├── .gitignore ├── CMakeLists.txt ├── README.md ├── common ├── common.cpp ├── common.h └── config.h.in ├── config ├── clip │ ├── image_encoder.yaml │ ├── prompts.txt │ └── text_encoder.yaml ├── ocr │ ├── det │ │ └── dbnet.yaml │ └── rec │ │ ├── attn.yaml │ │ ├── charset_36.txt │ │ └── ctc.yaml ├── sam │ ├── image_encoder.yaml │ └── mask_decoder.yaml └── yolo │ ├── coco.txt │ ├── yolo.yaml │ ├── yolo_cutoff.yaml │ ├── yolo_pose.yaml │ ├── yolo_seg.yaml │ └── yolo_seg_cutoff.yaml ├── doc ├── environment │ ├── cuda-on-linux.md │ ├── cuda-on-wsl.md │ └── onnxruntime.md └── model │ ├── abinet.md │ ├── clip.md │ ├── crnn.md │ ├── dbnet.md │ ├── sam.md │ └── yolo.md ├── framework ├── CMakeLists.txt ├── framework.h ├── onnx │ ├── onnx.cpp │ └── onnx.h ├── rknn │ ├── rknn.cpp │ └── rknn.h └── tensorrt │ ├── tensorrt.cpp │ └── tensorrt.h ├── model ├── CMakeLists.txt ├── base │ ├── detection_model.cpp │ ├── detection_model.h │ ├── model.cpp │ ├── model.h │ ├── ocr_model.cpp │ └── ocr_model.h ├── clip │ ├── CMakeLists.txt │ ├── clip.cpp │ ├── clip.h │ ├── image_encoder.cpp │ ├── image_encoder.h │ ├── text_encoder.cpp │ ├── text_encoder.h │ ├── text_tokenizer.cpp │ └── text_tokenizer.h ├── ocr │ ├── attention.cpp │ ├── attention.h │ ├── ctc.cpp │ ├── ctc.h │ ├── dbnet.cpp │ ├── dbnet.h │ └── scripts │ │ ├── abinet_export.py │ │ ├── crnn_export.py │ │ └── dbnet_export.py ├── sam │ ├── image_encoder.cpp │ ├── image_encoder.h │ ├── mask_decoder.cpp │ ├── mask_decoder.h │ ├── sam.cpp │ └── sam.h └── yolo │ ├── common.py │ ├── test.py │ ├── yolo.cpp │ ├── yolo.h │ ├── yolo_cutoff.cpp │ ├── yolo_cutoff.h │ ├── yolo_pose.cpp │ ├── yolo_pose.h │ ├── yolo_seg.cpp │ ├── yolo_seg.h │ ├── yolo_seg_cutoff.cpp │ ├── yolo_seg_cutoff.h │ ├── yolov8-det-export.py │ ├── yolov8-pose-export.py │ ├── yolov8-seg-export.py │ └── yolov9-det-export.py ├── output ├── dbnet │ ├── 01.png │ └── 02.png ├── sam │ └── dogs.jpg └── yolo │ ├── detect │ ├── COCO_train2014_000000181904.jpg │ ├── COCO_train2014_000000291797.jpg │ ├── bus.jpg │ └── zidane.jpg │ ├── pose │ ├── COCO_train2014_000000181904.jpg │ ├── COCO_train2014_000000291797.jpg │ ├── bus.jpg │ └── zidane.jpg │ └── segment │ ├── COCO_train2014_000000181904.jpg │ ├── COCO_train2014_000000291797.jpg │ ├── bus.jpg │ └── zidane.jpg └── test ├── CMakeLists.txt ├── clip_test.cpp ├── image ├── clip │ ├── Mona_Lisa.jpg │ └── franz-kafka.jpg ├── detect │ ├── COCO_train2014_000000181904.jpg │ ├── COCO_train2014_000000291797.jpg │ ├── bus.jpg │ └── zidane.jpg ├── ocr │ ├── det │ │ ├── 01.png │ │ └── 02.png │ └── rec │ │ └── demo.png └── sam │ └── dogs.jpg ├── ocr_test.cpp ├── sam_test.cpp ├── test.cpp └── yolo_test.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode/* 2 | /build/* 3 | /build-orin/* 4 | /build-rk/* 5 | /weights/* 6 | __pycache__ -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | 3 | option(USE_TENSORRT "Compile tensorrt framework" ON) 4 | option(USE_TENSORRT "Compile rknn framework" OFF) 5 | message(STATUS "USE_TENSORRT: ${USE_TENSORRT}\n") 6 | message(STATUS "USE_RKNN: ${USE_RKNN}\n") 7 | 8 | if(USE_TENSORRT) 9 | set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86) 10 | set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc) 11 | project(model_zoo_cxx LANGUAGES CXX CUDA) 12 | option(CUDA_USE_STATIC_CUDA_RUNTIME OFF) 13 | else() 14 | project(model_zoo_cxx) 15 | endif() 16 | 17 | if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") 18 | add_compile_options(-Wall -Wextra) 19 | endif() 20 | 21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0") 22 | set(CMAKE_CXX_STANDARD 14) 23 | set(CMAKE_BUILD_TYPE Debug) 24 | 25 | # OpenCV 26 | find_package(OpenCV REQUIRED) 27 | message(STATUS "OpenCV Libs: \n${OpenCV_LIBS}\n") 28 | message(STATUS "OpenCV Libraries: \n${OpenCV_LIBRARIES}\n") 29 | message(STATUS "OpenCV Headers: \n${OpenCV_INCLUDE_DIRS}\n") 30 | 31 | find_package(Eigen3 REQUIRED) 32 | 33 | if(USE_TENSORRT) 34 | find_package(CUDA REQUIRED) 35 | message(STATUS "CUDA Libs: \n${CUDA_LIBRARIES}\n") 36 | get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY) 37 | message(STATUS "CUDA Headers: \n${CUDA_INCLUDE_DIRS}\n") 38 | 39 | # TensorRT 40 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") 41 | set(TensorRT_INCLUDE_DIRS /usr/include/aarch64-linux-gnu) 42 | set(TensorRT_LIBRARIES /usr/lib/aarch64-linux-gnu) 43 | else() 44 | set(TensorRT_INCLUDE_DIRS /usr/include/x86_64-linux-gnu) 45 | set(TensorRT_LIBRARIES /usr/lib/x86_64-linux-gnu) 46 | endif() 47 | 48 | message(STATUS "TensorRT Libs: \n${TensorRT_LIBRARIES}\n") 49 | message(STATUS "TensorRT Headers: \n${TensorRT_INCLUDE_DIRS}\n") 50 | 51 | list(APPEND INCLUDE_DIRS 52 | ${CUDA_INCLUDE_DIRS} 53 | ${TensorRT_INCLUDE_DIRS} 54 | ) 55 | endif() 56 | 57 | if (USE_RKNN) 58 | set(RKNN_INCLUDE_DIRS /usr/include) 59 | set(RKNN_LIBS /usr/lib/librknnrt.so) 60 | list(APPEND INCLUDE_DIRS 61 | ${RKNN_INCLUDE_DIRS} 62 | ) 63 | endif() 64 | 65 | list(APPEND INCLUDE_DIRS 66 | ${OpenCV_INCLUDE_DIRS} 67 | ) 68 | 69 | set(ONNXRUNTIME_LIBS /usr/lib/libonnxruntime.so) 70 | 71 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") 72 | set(Clipper_LIBS /usr/lib/aarch64-linux-gnu/libpolyclipping.so) 73 | else() 74 | set(Clipper_LIBS /usr/lib/x86_64-linux-gnu/libpolyclipping.so) 75 | endif() 76 | message(STATUS "Clipper Libs: \n${Clipper_LIBS}\n") 77 | 78 | add_subdirectory(framework) 79 | add_subdirectory(model) 80 | add_subdirectory(test) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CXX-DeepLearning-Inference 2 | 3 | ## Introduction 4 | A unified and extensible pipeline for deep learning model inference with C++. 5 | ### Support framework 6 | - [x] ONNXRuntime 7 | - [x] TensorRT 8 | - [x] RKNN 9 | ### Support model 10 | - [x] object-detection 11 | - [x] [yolo](/doc/model/yolo.md) (including yolov8 & yolov9 for detection, segmentation and pose) 12 | - [x] ocr 13 | - [x] [crnn](/doc/model/crnn.md) 14 | - [x] [abinet](/doc/model/abinet.md) 15 | - [x] [dbnet](/doc/model/dbnet.md) 16 | - [x] [sam](/doc/model/sam.md) 17 | - [x] [clip](/doc/model/clip.md) 18 | 19 | | | | ONNXRuntime | TensorRT | RKNN | 20 | |-|-|:-:|:-:|:-:| 21 | | YOLO | YOLO-Det| √ | √ | | 22 | | | YOLO-Seg| √ | √ | | 23 | | | YOLO-Pose| √ | √ | | 24 | | | YOLO-Det-Cutoff ||| √ | 25 | | | YOLO-Seg-Cutoff ||| √ | 26 | |OCR | CRNN | √ | √ | | 27 | | | ABINet | √ | √ | | 28 | | | DBNet | √ | √ | | 29 | |SAM | | √ | √ | | 30 | |CLIP | | √ | √ | | 31 | 32 | ## Appendix 33 | [How to build TensorRT environment](/doc/environment/cuda-on-linux.md) 34 | -------------------------------------------------------------------------------- /common/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common/common.h" 2 | #include 3 | #include 4 | 5 | bool IsFile(const std::string &path) 6 | { 7 | if (!IsPathExist(path)) 8 | { 9 | printf("%s:%d %s not exist\n", __FILE__, __LINE__, path.c_str()); 10 | return false; 11 | } 12 | struct stat buffer; 13 | return (stat(path.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode)); 14 | } 15 | 16 | bool IsFolder(const std::string &path) 17 | { 18 | if (!IsPathExist(path)) 19 | { 20 | return false; 21 | } 22 | struct stat buffer; 23 | return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode)); 24 | } 25 | 26 | PreParam Letterbox(const cv::Mat &image, cv::Mat &out, cv::Size size) 27 | { 28 | const float inp_h = size.height; 29 | const float inp_w = size.width; 30 | float height = image.rows; 31 | float width = image.cols; 32 | 33 | float r = std::min(inp_h / height, inp_w / width); 34 | int padw = std::round(width * r); 35 | int padh = std::round(height * r); 36 | 37 | cv::Mat tmp; 38 | if ((int)width != padw || (int)height != padh) 39 | { 40 | cv::resize(image, tmp, cv::Size(padw, padh)); 41 | } 42 | else 43 | { 44 | tmp = image.clone(); 45 | } 46 | 47 | float dw = inp_w - padw; 48 | float dh = inp_h - padh; 49 | 50 | dw /= 2.0f; 51 | dh /= 2.0f; 52 | int top = int(std::round(dh - 0.1f)); 53 | int bottom = int(std::round(dh + 0.1f)); 54 | int left = int(std::round(dw - 0.1f)); 55 | int right = int(std::round(dw + 0.1f)); 56 | 57 | cv::copyMakeBorder(tmp, out, top, bottom, left, right, cv::BORDER_CONSTANT, {114, 114, 114}); 58 | 59 | PreParam pparam; 60 | pparam.ratio = 1 / r; 61 | pparam.dw = dw; 62 | pparam.dh = dh; 63 | pparam.height = height; 64 | pparam.width = width; 65 | return pparam; 66 | } 67 | 68 | PreParam paddimg(const cv::Mat &image, cv::Mat &out, int shortsize) { 69 | int w = image.cols; 70 | int h = image.rows; 71 | float scale = 1.f; 72 | if (w < h) { 73 | scale = (float)shortsize / w; 74 | h = scale * h; 75 | w = shortsize; 76 | } 77 | else { 78 | scale = (float)shortsize / h; 79 | w = scale * w; 80 | h = shortsize; 81 | } 82 | 83 | if (h % 32 != 0) { 84 | h = (h / 32 + 1) * 32; 85 | } 86 | if (w % 32 != 0) { 87 | w = (w / 32 + 1) * 32; 88 | } 89 | 90 | cv::resize(image, out, cv::Size(w, h)); 91 | PreParam pparam; 92 | pparam.ratio = 1 / scale; 93 | pparam.dw = 0; 94 | pparam.dh = 0; 95 | pparam.height = image.rows; 96 | pparam.width = image.cols; 97 | return pparam; 98 | } 99 | 100 | int32_t __clip(float val, float min, float max) { 101 | float f = val <= min ? min : (val >= max ? max : val); 102 | return f; 103 | } 104 | 105 | float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); } 106 | 107 | float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); } 108 | 109 | int8_t qntF32ToAffine(float f32, int32_t zp, float scale) { 110 | float dst_val = (f32 / scale) + zp; 111 | int8_t res = (int8_t)__clip(dst_val, -128, 127); 112 | return res; 113 | } 114 | 115 | float deqntAffineToF32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; } -------------------------------------------------------------------------------- /common/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "opencv2/opencv.hpp" 3 | #include 4 | #include 5 | 6 | struct Binding 7 | { 8 | size_t size = 1; 9 | size_t dsize = 1; 10 | std::vector dims; 11 | std::string name; 12 | }; 13 | 14 | struct PreParam 15 | { 16 | float ratio = 1.0f; 17 | float dw = 0.0f; 18 | float dh = 0.0f; 19 | float height = 0; 20 | float width = 0; 21 | }; 22 | 23 | inline static float clamp(float val, float min, float max) 24 | { 25 | return val > min ? (val < max ? val : max) : min; 26 | } 27 | 28 | inline bool IsPathExist(const std::string &path) 29 | { 30 | return (access(path.c_str(), 0) == F_OK); 31 | } 32 | 33 | bool IsFile(const std::string &path); 34 | 35 | bool IsFolder(const std::string &path); 36 | 37 | PreParam Letterbox(const cv::Mat &image, cv::Mat &out, cv::Size size); 38 | 39 | PreParam paddimg(const cv::Mat &image, cv::Mat &out, int shortsize = 960); 40 | 41 | float sigmoid(float x); 42 | 43 | float unsigmoid(float y); 44 | 45 | float deqntAffineToF32(int8_t qnt, int32_t zp, float scale); 46 | 47 | int32_t __clip(float val, float min, float max); 48 | 49 | int8_t qntF32ToAffine(float f32, int32_t zp, float scale); -------------------------------------------------------------------------------- /common/config.h.in: -------------------------------------------------------------------------------- 1 | #cmakedefine USE_TENSORRT 2 | #cmakedefine USE_RKNN -------------------------------------------------------------------------------- /config/clip/image_encoder.yaml: -------------------------------------------------------------------------------- 1 | model_name: "clip_image_encoder" 2 | model_path: "../weights/clip/clip_image_model_vitb32.onnx" 3 | framework: "ONNX" 4 | # model_path: "../weights/clip/clip_image_model_res18.engine" 5 | # framework: "TensorRT" 6 | max_batch_size: 2 -------------------------------------------------------------------------------- /config/clip/prompts.txt: -------------------------------------------------------------------------------- 1 | a photo of a man 2 | a photo of a woman -------------------------------------------------------------------------------- /config/clip/text_encoder.yaml: -------------------------------------------------------------------------------- 1 | model_name: "clip_text_encoder" 2 | model_path: "../weights/clip/clip_text_model_vitb32.onnx" 3 | framework: "ONNX" 4 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz" 5 | prompts: "../config/clip/prompts.txt" 6 | text_embedding: "../weights/clip/text_embeddings.bin" 7 | online: false -------------------------------------------------------------------------------- /config/ocr/det/dbnet.yaml: -------------------------------------------------------------------------------- 1 | model_name: "dbnet" 2 | model_path: "../weights/ocr/DBNet.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | box_thres: 0.5 6 | max_input_size: [1, 3, 1440, 1440] -------------------------------------------------------------------------------- /config/ocr/rec/attn.yaml: -------------------------------------------------------------------------------- 1 | model_name: "abinet" 2 | model_path: "../weights/ocr/best-train-abinet.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | input_size: [128,32] # (width, height) 6 | input_channel: 3 7 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789" 8 | output_size: 26 -------------------------------------------------------------------------------- /config/ocr/rec/charset_36.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | a 12 | b 13 | c 14 | d 15 | e 16 | f 17 | g 18 | h 19 | i 20 | j 21 | k 22 | l 23 | m 24 | n 25 | o 26 | p 27 | q 28 | r 29 | s 30 | t 31 | u 32 | v 33 | w 34 | x 35 | y 36 | z -------------------------------------------------------------------------------- /config/ocr/rec/ctc.yaml: -------------------------------------------------------------------------------- 1 | model_name: "crnn" 2 | model_path: "../weights/ocr/crnn.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | input_size: [100,32] # (width, height) 6 | input_channel: 1 7 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz" 8 | output_size: 26 -------------------------------------------------------------------------------- /config/sam/image_encoder.yaml: -------------------------------------------------------------------------------- 1 | model_name: "sam_image_encoder" 2 | model_path: "../weights/sam/resnet18_image_encoder.onnx" 3 | framework: "ONNX" 4 | # model_path: "../weights/sam/resnet18_image_encoder.engine" 5 | # framework: "TensorRT" -------------------------------------------------------------------------------- /config/sam/mask_decoder.yaml: -------------------------------------------------------------------------------- 1 | model_name: "sam_mask_decoder" 2 | model_path: "../weights/sam/mobile_sam_mask_decoder.onnx" 3 | framework: "ONNX" 4 | # model_path: "../weights/sam/mobile_sam_mask_decoder.engine" 5 | # framework: "TensorRT" -------------------------------------------------------------------------------- /config/yolo/coco.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorcycle 5 | airplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | couch 59 | potted plant 60 | bed 61 | dining table 62 | toilet 63 | tv 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush -------------------------------------------------------------------------------- /config/yolo/yolo.yaml: -------------------------------------------------------------------------------- 1 | model_name: "yolo" 2 | model_path: "../weights/yolo/yolov8s_end2end.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | input_size: [640,640] 6 | with_nms: true 7 | conf_thres: 0.25 8 | nms_thres: 0.65 9 | topk: 100 -------------------------------------------------------------------------------- /config/yolo/yolo_cutoff.yaml: -------------------------------------------------------------------------------- 1 | model_name: "yolo_det" 2 | model_path: "../weights/yolo/yolov8n.rknn" 3 | framework: "RKNN" 4 | input_size: [640,640] 5 | conf_thres: 0.25 6 | nms_thres: 0.65 7 | class_num: 80 8 | topk: 100 -------------------------------------------------------------------------------- /config/yolo/yolo_pose.yaml: -------------------------------------------------------------------------------- 1 | model_name: "yolo_pose" 2 | model_path: "../weights/yolo/yolov8s-pose.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | input_size: [640,640] 6 | conf_thres: 0.25 7 | nms_thres: 0.65 -------------------------------------------------------------------------------- /config/yolo/yolo_seg.yaml: -------------------------------------------------------------------------------- 1 | model_name: "yolo_seg" 2 | model_path: "../weights/yolo/yolov8s-seg.onnx" 3 | framework: "ONNX" 4 | # framework: "TensorRT" 5 | input_size: [640,640] 6 | conf_thres: 0.25 7 | nms_thres: 0.65 8 | seg_size: [160, 160] 9 | seg_channels: 32 10 | -------------------------------------------------------------------------------- /config/yolo/yolo_seg_cutoff.yaml: -------------------------------------------------------------------------------- 1 | model_name: "yolo_seg" 2 | model_path: "../weights/yolo/yolov8s-seg.rknn" 3 | framework: "RKNN" 4 | input_size: [640,640] 5 | conf_thres: 0.25 6 | nms_thres: 0.65 7 | seg_size: [160, 160] 8 | seg_channels: 32 9 | class_num: 80 10 | topk: 100 -------------------------------------------------------------------------------- /doc/environment/cuda-on-linux.md: -------------------------------------------------------------------------------- 1 | # CUDA ON Ubuntu 2 | 3 | ## 驱动安装 4 | 5 | TODO 6 | 7 | ## cuda安装 8 | 9 | ### 添加源 10 | 11 | For Ubuntu 22.04 12 | ``` 13 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC 14 | sudo sh -c 'echo "deb https://mirrors.aliyun.com/nvidia-cuda/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list' 15 | ``` 16 | 17 | ### 安装 18 | ``` 19 | # 更新列表 20 | sudo apt-get update 21 | 22 | # 查询可用版本 23 | apt search cuda-toolkit 24 | 25 | # 安装 26 | sudo apt install cuda-toolkit- 27 | ``` 28 | 29 | ## TensorRT安装 30 | 31 | ``` 32 | # 查询可用版本 33 | apt policy tensorrt-dev 34 | 35 | # 安装 36 | sudo apt install tensorrt-dev= 37 | 38 | # 安装trtexec 39 | sudo apt install libnvinfer-bin= 40 | ``` -------------------------------------------------------------------------------- /doc/environment/cuda-on-wsl.md: -------------------------------------------------------------------------------- 1 | # CUDA ON WSL 2 | 3 | ## 驱动安装 4 | 5 | [CUDA on WSL 驱动下载地址](https://developer.nvidia.com/cuda/wsl) 6 | 7 | 根据自己的GPU类型(GeForce and Quadro) 选择对应的驱动。 8 | 不需要在wsl下安装nvidia驱动,windows会自动为wsl安装nvidia驱动。 9 | 10 | ## cuda安装 11 | 12 | ### 添加源 13 | 14 | For Ubuntu 22.04 15 | ``` 16 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC 17 | sudo sh -c 'echo "deb https://mirrors.aliyun.com/nvidia-cuda/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list' 18 | ``` 19 | 20 | ### 安装 21 | ``` 22 | # 更新列表 23 | sudo apt-get update 24 | 25 | # 查询可用版本 26 | apt search cuda-toolkit 27 | 28 | # 安装 29 | sudo apt install cuda-toolkit- 30 | ``` 31 | 32 | ## TensorRT安装 33 | 34 | ``` 35 | # 查询可用版本 36 | apt policy tensorrt-dev 37 | 38 | # 安装 39 | sudo apt install tensorrt-dev= 40 | 41 | # 安装trtexec 42 | sudo apt install libnvinfer-bin= 43 | ``` -------------------------------------------------------------------------------- /doc/environment/onnxruntime.md: -------------------------------------------------------------------------------- 1 | # ONNXRuntime install 2 | 3 | 1. choose a version in [Onnxruntime release](https://github.com/microsoft/onnxruntime/releases) 4 | 2. download 5 | ``` 6 | wget https://github.com/microsoft/onnxruntime/releases/download/v1.16.2/onnxruntime-linux-x64-1.16.2.tgz 7 | ``` 8 | 3. install 9 | ``` 10 | tar -xzf onnxruntime-linux-x64-1.16.2.tgz 11 | cd onnxruntime-linux-x64-1.16.2 12 | sudo cp include/* /usr/include 13 | sudo cp lib/* /usr/lib 14 | ``` -------------------------------------------------------------------------------- /doc/model/abinet.md: -------------------------------------------------------------------------------- 1 | # ABINet 2 | 3 | ## Get pytorch model 4 | The pytorch implementation is [ABINet](https://github.com/FangShancheng/ABINet). 5 | 6 | ## Export 7 | ### ONNX 8 | ``` 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git 10 | git clone https://github.com/FangShancheng/ABINet.git 11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/abinet_export.py ABINet 12 | cd ABINet 13 | python3 abinet_export.py --sim --weights=path-to-weights 14 | ``` 15 | You can checkout the onnx model in [netron](netron.app). 16 | - inputs 17 | - images (float32[1,3,32,128]) 18 | - outputs 19 | - output (float32[1,26,1]) 20 | 21 | ### TensorRT 22 | ``` 23 | ${tensorrt-install-path}/bin/trtexec 24 | --onnx=path-to-your-onnx-model \ 25 | --saveEngine=save-path \ 26 | --fp16 27 | ``` 28 | 29 | ## Inference 30 | ### ONNXRuntime 31 | #### Build 32 | ``` 33 | mkdir build && cd build 34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF 35 | make 36 | ``` 37 | #### Config 38 | config/ocr/rec/attn.yaml for abinet model 39 | ``` 40 | model_name: "abinet" 41 | model_path: "../weights/ocr/best-train-abinet.onnx" 42 | framework: "ONNX" 43 | input_size: [128,32] # (width, height) 44 | input_channel: 3 45 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789" 46 | output_size: 26 47 | ``` 48 | #### Run 49 | ``` 50 | cd build 51 | ./test/ocr_test 52 | ``` 53 | 54 | ### TensorRT 55 | ``` 56 | mkdir build && cd build 57 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON 58 | make 59 | ``` 60 | #### Config 61 | config/ocr/rec/attn.yaml for abinet model 62 | ``` 63 | model_name: "abinet" 64 | model_path: "../weights/ocr/best-train-abinet.engine" 65 | framework: "TensorRT" 66 | input_size: [128,32] # (width, height) 67 | input_channel: 3 68 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789" 69 | output_size: 26 70 | ``` 71 | #### Run 72 | ``` 73 | cd build 74 | ./test/ocr_test 75 | ``` -------------------------------------------------------------------------------- /doc/model/clip.md: -------------------------------------------------------------------------------- 1 | # Clip 2 | 3 | 4 | ## Related repos 5 | - [Clip](https://github.com/openai/CLIP) 6 | - [onnx_clip](https://github.com/lakeraai/onnx_clip.git) 7 | - [clip-distillation](https://github.com/NVIDIA-AI-IOT/clip-distillation) 8 | 9 | ## ONNXRuntime Inference 10 | ### Model 11 | You can download the ONNX model by 12 | ``` 13 | wget https://lakera-clip.s3.eu-west-1.amazonaws.com/clip_image_model_vitb32.onnx 14 | wget https://lakera-clip.s3.eu-west-1.amazonaws.com/clip_text_model_vitb32.onnx 15 | ``` 16 | Or you can export by yourself. Python code will be like 17 | ``` 18 | torch.onnx.export(image_model, 19 | torch.randn(1, 3, 224, 224), 20 | f, 21 | opset_version=11, 22 | input_names=['IMAGE'], 23 | output_names=['IMAGE_EMBEDDING'], 24 | dynamic_axes={ 25 | 'IMAGE': {0: 'batch_size'}, 26 | 'IMAGE_EMBEDDING': {0: 'batch_size'} 27 | }) 28 | 29 | torch.onnx.export(text_model, 30 | torch.randn(1, 77), 31 | f, 32 | opset_version=11, 33 | input_names=['TEXT'], 34 | output_names=['TEXT_EMBEDDING'], 35 | dynamic_axes={ 36 | 'TEXT': {0: 'batch_size'}, 37 | 'TEXT_EMBEDDING': {0: 'batch_size'} 38 | }) 39 | ``` 40 | 41 | ### Get bpe vocab 42 | ``` 43 | git clone https://github.com/lakeraai/onnx_clip.git 44 | cp onnx_clip/onnx_clip/data/bpe_simple_vocab_16e6.txt.gz model-zoo-cxx/weights/clip 45 | ``` 46 | 47 | ### Inference 48 | #### Build 49 | ``` 50 | mkdir build && cd build 51 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF 52 | make 53 | ``` 54 | #### Config 55 | config/clip/image_encoder.yaml 56 | ``` 57 | model_name: "clip_image_encoder" 58 | model_path: "../weights/clip/clip_image_model_vitb32.onnx" 59 | framework: "ONNX" 60 | ``` 61 | config/clip/text_encoder.yaml 62 | ``` 63 | model_name: "clip_text_encoder" 64 | model_path: "../weights/clip/clip_text_model_vitb32.onnx" 65 | framework: "ONNX" 66 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz" 67 | prompts: "../config/clip/prompts.txt" 68 | text_embedding: "../weights/clip/text_embeddings.bin" 69 | online: true 70 | ``` 71 | #### Run 72 | ``` 73 | cd build 74 | ./test/clip_test 75 | ``` 76 | You can get output like 77 | ``` 78 | Input: 79 | IMAGE: [-1,3,224,224,] 80 | Output: 81 | IMAGE_EMBEDDING: [-1,512,] 82 | Input: 83 | TEXT: [-1,77,] 84 | Output: 85 | TEXT_EMBEDDING: [-1,512,] 86 | Shape of IMAGE_EMBEDDING: [2,512,] 87 | Shape of TEXT_EMBEDDING: [2,512,] 88 | [ [ 0.970533 0.0294665 ], [ 0.0195933 0.980407 ], ] 89 | Destruct text encoder 90 | Destruct image encoder 91 | ``` 92 | 93 | ## TensorRT Inference 94 | ### Model 95 | We can simply transfer the onnx model to tensorrt engine by 96 | ``` 97 | /usr/src/tensorrt/bin/trtexec \ 98 | --onnx=${onnx_image_model_path} \ 99 | --saveEngine=${tensorrt_image_model_path} \ 100 | --fp16 \ 101 | --minShapes=IMAGE:1x3x224x224 \ 102 | --optShapes=IMAGE:1x3x224x224 \ 103 | --maxShapes=IMAGE:10x3x224x224 104 | /usr/src/tensorrt/bin/trtexec \ 105 | --onnx=${onnx_text_model_path} \ 106 | --saveEngine=${tensorrt_text_model_path} \ 107 | --fp16 \ 108 | --minShapes=TEXT:1x77 \ 109 | --optShapes=TEXT:1x77 \ 110 | --maxShapes=TEXT:10x77 111 | ``` 112 | However, these models with vit is too large for Jetson. 113 | 114 | Nvidia releases [clip-distillation](https://github.com/NVIDIA-AI-IOT/clip-distillation) to solve this problem. 115 | First, train a smaller image model with knowledge distillation. Though Nvidia does not release weights yet, it publishes a pipeline to train models. 116 | Second, fix the prompt texts and save their embeddings. 117 | 118 | ### Inference 119 | #### Build 120 | ``` 121 | mkdir build && cd build 122 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON 123 | make 124 | ``` 125 | #### Config 126 | config/clip/image_encoder.yaml 127 | ``` 128 | model_name: "clip_image_encoder" 129 | model_path: "../weights/clip/clip_image_model_res18.engine" 130 | framework: "TensorRT" 131 | ``` 132 | config/clip/text_encoder.yaml 133 | ``` 134 | model_name: "clip_text_encoder" 135 | model_path: "../weights/clip/clip_text_model_vitb32.onnx" 136 | framework: "ONNX" 137 | # model_path: "../weights/sam/resnet18_image_encoder.engine" 138 | # framework: "TensorRT" 139 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz" 140 | prompts: "../config/clip/prompts.txt" 141 | text_embedding: "../weights/clip/text_embeddings.bin" 142 | online: false 143 | ``` 144 | #### Run 145 | ``` 146 | cd build 147 | ./test/clip_test -g ## generate text_embeddings.bin 148 | ./test/clip_test 149 | ``` -------------------------------------------------------------------------------- /doc/model/crnn.md: -------------------------------------------------------------------------------- 1 | # CRNN 2 | 3 | ## Get pytorch model 4 | The pytorch implementation is [crnn.pytorch](https://github.com/meijieru/crnn.pytorch). 5 | 6 | ## Export 7 | ### ONNX 8 | ``` 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git 10 | git clone https://github.com/meijieru/crnn.pytorch.git 11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/crnn_export.py crnn.pytorch 12 | cd crnn.pytorch 13 | python3 crnn_export.py --weights=crnn.pth --sim 14 | ``` 15 | You can checkout the onnx model in [netron](netron.app). 16 | - inputs 17 | - images (float32[1,1,32,100]) 18 | - outputs 19 | - output (float32[1,26,1]) 20 | 21 | ### TensorRT 22 | ``` 23 | ${tensorrt-install-path}/bin/trtexec 24 | --onnx=path-to-your-onnx-model \ 25 | --saveEngine=save-path \ 26 | --fp16 27 | ``` 28 | 29 | ## Inference 30 | ### ONNXRuntime 31 | #### Build 32 | ``` 33 | mkdir build && cd build 34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF 35 | make 36 | ``` 37 | #### Config 38 | config/ocr/rec/ctc.yaml for crnn model 39 | ``` 40 | model_name: "crnn" 41 | model_path: "../weights/ocr/crnn.onnx" 42 | framework: "ONNX" 43 | input_size: [100,32] # (width, height) 44 | input_channel: 1 45 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz" 46 | output_size: 26 47 | ``` 48 | #### Run 49 | ``` 50 | cd build 51 | ./test/ocr_test 52 | ``` 53 | You can see output like: 54 | ``` 55 | Input: 56 | images: 3200 57 | Output: 58 | output: 26 59 | ../test/image/ocr/demo.png: available 60 | cost 7.9420 ms 61 | Destruct ocr model 62 | ``` 63 | 64 | ### TensorRT 65 | ``` 66 | mkdir build && cd build 67 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON 68 | make 69 | ``` 70 | #### Config 71 | config/ocr/rec/ctc.yaml for crnn model 72 | ``` 73 | model_name: "crnn" 74 | model_path: "../weights/ocr/crnn.engine" 75 | framework: "TensorRT" 76 | input_size: [100,32] # (width, height) 77 | input_channel: 1 78 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz" 79 | output_size: 26 80 | ``` 81 | #### Run 82 | ``` 83 | cd build 84 | ./test/ocr_test 85 | ``` 86 | You can see output like: 87 | ``` 88 | Input bind name: images 89 | Output bind name: output 90 | model warmup 10 times 91 | ../test/image/ocr/demo.png: available 92 | cost 16.3920 ms 93 | Destruct ocr model 94 | ``` -------------------------------------------------------------------------------- /doc/model/dbnet.md: -------------------------------------------------------------------------------- 1 | # DBNet 2 | 3 | ## Get pytorch model 4 | The pytorch implementation is [DBNet.pytorch](https://github.com/BaofengZan/DBNet.pytorch). 5 | 6 | ## Export 7 | ### ONNX 8 | ``` 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git 10 | git clone https://github.com/BaofengZan/DBNet.pytorch.git 11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/dbnet_export.py DBNet.pytorch 12 | cd DBNet.pytorch 13 | python3 dbnet_export.py --sim --weights=path-to-weights 14 | ``` 15 | It's an onnx model with dynamic axes, you can check the onnx model in [netron](netron.app). 16 | - inputs 17 | - images (float32[1,3,height,width]) 18 | - outputs 19 | - output (float32[Resizeoutput_dim_0,Resizeoutput_dim_1,Resizeoutput_dim_2,Resizeoutput_dim_3]) 20 | 21 | ### TensorRT 22 | ``` 23 | ${tensorrt-install-path}/bin/trtexec \ 24 | --onnx=DBNet.onnx \ 25 | --saveEngine=DBNet.engine \ 26 | --fp16 \ 27 | --minShapes=images:1x3x608x608 \ 28 | --maxShapes=images:1x3x1440x1440 \ 29 | --optShapes=images:1x3x640x1152 30 | ``` 31 | 32 | ## Inference 33 | ### ONNXRuntime 34 | #### Build 35 | ``` 36 | mkdir build && cd build 37 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF 38 | make 39 | ``` 40 | #### Config 41 | config/ocr/det/dbnet.yaml 42 | ``` 43 | model_name: "dbnet" 44 | model_path: "../weights/ocr/det/DBNet.onnx" 45 | framework: "ONNX" 46 | box_thres: 0.5 47 | ``` 48 | #### Run 49 | ``` 50 | cd build 51 | ./test/ocr_test 52 | ``` 53 | 54 | ### TensorRT 55 | ``` 56 | mkdir build && cd build 57 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON 58 | make 59 | ``` 60 | #### Config 61 | config/ocr/det/dbnet.yaml 62 | ``` 63 | model_name: "dbnet" 64 | model_path: "../weights/ocr/det/DBNet.engine" 65 | framework: "TensorRT" 66 | box_thres: 0.5 67 | ``` 68 | #### Run 69 | ``` 70 | cd build 71 | ./test/ocr_test 72 | ``` -------------------------------------------------------------------------------- /doc/model/sam.md: -------------------------------------------------------------------------------- 1 | # SAM 2 | This model is based on [NanoSAM](https://github.com/NVIDIA-AI-IOT/nanosam). 3 | 4 | ## Get model 5 | ### ONNX 6 | 1. Download the image encoder ONNX file from [here](https://drive.google.com/file/d/14-SsvoaTl-esC3JOzomHDnI9OGgdO2OR/view?usp=drive_link). 7 | 2. Download the mask decoder ONNX file from [here](https://drive.google.com/file/d/1jYNvnseTL49SNRx9PDcbkZ9DwsY8up7n/view?usp=drive_link). 8 | 3. Or you can export mannally following [NanoSAM](https://github.com/NVIDIA-AI-IOT/nanosam). 9 | 10 | ### TensorRT 11 | - image encoder 12 | ``` 13 | ${tensorrt-install-path}/bin/trtexec \ 14 | --onnx=data/resnet18_image_encoder.onnx \ 15 | --saveEngine=data/resnet18_image_encoder.engine \ 16 | --fp16 17 | ``` 18 | 19 | - mask decoder 20 | ``` 21 | ${tensorrt-install-path}/bin/trtexec \ 22 | --onnx=weights/sam/mobile_sam_mask_decoder.onnx \ 23 | --saveEngine=weights/sam/mobile_sam_mask_decoder.engine \ 24 | --minShapes=point_coords:1x1x2,point_labels:1x1 \ 25 | --optShapes=point_coords:1x1x2,point_labels:1x1 \ 26 | --maxShapes=point_coords:1x10x2,point_labels:1x10 27 | ``` 28 | 29 | ## Inference 30 | ### ONNXRuntime 31 | #### Build 32 | ``` 33 | mkdir build && cd build 34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF 35 | make 36 | ``` 37 | #### Config 38 | config/sam/image_encoder.yaml 39 | ``` 40 | model_name: "sam_image_encoder" 41 | model_path: "../weights/sam/resnet18_image_encoder.onnx" 42 | framework: "ONNX" 43 | ``` 44 | config/sam/mask_decoder.yaml 45 | ``` 46 | model_name: "sam_mask_decoder" 47 | model_path: "../weights/sam/mobile_sam_mask_decoder.onnx" 48 | framework: "ONNX" 49 | ``` 50 | #### Run 51 | ``` 52 | cd build 53 | ./test/ocr_test 54 | ``` 55 | 56 | ### TensorRT 57 | #### Build 58 | ``` 59 | mkdir build && cd build 60 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON 61 | make 62 | ``` 63 | #### Config 64 | config/sam/image_encoder.yaml 65 | ``` 66 | model_name: "sam_image_encoder" 67 | model_path: "../weights/sam/resnet18_image_encoder.engine" 68 | framework: "TensorRT" 69 | ``` 70 | config/sam/mask_decoder.yaml 71 | ``` 72 | model_name: "sam_mask_decoder" 73 | model_path: "../weights/sam/mobile_sam_mask_decoder.engine" 74 | framework: "TensorRT" 75 | ``` 76 | #### Run 77 | ``` 78 | cd build 79 | ./test/ocr_test 80 | ``` 81 | 82 | ### Sample result 83 |

-------------------------------------------------------------------------------- /doc/model/yolo.md: -------------------------------------------------------------------------------- 1 | # YOLO 2 | In this repo, all versions of yolo algorithms are exported with the same inputs and outputs, so we can use the same C++ code to inference. 3 | ## Prepare 4 | ``` 5 | python3 -m pip install ultralytics, onnx, onnxsim 6 | ``` 7 | 8 | ## Get pytorch model 9 | ``` 10 | # yolov8s-detect 11 | wget https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s.pt 12 | # yolov8s-seg 13 | wget https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-seg.pt 14 | # yolov9c 15 | wget https://github.com/WongKinYiu/yolov9/releases/download/v0.1/yolov9-c-converted.pt 16 | ``` 17 | 18 | ## Export 19 | ### ONNX 20 | #### YOLOv8-Detect 21 | ``` 22 | # without-nms 23 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim 24 | # with onnx-nms plugin 25 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim --onnx-nms 26 | # with trt-nms plugin (only for tensorrt transfer, not support to inference with onnxruntime C++) 27 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim --trt-nms 28 | ``` 29 | 30 | #### YOLOv8-Segment 31 | ``` 32 | python3 model/yolo/yolov8-seg-export.py --weights=path-to-your-weights --sim 33 | ``` 34 | 35 | #### YOLOv8-Pose 36 | ``` 37 | python3 model/yolo/yolov8-pose-export.py --weights=path-to-your-weights --sim 38 | ``` 39 | 40 | #### YOLOv9-Detect 41 | ``` 42 | git clone https://github.com/WongKinYiu/yolov9.git 43 | cp model/yolo/yolov9-det-export.py yolov9/ 44 | cd yolov9 45 | # without-nms 46 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim 47 | # with onnx-nms plugin 48 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim --onnx-nms 49 | # with trt-nms plugin (only for tensorrt transfer, not support to inference with onnxruntime C++) 50 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim --trt-nms 51 | ``` 52 | 53 | ### TensorRT 54 | ``` 55 | ${tensorrt-install-path}/bin/trtexec 56 | --onnx=path-to-your-onnx-model \ 57 | --saveEngine=save-path \ 58 | --fp16 59 | ``` 60 | 61 | ### RKNN 62 | - Pytorch to ONNX, see [airockchip/ultralytics_yolov8](https://github.com/airockchip/ultralytics_yolov8) 63 | - ONNX to RKNN, see [airockchip/rknn-model-zoo](https://github.com/airockchip/rknn_model_zoo) 64 | 65 | ## Inference 66 | ### Build 67 | Compile options: 68 | - USE_TENSORRT 69 | - USE_RKNN 70 | ``` 71 | mkdir build && cd build 72 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON/OFF -DUSE_RKNN=ON/OFF 73 | make 74 | ``` 75 | ### Config 76 | #### Detect 77 | ``` 78 | model_name: "yolo" 79 | model_path: "path-to-model-file" 80 | framework: "ONNX" # ("ONNX" or "TensorRT", corresponding to your model) 81 | input_size: [640,640] 82 | with_nms: true 83 | conf_thres: 0.25 84 | nms_thres: 0.65 85 | topk: 100 86 | ``` 87 | #### Segment 88 | ``` 89 | model_name: "yolo_seg" 90 | model_path: "path-to-model-file" 91 | framework: "ONNX" # ("ONNX" or "TensorRT", corresponding to your model) 92 | input_size: [640,640] 93 | conf_thres: 0.25 94 | nms_thres: 0.65 95 | seg_size: [160, 160] 96 | seg_channels: 32 97 | ``` 98 | #### Segment-Cutoff 99 | ``` 100 | model_name: "yolo_seg" 101 | model_path: "../weights/yolo/yolov8s-seg.rknn" 102 | framework: "RKNN" 103 | input_size: [640,640] 104 | conf_thres: 0.25 105 | nms_thres: 0.65 106 | seg_size: [160, 160] 107 | seg_channels: 32 108 | class_num: 80 109 | topk: 100 110 | ``` 111 | 112 | #### Pose 113 | ``` 114 | model_name: "yolo_pose" 115 | model_path: "path-to-model-file" 116 | framework: "ONNX" # ("ONNX" or "TensorRT", corresponding to your model) 117 | input_size: [640,640] 118 | conf_thres: 0.25 119 | nms_thres: 0.65 120 | ``` 121 | 122 | #### Run 123 | ``` 124 | mkdir -p output/yolo/detect 125 | mkdir output/yolo/segment 126 | cd build 127 | ./test/yolo_test 128 | ``` 129 | 130 | ### Example output 131 |

132 |

133 |

-------------------------------------------------------------------------------- /framework/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | configure_file(../common/config.h.in config.h) 2 | 3 | if(USE_TENSORRT) 4 | add_library(framework_trt SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 5 | ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt/tensorrt.cpp) 6 | target_include_directories(framework_trt PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 7 | target_link_libraries(framework_trt nvinfer nvinfer_plugin ${CUDA_LIBRARIES}) 8 | target_link_directories(framework_trt PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS}) 9 | endif() 10 | 11 | if(USE_RKNN) 12 | add_library(framework_rknn SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 13 | ${CMAKE_CURRENT_SOURCE_DIR}/rknn/rknn.cpp) 14 | target_include_directories(framework_rknn PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 15 | target_link_libraries(framework_rknn ${RKNN_LIBS}) 16 | target_link_directories(framework_rknn PUBLIC ${OpenCV_LIBS}) 17 | endif() 18 | 19 | add_library(framework_onnx SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 20 | ${CMAKE_CURRENT_SOURCE_DIR}/onnx/onnx.cpp) 21 | target_include_directories(framework_onnx PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 22 | target_link_libraries(framework_onnx ${ONNXRUNTIME_LIBS}) 23 | target_link_directories(framework_onnx PUBLIC ${OpenCV_LIBS}) 24 | -------------------------------------------------------------------------------- /framework/framework.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "common/common.h" 7 | 8 | enum DataType { 9 | INT32 = 1, 10 | FP32 = 2, 11 | UINT8 = 3, 12 | INT8 = 4 13 | }; 14 | 15 | struct IOTensor { 16 | std::vector raw_data; 17 | std::vector shape; 18 | int zp; 19 | float scale; 20 | DataType data_type = DataType::FP32; 21 | void resize(size_t size) { 22 | raw_data.resize(size); 23 | } 24 | 25 | size_t size() const { 26 | return raw_data.size(); 27 | } 28 | 29 | uint8_t* data() { 30 | return raw_data.data(); 31 | } 32 | 33 | const uint8_t* data() const{ 34 | return raw_data.data(); 35 | } 36 | }; 37 | 38 | enum Status { SUCCESS = 0, INIT_ERROR = -1, INFERENCE_ERROR = -2}; 39 | 40 | struct Config { 41 | std::string model_path; 42 | std::map input_len; 43 | std::map output_len; 44 | bool is_dynamic; 45 | }; 46 | 47 | class BaseFramework { 48 | public: 49 | BaseFramework() {} 50 | virtual ~BaseFramework() {} 51 | virtual Status Init(Config config) = 0; 52 | virtual Status forward(const std::unordered_map &input, 53 | std::unordered_map &output) = 0; 54 | 55 | protected: 56 | std::vector input_bindings; 57 | std::vector output_bindings; 58 | bool is_dynamic; 59 | }; -------------------------------------------------------------------------------- /framework/onnx/onnx.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/onnx/onnx.h" 2 | 3 | int TypeToSize(const ONNXTensorElementDataType& dataType) { 4 | switch (dataType) { 5 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: 6 | return 4; 7 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: 8 | return 2; 9 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: 10 | return 4; 11 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: 12 | return 1; 13 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: 14 | return 1; 15 | case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: 16 | return 8; 17 | default: 18 | std::cout << "Unknown data type " << dataType << std::endl; 19 | return 4; 20 | } 21 | } 22 | 23 | Status ONNXFramework::Init(Config config) { 24 | is_dynamic = config.is_dynamic; 25 | env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "ONNX_DETECTION"); 26 | session_options = Ort::SessionOptions(); 27 | 28 | Ort::AllocatorWithDefaultOptions allocator; 29 | 30 | #ifdef _WIN32 31 | std::wstring w_modelPath = utils::charToWstring(model_path.c_str()); 32 | session = Ort::Session(env, w_modelPath.c_str(), sessionOptions); 33 | #else 34 | session = new Ort::Session(env, config.model_path.c_str(), session_options); 35 | #endif 36 | 37 | std::cout << "Input: " << std::endl; 38 | int input_num = session->GetInputCount(); 39 | for (int i = 0; i < input_num; i++) { 40 | Ort::TypeInfo input_type_info = session->GetInputTypeInfo(i); 41 | std::vector input_tensor_shape = input_type_info.GetTensorTypeAndShapeInfo().GetShape(); 42 | 43 | Binding binding; 44 | int64_t size = 1; 45 | for (size_t j = 0; j < input_tensor_shape.size(); j++) { 46 | binding.dims.push_back(input_tensor_shape[j]); 47 | size *= input_tensor_shape[j]; 48 | } 49 | 50 | if (size <= 0) { 51 | size = config.input_len[binding.name]; 52 | } 53 | 54 | binding.size = size; 55 | binding.dsize = TypeToSize(input_type_info.GetTensorTypeAndShapeInfo().GetElementType()); 56 | 57 | Ort::AllocatedStringPtr input_name = session->GetInputNameAllocated(i, allocator); 58 | binding.name = input_name.get(); 59 | input_bindings.push_back(binding); 60 | std::cout << binding.name << ": ["; 61 | for (size_t j = 0; j < input_tensor_shape.size(); j++) { 62 | std::cout << input_tensor_shape[j] << ","; 63 | } 64 | std::cout << "]" << std::endl; 65 | 66 | if (!is_dynamic && config.input_len[binding.name] != size) { 67 | std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 68 | << config.input_len[binding.name] << "!=" << size << ")" << std::endl; 69 | return Status::INIT_ERROR; 70 | } 71 | } 72 | 73 | std::cout << "Output: " << std::endl; 74 | int output_num = session->GetOutputCount(); 75 | for (int i = 0; i < output_num; i++) { 76 | Binding binding; 77 | 78 | Ort::TypeInfo output_type_info = session->GetOutputTypeInfo(i); 79 | std::vector output_tensor_shape = output_type_info.GetTensorTypeAndShapeInfo().GetShape(); 80 | 81 | Ort::AllocatedStringPtr output_name = session->GetOutputNameAllocated(i, allocator); 82 | binding.name = output_name.get(); 83 | 84 | int64_t size = 1; 85 | for (size_t j = 0; j < output_tensor_shape.size(); j++) { 86 | binding.dims.push_back(output_tensor_shape[j]); 87 | size *= output_tensor_shape[j]; 88 | } 89 | 90 | if (size <= 0) { 91 | size = config.output_len[binding.name]; 92 | } 93 | 94 | binding.size = size; 95 | binding.dsize = TypeToSize(output_type_info.GetTensorTypeAndShapeInfo().GetElementType()); 96 | 97 | output_bindings.push_back(binding); 98 | 99 | std::cout << binding.name << ": ["; 100 | for (size_t j = 0; j < output_tensor_shape.size(); j++) { 101 | std::cout << output_tensor_shape[j] << ","; 102 | } 103 | std::cout << "]" << std::endl; 104 | 105 | if (!is_dynamic && config.output_len[binding.name] != size) { 106 | std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 107 | << config.output_len[binding.name] << "!=" << size << ")" << std::endl; 108 | return Status::INIT_ERROR; 109 | } 110 | 111 | } 112 | 113 | return Status::SUCCESS; 114 | } 115 | 116 | ONNXFramework::~ONNXFramework() { 117 | delete session; 118 | } 119 | 120 | Status ONNXFramework::forward(const std::unordered_map& input, 121 | std::unordered_map& output) { 122 | std::vector input_tensors; 123 | Ort::MemoryInfo memory_info = 124 | Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault); 125 | 126 | std::vector input_names; 127 | for (const auto& binding : input_bindings) { 128 | const std::string input_name = binding.name; 129 | input_names.emplace_back(binding.name.c_str()); 130 | if (input.find(input_name) == input.end()) { 131 | std::cout << "Cannot find " << input_name << " from the input tensors!" << std::endl; 132 | return Status::INFERENCE_ERROR; 133 | } 134 | 135 | size_t size = 1; 136 | if (!is_dynamic) { 137 | size = binding.size; 138 | } else { 139 | for (size_t i = 0; i < input.at(input_name).shape.size(); i++) { 140 | size *= input.at(input_name).shape[i]; 141 | } 142 | } 143 | if (input.at(input_name).data_type == DataType::INT32) { 144 | input_tensors.push_back(Ort::Value::CreateTensor( 145 | memory_info, (int*)input.at(input_name).data(), size, input.at(input_name).shape.data(), input.at(input_name).shape.size())); 146 | } else if (input.at(input_name).data_type == DataType::FP32) { 147 | input_tensors.push_back(Ort::Value::CreateTensor( 148 | memory_info, (float*)input.at(input_name).data(), size, input.at(input_name).shape.data(), input.at(input_name).shape.size())); 149 | } else { 150 | std::cout << "Error occur when Ort::Value::CreateTensor" << std::endl; 151 | } 152 | 153 | } 154 | 155 | std::vector output_names; 156 | for (const auto& binding : output_bindings) { 157 | output_names.emplace_back(binding.name.c_str()); 158 | if (output.find(binding.name) == output.end()) { 159 | std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl; 160 | return Status::INFERENCE_ERROR; 161 | } 162 | } 163 | 164 | std::vector output_tensors = this->session->Run(Ort::RunOptions{nullptr}, input_names.data(), input_tensors.data(), input_names.size(), 165 | output_names.data(), output_names.size()); 166 | 167 | for (size_t i = 0; i < output_tensors.size(); ++i){ 168 | size_t element_size = TypeToSize(output_tensors[i].GetTensorTypeAndShapeInfo().GetElementType()); 169 | size_t count = output_tensors[i].GetTensorTypeAndShapeInfo().GetElementCount(); 170 | output[output_names[i]].resize(element_size * count); 171 | memcpy(output[output_names[i]].data(), output_tensors[i].GetTensorData(), element_size * count); 172 | output[output_names[i]].shape = output_tensors[i].GetTensorTypeAndShapeInfo().GetShape(); 173 | std::cout << "Shape of " << output_names[i] << ": ["; 174 | for (int64_t j : output[output_names[i]].shape) { 175 | std::cout << j << ","; 176 | } 177 | std::cout << "]" << std::endl; 178 | } 179 | return Status::SUCCESS; 180 | } -------------------------------------------------------------------------------- /framework/onnx/onnx.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "framework/framework.h" 9 | 10 | int TypeToSize(const ONNXTensorElementDataType &dataType); 11 | 12 | class ONNXFramework : public BaseFramework { 13 | public: 14 | ONNXFramework() {} 15 | ~ONNXFramework(); 16 | Status Init(Config config) override; 17 | Status forward(const std::unordered_map &input, 18 | std::unordered_map &output) override; 19 | 20 | private: 21 | Ort::Env env{nullptr}; 22 | Ort::SessionOptions session_options{nullptr}; 23 | Ort::Session *session{nullptr}; 24 | std::vector temp_output_ptrs; 25 | }; -------------------------------------------------------------------------------- /framework/rknn/rknn.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/rknn/rknn.h" 2 | 3 | static void dump_tensor_attr(rknn_tensor_attr *attr) { 4 | printf(" index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, " 5 | "zp=%d, scale=%f\n", 6 | attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3], 7 | attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type), 8 | get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale); 9 | } 10 | 11 | static int read_data_from_file(const char *path, char **out_data) 12 | { 13 | FILE *fp = fopen(path, "rb"); 14 | if(fp == NULL) { 15 | printf("fopen %s fail!\n", path); 16 | return -1; 17 | } 18 | fseek(fp, 0, SEEK_END); 19 | int file_size = ftell(fp); 20 | char *data = (char *)malloc(file_size+1); 21 | data[file_size] = 0; 22 | fseek(fp, 0, SEEK_SET); 23 | if(file_size != fread(data, 1, file_size, fp)) { 24 | printf("fread %s fail!\n", path); 25 | free(data); 26 | fclose(fp); 27 | return -1; 28 | } 29 | if(fp) { 30 | fclose(fp); 31 | } 32 | *out_data = data; 33 | return file_size; 34 | } 35 | 36 | int TypeToSize(const rknn_tensor_type& dataType) { 37 | switch (dataType) { 38 | case RKNN_TENSOR_FLOAT32: 39 | return 4; 40 | case RKNN_TENSOR_FLOAT16: 41 | return 2; 42 | case RKNN_TENSOR_INT32: 43 | return 4; 44 | case RKNN_TENSOR_INT8: 45 | return 1; 46 | case RKNN_TENSOR_BOOL: 47 | return 1; 48 | case RKNN_TENSOR_INT64: 49 | return 8; 50 | default: 51 | std::cout << "Unknown data type " << dataType << std::endl; 52 | return 4; 53 | } 54 | } 55 | 56 | Status RknnFramework::Init(Config config) { 57 | is_dynamic = config.is_dynamic; 58 | int ret; 59 | int model_len = 0; 60 | char *model; 61 | rknn_context ctx = 0; 62 | 63 | // Load RKNN Model 64 | model_len = read_data_from_file(config.model_path.c_str(), &model); 65 | if (model == NULL) 66 | { 67 | printf("load_model fail!\n"); 68 | return Status::INIT_ERROR; 69 | } 70 | 71 | ret = rknn_init(&ctx, model, model_len, 0, NULL); 72 | free(model); 73 | if (ret < 0) 74 | { 75 | printf("rknn_init fail! ret=%d\n", ret); 76 | return Status::INIT_ERROR; 77 | } 78 | 79 | // Get Model Input Output Number 80 | rknn_input_output_num io_num; 81 | ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num)); 82 | if (ret != RKNN_SUCC) 83 | { 84 | printf("rknn_query fail! ret=%d\n", ret); 85 | return Status::INIT_ERROR; 86 | } 87 | printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output); 88 | 89 | // Get Model Input Info 90 | printf("input tensors:\n"); 91 | rknn_tensor_attr input_attrs[io_num.n_input]; 92 | memset(input_attrs, 0, sizeof(input_attrs)); 93 | for (int i = 0; i < io_num.n_input; i++) 94 | { 95 | input_attrs[i].index = i; 96 | ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr)); 97 | if (ret != RKNN_SUCC) 98 | { 99 | printf("rknn_query fail! ret=%d\n", ret); 100 | return Status::INIT_ERROR; 101 | } 102 | dump_tensor_attr(&(input_attrs[i])); 103 | Binding binding; 104 | binding.name = input_attrs[i].name; 105 | binding.size = input_attrs[i].n_elems; 106 | binding.dsize = TypeToSize(input_attrs[i].type); 107 | binding.dims = std::vector{input_attrs[i].dims[0], input_attrs[i].dims[1], input_attrs[i].dims[2], input_attrs[i].dims[3]}; 108 | input_bindings.push_back(binding); 109 | in_index_[binding.name] = i; 110 | if (!is_dynamic && config.input_len[binding.name] != binding.size) { 111 | std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 112 | << config.input_len[binding.name] << "!=" << binding.size << ")" << std::endl; 113 | return Status::INIT_ERROR; 114 | } 115 | } 116 | 117 | // Get Model Output Info 118 | printf("output tensors:\n"); 119 | rknn_tensor_attr output_attrs[io_num.n_output]; 120 | memset(output_attrs, 0, sizeof(output_attrs)); 121 | for (int i = 0; i < io_num.n_output; i++) 122 | { 123 | output_attrs[i].index = i; 124 | ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr)); 125 | if (ret != RKNN_SUCC) 126 | { 127 | printf("rknn_query fail! ret=%d\n", ret); 128 | return Status::INIT_ERROR; 129 | } 130 | dump_tensor_attr(&(output_attrs[i])); 131 | Binding binding; 132 | binding.name = output_attrs[i].name; 133 | binding.size = output_attrs[i].n_elems; 134 | binding.dsize = TypeToSize(output_attrs[i].type); 135 | binding.dims = std::vector{output_attrs[i].dims[0], output_attrs[i].dims[1], output_attrs[i].dims[2], output_attrs[i].dims[3]}; 136 | output_bindings.push_back(binding); 137 | out_index_[binding.name] = i; 138 | if (!is_dynamic && config.output_len[binding.name] != binding.size) { 139 | std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 140 | << config.output_len[binding.name] << "!=" << binding.size << ")" << std::endl; 141 | return Status::INIT_ERROR; 142 | } 143 | } 144 | 145 | // Set to context 146 | rknn_ctx = ctx; 147 | 148 | // if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16) 149 | // { 150 | // is_quant_ = true; 151 | // } 152 | // else 153 | // { 154 | // is_quant_ = false; 155 | // } 156 | 157 | input_attrs_ = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr)); 158 | memcpy(input_attrs_, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr)); 159 | output_attrs_ = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr)); 160 | memcpy(output_attrs_, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr)); 161 | 162 | uint32_t model_channel, model_height, model_width; 163 | if (input_attrs[0].fmt == RKNN_TENSOR_NCHW) 164 | { 165 | printf("model is NCHW input fmt\n"); 166 | model_channel = input_attrs[0].dims[1]; 167 | model_height = input_attrs[0].dims[2]; 168 | model_width = input_attrs[0].dims[3]; 169 | } 170 | else 171 | { 172 | printf("model is NHWC input fmt\n"); 173 | model_height = input_attrs[0].dims[1]; 174 | model_width = input_attrs[0].dims[2]; 175 | model_channel = input_attrs[0].dims[3]; 176 | } 177 | printf("model input height=%d, width=%d, channel=%d\n", 178 | model_height, model_width, model_channel); 179 | 180 | return Status::SUCCESS; 181 | } 182 | 183 | RknnFramework::~RknnFramework() { 184 | if (rknn_ctx != 0) 185 | { 186 | rknn_destroy(rknn_ctx); 187 | rknn_ctx = 0; 188 | } 189 | if (input_attrs_ != NULL) 190 | { 191 | free(input_attrs_); 192 | input_attrs_ = NULL; 193 | } 194 | if (output_attrs_ != NULL) 195 | { 196 | free(output_attrs_); 197 | output_attrs_ = NULL; 198 | } 199 | } 200 | 201 | Status RknnFramework::forward(const std::unordered_map &input, std::unordered_map &output) { 202 | rknn_input rknn_input_tensors[input.size()]; 203 | rknn_output rknn_output_tensors[output.size()]; 204 | memset(rknn_input_tensors, 0, sizeof(rknn_input_tensors)); 205 | memset(rknn_output_tensors, 0, sizeof(rknn_output_tensors)); 206 | 207 | int ret = 0; 208 | 209 | for (auto &kv : input) { 210 | size_t idx = in_index_[kv.first]; 211 | auto& binding = this->input_bindings[idx]; 212 | if (input.find(binding.name) == input.end()) { 213 | std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl; 214 | return Status::INFERENCE_ERROR; 215 | } 216 | rknn_input_tensors[0].index = idx; 217 | rknn_input_tensors[0].type = RKNN_TENSOR_UINT8; 218 | rknn_input_tensors[0].fmt = RKNN_TENSOR_NHWC; 219 | rknn_input_tensors[0].size = binding.size * binding.dsize; 220 | rknn_input_tensors[0].buf = (void*)kv.second.data(); 221 | } 222 | 223 | ret = rknn_inputs_set(rknn_ctx, input_bindings.size(), rknn_input_tensors); 224 | if (ret < 0) 225 | { 226 | printf("rknn_input_set fail! ret=%d\n", ret); 227 | return Status::INFERENCE_ERROR; 228 | } 229 | 230 | ret = rknn_run(rknn_ctx, nullptr); 231 | if (ret < 0) 232 | { 233 | printf("rknn_run fail! ret=%d\n", ret); 234 | return Status::INFERENCE_ERROR; 235 | } 236 | 237 | memset(rknn_output_tensors, 0, sizeof(rknn_output_tensors)); 238 | for (int i = 0; i < output_bindings.size(); i++) 239 | { 240 | rknn_output_tensors[i].index = i; 241 | rknn_output_tensors[i].want_float = false; 242 | } 243 | ret = rknn_outputs_get(rknn_ctx, output_bindings.size(), rknn_output_tensors, NULL); 244 | if (ret < 0) 245 | { 246 | printf("rknn_outputs_get fail! ret=%d\n", ret); 247 | return Status::INFERENCE_ERROR; 248 | } 249 | 250 | for (auto &kv : output) { 251 | auto idx = out_index_[kv.first]; 252 | const auto& binding = this->output_bindings[idx]; 253 | kv.second.resize(binding.size); 254 | if (rknn_output_tensors[idx].size != binding.size) { 255 | return Status::INFERENCE_ERROR; 256 | } 257 | memcpy(kv.second.data(), rknn_output_tensors[idx].buf, kv.second.size()); 258 | kv.second.zp = output_attrs_[idx].zp; 259 | kv.second.scale = output_attrs_[idx].scale; 260 | } 261 | 262 | return Status::SUCCESS; 263 | } 264 | -------------------------------------------------------------------------------- /framework/rknn/rknn.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include "framework/framework.h" 9 | 10 | int TypeToSize(const rknn_tensor_type &dataType); 11 | 12 | class RknnFramework : public BaseFramework { 13 | public: 14 | RknnFramework() {} 15 | ~RknnFramework(); 16 | Status Init(Config config) override; 17 | Status forward(const std::unordered_map &input, 18 | std::unordered_map &output) override; 19 | 20 | private: 21 | rknn_context rknn_ctx; 22 | rknn_tensor_attr* input_attrs_; 23 | rknn_tensor_attr* output_attrs_; 24 | std::unordered_map in_index_; 25 | std::unordered_map out_index_; 26 | bool is_quant_; 27 | }; -------------------------------------------------------------------------------- /framework/tensorrt/tensorrt.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/tensorrt/tensorrt.h" 2 | 3 | void Logger::log(nvinfer1::ILogger::Severity severity, const char *msg) noexcept { 4 | if (severity > reportableSeverity) { 5 | return; 6 | } 7 | switch (severity) { 8 | case nvinfer1::ILogger::Severity::kINTERNAL_ERROR: 9 | std::cerr << "INTERNAL_ERROR: "; 10 | break; 11 | case nvinfer1::ILogger::Severity::kERROR: 12 | std::cerr << "ERROR: "; 13 | break; 14 | case nvinfer1::ILogger::Severity::kWARNING: 15 | std::cerr << "WARNING: "; 16 | break; 17 | case nvinfer1::ILogger::Severity::kINFO: 18 | std::cerr << "INFO: "; 19 | break; 20 | default: 21 | std::cerr << "VERBOSE: "; 22 | break; 23 | } 24 | std::cerr << msg << std::endl; 25 | } 26 | 27 | int TypeToSize(const nvinfer1::DataType &dataType) { 28 | switch (dataType) { 29 | case nvinfer1::DataType::kFLOAT: 30 | return 4; 31 | case nvinfer1::DataType::kHALF: 32 | return 2; 33 | case nvinfer1::DataType::kINT32: 34 | return 4; 35 | case nvinfer1::DataType::kINT8: 36 | return 1; 37 | case nvinfer1::DataType::kBOOL: 38 | return 1; 39 | default: 40 | return 4; 41 | } 42 | } 43 | 44 | Status TensorRTFramework::Init(Config config) { 45 | // 读取模型文件 46 | std::ifstream file(config.model_path, std::ios::binary); 47 | assert(file.good()); 48 | file.seekg(0, std::ios::end); 49 | auto size = file.tellg(); 50 | file.seekg(0, std::ios::beg); 51 | char *trtModelStream = new char[size]; 52 | assert(trtModelStream); 53 | file.read(trtModelStream, size); 54 | file.close(); 55 | 56 | // 加载插件 57 | initLibNvInferPlugins(&this->gLogger, ""); 58 | 59 | // 创建IRuntime对象 60 | this->runtime = nvinfer1::createInferRuntime(this->gLogger); 61 | assert(this->runtime != nullptr); 62 | 63 | // 反序列化engine文件,创建ICudaEngine对象 64 | this->engine = this->runtime->deserializeCudaEngine(trtModelStream, size); 65 | assert(this->engine != nullptr); 66 | delete[] trtModelStream; 67 | 68 | // 初始化IExecutionContext对象 69 | this->context = this->engine->createExecutionContext(); 70 | assert(this->context != nullptr); 71 | 72 | // 创建cudaStream_t对象 73 | cudaStreamCreate(&this->stream); 74 | 75 | this->is_dynamic = config.is_dynamic; 76 | 77 | this->num_bindings = this->engine->getNbIOTensors(); 78 | for (int i = 0; i < this->num_bindings; ++i) 79 | { 80 | Binding binding; 81 | nvinfer1::Dims dims; 82 | std::string name = this->engine->getIOTensorName(i); 83 | nvinfer1::DataType dtype = this->engine->getTensorDataType(name.c_str()); 84 | binding.name = name; 85 | binding.dsize = TypeToSize(dtype); 86 | 87 | nvinfer1::TensorIOMode io_mode = engine->getTensorIOMode(name.c_str()); 88 | if (io_mode == nvinfer1::TensorIOMode::kINPUT) 89 | { 90 | in_index_[name] = this->num_inputs; 91 | this->num_inputs += 1; 92 | dims = this->engine->getProfileShape(name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX); 93 | binding.size = 1; 94 | std::cout << binding.name << ": ["; 95 | for (int i = 0; i < dims.nbDims; i++) 96 | { 97 | std::cout << dims.d[i] << ","; 98 | binding.size *= dims.d[i]; 99 | binding.dims.push_back(dims.d[i]); 100 | } 101 | std::cout << "]" << std::endl; 102 | if (!is_dynamic && config.input_len[binding.name] != binding.size) { 103 | std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 104 | << config.input_len[binding.name] << "!=" << binding.size << ")" << std::endl; 105 | return Status::INIT_ERROR; 106 | } 107 | this->input_bindings.push_back(binding); 108 | // set max opt shape 109 | this->context->setInputShape(name.c_str(), dims); 110 | std::cout << "Input bind name: " << name << std::endl; 111 | } 112 | else if (io_mode == nvinfer1::TensorIOMode::kOUTPUT) 113 | { 114 | out_index_[name] = this->num_outputs; 115 | dims = this->context->getTensorShape(name.c_str()); 116 | binding.size = 1; 117 | std::cout << binding.name << ": ["; 118 | for (int i = 0; i < dims.nbDims; i++) 119 | { 120 | std::cout << dims.d[i] << ","; 121 | binding.size *= dims.d[i]; 122 | binding.dims.push_back(dims.d[i]); 123 | } 124 | std::cout << "]" << std::endl; 125 | if (!is_dynamic && config.output_len[binding.name] != binding.size) { 126 | std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". (" 127 | << config.output_len[binding.name] << "!=" << binding.size << ")" << std::endl; 128 | return Status::INIT_ERROR; 129 | } 130 | if (is_dynamic) { 131 | binding.size = config.output_len[binding.name]; 132 | } 133 | this->output_bindings.push_back(binding); 134 | this->num_outputs += 1; 135 | std::cout << "Output bind name: " << name << std::endl; 136 | } 137 | } 138 | make_pipe(true); 139 | return Status::SUCCESS; 140 | } 141 | 142 | TensorRTFramework::~TensorRTFramework() { 143 | delete this->context; 144 | delete this->engine; 145 | delete this->runtime; 146 | cudaStreamDestroy(this->stream); 147 | for (auto &ptr : this->device_ptrs) 148 | { 149 | CHECK(cudaFree(ptr)); 150 | } 151 | 152 | for (auto &ptr : this->host_ptrs) 153 | { 154 | CHECK(cudaFreeHost(ptr)); 155 | } 156 | } 157 | 158 | void TensorRTFramework::make_pipe(bool warmup) { 159 | for (auto &bindings : this->input_bindings) 160 | { 161 | void *d_ptr; 162 | CHECK(cudaMalloc(&d_ptr, bindings.size * bindings.dsize)); 163 | this->device_ptrs.push_back(d_ptr); 164 | this->context->setTensorAddress(bindings.name.c_str(), d_ptr); 165 | } 166 | 167 | for (auto &bindings : this->output_bindings) 168 | { 169 | void *d_ptr, *h_ptr; 170 | size_t size = bindings.size * bindings.dsize; 171 | CHECK(cudaMalloc(&d_ptr, size)); 172 | CHECK(cudaHostAlloc(&h_ptr, size, 0)); 173 | this->device_ptrs.push_back(d_ptr); 174 | this->host_ptrs.push_back(h_ptr); 175 | this->context->setTensorAddress(bindings.name.c_str(), d_ptr); 176 | } 177 | 178 | if (warmup) 179 | { 180 | for (int i = 0; i < 10; i++) 181 | { 182 | for (auto &bindings : this->input_bindings) 183 | { 184 | size_t size = bindings.size * bindings.dsize; 185 | void *h_ptr = malloc(size); 186 | memset(h_ptr, 0, size); 187 | CHECK(cudaMemcpyAsync(this->device_ptrs[0], h_ptr, size, cudaMemcpyHostToDevice, this->stream)); 188 | free(h_ptr); 189 | } 190 | this->infer(); 191 | } 192 | printf("model warmup 10 times\n"); 193 | } 194 | } 195 | 196 | bool TensorRTFramework::set_input(const std::unordered_map &input) { 197 | for (auto &kv : input) { 198 | size_t idx = in_index_[kv.first]; 199 | auto& binding = this->input_bindings[idx]; 200 | if (input.find(binding.name) == input.end()) { 201 | std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl; 202 | return false; 203 | } 204 | if (is_dynamic) { 205 | std::vector shape = input.at(binding.name).shape; 206 | nvinfer1::Dims dim; 207 | dim.nbDims = shape.size(); 208 | for (size_t i = 0; i < dim.nbDims; i++) { 209 | dim.d[i] = shape[i]; 210 | } 211 | context->setInputShape(binding.name.c_str(), dim); 212 | } 213 | CHECK(cudaMemcpyAsync( 214 | this->device_ptrs[idx], kv.second.data(), kv.second.size(), cudaMemcpyHostToDevice, this->stream)); 215 | } 216 | return true; 217 | } 218 | 219 | bool TensorRTFramework::infer() { 220 | this->context->enqueueV3(this->stream); 221 | for (int i = 0; i < this->num_outputs; i++) 222 | { 223 | size_t osize = this->output_bindings[i].size * this->output_bindings[i].dsize; 224 | CHECK(cudaMemcpyAsync( 225 | this->host_ptrs[i], this->device_ptrs[i + this->num_inputs], osize, cudaMemcpyDeviceToHost, this->stream)); 226 | } 227 | cudaStreamSynchronize(this->stream); 228 | return true; 229 | } 230 | 231 | Status TensorRTFramework::forward(const std::unordered_map &input, 232 | std::unordered_map &output) { 233 | if (!this->set_input(input)) { 234 | return Status::INFERENCE_ERROR; 235 | } 236 | if (!this->infer()) { 237 | return Status::INFERENCE_ERROR; 238 | } 239 | for (auto &kv : output) { 240 | auto cur_idx = out_index_[kv.first]; 241 | const auto& binding = this->output_bindings[cur_idx]; 242 | memcpy(kv.second.data(), this->host_ptrs[cur_idx], kv.second.size()); 243 | } 244 | return Status::SUCCESS; 245 | } -------------------------------------------------------------------------------- /framework/tensorrt/tensorrt.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "NvInferPlugin.h" 4 | #include 5 | #include "framework/framework.h" 6 | #include "common/common.h" 7 | 8 | #define CHECK(call) \ 9 | do \ 10 | { \ 11 | const cudaError_t error_code = call; \ 12 | if (error_code != cudaSuccess) \ 13 | { \ 14 | printf("CUDA Error:\n"); \ 15 | printf(" File: %s\n", __FILE__); \ 16 | printf(" Line: %d\n", __LINE__); \ 17 | printf(" Error code: %d\n", error_code); \ 18 | printf(" Error text: %s\n", cudaGetErrorString(error_code)); \ 19 | exit(1); \ 20 | } \ 21 | } while (0) 22 | 23 | class Logger : public nvinfer1::ILogger 24 | { 25 | public: 26 | nvinfer1::ILogger::Severity reportableSeverity; 27 | 28 | explicit Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO) : reportableSeverity(severity) 29 | { 30 | } 31 | 32 | void log(nvinfer1::ILogger::Severity severity, const char *msg) noexcept override; 33 | }; 34 | 35 | int TypeToSize(const nvinfer1::DataType &dataType); 36 | 37 | class TensorRTFramework: public BaseFramework 38 | { 39 | public: 40 | explicit TensorRTFramework() {} 41 | virtual ~TensorRTFramework(); 42 | Status Init(Config config) override; 43 | Status forward(const std::unordered_map &input, 44 | std::unordered_map &output) override; 45 | 46 | private: 47 | void make_pipe(bool warmup = true); 48 | bool set_input(const std::unordered_map &input); 49 | bool infer(); 50 | 51 | nvinfer1::ICudaEngine *engine = nullptr; 52 | nvinfer1::IRuntime *runtime = nullptr; 53 | nvinfer1::IExecutionContext *context = nullptr; 54 | cudaStream_t stream = nullptr; 55 | Logger gLogger{nvinfer1::ILogger::Severity::kERROR}; 56 | int num_bindings; 57 | int num_inputs = 0; 58 | int num_outputs = 0; 59 | std::vector host_ptrs; 60 | std::vector device_ptrs; 61 | std::unordered_map in_index_; 62 | std::unordered_map out_index_; 63 | 64 | PreParam pparam; 65 | }; -------------------------------------------------------------------------------- /model/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(yaml-cpp) 2 | 3 | add_library(base_model SHARED ${CMAKE_CURRENT_SOURCE_DIR}/base/model.cpp) 4 | target_include_directories(base_model PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 5 | target_link_libraries(base_model framework_onnx) 6 | if(USE_TENSORRT) 7 | target_link_libraries(base_model framework_trt) 8 | endif() 9 | if(USE_RKNN) 10 | target_link_libraries(base_model framework_rknn) 11 | endif() 12 | 13 | add_library(yolo_seg SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_seg.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 14 | target_include_directories(yolo_seg PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 15 | target_link_libraries(yolo_seg base_model yaml-cpp) 16 | target_link_libraries(yolo_seg ${OpenCV_LIBS}) 17 | 18 | add_library(yolo_seg_cutoff SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_seg_cutoff.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 19 | target_include_directories(yolo_seg_cutoff PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 20 | target_link_libraries(yolo_seg_cutoff base_model yaml-cpp) 21 | target_link_libraries(yolo_seg_cutoff ${OpenCV_LIBS}) 22 | 23 | add_library(yolo_pose SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_pose.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 24 | target_include_directories(yolo_pose PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 25 | target_link_libraries(yolo_pose base_model yaml-cpp) 26 | target_link_libraries(yolo_pose ${OpenCV_LIBS}) 27 | 28 | add_library(yolo_det SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 29 | target_include_directories(yolo_det PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 30 | target_link_libraries(yolo_det base_model yaml-cpp) 31 | target_link_libraries(yolo_det ${OpenCV_LIBS}) 32 | 33 | add_library(yolo_det_cutoff SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_cutoff.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 34 | target_include_directories(yolo_det_cutoff PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 35 | target_link_libraries(yolo_det_cutoff base_model yaml-cpp) 36 | target_link_libraries(yolo_det_cutoff ${OpenCV_LIBS}) 37 | 38 | add_library(ctc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/ctc.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/ocr_model.cpp) 39 | target_include_directories(ctc PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 40 | target_link_libraries(ctc base_model yaml-cpp) 41 | target_link_libraries(ctc ${OpenCV_LIBS}) 42 | 43 | add_library(attn SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/attention.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/ocr_model.cpp) 44 | target_include_directories(attn PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 45 | target_link_libraries(attn base_model yaml-cpp) 46 | target_link_libraries(attn ${OpenCV_LIBS}) 47 | 48 | add_library(dbnet SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/dbnet.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp) 49 | target_include_directories(dbnet PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 50 | target_link_libraries(dbnet base_model yaml-cpp) 51 | target_link_libraries(dbnet ${OpenCV_LIBS} ${Clipper_LIBS}) 52 | 53 | add_library(sam SHARED ${CMAKE_CURRENT_SOURCE_DIR}/sam/image_encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sam/mask_decoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sam/sam.cpp) 54 | target_include_directories(sam PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 55 | target_link_libraries(sam base_model yaml-cpp) 56 | target_link_libraries(sam ${OpenCV_LIBS}) 57 | 58 | add_subdirectory(clip) -------------------------------------------------------------------------------- /model/base/detection_model.cpp: -------------------------------------------------------------------------------- 1 | #include "model/base/detection_model.h" 2 | #include "opencv2/opencv.hpp" 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | void ReadClassNames(std::string file_name, std::vector &class_names) 9 | { 10 | std::ifstream in_file; 11 | in_file.open(file_name, std::ios::in); 12 | assert(in_file.good()); 13 | 14 | std::string name; 15 | while (getline(in_file, name, '\n')) 16 | { 17 | class_names.push_back(name); 18 | } 19 | in_file.close(); 20 | } 21 | 22 | void DrawObjects(const cv::Mat &image, 23 | cv::Mat &res, 24 | const std::vector &objs, 25 | const std::vector &CLASS_NAMES, 26 | const std::vector> &COLORS) 27 | { 28 | res = image.clone(); 29 | for (auto &obj : objs) 30 | { 31 | cv::Scalar color = cv::Scalar(COLORS[obj.label][0], COLORS[obj.label][1], COLORS[obj.label][2]); 32 | cv::rectangle(res, obj.rect, color, 2); 33 | 34 | char text[256]; 35 | sprintf(text, "%s %.1f%%", CLASS_NAMES[obj.label].c_str(), obj.prob * 100); 36 | 37 | int baseLine = 0; 38 | cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine); 39 | 40 | int x = (int)obj.rect.x; 41 | int y = (int)obj.rect.y + 1; 42 | 43 | if (y > res.rows) 44 | y = res.rows; 45 | 46 | cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1); 47 | 48 | cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1); 49 | } 50 | } 51 | 52 | void DrawObjectsMasks(const cv::Mat &image, 53 | cv::Mat &res, 54 | const std::vector &objs, 55 | const std::vector &CLASS_NAMES, 56 | const std::vector> &COLORS, 57 | const std::vector> &MASK_COLORS) 58 | { 59 | res = image.clone(); 60 | cv::Mat mask = image.clone(); 61 | for (auto &obj : objs) 62 | { 63 | int idx = obj.label; 64 | cv::Scalar color = cv::Scalar(COLORS[idx][0], COLORS[idx][1], COLORS[idx][2]); 65 | cv::Scalar mask_color = 66 | cv::Scalar(MASK_COLORS[idx % 20][0], MASK_COLORS[idx % 20][1], MASK_COLORS[idx % 20][2]); 67 | cv::rectangle(res, obj.rect, color, 2); 68 | 69 | char text[256]; 70 | sprintf(text, "%s %.1f%%", CLASS_NAMES[idx].c_str(), obj.prob * 100); 71 | mask(obj.rect).setTo(mask_color, obj.boxMask); 72 | 73 | int baseLine = 0; 74 | cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine); 75 | 76 | int x = (int)obj.rect.x; 77 | int y = (int)obj.rect.y + 1; 78 | 79 | if (y > res.rows) 80 | y = res.rows; 81 | 82 | cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1); 83 | 84 | cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1); 85 | } 86 | cv::addWeighted(res, 0.5, mask, 0.8, 1, res); 87 | } 88 | 89 | void DrawObjectsKps(const cv::Mat& image, 90 | cv::Mat& res, 91 | const std::vector& objs, 92 | const std::vector>& SKELETON, 93 | const std::vector>& KPS_COLORS, 94 | const std::vector>& LIMB_COLORS) 95 | { 96 | res = image.clone(); 97 | const int num_point = 17; 98 | for (auto& obj : objs) { 99 | cv::rectangle(res, obj.rect, {0, 0, 255}, 2); 100 | 101 | char text[256]; 102 | sprintf(text, "person %.1f%%", obj.prob * 100); 103 | 104 | int baseLine = 0; 105 | cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine); 106 | 107 | int x = (int)obj.rect.x; 108 | int y = (int)obj.rect.y + 1; 109 | 110 | if (y > res.rows) 111 | y = res.rows; 112 | 113 | cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1); 114 | 115 | cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1); 116 | 117 | auto& kps = obj.kps; 118 | for (int k = 0; k < num_point + 2; k++) { 119 | if (k < num_point) { 120 | int kps_x = std::round(kps[k * 3]); 121 | int kps_y = std::round(kps[k * 3 + 1]); 122 | float kps_s = kps[k * 3 + 2]; 123 | if (kps_s > 0.5f) { 124 | cv::Scalar kps_color = cv::Scalar(KPS_COLORS[k][0], KPS_COLORS[k][1], KPS_COLORS[k][2]); 125 | cv::circle(res, {kps_x, kps_y}, 5, kps_color, -1); 126 | } 127 | } 128 | auto& ske = SKELETON[k]; 129 | int pos1_x = std::round(kps[(ske[0] - 1) * 3]); 130 | int pos1_y = std::round(kps[(ske[0] - 1) * 3 + 1]); 131 | 132 | int pos2_x = std::round(kps[(ske[1] - 1) * 3]); 133 | int pos2_y = std::round(kps[(ske[1] - 1) * 3 + 1]); 134 | 135 | float pos1_s = kps[(ske[0] - 1) * 3 + 2]; 136 | float pos2_s = kps[(ske[1] - 1) * 3 + 2]; 137 | 138 | if (pos1_s > 0.5f && pos2_s > 0.5f) { 139 | cv::Scalar limb_color = cv::Scalar(LIMB_COLORS[k][0], LIMB_COLORS[k][1], LIMB_COLORS[k][2]); 140 | cv::line(res, {pos1_x, pos1_y}, {pos2_x, pos2_y}, limb_color, 2); 141 | } 142 | } 143 | } 144 | } 145 | 146 | void DrawBoxes(const cv::Mat &image, 147 | cv::Mat &res, 148 | const std::vector &objs) { 149 | res = image.clone(); 150 | for (auto &obj : objs) 151 | { 152 | cv::Scalar color = cv::Scalar(COLORS[obj.label][0], COLORS[obj.label][1], COLORS[obj.label][2]); 153 | cv::rectangle(res, obj.rect, color, 2); 154 | } 155 | } 156 | 157 | float Iou(cv::Rect bb_test, cv::Rect bb_gt) 158 | { 159 | float in = (bb_test & bb_gt).area(); 160 | float un = bb_test.area() + bb_gt.area() - in; 161 | 162 | if (un < DBL_EPSILON) 163 | return 0; 164 | 165 | return in / un; 166 | } 167 | 168 | void Nms(std::vector &res, float nms_thresh) 169 | { 170 | std::map> m; 171 | for (const auto &obj : res) 172 | { 173 | if (m.count(obj.label) == 0) 174 | { 175 | m.emplace(obj.label, std::vector()); 176 | } 177 | m[obj.label].push_back(obj); 178 | } 179 | auto cmp = [](const Object &a, const Object &b) 180 | { 181 | return a.prob > b.prob; 182 | }; 183 | res.clear(); 184 | for (auto it = m.begin(); it != m.end(); it++) 185 | { 186 | auto &dets = it->second; 187 | std::sort(dets.begin(), dets.end(), cmp); 188 | for (size_t m = 0; m < dets.size(); ++m) 189 | { 190 | auto &item = dets[m]; 191 | res.push_back(item); 192 | for (size_t n = m + 1; n < dets.size(); ++n) 193 | { 194 | if (Iou(item.rect, dets[n].rect) > nms_thresh) 195 | { 196 | dets.erase(dets.begin() + n); 197 | --n; 198 | } 199 | } 200 | } 201 | } 202 | } -------------------------------------------------------------------------------- /model/base/detection_model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | #include "common/common.h" 4 | 5 | const std::vector> COLORS = { 6 | {0, 114, 189}, {217, 83, 25}, {237, 177, 32}, {126, 47, 142}, {119, 172, 48}, {77, 190, 238}, {162, 20, 47}, {76, 76, 76}, {153, 153, 153}, {255, 0, 0}, {255, 128, 0}, {191, 191, 0}, {0, 255, 0}, {0, 0, 255}, {170, 0, 255}, {85, 85, 0}, {85, 170, 0}, {85, 255, 0}, {170, 85, 0}, {170, 170, 0}, {170, 255, 0}, {255, 85, 0}, {255, 170, 0}, {255, 255, 0}, {0, 85, 128}, {0, 170, 128}, {0, 255, 128}, {85, 0, 128}, {85, 85, 128}, {85, 170, 128}, {85, 255, 128}, {170, 0, 128}, {170, 85, 128}, {170, 170, 128}, {170, 255, 128}, {255, 0, 128}, {255, 85, 128}, {255, 170, 128}, {255, 255, 128}, {0, 85, 255}, {0, 170, 255}, {0, 255, 255}, {85, 0, 255}, {85, 85, 255}, {85, 170, 255}, {85, 255, 255}, {170, 0, 255}, {170, 85, 255}, {170, 170, 255}, {170, 255, 255}, {255, 0, 255}, {255, 85, 255}, {255, 170, 255}, {85, 0, 0}, {128, 0, 0}, {170, 0, 0}, {212, 0, 0}, {255, 0, 0}, {0, 43, 0}, {0, 85, 0}, {0, 128, 0}, {0, 170, 0}, {0, 212, 0}, {0, 255, 0}, {0, 0, 43}, {0, 0, 85}, {0, 0, 128}, {0, 0, 170}, {0, 0, 212}, {0, 0, 255}, {0, 0, 0}, {36, 36, 36}, {73, 73, 73}, {109, 109, 109}, {146, 146, 146}, {182, 182, 182}, {219, 219, 219}, {0, 114, 189}, {80, 183, 189}, {128, 128, 0}}; 7 | 8 | const std::vector> MASK_COLORS = { 9 | {255, 56, 56}, {255, 157, 151}, {255, 112, 31}, {255, 178, 29}, {207, 210, 49}, {72, 249, 10}, {146, 204, 23}, {61, 219, 134}, {26, 147, 52}, {0, 212, 187}, {44, 153, 168}, {0, 194, 255}, {52, 69, 147}, {100, 115, 255}, {0, 24, 236}, {132, 56, 255}, {82, 0, 133}, {203, 56, 255}, {255, 149, 200}, {255, 55, 199}}; 10 | 11 | const std::vector> KPS_COLORS = {{0, 255, 0}, 12 | {0, 255, 0}, 13 | {0, 255, 0}, 14 | {0, 255, 0}, 15 | {0, 255, 0}, 16 | {255, 128, 0}, 17 | {255, 128, 0}, 18 | {255, 128, 0}, 19 | {255, 128, 0}, 20 | {255, 128, 0}, 21 | {255, 128, 0}, 22 | {51, 153, 255}, 23 | {51, 153, 255}, 24 | {51, 153, 255}, 25 | {51, 153, 255}, 26 | {51, 153, 255}, 27 | {51, 153, 255}}; 28 | 29 | const std::vector> SKELETON = {{16, 14}, 30 | {14, 12}, 31 | {17, 15}, 32 | {15, 13}, 33 | {12, 13}, 34 | {6, 12}, 35 | {7, 13}, 36 | {6, 7}, 37 | {6, 8}, 38 | {7, 9}, 39 | {8, 10}, 40 | {9, 11}, 41 | {2, 3}, 42 | {1, 2}, 43 | {1, 3}, 44 | {2, 4}, 45 | {3, 5}, 46 | {4, 6}, 47 | {5, 7}}; 48 | 49 | const std::vector> LIMB_COLORS = {{51, 153, 255}, 50 | {51, 153, 255}, 51 | {51, 153, 255}, 52 | {51, 153, 255}, 53 | {255, 51, 255}, 54 | {255, 51, 255}, 55 | {255, 51, 255}, 56 | {255, 128, 0}, 57 | {255, 128, 0}, 58 | {255, 128, 0}, 59 | {255, 128, 0}, 60 | {255, 128, 0}, 61 | {0, 255, 0}, 62 | {0, 255, 0}, 63 | {0, 255, 0}, 64 | {0, 255, 0}, 65 | {0, 255, 0}, 66 | {0, 255, 0}, 67 | {0, 255, 0}}; 68 | 69 | void ReadClassNames(std::string file_name, std::vector &class_names); 70 | 71 | struct Object 72 | { 73 | cv::Rect_ rect; 74 | int label = 0; 75 | float prob = 0.0; 76 | cv::Mat boxMask; 77 | std::vector kps; 78 | }; 79 | 80 | void DrawObjects(const cv::Mat &image, 81 | cv::Mat &res, 82 | const std::vector &objs, 83 | const std::vector &CLASS_NAMES, 84 | const std::vector> &COLORS); 85 | 86 | void DrawObjectsMasks(const cv::Mat &image, 87 | cv::Mat &res, 88 | const std::vector &objs, 89 | const std::vector &CLASS_NAMES, 90 | const std::vector> &COLORS, 91 | const std::vector> &MASK_COLORS); 92 | 93 | void DrawObjectsKps(const cv::Mat& image, 94 | cv::Mat& res, 95 | const std::vector& objs, 96 | const std::vector>& SKELETON, 97 | const std::vector>& KPS_COLORS, 98 | const std::vector>& LIMB_COLORS); 99 | 100 | void DrawBoxes(const cv::Mat &image, 101 | cv::Mat &res, 102 | const std::vector &objs); 103 | 104 | float Iou(cv::Rect bb_test, cv::Rect bb_gt); 105 | 106 | void Nms(std::vector &res, float nms_thresh); 107 | 108 | class DetectionModel : public Model 109 | { 110 | public: 111 | explicit DetectionModel() {}; 112 | virtual ~DetectionModel() {}; 113 | virtual void detect(const cv::Mat &image, std::vector &objs) = 0; 114 | protected: 115 | virtual void preprocess(const cv::Mat &input_image, cv::Mat &output_image) = 0; 116 | virtual void postprocess(const std::unordered_map &output, std::vector &objs) = 0; 117 | }; -------------------------------------------------------------------------------- /model/base/model.cpp: -------------------------------------------------------------------------------- 1 | #include "model/base/model.h" 2 | 3 | bool Model::Init(const std::string &model_path, const std::string &framework_type) { 4 | config_.model_path = model_path; 5 | if (framework_type == "TensorRT") 6 | { 7 | #ifdef USE_TENSORRT 8 | framework_ = std::make_shared(); 9 | #else 10 | std::cout << "Framework " << framework_type << " not implemented" <(); 17 | } 18 | else if (framework_type == "RKNN") 19 | { 20 | #ifdef USE_RKNN 21 | framework_ = std::make_shared(); 22 | #else 23 | std::cout << "Framework " << framework_type << " not implemented" < framework_; 24 | }; -------------------------------------------------------------------------------- /model/base/ocr_model.cpp: -------------------------------------------------------------------------------- 1 | #include "model/base/ocr_model.h" 2 | #include 3 | 4 | OcrModel::OcrModel(const std::string &yaml_file) { 5 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 6 | 7 | std::string model_path = yaml_node["model_path"].as(); 8 | 9 | std::string framework_type = yaml_node["framework"].as(); 10 | if (!Init(model_path, framework_type)) exit(0); 11 | 12 | std::vector input_size = yaml_node["input_size"].as>(); 13 | m_input_size_.width = input_size.at(0); 14 | m_input_size_.height = input_size.at(1); 15 | m_input_channel_ = yaml_node["input_channel"].as(); 16 | 17 | m_output_length_ = yaml_node["output_size"].as(); 18 | 19 | alphabet_ = yaml_node["alphabet"].as(); 20 | 21 | config_.input_len["images"] = m_input_size_.height * m_input_size_.width * m_input_channel_; 22 | config_.output_len["output"] = m_output_length_; 23 | config_.is_dynamic = false; 24 | Status status = framework_->Init(config_); 25 | if (status != Status::SUCCESS) { 26 | std::cout << "Failed to init framework" << std::endl; 27 | exit(0); 28 | } 29 | } 30 | 31 | OcrModel::~OcrModel() { std::cout << "Destruct ocr model" << std::endl; } -------------------------------------------------------------------------------- /model/base/ocr_model.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | #include "common/common.h" 4 | 5 | class OcrModel : public Model 6 | { 7 | public: 8 | OcrModel() = delete; 9 | explicit OcrModel(const std::string &yaml_file); 10 | virtual ~OcrModel(); 11 | virtual std::string detect(const cv::Mat &image) = 0; 12 | protected: 13 | virtual std::string postprocess(const std::unordered_map &output) = 0; 14 | cv::Size m_input_size_ = {32, 100}; 15 | size_t m_input_channel_ = 1; 16 | size_t m_output_length_ = 26; 17 | std::string alphabet_; 18 | }; -------------------------------------------------------------------------------- /model/clip/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | find_package(ZLIB REQUIRED) 2 | 3 | add_library(clip SHARED ${CMAKE_CURRENT_SOURCE_DIR}/text_tokenizer.cpp 4 | ${CMAKE_CURRENT_SOURCE_DIR}/image_encoder.cpp 5 | ${CMAKE_CURRENT_SOURCE_DIR}/text_encoder.cpp 6 | ${CMAKE_CURRENT_SOURCE_DIR}/clip.cpp) 7 | target_include_directories(clip PUBLIC 8 | ${CMAKE_SOURCE_DIR} 9 | ${ICU_INCLUDE_DIRS} 10 | ${INCLUDE_DIRS} 11 | ${PROJECT_BINARY_DIR}) 12 | target_link_libraries(clip base_model yaml-cpp) 13 | target_link_libraries(clip ${ZLIB_LIBRARIES} ${ICU_LIBRARIES} ${OpenCV_LIBS}) -------------------------------------------------------------------------------- /model/clip/clip.cpp: -------------------------------------------------------------------------------- 1 | #include "model/clip/clip.h" 2 | #include 3 | 4 | using namespace clip; 5 | 6 | static void normalize(IOTensor& tensor, size_t size) { 7 | float *ptr = (float*)tensor.data(); 8 | for (size_t i = 0; i < size; i++) { 9 | float norm = 0.0; 10 | for (size_t j = 0; j < 512; j++) { 11 | norm += std::pow(*(ptr+j), 2); 12 | } 13 | norm = std::sqrt(norm); 14 | 15 | for (size_t j = 0; j < 512; j++) { 16 | *ptr = *ptr / norm; 17 | ++ptr; 18 | } 19 | } 20 | } 21 | 22 | static void ReadPrompt(const std::string& prompt_path, std::vector& prompts) { 23 | std::ifstream file(prompt_path); 24 | 25 | if (file.is_open()) { 26 | std::string line; 27 | while (std::getline(file, line)) { 28 | prompts.push_back(line); // 逐行读取文件内容并存储到 vector 中 29 | } 30 | file.close(); // 关闭文件 31 | } else { 32 | std::cout << "无法打开文件" << std::endl; 33 | } 34 | } 35 | 36 | static void ReadTextEmbedding(const std::string& path, std::vector& text_embeddings) { 37 | std::streampos size; 38 | std::ifstream fin(path.c_str(), std::ios::binary | std::ios::in); 39 | fin.seekg(0, std::ios::end); 40 | size = fin.tellg(); 41 | text_embeddings.resize(size/sizeof(float)); 42 | fin.seekg(0, std::ios::beg); 43 | fin.read((char *)text_embeddings.data(), size); 44 | fin.close(); 45 | } 46 | 47 | Clip::Clip(const std::string& image_encoder_cfg, const std::string& text_encoder_cfg) { 48 | m_image_encoder_ = std::make_shared(image_encoder_cfg); 49 | 50 | YAML::Node yaml_node = YAML::LoadFile(text_encoder_cfg); 51 | bool online = yaml_node["online"].as(); 52 | if (online) { 53 | m_text_encoder_ = std::make_shared(text_encoder_cfg); 54 | } 55 | 56 | std::string prompt_path = yaml_node["prompts"].as(); 57 | std::vector prompts; 58 | ReadPrompt(prompt_path, prompts); 59 | 60 | std::string text_embedding_path = yaml_node["text_embedding"].as(); 61 | std::vector embeddings; 62 | ReadTextEmbedding(text_embedding_path, embeddings); 63 | 64 | float* ptr = embeddings.data(); 65 | for (size_t i = 0; i < prompts.size(); i++) { 66 | cache_[prompts[i]] = std::vector(512, 0.0); 67 | for (size_t j = 0; j < 512; j++) { 68 | cache_[prompts[i]][j] = *ptr++; 69 | } 70 | } 71 | } 72 | 73 | void Clip::encodeImages(const std::vector& images) { 74 | m_image_encoder_->forward(images, image_embeddings); 75 | normalize(image_embeddings, images.size()); 76 | } 77 | 78 | void Clip::encodeTexts(const std::vector& texts) { 79 | std::vector texts_not_in_cache; 80 | for (const auto& text: texts) { 81 | if (!cache_.count(text)) { 82 | texts_not_in_cache.push_back(text); 83 | } 84 | } 85 | 86 | float* ptr; 87 | 88 | if (!texts_not_in_cache.empty()) { 89 | if (!m_text_encoder_) { 90 | std::cout << "The text encoder is offline. Failed to generate text embeddings for text out of prompt list" << std::endl; 91 | exit(0); 92 | } 93 | IOTensor embeddings; 94 | m_text_encoder_->forward(texts_not_in_cache, embeddings); 95 | ptr = (float*)embeddings.data(); 96 | for (size_t i = 0; i < texts_not_in_cache.size(); i++) { 97 | cache_[texts_not_in_cache[i]] = std::vector(512, 0.0); 98 | for (size_t j = 0; j < 512; j++) { 99 | cache_[texts_not_in_cache[i]][j] = *ptr++; 100 | } 101 | } 102 | } 103 | 104 | text_embeddings.resize(texts.size() * 512 * sizeof(float)); 105 | text_embeddings.shape = std::vector{static_cast(texts.size()), 512}; 106 | text_embeddings.data_type = DataType::FP32; 107 | 108 | ptr = (float*)text_embeddings.data(); 109 | for (const auto& text : texts) { 110 | memcpy(ptr, cache_[text].data(), 512 * sizeof(float)); 111 | ptr += 512; 112 | } 113 | 114 | normalize(text_embeddings, texts.size()); 115 | } 116 | 117 | std::vector> Clip::computeProbabilities() { 118 | size_t num_images = image_embeddings.shape[0]; 119 | size_t num_texts = text_embeddings.shape[0]; 120 | cv::Mat image_matrix(num_images, 512, CV_32F, image_embeddings.data()); 121 | cv::Mat text_matrix(num_texts, 512, CV_32F, text_embeddings.data()); 122 | cv::Mat logits; 123 | cv::gemm(image_matrix, text_matrix.t(), 100, cv::Mat(), 0.0, logits); 124 | 125 | std::vector> probs; 126 | float *ptr = logits.ptr(); 127 | for (size_t i = 0; i < num_images; i++) { 128 | float exp_sum = 0.0; 129 | for (size_t j = 0; j < num_texts; j++) { 130 | exp_sum += std::exp(*(ptr+j)); 131 | } 132 | std::vector prob; 133 | for (size_t j = 0; j < num_texts; j++) { 134 | prob.push_back(std::exp(*(ptr++)) / exp_sum); 135 | } 136 | probs.push_back(prob); 137 | } 138 | return probs; 139 | } -------------------------------------------------------------------------------- /model/clip/clip.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/clip/image_encoder.h" 3 | #include "model/clip/text_encoder.h" 4 | 5 | namespace clip { 6 | 7 | class Clip { 8 | public: 9 | Clip() = delete; 10 | Clip(const std::string& image_encoder_cfg, const std::string& text_encoder_cfg); 11 | void encodeImages(const std::vector& images); 12 | void encodeTexts(const std::vector& texts); 13 | std::vector> computeProbabilities(); 14 | private: 15 | std::shared_ptr m_image_encoder_; 16 | std::shared_ptr m_text_encoder_; 17 | IOTensor image_embeddings; 18 | IOTensor text_embeddings; 19 | std::map> cache_; 20 | }; 21 | } -------------------------------------------------------------------------------- /model/clip/image_encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "model/clip/image_encoder.h" 2 | #include 3 | #include 4 | 5 | using namespace clip; 6 | 7 | ImageEncoder::ImageEncoder(const std::string &yaml_file) : m_input_size_(224, 224), m_output_size_(512) 8 | { 9 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 10 | 11 | std::string model_path = yaml_node["model_path"].as(); 12 | std::string framework_type = yaml_node["framework"].as(); 13 | int max_batch_size = yaml_node["max_batch_size"].as(); 14 | 15 | if (!Init(model_path, framework_type)) exit(0); 16 | 17 | config_.input_len["IMAGE"] = max_batch_size * 3 * m_input_size_.height * m_input_size_.width; 18 | config_.output_len["IMAGE_EMBEDDING"] = max_batch_size * m_output_size_; 19 | config_.is_dynamic = true; 20 | Status status = framework_->Init(config_); 21 | if (status != Status::SUCCESS) { 22 | std::cout << "Failed to init framework" << std::endl; 23 | exit(0); 24 | } 25 | } 26 | 27 | ImageEncoder::~ImageEncoder() { 28 | std::cout << "Destruct image encoder" << std::endl; 29 | } 30 | 31 | void ImageEncoder::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 32 | int h = input_image.rows; 33 | int w = input_image.cols; 34 | int resized_h, resized_w; 35 | if (h < w) { 36 | resized_h = 224; 37 | resized_w = int(224 * w / h); 38 | } else { 39 | resized_w = 224; 40 | resized_h = int(resized_w * h / w); 41 | } 42 | cv::Mat resized_img; 43 | cv::resize(input_image, resized_img, cv::Size(resized_w, resized_h)); 44 | 45 | int y_from = (resized_h - 224) / 2; 46 | int x_from = (resized_w - 224) / 2; 47 | cv::Rect roi(x_from, y_from, 224, 224); 48 | resized_img = resized_img(roi); 49 | 50 | cv::Scalar mean(0.48145466*255, 0.4578275*255, 0.40821073*255); 51 | float std = (0.26862954 + 0.26130258 + 0.27577711) / 3 * 255; 52 | cv::dnn::blobFromImage(resized_img, output_image, 1 / std, cv::Size(), cv::Scalar(), false, false, CV_32F); 53 | } 54 | 55 | void ImageEncoder::forward(const std::vector &images, IOTensor& features) { 56 | std::unordered_map input, output; 57 | 58 | input["IMAGE"] = IOTensor(); 59 | input["IMAGE"].resize(images.size() * 3 * m_input_size_.height * m_input_size_.width * sizeof(float)); 60 | input["IMAGE"].shape = std::vector{static_cast(images.size()), 3, m_input_size_.height, m_input_size_.width}; 61 | auto ptr = input["IMAGE"].data(); 62 | for (const auto& image: images) { 63 | cv::Mat nchw; 64 | preprocess(image, nchw); 65 | assert(nchw.total() * nchw.elemSize() == 3 * m_input_size_.height * m_input_size_.width * sizeof(float)); 66 | memcpy(ptr, nchw.ptr(), nchw.total() * nchw.elemSize()); 67 | ptr += nchw.total() * nchw.elemSize(); 68 | } 69 | 70 | // 输出张量设置 71 | output["IMAGE_EMBEDDING"] = IOTensor(); 72 | output["IMAGE_EMBEDDING"].resize(images.size() * config_.output_len["IMAGE_EMBEDDING"] * sizeof(float)); 73 | output["IMAGE_EMBEDDING"].shape = std::vector{static_cast(images.size()), config_.output_len["IMAGE_EMBEDDING"]}; 74 | 75 | this->framework_->forward(input, output); 76 | 77 | features.resize(output["IMAGE_EMBEDDING"].size()); 78 | memcpy(features.data(), output["IMAGE_EMBEDDING"].data(), features.size()); 79 | features.shape = output["IMAGE_EMBEDDING"].shape; 80 | } -------------------------------------------------------------------------------- /model/clip/image_encoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | 4 | namespace clip { 5 | 6 | class ImageEncoder : public Model { 7 | public: 8 | ImageEncoder() = delete; 9 | ImageEncoder(const std::string &yaml_file); 10 | virtual ~ImageEncoder(); 11 | void forward(const std::vector &images, IOTensor &features); 12 | 13 | cv::Size input_size() const { return m_input_size_; } 14 | size_t output_size() const { return m_output_size_; } 15 | 16 | protected: 17 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image); 18 | 19 | private: 20 | cv::Size m_input_size_; 21 | size_t m_output_size_; 22 | }; 23 | 24 | } -------------------------------------------------------------------------------- /model/clip/text_encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "model/clip/text_encoder.h" 2 | 3 | #include 4 | 5 | using namespace clip; 6 | 7 | TextEncoder::TextEncoder(const std::string &yaml_file) : m_input_size_(77), m_output_size_(512) { 8 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 9 | 10 | std::string bpe_path = yaml_node["bpe_path"].as(); 11 | m_tokenizer_ = std::make_shared(bpe_path); 12 | 13 | std::string model_path = yaml_node["model_path"].as(); 14 | std::string framework_type = yaml_node["framework"].as(); 15 | 16 | if (!Init(model_path, framework_type)) exit(0); 17 | 18 | config_.input_len["TEXT"] = 2 * m_input_size_; 19 | config_.output_len["TEXT_EMBEDDING"] = 2 * m_output_size_; 20 | config_.is_dynamic = true; 21 | Status status = framework_->Init(config_); 22 | if (status != Status::SUCCESS) { 23 | std::cout << "Failed to init framework" << std::endl; 24 | exit(0); 25 | } 26 | } 27 | 28 | TextEncoder::~TextEncoder() { std::cout << "Destruct text encoder" << std::endl; } 29 | 30 | void TextEncoder::preprocess(const std::vector &texts, IOTensor &text_embeddings) { 31 | std::vector> tokens = m_tokenizer_->batchTokenize(texts); 32 | std::vector tensor; 33 | for (const auto &token : tokens) { 34 | for (int i : token) { 35 | tensor.push_back(i); 36 | } 37 | } 38 | 39 | text_embeddings.resize(tensor.size() * sizeof(int)); 40 | text_embeddings.shape = 41 | std::vector{static_cast(texts.size()), static_cast(tokens[0].size())}; 42 | text_embeddings.data_type = DataType::INT32; 43 | memcpy(text_embeddings.data(), tensor.data(), text_embeddings.size()); 44 | } 45 | 46 | void TextEncoder::forward(const std::vector &texts, IOTensor &features) { 47 | std::unordered_map input, output; 48 | 49 | input["TEXT"] = IOTensor(); 50 | preprocess(texts, input["TEXT"]); 51 | 52 | output["TEXT_EMBEDDING"] = IOTensor(); 53 | output["TEXT_EMBEDDING"].resize(texts.size() * m_output_size_ * sizeof(float)); 54 | output["TEXT_EMBEDDING"].shape = 55 | std::vector{static_cast(texts.size()), static_cast(m_output_size_)}; 56 | output["TEXT_EMBEDDING"].data_type = DataType::FP32; 57 | 58 | this->framework_->forward(input, output); 59 | 60 | features.resize(output["TEXT_EMBEDDING"].size()); 61 | memcpy(features.data(), output["TEXT_EMBEDDING"].data(), features.size()); 62 | features.shape = output["TEXT_EMBEDDING"].shape; 63 | features.data_type = output["TEXT_EMBEDDING"].data_type; 64 | } -------------------------------------------------------------------------------- /model/clip/text_encoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | #include "model/clip/text_tokenizer.h" 4 | 5 | namespace clip { 6 | class TextEncoder : public Model { 7 | public: 8 | TextEncoder() = delete; 9 | TextEncoder(const std::string &yaml_file); 10 | virtual ~TextEncoder(); 11 | void forward(const std::vector &texts, IOTensor &features); 12 | 13 | size_t input_size() const { return m_input_size_; } 14 | size_t output_size() const { return m_output_size_; } 15 | 16 | protected: 17 | void preprocess(const std::vector &texts, IOTensor &text_embeddings); 18 | 19 | private: 20 | std::shared_ptr m_tokenizer_; 21 | size_t m_input_size_; 22 | size_t m_output_size_; 23 | }; 24 | } -------------------------------------------------------------------------------- /model/clip/text_tokenizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace clip { 14 | 15 | class TextTokenizer { 16 | public: 17 | TextTokenizer() = delete; 18 | TextTokenizer(const std::string &path); 19 | 20 | std::vector tokenize(const std::string &text, size_t context_length = 77); 21 | std::vector> batchTokenize(const std::vector &texts, size_t context_length = 77); 22 | 23 | private: 24 | std::string bpe(const std::string &token); 25 | void encode(const std::string &str, std::vector &bpe_tokens); 26 | std::string decode(const std::vector &bpe_tokens); 27 | 28 | std::map byte_encoder; 29 | std::map byte_decoder; 30 | std::map encoder; 31 | std::map decoder; 32 | std::map, size_t> bpe_ranks; 33 | std::map cache; 34 | std::regex pattern; 35 | }; 36 | } // namespace clip 37 | -------------------------------------------------------------------------------- /model/ocr/attention.cpp: -------------------------------------------------------------------------------- 1 | #include "model/ocr/attention.h" 2 | 3 | #include 4 | 5 | std::string AttnModel::detect(const cv::Mat &image) { 6 | std::unordered_map input, output; 7 | 8 | // 输入tensor设置 9 | cv::Mat nchw; 10 | cv::dnn::blobFromImage(image, nchw, 1 / 64.f, m_input_size_, cv::Scalar(127.5, 127.5, 127.5), false, false, CV_32F); 11 | 12 | input["images"] = IOTensor(); 13 | input["images"].shape = std::vector{1, static_cast(m_input_channel_), m_input_size_.height, m_input_size_.width}; 14 | input["images"].data_type = DataType::FP32; 15 | input["images"].resize(nchw.total() * nchw.elemSize()); 16 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 17 | 18 | // 输出张量设置 19 | output["output"] = IOTensor(); 20 | output["output"].shape = std::vector{1, static_cast(m_output_length_)}; 21 | output["output"].data_type = DataType::FP32; 22 | output["output"].resize(m_output_length_ * sizeof(float)); 23 | 24 | this->framework_->forward(input, output); 25 | return postprocess(output); 26 | } 27 | 28 | std::string AttnModel::postprocess(const std::unordered_map &output) { 29 | float *const outputs = (float *)output.at("output").data(); 30 | std::string str; 31 | for (size_t i = 0; i < m_output_length_; i++) { 32 | int idx = static_cast(outputs[i]); 33 | if (idx != 0){ 34 | str.push_back(alphabet_[idx - 1]); 35 | } else { 36 | break; 37 | } 38 | } 39 | return str; 40 | } -------------------------------------------------------------------------------- /model/ocr/attention.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/ocr_model.h" 5 | 6 | class AttnModel : public OcrModel { 7 | public: 8 | AttnModel() = delete; 9 | explicit AttnModel(const std::string &yaml_file) : OcrModel(yaml_file) {} 10 | ~AttnModel() {} 11 | 12 | std::string detect(const cv::Mat &image) override; 13 | 14 | protected: 15 | std::string postprocess(const std::unordered_map &output) override; 16 | }; -------------------------------------------------------------------------------- /model/ocr/ctc.cpp: -------------------------------------------------------------------------------- 1 | #include "model/ocr/ctc.h" 2 | 3 | std::string CtcModel::detect(const cv::Mat &image) { 4 | std::unordered_map input, output; 5 | 6 | // 输入tensor设置 7 | cv::Mat nchw; 8 | cv::dnn::blobFromImage(image, nchw, 1 / 127.5f, m_input_size_, cv::Scalar(127.5, 127.5, 127.5), false, false, CV_32F); 9 | 10 | input["images"] = IOTensor(); 11 | input["images"].shape = std::vector{1, static_cast(m_input_channel_), m_input_size_.height, m_input_size_.width}; 12 | input["images"].data_type = DataType::FP32; 13 | input["images"].resize(nchw.total() * nchw.elemSize()); 14 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 15 | 16 | // 输出张量设置 17 | output["output"] = IOTensor(); 18 | output["output"].shape = std::vector{1, static_cast(m_output_length_), 1}; 19 | output["output"].data_type = DataType::FP32; 20 | output["output"].resize(config_.output_len["output"] * sizeof(float)); 21 | 22 | this->framework_->forward(input, output); 23 | return postprocess(output); 24 | } 25 | 26 | std::string CtcModel::postprocess(const std::unordered_map &output) { 27 | float *const outputs = (float *)output.at("output").data(); 28 | std::string str; 29 | for (size_t i = 0; i < m_output_length_; i++) { 30 | int idx = static_cast(outputs[i]); 31 | if (idx == 0 || (i > 0 && static_cast(outputs[i-1]) == idx)) continue; 32 | str.push_back(alphabet_[idx - 1]); 33 | } 34 | return str; 35 | } -------------------------------------------------------------------------------- /model/ocr/ctc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/ocr_model.h" 5 | 6 | class CtcModel : public OcrModel { 7 | public: 8 | CtcModel() = delete; 9 | explicit CtcModel(const std::string &yaml_file): OcrModel(yaml_file) {} 10 | ~CtcModel() {} 11 | 12 | std::string detect(const cv::Mat &image) override; 13 | 14 | protected: 15 | std::string postprocess(const std::unordered_map &output) override; 16 | 17 | }; -------------------------------------------------------------------------------- /model/ocr/dbnet.cpp: -------------------------------------------------------------------------------- 1 | #include "model/ocr/dbnet.h" 2 | 3 | #include 4 | #include "polyclipping/clipper.hpp" 5 | 6 | static cv::RotatedRect expandBox(cv::Point2f temp[], float ratio) 7 | { 8 | ClipperLib::Path path = { 9 | {ClipperLib::cInt(temp[0].x), ClipperLib::cInt(temp[0].y)}, 10 | {ClipperLib::cInt(temp[1].x), ClipperLib::cInt(temp[1].y)}, 11 | {ClipperLib::cInt(temp[2].x), ClipperLib::cInt(temp[2].y)}, 12 | {ClipperLib::cInt(temp[3].x), ClipperLib::cInt(temp[3].y)}}; 13 | double area = ClipperLib::Area(path); 14 | double distance; 15 | double length = 0.0; 16 | for (int i = 0; i < 4; i++) { 17 | length = length + sqrtf(powf((temp[i].x - temp[(i + 1) % 4].x), 2) + 18 | powf((temp[i].y - temp[(i + 1) % 4].y), 2)); 19 | } 20 | 21 | distance = area * ratio / length; 22 | 23 | ClipperLib::ClipperOffset offset; 24 | offset.AddPath(path, ClipperLib::JoinType::jtRound, 25 | ClipperLib::EndType::etClosedPolygon); 26 | ClipperLib::Paths paths; 27 | offset.Execute(paths, distance); 28 | 29 | std::vector contour; 30 | for (size_t i = 0; i < paths[0].size(); i++) { 31 | contour.emplace_back(paths[0][i].X, paths[0][i].Y); 32 | } 33 | offset.Clear(); 34 | return cv::minAreaRect(contour); 35 | } 36 | 37 | static bool get_mini_boxes(cv::RotatedRect& rotated_rect, cv::Point2f rect[], 38 | int min_size) 39 | { 40 | 41 | cv::Point2f temp_rect[4]; 42 | rotated_rect.points(temp_rect); 43 | for (int i = 0; i < 4; i++) { 44 | for (int j = i + 1; j < 4; j++) { 45 | if (temp_rect[i].x > temp_rect[j].x) { 46 | cv::Point2f temp; 47 | temp = temp_rect[i]; 48 | temp_rect[i] = temp_rect[j]; 49 | temp_rect[j] = temp; 50 | } 51 | } 52 | } 53 | int index0 = 0; 54 | int index1 = 1; 55 | int index2 = 2; 56 | int index3 = 3; 57 | if (temp_rect[1].y > temp_rect[0].y) { 58 | index0 = 0; 59 | index3 = 1; 60 | } else { 61 | index0 = 1; 62 | index3 = 0; 63 | } 64 | if (temp_rect[3].y > temp_rect[2].y) { 65 | index1 = 2; 66 | index2 = 3; 67 | } else { 68 | index1 = 3; 69 | index2 = 2; 70 | } 71 | 72 | rect[0] = temp_rect[index0]; // Left top coordinate 73 | rect[1] = temp_rect[index1]; // Left bottom coordinate 74 | rect[2] = temp_rect[index2]; // Right bottom coordinate 75 | rect[3] = temp_rect[index3]; // Right top coordinate 76 | 77 | if (rotated_rect.size.width < min_size || 78 | rotated_rect.size.height < min_size) { 79 | return false; 80 | } else { 81 | return true; 82 | } 83 | } 84 | 85 | static float get_box_score(float* map, cv::Point2f rect[], int width, int height, 86 | float threshold) 87 | { 88 | 89 | int xmin = width - 1; 90 | int ymin = height - 1; 91 | int xmax = 0; 92 | int ymax = 0; 93 | 94 | for (int j = 0; j < 4; j++) { 95 | if (rect[j].x < xmin) { 96 | xmin = rect[j].x; 97 | } 98 | if (rect[j].y < ymin) { 99 | ymin = rect[j].y; 100 | } 101 | if (rect[j].x > xmax) { 102 | xmax = rect[j].x; 103 | } 104 | if (rect[j].y > ymax) { 105 | ymax = rect[j].y; 106 | } 107 | } 108 | float sum = 0; 109 | int num = 0; 110 | for (int i = ymin; i <= ymax; i++) { 111 | for (int j = xmin; j <= xmax; j++) { 112 | if (map[i * width + j] > threshold) { 113 | sum = sum + map[i * width + j]; 114 | num++; 115 | } 116 | } 117 | } 118 | 119 | return sum / num; 120 | } 121 | 122 | DBNet::DBNet(const std::string &yaml_file) { 123 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 124 | 125 | std::string model_path = yaml_node["model_path"].as(); 126 | std::string framework_type = yaml_node["framework"].as(); 127 | 128 | m_box_thres_ = yaml_node["box_thres"].as(); 129 | std::vector max_input_size = yaml_node["max_input_size"].as>(); 130 | 131 | if (!Init(model_path, framework_type)) exit(0); 132 | 133 | config_.input_len["images"] = max_input_size[0] * max_input_size[1] * max_input_size[2] * max_input_size[3]; 134 | config_.output_len["output"] = max_input_size[0] * 2 * max_input_size[2] * max_input_size[3]; 135 | config_.is_dynamic = true; 136 | Status status = framework_->Init(config_); 137 | if (status != Status::SUCCESS) { 138 | std::cout << "Failed to init framework" << std::endl; 139 | exit(0); 140 | } 141 | } 142 | 143 | DBNet::~DBNet() 144 | { 145 | std::cout << "Destruct dbnet" << std::endl; 146 | } 147 | 148 | void DBNet::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 149 | // mean value [0.406, 0.456, 0.485] * 255 150 | // std value [0.225, 0.225, 0.225] * 255 151 | cv::Mat mask; 152 | this->pparam_ = paddimg(input_image, mask, 640); 153 | cv::dnn::blobFromImage(mask, output_image, 1 / 57.375, cv::Size(), cv::Scalar(103.53f, 116.28f, 123.675f), false, false, CV_32F); 154 | } 155 | 156 | void DBNet::detect(const cv::Mat &image, std::vector &objs) { 157 | std::unordered_map input, output; 158 | 159 | // 输入tensor设置 160 | cv::Mat nchw; 161 | preprocess(image, nchw); 162 | 163 | input["images"] = IOTensor(); 164 | input["images"].resize(nchw.total() * nchw.elemSize()); 165 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 166 | input["images"].shape = std::vector{1, 3, nchw.size[2], nchw.size[3]}; 167 | input["images"].data_type = DataType::FP32; 168 | 169 | 170 | // 输出张量设置 171 | output["output"] = IOTensor(); 172 | output["output"].resize(2 * nchw.size[2] * nchw.size[3] * sizeof(float)); 173 | output["output"].shape = std::vector{1, 2 ,nchw.size[2] ,nchw.size[3]}; 174 | output["output"].data_type = DataType::FP32; 175 | 176 | this->framework_->forward(input, output); 177 | postprocess(output, objs); 178 | } 179 | 180 | void DBNet::postprocess(const std::unordered_map &output, std::vector &objs) { 181 | objs.clear(); 182 | 183 | float scale = this->pparam_.ratio; 184 | 185 | float * const prob = (float *)output.at("output").data(); 186 | int height = output.at("output").shape[2]; 187 | int width = output.at("output").shape[3]; 188 | 189 | cv::Mat map = cv::Mat::zeros(cv::Size(width, height), CV_8UC1); 190 | for (int h = 0; h < height; ++h) { 191 | uchar *ptr = map.ptr(h); 192 | for (int w = 0; w < width; ++w) { 193 | ptr[w] = (prob[h * width + w] > 0.3) ? 255 : 0; 194 | } 195 | } 196 | 197 | // Extracting minimum circumscribed rectangle 198 | std::vector> contours; 199 | std::vector hierarcy; 200 | cv::findContours(map, contours, hierarcy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); 201 | 202 | std::vector boundRect(contours.size()); 203 | std::vector box(contours.size()); 204 | cv::Point2f rect[4]; 205 | cv::Point2f order_rect[4]; 206 | 207 | for (size_t i = 0; i < contours.size(); i++) { 208 | cv::RotatedRect rotated_rect = cv::minAreaRect(cv::Mat(contours[i])); 209 | if (!get_mini_boxes(rotated_rect, rect, m_box_thres_)) { 210 | std::cout << "box too small" << std::endl; 211 | continue; 212 | } 213 | 214 | // drop low score boxes 215 | float score = get_box_score(prob, rect, width, height, 216 | m_score_thres_); 217 | if (score < m_box_thres_) { 218 | // std::cout << "score too low = " << score << ", threshold = " << m_box_thres_ << std::endl; 219 | continue; 220 | } 221 | 222 | // Scaling the predict boxes depend on EXPANDRATIO 223 | cv::RotatedRect expandbox = expandBox(rect, m_expand_ratio_); 224 | expandbox.points(rect); 225 | if (!get_mini_boxes(expandbox, rect, m_box_min_size_ + 2)) { 226 | continue; 227 | } 228 | 229 | // Restore the coordinates to the original image 230 | for (int k = 0; k < 4; k++) { 231 | order_rect[k] = rect[k]; 232 | order_rect[k].x = int(order_rect[k].x * scale); 233 | order_rect[k].y = int(order_rect[k].y * scale); 234 | } 235 | 236 | Object obj; 237 | obj.label = 0; 238 | obj.rect = cv::Rect2i(cv::Point(order_rect[0].x,order_rect[0].y), cv::Point(order_rect[2].x,order_rect[2].y)); 239 | objs.push_back(obj); 240 | } 241 | } -------------------------------------------------------------------------------- /model/ocr/dbnet.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class DBNet : public DetectionModel { 7 | public: 8 | DBNet() = delete; 9 | explicit DBNet(const std::string &yaml_file); 10 | ~DBNet(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | 18 | private: 19 | cv::Size m_input_size_ = {640, 640}; 20 | float m_box_thres_ = 0.3f; 21 | float m_expand_ratio_ = 1.5f; 22 | float m_score_thres_ = 0.3f; 23 | int m_box_min_size_ = 5; 24 | PreParam pparam_; 25 | }; -------------------------------------------------------------------------------- /model/ocr/scripts/abinet_export.py: -------------------------------------------------------------------------------- 1 | import os 2 | import onnx 3 | import torch 4 | import argparse 5 | from io import BytesIO 6 | import torch.nn as nn 7 | 8 | from utils import Config 9 | 10 | try: 11 | import onnxsim 12 | except ImportError: 13 | onnxsim = None 14 | 15 | class ONNXModel(nn.Module): 16 | def __init__(self, config, device): 17 | super().__init__() 18 | self.get_model(config, device) 19 | self.load(config.model_checkpoint, device=device) 20 | print('loading pretrained model from %s' % config.model_checkpoint) 21 | 22 | def forward(self, x): 23 | logits, length = self.model(x) 24 | scores, labels = logits.max(dim=-1, keepdim=True) 25 | return labels.to(torch.float32) 26 | 27 | def get_model(self, config, device): 28 | import importlib 29 | names = config.model_name.split('.') 30 | module_name, class_name = '.'.join(names[:-1]), names[-1] 31 | cls = getattr(importlib.import_module(module_name), class_name) 32 | self.model = cls(config).eval().to(device) 33 | 34 | def load(self, file, device=None, strict=True): 35 | if device is None: 36 | device = 'cpu' 37 | elif isinstance(device, int): 38 | device = torch.device('cuda', device) 39 | assert os.path.isfile(file) 40 | state = torch.load(file, map_location=device) 41 | if set(state.keys()) == {'model', 'opt'}: 42 | state = state['model'] 43 | self.model.load_state_dict(state, strict=strict) 44 | 45 | def parse_args(): 46 | parser = argparse.ArgumentParser() 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument('-w', 49 | '--weights', 50 | type=str, 51 | default='workdir/train-abinet/best-train-abinet.pth', 52 | help='PyTorch weights') 53 | parser.add_argument('--opset', 54 | type=int, 55 | default=13, 56 | help='ONNX opset version') 57 | parser.add_argument('--sim', 58 | action='store_true', 59 | help='simplify onnx model') 60 | parser.add_argument('--input-shape', 61 | nargs='+', 62 | type=int, 63 | default=[1, 3, 32, 128], 64 | help='Model input shape only for api builder') 65 | parser.add_argument('--cuda', type=int, default=-1) 66 | parser.add_argument('--config', type=str, default='configs/train_abinet.yaml', 67 | help='path to config file') 68 | parser.add_argument('--model_eval', type=str, default='alignment', 69 | choices=['alignment', 'vision', 'language']) 70 | args = parser.parse_args() 71 | assert len(args.input_shape) == 4 72 | return args 73 | 74 | def main(args): 75 | config = Config(args.config) 76 | if args.weights is not None: config.model_checkpoint = args.weights 77 | if args.model_eval is not None: config.model_eval = args.model_eval 78 | config.global_phase = 'test' 79 | config.model_vision_checkpoint, config.model_language_checkpoint = None, None 80 | device = 'cpu' if args.cuda < 0 else f'cuda:{args.cuda}' 81 | config.export = True 82 | 83 | model = ONNXModel(config, device) 84 | fake_input = torch.randn(args.input_shape).to(device) 85 | for _ in range(2): 86 | model(fake_input) 87 | 88 | with BytesIO() as f: 89 | torch.onnx.export( 90 | model, 91 | fake_input, 92 | f, 93 | opset_version=args.opset, 94 | do_constant_folding=True, 95 | export_params=True, 96 | input_names=['images'], 97 | output_names=['output']) 98 | f.seek(0) 99 | onnx_model = onnx.load(f) 100 | 101 | onnx.checker.check_model(onnx_model) 102 | save_path = args.weights.replace('.pth', '.onnx') 103 | 104 | if args.sim: 105 | try: 106 | onnx_model, check = onnxsim.simplify(onnx_model) 107 | assert check, 'assert check failed' 108 | except Exception as e: 109 | print(f'Simplifier failure: {e}') 110 | onnx.save(onnx_model, save_path) 111 | print(f'ONNX export success, saved as {save_path}') 112 | 113 | if __name__ == '__main__': 114 | main(parse_args()) -------------------------------------------------------------------------------- /model/ocr/scripts/crnn_export.py: -------------------------------------------------------------------------------- 1 | import onnx 2 | import torch 3 | import argparse 4 | from io import BytesIO 5 | import torch.nn as nn 6 | 7 | import models.crnn as crnn 8 | try: 9 | import onnxsim 10 | except ImportError: 11 | onnxsim = None 12 | 13 | class CRNN(nn.Module): 14 | def __init__(self, weights): 15 | super().__init__() 16 | self.crnn = crnn.CRNN(32, 1, 37, 256) 17 | print('loading pretrained model from %s' % weights) 18 | self.crnn.load_state_dict(torch.load(weights)) 19 | 20 | def forward(self, x): 21 | output = self.crnn(x) 22 | scores, labels = output.transpose(0,1).max(dim=-1, keepdim=True) 23 | return labels.to(torch.float32) 24 | 25 | def parse_args(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('-w', 28 | '--weights', 29 | type=str, 30 | required=True, 31 | help='PyTorch crnn weights') 32 | parser.add_argument('--opset', 33 | type=int, 34 | default=11, 35 | help='ONNX opset version') 36 | parser.add_argument('--sim', 37 | action='store_true', 38 | help='simplify onnx model') 39 | parser.add_argument('--input-shape', 40 | nargs='+', 41 | type=int, 42 | default=[1, 1, 32, 100], 43 | help='Model input shape only for api builder') 44 | parser.add_argument('--device', 45 | type=str, 46 | default='cpu', 47 | help='Export ONNX device') 48 | args = parser.parse_args() 49 | assert len(args.input_shape) == 4 50 | return args 51 | 52 | def main(args): 53 | model_path = args.weights 54 | 55 | model = CRNN(model_path) 56 | 57 | model.eval() 58 | model.to(args.device) 59 | fake_input = torch.randn(args.input_shape).to(args.device) 60 | for _ in range(2): 61 | model(fake_input) 62 | 63 | with BytesIO() as f: 64 | torch.onnx.export( 65 | model, 66 | fake_input, 67 | f, 68 | opset_version=args.opset, 69 | input_names=['images'], 70 | output_names=['output']) 71 | f.seek(0) 72 | onnx_model = onnx.load(f) 73 | 74 | onnx.checker.check_model(onnx_model) 75 | save_path = args.weights.replace('.pth', '.onnx') 76 | 77 | if args.sim: 78 | try: 79 | onnx_model, check = onnxsim.simplify(onnx_model) 80 | assert check, 'assert check failed' 81 | except Exception as e: 82 | print(f'Simplifier failure: {e}') 83 | onnx.save(onnx_model, save_path) 84 | print(f'ONNX export success, saved as {save_path}') 85 | 86 | if __name__ == '__main__': 87 | main(parse_args()) -------------------------------------------------------------------------------- /model/ocr/scripts/dbnet_export.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import onnx 3 | from io import BytesIO 4 | 5 | try: 6 | import onnxsim 7 | except ImportError: 8 | onnxsim = None 9 | 10 | from models import build_model 11 | 12 | def parse_args(): 13 | import argparse 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-w', 16 | '--weights', 17 | type=str, 18 | required=True, 19 | help='PyTorch dbnet weights') 20 | parser.add_argument('--opset', 21 | type=int, 22 | default=11, 23 | help='ONNX opset version') 24 | parser.add_argument('--sim', 25 | action='store_true', 26 | help='simplify onnx model') 27 | parser.add_argument('--device', 28 | type=str, 29 | default='cpu', 30 | help='Export ONNX device') 31 | args = parser.parse_args() 32 | return args 33 | 34 | def main(args): 35 | checkpoint = torch.load(args.weights, map_location=args.device) 36 | config = checkpoint['config'] 37 | config['arch']['backbone']['pretrained'] = False 38 | model = build_model(config['arch']) 39 | model.load_state_dict(checkpoint['state_dict']) 40 | model.to(args.device) 41 | 42 | fake_input = torch.randn((1, 3, 640, 640)).to(args.device) 43 | for _ in range(2): 44 | model(fake_input) 45 | save_path = args.weights.replace('.pth', '.onnx') 46 | 47 | with BytesIO() as f: 48 | torch.onnx.export(model, fake_input, f, verbose=False, opset_version=12, input_names=['images'], 49 | output_names=['output'], 50 | dynamic_axes={"images": {2: "height", 3: "width"}}) 51 | f.seek(0) 52 | onnx_model = onnx.load(f) 53 | 54 | onnx.checker.check_model(onnx_model) # check onnx model 55 | if args.sim: 56 | try: 57 | onnx_model, check = onnxsim.simplify(onnx_model) 58 | assert check, 'assert check failed' 59 | except Exception as e: 60 | print(f'Simplifier failure: {e}') 61 | onnx.save(onnx_model, save_path) 62 | print('ONNX export success, saved as %s' % save_path) 63 | 64 | 65 | 66 | if __name__ == '__main__': 67 | main(parse_args()) 68 | -------------------------------------------------------------------------------- /model/sam/image_encoder.cpp: -------------------------------------------------------------------------------- 1 | #include "model/sam/image_encoder.h" 2 | #include 3 | 4 | using namespace sam; 5 | 6 | ImageEncoder::ImageEncoder(const std::string &yaml_file) : m_input_size_(1024, 1024), m_output_size_(64, 64) 7 | { 8 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 9 | 10 | std::string model_path = yaml_node["model_path"].as(); 11 | std::string framework_type = yaml_node["framework"].as(); 12 | 13 | if (!Init(model_path, framework_type)) exit(0); 14 | 15 | config_.input_len["image"] = 3 * m_input_size_.height * m_input_size_.width; 16 | config_.output_len["image_embeddings"] = 256 * m_output_size_.height * m_output_size_.width; 17 | config_.is_dynamic = false; 18 | Status status = framework_->Init(config_); 19 | if (status != Status::SUCCESS) { 20 | std::cout << "Failed to init framework" << std::endl; 21 | exit(0); 22 | } 23 | } 24 | 25 | ImageEncoder::~ImageEncoder() { 26 | std::cout << "Destruct image encoder" << std::endl; 27 | } 28 | 29 | void ImageEncoder::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 30 | cv::dnn::blobFromImage(input_image, output_image, 1 / 57.f, cv::Size(), cv::Scalar(123.675, 116.28, 103.53), false, false, CV_32F); 31 | } 32 | 33 | void ImageEncoder::forward(const cv::Mat &image, IOTensor& features) { 34 | std::unordered_map input, output; 35 | 36 | cv::Mat nchw; 37 | preprocess(image, nchw); 38 | 39 | input["image"] = IOTensor(); 40 | input["image"].resize(nchw.total() * nchw.elemSize()); 41 | input["image"].shape = std::vector{1, 3, m_input_size_.height, m_input_size_.width}; 42 | input["image"].data_type = DataType::FP32; 43 | memcpy(input["image"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 44 | 45 | 46 | // 输出张量设置 47 | output["image_embeddings"] = IOTensor(); 48 | output["image_embeddings"].data_type = DataType::FP32; 49 | output["image_embeddings"].shape = std::vector{1, 256, m_output_size_.height, m_output_size_.width}; 50 | output["image_embeddings"].resize(config_.output_len["image_embeddings"] * sizeof(float)); 51 | 52 | this->framework_->forward(input, output); 53 | 54 | features.resize(config_.output_len["image_embeddings"] * sizeof(float)); 55 | memcpy(features.data(), output["image_embeddings"].data(), features.size()); 56 | features.shape = std::vector{1, 256, 64, 64}; 57 | } -------------------------------------------------------------------------------- /model/sam/image_encoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | 4 | namespace sam { 5 | 6 | class ImageEncoder : public Model { 7 | public: 8 | ImageEncoder() = delete; 9 | ImageEncoder(const std::string &yaml_file); 10 | virtual ~ImageEncoder(); 11 | void forward(const cv::Mat &image, IOTensor &features); 12 | 13 | cv::Size input_size() const { return m_input_size_; } 14 | cv::Size output_size() const { return m_output_size_; } 15 | 16 | protected: 17 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image); 18 | 19 | private: 20 | cv::Size m_input_size_; 21 | cv::Size m_output_size_; 22 | }; 23 | } // namespace sam -------------------------------------------------------------------------------- /model/sam/mask_decoder.cpp: -------------------------------------------------------------------------------- 1 | #include "model/sam/mask_decoder.h" 2 | #include 3 | 4 | using namespace sam; 5 | 6 | MaskDecoder::MaskDecoder(const std::string &yaml_file) : features_shape{1, 256, 64, 64}{ 7 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 8 | 9 | std::string model_path = yaml_node["model_path"].as(); 10 | std::string framework_type = yaml_node["framework"].as(); 11 | 12 | if (!Init(model_path, framework_type)) exit(0); 13 | 14 | config_.input_len["image_embeddings"] = 15 | features_shape[0] * features_shape[1] * features_shape[2] * features_shape[3]; 16 | config_.input_len["point_coords"] = 10 * 2; 17 | config_.input_len["point_labels"] = 10; 18 | config_.input_len["mask_input"] = 1 * 1 * 256 * 256; 19 | config_.input_len["has_mask_input"] = 1; 20 | 21 | config_.output_len["iou_predictions"] = 1 * 4; 22 | config_.output_len["low_res_masks"] = 1 * 4 * 256 * 256; 23 | config_.is_dynamic = true; 24 | Status status = framework_->Init(config_); 25 | if (status != Status::SUCCESS) { 26 | std::cout << "Failed to init framework" << std::endl; 27 | exit(0); 28 | } 29 | } 30 | 31 | MaskDecoder::~MaskDecoder() { std::cout << "Destruct sam mask decoder" << std::endl; } 32 | 33 | 34 | // The point labels may be 35 | // | Point Label | Description | 36 | // |:--------------------:|-------------| 37 | // | 0 | Background point | 38 | // | 1 | Foreground point | 39 | // | 2 | Bounding box top-left | 40 | // | 3 | Bounding box bottom-right | 41 | void MaskDecoder::forward(const IOTensor &features, const std::vector &image_point_coords, 42 | const std::vector &image_point_labels, cv::Mat& low_res_mask) { 43 | std::unordered_map input, output; 44 | 45 | input["image_embeddings"] = IOTensor(); 46 | input["image_embeddings"].shape = features_shape; 47 | input["image_embeddings"].resize(config_.input_len["image_embeddings"] * sizeof(float)); 48 | memcpy(input["image_embeddings"].data(), features.data(), input["image_embeddings"].size()); 49 | 50 | input["point_coords"] = IOTensor(); 51 | input["point_coords"].shape = std::vector{1, static_cast(image_point_coords.size()), 2}; 52 | input["point_coords"].resize(image_point_coords.size() * 2 * sizeof(float)); 53 | std::vector points; 54 | for (const auto& point: image_point_coords) { 55 | points.push_back(point.x); 56 | points.push_back(point.y); 57 | } 58 | memcpy(input["point_coords"].data(), points.data(), input["point_coords"].size()); 59 | 60 | input["point_labels"] = IOTensor(); 61 | input["point_labels"].shape = std::vector{1, static_cast(image_point_coords.size())}; 62 | input["point_labels"].resize(image_point_coords.size() * sizeof(float)); 63 | memcpy(input["point_labels"].data(), image_point_labels.data(), input["point_labels"].size()); 64 | 65 | input["mask_input"] = IOTensor(); 66 | input["mask_input"].shape = std::vector{1, 1, 256, 256}; 67 | input["mask_input"].resize(256 * 256 * sizeof(float)); 68 | 69 | input["has_mask_input"] = IOTensor(); 70 | input["has_mask_input"].shape = std::vector{1}; 71 | input["has_mask_input"].resize(sizeof(float)); 72 | float has_mask_input = 0.0f; 73 | memcpy(input["has_mask_input"].data(), &has_mask_input, sizeof(float)); 74 | 75 | // 输出张量设置 76 | output["iou_predictions"] = IOTensor(); 77 | output["iou_predictions"].shape = std::vector{1, 4}; 78 | output["iou_predictions"].resize(sizeof(float) * 4); 79 | 80 | output["low_res_masks"] = IOTensor(); 81 | output["low_res_masks"].shape = std::vector{1, 4, 256, 256}; 82 | output["low_res_masks"].resize(4 * 256 * 256 * sizeof(float)); 83 | 84 | this->framework_->forward(input, output); 85 | 86 | low_res_mask = cv::Mat(256, 256, CV_32F, (float *)output.at("low_res_masks").data()); 87 | } -------------------------------------------------------------------------------- /model/sam/mask_decoder.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/base/model.h" 3 | 4 | namespace sam { 5 | class MaskDecoder : public Model { 6 | public: 7 | MaskDecoder() = delete; 8 | MaskDecoder(const std::string &yaml_file); 9 | virtual ~MaskDecoder(); 10 | void forward(const IOTensor &features, const std::vector &image_point_coords, 11 | const std::vector &image_point_labels, cv::Mat &low_res_mask); 12 | 13 | private: 14 | std::vector features_shape; 15 | }; 16 | } -------------------------------------------------------------------------------- /model/sam/sam.cpp: -------------------------------------------------------------------------------- 1 | #include "model/sam/sam.h" 2 | 3 | using namespace sam; 4 | 5 | SAM::SAM(const std::string& encoder_cfg, const std::string& decoder_cfg) { 6 | encoder_ = std::make_shared(encoder_cfg); 7 | decoder_ = std::make_shared(decoder_cfg); 8 | } 9 | 10 | void SAM::setImage(const cv::Mat &input_image) { 11 | cv::Mat mask; 12 | this->pparam_ = Letterbox(input_image, mask, encoder_->input_size()); 13 | 14 | encoder_->forward(mask, features_); 15 | } 16 | 17 | void SAM::predict(const std::vector &image_point_coords, const std::vector &image_point_labels, cv::Mat &output_mask) { 18 | auto &dw = this->pparam_.dw; 19 | auto &dh = this->pparam_.dh; 20 | auto &width = this->pparam_.width; 21 | auto &height = this->pparam_.height; 22 | auto input_w = encoder_->input_size().width; 23 | auto input_h = encoder_->input_size().height; 24 | int seg_w = 256, seg_h = 256; 25 | 26 | int scale_dw = dw / input_w * seg_w; 27 | int scale_dh = dh / input_h * seg_h; 28 | 29 | std::vector resize_image_point_coords; 30 | preprocessPoints(image_point_coords, resize_image_point_coords); 31 | 32 | cv::Mat low_res_mask; 33 | decoder_->forward(features_, resize_image_point_coords, image_point_labels, low_res_mask); 34 | 35 | cv::Rect roi(scale_dw, scale_dh, seg_w - 2 * scale_dw, seg_h - 2 * scale_dh); 36 | 37 | cv::Mat mask = low_res_mask(roi); 38 | mask = mask > 0.0f; 39 | cv::resize(mask, output_mask, cv::Size((int)width, (int)height), cv::INTER_LINEAR); 40 | } 41 | 42 | void SAM::preprocessPoints(const std::vector &input_points, std::vector &output_points) { 43 | auto &dw = this->pparam_.dw; 44 | auto &dh = this->pparam_.dh; 45 | auto &ratio = this->pparam_.ratio; 46 | 47 | output_points.clear(); 48 | for (const auto& point: input_points) { 49 | float x = point.x / ratio + dw; 50 | float y = point.y / ratio + dh; 51 | output_points.push_back(cv::Point2f(x,y)); 52 | } 53 | } -------------------------------------------------------------------------------- /model/sam/sam.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "model/sam/image_encoder.h" 3 | #include "model/sam/mask_decoder.h" 4 | 5 | namespace sam { 6 | 7 | class SAM { 8 | public: 9 | SAM() = delete; 10 | SAM(const std::string &encoder_cfg, const std::string &decoder_cfg); 11 | ~SAM(){}; 12 | void setImage(const cv::Mat &image); 13 | void predict(const std::vector &image_point_coords, const std::vector &image_point_labels, 14 | cv::Mat &output_mask); 15 | void preprocessPoints(const std::vector &input_points, std::vector &output_points); 16 | 17 | private: 18 | std::shared_ptr encoder_; 19 | std::shared_ptr decoder_; 20 | PreParam pparam_; 21 | IOTensor features_; 22 | }; 23 | } // namespace sam -------------------------------------------------------------------------------- /model/yolo/common.py: -------------------------------------------------------------------------------- 1 | # copyed from https://github.com/triple-Mu/YOLOv8-TensorRT 2 | 3 | from typing import Tuple 4 | import random 5 | import torch 6 | import torch.nn as nn 7 | from torch import Graph, Tensor, Value 8 | 9 | def make_anchors(feats: Tensor, 10 | strides: Tensor, 11 | grid_cell_offset: float = 0.5) -> Tuple[Tensor, Tensor]: 12 | anchor_points, stride_tensor = [], [] 13 | assert feats is not None 14 | dtype, device = feats[0].dtype, feats[0].device 15 | for i, stride in enumerate(strides): 16 | _, _, h, w = feats[i].shape 17 | sx = torch.arange(end=w, device=device, 18 | dtype=dtype) + grid_cell_offset # shift x 19 | sy = torch.arange(end=h, device=device, 20 | dtype=dtype) + grid_cell_offset # shift y 21 | sy, sx = torch.meshgrid(sy, sx) 22 | anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) 23 | stride_tensor.append( 24 | torch.full((h * w, 1), stride, dtype=dtype, device=device)) 25 | return torch.cat(anchor_points), torch.cat(stride_tensor) 26 | 27 | class C2f(nn.Module): 28 | 29 | def __init__(self, *args, **kwargs): 30 | super().__init__() 31 | 32 | def forward(self, x): 33 | x = self.cv1(x) 34 | x = [x, x[:, self.c:, ...]] 35 | x.extend(m(x[-1]) for m in self.m) 36 | x.pop(1) 37 | return self.cv2(torch.cat(x, 1)) -------------------------------------------------------------------------------- /model/yolo/test.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from ultralytics.cfg import entrypoint 4 | if __name__ == 'main': 5 | sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0]) 6 | 7 | sys.exit(entrypoint()) -------------------------------------------------------------------------------- /model/yolo/yolo.cpp: -------------------------------------------------------------------------------- 1 | #include "model/yolo/yolo.h" 2 | #include 3 | 4 | YOLO::YOLO(const std::string &yaml_file) { 5 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 6 | 7 | std::string model_path = yaml_node["model_path"].as(); 8 | std::string framework_type = yaml_node["framework"].as(); 9 | if (!Init(model_path, framework_type)) exit(0); 10 | 11 | std::vector input_size = yaml_node["input_size"].as>(); 12 | m_input_size_.width = input_size.at(0); 13 | m_input_size_.height = input_size.at(1); 14 | topk_ = yaml_node["topk"].as(); 15 | 16 | with_nms_ = yaml_node["with_nms"].as(); 17 | if (!with_nms_) { 18 | m_conf_thres_ = yaml_node["conf_thres"].as(); 19 | m_nms_thres_ = yaml_node["nms_thres"].as(); 20 | m_grid_num_ = 0; 21 | for (int i = 0; i < 3; i++) 22 | { 23 | m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]); 24 | } 25 | config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width; 26 | config_.output_len["output"] = m_grid_num_ * 6; 27 | config_.is_dynamic = false; 28 | } else { 29 | config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width; 30 | config_.output_len["num_dets"] = 1; 31 | config_.output_len["bboxes"] = 4 * topk_; 32 | config_.output_len["scores"] = topk_; 33 | config_.output_len["labels"] = topk_; 34 | } 35 | 36 | config_.is_dynamic = false; 37 | Status status = framework_->Init(config_); 38 | if (status != Status::SUCCESS) { 39 | std::cout << "Failed to init framework" << std::endl; 40 | exit(0); 41 | } 42 | } 43 | 44 | YOLO::~YOLO() 45 | { 46 | std::cout << "Destruct yolov8" << std::endl; 47 | } 48 | 49 | void YOLO::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 50 | cv::Mat mask; 51 | this->pparam_ = Letterbox(input_image, mask, m_input_size_); 52 | cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F); 53 | } 54 | 55 | void YOLO::detect(const cv::Mat &image, std::vector &objs) { 56 | std::unordered_map input, output; 57 | 58 | // 输入tensor设置 59 | cv::Mat nchw; 60 | preprocess(image, nchw); 61 | 62 | if (!with_nms_) { 63 | input["images"] = IOTensor(); 64 | input["images"].shape = std::vector{1, 3, m_input_size_.height, m_input_size_.width}; 65 | input["images"].data_type = DataType::FP32; 66 | input["images"].resize(nchw.total() * nchw.elemSize()); 67 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 68 | 69 | // 输出张量设置 70 | output["output"] = IOTensor(); 71 | output["output"].shape = std::vector{1, m_grid_num_, 6}; 72 | output["output"].data_type = DataType::FP32; 73 | output["output"].resize(config_.output_len["output"] * sizeof(float)); 74 | } else { 75 | input["images"] = IOTensor(); 76 | input["images"].shape = std::vector{1, 3, m_input_size_.height, m_input_size_.width}; 77 | input["images"].data_type = DataType::FP32; 78 | input["images"].resize(nchw.total() * nchw.elemSize()); 79 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 80 | 81 | // 输出张量设置 82 | output["num_dets"] = IOTensor(); 83 | output["num_dets"].shape = std::vector{1, 1}; 84 | output["num_dets"].data_type = DataType::INT32; 85 | output["num_dets"].resize(config_.output_len["num_dets"] * sizeof(int)); 86 | 87 | output["bboxes"] = IOTensor(); 88 | output["bboxes"].shape = std::vector{1, 100, 4}; 89 | output["bboxes"].data_type = DataType::FP32; 90 | output["bboxes"].resize(config_.output_len["bboxes"] * sizeof(float)); 91 | 92 | output["scores"] = IOTensor(); 93 | output["scores"].shape = std::vector{1, 100}; 94 | output["scores"].data_type = DataType::FP32; 95 | output["scores"].resize(config_.output_len["scores"] * sizeof(float)); 96 | 97 | output["labels"] = IOTensor(); 98 | output["labels"].shape = std::vector{1, 100}; 99 | output["labels"].data_type = DataType::INT32; 100 | output["labels"].resize(config_.output_len["labels"] * sizeof(int)); 101 | } 102 | 103 | this->framework_->forward(input, output); 104 | postprocess(output, objs); 105 | 106 | } 107 | 108 | void YOLO::postprocess(const std::unordered_map &output, std::vector &objs) { 109 | if (!with_nms_) { 110 | postprocess_with_nms(output, objs); 111 | } else { 112 | postprocess_without_nms(output, objs); 113 | } 114 | } 115 | 116 | void YOLO::postprocess_without_nms(const std::unordered_map &output, std::vector &objs) { 117 | objs.clear(); 118 | int *const num_dets = (int*)(output.at("num_dets").data()); 119 | float *const boxes = (float *)(output.at("bboxes").data()); 120 | float *scores = (float *)(output.at("scores").data()); 121 | int *labels = (int*)(output.at("labels").data()); 122 | auto &dw = this->pparam_.dw; 123 | auto &dh = this->pparam_.dh; 124 | auto &width = this->pparam_.width; 125 | auto &height = this->pparam_.height; 126 | auto &ratio = this->pparam_.ratio; 127 | for (int i = 0; i < num_dets[0]; i++) 128 | { 129 | float *ptr = boxes + i * 4; 130 | 131 | float x0 = *ptr++ - dw; 132 | float y0 = *ptr++ - dh; 133 | float x1 = *ptr++ - dw; 134 | float y1 = *ptr - dh; 135 | 136 | x0 = clamp(x0 * ratio, 0.f, width); 137 | y0 = clamp(y0 * ratio, 0.f, height); 138 | x1 = clamp(x1 * ratio, 0.f, width); 139 | y1 = clamp(y1 * ratio, 0.f, height); 140 | Object obj; 141 | obj.rect.x = x0; 142 | obj.rect.y = y0; 143 | obj.rect.width = x1 - x0; 144 | obj.rect.height = y1 - y0; 145 | obj.prob = *(scores + i); 146 | obj.label = *(labels + i); 147 | objs.push_back(obj); 148 | } 149 | } 150 | 151 | void YOLO::postprocess_with_nms(const std::unordered_map &output, std::vector &objs) 152 | { 153 | objs.clear(); 154 | auto num_anchors = m_grid_num_; 155 | 156 | auto &dw = this->pparam_.dw; 157 | auto &dh = this->pparam_.dh; 158 | auto &width = this->pparam_.width; 159 | auto &height = this->pparam_.height; 160 | auto &ratio = this->pparam_.ratio; 161 | 162 | std::vector labels; 163 | std::vector scores; 164 | std::vector bboxes; 165 | std::vector indices; 166 | 167 | float * const outputs = (float *)output.at("output").data(); 168 | 169 | for (int i = 0; i < num_anchors; i++) 170 | { 171 | float *ptr = outputs + i * 6; 172 | float score = *(ptr + 4); 173 | if (score > m_conf_thres_) 174 | { 175 | float x0 = *ptr++ - dw; 176 | float y0 = *ptr++ - dh; 177 | float x1 = *ptr++ - dw; 178 | float y1 = *ptr++ - dh; 179 | 180 | x0 = clamp(x0 * ratio, 0.f, width); 181 | y0 = clamp(y0 * ratio, 0.f, height); 182 | x1 = clamp(x1 * ratio, 0.f, width); 183 | y1 = clamp(y1 * ratio, 0.f, height); 184 | 185 | int label = *(++ptr); 186 | labels.push_back(label); 187 | scores.push_back(score); 188 | bboxes.push_back(cv::Rect_(x0, y0, x1 - x0, y1 - y0)); 189 | } 190 | } 191 | cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices); 192 | 193 | int cnt = 0; 194 | for (auto &i : indices) 195 | { 196 | if (cnt >= topk_) 197 | { 198 | break; 199 | } 200 | cv::Rect tmp = bboxes[i]; 201 | Object obj; 202 | obj.label = labels[i]; 203 | obj.rect = tmp; 204 | obj.prob = scores[i]; 205 | objs.push_back(obj); 206 | cnt += 1; 207 | } 208 | } -------------------------------------------------------------------------------- /model/yolo/yolo.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class YOLO : public DetectionModel { 7 | public: 8 | YOLO() = delete; 9 | explicit YOLO(const std::string &yaml_file); 10 | ~YOLO(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | void postprocess_without_nms(const std::unordered_map &output, std::vector &objs); 18 | void postprocess_with_nms(const std::unordered_map &output, std::vector &objs); 19 | 20 | private: 21 | cv::Size m_input_size_ = {640, 640}; 22 | float m_conf_thres_ = 0.25f; 23 | float m_nms_thres_ = 0.65f; 24 | int topk_ = 100; 25 | int strides[3] = {8, 16, 32}; 26 | int m_grid_num_ = 8400; 27 | bool with_nms_ = false; 28 | PreParam pparam_; 29 | }; -------------------------------------------------------------------------------- /model/yolo/yolo_cutoff.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class YOLOCutoff : public DetectionModel { 7 | public: 8 | YOLOCutoff() = delete; 9 | explicit YOLOCutoff(const std::string &yaml_file); 10 | ~YOLOCutoff(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | int decodeBoxes(const IOTensor &output1, const IOTensor &output2, const IOTensor &output3, 18 | int grid_h, int grid_w, int height, int width, int stride, int dfl_len, 19 | std::vector &boxes, std::vector &objProbs, std::vector &classId, 20 | float threshold); 21 | 22 | private: 23 | cv::Size m_input_size_ = {640, 640}; 24 | int m_class_num_ = 80; 25 | float m_conf_thres_ = 0.25f; 26 | float m_nms_thres_ = 0.65f; 27 | int topk_ = 100; 28 | int strides[3] = {8, 16, 32}; 29 | std::string framework_type_; 30 | 31 | PreParam pparam_; 32 | }; -------------------------------------------------------------------------------- /model/yolo/yolo_pose.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/framework.h" 2 | 3 | #include "model/yolo/yolo_pose.h" 4 | #include 5 | 6 | YOLOPose::YOLOPose(const std::string &yaml_file) 7 | { 8 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 9 | 10 | std::string model_path = yaml_node["model_path"].as(); 11 | std::string framework_type = yaml_node["framework"].as(); 12 | 13 | m_conf_thres_ = yaml_node["conf_thres"].as(); 14 | m_nms_thres_ = yaml_node["nms_thres"].as(); 15 | 16 | std::vector input_size = yaml_node["input_size"].as>(); 17 | m_input_size_.width = input_size.at(0); 18 | m_input_size_.height = input_size.at(1); 19 | 20 | if (!Init(model_path, framework_type)) exit(0); 21 | 22 | m_grid_num_ = 0; 23 | for (int i = 0; i < 3; i++) 24 | { 25 | m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]); 26 | } 27 | config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width; 28 | config_.output_len["bboxes"] = m_grid_num_ * 4; 29 | config_.output_len["scores"] = m_grid_num_; 30 | config_.output_len["kps"] = m_grid_num_ * 51; 31 | config_.is_dynamic = false; 32 | Status status = framework_->Init(config_); 33 | if (status != Status::SUCCESS) { 34 | std::cout << "Failed to init framework" << std::endl; 35 | exit(0); 36 | } 37 | } 38 | 39 | YOLOPose::~YOLOPose() 40 | { 41 | std::cout << "Destruct yolov8" << std::endl; 42 | } 43 | 44 | void YOLOPose::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 45 | cv::Mat mask; 46 | this->pparam_ = Letterbox(input_image, mask, m_input_size_); 47 | cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F); 48 | } 49 | 50 | void YOLOPose::detect(const cv::Mat &image, std::vector &objs) 51 | { 52 | std::unordered_map input, output; 53 | 54 | // 输入tensor设置 55 | cv::Mat nchw; 56 | preprocess(image, nchw); 57 | 58 | input["images"] = IOTensor(); 59 | input["images"].resize(nchw.total() * nchw.elemSize()); 60 | input["images"].shape = std::vector{1, 3, m_input_size_.height, m_input_size_.width}; 61 | input["images"].data_type = DataType::FP32; 62 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 63 | 64 | 65 | // 输出张量设置 66 | output["bboxes"] = IOTensor(); 67 | output["bboxes"].shape = std::vector{1, m_grid_num_, 4}; 68 | output["bboxes"].data_type = DataType::FP32; 69 | output["bboxes"].resize(config_.output_len["bboxes"] * sizeof(float)); 70 | 71 | output["scores"] = IOTensor(); 72 | output["scores"].shape = std::vector{1, m_grid_num_, 1}; 73 | output["scores"].data_type = DataType::FP32; 74 | output["scores"].resize(config_.output_len["scores"] * sizeof(float)); 75 | 76 | output["kps"] = IOTensor(); 77 | output["kps"].shape = std::vector{1, m_grid_num_, 51}; 78 | output["kps"].data_type = DataType::FP32; 79 | output["kps"].resize(config_.output_len["kps"] * sizeof(float)); 80 | 81 | // start = std::chrono::system_clock::now(); 82 | this->framework_->forward(input, output); 83 | // end = std::chrono::system_clock::now(); 84 | // tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 85 | // std::cout << "Inference costs " << tc << " ms" << std::endl; 86 | 87 | // start = std::chrono::system_clock::now(); 88 | postprocess(output, objs); 89 | // end = std::chrono::system_clock::now(); 90 | // tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 91 | // std::cout << "Postprocess costs " << tc << " ms" << std::endl; 92 | } 93 | 94 | void YOLOPose::postprocess(const std::unordered_map &output, std::vector &objs) 95 | { 96 | objs.clear(); 97 | auto num_anchors = m_grid_num_; 98 | 99 | auto &dw = this->pparam_.dw; 100 | auto &dh = this->pparam_.dh; 101 | auto &width = this->pparam_.width; 102 | auto &height = this->pparam_.height; 103 | auto &ratio = this->pparam_.ratio; 104 | 105 | float *bbox_ptr = (float *)output.at("bboxes").data(); 106 | float *score_ptr = (float *)output.at("scores").data(); 107 | float *kps_ptr = (float *)output.at("kps").data(); 108 | 109 | std::vector bboxes; 110 | std::vector scores; 111 | std::vector labels; 112 | std::vector indices; 113 | std::vector> kpss; 114 | 115 | for (int i = 0; i < num_anchors; i++) 116 | { 117 | float score = *(score_ptr++); 118 | if (score > m_conf_thres_) 119 | { 120 | float x0 = *bbox_ptr++ - dw; 121 | float y0 = *bbox_ptr++ - dh; 122 | float x1 = *bbox_ptr++ - dw; 123 | float y1 = *bbox_ptr++ - dh; 124 | 125 | x0 = clamp(x0 * ratio, 0.f, width); 126 | y0 = clamp(y0 * ratio, 0.f, height); 127 | x1 = clamp(x1 * ratio, 0.f, width); 128 | y1 = clamp(y1 * ratio, 0.f, height); 129 | 130 | std::vector kps; 131 | for (int k = 0; k < 17; k++) { 132 | float kps_x = (*(kps_ptr + 3 * k) - dw) * ratio; 133 | float kps_y = (*(kps_ptr + 3 * k + 1) - dh) * ratio; 134 | float kps_s = *(kps_ptr + 3 * k + 2); 135 | kps_x = clamp(kps_x, 0.f, width); 136 | kps_y = clamp(kps_y, 0.f, height); 137 | kps.push_back(kps_x); 138 | kps.push_back(kps_y); 139 | kps.push_back(kps_s); 140 | } 141 | kps_ptr += 51; 142 | 143 | labels.push_back(0); 144 | scores.push_back(score); 145 | bboxes.push_back(cv::Rect_(x0, y0, x1 - x0, y1 - y0)); 146 | kpss.push_back(kps); 147 | } else { 148 | bbox_ptr += 4; 149 | kps_ptr += 51; 150 | } 151 | } 152 | cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices); 153 | 154 | int cnt = 0; 155 | for (auto& i : indices) { 156 | if (cnt >= topk) { 157 | break; 158 | } 159 | Object obj; 160 | obj.rect = bboxes[i]; 161 | obj.prob = scores[i]; 162 | obj.label = labels[i]; 163 | obj.kps = kpss[i]; 164 | objs.push_back(obj); 165 | cnt += 1; 166 | } 167 | } -------------------------------------------------------------------------------- /model/yolo/yolo_pose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class YOLOPose : public DetectionModel { 7 | public: 8 | YOLOPose() = delete; 9 | explicit YOLOPose(const std::string &yaml_file); 10 | ~YOLOPose(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | 18 | private: 19 | cv::Size m_input_size_ = {640, 640}; 20 | float m_conf_thres_ = 0.25f; 21 | float m_nms_thres_ = 0.65f; 22 | int topk = 100; 23 | int strides[3] = {8, 16, 32}; 24 | int m_grid_num_ = 8400; 25 | 26 | PreParam pparam_; 27 | }; -------------------------------------------------------------------------------- /model/yolo/yolo_seg.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/framework.h" 2 | 3 | #include "model/yolo/yolo_seg.h" 4 | #include 5 | 6 | YOLOSeg::YOLOSeg(const std::string &yaml_file) 7 | { 8 | YAML::Node yaml_node = YAML::LoadFile(yaml_file); 9 | 10 | std::string model_path = yaml_node["model_path"].as(); 11 | std::string framework_type = yaml_node["framework"].as(); 12 | 13 | m_conf_thres_ = yaml_node["conf_thres"].as(); 14 | m_nms_thres_ = yaml_node["nms_thres"].as(); 15 | 16 | std::vector input_size = yaml_node["input_size"].as>(); 17 | m_input_size_.width = input_size.at(0); 18 | m_input_size_.height = input_size.at(1); 19 | 20 | std::vector seg_size = yaml_node["seg_size"].as>(); 21 | m_seg_size_.width = seg_size.at(0); 22 | m_seg_size_.height = seg_size.at(1); 23 | 24 | m_seg_channels_ = yaml_node["seg_channels"].as(); 25 | 26 | if (!Init(model_path, framework_type)) exit(0); 27 | 28 | m_grid_num_ = 0; 29 | for (int i = 0; i < 3; i++) 30 | { 31 | m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]); 32 | } 33 | config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width; 34 | config_.output_len["outputs"] = m_grid_num_ * (m_seg_channels_ + 6); 35 | config_.output_len["proto"] = m_seg_channels_ * m_seg_size_.height * m_seg_size_.width; 36 | config_.is_dynamic = false; 37 | Status status = framework_->Init(config_); 38 | if (status != Status::SUCCESS) { 39 | std::cout << "Failed to init framework" << std::endl; 40 | exit(0); 41 | } 42 | } 43 | 44 | YOLOSeg::~YOLOSeg() 45 | { 46 | std::cout << "Destruct yolov8" << std::endl; 47 | } 48 | 49 | void YOLOSeg::preprocess(const cv::Mat &input_image, cv::Mat &output_image) { 50 | cv::Mat mask; 51 | this->pparam_ = Letterbox(input_image, mask, m_input_size_); 52 | cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F); 53 | } 54 | 55 | void YOLOSeg::detect(const cv::Mat &image, std::vector &objs) 56 | { 57 | std::unordered_map input, output; 58 | 59 | // 输入tensor设置 60 | cv::Mat nchw; 61 | preprocess(image, nchw); 62 | 63 | input["images"] = IOTensor(); 64 | input["images"].resize(nchw.total() * nchw.elemSize()); 65 | input["images"].shape = std::vector{1, 3, m_input_size_.height, m_input_size_.width}; 66 | input["images"].data_type = DataType::FP32; 67 | memcpy(input["images"].data(), nchw.ptr(), nchw.total() * nchw.elemSize()); 68 | 69 | 70 | // 输出张量设置 71 | output["outputs"] = IOTensor(); 72 | output["outputs"].shape = std::vector{1, m_grid_num_, m_seg_channels_ + 6}; 73 | output["outputs"].data_type = DataType::FP32; 74 | output["outputs"].resize(config_.output_len["outputs"] * sizeof(float)); 75 | 76 | output["proto"] = IOTensor(); 77 | output["proto"].shape = std::vector{1, m_seg_channels_, m_seg_size_.height, m_seg_size_.width}; 78 | output["proto"].data_type = DataType::FP32; 79 | output["proto"].resize(config_.output_len["proto"] * sizeof(float)); 80 | 81 | // start = std::chrono::system_clock::now(); 82 | this->framework_->forward(input, output); 83 | // end = std::chrono::system_clock::now(); 84 | // tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 85 | // std::cout << "Inference costs " << tc << " ms" << std::endl; 86 | 87 | // start = std::chrono::system_clock::now(); 88 | postprocess(output, objs); 89 | // end = std::chrono::system_clock::now(); 90 | // tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 91 | // std::cout << "Postprocess costs " << tc << " ms" << std::endl; 92 | } 93 | 94 | void YOLOSeg::postprocess(const std::unordered_map &output, std::vector &objs) 95 | { 96 | objs.clear(); 97 | auto seg_h = m_seg_size_.height; 98 | auto seg_w = m_seg_size_.width; 99 | auto input_h = m_input_size_.height; 100 | auto input_w = m_input_size_.width; 101 | auto num_anchors = m_grid_num_; 102 | auto num_channels = m_num_channels_; 103 | 104 | auto &dw = this->pparam_.dw; 105 | auto &dh = this->pparam_.dh; 106 | auto &width = this->pparam_.width; 107 | auto &height = this->pparam_.height; 108 | auto &ratio = this->pparam_.ratio; 109 | 110 | float * const outputs = (float *)output.at("outputs").data(); 111 | cv::Mat protos = cv::Mat(m_seg_channels_, seg_h * seg_w, CV_32F, (float *)output.at("proto").data()); 112 | assert(!protos.empty()); 113 | 114 | std::vector labels; 115 | std::vector scores; 116 | std::vector bboxes; 117 | std::vector mask_confs; 118 | std::vector indices; 119 | 120 | for (int i = 0; i < num_anchors; i++) 121 | { 122 | float *ptr = outputs + i * num_channels; 123 | float score = *(ptr + 4); 124 | if (score > m_conf_thres_) 125 | { 126 | float x0 = *ptr++ - dw; 127 | float y0 = *ptr++ - dh; 128 | float x1 = *ptr++ - dw; 129 | float y1 = *ptr++ - dh; 130 | 131 | x0 = clamp(x0 * ratio, 0.f, width); 132 | y0 = clamp(y0 * ratio, 0.f, height); 133 | x1 = clamp(x1 * ratio, 0.f, width); 134 | y1 = clamp(y1 * ratio, 0.f, height); 135 | 136 | int label = *(++ptr); 137 | cv::Mat mask_conf = cv::Mat(1, m_seg_channels_, CV_32F, ++ptr); 138 | mask_confs.push_back(mask_conf); 139 | labels.push_back(label); 140 | scores.push_back(score); 141 | bboxes.push_back(cv::Rect_(x0, y0, x1 - x0, y1 - y0)); 142 | } 143 | } 144 | cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices); 145 | 146 | cv::Mat masks; 147 | int cnt = 0; 148 | for (auto &i : indices) 149 | { 150 | if (cnt >= topk) 151 | { 152 | break; 153 | } 154 | cv::Rect tmp = bboxes[i]; 155 | Object obj; 156 | obj.label = labels[i]; 157 | obj.rect = tmp; 158 | obj.prob = scores[i]; 159 | masks.push_back(mask_confs[i]); 160 | objs.push_back(obj); 161 | cnt += 1; 162 | } 163 | if (masks.empty()) 164 | { 165 | // masks is empty 166 | } 167 | else 168 | { 169 | cv::Mat matmulRes = (masks * protos).t(); 170 | cv::Mat maskMat = matmulRes.reshape(indices.size(), {seg_w, seg_h}); 171 | 172 | std::vector maskChannels; 173 | cv::split(maskMat, maskChannels); 174 | int scale_dw = dw / input_w * seg_w; 175 | int scale_dh = dh / input_h * seg_h; 176 | 177 | cv::Rect roi(scale_dw, scale_dh, seg_w - 2 * scale_dw, seg_h - 2 * scale_dh); 178 | 179 | for (long unsigned int i = 0; i < indices.size(); i++) 180 | { 181 | cv::Mat dest, mask; 182 | cv::exp(-maskChannels[i], dest); 183 | dest = 1.0 / (1.0 + dest); 184 | dest = dest(roi); 185 | // std::cout << dest.size() << " " << dest.size().empty() << std::endl; 186 | cv::resize(dest, mask, cv::Size((int)width, (int)height), cv::INTER_LINEAR); 187 | objs[i].boxMask = mask(objs[i].rect) > 0.5f; 188 | } 189 | } 190 | } -------------------------------------------------------------------------------- /model/yolo/yolo_seg.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class YOLOSeg : public DetectionModel { 7 | public: 8 | YOLOSeg() = delete; 9 | explicit YOLOSeg(const std::string &yaml_file); 10 | ~YOLOSeg(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | 18 | private: 19 | cv::Size m_input_size_ = {640, 640}; 20 | cv::Size m_seg_size_ = {160, 160}; 21 | int m_seg_channels_ = 32; 22 | float m_conf_thres_ = 0.25f; 23 | float m_nms_thres_ = 0.65f; 24 | int topk = 100; 25 | int strides[3] = {8, 16, 32}; 26 | int m_grid_num_ = 8400; 27 | int m_num_channels_ = 38; 28 | 29 | PreParam pparam_; 30 | }; -------------------------------------------------------------------------------- /model/yolo/yolo_seg_cutoff.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include "model/base/detection_model.h" 5 | 6 | class YOLOSegCutoff : public DetectionModel { 7 | public: 8 | YOLOSegCutoff() = delete; 9 | explicit YOLOSegCutoff(const std::string &yaml_file); 10 | ~YOLOSegCutoff(); 11 | 12 | void detect(const cv::Mat &image, std::vector &objs) override; 13 | 14 | protected: 15 | void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override; 16 | void postprocess(const std::unordered_map &output, std::vector &objs) override; 17 | int decodeBoxes(const IOTensor &output1, const IOTensor &output2, const IOTensor &output3, const IOTensor &output4, 18 | int grid_h, int grid_w, int height, int width, int stride, int dfl_len, 19 | std::vector &boxes, std::vector &segments, std::vector &objProbs, 20 | std::vector &classId, float threshold); 21 | void decodeMask(const IOTensor &input, cv::Mat &protos); 22 | 23 | private: 24 | cv::Size m_input_size_ = {640, 640}; 25 | cv::Size m_seg_size_ = {160, 160}; 26 | int m_seg_channels_ = 32; 27 | int m_class_num_ = 80; 28 | float m_conf_thres_ = 0.25f; 29 | float m_nms_thres_ = 0.65f; 30 | int topk_ = 100; 31 | int strides[3] = {8, 16, 32}; 32 | std::string framework_type_; 33 | 34 | PreParam pparam_; 35 | }; -------------------------------------------------------------------------------- /model/yolo/yolov8-pose-export.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from io import BytesIO 3 | 4 | import onnx 5 | import torch 6 | from typing import Tuple 7 | 8 | import torch 9 | import torch.nn as nn 10 | from torch import Graph, Tensor, Value 11 | from ultralytics import YOLO 12 | 13 | try: 14 | import onnxsim 15 | except ImportError: 16 | onnxsim = None 17 | 18 | class YOLOv8Pose(nn.Module): 19 | export = True 20 | shape = None 21 | dynamic = False 22 | 23 | def __init__(self, weights, device): 24 | super().__init__() 25 | self.device = device 26 | self.model = YOLO(weights).to(self.device).model.fuse().eval() 27 | self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], 28 | dtype=torch.float32, 29 | device=self.device) 30 | 31 | def forward(self, x): 32 | out, _ = self.model(x) 33 | boxes, scores, kps = out.split((4,1,51), 1) 34 | boxes = (boxes.transpose(1,2) @ self.convert_matrix) 35 | return boxes, scores.transpose(1,2), kps.transpose(1,2) 36 | 37 | def export(self, save_path, opset_version=11, sim=True): 38 | fake_input = torch.randn(1,3,640,640).to(self.device) 39 | for _ in range(2): 40 | self.forward(fake_input) 41 | with BytesIO() as f: 42 | torch.onnx.export( 43 | self, 44 | fake_input, 45 | f, 46 | opset_version=opset_version, 47 | input_names=['images'], 48 | output_names=['bboxes', 'scores', 'kps']) 49 | f.seek(0) 50 | onnx_model = onnx.load(f) 51 | onnx.checker.check_model(onnx_model) 52 | if sim: 53 | try: 54 | onnx_model, check = onnxsim.simplify(onnx_model) 55 | assert check, 'assert check failed' 56 | except Exception as e: 57 | print(f'Simplifier failure: {e}') 58 | onnx.save(onnx_model, save_path) 59 | print(f'ONNX export success, saved as {save_path}') 60 | 61 | def parse_args(): 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('-w', 64 | '--weights', 65 | type=str, 66 | required=True, 67 | help='PyTorch weights') 68 | parser.add_argument('--opset', 69 | type=int, 70 | default=11, 71 | help='ONNX opset version') 72 | parser.add_argument('--sim', 73 | action='store_true', 74 | help='simplify onnx model') 75 | args = parser.parse_args() 76 | return args 77 | 78 | if __name__=='__main__': 79 | args = parse_args() 80 | model = YOLOv8Pose(args.weights, 'cpu') 81 | save_path = args.weights.replace('.pt', '.onnx') 82 | model.export(save_path, args.opset, args.sim) -------------------------------------------------------------------------------- /model/yolo/yolov8-seg-export.py: -------------------------------------------------------------------------------- 1 | ## copyed from https://github.com/triple-Mu/YOLOv8-TensorRT 2 | 3 | import argparse 4 | from io import BytesIO 5 | 6 | import onnx 7 | import torch 8 | from typing import Tuple 9 | 10 | import torch 11 | import torch.nn as nn 12 | from torch import Graph, Tensor, Value 13 | from ultralytics import YOLO 14 | 15 | from common import make_anchors, C2f 16 | 17 | try: 18 | import onnxsim 19 | except ImportError: 20 | onnxsim = None 21 | 22 | class PostSeg(nn.Module): 23 | export = True 24 | shape = None 25 | dynamic = False 26 | 27 | def __init__(self, *args, **kwargs): 28 | super().__init__() 29 | 30 | def forward(self, x): 31 | p = self.proto(x[0]) # mask protos 32 | bs = p.shape[0] # batch size 33 | mc = torch.cat( 34 | [self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)], 35 | 2) # mask coefficients 36 | boxes, scores, labels = self.forward_det(x) 37 | out = torch.cat([boxes, scores, labels.float(), mc.transpose(1, 2)], 2) 38 | return out, p.flatten(2) 39 | 40 | def forward_det(self, x): 41 | shape = x[0].shape 42 | b, res, b_reg_num = shape[0], [], self.reg_max * 4 43 | for i in range(self.nl): 44 | res.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)) 45 | if self.dynamic or self.shape != shape: 46 | self.anchors, self.strides = \ 47 | (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) 48 | self.shape = shape 49 | x = [i.view(b, self.no, -1) for i in res] 50 | y = torch.cat(x, 2) 51 | boxes, scores = y[:, :b_reg_num, ...], y[:, b_reg_num:, ...].sigmoid() 52 | boxes = boxes.view(b, 4, self.reg_max, -1).permute(0, 1, 3, 2) 53 | boxes = boxes.softmax(-1) @ torch.arange(self.reg_max).to(boxes) 54 | boxes0, boxes1 = -boxes[:, :2, ...], boxes[:, 2:, ...] 55 | boxes = self.anchors.repeat(b, 2, 1) + torch.cat([boxes0, boxes1], 1) 56 | boxes = boxes * self.strides 57 | scores, labels = scores.transpose(1, 2).max(dim=-1, keepdim=True) 58 | return boxes.transpose(1, 2), scores, labels 59 | 60 | def parse_args(): 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('-w', 63 | '--weights', 64 | type=str, 65 | required=True, 66 | help='PyTorch yolov8 weights') 67 | parser.add_argument('--opset', 68 | type=int, 69 | default=11, 70 | help='ONNX opset version') 71 | parser.add_argument('--sim', 72 | action='store_true', 73 | help='simplify onnx model') 74 | parser.add_argument('--input-shape', 75 | nargs='+', 76 | type=int, 77 | default=[1, 3, 640, 640], 78 | help='Model input shape only for api builder') 79 | parser.add_argument('--device', 80 | type=str, 81 | default='cpu', 82 | help='Export ONNX device') 83 | args = parser.parse_args() 84 | assert len(args.input_shape) == 4 85 | return args 86 | 87 | 88 | def main(args): 89 | YOLOv8 = YOLO(args.weights) 90 | model = YOLOv8.model.fuse().eval() 91 | for m in model.modules(): 92 | s = str(type(m))[6:-2].split('.')[-1] 93 | if s == 'Segment': 94 | setattr(m, '__class__', PostSeg) 95 | elif s == 'C2f': 96 | setattr(m, '__class__', C2f) 97 | m.to(args.device) 98 | model.to(args.device) 99 | fake_input = torch.randn(args.input_shape).to(args.device) 100 | for _ in range(2): 101 | model(fake_input) 102 | save_path = args.weights.replace('.pt', '.onnx') 103 | with BytesIO() as f: 104 | torch.onnx.export(model, 105 | fake_input, 106 | f, 107 | opset_version=args.opset, 108 | input_names=['images'], 109 | output_names=['outputs', 'proto']) 110 | f.seek(0) 111 | onnx_model = onnx.load(f) 112 | onnx.checker.check_model(onnx_model) 113 | if args.sim: 114 | try: 115 | onnx_model, check = onnxsim.simplify(onnx_model) 116 | assert check, 'assert check failed' 117 | except Exception as e: 118 | print(f'Simplifier failure: {e}') 119 | onnx.save(onnx_model, save_path) 120 | print(f'ONNX export success, saved as {save_path}') 121 | 122 | 123 | if __name__ == '__main__': 124 | main(parse_args()) -------------------------------------------------------------------------------- /model/yolo/yolov9-det-export.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | from io import BytesIO 4 | from typing import Tuple 5 | 6 | import onnx 7 | import torch 8 | from onnx import TensorProto 9 | from ultralytics import YOLO 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch import Graph, Tensor, Value 15 | 16 | try: 17 | import onnxsim 18 | except ImportError: 19 | onnxsim = None 20 | 21 | from models.experimental import attempt_load 22 | 23 | class TRT_NMS(torch.autograd.Function): 24 | 25 | @staticmethod 26 | def forward( 27 | ctx: Graph, 28 | boxes: Tensor, 29 | scores: Tensor, 30 | iou_threshold: float = 0.65, 31 | score_threshold: float = 0.25, 32 | max_output_boxes: int = 100, 33 | background_class: int = -1, 34 | box_coding: int = 0, 35 | plugin_version: str = '1', 36 | score_activation: int = 0 37 | ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: 38 | batch_size, num_boxes, num_classes = scores.shape 39 | num_dets = torch.randint(0, 40 | max_output_boxes, (batch_size, 1), 41 | dtype=torch.int32) 42 | boxes = torch.randn(batch_size, max_output_boxes, 4) 43 | scores = torch.randn(batch_size, max_output_boxes) 44 | labels = torch.randint(0, 45 | num_classes, (batch_size, max_output_boxes), 46 | dtype=torch.int32) 47 | 48 | return num_dets, boxes, scores, labels 49 | 50 | @staticmethod 51 | def symbolic( 52 | g, 53 | boxes: Value, 54 | scores: Value, 55 | iou_threshold: float = 0.45, 56 | score_threshold: float = 0.25, 57 | max_output_boxes: int = 100, 58 | background_class: int = -1, 59 | box_coding: int = 0, 60 | score_activation: int = 0, 61 | plugin_version: str = '1') -> Tuple[Value, Value, Value, Value]: 62 | out = g.op('TRT::EfficientNMS_TRT', 63 | boxes, 64 | scores, 65 | iou_threshold_f=iou_threshold, 66 | score_threshold_f=score_threshold, 67 | max_output_boxes_i=max_output_boxes, 68 | background_class_i=background_class, 69 | box_coding_i=box_coding, 70 | plugin_version_s=plugin_version, 71 | score_activation_i=score_activation, 72 | outputs=4) 73 | nums_dets, boxes, scores, classes = out 74 | return nums_dets, boxes, scores, classes 75 | 76 | class ORT_NMS(torch.autograd.Function): 77 | '''ONNX-Runtime NMS operation''' 78 | @staticmethod 79 | def forward(ctx, 80 | boxes, 81 | scores, 82 | max_output_boxes_per_class=torch.tensor([100]), 83 | iou_threshold=torch.tensor([0.45]), 84 | score_threshold=torch.tensor([0.25])): 85 | device = boxes.device 86 | batch = scores.shape[0] 87 | num_det = random.randint(0, 100) 88 | batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device) 89 | idxs = torch.arange(100, 100 + num_det).to(device) 90 | zeros = torch.zeros((num_det,), dtype=torch.int64).to(device) 91 | selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous() 92 | selected_indices = selected_indices.to(torch.int64) 93 | return selected_indices 94 | 95 | @staticmethod 96 | def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold): 97 | return g.op("NonMaxSuppression", boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold) 98 | 99 | class YOLOv9(nn.Module): 100 | export = True 101 | shape = None 102 | dynamic = True 103 | iou_thres = 0.65 104 | conf_thres = 0.25 105 | topk = 100 106 | use_trt_nms = False 107 | use_onnx_nms = False 108 | def __init__(self, weights, device='cpu'): 109 | super().__init__() 110 | self.device = device 111 | self.model = attempt_load(weights, device=self.device, inplace=True, fuse=True) 112 | self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]], 113 | dtype=torch.float32, 114 | device=self.device) 115 | 116 | 117 | def forward(self, x): 118 | out, _ = self.model(x) 119 | bs = out.shape[0] # batch size 120 | nc = out.shape[1] - 4 # number of classes 121 | boxes, scores = out.split((4,nc), 1) 122 | boxes = (boxes.transpose(1,2) @ self.convert_matrix) 123 | 124 | if self.use_trt_nms: 125 | return TRT_NMS.apply(boxes, scores.transpose(1, 2), 126 | self.iou_thres, self.conf_thres, self.topk) 127 | elif self.use_onnx_nms: 128 | max_output_boxes_per_class = torch.tensor([self.topk]) 129 | iou_thres = torch.tensor([self.iou_thres]) 130 | conf_thres = torch.tensor([self.conf_thres]) 131 | num_selected_indices = ORT_NMS.apply(boxes, scores, max_output_boxes_per_class, iou_thres, conf_thres) 132 | 133 | scores = scores.transpose(1, 2) 134 | bbox_result = self.gather(boxes, num_selected_indices) 135 | score_intermediate_result = self.gather(scores, num_selected_indices).max(axis=-1) 136 | score_result = score_intermediate_result.values 137 | classes_result = score_intermediate_result.indices.to(torch.int32) 138 | num_dets = torch.tensor(score_result.shape[-1]).reshape([1,1]).to(torch.int32).clone().detach() 139 | 140 | return (num_dets, bbox_result, score_result, classes_result) 141 | else: 142 | scores, labels = scores.transpose(1, 2).max(dim=-1, keepdim=True) 143 | return torch.cat([boxes, scores, labels], dim=2) 144 | 145 | def gather(self, target, idx): 146 | pick_indices = idx[:, -1:].repeat(1, target.shape[2]).unsqueeze(0) 147 | return torch.gather(target, 1, pick_indices) 148 | 149 | def parse_args(): 150 | parser = argparse.ArgumentParser() 151 | parser.add_argument('-w', 152 | '--weights', 153 | type=str, 154 | required=True, 155 | help='PyTorch yolov8 weights') 156 | parser.add_argument('--trt-nms', 157 | action='store_true', 158 | required=False, 159 | help='Use TensorRT Efficient NMS plugins') 160 | parser.add_argument('--onnx-nms', 161 | action='store_true', 162 | required=False, 163 | help='Use onnx NMS ops') 164 | parser.add_argument('--iou-thres', 165 | type=float, 166 | default=0.65, 167 | help='IOU threshoud for NMS plugin') 168 | parser.add_argument('--conf-thres', 169 | type=float, 170 | default=0.25, 171 | help='CONF threshoud for NMS plugin') 172 | parser.add_argument('--topk', 173 | type=int, 174 | default=100, 175 | help='Max number of detection bboxes') 176 | parser.add_argument('--opset', 177 | type=int, 178 | default=11, 179 | help='ONNX opset version') 180 | parser.add_argument('--sim', 181 | action='store_true', 182 | help='simplify onnx model') 183 | parser.add_argument('--input-shape', 184 | nargs='+', 185 | type=int, 186 | default=[1, 3, 640, 640], 187 | help='Model input shape only for api builder') 188 | parser.add_argument('--device', 189 | type=str, 190 | default='cpu', 191 | help='Export ONNX device') 192 | args = parser.parse_args() 193 | assert len(args.input_shape) == 4 194 | YOLOv9.conf_thres = args.conf_thres 195 | YOLOv9.iou_thres = args.iou_thres 196 | YOLOv9.topk = args.topk 197 | YOLOv9.use_trt_nms = args.trt_nms 198 | YOLOv9.use_onnx_nms = args.onnx_nms 199 | return args 200 | 201 | 202 | def export_end2end(args): 203 | b = args.input_shape[0] 204 | model = YOLOv9(args.weights) 205 | model.to(args.device) 206 | fake_input = torch.randn(args.input_shape).to(args.device) 207 | for _ in range(2): 208 | model(fake_input) 209 | save_path = args.weights[:-3]+ '_end2end.onnx' 210 | with BytesIO() as f: 211 | torch.onnx.export( 212 | model, 213 | fake_input, 214 | f, 215 | opset_version=args.opset, 216 | input_names=['images'], 217 | output_names=['num_dets', 'bboxes', 'scores', 'labels']) 218 | f.seek(0) 219 | onnx_model = onnx.load(f) 220 | onnx.checker.check_model(onnx_model) 221 | shapes = [b, 1, b, args.topk, 4, b, args.topk, b, args.topk] 222 | for i in onnx_model.graph.output: 223 | for j in i.type.tensor_type.shape.dim: 224 | j.dim_param = str(shapes.pop(0)) 225 | if args.sim: 226 | try: 227 | onnx_model, check = onnxsim.simplify(onnx_model) 228 | assert check, 'assert check failed' 229 | except Exception as e: 230 | print(f'Simplifier failure: {e}') 231 | onnx.save(onnx_model, save_path) 232 | print(f'ONNX export success, saved as {save_path}') 233 | 234 | def export_normal(args): 235 | b = args.input_shape[0] 236 | model = YOLOv9(args.weights) 237 | model.to(args.device) 238 | fake_input = torch.randn(args.input_shape).to(args.device) 239 | for _ in range(2): 240 | model(fake_input) 241 | # save_path = args.weights.replace('.pt', '.onnx') 242 | save_path = args.weights[:-3] + '_normal.onnx' 243 | with BytesIO() as f: 244 | torch.onnx.export( 245 | model, 246 | fake_input, 247 | f, 248 | opset_version=args.opset, 249 | input_names=['images'], 250 | output_names=['output']) 251 | f.seek(0) 252 | onnx_model = onnx.load(f) 253 | onnx.checker.check_model(onnx_model) 254 | 255 | if args.sim: 256 | try: 257 | onnx_model, check = onnxsim.simplify(onnx_model) 258 | assert check, 'assert check failed' 259 | except Exception as e: 260 | print(f'Simplifier failure: {e}') 261 | onnx.save(onnx_model, save_path) 262 | print(f'ONNX export success, saved as {save_path}') 263 | 264 | def main(args): 265 | if args.trt_nms or args.onnx_nms: 266 | export_end2end(args) 267 | else: 268 | export_normal(args) 269 | 270 | if __name__=='__main__': 271 | main(parse_args()) -------------------------------------------------------------------------------- /output/dbnet/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/dbnet/01.png -------------------------------------------------------------------------------- /output/dbnet/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/dbnet/02.png -------------------------------------------------------------------------------- /output/sam/dogs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/sam/dogs.jpg -------------------------------------------------------------------------------- /output/yolo/detect/COCO_train2014_000000181904.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/COCO_train2014_000000181904.jpg -------------------------------------------------------------------------------- /output/yolo/detect/COCO_train2014_000000291797.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/COCO_train2014_000000291797.jpg -------------------------------------------------------------------------------- /output/yolo/detect/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/bus.jpg -------------------------------------------------------------------------------- /output/yolo/detect/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/zidane.jpg -------------------------------------------------------------------------------- /output/yolo/pose/COCO_train2014_000000181904.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/COCO_train2014_000000181904.jpg -------------------------------------------------------------------------------- /output/yolo/pose/COCO_train2014_000000291797.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/COCO_train2014_000000291797.jpg -------------------------------------------------------------------------------- /output/yolo/pose/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/bus.jpg -------------------------------------------------------------------------------- /output/yolo/pose/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/zidane.jpg -------------------------------------------------------------------------------- /output/yolo/segment/COCO_train2014_000000181904.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/COCO_train2014_000000181904.jpg -------------------------------------------------------------------------------- /output/yolo/segment/COCO_train2014_000000291797.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/COCO_train2014_000000291797.jpg -------------------------------------------------------------------------------- /output/yolo/segment/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/bus.jpg -------------------------------------------------------------------------------- /output/yolo/segment/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/zidane.jpg -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(yolo_test ${CMAKE_CURRENT_SOURCE_DIR}/yolo_test.cpp) 2 | target_include_directories(yolo_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 3 | target_link_libraries(yolo_test PUBLIC yolo_det yolo_seg yolo_pose yolo_det_cutoff yolo_seg_cutoff) 4 | target_link_directories(yolo_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS}) 5 | 6 | add_executable(ocr_test ${CMAKE_CURRENT_SOURCE_DIR}/ocr_test.cpp) 7 | target_include_directories(ocr_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 8 | target_link_libraries(ocr_test PUBLIC ctc attn dbnet) 9 | target_link_directories(ocr_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS}) 10 | 11 | add_executable(sam_test ${CMAKE_CURRENT_SOURCE_DIR}/sam_test.cpp) 12 | target_include_directories(sam_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 13 | target_link_libraries(sam_test PUBLIC sam) 14 | target_link_directories(sam_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS}) 15 | 16 | add_executable(clip_test ${CMAKE_CURRENT_SOURCE_DIR}/clip_test.cpp) 17 | target_include_directories(clip_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR}) 18 | target_link_libraries(clip_test PUBLIC clip) 19 | target_link_directories(clip_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS}) 20 | 21 | # add_executable(test ${CMAKE_CURRENT_SOURCE_DIR}/test.cpp) 22 | # target_include_directories(test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR}) 23 | # target_link_libraries(test ${ONNXRUNTIME_LIBS}) 24 | # target_link_directories(test PUBLIC ${OpenCV_LIBS}) -------------------------------------------------------------------------------- /test/clip_test.cpp: -------------------------------------------------------------------------------- 1 | #include "model/clip/text_tokenizer.h" 2 | #include "model/clip/image_encoder.h" 3 | #include "model/clip/text_encoder.h" 4 | #include "model/clip/clip.h" 5 | 6 | void ModuleTest() { 7 | clip::TextTokenizer tokenizer("/home/stardust/my_work/model-zoo-cxx/weights/clip/bpe_simple_vocab_16e6.txt.gz"); 8 | std::vector tokens = tokenizer.tokenize("a photo of a woman"); 9 | for(int token : tokens) { 10 | std::cout << token << ","; 11 | } 12 | std::cout << std::endl; 13 | 14 | std::string current_path = "../"; 15 | std::string image_encoder_cfg = current_path + "config/clip/image_encoder.yaml"; 16 | std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml"; 17 | 18 | clip::ImageEncoder image_encoder(image_encoder_cfg); 19 | 20 | std::vector images; 21 | images.push_back(cv::imread("../test/image/clip/franz-kafka.jpg")); 22 | 23 | IOTensor image_embeddings; 24 | image_encoder.forward(images, image_embeddings); 25 | std::cout << "Shape of image image_embeddings: ["; 26 | for (int64_t i : image_embeddings.shape) { 27 | std::cout << i << ","; 28 | } 29 | std::cout << "]" << std::endl; 30 | 31 | float* ptr = (float*)image_embeddings.data(); 32 | float min_val = FLT_MAX, max_val = -FLT_MAX; 33 | for (size_t i = 0; i < image_embeddings.size() / 4; i++) { 34 | float val = *ptr++; 35 | if (val > max_val) { 36 | max_val = val; 37 | } 38 | if (val < min_val) { 39 | min_val = val; 40 | } 41 | } 42 | std::cout << "Range of image_embeddings: [" << min_val << "," << max_val << "]" << std::endl; 43 | 44 | clip::TextEncoder text_encoder(text_encoder_cfg); 45 | 46 | std::vector texts{"a photo of a man", "a photo of a woman"}; 47 | 48 | IOTensor text_embeddings; 49 | text_encoder.forward(texts, text_embeddings); 50 | std::cout << "Shape of image text_embeddings: ["; 51 | for (int64_t i : text_embeddings.shape) { 52 | std::cout << i << ","; 53 | } 54 | std::cout << "]" << std::endl; 55 | 56 | ptr = (float*)text_embeddings.data(); 57 | min_val = FLT_MAX; 58 | max_val = -FLT_MAX; 59 | for (size_t i = 0; i < text_embeddings.size() / 4; i++) { 60 | float val = *ptr++; 61 | if (val > max_val) { 62 | max_val = val; 63 | } 64 | if (val < min_val) { 65 | min_val = val; 66 | } 67 | } 68 | std::cout << "Range of text_embeddings: [" << min_val << "," << max_val << "]" << std::endl; 69 | 70 | std::vector norm_image_embeddings; 71 | 72 | ptr = (float*)image_embeddings.data(); 73 | for (size_t i = 0; i < images.size(); i++) { 74 | float norm = 0.0; 75 | for (size_t j = 0; j < 512; j++) { 76 | norm += std::pow(*(ptr+j), 2); 77 | } 78 | norm = std::sqrt(norm); 79 | 80 | for (size_t j = 0; j < 512; j++) { 81 | *ptr = *ptr / norm; 82 | ++ptr; 83 | } 84 | } 85 | 86 | ptr = (float*)text_embeddings.data(); 87 | for (size_t i = 0; i < texts.size(); i++) { 88 | float norm = 0.0; 89 | for (size_t j = 0; j < 512; j++) { 90 | norm += std::pow(*(ptr+j), 2); 91 | } 92 | norm = std::sqrt(norm); 93 | 94 | for (size_t j = 0; j < 512; j++) { 95 | *ptr = *ptr / norm; 96 | ++ptr; 97 | } 98 | } 99 | 100 | ptr = (float*)image_embeddings.data(); 101 | min_val = FLT_MAX; 102 | max_val = -FLT_MAX; 103 | for (size_t i = 0; i < image_embeddings.size() / 4; i++) { 104 | float val = *ptr++; 105 | if (val > max_val) { 106 | max_val = val; 107 | } 108 | if (val < min_val) { 109 | min_val = val; 110 | } 111 | } 112 | std::cout << "After normalization, range of image_embeddings: [" << min_val << "," << max_val << "]" << std::endl; 113 | 114 | ptr = (float*)text_embeddings.data(); 115 | min_val = FLT_MAX; 116 | max_val = -FLT_MAX; 117 | for (size_t i = 0; i < text_embeddings.size() / 4; i++) { 118 | float val = *ptr++; 119 | if (val > max_val) { 120 | max_val = val; 121 | } 122 | if (val < min_val) { 123 | min_val = val; 124 | } 125 | } 126 | std::cout << "After normalization, range of text_embeddings: [" << min_val << "," << max_val << "]" << std::endl; 127 | 128 | cv::Mat image_matrix(images.size(), 512, CV_32F, image_embeddings.data()); 129 | cv::Mat text_matrix(texts.size(), 512, CV_32F, text_embeddings.data()); 130 | cv::Mat result; 131 | cv::gemm(image_matrix, text_matrix.t(), 100, cv::Mat(), 0.0, result); 132 | std::cout << result << std::endl; 133 | } 134 | 135 | void GetTextEmbeddings() { 136 | std::string current_path = "../"; 137 | std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml"; 138 | clip::TextEncoder text_encoder(text_encoder_cfg); 139 | 140 | std::string prompt_path = current_path + "config/clip/prompts.txt"; 141 | std::ifstream file(prompt_path); 142 | std::vector texts; 143 | 144 | if (file.is_open()) { 145 | std::string line; 146 | while (std::getline(file, line)) { 147 | texts.push_back(line); // 逐行读取文件内容并存储到 vector 中 148 | } 149 | file.close(); // 关闭文件 150 | } else { 151 | std::cout << "无法打开文件" << std::endl; 152 | } 153 | 154 | std::cout << "Prompts: "; 155 | for (const auto& l : texts) { 156 | std::cout << l << ", "; 157 | } 158 | std::cout << std::endl; 159 | 160 | IOTensor text_embeddings; 161 | text_encoder.forward(texts, text_embeddings); 162 | std::cout << "Shape of image text_embeddings: ["; 163 | for (int64_t i : text_embeddings.shape) { 164 | std::cout << i << ","; 165 | } 166 | std::cout << "]" << std::endl; 167 | 168 | 169 | std::string fname = current_path + "weights/clip/text_embeddings.bin"; 170 | std::ofstream fout(fname.c_str(), std::ios::binary | std::ios::out); 171 | fout.write((char *)text_embeddings.data(), text_embeddings.size()); 172 | fout.close(); 173 | } 174 | 175 | void PipeLineTest() { 176 | std::string current_path = "../"; 177 | std::string image_encoder_cfg = current_path + "config/clip/image_encoder.yaml"; 178 | std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml"; 179 | 180 | clip::Clip clip_model(image_encoder_cfg, text_encoder_cfg); 181 | 182 | std::vector images; 183 | images.push_back(cv::imread("../test/image/clip/franz-kafka.jpg")); 184 | images.push_back(cv::imread("../test/image/clip/Mona_Lisa.jpg")); 185 | clip_model.encodeImages(images); 186 | 187 | std::vector texts{"a photo of a man", "a photo of a woman"}; 188 | clip_model.encodeTexts(texts); 189 | 190 | std::vector> probs = clip_model.computeProbabilities(); 191 | 192 | std::cout << "[ "; 193 | for (size_t i = 0; i < probs.size(); i++) { 194 | std::cout << "[ "; 195 | for (size_t j = 0; j < probs[0].size(); j++) { 196 | std::cout << probs[i][j] << " "; 197 | } 198 | std::cout << " ], "; 199 | } 200 | std::cout << " ]" << std::endl; 201 | } 202 | 203 | int main(int argc, char** argv) { 204 | if (argc == 2 && std::string(argv[1]) == "-g") { 205 | GetTextEmbeddings(); 206 | } 207 | PipeLineTest(); 208 | } -------------------------------------------------------------------------------- /test/image/clip/Mona_Lisa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/clip/Mona_Lisa.jpg -------------------------------------------------------------------------------- /test/image/clip/franz-kafka.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/clip/franz-kafka.jpg -------------------------------------------------------------------------------- /test/image/detect/COCO_train2014_000000181904.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/COCO_train2014_000000181904.jpg -------------------------------------------------------------------------------- /test/image/detect/COCO_train2014_000000291797.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/COCO_train2014_000000291797.jpg -------------------------------------------------------------------------------- /test/image/detect/bus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/bus.jpg -------------------------------------------------------------------------------- /test/image/detect/zidane.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/zidane.jpg -------------------------------------------------------------------------------- /test/image/ocr/det/01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/det/01.png -------------------------------------------------------------------------------- /test/image/ocr/det/02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/det/02.png -------------------------------------------------------------------------------- /test/image/ocr/rec/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/rec/demo.png -------------------------------------------------------------------------------- /test/image/sam/dogs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/sam/dogs.jpg -------------------------------------------------------------------------------- /test/ocr_test.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/framework.h" 2 | #include "common/common.h" 3 | 4 | #include "model/ocr/ctc.h" 5 | #include "model/ocr/attention.h" 6 | #include "model/ocr/dbnet.h" 7 | 8 | void CtcModelTest() { 9 | std::string current_path = "../"; 10 | std::string yaml_file = current_path + "config/ocr/rec/ctc.yaml"; 11 | 12 | CtcModel model(yaml_file); 13 | 14 | std::vector imagePathList; 15 | std::string input_path = current_path + "test/image/ocr/rec"; 16 | cv::glob(input_path + "/*.png", imagePathList); 17 | 18 | cv::Mat image, res; 19 | 20 | for (auto& path : imagePathList) { 21 | auto start = std::chrono::system_clock::now(); 22 | image = cv::imread(path, 0); 23 | std::string output = model.detect(image); 24 | std::cout << path << ": " << output << std::endl; 25 | auto end = std::chrono::system_clock::now(); 26 | auto tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 27 | printf("cost %2.4lf ms\n", tc); 28 | } 29 | } 30 | 31 | void AttnModelTest() { 32 | std::string current_path = "../"; 33 | std::string yaml_file = current_path + "config/ocr/rec/attn.yaml"; 34 | 35 | AttnModel model(yaml_file); 36 | 37 | std::vector imagePathList; 38 | std::string input_path = current_path + "test/image/ocr/rec"; 39 | cv::glob(input_path + "/*.png", imagePathList); 40 | 41 | cv::Mat image, res; 42 | 43 | for (auto& path : imagePathList) { 44 | auto start = std::chrono::system_clock::now(); 45 | image = cv::imread(path); 46 | std::string output = model.detect(image); 47 | std::cout << path << ": " << output << std::endl; 48 | auto end = std::chrono::system_clock::now(); 49 | auto tc = (double)std::chrono::duration_cast(end - start).count() / 1000.; 50 | printf("cost %2.4lf ms\n", tc); 51 | } 52 | } 53 | 54 | void DBNetTest() { 55 | std::string current_path = "../"; 56 | std::string yaml_file = current_path + "config/ocr/det/dbnet.yaml"; 57 | 58 | DBNet model(yaml_file); 59 | 60 | std::vector imagePathList; 61 | std::string input_path = current_path + "test/image/ocr/det"; 62 | std::string output_path = current_path + "output/dbnet"; 63 | cv::glob(input_path + "/*.png", imagePathList); 64 | 65 | cv::Mat image, input_image, res; 66 | std::vector objs; 67 | 68 | for (auto& path : imagePathList) { 69 | objs.clear(); 70 | std::cout << path << std::endl; 71 | image = cv::imread(path); 72 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 73 | model.detect(input_image, objs); 74 | DrawBoxes(image, res, objs); 75 | 76 | std::string::size_type iPos = path.find_last_of('/') + 1; 77 | std::string filename = path.substr(iPos, path.length() - iPos); 78 | std::string out_path = output_path + "/" + filename; 79 | // cv::imshow("image", res); 80 | // cv::waitKey(0); 81 | cv::imwrite(out_path, res); 82 | } 83 | } 84 | 85 | int main() { 86 | // CtcModelTest(); 87 | // AttnModelTest(); 88 | DBNetTest(); 89 | } -------------------------------------------------------------------------------- /test/sam_test.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/framework.h" 2 | #include "common/common.h" 3 | 4 | #include "model/sam/sam.h" 5 | 6 | int main() { 7 | std::string current_path = "../"; 8 | std::string encoder_cfg = current_path + "config/sam/image_encoder.yaml"; 9 | std::string decoder_cfg = current_path + "config/sam/mask_decoder.yaml"; 10 | 11 | sam::SAM sam_model(encoder_cfg, decoder_cfg); 12 | 13 | cv::Mat image, input_image; 14 | 15 | image = cv::imread("../test/image/sam/dogs.jpg"); 16 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 17 | 18 | sam_model.setImage(input_image); 19 | 20 | std::vector points; 21 | points.push_back(cv::Point2f(100, 100)); 22 | points.push_back(cv::Point2f(850, 759)); 23 | std::vector labels{2, 3}; 24 | 25 | cv::Mat output_mask; 26 | sam_model.predict(points, labels, output_mask); 27 | 28 | cv::Mat res = image.clone(); 29 | cv::Mat mask = image.clone(); 30 | 31 | cv::rectangle(res, cv::Rect(100, 100, 750, 659), {0, 0, 255}, 2); 32 | mask.setTo(cv::Scalar(255, 56, 56), output_mask); 33 | cv::addWeighted(res, 0.5, mask, 0.8, 1, res); 34 | cv::imwrite("../output/sam/dogs.jpg", res); 35 | } -------------------------------------------------------------------------------- /test/test.cpp: -------------------------------------------------------------------------------- 1 | #include "onnxruntime_cxx_api.h" 2 | 3 | int main() { 4 | // Allocate ONNXRuntime session 5 | auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU); 6 | Ort::Env env; 7 | Ort::Session session{env, ORT_TSTR("../weights/ocr/best-train-abinet.onnx"), Ort::SessionOptions{nullptr}}; 8 | 9 | // Allocate model inputs: fill in shape and size 10 | std::array input; 11 | std::array input_shape{1, 3, 32, 128}; 12 | Ort::Value input_tensor = Ort::Value::CreateTensor(memory_info, input.data(), input.size(), input_shape.data(), input_shape.size()); 13 | const char* input_names[] = {"images"}; 14 | 15 | // Allocate model outputs: fill in shape and size 16 | std::array output; 17 | std::array output_shape{1, 26, 1}; 18 | Ort::Value output_tensor = Ort::Value::CreateTensor(memory_info, output.data(), output.size(), output_shape.data(), output_shape.size()); 19 | const char* output_names[] = {"output"}; 20 | 21 | // Run the model 22 | session.Run(Ort::RunOptions{nullptr}, input_names, &input_tensor, 1, output_names, &output_tensor, 1); 23 | return 0; 24 | } 25 | -------------------------------------------------------------------------------- /test/yolo_test.cpp: -------------------------------------------------------------------------------- 1 | #include "framework/framework.h" 2 | #include "common/common.h" 3 | 4 | #include "model/yolo/yolo_seg.h" 5 | #include "model/yolo/yolo_pose.h" 6 | #include "model/yolo/yolo.h" 7 | #include "model/yolo/yolo_seg_cutoff.h" 8 | #include "model/yolo/yolo_cutoff.h" 9 | 10 | void YOLODetTest() { 11 | std::string current_path = "../"; 12 | std::string yaml_file = current_path + "config/yolo/yolo.yaml"; 13 | 14 | YOLO model(yaml_file); 15 | 16 | std::vector imagePathList; 17 | std::string input_path = current_path + "test/image/detect"; 18 | std::string output_path = current_path + "output/yolo/detect"; 19 | cv::glob(input_path + "/*.jpg", imagePathList); 20 | 21 | cv::Mat image, input_image, res; 22 | std::vector objs; 23 | 24 | std::vector class_names; 25 | ReadClassNames(current_path + "config/yolo/coco.txt", class_names); 26 | 27 | for (auto& path : imagePathList) { 28 | objs.clear(); 29 | std::cout << path << std::endl; 30 | image = cv::imread(path); 31 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 32 | model.detect(input_image, objs); 33 | DrawObjects(image, res, objs, class_names, COLORS); 34 | 35 | std::string::size_type iPos = path.find_last_of('/') + 1; 36 | std::string filename = path.substr(iPos, path.length() - iPos); 37 | std::string out_path = output_path + "/" + filename; 38 | // cv::imshow("image", res); 39 | // cv::waitKey(0); 40 | cv::imwrite(out_path, res); 41 | } 42 | } 43 | 44 | void YOLODetCutoffTest() { 45 | std::string current_path = "../"; 46 | std::string yaml_file = current_path + "config/yolo/yolo_cutoff.yaml"; 47 | 48 | YOLOCutoff model(yaml_file); 49 | 50 | std::vector imagePathList; 51 | std::string input_path = current_path + "test/image/detect"; 52 | std::string output_path = current_path + "output/yolo/detect"; 53 | cv::glob(input_path + "/*.jpg", imagePathList); 54 | 55 | cv::Mat image, input_image, res; 56 | std::vector objs; 57 | 58 | std::vector class_names; 59 | ReadClassNames(current_path + "config/yolo/coco.txt", class_names); 60 | 61 | for (auto& path : imagePathList) { 62 | objs.clear(); 63 | std::cout << path << std::endl; 64 | image = cv::imread(path); 65 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 66 | 67 | auto start = std::chrono::system_clock::now(); 68 | model.detect(input_image, objs); 69 | auto end = std::chrono::system_clock::now(); 70 | std::cout << "Costs: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 71 | 72 | DrawObjects(image, res, objs, class_names, COLORS); 73 | 74 | std::string::size_type iPos = path.find_last_of('/') + 1; 75 | std::string filename = path.substr(iPos, path.length() - iPos); 76 | std::string out_path = output_path + "/" + filename; 77 | // cv::imshow("image", res); 78 | // cv::waitKey(0); 79 | cv::imwrite(out_path, res); 80 | } 81 | } 82 | 83 | void YOLOSegTest() { 84 | std::string current_path = "../"; 85 | std::string yaml_file = current_path + "config/yolo/yolo_seg.yaml"; 86 | 87 | YOLOSeg model(yaml_file); 88 | 89 | std::vector imagePathList; 90 | std::string input_path = current_path + "test/image/detect"; 91 | std::string output_path = current_path + "output/yolo/segment"; 92 | cv::glob(input_path + "/*.jpg", imagePathList); 93 | 94 | cv::Mat image, input_image, res; 95 | std::vector objs; 96 | 97 | std::vector class_names; 98 | ReadClassNames(current_path + "config/yolo/stardust.txt", class_names); 99 | 100 | for (auto& path : imagePathList) { 101 | objs.clear(); 102 | std::cout << path << std::endl; 103 | image = cv::imread(path); 104 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 105 | 106 | auto start = std::chrono::system_clock::now(); 107 | model.detect(input_image, objs); 108 | auto end = std::chrono::system_clock::now(); 109 | std::cout << "Costs: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 110 | 111 | DrawObjectsMasks(image, res, objs, class_names, COLORS, MASK_COLORS); 112 | 113 | std::string::size_type iPos = path.find_last_of('/') + 1; 114 | std::string filename = path.substr(iPos, path.length() - iPos); 115 | std::string out_path = output_path + "/" + filename; 116 | // cv::imshow("image", res); 117 | // cv::waitKey(0); 118 | cv::imwrite(out_path, res); 119 | } 120 | } 121 | 122 | void YOLOSegCutoffTest() { 123 | std::string current_path = "../"; 124 | std::string yaml_file = current_path + "config/yolo/yolo_seg_cutoff.yaml"; 125 | 126 | YOLOSegCutoff model(yaml_file); 127 | 128 | std::vector imagePathList; 129 | std::string input_path = current_path + "test/image/detect"; 130 | std::string output_path = current_path + "output/yolo/segment"; 131 | cv::glob(input_path + "/*.jpg", imagePathList); 132 | 133 | cv::Mat image, input_image, res; 134 | std::vector objs; 135 | 136 | std::vector class_names; 137 | ReadClassNames(current_path + "config/yolo/coco.txt", class_names); 138 | 139 | for (auto& path : imagePathList) { 140 | objs.clear(); 141 | std::cout << path << std::endl; 142 | image = cv::imread(path); 143 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 144 | 145 | auto start = std::chrono::system_clock::now(); 146 | model.detect(input_image, objs); 147 | auto end = std::chrono::system_clock::now(); 148 | std::cout << "Costs: " << std::chrono::duration_cast(end - start).count() << "ms" << std::endl; 149 | 150 | DrawObjectsMasks(image, res, objs, class_names, COLORS, MASK_COLORS); 151 | 152 | std::string::size_type iPos = path.find_last_of('/') + 1; 153 | std::string filename = path.substr(iPos, path.length() - iPos); 154 | std::string out_path = output_path + "/" + filename; 155 | // cv::imshow("image", res); 156 | // cv::waitKey(0); 157 | cv::imwrite(out_path, res); 158 | } 159 | } 160 | 161 | void YOLOPoseTest() { 162 | std::string current_path = "../"; 163 | std::string yaml_file = current_path + "config/yolo/yolo_pose.yaml"; 164 | 165 | YOLOPose model(yaml_file); 166 | 167 | std::vector imagePathList; 168 | std::string input_path = current_path + "test/image/detect"; 169 | std::string output_path = current_path + "output/yolo/pose"; 170 | cv::glob(input_path + "/*.jpg", imagePathList); 171 | 172 | cv::Mat image, input_image, res; 173 | std::vector objs; 174 | 175 | std::vector class_names; 176 | ReadClassNames(current_path + "config/yolo/coco.txt", class_names); 177 | 178 | for (auto& path : imagePathList) { 179 | objs.clear(); 180 | std::cout << path << std::endl; 181 | image = cv::imread(path); 182 | cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB); 183 | model.detect(input_image, objs); 184 | DrawObjectsKps(image, res, objs, SKELETON, KPS_COLORS, LIMB_COLORS); 185 | 186 | std::string::size_type iPos = path.find_last_of('/') + 1; 187 | std::string filename = path.substr(iPos, path.length() - iPos); 188 | std::string out_path = output_path + "/" + filename; 189 | // cv::imshow("image", res); 190 | // cv::waitKey(0); 191 | cv::imwrite(out_path, res); 192 | } 193 | } 194 | 195 | int main() { 196 | // YOLODetTest(); 197 | YOLOSegTest(); 198 | // YOLOPoseTest(); 199 | // YOLODetCutoffTest(); 200 | // YOLOSegCutoffTest(); 201 | } 202 | --------------------------------------------------------------------------------