├── .gitignore
├── CMakeLists.txt
├── README.md
├── common
    ├── common.cpp
    ├── common.h
    └── config.h.in
├── config
    ├── clip
    │   ├── image_encoder.yaml
    │   ├── prompts.txt
    │   └── text_encoder.yaml
    ├── ocr
    │   ├── det
    │   │   └── dbnet.yaml
    │   └── rec
    │   │   ├── attn.yaml
    │   │   ├── charset_36.txt
    │   │   └── ctc.yaml
    ├── sam
    │   ├── image_encoder.yaml
    │   └── mask_decoder.yaml
    └── yolo
    │   ├── coco.txt
    │   ├── yolo.yaml
    │   ├── yolo_cutoff.yaml
    │   ├── yolo_pose.yaml
    │   ├── yolo_seg.yaml
    │   └── yolo_seg_cutoff.yaml
├── doc
    ├── environment
    │   ├── cuda-on-linux.md
    │   ├── cuda-on-wsl.md
    │   └── onnxruntime.md
    └── model
    │   ├── abinet.md
    │   ├── clip.md
    │   ├── crnn.md
    │   ├── dbnet.md
    │   ├── sam.md
    │   └── yolo.md
├── framework
    ├── CMakeLists.txt
    ├── framework.h
    ├── onnx
    │   ├── onnx.cpp
    │   └── onnx.h
    ├── rknn
    │   ├── rknn.cpp
    │   └── rknn.h
    └── tensorrt
    │   ├── tensorrt.cpp
    │   └── tensorrt.h
├── model
    ├── CMakeLists.txt
    ├── base
    │   ├── detection_model.cpp
    │   ├── detection_model.h
    │   ├── model.cpp
    │   ├── model.h
    │   ├── ocr_model.cpp
    │   └── ocr_model.h
    ├── clip
    │   ├── CMakeLists.txt
    │   ├── clip.cpp
    │   ├── clip.h
    │   ├── image_encoder.cpp
    │   ├── image_encoder.h
    │   ├── text_encoder.cpp
    │   ├── text_encoder.h
    │   ├── text_tokenizer.cpp
    │   └── text_tokenizer.h
    ├── ocr
    │   ├── attention.cpp
    │   ├── attention.h
    │   ├── ctc.cpp
    │   ├── ctc.h
    │   ├── dbnet.cpp
    │   ├── dbnet.h
    │   └── scripts
    │   │   ├── abinet_export.py
    │   │   ├── crnn_export.py
    │   │   └── dbnet_export.py
    ├── sam
    │   ├── image_encoder.cpp
    │   ├── image_encoder.h
    │   ├── mask_decoder.cpp
    │   ├── mask_decoder.h
    │   ├── sam.cpp
    │   └── sam.h
    └── yolo
    │   ├── common.py
    │   ├── test.py
    │   ├── yolo.cpp
    │   ├── yolo.h
    │   ├── yolo_cutoff.cpp
    │   ├── yolo_cutoff.h
    │   ├── yolo_pose.cpp
    │   ├── yolo_pose.h
    │   ├── yolo_seg.cpp
    │   ├── yolo_seg.h
    │   ├── yolo_seg_cutoff.cpp
    │   ├── yolo_seg_cutoff.h
    │   ├── yolov8-det-export.py
    │   ├── yolov8-pose-export.py
    │   ├── yolov8-seg-export.py
    │   └── yolov9-det-export.py
├── output
    ├── dbnet
    │   ├── 01.png
    │   └── 02.png
    ├── sam
    │   └── dogs.jpg
    └── yolo
    │   ├── detect
    │       ├── COCO_train2014_000000181904.jpg
    │       ├── COCO_train2014_000000291797.jpg
    │       ├── bus.jpg
    │       └── zidane.jpg
    │   ├── pose
    │       ├── COCO_train2014_000000181904.jpg
    │       ├── COCO_train2014_000000291797.jpg
    │       ├── bus.jpg
    │       └── zidane.jpg
    │   └── segment
    │       ├── COCO_train2014_000000181904.jpg
    │       ├── COCO_train2014_000000291797.jpg
    │       ├── bus.jpg
    │       └── zidane.jpg
└── test
    ├── CMakeLists.txt
    ├── clip_test.cpp
    ├── image
        ├── clip
        │   ├── Mona_Lisa.jpg
        │   └── franz-kafka.jpg
        ├── detect
        │   ├── COCO_train2014_000000181904.jpg
        │   ├── COCO_train2014_000000291797.jpg
        │   ├── bus.jpg
        │   └── zidane.jpg
        ├── ocr
        │   ├── det
        │   │   ├── 01.png
        │   │   └── 02.png
        │   └── rec
        │   │   └── demo.png
        └── sam
        │   └── dogs.jpg
    ├── ocr_test.cpp
    ├── sam_test.cpp
    ├── test.cpp
    └── yolo_test.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | /.vscode/*
2 | /build/*
3 | /build-orin/*
4 | /build-rk/*
5 | /weights/*
6 | __pycache__


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.8)
 2 | 
 3 | option(USE_TENSORRT "Compile tensorrt framework" ON)
 4 | option(USE_TENSORRT "Compile rknn framework" OFF)
 5 | message(STATUS "USE_TENSORRT: ${USE_TENSORRT}\n")
 6 | message(STATUS "USE_RKNN: ${USE_RKNN}\n")
 7 | 
 8 | if(USE_TENSORRT)
 9 |   set(CMAKE_CUDA_ARCHITECTURES 60 61 62 70 72 75 86)
10 |   set(CMAKE_CUDA_COMPILER /usr/local/cuda/bin/nvcc)
11 |   project(model_zoo_cxx LANGUAGES CXX CUDA)
12 |   option(CUDA_USE_STATIC_CUDA_RUNTIME OFF)
13 | else()
14 |   project(model_zoo_cxx)
15 | endif()
16 | 
17 | if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
18 |   add_compile_options(-Wall -Wextra)
19 | endif()
20 | 
21 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0")
22 | set(CMAKE_CXX_STANDARD 14)
23 | set(CMAKE_BUILD_TYPE Debug)
24 | 
25 | # OpenCV
26 | find_package(OpenCV REQUIRED)
27 | message(STATUS "OpenCV Libs: \n${OpenCV_LIBS}\n")
28 | message(STATUS "OpenCV Libraries: \n${OpenCV_LIBRARIES}\n")
29 | message(STATUS "OpenCV Headers: \n${OpenCV_INCLUDE_DIRS}\n")
30 | 
31 | find_package(Eigen3 REQUIRED)
32 | 
33 | if(USE_TENSORRT)
34 |   find_package(CUDA REQUIRED)
35 |   message(STATUS "CUDA Libs: \n${CUDA_LIBRARIES}\n")
36 |   get_filename_component(CUDA_LIB_DIR ${CUDA_LIBRARIES} DIRECTORY)
37 |   message(STATUS "CUDA Headers: \n${CUDA_INCLUDE_DIRS}\n")
38 | 
39 |   # TensorRT
40 |   if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
41 |     set(TensorRT_INCLUDE_DIRS /usr/include/aarch64-linux-gnu)
42 |     set(TensorRT_LIBRARIES /usr/lib/aarch64-linux-gnu)
43 |   else()
44 |     set(TensorRT_INCLUDE_DIRS /usr/include/x86_64-linux-gnu)
45 |     set(TensorRT_LIBRARIES /usr/lib/x86_64-linux-gnu)
46 |   endif()
47 | 
48 |   message(STATUS "TensorRT Libs: \n${TensorRT_LIBRARIES}\n")
49 |   message(STATUS "TensorRT Headers: \n${TensorRT_INCLUDE_DIRS}\n")
50 | 
51 |   list(APPEND INCLUDE_DIRS
52 |           ${CUDA_INCLUDE_DIRS}
53 |           ${TensorRT_INCLUDE_DIRS}
54 |           )
55 | endif()
56 | 
57 | if (USE_RKNN)
58 |   set(RKNN_INCLUDE_DIRS /usr/include)
59 |   set(RKNN_LIBS /usr/lib/librknnrt.so)
60 |   list(APPEND INCLUDE_DIRS
61 |           ${RKNN_INCLUDE_DIRS}
62 |           )
63 | endif()
64 | 
65 | list(APPEND INCLUDE_DIRS
66 |   ${OpenCV_INCLUDE_DIRS}
67 |   )
68 | 
69 | set(ONNXRUNTIME_LIBS /usr/lib/libonnxruntime.so)
70 | 
71 | if(${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
72 |   set(Clipper_LIBS /usr/lib/aarch64-linux-gnu/libpolyclipping.so)
73 | else()
74 |   set(Clipper_LIBS /usr/lib/x86_64-linux-gnu/libpolyclipping.so)
75 | endif()
76 | message(STATUS "Clipper Libs: \n${Clipper_LIBS}\n")
77 | 
78 | add_subdirectory(framework)
79 | add_subdirectory(model)
80 | add_subdirectory(test)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CXX-DeepLearning-Inference
 2 | 
 3 | ## Introduction
 4 | A unified and extensible pipeline for deep learning model inference with C++.
 5 | ### Support framework
 6 | - [x] ONNXRuntime
 7 | - [x] TensorRT
 8 | - [x] RKNN
 9 | ### Support model
10 | - [x] object-detection
11 |   - [x] [yolo](/doc/model/yolo.md) (including yolov8 & yolov9 for detection, segmentation and pose) 
12 | - [x] ocr
13 |   - [x] [crnn](/doc/model/crnn.md)
14 |   - [x] [abinet](/doc/model/abinet.md)
15 |   - [x] [dbnet](/doc/model/dbnet.md)
16 | - [x] [sam](/doc/model/sam.md) 
17 | - [x] [clip](/doc/model/clip.md)
18 | 
19 | |       |         | ONNXRuntime | TensorRT | RKNN |
20 | |-|-|:-:|:-:|:-:|
21 | | YOLO  | YOLO-Det|       √     |   √      |      |
22 | |       | YOLO-Seg|       √     |   √      |      |
23 | |       | YOLO-Pose|       √     |   √      |      |
24 | |       | YOLO-Det-Cutoff |||       √     |
25 | |       | YOLO-Seg-Cutoff |||       √     | 
26 | |OCR    | CRNN    |       √     |   √      |      |
27 | |       | ABINet  |       √     |   √      |      |
28 | |       | DBNet   |       √     |   √      |      |
29 | |SAM    |         |       √     |   √      |      |
30 | |CLIP   |         |       √     |   √      |      |
31 | 
32 | ## Appendix
33 | [How to build TensorRT environment](/doc/environment/cuda-on-linux.md)
34 | 


--------------------------------------------------------------------------------
/common/common.cpp:
--------------------------------------------------------------------------------
  1 | #include "common/common.h"
  2 | #include <fstream>
  3 | #include <sstream>
  4 | 
  5 | bool IsFile(const std::string &path)
  6 | {
  7 |     if (!IsPathExist(path))
  8 |     {
  9 |         printf("%s:%d %s not exist\n", __FILE__, __LINE__, path.c_str());
 10 |         return false;
 11 |     }
 12 |     struct stat buffer;
 13 |     return (stat(path.c_str(), &buffer) == 0 && S_ISREG(buffer.st_mode));
 14 | }
 15 | 
 16 | bool IsFolder(const std::string &path)
 17 | {
 18 |     if (!IsPathExist(path))
 19 |     {
 20 |         return false;
 21 |     }
 22 |     struct stat buffer;
 23 |     return (stat(path.c_str(), &buffer) == 0 && S_ISDIR(buffer.st_mode));
 24 | }
 25 | 
 26 | PreParam Letterbox(const cv::Mat &image, cv::Mat &out, cv::Size size)
 27 | {
 28 |     const float inp_h = size.height;
 29 |     const float inp_w = size.width;
 30 |     float height = image.rows;
 31 |     float width = image.cols;
 32 | 
 33 |     float r = std::min(inp_h / height, inp_w / width);
 34 |     int padw = std::round(width * r);
 35 |     int padh = std::round(height * r);
 36 | 
 37 |     cv::Mat tmp;
 38 |     if ((int)width != padw || (int)height != padh)
 39 |     {
 40 |         cv::resize(image, tmp, cv::Size(padw, padh));
 41 |     }
 42 |     else
 43 |     {
 44 |         tmp = image.clone();
 45 |     }
 46 | 
 47 |     float dw = inp_w - padw;
 48 |     float dh = inp_h - padh;
 49 | 
 50 |     dw /= 2.0f;
 51 |     dh /= 2.0f;
 52 |     int top = int(std::round(dh - 0.1f));
 53 |     int bottom = int(std::round(dh + 0.1f));
 54 |     int left = int(std::round(dw - 0.1f));
 55 |     int right = int(std::round(dw + 0.1f));
 56 | 
 57 |     cv::copyMakeBorder(tmp, out, top, bottom, left, right, cv::BORDER_CONSTANT, {114, 114, 114});
 58 | 
 59 |     PreParam pparam;
 60 |     pparam.ratio = 1 / r;
 61 |     pparam.dw = dw;
 62 |     pparam.dh = dh;
 63 |     pparam.height = height;
 64 |     pparam.width = width;
 65 |     return pparam;
 66 | }
 67 | 
 68 | PreParam paddimg(const cv::Mat &image, cv::Mat &out, int shortsize) {
 69 |     int w = image.cols;
 70 |     int h = image.rows;
 71 |     float scale = 1.f;
 72 |     if (w < h) {
 73 |         scale = (float)shortsize / w;
 74 |         h = scale * h;
 75 |         w = shortsize;
 76 |     }
 77 |     else {
 78 |         scale = (float)shortsize / h;
 79 |         w = scale * w;
 80 |         h = shortsize;
 81 |     }
 82 | 
 83 |     if (h % 32 != 0) {
 84 |         h = (h / 32 + 1) * 32;
 85 |     }
 86 |     if (w % 32 != 0) {
 87 |         w = (w / 32 + 1) * 32;
 88 |     }
 89 | 
 90 |     cv::resize(image, out, cv::Size(w, h));
 91 |     PreParam pparam;
 92 |     pparam.ratio = 1 / scale;
 93 |     pparam.dw = 0;
 94 |     pparam.dh = 0;
 95 |     pparam.height = image.rows;
 96 |     pparam.width = image.cols;
 97 |     return pparam;
 98 | }
 99 | 
100 | int32_t __clip(float val, float min, float max) {
101 |     float f = val <= min ? min : (val >= max ? max : val);
102 |     return f;
103 | }
104 | 
105 | float sigmoid(float x) { return 1.0 / (1.0 + expf(-x)); }
106 | 
107 | float unsigmoid(float y) { return -1.0 * logf((1.0 / y) - 1.0); }
108 | 
109 | int8_t qntF32ToAffine(float f32, int32_t zp, float scale) {
110 |     float dst_val = (f32 / scale) + zp;
111 |     int8_t res = (int8_t)__clip(dst_val, -128, 127);
112 |     return res;
113 | }
114 | 
115 | float deqntAffineToF32(int8_t qnt, int32_t zp, float scale) { return ((float)qnt - (float)zp) * scale; }


--------------------------------------------------------------------------------
/common/common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "opencv2/opencv.hpp"
 3 | #include <sys/stat.h>
 4 | #include <unistd.h>
 5 | 
 6 | struct Binding
 7 | {
 8 |     size_t size = 1;
 9 |     size_t dsize = 1;
10 |     std::vector<int64_t> dims;
11 |     std::string name;
12 | };
13 | 
14 | struct PreParam
15 | {
16 |     float ratio = 1.0f;
17 |     float dw = 0.0f;
18 |     float dh = 0.0f;
19 |     float height = 0;
20 |     float width = 0;
21 | };
22 | 
23 | inline static float clamp(float val, float min, float max)
24 | {
25 |     return val > min ? (val < max ? val : max) : min;
26 | }
27 | 
28 | inline bool IsPathExist(const std::string &path)
29 | {
30 |     return (access(path.c_str(), 0) == F_OK);
31 | }
32 | 
33 | bool IsFile(const std::string &path);
34 | 
35 | bool IsFolder(const std::string &path);
36 | 
37 | PreParam Letterbox(const cv::Mat &image, cv::Mat &out, cv::Size size);
38 | 
39 | PreParam paddimg(const cv::Mat &image, cv::Mat &out, int shortsize = 960);
40 | 
41 | float sigmoid(float x);
42 | 
43 | float unsigmoid(float y);
44 | 
45 | float deqntAffineToF32(int8_t qnt, int32_t zp, float scale);
46 | 
47 | int32_t __clip(float val, float min, float max);
48 | 
49 | int8_t qntF32ToAffine(float f32, int32_t zp, float scale);


--------------------------------------------------------------------------------
/common/config.h.in:
--------------------------------------------------------------------------------
1 | #cmakedefine USE_TENSORRT
2 | #cmakedefine USE_RKNN


--------------------------------------------------------------------------------
/config/clip/image_encoder.yaml:
--------------------------------------------------------------------------------
1 | model_name: "clip_image_encoder"
2 | model_path: "../weights/clip/clip_image_model_vitb32.onnx"
3 | framework: "ONNX"
4 | # model_path: "../weights/clip/clip_image_model_res18.engine"
5 | # framework: "TensorRT"
6 | max_batch_size: 2 


--------------------------------------------------------------------------------
/config/clip/prompts.txt:
--------------------------------------------------------------------------------
1 | a photo of a man
2 | a photo of a woman


--------------------------------------------------------------------------------
/config/clip/text_encoder.yaml:
--------------------------------------------------------------------------------
1 | model_name: "clip_text_encoder"
2 | model_path: "../weights/clip/clip_text_model_vitb32.onnx"
3 | framework: "ONNX"
4 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz"
5 | prompts: "../config/clip/prompts.txt"
6 | text_embedding: "../weights/clip/text_embeddings.bin"
7 | online: false


--------------------------------------------------------------------------------
/config/ocr/det/dbnet.yaml:
--------------------------------------------------------------------------------
1 | model_name: "dbnet"
2 | model_path: "../weights/ocr/DBNet.onnx"
3 | framework: "ONNX"
4 | # framework: "TensorRT"
5 | box_thres: 0.5
6 | max_input_size: [1, 3, 1440, 1440]


--------------------------------------------------------------------------------
/config/ocr/rec/attn.yaml:
--------------------------------------------------------------------------------
1 | model_name: "abinet"
2 | model_path: "../weights/ocr/best-train-abinet.onnx"
3 | framework: "ONNX"
4 | # framework: "TensorRT"
5 | input_size: [128,32]  # (width, height)
6 | input_channel: 3
7 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789"
8 | output_size: 26


--------------------------------------------------------------------------------
/config/ocr/rec/charset_36.txt:
--------------------------------------------------------------------------------
 1 | 0
 2 | 1
 3 | 2
 4 | 3
 5 | 4
 6 | 5
 7 | 6
 8 | 7
 9 | 8
10 | 9
11 | a
12 | b
13 | c
14 | d
15 | e
16 | f
17 | g
18 | h
19 | i
20 | j
21 | k
22 | l
23 | m
24 | n
25 | o
26 | p
27 | q
28 | r
29 | s
30 | t
31 | u
32 | v
33 | w
34 | x
35 | y
36 | z


--------------------------------------------------------------------------------
/config/ocr/rec/ctc.yaml:
--------------------------------------------------------------------------------
1 | model_name: "crnn"
2 | model_path: "../weights/ocr/crnn.onnx"
3 | framework: "ONNX"
4 | # framework: "TensorRT"
5 | input_size: [100,32]  # (width, height)
6 | input_channel: 1
7 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz"
8 | output_size: 26


--------------------------------------------------------------------------------
/config/sam/image_encoder.yaml:
--------------------------------------------------------------------------------
1 | model_name: "sam_image_encoder"
2 | model_path: "../weights/sam/resnet18_image_encoder.onnx"
3 | framework: "ONNX"
4 | # model_path: "../weights/sam/resnet18_image_encoder.engine"
5 | # framework: "TensorRT"


--------------------------------------------------------------------------------
/config/sam/mask_decoder.yaml:
--------------------------------------------------------------------------------
1 | model_name: "sam_mask_decoder"
2 | model_path: "../weights/sam/mobile_sam_mask_decoder.onnx"
3 | framework: "ONNX"
4 | # model_path: "../weights/sam/mobile_sam_mask_decoder.engine"
5 | # framework: "TensorRT"


--------------------------------------------------------------------------------
/config/yolo/coco.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorcycle
 5 | airplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | couch
59 | potted plant
60 | bed
61 | dining table
62 | toilet
63 | tv
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush


--------------------------------------------------------------------------------
/config/yolo/yolo.yaml:
--------------------------------------------------------------------------------
1 | model_name: "yolo"
2 | model_path: "../weights/yolo/yolov8s_end2end.onnx"
3 | framework: "ONNX"
4 | # framework: "TensorRT"
5 | input_size: [640,640]
6 | with_nms: true
7 | conf_thres: 0.25
8 | nms_thres: 0.65
9 | topk: 100


--------------------------------------------------------------------------------
/config/yolo/yolo_cutoff.yaml:
--------------------------------------------------------------------------------
1 | model_name: "yolo_det"
2 | model_path: "../weights/yolo/yolov8n.rknn"
3 | framework: "RKNN"
4 | input_size: [640,640]
5 | conf_thres: 0.25
6 | nms_thres: 0.65
7 | class_num: 80
8 | topk: 100


--------------------------------------------------------------------------------
/config/yolo/yolo_pose.yaml:
--------------------------------------------------------------------------------
1 | model_name: "yolo_pose"
2 | model_path: "../weights/yolo/yolov8s-pose.onnx"
3 | framework: "ONNX"
4 | # framework: "TensorRT"
5 | input_size: [640,640]
6 | conf_thres: 0.25
7 | nms_thres: 0.65


--------------------------------------------------------------------------------
/config/yolo/yolo_seg.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "yolo_seg"
 2 | model_path: "../weights/yolo/yolov8s-seg.onnx"
 3 | framework: "ONNX"
 4 | # framework: "TensorRT"
 5 | input_size: [640,640]
 6 | conf_thres: 0.25
 7 | nms_thres: 0.65
 8 | seg_size: [160, 160]
 9 | seg_channels: 32
10 | 


--------------------------------------------------------------------------------
/config/yolo/yolo_seg_cutoff.yaml:
--------------------------------------------------------------------------------
 1 | model_name: "yolo_seg"
 2 | model_path: "../weights/yolo/yolov8s-seg.rknn"
 3 | framework: "RKNN"
 4 | input_size: [640,640]
 5 | conf_thres: 0.25
 6 | nms_thres: 0.65
 7 | seg_size: [160, 160]
 8 | seg_channels: 32
 9 | class_num: 80
10 | topk: 100


--------------------------------------------------------------------------------
/doc/environment/cuda-on-linux.md:
--------------------------------------------------------------------------------
 1 | # CUDA ON Ubuntu
 2 | 
 3 | ## 驱动安装
 4 | 
 5 | TODO
 6 | 
 7 | ## cuda安装
 8 | 
 9 | ### 添加源
10 | 
11 | For Ubuntu 22.04
12 | ```
13 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
14 | sudo sh -c 'echo "deb https://mirrors.aliyun.com/nvidia-cuda/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
15 | ```
16 | 
17 | ### 安装
18 | ```
19 | # 更新列表
20 | sudo apt-get update
21 | 
22 | # 查询可用版本
23 | apt search cuda-toolkit
24 | 
25 | # 安装
26 | sudo apt install cuda-toolkit-<version>
27 | ```
28 | 
29 | ## TensorRT安装
30 | 
31 | ```
32 | # 查询可用版本
33 | apt policy tensorrt-dev
34 | 
35 | # 安装
36 | sudo apt install tensorrt-dev=<version>
37 | 
38 | # 安装trtexec
39 | sudo apt install libnvinfer-bin=<version>
40 | ```


--------------------------------------------------------------------------------
/doc/environment/cuda-on-wsl.md:
--------------------------------------------------------------------------------
 1 | # CUDA ON WSL
 2 | 
 3 | ## 驱动安装
 4 | 
 5 | [CUDA on WSL 驱动下载地址](https://developer.nvidia.com/cuda/wsl)
 6 | 
 7 | 根据自己的GPU类型（GeForce and Quadro) 选择对应的驱动。
 8 | 不需要在wsl下安装nvidia驱动，windows会自动为wsl安装nvidia驱动。
 9 | 
10 | ## cuda安装
11 | 
12 | ### 添加源
13 | 
14 | For Ubuntu 22.04
15 | ```
16 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys A4B469963BF863CC
17 | sudo sh -c 'echo "deb https://mirrors.aliyun.com/nvidia-cuda/ubuntu2204/x86_64 /" > /etc/apt/sources.list.d/cuda.list'
18 | ```
19 | 
20 | ### 安装
21 | ```
22 | # 更新列表
23 | sudo apt-get update
24 | 
25 | # 查询可用版本
26 | apt search cuda-toolkit
27 | 
28 | # 安装
29 | sudo apt install cuda-toolkit-<version>
30 | ```
31 | 
32 | ## TensorRT安装
33 | 
34 | ```
35 | # 查询可用版本
36 | apt policy tensorrt-dev
37 | 
38 | # 安装
39 | sudo apt install tensorrt-dev=<version>
40 | 
41 | # 安装trtexec
42 | sudo apt install libnvinfer-bin=<version>
43 | ```


--------------------------------------------------------------------------------
/doc/environment/onnxruntime.md:
--------------------------------------------------------------------------------
 1 | # ONNXRuntime install
 2 | 
 3 | 1. choose a version in [Onnxruntime release](https://github.com/microsoft/onnxruntime/releases)
 4 | 2. download
 5 | ```
 6 | wget https://github.com/microsoft/onnxruntime/releases/download/v1.16.2/onnxruntime-linux-x64-1.16.2.tgz
 7 | ```
 8 | 3. install
 9 | ```
10 | tar -xzf onnxruntime-linux-x64-1.16.2.tgz
11 | cd onnxruntime-linux-x64-1.16.2
12 | sudo cp include/* /usr/include
13 | sudo cp lib/* /usr/lib
14 | ```


--------------------------------------------------------------------------------
/doc/model/abinet.md:
--------------------------------------------------------------------------------
 1 | # ABINet
 2 | 
 3 | ## Get pytorch model
 4 | The pytorch implementation is [ABINet](https://github.com/FangShancheng/ABINet).
 5 | 
 6 | ## Export
 7 | ### ONNX
 8 | ```
 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git
10 | git clone https://github.com/FangShancheng/ABINet.git
11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/abinet_export.py ABINet
12 | cd ABINet
13 | python3 abinet_export.py --sim --weights=path-to-weights
14 | ```
15 | You can checkout the onnx model in [netron](netron.app).
16 | - inputs
17 |     - images (float32[1,3,32,128])
18 | - outputs
19 |     - output (float32[1,26,1])
20 | 
21 | ### TensorRT
22 | ```
23 | ${tensorrt-install-path}/bin/trtexec                                                             
24 | --onnx=path-to-your-onnx-model \
25 | --saveEngine=save-path \
26 | --fp16
27 | ```
28 | 
29 | ## Inference
30 | ### ONNXRuntime
31 | #### Build
32 | ```
33 | mkdir build && cd build
34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF
35 | make
36 | ```
37 | #### Config
38 | config/ocr/rec/attn.yaml for abinet model
39 | ```
40 | model_name: "abinet"
41 | model_path: "../weights/ocr/best-train-abinet.onnx"
42 | framework: "ONNX"
43 | input_size: [128,32]  # (width, height)
44 | input_channel: 3
45 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789"
46 | output_size: 26
47 | ```
48 | #### Run
49 | ```
50 | cd build
51 | ./test/ocr_test
52 | ```
53 | 
54 | ### TensorRT
55 | ```
56 | mkdir build && cd build
57 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON
58 | make
59 | ```
60 | #### Config
61 | config/ocr/rec/attn.yaml for abinet model
62 | ```
63 | model_name: "abinet"
64 | model_path: "../weights/ocr/best-train-abinet.engine"
65 | framework: "TensorRT"
66 | input_size: [128,32]  # (width, height)
67 | input_channel: 3
68 | alphabet: "abcdefghijklmnopqrstuvwxyz0123456789"
69 | output_size: 26
70 | ```
71 | #### Run
72 | ```
73 | cd build
74 | ./test/ocr_test
75 | ```


--------------------------------------------------------------------------------
/doc/model/clip.md:
--------------------------------------------------------------------------------
  1 | # Clip
  2 | 
  3 | 
  4 | ## Related repos
  5 | - [Clip](https://github.com/openai/CLIP)
  6 | - [onnx_clip](https://github.com/lakeraai/onnx_clip.git)
  7 | - [clip-distillation](https://github.com/NVIDIA-AI-IOT/clip-distillation)
  8 | 
  9 | ## ONNXRuntime Inference
 10 | ### Model
 11 | You can download the ONNX model by
 12 | ```
 13 | wget https://lakera-clip.s3.eu-west-1.amazonaws.com/clip_image_model_vitb32.onnx
 14 | wget https://lakera-clip.s3.eu-west-1.amazonaws.com/clip_text_model_vitb32.onnx
 15 | ```
 16 | Or you can export by yourself. Python code will be like
 17 | ```
 18 | torch.onnx.export(image_model,
 19 |                 torch.randn(1, 3, 224, 224),
 20 |                 f,
 21 |                 opset_version=11,
 22 |                 input_names=['IMAGE'],
 23 |                 output_names=['IMAGE_EMBEDDING'],
 24 |                 dynamic_axes={
 25 |                     'IMAGE': {0: 'batch_size'},
 26 |                     'IMAGE_EMBEDDING': {0: 'batch_size'}
 27 |                 })
 28 | 
 29 | torch.onnx.export(text_model,
 30 |                 torch.randn(1, 77),
 31 |                 f,
 32 |                 opset_version=11,
 33 |                 input_names=['TEXT'],
 34 |                 output_names=['TEXT_EMBEDDING'],
 35 |                 dynamic_axes={
 36 |                     'TEXT': {0: 'batch_size'},
 37 |                     'TEXT_EMBEDDING': {0: 'batch_size'}
 38 |                 })
 39 | ```
 40 | 
 41 | ### Get bpe vocab
 42 | ```
 43 | git clone https://github.com/lakeraai/onnx_clip.git
 44 | cp onnx_clip/onnx_clip/data/bpe_simple_vocab_16e6.txt.gz model-zoo-cxx/weights/clip
 45 | ```
 46 | 
 47 | ### Inference
 48 | #### Build
 49 | ```
 50 | mkdir build && cd build
 51 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF
 52 | make
 53 | ```
 54 | #### Config
 55 | config/clip/image_encoder.yaml
 56 | ```
 57 | model_name: "clip_image_encoder"
 58 | model_path: "../weights/clip/clip_image_model_vitb32.onnx"
 59 | framework: "ONNX"
 60 | ```
 61 | config/clip/text_encoder.yaml
 62 | ```
 63 | model_name: "clip_text_encoder"
 64 | model_path: "../weights/clip/clip_text_model_vitb32.onnx"
 65 | framework: "ONNX"
 66 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz"
 67 | prompts: "../config/clip/prompts.txt"
 68 | text_embedding: "../weights/clip/text_embeddings.bin"
 69 | online: true
 70 | ```
 71 | #### Run
 72 | ```
 73 | cd build
 74 | ./test/clip_test
 75 | ```
 76 | You can get output like
 77 | ```
 78 | Input: 
 79 | IMAGE: [-1,3,224,224,]
 80 | Output: 
 81 | IMAGE_EMBEDDING: [-1,512,]
 82 | Input: 
 83 | TEXT: [-1,77,]
 84 | Output: 
 85 | TEXT_EMBEDDING: [-1,512,]
 86 | Shape of IMAGE_EMBEDDING: [2,512,]
 87 | Shape of TEXT_EMBEDDING: [2,512,]
 88 | [ [ 0.970533 0.0294665  ], [ 0.0195933 0.980407  ],  ]
 89 | Destruct text encoder
 90 | Destruct image encoder
 91 | ```
 92 | 
 93 | ## TensorRT Inference
 94 | ### Model
 95 | We can simply transfer the onnx model to tensorrt engine by
 96 | ```
 97 | /usr/src/tensorrt/bin/trtexec \
 98 |     --onnx=${onnx_image_model_path} \
 99 |     --saveEngine=${tensorrt_image_model_path} \
100 |     --fp16 \
101 | --minShapes=IMAGE:1x3x224x224 \
102 | --optShapes=IMAGE:1x3x224x224 \
103 | --maxShapes=IMAGE:10x3x224x224
104 | /usr/src/tensorrt/bin/trtexec \
105 |     --onnx=${onnx_text_model_path} \
106 |     --saveEngine=${tensorrt_text_model_path} \
107 |     --fp16 \
108 | --minShapes=TEXT:1x77 \
109 | --optShapes=TEXT:1x77 \
110 | --maxShapes=TEXT:10x77
111 | ```
112 | However, these models with vit is too large for Jetson.
113 | 
114 | Nvidia releases [clip-distillation](https://github.com/NVIDIA-AI-IOT/clip-distillation) to solve this problem. 
115 | First, train a smaller image model with knowledge distillation. Though Nvidia does not release weights yet, it publishes a pipeline to train models.
116 | Second, fix the prompt texts and save their embeddings.
117 | 
118 | ### Inference
119 | #### Build
120 | ```
121 | mkdir build && cd build
122 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON
123 | make
124 | ```
125 | #### Config
126 | config/clip/image_encoder.yaml
127 | ```
128 | model_name: "clip_image_encoder"
129 | model_path: "../weights/clip/clip_image_model_res18.engine"
130 | framework: "TensorRT"
131 | ```
132 | config/clip/text_encoder.yaml
133 | ```
134 | model_name: "clip_text_encoder"
135 | model_path: "../weights/clip/clip_text_model_vitb32.onnx"
136 | framework: "ONNX"
137 | # model_path: "../weights/sam/resnet18_image_encoder.engine"
138 | # framework: "TensorRT"
139 | bpe_path: "../weights/clip/bpe_simple_vocab_16e6.txt.gz"
140 | prompts: "../config/clip/prompts.txt"
141 | text_embedding: "../weights/clip/text_embeddings.bin"
142 | online: false
143 | ```
144 | #### Run
145 | ```
146 | cd build
147 | ./test/clip_test -g ## generate text_embeddings.bin
148 | ./test/clip_test
149 | ```


--------------------------------------------------------------------------------
/doc/model/crnn.md:
--------------------------------------------------------------------------------
 1 | # CRNN
 2 | 
 3 | ## Get pytorch model
 4 | The pytorch implementation is [crnn.pytorch](https://github.com/meijieru/crnn.pytorch).
 5 | 
 6 | ## Export
 7 | ### ONNX
 8 | ```
 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git
10 | git clone https://github.com/meijieru/crnn.pytorch.git
11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/crnn_export.py crnn.pytorch
12 | cd crnn.pytorch
13 | python3 crnn_export.py --weights=crnn.pth --sim
14 | ```
15 | You can checkout the onnx model in [netron](netron.app).
16 | - inputs
17 |     - images (float32[1,1,32,100])
18 | - outputs
19 |     - output (float32[1,26,1])
20 | 
21 | ### TensorRT
22 | ```
23 | ${tensorrt-install-path}/bin/trtexec                                                             
24 | --onnx=path-to-your-onnx-model \
25 | --saveEngine=save-path \
26 | --fp16
27 | ```
28 | 
29 | ## Inference
30 | ### ONNXRuntime
31 | #### Build
32 | ```
33 | mkdir build && cd build
34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF
35 | make
36 | ```
37 | #### Config
38 | config/ocr/rec/ctc.yaml for crnn model
39 | ```
40 | model_name: "crnn"
41 | model_path: "../weights/ocr/crnn.onnx"
42 | framework: "ONNX"
43 | input_size: [100,32]  # (width, height)
44 | input_channel: 1
45 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz"
46 | output_size: 26
47 | ```
48 | #### Run
49 | ```
50 | cd build
51 | ./test/ocr_test
52 | ```
53 | You can see output like:
54 | ```
55 | Input: 
56 | images: 3200
57 | Output: 
58 | output: 26
59 | ../test/image/ocr/demo.png: available
60 | cost 7.9420 ms
61 | Destruct ocr model
62 | ```
63 | 
64 | ### TensorRT
65 | ```
66 | mkdir build && cd build
67 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON
68 | make
69 | ```
70 | #### Config
71 | config/ocr/rec/ctc.yaml for crnn model
72 | ```
73 | model_name: "crnn"
74 | model_path: "../weights/ocr/crnn.engine"
75 | framework: "TensorRT"
76 | input_size: [100,32]  # (width, height)
77 | input_channel: 1
78 | alphabet: "0123456789abcdefghijklmnopqrstuvwxyz"
79 | output_size: 26
80 | ```
81 | #### Run
82 | ```
83 | cd build
84 | ./test/ocr_test
85 | ```
86 | You can see output like:
87 | ```
88 | Input bind name: images
89 | Output bind name: output
90 | model warmup 10 times
91 | ../test/image/ocr/demo.png: available
92 | cost 16.3920 ms
93 | Destruct ocr model
94 | ```


--------------------------------------------------------------------------------
/doc/model/dbnet.md:
--------------------------------------------------------------------------------
 1 | # DBNet
 2 | 
 3 | ## Get pytorch model
 4 | The pytorch implementation is [DBNet.pytorch](https://github.com/BaofengZan/DBNet.pytorch).
 5 | 
 6 | ## Export
 7 | ### ONNX
 8 | ```
 9 | git clone https://github.com/Huntersdeng/CXX-DeepLearning-Inference.git
10 | git clone https://github.com/BaofengZan/DBNet.pytorch.git
11 | cp CXX-DeepLearning-Inference/model/ocr/scripts/dbnet_export.py DBNet.pytorch
12 | cd DBNet.pytorch
13 | python3 dbnet_export.py --sim --weights=path-to-weights
14 | ```
15 | It's an onnx model with dynamic axes, you can check the onnx model in [netron](netron.app).
16 | - inputs
17 |     - images (float32[1,3,height,width])
18 | - outputs
19 |     - output (float32[Resizeoutput_dim_0,Resizeoutput_dim_1,Resizeoutput_dim_2,Resizeoutput_dim_3])
20 | 
21 | ### TensorRT
22 | ```
23 | ${tensorrt-install-path}/bin/trtexec \                                                                           
24 | --onnx=DBNet.onnx \
25 | --saveEngine=DBNet.engine \
26 | --fp16 \
27 | --minShapes=images:1x3x608x608 \
28 | --maxShapes=images:1x3x1440x1440 \
29 | --optShapes=images:1x3x640x1152
30 | ```
31 | 
32 | ## Inference
33 | ### ONNXRuntime
34 | #### Build
35 | ```
36 | mkdir build && cd build
37 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF
38 | make
39 | ```
40 | #### Config
41 | config/ocr/det/dbnet.yaml
42 | ```
43 | model_name: "dbnet"
44 | model_path: "../weights/ocr/det/DBNet.onnx"
45 | framework: "ONNX"
46 | box_thres: 0.5
47 | ```
48 | #### Run
49 | ```
50 | cd build
51 | ./test/ocr_test
52 | ```
53 | 
54 | ### TensorRT
55 | ```
56 | mkdir build && cd build
57 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON
58 | make
59 | ```
60 | #### Config
61 | config/ocr/det/dbnet.yaml
62 | ```
63 | model_name: "dbnet"
64 | model_path: "../weights/ocr/det/DBNet.engine"
65 | framework: "TensorRT"
66 | box_thres: 0.5
67 | ```
68 | #### Run
69 | ```
70 | cd build
71 | ./test/ocr_test
72 | ```


--------------------------------------------------------------------------------
/doc/model/sam.md:
--------------------------------------------------------------------------------
 1 | # SAM
 2 | This model is based on [NanoSAM](https://github.com/NVIDIA-AI-IOT/nanosam).
 3 | 
 4 | ## Get model
 5 | ### ONNX
 6 | 1. Download the image encoder ONNX file from [here](https://drive.google.com/file/d/14-SsvoaTl-esC3JOzomHDnI9OGgdO2OR/view?usp=drive_link).
 7 | 2. Download the mask decoder ONNX file from [here](https://drive.google.com/file/d/1jYNvnseTL49SNRx9PDcbkZ9DwsY8up7n/view?usp=drive_link).
 8 | 3. Or you can export mannally following [NanoSAM](https://github.com/NVIDIA-AI-IOT/nanosam).
 9 | 
10 | ### TensorRT
11 | - image encoder
12 | ```
13 | ${tensorrt-install-path}/bin/trtexec \
14 |     --onnx=data/resnet18_image_encoder.onnx \
15 |     --saveEngine=data/resnet18_image_encoder.engine \
16 |     --fp16
17 | ```
18 | 
19 | - mask decoder
20 | ```
21 | ${tensorrt-install-path}/bin/trtexec \
22 |     --onnx=weights/sam/mobile_sam_mask_decoder.onnx \
23 |     --saveEngine=weights/sam/mobile_sam_mask_decoder.engine \
24 |     --minShapes=point_coords:1x1x2,point_labels:1x1 \
25 |     --optShapes=point_coords:1x1x2,point_labels:1x1 \
26 |     --maxShapes=point_coords:1x10x2,point_labels:1x10
27 | ```
28 | 
29 | ## Inference
30 | ### ONNXRuntime
31 | #### Build
32 | ```
33 | mkdir build && cd build
34 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=OFF
35 | make
36 | ```
37 | #### Config
38 | config/sam/image_encoder.yaml
39 | ```
40 | model_name: "sam_image_encoder"
41 | model_path: "../weights/sam/resnet18_image_encoder.onnx"
42 | framework: "ONNX"
43 | ```
44 | config/sam/mask_decoder.yaml
45 | ```
46 | model_name: "sam_mask_decoder"
47 | model_path: "../weights/sam/mobile_sam_mask_decoder.onnx"
48 | framework: "ONNX"
49 | ```
50 | #### Run
51 | ```
52 | cd build
53 | ./test/ocr_test
54 | ```
55 | 
56 | ### TensorRT
57 | #### Build
58 | ```
59 | mkdir build && cd build
60 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON
61 | make
62 | ```
63 | #### Config
64 | config/sam/image_encoder.yaml
65 | ```
66 | model_name: "sam_image_encoder"
67 | model_path: "../weights/sam/resnet18_image_encoder.engine"
68 | framework: "TensorRT"
69 | ```
70 | config/sam/mask_decoder.yaml
71 | ```
72 | model_name: "sam_mask_decoder"
73 | model_path: "../weights/sam/mobile_sam_mask_decoder.engine"
74 | framework: "TensorRT"
75 | ```
76 | #### Run
77 | ```
78 | cd build
79 | ./test/ocr_test
80 | ```
81 | 
82 | ### Sample result
83 | <p align="center"><img src="../../output/sam/dogs.jpg" height="px"/></p>


--------------------------------------------------------------------------------
/doc/model/yolo.md:
--------------------------------------------------------------------------------
  1 | # YOLO
  2 | In this repo, all versions of yolo algorithms are exported with the same inputs and outputs, so we can use the same C++ code to inference.
  3 | ## Prepare
  4 | ```
  5 | python3 -m pip install ultralytics, onnx, onnxsim
  6 | ```
  7 | 
  8 | ## Get pytorch model
  9 | ```
 10 | # yolov8s-detect
 11 | wget https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s.pt
 12 | # yolov8s-seg
 13 | wget https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8s-seg.pt
 14 | # yolov9c
 15 | wget https://github.com/WongKinYiu/yolov9/releases/download/v0.1/yolov9-c-converted.pt
 16 | ```
 17 | 
 18 | ## Export
 19 | ### ONNX
 20 | #### YOLOv8-Detect
 21 | ```
 22 | # without-nms
 23 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim
 24 | # with onnx-nms plugin
 25 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim --onnx-nms
 26 | # with trt-nms plugin (only for tensorrt transfer, not support to inference with onnxruntime C++)
 27 | python3 model/yolo/yolov8-det-export.py --weights=path-to-your-weights --sim --trt-nms
 28 | ```
 29 | 
 30 | #### YOLOv8-Segment
 31 | ```
 32 | python3 model/yolo/yolov8-seg-export.py --weights=path-to-your-weights --sim
 33 | ```
 34 | 
 35 | #### YOLOv8-Pose
 36 | ```
 37 | python3 model/yolo/yolov8-pose-export.py --weights=path-to-your-weights --sim
 38 | ```
 39 | 
 40 | #### YOLOv9-Detect
 41 | ```
 42 | git clone https://github.com/WongKinYiu/yolov9.git
 43 | cp model/yolo/yolov9-det-export.py yolov9/
 44 | cd yolov9
 45 | # without-nms
 46 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim
 47 | # with onnx-nms plugin
 48 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim --onnx-nms
 49 | # with trt-nms plugin (only for tensorrt transfer, not support to inference with onnxruntime C++)
 50 | python3 yolov9-det-export.py --weights=path-to-your-weights --sim --trt-nms
 51 | ```
 52 | 
 53 | ### TensorRT
 54 | ```
 55 | ${tensorrt-install-path}/bin/trtexec                                                             
 56 | --onnx=path-to-your-onnx-model \
 57 | --saveEngine=save-path \
 58 | --fp16
 59 | ```
 60 | 
 61 | ### RKNN
 62 | - Pytorch to ONNX, see [airockchip/ultralytics_yolov8](https://github.com/airockchip/ultralytics_yolov8) 
 63 | - ONNX to RKNN, see [airockchip/rknn-model-zoo](https://github.com/airockchip/rknn_model_zoo)
 64 | 
 65 | ## Inference
 66 | ### Build
 67 | Compile options:
 68 | - USE_TENSORRT
 69 | - USE_RKNN
 70 | ```
 71 | mkdir build && cd build
 72 | cmake .. -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_TENSORRT=ON/OFF -DUSE_RKNN=ON/OFF 
 73 | make
 74 | ```
 75 | ### Config
 76 | #### Detect
 77 | ```
 78 | model_name: "yolo"
 79 | model_path: "path-to-model-file"
 80 | framework: "ONNX"       # ("ONNX" or "TensorRT", corresponding to your model)
 81 | input_size: [640,640]
 82 | with_nms: true
 83 | conf_thres: 0.25
 84 | nms_thres: 0.65
 85 | topk: 100
 86 | ```
 87 | #### Segment
 88 | ```
 89 | model_name: "yolo_seg"
 90 | model_path: "path-to-model-file"
 91 | framework: "ONNX"      # ("ONNX" or "TensorRT", corresponding to your model)
 92 | input_size: [640,640]
 93 | conf_thres: 0.25
 94 | nms_thres: 0.65
 95 | seg_size: [160, 160]
 96 | seg_channels: 32
 97 | ```
 98 | #### Segment-Cutoff
 99 | ```
100 | model_name: "yolo_seg"
101 | model_path: "../weights/yolo/yolov8s-seg.rknn"
102 | framework: "RKNN"
103 | input_size: [640,640]
104 | conf_thres: 0.25
105 | nms_thres: 0.65
106 | seg_size: [160, 160]
107 | seg_channels: 32
108 | class_num: 80
109 | topk: 100
110 | ```
111 | 
112 | #### Pose
113 | ```
114 | model_name: "yolo_pose"
115 | model_path: "path-to-model-file"
116 | framework: "ONNX"      # ("ONNX" or "TensorRT", corresponding to your model)
117 | input_size: [640,640]
118 | conf_thres: 0.25
119 | nms_thres: 0.65
120 | ```
121 | 
122 | #### Run
123 | ```
124 | mkdir -p output/yolo/detect
125 | mkdir output/yolo/segment
126 | cd build
127 | ./test/yolo_test
128 | ```
129 | 
130 | ### Example output
131 | <p align="center"><img src="../../output/yolo/detect/bus.jpg" height="px"/></p>
132 | <p align="center"><img src="../../output/yolo/segment/bus.jpg" height="px"/></p>
133 | <p align="center"><img src="../../output/yolo/pose/bus.jpg" height="px"/></p>


--------------------------------------------------------------------------------
/framework/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | configure_file(../common/config.h.in config.h)
 2 | 
 3 | if(USE_TENSORRT)
 4 |   add_library(framework_trt SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 
 5 |                                    ${CMAKE_CURRENT_SOURCE_DIR}/tensorrt/tensorrt.cpp)
 6 |   target_include_directories(framework_trt PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 7 |   target_link_libraries(framework_trt nvinfer nvinfer_plugin ${CUDA_LIBRARIES})
 8 |   target_link_directories(framework_trt PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS})
 9 | endif()
10 | 
11 | if(USE_RKNN)
12 |   add_library(framework_rknn SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 
13 |                                    ${CMAKE_CURRENT_SOURCE_DIR}/rknn/rknn.cpp)
14 |   target_include_directories(framework_rknn PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
15 |   target_link_libraries(framework_rknn ${RKNN_LIBS})
16 |   target_link_directories(framework_rknn PUBLIC ${OpenCV_LIBS})
17 | endif()
18 | 
19 | add_library(framework_onnx SHARED ${CMAKE_SOURCE_DIR}/common/common.cpp 
20 |                                   ${CMAKE_CURRENT_SOURCE_DIR}/onnx/onnx.cpp)
21 | target_include_directories(framework_onnx PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
22 | target_link_libraries(framework_onnx ${ONNXRUNTIME_LIBS})
23 | target_link_directories(framework_onnx PUBLIC ${OpenCV_LIBS})
24 | 


--------------------------------------------------------------------------------
/framework/framework.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <unordered_map>
 4 | #include <vector>
 5 | 
 6 | #include "common/common.h"
 7 | 
 8 | enum DataType {
 9 |     INT32 = 1,
10 |     FP32 = 2,
11 |     UINT8 = 3,
12 |     INT8 = 4
13 | };
14 | 
15 | struct IOTensor {
16 |     std::vector<uint8_t> raw_data;
17 |     std::vector<int64_t> shape;
18 |     int zp;
19 |     float scale;
20 |     DataType data_type = DataType::FP32;
21 |     void resize(size_t size) {
22 |         raw_data.resize(size);
23 |     }
24 | 
25 |     size_t size() const {
26 |         return raw_data.size();
27 |     }
28 | 
29 |     uint8_t* data() {
30 |         return raw_data.data();
31 |     }
32 | 
33 |     const uint8_t* data() const{
34 |         return raw_data.data();
35 |     }
36 | };
37 | 
38 | enum Status { SUCCESS = 0, INIT_ERROR = -1, INFERENCE_ERROR = -2};
39 | 
40 | struct Config {
41 |     std::string model_path;
42 |     std::map<std::string, int64_t> input_len;
43 |     std::map<std::string, int64_t> output_len;
44 |     bool is_dynamic;
45 | };
46 | 
47 | class BaseFramework {
48 |    public:
49 |     BaseFramework() {}
50 |     virtual ~BaseFramework() {}
51 |     virtual Status Init(Config config) = 0;
52 |     virtual Status forward(const std::unordered_map<std::string, IOTensor> &input,
53 |                            std::unordered_map<std::string, IOTensor> &output) = 0;
54 | 
55 |    protected:
56 |     std::vector<Binding> input_bindings;
57 |     std::vector<Binding> output_bindings;
58 |     bool is_dynamic;
59 | };


--------------------------------------------------------------------------------
/framework/onnx/onnx.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/onnx/onnx.h"
  2 | 
  3 | int TypeToSize(const ONNXTensorElementDataType& dataType) {
  4 |     switch (dataType) {
  5 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT:
  6 |             return 4;
  7 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16:
  8 |             return 2;
  9 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32:
 10 |             return 4;
 11 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8:
 12 |             return 1;
 13 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL:
 14 |             return 1;
 15 |         case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64:
 16 |             return 8;
 17 |         default:
 18 |             std::cout << "Unknown data type " << dataType << std::endl;
 19 |             return 4;
 20 |     }
 21 | }
 22 | 
 23 | Status ONNXFramework::Init(Config config) {
 24 |     is_dynamic = config.is_dynamic;
 25 |     env = Ort::Env(OrtLoggingLevel::ORT_LOGGING_LEVEL_WARNING, "ONNX_DETECTION");
 26 |     session_options = Ort::SessionOptions();
 27 | 
 28 |     Ort::AllocatorWithDefaultOptions allocator;
 29 | 
 30 | #ifdef _WIN32
 31 |     std::wstring w_modelPath = utils::charToWstring(model_path.c_str());
 32 |     session = Ort::Session(env, w_modelPath.c_str(), sessionOptions);
 33 | #else
 34 |     session = new Ort::Session(env, config.model_path.c_str(), session_options);
 35 | #endif
 36 | 
 37 |     std::cout << "Input: " << std::endl;
 38 |     int input_num = session->GetInputCount();
 39 |     for (int i = 0; i < input_num; i++) {
 40 |         Ort::TypeInfo input_type_info = session->GetInputTypeInfo(i);
 41 |         std::vector<int64_t> input_tensor_shape = input_type_info.GetTensorTypeAndShapeInfo().GetShape();
 42 | 
 43 |         Binding binding;
 44 |         int64_t size = 1;
 45 |         for (size_t j = 0; j < input_tensor_shape.size(); j++) {
 46 |             binding.dims.push_back(input_tensor_shape[j]);
 47 |             size *= input_tensor_shape[j];
 48 |         }
 49 | 
 50 |         if (size <= 0) {
 51 |             size = config.input_len[binding.name];
 52 |         }
 53 | 
 54 |         binding.size = size;
 55 |         binding.dsize = TypeToSize(input_type_info.GetTensorTypeAndShapeInfo().GetElementType());
 56 | 
 57 |         Ort::AllocatedStringPtr input_name = session->GetInputNameAllocated(i, allocator);
 58 |         binding.name = input_name.get();
 59 |         input_bindings.push_back(binding);
 60 |         std::cout << binding.name << ": [";
 61 |         for (size_t j = 0; j < input_tensor_shape.size(); j++) {
 62 |             std::cout << input_tensor_shape[j] << ",";
 63 |         }
 64 |         std::cout << "]" << std::endl;
 65 | 
 66 |         if (!is_dynamic && config.input_len[binding.name] != size) {
 67 |             std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
 68 |                       << config.input_len[binding.name] << "!=" << size << ")" << std::endl;
 69 |             return Status::INIT_ERROR;
 70 |         }
 71 |     }
 72 | 
 73 |     std::cout << "Output: " << std::endl;
 74 |     int output_num = session->GetOutputCount();
 75 |     for (int i = 0; i < output_num; i++) {
 76 |         Binding binding;
 77 | 
 78 |         Ort::TypeInfo output_type_info = session->GetOutputTypeInfo(i);
 79 |         std::vector<int64_t> output_tensor_shape = output_type_info.GetTensorTypeAndShapeInfo().GetShape();
 80 | 
 81 |         Ort::AllocatedStringPtr output_name = session->GetOutputNameAllocated(i, allocator);
 82 |         binding.name = output_name.get();
 83 | 
 84 |         int64_t size = 1;
 85 |         for (size_t j = 0; j < output_tensor_shape.size(); j++) {
 86 |             binding.dims.push_back(output_tensor_shape[j]);
 87 |             size *= output_tensor_shape[j];
 88 |         }
 89 | 
 90 |         if (size <= 0) {
 91 |             size = config.output_len[binding.name];
 92 |         }
 93 | 
 94 |         binding.size = size;
 95 |         binding.dsize = TypeToSize(output_type_info.GetTensorTypeAndShapeInfo().GetElementType());
 96 | 
 97 |         output_bindings.push_back(binding);
 98 | 
 99 |         std::cout << binding.name << ": [";
100 |         for (size_t j = 0; j < output_tensor_shape.size(); j++) {
101 |             std::cout << output_tensor_shape[j] << ",";
102 |         }
103 |         std::cout << "]" << std::endl;
104 | 
105 |         if (!is_dynamic && config.output_len[binding.name] != size) {
106 |             std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
107 |                       << config.output_len[binding.name] << "!=" << size << ")" << std::endl;
108 |             return Status::INIT_ERROR;
109 |         }
110 | 
111 |     }
112 | 
113 |     return Status::SUCCESS;
114 | }
115 | 
116 | ONNXFramework::~ONNXFramework() {
117 |     delete session;
118 | }
119 | 
120 | Status ONNXFramework::forward(const std::unordered_map<std::string, IOTensor>& input,
121 |                               std::unordered_map<std::string, IOTensor>& output) {
122 |     std::vector<Ort::Value> input_tensors;
123 |     Ort::MemoryInfo memory_info =
124 |         Ort::MemoryInfo::CreateCpu(OrtAllocatorType::OrtArenaAllocator, OrtMemType::OrtMemTypeDefault);
125 | 
126 |     std::vector<const char*> input_names;
127 |     for (const auto& binding : input_bindings) {
128 |         const std::string input_name = binding.name;
129 |         input_names.emplace_back(binding.name.c_str());
130 |         if (input.find(input_name) == input.end()) {
131 |             std::cout << "Cannot find " << input_name << " from the input tensors!" << std::endl;
132 |             return Status::INFERENCE_ERROR;
133 |         }
134 | 
135 |         size_t size = 1;
136 |         if (!is_dynamic) {
137 |             size = binding.size;
138 |         } else {
139 |             for (size_t i = 0; i < input.at(input_name).shape.size(); i++) {
140 |                 size *= input.at(input_name).shape[i];
141 |             }
142 |         }
143 |         if (input.at(input_name).data_type == DataType::INT32) {
144 |             input_tensors.push_back(Ort::Value::CreateTensor<int>(
145 |                 memory_info, (int*)input.at(input_name).data(), size, input.at(input_name).shape.data(), input.at(input_name).shape.size()));
146 |         } else if (input.at(input_name).data_type == DataType::FP32) {
147 |             input_tensors.push_back(Ort::Value::CreateTensor<float>(
148 |                 memory_info, (float*)input.at(input_name).data(), size, input.at(input_name).shape.data(), input.at(input_name).shape.size()));
149 |         } else {
150 |             std::cout << "Error occur when Ort::Value::CreateTensor" << std::endl;
151 |         }
152 |         
153 |     }
154 | 
155 |     std::vector<const char*> output_names;
156 |     for (const auto& binding : output_bindings) {
157 |         output_names.emplace_back(binding.name.c_str());
158 |         if (output.find(binding.name) == output.end()) {
159 |             std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl;
160 |             return Status::INFERENCE_ERROR;
161 |         }
162 |     }
163 | 
164 |     std::vector<Ort::Value> output_tensors = this->session->Run(Ort::RunOptions{nullptr}, input_names.data(), input_tensors.data(), input_names.size(),
165 |                           output_names.data(), output_names.size());
166 |     
167 |     for (size_t i = 0; i < output_tensors.size(); ++i){
168 |         size_t element_size = TypeToSize(output_tensors[i].GetTensorTypeAndShapeInfo().GetElementType());
169 |         size_t count = output_tensors[i].GetTensorTypeAndShapeInfo().GetElementCount();
170 |         output[output_names[i]].resize(element_size * count);
171 |         memcpy(output[output_names[i]].data(), output_tensors[i].GetTensorData<uint8_t>(), element_size * count);
172 |         output[output_names[i]].shape = output_tensors[i].GetTensorTypeAndShapeInfo().GetShape();
173 |         std::cout << "Shape of " << output_names[i] << ": [";
174 |         for (int64_t j : output[output_names[i]].shape) {
175 |             std::cout << j << ",";
176 |         }
177 |         std::cout << "]" << std::endl;
178 |     }
179 |     return Status::SUCCESS;
180 | }


--------------------------------------------------------------------------------
/framework/onnx/onnx.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <onnxruntime_cxx_api.h>
 4 | 
 5 | #include <fstream>
 6 | #include <string>
 7 | 
 8 | #include "framework/framework.h"
 9 | 
10 | int TypeToSize(const ONNXTensorElementDataType &dataType);
11 | 
12 | class ONNXFramework : public BaseFramework {
13 |    public:
14 |     ONNXFramework() {}
15 |     ~ONNXFramework();
16 |     Status Init(Config config) override;
17 |     Status forward(const std::unordered_map<std::string, IOTensor> &input,
18 |                  std::unordered_map<std::string, IOTensor> &output) override;
19 | 
20 |    private: 
21 |     Ort::Env env{nullptr};
22 |     Ort::SessionOptions session_options{nullptr};
23 |     Ort::Session *session{nullptr};
24 |     std::vector<float *> temp_output_ptrs;
25 | };


--------------------------------------------------------------------------------
/framework/rknn/rknn.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/rknn/rknn.h"
  2 | 
  3 | static void dump_tensor_attr(rknn_tensor_attr *attr) {
  4 |     printf("  index=%d, name=%s, n_dims=%d, dims=[%d, %d, %d, %d], n_elems=%d, size=%d, fmt=%s, type=%s, qnt_type=%s, "
  5 |            "zp=%d, scale=%f\n",
  6 |            attr->index, attr->name, attr->n_dims, attr->dims[0], attr->dims[1], attr->dims[2], attr->dims[3],
  7 |            attr->n_elems, attr->size, get_format_string(attr->fmt), get_type_string(attr->type),
  8 |            get_qnt_type_string(attr->qnt_type), attr->zp, attr->scale);
  9 | }
 10 | 
 11 | static int read_data_from_file(const char *path, char **out_data)
 12 | {
 13 |     FILE *fp = fopen(path, "rb");
 14 |     if(fp == NULL) {
 15 |         printf("fopen %s fail!\n", path);
 16 |         return -1;
 17 |     }
 18 |     fseek(fp, 0, SEEK_END);
 19 |     int file_size = ftell(fp);
 20 |     char *data = (char *)malloc(file_size+1);
 21 |     data[file_size] = 0;
 22 |     fseek(fp, 0, SEEK_SET);
 23 |     if(file_size != fread(data, 1, file_size, fp)) {
 24 |         printf("fread %s fail!\n", path);
 25 |         free(data);
 26 |         fclose(fp);
 27 |         return -1;
 28 |     }
 29 |     if(fp) {
 30 |         fclose(fp);
 31 |     }
 32 |     *out_data = data;
 33 |     return file_size;
 34 | }
 35 | 
 36 | int TypeToSize(const rknn_tensor_type& dataType) {
 37 |     switch (dataType) {
 38 |         case RKNN_TENSOR_FLOAT32:
 39 |             return 4;
 40 |         case RKNN_TENSOR_FLOAT16:
 41 |             return 2;
 42 |         case RKNN_TENSOR_INT32:
 43 |             return 4;
 44 |         case RKNN_TENSOR_INT8:
 45 |             return 1;
 46 |         case RKNN_TENSOR_BOOL:
 47 |             return 1;
 48 |         case RKNN_TENSOR_INT64:
 49 |             return 8;
 50 |         default:
 51 |             std::cout << "Unknown data type " << dataType << std::endl;
 52 |             return 4;
 53 |     }
 54 | }
 55 | 
 56 | Status RknnFramework::Init(Config config) {
 57 |     is_dynamic = config.is_dynamic;
 58 |     int ret;
 59 |     int model_len = 0;
 60 |     char *model;
 61 |     rknn_context ctx = 0;
 62 | 
 63 |     // Load RKNN Model
 64 |     model_len = read_data_from_file(config.model_path.c_str(), &model);
 65 |     if (model == NULL)
 66 |     {
 67 |         printf("load_model fail!\n");
 68 |         return Status::INIT_ERROR;
 69 |     }
 70 | 
 71 |     ret = rknn_init(&ctx, model, model_len, 0, NULL);
 72 |     free(model);
 73 |     if (ret < 0)
 74 |     {
 75 |         printf("rknn_init fail! ret=%d\n", ret);
 76 |         return Status::INIT_ERROR;
 77 |     }
 78 | 
 79 |     // Get Model Input Output Number
 80 |     rknn_input_output_num io_num;
 81 |     ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
 82 |     if (ret != RKNN_SUCC)
 83 |     {
 84 |         printf("rknn_query fail! ret=%d\n", ret);
 85 |         return Status::INIT_ERROR;
 86 |     }
 87 |     printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
 88 | 
 89 |     // Get Model Input Info
 90 |     printf("input tensors:\n");
 91 |     rknn_tensor_attr input_attrs[io_num.n_input];
 92 |     memset(input_attrs, 0, sizeof(input_attrs));
 93 |     for (int i = 0; i < io_num.n_input; i++)
 94 |     {
 95 |         input_attrs[i].index = i;
 96 |         ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
 97 |         if (ret != RKNN_SUCC)
 98 |         {
 99 |             printf("rknn_query fail! ret=%d\n", ret);
100 |             return Status::INIT_ERROR;
101 |         }
102 |         dump_tensor_attr(&(input_attrs[i]));
103 |         Binding binding;
104 |         binding.name = input_attrs[i].name;
105 |         binding.size = input_attrs[i].n_elems;
106 |         binding.dsize = TypeToSize(input_attrs[i].type);
107 |         binding.dims = std::vector<int64_t>{input_attrs[i].dims[0], input_attrs[i].dims[1], input_attrs[i].dims[2], input_attrs[i].dims[3]};
108 |         input_bindings.push_back(binding);
109 |         in_index_[binding.name] = i;
110 |         if (!is_dynamic && config.input_len[binding.name] != binding.size) {
111 |             std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
112 |                       << config.input_len[binding.name] << "!=" << binding.size << ")" << std::endl;
113 |             return Status::INIT_ERROR;
114 |         }
115 |     }
116 | 
117 |     // Get Model Output Info
118 |     printf("output tensors:\n");
119 |     rknn_tensor_attr output_attrs[io_num.n_output];
120 |     memset(output_attrs, 0, sizeof(output_attrs));
121 |     for (int i = 0; i < io_num.n_output; i++)
122 |     {
123 |         output_attrs[i].index = i;
124 |         ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
125 |         if (ret != RKNN_SUCC)
126 |         {
127 |             printf("rknn_query fail! ret=%d\n", ret);
128 |             return Status::INIT_ERROR;
129 |         }
130 |         dump_tensor_attr(&(output_attrs[i]));
131 |         Binding binding;
132 |         binding.name = output_attrs[i].name;
133 |         binding.size = output_attrs[i].n_elems;
134 |         binding.dsize = TypeToSize(output_attrs[i].type);
135 |         binding.dims = std::vector<int64_t>{output_attrs[i].dims[0], output_attrs[i].dims[1], output_attrs[i].dims[2], output_attrs[i].dims[3]};
136 |         output_bindings.push_back(binding);
137 |         out_index_[binding.name] = i;
138 |         if (!is_dynamic && config.output_len[binding.name] != binding.size) {
139 |             std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
140 |                       << config.output_len[binding.name] << "!=" << binding.size << ")" << std::endl;
141 |             return Status::INIT_ERROR;
142 |         }
143 |     }
144 | 
145 |     // Set to context
146 |     rknn_ctx = ctx;
147 | 
148 |     // if (output_attrs[0].qnt_type == RKNN_TENSOR_QNT_AFFINE_ASYMMETRIC && output_attrs[0].type != RKNN_TENSOR_FLOAT16)
149 |     // {
150 |     //     is_quant_ = true;
151 |     // }
152 |     // else
153 |     // {
154 |     //     is_quant_ = false;
155 |     // }
156 | 
157 |     input_attrs_ = (rknn_tensor_attr *)malloc(io_num.n_input * sizeof(rknn_tensor_attr));
158 |     memcpy(input_attrs_, input_attrs, io_num.n_input * sizeof(rknn_tensor_attr));
159 |     output_attrs_ = (rknn_tensor_attr *)malloc(io_num.n_output * sizeof(rknn_tensor_attr));
160 |     memcpy(output_attrs_, output_attrs, io_num.n_output * sizeof(rknn_tensor_attr));
161 | 
162 |     uint32_t model_channel, model_height, model_width;
163 |     if (input_attrs[0].fmt == RKNN_TENSOR_NCHW)
164 |     {
165 |         printf("model is NCHW input fmt\n");
166 |         model_channel = input_attrs[0].dims[1];
167 |         model_height = input_attrs[0].dims[2];
168 |         model_width = input_attrs[0].dims[3];
169 |     }
170 |     else
171 |     {
172 |         printf("model is NHWC input fmt\n");
173 |         model_height = input_attrs[0].dims[1];
174 |         model_width = input_attrs[0].dims[2];
175 |         model_channel = input_attrs[0].dims[3];
176 |     }
177 |     printf("model input height=%d, width=%d, channel=%d\n",
178 |            model_height, model_width, model_channel);
179 | 
180 |     return Status::SUCCESS;
181 | }
182 | 
183 | RknnFramework::~RknnFramework() {
184 |     if (rknn_ctx != 0)
185 |     {
186 |         rknn_destroy(rknn_ctx);
187 |         rknn_ctx = 0;
188 |     }
189 |     if (input_attrs_ != NULL)
190 |     {
191 |         free(input_attrs_);
192 |         input_attrs_ = NULL;
193 |     }
194 |     if (output_attrs_ != NULL)
195 |     {
196 |         free(output_attrs_);
197 |         output_attrs_ = NULL;
198 |     }
199 | }
200 | 
201 | Status RknnFramework::forward(const std::unordered_map<std::string, IOTensor> &input, std::unordered_map<std::string, IOTensor> &output) {
202 |     rknn_input rknn_input_tensors[input.size()];
203 |     rknn_output rknn_output_tensors[output.size()];
204 |     memset(rknn_input_tensors, 0, sizeof(rknn_input_tensors));
205 |     memset(rknn_output_tensors, 0, sizeof(rknn_output_tensors));
206 | 
207 |     int ret = 0;
208 | 
209 |     for (auto &kv : input) {
210 |         size_t idx = in_index_[kv.first];
211 |         auto& binding = this->input_bindings[idx];
212 |         if (input.find(binding.name) == input.end()) {
213 |             std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl;
214 |             return Status::INFERENCE_ERROR;
215 |         }
216 |         rknn_input_tensors[0].index = idx;
217 |         rknn_input_tensors[0].type = RKNN_TENSOR_UINT8;
218 |         rknn_input_tensors[0].fmt = RKNN_TENSOR_NHWC;
219 |         rknn_input_tensors[0].size = binding.size * binding.dsize;
220 |         rknn_input_tensors[0].buf = (void*)kv.second.data();
221 |     }
222 | 
223 |     ret = rknn_inputs_set(rknn_ctx, input_bindings.size(), rknn_input_tensors);
224 |     if (ret < 0)
225 |     {
226 |         printf("rknn_input_set fail! ret=%d\n", ret);
227 |         return Status::INFERENCE_ERROR;
228 |     }
229 | 
230 |     ret = rknn_run(rknn_ctx, nullptr);
231 |     if (ret < 0)
232 |     {
233 |         printf("rknn_run fail! ret=%d\n", ret);
234 |         return Status::INFERENCE_ERROR;
235 |     }
236 | 
237 |     memset(rknn_output_tensors, 0, sizeof(rknn_output_tensors));
238 |     for (int i = 0; i < output_bindings.size(); i++)
239 |     {
240 |         rknn_output_tensors[i].index = i;
241 |         rknn_output_tensors[i].want_float = false;
242 |     }
243 |     ret = rknn_outputs_get(rknn_ctx, output_bindings.size(), rknn_output_tensors, NULL);
244 |     if (ret < 0)
245 |     {
246 |         printf("rknn_outputs_get fail! ret=%d\n", ret);
247 |         return  Status::INFERENCE_ERROR;
248 |     }
249 | 
250 |     for (auto &kv : output) {
251 |         auto idx = out_index_[kv.first];
252 |         const auto& binding = this->output_bindings[idx];
253 |         kv.second.resize(binding.size);
254 |         if (rknn_output_tensors[idx].size != binding.size) {
255 |             return Status::INFERENCE_ERROR;
256 |         }
257 |         memcpy(kv.second.data(), rknn_output_tensors[idx].buf, kv.second.size());
258 |         kv.second.zp = output_attrs_[idx].zp;
259 |         kv.second.scale = output_attrs_[idx].scale;
260 |     }
261 | 
262 |     return Status::SUCCESS;
263 | }
264 | 


--------------------------------------------------------------------------------
/framework/rknn/rknn.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <rknn_api.h>
 4 | 
 5 | #include <fstream>
 6 | #include <string>
 7 | 
 8 | #include "framework/framework.h"
 9 | 
10 | int TypeToSize(const rknn_tensor_type &dataType);
11 | 
12 | class RknnFramework : public BaseFramework {
13 |    public:
14 |     RknnFramework() {}
15 |     ~RknnFramework();
16 |     Status Init(Config config) override;
17 |     Status forward(const std::unordered_map<std::string, IOTensor> &input,
18 |                  std::unordered_map<std::string, IOTensor> &output) override;
19 | 
20 |    private: 
21 |     rknn_context rknn_ctx;
22 |     rknn_tensor_attr* input_attrs_;
23 |     rknn_tensor_attr* output_attrs_;
24 |     std::unordered_map<std::string, int> in_index_;
25 |     std::unordered_map<std::string, int> out_index_;
26 |     bool is_quant_;
27 | };


--------------------------------------------------------------------------------
/framework/tensorrt/tensorrt.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/tensorrt/tensorrt.h"
  2 | 
  3 | void Logger::log(nvinfer1::ILogger::Severity severity, const char *msg) noexcept {
  4 |     if (severity > reportableSeverity) {
  5 |         return;
  6 |     }
  7 |     switch (severity) {
  8 |         case nvinfer1::ILogger::Severity::kINTERNAL_ERROR:
  9 |             std::cerr << "INTERNAL_ERROR: ";
 10 |             break;
 11 |         case nvinfer1::ILogger::Severity::kERROR:
 12 |             std::cerr << "ERROR: ";
 13 |             break;
 14 |         case nvinfer1::ILogger::Severity::kWARNING:
 15 |             std::cerr << "WARNING: ";
 16 |             break;
 17 |         case nvinfer1::ILogger::Severity::kINFO:
 18 |             std::cerr << "INFO: ";
 19 |             break;
 20 |         default:
 21 |             std::cerr << "VERBOSE: ";
 22 |             break;
 23 |     }
 24 |     std::cerr << msg << std::endl;
 25 | }
 26 | 
 27 | int TypeToSize(const nvinfer1::DataType &dataType) {
 28 |     switch (dataType) {
 29 |         case nvinfer1::DataType::kFLOAT:
 30 |             return 4;
 31 |         case nvinfer1::DataType::kHALF:
 32 |             return 2;
 33 |         case nvinfer1::DataType::kINT32:
 34 |             return 4;
 35 |         case nvinfer1::DataType::kINT8:
 36 |             return 1;
 37 |         case nvinfer1::DataType::kBOOL:
 38 |             return 1;
 39 |         default:
 40 |             return 4;
 41 |     }
 42 | }
 43 | 
 44 | Status TensorRTFramework::Init(Config config) {
 45 |     // 读取模型文件
 46 |     std::ifstream file(config.model_path, std::ios::binary);
 47 |     assert(file.good());
 48 |     file.seekg(0, std::ios::end);
 49 |     auto size = file.tellg();
 50 |     file.seekg(0, std::ios::beg);
 51 |     char *trtModelStream = new char[size];
 52 |     assert(trtModelStream);
 53 |     file.read(trtModelStream, size);
 54 |     file.close();
 55 | 
 56 |     // 加载插件
 57 |     initLibNvInferPlugins(&this->gLogger, "");
 58 | 
 59 |     // 创建IRuntime对象
 60 |     this->runtime = nvinfer1::createInferRuntime(this->gLogger);
 61 |     assert(this->runtime != nullptr);
 62 | 
 63 |     // 反序列化engine文件，创建ICudaEngine对象
 64 |     this->engine = this->runtime->deserializeCudaEngine(trtModelStream, size);
 65 |     assert(this->engine != nullptr);
 66 |     delete[] trtModelStream;
 67 | 
 68 |     // 初始化IExecutionContext对象
 69 |     this->context = this->engine->createExecutionContext();
 70 |     assert(this->context != nullptr);
 71 | 
 72 |     // 创建cudaStream_t对象
 73 |     cudaStreamCreate(&this->stream);
 74 | 
 75 |     this->is_dynamic = config.is_dynamic;
 76 | 
 77 |     this->num_bindings = this->engine->getNbIOTensors();
 78 |     for (int i = 0; i < this->num_bindings; ++i)
 79 |     {
 80 |         Binding binding;
 81 |         nvinfer1::Dims dims;
 82 |         std::string name = this->engine->getIOTensorName(i);
 83 |         nvinfer1::DataType dtype = this->engine->getTensorDataType(name.c_str());
 84 |         binding.name = name;
 85 |         binding.dsize = TypeToSize(dtype);
 86 | 
 87 |         nvinfer1::TensorIOMode io_mode = engine->getTensorIOMode(name.c_str());
 88 |         if (io_mode == nvinfer1::TensorIOMode::kINPUT)
 89 |         {
 90 |             in_index_[name] = this->num_inputs;
 91 |             this->num_inputs += 1;
 92 |             dims = this->engine->getProfileShape(name.c_str(), 0, nvinfer1::OptProfileSelector::kMAX);
 93 |             binding.size = 1;
 94 |             std::cout << binding.name << ": [";
 95 |             for (int i = 0; i < dims.nbDims; i++)
 96 |             {
 97 |                 std::cout << dims.d[i] << ",";
 98 |                 binding.size *= dims.d[i];
 99 |                 binding.dims.push_back(dims.d[i]);
100 |             }
101 |             std::cout << "]" << std::endl;
102 |             if (!is_dynamic && config.input_len[binding.name] != binding.size) {
103 |                 std::cout << "Input size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
104 |                         << config.input_len[binding.name] << "!=" << binding.size << ")" << std::endl;
105 |                 return Status::INIT_ERROR;
106 |             }
107 |             this->input_bindings.push_back(binding);
108 |             // set max opt shape
109 |             this->context->setInputShape(name.c_str(), dims);
110 |             std::cout << "Input bind name: " << name << std::endl;
111 |         }
112 |         else if (io_mode == nvinfer1::TensorIOMode::kOUTPUT)
113 |         {
114 |             out_index_[name] = this->num_outputs;
115 |             dims = this->context->getTensorShape(name.c_str());
116 |             binding.size = 1;
117 |             std::cout << binding.name << ": [";
118 |             for (int i = 0; i < dims.nbDims; i++)
119 |             {
120 |                 std::cout << dims.d[i] << ",";
121 |                 binding.size *= dims.d[i];
122 |                 binding.dims.push_back(dims.d[i]);
123 |             }
124 |             std::cout << "]" << std::endl;
125 |             if (!is_dynamic && config.output_len[binding.name] != binding.size) {
126 |                 std::cout << "Output size of " << binding.name << " mismatch the model file " << config.model_path << ". ("
127 |                         << config.output_len[binding.name] << "!=" << binding.size << ")" << std::endl;
128 |                 return Status::INIT_ERROR;
129 |             }
130 |             if (is_dynamic) {
131 |                 binding.size = config.output_len[binding.name];
132 |             }
133 |             this->output_bindings.push_back(binding);
134 |             this->num_outputs += 1;
135 |             std::cout << "Output bind name: " << name << std::endl;
136 |         }
137 |     }
138 |     make_pipe(true);
139 |     return Status::SUCCESS;
140 | }
141 | 
142 | TensorRTFramework::~TensorRTFramework() {
143 |     delete this->context;
144 |     delete this->engine;
145 |     delete this->runtime;
146 |     cudaStreamDestroy(this->stream);
147 |     for (auto &ptr : this->device_ptrs)
148 |     {
149 |         CHECK(cudaFree(ptr));
150 |     }
151 | 
152 |     for (auto &ptr : this->host_ptrs)
153 |     {
154 |         CHECK(cudaFreeHost(ptr));
155 |     }
156 | }
157 | 
158 | void TensorRTFramework::make_pipe(bool warmup) {
159 |     for (auto &bindings : this->input_bindings)
160 |     {
161 |         void *d_ptr;
162 |         CHECK(cudaMalloc(&d_ptr, bindings.size * bindings.dsize));
163 |         this->device_ptrs.push_back(d_ptr);
164 |         this->context->setTensorAddress(bindings.name.c_str(), d_ptr);
165 |     }
166 | 
167 |     for (auto &bindings : this->output_bindings)
168 |     {
169 |         void *d_ptr, *h_ptr;
170 |         size_t size = bindings.size * bindings.dsize;
171 |         CHECK(cudaMalloc(&d_ptr, size));
172 |         CHECK(cudaHostAlloc(&h_ptr, size, 0));
173 |         this->device_ptrs.push_back(d_ptr);
174 |         this->host_ptrs.push_back(h_ptr);
175 |         this->context->setTensorAddress(bindings.name.c_str(), d_ptr);
176 |     }
177 | 
178 |     if (warmup)
179 |     {
180 |         for (int i = 0; i < 10; i++)
181 |         {
182 |             for (auto &bindings : this->input_bindings)
183 |             {
184 |                 size_t size = bindings.size * bindings.dsize;
185 |                 void *h_ptr = malloc(size);
186 |                 memset(h_ptr, 0, size);
187 |                 CHECK(cudaMemcpyAsync(this->device_ptrs[0], h_ptr, size, cudaMemcpyHostToDevice, this->stream));
188 |                 free(h_ptr);
189 |             }
190 |             this->infer();
191 |         }
192 |         printf("model warmup 10 times\n");
193 |     }
194 | }
195 | 
196 | bool TensorRTFramework::set_input(const std::unordered_map<std::string, IOTensor> &input) {
197 |     for (auto &kv : input) {
198 |         size_t idx = in_index_[kv.first];
199 |         auto& binding = this->input_bindings[idx];
200 |         if (input.find(binding.name) == input.end()) {
201 |             std::cout << "Cannot find " << binding.name << " from the input tensors!" << std::endl;
202 |             return false;
203 |         }
204 |         if (is_dynamic) {
205 |             std::vector<int64_t> shape = input.at(binding.name).shape;
206 |             nvinfer1::Dims dim;
207 |             dim.nbDims = shape.size();
208 |             for (size_t i = 0; i < dim.nbDims; i++) {
209 |                 dim.d[i] = shape[i];
210 |             }
211 |             context->setInputShape(binding.name.c_str(), dim);
212 |         }
213 |         CHECK(cudaMemcpyAsync(
214 |             this->device_ptrs[idx], kv.second.data(), kv.second.size(), cudaMemcpyHostToDevice, this->stream));
215 |     }
216 |     return true;
217 | }
218 | 
219 | bool TensorRTFramework::infer() {
220 |     this->context->enqueueV3(this->stream);
221 |     for (int i = 0; i < this->num_outputs; i++)
222 |     {
223 |         size_t osize = this->output_bindings[i].size * this->output_bindings[i].dsize;
224 |         CHECK(cudaMemcpyAsync(
225 |             this->host_ptrs[i], this->device_ptrs[i + this->num_inputs], osize, cudaMemcpyDeviceToHost, this->stream));
226 |     }
227 |     cudaStreamSynchronize(this->stream);
228 |     return true;
229 | }
230 | 
231 | Status TensorRTFramework::forward(const std::unordered_map<std::string, IOTensor> &input,
232 |                            std::unordered_map<std::string, IOTensor> &output) {
233 |     if (!this->set_input(input)) {
234 |         return Status::INFERENCE_ERROR;
235 |     }
236 |     if (!this->infer()) {
237 |         return Status::INFERENCE_ERROR;
238 |     }
239 |     for (auto &kv : output) {
240 |         auto cur_idx = out_index_[kv.first];
241 |         const auto& binding = this->output_bindings[cur_idx];
242 |         memcpy(kv.second.data(), this->host_ptrs[cur_idx], kv.second.size());
243 |     }
244 |     return Status::SUCCESS;
245 | }


--------------------------------------------------------------------------------
/framework/tensorrt/tensorrt.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "NvInferPlugin.h"
 4 | #include <fstream>
 5 | #include "framework/framework.h"
 6 | #include "common/common.h"
 7 | 
 8 | #define CHECK(call)                                                         \
 9 |     do                                                                      \
10 |     {                                                                       \
11 |         const cudaError_t error_code = call;                                \
12 |         if (error_code != cudaSuccess)                                      \
13 |         {                                                                   \
14 |             printf("CUDA Error:\n");                                        \
15 |             printf("    File:       %s\n", __FILE__);                       \
16 |             printf("    Line:       %d\n", __LINE__);                       \
17 |             printf("    Error code: %d\n", error_code);                     \
18 |             printf("    Error text: %s\n", cudaGetErrorString(error_code)); \
19 |             exit(1);                                                        \
20 |         }                                                                   \
21 |     } while (0)
22 | 
23 | class Logger : public nvinfer1::ILogger
24 | {
25 | public:
26 |     nvinfer1::ILogger::Severity reportableSeverity;
27 | 
28 |     explicit Logger(nvinfer1::ILogger::Severity severity = nvinfer1::ILogger::Severity::kINFO) : reportableSeverity(severity)
29 |     {
30 |     }
31 | 
32 |     void log(nvinfer1::ILogger::Severity severity, const char *msg) noexcept override;
33 | };
34 | 
35 | int TypeToSize(const nvinfer1::DataType &dataType);
36 | 
37 | class TensorRTFramework: public BaseFramework
38 | {
39 | public:
40 |     explicit TensorRTFramework() {}
41 |     virtual ~TensorRTFramework();
42 |     Status Init(Config config) override;
43 |     Status forward(const std::unordered_map<std::string, IOTensor> &input,
44 |                            std::unordered_map<std::string, IOTensor> &output) override;
45 | 
46 | private:
47 |     void make_pipe(bool warmup = true);
48 |     bool set_input(const std::unordered_map<std::string, IOTensor> &input);
49 |     bool infer();
50 | 
51 |     nvinfer1::ICudaEngine *engine = nullptr;
52 |     nvinfer1::IRuntime *runtime = nullptr;
53 |     nvinfer1::IExecutionContext *context = nullptr;
54 |     cudaStream_t stream = nullptr;
55 |     Logger gLogger{nvinfer1::ILogger::Severity::kERROR};
56 |     int num_bindings;
57 |     int num_inputs = 0;
58 |     int num_outputs = 0;
59 |     std::vector<void *> host_ptrs;
60 |     std::vector<void *> device_ptrs;
61 |     std::unordered_map<std::string, int> in_index_;
62 |     std::unordered_map<std::string, int> out_index_;
63 | 
64 |     PreParam pparam;
65 | };


--------------------------------------------------------------------------------
/model/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(yaml-cpp)
 2 | 
 3 | add_library(base_model SHARED ${CMAKE_CURRENT_SOURCE_DIR}/base/model.cpp)
 4 | target_include_directories(base_model PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 5 | target_link_libraries(base_model framework_onnx)
 6 | if(USE_TENSORRT)
 7 |     target_link_libraries(base_model framework_trt)
 8 | endif()
 9 | if(USE_RKNN)
10 |     target_link_libraries(base_model framework_rknn)
11 | endif()
12 | 
13 | add_library(yolo_seg SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_seg.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
14 | target_include_directories(yolo_seg PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
15 | target_link_libraries(yolo_seg base_model yaml-cpp)
16 | target_link_libraries(yolo_seg ${OpenCV_LIBS})
17 | 
18 | add_library(yolo_seg_cutoff SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_seg_cutoff.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
19 | target_include_directories(yolo_seg_cutoff PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
20 | target_link_libraries(yolo_seg_cutoff base_model yaml-cpp)
21 | target_link_libraries(yolo_seg_cutoff ${OpenCV_LIBS})
22 | 
23 | add_library(yolo_pose SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_pose.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
24 | target_include_directories(yolo_pose PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
25 | target_link_libraries(yolo_pose base_model yaml-cpp)
26 | target_link_libraries(yolo_pose ${OpenCV_LIBS})
27 | 
28 | add_library(yolo_det SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
29 | target_include_directories(yolo_det PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
30 | target_link_libraries(yolo_det base_model yaml-cpp)
31 | target_link_libraries(yolo_det ${OpenCV_LIBS})
32 | 
33 | add_library(yolo_det_cutoff SHARED ${CMAKE_CURRENT_SOURCE_DIR}/yolo/yolo_cutoff.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
34 | target_include_directories(yolo_det_cutoff PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
35 | target_link_libraries(yolo_det_cutoff base_model yaml-cpp)
36 | target_link_libraries(yolo_det_cutoff ${OpenCV_LIBS})
37 | 
38 | add_library(ctc SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/ctc.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/ocr_model.cpp)
39 | target_include_directories(ctc PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
40 | target_link_libraries(ctc base_model yaml-cpp)
41 | target_link_libraries(ctc ${OpenCV_LIBS})
42 | 
43 | add_library(attn SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/attention.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/ocr_model.cpp)
44 | target_include_directories(attn PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
45 | target_link_libraries(attn base_model yaml-cpp)
46 | target_link_libraries(attn ${OpenCV_LIBS})
47 | 
48 | add_library(dbnet SHARED ${CMAKE_CURRENT_SOURCE_DIR}/ocr/dbnet.cpp ${CMAKE_CURRENT_SOURCE_DIR}/base/detection_model.cpp)
49 | target_include_directories(dbnet PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
50 | target_link_libraries(dbnet base_model yaml-cpp)
51 | target_link_libraries(dbnet ${OpenCV_LIBS} ${Clipper_LIBS})
52 | 
53 | add_library(sam SHARED ${CMAKE_CURRENT_SOURCE_DIR}/sam/image_encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sam/mask_decoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sam/sam.cpp)
54 | target_include_directories(sam PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
55 | target_link_libraries(sam base_model yaml-cpp)
56 | target_link_libraries(sam ${OpenCV_LIBS})
57 | 
58 | add_subdirectory(clip)


--------------------------------------------------------------------------------
/model/base/detection_model.cpp:
--------------------------------------------------------------------------------
  1 | #include "model/base/detection_model.h"
  2 | #include "opencv2/opencv.hpp"
  3 | #include <sys/stat.h>
  4 | #include <unistd.h>
  5 | #include <fstream>
  6 | #include <sstream>
  7 | 
  8 | void ReadClassNames(std::string file_name, std::vector<std::string> &class_names)
  9 | {
 10 |     std::ifstream in_file;
 11 |     in_file.open(file_name, std::ios::in);
 12 |     assert(in_file.good());
 13 | 
 14 |     std::string name;
 15 |     while (getline(in_file, name, '\n'))
 16 |     {
 17 |         class_names.push_back(name);
 18 |     }
 19 |     in_file.close();
 20 | }
 21 | 
 22 | void DrawObjects(const cv::Mat &image,
 23 |                   cv::Mat &res,
 24 |                   const std::vector<Object> &objs,
 25 |                   const std::vector<std::string> &CLASS_NAMES,
 26 |                   const std::vector<std::vector<unsigned int>> &COLORS)
 27 | {
 28 |     res = image.clone();
 29 |     for (auto &obj : objs)
 30 |     {
 31 |         cv::Scalar color = cv::Scalar(COLORS[obj.label][0], COLORS[obj.label][1], COLORS[obj.label][2]);
 32 |         cv::rectangle(res, obj.rect, color, 2);
 33 | 
 34 |         char text[256];
 35 |         sprintf(text, "%s %.1f%%", CLASS_NAMES[obj.label].c_str(), obj.prob * 100);
 36 | 
 37 |         int baseLine = 0;
 38 |         cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
 39 | 
 40 |         int x = (int)obj.rect.x;
 41 |         int y = (int)obj.rect.y + 1;
 42 | 
 43 |         if (y > res.rows)
 44 |             y = res.rows;
 45 | 
 46 |         cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1);
 47 | 
 48 |         cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1);
 49 |     }
 50 | }
 51 | 
 52 | void DrawObjectsMasks(const cv::Mat &image,
 53 |                         cv::Mat &res,
 54 |                         const std::vector<Object> &objs,
 55 |                         const std::vector<std::string> &CLASS_NAMES,
 56 |                         const std::vector<std::vector<unsigned int>> &COLORS,
 57 |                         const std::vector<std::vector<unsigned int>> &MASK_COLORS)
 58 | {
 59 |     res = image.clone();
 60 |     cv::Mat mask = image.clone();
 61 |     for (auto &obj : objs)
 62 |     {
 63 |         int idx = obj.label;
 64 |         cv::Scalar color = cv::Scalar(COLORS[idx][0], COLORS[idx][1], COLORS[idx][2]);
 65 |         cv::Scalar mask_color =
 66 |             cv::Scalar(MASK_COLORS[idx % 20][0], MASK_COLORS[idx % 20][1], MASK_COLORS[idx % 20][2]);
 67 |         cv::rectangle(res, obj.rect, color, 2);
 68 | 
 69 |         char text[256];
 70 |         sprintf(text, "%s %.1f%%", CLASS_NAMES[idx].c_str(), obj.prob * 100);
 71 |         mask(obj.rect).setTo(mask_color, obj.boxMask);
 72 | 
 73 |         int baseLine = 0;
 74 |         cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
 75 | 
 76 |         int x = (int)obj.rect.x;
 77 |         int y = (int)obj.rect.y + 1;
 78 | 
 79 |         if (y > res.rows)
 80 |             y = res.rows;
 81 | 
 82 |         cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1);
 83 | 
 84 |         cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1);
 85 |     }
 86 |     cv::addWeighted(res, 0.5, mask, 0.8, 1, res);
 87 | }
 88 | 
 89 | void DrawObjectsKps(const cv::Mat&                                image,
 90 |                     cv::Mat&                                      res,
 91 |                     const std::vector<Object>&                    objs,
 92 |                     const std::vector<std::vector<unsigned int>>& SKELETON,
 93 |                     const std::vector<std::vector<unsigned int>>& KPS_COLORS,
 94 |                     const std::vector<std::vector<unsigned int>>& LIMB_COLORS)
 95 | {
 96 |     res                 = image.clone();
 97 |     const int num_point = 17;
 98 |     for (auto& obj : objs) {
 99 |         cv::rectangle(res, obj.rect, {0, 0, 255}, 2);
100 | 
101 |         char text[256];
102 |         sprintf(text, "person %.1f%%", obj.prob * 100);
103 | 
104 |         int      baseLine   = 0;
105 |         cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.4, 1, &baseLine);
106 | 
107 |         int x = (int)obj.rect.x;
108 |         int y = (int)obj.rect.y + 1;
109 | 
110 |         if (y > res.rows)
111 |             y = res.rows;
112 | 
113 |         cv::rectangle(res, cv::Rect(x, y, label_size.width, label_size.height + baseLine), {0, 0, 255}, -1);
114 | 
115 |         cv::putText(res, text, cv::Point(x, y + label_size.height), cv::FONT_HERSHEY_SIMPLEX, 0.4, {255, 255, 255}, 1);
116 | 
117 |         auto& kps = obj.kps;
118 |         for (int k = 0; k < num_point + 2; k++) {
119 |             if (k < num_point) {
120 |                 int   kps_x = std::round(kps[k * 3]);
121 |                 int   kps_y = std::round(kps[k * 3 + 1]);
122 |                 float kps_s = kps[k * 3 + 2];
123 |                 if (kps_s > 0.5f) {
124 |                     cv::Scalar kps_color = cv::Scalar(KPS_COLORS[k][0], KPS_COLORS[k][1], KPS_COLORS[k][2]);
125 |                     cv::circle(res, {kps_x, kps_y}, 5, kps_color, -1);
126 |                 }
127 |             }
128 |             auto& ske    = SKELETON[k];
129 |             int   pos1_x = std::round(kps[(ske[0] - 1) * 3]);
130 |             int   pos1_y = std::round(kps[(ske[0] - 1) * 3 + 1]);
131 | 
132 |             int pos2_x = std::round(kps[(ske[1] - 1) * 3]);
133 |             int pos2_y = std::round(kps[(ske[1] - 1) * 3 + 1]);
134 | 
135 |             float pos1_s = kps[(ske[0] - 1) * 3 + 2];
136 |             float pos2_s = kps[(ske[1] - 1) * 3 + 2];
137 | 
138 |             if (pos1_s > 0.5f && pos2_s > 0.5f) {
139 |                 cv::Scalar limb_color = cv::Scalar(LIMB_COLORS[k][0], LIMB_COLORS[k][1], LIMB_COLORS[k][2]);
140 |                 cv::line(res, {pos1_x, pos1_y}, {pos2_x, pos2_y}, limb_color, 2);
141 |             }
142 |         }
143 |     }
144 | }
145 | 
146 | void DrawBoxes(const cv::Mat &image,
147 |                   cv::Mat &res,
148 |                   const std::vector<Object> &objs) {
149 |     res = image.clone();
150 |     for (auto &obj : objs)
151 |     {
152 |         cv::Scalar color = cv::Scalar(COLORS[obj.label][0], COLORS[obj.label][1], COLORS[obj.label][2]);
153 |         cv::rectangle(res, obj.rect, color, 2);
154 |     }
155 | }
156 | 
157 | float Iou(cv::Rect bb_test, cv::Rect bb_gt)
158 | {
159 |     float in = (bb_test & bb_gt).area();
160 |     float un = bb_test.area() + bb_gt.area() - in;
161 | 
162 |     if (un < DBL_EPSILON)
163 |         return 0;
164 | 
165 |     return in / un;
166 | }
167 | 
168 | void Nms(std::vector<Object> &res, float nms_thresh)
169 | {
170 |     std::map<float, std::vector<Object>> m;
171 |     for (const auto &obj : res)
172 |     {
173 |         if (m.count(obj.label) == 0)
174 |         {
175 |             m.emplace(obj.label, std::vector<Object>());
176 |         }
177 |         m[obj.label].push_back(obj);
178 |     }
179 |     auto cmp = [](const Object &a, const Object &b)
180 |     {
181 |         return a.prob > b.prob;
182 |     };
183 |     res.clear();
184 |     for (auto it = m.begin(); it != m.end(); it++)
185 |     {
186 |         auto &dets = it->second;
187 |         std::sort(dets.begin(), dets.end(), cmp);
188 |         for (size_t m = 0; m < dets.size(); ++m)
189 |         {
190 |             auto &item = dets[m];
191 |             res.push_back(item);
192 |             for (size_t n = m + 1; n < dets.size(); ++n)
193 |             {
194 |                 if (Iou(item.rect, dets[n].rect) > nms_thresh)
195 |                 {
196 |                     dets.erase(dets.begin() + n);
197 |                     --n;
198 |                 }
199 |             }
200 |         }
201 |     }
202 | }


--------------------------------------------------------------------------------
/model/base/detection_model.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include "model/base/model.h"
  3 | #include "common/common.h"
  4 | 
  5 | const std::vector<std::vector<unsigned int>> COLORS = {
  6 |     {0, 114, 189}, {217, 83, 25}, {237, 177, 32}, {126, 47, 142}, {119, 172, 48}, {77, 190, 238}, {162, 20, 47}, {76, 76, 76}, {153, 153, 153}, {255, 0, 0}, {255, 128, 0}, {191, 191, 0}, {0, 255, 0}, {0, 0, 255}, {170, 0, 255}, {85, 85, 0}, {85, 170, 0}, {85, 255, 0}, {170, 85, 0}, {170, 170, 0}, {170, 255, 0}, {255, 85, 0}, {255, 170, 0}, {255, 255, 0}, {0, 85, 128}, {0, 170, 128}, {0, 255, 128}, {85, 0, 128}, {85, 85, 128}, {85, 170, 128}, {85, 255, 128}, {170, 0, 128}, {170, 85, 128}, {170, 170, 128}, {170, 255, 128}, {255, 0, 128}, {255, 85, 128}, {255, 170, 128}, {255, 255, 128}, {0, 85, 255}, {0, 170, 255}, {0, 255, 255}, {85, 0, 255}, {85, 85, 255}, {85, 170, 255}, {85, 255, 255}, {170, 0, 255}, {170, 85, 255}, {170, 170, 255}, {170, 255, 255}, {255, 0, 255}, {255, 85, 255}, {255, 170, 255}, {85, 0, 0}, {128, 0, 0}, {170, 0, 0}, {212, 0, 0}, {255, 0, 0}, {0, 43, 0}, {0, 85, 0}, {0, 128, 0}, {0, 170, 0}, {0, 212, 0}, {0, 255, 0}, {0, 0, 43}, {0, 0, 85}, {0, 0, 128}, {0, 0, 170}, {0, 0, 212}, {0, 0, 255}, {0, 0, 0}, {36, 36, 36}, {73, 73, 73}, {109, 109, 109}, {146, 146, 146}, {182, 182, 182}, {219, 219, 219}, {0, 114, 189}, {80, 183, 189}, {128, 128, 0}};
  7 | 
  8 | const std::vector<std::vector<unsigned int>> MASK_COLORS = {
  9 |     {255, 56, 56}, {255, 157, 151}, {255, 112, 31}, {255, 178, 29}, {207, 210, 49}, {72, 249, 10}, {146, 204, 23}, {61, 219, 134}, {26, 147, 52}, {0, 212, 187}, {44, 153, 168}, {0, 194, 255}, {52, 69, 147}, {100, 115, 255}, {0, 24, 236}, {132, 56, 255}, {82, 0, 133}, {203, 56, 255}, {255, 149, 200}, {255, 55, 199}};
 10 | 
 11 | const std::vector<std::vector<unsigned int>> KPS_COLORS = {{0, 255, 0},
 12 |                                                            {0, 255, 0},
 13 |                                                            {0, 255, 0},
 14 |                                                            {0, 255, 0},
 15 |                                                            {0, 255, 0},
 16 |                                                            {255, 128, 0},
 17 |                                                            {255, 128, 0},
 18 |                                                            {255, 128, 0},
 19 |                                                            {255, 128, 0},
 20 |                                                            {255, 128, 0},
 21 |                                                            {255, 128, 0},
 22 |                                                            {51, 153, 255},
 23 |                                                            {51, 153, 255},
 24 |                                                            {51, 153, 255},
 25 |                                                            {51, 153, 255},
 26 |                                                            {51, 153, 255},
 27 |                                                            {51, 153, 255}};
 28 | 
 29 | const std::vector<std::vector<unsigned int>> SKELETON = {{16, 14},
 30 |                                                          {14, 12},
 31 |                                                          {17, 15},
 32 |                                                          {15, 13},
 33 |                                                          {12, 13},
 34 |                                                          {6, 12},
 35 |                                                          {7, 13},
 36 |                                                          {6, 7},
 37 |                                                          {6, 8},
 38 |                                                          {7, 9},
 39 |                                                          {8, 10},
 40 |                                                          {9, 11},
 41 |                                                          {2, 3},
 42 |                                                          {1, 2},
 43 |                                                          {1, 3},
 44 |                                                          {2, 4},
 45 |                                                          {3, 5},
 46 |                                                          {4, 6},
 47 |                                                          {5, 7}};
 48 | 
 49 | const std::vector<std::vector<unsigned int>> LIMB_COLORS = {{51, 153, 255},
 50 |                                                             {51, 153, 255},
 51 |                                                             {51, 153, 255},
 52 |                                                             {51, 153, 255},
 53 |                                                             {255, 51, 255},
 54 |                                                             {255, 51, 255},
 55 |                                                             {255, 51, 255},
 56 |                                                             {255, 128, 0},
 57 |                                                             {255, 128, 0},
 58 |                                                             {255, 128, 0},
 59 |                                                             {255, 128, 0},
 60 |                                                             {255, 128, 0},
 61 |                                                             {0, 255, 0},
 62 |                                                             {0, 255, 0},
 63 |                                                             {0, 255, 0},
 64 |                                                             {0, 255, 0},
 65 |                                                             {0, 255, 0},
 66 |                                                             {0, 255, 0},
 67 |                                                             {0, 255, 0}};
 68 | 
 69 | void ReadClassNames(std::string file_name, std::vector<std::string> &class_names);
 70 | 
 71 | struct Object
 72 | {
 73 |     cv::Rect_<float> rect;
 74 |     int label = 0;
 75 |     float prob = 0.0;
 76 |     cv::Mat boxMask;
 77 |     std::vector<float> kps;
 78 | };
 79 | 
 80 | void DrawObjects(const cv::Mat &image,
 81 |                   cv::Mat &res,
 82 |                   const std::vector<Object> &objs,
 83 |                   const std::vector<std::string> &CLASS_NAMES,
 84 |                   const std::vector<std::vector<unsigned int>> &COLORS);
 85 | 
 86 | void DrawObjectsMasks(const cv::Mat &image,
 87 |                         cv::Mat &res,
 88 |                         const std::vector<Object> &objs,
 89 |                         const std::vector<std::string> &CLASS_NAMES,
 90 |                         const std::vector<std::vector<unsigned int>> &COLORS,
 91 |                         const std::vector<std::vector<unsigned int>> &MASK_COLORS);
 92 | 
 93 | void DrawObjectsKps(const cv::Mat&                                image,
 94 |                     cv::Mat&                                      res,
 95 |                     const std::vector<Object>&                    objs,
 96 |                     const std::vector<std::vector<unsigned int>>& SKELETON,
 97 |                     const std::vector<std::vector<unsigned int>>& KPS_COLORS,
 98 |                     const std::vector<std::vector<unsigned int>>& LIMB_COLORS);
 99 | 
100 | void DrawBoxes(const cv::Mat &image,
101 |                   cv::Mat &res,
102 |                   const std::vector<Object> &objs);
103 | 
104 | float Iou(cv::Rect bb_test, cv::Rect bb_gt);
105 | 
106 | void Nms(std::vector<Object> &res, float nms_thresh);
107 | 
108 | class DetectionModel : public Model
109 | {
110 | public:
111 |     explicit DetectionModel() {}; 
112 |     virtual ~DetectionModel() {};
113 |     virtual void detect(const cv::Mat &image, std::vector<Object> &objs) = 0;
114 | protected:
115 |     virtual void preprocess(const cv::Mat &input_image, cv::Mat &output_image) = 0;
116 |     virtual void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) = 0;
117 | };


--------------------------------------------------------------------------------
/model/base/model.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/base/model.h"
 2 | 
 3 | bool Model::Init(const std::string &model_path, const std::string &framework_type) {
 4 |     config_.model_path = model_path;
 5 |     if (framework_type == "TensorRT")
 6 |     {   
 7 |     #ifdef USE_TENSORRT
 8 |         framework_ = std::make_shared<TensorRTFramework>();
 9 |     #else
10 |         std::cout << "Framework " << framework_type << " not implemented" <<std::endl;
11 |         return false;
12 |     #endif
13 |     }
14 |     else if (framework_type == "ONNX")
15 |     {
16 |         framework_ = std::make_shared<ONNXFramework>();
17 |     }
18 |     else if (framework_type == "RKNN")
19 |     {
20 |     #ifdef USE_RKNN
21 |         framework_ = std::make_shared<RknnFramework>();
22 |     #else
23 |         std::cout << "Framework " << framework_type << " not implemented" <<std::endl;
24 |         return false;
25 |     #endif
26 |     }
27 |     else
28 |     {
29 |         std::cout << "Framework " << framework_type << " not implemented" <<std::endl;
30 |         return false;
31 |     }
32 |     return true;
33 | }


--------------------------------------------------------------------------------
/model/base/model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "common/common.h"
 3 | #include "framework/framework.h"
 4 | #include "framework/onnx/onnx.h"
 5 | #include "framework/config.h"
 6 | 
 7 | #ifdef USE_TENSORRT
 8 |     #include "framework/tensorrt/tensorrt.h"
 9 | #endif
10 | 
11 | #ifdef USE_RKNN
12 |     #include "framework/rknn/rknn.h"
13 | #endif
14 | 
15 | class Model
16 | {
17 | public:
18 |     explicit Model() {}; 
19 |     virtual ~Model() {};
20 | protected:
21 |     bool Init(const std::string &model_path, const std::string &framework_type);
22 |     Config config_;
23 |     std::shared_ptr<BaseFramework> framework_;
24 | };


--------------------------------------------------------------------------------
/model/base/ocr_model.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/base/ocr_model.h"
 2 | #include <yaml-cpp/yaml.h>
 3 | 
 4 | OcrModel::OcrModel(const std::string &yaml_file) {
 5 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
 6 | 
 7 |     std::string model_path = yaml_node["model_path"].as<std::string>();
 8 | 
 9 |     std::string framework_type = yaml_node["framework"].as<std::string>();
10 |     if (!Init(model_path, framework_type)) exit(0);
11 | 
12 |     std::vector<long> input_size = yaml_node["input_size"].as<std::vector<long>>();
13 |     m_input_size_.width = input_size.at(0);
14 |     m_input_size_.height = input_size.at(1);
15 |     m_input_channel_ = yaml_node["input_channel"].as<int>();
16 | 
17 |     m_output_length_ = yaml_node["output_size"].as<long>();
18 | 
19 |     alphabet_ = yaml_node["alphabet"].as<std::string>();
20 | 
21 |     config_.input_len["images"] = m_input_size_.height * m_input_size_.width * m_input_channel_;
22 |     config_.output_len["output"] = m_output_length_;
23 |     config_.is_dynamic = false;
24 |     Status status = framework_->Init(config_);
25 |     if (status != Status::SUCCESS) {
26 |         std::cout << "Failed to init framework" << std::endl;
27 |         exit(0);
28 |     }
29 | }
30 | 
31 | OcrModel::~OcrModel() { std::cout << "Destruct ocr model" << std::endl; }


--------------------------------------------------------------------------------
/model/base/ocr_model.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/base/model.h"
 3 | #include "common/common.h"
 4 | 
 5 | class OcrModel : public Model
 6 | {
 7 | public:
 8 |     OcrModel() = delete;
 9 |     explicit OcrModel(const std::string &yaml_file);
10 |     virtual ~OcrModel();
11 |     virtual std::string detect(const cv::Mat &image) = 0;
12 | protected:
13 |     virtual std::string postprocess(const std::unordered_map<std::string, IOTensor> &output) = 0;
14 |     cv::Size m_input_size_ = {32, 100};
15 |     size_t m_input_channel_ = 1;
16 |     size_t m_output_length_ = 26;
17 |     std::string alphabet_;
18 | };


--------------------------------------------------------------------------------
/model/clip/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | find_package(ZLIB REQUIRED)
 2 | 
 3 | add_library(clip SHARED ${CMAKE_CURRENT_SOURCE_DIR}/text_tokenizer.cpp 
 4 |                         ${CMAKE_CURRENT_SOURCE_DIR}/image_encoder.cpp 
 5 |                         ${CMAKE_CURRENT_SOURCE_DIR}/text_encoder.cpp
 6 |                         ${CMAKE_CURRENT_SOURCE_DIR}/clip.cpp)
 7 | target_include_directories(clip PUBLIC 
 8 |   ${CMAKE_SOURCE_DIR}
 9 |   ${ICU_INCLUDE_DIRS}
10 |   ${INCLUDE_DIRS}
11 |   ${PROJECT_BINARY_DIR})
12 | target_link_libraries(clip base_model yaml-cpp)
13 | target_link_libraries(clip ${ZLIB_LIBRARIES} ${ICU_LIBRARIES} ${OpenCV_LIBS})


--------------------------------------------------------------------------------
/model/clip/clip.cpp:
--------------------------------------------------------------------------------
  1 | #include "model/clip/clip.h"
  2 | #include <yaml-cpp/yaml.h>
  3 | 
  4 | using namespace clip;
  5 | 
  6 | static void normalize(IOTensor& tensor, size_t size) {
  7 |     float *ptr = (float*)tensor.data();
  8 |     for (size_t i = 0; i < size; i++) {
  9 |         float norm = 0.0;
 10 |         for (size_t j = 0; j < 512; j++) {
 11 |             norm += std::pow(*(ptr+j), 2);
 12 |         }
 13 |         norm = std::sqrt(norm);
 14 | 
 15 |         for (size_t j = 0; j < 512; j++) {
 16 |             *ptr = *ptr / norm;
 17 |             ++ptr;
 18 |         }
 19 |     }
 20 | }
 21 | 
 22 | static void ReadPrompt(const std::string& prompt_path, std::vector<std::string>& prompts) {
 23 |     std::ifstream file(prompt_path);
 24 | 
 25 |     if (file.is_open()) {
 26 |         std::string line;
 27 |         while (std::getline(file, line)) {
 28 |             prompts.push_back(line); // 逐行读取文件内容并存储到 vector 中
 29 |         }
 30 |         file.close(); // 关闭文件
 31 |     } else {
 32 |         std::cout << "无法打开文件" << std::endl;
 33 |     }
 34 | }
 35 | 
 36 | static void ReadTextEmbedding(const std::string& path, std::vector<float>& text_embeddings) {
 37 |     std::streampos size;
 38 |     std::ifstream fin(path.c_str(), std::ios::binary | std::ios::in);
 39 |     fin.seekg(0, std::ios::end);
 40 |     size = fin.tellg();
 41 |     text_embeddings.resize(size/sizeof(float));
 42 |     fin.seekg(0, std::ios::beg);
 43 |     fin.read((char *)text_embeddings.data(), size);
 44 |     fin.close();
 45 | }
 46 | 
 47 | Clip::Clip(const std::string& image_encoder_cfg, const std::string& text_encoder_cfg) {
 48 |     m_image_encoder_ = std::make_shared<ImageEncoder>(image_encoder_cfg);
 49 | 
 50 |     YAML::Node yaml_node = YAML::LoadFile(text_encoder_cfg);
 51 |     bool online = yaml_node["online"].as<bool>();
 52 |     if (online) {
 53 |         m_text_encoder_ = std::make_shared<TextEncoder>(text_encoder_cfg);
 54 |     } 
 55 | 
 56 |     std::string prompt_path = yaml_node["prompts"].as<std::string>();
 57 |     std::vector<std::string> prompts;
 58 |     ReadPrompt(prompt_path, prompts);
 59 | 
 60 |     std::string text_embedding_path = yaml_node["text_embedding"].as<std::string>();
 61 |     std::vector<float> embeddings;
 62 |     ReadTextEmbedding(text_embedding_path, embeddings);
 63 | 
 64 |     float* ptr = embeddings.data();
 65 |     for (size_t i = 0; i < prompts.size(); i++) {
 66 |         cache_[prompts[i]] = std::vector<float>(512, 0.0);
 67 |         for (size_t j = 0; j < 512; j++) {
 68 |             cache_[prompts[i]][j] = *ptr++;
 69 |         }
 70 |     }
 71 | }
 72 | 
 73 | void Clip::encodeImages(const std::vector<cv::Mat>& images) {
 74 |     m_image_encoder_->forward(images, image_embeddings);
 75 |     normalize(image_embeddings, images.size());
 76 | }
 77 | 
 78 | void Clip::encodeTexts(const std::vector<std::string>& texts) {
 79 |     std::vector<std::string> texts_not_in_cache;
 80 |     for (const auto& text: texts) {
 81 |         if (!cache_.count(text)) {
 82 |             texts_not_in_cache.push_back(text);
 83 |         }
 84 |     }
 85 | 
 86 |     float* ptr;
 87 | 
 88 |     if (!texts_not_in_cache.empty()) {
 89 |         if (!m_text_encoder_) {
 90 |             std::cout << "The text encoder is offline. Failed to generate text embeddings for text out of prompt list" << std::endl;
 91 |             exit(0);
 92 |         }
 93 |         IOTensor embeddings;
 94 |         m_text_encoder_->forward(texts_not_in_cache, embeddings);
 95 |         ptr = (float*)embeddings.data();
 96 |         for (size_t i = 0; i < texts_not_in_cache.size(); i++) {
 97 |             cache_[texts_not_in_cache[i]] = std::vector<float>(512, 0.0);
 98 |             for (size_t j = 0; j < 512; j++) {
 99 |                 cache_[texts_not_in_cache[i]][j] = *ptr++;
100 |             }
101 |         }
102 |     }
103 | 
104 |     text_embeddings.resize(texts.size() * 512 * sizeof(float));
105 |     text_embeddings.shape = std::vector<int64_t>{static_cast<int64_t>(texts.size()), 512};
106 |     text_embeddings.data_type = DataType::FP32;
107 | 
108 |     ptr = (float*)text_embeddings.data();
109 |     for (const auto& text : texts) {
110 |         memcpy(ptr, cache_[text].data(), 512 * sizeof(float));
111 |         ptr += 512;
112 |     }
113 |     
114 |     normalize(text_embeddings, texts.size());
115 | }
116 | 
117 | std::vector<std::vector<float>> Clip::computeProbabilities() {
118 |     size_t num_images = image_embeddings.shape[0];
119 |     size_t num_texts = text_embeddings.shape[0];
120 |     cv::Mat image_matrix(num_images, 512, CV_32F, image_embeddings.data());
121 |     cv::Mat text_matrix(num_texts, 512, CV_32F, text_embeddings.data());
122 |     cv::Mat logits;
123 |     cv::gemm(image_matrix, text_matrix.t(), 100, cv::Mat(), 0.0, logits);
124 |     
125 |     std::vector<std::vector<float>> probs;
126 |     float *ptr = logits.ptr<float>();
127 |     for (size_t i = 0; i < num_images; i++) {
128 |         float exp_sum = 0.0;
129 |         for (size_t j = 0; j < num_texts; j++) {
130 |             exp_sum += std::exp(*(ptr+j));
131 |         }
132 |         std::vector<float> prob;
133 |         for (size_t j = 0; j < num_texts; j++) {
134 |             prob.push_back(std::exp(*(ptr++)) / exp_sum);
135 |         }
136 |         probs.push_back(prob);
137 |     }
138 |     return probs;
139 | }


--------------------------------------------------------------------------------
/model/clip/clip.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/clip/image_encoder.h"
 3 | #include "model/clip/text_encoder.h"
 4 | 
 5 | namespace clip {
 6 | 
 7 | class Clip {
 8 |    public:
 9 |     Clip() = delete;
10 |     Clip(const std::string& image_encoder_cfg, const std::string& text_encoder_cfg);
11 |     void encodeImages(const std::vector<cv::Mat>& images);
12 |     void encodeTexts(const std::vector<std::string>& texts);
13 |     std::vector<std::vector<float>> computeProbabilities();
14 |    private:
15 |     std::shared_ptr<ImageEncoder> m_image_encoder_;
16 |     std::shared_ptr<TextEncoder> m_text_encoder_;
17 |     IOTensor image_embeddings;
18 |     IOTensor text_embeddings;
19 |     std::map<std::string, std::vector<float>> cache_;
20 | };
21 | }


--------------------------------------------------------------------------------
/model/clip/image_encoder.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/clip/image_encoder.h"
 2 | #include <yaml-cpp/yaml.h>
 3 | #include <assert.h>
 4 | 
 5 | using namespace clip;
 6 | 
 7 | ImageEncoder::ImageEncoder(const std::string &yaml_file) : m_input_size_(224, 224), m_output_size_(512) 
 8 | {
 9 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
10 | 
11 |     std::string model_path = yaml_node["model_path"].as<std::string>();
12 |     std::string framework_type = yaml_node["framework"].as<std::string>();
13 |     int max_batch_size = yaml_node["max_batch_size"].as<int>();
14 | 
15 |     if (!Init(model_path, framework_type)) exit(0);
16 | 
17 |     config_.input_len["IMAGE"] = max_batch_size * 3 * m_input_size_.height * m_input_size_.width;
18 |     config_.output_len["IMAGE_EMBEDDING"] = max_batch_size * m_output_size_;
19 |     config_.is_dynamic = true;
20 |     Status status = framework_->Init(config_);
21 |     if (status != Status::SUCCESS) {
22 |         std::cout << "Failed to init framework" << std::endl;
23 |         exit(0);
24 |     }
25 | }
26 | 
27 | ImageEncoder::~ImageEncoder() {
28 |     std::cout << "Destruct image encoder" << std::endl;
29 | }
30 | 
31 | void ImageEncoder::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
32 |     int h = input_image.rows;
33 |     int w = input_image.cols;
34 |     int resized_h, resized_w;
35 |     if (h < w) {
36 |         resized_h = 224;
37 |         resized_w = int(224 * w / h);
38 |     } else {
39 |         resized_w = 224;
40 |         resized_h = int(resized_w * h / w);
41 |     }
42 |     cv::Mat resized_img;
43 |     cv::resize(input_image, resized_img, cv::Size(resized_w, resized_h));
44 | 
45 |     int y_from = (resized_h - 224) / 2;
46 |     int x_from = (resized_w - 224) / 2;
47 |     cv::Rect roi(x_from, y_from, 224, 224);
48 |     resized_img = resized_img(roi);
49 | 
50 |     cv::Scalar mean(0.48145466*255, 0.4578275*255, 0.40821073*255);
51 |     float std = (0.26862954 + 0.26130258 + 0.27577711) / 3 * 255;
52 |     cv::dnn::blobFromImage(resized_img, output_image, 1 / std, cv::Size(), cv::Scalar(), false, false, CV_32F);
53 | }
54 | 
55 | void ImageEncoder::forward(const std::vector<cv::Mat> &images, IOTensor& features) {
56 |     std::unordered_map<std::string, IOTensor> input, output;
57 | 
58 |     input["IMAGE"] = IOTensor();
59 |     input["IMAGE"].resize(images.size() * 3 * m_input_size_.height * m_input_size_.width * sizeof(float));
60 |     input["IMAGE"].shape = std::vector<int64_t>{static_cast<int64_t>(images.size()), 3, m_input_size_.height, m_input_size_.width};
61 |     auto ptr = input["IMAGE"].data();
62 |     for (const auto& image: images) {
63 |         cv::Mat nchw;
64 |         preprocess(image, nchw);
65 |         assert(nchw.total() * nchw.elemSize() == 3 * m_input_size_.height * m_input_size_.width * sizeof(float));
66 |         memcpy(ptr, nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
67 |         ptr += nchw.total() * nchw.elemSize();
68 |     }
69 | 
70 |     // 输出张量设置
71 |     output["IMAGE_EMBEDDING"] = IOTensor();
72 |     output["IMAGE_EMBEDDING"].resize(images.size() * config_.output_len["IMAGE_EMBEDDING"] * sizeof(float));
73 |     output["IMAGE_EMBEDDING"].shape = std::vector<int64_t>{static_cast<int64_t>(images.size()), config_.output_len["IMAGE_EMBEDDING"]};
74 | 
75 |     this->framework_->forward(input, output);
76 | 
77 |     features.resize(output["IMAGE_EMBEDDING"].size());
78 |     memcpy(features.data(), output["IMAGE_EMBEDDING"].data(), features.size());
79 |     features.shape = output["IMAGE_EMBEDDING"].shape;
80 | }


--------------------------------------------------------------------------------
/model/clip/image_encoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/base/model.h"
 3 | 
 4 | namespace clip {
 5 | 
 6 | class ImageEncoder : public Model {
 7 |    public:
 8 |     ImageEncoder() = delete;
 9 |     ImageEncoder(const std::string &yaml_file);
10 |     virtual ~ImageEncoder();
11 |     void forward(const std::vector<cv::Mat> &images, IOTensor &features);
12 | 
13 |     cv::Size input_size() const { return m_input_size_; }
14 |     size_t output_size() const { return m_output_size_; }
15 | 
16 |    protected:
17 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image);
18 | 
19 |    private:
20 |     cv::Size m_input_size_;
21 |     size_t m_output_size_;
22 | };
23 | 
24 | }


--------------------------------------------------------------------------------
/model/clip/text_encoder.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/clip/text_encoder.h"
 2 | 
 3 | #include <yaml-cpp/yaml.h>
 4 | 
 5 | using namespace clip;
 6 | 
 7 | TextEncoder::TextEncoder(const std::string &yaml_file) : m_input_size_(77), m_output_size_(512) {
 8 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
 9 | 
10 |     std::string bpe_path = yaml_node["bpe_path"].as<std::string>();
11 |     m_tokenizer_ = std::make_shared<TextTokenizer>(bpe_path);
12 | 
13 |     std::string model_path = yaml_node["model_path"].as<std::string>();
14 |     std::string framework_type = yaml_node["framework"].as<std::string>();
15 | 
16 |     if (!Init(model_path, framework_type)) exit(0);
17 | 
18 |     config_.input_len["TEXT"] = 2 * m_input_size_;
19 |     config_.output_len["TEXT_EMBEDDING"] = 2 * m_output_size_;
20 |     config_.is_dynamic = true;
21 |     Status status = framework_->Init(config_);
22 |     if (status != Status::SUCCESS) {
23 |         std::cout << "Failed to init framework" << std::endl;
24 |         exit(0);
25 |     }
26 | }
27 | 
28 | TextEncoder::~TextEncoder() { std::cout << "Destruct text encoder" << std::endl; }
29 | 
30 | void TextEncoder::preprocess(const std::vector<std::string> &texts, IOTensor &text_embeddings) {
31 |     std::vector<std::vector<int>> tokens = m_tokenizer_->batchTokenize(texts);
32 |     std::vector<int> tensor;
33 |     for (const auto &token : tokens) {
34 |         for (int i : token) {
35 |             tensor.push_back(i);
36 |         }
37 |     }
38 | 
39 |     text_embeddings.resize(tensor.size() * sizeof(int));
40 |     text_embeddings.shape =
41 |         std::vector<int64_t>{static_cast<int64_t>(texts.size()), static_cast<int64_t>(tokens[0].size())};
42 |     text_embeddings.data_type = DataType::INT32;
43 |     memcpy(text_embeddings.data(), tensor.data(), text_embeddings.size());
44 | }
45 | 
46 | void TextEncoder::forward(const std::vector<std::string> &texts, IOTensor &features) {
47 |     std::unordered_map<std::string, IOTensor> input, output;
48 | 
49 |     input["TEXT"] = IOTensor();
50 |     preprocess(texts, input["TEXT"]);
51 | 
52 |     output["TEXT_EMBEDDING"] = IOTensor();
53 |     output["TEXT_EMBEDDING"].resize(texts.size() * m_output_size_ * sizeof(float));
54 |     output["TEXT_EMBEDDING"].shape =
55 |         std::vector<int64_t>{static_cast<int64_t>(texts.size()), static_cast<int64_t>(m_output_size_)};
56 |     output["TEXT_EMBEDDING"].data_type = DataType::FP32;
57 | 
58 |     this->framework_->forward(input, output);
59 | 
60 |     features.resize(output["TEXT_EMBEDDING"].size());
61 |     memcpy(features.data(), output["TEXT_EMBEDDING"].data(), features.size());
62 |     features.shape = output["TEXT_EMBEDDING"].shape;
63 |     features.data_type = output["TEXT_EMBEDDING"].data_type;
64 | }


--------------------------------------------------------------------------------
/model/clip/text_encoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/base/model.h"
 3 | #include "model/clip/text_tokenizer.h"
 4 | 
 5 | namespace clip {
 6 | class TextEncoder : public Model {
 7 |    public:
 8 |     TextEncoder() = delete;
 9 |     TextEncoder(const std::string &yaml_file);
10 |     virtual ~TextEncoder();
11 |     void forward(const std::vector<std::string> &texts, IOTensor &features);
12 | 
13 |     size_t input_size() const { return m_input_size_; }
14 |     size_t output_size() const { return m_output_size_; }
15 | 
16 |    protected:
17 |     void preprocess(const std::vector<std::string> &texts, IOTensor &text_embeddings);
18 | 
19 |    private:
20 |     std::shared_ptr<TextTokenizer> m_tokenizer_;
21 |     size_t m_input_size_;
22 |     size_t m_output_size_;
23 | };
24 | }


--------------------------------------------------------------------------------
/model/clip/text_tokenizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <zlib.h>
 3 | 
 4 | #include <algorithm>
 5 | #include <fstream>
 6 | #include <iostream>
 7 | #include <map>
 8 | #include <regex>
 9 | #include <set>
10 | #include <sstream>
11 | #include <vector>
12 | 
13 | namespace clip {
14 | 
15 | class TextTokenizer {
16 |    public:
17 |     TextTokenizer() = delete;
18 |     TextTokenizer(const std::string &path);
19 | 
20 |     std::vector<int> tokenize(const std::string &text, size_t context_length = 77);
21 |     std::vector<std::vector<int>> batchTokenize(const std::vector<std::string> &texts, size_t context_length = 77);
22 | 
23 |    private:
24 |     std::string bpe(const std::string &token);
25 |     void encode(const std::string &str, std::vector<int> &bpe_tokens);
26 |     std::string decode(const std::vector<int> &bpe_tokens);
27 | 
28 |     std::map<size_t, std::string> byte_encoder;
29 |     std::map<std::string, size_t> byte_decoder;
30 |     std::map<std::string, size_t> encoder;
31 |     std::map<size_t, std::string> decoder;
32 |     std::map<std::vector<std::string>, size_t> bpe_ranks;
33 |     std::map<std::string, std::string> cache;
34 |     std::regex pattern;
35 | };
36 | }  // namespace clip
37 | 


--------------------------------------------------------------------------------
/model/ocr/attention.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/ocr/attention.h"
 2 | 
 3 | #include <yaml-cpp/yaml.h>
 4 | 
 5 | std::string AttnModel::detect(const cv::Mat &image) {
 6 |     std::unordered_map<std::string, IOTensor> input, output;
 7 | 
 8 |     // 输入tensor设置
 9 |     cv::Mat nchw;
10 |     cv::dnn::blobFromImage(image, nchw, 1 / 64.f, m_input_size_, cv::Scalar(127.5, 127.5, 127.5), false, false, CV_32F);
11 | 
12 |     input["images"] = IOTensor();
13 |     input["images"].shape = std::vector<int64_t>{1, static_cast<int64_t>(m_input_channel_), m_input_size_.height, m_input_size_.width};
14 |     input["images"].data_type = DataType::FP32;
15 |     input["images"].resize(nchw.total() * nchw.elemSize());
16 |     memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
17 | 
18 |     // 输出张量设置
19 |     output["output"] = IOTensor();
20 |     output["output"].shape = std::vector<int64_t>{1, static_cast<int64_t>(m_output_length_)};
21 |     output["output"].data_type = DataType::FP32;
22 |     output["output"].resize(m_output_length_ * sizeof(float));
23 | 
24 |     this->framework_->forward(input, output);
25 |     return postprocess(output);
26 | }
27 | 
28 | std::string AttnModel::postprocess(const std::unordered_map<std::string, IOTensor> &output) {
29 |     float *const outputs = (float *)output.at("output").data();
30 |     std::string str;
31 |     for (size_t i = 0; i < m_output_length_; i++) {
32 |         int idx = static_cast<int>(outputs[i]);
33 |         if (idx != 0){
34 |             str.push_back(alphabet_[idx - 1]);
35 |         } else {
36 |             break;
37 |         }
38 |     }
39 |     return str;
40 | }


--------------------------------------------------------------------------------
/model/ocr/attention.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/ocr_model.h"
 5 | 
 6 | class AttnModel : public OcrModel {
 7 |    public:
 8 |     AttnModel() = delete;
 9 |     explicit AttnModel(const std::string &yaml_file) : OcrModel(yaml_file) {}
10 |     ~AttnModel() {}
11 | 
12 |     std::string detect(const cv::Mat &image) override;
13 | 
14 |    protected:
15 |     std::string postprocess(const std::unordered_map<std::string, IOTensor> &output) override;
16 | };


--------------------------------------------------------------------------------
/model/ocr/ctc.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/ocr/ctc.h"
 2 | 
 3 | std::string CtcModel::detect(const cv::Mat &image) {
 4 |     std::unordered_map<std::string, IOTensor> input, output;
 5 | 
 6 |     // 输入tensor设置
 7 |     cv::Mat nchw;
 8 |     cv::dnn::blobFromImage(image, nchw, 1 / 127.5f, m_input_size_, cv::Scalar(127.5, 127.5, 127.5), false, false, CV_32F);
 9 | 
10 |     input["images"] = IOTensor();
11 |     input["images"].shape = std::vector<int64_t>{1, static_cast<int64_t>(m_input_channel_), m_input_size_.height, m_input_size_.width};
12 |     input["images"].data_type = DataType::FP32;
13 |     input["images"].resize(nchw.total() * nchw.elemSize());
14 |     memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
15 | 
16 |     // 输出张量设置
17 |     output["output"] = IOTensor();
18 |     output["output"].shape = std::vector<int64_t>{1, static_cast<int64_t>(m_output_length_), 1};
19 |     output["output"].data_type = DataType::FP32;
20 |     output["output"].resize(config_.output_len["output"] * sizeof(float));
21 | 
22 |     this->framework_->forward(input, output);
23 |     return postprocess(output);
24 | }
25 | 
26 | std::string CtcModel::postprocess(const std::unordered_map<std::string, IOTensor> &output) {
27 |     float *const outputs = (float *)output.at("output").data();
28 |     std::string str;
29 |     for (size_t i = 0; i < m_output_length_; i++) {
30 |         int idx = static_cast<int>(outputs[i]);
31 |         if (idx == 0 || (i > 0 && static_cast<int>(outputs[i-1]) == idx)) continue;
32 |         str.push_back(alphabet_[idx - 1]);
33 |     }
34 |     return str;
35 | }


--------------------------------------------------------------------------------
/model/ocr/ctc.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/ocr_model.h"
 5 | 
 6 | class CtcModel : public OcrModel {
 7 |    public:
 8 |     CtcModel() = delete;
 9 |     explicit CtcModel(const std::string &yaml_file): OcrModel(yaml_file) {}
10 |     ~CtcModel() {}
11 | 
12 |     std::string detect(const cv::Mat &image) override;
13 | 
14 |    protected:
15 |     std::string postprocess(const std::unordered_map<std::string, IOTensor> &output) override;
16 | 
17 | };


--------------------------------------------------------------------------------
/model/ocr/dbnet.cpp:
--------------------------------------------------------------------------------
  1 | #include "model/ocr/dbnet.h"
  2 | 
  3 | #include <yaml-cpp/yaml.h>
  4 | #include "polyclipping/clipper.hpp"
  5 | 
  6 | static cv::RotatedRect expandBox(cv::Point2f temp[], float ratio)
  7 | {
  8 |     ClipperLib::Path path = {
  9 |         {ClipperLib::cInt(temp[0].x), ClipperLib::cInt(temp[0].y)},
 10 |         {ClipperLib::cInt(temp[1].x), ClipperLib::cInt(temp[1].y)},
 11 |         {ClipperLib::cInt(temp[2].x), ClipperLib::cInt(temp[2].y)},
 12 |         {ClipperLib::cInt(temp[3].x), ClipperLib::cInt(temp[3].y)}};
 13 |     double area = ClipperLib::Area(path);
 14 |     double distance;
 15 |     double length = 0.0;
 16 |     for (int i = 0; i < 4; i++) {
 17 |         length = length + sqrtf(powf((temp[i].x - temp[(i + 1) % 4].x), 2) +
 18 |                                 powf((temp[i].y - temp[(i + 1) % 4].y), 2));
 19 |     }
 20 | 
 21 |     distance = area * ratio / length;
 22 | 
 23 |     ClipperLib::ClipperOffset offset;
 24 |     offset.AddPath(path, ClipperLib::JoinType::jtRound,
 25 |                    ClipperLib::EndType::etClosedPolygon);
 26 |     ClipperLib::Paths paths;
 27 |     offset.Execute(paths, distance);
 28 |     
 29 |     std::vector<cv::Point> contour;
 30 |     for (size_t i = 0; i < paths[0].size(); i++) {
 31 |         contour.emplace_back(paths[0][i].X, paths[0][i].Y);
 32 |     }
 33 |     offset.Clear();
 34 |     return cv::minAreaRect(contour);
 35 | }
 36 | 
 37 | static bool get_mini_boxes(cv::RotatedRect& rotated_rect, cv::Point2f rect[],
 38 |                     int min_size)
 39 | {
 40 | 
 41 |     cv::Point2f temp_rect[4];
 42 |     rotated_rect.points(temp_rect);
 43 |     for (int i = 0; i < 4; i++) {
 44 |         for (int j = i + 1; j < 4; j++) {
 45 |             if (temp_rect[i].x > temp_rect[j].x) {
 46 |                 cv::Point2f temp;
 47 |                 temp = temp_rect[i];
 48 |                 temp_rect[i] = temp_rect[j];
 49 |                 temp_rect[j] = temp;
 50 |             }
 51 |         }
 52 |     }
 53 |     int index0 = 0;
 54 |     int index1 = 1;
 55 |     int index2 = 2;
 56 |     int index3 = 3;
 57 |     if (temp_rect[1].y > temp_rect[0].y) {
 58 |         index0 = 0;
 59 |         index3 = 1;
 60 |     } else {
 61 |         index0 = 1;
 62 |         index3 = 0;
 63 |     }
 64 |     if (temp_rect[3].y > temp_rect[2].y) {
 65 |         index1 = 2;
 66 |         index2 = 3;
 67 |     } else {
 68 |         index1 = 3;
 69 |         index2 = 2;
 70 |     }   
 71 | 
 72 |     rect[0] = temp_rect[index0];  // Left top coordinate
 73 |     rect[1] = temp_rect[index1];  // Left bottom coordinate
 74 |     rect[2] = temp_rect[index2];  // Right bottom coordinate
 75 |     rect[3] = temp_rect[index3];  // Right top coordinate
 76 | 
 77 |     if (rotated_rect.size.width < min_size ||
 78 |         rotated_rect.size.height < min_size) {
 79 |         return false;
 80 |     } else {
 81 |         return true;
 82 |     }
 83 | }
 84 | 
 85 | static float get_box_score(float* map, cv::Point2f rect[], int width, int height,
 86 |                     float threshold)
 87 | {
 88 | 
 89 |     int xmin = width - 1;
 90 |     int ymin = height - 1;
 91 |     int xmax = 0;
 92 |     int ymax = 0;
 93 | 
 94 |     for (int j = 0; j < 4; j++) {
 95 |         if (rect[j].x < xmin) {
 96 |             xmin = rect[j].x;
 97 |         }
 98 |         if (rect[j].y < ymin) {
 99 |             ymin = rect[j].y;
100 |         }
101 |         if (rect[j].x > xmax) {
102 |             xmax = rect[j].x;
103 |         }
104 |         if (rect[j].y > ymax) {
105 |             ymax = rect[j].y;
106 |         }
107 |     }
108 |     float sum = 0;
109 |     int num = 0;
110 |     for (int i = ymin; i <= ymax; i++) {
111 |         for (int j = xmin; j <= xmax; j++) {
112 |             if (map[i * width + j] > threshold) {
113 |                 sum = sum + map[i * width + j];
114 |                 num++;
115 |             }
116 |         }
117 |     }
118 | 
119 |     return sum / num;
120 | }
121 | 
122 | DBNet::DBNet(const std::string &yaml_file) {
123 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
124 | 
125 |     std::string model_path = yaml_node["model_path"].as<std::string>();
126 |     std::string framework_type = yaml_node["framework"].as<std::string>();
127 | 
128 |     m_box_thres_ = yaml_node["box_thres"].as<float>();
129 |     std::vector<long> max_input_size = yaml_node["max_input_size"].as<std::vector<long>>();
130 | 
131 |     if (!Init(model_path, framework_type)) exit(0);
132 | 
133 |     config_.input_len["images"] = max_input_size[0] * max_input_size[1] * max_input_size[2] * max_input_size[3];
134 |     config_.output_len["output"] = max_input_size[0] * 2 * max_input_size[2] * max_input_size[3];
135 |     config_.is_dynamic = true;
136 |     Status status = framework_->Init(config_);
137 |     if (status != Status::SUCCESS) {
138 |         std::cout << "Failed to init framework" << std::endl;
139 |         exit(0);
140 |     }
141 | }
142 | 
143 | DBNet::~DBNet()
144 | {
145 |     std::cout << "Destruct dbnet" << std::endl;
146 | }
147 | 
148 | void DBNet::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
149 |     // mean value [0.406, 0.456, 0.485] * 255
150 |     // std value [0.225, 0.225, 0.225] * 255
151 |     cv::Mat mask;
152 |     this->pparam_ = paddimg(input_image, mask, 640);
153 |     cv::dnn::blobFromImage(mask, output_image, 1 / 57.375, cv::Size(), cv::Scalar(103.53f, 116.28f, 123.675f), false, false, CV_32F);
154 | }
155 | 
156 | void DBNet::detect(const cv::Mat &image, std::vector<Object> &objs) {
157 |     std::unordered_map<std::string, IOTensor> input, output;
158 | 
159 |     // 输入tensor设置
160 |     cv::Mat nchw;
161 |     preprocess(image, nchw);
162 | 
163 |     input["images"] = IOTensor();
164 |     input["images"].resize(nchw.total() * nchw.elemSize());
165 |     memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
166 |     input["images"].shape = std::vector<int64_t>{1, 3, nchw.size[2], nchw.size[3]};
167 |     input["images"].data_type = DataType::FP32;
168 |     
169 | 
170 |     // 输出张量设置
171 |     output["output"] = IOTensor();
172 |     output["output"].resize(2 * nchw.size[2] * nchw.size[3] * sizeof(float));
173 |     output["output"].shape = std::vector<int64_t>{1, 2 ,nchw.size[2] ,nchw.size[3]};
174 |     output["output"].data_type = DataType::FP32;
175 | 
176 |     this->framework_->forward(input, output);
177 |     postprocess(output, objs);
178 | }
179 | 
180 | void DBNet::postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) {
181 |     objs.clear();
182 | 
183 |     float scale = this->pparam_.ratio;
184 | 
185 |     float * const prob = (float *)output.at("output").data();
186 |     int height = output.at("output").shape[2];
187 |     int width = output.at("output").shape[3];
188 | 
189 |     cv::Mat map = cv::Mat::zeros(cv::Size(width, height), CV_8UC1);
190 |     for (int h = 0; h < height; ++h) {
191 |         uchar *ptr = map.ptr(h);
192 |         for (int w = 0; w < width; ++w) {
193 |             ptr[w] = (prob[h * width + w] > 0.3) ? 255 : 0;
194 |         }
195 |     }
196 | 
197 |     // Extracting minimum circumscribed rectangle
198 |     std::vector<std::vector<cv::Point>> contours;
199 |     std::vector<cv::Vec4i> hierarcy;
200 |     cv::findContours(map, contours, hierarcy, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
201 | 
202 |     std::vector<cv::Rect> boundRect(contours.size());
203 |     std::vector<cv::RotatedRect> box(contours.size());
204 |     cv::Point2f rect[4];
205 |     cv::Point2f order_rect[4];
206 | 
207 |     for (size_t i = 0; i < contours.size(); i++) {
208 |         cv::RotatedRect rotated_rect = cv::minAreaRect(cv::Mat(contours[i]));
209 |         if (!get_mini_boxes(rotated_rect, rect, m_box_thres_)) {
210 |             std::cout << "box too small" <<  std::endl;
211 |             continue;
212 |         }
213 | 
214 |         // drop low score boxes
215 |         float score = get_box_score(prob, rect, width, height,
216 |                                     m_score_thres_);
217 |         if (score < m_box_thres_) {
218 |             // std::cout << "score too low =  " << score << ", threshold = " << m_box_thres_ <<  std::endl;
219 |             continue;
220 |         }
221 | 
222 |         // Scaling the predict boxes depend on EXPANDRATIO
223 |         cv::RotatedRect expandbox = expandBox(rect, m_expand_ratio_);
224 |         expandbox.points(rect);
225 |         if (!get_mini_boxes(expandbox, rect, m_box_min_size_ + 2)) {  
226 |             continue;
227 |         }
228 | 
229 |         // Restore the coordinates to the original image
230 |         for (int k = 0; k < 4; k++) {
231 |             order_rect[k] = rect[k];
232 |             order_rect[k].x = int(order_rect[k].x * scale);
233 |             order_rect[k].y = int(order_rect[k].y * scale);
234 |         }
235 |         
236 |         Object obj;
237 |         obj.label = 0;
238 |         obj.rect = cv::Rect2i(cv::Point(order_rect[0].x,order_rect[0].y), cv::Point(order_rect[2].x,order_rect[2].y));
239 |         objs.push_back(obj);
240 |     }
241 | }


--------------------------------------------------------------------------------
/model/ocr/dbnet.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class DBNet : public DetectionModel {
 7 |    public:
 8 |     DBNet() = delete;
 9 |     explicit DBNet(const std::string &yaml_file);
10 |     ~DBNet();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 | 
18 |    private:
19 |     cv::Size m_input_size_ = {640, 640};
20 |     float m_box_thres_ = 0.3f;
21 |     float m_expand_ratio_ = 1.5f;
22 |     float m_score_thres_ = 0.3f;
23 |     int m_box_min_size_ = 5;
24 |     PreParam pparam_;
25 | };


--------------------------------------------------------------------------------
/model/ocr/scripts/abinet_export.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import onnx
  3 | import torch
  4 | import argparse
  5 | from io import BytesIO
  6 | import torch.nn as nn
  7 | 
  8 | from utils import Config
  9 | 
 10 | try:
 11 |     import onnxsim
 12 | except ImportError:
 13 |     onnxsim = None
 14 | 
 15 | class ONNXModel(nn.Module):
 16 |     def __init__(self, config, device):
 17 |         super().__init__()
 18 |         self.get_model(config, device)
 19 |         self.load(config.model_checkpoint, device=device)
 20 |         print('loading pretrained model from %s' % config.model_checkpoint)
 21 | 
 22 |     def forward(self, x):
 23 |         logits, length = self.model(x)
 24 |         scores, labels = logits.max(dim=-1, keepdim=True)
 25 |         return labels.to(torch.float32)
 26 |     
 27 |     def get_model(self, config, device):
 28 |         import importlib
 29 |         names = config.model_name.split('.')
 30 |         module_name, class_name = '.'.join(names[:-1]), names[-1]
 31 |         cls = getattr(importlib.import_module(module_name), class_name)
 32 |         self.model = cls(config).eval().to(device)
 33 | 
 34 |     def load(self, file, device=None, strict=True):
 35 |         if device is None:
 36 |             device = 'cpu'
 37 |         elif isinstance(device, int):
 38 |             device = torch.device('cuda', device)
 39 |         assert os.path.isfile(file)
 40 |         state = torch.load(file, map_location=device)
 41 |         if set(state.keys()) == {'model', 'opt'}:
 42 |             state = state['model']
 43 |         self.model.load_state_dict(state, strict=strict)
 44 | 
 45 | def parse_args():
 46 |     parser = argparse.ArgumentParser()
 47 |     parser = argparse.ArgumentParser()
 48 |     parser.add_argument('-w',
 49 |                         '--weights',
 50 |                         type=str,
 51 |                         default='workdir/train-abinet/best-train-abinet.pth',
 52 |                         help='PyTorch weights')
 53 |     parser.add_argument('--opset',
 54 |                         type=int,
 55 |                         default=13,
 56 |                         help='ONNX opset version')
 57 |     parser.add_argument('--sim',
 58 |                         action='store_true',
 59 |                         help='simplify onnx model')
 60 |     parser.add_argument('--input-shape',
 61 |                         nargs='+',
 62 |                         type=int,
 63 |                         default=[1, 3, 32, 128],
 64 |                         help='Model input shape only for api builder')
 65 |     parser.add_argument('--cuda', type=int, default=-1)
 66 |     parser.add_argument('--config', type=str, default='configs/train_abinet.yaml',
 67 |                         help='path to config file')
 68 |     parser.add_argument('--model_eval', type=str, default='alignment',
 69 |                         choices=['alignment', 'vision', 'language'])
 70 |     args = parser.parse_args()
 71 |     assert len(args.input_shape) == 4
 72 |     return args
 73 | 
 74 | def main(args):
 75 |     config = Config(args.config)
 76 |     if args.weights is not None: config.model_checkpoint = args.weights
 77 |     if args.model_eval is not None: config.model_eval = args.model_eval
 78 |     config.global_phase = 'test'
 79 |     config.model_vision_checkpoint, config.model_language_checkpoint = None, None
 80 |     device = 'cpu' if args.cuda < 0 else f'cuda:{args.cuda}'
 81 |     config.export = True
 82 | 
 83 |     model = ONNXModel(config, device)
 84 |     fake_input = torch.randn(args.input_shape).to(device)
 85 |     for _ in range(2):
 86 |         model(fake_input)
 87 | 
 88 |     with BytesIO() as f:
 89 |         torch.onnx.export(
 90 |             model,
 91 |             fake_input,
 92 |             f,
 93 |             opset_version=args.opset,
 94 |             do_constant_folding=True,
 95 |             export_params=True,
 96 |             input_names=['images'],
 97 |             output_names=['output'])
 98 |         f.seek(0)
 99 |         onnx_model = onnx.load(f)
100 | 
101 |     onnx.checker.check_model(onnx_model)
102 |     save_path = args.weights.replace('.pth', '.onnx')
103 | 
104 |     if args.sim:
105 |         try:
106 |             onnx_model, check = onnxsim.simplify(onnx_model)
107 |             assert check, 'assert check failed'
108 |         except Exception as e:
109 |             print(f'Simplifier failure: {e}')
110 |     onnx.save(onnx_model, save_path)
111 |     print(f'ONNX export success, saved as {save_path}')
112 | 
113 | if __name__ == '__main__':
114 |     main(parse_args())


--------------------------------------------------------------------------------
/model/ocr/scripts/crnn_export.py:
--------------------------------------------------------------------------------
 1 | import onnx
 2 | import torch
 3 | import argparse
 4 | from io import BytesIO
 5 | import torch.nn as nn
 6 | 
 7 | import models.crnn as crnn
 8 | try:
 9 |     import onnxsim
10 | except ImportError:
11 |     onnxsim = None
12 | 
13 | class CRNN(nn.Module):
14 |     def __init__(self, weights):
15 |         super().__init__()
16 |         self.crnn = crnn.CRNN(32, 1, 37, 256)
17 |         print('loading pretrained model from %s' % weights)
18 |         self.crnn.load_state_dict(torch.load(weights))
19 | 
20 |     def forward(self, x):
21 |         output = self.crnn(x)
22 |         scores, labels = output.transpose(0,1).max(dim=-1, keepdim=True)
23 |         return labels.to(torch.float32)
24 | 
25 | def parse_args():
26 |     parser = argparse.ArgumentParser()
27 |     parser.add_argument('-w',
28 |                         '--weights',
29 |                         type=str,
30 |                         required=True,
31 |                         help='PyTorch crnn weights')
32 |     parser.add_argument('--opset',
33 |                         type=int,
34 |                         default=11,
35 |                         help='ONNX opset version')
36 |     parser.add_argument('--sim',
37 |                         action='store_true',
38 |                         help='simplify onnx model')
39 |     parser.add_argument('--input-shape',
40 |                         nargs='+',
41 |                         type=int,
42 |                         default=[1, 1, 32, 100],
43 |                         help='Model input shape only for api builder')
44 |     parser.add_argument('--device',
45 |                         type=str,
46 |                         default='cpu',
47 |                         help='Export ONNX device')
48 |     args = parser.parse_args()
49 |     assert len(args.input_shape) == 4
50 |     return args
51 | 
52 | def main(args):
53 |     model_path = args.weights
54 | 
55 |     model = CRNN(model_path)
56 | 
57 |     model.eval()
58 |     model.to(args.device)
59 |     fake_input = torch.randn(args.input_shape).to(args.device)
60 |     for _ in range(2):
61 |         model(fake_input)
62 | 
63 |     with BytesIO() as f:
64 |         torch.onnx.export(
65 |             model,
66 |             fake_input,
67 |             f,
68 |             opset_version=args.opset,
69 |             input_names=['images'],
70 |             output_names=['output'])
71 |         f.seek(0)
72 |         onnx_model = onnx.load(f)
73 | 
74 |     onnx.checker.check_model(onnx_model)
75 |     save_path = args.weights.replace('.pth', '.onnx')
76 | 
77 |     if args.sim:
78 |         try:
79 |             onnx_model, check = onnxsim.simplify(onnx_model)
80 |             assert check, 'assert check failed'
81 |         except Exception as e:
82 |             print(f'Simplifier failure: {e}')
83 |     onnx.save(onnx_model, save_path)
84 |     print(f'ONNX export success, saved as {save_path}')
85 | 
86 | if __name__ == '__main__':
87 |     main(parse_args())


--------------------------------------------------------------------------------
/model/ocr/scripts/dbnet_export.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import onnx
 3 | from io import BytesIO
 4 | 
 5 | try:
 6 |     import onnxsim
 7 | except ImportError:
 8 |     onnxsim = None
 9 | 
10 | from models import build_model
11 | 
12 | def parse_args():
13 |     import argparse
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('-w',
16 |                         '--weights',
17 |                         type=str,
18 |                         required=True,
19 |                         help='PyTorch dbnet weights')
20 |     parser.add_argument('--opset',
21 |                         type=int,
22 |                         default=11,
23 |                         help='ONNX opset version')
24 |     parser.add_argument('--sim',
25 |                         action='store_true',
26 |                         help='simplify onnx model')
27 |     parser.add_argument('--device',
28 |                         type=str,
29 |                         default='cpu',
30 |                         help='Export ONNX device')
31 |     args = parser.parse_args()
32 |     return args
33 | 
34 | def main(args):
35 |     checkpoint = torch.load(args.weights, map_location=args.device)
36 |     config = checkpoint['config']
37 |     config['arch']['backbone']['pretrained'] = False
38 |     model = build_model(config['arch'])
39 |     model.load_state_dict(checkpoint['state_dict'])
40 |     model.to(args.device)
41 | 
42 |     fake_input = torch.randn((1, 3, 640, 640)).to(args.device)
43 |     for _ in range(2):
44 |         model(fake_input)
45 |     save_path = args.weights.replace('.pth', '.onnx') 
46 | 
47 |     with BytesIO() as f:
48 |         torch.onnx.export(model, fake_input, f, verbose=False, opset_version=12, input_names=['images'],
49 |                         output_names=['output'], 
50 |                         dynamic_axes={"images": {2: "height", 3: "width"}})
51 |         f.seek(0)
52 |         onnx_model = onnx.load(f)
53 | 
54 |     onnx.checker.check_model(onnx_model)  # check onnx model
55 |     if args.sim:
56 |         try:
57 |             onnx_model, check = onnxsim.simplify(onnx_model)
58 |             assert check, 'assert check failed'
59 |         except Exception as e:
60 |             print(f'Simplifier failure: {e}')
61 |     onnx.save(onnx_model, save_path)
62 |     print('ONNX export success, saved as %s' % save_path)
63 |     
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     main(parse_args())
68 | 


--------------------------------------------------------------------------------
/model/sam/image_encoder.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/sam/image_encoder.h"
 2 | #include <yaml-cpp/yaml.h>
 3 | 
 4 | using namespace sam;
 5 | 
 6 | ImageEncoder::ImageEncoder(const std::string &yaml_file) : m_input_size_(1024, 1024), m_output_size_(64, 64) 
 7 | {
 8 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
 9 | 
10 |     std::string model_path = yaml_node["model_path"].as<std::string>();
11 |     std::string framework_type = yaml_node["framework"].as<std::string>();
12 | 
13 |     if (!Init(model_path, framework_type)) exit(0);
14 | 
15 |     config_.input_len["image"] = 3 * m_input_size_.height * m_input_size_.width;
16 |     config_.output_len["image_embeddings"] = 256 * m_output_size_.height * m_output_size_.width;
17 |     config_.is_dynamic = false;
18 |     Status status = framework_->Init(config_);
19 |     if (status != Status::SUCCESS) {
20 |         std::cout << "Failed to init framework" << std::endl;
21 |         exit(0);
22 |     }
23 | }
24 | 
25 | ImageEncoder::~ImageEncoder() {
26 |     std::cout << "Destruct image encoder" << std::endl;
27 | }
28 | 
29 | void ImageEncoder::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
30 |     cv::dnn::blobFromImage(input_image, output_image, 1 / 57.f, cv::Size(), cv::Scalar(123.675, 116.28, 103.53), false, false, CV_32F);
31 | }
32 | 
33 | void ImageEncoder::forward(const cv::Mat &image, IOTensor& features) {
34 |     std::unordered_map<std::string, IOTensor> input, output;
35 | 
36 |     cv::Mat nchw;
37 |     preprocess(image, nchw);
38 | 
39 |     input["image"] = IOTensor();
40 |     input["image"].resize(nchw.total() * nchw.elemSize());
41 |     input["image"].shape = std::vector<int64_t>{1, 3, m_input_size_.height, m_input_size_.width};
42 |     input["image"].data_type = DataType::FP32;
43 |     memcpy(input["image"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
44 |     
45 | 
46 |     // 输出张量设置
47 |     output["image_embeddings"] = IOTensor();
48 |     output["image_embeddings"].data_type = DataType::FP32;
49 |     output["image_embeddings"].shape = std::vector<int64_t>{1, 256, m_output_size_.height, m_output_size_.width};
50 |     output["image_embeddings"].resize(config_.output_len["image_embeddings"] * sizeof(float));
51 | 
52 |     this->framework_->forward(input, output);
53 | 
54 |     features.resize(config_.output_len["image_embeddings"] * sizeof(float));
55 |     memcpy(features.data(), output["image_embeddings"].data(), features.size());
56 |     features.shape = std::vector<int64_t>{1, 256, 64, 64};
57 | }


--------------------------------------------------------------------------------
/model/sam/image_encoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/base/model.h"
 3 | 
 4 | namespace sam {
 5 | 
 6 | class ImageEncoder : public Model {
 7 |    public:
 8 |     ImageEncoder() = delete;
 9 |     ImageEncoder(const std::string &yaml_file);
10 |     virtual ~ImageEncoder();
11 |     void forward(const cv::Mat &image, IOTensor &features);
12 | 
13 |     cv::Size input_size() const { return m_input_size_; }
14 |     cv::Size output_size() const { return m_output_size_; }
15 | 
16 |    protected:
17 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image);
18 | 
19 |    private:
20 |     cv::Size m_input_size_;
21 |     cv::Size m_output_size_;
22 | };
23 | }  // namespace sam


--------------------------------------------------------------------------------
/model/sam/mask_decoder.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/sam/mask_decoder.h"
 2 | #include <yaml-cpp/yaml.h>
 3 | 
 4 | using namespace sam;
 5 | 
 6 | MaskDecoder::MaskDecoder(const std::string &yaml_file) : features_shape{1, 256, 64, 64}{
 7 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
 8 | 
 9 |     std::string model_path = yaml_node["model_path"].as<std::string>();
10 |     std::string framework_type = yaml_node["framework"].as<std::string>();
11 | 
12 |     if (!Init(model_path, framework_type)) exit(0);
13 | 
14 |     config_.input_len["image_embeddings"] =
15 |         features_shape[0] * features_shape[1] * features_shape[2] * features_shape[3];
16 |     config_.input_len["point_coords"] = 10 * 2;
17 |     config_.input_len["point_labels"] = 10;
18 |     config_.input_len["mask_input"] = 1 * 1 * 256 * 256;
19 |     config_.input_len["has_mask_input"] = 1;
20 | 
21 |     config_.output_len["iou_predictions"] = 1 * 4;
22 |     config_.output_len["low_res_masks"] = 1 * 4 * 256 * 256;
23 |     config_.is_dynamic = true;
24 |     Status status = framework_->Init(config_);
25 |     if (status != Status::SUCCESS) {
26 |         std::cout << "Failed to init framework" << std::endl;
27 |         exit(0);
28 |     }
29 | }
30 | 
31 | MaskDecoder::~MaskDecoder() { std::cout << "Destruct sam mask decoder" << std::endl; }
32 | 
33 | 
34 | // The point labels may be
35 | // | Point Label | Description |
36 | // |:--------------------:|-------------|
37 | // | 0 | Background point |
38 | // | 1 | Foreground point |
39 | // | 2 | Bounding box top-left |
40 | // | 3 | Bounding box bottom-right |
41 | void MaskDecoder::forward(const IOTensor &features, const std::vector<cv::Point2f> &image_point_coords,
42 |                           const std::vector<float> &image_point_labels, cv::Mat& low_res_mask) {
43 |     std::unordered_map<std::string, IOTensor> input, output;
44 | 
45 |     input["image_embeddings"] = IOTensor();
46 |     input["image_embeddings"].shape = features_shape;
47 |     input["image_embeddings"].resize(config_.input_len["image_embeddings"] * sizeof(float));
48 |     memcpy(input["image_embeddings"].data(), features.data(), input["image_embeddings"].size());
49 | 
50 |     input["point_coords"] = IOTensor();
51 |     input["point_coords"].shape = std::vector<int64_t>{1, static_cast<int64_t>(image_point_coords.size()), 2};
52 |     input["point_coords"].resize(image_point_coords.size() * 2 * sizeof(float));
53 |     std::vector<float> points;
54 |     for (const auto& point: image_point_coords) {
55 |         points.push_back(point.x);
56 |         points.push_back(point.y);
57 |     }
58 |     memcpy(input["point_coords"].data(), points.data(), input["point_coords"].size());
59 | 
60 |     input["point_labels"] = IOTensor();
61 |     input["point_labels"].shape = std::vector<int64_t>{1, static_cast<int64_t>(image_point_coords.size())};
62 |     input["point_labels"].resize(image_point_coords.size() * sizeof(float));
63 |     memcpy(input["point_labels"].data(), image_point_labels.data(), input["point_labels"].size());
64 | 
65 |     input["mask_input"] = IOTensor();
66 |     input["mask_input"].shape = std::vector<int64_t>{1, 1, 256, 256};
67 |     input["mask_input"].resize(256 * 256 * sizeof(float));
68 | 
69 |     input["has_mask_input"] = IOTensor();
70 |     input["has_mask_input"].shape = std::vector<int64_t>{1};
71 |     input["has_mask_input"].resize(sizeof(float));
72 |     float has_mask_input = 0.0f;
73 |     memcpy(input["has_mask_input"].data(), &has_mask_input, sizeof(float));
74 | 
75 |     // 输出张量设置
76 |     output["iou_predictions"] = IOTensor();
77 |     output["iou_predictions"].shape = std::vector<int64_t>{1, 4};
78 |     output["iou_predictions"].resize(sizeof(float) * 4);
79 | 
80 |     output["low_res_masks"] = IOTensor();
81 |     output["low_res_masks"].shape = std::vector<int64_t>{1, 4, 256, 256};
82 |     output["low_res_masks"].resize(4 * 256 * 256 * sizeof(float));
83 | 
84 |     this->framework_->forward(input, output);
85 | 
86 |     low_res_mask = cv::Mat(256, 256, CV_32F, (float *)output.at("low_res_masks").data());
87 | }


--------------------------------------------------------------------------------
/model/sam/mask_decoder.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/base/model.h"
 3 | 
 4 | namespace sam {
 5 | class MaskDecoder : public Model {
 6 |    public:
 7 |     MaskDecoder() = delete;
 8 |     MaskDecoder(const std::string &yaml_file);
 9 |     virtual ~MaskDecoder();
10 |     void forward(const IOTensor &features, const std::vector<cv::Point2f> &image_point_coords,
11 |                  const std::vector<float> &image_point_labels, cv::Mat &low_res_mask);
12 | 
13 |    private:
14 |     std::vector<int64_t> features_shape;
15 | };
16 | }


--------------------------------------------------------------------------------
/model/sam/sam.cpp:
--------------------------------------------------------------------------------
 1 | #include "model/sam/sam.h"
 2 | 
 3 | using namespace sam;
 4 | 
 5 | SAM::SAM(const std::string& encoder_cfg, const std::string& decoder_cfg) {
 6 |     encoder_ = std::make_shared<ImageEncoder>(encoder_cfg);
 7 |     decoder_ = std::make_shared<MaskDecoder>(decoder_cfg);
 8 | }
 9 | 
10 | void SAM::setImage(const cv::Mat &input_image) {
11 |     cv::Mat mask;
12 |     this->pparam_ = Letterbox(input_image, mask, encoder_->input_size());
13 | 
14 |     encoder_->forward(mask, features_);
15 | }
16 | 
17 | void SAM::predict(const std::vector<cv::Point2f> &image_point_coords, const std::vector<float> &image_point_labels, cv::Mat &output_mask) {
18 |     auto &dw = this->pparam_.dw;
19 |     auto &dh = this->pparam_.dh;
20 |     auto &width = this->pparam_.width;
21 |     auto &height = this->pparam_.height;
22 |     auto input_w = encoder_->input_size().width;
23 |     auto input_h = encoder_->input_size().height;
24 |     int seg_w = 256, seg_h = 256;
25 | 
26 |     int scale_dw = dw / input_w * seg_w;
27 |     int scale_dh = dh / input_h * seg_h;
28 | 
29 |     std::vector<cv::Point2f> resize_image_point_coords;
30 |     preprocessPoints(image_point_coords, resize_image_point_coords);
31 | 
32 |     cv::Mat low_res_mask;
33 |     decoder_->forward(features_, resize_image_point_coords, image_point_labels, low_res_mask);
34 | 
35 |     cv::Rect roi(scale_dw, scale_dh, seg_w - 2 * scale_dw, seg_h - 2 * scale_dh);
36 | 
37 |     cv::Mat mask = low_res_mask(roi);
38 |     mask = mask > 0.0f;
39 |     cv::resize(mask, output_mask, cv::Size((int)width, (int)height), cv::INTER_LINEAR);
40 | }
41 | 
42 | void SAM::preprocessPoints(const std::vector<cv::Point2f> &input_points, std::vector<cv::Point2f> &output_points) {
43 |     auto &dw = this->pparam_.dw;
44 |     auto &dh = this->pparam_.dh;
45 |     auto &ratio = this->pparam_.ratio;
46 | 
47 |     output_points.clear();
48 |     for (const auto& point: input_points) {
49 |         float x = point.x / ratio + dw;
50 |         float y = point.y / ratio + dh;
51 |         output_points.push_back(cv::Point2f(x,y));
52 |     }
53 | }


--------------------------------------------------------------------------------
/model/sam/sam.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "model/sam/image_encoder.h"
 3 | #include "model/sam/mask_decoder.h"
 4 | 
 5 | namespace sam {
 6 | 
 7 | class SAM {
 8 |    public:
 9 |     SAM() = delete;
10 |     SAM(const std::string &encoder_cfg, const std::string &decoder_cfg);
11 |     ~SAM(){};
12 |     void setImage(const cv::Mat &image);
13 |     void predict(const std::vector<cv::Point2f> &image_point_coords, const std::vector<float> &image_point_labels,
14 |                  cv::Mat &output_mask);
15 |     void preprocessPoints(const std::vector<cv::Point2f> &input_points, std::vector<cv::Point2f> &output_points);
16 | 
17 |    private:
18 |     std::shared_ptr<ImageEncoder> encoder_;
19 |     std::shared_ptr<MaskDecoder> decoder_;
20 |     PreParam pparam_;
21 |     IOTensor features_;
22 | };
23 | }  // namespace sam


--------------------------------------------------------------------------------
/model/yolo/common.py:
--------------------------------------------------------------------------------
 1 | # copyed from https://github.com/triple-Mu/YOLOv8-TensorRT
 2 | 
 3 | from typing import Tuple
 4 | import random
 5 | import torch
 6 | import torch.nn as nn
 7 | from torch import Graph, Tensor, Value
 8 | 
 9 | def make_anchors(feats: Tensor,
10 |                  strides: Tensor,
11 |                  grid_cell_offset: float = 0.5) -> Tuple[Tensor, Tensor]:
12 |     anchor_points, stride_tensor = [], []
13 |     assert feats is not None
14 |     dtype, device = feats[0].dtype, feats[0].device
15 |     for i, stride in enumerate(strides):
16 |         _, _, h, w = feats[i].shape
17 |         sx = torch.arange(end=w, device=device,
18 |                           dtype=dtype) + grid_cell_offset  # shift x
19 |         sy = torch.arange(end=h, device=device,
20 |                           dtype=dtype) + grid_cell_offset  # shift y
21 |         sy, sx = torch.meshgrid(sy, sx)
22 |         anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
23 |         stride_tensor.append(
24 |             torch.full((h * w, 1), stride, dtype=dtype, device=device))
25 |     return torch.cat(anchor_points), torch.cat(stride_tensor)
26 | 
27 | class C2f(nn.Module):
28 | 
29 |     def __init__(self, *args, **kwargs):
30 |         super().__init__()
31 | 
32 |     def forward(self, x):
33 |         x = self.cv1(x)
34 |         x = [x, x[:, self.c:, ...]]
35 |         x.extend(m(x[-1]) for m in self.m)
36 |         x.pop(1)
37 |         return self.cv2(torch.cat(x, 1))


--------------------------------------------------------------------------------
/model/yolo/test.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | from ultralytics.cfg import entrypoint
4 | if __name__ == 'main':
5 |     sys.argv[0] = re.sub(r'(-script.pyw|.exe)?$', '', sys.argv[0])
6 | 
7 |     sys.exit(entrypoint())


--------------------------------------------------------------------------------
/model/yolo/yolo.cpp:
--------------------------------------------------------------------------------
  1 | #include "model/yolo/yolo.h"
  2 | #include <yaml-cpp/yaml.h>
  3 | 
  4 | YOLO::YOLO(const std::string &yaml_file) {
  5 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
  6 | 
  7 |     std::string model_path = yaml_node["model_path"].as<std::string>();
  8 |     std::string framework_type = yaml_node["framework"].as<std::string>();
  9 |     if (!Init(model_path, framework_type)) exit(0);
 10 | 
 11 |     std::vector<long> input_size = yaml_node["input_size"].as<std::vector<long>>();
 12 |     m_input_size_.width = input_size.at(0);
 13 |     m_input_size_.height = input_size.at(1);
 14 |     topk_ = yaml_node["topk"].as<int>();
 15 |     
 16 |     with_nms_ = yaml_node["with_nms"].as<bool>();
 17 |     if (!with_nms_) {  
 18 |         m_conf_thres_ = yaml_node["conf_thres"].as<float>();
 19 |         m_nms_thres_ = yaml_node["nms_thres"].as<float>();  
 20 |         m_grid_num_ = 0;
 21 |         for (int i = 0; i < 3; i++)
 22 |         {
 23 |             m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]);
 24 |         }
 25 |         config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width;
 26 |         config_.output_len["output"] = m_grid_num_ * 6;
 27 |         config_.is_dynamic = false;
 28 |     } else {
 29 |         config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width;
 30 |         config_.output_len["num_dets"] = 1;
 31 |         config_.output_len["bboxes"] = 4 * topk_;
 32 |         config_.output_len["scores"] = topk_;
 33 |         config_.output_len["labels"] = topk_;
 34 |     }
 35 | 
 36 |     config_.is_dynamic = false;
 37 |     Status status = framework_->Init(config_);
 38 |     if (status != Status::SUCCESS) {
 39 |         std::cout << "Failed to init framework" << std::endl;
 40 |         exit(0);
 41 |     }
 42 | }
 43 | 
 44 | YOLO::~YOLO()
 45 | {
 46 |     std::cout << "Destruct yolov8" << std::endl;
 47 | }
 48 | 
 49 | void YOLO::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
 50 |     cv::Mat mask;
 51 |     this->pparam_ = Letterbox(input_image, mask, m_input_size_);
 52 |     cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F);
 53 | }
 54 | 
 55 | void YOLO::detect(const cv::Mat &image, std::vector<Object> &objs) {
 56 |     std::unordered_map<std::string, IOTensor> input, output;
 57 | 
 58 |     // 输入tensor设置
 59 |     cv::Mat nchw;
 60 |     preprocess(image, nchw);
 61 | 
 62 |     if (!with_nms_) {
 63 |         input["images"] = IOTensor();
 64 |         input["images"].shape = std::vector<int64_t>{1, 3, m_input_size_.height, m_input_size_.width};
 65 |         input["images"].data_type = DataType::FP32;
 66 |         input["images"].resize(nchw.total() * nchw.elemSize());
 67 |         memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
 68 |         
 69 |         // 输出张量设置
 70 |         output["output"] = IOTensor();
 71 |         output["output"].shape = std::vector<int64_t>{1, m_grid_num_, 6};
 72 |         output["output"].data_type = DataType::FP32;
 73 |         output["output"].resize(config_.output_len["output"] * sizeof(float));
 74 |     } else {
 75 |         input["images"] = IOTensor();
 76 |         input["images"].shape = std::vector<int64_t>{1, 3, m_input_size_.height, m_input_size_.width};
 77 |         input["images"].data_type = DataType::FP32;
 78 |         input["images"].resize(nchw.total() * nchw.elemSize());
 79 |         memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
 80 | 
 81 |         // 输出张量设置
 82 |         output["num_dets"] = IOTensor();
 83 |         output["num_dets"].shape = std::vector<int64_t>{1, 1};
 84 |         output["num_dets"].data_type = DataType::INT32;
 85 |         output["num_dets"].resize(config_.output_len["num_dets"] * sizeof(int));
 86 | 
 87 |         output["bboxes"] = IOTensor();
 88 |         output["bboxes"].shape = std::vector<int64_t>{1, 100, 4};
 89 |         output["bboxes"].data_type = DataType::FP32;
 90 |         output["bboxes"].resize(config_.output_len["bboxes"] * sizeof(float));
 91 | 
 92 |         output["scores"] = IOTensor();
 93 |         output["scores"].shape = std::vector<int64_t>{1, 100};
 94 |         output["scores"].data_type = DataType::FP32;
 95 |         output["scores"].resize(config_.output_len["scores"] * sizeof(float));
 96 | 
 97 |         output["labels"] = IOTensor();
 98 |         output["labels"].shape = std::vector<int64_t>{1, 100};
 99 |         output["labels"].data_type = DataType::INT32;
100 |         output["labels"].resize(config_.output_len["labels"] * sizeof(int));
101 |     }
102 | 
103 |     this->framework_->forward(input, output);
104 |     postprocess(output, objs);
105 |     
106 | }
107 | 
108 | void YOLO::postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) {
109 |     if (!with_nms_) {
110 |         postprocess_with_nms(output, objs);
111 |     } else {
112 |         postprocess_without_nms(output, objs);
113 |     }
114 | }
115 | 
116 | void YOLO::postprocess_without_nms(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) {
117 |     objs.clear();
118 |     int *const num_dets = (int*)(output.at("num_dets").data());
119 |     float *const boxes = (float *)(output.at("bboxes").data());
120 |     float *scores = (float *)(output.at("scores").data());
121 |     int *labels = (int*)(output.at("labels").data());
122 |     auto &dw = this->pparam_.dw;
123 |     auto &dh = this->pparam_.dh;
124 |     auto &width = this->pparam_.width;
125 |     auto &height = this->pparam_.height;
126 |     auto &ratio = this->pparam_.ratio;
127 |     for (int i = 0; i < num_dets[0]; i++)
128 |     {
129 |         float *ptr = boxes + i * 4;
130 | 
131 |         float x0 = *ptr++ - dw;
132 |         float y0 = *ptr++ - dh;
133 |         float x1 = *ptr++ - dw;
134 |         float y1 = *ptr - dh;
135 | 
136 |         x0 = clamp(x0 * ratio, 0.f, width);
137 |         y0 = clamp(y0 * ratio, 0.f, height);
138 |         x1 = clamp(x1 * ratio, 0.f, width);
139 |         y1 = clamp(y1 * ratio, 0.f, height);
140 |         Object obj;
141 |         obj.rect.x = x0;
142 |         obj.rect.y = y0;
143 |         obj.rect.width = x1 - x0;
144 |         obj.rect.height = y1 - y0;
145 |         obj.prob = *(scores + i);
146 |         obj.label = *(labels + i);
147 |         objs.push_back(obj);
148 |     }
149 | }
150 | 
151 | void YOLO::postprocess_with_nms(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs)
152 | {
153 |     objs.clear();
154 |     auto num_anchors = m_grid_num_;
155 | 
156 |     auto &dw = this->pparam_.dw;
157 |     auto &dh = this->pparam_.dh;
158 |     auto &width = this->pparam_.width;
159 |     auto &height = this->pparam_.height;
160 |     auto &ratio = this->pparam_.ratio;
161 | 
162 |     std::vector<int> labels;
163 |     std::vector<float> scores;
164 |     std::vector<cv::Rect> bboxes;
165 |     std::vector<int> indices;
166 | 
167 |     float * const outputs = (float *)output.at("output").data();
168 | 
169 |     for (int i = 0; i < num_anchors; i++)
170 |     {
171 |         float *ptr = outputs + i * 6;
172 |         float score = *(ptr + 4);
173 |         if (score > m_conf_thres_)
174 |         {
175 |             float x0 = *ptr++ - dw;
176 |             float y0 = *ptr++ - dh;
177 |             float x1 = *ptr++ - dw;
178 |             float y1 = *ptr++ - dh;
179 | 
180 |             x0 = clamp(x0 * ratio, 0.f, width);
181 |             y0 = clamp(y0 * ratio, 0.f, height);
182 |             x1 = clamp(x1 * ratio, 0.f, width);
183 |             y1 = clamp(y1 * ratio, 0.f, height);
184 | 
185 |             int label = *(++ptr);
186 |             labels.push_back(label);
187 |             scores.push_back(score);
188 |             bboxes.push_back(cv::Rect_<float>(x0, y0, x1 - x0, y1 - y0));
189 |         }
190 |     }
191 |     cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices);
192 | 
193 |     int cnt = 0;
194 |     for (auto &i : indices)
195 |     {
196 |         if (cnt >= topk_)
197 |         {
198 |             break;
199 |         }
200 |         cv::Rect tmp = bboxes[i];
201 |         Object obj;
202 |         obj.label = labels[i];
203 |         obj.rect = tmp;
204 |         obj.prob = scores[i];
205 |         objs.push_back(obj);
206 |         cnt += 1;
207 |     }
208 | }


--------------------------------------------------------------------------------
/model/yolo/yolo.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class YOLO : public DetectionModel {
 7 |    public:
 8 |     YOLO() = delete;
 9 |     explicit YOLO(const std::string &yaml_file);
10 |     ~YOLO();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 |     void postprocess_without_nms(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs);
18 |     void postprocess_with_nms(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs);
19 | 
20 |    private:
21 |     cv::Size m_input_size_ = {640, 640};
22 |     float m_conf_thres_ = 0.25f;
23 |     float m_nms_thres_ = 0.65f;
24 |     int topk_ = 100;
25 |     int strides[3] = {8, 16, 32};
26 |     int m_grid_num_ = 8400;
27 |     bool with_nms_ = false;
28 |     PreParam pparam_;
29 | };


--------------------------------------------------------------------------------
/model/yolo/yolo_cutoff.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class YOLOCutoff : public DetectionModel {
 7 |    public:
 8 |     YOLOCutoff() = delete;
 9 |     explicit YOLOCutoff(const std::string &yaml_file);
10 |     ~YOLOCutoff();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 |     int decodeBoxes(const IOTensor &output1, const IOTensor &output2, const IOTensor &output3,
18 |                     int grid_h, int grid_w, int height, int width, int stride, int dfl_len,
19 |                     std::vector<cv::Rect> &boxes, std::vector<float> &objProbs, std::vector<int> &classId,
20 |                     float threshold);
21 | 
22 |    private:
23 |     cv::Size m_input_size_ = {640, 640};
24 |     int m_class_num_ = 80;
25 |     float m_conf_thres_ = 0.25f;
26 |     float m_nms_thres_ = 0.65f;
27 |     int topk_ = 100;
28 |     int strides[3] = {8, 16, 32};
29 |     std::string framework_type_;
30 | 
31 |     PreParam pparam_;
32 | };


--------------------------------------------------------------------------------
/model/yolo/yolo_pose.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/framework.h"
  2 | 
  3 | #include "model/yolo/yolo_pose.h"
  4 | #include <yaml-cpp/yaml.h>
  5 | 
  6 | YOLOPose::YOLOPose(const std::string &yaml_file)
  7 | {
  8 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
  9 | 
 10 |     std::string model_path = yaml_node["model_path"].as<std::string>();
 11 |     std::string framework_type = yaml_node["framework"].as<std::string>();
 12 |     
 13 |     m_conf_thres_ = yaml_node["conf_thres"].as<float>();
 14 |     m_nms_thres_ = yaml_node["nms_thres"].as<float>();
 15 | 
 16 |     std::vector<long> input_size = yaml_node["input_size"].as<std::vector<long>>();
 17 |     m_input_size_.width = input_size.at(0);
 18 |     m_input_size_.height = input_size.at(1);
 19 | 
 20 |     if (!Init(model_path, framework_type)) exit(0);
 21 | 
 22 |     m_grid_num_ = 0;
 23 |     for (int i = 0; i < 3; i++)
 24 |     {
 25 |         m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]);
 26 |     }
 27 |     config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width;
 28 |     config_.output_len["bboxes"] = m_grid_num_ * 4;
 29 |     config_.output_len["scores"] = m_grid_num_;
 30 |     config_.output_len["kps"] = m_grid_num_ * 51;
 31 |     config_.is_dynamic = false;
 32 |     Status status = framework_->Init(config_);
 33 |     if (status != Status::SUCCESS) {
 34 |         std::cout << "Failed to init framework" << std::endl;
 35 |         exit(0);
 36 |     }
 37 | }
 38 | 
 39 | YOLOPose::~YOLOPose()
 40 | {
 41 |     std::cout << "Destruct yolov8" << std::endl;
 42 | }
 43 | 
 44 | void YOLOPose::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
 45 |     cv::Mat mask;
 46 |     this->pparam_ = Letterbox(input_image, mask, m_input_size_);
 47 |     cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F);
 48 | }
 49 | 
 50 | void YOLOPose::detect(const cv::Mat &image, std::vector<Object> &objs)
 51 | {
 52 |     std::unordered_map<std::string, IOTensor> input, output;
 53 | 
 54 |     // 输入tensor设置
 55 |     cv::Mat nchw;
 56 |     preprocess(image, nchw);
 57 | 
 58 |     input["images"] = IOTensor();
 59 |     input["images"].resize(nchw.total() * nchw.elemSize());
 60 |     input["images"].shape = std::vector<int64_t>{1, 3, m_input_size_.height, m_input_size_.width};
 61 |     input["images"].data_type = DataType::FP32;
 62 |     memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
 63 |     
 64 | 
 65 |     // 输出张量设置
 66 |     output["bboxes"] = IOTensor();
 67 |     output["bboxes"].shape = std::vector<int64_t>{1, m_grid_num_, 4};
 68 |     output["bboxes"].data_type = DataType::FP32;
 69 |     output["bboxes"].resize(config_.output_len["bboxes"] * sizeof(float));
 70 | 
 71 |     output["scores"] = IOTensor();
 72 |     output["scores"].shape = std::vector<int64_t>{1, m_grid_num_, 1};
 73 |     output["scores"].data_type = DataType::FP32;
 74 |     output["scores"].resize(config_.output_len["scores"] * sizeof(float));
 75 | 
 76 |     output["kps"] = IOTensor();
 77 |     output["kps"].shape = std::vector<int64_t>{1, m_grid_num_, 51};
 78 |     output["kps"].data_type = DataType::FP32;
 79 |     output["kps"].resize(config_.output_len["kps"] * sizeof(float));
 80 | 
 81 |     // start = std::chrono::system_clock::now();
 82 |     this->framework_->forward(input, output);
 83 |     // end = std::chrono::system_clock::now();
 84 |     // tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
 85 |     // std::cout << "Inference costs " << tc << " ms" << std::endl;
 86 | 
 87 |     // start = std::chrono::system_clock::now();
 88 |     postprocess(output, objs);
 89 |     // end = std::chrono::system_clock::now();
 90 |     // tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
 91 |     // std::cout << "Postprocess costs " << tc << " ms" << std::endl;
 92 | }
 93 | 
 94 | void YOLOPose::postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs)
 95 | {
 96 |     objs.clear();
 97 |     auto num_anchors = m_grid_num_;
 98 | 
 99 |     auto &dw = this->pparam_.dw;
100 |     auto &dh = this->pparam_.dh;
101 |     auto &width = this->pparam_.width;
102 |     auto &height = this->pparam_.height;
103 |     auto &ratio = this->pparam_.ratio;
104 | 
105 |     float *bbox_ptr = (float *)output.at("bboxes").data();
106 |     float *score_ptr = (float *)output.at("scores").data();
107 |     float *kps_ptr = (float *)output.at("kps").data();
108 | 
109 |     std::vector<cv::Rect>           bboxes;
110 |     std::vector<float>              scores;
111 |     std::vector<int>                labels;
112 |     std::vector<int>                indices;
113 |     std::vector<std::vector<float>> kpss;
114 | 
115 |     for (int i = 0; i < num_anchors; i++)
116 |     {
117 |         float score = *(score_ptr++);
118 |         if (score > m_conf_thres_)
119 |         {
120 |             float x0 = *bbox_ptr++ - dw;
121 |             float y0 = *bbox_ptr++ - dh;
122 |             float x1 = *bbox_ptr++ - dw;
123 |             float y1 = *bbox_ptr++ - dh;
124 | 
125 |             x0 = clamp(x0 * ratio, 0.f, width);
126 |             y0 = clamp(y0 * ratio, 0.f, height);
127 |             x1 = clamp(x1 * ratio, 0.f, width);
128 |             y1 = clamp(y1 * ratio, 0.f, height);
129 | 
130 |             std::vector<float> kps;
131 |             for (int k = 0; k < 17; k++) {
132 |                 float kps_x = (*(kps_ptr + 3 * k) - dw) * ratio;
133 |                 float kps_y = (*(kps_ptr + 3 * k + 1) - dh) * ratio;
134 |                 float kps_s = *(kps_ptr + 3 * k + 2);
135 |                 kps_x       = clamp(kps_x, 0.f, width);
136 |                 kps_y       = clamp(kps_y, 0.f, height);
137 |                 kps.push_back(kps_x);
138 |                 kps.push_back(kps_y);
139 |                 kps.push_back(kps_s);
140 |             }
141 |             kps_ptr += 51;
142 | 
143 |             labels.push_back(0);
144 |             scores.push_back(score);
145 |             bboxes.push_back(cv::Rect_<float>(x0, y0, x1 - x0, y1 - y0));
146 |             kpss.push_back(kps);
147 |         } else {
148 |             bbox_ptr += 4;
149 |             kps_ptr += 51;
150 |         }
151 |     }
152 |     cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices);
153 | 
154 |     int cnt = 0;
155 |     for (auto& i : indices) {
156 |         if (cnt >= topk) {
157 |             break;
158 |         }
159 |         Object obj;
160 |         obj.rect  = bboxes[i];
161 |         obj.prob  = scores[i];
162 |         obj.label = labels[i];
163 |         obj.kps   = kpss[i];
164 |         objs.push_back(obj);
165 |         cnt += 1;
166 |     }
167 | }


--------------------------------------------------------------------------------
/model/yolo/yolo_pose.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class YOLOPose : public DetectionModel {
 7 |    public:
 8 |     YOLOPose() = delete;
 9 |     explicit YOLOPose(const std::string &yaml_file);
10 |     ~YOLOPose();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 | 
18 |    private:
19 |     cv::Size m_input_size_ = {640, 640};
20 |     float m_conf_thres_ = 0.25f;
21 |     float m_nms_thres_ = 0.65f;
22 |     int topk = 100;
23 |     int strides[3] = {8, 16, 32};
24 |     int m_grid_num_ = 8400;
25 | 
26 |     PreParam pparam_;
27 | };


--------------------------------------------------------------------------------
/model/yolo/yolo_seg.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/framework.h"
  2 | 
  3 | #include "model/yolo/yolo_seg.h"
  4 | #include <yaml-cpp/yaml.h>
  5 | 
  6 | YOLOSeg::YOLOSeg(const std::string &yaml_file)
  7 | {
  8 |     YAML::Node yaml_node = YAML::LoadFile(yaml_file);
  9 | 
 10 |     std::string model_path = yaml_node["model_path"].as<std::string>();
 11 |     std::string framework_type = yaml_node["framework"].as<std::string>();
 12 |     
 13 |     m_conf_thres_ = yaml_node["conf_thres"].as<float>();
 14 |     m_nms_thres_ = yaml_node["nms_thres"].as<float>();
 15 | 
 16 |     std::vector<long> input_size = yaml_node["input_size"].as<std::vector<long>>();
 17 |     m_input_size_.width = input_size.at(0);
 18 |     m_input_size_.height = input_size.at(1);
 19 | 
 20 |     std::vector<long> seg_size = yaml_node["seg_size"].as<std::vector<long>>();
 21 |     m_seg_size_.width = seg_size.at(0);
 22 |     m_seg_size_.height = seg_size.at(1);
 23 | 
 24 |     m_seg_channels_ = yaml_node["seg_channels"].as<int>();
 25 | 
 26 |     if (!Init(model_path, framework_type)) exit(0);
 27 | 
 28 |     m_grid_num_ = 0;
 29 |     for (int i = 0; i < 3; i++)
 30 |     {
 31 |         m_grid_num_ += (m_input_size_.width / strides[i]) * (m_input_size_.height / strides[i]);
 32 |     }
 33 |     config_.input_len["images"] = 3 * m_input_size_.height * m_input_size_.width;
 34 |     config_.output_len["outputs"] = m_grid_num_ * (m_seg_channels_ + 6);
 35 |     config_.output_len["proto"] = m_seg_channels_ * m_seg_size_.height * m_seg_size_.width;
 36 |     config_.is_dynamic = false;
 37 |     Status status = framework_->Init(config_);
 38 |     if (status != Status::SUCCESS) {
 39 |         std::cout << "Failed to init framework" << std::endl;
 40 |         exit(0);
 41 |     }
 42 | }
 43 | 
 44 | YOLOSeg::~YOLOSeg()
 45 | {
 46 |     std::cout << "Destruct yolov8" << std::endl;
 47 | }
 48 | 
 49 | void YOLOSeg::preprocess(const cv::Mat &input_image, cv::Mat &output_image) {
 50 |     cv::Mat mask;
 51 |     this->pparam_ = Letterbox(input_image, mask, m_input_size_);
 52 |     cv::dnn::blobFromImage(mask, output_image, 1 / 255.f, cv::Size(), cv::Scalar(0, 0, 0), false, false, CV_32F);
 53 | }
 54 | 
 55 | void YOLOSeg::detect(const cv::Mat &image, std::vector<Object> &objs)
 56 | {
 57 |     std::unordered_map<std::string, IOTensor> input, output;
 58 | 
 59 |     // 输入tensor设置
 60 |     cv::Mat nchw;
 61 |     preprocess(image, nchw);
 62 | 
 63 |     input["images"] = IOTensor();
 64 |     input["images"].resize(nchw.total() * nchw.elemSize());
 65 |     input["images"].shape = std::vector<int64_t>{1, 3, m_input_size_.height, m_input_size_.width};
 66 |     input["images"].data_type = DataType::FP32;
 67 |     memcpy(input["images"].data(), nchw.ptr<uint8_t>(), nchw.total() * nchw.elemSize());
 68 |     
 69 | 
 70 |     // 输出张量设置
 71 |     output["outputs"] = IOTensor();
 72 |     output["outputs"].shape = std::vector<int64_t>{1, m_grid_num_, m_seg_channels_ + 6};
 73 |     output["outputs"].data_type = DataType::FP32;
 74 |     output["outputs"].resize(config_.output_len["outputs"] * sizeof(float));
 75 | 
 76 |     output["proto"] = IOTensor();
 77 |     output["proto"].shape = std::vector<int64_t>{1, m_seg_channels_, m_seg_size_.height, m_seg_size_.width};
 78 |     output["proto"].data_type = DataType::FP32;
 79 |     output["proto"].resize(config_.output_len["proto"] * sizeof(float));
 80 | 
 81 |     // start = std::chrono::system_clock::now();
 82 |     this->framework_->forward(input, output);
 83 |     // end = std::chrono::system_clock::now();
 84 |     // tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
 85 |     // std::cout << "Inference costs " << tc << " ms" << std::endl;
 86 | 
 87 |     // start = std::chrono::system_clock::now();
 88 |     postprocess(output, objs);
 89 |     // end = std::chrono::system_clock::now();
 90 |     // tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
 91 |     // std::cout << "Postprocess costs " << tc << " ms" << std::endl;
 92 | }
 93 | 
 94 | void YOLOSeg::postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs)
 95 | {
 96 |     objs.clear();
 97 |     auto seg_h = m_seg_size_.height;
 98 |     auto seg_w = m_seg_size_.width;
 99 |     auto input_h = m_input_size_.height;
100 |     auto input_w = m_input_size_.width;
101 |     auto num_anchors = m_grid_num_;
102 |     auto num_channels = m_num_channels_;
103 | 
104 |     auto &dw = this->pparam_.dw;
105 |     auto &dh = this->pparam_.dh;
106 |     auto &width = this->pparam_.width;
107 |     auto &height = this->pparam_.height;
108 |     auto &ratio = this->pparam_.ratio;
109 | 
110 |     float * const outputs = (float *)output.at("outputs").data();
111 |     cv::Mat protos = cv::Mat(m_seg_channels_, seg_h * seg_w, CV_32F, (float *)output.at("proto").data());
112 |     assert(!protos.empty());
113 | 
114 |     std::vector<int> labels;
115 |     std::vector<float> scores;
116 |     std::vector<cv::Rect> bboxes;
117 |     std::vector<cv::Mat> mask_confs;
118 |     std::vector<int> indices;
119 | 
120 |     for (int i = 0; i < num_anchors; i++)
121 |     {
122 |         float *ptr = outputs + i * num_channels;
123 |         float score = *(ptr + 4);
124 |         if (score > m_conf_thres_)
125 |         {
126 |             float x0 = *ptr++ - dw;
127 |             float y0 = *ptr++ - dh;
128 |             float x1 = *ptr++ - dw;
129 |             float y1 = *ptr++ - dh;
130 | 
131 |             x0 = clamp(x0 * ratio, 0.f, width);
132 |             y0 = clamp(y0 * ratio, 0.f, height);
133 |             x1 = clamp(x1 * ratio, 0.f, width);
134 |             y1 = clamp(y1 * ratio, 0.f, height);
135 | 
136 |             int label = *(++ptr);
137 |             cv::Mat mask_conf = cv::Mat(1, m_seg_channels_, CV_32F, ++ptr);
138 |             mask_confs.push_back(mask_conf);
139 |             labels.push_back(label);
140 |             scores.push_back(score);
141 |             bboxes.push_back(cv::Rect_<float>(x0, y0, x1 - x0, y1 - y0));
142 |         }
143 |     }
144 |     cv::dnn::NMSBoxes(bboxes, scores, m_conf_thres_, m_nms_thres_, indices);
145 | 
146 |     cv::Mat masks;
147 |     int cnt = 0;
148 |     for (auto &i : indices)
149 |     {
150 |         if (cnt >= topk)
151 |         {
152 |             break;
153 |         }
154 |         cv::Rect tmp = bboxes[i];
155 |         Object obj;
156 |         obj.label = labels[i];
157 |         obj.rect = tmp;
158 |         obj.prob = scores[i];
159 |         masks.push_back(mask_confs[i]);
160 |         objs.push_back(obj);
161 |         cnt += 1;
162 |     }
163 |     if (masks.empty())
164 |     {
165 |         // masks is empty
166 |     }
167 |     else
168 |     {
169 |         cv::Mat matmulRes = (masks * protos).t();
170 |         cv::Mat maskMat = matmulRes.reshape(indices.size(), {seg_w, seg_h});
171 | 
172 |         std::vector<cv::Mat> maskChannels;
173 |         cv::split(maskMat, maskChannels);
174 |         int scale_dw = dw / input_w * seg_w;
175 |         int scale_dh = dh / input_h * seg_h;
176 | 
177 |         cv::Rect roi(scale_dw, scale_dh, seg_w - 2 * scale_dw, seg_h - 2 * scale_dh);
178 | 
179 |         for (long unsigned int i = 0; i < indices.size(); i++)
180 |         {
181 |             cv::Mat dest, mask;
182 |             cv::exp(-maskChannels[i], dest);
183 |             dest = 1.0 / (1.0 + dest);
184 |             dest = dest(roi);
185 |             // std::cout << dest.size() << " " << dest.size().empty() << std::endl;
186 |             cv::resize(dest, mask, cv::Size((int)width, (int)height), cv::INTER_LINEAR);
187 |             objs[i].boxMask = mask(objs[i].rect) > 0.5f;
188 |         }
189 |     }
190 | }


--------------------------------------------------------------------------------
/model/yolo/yolo_seg.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class YOLOSeg : public DetectionModel {
 7 |    public:
 8 |     YOLOSeg() = delete;
 9 |     explicit YOLOSeg(const std::string &yaml_file);
10 |     ~YOLOSeg();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 | 
18 |    private:
19 |     cv::Size m_input_size_ = {640, 640};
20 |     cv::Size m_seg_size_ = {160, 160}; 
21 |     int m_seg_channels_ = 32;
22 |     float m_conf_thres_ = 0.25f;
23 |     float m_nms_thres_ = 0.65f;
24 |     int topk = 100;
25 |     int strides[3] = {8, 16, 32};
26 |     int m_grid_num_ = 8400;
27 |     int m_num_channels_ = 38;
28 | 
29 |     PreParam pparam_;
30 | };


--------------------------------------------------------------------------------
/model/yolo/yolo_seg_cutoff.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <fstream>
 3 | 
 4 | #include "model/base/detection_model.h"
 5 | 
 6 | class YOLOSegCutoff : public DetectionModel {
 7 |    public:
 8 |     YOLOSegCutoff() = delete;
 9 |     explicit YOLOSegCutoff(const std::string &yaml_file);
10 |     ~YOLOSegCutoff();
11 | 
12 |     void detect(const cv::Mat &image, std::vector<Object> &objs) override;
13 | 
14 |    protected:
15 |     void preprocess(const cv::Mat &input_image, cv::Mat &output_image) override;
16 |     void postprocess(const std::unordered_map<std::string, IOTensor> &output, std::vector<Object> &objs) override;
17 |     int decodeBoxes(const IOTensor &output1, const IOTensor &output2, const IOTensor &output3, const IOTensor &output4,
18 |                      int grid_h, int grid_w, int height, int width, int stride, int dfl_len,
19 |                      std::vector<cv::Rect> &boxes, std::vector<float> &segments, std::vector<float> &objProbs,
20 |                      std::vector<int> &classId, float threshold);
21 |     void decodeMask(const IOTensor &input, cv::Mat &protos);
22 | 
23 |    private:
24 |     cv::Size m_input_size_ = {640, 640};
25 |     cv::Size m_seg_size_ = {160, 160};
26 |     int m_seg_channels_ = 32;
27 |     int m_class_num_ = 80;
28 |     float m_conf_thres_ = 0.25f;
29 |     float m_nms_thres_ = 0.65f;
30 |     int topk_ = 100;
31 |     int strides[3] = {8, 16, 32};
32 |     std::string framework_type_;
33 | 
34 |     PreParam pparam_;
35 | };


--------------------------------------------------------------------------------
/model/yolo/yolov8-pose-export.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from io import BytesIO
 3 | 
 4 | import onnx
 5 | import torch
 6 | from typing import Tuple
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | from torch import Graph, Tensor, Value
11 | from ultralytics import YOLO
12 | 
13 | try:
14 |     import onnxsim
15 | except ImportError:
16 |     onnxsim = None
17 | 
18 | class YOLOv8Pose(nn.Module):
19 |     export = True
20 |     shape = None
21 |     dynamic = False
22 | 
23 |     def __init__(self, weights, device):
24 |         super().__init__()
25 |         self.device = device
26 |         self.model = YOLO(weights).to(self.device).model.fuse().eval()
27 |         self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
28 |                                            dtype=torch.float32,
29 |                                            device=self.device)
30 | 
31 |     def forward(self, x):
32 |         out, _ = self.model(x)
33 |         boxes, scores, kps = out.split((4,1,51), 1)
34 |         boxes = (boxes.transpose(1,2) @ self.convert_matrix)
35 |         return boxes, scores.transpose(1,2), kps.transpose(1,2)
36 |     
37 |     def export(self, save_path, opset_version=11, sim=True):
38 |         fake_input = torch.randn(1,3,640,640).to(self.device)
39 |         for _ in range(2):
40 |             self.forward(fake_input)
41 |         with BytesIO() as f:
42 |             torch.onnx.export(
43 |                 self,
44 |                 fake_input,
45 |                 f,
46 |                 opset_version=opset_version,
47 |                 input_names=['images'],
48 |                 output_names=['bboxes', 'scores', 'kps'])
49 |             f.seek(0)
50 |             onnx_model = onnx.load(f)
51 |         onnx.checker.check_model(onnx_model)
52 |         if sim:
53 |             try:
54 |                 onnx_model, check = onnxsim.simplify(onnx_model)
55 |                 assert check, 'assert check failed'
56 |             except Exception as e:
57 |                 print(f'Simplifier failure: {e}')
58 |         onnx.save(onnx_model, save_path)
59 |         print(f'ONNX export success, saved as {save_path}')
60 | 
61 | def parse_args():
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('-w',
64 |                         '--weights',
65 |                         type=str,
66 |                         required=True,
67 |                         help='PyTorch weights')
68 |     parser.add_argument('--opset',
69 |                         type=int,
70 |                         default=11,
71 |                         help='ONNX opset version')
72 |     parser.add_argument('--sim',
73 |                         action='store_true',
74 |                         help='simplify onnx model')
75 |     args = parser.parse_args()
76 |     return args
77 | 
78 | if __name__=='__main__':
79 |     args = parse_args()
80 |     model = YOLOv8Pose(args.weights, 'cpu')
81 |     save_path = args.weights.replace('.pt', '.onnx')
82 |     model.export(save_path, args.opset, args.sim)


--------------------------------------------------------------------------------
/model/yolo/yolov8-seg-export.py:
--------------------------------------------------------------------------------
  1 | ## copyed from https://github.com/triple-Mu/YOLOv8-TensorRT
  2 | 
  3 | import argparse
  4 | from io import BytesIO
  5 | 
  6 | import onnx
  7 | import torch
  8 | from typing import Tuple
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | from torch import Graph, Tensor, Value
 13 | from ultralytics import YOLO
 14 | 
 15 | from common import make_anchors, C2f
 16 | 
 17 | try:
 18 |     import onnxsim
 19 | except ImportError:
 20 |     onnxsim = None
 21 | 
 22 | class PostSeg(nn.Module):
 23 |     export = True
 24 |     shape = None
 25 |     dynamic = False
 26 | 
 27 |     def __init__(self, *args, **kwargs):
 28 |         super().__init__()
 29 | 
 30 |     def forward(self, x):
 31 |         p = self.proto(x[0])  # mask protos
 32 |         bs = p.shape[0]  # batch size
 33 |         mc = torch.cat(
 34 |             [self.cv4[i](x[i]).view(bs, self.nm, -1) for i in range(self.nl)],
 35 |             2)  # mask coefficients
 36 |         boxes, scores, labels = self.forward_det(x)
 37 |         out = torch.cat([boxes, scores, labels.float(), mc.transpose(1, 2)], 2)
 38 |         return out, p.flatten(2)
 39 | 
 40 |     def forward_det(self, x):
 41 |         shape = x[0].shape
 42 |         b, res, b_reg_num = shape[0], [], self.reg_max * 4
 43 |         for i in range(self.nl):
 44 |             res.append(torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1))
 45 |         if self.dynamic or self.shape != shape:
 46 |             self.anchors, self.strides = \
 47 |                 (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
 48 |             self.shape = shape
 49 |         x = [i.view(b, self.no, -1) for i in res]
 50 |         y = torch.cat(x, 2)
 51 |         boxes, scores = y[:, :b_reg_num, ...], y[:, b_reg_num:, ...].sigmoid()
 52 |         boxes = boxes.view(b, 4, self.reg_max, -1).permute(0, 1, 3, 2)
 53 |         boxes = boxes.softmax(-1) @ torch.arange(self.reg_max).to(boxes)
 54 |         boxes0, boxes1 = -boxes[:, :2, ...], boxes[:, 2:, ...]
 55 |         boxes = self.anchors.repeat(b, 2, 1) + torch.cat([boxes0, boxes1], 1)
 56 |         boxes = boxes * self.strides
 57 |         scores, labels = scores.transpose(1, 2).max(dim=-1, keepdim=True)
 58 |         return boxes.transpose(1, 2), scores, labels
 59 | 
 60 | def parse_args():
 61 |     parser = argparse.ArgumentParser()
 62 |     parser.add_argument('-w',
 63 |                         '--weights',
 64 |                         type=str,
 65 |                         required=True,
 66 |                         help='PyTorch yolov8 weights')
 67 |     parser.add_argument('--opset',
 68 |                         type=int,
 69 |                         default=11,
 70 |                         help='ONNX opset version')
 71 |     parser.add_argument('--sim',
 72 |                         action='store_true',
 73 |                         help='simplify onnx model')
 74 |     parser.add_argument('--input-shape',
 75 |                         nargs='+',
 76 |                         type=int,
 77 |                         default=[1, 3, 640, 640],
 78 |                         help='Model input shape only for api builder')
 79 |     parser.add_argument('--device',
 80 |                         type=str,
 81 |                         default='cpu',
 82 |                         help='Export ONNX device')
 83 |     args = parser.parse_args()
 84 |     assert len(args.input_shape) == 4
 85 |     return args
 86 | 
 87 | 
 88 | def main(args):
 89 |     YOLOv8 = YOLO(args.weights)
 90 |     model = YOLOv8.model.fuse().eval()
 91 |     for m in model.modules():
 92 |         s = str(type(m))[6:-2].split('.')[-1]
 93 |         if s == 'Segment':
 94 |             setattr(m, '__class__', PostSeg)
 95 |         elif s == 'C2f':
 96 |             setattr(m, '__class__', C2f)
 97 |         m.to(args.device)
 98 |     model.to(args.device)
 99 |     fake_input = torch.randn(args.input_shape).to(args.device)
100 |     for _ in range(2):
101 |         model(fake_input)
102 |     save_path = args.weights.replace('.pt', '.onnx')
103 |     with BytesIO() as f:
104 |         torch.onnx.export(model,
105 |                           fake_input,
106 |                           f,
107 |                           opset_version=args.opset,
108 |                           input_names=['images'],
109 |                           output_names=['outputs', 'proto'])
110 |         f.seek(0)
111 |         onnx_model = onnx.load(f)
112 |     onnx.checker.check_model(onnx_model)
113 |     if args.sim:
114 |         try:
115 |             onnx_model, check = onnxsim.simplify(onnx_model)
116 |             assert check, 'assert check failed'
117 |         except Exception as e:
118 |             print(f'Simplifier failure: {e}')
119 |     onnx.save(onnx_model, save_path)
120 |     print(f'ONNX export success, saved as {save_path}')
121 | 
122 | 
123 | if __name__ == '__main__':
124 |     main(parse_args())


--------------------------------------------------------------------------------
/model/yolo/yolov9-det-export.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import random
  3 | from io import BytesIO
  4 | from typing import Tuple
  5 | 
  6 | import onnx
  7 | import torch
  8 | from onnx import TensorProto
  9 | from ultralytics import YOLO
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from torch import Graph, Tensor, Value
 15 | 
 16 | try:
 17 |     import onnxsim
 18 | except ImportError:
 19 |     onnxsim = None
 20 | 
 21 | from models.experimental import attempt_load
 22 | 
 23 | class TRT_NMS(torch.autograd.Function):
 24 | 
 25 |     @staticmethod
 26 |     def forward(
 27 |             ctx: Graph,
 28 |             boxes: Tensor,
 29 |             scores: Tensor,
 30 |             iou_threshold: float = 0.65,
 31 |             score_threshold: float = 0.25,
 32 |             max_output_boxes: int = 100,
 33 |             background_class: int = -1,
 34 |             box_coding: int = 0,
 35 |             plugin_version: str = '1',
 36 |             score_activation: int = 0
 37 |     ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
 38 |         batch_size, num_boxes, num_classes = scores.shape
 39 |         num_dets = torch.randint(0,
 40 |                                  max_output_boxes, (batch_size, 1),
 41 |                                  dtype=torch.int32)
 42 |         boxes = torch.randn(batch_size, max_output_boxes, 4)
 43 |         scores = torch.randn(batch_size, max_output_boxes)
 44 |         labels = torch.randint(0,
 45 |                                num_classes, (batch_size, max_output_boxes),
 46 |                                dtype=torch.int32)
 47 | 
 48 |         return num_dets, boxes, scores, labels
 49 | 
 50 |     @staticmethod
 51 |     def symbolic(
 52 |             g,
 53 |             boxes: Value,
 54 |             scores: Value,
 55 |             iou_threshold: float = 0.45,
 56 |             score_threshold: float = 0.25,
 57 |             max_output_boxes: int = 100,
 58 |             background_class: int = -1,
 59 |             box_coding: int = 0,
 60 |             score_activation: int = 0,
 61 |             plugin_version: str = '1') -> Tuple[Value, Value, Value, Value]:
 62 |         out = g.op('TRT::EfficientNMS_TRT',
 63 |                    boxes,
 64 |                    scores,
 65 |                    iou_threshold_f=iou_threshold,
 66 |                    score_threshold_f=score_threshold,
 67 |                    max_output_boxes_i=max_output_boxes,
 68 |                    background_class_i=background_class,
 69 |                    box_coding_i=box_coding,
 70 |                    plugin_version_s=plugin_version,
 71 |                    score_activation_i=score_activation,
 72 |                    outputs=4)
 73 |         nums_dets, boxes, scores, classes = out
 74 |         return nums_dets, boxes, scores, classes
 75 |     
 76 | class ORT_NMS(torch.autograd.Function):
 77 |     '''ONNX-Runtime NMS operation'''
 78 |     @staticmethod
 79 |     def forward(ctx,
 80 |                 boxes,
 81 |                 scores,
 82 |                 max_output_boxes_per_class=torch.tensor([100]),
 83 |                 iou_threshold=torch.tensor([0.45]),
 84 |                 score_threshold=torch.tensor([0.25])):
 85 |         device = boxes.device
 86 |         batch = scores.shape[0]
 87 |         num_det = random.randint(0, 100)
 88 |         batches = torch.randint(0, batch, (num_det,)).sort()[0].to(device)
 89 |         idxs = torch.arange(100, 100 + num_det).to(device)
 90 |         zeros = torch.zeros((num_det,), dtype=torch.int64).to(device)
 91 |         selected_indices = torch.cat([batches[None], zeros[None], idxs[None]], 0).T.contiguous()
 92 |         selected_indices = selected_indices.to(torch.int64)
 93 |         return selected_indices
 94 | 
 95 |     @staticmethod
 96 |     def symbolic(g, boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold):
 97 |         return g.op("NonMaxSuppression", boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold)
 98 | 
 99 | class YOLOv9(nn.Module):
100 |     export = True
101 |     shape = None
102 |     dynamic = True
103 |     iou_thres = 0.65
104 |     conf_thres = 0.25
105 |     topk = 100
106 |     use_trt_nms = False
107 |     use_onnx_nms = False
108 |     def __init__(self, weights, device='cpu'):
109 |         super().__init__()
110 |         self.device = device
111 |         self.model = attempt_load(weights, device=self.device, inplace=True, fuse=True)
112 |         self.convert_matrix = torch.tensor([[1, 0, 1, 0], [0, 1, 0, 1], [-0.5, 0, 0.5, 0], [0, -0.5, 0, 0.5]],
113 |                                            dtype=torch.float32,
114 |                                            device=self.device)
115 |         
116 | 
117 |     def forward(self, x):
118 |         out, _ = self.model(x)
119 |         bs = out.shape[0]  # batch size
120 |         nc = out.shape[1] - 4  # number of classes
121 |         boxes, scores = out.split((4,nc), 1)
122 |         boxes = (boxes.transpose(1,2) @ self.convert_matrix)
123 | 
124 |         if self.use_trt_nms:
125 |             return TRT_NMS.apply(boxes, scores.transpose(1, 2),
126 |                                 self.iou_thres, self.conf_thres, self.topk)
127 |         elif self.use_onnx_nms:
128 |             max_output_boxes_per_class = torch.tensor([self.topk])
129 |             iou_thres = torch.tensor([self.iou_thres])
130 |             conf_thres = torch.tensor([self.conf_thres])
131 |             num_selected_indices = ORT_NMS.apply(boxes, scores, max_output_boxes_per_class, iou_thres, conf_thres)
132 |             
133 |             scores = scores.transpose(1, 2)
134 |             bbox_result = self.gather(boxes, num_selected_indices)
135 |             score_intermediate_result = self.gather(scores, num_selected_indices).max(axis=-1)
136 |             score_result = score_intermediate_result.values
137 |             classes_result = score_intermediate_result.indices.to(torch.int32)
138 |             num_dets = torch.tensor(score_result.shape[-1]).reshape([1,1]).to(torch.int32).clone().detach()
139 | 
140 |             return (num_dets, bbox_result, score_result, classes_result)
141 |         else:
142 |             scores, labels = scores.transpose(1, 2).max(dim=-1, keepdim=True)
143 |             return torch.cat([boxes, scores, labels], dim=2)
144 |         
145 |     def gather(self, target, idx):
146 |         pick_indices = idx[:, -1:].repeat(1, target.shape[2]).unsqueeze(0)
147 |         return torch.gather(target, 1, pick_indices)
148 |     
149 | def parse_args():
150 |     parser = argparse.ArgumentParser()
151 |     parser.add_argument('-w',
152 |                         '--weights',
153 |                         type=str,
154 |                         required=True,
155 |                         help='PyTorch yolov8 weights')
156 |     parser.add_argument('--trt-nms',
157 |                         action='store_true',
158 |                         required=False,
159 |                         help='Use TensorRT Efficient NMS plugins')
160 |     parser.add_argument('--onnx-nms',
161 |                         action='store_true',
162 |                         required=False,
163 |                         help='Use onnx NMS ops')
164 |     parser.add_argument('--iou-thres',
165 |                         type=float,
166 |                         default=0.65,
167 |                         help='IOU threshoud for NMS plugin')
168 |     parser.add_argument('--conf-thres',
169 |                         type=float,
170 |                         default=0.25,
171 |                         help='CONF threshoud for NMS plugin')
172 |     parser.add_argument('--topk',
173 |                         type=int,
174 |                         default=100,
175 |                         help='Max number of detection bboxes')
176 |     parser.add_argument('--opset',
177 |                         type=int,
178 |                         default=11,
179 |                         help='ONNX opset version')
180 |     parser.add_argument('--sim',
181 |                         action='store_true',
182 |                         help='simplify onnx model')
183 |     parser.add_argument('--input-shape',
184 |                         nargs='+',
185 |                         type=int,
186 |                         default=[1, 3, 640, 640],
187 |                         help='Model input shape only for api builder')
188 |     parser.add_argument('--device',
189 |                         type=str,
190 |                         default='cpu',
191 |                         help='Export ONNX device')
192 |     args = parser.parse_args()
193 |     assert len(args.input_shape) == 4
194 |     YOLOv9.conf_thres = args.conf_thres
195 |     YOLOv9.iou_thres = args.iou_thres
196 |     YOLOv9.topk = args.topk
197 |     YOLOv9.use_trt_nms = args.trt_nms
198 |     YOLOv9.use_onnx_nms = args.onnx_nms
199 |     return args
200 | 
201 | 
202 | def export_end2end(args):
203 |     b = args.input_shape[0]
204 |     model = YOLOv9(args.weights)
205 |     model.to(args.device)
206 |     fake_input = torch.randn(args.input_shape).to(args.device)
207 |     for _ in range(2):
208 |         model(fake_input)
209 |     save_path = args.weights[:-3]+ '_end2end.onnx'
210 |     with BytesIO() as f:
211 |         torch.onnx.export(
212 |             model,
213 |             fake_input,
214 |             f,
215 |             opset_version=args.opset,
216 |             input_names=['images'],
217 |             output_names=['num_dets', 'bboxes', 'scores', 'labels'])
218 |         f.seek(0)
219 |         onnx_model = onnx.load(f)
220 |     onnx.checker.check_model(onnx_model)
221 |     shapes = [b, 1, b, args.topk, 4, b, args.topk, b, args.topk]
222 |     for i in onnx_model.graph.output:
223 |         for j in i.type.tensor_type.shape.dim:
224 |             j.dim_param = str(shapes.pop(0))
225 |     if args.sim:
226 |         try:
227 |             onnx_model, check = onnxsim.simplify(onnx_model)
228 |             assert check, 'assert check failed'
229 |         except Exception as e:
230 |             print(f'Simplifier failure: {e}')
231 |     onnx.save(onnx_model, save_path)
232 |     print(f'ONNX export success, saved as {save_path}')
233 | 
234 | def export_normal(args):
235 |     b = args.input_shape[0]
236 |     model = YOLOv9(args.weights)
237 |     model.to(args.device)
238 |     fake_input = torch.randn(args.input_shape).to(args.device)
239 |     for _ in range(2):
240 |         model(fake_input)
241 |     # save_path = args.weights.replace('.pt', '.onnx')
242 |     save_path = args.weights[:-3] + '_normal.onnx'
243 |     with BytesIO() as f:
244 |         torch.onnx.export(
245 |             model,
246 |             fake_input,
247 |             f,
248 |             opset_version=args.opset,
249 |             input_names=['images'],
250 |             output_names=['output'])
251 |         f.seek(0)
252 |         onnx_model = onnx.load(f)
253 |     onnx.checker.check_model(onnx_model)
254 | 
255 |     if args.sim:
256 |         try:
257 |             onnx_model, check = onnxsim.simplify(onnx_model)
258 |             assert check, 'assert check failed'
259 |         except Exception as e:
260 |             print(f'Simplifier failure: {e}')
261 |     onnx.save(onnx_model, save_path)
262 |     print(f'ONNX export success, saved as {save_path}')
263 | 
264 | def main(args):
265 |     if args.trt_nms or args.onnx_nms:
266 |         export_end2end(args)
267 |     else:
268 |         export_normal(args)
269 | 
270 | if __name__=='__main__':
271 |     main(parse_args())


--------------------------------------------------------------------------------
/output/dbnet/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/dbnet/01.png


--------------------------------------------------------------------------------
/output/dbnet/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/dbnet/02.png


--------------------------------------------------------------------------------
/output/sam/dogs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/sam/dogs.jpg


--------------------------------------------------------------------------------
/output/yolo/detect/COCO_train2014_000000181904.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/COCO_train2014_000000181904.jpg


--------------------------------------------------------------------------------
/output/yolo/detect/COCO_train2014_000000291797.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/COCO_train2014_000000291797.jpg


--------------------------------------------------------------------------------
/output/yolo/detect/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/bus.jpg


--------------------------------------------------------------------------------
/output/yolo/detect/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/detect/zidane.jpg


--------------------------------------------------------------------------------
/output/yolo/pose/COCO_train2014_000000181904.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/COCO_train2014_000000181904.jpg


--------------------------------------------------------------------------------
/output/yolo/pose/COCO_train2014_000000291797.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/COCO_train2014_000000291797.jpg


--------------------------------------------------------------------------------
/output/yolo/pose/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/bus.jpg


--------------------------------------------------------------------------------
/output/yolo/pose/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/pose/zidane.jpg


--------------------------------------------------------------------------------
/output/yolo/segment/COCO_train2014_000000181904.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/COCO_train2014_000000181904.jpg


--------------------------------------------------------------------------------
/output/yolo/segment/COCO_train2014_000000291797.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/COCO_train2014_000000291797.jpg


--------------------------------------------------------------------------------
/output/yolo/segment/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/bus.jpg


--------------------------------------------------------------------------------
/output/yolo/segment/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/output/yolo/segment/zidane.jpg


--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(yolo_test ${CMAKE_CURRENT_SOURCE_DIR}/yolo_test.cpp)
 2 | target_include_directories(yolo_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 3 | target_link_libraries(yolo_test PUBLIC yolo_det yolo_seg yolo_pose yolo_det_cutoff yolo_seg_cutoff)
 4 | target_link_directories(yolo_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS})
 5 | 
 6 | add_executable(ocr_test ${CMAKE_CURRENT_SOURCE_DIR}/ocr_test.cpp)
 7 | target_include_directories(ocr_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
 8 | target_link_libraries(ocr_test PUBLIC ctc attn dbnet) 
 9 | target_link_directories(ocr_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS})
10 | 
11 | add_executable(sam_test ${CMAKE_CURRENT_SOURCE_DIR}/sam_test.cpp)
12 | target_include_directories(sam_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
13 | target_link_libraries(sam_test PUBLIC sam)
14 | target_link_directories(sam_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS})
15 | 
16 | add_executable(clip_test ${CMAKE_CURRENT_SOURCE_DIR}/clip_test.cpp)
17 | target_include_directories(clip_test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${PROJECT_BINARY_DIR})
18 | target_link_libraries(clip_test PUBLIC clip)
19 | target_link_directories(clip_test PUBLIC ${TensorRT_LIBRARIES} ${OpenCV_LIBS} ${ONNXRUNTIME_LIBS})
20 | 
21 | # add_executable(test ${CMAKE_CURRENT_SOURCE_DIR}/test.cpp)
22 | # target_include_directories(test PUBLIC ${INCLUDE_DIRS} ${CMAKE_SOURCE_DIR})
23 | # target_link_libraries(test ${ONNXRUNTIME_LIBS})
24 | # target_link_directories(test PUBLIC ${OpenCV_LIBS})


--------------------------------------------------------------------------------
/test/clip_test.cpp:
--------------------------------------------------------------------------------
  1 | #include "model/clip/text_tokenizer.h"
  2 | #include "model/clip/image_encoder.h"
  3 | #include "model/clip/text_encoder.h"
  4 | #include "model/clip/clip.h"
  5 | 
  6 | void ModuleTest() {
  7 |     clip::TextTokenizer tokenizer("/home/stardust/my_work/model-zoo-cxx/weights/clip/bpe_simple_vocab_16e6.txt.gz");
  8 |     std::vector<int> tokens = tokenizer.tokenize("a photo of a woman");
  9 |     for(int token : tokens) {
 10 |         std::cout << token << ",";
 11 |     }
 12 |     std::cout << std::endl;
 13 | 
 14 |     std::string current_path = "../";
 15 |     std::string image_encoder_cfg = current_path + "config/clip/image_encoder.yaml";
 16 |     std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml";
 17 | 
 18 |     clip::ImageEncoder image_encoder(image_encoder_cfg);
 19 |     
 20 |     std::vector<cv::Mat> images;
 21 |     images.push_back(cv::imread("../test/image/clip/franz-kafka.jpg"));
 22 | 
 23 |     IOTensor image_embeddings;
 24 |     image_encoder.forward(images, image_embeddings);
 25 |     std::cout << "Shape of image image_embeddings: [";
 26 |     for (int64_t i : image_embeddings.shape) {
 27 |         std::cout << i << ",";
 28 |     }
 29 |     std::cout << "]" << std::endl;
 30 | 
 31 |     float* ptr = (float*)image_embeddings.data();
 32 |     float min_val = FLT_MAX, max_val = -FLT_MAX;
 33 |     for (size_t i = 0; i < image_embeddings.size() / 4; i++) {
 34 |         float val = *ptr++;
 35 |         if (val > max_val) {
 36 |             max_val = val;
 37 |         }
 38 |         if (val < min_val) {
 39 |             min_val = val;
 40 |         }
 41 |     }
 42 |     std::cout << "Range of image_embeddings: [" << min_val << "," << max_val << "]" << std::endl;
 43 | 
 44 |     clip::TextEncoder text_encoder(text_encoder_cfg);
 45 |     
 46 |     std::vector<std::string> texts{"a photo of a man", "a photo of a woman"};
 47 | 
 48 |     IOTensor text_embeddings;
 49 |     text_encoder.forward(texts, text_embeddings);
 50 |     std::cout << "Shape of image text_embeddings: [";
 51 |     for (int64_t i : text_embeddings.shape) {
 52 |         std::cout << i << ",";
 53 |     }
 54 |     std::cout << "]" << std::endl;
 55 | 
 56 |     ptr = (float*)text_embeddings.data();
 57 |     min_val = FLT_MAX;
 58 |     max_val = -FLT_MAX;
 59 |     for (size_t i = 0; i < text_embeddings.size() / 4; i++) {
 60 |         float val = *ptr++;
 61 |         if (val > max_val) {
 62 |             max_val = val;
 63 |         }
 64 |         if (val < min_val) {
 65 |             min_val = val;
 66 |         }
 67 |     }
 68 |     std::cout << "Range of text_embeddings: [" << min_val << "," << max_val << "]" << std::endl;
 69 | 
 70 |     std::vector<float> norm_image_embeddings;
 71 | 
 72 |     ptr = (float*)image_embeddings.data();
 73 |     for (size_t i = 0; i < images.size(); i++) {
 74 |         float norm = 0.0;
 75 |         for (size_t j = 0; j < 512; j++) {
 76 |             norm += std::pow(*(ptr+j), 2);
 77 |         }
 78 |         norm = std::sqrt(norm);
 79 | 
 80 |         for (size_t j = 0; j < 512; j++) {
 81 |             *ptr = *ptr / norm;
 82 |             ++ptr;
 83 |         }
 84 |     }
 85 | 
 86 |     ptr = (float*)text_embeddings.data();
 87 |     for (size_t i = 0; i < texts.size(); i++) {
 88 |         float norm = 0.0;
 89 |         for (size_t j = 0; j < 512; j++) {
 90 |             norm += std::pow(*(ptr+j), 2);
 91 |         }
 92 |         norm = std::sqrt(norm);
 93 | 
 94 |         for (size_t j = 0; j < 512; j++) {
 95 |             *ptr = *ptr / norm;
 96 |             ++ptr;
 97 |         }
 98 |     }
 99 | 
100 |     ptr = (float*)image_embeddings.data();
101 |     min_val = FLT_MAX;
102 |     max_val = -FLT_MAX;
103 |     for (size_t i = 0; i < image_embeddings.size() / 4; i++) {
104 |         float val = *ptr++;
105 |         if (val > max_val) {
106 |             max_val = val;
107 |         }
108 |         if (val < min_val) {
109 |             min_val = val;
110 |         }
111 |     }
112 |     std::cout << "After normalization, range of image_embeddings: [" << min_val << "," << max_val << "]" << std::endl;
113 | 
114 |     ptr = (float*)text_embeddings.data();
115 |     min_val = FLT_MAX;
116 |     max_val = -FLT_MAX;
117 |     for (size_t i = 0; i < text_embeddings.size() / 4; i++) {
118 |         float val = *ptr++;
119 |         if (val > max_val) {
120 |             max_val = val;
121 |         }
122 |         if (val < min_val) {
123 |             min_val = val;
124 |         }
125 |     }
126 |     std::cout << "After normalization, range of text_embeddings: [" << min_val << "," << max_val << "]" << std::endl;
127 | 
128 |     cv::Mat image_matrix(images.size(), 512, CV_32F, image_embeddings.data());
129 |     cv::Mat text_matrix(texts.size(), 512, CV_32F, text_embeddings.data());
130 |     cv::Mat result;
131 |     cv::gemm(image_matrix, text_matrix.t(), 100, cv::Mat(), 0.0, result);
132 |     std::cout << result << std::endl;
133 | }
134 | 
135 | void GetTextEmbeddings() {
136 |     std::string current_path = "../";
137 |     std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml";
138 |     clip::TextEncoder text_encoder(text_encoder_cfg);
139 | 
140 |     std::string prompt_path = current_path + "config/clip/prompts.txt";
141 |     std::ifstream file(prompt_path);
142 |     std::vector<std::string> texts;
143 | 
144 |     if (file.is_open()) {
145 |         std::string line;
146 |         while (std::getline(file, line)) {
147 |             texts.push_back(line); // 逐行读取文件内容并存储到 vector 中
148 |         }
149 |         file.close(); // 关闭文件
150 |     } else {
151 |         std::cout << "无法打开文件" << std::endl;
152 |     }
153 | 
154 |     std::cout << "Prompts: ";
155 |     for (const auto& l : texts) {
156 |         std::cout << l << ", ";
157 |     }
158 |     std::cout << std::endl;
159 | 
160 |     IOTensor text_embeddings;
161 |     text_encoder.forward(texts, text_embeddings);
162 |     std::cout << "Shape of image text_embeddings: [";
163 |     for (int64_t i : text_embeddings.shape) {
164 |         std::cout << i << ",";
165 |     }
166 |     std::cout << "]" << std::endl;
167 | 
168 |     
169 |     std::string fname = current_path + "weights/clip/text_embeddings.bin";
170 |     std::ofstream fout(fname.c_str(), std::ios::binary | std::ios::out);
171 |     fout.write((char *)text_embeddings.data(), text_embeddings.size());
172 |     fout.close();
173 | }
174 | 
175 | void PipeLineTest() {
176 |     std::string current_path = "../";
177 |     std::string image_encoder_cfg = current_path + "config/clip/image_encoder.yaml";
178 |     std::string text_encoder_cfg = current_path + "config/clip/text_encoder.yaml";
179 | 
180 |     clip::Clip clip_model(image_encoder_cfg, text_encoder_cfg);
181 | 
182 |     std::vector<cv::Mat> images;
183 |     images.push_back(cv::imread("../test/image/clip/franz-kafka.jpg"));
184 |     images.push_back(cv::imread("../test/image/clip/Mona_Lisa.jpg"));
185 |     clip_model.encodeImages(images);
186 | 
187 |     std::vector<std::string> texts{"a photo of a man", "a photo of a woman"};
188 |     clip_model.encodeTexts(texts);
189 | 
190 |     std::vector<std::vector<float>> probs = clip_model.computeProbabilities();
191 | 
192 |     std::cout << "[ ";
193 |     for (size_t i = 0; i < probs.size(); i++) {
194 |         std::cout << "[ ";
195 |         for (size_t j = 0; j < probs[0].size(); j++) {
196 |             std::cout << probs[i][j] << " ";
197 |         }
198 |         std::cout << " ], ";
199 |     }
200 |     std::cout << " ]" << std::endl;
201 | }
202 | 
203 | int main(int argc, char** argv) {
204 |     if (argc == 2 && std::string(argv[1]) == "-g") {
205 |         GetTextEmbeddings();
206 |     }
207 |     PipeLineTest();
208 | }


--------------------------------------------------------------------------------
/test/image/clip/Mona_Lisa.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/clip/Mona_Lisa.jpg


--------------------------------------------------------------------------------
/test/image/clip/franz-kafka.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/clip/franz-kafka.jpg


--------------------------------------------------------------------------------
/test/image/detect/COCO_train2014_000000181904.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/COCO_train2014_000000181904.jpg


--------------------------------------------------------------------------------
/test/image/detect/COCO_train2014_000000291797.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/COCO_train2014_000000291797.jpg


--------------------------------------------------------------------------------
/test/image/detect/bus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/bus.jpg


--------------------------------------------------------------------------------
/test/image/detect/zidane.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/detect/zidane.jpg


--------------------------------------------------------------------------------
/test/image/ocr/det/01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/det/01.png


--------------------------------------------------------------------------------
/test/image/ocr/det/02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/det/02.png


--------------------------------------------------------------------------------
/test/image/ocr/rec/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/ocr/rec/demo.png


--------------------------------------------------------------------------------
/test/image/sam/dogs.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Huntersdeng/CXX-DeepLearning-Inference/9086157643f47a5c761caf0f45df0ee377477eb4/test/image/sam/dogs.jpg


--------------------------------------------------------------------------------
/test/ocr_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "framework/framework.h"
 2 | #include "common/common.h"
 3 | 
 4 | #include "model/ocr/ctc.h"
 5 | #include "model/ocr/attention.h"
 6 | #include "model/ocr/dbnet.h"
 7 | 
 8 | void CtcModelTest() {
 9 |     std::string current_path = "../";
10 |     std::string yaml_file = current_path + "config/ocr/rec/ctc.yaml";
11 | 
12 |     CtcModel model(yaml_file);
13 | 
14 |     std::vector<std::string> imagePathList;
15 |     std::string input_path = current_path + "test/image/ocr/rec"; 
16 |     cv::glob(input_path + "/*.png", imagePathList);
17 | 
18 |     cv::Mat image, res;
19 | 
20 |     for (auto& path : imagePathList) {
21 |         auto start = std::chrono::system_clock::now();
22 |         image = cv::imread(path, 0);
23 |         std::string output = model.detect(image);
24 |         std::cout << path << ": " << output << std::endl;
25 |         auto end = std::chrono::system_clock::now();
26 |         auto tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
27 |         printf("cost %2.4lf ms\n", tc);
28 |     }
29 | }
30 | 
31 | void AttnModelTest() {
32 |     std::string current_path = "../";
33 |     std::string yaml_file = current_path + "config/ocr/rec/attn.yaml";
34 | 
35 |     AttnModel model(yaml_file);
36 | 
37 |     std::vector<std::string> imagePathList;
38 |     std::string input_path = current_path + "test/image/ocr/rec"; 
39 |     cv::glob(input_path + "/*.png", imagePathList);
40 | 
41 |     cv::Mat image, res;
42 | 
43 |     for (auto& path : imagePathList) {
44 |         auto start = std::chrono::system_clock::now();
45 |         image = cv::imread(path);
46 |         std::string output = model.detect(image);
47 |         std::cout << path << ": " << output << std::endl;
48 |         auto end = std::chrono::system_clock::now();
49 |         auto tc = (double)std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() / 1000.;
50 |         printf("cost %2.4lf ms\n", tc);
51 |     }
52 | }
53 | 
54 | void DBNetTest() {
55 |     std::string current_path = "../";
56 |     std::string yaml_file = current_path + "config/ocr/det/dbnet.yaml";
57 | 
58 |     DBNet model(yaml_file);
59 | 
60 |     std::vector<std::string> imagePathList;
61 |     std::string input_path = current_path + "test/image/ocr/det"; 
62 |     std::string output_path = current_path + "output/dbnet";
63 |     cv::glob(input_path + "/*.png", imagePathList);
64 | 
65 |     cv::Mat image, input_image, res;
66 |     std::vector<Object> objs;
67 | 
68 |     for (auto& path : imagePathList) {
69 |         objs.clear();
70 |         std::cout << path << std::endl;
71 |         image = cv::imread(path);
72 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
73 |         model.detect(input_image, objs);
74 |         DrawBoxes(image, res, objs);
75 | 
76 |         std::string::size_type iPos = path.find_last_of('/') + 1;
77 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
78 |         std::string out_path = output_path + "/" + filename;
79 |         // cv::imshow("image", res);
80 |         // cv::waitKey(0);
81 |         cv::imwrite(out_path, res);
82 |     }
83 | }
84 | 
85 | int main() {
86 |     // CtcModelTest();
87 |     // AttnModelTest();
88 |     DBNetTest();
89 | }


--------------------------------------------------------------------------------
/test/sam_test.cpp:
--------------------------------------------------------------------------------
 1 | #include "framework/framework.h"
 2 | #include "common/common.h"
 3 | 
 4 | #include "model/sam/sam.h"
 5 | 
 6 | int main() {
 7 |     std::string current_path = "../";
 8 |     std::string encoder_cfg = current_path + "config/sam/image_encoder.yaml";
 9 |     std::string decoder_cfg = current_path + "config/sam/mask_decoder.yaml";
10 | 
11 |     sam::SAM sam_model(encoder_cfg, decoder_cfg);
12 | 
13 |     cv::Mat image, input_image;
14 | 
15 |     image = cv::imread("../test/image/sam/dogs.jpg");
16 |     cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
17 | 
18 |     sam_model.setImage(input_image);
19 | 
20 |     std::vector<cv::Point2f> points;
21 |     points.push_back(cv::Point2f(100, 100));
22 |     points.push_back(cv::Point2f(850, 759));
23 |     std::vector<float> labels{2, 3};
24 | 
25 |     cv::Mat output_mask;
26 |     sam_model.predict(points, labels, output_mask);
27 | 
28 |     cv::Mat res = image.clone();
29 |     cv::Mat mask = image.clone();
30 | 
31 |     cv::rectangle(res, cv::Rect(100, 100, 750, 659), {0, 0, 255}, 2);
32 |     mask.setTo(cv::Scalar(255, 56, 56), output_mask);
33 |     cv::addWeighted(res, 0.5, mask, 0.8, 1, res);
34 |     cv::imwrite("../output/sam/dogs.jpg", res);
35 | }


--------------------------------------------------------------------------------
/test/test.cpp:
--------------------------------------------------------------------------------
 1 | #include "onnxruntime_cxx_api.h"
 2 | 
 3 | int main() {
 4 |     // Allocate ONNXRuntime session
 5 |     auto memory_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
 6 |     Ort::Env env;
 7 |     Ort::Session session{env, ORT_TSTR("../weights/ocr/best-train-abinet.onnx"), Ort::SessionOptions{nullptr}};
 8 | 
 9 |     // Allocate model inputs: fill in shape and size
10 |     std::array<float, 12288> input;
11 |     std::array<int64_t, 4> input_shape{1, 3, 32, 128};
12 |     Ort::Value input_tensor = Ort::Value::CreateTensor<float>(memory_info, input.data(), input.size(), input_shape.data(), input_shape.size());
13 |     const char* input_names[] = {"images"};
14 | 
15 |     // Allocate model outputs: fill in shape and size
16 |     std::array<int64_t, 26> output;
17 |     std::array<int64_t, 3> output_shape{1, 26, 1};
18 |     Ort::Value output_tensor = Ort::Value::CreateTensor<int64_t>(memory_info, output.data(), output.size(), output_shape.data(), output_shape.size());
19 |     const char* output_names[] = {"output"};
20 | 
21 |     // Run the model
22 |     session.Run(Ort::RunOptions{nullptr}, input_names, &input_tensor, 1, output_names, &output_tensor, 1);
23 |     return 0;
24 | }   
25 |  


--------------------------------------------------------------------------------
/test/yolo_test.cpp:
--------------------------------------------------------------------------------
  1 | #include "framework/framework.h"
  2 | #include "common/common.h"
  3 | 
  4 | #include "model/yolo/yolo_seg.h"
  5 | #include "model/yolo/yolo_pose.h"
  6 | #include "model/yolo/yolo.h"
  7 | #include "model/yolo/yolo_seg_cutoff.h"
  8 | #include "model/yolo/yolo_cutoff.h"
  9 | 
 10 | void YOLODetTest() {
 11 |     std::string current_path = "../";
 12 |     std::string yaml_file = current_path + "config/yolo/yolo.yaml";
 13 | 
 14 |     YOLO model(yaml_file);
 15 | 
 16 |     std::vector<std::string> imagePathList;
 17 |     std::string input_path = current_path + "test/image/detect"; 
 18 |     std::string output_path = current_path + "output/yolo/detect";
 19 |     cv::glob(input_path + "/*.jpg", imagePathList);
 20 | 
 21 |     cv::Mat image, input_image, res;
 22 |     std::vector<Object> objs;
 23 | 
 24 |     std::vector<std::string> class_names;
 25 |     ReadClassNames(current_path + "config/yolo/coco.txt", class_names);
 26 | 
 27 |     for (auto& path : imagePathList) {
 28 |         objs.clear();
 29 |         std::cout << path << std::endl;
 30 |         image = cv::imread(path);
 31 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
 32 |         model.detect(input_image, objs);
 33 |         DrawObjects(image, res, objs, class_names, COLORS);
 34 | 
 35 |         std::string::size_type iPos = path.find_last_of('/') + 1;
 36 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
 37 |         std::string out_path = output_path + "/" + filename;
 38 |         // cv::imshow("image", res);
 39 |         // cv::waitKey(0);
 40 |         cv::imwrite(out_path, res);
 41 |     }
 42 | }
 43 | 
 44 | void YOLODetCutoffTest() {
 45 |     std::string current_path = "../";
 46 |     std::string yaml_file = current_path + "config/yolo/yolo_cutoff.yaml";
 47 | 
 48 |     YOLOCutoff model(yaml_file);
 49 | 
 50 |     std::vector<std::string> imagePathList;
 51 |     std::string input_path = current_path + "test/image/detect"; 
 52 |     std::string output_path = current_path + "output/yolo/detect";
 53 |     cv::glob(input_path + "/*.jpg", imagePathList);
 54 | 
 55 |     cv::Mat image, input_image, res;
 56 |     std::vector<Object> objs;
 57 | 
 58 |     std::vector<std::string> class_names;
 59 |     ReadClassNames(current_path + "config/yolo/coco.txt", class_names);
 60 | 
 61 |     for (auto& path : imagePathList) {
 62 |         objs.clear();
 63 |         std::cout << path << std::endl;
 64 |         image = cv::imread(path);
 65 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
 66 | 
 67 |         auto start = std::chrono::system_clock::now();
 68 |         model.detect(input_image, objs);
 69 |         auto end = std::chrono::system_clock::now();
 70 |         std::cout << "Costs: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
 71 |         
 72 |         DrawObjects(image, res, objs, class_names, COLORS);
 73 | 
 74 |         std::string::size_type iPos = path.find_last_of('/') + 1;
 75 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
 76 |         std::string out_path = output_path + "/" + filename;
 77 |         // cv::imshow("image", res);
 78 |         // cv::waitKey(0);
 79 |         cv::imwrite(out_path, res);
 80 |     }
 81 | }
 82 | 
 83 | void YOLOSegTest() {
 84 |     std::string current_path = "../";
 85 |     std::string yaml_file = current_path + "config/yolo/yolo_seg.yaml";
 86 | 
 87 |     YOLOSeg model(yaml_file);
 88 | 
 89 |     std::vector<std::string> imagePathList;
 90 |     std::string input_path = current_path + "test/image/detect"; 
 91 |     std::string output_path = current_path + "output/yolo/segment";
 92 |     cv::glob(input_path + "/*.jpg", imagePathList);
 93 | 
 94 |     cv::Mat image, input_image, res;
 95 |     std::vector<Object> objs;
 96 | 
 97 |     std::vector<std::string> class_names;
 98 |     ReadClassNames(current_path + "config/yolo/stardust.txt", class_names);
 99 | 
100 |     for (auto& path : imagePathList) {
101 |         objs.clear();
102 |         std::cout << path << std::endl;
103 |         image = cv::imread(path);
104 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
105 | 
106 |         auto start = std::chrono::system_clock::now();
107 |         model.detect(input_image, objs);
108 |         auto end = std::chrono::system_clock::now();
109 |         std::cout << "Costs: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
110 |         
111 |         DrawObjectsMasks(image, res, objs, class_names, COLORS, MASK_COLORS);
112 | 
113 |         std::string::size_type iPos = path.find_last_of('/') + 1;
114 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
115 |         std::string out_path = output_path + "/" + filename;
116 |         // cv::imshow("image", res);
117 |         // cv::waitKey(0);
118 |         cv::imwrite(out_path, res);
119 |     }
120 | }
121 | 
122 | void YOLOSegCutoffTest() {
123 |     std::string current_path = "../";
124 |     std::string yaml_file = current_path + "config/yolo/yolo_seg_cutoff.yaml";
125 | 
126 |     YOLOSegCutoff model(yaml_file);
127 | 
128 |     std::vector<std::string> imagePathList;
129 |     std::string input_path = current_path + "test/image/detect"; 
130 |     std::string output_path = current_path + "output/yolo/segment";
131 |     cv::glob(input_path + "/*.jpg", imagePathList);
132 | 
133 |     cv::Mat image, input_image, res;
134 |     std::vector<Object> objs;
135 | 
136 |     std::vector<std::string> class_names;
137 |     ReadClassNames(current_path + "config/yolo/coco.txt", class_names);
138 | 
139 |     for (auto& path : imagePathList) {
140 |         objs.clear();
141 |         std::cout << path << std::endl;
142 |         image = cv::imread(path);
143 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
144 | 
145 |         auto start = std::chrono::system_clock::now();
146 |         model.detect(input_image, objs);
147 |         auto end = std::chrono::system_clock::now();
148 |         std::cout << "Costs: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
149 |         
150 |         DrawObjectsMasks(image, res, objs, class_names, COLORS, MASK_COLORS);
151 | 
152 |         std::string::size_type iPos = path.find_last_of('/') + 1;
153 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
154 |         std::string out_path = output_path + "/" + filename;
155 |         // cv::imshow("image", res);
156 |         // cv::waitKey(0);
157 |         cv::imwrite(out_path, res);
158 |     }
159 | }
160 | 
161 | void YOLOPoseTest() {
162 |     std::string current_path = "../";
163 |     std::string yaml_file = current_path + "config/yolo/yolo_pose.yaml";
164 | 
165 |     YOLOPose model(yaml_file);
166 | 
167 |     std::vector<std::string> imagePathList;
168 |     std::string input_path = current_path + "test/image/detect"; 
169 |     std::string output_path = current_path + "output/yolo/pose";
170 |     cv::glob(input_path + "/*.jpg", imagePathList);
171 | 
172 |     cv::Mat image, input_image, res;
173 |     std::vector<Object> objs;
174 | 
175 |     std::vector<std::string> class_names;
176 |     ReadClassNames(current_path + "config/yolo/coco.txt", class_names);
177 | 
178 |     for (auto& path : imagePathList) {
179 |         objs.clear();
180 |         std::cout << path << std::endl;
181 |         image = cv::imread(path);
182 |         cv::cvtColor(image, input_image, cv::COLOR_BGR2RGB);
183 |         model.detect(input_image, objs);
184 |         DrawObjectsKps(image, res, objs, SKELETON, KPS_COLORS, LIMB_COLORS);
185 | 
186 |         std::string::size_type iPos = path.find_last_of('/') + 1;
187 | 	    std::string filename = path.substr(iPos, path.length() - iPos);
188 |         std::string out_path = output_path + "/" + filename;
189 |         // cv::imshow("image", res);
190 |         // cv::waitKey(0);
191 |         cv::imwrite(out_path, res);
192 |     }
193 | }
194 | 
195 | int main() {
196 |     // YOLODetTest();
197 |     YOLOSegTest();
198 |     // YOLOPoseTest();
199 |     // YOLODetCutoffTest();
200 |     // YOLOSegCutoffTest();
201 | }
202 | 


--------------------------------------------------------------------------------