├── .gitignore
├── Makefile
├── README.cn.md
├── README.md
├── README.onnx.plugin.md
├── README.onnx.plugin.pdf
├── TensorRT.sln
├── TensorRT.vcxproj
├── TensorRT.vcxproj.filters
├── TensorRT.vcxproj.user
├── dbface
└── DBFace.py
├── dcn_onnx
├── DCNv2
│ ├── .gitignore
│ ├── LICENSE
│ ├── README.md
│ ├── __init__.py
│ ├── dcn_v2.py
│ ├── make.sh
│ ├── setup.py
│ ├── src
│ │ ├── cpu
│ │ │ ├── dcn_v2_cpu.cpp
│ │ │ └── vision.h
│ │ ├── cuda
│ │ │ ├── dcn_v2_cuda.cu
│ │ │ ├── dcn_v2_im2col_cuda.cu
│ │ │ ├── dcn_v2_im2col_cuda.h
│ │ │ ├── dcn_v2_psroi_pooling_cuda.cu
│ │ │ └── vision.h
│ │ ├── dcn_v2.h
│ │ └── vision.cpp
│ └── test.py
├── README.md
├── dcn_v2.py
├── dladcn_export_onnx.py
└── pose_dla_dcn.py
├── lean
├── .gitignore
└── README.md
├── plugin_onnx_export.py
├── scripts
├── getALL.sh
├── getCenterTrack.sh
├── getDBFace.sh
└── getDLADCN.sh
├── src
├── builder
│ ├── trt_builder.cpp
│ └── trt_builder.hpp
├── caffeplugin
│ ├── caffeplugin.cpp
│ ├── caffeplugin.hpp
│ └── plugins
│ │ ├── ChannelMultiplicationLayer.cu
│ │ ├── ChannelMultiplicationLayer.hpp
│ │ ├── ClipLayer.cu
│ │ ├── ClipLayer.hpp
│ │ ├── DCNLayer.cu
│ │ ├── DCNLayer.hpp
│ │ ├── PlexShuffleLayer.cu
│ │ ├── PlexShuffleLayer.hpp
│ │ ├── TestPlugin.cu
│ │ └── TestPlugin.hpp
├── common
│ ├── cc_util.cpp
│ ├── cc_util.hpp
│ ├── json.cpp
│ ├── json.hpp
│ ├── trt_common.cpp
│ └── trt_common.hpp
├── examples
│ ├── center_net_coco2x_dcn.cpp
│ ├── center_track_coco_tracking.cpp
│ ├── dbface.cpp
│ └── onnx.cpp
├── import_lib.cpp
├── infer
│ ├── ct_detect_backend.cu
│ ├── ct_detect_backend.hpp
│ ├── dbface_backend.cu
│ ├── dbface_backend.hpp
│ ├── task_pool.hpp
│ ├── trt_backend.cpp
│ ├── trt_backend.hpp
│ ├── trt_infer.cpp
│ ├── trt_infer.hpp
│ └── trt_infer_norm.cu
├── main.cpp
├── onnx
│ ├── onnx-operators_ONNX_NAMESPACE-ml.pb.cpp
│ ├── onnx-operators_ONNX_NAMESPACE-ml.pb.h
│ ├── onnx_ONNX_NAMESPACE-ml.pb.cpp
│ ├── onnx_ONNX_NAMESPACE-ml.pb.h
│ ├── onnx_pb.h
│ └── onnxifi.h
├── onnx_parser
│ ├── ImporterContext.hpp
│ ├── InstanceNormalization.cpp
│ ├── InstanceNormalization.hpp
│ ├── ModelImporter.cpp
│ ├── ModelImporter.hpp
│ ├── NvOnnxParser.cpp
│ ├── NvOnnxParser.h
│ ├── NvOnnxParserTypedefs.h
│ ├── OnnxAttrs.cpp
│ ├── OnnxAttrs.hpp
│ ├── ResizeNearest.cu
│ ├── ResizeNearest.hpp
│ ├── ShapedWeights.cpp
│ ├── ShapedWeights.hpp
│ ├── Split.cu
│ ├── Split.hpp
│ ├── Status.hpp
│ ├── TensorOrWeights.hpp
│ ├── builtin_op_importers.cpp
│ ├── builtin_op_importers.hpp
│ ├── common.hpp
│ ├── onnx2trt.hpp
│ ├── onnx2trt_common.hpp
│ ├── onnx2trt_utils.cpp
│ ├── onnx2trt_utils.hpp
│ ├── onnx_utils.hpp
│ ├── plugin.cpp
│ ├── plugin.hpp
│ ├── plugin_common.hpp
│ ├── serialize.hpp
│ ├── toposort.hpp
│ ├── trt_utils.hpp
│ └── utils.hpp
└── onnxplugin
│ ├── onnxplugin.cpp
│ ├── onnxplugin.hpp
│ └── plugins
│ ├── DCNv2.cu
│ ├── DCNv2.hpp
│ ├── HSigmoid.cu
│ ├── HSigmoid.hpp
│ ├── HSwish.cu
│ ├── HSwish.hpp
│ ├── MReLU.cu
│ └── MReLU.hpp
└── workspace
├── imgs
├── 000020.jpg
├── 000023.jpg
├── 17790319373_bd19b24cfc_k.jpg
├── selfie.jpg
└── www.jpg
├── logs
├── 2020-04-15.log
└── 2020-04-17.log
├── models
├── .gitignore
└── demo.onnx
└── results
├── 0.centernet.coco2x.dcn.jpg
├── 1.centernet.coco2x.dcn.jpg
├── coco.tracking.jpg
└── selfie.draw.jpg
/.gitignore:
--------------------------------------------------------------------------------
1 | **/.trtmodel
2 | /build
3 | .vs
4 | **/*.pdb
5 | **/*.exe
6 | **/*.ilk
7 | **/*.suo
8 | objs
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ECHO = @echo
2 | OUTNAME = trtrun
3 | CC := g++
4 | CUCC := nvcc
5 | SRCDIR := src
6 | OBJDIR := objs
7 | LEAN := /datav/newbb/lean
8 | #BINDIR := $(LEAN)/tensorRTIntegrate
9 | BINDIR := workspace
10 |
11 | TENSORRT_NAME := TensorRT-7.0.0.11
12 | #TENSORRT_NAME := TensorRT-6.0.1.8-cuda10.2-cudnn7.6
13 | CFLAGS := -std=c++11 -fPIC -m64 -g -O3 -fopenmp -w -DONNX_ML -DNDEBUG
14 | CUFLAGS := -std=c++11 -m64 -Xcompiler -fPIC -g -O3 -w -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_61,code=sm_61
15 | INC_OPENCV := $(LEAN)/opencv4.2.0/include/opencv4
16 | INC_LOCAL := ./src ./src/builder ./src/common ./src/infer ./src/plugin ./src/plugin/plugins
17 | INC_SYS := /usr/local/protobuf/include
18 | INC_CUDA := $(LEAN)/cuda10.2/include $(LEAN)/$(TENSORRT_NAME)/include $(LEAN)/cudnn7.6.5.32-cuda10.2
19 | INCS := $(INC_SYS) $(INC_OPENCV) $(INC_LOCAL) $(INC_CUDA)
20 | INCS := $(foreach inc, $(INCS), -I$(inc))
21 |
22 | LIB_CUDA := $(LEAN)/cuda10.2/lib $(LEAN)/$(TENSORRT_NAME)/lib $(LEAN)/cudnn7.6.5.32-cuda10.2
23 | LIB_SYS := /usr/local/protobuf/lib
24 | LIB_OPENCV := $(LEAN)/opencv4.2.0/lib
25 | LIBS := $(LIB_SYS) $(LIB_CUDA) $(LIB_OPENCV)
26 | LIBS := $(foreach lib, $(LIBS),-L$(lib))
27 |
28 | RPATH := $(LIB_SYS) $(LIB_CUDA) $(LIB_OPENCV)
29 | RPATH := $(foreach lib, $(RPATH),-Wl,-rpath=$(lib))
30 |
31 | LD_OPENCV := opencv_core opencv_highgui opencv_imgproc opencv_video opencv_videoio opencv_imgcodecs
32 | LD_NVINFER := nvinfer nvinfer_plugin nvparsers
33 | LD_CUDA := cuda curand cublas cudart cudnn
34 | LD_SYS := stdc++
35 | LDS := $(LD_SYS) $(LD_OPENCV) $(LD_NVINFER) $(LD_CUDA)
36 | LDS := $(foreach lib, $(LDS), -l$(lib))
37 |
38 | SRCS := $(shell cd $(SRCDIR) && find -name "*.cpp")
39 | OBJS := $(patsubst %.cpp,%.o,$(SRCS))
40 | OBJS := $(foreach item,$(OBJS),$(OBJDIR)/$(item))
41 | CUS := $(shell cd $(SRCDIR) && find -name "*.cu")
42 | CUOBJS := $(patsubst %.cu,%.o,$(CUS))
43 | CUOBJS := $(foreach item,$(CUOBJS),$(OBJDIR)/$(item))
44 | OBJS := $(subst /./,/,$(OBJS))
45 | CUOBJS := $(subst /./,/,$(CUOBJS))
46 |
47 | all: $(BINDIR)/$(OUTNAME)
48 | $(ECHO) Done, now you can run this program with \"make run\" command.
49 |
50 | run: all
51 | @cd $(BINDIR) && ./$(OUTNAME)
52 |
53 | $(BINDIR)/$(OUTNAME): $(OBJS) $(CUOBJS)
54 | $(ECHO) Linking: $@
55 | @g++ $(LIBS) -o $@ $^ $(LDS) $(RPATH) -pthread -lprotobuf
56 |
57 | $(CUOBJS) : $(OBJDIR)/%.o : $(SRCDIR)/%.cu
58 | @if [ ! -d $@ ]; then mkdir -p $(dir $@); fi
59 | $(ECHO) Compiling: $<
60 | @$(CUCC) $(CUFLAGS) $(INCS) -c -o $@ $<
61 |
62 | $(OBJS) : $(OBJDIR)/%.o : $(SRCDIR)/%.cpp
63 | @if [ ! -d $@ ]; then mkdir -p $(dir $@); fi
64 | $(ECHO) Compiling: $<
65 | @$(CC) $(CFLAGS) $(INCS) -c -o $@ $<
66 |
67 | clean:
68 | rm -rf $(OBJDIR) $(BINDIR)/$(OUTNAME)
69 |
--------------------------------------------------------------------------------
/README.cn.md:
--------------------------------------------------------------------------------
1 | # TensorRT
2 |
3 | ---
4 | * 1、支持OnnX的插件开发,并且实现[CenterNet](https://github.com/xingyizhou/CenterNet)的DCNv2插件demo(fp32/fp16)和Inference实现,附有案例
5 | * 2、不建议使用pytorch->caffemodel->tensorRT,改用pytorch->onnx->tensorRT,对于任何特定需求(例如dcn、例如双线性插值),可以用插件实现
6 | * 3、如果不用这里提供的框架,自己实现onnx插件,这里有[一份指导](README.onnx.plugin.md),说明了关键点,可以做参考
7 | * 4、视频讲解点击这里:https://www.bilibili.com/video/BV1Pe411x7qr
8 |
9 |
10 | ## 复现centerNetDCN的检测结果
11 | 
12 |
13 |
14 |
15 | ## 复现centerTrack的结果
16 |
17 |
18 |
19 | 
20 |
21 |
22 |
23 | ## 复现DBFace
24 |
25 | 
26 |
27 |
28 |
29 |
30 | ## 快速使用
31 | * 安装protobuf v3.8.x,点击[README.onnx.plugin.md](README.onnx.plugin.md)有提到怎么装
32 | ```bash
33 | bash getDLADCN.sh
34 | make run -j32
35 | ```
36 |
37 | ---
38 | ## 案例-Inference
39 | ```
40 | auto engine = TRTInfer::loadEngine("models/efficientnet-b0.fp32.trtmodel");
41 | float mean[3] = {0.485, 0.456, 0.406};
42 | float std[3] = {0.229, 0.224, 0.225};
43 | Mat image = imread("img.jpg");
44 | engine->input()->setNormMat(0, image, mean, std);
45 | engine->forward();
46 | engine->output(0)->print();
47 | ```
48 |
49 | ## 环境-Windows
50 | * tensorRT7.0.0.11 (如果修改为6或者其他版本,可能会面临一点改动)
51 | * opencv3.4.6(可以任意修改为其他版本)
52 | * cudnn7.6.3(可以任意修改为其他版本)
53 | * cuda10.0(可以任意修改为其他版本)
54 | * protobuf v3.8.x
55 | * Visual Studio 2017(可以用其他版本打开,但需要修改对应opencv版本)
56 | * 如果要修改版本,你需要下载cuda/cudnn/tensorRT三者同时匹配的版本,因为他们互相存在依赖,否则只要你是cuda10.0就可以很轻易编译这个项目
57 | * Windows下的[依赖库lean.zip下载](http://zifuture.com:1000/fs/25.shared/lean.zip)
58 | ---
59 |
60 |
61 | ## 环境-Linux
62 | * protobuf v3.8.x
63 | * cuda10.2 (可以任意修改为其他版本)
64 | * cudnn7.6.5.32-cuda10.2 (可以任意修改为其他版本)
65 | * opencv4.2.0 (可以任意修改为其他版本)
66 | * TensorRT-7.0.0.11 (如果修改为6或者其他版本,可能会面临一点改动)
67 | ---
68 |
69 |
70 | ## 说明
71 | * pytorch到onnx(autograd.Function类特殊自定义实现函数导出成插件),参考[plugin_onnx_export.py](plugin_onnx_export.py)
72 | * onnx插件MReLU参考[MReLU.cu](src/onnxplugin/plugins/MReLU.cu),和HSwish参考[HSwish.cu](src/onnxplugin/plugins/HSwish.cu)
73 | * src/plugin底下的插件是实现caffemodel的插件方法,与onnx不兼容,并且不被推荐使用
74 | * int8已经失效,如果需要int8,可以使用[之前的版本并替换为tensorRT6.0](https://github.com/dlunion/tensorRTIntegrate/tree/59e933efc8011bc304d3ccd9fdd1d6cbc7b2e9a0)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 这个版本已经废弃,最新版本,请移步
2 | - https://github.com/shouxieai/tensorRT_cpp
3 | - 新版本支持了最新版的tensorRT、yolov5,替换了新的解析器,模型编译报错更少
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 | # YoloV5 Support
12 | http://zifuture.com:1556/fs/16.std/release_tensorRT_yolov5.zip
13 |
14 |
15 | # TensorRT-Integrate
16 |
17 | 1. Support pytorch onnx plugin(DCN、HSwish ... etc.)
18 | 2. Simpler inference and plugin APIs
19 |
20 |
21 |
22 |
23 | ## Re-implement
24 | ##### [CenterNet : ctdet_coco_dla_2x](https://github.com/xingyizhou/CenterNet)
25 |
26 | 
27 |
28 |
29 |
30 | ##### [CenterTrack: coco_tracking](https://github.com/xingyizhou/CenterTrack)
31 |
32 | 
33 |
34 | * [coco_tracking.onnx download](http://zifuture.com:1556/fs/public_models/coco_tracking.onnx)
35 |
36 | * [nuScenes_3Dtracking.onnx download](http://zifuture.com:1556/fs/public_models/nuScenes_3Dtracking.onnx)
37 |
38 |
39 |
40 | ##### [DBFace](https://github.com/dlunion/DBFace)
41 |
42 | 
43 |
44 |
45 |
46 | ## Use TensorRT-Integrate
47 |
48 | install protobuf == 3.11.4 (or >= 3.8.x, But it's more troublesome)
49 |
50 | ```bash
51 | bash scripts/getALL.sh
52 | make run -j32
53 | ```
54 |
55 |
56 |
57 | ## Inference Code
58 |
59 | ```
60 | auto engine = TRTInfer::loadEngine("models/efficientnet-b0.fp32.trtmodel");
61 | float mean[3] = {0.485, 0.456, 0.406};
62 | float std[3] = {0.229, 0.224, 0.225};
63 | Mat image = imread("img.jpg");
64 | auto input = engine->input();
65 |
66 | // multi batch sample
67 | input->resize(2);
68 | input->setNormMatGPU(0, image, mean, std);
69 | input->setNormMatGPU(1, image, mean, std);
70 |
71 | engine->forward();
72 |
73 | // get result and copy to cpu
74 | engine->output(0)->cpu();
75 | engine->tensor("hm")->cpu();
76 | ```
77 |
78 |
79 |
80 | ## Environment
81 |
82 | * tensorRT7.0 or tensorRT6.0
83 | * opencv3.4.6
84 | * cudnn7.6.3
85 | * cuda10.0
86 | * protobuf v3.8.x
87 | * Visual Studio 2017
88 | * [lean-windows.zip (include tensorRT、opencv、cudnn、cuda、protobuf)](http://zifuture.com:1556/fs/25.shared/lean.zip)
89 |
90 |
91 |
92 | ## Plugin
93 |
94 | 1. Pytorch export ONNX: [plugin_onnx_export.py](plugin_onnx_export.py)
95 | 2. [MReLU.cu](src/onnxplugin/plugins/MReLU.cu) 、[HSwish.cu](src/onnxplugin/plugins/HSwish.cu)、[DCNv2.cu](src/onnxplugin/plugins/DCNv2.cu)
96 |
97 |
98 |
--------------------------------------------------------------------------------
/README.onnx.plugin.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/README.onnx.plugin.pdf
--------------------------------------------------------------------------------
/TensorRT.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28307.136
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "TensorRT", "TensorRT.vcxproj", "{FBF775F5-DAB4-4BC1-97A9-D36301073438}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|x64 = Debug|x64
11 | Release|x64 = Release|x64
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {FBF775F5-DAB4-4BC1-97A9-D36301073438}.Debug|x64.ActiveCfg = Debug|x64
15 | {FBF775F5-DAB4-4BC1-97A9-D36301073438}.Debug|x64.Build.0 = Debug|x64
16 | {FBF775F5-DAB4-4BC1-97A9-D36301073438}.Release|x64.ActiveCfg = Release|x64
17 | {FBF775F5-DAB4-4BC1-97A9-D36301073438}.Release|x64.Build.0 = Release|x64
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | GlobalSection(ExtensibilityGlobals) = postSolution
23 | SolutionGuid = {679F35F0-20AA-4D18-8610-D369E2BE97E8}
24 | EndGlobalSection
25 | EndGlobal
26 |
--------------------------------------------------------------------------------
/TensorRT.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | workspace
5 | PATH=$(projectDir)lean/cuda10.0/bin;$(projectDir)lean/opencv3.4.6/lib;$(projectDir)lean/cudnn7.6.3;$(projectDir)lean/TensorRT-7.0.0.11/lib
6 | WindowsLocalDebugger
7 | false
8 |
9 |
10 | workspace
11 | PATH=$(projectDir)lean/cuda10.0/bin;$(projectDir)lean/opencv3.4.6/lib;$(projectDir)lean/cudnn7.6.3;$(projectDir)lean/TensorRT-7.0.0.11/lib
12 | WindowsLocalDebugger
13 | false
14 |
15 |
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .idea
3 | *.so
4 | *.o
5 | *pyc
6 | _ext
7 | build
8 | DCNv2.egg-info
9 | dist
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, Charles Shang
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/README.md:
--------------------------------------------------------------------------------
1 | ## Deformable Convolutional Networks V2 with Pytorch 1.0
2 |
3 | ### Build
4 | ```bash
5 | ./make.sh # build
6 | python test.py # run examples and gradient check
7 | ```
8 |
9 | ### An Example
10 | - deformable conv
11 | ```python
12 | from dcn_v2 import DCN
13 | input = torch.randn(2, 64, 128, 128).cuda()
14 | # wrap all things (offset and mask) in DCN
15 | dcn = DCN(64, 64, kernel_size=(3,3), stride=1, padding=1, deformable_groups=2).cuda()
16 | output = dcn(input)
17 | print(output.shape)
18 | ```
19 | - deformable roi pooling
20 | ```python
21 | from dcn_v2 import DCNPooling
22 | input = torch.randn(2, 32, 64, 64).cuda()
23 | batch_inds = torch.randint(2, (20, 1)).cuda().float()
24 | x = torch.randint(256, (20, 1)).cuda().float()
25 | y = torch.randint(256, (20, 1)).cuda().float()
26 | w = torch.randint(64, (20, 1)).cuda().float()
27 | h = torch.randint(64, (20, 1)).cuda().float()
28 | rois = torch.cat((batch_inds, x, y, x + w, y + h), dim=1)
29 |
30 | # mdformable pooling (V2)
31 | # wrap all things (offset and mask) in DCNPooling
32 | dpooling = DCNPooling(spatial_scale=1.0 / 4,
33 | pooled_size=7,
34 | output_dim=32,
35 | no_trans=False,
36 | group_size=1,
37 | trans_std=0.1).cuda()
38 |
39 | dout = dpooling(input, rois)
40 | ```
41 | ### Note
42 | Now the master branch is for pytorch 1.0 (new ATen API), you can switch back to pytorch 0.4 with,
43 | ```bash
44 | git checkout pytorch_0.4
45 | ```
46 |
47 | ### Known Issues:
48 |
49 | - [x] Gradient check w.r.t offset (solved)
50 | - [ ] Backward is not reentrant (minor)
51 |
52 | This is an adaption of the official [Deformable-ConvNets](https://github.com/msracver/Deformable-ConvNets/tree/master/DCNv2_op).
53 |
54 | I have ran the gradient check for many times with DOUBLE type. Every tensor **except offset** passes.
55 | However, when I set the offset to 0.5, it passes. I'm still wondering what cause this problem. Is it because some
56 | non-differential points?
57 |
58 | Update: all gradient check passes with double precision.
59 |
60 | Another issue is that it raises `RuntimeError: Backward is not reentrant`. However, the error is very small (`<1e-7` for
61 | float `<1e-15` for double),
62 | so it may not be a serious problem (?)
63 |
64 | Please post an issue or PR if you have any comments.
65 |
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/dcn_onnx/DCNv2/__init__.py
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | python setup.py build develop
3 |
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import os
4 | import glob
5 |
6 | import torch
7 |
8 | from torch.utils.cpp_extension import CUDA_HOME
9 | from torch.utils.cpp_extension import CppExtension
10 | from torch.utils.cpp_extension import CUDAExtension
11 |
12 | from setuptools import find_packages
13 | from setuptools import setup
14 |
15 | requirements = ["torch", "torchvision"]
16 |
17 | def get_extensions():
18 | this_dir = os.path.dirname(os.path.abspath(__file__))
19 | extensions_dir = os.path.join(this_dir, "src")
20 |
21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 |
25 | sources = main_file + source_cpu
26 | extension = CppExtension
27 | extra_compile_args = {"cxx": []}
28 | define_macros = []
29 |
30 | if torch.cuda.is_available() and CUDA_HOME is not None:
31 | extension = CUDAExtension
32 | sources += source_cuda
33 | define_macros += [("WITH_CUDA", None)]
34 | extra_compile_args["nvcc"] = [
35 | "-DCUDA_HAS_FP16=1",
36 | "-D__CUDA_NO_HALF_OPERATORS__",
37 | "-D__CUDA_NO_HALF_CONVERSIONS__",
38 | "-D__CUDA_NO_HALF2_OPERATORS__",
39 | ]
40 | else:
41 | raise NotImplementedError('Cuda is not availabel')
42 |
43 | sources = [os.path.join(extensions_dir, s) for s in sources]
44 | include_dirs = [extensions_dir, "/usr/local/cuda-10.1/include"]
45 | ext_modules = [
46 | extension(
47 | "_ext",
48 | sources,
49 | include_dirs=include_dirs,
50 | define_macros=define_macros,
51 | extra_compile_args=extra_compile_args,
52 | )
53 | ]
54 | return ext_modules
55 |
56 | setup(
57 | name="DCNv2",
58 | version="0.1",
59 | author="charlesshang",
60 | url="https://github.com/charlesshang/DCNv2",
61 | description="deformable convolutional networks",
62 | packages=find_packages(exclude=("configs", "tests",)),
63 | # install_requires=requirements,
64 | ext_modules=get_extensions(),
65 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
66 | )
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/cpu/dcn_v2_cpu.cpp:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include
4 | #include
5 |
6 |
7 | at::Tensor
8 | dcn_v2_cpu_forward(const at::Tensor &input,
9 | const at::Tensor &weight,
10 | const at::Tensor &bias,
11 | const at::Tensor &offset,
12 | const at::Tensor &mask,
13 | const int kernel_h,
14 | const int kernel_w,
15 | const int stride_h,
16 | const int stride_w,
17 | const int pad_h,
18 | const int pad_w,
19 | const int dilation_h,
20 | const int dilation_w,
21 | const int deformable_group)
22 | {
23 | AT_ERROR("Not implement on cpu");
24 | }
25 |
26 | std::vector
27 | dcn_v2_cpu_backward(const at::Tensor &input,
28 | const at::Tensor &weight,
29 | const at::Tensor &bias,
30 | const at::Tensor &offset,
31 | const at::Tensor &mask,
32 | const at::Tensor &grad_output,
33 | int kernel_h, int kernel_w,
34 | int stride_h, int stride_w,
35 | int pad_h, int pad_w,
36 | int dilation_h, int dilation_w,
37 | int deformable_group)
38 | {
39 | AT_ERROR("Not implement on cpu");
40 | }
41 |
42 | std::tuple
43 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
44 | const at::Tensor &bbox,
45 | const at::Tensor &trans,
46 | const int no_trans,
47 | const float spatial_scale,
48 | const int output_dim,
49 | const int group_size,
50 | const int pooled_size,
51 | const int part_size,
52 | const int sample_per_part,
53 | const float trans_std)
54 | {
55 | AT_ERROR("Not implement on cpu");
56 | }
57 |
58 | std::tuple
59 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
60 | const at::Tensor &input,
61 | const at::Tensor &bbox,
62 | const at::Tensor &trans,
63 | const at::Tensor &top_count,
64 | const int no_trans,
65 | const float spatial_scale,
66 | const int output_dim,
67 | const int group_size,
68 | const int pooled_size,
69 | const int part_size,
70 | const int sample_per_part,
71 | const float trans_std)
72 | {
73 | AT_ERROR("Not implement on cpu");
74 | }
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/cpu/vision.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor
5 | dcn_v2_cpu_forward(const at::Tensor &input,
6 | const at::Tensor &weight,
7 | const at::Tensor &bias,
8 | const at::Tensor &offset,
9 | const at::Tensor &mask,
10 | const int kernel_h,
11 | const int kernel_w,
12 | const int stride_h,
13 | const int stride_w,
14 | const int pad_h,
15 | const int pad_w,
16 | const int dilation_h,
17 | const int dilation_w,
18 | const int deformable_group);
19 |
20 | std::vector
21 | dcn_v2_cpu_backward(const at::Tensor &input,
22 | const at::Tensor &weight,
23 | const at::Tensor &bias,
24 | const at::Tensor &offset,
25 | const at::Tensor &mask,
26 | const at::Tensor &grad_output,
27 | int kernel_h, int kernel_w,
28 | int stride_h, int stride_w,
29 | int pad_h, int pad_w,
30 | int dilation_h, int dilation_w,
31 | int deformable_group);
32 |
33 |
34 | std::tuple
35 | dcn_v2_psroi_pooling_cpu_forward(const at::Tensor &input,
36 | const at::Tensor &bbox,
37 | const at::Tensor &trans,
38 | const int no_trans,
39 | const float spatial_scale,
40 | const int output_dim,
41 | const int group_size,
42 | const int pooled_size,
43 | const int part_size,
44 | const int sample_per_part,
45 | const float trans_std);
46 |
47 | std::tuple
48 | dcn_v2_psroi_pooling_cpu_backward(const at::Tensor &out_grad,
49 | const at::Tensor &input,
50 | const at::Tensor &bbox,
51 | const at::Tensor &trans,
52 | const at::Tensor &top_count,
53 | const int no_trans,
54 | const float spatial_scale,
55 | const int output_dim,
56 | const int group_size,
57 | const int pooled_size,
58 | const int part_size,
59 | const int sample_per_part,
60 | const float trans_std);
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/cuda/dcn_v2_im2col_cuda.h:
--------------------------------------------------------------------------------
1 |
2 | /*!
3 | ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
4 | *
5 | * COPYRIGHT
6 | *
7 | * All contributions by the University of California:
8 | * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
9 | * All rights reserved.
10 | *
11 | * All other contributions:
12 | * Copyright (c) 2014-2017, the respective contributors
13 | * All rights reserved.
14 | *
15 | * Caffe uses a shared copyright model: each contributor holds copyright over
16 | * their contributions to Caffe. The project versioning records all such
17 | * contribution and copyright details. If a contributor wants to further mark
18 | * their specific copyright on a particular contribution, they should indicate
19 | * their copyright solely in the commit message of the change when it is
20 | * committed.
21 | *
22 | * LICENSE
23 | *
24 | * Redistribution and use in source and binary forms, with or without
25 | * modification, are permitted provided that the following conditions are met:
26 | *
27 | * 1. Redistributions of source code must retain the above copyright notice, this
28 | * list of conditions and the following disclaimer.
29 | * 2. Redistributions in binary form must reproduce the above copyright notice,
30 | * this list of conditions and the following disclaimer in the documentation
31 | * and/or other materials provided with the distribution.
32 | *
33 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
34 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
35 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
36 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
37 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
38 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
39 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
40 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
41 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
42 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43 | *
44 | * CONTRIBUTION AGREEMENT
45 | *
46 | * By contributing to the BVLC/caffe repository through pull-request, comment,
47 | * or otherwise, the contributor releases their content to the
48 | * license and copyright terms herein.
49 | *
50 | ***************** END Caffe Copyright Notice and Disclaimer ********************
51 | *
52 | * Copyright (c) 2018 Microsoft
53 | * Licensed under The MIT License [see LICENSE for details]
54 | * \file modulated_deformable_im2col.h
55 | * \brief Function definitions of converting an image to
56 | * column matrix based on kernel, padding, dilation, and offset.
57 | * These functions are mainly used in deformable convolution operators.
58 | * \ref: https://arxiv.org/abs/1811.11168
59 | * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu
60 | */
61 |
62 | /***************** Adapted by Charles Shang *********************/
63 |
64 | #ifndef DCN_V2_IM2COL_CUDA
65 | #define DCN_V2_IM2COL_CUDA
66 |
67 | #ifdef __cplusplus
68 | extern "C"
69 | {
70 | #endif
71 |
72 | void modulated_deformable_im2col_cuda(cudaStream_t stream,
73 | const float *data_im, const float *data_offset, const float *data_mask,
74 | const int batch_size, const int channels, const int height_im, const int width_im,
75 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
76 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
77 | const int dilation_h, const int dilation_w,
78 | const int deformable_group, float *data_col);
79 |
80 | void modulated_deformable_col2im_cuda(cudaStream_t stream,
81 | const float *data_col, const float *data_offset, const float *data_mask,
82 | const int batch_size, const int channels, const int height_im, const int width_im,
83 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
84 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
85 | const int dilation_h, const int dilation_w,
86 | const int deformable_group, float *grad_im);
87 |
88 | void modulated_deformable_col2im_coord_cuda(cudaStream_t stream,
89 | const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
90 | const int batch_size, const int channels, const int height_im, const int width_im,
91 | const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
92 | const int pad_h, const int pad_w, const int stride_h, const int stride_w,
93 | const int dilation_h, const int dilation_w,
94 | const int deformable_group,
95 | float *grad_offset, float *grad_mask);
96 |
97 | #ifdef __cplusplus
98 | }
99 | #endif
100 |
101 | #endif
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/cuda/vision.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 |
4 | at::Tensor
5 | dcn_v2_cuda_forward(const at::Tensor &input,
6 | const at::Tensor &weight,
7 | const at::Tensor &bias,
8 | const at::Tensor &offset,
9 | const at::Tensor &mask,
10 | const int kernel_h,
11 | const int kernel_w,
12 | const int stride_h,
13 | const int stride_w,
14 | const int pad_h,
15 | const int pad_w,
16 | const int dilation_h,
17 | const int dilation_w,
18 | const int deformable_group);
19 |
20 | std::vector
21 | dcn_v2_cuda_backward(const at::Tensor &input,
22 | const at::Tensor &weight,
23 | const at::Tensor &bias,
24 | const at::Tensor &offset,
25 | const at::Tensor &mask,
26 | const at::Tensor &grad_output,
27 | int kernel_h, int kernel_w,
28 | int stride_h, int stride_w,
29 | int pad_h, int pad_w,
30 | int dilation_h, int dilation_w,
31 | int deformable_group);
32 |
33 |
34 | std::tuple
35 | dcn_v2_psroi_pooling_cuda_forward(const at::Tensor &input,
36 | const at::Tensor &bbox,
37 | const at::Tensor &trans,
38 | const int no_trans,
39 | const float spatial_scale,
40 | const int output_dim,
41 | const int group_size,
42 | const int pooled_size,
43 | const int part_size,
44 | const int sample_per_part,
45 | const float trans_std);
46 |
47 | std::tuple
48 | dcn_v2_psroi_pooling_cuda_backward(const at::Tensor &out_grad,
49 | const at::Tensor &input,
50 | const at::Tensor &bbox,
51 | const at::Tensor &trans,
52 | const at::Tensor &top_count,
53 | const int no_trans,
54 | const float spatial_scale,
55 | const int output_dim,
56 | const int group_size,
57 | const int pooled_size,
58 | const int part_size,
59 | const int sample_per_part,
60 | const float trans_std);
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/dcn_v2.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "cpu/vision.h"
4 |
5 | #ifdef WITH_CUDA
6 | #include "cuda/vision.h"
7 | #endif
8 |
9 | at::Tensor
10 | dcn_v2_forward(const at::Tensor &input,
11 | const at::Tensor &weight,
12 | const at::Tensor &bias,
13 | const at::Tensor &offset,
14 | const at::Tensor &mask,
15 | const int kernel_h,
16 | const int kernel_w,
17 | const int stride_h,
18 | const int stride_w,
19 | const int pad_h,
20 | const int pad_w,
21 | const int dilation_h,
22 | const int dilation_w,
23 | const int deformable_group)
24 | {
25 | if (input.type().is_cuda())
26 | {
27 | #ifdef WITH_CUDA
28 | return dcn_v2_cuda_forward(input, weight, bias, offset, mask,
29 | kernel_h, kernel_w,
30 | stride_h, stride_w,
31 | pad_h, pad_w,
32 | dilation_h, dilation_w,
33 | deformable_group);
34 | #else
35 | AT_ERROR("Not compiled with GPU support");
36 | #endif
37 | }
38 | AT_ERROR("Not implemented on the CPU");
39 | }
40 |
41 | std::vector
42 | dcn_v2_backward(const at::Tensor &input,
43 | const at::Tensor &weight,
44 | const at::Tensor &bias,
45 | const at::Tensor &offset,
46 | const at::Tensor &mask,
47 | const at::Tensor &grad_output,
48 | int kernel_h, int kernel_w,
49 | int stride_h, int stride_w,
50 | int pad_h, int pad_w,
51 | int dilation_h, int dilation_w,
52 | int deformable_group)
53 | {
54 | if (input.type().is_cuda())
55 | {
56 | #ifdef WITH_CUDA
57 | return dcn_v2_cuda_backward(input,
58 | weight,
59 | bias,
60 | offset,
61 | mask,
62 | grad_output,
63 | kernel_h, kernel_w,
64 | stride_h, stride_w,
65 | pad_h, pad_w,
66 | dilation_h, dilation_w,
67 | deformable_group);
68 | #else
69 | AT_ERROR("Not compiled with GPU support");
70 | #endif
71 | }
72 | AT_ERROR("Not implemented on the CPU");
73 | }
74 |
75 | std::tuple
76 | dcn_v2_psroi_pooling_forward(const at::Tensor &input,
77 | const at::Tensor &bbox,
78 | const at::Tensor &trans,
79 | const int no_trans,
80 | const float spatial_scale,
81 | const int output_dim,
82 | const int group_size,
83 | const int pooled_size,
84 | const int part_size,
85 | const int sample_per_part,
86 | const float trans_std)
87 | {
88 | if (input.type().is_cuda())
89 | {
90 | #ifdef WITH_CUDA
91 | return dcn_v2_psroi_pooling_cuda_forward(input,
92 | bbox,
93 | trans,
94 | no_trans,
95 | spatial_scale,
96 | output_dim,
97 | group_size,
98 | pooled_size,
99 | part_size,
100 | sample_per_part,
101 | trans_std);
102 | #else
103 | AT_ERROR("Not compiled with GPU support");
104 | #endif
105 | }
106 | AT_ERROR("Not implemented on the CPU");
107 | }
108 |
109 | std::tuple
110 | dcn_v2_psroi_pooling_backward(const at::Tensor &out_grad,
111 | const at::Tensor &input,
112 | const at::Tensor &bbox,
113 | const at::Tensor &trans,
114 | const at::Tensor &top_count,
115 | const int no_trans,
116 | const float spatial_scale,
117 | const int output_dim,
118 | const int group_size,
119 | const int pooled_size,
120 | const int part_size,
121 | const int sample_per_part,
122 | const float trans_std)
123 | {
124 | if (input.type().is_cuda())
125 | {
126 | #ifdef WITH_CUDA
127 | return dcn_v2_psroi_pooling_cuda_backward(out_grad,
128 | input,
129 | bbox,
130 | trans,
131 | top_count,
132 | no_trans,
133 | spatial_scale,
134 | output_dim,
135 | group_size,
136 | pooled_size,
137 | part_size,
138 | sample_per_part,
139 | trans_std);
140 | #else
141 | AT_ERROR("Not compiled with GPU support");
142 | #endif
143 | }
144 | AT_ERROR("Not implemented on the CPU");
145 | }
--------------------------------------------------------------------------------
/dcn_onnx/DCNv2/src/vision.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dcn_v2.h"
3 |
4 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
5 | m.def("dcn_v2_forward", &dcn_v2_forward, "dcn_v2_forward");
6 | m.def("dcn_v2_backward", &dcn_v2_backward, "dcn_v2_backward");
7 | m.def("dcn_v2_psroi_pooling_forward", &dcn_v2_psroi_pooling_forward, "dcn_v2_psroi_pooling_forward");
8 | m.def("dcn_v2_psroi_pooling_backward", &dcn_v2_psroi_pooling_backward, "dcn_v2_psroi_pooling_backward");
9 | }
10 |
--------------------------------------------------------------------------------
/dcn_onnx/README.md:
--------------------------------------------------------------------------------
1 | ## 说明
2 | 1. 下载官方的代码
3 | 2. 下载模型
4 | 3. 替换这里的几个py(请着重看一下这几个py里面,用==================修改的部分===================做的标记,没有修改太多,可以自己尝试改下)
5 | 4. 执行dladcn_export_onnx.py,导出onnx即可
--------------------------------------------------------------------------------
/dcn_onnx/dladcn_export_onnx.py:
--------------------------------------------------------------------------------
1 |
2 | # 请下载官方的代码,然后执行这个就可以生成了
3 | import numpy as np
4 | import torch
5 | import torch.onnx.utils as onnx
6 | import models.networks.pose_dla_dcn as net
7 | from collections import OrderedDict
8 | import cv2
9 |
10 | model = net.get_pose_net(num_layers=34, heads={'hm': 80, 'wh': 2, 'reg': 2})
11 |
12 | # https://github.com/xingyizhou/CenterNet/blob/master/readme/MODEL_ZOO.md 这里下载的
13 | # 如果下载不了,可以尝试我提供的连接:http://zifuture.com:1000/fs/public_models/ctdet_coco_dla_2x.pth
14 | checkpoint = torch.load(r"ctdet_coco_dla_2x.pth", map_location="cpu")
15 | checkpoint = checkpoint["state_dict"]
16 | change = OrderedDict()
17 | for key, op in checkpoint.items():
18 | change[key.replace("module.", "", 1)] = op
19 |
20 | model.load_state_dict(change)
21 | model.eval()
22 | model.cuda()
23 |
24 | input = torch.zeros((1, 3, 32, 32)).cuda()
25 |
26 | # 有个已经导出好的模型:http://zifuture.com:1000/fs/public_models/dladcnv2.onnx
27 | onnx.export(model, (input), "dladcnv2.onnx", output_names=["hm", "wh", "reg", "hm_pool"], verbose=True)
28 |
--------------------------------------------------------------------------------
/lean/.gitignore:
--------------------------------------------------------------------------------
1 | /tensorRT6.0.1.5
2 | /cuda10.0
3 | /cudnn7.6.3
4 | /opencv3.4.6
5 | /TensorRT-7.0.0.11
6 | /protobuf3.11.4
--------------------------------------------------------------------------------
/lean/README.md:
--------------------------------------------------------------------------------
1 | # Lean
2 |
3 | 编译好的protobuf Windows版本:http://zifuture.com:1000/fs/25.shared/lean.zip
--------------------------------------------------------------------------------
/plugin_onnx_export.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn.functional as F
3 | import torch.nn as nn
4 | import json
5 |
6 | # 一个内置插件
7 | class HSwishImplementation(torch.autograd.Function):
8 |
9 | @staticmethod
10 | def symbolic(g, input, bias):
11 | return g.op("HSwish", input, bias, info_s="string attribute", kernel_size_i=3, eps_f=3e-2)
12 |
13 | @staticmethod
14 | def forward(ctx, i, bias):
15 | ctx.save_for_backward(i)
16 | return i * F.relu6(i + 3) / 6 + bias
17 |
18 | class MemoryEfficientHSwish(nn.Module):
19 | def __init__(self, shape):
20 | super(MemoryEfficientHSwish, self).__init__()
21 | self.bias = nn.Parameter(torch.zeros(shape))
22 | self.bias.data.fill_(3.15)
23 |
24 | def forward(self, x):
25 | return HSwishImplementation.apply(x, self.bias)
26 |
27 |
28 | # 一个通过本框架实现的插件
29 | class MReLUImplementation(torch.autograd.Function):
30 |
31 | @staticmethod
32 | def symbolic(g, input, bias):
33 | return g.op("Plugin", input, bias, name_s="MReLU", info_s=json.dumps({
34 | "kernel_size": 3,
35 | "eps": 3e-2,
36 | "other": "Hello Onnx Plugin"
37 | }))
38 |
39 | @staticmethod
40 | def forward(ctx, i, bias):
41 | ctx.save_for_backward(i)
42 | return F.relu(i) + bias
43 |
44 | class MReLU(nn.Module):
45 | def __init__(self, *shape):
46 | super(MReLU, self).__init__()
47 | self.bias = nn.Parameter(torch.zeros(shape))
48 | self.bias.data.fill_(0.5)
49 |
50 | def forward(self, x):
51 | return MReLUImplementation.apply(x, self.bias)
52 |
53 | class FooModel(torch.nn.Module):
54 | def __init__(self):
55 | super(FooModel, self).__init__()
56 | self.hswish = MemoryEfficientHSwish(1)
57 | self.mrelu = MReLU(1)
58 |
59 | def forward(self, input1, input2):
60 | return self.mrelu(input2) + self.hswish(input1)
61 |
62 | dummy_input1 = torch.zeros((1, 3, 3, 3))
63 | dummy_input2 = torch.zeros((1, 3, 3, 3))
64 | model = FooModel()
65 |
66 | dummy_input1[...] = 0.25
67 | dummy_input2[...] = 0
68 | out = model(dummy_input1, dummy_input2)
69 | print(out)
70 |
71 | torch.onnx.export(model, (dummy_input1, dummy_input2), 'workspace/models/demo.onnx', verbose=True)
--------------------------------------------------------------------------------
/scripts/getALL.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | bash scripts/getCenterTrack.sh
3 | bash scripts/getDBFace.sh
4 | bash scripts/getDLADCN.sh
--------------------------------------------------------------------------------
/scripts/getCenterTrack.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget http://zifuture.com:1000/fs/public_models/coco_tracking.onnx -O workspace/models/coco_tracking.onnx
--------------------------------------------------------------------------------
/scripts/getDBFace.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget http://zifuture.com:1000/fs/public_models/dbface.onnx -O workspace/models/dbface.onnx
--------------------------------------------------------------------------------
/scripts/getDLADCN.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | wget http://zifuture.com:1000/fs/public_models/dladcnv2.onnx -O workspace/models/dladcnv2.onnx
3 |
--------------------------------------------------------------------------------
/src/builder/trt_builder.hpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #ifndef TRT_BUILDER_HPP
4 | #define TRT_BUILDER_HPP
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | namespace TRTBuilder {
12 |
13 | typedef std::function Int8Process;
14 |
15 | void setDevice(int device_id);
16 |
17 | enum ModelSourceType {
18 | ModelSourceType_FromCaffe,
19 | ModelSourceType_FromONNX
20 | };
21 |
22 | class ModelSource {
23 | public:
24 | ModelSource(const std::string& prototxt, const std::string& caffemodel);
25 | ModelSource(const std::string& onnxmodel);
26 | ModelSourceType type() const;
27 | std::string prototxt() const;
28 | std::string caffemodel() const;
29 | std::string onnxmodel() const;
30 |
31 | private:
32 | std::string prototxt_, caffemodel_;
33 | std::string onnxmodel_;
34 | ModelSourceType type_;
35 | };
36 |
37 | class InputDims {
38 | public:
39 | InputDims(int channels, int height, int width);
40 |
41 | int channels() const;
42 | int height() const;
43 | int width() const;
44 |
45 | private:
46 | int channels_, height_, width_;
47 | };
48 |
49 | enum TRTMode {
50 | TRTMode_FP32,
51 | TRTMode_FP16
52 | };
53 |
54 | const char* modeString(TRTMode type);
55 |
56 | bool compileTRT(
57 | TRTMode mode,
58 | const std::vector& outputs,
59 | unsigned int batchSize,
60 | const ModelSource& source,
61 | const std::string& savepath,
62 | const std::vector inputsDimsSetup = {});
63 | };
64 |
65 | #endif //TRT_BUILDER_HPP
--------------------------------------------------------------------------------
/src/caffeplugin/caffeplugin.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef PLUGIN_BASE_HPP
3 | #define PLUGIN_BASE_HPP
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 |
16 | namespace Plugin {
17 |
18 | enum Phase {
19 | CompilePhase,
20 | InferencePhase
21 | };
22 |
23 | struct GTensor {
24 | GTensor() {}
25 | GTensor(const TRTInfer::Tensor& tensor);
26 | GTensor(float* ptr, int n, int c, int h, int w);
27 | GTensor(TRTInfer::halfloat* ptr, int n, int c, int h, int w);
28 | int count(int start_axis = 0) const;
29 | inline int offset(int n = 0, int c = 0, int h = 0, int w = 0) const { return ((n * this->channel_ + c) * this->height_ + h) * this->width_ + w; }
30 |
31 | template
32 | inline _T* ptr() const { return (_T*)ptr_; }
33 |
34 | template
35 | inline _T* ptr(int n, int c = 0, int h = 0, int w = 0) const { return (_T*)ptr_ + offset(n, c, h, w); }
36 |
37 | inline float* ptr_float() const { return (float*)ptr_; }
38 | inline float* ptr_float(int n, int c = 0, int h = 0, int w = 0) const { return (float*)ptr_ + offset(n, c, h, w); }
39 | inline TRTInfer::halfloat* ptr_half() const { return (TRTInfer::halfloat*)ptr_; }
40 | inline TRTInfer::halfloat* ptr_half(int n, int c = 0, int h = 0, int w = 0) const { return (TRTInfer::halfloat*)ptr_ + offset(n, c, h, w); }
41 |
42 | int num_ = 0, channel_ = 0, height_ = 0, width_ = 0;
43 | void* ptr_ = nullptr;
44 | TRTInfer::DataType dtType_ = TRTInfer::DataType::dtFloat;
45 | };
46 |
47 | struct LayerConfig {
48 |
49 | ///////////////////////////////////
50 | int nbOutput_ = 1;
51 | size_t workspaceSize_ = 0;
52 | std::set supportDataType_;
53 | std::set supportPluginFormat_;
54 |
55 | std::vector> weights_;
56 | TRTInfer::DataType configDataType_;
57 | nvinfer1::PluginFormat configPluginFormat_;
58 | int configMaxbatchSize_ = 0;
59 |
60 | ///////////////////////////////////
61 | std::vector input;
62 | std::vector output;
63 | std::string serializeData_;
64 |
65 | LayerConfig();
66 | void serialCopyTo(void* buffer);
67 | int serialize();
68 | void deserialize(const void* ptr, size_t length);
69 | void loadWeights(const nvinfer1::Weights* weights, int nbWeights);
70 | virtual void seril(ccutil::BinIO& out) {}
71 | virtual void deseril(ccutil::BinIO& in) {}
72 | };
73 |
74 | class TRTPlugin : public nvinfer1::IPluginExt {
75 | public:
76 |
77 | virtual ~TRTPlugin();
78 | virtual nvinfer1::Dims outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) = 0;
79 | virtual int enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) = 0;
80 |
81 | void pluginInit(const std::string& name, const nvinfer1::Weights* weights, int nbWeights);
82 | void pluginInit(const std::string& name, const void* serialData, size_t serialLength);
83 |
84 | virtual std::shared_ptr config(const std::string& layerName);
85 | virtual bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const;
86 | virtual void configureWithFormat(
87 | const nvinfer1::Dims* inputDims, int nbInputs, const nvinfer1::Dims* outputDims,
88 | int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize);
89 | virtual int getNbOutputs() const;
90 | virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims);
91 | virtual int initialize();
92 | virtual void terminate();
93 | virtual size_t getWorkspaceSize(int maxBatchSize) const override;
94 | virtual int enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream);
95 | virtual size_t getSerializationSize();
96 | virtual void serialize(void* buffer);
97 |
98 | private:
99 | void mappingToGTensor();
100 |
101 | protected:
102 | std::string layerName_;
103 | Phase phase_ = CompilePhase;
104 | std::shared_ptr config_;
105 | std::vector inputTensors_;
106 | std::vector outputTensors_;
107 | std::vector weightTensors_;
108 | };
109 |
110 | #define SETUP_PLUGIN(class_, pattern_) \
111 | static std::string pattern() { \
112 | return pattern_; \
113 | } \
114 | \
115 | static std::shared_ptr creator() { \
116 | return std::shared_ptr(new class_()); \
117 | }
118 |
119 | #define RegisterPlugin(class_) \
120 | static Plugin::PluginRegister __register##class_(class_::creator, class_::pattern())
121 |
122 | typedef std::shared_ptr(*PluginCreater)();
123 |
124 | struct PluginInfo {
125 | PluginCreater creater;
126 | std::string pattern;
127 | };
128 |
129 | class PluginRegistry {
130 | public:
131 | virtual void addPlugin(PluginCreater creater, const std::string& pattern) = 0;
132 | virtual PluginInfo* findPlugin(const std::string& layerName) = 0;
133 | };
134 |
135 | class PluginRegister {
136 | public:
137 | PluginRegister(PluginCreater creater, const std::string& pattern);
138 | };
139 |
140 | class TRTBuilderPluginFactory : public nvcaffeparser1::IPluginFactoryExt, public nvinfer1::IPluginFactory {
141 |
142 | public:
143 | virtual bool isPluginExt(const char* layerName) override;
144 | virtual bool isPlugin(const char* layerName) override;
145 | virtual nvinfer1::IPluginExt* createPlugin(const char* layerName, const nvinfer1::Weights* weights, int nbWeights) override;
146 | virtual nvinfer1::IPluginExt* createPlugin(const char* layerName, const void* serialData, size_t serialLength) override;
147 |
148 | virtual bool support(const std::string& layerName);
149 |
150 | virtual std::shared_ptr createPlugin(const std::string& layerName);
151 | virtual nvinfer1::IPluginExt* builderCreate(const std::string& layerName, const nvinfer1::Weights* weights, int nbWeights);
152 | virtual nvinfer1::IPluginExt* inferCreate(const std::string& layerName, const void* serialData, size_t serialLength);
153 |
154 | private:
155 | std::vector> plugins_;
156 | };
157 |
158 | ///////////////////////////////////////////////////////////////////////////////////////////
159 | std::shared_ptr createPluginFactoryForBuildPhase();
160 | std::shared_ptr createPluginFactoryForInferPhase();
161 | PluginRegistry* getPluginRegistry();
162 |
163 | #define ExecuteKernel(numJobs, kernel, stream) kernel<<>>
164 | }; //namespace Plugin
165 |
166 | #endif //PLUGIN_BASE_HPP
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/ChannelMultiplicationLayer.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "ChannelMultiplicationLayer.hpp"
4 | #include
5 |
6 | using namespace Plugin;
7 |
8 | template
9 | __global__ void channelMultiplicationKernel(const _T* in, const _T* muld, _T* out, int input_area, int edge)
10 | {
11 | KERNEL_POSITION;
12 |
13 | int c = position / input_area;
14 | out[position] = in[position] * muld[c];
15 | }
16 |
17 | namespace Plugin {
18 |
19 | std::shared_ptr ChannelMultiplicationLayer::config(const std::string& layerName) {
20 | auto cfg = TRTPlugin::config(layerName);
21 | cfg->supportDataType_ = {nvinfer1::DataType::kHALF, nvinfer1::DataType::kFLOAT};
22 | //cfg->supportDataType_ = {nvinfer1::DataType::kHALF};
23 | return cfg;
24 | }
25 |
26 | int ChannelMultiplicationLayer::enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) {
27 | auto& data = inputs[0];
28 | auto& mul = inputs[1];
29 | auto& out = outputs[0];
30 | int edge = data.count();
31 |
32 | if (config_->configDataType_ == TRTInfer::DataType::dtFloat) {
33 | channelMultiplicationKernel <<>> (data.ptr_float(), mul.ptr_float(), out.ptr_float(), data.height_ * data.width_, edge);
34 | }
35 | else if (config_->configDataType_ == TRTInfer::DataType::dtHalfloat) {
36 | channelMultiplicationKernel <<>> (
37 | (const TRTInfer::halfloat*)data.ptr_half(), (const TRTInfer::halfloat*)mul.ptr_half(), out.ptr_half(), data.height_ * data.width_, edge
38 | );
39 | }
40 | return 0;
41 | }
42 |
43 | nvinfer1::Dims ChannelMultiplicationLayer::outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) {
44 | return inputDims[0];
45 | }
46 | }
47 |
48 | RegisterPlugin(ChannelMultiplicationLayer);
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/ChannelMultiplicationLayer.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/src/caffeplugin/plugins/ChannelMultiplicationLayer.hpp
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/ClipLayer.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "ClipLayer.hpp"
4 | #include
5 |
6 | using namespace Plugin;
7 |
8 | template
9 | __global__ void clipKernel(const _T* in, _T* out, int inw, int inh, int outw, int outh, int edge)
10 | {
11 | KERNEL_POSITION;
12 |
13 | int outHeightIndex = position;
14 | int selectOutInnerY = outHeightIndex % outh;
15 | int nc = outHeightIndex / outh;
16 | int inHeightIndex = nc * inh + selectOutInnerY;
17 | in += inHeightIndex * inw;
18 | out += outHeightIndex * outw;
19 | for (int i = 0; i < outw; ++i)
20 | *out++ = *in++;
21 | }
22 |
23 | namespace Plugin {
24 |
25 | std::shared_ptr ClipLayer::config(const std::string& layerName) {
26 | auto cfg = TRTPlugin::config(layerName);
27 | cfg->supportDataType_ = {nvinfer1::DataType::kHALF, nvinfer1::DataType::kFLOAT};
28 | //cfg->supportDataType_ = {nvinfer1::DataType::kHALF};
29 | return cfg;
30 | }
31 |
32 | int ClipLayer::enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) {
33 | auto& data = inputs[0];
34 | auto& out = outputs[0];
35 |
36 | int edge = out.num_ * out.channel_ * out.height_;
37 | if (config_->configDataType_ == TRTInfer::DataType::dtFloat) {
38 | clipKernel <<>> (data.ptr_float(), out.ptr_float(), data.width_, data.height_, out.width_, out.height_, edge);
39 | }
40 | else if(config_->configDataType_ == TRTInfer::DataType::dtHalfloat) {
41 | clipKernel <<>> (data.ptr_half(), out.ptr_half(), data.width_, data.height_, out.width_, out.height_, edge);
42 | }
43 | return 0;
44 | }
45 |
46 | nvinfer1::Dims ClipLayer::outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) {
47 | return nvinfer1::Dims3(inputDims[0].d[0], inputDims[0].d[1] - 1, inputDims[0].d[2] - 1);
48 | }
49 | }
50 |
51 | RegisterPlugin(ClipLayer);
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/ClipLayer.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/src/caffeplugin/plugins/ClipLayer.hpp
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/DCNLayer.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef DCNLayer_HPP
3 | #define DCNLayer_HPP
4 |
5 | #include
6 | #include
7 |
8 | namespace Plugin {
9 | class DCNLayer : public TRTPlugin {
10 | public:
11 | SETUP_PLUGIN(DCNLayer, "DCN*");
12 |
13 | DCNLayer();
14 | virtual ~DCNLayer();
15 |
16 | virtual std::shared_ptr config(const std::string& layerName) override;
17 |
18 | virtual nvinfer1::Dims outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims);
19 | virtual size_t getWorkspaceSize(int maxBatchSize) const override;
20 | virtual int enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) override;
21 | private:
22 | cublasHandle_t cublasHandle_ = nullptr;
23 | };
24 | }
25 |
26 | #endif //DCNLayer_HPP
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/PlexShuffleLayer.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "PlexShuffleLayer.hpp"
4 | #include
5 |
6 | using namespace Plugin;
7 |
8 | template
9 | __global__ void pixelShuffleKernel(
10 | const _T* bottom_data, _T* top_data, int input_c, int input_h,
11 | int input_w, int input_area, int output_h, int output_w, int output_area, int edge)
12 | {
13 | KERNEL_POSITION;
14 |
15 | int input_c_index = position / input_area;
16 | int f_input_index = position % input_area;
17 | int input_row = f_input_index / input_w;
18 | int input_col = f_input_index % input_w;
19 | int output_c_index = input_c_index / 4;
20 | input_c_index = input_c_index % 4;
21 | int output_row = input_row * 2 + input_c_index / 2;
22 | int output_col = input_col * 2 + input_c_index % 2;
23 | int output_index = output_c_index * output_area + output_row * output_w + output_col;
24 | top_data[output_index] = bottom_data[position];
25 | }
26 |
27 | namespace Plugin {
28 |
29 | std::shared_ptr PlexShuffleLayer::config(const std::string& layerName){
30 | auto cfg = TRTPlugin::config(layerName);
31 | cfg->supportDataType_ = {nvinfer1::DataType::kHALF, nvinfer1::DataType::kFLOAT};
32 | //cfg->supportDataType_ = {nvinfer1::DataType::kHALF};
33 | return cfg;
34 | }
35 |
36 | int PlexShuffleLayer::enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) {
37 | auto& data = inputs[0];
38 | auto& out = outputs[0];
39 |
40 | int edge = data.count();
41 | if (config_->configDataType_ == TRTInfer::DataType::dtFloat) {
42 | pixelShuffleKernel <<>> (data.ptr_float(), out.ptr_float(), data.channel_, data.height_, data.width_,
43 | data.height_ * data.width_, out.height_, out.width_, out.height_ * out.width_, edge);
44 | }
45 | else {
46 | pixelShuffleKernel <<>> (data.ptr_half(), out.ptr_half(), data.channel_, data.height_, data.width_,
47 | data.height_ * data.width_, out.height_, out.width_, out.height_ * out.width_, edge);
48 | }
49 | return 0;
50 | }
51 |
52 | nvinfer1::Dims PlexShuffleLayer::outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) {
53 | return nvinfer1::Dims3(inputDims[0].d[0] / 4, inputDims[0].d[1] * 2, inputDims[0].d[2] * 2);
54 | }
55 | }
56 |
57 | RegisterPlugin(PlexShuffleLayer);
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/PlexShuffleLayer.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/src/caffeplugin/plugins/PlexShuffleLayer.hpp
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/TestPlugin.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "TestPlugin.hpp"
4 |
5 | typedef TRTInfer::halfloat halfloat;
6 |
7 | template
8 | __global__ void MyPluginKenel(_T* input, _T* output, int edge);
9 |
10 | template<>
11 | __global__ void MyPluginKenel(float* input, float* output, int edge) {
12 |
13 | KERNEL_POSITION;
14 | output[position] = (input[position] < 0 ? 0 : input[position]) + 1.3f;
15 | }
16 |
17 | template<>
18 | __global__ void MyPluginKenel(halfloat* input, halfloat* output, int edge) {
19 |
20 | KERNEL_POSITION;
21 |
22 | halfloat zero = 0.0f;
23 | halfloat add = 1.3f;
24 | output[position] = (input[position] < zero ? zero : input[position]) + add;
25 | }
26 |
27 | nvinfer1::Dims TestPlugin::outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) {
28 | return inputDims[0];
29 | }
30 |
31 | std::shared_ptr TestPlugin::config(const std::string& layerName) {
32 | auto cfg = TRTPlugin::config(layerName);
33 |
34 | //定义我们这个插件支持half和float格式
35 | cfg->supportDataType_ = {nvinfer1::DataType::kHALF, nvinfer1::DataType::kFLOAT};
36 | //cfg->supportDataType_ = {nvinfer1::DataType::kHALF};
37 | return cfg;
38 | }
39 |
40 | int TestPlugin::enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) {
41 |
42 | int count = inputs[0].count();
43 | auto grid = gridDims(count);
44 | auto block = blockDims(count);
45 |
46 | if (config_->configDataType_ == TRTInfer::DataType::dtFloat) {
47 | MyPluginKenel <<>> (inputs[0].ptr(), outputs[0].ptr(), count);
48 | }
49 | else if (config_->configDataType_ == TRTInfer::DataType::dtHalfloat) {
50 | MyPluginKenel <<>> (inputs[0].ptr(), outputs[0].ptr(), count);
51 | }
52 | return 0;
53 | }
54 |
55 | RegisterPlugin(TestPlugin);
--------------------------------------------------------------------------------
/src/caffeplugin/plugins/TestPlugin.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef WReLU_HPP
3 | #define WReLU_HPP
4 |
5 | #include
6 |
7 | using namespace Plugin;
8 |
9 | class TestPlugin : public TRTPlugin {
10 | public:
11 | //设置插件函数,通过宏,执行插件创建函数,同时执行模式匹配名字,用来对每个层的名字做模式匹配
12 | //该匹配方法用的是ccutil::patternMatch,请参照这个函数
13 | SETUP_PLUGIN(TestPlugin, "TestPlugin*");
14 |
15 | virtual std::shared_ptr config(const std::string& layerName) override;
16 |
17 | //这个插件只有一个输出,输出的shape等于输入0的shape,因此返回input0的shape
18 | virtual nvinfer1::Dims outputDims(int index, const nvinfer1::Dims* inputDims, int nbInputDims) override;
19 |
20 | //执行过程
21 | int enqueue(const std::vector& inputs, std::vector& outputs, const std::vector& weights, void* workspace, cudaStream_t stream) override;
22 | };
23 |
24 | #endif //WReLU_HPP
--------------------------------------------------------------------------------
/src/common/trt_common.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "trt_common.hpp"
4 | #include
5 |
6 | dim3 gridDims(int numJobs) {
7 | int numBlockThreads = numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
8 | return dim3(ceil(numJobs / (float)numBlockThreads));
9 | }
10 |
11 | dim3 blockDims(int numJobs) {
12 | return numJobs < GPU_BLOCK_THREADS ? numJobs : GPU_BLOCK_THREADS;
13 | }
--------------------------------------------------------------------------------
/src/common/trt_common.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef TRT_COMMON_HPP
3 | #define TRT_COMMON_HPP
4 |
5 | #include
6 |
7 | #define GPU_BLOCK_THREADS 512
8 | #define KERNEL_POSITION \
9 | int position = (blockDim.x * blockIdx.x + threadIdx.x); \
10 | if (position >= (edge)) return;
11 |
12 | dim3 gridDims(int numJobs);
13 | dim3 blockDims(int numJobs);
14 |
15 | #endif //TRT_COMMON_HPP
--------------------------------------------------------------------------------
/src/examples/center_net_coco2x_dcn.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include "builder/trt_builder.hpp"
5 | #include "infer/trt_infer.hpp"
6 | #include "infer/ct_detect_backend.hpp"
7 |
8 | using namespace cv;
9 | using namespace std;
10 |
11 | namespace examples {
12 |
13 | static Rect restoreCenterNetBox(float dx, float dy, float dw, float dh, float cellx, float celly, int stride, Size netSize, Size imageSize) {
14 |
15 | float scale = 0;
16 | if (imageSize.width >= imageSize.height)
17 | scale = netSize.width / (float)imageSize.width;
18 | else
19 | scale = netSize.height / (float)imageSize.height;
20 |
21 | float x = ((cellx + dx - dw * 0.5) * stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
22 | float y = ((celly + dy - dh * 0.5) * stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
23 | float r = ((cellx + dx + dw * 0.5) * stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
24 | float b = ((celly + dy + dh * 0.5) * stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
25 | return Rect(Point(x, y), Point(r + 1, b + 1));
26 | }
27 |
28 | static void preprocessCenterNetImageToTensor(const Mat& image, int numIndex, const shared_ptr& tensor) {
29 |
30 | int outH = tensor->height();
31 | int outW = tensor->width();
32 | float sw = outW / (float)image.cols;
33 | float sh = outH / (float)image.rows;
34 | float scale = std::min(sw, sh);
35 |
36 | Mat matrix = getRotationMatrix2D(Point2f(image.cols*0.5, image.rows*0.5), 0, scale);
37 | matrix.at(0, 2) -= image.cols*0.5 - outW * 0.5;
38 | matrix.at(1, 2) -= image.rows*0.5 - outH * 0.5;
39 |
40 | float mean[3] = {0.40789654, 0.44719302, 0.47026115};
41 | float std[3] = {0.28863828, 0.27408164, 0.27809835};
42 |
43 | Mat outimage;
44 | cv::warpAffine(image, outimage, matrix, Size(outW, outH));
45 | tensor->setNormMatGPU(numIndex, outimage, mean, std);
46 | }
47 |
48 | static vector detectBoundingbox(const shared_ptr& boundingboxDetect_, const Mat& image, float threshold = 0.3) {
49 |
50 | if (boundingboxDetect_ == nullptr) {
51 | INFO("detectBoundingbox failure call, model is nullptr");
52 | return vector();
53 | }
54 |
55 | preprocessCenterNetImageToTensor(image, 0, boundingboxDetect_->input());
56 | boundingboxDetect_->forward();
57 |
58 | auto outHM = boundingboxDetect_->tensor("hm");
59 | auto outHMPool = boundingboxDetect_->tensor("hm_pool");
60 | auto outWH = boundingboxDetect_->tensor("wh");
61 | auto outXY = boundingboxDetect_->tensor("reg");
62 | const int stride = 4;
63 |
64 | vector bboxs;
65 | Size inputSize = boundingboxDetect_->input()->size();
66 | float sx = image.cols / (float)inputSize.width * stride;
67 | float sy = image.rows / (float)inputSize.height * stride;
68 |
69 | for (int class_ = 0; class_ < outHM->channel(); ++class_) {
70 | for (int i = 0; i < outHM->height(); ++i) {
71 | float* ohmptr = outHM->cpu(0, class_, i);
72 | float* ohmpoolptr = outHMPool->cpu(0, class_, i);
73 | for (int j = 0; j < outHM->width(); ++j) {
74 | if (*ohmptr == *ohmpoolptr && *ohmpoolptr > threshold) {
75 |
76 | float dx = outXY->at(0, 0, i, j);
77 | float dy = outXY->at(0, 1, i, j);
78 | float dw = outWH->at(0, 0, i, j);
79 | float dh = outWH->at(0, 1, i, j);
80 | ccutil::BBox box = restoreCenterNetBox(dx, dy, dw, dh, j, i, stride, inputSize, image.size());
81 | box = box.box() & Rect(0, 0, image.cols, image.rows);
82 | box.label = class_;
83 | box.score = *ohmptr;
84 |
85 | if (box.area() > 0)
86 | bboxs.push_back(box);
87 | }
88 | ++ohmptr;
89 | ++ohmpoolptr;
90 | }
91 | }
92 | }
93 | return bboxs;
94 | }
95 |
96 | static vector> detectBoundingboxOptim(const shared_ptr& boundingboxDetect_, const vector& images,
97 | float threshold, int maxobjs, TRTInfer::CTDetectBackend* detectBackend) {
98 |
99 | if (boundingboxDetect_ == nullptr) {
100 | INFO("detectBoundingbox failure call, model is nullptr");
101 | return vector>();
102 | }
103 | boundingboxDetect_->input()->resize(images.size());
104 |
105 | vector imsize;
106 | for (int i = 0; i < images.size(); ++i) {
107 | preprocessCenterNetImageToTensor(images[i], i, boundingboxDetect_->input()); //1.0 ms
108 | imsize.emplace_back(images[i].size());
109 | }
110 |
111 | boundingboxDetect_->forward(false); //41.5 ms
112 | auto outHM = boundingboxDetect_->tensor("hm");
113 | auto outHMPool = boundingboxDetect_->tensor("hm_pool");
114 | auto outWH = boundingboxDetect_->tensor("wh");
115 | auto outXY = boundingboxDetect_->tensor("reg");
116 | return detectBackend->forwardGPU(outHM, outHMPool, outWH, outXY, imsize, threshold, maxobjs); // 0.25 ms
117 | }
118 |
119 | void center_net_coco2x_dcn() {
120 |
121 | INFOW("onnx to trtmodel...");
122 |
123 | // tensorRT 7.0 + OnnX: Must be an explicit batchsize, that is, batchsize must be specified at compile time.
124 | int batchSize = 2;
125 | auto modelFile = ccutil::format("models/dladcnv2.fp32.b%d.trtmodel", batchSize);
126 | if (!ccutil::exists(modelFile)) {
127 |
128 | if (!ccutil::exists("models/dladcnv2.onnx")) {
129 | INFOW(
130 | "models/dladcnv2.onnx not found, download url: http://zifuture.com:1000/fs/public_models/dladcnv2.onnx "
131 | "or use centerNetDLADCNOnnX/dladcn_export_onnx.py to generate"
132 | );
133 | return;
134 | }
135 |
136 | TRTBuilder::compileTRT(
137 | TRTBuilder::TRTMode_FP32, {}, batchSize,
138 | TRTBuilder::ModelSource("models/dladcnv2.onnx"),
139 | modelFile, {TRTBuilder::InputDims(3, 512, 512)}
140 | );
141 | }
142 |
143 | INFO("load model: %s", modelFile.c_str());
144 | auto engine = TRTInfer::loadEngine(modelFile);
145 | if (!engine) {
146 | INFO("can not load model.");
147 | return;
148 | }
149 |
150 | INFO("forward...");
151 | vector images{
152 | imread("imgs/www.jpg"),
153 | imread("imgs/17790319373_bd19b24cfc_k.jpg")
154 | };
155 |
156 | TRTInfer::CTDetectBackend backend(engine->getCUStream());
157 | auto imobjs = detectBoundingboxOptim(engine, images, 0.3, 100, &backend); // 43.86 ms
158 |
159 | for (int j = 0; j < images.size(); ++j) {
160 | auto& objs = imobjs[j];
161 | objs = ccutil::nms(objs, 0.5);
162 |
163 | INFO("objs.length = %d", objs.size());
164 | for (int i = 0; i < objs.size(); ++i) {
165 | auto& obj = objs[i];
166 | ccutil::drawbbox(images[j], obj);
167 | }
168 | imwrite(ccutil::format("results/%d.centernet.coco2x.dcn.jpg", j), images[j]);
169 | }
170 |
171 | #ifdef _WIN32
172 | cv::imshow("dla dcn detect 1", images[0]);
173 | cv::imshow("dla dcn detect 2", images[1]);
174 | cv::waitKey();
175 | cv::destroyAllWindows();
176 | #endif
177 | INFO("done.");
178 | }
179 | };
--------------------------------------------------------------------------------
/src/examples/center_track_coco_tracking.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include "builder/trt_builder.hpp"
5 | #include "infer/trt_infer.hpp"
6 |
7 | using namespace cv;
8 | using namespace std;
9 |
10 | namespace examples {
11 |
12 | static void drawArrow(cv::Mat& img, cv::Point pStart, cv::Point pEnd, int len, int alpha,
13 | cv::Scalar color, int thickness, int lineType)
14 | {
15 | const double PI = 3.1415926;
16 | Point arrow;
17 | double angle = atan2((double)(pStart.y - pEnd.y), (double)(pStart.x - pEnd.x));
18 | line(img, pStart, pEnd, color, thickness, lineType);
19 |
20 | arrow.x = pEnd.x + len * cos(angle + PI * alpha / 180);
21 | arrow.y = pEnd.y + len * sin(angle + PI * alpha / 180);
22 | line(img, pEnd, arrow, color, thickness, lineType);
23 | arrow.x = pEnd.x + len * cos(angle - PI * alpha / 180);
24 | arrow.y = pEnd.y + len * sin(angle - PI * alpha / 180);
25 | line(img, pEnd, arrow, color, thickness, lineType);
26 | }
27 |
28 | static Rect restoreCenterNetBox(float dx, float dy, float dw, float dh, float cellx, float celly, int stride, Size netSize, Size imageSize) {
29 |
30 | float scale = 0;
31 | if (imageSize.width >= imageSize.height)
32 | scale = netSize.width / (float)imageSize.width;
33 | else
34 | scale = netSize.height / (float)imageSize.height;
35 |
36 | float x = ((cellx + dx - dw * 0.5) * stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
37 | float y = ((celly + dy - dh * 0.5) * stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
38 | float r = ((cellx + dx + dw * 0.5) * stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
39 | float b = ((celly + dy + dh * 0.5) * stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
40 | return Rect(Point(x, y), Point(r + 1, b + 1));
41 | }
42 |
43 | static Scalar restoreCenterTracking(float ox, float oy, float cellx, float celly, int stride, Size netSize, Size imageSize) {
44 |
45 | float scale = 0;
46 | if (imageSize.width >= imageSize.height)
47 | scale = netSize.width / (float)imageSize.width;
48 | else
49 | scale = netSize.height / (float)imageSize.height;
50 |
51 | float x = ((cellx + ox) * stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
52 | float y = ((celly + oy) * stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
53 | float x0 = ((cellx)* stride - netSize.width * 0.5) / scale + imageSize.width * 0.5;
54 | float y0 = ((celly)* stride - netSize.height * 0.5) / scale + imageSize.height * 0.5;
55 | return Scalar(x0, y0, x, y);
56 | }
57 |
58 | static void preprocessCenterNetImageToTensor(const Mat& image, int numIndex, const shared_ptr& tensor) {
59 |
60 | int outH = tensor->height();
61 | int outW = tensor->width();
62 | float sw = outW / (float)image.cols;
63 | float sh = outH / (float)image.rows;
64 | float scale = std::min(sw, sh);
65 |
66 | Mat matrix = getRotationMatrix2D(Point2f(image.cols*0.5, image.rows*0.5), 0, scale);
67 | matrix.at(0, 2) -= image.cols*0.5 - outW * 0.5;
68 | matrix.at(1, 2) -= image.rows*0.5 - outH * 0.5;
69 |
70 | float mean[3] = {0.40789654, 0.44719302, 0.47026115};
71 | float std[3] = {0.28863828, 0.27408164, 0.27809835};
72 |
73 | Mat outimage;
74 | cv::warpAffine(image, outimage, matrix, Size(outW, outH));
75 | tensor->setNormMatGPU(numIndex, outimage, mean, std);
76 | }
77 |
78 | static vector> detectBoundingboxAndTracking(const shared_ptr& boundingboxAndTrackingDetect_, const Mat& image, const Mat& prevImage, float threshold = 0.3) {
79 |
80 | if (boundingboxAndTrackingDetect_ == nullptr) {
81 | INFO("detectBoundingbox failure call, model is nullptr");
82 | return vector>();
83 | }
84 |
85 | preprocessCenterNetImageToTensor(image, 0, boundingboxAndTrackingDetect_->input(0));
86 | preprocessCenterNetImageToTensor(prevImage, 0, boundingboxAndTrackingDetect_->input(1));
87 | boundingboxAndTrackingDetect_->forward();
88 | auto outHM = boundingboxAndTrackingDetect_->tensor("hm");
89 | auto outHMPool = boundingboxAndTrackingDetect_->tensor("hm_pool");
90 | auto outWH = boundingboxAndTrackingDetect_->tensor("wh");
91 | auto outXY = boundingboxAndTrackingDetect_->tensor("reg");
92 | auto outTracking = boundingboxAndTrackingDetect_->tensor("tracking");
93 | const int stride = 4;
94 |
95 | vector> bboxs;
96 | Size inputSize = boundingboxAndTrackingDetect_->input()->size();
97 | float sx = image.cols / (float)inputSize.width * stride;
98 | float sy = image.rows / (float)inputSize.height * stride;
99 |
100 | for (int class_ = 0; class_ < outHM->channel(); ++class_) {
101 | for (int i = 0; i < outHM->height(); ++i) {
102 | float* ohmptr = outHM->cpu(0, class_, i);
103 | float* ohmpoolptr = outHMPool->cpu(0, class_, i);
104 | for (int j = 0; j < outHM->width(); ++j) {
105 | if (*ohmptr == *ohmpoolptr && *ohmpoolptr > threshold) {
106 |
107 | float dx = outXY->at(0, 0, i, j);
108 | float dy = outXY->at(0, 1, i, j);
109 | float dw = outWH->at(0, 0, i, j);
110 | float dh = outWH->at(0, 1, i, j);
111 | float ox = outTracking->at(0, 0, i, j);
112 | float oy = outTracking->at(0, 1, i, j);
113 | ccutil::BBox box = restoreCenterNetBox(dx, dy, dw, dh, j, i, stride, inputSize, image.size());
114 | auto offset = restoreCenterTracking(ox, oy, j, i, stride, inputSize, image.size());
115 | box = box.box() & Rect(0, 0, image.cols, image.rows);
116 | box.label = class_;
117 | box.score = *ohmptr;
118 |
119 | if (box.area() > 0)
120 | bboxs.push_back(make_tuple(box, offset));
121 | }
122 | ++ohmptr;
123 | ++ohmpoolptr;
124 | }
125 | }
126 | }
127 | return bboxs;
128 | }
129 |
130 | void center_track_coco_tracking() {
131 | INFOW("onnx to trtmodel...");
132 |
133 | if (!ccutil::exists("models/coco_tracking.fp32.trtmodel")) {
134 |
135 | if (!ccutil::exists("models/coco_tracking.onnx")) {
136 |
137 | INFOW(
138 | "models/coco_tracking.onnx not found, download url: http://zifuture.com:1000/fs/public_models/coco_tracking.onnx"
139 | );
140 | return;
141 | }
142 |
143 | TRTBuilder::compileTRT(
144 | TRTBuilder::TRTMode_FP32, {}, 1,
145 | TRTBuilder::ModelSource("models/coco_tracking.onnx"),
146 | "models/coco_tracking.fp32.trtmodel",
147 | {TRTBuilder::InputDims(3, 512, 512), TRTBuilder::InputDims(3, 512, 512)}
148 | );
149 | }
150 |
151 | INFO("load model: models/coco_tracking.fp32.trtmodel");
152 | auto engine = TRTInfer::loadEngine("models/coco_tracking.fp32.trtmodel");
153 | if (!engine) {
154 | INFO("can not load model.");
155 | return;
156 | }
157 |
158 | INFO("forward...");
159 | Mat prevImage = imread("imgs/000020.jpg");
160 | Mat image = imread("imgs/000023.jpg");
161 |
162 | auto objs = detectBoundingboxAndTracking(engine, image, prevImage, 0.35);
163 |
164 | INFO("objs.length = %d", objs.size());
165 | for (int i = 0; i < objs.size(); ++i) {
166 | auto& obj = objs[i];
167 | auto& box = get<0>(obj);
168 | auto& offset = get<1>(obj);
169 | ccutil::drawbbox(image, box, ccutil::DrawType::NoName);
170 | drawArrow(image, Point(offset[0], offset[1]), Point(offset[2], offset[3]), 10, 35, Scalar(0, 255, 0), 2, 16);
171 |
172 | ccutil::drawbbox(prevImage, box, ccutil::DrawType::NoName);
173 | drawArrow(prevImage, Point(offset[0], offset[1]), Point(offset[2], offset[3]), 10, 35, Scalar(0, 255, 0), 2, 16);
174 | }
175 |
176 | imwrite("results/coco.tracking.jpg", image);
177 |
178 | #ifdef _WIN32
179 | cv::imshow("coco.tracking.current", image);
180 | cv::imshow("coco.tracking.prev", prevImage);
181 | cv::waitKey();
182 | cv::destroyAllWindows();
183 | #endif
184 | INFO("done.");
185 | }
186 | };
--------------------------------------------------------------------------------
/src/examples/dbface.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include "builder/trt_builder.hpp"
5 | #include "infer/trt_infer.hpp"
6 | #include "infer/dbface_backend.hpp"
7 |
8 | using namespace cv;
9 | using namespace std;
10 |
11 | namespace examples {
12 |
13 | static float commonExp(float value) {
14 |
15 | float gate = 1;
16 | float base = exp(gate);
17 | if (fabs(value) < gate)
18 | return value * base;
19 |
20 | if (value > 0) {
21 | return exp(value);
22 | }
23 | else {
24 | return -exp(-value);
25 | }
26 | }
27 |
28 | static vector detectDBFace(const shared_ptr& dbfaceDetect_, const Mat& image, float threshold = 0.3) {
29 |
30 | Assert(image.cols % 32 == 0 || image.rows % 32 == 0);
31 |
32 | float mean[3] = {0.408, 0.447, 0.47};
33 | float std[3] = {0.289, 0.274, 0.278};
34 |
35 | //dbfaceDetect_->input()->setNormMat(0, image, mean, std); // 20 ms
36 | dbfaceDetect_->input()->setNormMatGPU(0, image, mean, std); // 5 ms
37 | dbfaceDetect_->forward();
38 | auto outHM = dbfaceDetect_->tensor("hm");
39 | auto outHMPool = dbfaceDetect_->tensor("pool_hm");
40 | auto outTLRB = dbfaceDetect_->tensor("tlrb");
41 | auto outLandmark = dbfaceDetect_->tensor("landmark");
42 | const int stride = 4;
43 |
44 | vector bboxs;
45 | Size inputSize = dbfaceDetect_->input()->size();
46 | float sx = image.cols / (float)inputSize.width * stride;
47 | float sy = image.rows / (float)inputSize.height * stride;
48 |
49 | for (int class_ = 0; class_ < outHM->channel(); ++class_) {
50 | for (int i = 0; i < outHM->height(); ++i) {
51 | float* ohmptr = outHM->cpu(0, class_, i);
52 | float* ohmpoolptr = outHMPool->cpu(0, class_, i);
53 | for (int j = 0; j < outHM->width(); ++j) {
54 | if (*ohmptr == *ohmpoolptr && *ohmpoolptr > threshold) {
55 |
56 | float dx = outTLRB->at(0, 0, i, j);
57 | float dy = outTLRB->at(0, 1, i, j);
58 | float dr = outTLRB->at(0, 2, i, j);
59 | float db = outTLRB->at(0, 3, i, j);
60 | float cx = j;
61 | float cy = i;
62 | float x = (cx - dx) * stride;
63 | float y = (cy - dy) * stride;
64 | float r = (cx + dr) * stride;
65 | float b = (cy + db) * stride;
66 |
67 | TRTInfer::FaceBox box(ccutil::BBox(x, y, r, b, *ohmptr, class_));
68 | if (box.area() > 0) {
69 |
70 | for (int k = 0; k < 5; ++k) {
71 | float landmark_x = outLandmark->at(0, k, i, j) * 4;
72 | float landmark_y = outLandmark->at(0, k + 5, i, j) * 4;
73 | landmark_x = (commonExp(landmark_x) + cx) * stride;
74 | landmark_y = (commonExp(landmark_y) + cy) * stride;
75 | box.landmark[k] = Point2f(landmark_x, landmark_y);
76 | }
77 | bboxs.push_back(box);
78 | }
79 | }
80 | ++ohmptr;
81 | ++ohmpoolptr;
82 | }
83 | }
84 | }
85 | return bboxs;
86 | }
87 |
88 | static vector detectDBFaceOptim(const shared_ptr& dbfaceDetect_, const Mat& image, float threshold, TRTInfer::DBFaceBackend* backend) {
89 |
90 | Assert(image.cols % 32 == 0 || image.rows % 32 == 0);
91 |
92 | float mean[3] = {0.408, 0.447, 0.47};
93 | float std[3] = {0.289, 0.274, 0.278};
94 |
95 | dbfaceDetect_->input()->setNormMatGPU(0, image, mean, std); // 5 ms
96 | dbfaceDetect_->forward(false);
97 | auto outHM = dbfaceDetect_->tensor("hm");
98 | auto outHMPool = dbfaceDetect_->tensor("pool_hm");
99 | auto outTLRB = dbfaceDetect_->tensor("tlrb");
100 | auto outLandmark = dbfaceDetect_->tensor("landmark");
101 | const int stride = 4;
102 | return backend->forwardGPU(outHM, outHMPool, outTLRB, outLandmark, threshold, 1000)[0]; // 0.25 ms
103 | }
104 |
105 | static Mat padImage(const Mat& image, int stride = 32) {
106 |
107 | int w = image.cols;
108 | if (image.cols % stride != 0)
109 | w = image.cols + (stride - (image.cols % stride));
110 |
111 | int h = image.rows;
112 | if (image.rows % stride != 0)
113 | h = image.rows + (stride - (image.rows % stride));
114 |
115 | if (Size(w, h) == image.size())
116 | return image;
117 |
118 | Mat output(h, w, image.type(), Scalar(0));
119 | image.copyTo(output(Rect(0, 0, image.cols, image.rows)));
120 | return output;
121 | }
122 |
123 | void dbface() {
124 |
125 | Mat image = imread("imgs/selfie.jpg");
126 | if (image.empty()) {
127 | INFOW("image load fail");
128 | return;
129 | }
130 |
131 | Mat padimage = padImage(image);
132 | int maxBatchSize = 1;
133 | string modelPath = ccutil::format("models/dbface.%dx%d.fp32.b%d.trtmodel", padimage.cols, padimage.rows, maxBatchSize);
134 |
135 | if (!ccutil::exists(modelPath)) {
136 |
137 | if (!ccutil::exists("models/dbface.onnx")) {
138 | INFOW(
139 | "models/dbface.onnx not found, download url: http://zifuture.com:1000/fs/public_models/dbface.onnx"
140 | );
141 | return;
142 | }
143 |
144 | TRTBuilder::compileTRT(
145 | TRTBuilder::TRTMode_FP32, {}, maxBatchSize,
146 | TRTBuilder::ModelSource("models/dbface.onnx"),
147 | modelPath,
148 | {TRTBuilder::InputDims(3, padimage.rows, padimage.cols)}
149 | );
150 | }
151 |
152 | INFO("load model: %s", modelPath.c_str());
153 | auto engine = TRTInfer::loadEngine(modelPath);
154 | if (!engine) {
155 | INFO("can not load model: %s", modelPath.c_str());
156 | return;
157 | }
158 |
159 | INFO("forward...");
160 | TRTInfer::DBFaceBackend backend(engine->getCUStream());
161 | auto objs = detectDBFaceOptim(engine, image, 0.25, &backend);
162 |
163 | INFO("objs.length = %d", objs.size());
164 | for (int i = 0; i < objs.size(); ++i) {
165 | auto& obj = objs[i];
166 | ccutil::drawbbox(image, obj, ccutil::DrawType::Empty);
167 |
168 | for (int k = 0; k < 5; ++k) {
169 | cv::circle(image, obj.landmark[k], 3, Scalar(0, 0, 255), -1, 16);
170 | }
171 | }
172 |
173 | imwrite("results/selfie.draw.jpg", image);
174 |
175 | #ifdef _WIN32
176 | cv::imshow("dbface selfie detect", image);
177 | cv::waitKey();
178 | cv::destroyAllWindows();
179 | #endif
180 | INFO("done.");
181 | }
182 | };
--------------------------------------------------------------------------------
/src/examples/onnx.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include "builder/trt_builder.hpp"
5 | #include "infer/trt_infer.hpp"
6 |
7 | using namespace cv;
8 | using namespace std;
9 |
10 | namespace examples {
11 |
12 | void onnx() {
13 |
14 | if (!ccutil::exists("models/demo.onnx")) {
15 | INFOE("models/demo.onnx not exists, run< python plugin_onnx_export.py > generate demo.onnx.");
16 | return;
17 | }
18 |
19 | INFOW("onnx to trtmodel...");
20 | TRTBuilder::compileTRT(
21 | TRTBuilder::TRTMode_FP32, {}, 4,
22 | TRTBuilder::ModelSource("models/demo.onnx"),
23 | "models/demo.fp32.trtmodel",
24 | {TRTBuilder::InputDims(3, 5, 5), TRTBuilder::InputDims(3, 5, 5)}
25 | );
26 | INFO("done.");
27 |
28 | INFO("load model: models/demo.fp32.trtmodel");
29 | auto engine = TRTInfer::loadEngine("models/demo.fp32.trtmodel");
30 | if (!engine) {
31 | INFO("can not load model.");
32 | return;
33 | }
34 |
35 | INFO("forward...");
36 |
37 | engine->input(0)->setTo(0.25);
38 | engine->input(1)->setTo(0);
39 | engine->forward();
40 | auto output = engine->output(0);
41 | output->print();
42 | INFO("done.");
43 | }
44 | };
--------------------------------------------------------------------------------
/src/import_lib.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | //导入OpenCV,根据编译情况选择不同库
4 | #if defined(_DEBUG)
5 | # pragma comment(lib, "opencv_world346d.lib")
6 | #else
7 | # pragma comment(lib, "opencv_world346.lib")
8 | #endif
9 |
10 | //导入cuda
11 | #pragma comment(lib, "cuda.lib")
12 | #pragma comment(lib, "cudart.lib")
13 | #pragma comment(lib, "cublas.lib")
14 | #pragma comment(lib, "cudnn.lib")
15 |
16 | //导入tensorRT
17 | #pragma comment(lib, "nvinfer.lib")
18 | #pragma comment(lib, "nvinfer_plugin.lib")
19 | #pragma comment(lib, "nvparsers.lib")
20 |
21 | #if defined(_DEBUG)
22 | #pragma comment(lib, "libprotobufd.lib")
23 | #else
24 | #pragma comment(lib, "libprotobuf.lib")
25 | #endif
26 | //#pragma comment(lib, "nvonnxparser.lib")
--------------------------------------------------------------------------------
/src/infer/ct_detect_backend.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "ct_detect_backend.hpp"
4 | #include
5 | #include
6 |
7 | namespace TRTInfer {
8 |
9 | CTDetectBackend::CTDetectBackend(CUStream stream) :Backend(stream){}
10 |
11 | static __global__ void CTDetectBackend_forwardGPU(float* hm, float* hmpool, float* wh, float* reg, int* countptr, ccutil::BBox* boxptr, int width, int height, int w_x_h, int channels, int stride, float threshold,
12 | int maxobjs, int imageWidth, int imageHeight, float scale, int edge) {
13 |
14 | KERNEL_POSITION;
15 |
16 | float confidence = hm[position];
17 | if (confidence != hmpool[position] || confidence < threshold)
18 | return;
19 |
20 | int index = atomicAdd(countptr, 1);
21 | if (index >= maxobjs)
22 | return;
23 |
24 | int channel_index = position / w_x_h;
25 | int classes = channel_index;
26 | int offsetChannel0 = position - channel_index * w_x_h;
27 | int offsetChannel1 = offsetChannel0 + w_x_h;
28 |
29 | int cx = offsetChannel0 % width;
30 | int cy = offsetChannel0 / width;
31 |
32 | ccutil::BBox* ptr = boxptr + index;
33 | float dx = reg[offsetChannel0];
34 | float dy = reg[offsetChannel1];
35 | float dw = wh[offsetChannel0];
36 | float dh = wh[offsetChannel1];
37 |
38 | ptr->x = ((cx + dx - dw * 0.5 - width * 0.5) * stride) / scale + imageWidth * 0.5;
39 | ptr->y = ((cy + dy - dh * 0.5 - height * 0.5) * stride) / scale + imageHeight * 0.5;
40 | ptr->r = ((cx + dx + dw * 0.5 - width * 0.5) * stride) / scale + imageWidth * 0.5;
41 | ptr->b = ((cy + dy + dh * 0.5 - height * 0.5) * stride) / scale + imageHeight * 0.5;
42 | ptr->score = confidence;
43 | ptr->label = classes;
44 | }
45 |
46 | const std::vector>& CTDetectBackend::forwardGPU(std::shared_ptr hm, std::shared_ptr hmpool, std::shared_ptr wh,
47 | std::shared_ptr reg, const std::vector& imageSize, float threshold, int maxobjs) {
48 |
49 | int count = hm->count(1); // w * h * c
50 | int width = hm->width();
51 | int height = hm->height();
52 | int batchSize = hm->num();
53 | int channels = hm->channel();
54 | int stride = 4;
55 | auto grid = gridDims(count);
56 | auto block = blockDims(count);
57 |
58 | size_t objsStoreSize = maxobjs * sizeof(ccutil::BBox) + sizeof(int);
59 | int heatmapArea = width * height;
60 | void* cpuPtr = getCPUMemory(objsStoreSize * batchSize);
61 | char* cpuPtrInput = (char*)cpuPtr;
62 | void* gpuPtr = getGPUMemory(objsStoreSize * batchSize);
63 | char* gpuPtrInput = (char*)gpuPtr;
64 | auto stream = getStream();
65 |
66 | for (int i = 0; i < batchSize; ++i) {
67 |
68 | auto& imsize = imageSize[i];
69 | float sw = width * stride / (float)imsize.width;
70 | float sh = height * stride / (float)imsize.height;
71 | float scale = std::min(sw, sh);
72 |
73 | float* hm_ptr = hm->gpu(i);
74 | float* hm_pool_ptr = hmpool->gpu(i);
75 | float* wh_ptr = wh->gpu(i);
76 | float* reg_ptr = reg->gpu(i);
77 |
78 | int* countPtr = (int*)gpuPtrInput;
79 | ccutil::BBox* boxPtr = (ccutil::BBox*)((char*)gpuPtrInput + sizeof(int));
80 |
81 | cudaMemsetAsync(gpuPtrInput, 0, sizeof(int), stream);
82 | CTDetectBackend_forwardGPU <<< grid, block, 0, stream >>> (hm_ptr, hm_pool_ptr, wh_ptr, reg_ptr, countPtr, boxPtr,
83 | width, height, heatmapArea, channels, stride, threshold, maxobjs, imsize.width, imsize.height, scale, count);
84 |
85 | cudaMemcpyAsync(cpuPtrInput, gpuPtrInput, objsStoreSize, cudaMemcpyKind::cudaMemcpyDeviceToHost, stream);
86 |
87 | cpuPtrInput += objsStoreSize;
88 | gpuPtrInput += objsStoreSize;
89 | }
90 | cudaStreamSynchronize(stream);
91 |
92 | cpuPtrInput = (char*)cpuPtr;
93 | outputs_.resize(batchSize);
94 |
95 | for (int i = 0; i < batchSize; ++i, cpuPtrInput += objsStoreSize) {
96 | auto& output = outputs_[i];
97 | output.clear();
98 |
99 | int num = *((int*)cpuPtrInput);
100 | num = std::min(num, maxobjs);
101 |
102 | if (num == 0)
103 | continue;
104 |
105 | ccutil::BBox* ptr = (ccutil::BBox*)(cpuPtrInput + sizeof(int));
106 | output.insert(output.begin(), ptr, ptr + num);
107 | }
108 | return outputs_;
109 | }
110 | };
--------------------------------------------------------------------------------
/src/infer/ct_detect_backend.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef CT_DETECT_BACKEND_HPP
3 | #define CT_DETECT_BACKEND_HPP
4 |
5 | #include
6 | #include
7 | #include
8 | #include "trt_backend.hpp"
9 |
10 | namespace TRTInfer {
11 |
12 | class CTDetectBackend : public Backend{
13 | public:
14 | CTDetectBackend(CUStream stream = nullptr);
15 |
16 | const std::vector>& forwardGPU(std::shared_ptr hm, std::shared_ptr hmpool, std::shared_ptr wh, std::shared_ptr reg,
17 | const std::vector& imageSize, float threshold, int maxobjs);
18 |
19 | private:
20 | std::vector> outputs_;
21 | };
22 | };
23 |
24 | #endif // CT_DETECT_BACKEND_HPP
--------------------------------------------------------------------------------
/src/infer/dbface_backend.cu:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "dbface_backend.hpp"
4 | #include
5 | #include
6 |
7 | namespace TRTInfer {
8 |
9 | DBFaceBackend::DBFaceBackend(CUStream stream):Backend(stream){}
10 |
11 | static __device__ float commonExp(float value) {
12 |
13 | float gate = 1.0f;
14 | if (fabs(value) < gate)
15 | return value * exp(gate);
16 |
17 | if (value > 0)
18 | return exp(value);
19 | else
20 | return -exp(-value);
21 | }
22 |
23 | static __global__ void DBFaceBackend_forwardGPU(float* hm, float* hmpool, float* tlrb, float* landmark, int* countptr, FaceBox* boxptr,
24 | int width, int height, int w_x_h, int stride, float threshold,
25 | int maxobjs, int edge) {
26 |
27 | KERNEL_POSITION;
28 |
29 | float confidence = hm[position];
30 | if (confidence != hmpool[position] || confidence < threshold)
31 | return;
32 |
33 | int index = atomicAdd(countptr, 1);
34 | if (index >= maxobjs)
35 | return;
36 |
37 | int cx = position % width;
38 | int cy = position / width;
39 | int oc0 = position;
40 | int oc1 = position + w_x_h;
41 | int oc2 = position + w_x_h * 2;
42 | int oc3 = position + w_x_h * 3;
43 |
44 | FaceBox* ptr = boxptr + index;
45 | float dx = tlrb[oc0];
46 | float dy = tlrb[oc1];
47 | float dr = tlrb[oc2];
48 | float db = tlrb[oc3];
49 |
50 | ptr->x = (cx - dx) * stride;
51 | ptr->y = (cy - dy) * stride;
52 | ptr->r = (cx + dr) * stride;
53 | ptr->b = (cy + db) * stride;
54 | ptr->score = confidence;
55 | ptr->label = 0;
56 |
57 | for (int k = 0; k < 5; ++k) {
58 | // xxxxx yyyyy
59 | float landmark_x = landmark[position + w_x_h * k] * 4;
60 | float landmark_y = landmark[position + w_x_h * (k + 5)] * 4;
61 |
62 | cv::Point2f& point = ptr->landmark[k];
63 | point.x = (commonExp(landmark_x) + cx) * stride;
64 | point.y = (commonExp(landmark_y) + cy) * stride;
65 | }
66 | }
67 |
68 | const std::vector>& DBFaceBackend::forwardGPU(std::shared_ptr hm, std::shared_ptr hmpool, std::shared_ptr tlrb,
69 | std::shared_ptr landmark, float threshold, int maxobjs) {
70 |
71 | int width = hm->width();
72 | int height = hm->height();
73 | int batchSize = hm->num();
74 | int count = hm->count(1); // c * h * w
75 | auto grid = gridDims(count);
76 | auto block = blockDims(count);
77 |
78 | size_t objsStoreSize = maxobjs * sizeof(FaceBox) + sizeof(int);
79 | int heatmapArea = width * height;
80 | void* cpuPtr = getCPUMemory(objsStoreSize * batchSize);
81 | char* cpuPtrInput = (char*)cpuPtr;
82 | void* gpuPtr = getGPUMemory(objsStoreSize * batchSize);
83 | char* gpuPtrInput = (char*)gpuPtr;
84 | int stride = 4;
85 | auto stream = getStream();
86 |
87 | for (int i = 0; i < batchSize; ++i) {
88 |
89 | float* hm_ptr = hm->gpu(i);
90 | float* hm_pool_ptr = hmpool->gpu(i);
91 | float* tlrb_ptr = tlrb->gpu(i);
92 | float* landmark_ptr = landmark->gpu(i);
93 |
94 | int* countPtr = (int*)gpuPtrInput;
95 | FaceBox* boxPtr = (FaceBox*)((char*)gpuPtrInput + sizeof(int));
96 |
97 | cudaMemsetAsync(gpuPtrInput, 0, sizeof(int), stream);
98 | DBFaceBackend_forwardGPU <<< grid, block, 0, stream >>> (hm_ptr, hm_pool_ptr, tlrb_ptr, landmark_ptr, countPtr, boxPtr,
99 | width, height, heatmapArea, stride, threshold, maxobjs, count);
100 |
101 | cudaMemcpyAsync(cpuPtrInput, gpuPtrInput, objsStoreSize, cudaMemcpyKind::cudaMemcpyDeviceToHost, stream);
102 | cpuPtrInput += objsStoreSize;
103 | gpuPtrInput += objsStoreSize;
104 | }
105 | cudaStreamSynchronize(stream);
106 |
107 | cpuPtrInput = (char*)cpuPtr;
108 | outputs_.resize(batchSize);
109 |
110 | for (int i = 0; i < batchSize; ++i, cpuPtrInput += objsStoreSize) {
111 | auto& output = outputs_[i];
112 | output.clear();
113 |
114 | int num = *((int*)cpuPtrInput);
115 | num = std::min(num, maxobjs);
116 |
117 | if (num == 0)
118 | continue;
119 |
120 | FaceBox* ptr = (FaceBox*)(cpuPtrInput + sizeof(int));
121 | output.insert(output.begin(), ptr, ptr + num);
122 | }
123 | return outputs_;
124 | }
125 | };
--------------------------------------------------------------------------------
/src/infer/dbface_backend.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef DBFACE_BACKEND_HPP
3 | #define DBFACE_BACKEND_HPP
4 |
5 | #include
6 | #include
7 | #include
8 | #include "trt_backend.hpp"
9 |
10 | namespace TRTInfer {
11 |
12 | struct FaceBox : ccutil::BBox {
13 | cv::Point2f landmark[5];
14 |
15 | FaceBox() {}
16 | FaceBox(const ccutil::BBox& other):ccutil::BBox(other) {}
17 | };
18 |
19 | class DBFaceBackend : public Backend{
20 | public:
21 | DBFaceBackend(CUStream stream = nullptr);
22 |
23 | const std::vector>& forwardGPU(
24 | std::shared_ptr hm, std::shared_ptr hmpool, std::shared_ptr tlrb, std::shared_ptr landmark,
25 | float threshold, int maxobjs = 100);
26 |
27 | private:
28 | std::vector> outputs_;
29 | };
30 | };
31 |
32 | #endif // DBFACE_BACKEND_HPP
--------------------------------------------------------------------------------
/src/infer/task_pool.hpp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlunion/tensorRTIntegrate/145aec3faeef0d761a8f2752951deede2ed661a6/src/infer/task_pool.hpp
--------------------------------------------------------------------------------
/src/infer/trt_backend.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include "trt_backend.hpp"
4 | #include
5 | #include
6 |
7 | #define cuCheck(op) Assert((op) == cudaSuccess)
8 |
9 | namespace TRTInfer {
10 |
11 | Backend::Backend(CUStream stream) {
12 |
13 | this->stream_ = stream;
14 | this->ownStream_ = false;
15 |
16 | if (stream == nullptr) {
17 | cuCheck(cudaStreamCreate(&stream_));
18 | ownStream_ = true;
19 | }
20 | }
21 |
22 | void* Backend::getCPUMemory(size_t size) {
23 | if (cpuMemSize_ >= size)
24 | return cpuMemory_;
25 |
26 | releaseCPUMemory();
27 | cpuMemSize_ = size;
28 | cpuMemory_ = malloc(size);
29 | Assert(cpuMemory_ != nullptr);
30 | return cpuMemory_;
31 | }
32 |
33 | CUStream Backend::getStream() const {
34 | return stream_;
35 | }
36 |
37 | void Backend::releaseCPUMemory() {
38 | if (cpuMemory_) {
39 | free(cpuMemory_);
40 | cpuMemory_ = nullptr;
41 | cpuMemSize_ = 0;
42 | }
43 | }
44 |
45 | void* Backend::getGPUMemory(size_t size) {
46 | if (gpuMemSize_ >= size)
47 | return gpuMemory_;
48 |
49 | releaseGPUMemory();
50 | gpuMemSize_ = size;
51 |
52 | cuCheck(cudaMalloc(&gpuMemory_, gpuMemSize_));
53 | return gpuMemory_;
54 | }
55 |
56 | void Backend::releaseGPUMemory() {
57 | if (gpuMemory_) {
58 | cudaFree(gpuMemory_);
59 | gpuMemory_ = nullptr;
60 | gpuMemSize_ = 0;
61 | }
62 | }
63 |
64 | Backend::~Backend() {
65 | releaseGPUMemory();
66 | releaseCPUMemory();
67 |
68 | if (ownStream_) {
69 | cudaStreamDestroy(stream_);
70 | }
71 | stream_ = nullptr;
72 | }
73 | };
--------------------------------------------------------------------------------
/src/infer/trt_backend.hpp:
--------------------------------------------------------------------------------
1 |
2 | #ifndef TRT_BACKEND_HPP
3 | #define TRT_BACKEND_HPP
4 |
5 | #include
6 | #include
7 | #include
8 | #include "trt_infer.hpp"
9 |
10 | namespace TRTInfer {
11 |
12 | class Backend {
13 | public:
14 | Backend(CUStream stream = nullptr);
15 | virtual ~Backend();
16 |
17 | protected:
18 | void* getCPUMemory(size_t size);
19 | void releaseCPUMemory();
20 |
21 | void* getGPUMemory(size_t size);
22 | void releaseGPUMemory();
23 |
24 | CUStream getStream() const;
25 |
26 | private:
27 | void* cpuMemory_ = nullptr;
28 | size_t cpuMemSize_ = 0;
29 | void* gpuMemory_ = nullptr;
30 | size_t gpuMemSize_ = 0;
31 |
32 | CUStream stream_ = nullptr;
33 | bool ownStream_ = false;
34 | };
35 | };
36 |
37 | #endif // TRT_BACKEND_HPP
--------------------------------------------------------------------------------
/src/infer/trt_infer.hpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #ifndef TRT_INFER_HPP
4 | #define TRT_INFER_HPP
5 |
6 | #include
7 | #include
8 | #include
9 | #include