├── .gitignore
├── 3rdparty
    ├── opencv
    │   ├── linux.zip
    │   └── win.zip
    └── pybind11.zip
├── README.md
├── docs
    ├── README.pdf
    ├── files
    │   ├── Custom C++ and CUDA Extensions — PyTorch Tutorials 1.12.1+cu102 documentation.pdf
    │   ├── Keywords - setuptools 65.4.1.post20221009 documentation.pdf
    │   ├── PyTorch 源码解读之 cpp_extension：揭秘 C++_CUDA 算子实现和调用全流程 - 知乎.pdf
    │   ├── Python中使用C++_CUDA｜以PointNet中的ball query 为例 - 知乎.pdf
    │   ├── Pytorch拓展进阶(二)：Pytorch结合C++以及Cuda拓展 - Oldpan的个人博客.pdf
    │   ├── mmdetection源码剖析(1)--NMS.pdf
    │   └── setup.py实现C++扩展和python库编译.pdf
    └── imgs
    │   ├── Dependencies_x64_Release.png
    │   ├── compile_win.png
    │   ├── dali.png
    │   ├── pbind11_pipeline.pptx
    │   ├── permute.png
    │   ├── pybind11.png
    │   ├── pybind11_pipeline.png
    │   ├── windows_opencv_cfg.png
    │   ├── 动态库拷贝到python文件同路径.png
    │   └── 安装成功.png
├── mmcv_setup.py
├── orbbec
    ├── __init__.py
    ├── nms
    │   ├── __init__.py
    │   ├── nms_wrapper.py
    │   └── src
    │   │   ├── cpu
    │   │       └── nms_cpu.cpp
    │   │   ├── cuda
    │   │       ├── nms_cuda.cpp
    │   │       └── nms_kernel.cu
    │   │   └── nms_ext.cpp
    ├── roi_align
    │   ├── __init__.py
    │   ├── gradcheck.py
    │   ├── roi_align.py
    │   └── src
    │   │   ├── cpu
    │   │       └── roi_align_v2.cpp
    │   │   ├── cuda
    │   │       ├── roi_align_kernel.cu
    │   │       └── roi_align_kernel_v2.cu
    │   │   └── roi_align_ext.cpp
    ├── utils
    │   ├── __init__.py
    │   └── src
    │   │   └── compiling_info.cpp
    └── warpaffine
    │   ├── __init__.py
    │   └── src
    │       ├── cpu
    │           ├── warpaffine_opencv.cpp
    │           ├── warpaffine_torch_v1.cpp
    │           └── warpaffine_torch_v2.cpp
    │       ├── cuda
    │           ├── warpaffine_cuda.cpp
    │           └── warpaffine_kernel.cu
    │       └── warpaffine_ext.cpp
├── requirements.txt
├── requirements
    ├── build.txt
    ├── docs.txt
    ├── optional.txt
    ├── readthedocs.txt
    ├── runtime.txt
    └── tests.txt
├── scripts
    ├── benchmark.py
    ├── check.py
    ├── collect_env.py
    ├── cpp_img.png
    ├── demo.png
    ├── grad_check.py
    ├── image.jpg
    ├── opencv_world453.dll
    ├── py_img.png
    ├── test_nms.py
    ├── test_warpaffine_opencv.py
    ├── test_warpaffine_torch_cpu.py
    ├── test_warpaffine_torch_gpu.py
    ├── torch_affine_cpu.png
    └── torch_affine_gpu.png
├── setup.py
└── tools
    └── Dependencies_x64_Release.7z


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | *.pyd
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | mmdet/version.py
108 | ./data/
109 | .vscode
110 | .idea
111 | .DS_Store
112 | 
113 | # custom
114 | *.pkl
115 | *.pkl.json
116 | *.log.json
117 | work_dirs/
118 | 
119 | # Pytorch
120 | *.pth
121 | *.py~
122 | *.sh~
123 | 


--------------------------------------------------------------------------------
/3rdparty/opencv/linux.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/3rdparty/opencv/linux.zip


--------------------------------------------------------------------------------
/3rdparty/opencv/win.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/3rdparty/opencv/win.zip


--------------------------------------------------------------------------------
/3rdparty/pybind11.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/3rdparty/pybind11.zip


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Introduction：
  2 | 
  3 | ​		一直以来，得益于GPU的日益发展，深度学习中网络训练以及部署推理速度越来越快，在各大主流的深度学习框架，诸如Pytorch、TensorFlow、OneFlow等都有很多算子对GPU的加速支持。从网络结构角度，Pytorch虽然已经使用了NVIDIA cuDNN、Intel MKL和NNPACK这些底层来加快训练速度，但是在某些情况下，比如我们要实现一些特定算法，如果只是用Pytorch已有的算子或操作远远不够。因为Pytorch虽然在特定操作上经过了很好的优化，但是对于Pytorch已经写好的这些操作，假如我们组合起来成为一个新的算子(OP)，Pytorch不会管你的算法的具体执行流程，一般Pytorch只会按照设计好的操作去使用GPU，然后GPU可能不能充分利用或者直接超负载，并且python解释器也不能对此进行优化，导致训练过程变慢很多[^1]。从数据流角度，**深度学习一般都需要复杂的、多阶段的数据处理流程，包括数据加载、解码以及一定量的数据增强预处理操作，这些目前在CPU上执行的数据处理管道已经成为瓶颈，使得模型训练耗时很长大**。对于此，NVIDIA提出了Data Loading Library (DALI)[^2]，通过将数据预处理交给GPU处理，缓解CPU瓶颈问题。DALI依赖于它自己的执行引擎，其构建目的是最大化输入管道的吞吐量。诸如预取、并行执行和批处理等特性都是为用户透明处理，如下图所示：
  4 | 
  5 | <center>
  6 |     <img style="border-radius: 0.3125em;
  7 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
  8 |     src="docs/imgs/dali.png" width = "90%" alt=""/>
  9 |     <br>
 10 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
 11 |     display: inline-block;
 12 |     color: #999;
 13 |     padding: 2px;">
 14 |       DALI Pipeline
 15 |   	</div>
 16 | </center>
 17 | 
 18 | ​		使用DALI以及配置DALI环境比较复杂，并且DALI当前的支持的函数实现也比较有限，具体使用可以看文献[^2]中的说明文档。实际开发中，对于一些复杂的特定数据增强操作，就需要自己实现。因此，本工程利用Pytorch的C++/CUDA扩展，实现GPU的数据增强，然后直接推送给网络，从而达到训练加速效果。为了指导大家系统性掌握该方面的相关知识，本工程也包含了python的C++拓展，且详细讲解了在需要依赖第三方库的情况下怎样编写setup.py文件以及相关配置，关于如何编译和测试，在后续有详细的讲解。
 19 | 
 20 | ## Project  Structure：
 21 | 
 22 | ```python
 23 | ├── 3rdparty          # 工程依赖的第三方库
 24 | │    ├── opencv
 25 | │    │    ├── linux
 26 | │    │    └── win
 27 | │    └── pybind11
 28 | ├── docs              # 说明文档及相关资料
 29 | ├── requirements      # python相关安装依赖
 30 | ├── requirements.txt  # python相关安装依赖项, 与requirements文件夹配合
 31 | ├── scripts           # 相关测试脚本
 32 | ├── tools             # 分析工具
 33 | ├── orbbec            # 源码文件
 34 | │    ├── nms          # 非极大值抑制
 35 | │    ├── roi_align    # ROI Align
 36 | │    ├── utils        # 编译工具函数
 37 | │    └── warpaffine   # 仿射变换增强
 38 | └── setup.py          # 用于编译和构建python包(.egg), 类似:CMakeLists.txt
 39 | ```
 40 | 
 41 | ## Compilation And Python Environment ：
 42 | 
 43 | ### Compile Environment：
 44 | 
 45 | - [x] **GCC/G++ >= 5.5.0(Visual Studio 2017 or newer for Windows)**
 46 | - [x] **CUDA(NVCC): 10.1~11.5**
 47 | ### Python Environment(requirements.txt)：
 48 | 
 49 | ```python
 50 | certifi==2021.5.30
 51 | cycler==0.11.0
 52 | future==0.18.2
 53 | kiwisolver==1.3.1
 54 | matplotlib==3.3.4
 55 | mkl-fft==1.3.0
 56 | mkl-random==1.1.1
 57 | mkl-service==2.3.0
 58 | numpy @ file:///C:/ci/numpy_and_numpy_base_1603480701039/work
 59 | olefile==0.46
 60 | opencv-python==3.4.0.12
 61 | Pillow @ file:///C:/ci/pillow_1625663293114/work
 62 | pyparsing==3.0.9
 63 | python-dateutil==2.8.2
 64 | six @ file:///tmp/build/80754af9/six_1644875935023/work
 65 | terminaltables==3.1.10
 66 | torch==1.5.0
 67 | torchvision==0.6.0
 68 | wincertstore==0.2
 69 | ```
 70 | 
 71 | ### Python Package infos：
 72 | 
 73 | ```python
 74 | Package         Version 
 75 | --------------- --------- 
 76 | certifi         2016.2.28
 77 | cycler          0.11.0
 78 | Cython          0.29.32
 79 | future          0.18.2
 80 | kiwisolver      1.3.1
 81 | matplotlib      3.3.4
 82 | mkl-fft         1.3.0
 83 | mkl-random      1.1.1
 84 | mkl-service     2.3.0
 85 | numpy           1.19.2
 86 | olefile         0.44
 87 | opencv-python   3.4.0.12
 88 | Pillow          8.3.1
 89 | pip             21.3.1
 90 | pyparsing       3.0.9
 91 | python-dateutil 2.8.2
 92 | setuptools      59.6.0
 93 | six             1.10.0
 94 | terminaltables  3.1.10
 95 | torch           1.5.0
 96 | torchvision     0.6.0
 97 | wheel           0.29.0
 98 | wincertstore    0.2
 99 | ```
100 | 
101 | 【注】：上述环境中的Pytorch版本需要对应的CUDA版本，本工程支持的Pytorch版本：**Pytorch version：1.5.0~latest**。
102 | 
103 | ## C++ And CUDA Extensions For Python/Pytorch:
104 | 
105 | ​		C++与python或pytotch的交互，业界主流做法是采用**pybind11**，关于Pybind11的更多详细说明可以参看文献[^15]，其核心原理如下图所示：
106 | 
107 | <center>
108 |     <img style="border-radius: 0.3125em;
109 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
110 |     src="docs/imgs/pybind11_pipeline.png" width = "85%" alt=""/>
111 |     <br>
112 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
113 |     display: inline-block;
114 |     color: #999;
115 |     padding: 2px;">
116 |       pybind11 pipeline
117 |   	</div>
118 | </center>
119 | ​		由于Pytorch的C++拓展与纯Python有一些区别，因为Pytorch的基础数据类型是**torch.Tensor**，该数据类型可以认为是Pytorch库对np.array进行了更高一层的封装。所以，在写拓展程序时，其接口函数所需要的数据类型以及调用的库会有些区别，下面会详细解释。
120 | 
121 | ### C++  Extensions For Python：
122 | 
123 | ​		首先我们看Python代码，如下所示(scripts/test_warpaffine_opencv.py)：
124 | 
125 | ```python
126 | import cv2
127 | import torch  # 不能删掉, 因为需要动态加载torch的一些动态库,后面会详细说明.
128 | import numpy as np
129 | from orbbec.warpaffine import affine_opencv  # C++ interface
130 | 
131 | data_path = "./demo.png"
132 | img = cv2.imread(data_path, cv2.IMREAD_GRAYSCALE)
133 | 
134 | # python中的numpy.array()与 pybind中的py::array_t一一对应.
135 | src_point = np.array([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=np.float32)
136 | dst_point = np.array([[38.29, 51.69], [73.53, 51.69], [56.02, 71.73]], dtype=np.float32)
137 | # python interface 
138 | mat_trans = cv2.getAffineTransform(src_point, dst_point)
139 | res = cv2.warpAffine(img, mat_trans, (600,800))
140 | cv2.imwrite("py_img.png", res)
141 | 
142 | # C++ interface
143 | warpffine_img = affine_opencv(img, src_point, dst_point)
144 | cv2.imwrite("cpp_img.png", warpffine_img)
145 | ```
146 | 
147 | ​		从上述代码可以看到，python文件中调用了affine_opencv函数，而affine_opencv的C++实现在orbbec/warpaffine/src/cpu/warpaffine_opencv.cpp中，如下所示：
148 | 
149 | ```c++
150 | #include<vector>
151 | #include<iostream>
152 | #include<pybind11/pybind11.h>
153 | #include<pybind11/numpy.h>
154 | #include<pybind11/stl.h>
155 | #include<opencv2/opencv.hpp>
156 | 
157 | 
158 | namespace py = pybind11;
159 | 
160 | /* Python->C++ Mat */
161 | cv::Mat numpy_uint8_1c_to_cv_mat(py::array_t<unsigned char>& input)
162 | {
163 | 	...
164 | }
165 | 
166 | cv::Mat numpy_uint8_3c_to_cv_mat(py::array_t<unsigned char>& input)
167 | {
168 | 	...
169 | }
170 | 
171 | /* C++ Mat ->numpy */
172 | py::array_t<unsigned char> cv_mat_uint8_1c_to_numpy(cv::Mat& input)
173 | {
174 | 	...
175 | }
176 | 
177 | py::array_t<unsigned char> cv_mat_uint8_3c_to_numpy(cv::Mat& input)
178 | {
179 | 	...
180 | }
181 | 
182 | py::array_t<unsigned char> affine_opencv(py::array_t<unsigned char>& input, 
183 |                                         py::array_t<float>& from_point, 
184 |                                         py::array_t<float>& to_point)
185 | {
186 | 	...
187 | }
188 | ```
189 | 
190 | ​		由于本工程同时兼容了Pytorch的C++/CUDA拓展，为了更加规范，这里在拓展接口程序(orbbec/warpaffine/src/warpaffine_ext.cpp)中通过PYBIND11_MODULE定义好接口，如下所示：
191 | 
192 | ```c++
193 | #include <torch/extension.h>
194 | #include<pybind11/numpy.h>
195 | 
196 | // python的C++拓展函数申明
197 | py::array_t<unsigned char> affine_opencv(py::array_t<unsigned char>& input, 
198 |                                         py::array_t<float>& from_point, 
199 |                                         py::array_t<float>& to_point);
200 | 
201 | // Pytorch的C++拓展函数申明(CPU)
202 | at::Tensor affine_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
203 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
204 |                       const int out_h,
205 |                       const int out_w);
206 | 
207 | // Pytorch的CUDA拓展函数申明(GPU)
208 | #ifdef WITH_CUDA
209 | at::Tensor affine_gpu(const at::Tensor& input,          /*[B, C, H, W]*/
210 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
211 |                       const int out_h,
212 |                       const int out_w);
213 | #endif
214 | 
215 | // 通过WITH_CUDA宏进一步封装Pytorch的拓展接口
216 | at::Tensor affine_torch(const at::Tensor& input,          /*[B, C, H, W]*/
217 |                   		const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
218 |                   		const int out_h,
219 |                   		const int out_w)
220 | {
221 | 	if (input.device().is_cuda())
222 |   	{
223 | #ifdef WITH_CUDA
224 |     return affine_gpu(input, affine_matrix, out_h, out_w);
225 | #else
226 |     AT_ERROR("affine is not compiled with GPU support");
227 | #endif
228 |   	}
229 |   	return affine_cpu(input, affine_matrix, out_h, out_w);
230 | }
231 | 
232 | // 使用pybind11模块定义python/pytorch接口
233 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
234 |   m.def("affine_opencv", &affine_opencv, "affine with c++ opencv");
235 |   m.def("affine_torch", &affine_torch,   "affine with c++ libtorch");
236 | }
237 | ```
238 | 
239 | ​		从上面代码可以看出，**python中的np.array数组与pybind11的py::array_t相互对应，也即python接口函数中，传入的np.array数组，在C++对应的函数中用py::array_t接收，操作Numpy数组，需要引入头文件<pybind11/numpy.h>**。数组本质上在底层是一块一维的连续内存区，通过pybind11中的request()函数可以把数组解析成py::buffer_info结构体，buffer_info类型可以公开一个缓冲区视图，它提供对内部数据的快速直接访问，如下代码所示：
240 | 
241 | ```c++
242 | struct buffer_info {
243 |     void *ptr;                         // 指向数组(缓冲区)数据的指针
244 |     py::ssize_t itemsize;              // 数组元素总数
245 |     std::string format;                // 数组元素格式(python表示的类型)
246 |     py::ssize_t ndim;                  // 数组维度信息
247 |     std::vector<py::ssize_t> shape;    // 数组形状
248 |     std::vector<py::ssize_t> strides;  // 每个维度相邻元素的间隔(字节数表示)
249 | };		
250 | ```
251 | 
252 | ​		在写好C++源码以后，在setup.py中将相关C++源文件，以及依赖的第三方库：opencv、pybind11的路径写入对应位置(本工程已经写好，请具体看setup.py文件)，然后进行编译和安装：
253 | 
254 | ```python
255 | # 切换工作路径
256 | step 1: cd F:/code/python_cpp_extension
257 | # 编译
258 | step 2: python setup.py develop
259 | # 安装, 如果没有指定--prefix, 则最终编译成功的安装包(.egg)文件会安装到对应的python环境下的site-packages下.
260 | step 3: python setup.py install
261 | ```
262 | 
263 | 【注】：关于工程文件中的setup.py相关知识可以参考文献[^7][^12][^13]，该三篇文献对此有详细的解释。执行step2和step3之后，如下图所示，最终源码文件会编译成.pyd二进制文件（linux系统下编译成.so文件），且会生成一个python包文件：**orbbec-0.0.1-py36-win-amd64.egg**，包名取决于setup.py中规定的name和version信息，该安装包会被安装在当前python环境的site-packages文件夹下。同时，在终端执行命令：**pip list**，会发现安装包以及对应的版本信息。安装成功后，也就意味着，在该python环境(本工程的python环境是cpp_extension)下，可以在任何一个python文件中，导入orbbec安装包中的接口函数，比如上述scripts/test_warpaffine_opencv.py文件中的语句：**``from orbbec.warpaffine import affine_opencv``**。
264 | 
265 | <center>
266 |     <img style="border-radius: 0.3125em;
267 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
268 |     src="docs/imgs/compile_win.png" width = "100%" alt=""/>
269 |     <br>
270 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
271 |     display: inline-block;
272 |     color: #999;
273 |     padding: 2px;">
274 |       编译和安装成功
275 |   	</div>
276 | </center>
277 | <center>
278 |     <img style="border-radius: 0.3125em;
279 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
280 |     src="docs/imgs/安装成功.png" width = "90%" alt=""/>
281 |     <br>
282 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
283 |     display: inline-block;
284 |     color: #999;
285 |     padding: 2px;">
286 |       安装成功后, packages信息
287 |   	</div>
288 | </center>
289 | 
290 | 编译完成后，可以运行tools/collect_env.py，查看当前一些必要工具的版本等一系列信息，输出如下：
291 | 
292 | ```python
293 | sys.platform    : win32
294 | Python  : 3.6.13 |Anaconda, Inc.| (default, Mar 16 2021, 11:37:27) [MSC v.1916 64 bit (AMD64)]
295 | CUDA available  : True
296 | CUDA_HOME       : C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
297 | NVCC    : Not Available
298 | GPU 0   : NVIDIA GeForce GTX 1650
299 | OpenCV  : 3.4.0
300 | PyTorch : 1.5.0
301 | PyTorch compiling details       : PyTorch built with:
302 |   - C++ Version: 199711
303 |   - MSVC 191627039
304 |   - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191125 for Intel(R) 64 architecture applications
305 |   - Intel(R) MKL-DNN v0.21.1 (Git Hash 7d2fd500bc78936d1d648ca713b901012f470dbc)
306 |   - OpenMP 200203
307 |   - CPU capability usage: AVX2
308 |   - CUDA Runtime 10.1
309 |   - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
310 |   - CuDNN 7.6.4
311 |   - Magma 2.5.2
312 |   - Build settings: BLAS=MKL, BUILD_TYPE=Release, CXX_FLAGS=/DWIN32 /D_WINDOWS  /GR  /w /EHa /bigobj -openmp -DNDEBUG -DUSE_FBGEMM, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=OFF, USE_NNPACK=OFF, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF,
313 | 
314 | TorchVision     : 0.6.0
315 | C/C++ Compiler  : MSVC 191627045
316 | CUDA Compiler   : 10.1
317 | ```
318 | 
319 | ​		在运行scripts/test_warpaffine_opencv.py文件之前，由于warpaffine_opencv.cpp源码用到相关opencv库，因此，还需要配置动态库路径，windows系统配置如下：
320 | 
321 | <center>
322 |     <img style="border-radius: 0.3125em;
323 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
324 |     src="docs/imgs/windows_opencv_cfg.png" width = "90%" alt=""/>
325 |     <br>
326 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
327 |     display: inline-block;
328 |     color: #999;
329 |     padding: 2px;">
330 |       opencv库路径配置
331 |   	</div>
332 | </center>
333 | 
334 | Linux系统同样也需要配置进行配置，命令如下：
335 | 
336 | ```python
337 | root@aistation:/xxx/code/python_cpp_extension# export LD_LIBRARY_PATH=/xxx/code/python_cpp_extension/3rdparty/opencv/linux/lib
338 | root@aistation:/xxx/code/python_cpp_extension# ldconfig
339 | ```
340 | 
341 | 也可以通过修改~/.bashrc文件，加入上述``export LD_LIBRARY_PATH=/...``，然后命令：``source ~/.bashrc``。也可以直接修改配置文件/etc/profile，与修改.bashrc文件一样，对所有用户有效。
342 | 
343 | ​		如果是在服务器上训练模型，比如浪潮的AIStation，则可以将上述命令写入.sh脚本中，然后训练时直接脚本启动即可。如下所示：
344 | 
345 | ```sh
346 | # add dll path to env
347 | export LD_LIBRARY_PATH=/jiaozhu01/code/insightface_ir_train
348 | ldconfig
349 | 
350 | # run 
351 | cd /jiaozhu01/code/insightface_ir_train/
352 | OMP_NUM_THREADS=4 torchrun --standalone --nproc_per_node=2 --master_addr="127.0.0.1" --master_port=12581 train.py configs/mbf200.py
353 | 
354 | # kill process
355 | ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
356 | ```
357 | 
358 | ​		可以通过tools下的Dependencies_x64_Release工具(运行：DependenciesGui.exe)，查看编译好的文件(.pyd)依赖的动态库是否都配置完好，如下图所示：
359 | 
360 | <center>
361 |     <img style="border-radius: 0.3125em;
362 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
363 |     src="docs/imgs/Dependencies_x64_Release.png" width = "100%" alt=""/>
364 |     <br>
365 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
366 |     display: inline-block;
367 |     color: #999;
368 |     padding: 2px;">
369 |       检查编译好的动态库依赖的动态库路径
370 |   	</div>
371 | </center>
372 | 可以发现，该工具没有找到python36.dll、c10.dll、torch_cpu.dll、torch_python.dll和c10_cuda.dll的路径，这里说明一下，python相关的dll库以及torch相关的动态库是动态加载的，也就是说，如果你在python代码中写一句：**import torch**，只有在程序运行时才会动态加载torch相关库。所以，Dependencies_x64_Release工具检查不到编译好的warpaffine_ext.cp36-win_amd64.pyd文件依赖完好性。这里还需要说明一下为什么warpaffine_ext.cp36-win_amd64.pyd需要依赖torch相关库，这是因为源文件orbbec/warpaffine/src/warpaffine_ext.cpp兼容了pytorch的c++拓展，所以依赖torch和cuda相关动态库文件，如果你单纯只在orbbec/warpaffine/src/warpaffine_ext.cpp实现纯粹python的c++拓展，则是不需要依赖torch和cuda相关动态库。
373 | 
374 | ​		配置好之后，还需要将warpaffine_ext.cp36-win_amd64.pyd无法**动态加载**的动态库文件(opencv_world453.dll)放到scripts/test_warpaffine_opencv.py同路径之下(Linux系统也一样)，如下图所示：
375 | 
376 | <center>
377 |     <img style="border-radius: 0.3125em;
378 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
379 |     src="docs/imgs/动态库拷贝到python文件同路径.png" width = "80%" alt=""/>
380 |     <br>
381 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
382 |     display: inline-block;
383 |     color: #999;
384 |     padding: 2px;">
385 |       拷贝动态库与测试脚本同一目录
386 |   	</div>
387 | </center>
388 | ​		需要注意一个问题，有时候，如果在docker中进行编译和安装，其最终生成的python安装包(.egg)文件并不会安装到当前python环境下的site-packages中。也就意味着，在python文件中执行：``from orbbec.warpaffine import affine_opencv``会失败。原因是``orbbec.warpaffine``并不在其python的搜索路径中，这个时候有两种解决办法，一种是在执行：``python setup.py install``时，加上``--prefix='install path'``，但是经过本人验证，有时候不可行，另外一种办法是在python文件中，将``orbbec``文件夹路径添加到python的搜索路径中，如下所示：
389 | 
390 | ```python
391 | import cv2
392 | import torch  # 不能删掉, 因为需要动态加载torch的一些动态库.
393 | import numpy as np
394 | 
395 | # 添加下述两行代码，这里默认此python脚本所在目录的上一层目录路径包含orbbec文件夹.
396 | _FILE_PATH = os.path.dirname(os.path.abspath(__file__))
397 | sys.path.insert(0, os.path.join(_FILE_PATH, "../"))
398 | 
399 | from orbbec.warpaffine import affine_opencv  # C++ interface
400 | ```
401 | 
402 | ### C++/CUDA Extensions For Pytorch:
403 | 
404 | ​		Pytorch的C++/CUDA拓展同样也是利用Pybind11工具，但是，由于Pytorch使用的基础数据类型是torch.Tensor类型，因此，在写拓展程序中，必须要有libtorch库中对应的数据类型与Pytorch的tensor类型对应，这样才能进行正确传参。这里需要知道Pytorch对应的C++版本libtorch中几个常用的库和命名空间：
405 | 
406 | 常用的**命名空间**：
407 | ①：at(ATen)负责声明和定义Tensor运算，是最常用到的命名空间；
408 | ②：c10是ATen的基础，包含了PyTorch的核心抽象、Tensor和Storage数据结构的实际实现；
409 | ③：torch命名空间下定义的Tensor相比于ATen 增加自动求导功能。
410 | 
411 | Pytorch的Aten目录下的主要构成：
412 | ①：ATen（ATen核心源文件）；
413 | ②：TH（Torch 张量计算库）；
414 | ③：THC（Torch CUDA张量计算库）；
415 | ④：THCUNN（Torch CUDA神经网络库）；
416 | ⑤：THNN(Torch神经网络库)。
417 | 
418 | ​		C10是Caffe Tensor Library的缩写。这里存放的都是最基础的Tensor库的代码，可以运行在服务端和移动端，C10主要目的之一是为了统一Pytorch的张量计算后端代码和caffe2的张量计算后端代码。
419 | ​		libtorch中还有个csrc模块，主要适用于**C++和python的API之间的相互映射**，比如pytorch的nn.Conv2d对应于torch中的at:conv2d，其次是autograd和自动求导机制。了解如上内容后，首先来看python测试代码，如下所示(scripts/test_warpaffine_torch_cpu.py)：
420 | 
421 | ```python
422 | import cv2
423 | import torch
424 | import numpy as np
425 | from orbbec.warpaffine import affine_torch  # C++ interface
426 | 
427 | data_path = "demo.png"
428 | 
429 | img = cv2.imread(data_path)
430 | # transform img(numpy.array) to tensor(torch.Tensor)
431 | # use permute
432 | img_tensor = torch.from_numpy(img / 255.0).permute(2, 0, 1).contiguous()
433 | img_tensor = img_tensor.unsqueeze(0).float()
434 | 
435 | src_tensor = torch.tensor([[38.29, 51.69, 1.0], [73.53, 51.69, 1.0], [56.02, 71.73, 1.0]], dtype=torch.float32).unsqueeze(0)
436 | dst_tensor = torch.tensor([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=torch.float32).unsqueeze(0)
437 | 
438 | # compute affine transform matrix
439 | matrix_l = torch.transpose(src_tensor, 1, 2).bmm(src_tensor)
440 | matrix_l = torch.inverse(matrix_l)
441 | matrix_r = torch.transpose(src_tensor, 1, 2).bmm(dst_tensor)
442 | affine_matrix = torch.transpose(matrix_l.bmm(matrix_r), 1, 2)
443 | 
444 | warpffine_img = affine_torch(img_tensor, affine_matrix, 112, 112)
445 | 
446 | warpffine_img = warpffine_img.squeeze(0).permute(1, 2, 0).numpy()
447 | cv2.imwrite("torch_affine_cpu.png", np.uint8(warpffine_img * 255.0))
448 | ```
449 | 
450 | ​		从上述代码可以看到，python文件中调用了affine_torch函数，并且传入的参数类型是cpu类型的tensor，而affine_torch的C++实现在orbbec/warpaffine/src/warpaffine_ext.cpp中，如下所示：
451 | 
452 | ```c++
453 | #include <torch/extension.h>
454 | #include<pybind11/numpy.h>
455 | 
456 | // python的C++拓展函数申明
457 | py::array_t<unsigned char> affine_opencv(py::array_t<unsigned char>& input, 
458 |                                         py::array_t<float>& from_point, 
459 |                                         py::array_t<float>& to_point);
460 | 
461 | // Pytorch的C++拓展函数申明(CPU)
462 | at::Tensor affine_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
463 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
464 |                       const int out_h,
465 |                       const int out_w);
466 | 
467 | // Pytorch的CUDA拓展函数申明(GPU)
468 | #ifdef WITH_CUDA
469 | at::Tensor affine_gpu(const at::Tensor& input,          /*[B, C, H, W]*/
470 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
471 |                       const int out_h,
472 |                       const int out_w);
473 | #endif
474 | 
475 | // 通过WITH_CUDA宏进一步封装Pytorch的拓展接口
476 | at::Tensor affine_torch(const at::Tensor& input,          /*[B, C, H, W]*/
477 |                   		const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
478 |                   		const int out_h,
479 |                   		const int out_w)
480 | {
481 | 	if (input.device().is_cuda())
482 |   	{
483 | #ifdef WITH_CUDA
484 |     return affine_gpu(input, affine_matrix, out_h, out_w);
485 | #else
486 |     AT_ERROR("affine is not compiled with GPU support");
487 | #endif
488 |   	}
489 |   	return affine_cpu(input, affine_matrix, out_h, out_w);
490 | }
491 | 
492 | // 使用pybind11模块定义python/pytorch接口
493 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
494 |   m.def("affine_opencv", &affine_opencv, "affine with c++ opencv");
495 |   m.def("affine_torch", &affine_torch,   "affine with c++ libtorch");
496 | }
497 | ```
498 | 
499 | ​		从上述代码可以看出，根据宏WITH_CUDA和tensor类型控制affine_torch最终底层执行affine_cpu还是affine_gpu函数。同时也注意到，**python中的torch.Tensor类型与libtorch库中的at::Tensor对应**。再看看affine_cpu函数的具体实现(orbbec/warpaffine/src/cpu/warpaffine_torch_v2.cpp)：
500 | 
501 | ```c++
502 | at::Tensor affine_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
503 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
504 |                       const int out_h,
505 |                       const int out_w)
506 | {
507 |     at::Tensor result;
508 |     // AT_DISPATCH_FLOATING_TYPES: input.scalar_type() => scalar_t
509 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "affine_cpu", [&] {
510 |         result = affine_torch_cpu<scalar_t>(input, affine_matrix, out_h, out_w);
511 |     });
512 |     return result;
513 | }
514 | ```
515 | 
516 | 进一步看affine_torch_cpu函数的具体实现(orbbec/warpaffine/src/cpu/warpaffine_torch_v2.cpp)：
517 | 
518 | ```c++
519 | template <typename scalar_t>
520 | at::Tensor affine_torch_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
521 |                             const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
522 |                             const int out_h,
523 |                             const int out_w)  
524 | {
525 |     AT_ASSERTM(input.device().is_cpu(),         "input must be a CPU tensor");
526 |     AT_ASSERTM(affine_matrix.device().is_cpu(), "affine_matrix must be a CPU tensor");
527 | 
528 |     auto matrix_ptr = affine_matrix.contiguous().data_ptr<scalar_t>();
529 |     auto input_ptr = input.contiguous().data_ptr<scalar_t>();
530 |     auto nimgs = input.size(0);
531 |     auto img_c = input.size(1);
532 |     auto img_h = input.size(2);
533 |     auto img_w = input.size(3);
534 |     auto in_img_size = img_c * img_h * img_w;
535 |     auto out_img_size = img_c * out_h * out_w;
536 | 
537 |     // build dst tensor
538 |     auto output_tensor = at::zeros({nimgs, img_c, out_h, out_w}, input.options());
539 |     auto output_ptr = output_tensor.contiguous().data_ptr<scalar_t>();  
540 |     
541 |     for(int i = 0; i < nimgs; i++)
542 |     {   
543 |         scalar_t* matrix = matrix_ptr + i * 6; 
544 |         scalar_t* in = input_ptr + i * in_img_size;
545 |         scalar_t* out = output_ptr + i * out_img_size;
546 |         affine_cpu_kernel<scalar_t>(img_h, img_w, img_c, img_w*img_h, 
547 |                                     out_h, out_w, out_h*out_w, out, in, matrix, 0.0f);
548 |     }
549 | 
550 |     return output_tensor;
551 | }
552 | ```
553 | 
554 | ​		这里有一个非常注意的地方就是，上述代码中的**tensor的.contiguous()方法(上述代码第10、11、21行)**。可以看到，我们在获取tensor的数据指针时候(**data_ptr<scalar_t>()**)，Pytorch官方示例代码和mmdtection/mmcv中的一些相关代码都推荐先做这个操作。这是因为，不管是在python还是在c++代码中，使用**permute()、transpose()、view()**等方法操作返回一个新的tensor时，其与旧的tensor是**共享数据存储**，所以他们的storage不会发生变化，只是会重新返回一个新的view，这样做的目的是减少数据拷贝，减少内存消耗，一定程度上加速网络训练或推理过程，如果在Python端对tensor做了.contiguous()操作，则在C++端就不需要再做了，因为.contiguous()是一个深拷贝操作。
555 | 
556 | <center>
557 |     <img style="border-radius: 0.3125em;
558 |     box-shadow: 0 2px 4px 0 rgba(34,36,38,.12),0 2px 10px 0 rgba(34,36,38,.08);" 
559 |     src="docs/imgs/permute.png" width = "80%" alt=""/>
560 |     <br>
561 |     <div style="color:orange; border-bottom: 1px solid #d9d9d9;
562 |     display: inline-block;
563 |     color: #999;
564 |     padding: 2px;">
565 |       permute操作分析
566 |   	</div>
567 | </center>
568 | ​		接下来，再来看pytorch的CUDA扩展，首先测试文件test_warpaffine_torch_gpu.py如下：
569 | 
570 | ```python
571 | import cv2
572 | import torch
573 | import numpy as np
574 | from orbbec.warpaffine import affine_torch  # CUDA interface
575 | 
576 | data_path = "demo.png"
577 | 
578 | img = cv2.imread(data_path)
579 | # transform img(numpy.array) to tensor(torch.Tensor)
580 | # use permute
581 | img_tensor = torch.from_numpy(img / 255.0).permute(2, 0, 1).contiguous()
582 | img_tensor = img_tensor.unsqueeze(0).float()
583 | img_tensor = img_tensor.cuda()  # gpu tensor
584 | 
585 | # dst -> src
586 | src_tensor = torch.tensor([[38.29, 51.69, 1.0], [73.53, 51.69, 1.0], [56.02, 71.73, 1.0]], dtype=torch.float32).unsqueeze(0)
587 | dst_tensor = torch.tensor([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=torch.float32).unsqueeze(0)
588 | src_tensor = src_tensor.cuda()  # gpu tensor
589 | dst_tensor = dst_tensor.cuda()  # gpu tensor
590 | 
591 | # compute affine transform matrix
592 | matrix_l = torch.transpose(src_tensor, 1, 2).bmm(src_tensor)
593 | matrix_l = torch.inverse(matrix_l)
594 | matrix_r = torch.transpose(src_tensor, 1, 2).bmm(dst_tensor)
595 | affine_matrix = torch.transpose(matrix_l.bmm(matrix_r), 1, 2)
596 | affine_matrix = affine_matrix.contiguous().cuda()  # gpu tensor
597 | 
598 | warpffine_img = affine_torch(img_tensor, affine_matrix, 112, 112)
599 | warpffine_img = warpffine_img.cpu().squeeze(0).permute(1, 2, 0).numpy()
600 | cv2.imwrite("torch_affine_gpu.png", np.uint8(warpffine_img * 255.0))
601 | ```
602 | 
603 | 从上述脚本代码可以看到，affine_torch接收的是GPU类型的Tensor数据，其底层会在GPU上执行相关计算。进一步分析orbbec/warpaffine/src/warpaffine_ext.cpp中的affine_torch()函数的CUDA接口，可以发现，最终调用的是affine_gpu()函数，如下代码所示：
604 | 
605 | ```c++
606 | at::Tensor affine_gpu(const at::Tensor& input,          /*[B, C, H, W]*/
607 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
608 |                       const int out_h,
609 |                       const int out_w)
610 | {
611 |     CHECK_INPUT(input);
612 |     CHECK_INPUT(affine_matrix);
613 | 
614 |     // Ensure CUDA uses the input tensor device.
615 |     at::DeviceGuard guard(input.device());
616 | 
617 |     return affine_cuda_forward(input, affine_matrix, out_h, out_w);
618 | }
619 | ```
620 | 
621 | 可以发现，最终执行的是affine_cuda_forward()函数，如下代码所示：
622 | 
623 | ```c++
624 | at::Tensor affine_cuda_forward(const at::Tensor& input,          /*[B, C, H, W]*/
625 |                                const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
626 |                                const int out_h,
627 |                                const int out_w)
628 | {
629 |     // build dst tensor
630 |     auto nimgs = input.size(0);
631 |     auto img_c = input.size(1);
632 |     auto img_h = input.size(2);
633 |     auto img_w = input.size(3);
634 |     const int output_size = nimgs * img_c * out_h * out_w;
635 |     auto output_tensor = at::zeros({nimgs, img_c, out_h, out_w}, input.options());
636 | 
637 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "affine_cuda", [&] {
638 |         auto matrix_ptr = affine_matrix.data_ptr<scalar_t>();
639 |         auto input_ptr = input.data_ptr<scalar_t>();
640 |         auto output_ptr = output_tensor.data_ptr<scalar_t>();
641 | 
642 |         // launch kernel function on GPU with CUDA.
643 |         affine_gpu_kernel<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
644 |                         0, at::cuda::getCurrentCUDAStream()>>>(output_size, img_h,
645 |                         img_w, img_c, out_h, out_w, output_ptr, input_ptr, matrix_ptr, 0.0f);
646 |     });    
647 | 
648 |     return  output_tensor;
649 | }
650 | ```
651 | 
652 | 通过配置grid_size和block_size之后，启动核函数: **affine_gpu_kernel**，关于核函数这一部分涉及很多CUDA知识，这里并不进行展开说明。最终返回gpu类型的output_tensor给python接口。
653 | 
654 | ## GPU-Accelerated Augmentation:
655 | 
656 | ​		在掌握了pytorch的C++/CUDA拓展之后，我们就可以轻松做到与NVIDIA的DALI库一样的加速效果，不管多么复杂的数据增强，都可以通过上述操作进行一定程度上的加速，伪代码如下所示(假设编译和安装步骤都已完成)：
657 | 
658 | ```python
659 | for _, (img, local_labels) in enumerate(train_loader):
660 |     global_step += 1
661 |     # 这里假设从train_loader取出的gpu类型的Tensor, 如果是cpu类型的Tensor, 则需要首先放到对应的编号为：local_rank的GPU上.
662 |     # local_rank = torch.distributed.get_rank()
663 |     # ================== add data augmentation (这里只做一个示意)===================
664 |     batch = img.shape[0]   # get batchsize
665 |     devive = img.device    # get local_rank
666 |     src_tensor = torch.tensor([[38.29, 51.69, 1.0], [73.53, 51.69, 1.0], [56.02, 71.73, 1.0]],dtype=torch.float32).unsqueeze(0)
667 |     dst_tensor = torch.tensor([[42.0, 52.0], [78.0, 55.0], [58.0, 74.0]], dtype=torch.float32).unsqueeze(0)
668 |     src_tensor = src_tensor.repeat(batch, 1, 1)
669 |     dst_tensor = dst_tensor.repeat(batch, 1, 1)
670 |     # compute affine transform matrix
671 |     matrix_l = torch.transpose(src_tensor, 1, 2).bmm(src_tensor)
672 |     matrix_l = torch.inverse(matrix_l)
673 |     matrix_r = torch.transpose(src_tensor, 1, 2).bmm(dst_tensor)
674 |     affine_matrix = torch.transpose(matrix_l.bmm(matrix_r), 1, 2)
675 |     affine_matrix = affine_matrix.contiguous().to(devive)  # python端做了.contiguous()操作, 则CUDA拓展底层不需要再做.
676 |     img = affine_torch(img, affine_matrix, 112, 112)       # 在gpu上进行数据增强
677 |     # ==============================================================================	
678 |     local_embeddings = backbone(img)
679 |     loss: torch.Tensor = module_partial_fc(local_embeddings, local_labels, opt)
680 | ```
681 | 
682 | 【**注**】：本工程编译完成后，可以将orbbec文件夹直接拷贝自己的训练工程，然后在对应的需要调用拓展函数的python文件中(比如上述代码在train.py文件中)，通过之前提到的方法，将orbbec文件夹所在路径加入到python环境中，就可以正常调用拓展函数了(比如：affine_torch)。
683 | 
684 | ## Reference：
685 | 
686 | [^1]: https://pytorch.org/tutorials/advanced/cpp_extension.html
687 | [^2]: https://github.com/NVIDIA/DALI
688 | [^3]:https://github.com/open-mmlab/mmdetection/tree/v2.0.0
689 | [^4]:  https://github.com/open-mmlab/mmcv
690 | [^5]: https://github.com/openppl-public/ppl.cv
691 | [^6]: https://github.com/pytorch/extension-cpp
692 | [^7]: https://setuptools.pypa.io/en/latest/references/keywords.html
693 | [^8]: https://www.bbsmax.com/A/MAzAwZAo59/
694 | [^9]: https://zhuanlan.zhihu.com/p/419076427 
695 | [^10]: https://zhuanlan.zhihu.com/p/348555597 
696 | [^11]: https://oldpan.me/archives/pytorch-cuda-c-plus-plus 
697 | [^12]: https://docs.python.org/zh-cn/3/extending/building.html
698 | [^13]: https://zhuanlan.zhihu.com/p/276461821
699 | [^14]: https://blog.51cto.com/u_15357586/3784891
700 | [^15]: https://zhuanlan.zhihu.com/p/383572973
701 | 


--------------------------------------------------------------------------------
/docs/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/README.pdf


--------------------------------------------------------------------------------
/docs/files/Custom C++ and CUDA Extensions — PyTorch Tutorials 1.12.1+cu102 documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/Custom C++ and CUDA Extensions — PyTorch Tutorials 1.12.1+cu102 documentation.pdf


--------------------------------------------------------------------------------
/docs/files/Keywords - setuptools 65.4.1.post20221009 documentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/Keywords - setuptools 65.4.1.post20221009 documentation.pdf


--------------------------------------------------------------------------------
/docs/files/PyTorch 源码解读之 cpp_extension：揭秘 C++_CUDA 算子实现和调用全流程 - 知乎.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/PyTorch 源码解读之 cpp_extension：揭秘 C++_CUDA 算子实现和调用全流程 - 知乎.pdf


--------------------------------------------------------------------------------
/docs/files/Python中使用C++_CUDA｜以PointNet中的ball query 为例 - 知乎.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/Python中使用C++_CUDA｜以PointNet中的ball query 为例 - 知乎.pdf


--------------------------------------------------------------------------------
/docs/files/Pytorch拓展进阶(二)：Pytorch结合C++以及Cuda拓展 - Oldpan的个人博客.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/Pytorch拓展进阶(二)：Pytorch结合C++以及Cuda拓展 - Oldpan的个人博客.pdf


--------------------------------------------------------------------------------
/docs/files/mmdetection源码剖析(1)--NMS.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/mmdetection源码剖析(1)--NMS.pdf


--------------------------------------------------------------------------------
/docs/files/setup.py实现C++扩展和python库编译.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/files/setup.py实现C++扩展和python库编译.pdf


--------------------------------------------------------------------------------
/docs/imgs/Dependencies_x64_Release.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/Dependencies_x64_Release.png


--------------------------------------------------------------------------------
/docs/imgs/compile_win.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/compile_win.png


--------------------------------------------------------------------------------
/docs/imgs/dali.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/dali.png


--------------------------------------------------------------------------------
/docs/imgs/pbind11_pipeline.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/pbind11_pipeline.pptx


--------------------------------------------------------------------------------
/docs/imgs/permute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/permute.png


--------------------------------------------------------------------------------
/docs/imgs/pybind11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/pybind11.png


--------------------------------------------------------------------------------
/docs/imgs/pybind11_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/pybind11_pipeline.png


--------------------------------------------------------------------------------
/docs/imgs/windows_opencv_cfg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/windows_opencv_cfg.png


--------------------------------------------------------------------------------
/docs/imgs/动态库拷贝到python文件同路径.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/动态库拷贝到python文件同路径.png


--------------------------------------------------------------------------------
/docs/imgs/安装成功.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/docs/imgs/安装成功.png


--------------------------------------------------------------------------------
/mmcv_setup.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import os
  3 | import platform
  4 | import re
  5 | import warnings
  6 | from pkg_resources import DistributionNotFound, get_distribution
  7 | from setuptools import find_packages, setup
  8 | 
  9 | EXT_TYPE = ''
 10 | try:
 11 |     import torch
 12 |     if torch.__version__ == 'parrots':
 13 |         from parrots.utils.build_extension import BuildExtension
 14 |         EXT_TYPE = 'parrots'
 15 |     elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
 16 |             os.getenv('FORCE_MLU', '0') == '1':
 17 |         from torch_mlu.utils.cpp_extension import BuildExtension
 18 |         EXT_TYPE = 'pytorch'
 19 |     else:
 20 |         from torch.utils.cpp_extension import BuildExtension
 21 |         EXT_TYPE = 'pytorch'
 22 |     cmd_class = {'build_ext': BuildExtension}
 23 | except ModuleNotFoundError:
 24 |     cmd_class = {}
 25 |     print('Skip building ext ops due to the absence of torch.')
 26 | 
 27 | 
 28 | def choose_requirement(primary, secondary):
 29 |     """If some version of primary requirement installed, return primary, else
 30 |     return secondary."""
 31 |     try:
 32 |         name = re.split(r'[!<>=]', primary)[0]
 33 |         get_distribution(name)
 34 |     except DistributionNotFound:
 35 |         return secondary
 36 | 
 37 |     return str(primary)
 38 | 
 39 | 
 40 | def get_version():
 41 |     version_file = 'mmcv/version.py'
 42 |     with open(version_file, encoding='utf-8') as f:
 43 |         exec(compile(f.read(), version_file, 'exec'))
 44 |     return locals()['__version__']
 45 | 
 46 | 
 47 | def parse_requirements(fname='requirements/runtime.txt', with_version=True):
 48 |     """Parse the package dependencies listed in a requirements file but strips
 49 |     specific versioning information.
 50 | 
 51 |     Args:
 52 |         fname (str): path to requirements file
 53 |         with_version (bool, default=False): if True include version specs
 54 | 
 55 |     Returns:
 56 |         List[str]: list of requirements items
 57 | 
 58 |     CommandLine:
 59 |         python -c "import setup; print(setup.parse_requirements())"
 60 |     """
 61 |     import sys
 62 |     from os.path import exists
 63 |     require_fpath = fname
 64 | 
 65 |     def parse_line(line):
 66 |         """Parse information from a line in a requirements text file."""
 67 |         if line.startswith('-r '):
 68 |             # Allow specifying requirements in other files
 69 |             target = line.split(' ')[1]
 70 |             for info in parse_require_file(target):
 71 |                 yield info
 72 |         else:
 73 |             info = {'line': line}
 74 |             if line.startswith('-e '):
 75 |                 info['package'] = line.split('#egg=')[1]
 76 |             else:
 77 |                 # Remove versioning from the package
 78 |                 pat = '(' + '|'.join(['>=', '==', '>']) + ')'
 79 |                 parts = re.split(pat, line, maxsplit=1)
 80 |                 parts = [p.strip() for p in parts]
 81 | 
 82 |                 info['package'] = parts[0]
 83 |                 if len(parts) > 1:
 84 |                     op, rest = parts[1:]
 85 |                     if ';' in rest:
 86 |                         # Handle platform specific dependencies
 87 |                         # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
 88 |                         version, platform_deps = map(str.strip,
 89 |                                                      rest.split(';'))
 90 |                         info['platform_deps'] = platform_deps
 91 |                     else:
 92 |                         version = rest  # NOQA
 93 |                     info['version'] = (op, version)
 94 |             yield info
 95 | 
 96 |     def parse_require_file(fpath):
 97 |         with open(fpath) as f:
 98 |             for line in f.readlines():
 99 |                 line = line.strip()
100 |                 if line and not line.startswith('#'):
101 |                     yield from parse_line(line)
102 | 
103 |     def gen_packages_items():
104 |         if exists(require_fpath):
105 |             for info in parse_require_file(require_fpath):
106 |                 parts = [info['package']]
107 |                 if with_version and 'version' in info:
108 |                     parts.extend(info['version'])
109 |                 if not sys.version.startswith('3.4'):
110 |                     # apparently package_deps are broken in 3.4
111 |                     platform_deps = info.get('platform_deps')
112 |                     if platform_deps is not None:
113 |                         parts.append(';' + platform_deps)
114 |                 item = ''.join(parts)
115 |                 yield item
116 | 
117 |     packages = list(gen_packages_items())
118 |     return packages
119 | 
120 | 
121 | install_requires = parse_requirements()
122 | 
123 | try:
124 |     # OpenCV installed via conda.
125 |     import cv2  # NOQA: F401
126 |     major, minor, *rest = cv2.__version__.split('.')
127 |     if int(major) < 3:
128 |         raise RuntimeError(
129 |             f'OpenCV >=3 is required but {cv2.__version__} is installed')
130 | except ImportError:
131 |     # If first not installed install second package
132 |     CHOOSE_INSTALL_REQUIRES = [('opencv-python-headless>=3',
133 |                                 'opencv-python>=3')]
134 |     for main, secondary in CHOOSE_INSTALL_REQUIRES:
135 |         install_requires.append(choose_requirement(main, secondary))
136 | 
137 | 
138 | def get_extensions():
139 |     extensions = []
140 | 
141 |     if os.getenv('MMCV_WITH_TRT', '0') != '0':
142 | 
143 |         # Following strings of text style are from colorama package
144 |         bright_style, reset_style = '\x1b[1m', '\x1b[0m'
145 |         red_text, blue_text = '\x1b[31m', '\x1b[34m'
146 |         white_background = '\x1b[107m'
147 | 
148 |         msg = white_background + bright_style + red_text
149 |         msg += 'DeprecationWarning: ' + \
150 |             'Custom TensorRT Ops will be deprecated in future. '
151 |         msg += blue_text + \
152 |             'Welcome to use the unified model deployment toolbox '
153 |         msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
154 |         msg += reset_style
155 |         warnings.warn(msg)
156 | 
157 |         ext_name = 'mmcv._ext_trt'
158 |         from torch.utils.cpp_extension import include_paths, library_paths
159 |         library_dirs = []
160 |         libraries = []
161 |         include_dirs = []
162 |         tensorrt_path = os.getenv('TENSORRT_DIR', '0')
163 |         tensorrt_lib_path = glob.glob(
164 |             os.path.join(tensorrt_path, 'targets', '*', 'lib'))[0]
165 |         library_dirs += [tensorrt_lib_path]
166 |         libraries += ['nvinfer', 'nvparsers', 'nvinfer_plugin']
167 |         libraries += ['cudart']
168 |         define_macros = []
169 |         extra_compile_args = {'cxx': []}
170 | 
171 |         include_path = os.path.abspath('./mmcv/ops/csrc/common/cuda')
172 |         include_trt_path = os.path.abspath('./mmcv/ops/csrc/tensorrt')
173 |         include_dirs.append(include_path)
174 |         include_dirs.append(include_trt_path)
175 |         include_dirs.append(os.path.join(tensorrt_path, 'include'))
176 |         include_dirs += include_paths(cuda=True)
177 | 
178 |         op_files = glob.glob('./mmcv/ops/csrc/tensorrt/plugins/*')
179 |         define_macros += [('MMCV_WITH_CUDA', None)]
180 |         define_macros += [('MMCV_WITH_TRT', None)]
181 |         cuda_args = os.getenv('MMCV_CUDA_ARGS')
182 |         extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
183 |         # prevent cub/thrust conflict with other python library
184 |         # More context See issues #1454
185 |         extra_compile_args['nvcc'] += ['-Xcompiler=-fno-gnu-unique']
186 |         library_dirs += library_paths(cuda=True)
187 | 
188 |         from setuptools import Extension
189 |         ext_ops = Extension(
190 |             name=ext_name,
191 |             sources=op_files,
192 |             include_dirs=include_dirs,
193 |             define_macros=define_macros,
194 |             extra_compile_args=extra_compile_args,
195 |             language='c++',
196 |             library_dirs=library_dirs,
197 |             libraries=libraries)
198 |         extensions.append(ext_ops)
199 | 
200 |     if os.getenv('MMCV_WITH_OPS', '0') == '0':
201 |         return extensions
202 | 
203 |     if EXT_TYPE == 'parrots':
204 |         ext_name = 'mmcv._ext'
205 |         from parrots.utils.build_extension import Extension
206 | 
207 |         # new parrots op impl do not use MMCV_USE_PARROTS
208 |         # define_macros = [('MMCV_USE_PARROTS', None)]
209 |         define_macros = []
210 |         include_dirs = []
211 |         op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
212 |             glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
213 |             glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
214 |         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
215 |         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
216 |         cuda_args = os.getenv('MMCV_CUDA_ARGS')
217 |         extra_compile_args = {
218 |             'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
219 |             'cxx': ['-std=c++14'],
220 |         }
221 |         if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
222 |             define_macros += [('MMCV_WITH_CUDA', None)]
223 |             extra_compile_args['nvcc'] += [
224 |                 '-D__CUDA_NO_HALF_OPERATORS__',
225 |                 '-D__CUDA_NO_HALF_CONVERSIONS__',
226 |                 '-D__CUDA_NO_HALF2_OPERATORS__',
227 |             ]
228 |         ext_ops = Extension(
229 |             name=ext_name,
230 |             sources=op_files,
231 |             include_dirs=include_dirs,
232 |             define_macros=define_macros,
233 |             extra_compile_args=extra_compile_args,
234 |             cuda=True,
235 |             pytorch=True)
236 |         extensions.append(ext_ops)
237 |     elif EXT_TYPE == 'pytorch':
238 |         ext_name = 'mmcv._ext'
239 |         from torch.utils.cpp_extension import CppExtension, CUDAExtension
240 | 
241 |         # prevent ninja from using too many resources
242 |         try:
243 |             import psutil
244 |             num_cpu = len(psutil.Process().cpu_affinity())
245 |             cpu_use = max(4, num_cpu - 1)
246 |         except (ModuleNotFoundError, AttributeError):
247 |             cpu_use = 4
248 | 
249 |         os.environ.setdefault('MAX_JOBS', str(cpu_use))
250 |         define_macros = []
251 | 
252 |         # Before PyTorch1.8.0, when compiling CUDA code, `cxx` is a
253 |         # required key passed to PyTorch. Even if there is no flag passed
254 |         # to cxx, users also need to pass an empty list to PyTorch.
255 |         # Since PyTorch1.8.0, it has a default value so users do not need
256 |         # to pass an empty list anymore.
257 |         # More details at https://github.com/pytorch/pytorch/pull/45956
258 |         extra_compile_args = {'cxx': []}
259 | 
260 |         # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
261 |         # c++14 features, the argument ['std=c++14'] must be added here.
262 |         # However, in the windows environment, some standard libraries
263 |         # will depend on c++17 or higher. In fact, for the windows
264 |         # environment, the compiler will choose the appropriate compiler
265 |         # to compile those cpp files, so there is no need to add the
266 |         # argument
267 |         if platform.system() != 'Windows':
268 |             extra_compile_args['cxx'] = ['-std=c++14']
269 | 
270 |         include_dirs = []
271 | 
272 |         is_rocm_pytorch = False
273 |         try:
274 |             from torch.utils.cpp_extension import ROCM_HOME
275 |             is_rocm_pytorch = True if ((torch.version.hip is not None) and
276 |                                        (ROCM_HOME is not None)) else False
277 |         except ImportError:
278 |             pass
279 | 
280 |         if is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
281 |                 'FORCE_CUDA', '0') == '1':
282 |             if is_rocm_pytorch:
283 |                 define_macros += [('MMCV_WITH_HIP', None)]
284 |             define_macros += [('MMCV_WITH_CUDA', None)]
285 |             cuda_args = os.getenv('MMCV_CUDA_ARGS')
286 |             extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
287 |             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
288 |                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
289 |                 glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
290 |                 glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
291 |             extension = CUDAExtension
292 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/pytorch'))
293 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
294 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
295 |         elif (hasattr(torch, 'is_mlu_available') and
296 |                 torch.is_mlu_available()) or \
297 |                 os.getenv('FORCE_MLU', '0') == '1':
298 |             from torch_mlu.utils.cpp_extension import MLUExtension
299 |             define_macros += [('MMCV_WITH_MLU', None)]
300 |             mlu_args = os.getenv('MMCV_MLU_ARGS')
301 |             extra_compile_args['cncc'] = [mlu_args] if mlu_args else []
302 |             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
303 |                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
304 |                 glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
305 |                 glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu')
306 |             extension = MLUExtension
307 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
308 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
309 |         elif (hasattr(torch.backends, 'mps')
310 |               and torch.backends.mps.is_available()) or os.getenv(
311 |                   'FORCE_MPS', '0') == '1':
312 |             # objc compiler support
313 |             from distutils.unixccompiler import UnixCCompiler
314 |             if '.mm' not in UnixCCompiler.src_extensions:
315 |                 UnixCCompiler.src_extensions.append('.mm')
316 |                 UnixCCompiler.language_map['.mm'] = 'objc'
317 | 
318 |             define_macros += [('MMCV_WITH_MPS', None)]
319 |             extra_compile_args = {}
320 |             extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
321 |             extra_compile_args['cxx'] += [
322 |                 '-framework', 'Metal', '-framework', 'Foundation'
323 |             ]
324 |             extra_compile_args['cxx'] += ['-ObjC++']
325 |             # src
326 |             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
327 |                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
328 |                 glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
329 |                 glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
330 |             extension = CppExtension
331 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
332 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
333 |         elif (os.getenv('FORCE_NPU', '0') == '1'):
334 |             print(f'Compiling {ext_name} only with CPU and NPU')
335 |             try:
336 |                 from torch_npu.utils.cpp_extension import NpuExtension
337 |                 define_macros += [('MMCV_WITH_NPU', None)]
338 |                 extension = NpuExtension
339 |             except Exception:
340 |                 raise ImportError('can not find any torch_npu')
341 |             # src
342 |             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
343 |                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
344 |                 glob.glob('./mmcv/ops/csrc/common/npu/*.cpp') + \
345 |                 glob.glob('./mmcv/ops/csrc/pytorch/npu/*.cpp')
346 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
347 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/npu'))
348 |         else:
349 |             print(f'Compiling {ext_name} only with CPU')
350 |             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
351 |                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
352 |             extension = CppExtension
353 |             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
354 | 
355 |         # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
356 |         # c++14 features, the argument ['std=c++14'] must be added here.
357 |         # However, in the windows environment, some standard libraries
358 |         # will depend on c++17 or higher. In fact, for the windows
359 |         # environment, the compiler will choose the appropriate compiler
360 |         # to compile those cpp files, so there is no need to add the
361 |         # argument
362 |         if 'nvcc' in extra_compile_args and platform.system() != 'Windows':
363 |             extra_compile_args['nvcc'] += ['-std=c++14']
364 | 
365 |         ext_ops = extension(
366 |             name=ext_name,
367 |             sources=op_files,
368 |             include_dirs=include_dirs,
369 |             define_macros=define_macros,
370 |             extra_compile_args=extra_compile_args)
371 |         extensions.append(ext_ops)
372 | 
373 |     if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
374 | 
375 |         # Following strings of text style are from colorama package
376 |         bright_style, reset_style = '\x1b[1m', '\x1b[0m'
377 |         red_text, blue_text = '\x1b[31m', '\x1b[34m'
378 |         white_background = '\x1b[107m'
379 | 
380 |         msg = white_background + bright_style + red_text
381 |         msg += 'DeprecationWarning: ' + \
382 |             'Custom ONNXRuntime Ops will be deprecated in future. '
383 |         msg += blue_text + \
384 |             'Welcome to use the unified model deployment toolbox '
385 |         msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
386 |         msg += reset_style
387 |         warnings.warn(msg)
388 |         ext_name = 'mmcv._ext_ort'
389 |         import onnxruntime
390 |         from torch.utils.cpp_extension import include_paths, library_paths
391 |         library_dirs = []
392 |         libraries = []
393 |         include_dirs = []
394 |         ort_path = os.getenv('ONNXRUNTIME_DIR', '0')
395 |         library_dirs += [os.path.join(ort_path, 'lib')]
396 |         libraries.append('onnxruntime')
397 |         define_macros = []
398 |         extra_compile_args = {'cxx': []}
399 | 
400 |         include_path = os.path.abspath('./mmcv/ops/csrc/onnxruntime')
401 |         include_dirs.append(include_path)
402 |         include_dirs.append(os.path.join(ort_path, 'include'))
403 | 
404 |         op_files = glob.glob('./mmcv/ops/csrc/onnxruntime/cpu/*')
405 |         if onnxruntime.get_device() == 'GPU' or os.getenv('FORCE_CUDA',
406 |                                                           '0') == '1':
407 |             define_macros += [('MMCV_WITH_CUDA', None)]
408 |             cuda_args = os.getenv('MMCV_CUDA_ARGS')
409 |             extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
410 |             op_files += glob.glob('./mmcv/ops/csrc/onnxruntime/gpu/*')
411 |             include_dirs += include_paths(cuda=True)
412 |             library_dirs += library_paths(cuda=True)
413 |         else:
414 |             include_dirs += include_paths(cuda=False)
415 |             library_dirs += library_paths(cuda=False)
416 | 
417 |         from setuptools import Extension
418 |         ext_ops = Extension(
419 |             name=ext_name,
420 |             sources=op_files,
421 |             include_dirs=include_dirs,
422 |             define_macros=define_macros,
423 |             extra_compile_args=extra_compile_args,
424 |             language='c++',
425 |             library_dirs=library_dirs,
426 |             libraries=libraries)
427 |         extensions.append(ext_ops)
428 | 
429 |     return extensions
430 | 
431 | 
432 | setup(
433 |     name='mmcv' if os.getenv('MMCV_WITH_OPS', '0') == '0' else 'mmcv-full',
434 |     version=get_version(),
435 |     description='OpenMMLab Computer Vision Foundation',
436 |     keywords='computer vision',
437 |     packages=find_packages(),
438 |     include_package_data=True,
439 |     classifiers=[
440 |         'Development Status :: 4 - Beta',
441 |         'License :: OSI Approved :: Apache Software License',
442 |         'Operating System :: OS Independent',
443 |         'Programming Language :: Python :: 3',
444 |         'Programming Language :: Python :: 3.6',
445 |         'Programming Language :: Python :: 3.7',
446 |         'Programming Language :: Python :: 3.8',
447 |         'Programming Language :: Python :: 3.9',
448 |         'Programming Language :: Python :: 3.10',
449 |         'Topic :: Utilities',
450 |     ],
451 |     url='https://github.com/open-mmlab/mmcv',
452 |     author='MMCV Contributors',
453 |     author_email='openmmlab@gmail.com',
454 |     install_requires=install_requires,
455 |     extras_require={
456 |         'all': parse_requirements('requirements.txt'),
457 |         'tests': parse_requirements('requirements/test.txt'),
458 |         'build': parse_requirements('requirements/build.txt'),
459 |         'optional': parse_requirements('requirements/optional.txt'),
460 |     },
461 |     ext_modules=get_extensions(),
462 |     cmdclass=cmd_class,
463 |     zip_safe=False)
464 | 


--------------------------------------------------------------------------------
/orbbec/__init__.py:
--------------------------------------------------------------------------------
 1 | from .warpaffine import affine_opencv, affine_torch
 2 | from .nms import batched_nms, nms, soft_nms
 3 | from .roi_align import RoIAlign, roi_align
 4 | from .utils import get_compiler_version, get_compiling_cuda_version
 5 | 
 6 | __all__ = [
 7 |     'nms', 'soft_nms', 'batched_nms',
 8 |     'RoIAlign', 'roi_align',
 9 |     'get_compiler_version', 'get_compiling_cuda_version',
10 |     'affine_opencv','affine_torch'
11 | ]
12 | 


--------------------------------------------------------------------------------
/orbbec/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from .nms_wrapper import batched_nms, nms, soft_nms
2 | 
3 | __all__ = ['nms', 'soft_nms', 'batched_nms']
4 | 


--------------------------------------------------------------------------------
/orbbec/nms/nms_wrapper.py:
--------------------------------------------------------------------------------
  1 | from re import I
  2 | import numpy as np
  3 | import torch
  4 | 
  5 | from . import nms_ext
  6 | 
  7 | 
  8 | def nms(dets, iou_thr, device_id=None):
  9 |     """Dispatch to either CPU or GPU NMS implementations.
 10 | 
 11 |     The input can be either a torch tensor or numpy array. GPU NMS will be used
 12 |     if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
 13 |     will be used. The returned type will always be the same as inputs.
 14 | 
 15 |     Arguments:
 16 |         dets (torch.Tensor or np.ndarray): bboxes with scores.
 17 |         iou_thr (float): IoU threshold for NMS.
 18 |         device_id (int, optional): when `dets` is a numpy array, if `device_id`
 19 |             is None, then cpu nms is used, otherwise gpu_nms will be used.
 20 | 
 21 |     Returns:
 22 |         tuple: kept bboxes and indice, which is always the same data type as
 23 |             the input.
 24 | 
 25 |     Example:
 26 |         >>> dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
 27 |         >>>                  [49.3, 32.9, 51.0, 35.3, 0.9],
 28 |         >>>                  [49.2, 31.8, 51.0, 35.4, 0.5],
 29 |         >>>                  [35.1, 11.5, 39.1, 15.7, 0.5],
 30 |         >>>                  [35.6, 11.8, 39.3, 14.2, 0.5],
 31 |         >>>                  [35.3, 11.5, 39.9, 14.5, 0.4],
 32 |         >>>                  [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32)
 33 |         >>> iou_thr = 0.6
 34 |         >>> suppressed, inds = nms(dets, iou_thr)
 35 |         >>> assert len(inds) == len(suppressed) == 3
 36 |     """
 37 |     # convert dets (tensor or numpy array) to tensor
 38 |     if isinstance(dets, torch.Tensor):
 39 |         is_numpy = False
 40 |         dets_th = dets
 41 |     elif isinstance(dets, np.ndarray):
 42 |         is_numpy = True
 43 |         device = 'cpu' if device_id is None else f'cuda:{device_id}'
 44 |         dets_th = torch.from_numpy(dets).to(device)
 45 |     else:
 46 |         raise TypeError('dets must be either a Tensor or numpy array, '
 47 |                         f'but got {type(dets)}')
 48 | 
 49 |     # execute cpu or cuda nms
 50 |     if dets_th.shape[0] == 0:
 51 |         inds = dets_th.new_zeros(0, dtype=torch.long)
 52 |     else:
 53 |         if dets_th.is_cuda:
 54 |             inds = nms_ext.nms(dets_th, iou_thr)
 55 |         else:
 56 |             inds = nms_ext.nms(dets_th, iou_thr)
 57 | 
 58 |     if is_numpy:
 59 |         inds = inds.cpu().numpy()
 60 |     return dets[inds, :], inds
 61 | 
 62 | 
 63 | def soft_nms(dets, iou_thr, method='linear', sigma=0.5, min_score=1e-3):
 64 |     """Dispatch to only CPU Soft NMS implementations.
 65 | 
 66 |     The input can be either a torch tensor or numpy array.
 67 |     The returned type will always be the same as inputs.
 68 | 
 69 |     Arguments:
 70 |         dets (torch.Tensor or np.ndarray): bboxes with scores.
 71 |         iou_thr (float): IoU threshold for Soft NMS.
 72 |         method (str): either 'linear' or 'gaussian'
 73 |         sigma (float): hyperparameter for gaussian method
 74 |         min_score (float): score filter threshold
 75 | 
 76 |     Returns:
 77 |         tuple: new det bboxes and indice, which is always the same
 78 |         data type as the input.
 79 | 
 80 |     Example:
 81 |         >>> dets = np.array([[4., 3., 5., 3., 0.9],
 82 |         >>>                  [4., 3., 5., 4., 0.9],
 83 |         >>>                  [3., 1., 3., 1., 0.5],
 84 |         >>>                  [3., 1., 3., 1., 0.5],
 85 |         >>>                  [3., 1., 3., 1., 0.4],
 86 |         >>>                  [3., 1., 3., 1., 0.0]], dtype=np.float32)
 87 |         >>> iou_thr = 0.6
 88 |         >>> new_dets, inds = soft_nms(dets, iou_thr, sigma=0.5)
 89 |         >>> assert len(inds) == len(new_dets) == 5
 90 |     """
 91 |     # convert dets (tensor or numpy array) to tensor
 92 |     if isinstance(dets, torch.Tensor):
 93 |         is_tensor = True
 94 |         dets_t = dets.detach().cpu()
 95 |     elif isinstance(dets, np.ndarray):
 96 |         is_tensor = False
 97 |         dets_t = torch.from_numpy(dets)
 98 |     else:
 99 |         raise TypeError('dets must be either a Tensor or numpy array, '
100 |                         f'but got {type(dets)}')
101 | 
102 |     method_codes = {'linear': 1, 'gaussian': 2}
103 |     if method not in method_codes:
104 |         raise ValueError(f'Invalid method for SoftNMS: {method}')
105 |     results = nms_ext.soft_nms(dets_t, iou_thr, method_codes[method], sigma,
106 |                                min_score)
107 | 
108 |     new_dets = results[:, :5]
109 |     inds = results[:, 5]
110 | 
111 |     if is_tensor:
112 |         return new_dets.to(
113 |             device=dets.device, dtype=dets.dtype), inds.to(
114 |                 device=dets.device, dtype=torch.long)
115 |     else:
116 |         return new_dets.numpy().astype(dets.dtype), inds.numpy().astype(
117 |             np.int64)
118 | 
119 | 
120 | def batched_nms(bboxes, scores, inds, nms_cfg):
121 |     """Performs non-maximum suppression in a batched fashion.
122 | 
123 |     Modified from https://github.com/pytorch/vision/blob
124 |     /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
125 |     In order to perform NMS independently per class, we add an offset to all
126 |     the boxes. The offset is dependent only on the class idx, and is large
127 |     enough so that boxes from different classes do not overlap.
128 | 
129 |     Arguments:
130 |         bboxes (torch.Tensor): bboxes in shape (N, 4).
131 |         scores (torch.Tensor): scores in shape (N, ).
132 |         inds (torch.Tensor): each index value correspond to a bbox cluster,
133 |             and NMS will not be applied between elements of different inds,
134 |             shape (N, ).
135 |         nms_cfg (dict): specify nms type and other parameters like iou_thr.
136 | 
137 |     Returns:
138 |         tuple: kept bboxes and indice.
139 |     """
140 |     max_coordinate = bboxes.max()
141 |     offsets = inds.to(bboxes) * (max_coordinate + 1)
142 |     bboxes_for_nms = bboxes + offsets[:, None]
143 |     nms_cfg_ = nms_cfg.copy()
144 |     nms_type = nms_cfg_.pop('type', 'nms')
145 |     nms_op = eval(nms_type)
146 |     dets, keep = nms_op(
147 |         torch.cat([bboxes_for_nms, scores[:, None]], -1), **nms_cfg_)
148 |     bboxes = bboxes[keep]
149 |     scores = dets[:, -1]
150 |     return torch.cat([bboxes, scores[:, None]], -1), keep
151 | 


--------------------------------------------------------------------------------
/orbbec/nms/src/cpu/nms_cpu.cpp:
--------------------------------------------------------------------------------
  1 | // Soft-NMS is added by MMDetection.
  2 | // Modified from
  3 | // https://github.com/bharatsingh430/soft-nms/blob/master/lib/nms/cpu_nms.pyx.
  4 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  5 | #include <torch/extension.h>
  6 | 
  7 | template <typename scalar_t>
  8 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
  9 |   AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
 10 | 
 11 |   if (dets.numel() == 0) {
 12 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
 13 |   }
 14 | 
 15 |   auto x1_t = dets.select(1, 0).contiguous();
 16 |   auto y1_t = dets.select(1, 1).contiguous();
 17 |   auto x2_t = dets.select(1, 2).contiguous();
 18 |   auto y2_t = dets.select(1, 3).contiguous();
 19 |   auto scores = dets.select(1, 4).contiguous();
 20 | 
 21 |   at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
 22 | 
 23 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
 24 | 
 25 |   auto ndets = dets.size(0);
 26 |   at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
 27 | 
 28 |   auto suppressed = suppressed_t.data_ptr<uint8_t>();
 29 |   auto order = order_t.data_ptr<int64_t>();
 30 |   auto x1 = x1_t.data_ptr<scalar_t>();
 31 |   auto y1 = y1_t.data_ptr<scalar_t>();
 32 |   auto x2 = x2_t.data_ptr<scalar_t>();
 33 |   auto y2 = y2_t.data_ptr<scalar_t>();
 34 |   auto areas = areas_t.data_ptr<scalar_t>();
 35 | 
 36 |   for (int64_t _i = 0; _i < ndets; _i++) {
 37 |     auto i = order[_i];
 38 |     if (suppressed[i] == 1) continue;
 39 |     auto ix1 = x1[i];
 40 |     auto iy1 = y1[i];
 41 |     auto ix2 = x2[i];
 42 |     auto iy2 = y2[i];
 43 |     auto iarea = areas[i];
 44 | 
 45 |     for (int64_t _j = _i + 1; _j < ndets; _j++) {
 46 |       auto j = order[_j];
 47 |       if (suppressed[j] == 1) continue;
 48 |       auto xx1 = std::max(ix1, x1[j]);
 49 |       auto yy1 = std::max(iy1, y1[j]);
 50 |       auto xx2 = std::min(ix2, x2[j]);
 51 |       auto yy2 = std::min(iy2, y2[j]);
 52 | 
 53 |       auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1);
 54 |       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
 55 |       auto inter = w * h;
 56 |       auto ovr = inter / (iarea + areas[j] - inter);
 57 |       if (ovr >= threshold) suppressed[j] = 1;
 58 |     }
 59 |   }
 60 |   return at::nonzero(suppressed_t == 0).squeeze(1);
 61 | }
 62 | 
 63 | at::Tensor nms_cpu(const at::Tensor& dets, const float threshold) {
 64 |   at::Tensor result;
 65 |   AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] {
 66 |     result = nms_cpu_kernel<scalar_t>(dets, threshold);
 67 |   });
 68 |   return result;
 69 | }
 70 | 
 71 | template <typename scalar_t>
 72 | at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold,
 73 |                                const unsigned char method, const float sigma,
 74 |                                const float min_score) {
 75 |   AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
 76 | 
 77 |   if (dets.numel() == 0) {
 78 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
 79 |   }
 80 | 
 81 |   auto x1_t = dets.select(1, 0).contiguous();
 82 |   auto y1_t = dets.select(1, 1).contiguous();
 83 |   auto x2_t = dets.select(1, 2).contiguous();
 84 |   auto y2_t = dets.select(1, 3).contiguous();
 85 |   auto scores_t = dets.select(1, 4).contiguous();
 86 | 
 87 |   at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
 88 | 
 89 |   auto ndets = dets.size(0);
 90 |   auto x1 = x1_t.data_ptr<scalar_t>();
 91 |   auto y1 = y1_t.data_ptr<scalar_t>();
 92 |   auto x2 = x2_t.data_ptr<scalar_t>();
 93 |   auto y2 = y2_t.data_ptr<scalar_t>();
 94 |   auto scores = scores_t.data_ptr<scalar_t>();
 95 |   auto areas = areas_t.data_ptr<scalar_t>();
 96 | 
 97 |   int64_t pos = 0;
 98 |   at::Tensor inds_t = at::arange(ndets, dets.options());
 99 |   auto inds = inds_t.data_ptr<scalar_t>();
100 | 
101 |   for (int64_t i = 0; i < ndets; i++) {
102 |     auto max_score = scores[i];
103 |     auto max_pos = i;
104 | 
105 |     auto ix1 = x1[i];
106 |     auto iy1 = y1[i];
107 |     auto ix2 = x2[i];
108 |     auto iy2 = y2[i];
109 |     auto iscore = scores[i];
110 |     auto iarea = areas[i];
111 |     auto iind = inds[i];
112 | 
113 |     pos = i + 1;
114 |     // get max box
115 |     while (pos < ndets) {
116 |       if (max_score < scores[pos]) {
117 |         max_score = scores[pos];
118 |         max_pos = pos;
119 |       }
120 |       pos = pos + 1;
121 |     }
122 |     // add max box as a detection
123 |     x1[i] = x1[max_pos];
124 |     y1[i] = y1[max_pos];
125 |     x2[i] = x2[max_pos];
126 |     y2[i] = y2[max_pos];
127 |     scores[i] = scores[max_pos];
128 |     areas[i] = areas[max_pos];
129 |     inds[i] = inds[max_pos];
130 | 
131 |     // swap ith box with position of max box
132 |     x1[max_pos] = ix1;
133 |     y1[max_pos] = iy1;
134 |     x2[max_pos] = ix2;
135 |     y2[max_pos] = iy2;
136 |     scores[max_pos] = iscore;
137 |     areas[max_pos] = iarea;
138 |     inds[max_pos] = iind;
139 | 
140 |     ix1 = x1[i];
141 |     iy1 = y1[i];
142 |     ix2 = x2[i];
143 |     iy2 = y2[i];
144 |     iscore = scores[i];
145 |     iarea = areas[i];
146 | 
147 |     pos = i + 1;
148 |     // NMS iterations, note that N changes if detection boxes fall below
149 |     // threshold
150 |     while (pos < ndets) {
151 |       auto xx1 = std::max(ix1, x1[pos]);
152 |       auto yy1 = std::max(iy1, y1[pos]);
153 |       auto xx2 = std::min(ix2, x2[pos]);
154 |       auto yy2 = std::min(iy2, y2[pos]);
155 | 
156 |       auto w = std::max(static_cast<scalar_t>(0), xx2 - xx1);
157 |       auto h = std::max(static_cast<scalar_t>(0), yy2 - yy1);
158 |       auto inter = w * h;
159 |       auto ovr = inter / (iarea + areas[pos] - inter);
160 | 
161 |       scalar_t weight = 1.;
162 |       if (method == 1) {
163 |         if (ovr > threshold) weight = 1 - ovr;
164 |       } else if (method == 2) {
165 |         weight = std::exp(-(ovr * ovr) / sigma);
166 |       } else {
167 |         // original NMS
168 |         if (ovr > threshold) {
169 |           weight = 0;
170 |         } else {
171 |           weight = 1;
172 |         }
173 |       }
174 |       scores[pos] = weight * scores[pos];
175 |       // if box score falls below threshold, discard the box by
176 |       // swapping with last box update N
177 |       if (scores[pos] < min_score) {
178 |         x1[pos] = x1[ndets - 1];
179 |         y1[pos] = y1[ndets - 1];
180 |         x2[pos] = x2[ndets - 1];
181 |         y2[pos] = y2[ndets - 1];
182 |         scores[pos] = scores[ndets - 1];
183 |         areas[pos] = areas[ndets - 1];
184 |         inds[pos] = inds[ndets - 1];
185 |         ndets = ndets - 1;
186 |         pos = pos - 1;
187 |       }
188 |       pos = pos + 1;
189 |     }
190 |   }
191 |   at::Tensor result = at::zeros({6, ndets}, dets.options());
192 |   result[0] = x1_t.slice(0, 0, ndets);
193 |   result[1] = y1_t.slice(0, 0, ndets);
194 |   result[2] = x2_t.slice(0, 0, ndets);
195 |   result[3] = y2_t.slice(0, 0, ndets);
196 |   result[4] = scores_t.slice(0, 0, ndets);
197 |   result[5] = inds_t.slice(0, 0, ndets);
198 | 
199 |   result = result.t().contiguous();
200 |   return result;
201 | }
202 | 
203 | at::Tensor soft_nms_cpu(const at::Tensor& dets, const float threshold,
204 |                         const unsigned char method, const float sigma,
205 |                         const float min_score) {
206 |   at::Tensor result;
207 |   AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "soft_nms", [&] {
208 |     result = soft_nms_cpu_kernel<scalar_t>(dets, threshold, method, sigma,
209 |                                            min_score);
210 |   });
211 |   return result;
212 | }
213 | 


--------------------------------------------------------------------------------
/orbbec/nms/src/cuda/nms_cuda.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include <torch/extension.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 5 | 
 6 | at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh);
 7 | 
 8 | at::Tensor nms_cuda(const at::Tensor& dets, const float threshold) {
 9 |   CHECK_CUDA(dets);
10 |   if (dets.numel() == 0)
11 |     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
12 |   return nms_cuda_forward(dets, threshold);
13 | }
14 | 


--------------------------------------------------------------------------------
/orbbec/nms/src/cuda/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | // /* pytorch: 1.5.0 ~ 1.10.x */
  3 | #include <ATen/ATen.h>
  4 | #include <ATen/cuda/CUDAContext.h>
  5 | #include <ATen/DeviceGuard.h>
  6 | 
  7 | #include <THC/THC.h>
  8 | #include <THC/THCDeviceUtils.cuh>
  9 | // /* ---------------------- */
 10 | 
 11 | // /* pytorch: 1.11.0 ~ latest */
 12 | // #include <cuda.h>
 13 | // #include <ATen/ATen.h>
 14 | // #include <ATen/cuda/CUDAContext.h>
 15 | // #include <c10/cuda/CUDAGuard.h>
 16 | 
 17 | // #include <ATen/cuda/CUDAApplyUtils.cuh>
 18 | // #include <THC/THCAtomics.cuh>
 19 | 
 20 | #include <vector>
 21 | #include <iostream>
 22 | 
 23 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 24 | 
 25 | __device__ inline float devIoU(float const * const a, float const * const b) {
 26 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 27 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 28 |   float width = max(right - left, 0.f), height = max(bottom - top, 0.f);
 29 |   float interS = width * height;
 30 |   float Sa = (a[2] - a[0]) * (a[3] - a[1]);
 31 |   float Sb = (b[2] - b[0]) * (b[3] - b[1]);
 32 |   return interS / (Sa + Sb - interS);
 33 | }
 34 | 
 35 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 36 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 37 |   const int row_start = blockIdx.y;
 38 |   const int col_start = blockIdx.x;
 39 | 
 40 |   // if (row_start > col_start) return;
 41 | 
 42 |   const int row_size =
 43 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 44 |   const int col_size =
 45 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 46 | 
 47 |   __shared__ float block_boxes[threadsPerBlock * 5];
 48 |   if (threadIdx.x < col_size) {
 49 |     block_boxes[threadIdx.x * 5 + 0] =
 50 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 51 |     block_boxes[threadIdx.x * 5 + 1] =
 52 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 53 |     block_boxes[threadIdx.x * 5 + 2] =
 54 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 55 |     block_boxes[threadIdx.x * 5 + 3] =
 56 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 57 |     block_boxes[threadIdx.x * 5 + 4] =
 58 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 59 |   }
 60 |   __syncthreads();
 61 | 
 62 |   if (threadIdx.x < row_size) {
 63 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 64 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 65 |     int i = 0;
 66 |     unsigned long long t = 0;
 67 |     int start = 0;
 68 |     if (row_start == col_start) {
 69 |       start = threadIdx.x + 1;
 70 |     }
 71 |     for (i = start; i < col_size; i++) {
 72 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 73 |         t |= 1ULL << i;
 74 |       }
 75 |     }
 76 |     const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock);
 77 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 78 |   }
 79 | }
 80 | 
 81 | // boxes is a N x 5 tensor
 82 | at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
 83 | 
 84 |   // Ensure CUDA uses the input tensor device.
 85 |   at::DeviceGuard guard(boxes.device());
 86 | 
 87 |   using scalar_t = float;
 88 |   AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
 89 |   auto scores = boxes.select(1, 4);
 90 |   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
 91 |   auto boxes_sorted = boxes.index_select(0, order_t);
 92 | 
 93 |   int boxes_num = boxes.size(0);
 94 | 
 95 |   const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
 96 | 
 97 |   scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
 98 | 
 99 |   THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
100 | 
101 |   unsigned long long* mask_dev = NULL;
102 |   //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
103 |   //                      boxes_num * col_blocks * sizeof(unsigned long long)));
104 | 
105 |   mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
106 | 
107 |   dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock),
108 |               THCCeilDiv(boxes_num, threadsPerBlock));
109 |   dim3 threads(threadsPerBlock);
110 |   nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num,
111 |                                   nms_overlap_thresh,
112 |                                   boxes_dev,
113 |                                   mask_dev);
114 | 
115 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
116 |   THCudaCheck(cudaMemcpyAsync(
117 | 			  &mask_host[0],
118 | 			  mask_dev,
119 | 			  sizeof(unsigned long long) * boxes_num * col_blocks,
120 | 			  cudaMemcpyDeviceToHost,
121 | 			  at::cuda::getCurrentCUDAStream()
122 | 			  ));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
128 |   int64_t* keep_out = keep.data_ptr<int64_t>();
129 | 
130 |   int num_to_keep = 0;
131 |   for (int i = 0; i < boxes_num; i++) {
132 |     int nblock = i / threadsPerBlock;
133 |     int inblock = i % threadsPerBlock;
134 | 
135 |     if (!(remv[nblock] & (1ULL << inblock))) {
136 |       keep_out[num_to_keep++] = i;
137 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
138 |       for (int j = nblock; j < col_blocks; j++) {
139 |         remv[j] |= p[j];
140 |       }
141 |     }
142 |   }
143 | 
144 |   THCudaFree(state, mask_dev);
145 |   // TODO improve this part
146 |   return order_t.index({
147 |       keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(
148 |           order_t.device(), keep.scalar_type())});
149 | }
150 | 


--------------------------------------------------------------------------------
/orbbec/nms/src/nms_ext.cpp:
--------------------------------------------------------------------------------
 1 | // Modified from https://github.com/bharatsingh430/soft-nms/blob/master/lib/nms/cpu_nms.pyx, Soft-NMS is added
 2 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 3 | #include <torch/extension.h>
 4 | 
 5 | at::Tensor nms_cpu(const at::Tensor& dets, const float threshold);
 6 | 
 7 | at::Tensor soft_nms_cpu(const at::Tensor& dets, const float threshold,
 8 |                     const unsigned char method, const float sigma, const
 9 |                     float min_score);
10 | 
11 | #ifdef WITH_CUDA
12 | at::Tensor nms_cuda(const at::Tensor& dets, const float threshold);
13 | #endif
14 | 
15 | at::Tensor nms(const at::Tensor& dets, const float threshold){
16 |   if (dets.device().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return nms_cuda(dets, threshold);
19 | #else
20 |     AT_ERROR("nms is not compiled with GPU support");
21 | #endif
22 |   }
23 |   return nms_cpu(dets, threshold);
24 | }
25 | 
26 | at::Tensor soft_nms(const at::Tensor& dets, const float threshold,
27 |                         const unsigned char method, const float sigma, const
28 |                         float min_score) {
29 |   if (dets.device().is_cuda()) {
30 |     AT_ERROR("soft_nms is not implemented on GPU");
31 |   }
32 |   return soft_nms_cpu(dets, threshold, method, sigma, min_score);
33 | }
34 | 
35 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
36 |   m.def("nms", &nms, "non-maximum suppression");
37 |   m.def("soft_nms", &soft_nms, "soft non-maximum suppression");
38 | }
39 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/__init__.py:
--------------------------------------------------------------------------------
1 | from .roi_align import RoIAlign, roi_align
2 | 
3 | __all__ = ['roi_align', 'RoIAlign']
4 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/gradcheck.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import sys
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | from torch.autograd import gradcheck
 7 | 
 8 | sys.path.append(osp.abspath(osp.join(__file__, '../../')))
 9 | from roi_align import RoIAlign  # noqa: E402, isort:skip
10 | 
11 | feat_size = 15
12 | spatial_scale = 1.0 / 8
13 | img_size = feat_size / spatial_scale
14 | num_imgs = 2
15 | num_rois = 20
16 | 
17 | batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
18 | rois = np.random.rand(num_rois, 4) * img_size * 0.5
19 | rois[:, 2:] += img_size * 0.5
20 | rois = np.hstack((batch_ind, rois))
21 | 
22 | feat = torch.randn(
23 |     num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
24 | rois = torch.from_numpy(rois).float().cuda()
25 | inputs = (feat, rois)
26 | print('Gradcheck for roi align...')
27 | test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
28 | print(test)
29 | test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
30 | print(test)
31 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/roi_align.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from torch.autograd import Function
  3 | from torch.autograd.function import once_differentiable
  4 | from torch.nn.modules.utils import _pair
  5 | 
  6 | from . import roi_align_ext
  7 | 
  8 | 
  9 | class RoIAlignFunction(Function):
 10 | 
 11 |     @staticmethod
 12 |     def forward(ctx,
 13 |                 features,
 14 |                 rois,
 15 |                 out_size,
 16 |                 spatial_scale,
 17 |                 sample_num=0,
 18 |                 aligned=True):
 19 |         out_h, out_w = _pair(out_size)
 20 |         assert isinstance(out_h, int) and isinstance(out_w, int)
 21 |         ctx.spatial_scale = spatial_scale
 22 |         ctx.sample_num = sample_num
 23 |         ctx.save_for_backward(rois)
 24 |         ctx.feature_size = features.size()
 25 |         ctx.aligned = aligned
 26 | 
 27 |         if aligned:
 28 |             output = roi_align_ext.forward_v2(features, rois, spatial_scale,
 29 |                                               out_h, out_w, sample_num,
 30 |                                               aligned)
 31 |         elif features.is_cuda:
 32 |             (batch_size, num_channels, data_height,
 33 |              data_width) = features.size()
 34 |             num_rois = rois.size(0)
 35 | 
 36 |             output = features.new_zeros(num_rois, num_channels, out_h, out_w)
 37 |             roi_align_ext.forward_v1(features, rois, out_h, out_w,
 38 |                                      spatial_scale, sample_num, output)
 39 |         else:
 40 |             raise NotImplementedError
 41 | 
 42 |         return output
 43 | 
 44 |     @staticmethod
 45 |     @once_differentiable
 46 |     def backward(ctx, grad_output):
 47 |         feature_size = ctx.feature_size
 48 |         spatial_scale = ctx.spatial_scale
 49 |         sample_num = ctx.sample_num
 50 |         rois = ctx.saved_tensors[0]
 51 |         aligned = ctx.aligned
 52 |         assert feature_size is not None
 53 | 
 54 |         batch_size, num_channels, data_height, data_width = feature_size
 55 |         out_w = grad_output.size(3)
 56 |         out_h = grad_output.size(2)
 57 | 
 58 |         grad_input = grad_rois = None
 59 |         if not aligned:
 60 |             if ctx.needs_input_grad[0]:
 61 |                 grad_input = rois.new_zeros(batch_size, num_channels,
 62 |                                             data_height, data_width)
 63 |                 roi_align_ext.backward_v1(grad_output.contiguous(), rois,
 64 |                                           out_h, out_w, spatial_scale,
 65 |                                           sample_num, grad_input)
 66 |         else:
 67 |             grad_input = roi_align_ext.backward_v2(grad_output, rois,
 68 |                                                    spatial_scale, out_h, out_w,
 69 |                                                    batch_size, num_channels,
 70 |                                                    data_height, data_width,
 71 |                                                    sample_num, aligned)
 72 | 
 73 |         return grad_input, grad_rois, None, None, None, None
 74 | 
 75 | 
 76 | roi_align = RoIAlignFunction.apply
 77 | 
 78 | 
 79 | class RoIAlign(nn.Module):
 80 | 
 81 |     def __init__(self,
 82 |                  out_size,
 83 |                  spatial_scale,
 84 |                  sample_num=0,
 85 |                  use_torchvision=False,
 86 |                  aligned=True):
 87 |         """
 88 |         Args:
 89 |             out_size (tuple): h, w
 90 |             spatial_scale (float): scale the input boxes by this number
 91 |             sample_num (int): number of inputs samples to take for each
 92 |                 output sample. 2 to take samples densely for current models.
 93 |             use_torchvision (bool): whether to use roi_align from torchvision
 94 |             aligned (bool): if False, use the legacy implementation in
 95 |                 MMDetection. If True, align the results more perfectly.
 96 | 
 97 |         Note:
 98 |             The implementation of RoIAlign when aligned=True is modified from
 99 |             https://github.com/facebookresearch/detectron2/
100 | 
101 |             The meaning of aligned=True:
102 | 
103 |             Given a continuous coordinate c, its two neighboring pixel
104 |             indices (in our pixel model) are computed by floor(c - 0.5) and
105 |             ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
106 |             indices [0] and [1] (which are sampled from the underlying signal
107 |             at continuous coordinates 0.5 and 1.5). But the original roi_align
108 |             (aligned=False) does not subtract the 0.5 when computing
109 |             neighboring pixel indices and therefore it uses pixels with a
110 |             slightly incorrect alignment (relative to our pixel model) when
111 |             performing bilinear interpolation.
112 | 
113 |             With `aligned=True`,
114 |             we first appropriately scale the ROI and then shift it by -0.5
115 |             prior to calling roi_align. This produces the correct neighbors;
116 | 
117 |             The difference does not make a difference to the model's
118 |             performance if ROIAlign is used together with conv layers.
119 |         """
120 |         super(RoIAlign, self).__init__()
121 |         self.out_size = _pair(out_size)
122 |         self.spatial_scale = float(spatial_scale)
123 |         self.aligned = aligned
124 |         self.sample_num = int(sample_num)
125 |         self.use_torchvision = use_torchvision
126 |         assert not (use_torchvision and
127 |                     aligned), 'Torchvision does not support aligned RoIAlgin'
128 | 
129 |     def forward(self, features, rois):
130 |         """
131 |         Args:
132 |             features: NCHW images
133 |             rois: Bx5 boxes. First column is the index into N. The other 4
134 |             columns are xyxy.
135 |         """
136 |         assert rois.dim() == 2 and rois.size(1) == 5
137 | 
138 |         if self.use_torchvision:
139 |             from torchvision.ops import roi_align as tv_roi_align
140 |             return tv_roi_align(features, rois, self.out_size,
141 |                                 self.spatial_scale, self.sample_num)
142 |         else:
143 |             return roi_align(features, rois, self.out_size, self.spatial_scale,
144 |                              self.sample_num, self.aligned)
145 | 
146 |     def __repr__(self):
147 |         indent_str = '\n    '
148 |         format_str = self.__class__.__name__
149 |         format_str += f'({indent_str}out_size={self.out_size},'
150 |         format_str += f'{indent_str}spatial_scale={self.spatial_scale},'
151 |         format_str += f'{indent_str}sample_num={self.sample_num},'
152 |         format_str += f'{indent_str}use_torchvision={self.use_torchvision},'
153 |         format_str += f'{indent_str}aligned={self.aligned})'
154 |         return format_str
155 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/src/cpu/roi_align_v2.cpp:
--------------------------------------------------------------------------------
  1 | // Modified from
  2 | // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
  3 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  4 | #include <ATen/ATen.h>
  5 | #include <ATen/TensorUtils.h>
  6 | 
  7 | // implementation taken from Caffe2
  8 | template <typename T>
  9 | struct PreCalc {
 10 |   int pos1;
 11 |   int pos2;
 12 |   int pos3;
 13 |   int pos4;
 14 |   T w1;
 15 |   T w2;
 16 |   T w3;
 17 |   T w4;
 18 | };
 19 | 
 20 | template <typename T>
 21 | void pre_calc_for_bilinear_interpolate(
 22 |     const int height, const int width, const int pooled_height,
 23 |     const int pooled_width, const int iy_upper, const int ix_upper,
 24 |     T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
 25 |     int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
 26 |   int pre_calc_index = 0;
 27 |   for (int ph = 0; ph < pooled_height; ph++) {
 28 |     for (int pw = 0; pw < pooled_width; pw++) {
 29 |       for (int iy = 0; iy < iy_upper; iy++) {
 30 |         const T yy = roi_start_h + ph * bin_size_h +
 31 |                      static_cast<T>(iy + .5f) * bin_size_h /
 32 |                          static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
 33 |         for (int ix = 0; ix < ix_upper; ix++) {
 34 |           const T xx = roi_start_w + pw * bin_size_w +
 35 |                        static_cast<T>(ix + .5f) * bin_size_w /
 36 |                            static_cast<T>(roi_bin_grid_w);
 37 | 
 38 |           T x = xx;
 39 |           T y = yy;
 40 |           // deal with: inverse elements are out of feature map boundary
 41 |           if (y < -1.0 || y > height || x < -1.0 || x > width) {
 42 |             // empty
 43 |             PreCalc<T> pc;
 44 |             pc.pos1 = 0;
 45 |             pc.pos2 = 0;
 46 |             pc.pos3 = 0;
 47 |             pc.pos4 = 0;
 48 |             pc.w1 = 0;
 49 |             pc.w2 = 0;
 50 |             pc.w3 = 0;
 51 |             pc.w4 = 0;
 52 |             pre_calc[pre_calc_index] = pc;
 53 |             pre_calc_index += 1;
 54 |             continue;
 55 |           }
 56 | 
 57 |           if (y <= 0) {
 58 |             y = 0;
 59 |           }
 60 |           if (x <= 0) {
 61 |             x = 0;
 62 |           }
 63 | 
 64 |           int y_low = (int)y;
 65 |           int x_low = (int)x;
 66 |           int y_high;
 67 |           int x_high;
 68 | 
 69 |           if (y_low >= height - 1) {
 70 |             y_high = y_low = height - 1;
 71 |             y = (T)y_low;
 72 |           } else {
 73 |             y_high = y_low + 1;
 74 |           }
 75 | 
 76 |           if (x_low >= width - 1) {
 77 |             x_high = x_low = width - 1;
 78 |             x = (T)x_low;
 79 |           } else {
 80 |             x_high = x_low + 1;
 81 |           }
 82 | 
 83 |           T ly = y - y_low;
 84 |           T lx = x - x_low;
 85 |           T hy = 1. - ly, hx = 1. - lx;
 86 |           T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 87 | 
 88 |           // save weights and indices
 89 |           PreCalc<T> pc;
 90 |           pc.pos1 = y_low * width + x_low;
 91 |           pc.pos2 = y_low * width + x_high;
 92 |           pc.pos3 = y_high * width + x_low;
 93 |           pc.pos4 = y_high * width + x_high;
 94 |           pc.w1 = w1;
 95 |           pc.w2 = w2;
 96 |           pc.w3 = w3;
 97 |           pc.w4 = w4;
 98 |           pre_calc[pre_calc_index] = pc;
 99 | 
100 |           pre_calc_index += 1;
101 |         }
102 |       }
103 |     }
104 |   }
105 | }
106 | 
107 | template <typename T>
108 | void ROIAlignForward(const int nthreads, const T* input, const T& spatial_scale,
109 |                      const int channels, const int height, const int width,
110 |                      const int pooled_height, const int pooled_width,
111 |                      const int sampling_ratio, const T* rois, T* output,
112 |                      bool aligned) {
113 |   int n_rois = nthreads / channels / pooled_width / pooled_height;
114 |   // (n, c, ph, pw) is an element in the pooled output
115 |   // can be parallelized using omp
116 |   // #pragma omp parallel for num_threads(32)
117 |   for (int n = 0; n < n_rois; n++) {
118 |     int index_n = n * channels * pooled_width * pooled_height;
119 | 
120 |     const T* offset_rois = rois + n * 5;
121 |     int roi_batch_ind = offset_rois[0];
122 | 
123 |     // Do not use rounding; this implementation detail is critical
124 |     T offset = aligned ? (T)0.5 : (T)0.0;
125 |     T roi_start_w = offset_rois[1] * spatial_scale - offset;
126 |     T roi_start_h = offset_rois[2] * spatial_scale - offset;
127 |     T roi_end_w = offset_rois[3] * spatial_scale - offset;
128 |     T roi_end_h = offset_rois[4] * spatial_scale - offset;
129 | 
130 |     T roi_width = roi_end_w - roi_start_w;
131 |     T roi_height = roi_end_h - roi_start_h;
132 |     if (aligned) {
133 |       AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
134 |                  "ROIs in ROIAlign cannot have non-negative size!");
135 |     } else {  // for backward-compatibility only
136 |       roi_width = std::max(roi_width, (T)1.);
137 |       roi_height = std::max(roi_height, (T)1.);
138 |     }
139 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
140 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
141 | 
142 |     // We use roi_bin_grid to sample the grid and mimic integral
143 |     int roi_bin_grid_h = (sampling_ratio > 0)
144 |                              ? sampling_ratio
145 |                              : ceil(roi_height / pooled_height);  // e.g., = 2
146 |     int roi_bin_grid_w =
147 |         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
148 | 
149 |     // We do average (integral) pooling inside a bin
150 |     // When the grid is empty, output zeros == 0/1, instead of NaN.
151 |     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
152 | 
153 |     // we want to precalculate indices and weights shared by all channels,
154 |     // this is the key point of optimization
155 |     std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
156 |                                      pooled_width * pooled_height);
157 |     pre_calc_for_bilinear_interpolate(
158 |         height, width, pooled_height, pooled_width, roi_bin_grid_h,
159 |         roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
160 |         roi_bin_grid_h, roi_bin_grid_w, pre_calc);
161 | 
162 |     for (int c = 0; c < channels; c++) {
163 |       int index_n_c = index_n + c * pooled_width * pooled_height;
164 |       const T* offset_input =
165 |           input + (roi_batch_ind * channels + c) * height * width;
166 |       int pre_calc_index = 0;
167 | 
168 |       for (int ph = 0; ph < pooled_height; ph++) {
169 |         for (int pw = 0; pw < pooled_width; pw++) {
170 |           int index = index_n_c + ph * pooled_width + pw;
171 | 
172 |           T output_val = 0.;
173 |           for (int iy = 0; iy < roi_bin_grid_h; iy++) {
174 |             for (int ix = 0; ix < roi_bin_grid_w; ix++) {
175 |               PreCalc<T> pc = pre_calc[pre_calc_index];
176 |               output_val += pc.w1 * offset_input[pc.pos1] +
177 |                             pc.w2 * offset_input[pc.pos2] +
178 |                             pc.w3 * offset_input[pc.pos3] +
179 |                             pc.w4 * offset_input[pc.pos4];
180 | 
181 |               pre_calc_index += 1;
182 |             }
183 |           }
184 |           output_val /= count;
185 | 
186 |           output[index] = output_val;
187 |         }  // for pw
188 |       }    // for ph
189 |     }      // for c
190 |   }        // for n
191 | }
192 | 
193 | template <typename T>
194 | void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
195 |                                    T& w1, T& w2, T& w3, T& w4, int& x_low,
196 |                                    int& x_high, int& y_low, int& y_high,
197 |                                    const int index /* index for debug only*/) {
198 |   // deal with cases that inverse elements are out of feature map boundary
199 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
200 |     // empty
201 |     w1 = w2 = w3 = w4 = 0.;
202 |     x_low = x_high = y_low = y_high = -1;
203 |     return;
204 |   }
205 | 
206 |   if (y <= 0) y = 0;
207 |   if (x <= 0) x = 0;
208 | 
209 |   y_low = (int)y;
210 |   x_low = (int)x;
211 | 
212 |   if (y_low >= height - 1) {
213 |     y_high = y_low = height - 1;
214 |     y = (T)y_low;
215 |   } else {
216 |     y_high = y_low + 1;
217 |   }
218 | 
219 |   if (x_low >= width - 1) {
220 |     x_high = x_low = width - 1;
221 |     x = (T)x_low;
222 |   } else {
223 |     x_high = x_low + 1;
224 |   }
225 | 
226 |   T ly = y - y_low;
227 |   T lx = x - x_low;
228 |   T hy = 1. - ly, hx = 1. - lx;
229 | 
230 |   // reference in forward
231 |   // T v1 = input[y_low * width + x_low];
232 |   // T v2 = input[y_low * width + x_high];
233 |   // T v3 = input[y_high * width + x_low];
234 |   // T v4 = input[y_high * width + x_high];
235 |   // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
236 | 
237 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
238 | 
239 |   return;
240 | }
241 | 
242 | template <class T>
243 | inline void add(T* address, const T& val) {
244 |   *address += val;
245 | }
246 | 
247 | template <typename T>
248 | void ROIAlignBackward(const int nthreads, const T* grad_output,
249 |                       const T& spatial_scale, const int channels,
250 |                       const int height, const int width,
251 |                       const int pooled_height, const int pooled_width,
252 |                       const int sampling_ratio, T* grad_input, const T* rois,
253 |                       const int n_stride, const int c_stride,
254 |                       const int h_stride, const int w_stride, bool aligned) {
255 |   for (int index = 0; index < nthreads; index++) {
256 |     // (n, c, ph, pw) is an element in the pooled output
257 |     int pw = index % pooled_width;
258 |     int ph = (index / pooled_width) % pooled_height;
259 |     int c = (index / pooled_width / pooled_height) % channels;
260 |     int n = index / pooled_width / pooled_height / channels;
261 | 
262 |     const T* offset_rois = rois + n * 5;
263 |     int roi_batch_ind = offset_rois[0];
264 | 
265 |     // Do not use rounding; this implementation detail is critical
266 |     T offset = aligned ? (T)0.5 : (T)0.0;
267 |     T roi_start_w = offset_rois[1] * spatial_scale - offset;
268 |     T roi_start_h = offset_rois[2] * spatial_scale - offset;
269 |     T roi_end_w = offset_rois[3] * spatial_scale - offset;
270 |     T roi_end_h = offset_rois[4] * spatial_scale - offset;
271 | 
272 |     T roi_width = roi_end_w - roi_start_w;
273 |     T roi_height = roi_end_h - roi_start_h;
274 |     if (aligned) {
275 |       AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
276 |                  "ROIs in ROIAlign do not have non-negative size!");
277 |     } else {  // for backward-compatibility only
278 |       roi_width = std::max(roi_width, (T)1.);
279 |       roi_height = std::max(roi_height, (T)1.);
280 |     }
281 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
282 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
283 | 
284 |     T* offset_grad_input =
285 |         grad_input + ((roi_batch_ind * channels + c) * height * width);
286 | 
287 |     int output_offset = n * n_stride + c * c_stride;
288 |     const T* offset_grad_output = grad_output + output_offset;
289 |     const T grad_output_this_bin =
290 |         offset_grad_output[ph * h_stride + pw * w_stride];
291 | 
292 |     // We use roi_bin_grid to sample the grid and mimic integral
293 |     int roi_bin_grid_h = (sampling_ratio > 0)
294 |                              ? sampling_ratio
295 |                              : ceil(roi_height / pooled_height);  // e.g., = 2
296 |     int roi_bin_grid_w =
297 |         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
298 | 
299 |     // We do average (integral) pooling inside a bin
300 |     const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
301 | 
302 |     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
303 |       const T y = roi_start_h + ph * bin_size_h +
304 |                   static_cast<T>(iy + .5f) * bin_size_h /
305 |                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
306 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
307 |         const T x = roi_start_w + pw * bin_size_w +
308 |                     static_cast<T>(ix + .5f) * bin_size_w /
309 |                         static_cast<T>(roi_bin_grid_w);
310 | 
311 |         T w1, w2, w3, w4;
312 |         int x_low, x_high, y_low, y_high;
313 | 
314 |         bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
315 |                                       x_low, x_high, y_low, y_high, index);
316 | 
317 |         T g1 = grad_output_this_bin * w1 / count;
318 |         T g2 = grad_output_this_bin * w2 / count;
319 |         T g3 = grad_output_this_bin * w3 / count;
320 |         T g4 = grad_output_this_bin * w4 / count;
321 | 
322 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
323 |           // atomic add is not needed for now since it is single threaded
324 |           add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
325 |           add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
326 |           add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
327 |           add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
328 |         }  // if
329 |       }    // ix
330 |     }      // iy
331 |   }        // for
332 | }  // ROIAlignBackward
333 | 
334 | at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input,
335 |                                        const at::Tensor& rois,
336 |                                        const float spatial_scale,
337 |                                        const int pooled_height,
338 |                                        const int pooled_width,
339 |                                        const int sampling_ratio, bool aligned) {
340 |   AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
341 |   AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
342 | 
343 |   at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
344 | 
345 |   at::CheckedFrom c = "ROIAlignForwardV2CPULaucher";
346 |   at::checkAllSameType(c, {input_t, rois_t});
347 | 
348 |   auto num_rois = rois.size(0);
349 |   auto channels = input.size(1);
350 |   auto height = input.size(2);
351 |   auto width = input.size(3);
352 | 
353 |   at::Tensor output = at::zeros(
354 |       {num_rois, channels, pooled_height, pooled_width}, input.options());
355 | 
356 |   auto output_size = num_rois * pooled_height * pooled_width * channels;
357 | 
358 |   if (output.numel() == 0) return output;
359 | 
360 |   AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] {
361 |     ROIAlignForward<scalar_t>(
362 |         output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
363 |         channels, height, width, pooled_height, pooled_width, sampling_ratio,
364 |         rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
365 |   });
366 |   return output;
367 | }
368 | 
369 | at::Tensor ROIAlignBackwardV2CPULaucher(
370 |     const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
371 |     const int pooled_height, const int pooled_width, const int batch_size,
372 |     const int channels, const int height, const int width,
373 |     const int sampling_ratio, bool aligned) {
374 |   AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor");
375 |   AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor");
376 | 
377 |   at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
378 | 
379 |   at::CheckedFrom c = "ROIAlignBackwardV2CPULaucher";
380 |   at::checkAllSameType(c, {grad_t, rois_t});
381 | 
382 |   at::Tensor grad_input =
383 |       at::zeros({batch_size, channels, height, width}, grad.options());
384 | 
385 |   // handle possibly empty gradients
386 |   if (grad.numel() == 0) {
387 |     return grad_input;
388 |   }
389 | 
390 |   // get stride values to ensure indexing into gradients is correct.
391 |   int n_stride = grad.stride(0);
392 |   int c_stride = grad.stride(1);
393 |   int h_stride = grad.stride(2);
394 |   int w_stride = grad.stride(3);
395 | 
396 |   AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] {
397 |     ROIAlignBackward<scalar_t>(
398 |         grad.numel(), grad.contiguous().data_ptr<scalar_t>(), spatial_scale,
399 |         channels, height, width, pooled_height, pooled_width, sampling_ratio,
400 |         grad_input.data_ptr<scalar_t>(), rois.contiguous().data_ptr<scalar_t>(),
401 |         n_stride, c_stride, h_stride, w_stride, aligned);
402 |   });
403 |   return grad_input;
404 | }
405 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/src/cuda/roi_align_kernel.cu:
--------------------------------------------------------------------------------
  1 | // /* pytorch: 1.5.0 ~ 1.10.x */
  2 | #include <ATen/ATen.h>
  3 | #include <ATen/cuda/CUDAContext.h>
  4 | #include <THC/THCAtomics.cuh>
  5 | 
  6 | // /* pytorch: 1.11.0 ~ latest */
  7 | // #include <cuda.h>
  8 | // #include <ATen/ATen.h>
  9 | // #include <ATen/cuda/CUDAContext.h>
 10 | // #include <c10/cuda/CUDAGuard.h>
 11 | 
 12 | // #include <ATen/cuda/CUDAApplyUtils.cuh>
 13 | // #include <THC/THCAtomics.cuh>
 14 | 
 15 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 16 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 17 |        i += blockDim.x * gridDim.x)
 18 | 
 19 | #define THREADS_PER_BLOCK 1024
 20 | 
 21 | inline int GET_BLOCKS(const int N) {
 22 |   int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 23 |   int max_block_num = 65000;
 24 |   return min(optimal_block_num, max_block_num);
 25 | }
 26 | 
 27 | template <typename scalar_t>
 28 | __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
 29 |                                          const int height, const int width,
 30 |                                          scalar_t y, scalar_t x) {
 31 |   // deal with cases that inverse elements are out of feature map boundary
 32 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 33 |     return 0;
 34 |   }
 35 | 
 36 |   if (y <= 0) y = 0;
 37 |   if (x <= 0) x = 0;
 38 | 
 39 |   int y_low = (int)y;
 40 |   int x_low = (int)x;
 41 |   int y_high;
 42 |   int x_high;
 43 | 
 44 |   if (y_low >= height - 1) {
 45 |     y_high = y_low = height - 1;
 46 |     y = (scalar_t)y_low;
 47 |   } else {
 48 |     y_high = y_low + 1;
 49 |   }
 50 | 
 51 |   if (x_low >= width - 1) {
 52 |     x_high = x_low = width - 1;
 53 |     x = (scalar_t)x_low;
 54 |   } else {
 55 |     x_high = x_low + 1;
 56 |   }
 57 | 
 58 |   scalar_t ly = y - y_low;
 59 |   scalar_t lx = x - x_low;
 60 |   scalar_t hy = 1. - ly;
 61 |   scalar_t hx = 1. - lx;
 62 |   // do bilinear interpolation
 63 |   scalar_t lt = bottom_data[y_low * width + x_low];
 64 |   scalar_t rt = bottom_data[y_low * width + x_high];
 65 |   scalar_t lb = bottom_data[y_high * width + x_low];
 66 |   scalar_t rb = bottom_data[y_high * width + x_high];
 67 |   scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 68 | 
 69 |   scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
 70 | 
 71 |   return val;
 72 | }
 73 | 
 74 | template <typename scalar_t>
 75 | __global__ void ROIAlignForwardV1(
 76 |     const int nthreads, const scalar_t *bottom_data,
 77 |     const scalar_t *bottom_rois, const scalar_t spatial_scale,
 78 |     const int sample_num, const int channels, const int height, const int width,
 79 |     const int pooled_height, const int pooled_width, scalar_t *top_data) {
 80 |   CUDA_1D_KERNEL_LOOP(index, nthreads) {
 81 |     // (n, c, ph, pw) is an element in the aligned output
 82 |     int pw = index % pooled_width;
 83 |     int ph = (index / pooled_width) % pooled_height;
 84 |     int c = (index / pooled_width / pooled_height) % channels;
 85 |     int n = index / pooled_width / pooled_height / channels;
 86 | 
 87 |     const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
 88 |     int roi_batch_ind = offset_bottom_rois[0];
 89 |     scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
 90 |     scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
 91 |     scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
 92 |     scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
 93 | 
 94 |     // Force malformed ROIs to be 1x1
 95 |     scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
 96 |     scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
 97 | 
 98 |     scalar_t bin_size_h = roi_height / pooled_height;
 99 |     scalar_t bin_size_w = roi_width / pooled_width;
100 | 
101 |     const scalar_t *offset_bottom_data =
102 |         bottom_data + (roi_batch_ind * channels + c) * height * width;
103 | 
104 |     int sample_num_h = (sample_num > 0)
105 |                            ? sample_num
106 |                            : ceil(roi_height / pooled_height);  // e.g., = 2
107 |     int sample_num_w =
108 |         (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
109 | 
110 |     scalar_t output_val = 0;
111 |     for (int iy = 0; iy < sample_num_h; iy++) {
112 |       const scalar_t y = roi_start_h + ph * bin_size_h +
113 |                          (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
114 |                              (scalar_t)(sample_num_h);
115 |       for (int ix = 0; ix < sample_num_w; ix++) {
116 |         const scalar_t x = roi_start_w + pw * bin_size_w +
117 |                            (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
118 |                                (scalar_t)(sample_num_w);
119 |         scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
120 |                                                       height, width, y, x);
121 |         output_val += val;
122 |       }
123 |     }
124 |     output_val /= (sample_num_h * sample_num_w);
125 |     top_data[index] = output_val;
126 |   }
127 | }
128 | 
129 | int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
130 |                            const float spatial_scale, const int sample_num,
131 |                            const int channels, const int height,
132 |                            const int width, const int num_rois,
133 |                            const int pooled_height, const int pooled_width,
134 |                            at::Tensor output) {
135 |   const int output_size = num_rois * pooled_height * pooled_width * channels;
136 |   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
137 |       features.scalar_type(), "ROIAlignLaucherForward", ([&] {
138 |         const scalar_t *bottom_data = features.data_ptr<scalar_t>();
139 |         const scalar_t *rois_data = rois.data_ptr<scalar_t>();
140 |         scalar_t *top_data = output.data_ptr<scalar_t>();
141 | 
142 |         ROIAlignForwardV1<scalar_t>
143 |             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
144 |                at::cuda::getCurrentCUDAStream()>>>(
145 |                 output_size, bottom_data, rois_data, scalar_t(spatial_scale),
146 |                 sample_num, channels, height, width, pooled_height,
147 |                 pooled_width, top_data);
148 |       }));
149 |   THCudaCheck(cudaGetLastError());
150 |   return 1;
151 | }
152 | 
153 | template <typename scalar_t>
154 | __device__ void bilinear_interpolate_gradient(const int height, const int width,
155 |                                               scalar_t y, scalar_t x,
156 |                                               scalar_t &w1, scalar_t &w2,
157 |                                               scalar_t &w3, scalar_t &w4,
158 |                                               int &x_low, int &x_high,
159 |                                               int &y_low, int &y_high) {
160 |   // deal with cases that inverse elements are out of feature map boundary
161 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
162 |     w1 = w2 = w3 = w4 = 0.;
163 |     x_low = x_high = y_low = y_high = -1;
164 |     return;
165 |   }
166 | 
167 |   if (y <= 0) y = 0;
168 |   if (x <= 0) x = 0;
169 | 
170 |   y_low = (int)y;
171 |   x_low = (int)x;
172 | 
173 |   if (y_low >= height - 1) {
174 |     y_high = y_low = height - 1;
175 |     y = (scalar_t)y_low;
176 |   } else {
177 |     y_high = y_low + 1;
178 |   }
179 | 
180 |   if (x_low >= width - 1) {
181 |     x_high = x_low = width - 1;
182 |     x = (scalar_t)x_low;
183 |   } else {
184 |     x_high = x_low + 1;
185 |   }
186 | 
187 |   scalar_t ly = y - y_low;
188 |   scalar_t lx = x - x_low;
189 |   scalar_t hy = 1. - ly;
190 |   scalar_t hx = 1. - lx;
191 | 
192 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
193 | 
194 |   return;
195 | }
196 | 
197 | template <typename scalar_t>
198 | __global__ void ROIAlignBackwardV1(
199 |     const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
200 |     const scalar_t spatial_scale, const int sample_num, const int channels,
201 |     const int height, const int width, const int pooled_height,
202 |     const int pooled_width, scalar_t *bottom_diff) {
203 |   CUDA_1D_KERNEL_LOOP(index, nthreads) {
204 |     // (n, c, ph, pw) is an element in the aligned output
205 |     int pw = index % pooled_width;
206 |     int ph = (index / pooled_width) % pooled_height;
207 |     int c = (index / pooled_width / pooled_height) % channels;
208 |     int n = index / pooled_width / pooled_height / channels;
209 | 
210 |     const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
211 |     int roi_batch_ind = offset_bottom_rois[0];
212 |     scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
213 |     scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
214 |     scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
215 |     scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
216 | 
217 |     // Force malformed ROIs to be 1x1
218 |     scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
219 |     scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
220 | 
221 |     scalar_t bin_size_h = roi_height / pooled_height;
222 |     scalar_t bin_size_w = roi_width / pooled_width;
223 | 
224 |     scalar_t *offset_bottom_diff =
225 |         bottom_diff + (roi_batch_ind * channels + c) * height * width;
226 |     int offset_top = (n * channels + c) * pooled_height * pooled_width +
227 |                      ph * pooled_width + pw;
228 |     scalar_t offset_top_diff = top_diff[offset_top];
229 | 
230 |     int sample_num_h = (sample_num > 0)
231 |                            ? sample_num
232 |                            : ceil(roi_height / pooled_height);  // e.g., = 2
233 |     int sample_num_w =
234 |         (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
235 | 
236 |     const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
237 | 
238 |     for (int iy = 0; iy < sample_num_h; iy++) {
239 |       const scalar_t y =
240 |           roi_start_h + ph * bin_size_h +
241 |           (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
242 |       for (int ix = 0; ix < sample_num_w; ix++) {
243 |         const scalar_t x =
244 |             roi_start_w + pw * bin_size_w +
245 |             (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
246 |         scalar_t w1, w2, w3, w4;
247 |         int x_low, x_high, y_low, y_high;
248 | 
249 |         bilinear_interpolate_gradient<scalar_t>(
250 |             height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
251 |         scalar_t g1 = offset_top_diff * w1 / count;
252 |         scalar_t g2 = offset_top_diff * w2 / count;
253 |         scalar_t g3 = offset_top_diff * w3 / count;
254 |         scalar_t g4 = offset_top_diff * w4 / count;
255 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
256 |           atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
257 |           atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
258 |           atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
259 |           atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
260 |         }
261 |       }
262 |     }
263 |   }
264 | }
265 | 
266 | int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
267 |                             const float spatial_scale, const int sample_num,
268 |                             const int channels, const int height,
269 |                             const int width, const int num_rois,
270 |                             const int pooled_height, const int pooled_width,
271 |                             at::Tensor bottom_grad) {
272 |   const int output_size = num_rois * pooled_height * pooled_width * channels;
273 | 
274 |   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
275 |       top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
276 |         const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
277 |         const scalar_t *rois_data = rois.data_ptr<scalar_t>();
278 |         scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
279 |         if (sizeof(scalar_t) == sizeof(double)) {
280 |           fprintf(stderr, "double is not supported\n");
281 |           exit(-1);
282 |         }
283 | 
284 |         ROIAlignBackwardV1<scalar_t>
285 |             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
286 |                at::cuda::getCurrentCUDAStream()>>>(
287 |                 output_size, top_diff, rois_data, spatial_scale, sample_num,
288 |                 channels, height, width, pooled_height, pooled_width,
289 |                 bottom_diff);
290 |       }));
291 |   THCudaCheck(cudaGetLastError());
292 |   return 1;
293 | }
294 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/src/cuda/roi_align_kernel_v2.cu:
--------------------------------------------------------------------------------
  1 | // Modified from
  2 | // https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
  3 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  4 | // /* pytorch: 1.5.0 ~ 1.10.x */
  5 | #include <ATen/ATen.h>
  6 | #include <ATen/cuda/CUDAContext.h>
  7 | #include <c10/cuda/CUDAGuard.h>
  8 | #include <ATen/cuda/CUDAApplyUtils.cuh>
  9 | 
 10 | 
 11 | // /* pytorch: 1.11.0 ~ latest */
 12 | // #include <cuda.h>
 13 | // #include <ATen/ATen.h>
 14 | // #include <ATen/cuda/CUDAContext.h>
 15 | // #include <c10/cuda/CUDAGuard.h>
 16 | 
 17 | // #include <ATen/cuda/CUDAApplyUtils.cuh>
 18 | // #include <THC/THCAtomics.cuh>
 19 | 
 20 | 
 21 | // TODO make it in a common file
 22 | #define CUDA_1D_KERNEL_LOOP(i, n)                            \
 23 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
 24 |        i += blockDim.x * gridDim.x)
 25 | 
 26 | template <typename T>
 27 | __device__ T bilinear_interpolate(const T* bottom_data, const int height,
 28 |                                   const int width, T y, T x,
 29 |                                   const int index /* index for debug only*/) {
 30 |   // deal with cases that inverse elements are out of feature map boundary
 31 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
 32 |     // empty
 33 |     return 0;
 34 |   }
 35 | 
 36 |   if (y <= 0) y = 0;
 37 |   if (x <= 0) x = 0;
 38 | 
 39 |   int y_low = (int)y;
 40 |   int x_low = (int)x;
 41 |   int y_high;
 42 |   int x_high;
 43 | 
 44 |   if (y_low >= height - 1) {
 45 |     y_high = y_low = height - 1;
 46 |     y = (T)y_low;
 47 |   } else {
 48 |     y_high = y_low + 1;
 49 |   }
 50 | 
 51 |   if (x_low >= width - 1) {
 52 |     x_high = x_low = width - 1;
 53 |     x = (T)x_low;
 54 |   } else {
 55 |     x_high = x_low + 1;
 56 |   }
 57 | 
 58 |   T ly = y - y_low;
 59 |   T lx = x - x_low;
 60 |   T hy = 1. - ly, hx = 1. - lx;
 61 |   // do bilinear interpolation
 62 |   T v1 = bottom_data[y_low * width + x_low];
 63 |   T v2 = bottom_data[y_low * width + x_high];
 64 |   T v3 = bottom_data[y_high * width + x_low];
 65 |   T v4 = bottom_data[y_high * width + x_high];
 66 |   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 67 | 
 68 |   T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
 69 | 
 70 |   return val;
 71 | }
 72 | 
 73 | template <typename T>
 74 | __global__ void RoIAlignForwardV2(
 75 |     const int nthreads, const T* bottom_data, const T spatial_scale,
 76 |     const int channels, const int height, const int width,
 77 |     const int pooled_height, const int pooled_width, const int sampling_ratio,
 78 |     const T* bottom_rois, T* top_data, bool aligned) {
 79 |   CUDA_1D_KERNEL_LOOP(index, nthreads) {
 80 |     // (n, c, ph, pw) is an element in the pooled output
 81 |     int pw = index % pooled_width;
 82 |     int ph = (index / pooled_width) % pooled_height;
 83 |     int c = (index / pooled_width / pooled_height) % channels;
 84 |     int n = index / pooled_width / pooled_height / channels;
 85 | 
 86 |     const T* offset_bottom_rois = bottom_rois + n * 5;
 87 |     int roi_batch_ind = offset_bottom_rois[0];
 88 | 
 89 |     // Do not use rounding; this implementation detail is critical
 90 |     T offset = aligned ? (T)0.5 : (T)0.0;
 91 |     T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
 92 |     T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
 93 |     T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
 94 |     T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
 95 | 
 96 |     T roi_width = roi_end_w - roi_start_w;
 97 |     T roi_height = roi_end_h - roi_start_h;
 98 |     if (!aligned) {  // for backward-compatibility only
 99 |       roi_width = max(roi_width, (T)1.);
100 |       roi_height = max(roi_height, (T)1.);
101 |     }
102 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
103 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
104 | 
105 |     const T* offset_bottom_data =
106 |         bottom_data + (roi_batch_ind * channels + c) * height * width;
107 | 
108 |     // We use roi_bin_grid to sample the grid and mimic integral
109 |     int roi_bin_grid_h = (sampling_ratio > 0)
110 |                              ? sampling_ratio
111 |                              : ceil(roi_height / pooled_height);  // e.g., = 2
112 |     int roi_bin_grid_w =
113 |         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
114 | 
115 |     // We do average (integral) pooling inside a bin
116 |     // When the grid is empty, output zeros.
117 |     const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
118 | 
119 |     T output_val = 0.;
120 |     for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
121 |     {
122 |       const T y = roi_start_h + ph * bin_size_h +
123 |                   static_cast<T>(iy + .5f) * bin_size_h /
124 |                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
125 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
126 |         const T x = roi_start_w + pw * bin_size_w +
127 |                     static_cast<T>(ix + .5f) * bin_size_w /
128 |                         static_cast<T>(roi_bin_grid_w);
129 | 
130 |         T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
131 |                                      index);
132 |         output_val += val;
133 |       }
134 |     }
135 |     output_val /= count;
136 | 
137 |     top_data[index] = output_val;
138 |   }
139 | }
140 | 
141 | template <typename T>
142 | __device__ void bilinear_interpolate_gradient(
143 |     const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
144 |     int& x_low, int& x_high, int& y_low, int& y_high,
145 |     const int index /* index for debug only*/) {
146 |   // deal with cases that inverse elements are out of feature map boundary
147 |   if (y < -1.0 || y > height || x < -1.0 || x > width) {
148 |     // empty
149 |     w1 = w2 = w3 = w4 = 0.;
150 |     x_low = x_high = y_low = y_high = -1;
151 |     return;
152 |   }
153 | 
154 |   if (y <= 0) y = 0;
155 |   if (x <= 0) x = 0;
156 | 
157 |   y_low = (int)y;
158 |   x_low = (int)x;
159 | 
160 |   if (y_low >= height - 1) {
161 |     y_high = y_low = height - 1;
162 |     y = (T)y_low;
163 |   } else {
164 |     y_high = y_low + 1;
165 |   }
166 | 
167 |   if (x_low >= width - 1) {
168 |     x_high = x_low = width - 1;
169 |     x = (T)x_low;
170 |   } else {
171 |     x_high = x_low + 1;
172 |   }
173 | 
174 |   T ly = y - y_low;
175 |   T lx = x - x_low;
176 |   T hy = 1. - ly, hx = 1. - lx;
177 | 
178 |   // reference in forward
179 |   // T v1 = bottom_data[y_low * width + x_low];
180 |   // T v2 = bottom_data[y_low * width + x_high];
181 |   // T v3 = bottom_data[y_high * width + x_low];
182 |   // T v4 = bottom_data[y_high * width + x_high];
183 |   // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
184 | 
185 |   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
186 | 
187 |   return;
188 | }
189 | 
190 | template <typename T>
191 | __global__ void RoIAlignBackwardFeatureV2(
192 |     const int nthreads, const T* top_diff, const int num_rois,
193 |     const T spatial_scale, const int channels, const int height,
194 |     const int width, const int pooled_height, const int pooled_width,
195 |     const int sampling_ratio, T* bottom_diff, const T* bottom_rois,
196 |     bool aligned) {
197 |   CUDA_1D_KERNEL_LOOP(index, nthreads) {
198 |     // (n, c, ph, pw) is an element in the pooled output
199 |     int pw = index % pooled_width;
200 |     int ph = (index / pooled_width) % pooled_height;
201 |     int c = (index / pooled_width / pooled_height) % channels;
202 |     int n = index / pooled_width / pooled_height / channels;
203 | 
204 |     const T* offset_bottom_rois = bottom_rois + n * 5;
205 |     int roi_batch_ind = offset_bottom_rois[0];
206 | 
207 |     // Do not use rounding; this implementation detail is critical
208 |     T offset = aligned ? (T)0.5 : (T)0.0;
209 |     T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
210 |     T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
211 |     T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
212 |     T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
213 | 
214 |     T roi_width = roi_end_w - roi_start_w;
215 |     T roi_height = roi_end_h - roi_start_h;
216 |     if (!aligned) {  // for backward-compatibility only
217 |       roi_width = max(roi_width, (T)1.);
218 |       roi_height = max(roi_height, (T)1.);
219 |     }
220 |     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
221 |     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
222 | 
223 |     T* offset_bottom_diff =
224 |         bottom_diff + (roi_batch_ind * channels + c) * height * width;
225 | 
226 |     int top_offset = (n * channels + c) * pooled_height * pooled_width;
227 |     const T* offset_top_diff = top_diff + top_offset;
228 |     const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
229 | 
230 |     // We use roi_bin_grid to sample the grid and mimic integral
231 |     int roi_bin_grid_h = (sampling_ratio > 0)
232 |                              ? sampling_ratio
233 |                              : ceil(roi_height / pooled_height);  // e.g., = 2
234 |     int roi_bin_grid_w =
235 |         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
236 | 
237 |     // We do average (integral) pooling inside a bin
238 |     const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
239 | 
240 |     for (int iy = 0; iy < roi_bin_grid_h; iy++)  // e.g., iy = 0, 1
241 |     {
242 |       const T y = roi_start_h + ph * bin_size_h +
243 |                   static_cast<T>(iy + .5f) * bin_size_h /
244 |                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
245 |       for (int ix = 0; ix < roi_bin_grid_w; ix++) {
246 |         const T x = roi_start_w + pw * bin_size_w +
247 |                     static_cast<T>(ix + .5f) * bin_size_w /
248 |                         static_cast<T>(roi_bin_grid_w);
249 | 
250 |         T w1, w2, w3, w4;
251 |         int x_low, x_high, y_low, y_high;
252 | 
253 |         bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
254 |                                       x_low, x_high, y_low, y_high, index);
255 | 
256 |         T g1 = top_diff_this_bin * w1 / count;
257 |         T g2 = top_diff_this_bin * w2 / count;
258 |         T g3 = top_diff_this_bin * w3 / count;
259 |         T g4 = top_diff_this_bin * w4 / count;
260 | 
261 |         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
262 |           atomicAdd(offset_bottom_diff + y_low * width + x_low,
263 |                     static_cast<T>(g1));
264 |           atomicAdd(offset_bottom_diff + y_low * width + x_high,
265 |                     static_cast<T>(g2));
266 |           atomicAdd(offset_bottom_diff + y_high * width + x_low,
267 |                     static_cast<T>(g3));
268 |           atomicAdd(offset_bottom_diff + y_high * width + x_high,
269 |                     static_cast<T>(g4));
270 |         }  // if
271 |       }    // ix
272 |     }      // iy
273 |   }        // CUDA_1D_KERNEL_LOOP
274 | }  // RoIAlignBackward
275 | 
276 | at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
277 |                                     const at::Tensor& rois,
278 |                                     const float spatial_scale,
279 |                                     const int pooled_height,
280 |                                     const int pooled_width,
281 |                                     const int sampling_ratio, bool aligned) {
282 |   AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
283 |   AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
284 |   at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
285 | 
286 |   at::CheckedFrom c = "ROIAlign_forward_cuda";
287 |   at::checkAllSameGPU(c, {input_t, rois_t});
288 |   at::checkAllSameType(c, {input_t, rois_t});
289 |   at::cuda::CUDAGuard device_guard(input.device());
290 | 
291 |   auto num_rois = rois.size(0);
292 |   auto channels = input.size(1);
293 |   auto height = input.size(2);
294 |   auto width = input.size(3);
295 | 
296 |   auto output = at::empty({num_rois, channels, pooled_height, pooled_width},
297 |                           input.options());
298 |   auto output_size = num_rois * pooled_height * pooled_width * channels;
299 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
300 | 
301 |   dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
302 |   dim3 block(512);
303 | 
304 |   if (output.numel() == 0) {
305 |     AT_CUDA_CHECK(cudaGetLastError());
306 |     return output;
307 |   }
308 | 
309 |   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
310 |     RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>(
311 |         output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
312 |         channels, height, width, pooled_height, pooled_width, sampling_ratio,
313 |         rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
314 |   });
315 |   cudaDeviceSynchronize();
316 |   AT_CUDA_CHECK(cudaGetLastError());
317 |   return output;
318 | }
319 | 
320 | // TODO remove the dependency on input and use instead its sizes -> save memory
321 | at::Tensor ROIAlignBackwardV2Laucher(
322 |     const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
323 |     const int pooled_height, const int pooled_width, const int batch_size,
324 |     const int channels, const int height, const int width,
325 |     const int sampling_ratio, bool aligned) {
326 |   AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
327 |   AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
328 | 
329 |   at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
330 |   at::CheckedFrom c = "ROIAlign_backward_cuda";
331 |   at::checkAllSameGPU(c, {grad_t, rois_t});
332 |   at::checkAllSameType(c, {grad_t, rois_t});
333 |   at::cuda::CUDAGuard device_guard(grad.device());
334 | 
335 |   auto num_rois = rois.size(0);
336 |   auto grad_input =
337 |       at::zeros({batch_size, channels, height, width}, grad.options());
338 | 
339 |   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
340 | 
341 |   dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
342 |   dim3 block(512);
343 | 
344 |   // handle possibly empty gradients
345 |   if (grad.numel() == 0) {
346 |     AT_CUDA_CHECK(cudaGetLastError());
347 |     return grad_input;
348 |   }
349 | 
350 |   AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
351 |     RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>(
352 |         grad.numel(), grad.contiguous().data_ptr<scalar_t>(), num_rois,
353 |         spatial_scale, channels, height, width, pooled_height, pooled_width,
354 |         sampling_ratio, grad_input.data_ptr<scalar_t>(),
355 |         rois.contiguous().data_ptr<scalar_t>(), aligned);
356 |   });
357 |   AT_CUDA_CHECK(cudaGetLastError());
358 |   return grad_input;
359 | }
360 | 


--------------------------------------------------------------------------------
/orbbec/roi_align/src/roi_align_ext.cpp:
--------------------------------------------------------------------------------
  1 | #include <ATen/ATen.h>
  2 | #include <torch/extension.h>
  3 | 
  4 | #include <cmath>
  5 | #include <vector>
  6 | 
  7 | #ifdef WITH_CUDA
  8 | int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
  9 |                            const float spatial_scale, const int sample_num,
 10 |                            const int channels, const int height,
 11 |                            const int width, const int num_rois,
 12 |                            const int pooled_height, const int pooled_width,
 13 |                            at::Tensor output);
 14 | 
 15 | int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
 16 |                             const float spatial_scale, const int sample_num,
 17 |                             const int channels, const int height,
 18 |                             const int width, const int num_rois,
 19 |                             const int pooled_height, const int pooled_width,
 20 |                             at::Tensor bottom_grad);
 21 | 
 22 | at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
 23 |                                     const at::Tensor& rois,
 24 |                                     const float spatial_scale,
 25 |                                     const int pooled_height,
 26 |                                     const int pooled_width,
 27 |                                     const int sampling_ratio, bool aligned);
 28 | 
 29 | at::Tensor ROIAlignBackwardV2Laucher(
 30 |     const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
 31 |     const int pooled_height, const int pooled_width, const int batch_size,
 32 |     const int channels, const int height, const int width,
 33 |     const int sampling_ratio, bool aligned);
 34 | #endif
 35 | 
 36 | at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input,
 37 |                                        const at::Tensor& rois,
 38 |                                        const float spatial_scale,
 39 |                                        const int pooled_height,
 40 |                                        const int pooled_width,
 41 |                                        const int sampling_ratio, bool aligned);
 42 | 
 43 | at::Tensor ROIAlignBackwardV2CPULaucher(
 44 |     const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
 45 |     const int pooled_height, const int pooled_width, const int batch_size,
 46 |     const int channels, const int height, const int width,
 47 |     const int sampling_ratio, bool aligned);
 48 | 
 49 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 50 | #define CHECK_CONTIGUOUS(x) \
 51 |   TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 52 | #define CHECK_INPUT(x) \
 53 |   CHECK_CUDA(x);       \
 54 |   CHECK_CONTIGUOUS(x)
 55 | 
 56 | int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height,
 57 |                        int pooled_width, float spatial_scale, int sample_num,
 58 |                        at::Tensor output) {
 59 |   if (features.device().is_cuda()) {
 60 | #ifdef WITH_CUDA
 61 |     CHECK_INPUT(features);
 62 |     CHECK_INPUT(rois);
 63 |     CHECK_INPUT(output);
 64 |     at::DeviceGuard guard(features.device());
 65 | 
 66 |     // Number of ROIs
 67 |     int num_rois = rois.size(0);
 68 |     int size_rois = rois.size(1);
 69 | 
 70 |     if (size_rois != 5) {
 71 |       printf("wrong roi size\n");
 72 |       return 0;
 73 |     }
 74 | 
 75 |     int num_channels = features.size(1);
 76 |     int data_height = features.size(2);
 77 |     int data_width = features.size(3);
 78 | 
 79 |     ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
 80 |                            num_channels, data_height, data_width, num_rois,
 81 |                            pooled_height, pooled_width, output);
 82 | 
 83 |     return 1;
 84 | #else
 85 |     AT_ERROR("ROIAlign is not compiled with GPU support");
 86 | #endif
 87 |   }
 88 |   AT_ERROR("ROIAlign is not implemented on CPU");
 89 | }
 90 | 
 91 | int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height,
 92 |                         int pooled_width, float spatial_scale, int sample_num,
 93 |                         at::Tensor bottom_grad) {
 94 |   if (top_grad.device().is_cuda()) {
 95 | #ifdef WITH_CUDA
 96 |     CHECK_INPUT(top_grad);
 97 |     CHECK_INPUT(rois);
 98 |     CHECK_INPUT(bottom_grad);
 99 |     at::DeviceGuard guard(top_grad.device());
100 | 
101 |     // Number of ROIs
102 |     int num_rois = rois.size(0);
103 |     int size_rois = rois.size(1);
104 |     if (size_rois != 5) {
105 |       printf("wrong roi size\n");
106 |       return 0;
107 |     }
108 | 
109 |     int num_channels = bottom_grad.size(1);
110 |     int data_height = bottom_grad.size(2);
111 |     int data_width = bottom_grad.size(3);
112 | 
113 |     ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
114 |                             num_channels, data_height, data_width, num_rois,
115 |                             pooled_height, pooled_width, bottom_grad);
116 | 
117 |     return 1;
118 | #else
119 |     AT_ERROR("ROIAlign is not compiled with GPU support");
120 | #endif
121 |   }
122 |   AT_ERROR("ROIAlign is not implemented on CPU");
123 | }
124 | 
125 | // Interface for Python
126 | inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input,
127 |                                      const at::Tensor& rois,
128 |                                      const float spatial_scale,
129 |                                      const int pooled_height,
130 |                                      const int pooled_width,
131 |                                      const int sampling_ratio, bool aligned) {
132 |   if (input.device().is_cuda()) {
133 | #ifdef WITH_CUDA
134 |     return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height,
135 |                                     pooled_width, sampling_ratio, aligned);
136 | #else
137 |     AT_ERROR("ROIAlignV2 is not compiled with GPU support");
138 | #endif
139 |   }
140 |   return ROIAlignForwardV2CPULaucher(input, rois, spatial_scale, pooled_height,
141 |                                      pooled_width, sampling_ratio, aligned);
142 | }
143 | 
144 | inline at::Tensor ROIAlign_backwardV2(
145 |     const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
146 |     const int pooled_height, const int pooled_width, const int batch_size,
147 |     const int channels, const int height, const int width,
148 |     const int sampling_ratio, bool aligned) {
149 |   if (grad.device().is_cuda()) {
150 | #ifdef WITH_CUDA
151 |     return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height,
152 |                                      pooled_width, batch_size, channels, height,
153 |                                      width, sampling_ratio, aligned);
154 | #else
155 |     AT_ERROR("ROIAlignV2 is not compiled with GPU support");
156 | #endif
157 |   }
158 |   return ROIAlignBackwardV2CPULaucher(grad, rois, spatial_scale, pooled_height,
159 |                                       pooled_width, batch_size, channels,
160 |                                       height, width, sampling_ratio, aligned);
161 | }
162 | 
163 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
164 |   m.def("forward_v1", &ROIAlign_forwardV1, "Roi_Align V1 forward");
165 |   m.def("backward_v1", &ROIAlign_backwardV1, "Roi_Align V1 backward");
166 |   m.def("forward_v2", &ROIAlign_forwardV2, "Roi_Align V2 forward");
167 |   m.def("backward_v2", &ROIAlign_backwardV2, "Roi_Align V2 backward");
168 | }
169 | 


--------------------------------------------------------------------------------
/orbbec/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # from . import compiling_info
2 | from .compiling_info import get_compiler_version, get_compiling_cuda_version
3 | 
4 | # get_compiler_version = compiling_info.get_compiler_version
5 | # get_compiling_cuda_version = compiling_info.get_compiling_cuda_version
6 | 
7 | __all__ = ['get_compiler_version', 'get_compiling_cuda_version']
8 | 


--------------------------------------------------------------------------------
/orbbec/utils/src/compiling_info.cpp:
--------------------------------------------------------------------------------
 1 | // modified from
 2 | // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
 3 | #include <torch/extension.h>
 4 | 
 5 | #ifdef WITH_CUDA
 6 | #include <cuda_runtime_api.h>
 7 | int get_cudart_version() { return CUDART_VERSION; }
 8 | #endif
 9 | 
10 | std::string get_compiling_cuda_version() {
11 | #ifdef WITH_CUDA
12 |   std::ostringstream oss;
13 | 
14 |   // copied from
15 |   // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
16 |   auto printCudaStyleVersion = [&](int v) {
17 |     oss << (v / 1000) << "." << (v / 10 % 100);
18 |     if (v % 10 != 0) {
19 |       oss << "." << (v % 10);
20 |     }
21 |   };
22 |   printCudaStyleVersion(get_cudart_version());
23 |   return oss.str();
24 | #else
25 |   return std::string("not available");
26 | #endif
27 | }
28 | 
29 | // similar to
30 | // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
31 | std::string get_compiler_version() {
32 |   std::ostringstream ss;
33 | #if defined(__GNUC__)
34 | #ifndef __clang__
35 |   { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
36 | #endif
37 | #endif
38 | 
39 | #if defined(__clang_major__)
40 |   {
41 |     ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
42 |        << __clang_patchlevel__;
43 |   }
44 | #endif
45 | 
46 | #if defined(_MSC_VER)
47 |   { ss << "MSVC " << _MSC_FULL_VER; }
48 | #endif
49 |   return ss.str();
50 | }
51 | 
52 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
53 |   m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
54 |   m.def("get_compiling_cuda_version", &get_compiling_cuda_version, "get_compiling_cuda_version");
55 | }
56 | 


--------------------------------------------------------------------------------
/orbbec/warpaffine/__init__.py:
--------------------------------------------------------------------------------
1 | from .warpaffine_ext import affine_opencv, affine_torch
2 | 
3 | __all__ = ['affine_opencv', 'affine_torch']


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/cpu/warpaffine_opencv.cpp:
--------------------------------------------------------------------------------
 1 | #include<vector>
 2 | #include<iostream>
 3 | #include<pybind11/pybind11.h>
 4 | #include<pybind11/numpy.h>
 5 | #include<pybind11/stl.h>
 6 | #include<opencv2/opencv.hpp>
 7 | 
 8 | 
 9 | namespace py = pybind11;
10 | 
11 | /* Python->C++ Mat */
12 | cv::Mat numpy_uint8_1c_to_cv_mat(py::array_t<unsigned char>& input)
13 | {
14 |     if (input.ndim() != 2)
15 |         throw std::runtime_error("1-channel image must be 2 dims ");
16 |     
17 |     py::buffer_info buf = input.request();
18 |     cv::Mat mat(buf.shape[0], buf.shape[1], CV_8UC1, (unsigned char*)buf.ptr);
19 |     
20 |     return mat;
21 | }
22 | 
23 | cv::Mat numpy_uint8_3c_to_cv_mat(py::array_t<unsigned char>& input)
24 | {
25 |     if (input.ndim() != 3)
26 |         throw std::runtime_error("3-channel image must be 3 dims ");
27 | 
28 |     py::buffer_info buf = input.request();
29 |     cv::Mat mat(buf.shape[0], buf.shape[1], CV_8UC3, (unsigned char*)buf.ptr);
30 | 
31 |     return mat;
32 | }
33 | 
34 | /* C++ Mat ->numpy */
35 | py::array_t<unsigned char> cv_mat_uint8_1c_to_numpy(cv::Mat& input)
36 | {
37 |     py::array_t<unsigned char> dst;
38 |     dst = py::array_t<unsigned char>({ input.rows,input.cols }, input.data);
39 |     
40 |     return dst;
41 | }
42 | 
43 | py::array_t<unsigned char> cv_mat_uint8_3c_to_numpy(cv::Mat& input)
44 | {
45 |     py::array_t<unsigned char> dst;
46 |     dst = py::array_t<unsigned char>({ input.rows,input.cols,3}, input.data);
47 |     
48 |     return dst;
49 | }
50 | 
51 | py::array_t<unsigned char> affine_opencv(py::array_t<unsigned char>& input, 
52 |                                         py::array_t<float>& from_point, 
53 |                                         py::array_t<float>& to_point)
54 | {
55 |     // step1: get affine transform matrix
56 |     py::buffer_info from_p_buf = from_point.request();
57 |     py::buffer_info from_t_buf = to_point.request();
58 |     float* fp = (float*)from_p_buf.ptr;
59 |     float* tp = (float*)from_t_buf.ptr;
60 |     int fp_stride = from_p_buf.shape[1];
61 |     int tp_stride = from_t_buf.shape[1];
62 | 
63 |     cv::Point2f src[3] = {};
64 |     cv::Point2f dst[3] = {};
65 | 
66 |     for(int i = 0; i < from_p_buf.shape[0]; i++)
67 |     {
68 |         src[i] = cv::Point2f(fp[fp_stride * i + 0], fp[fp_stride * i + 1]);
69 |         dst[i] = cv::Point2f(tp[tp_stride * i + 0], tp[tp_stride * i + 1]);      
70 |     }
71 | 
72 |     cv::Mat H = cv::getAffineTransform(src, dst);
73 | 
74 |     // step2: run affine transform
75 |     cv::Mat input_mat = numpy_uint8_1c_to_cv_mat(input);
76 |     cv::Mat output;
77 |     cv::warpAffine(input_mat, output, H, cv::Size(600, 800), cv::INTER_LINEAR);
78 | 
79 |     return cv_mat_uint8_1c_to_numpy(output);
80 | }


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/cpu/warpaffine_torch_v1.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | template <typename scalar_t>
 4 | void affine_cpu_kernel(
 5 |     int32_t inHeight,
 6 |     int32_t inWidth,
 7 |     int32_t inChannel,
 8 |     int32_t inPlanarSize,
 9 |     int32_t outHeight,
10 |     int32_t outWidth,
11 |     int32_t outPlanarSize,
12 |     scalar_t* dst,
13 |     const scalar_t* src,
14 |     const scalar_t* M,
15 |     float delta);
16 | 
17 | template <typename scalar_t>
18 | at::Tensor affine_torch_cpu(const at::Tensor& input, /*[B, C, H, W]*/
19 |                             const at::Tensor& from,  /*[B, 3, 3]*/
20 |                             const at::Tensor& to,    /*[B, 3, 2]*/
21 |                             const int out_h,
22 |                             const int out_w)  
23 | {
24 |     // step1. get affine transform matrix
25 |     AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor");
26 |     AT_ASSERTM(from.device().is_cpu(),  "from point must be a CPU tensor");
27 |     AT_ASSERTM(to.device().is_cpu(),    "to point must be a CPU tensor");
28 | 
29 |     // F = ((X^T*X)^-1)*(X^T*Y)
30 |     auto matrix_l = (torch::transpose(from,1, 2).bmm(from)).inverse();  //(X^T*X)^-1)
31 |     auto matrix_l_ptr = matrix_l.contiguous().data_ptr<scalar_t>();
32 |     auto matrix_r = (torch::transpose(from, 1, 2)).bmm(to);             //(X^T*Y)
33 |     auto matrix_r_ptr = matrix_r.contiguous().data_ptr<scalar_t>();
34 |     auto affine = matrix_l.bmm(matrix_r);
35 |     auto affine_ptr = affine.data_ptr<scalar_t>();
36 | 
37 |     auto affine_matrix = torch::transpose(affine, 1, 2);  // ((X^T*X)^-1) * (X^T*Y))^T --> [B, 2, 3].
38 | 
39 |     // step2. affine per imgs
40 |     // get data pointer
41 |     auto matrix_ptr = affine_matrix.contiguous().data_ptr<scalar_t>();
42 |     auto input_ptr = input.contiguous().data_ptr<scalar_t>();
43 | 
44 |     auto nimgs = input.size(0);
45 |     auto img_c = input.size(1);
46 |     auto img_h = input.size(2);
47 |     auto img_w = input.size(3);
48 |     auto input_size = img_c * img_h * img_w;
49 |     auto output_size = img_c * out_h * out_w;
50 | 
51 |     // build dst tensor
52 |     auto output_tensor = at::zeros({nimgs, img_c, out_h, out_w}, input.options());
53 |     auto output_ptr = output_tensor.contiguous().data_ptr<scalar_t>();  
54 |     
55 |     for(int i = 0; i < nimgs; i++)
56 |     {   
57 |         scalar_t* matrix = matrix_ptr + i * 6; 
58 |         scalar_t* in = input_ptr + i * input_size;
59 |         scalar_t* out = output_ptr + i * output_size;
60 |         affine_cpu_kernel<scalar_t>(img_h, img_w, img_c, img_w*img_h, 
61 |                                     out_h, out_w, out_h*out_w, out, in, matrix, 0.0f);
62 |     }
63 | 
64 |     return output_tensor;
65 | }
66 | 
67 | 
68 | at::Tensor affine_cpu(const at::Tensor& input, /*[B, C, H, W]*/
69 |                       const at::Tensor& from,  /*[B, 3, 3]*/
70 |                       const at::Tensor& to,    /*[B, 3, 2]*/
71 |                       const int out_h,
72 |                       const int out_w)
73 | {
74 |     at::Tensor result;
75 |     // AT_DISPATCH_FLOATING_TYPES: input.scalar_type() => scalar_t
76 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "affine_cpu", [&] {
77 |         result = affine_torch_cpu<scalar_t>(input, from, to, out_h, out_w);
78 |     });
79 | 
80 |     return result;
81 | }


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/cpu/warpaffine_torch_v2.cpp:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | // #include<opencv2/opencv.hpp>
  3 | 
  4 | /*
  5 | https://www.cnblogs.com/shine-lee/p/10950963.html
  6 | */
  7 | template <typename scalar_t>
  8 | void affine_cpu_kernel(
  9 |     int32_t inHeight,
 10 |     int32_t inWidth,
 11 |     int32_t inChannel,
 12 |     int32_t inPlanarSize,
 13 |     int32_t outHeight,
 14 |     int32_t outWidth,
 15 |     int32_t outPlanarSize,
 16 |     scalar_t* dst,
 17 |     const scalar_t* src,
 18 |     const scalar_t* M,
 19 |     float delta)
 20 | {
 21 |     for (int32_t i = 0; i < outHeight; i++)
 22 |     {
 23 |         float base_x = M[1] * i + M[2];
 24 |         float base_y = M[4] * i + M[5];
 25 | 
 26 |         for (int32_t j = 0; j < outWidth; j++)
 27 |         {
 28 |             float x     = base_x + M[0] * j;
 29 |             float y     = base_y + M[3] * j;
 30 |             int32_t sx0 = (int32_t)x;
 31 |             int32_t sy0 = (int32_t)y;
 32 | 
 33 |             float u = x - sx0;
 34 |             float v = y - sy0;
 35 | 
 36 |             float tab[4];
 37 |             float taby[2], tabx[2];
 38 |             float v0, v1, v2, v3;
 39 |             taby[0] = 1.0f - v;
 40 |             taby[1] = v;
 41 |             tabx[0] = 1.0f - u;
 42 |             tabx[1] = u;
 43 | 
 44 |             tab[0] = taby[0] * tabx[0];
 45 |             tab[1] = taby[0] * tabx[1];
 46 |             tab[2] = taby[1] * tabx[0];
 47 |             tab[3] = taby[1] * tabx[1];
 48 | 
 49 |             int32_t idxDst = (i * outWidth + j);
 50 | 
 51 |             bool flag0 = (sx0 >= 0 && sx0 < inWidth && sy0 >= 0 && sy0 < inHeight);
 52 |             bool flag1 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 >= 0 && sy0 < inHeight);
 53 |             bool flag2 = (sx0 >= 0 && sx0 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
 54 |             bool flag3 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
 55 | 
 56 |             for(int32_t c = 0; c < inChannel; c++)
 57 |             {
 58 |                 int32_t position1 = ((sy0 + 0) * inWidth + sx0);
 59 |                 int32_t position2 = ((sy0 + 1) * inWidth + sx0);              
 60 |                 v0                = flag0 ? src[position1 + c * inPlanarSize + 0] : delta;
 61 |                 v1                = flag1 ? src[position1 + c * inPlanarSize + 1] : delta;
 62 |                 v2                = flag2 ? src[position2 + c * inPlanarSize + 0] : delta;
 63 |                 v3                = flag3 ? src[position2 + c * inPlanarSize + 1] : delta;
 64 |                 scalar_t sum         = 0.0f;
 65 |                 sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
 66 |                 dst[idxDst + c * outPlanarSize] = static_cast<scalar_t>(sum);
 67 |             }
 68 |         }
 69 |     }
 70 | }
 71 | 
 72 | template <typename scalar_t>
 73 | at::Tensor affine_torch_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
 74 |                             const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
 75 |                             const int out_h,
 76 |                             const int out_w)  
 77 | {
 78 |     AT_ASSERTM(input.device().is_cpu(),         "input must be a CPU tensor");
 79 |     AT_ASSERTM(affine_matrix.device().is_cpu(), "affine_matrix must be a CPU tensor");
 80 | 
 81 |     // auto input_cpy = input.contiguous();
 82 |     // auto in_tensor = input_cpy.squeeze().permute({1, 2, 0}).contiguous();
 83 |     // in_tensor = in_tensor.mul(255).clamp(0, 255).to(torch::kU8);
 84 |     // in_tensor = in_tensor.to(torch::kCPU);
 85 |     // cv::Mat resultImg(800, 600, CV_8UC3);
 86 |     // std::memcpy((void *) resultImg.data, in_tensor.data_ptr(), sizeof(torch::kU8) * in_tensor.numel());
 87 |     // cv::imwrite("input.png", resultImg);
 88 | 
 89 |     auto matrix_ptr = affine_matrix.contiguous().data_ptr<scalar_t>();
 90 |     auto input_ptr = input.contiguous().data_ptr<scalar_t>();
 91 |     auto nimgs = input.size(0);
 92 |     auto img_c = input.size(1);
 93 |     auto img_h = input.size(2);
 94 |     auto img_w = input.size(3);
 95 |     auto in_img_size = img_c * img_h * img_w;
 96 |     auto out_img_size = img_c * out_h * out_w;
 97 | 
 98 |     // build dst tensor
 99 |     auto output_tensor = at::zeros({nimgs, img_c, out_h, out_w}, input.options());
100 |     auto output_ptr = output_tensor.contiguous().data_ptr<scalar_t>();  
101 |     
102 |     for(int i = 0; i < nimgs; i++)
103 |     {   
104 |         scalar_t* matrix = matrix_ptr + i * 6; 
105 |         scalar_t* in = input_ptr + i * in_img_size;
106 |         scalar_t* out = output_ptr + i * out_img_size;
107 |         affine_cpu_kernel<scalar_t>(img_h, img_w, img_c, img_w*img_h, 
108 |                                     out_h, out_w, out_h*out_w, out, in, matrix, 0.0f);
109 |     }
110 | 
111 |     return output_tensor;
112 | }
113 | 
114 | 
115 | at::Tensor affine_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
116 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
117 |                       const int out_h,
118 |                       const int out_w)
119 | {
120 |     at::Tensor result;
121 |     // AT_DISPATCH_FLOATING_TYPES: input.scalar_type() => scalar_t
122 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "affine_cpu", [&] {
123 |         result = affine_torch_cpu<scalar_t>(input, affine_matrix, out_h, out_w);
124 |     });
125 |     return result;
126 | }


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/cuda/warpaffine_cuda.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #include <torch/extension.h>
 3 | 
 4 | #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 5 | #define CHECK_CONTIGUOUS(x) \
 6 |   TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 7 | #define CHECK_INPUT(x) \
 8 |   CHECK_CUDA(x);       \
 9 |   CHECK_CONTIGUOUS(x)
10 | 
11 | at::Tensor affine_cuda_forward(const at::Tensor& input,          /*[B, C, H, W]*/
12 |                                const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
13 |                                const int out_h,
14 |                                const int out_w);
15 | 
16 | at::Tensor affine_gpu(const at::Tensor& input,          /*[B, C, H, W]*/
17 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
18 |                       const int out_h,
19 |                       const int out_w)
20 | {
21 |     CHECK_INPUT(input);
22 |     CHECK_INPUT(affine_matrix);
23 | 
24 |     // Ensure CUDA uses the input tensor device.
25 |     at::DeviceGuard guard(input.device());
26 | 
27 |     return affine_cuda_forward(input, affine_matrix, out_h, out_w);
28 | }


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/cuda/warpaffine_kernel.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | // /* pytorch: 1.5.0 ~ 1.10.x */
  3 | #include <ATen/ATen.h>
  4 | #include <ATen/cuda/CUDAContext.h>
  5 | #include <ATen/DeviceGuard.h>
  6 | 
  7 | #include <THC/THC.h>
  8 | #include <THC/THCDeviceUtils.cuh>
  9 | // /* ---------------------- */
 10 | 
 11 | // /* pytorch: 1.11.0 ~ latest */
 12 | // #include <cuda.h>
 13 | // #include <ATen/ATen.h>
 14 | // #include <ATen/cuda/CUDAContext.h>
 15 | // #include <c10/cuda/CUDAGuard.h>
 16 | 
 17 | // #include <ATen/cuda/CUDAApplyUtils.cuh>
 18 | // #include <THC/THCAtomics.cuh>
 19 | 
 20 | 
 21 | #define THREADS_PER_BLOCK 1024
 22 | 
 23 | inline int GET_BLOCKS(const int N)
 24 | {
 25 |     int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
 26 |     int max_block_num = 65000;
 27 |     return min(optimal_block_num, max_block_num);
 28 | }
 29 | 
 30 | 
 31 | template <typename scalar_t>
 32 | __global__ void affine_gpu_kernel(
 33 |     int64_t output_size,
 34 |     int32_t inHeight,
 35 |     int32_t inWidth,
 36 |     int32_t inChannel,
 37 |     int32_t outHeight,
 38 |     int32_t outWidth,
 39 |     scalar_t* dst,
 40 |     const scalar_t* src,
 41 |     const scalar_t* M,
 42 |     float delta)
 43 | {
 44 |     // thread idx
 45 |     const int64_t thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
 46 |     if (thread_idx >= output_size)
 47 |         return;
 48 | 
 49 |     // (n, c, ph, pw) is an element in the pooled output
 50 |     int pw = thread_idx % outWidth;
 51 |     int ph = (thread_idx / outWidth) % outHeight;
 52 |     int c  = (thread_idx / outWidth / outHeight) % inChannel;
 53 |     int n  = thread_idx / outWidth / outHeight / inChannel;
 54 | 
 55 |     // convert pw, ph to x, y
 56 |     float base_x = M[1] * ph + M[2];
 57 |     float base_y = M[4] * ph + M[5];
 58 |     float x      = base_x + M[0] * pw;
 59 |     float y      = base_y + M[3] * pw;
 60 |     int32_t sx0 = (int32_t)x;
 61 |     int32_t sy0 = (int32_t)y;
 62 | 
 63 |     float u = x - sx0;
 64 |     float v = y - sy0;
 65 | 
 66 |     float tab[4];
 67 |     float taby[2], tabx[2];
 68 |     float v0, v1, v2, v3;
 69 |     taby[0] = 1.0f - v;
 70 |     taby[1] = v;
 71 |     tabx[0] = 1.0f - u;
 72 |     tabx[1] = u;
 73 | 
 74 |     tab[0] = taby[0] * tabx[0];
 75 |     tab[1] = taby[0] * tabx[1];
 76 |     tab[2] = taby[1] * tabx[0];
 77 |     tab[3] = taby[1] * tabx[1];
 78 | 
 79 |     bool flag0 = (sx0 >= 0 && sx0 < inWidth && sy0 >= 0 && sy0 < inHeight);
 80 |     bool flag1 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 >= 0 && sy0 < inHeight);
 81 |     bool flag2 = (sx0 >= 0 && sx0 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
 82 |     bool flag3 = (sx0 + 1 >= 0 && sx0 + 1 < inWidth && sy0 + 1 >= 0 && sy0 + 1 < inHeight);
 83 | 
 84 |     int64_t input_feature_size = inHeight * inWidth * inChannel;
 85 |     int64_t in_planar_size = inHeight * inWidth;
 86 |     int64_t position1 = (n * input_feature_size + c * in_planar_size + (sy0 + 0) * inWidth + sx0);
 87 |     int64_t position2 = (n * input_feature_size + c * in_planar_size + (sy0 + 1) * inWidth + sx0);              
 88 |     v0                = flag0 ? src[position1 + 0] : delta;
 89 |     v1                = flag1 ? src[position1 + 1] : delta;
 90 |     v2                = flag2 ? src[position2 + 0] : delta;
 91 |     v3                = flag3 ? src[position2 + 1] : delta;
 92 |     scalar_t sum      = 0.0f;
 93 |     sum += v0 * tab[0] + v1 * tab[1] + v2 * tab[2] + v3 * tab[3];
 94 |     dst[thread_idx] = static_cast<scalar_t>(sum);
 95 | }
 96 | 
 97 | 
 98 | at::Tensor affine_cuda_forward(const at::Tensor& input,          /*[B, C, H, W]*/
 99 |                                const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
100 |                                const int out_h,
101 |                                const int out_w)
102 | {
103 |     // build dst tensor
104 |     auto nimgs = input.size(0);
105 |     auto img_c = input.size(1);
106 |     auto img_h = input.size(2);
107 |     auto img_w = input.size(3);
108 |     const int output_size = nimgs * img_c * out_h * out_w;
109 |     auto output_tensor = at::zeros({nimgs, img_c, out_h, out_w}, input.options());
110 | 
111 |     AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "affine_cuda", [&] {
112 |         auto matrix_ptr = affine_matrix.data_ptr<scalar_t>();
113 |         auto input_ptr = input.data_ptr<scalar_t>();
114 |         auto output_ptr = output_tensor.data_ptr<scalar_t>();
115 | 
116 |         // launch kernel function on GPU with CUDA.
117 |         affine_gpu_kernel<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
118 |                         0, at::cuda::getCurrentCUDAStream()>>>(output_size, img_h,
119 |                         img_w, img_c, out_h, out_w, output_ptr, input_ptr, matrix_ptr, 0.0f);
120 |     });    
121 | 
122 |     return  output_tensor;
123 | }


--------------------------------------------------------------------------------
/orbbec/warpaffine/src/warpaffine_ext.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include<pybind11/numpy.h>
 3 | 
 4 | 
 5 | py::array_t<unsigned char> affine_opencv(py::array_t<unsigned char>& input, 
 6 |                                         py::array_t<float>& from_point, 
 7 |                                         py::array_t<float>& to_point);
 8 | 
 9 | 
10 | at::Tensor affine_cpu(const at::Tensor& input,          /*[B, C, H, W]*/
11 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
12 |                       const int out_h,
13 |                       const int out_w);
14 | 
15 | #ifdef WITH_CUDA
16 | at::Tensor affine_gpu(const at::Tensor& input,          /*[B, C, H, W]*/
17 |                       const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
18 |                       const int out_h,
19 |                       const int out_w);
20 | #endif
21 | 
22 | 
23 | at::Tensor affine_torch(const at::Tensor& input,          /*[B, C, H, W]*/
24 |                   		const at::Tensor& affine_matrix,  /*[B, 2, 3]*/
25 |                   		const int out_h,
26 |                   		const int out_w)
27 | {
28 | 	if (input.device().is_cuda())
29 |   	{
30 | #ifdef WITH_CUDA
31 |     return affine_gpu(input, affine_matrix, out_h, out_w);
32 | #else
33 |     AT_ERROR("affine is not compiled with GPU support");
34 | #endif
35 |   	}
36 |   	return affine_cpu(input, affine_matrix, out_h, out_w);
37 | }
38 | 
39 | 
40 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
41 |   m.def("affine_opencv", &affine_opencv, "affine with c++ opencv");
42 |   m.def("affine_torch", &affine_torch,   "affine with c++ libtorch");
43 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -r requirements/build.txt
2 | -r requirements/optional.txt
3 | -r requirements/runtime.txt
4 | -r requirements/tests.txt
5 | 


--------------------------------------------------------------------------------
/requirements/build.txt:
--------------------------------------------------------------------------------
1 | # These must be installed before building mmdetection
2 | numpy
3 | torch>=1.3
4 | 


--------------------------------------------------------------------------------
/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | recommonmark
2 | sphinx
3 | sphinx_markdown_tables
4 | sphinx_rtd_theme
5 | 


--------------------------------------------------------------------------------
/requirements/optional.txt:
--------------------------------------------------------------------------------
1 | albumentations>=0.3.2
2 | cityscapesscripts
3 | imagecorruptions
4 | 


--------------------------------------------------------------------------------
/requirements/readthedocs.txt:
--------------------------------------------------------------------------------
1 | mmcv
2 | torch
3 | torchvision
4 | 


--------------------------------------------------------------------------------
/requirements/runtime.txt:
--------------------------------------------------------------------------------
 1 | # matplotlib
 2 | # mmcv>=0.5.1
 3 | numpy
 4 | # need older pillow until torchvision is fixed
 5 | # Pillow<=6.2.2
 6 | six
 7 | terminaltables
 8 | torch>=1.3
 9 | torchvision
10 | 


--------------------------------------------------------------------------------
/requirements/tests.txt:
--------------------------------------------------------------------------------
 1 | asynctest
 2 | codecov
 3 | flake8
 4 | isort
 5 | # Note: used for kwarray.group_items, this may be ported to mmcv in the future.
 6 | kwarray
 7 | pytest
 8 | pytest-cov
 9 | pytest-runner
10 | ubelt
11 | xdoctest >= 0.10.0
12 | yapf
13 | 


--------------------------------------------------------------------------------
/scripts/benchmark.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/benchmark.py


--------------------------------------------------------------------------------
/scripts/check.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/check.py


--------------------------------------------------------------------------------
/scripts/collect_env.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import cv2
 3 | import torch
 4 | import subprocess
 5 | import torchvision
 6 | import os.path as osp
 7 | from collections import defaultdict
 8 | from orbbec.utils import get_compiler_version, get_compiling_cuda_version
 9 | 
10 | 
11 | def collect_env():
12 |     env_info = {}
13 |     env_info['sys.platform'] = sys.platform
14 |     env_info['Python'] = sys.version.replace('\n', '')
15 | 
16 |     cuda_available = torch.cuda.is_available()
17 |     env_info['CUDA available'] = cuda_available
18 | 
19 |     if cuda_available:
20 |         from torch.utils.cpp_extension import CUDA_HOME
21 |         env_info['CUDA_HOME'] = CUDA_HOME
22 | 
23 |         if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
24 |             try:
25 |                 nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
26 |                 nvcc = subprocess.check_output(
27 |                     f'"{nvcc}" -V | tail -n1', shell=True)
28 |                 nvcc = nvcc.decode('utf-8').strip()
29 |             except subprocess.SubprocessError:
30 |                 nvcc = 'Not Available'
31 |             env_info['NVCC'] = nvcc
32 | 
33 |         devices = defaultdict(list)
34 |         for k in range(torch.cuda.device_count()):
35 |             devices[torch.cuda.get_device_name(k)].append(str(k))
36 |         for name, devids in devices.items():
37 |             env_info['GPU ' + ','.join(devids)] = name
38 | 
39 |     if sys.platform == 'linux':
40 |         gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
41 |         gcc = gcc.decode('utf-8').strip()
42 |         env_info['GCC'] = gcc
43 |     env_info['OpenCV'] = cv2.__version__
44 |     env_info['PyTorch'] = torch.__version__
45 |     env_info['PyTorch compiling details'] = torch.__config__.show()
46 |     env_info['TorchVision'] = torchvision.__version__
47 | 
48 |     env_info['C/C++ Compiler'] = get_compiler_version()
49 |     env_info['CUDA Compiler'] = get_compiling_cuda_version()
50 |     return env_info
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     for name, val in collect_env().items():
55 |         print(f'{name}\t: {val}')


--------------------------------------------------------------------------------
/scripts/cpp_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/cpp_img.png


--------------------------------------------------------------------------------
/scripts/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/demo.png


--------------------------------------------------------------------------------
/scripts/grad_check.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/grad_check.py


--------------------------------------------------------------------------------
/scripts/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/image.jpg


--------------------------------------------------------------------------------
/scripts/opencv_world453.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/opencv_world453.dll


--------------------------------------------------------------------------------
/scripts/py_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/py_img.png


--------------------------------------------------------------------------------
/scripts/test_nms.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | import numpy as np
 4 | from orbbec.nms import nms_ext
 5 | 
 6 | 
 7 | def nms(dets, iou_thr, device_id=None):
 8 |     """Dispatch to either CPU or GPU NMS implementations.
 9 | 
10 |     The input can be either a torch tensor or numpy array. GPU NMS will be used
11 |     if the input is a gpu tensor or device_id is specified, otherwise CPU NMS
12 |     will be used. The returned type will always be the same as inputs.
13 | 
14 |     Arguments:
15 |         dets (torch.Tensor or np.ndarray): bboxes with scores.
16 |         iou_thr (float): IoU threshold for NMS.
17 |         device_id (int, optional): when `dets` is a numpy array, if `device_id`
18 |             is None, then cpu nms is used, otherwise gpu_nms will be used.
19 | 
20 |     Returns:
21 |         tuple: kept bboxes and indice, which is always the same data type as
22 |             the input.
23 | 
24 |     Example:
25 |         >>> dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
26 |         >>>                  [49.3, 32.9, 51.0, 35.3, 0.9],
27 |         >>>                  [49.2, 31.8, 51.0, 35.4, 0.5],
28 |         >>>                  [35.1, 11.5, 39.1, 15.7, 0.5],
29 |         >>>                  [35.6, 11.8, 39.3, 14.2, 0.5],
30 |         >>>                  [35.3, 11.5, 39.9, 14.5, 0.4],
31 |         >>>                  [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32)
32 |         >>> iou_thr = 0.6
33 |         >>> suppressed, inds = nms(dets, iou_thr)
34 |         >>> assert len(inds) == len(suppressed) == 3
35 |     """
36 |     # convert dets (tensor or numpy array) to tensor
37 |     if isinstance(dets, torch.Tensor):
38 |         is_numpy = False
39 |         dets_th = dets
40 |     elif isinstance(dets, np.ndarray):
41 |         is_numpy = True
42 |         device = 'cpu' if device_id is None else f'cuda:{device_id}'
43 |         dets_th = torch.from_numpy(dets).to(device)
44 |     else:
45 |         raise TypeError('dets must be either a Tensor or numpy array, '
46 |                         f'but got {type(dets)}')
47 | 
48 |     # execute cpu or cuda nms
49 |     if dets_th.shape[0] == 0:
50 |         inds = dets_th.new_zeros(0, dtype=torch.long)
51 |     else:
52 |         if dets_th.is_cuda:
53 |             inds = nms_ext.nms(dets_th, iou_thr)
54 |         else:
55 |             inds = nms_ext.nms(dets_th, iou_thr)
56 | 
57 |     if is_numpy:
58 |         inds = inds.cpu().numpy()
59 |     return dets[inds, :], inds
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     dets = np.array([[49.1, 32.4, 51.0, 35.9, 0.9],
64 |                   [49.3, 32.9, 51.0, 35.3, 0.9],
65 |                   [49.2, 31.8, 51.0, 35.4, 0.5],
66 |                   [35.1, 11.5, 39.1, 15.7, 0.5],
67 |                   [35.6, 11.8, 39.3, 14.2, 0.5],
68 |                   [35.3, 11.5, 39.9, 14.5, 0.4],
69 |                   [35.2, 11.7, 39.7, 15.7, 0.3]], dtype=np.float32)
70 |     iou_thr = 0.6
71 |     suppressed, inds = nms(dets, iou_thr)
72 |     print(suppressed)
73 |     print(inds)


--------------------------------------------------------------------------------
/scripts/test_warpaffine_opencv.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch  # 不能删掉, 因为需要动态加载torch的一些动态库.
 3 | import numpy as np
 4 | from orbbec.warpaffine import affine_opencv
 5 | 
 6 | data_path = "./demo.png"
 7 | img = cv2.imread(data_path, cv2.IMREAD_GRAYSCALE)
 8 | 
 9 | # python中的numpy.array()与 pybind中的py::array_t一一对应.
10 | src_point = np.array([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=np.float32)
11 | dst_point = np.array([[38.29, 51.69], [73.53, 51.69], [56.02, 71.73]], dtype=np.float32)
12 | # python interface 
13 | mat_trans = cv2.getAffineTransform(src_point, dst_point)
14 | res = cv2.warpAffine(img, mat_trans, (600,800))
15 | cv2.imwrite("py_img.png", res)
16 | 
17 | # C++ interface
18 | warpffine_img = affine_opencv(img, src_point, dst_point)
19 | cv2.imwrite("cpp_img.png", warpffine_img)


--------------------------------------------------------------------------------
/scripts/test_warpaffine_torch_cpu.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | import numpy as np
 4 | from orbbec.warpaffine import affine_torch
 5 | 
 6 | data_path = "demo.png"
 7 | 
 8 | img = cv2.imread(data_path)
 9 | # transform img(numpy.array) to tensor(torch.Tensor)
10 | # use permute
11 | img_tensor = torch.from_numpy(img / 255.0).permute(2, 0, 1).contiguous()
12 | img_tensor = img_tensor.unsqueeze(0).float()
13 | 
14 | # src -> dst
15 | # src_tensor = torch.tensor([[262.0, 324.0, 1.0], [325.0, 323.0, 1.0], [295.0, 349.0, 1.0]], dtype=torch.float32).unsqueeze(0)
16 | # dst_tensor = torch.tensor([[38.29, 51.69], [73.53, 51.69], [56.02, 71.73]], dtype=torch.float32).unsqueeze(0)
17 | # dst -> src
18 | src_tensor = torch.tensor([[38.29, 51.69, 1.0], [73.53, 51.69, 1.0], [56.02, 71.73, 1.0]], dtype=torch.float32).unsqueeze(0)
19 | dst_tensor = torch.tensor([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=torch.float32).unsqueeze(0)
20 | 
21 | # compute affine transform matrix
22 | matrix_l = torch.transpose(src_tensor, 1, 2).bmm(src_tensor)
23 | matrix_l = torch.inverse(matrix_l)
24 | matrix_r = torch.transpose(src_tensor, 1, 2).bmm(dst_tensor)
25 | affine_matrix = torch.transpose(matrix_l.bmm(matrix_r), 1, 2)
26 | 
27 | warpffine_img = affine_torch(img_tensor, affine_matrix, 112, 112)
28 | 
29 | warpffine_img = warpffine_img.squeeze(0).permute(1, 2, 0).numpy()
30 | cv2.imwrite("torch_affine_cpu.png", np.uint8(warpffine_img * 255.0))
31 | cv2.imshow("img", warpffine_img)
32 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/scripts/test_warpaffine_torch_gpu.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import torch
 3 | import numpy as np
 4 | from orbbec.warpaffine import affine_torch
 5 | 
 6 | data_path = "demo.png"
 7 | 
 8 | img = cv2.imread(data_path)
 9 | # transform img(numpy.array) to tensor(torch.Tensor)
10 | # use permute
11 | img_tensor = torch.from_numpy(img / 255.0).permute(2, 0, 1).contiguous()
12 | img_tensor = img_tensor.unsqueeze(0).float()
13 | img_tensor = img_tensor.cuda()
14 | # src -> dst
15 | # src_tensor = torch.tensor([[262.0, 324.0, 1.0], [325.0, 323.0, 1.0], [295.0, 349.0, 1.0]], dtype=torch.float32).unsqueeze(0)
16 | # dst_tensor = torch.tensor([[38.29, 51.69], [73.53, 51.69], [56.02, 71.73]], dtype=torch.float32).unsqueeze(0)
17 | # dst -> src
18 | src_tensor = torch.tensor([[38.29, 51.69, 1.0], [73.53, 51.69, 1.0], [56.02, 71.73, 1.0]], dtype=torch.float32).unsqueeze(0)
19 | dst_tensor = torch.tensor([[262.0, 324.0], [325.0, 323.0], [295.0, 349.0]], dtype=torch.float32).unsqueeze(0)
20 | src_tensor = src_tensor.cuda()
21 | dst_tensor = dst_tensor.cuda()
22 | 
23 | # compute affine transform matrix
24 | matrix_l = torch.transpose(src_tensor, 1, 2).bmm(src_tensor)
25 | matrix_l = torch.inverse(matrix_l)
26 | matrix_r = torch.transpose(src_tensor, 1, 2).bmm(dst_tensor)
27 | affine_matrix = torch.transpose(matrix_l.bmm(matrix_r), 1, 2)
28 | affine_matrix = affine_matrix.contiguous().cuda()
29 | 
30 | warpffine_img = affine_torch(img_tensor, affine_matrix, 112, 112)
31 | print(warpffine_img.shape)
32 | warpffine_img = warpffine_img.cpu().squeeze(0).permute(1, 2, 0).numpy()
33 | cv2.imwrite("torch_affine_gpu.png", np.uint8(warpffine_img * 255.0))
34 | cv2.imshow("img", warpffine_img)
35 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/scripts/torch_affine_cpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/torch_affine_cpu.png


--------------------------------------------------------------------------------
/scripts/torch_affine_gpu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/scripts/torch_affine_gpu.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import sys
  4 | import glob
  5 | import time
  6 | import torch
  7 | import warnings
  8 | import subprocess
  9 | from setuptools import find_packages, setup
 10 | from torch.utils.cpp_extension import include_paths, library_paths
 11 | from torch.utils.cpp_extension import (BuildExtension, CppExtension, CUDAExtension)
 12 | 
 13 | 
 14 | def parse_requirements(fname='requirements.txt', with_version=True):
 15 |     """
 16 |     Parse the package dependencies listed in a requirements file but strips
 17 |     specific versioning information.
 18 | 
 19 |     Args:
 20 |         fname (str): path to requirements file
 21 |         with_version (bool, default=False): if True include version specs
 22 | 
 23 |     Returns:
 24 |         List[str]: list of requirements items
 25 | 
 26 |     CommandLine:
 27 |         python -c "import setup; print(setup.parse_requirements())"
 28 |     """
 29 |     import sys
 30 |     from os.path import exists
 31 |     import re
 32 |     require_fpath = fname
 33 | 
 34 |     def parse_line(line):
 35 |         """
 36 |         Parse information from a line in a requirements text file
 37 |         """
 38 |         if line.startswith('-r '):
 39 |             # Allow specifying requirements in other files
 40 |             target = line.split(' ')[1]
 41 |             for info in parse_require_file(target):
 42 |                 yield info
 43 |         else:
 44 |             info = {'line': line}
 45 |             if line.startswith('-e '):
 46 |                 info['package'] = line.split('#egg=')[1]
 47 |             else:
 48 |                 # Remove versioning from the package
 49 |                 pat = '(' + '|'.join(['>=', '==', '>']) + ')'
 50 |                 parts = re.split(pat, line, maxsplit=1)
 51 |                 parts = [p.strip() for p in parts]
 52 | 
 53 |                 info['package'] = parts[0]
 54 |                 if len(parts) > 1:
 55 |                     op, rest = parts[1:]
 56 |                     if ';' in rest:
 57 |                         # Handle platform specific dependencies
 58 |                         # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
 59 |                         version, platform_deps = map(str.strip, rest.split(';'))
 60 |                         info['platform_deps'] = platform_deps
 61 |                     else:
 62 |                         version = rest  # NOQA
 63 |                     info['version'] = (op, version)
 64 |             yield info
 65 | 
 66 |     def parse_require_file(fpath):
 67 |         with open(fpath, 'r') as f:
 68 |             for line in f.readlines():
 69 |                 line = line.strip()
 70 |                 if line and not line.startswith('#'):
 71 |                     for info in parse_line(line):
 72 |                         yield info
 73 | 
 74 |     def gen_packages_items():
 75 |         if exists(require_fpath):
 76 |             for info in parse_require_file(require_fpath):
 77 |                 parts = [info['package']]
 78 |                 if with_version and 'version' in info:
 79 |                     parts.extend(info['version'])
 80 |                 if not sys.version.startswith('3.4'):
 81 |                     # apparently package_deps are broken in 3.4
 82 |                     platform_deps = info.get('platform_deps')
 83 |                     if platform_deps is not None:
 84 |                         parts.append(';' + platform_deps)
 85 |                 item = ''.join(parts)
 86 |                 yield item
 87 | 
 88 |     packages = list(gen_packages_items())
 89 |     return packages
 90 | 
 91 | 
 92 | def make_extension(name, module, sources, sources_cuda=[], use_cpp_opencv=False):
 93 |     """
 94 |     make extension modules
 95 |     """
 96 |     include_dirs = []
 97 |     library_dirs = []
 98 |     libraries = []
 99 |     define_macros = []
100 |     extra_compile_args = {'cxx': []}
101 | 
102 |     # use 3rdparty: opencv
103 |     if use_cpp_opencv:
104 |         # pybind11
105 |         include_dirs.append(os.path.abspath('./3rdparty/pybind11/include'))
106 |         # opencv
107 |         if sys.platform == 'win32':
108 |             # include_directories()
109 |             include_dirs.append(os.path.abspath('./3rdparty/opencv/win/include'))
110 |             # target_link_directories()
111 |             library_dirs.append(os.path.abspath('./3rdparty/opencv/win/x64/vc15/lib'))
112 |             # target_link_libraries()
113 |             libraries.append('opencv_world453')
114 |         elif sys.platform == "linux":
115 |             # include_directories()
116 |             include_dirs.append(os.path.abspath('./3rdparty/opencv/linux/include'))
117 |             # target_link_directories()
118 |             library_dirs.append(os.path.abspath('./3rdparty/opencv/linux/lib'))
119 |             # target_link_libraries()
120 |             libraries.append("opencv_world")
121 |         else:
122 |             raise ValueError(f'Unsupported platform: {sys.platform}')
123 | 
124 |     # gpu files
125 |     if torch.cuda.is_available():
126 |         print(f'Compiling {name} with CUDA')
127 |         include_dirs += include_paths(cuda=True)
128 |         library_dirs += library_paths(cuda=True)
129 |         libraries.append('cudart')
130 |         # add macros WITH_CUDA in cpp code.
131 |         define_macros += [('WITH_CUDA', None)]
132 |         extension = CUDAExtension
133 |         extra_compile_args['nvcc'] = [
134 |             '-D__CUDA_NO_HALF_OPERATORS__',
135 |             '-D__CUDA_NO_HALF_CONVERSIONS__',
136 |             '-D__CUDA_NO_HALF2_OPERATORS__',
137 |         ]
138 |         sources += sources_cuda
139 |     else:
140 |         print(f'Compiling {name} with cpp, but without CUDA')
141 |         extension = CppExtension
142 | 
143 |     ext_ops = extension(
144 |                 name=f'{module}.{name}',
145 |                 sources=[os.path.join(*module.split('.'), p) for p in sources],
146 |                 define_macros=define_macros,
147 |                 extra_compile_args=extra_compile_args,
148 |                 include_dirs=include_dirs,
149 |                 library_dirs=library_dirs,
150 |                 libraries=libraries)
151 | 
152 |     return ext_ops
153 | 
154 | 
155 | if __name__ == '__main__':
156 |     setup(
157 |         name='orbbec',
158 |         version="0.0.1",
159 |         description='C++/CUDA extension for python or pytorch',
160 |         author='orbbec jiaozhu',
161 |         author_email='jiaozhu@orbbec.com',
162 |         keywords='C++/CUDA, python, pytorch',
163 |         url='https://github.com/ChenCVer/PYTHON_CPP_EXTENSION',
164 |         packages=find_packages(exclude=('3rdparty', 'requirements', 'scripts', "tools")),
165 |         package_data={'orbbec': ['*/*.pyd']},
166 |         classifiers=[
167 |             'Development Status :: 4 - Beta',
168 |             'License :: OSI Approved :: Apache Software License',
169 |             'Operating System :: OS Independent',
170 |             'Programming Language :: Python :: 3',
171 |             'Programming Language :: Python :: 3.5',
172 |             'Programming Language :: Python :: 3.6',
173 |             'Programming Language :: Python :: 3.7',
174 |         ],
175 |         license='Apache License 2.0',
176 |         setup_requires=parse_requirements('requirements/build.txt'),
177 |         tests_require=parse_requirements('requirements/tests.txt'),
178 |         install_requires=parse_requirements('requirements/runtime.txt'),
179 |         extras_require={
180 |             'all': parse_requirements('requirements.txt'),
181 |             'tests': parse_requirements('requirements/tests.txt'),
182 |             'build': parse_requirements('requirements/build.txt'),
183 |             'optional': parse_requirements('requirements/optional.txt'),
184 |         },
185 |         ext_modules=[   
186 |             # compile info
187 |             make_extension(
188 |                 name='compiling_info',
189 |                 module='orbbec.utils',
190 |                 sources=['src/compiling_info.cpp']),
191 |             
192 |             # nms    
193 |             make_extension(
194 |                 name='nms_ext',
195 |                 module='orbbec.nms',
196 |                 sources=[
197 |                     'src/nms_ext.cpp', 
198 |                     'src/cpu/nms_cpu.cpp'],
199 |                 sources_cuda=[
200 |                     'src/cuda/nms_cuda.cpp', 
201 |                     'src/cuda/nms_kernel.cu']),
202 |             
203 |             # roi align    
204 |             make_extension(
205 |                 name='roi_align_ext',
206 |                 module='orbbec.roi_align',
207 |                 sources=[
208 |                     'src/roi_align_ext.cpp',
209 |                     'src/cpu/roi_align_v2.cpp'],
210 |                 sources_cuda=[
211 |                     'src/cuda/roi_align_kernel.cu',
212 |                     'src/cuda/roi_align_kernel_v2.cu']),
213 |    
214 |             # warpaffine    
215 |             make_extension(
216 |                 name='warpaffine_ext',
217 |                 module='orbbec.warpaffine',
218 |                 sources=[
219 |                     'src/warpaffine_ext.cpp',                  
220 |                     'src/cpu/warpaffine_opencv.cpp',
221 |                     'src/cpu/warpaffine_torch_v2.cpp'],
222 |                 sources_cuda=[
223 |                     'src/cuda/warpaffine_kernel.cu',
224 |                     'src/cuda/warpaffine_cuda.cpp'],
225 |                 use_cpp_opencv=True),                      
226 |         ],
227 |         # if use_ninja is True, it will speed up the compilation process.
228 |         cmdclass={'build_ext': BuildExtension.with_options(use_ninja=False)},
229 |         zip_safe=False)


--------------------------------------------------------------------------------
/tools/Dependencies_x64_Release.7z:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ChenCVer/python_cpp_extension/7984cc513026d5ffde88ea711fc803adb4044850/tools/Dependencies_x64_Release.7z


--------------------------------------------------------------------------------