├── Makefile
├── README.md
├── README.txt
├── data
    ├── dirtyflow_0001.flo
    ├── frame_0001.jpg
    ├── frame_0002.jpg
    └── gtflow_0001.flo
├── deepflowcuda.cu
├── deepflowcuda.h
├── main.cu
└── myvec3f.h


/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC = nvcc
 2 | NVCCFLAGS = -c -g -O3
 3 | LD = g++
 4 | LDFLAGS = -O3
 5 | FINAL_TARGET = deepflow
 6 | CUDA_DIR = /usr/local/cuda-9.2
 7 | OPENCV_DIR = /usr/local/opencv-4.1.0-build
 8 | INCLUDE_DIR = -I$(CUDA_DIR)/include -I$(OPENCV_DIR)/include/opencv4
 9 | LIB_DIR = -L$(CUDA_DIR)/lib64 -L$(OPENCV_DIR)/lib
10 | LIBS =  -lopencv_cudawarping -lopencv_cudafilters -lopencv_cudaimgproc -lopencv_cudaarithm -lopencv_cudalegacy -lopencv_video -lopencv_imgproc -lopencv_imgcodecs -lopencv_core -lcudart
11 | 
12 | default: $(FINAL_TARGET)
13 | 
14 | $(FINAL_TARGET): main.o deepflowcuda.o
15 | 	$(LD) $+ -o $@ $(LDFLAGS) $(LIB_DIR) $(LIBS)
16 | 
17 | %.o: %.cu
18 | 	$(NVCC) $(NVCCFLAGS) $(INCLUDE_DIR) $< -o $@
19 | 
20 | clean:
21 | 	rm -f *.o $(FINAL_TARGET)
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DeepFlowCUDA
 2 | A C++ port of Philippe Weinzaepfel's C DeepFlow library, using OpenCV/CUDA
 3 | 
 4 | Reference papers are:
 5 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
 6 | [J. Revaud, P. Weinzaepfel, Z. Harchaoui and C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016]
 7 | 
 8 | Provided test images are taken from the MPI Sintel dataset:
 9 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012]
10 | 
11 | See README.txt for requirements and build instructions.
12 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | This is a C++/OpenCV/CUDA port of the DeepFlow C library, associated to the following paper:
 2 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
 3 | See http://lear.inrialpes.fr/src/deepflow/
 4 | 
 5 | The library is provided with two images and the corresponding ground truth flow from the MPI Sintel dataset:
 6 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012]
 7 | See http://sintel.is.tue.mpg.de/
 8 | 
 9 | It should be used as a refinement step, as described for example in the FlowNet paper,
10 | [A. Dosovitskiy, P. Fischer, E. Ilg, P. Häusser, C. Hazirbas, V. Golkov, P. van der Smagt, D. Cremers and T. Brox. FlowNet: Learning Optical Flow with Convolutional Networks. ICCV 2015]
11 | Thus, the input is made up of two images and a coarse optical flow, typically output from a deep neural network. If no input flow is provided, the initial field is set to zero everywhere. As in the OpenCV modules variationalrefinement and cudaoptflow, the red-black successive overrelaxation method (SOR) is used to solve the linear systems. See the paper by Brox et al:
12 | [T. Brox, A. Bruhn, N. Papenberg, J. Weickert. High Accuracy Optical Flow Estimation Based on a Theory for Warping. ECCVV 2004]
13 | 
14 | The DeepFlowCUDA class uses the cv::cudev::GpuMat_ template class, so you'll need OpenCV with additional opencv_contrib modules built (see https://github.com/opencv/opencv_contrib)
15 | 
16 | Requirements:
17 | - OpenCV with opencv_contrib modules (need CUDA modules)
18 | - CUDA build environment (nvcc should be in your PATH)
19 | 
20 | The Makefile assumes that CUDA is installed in /usr/local/cuda-9.2 and OpenCV in /usr/local/opencv-4.1.0
21 | Just edit the paths in the Makefile and run
22 | > make
23 | > ./deepflow
24 | 
25 | Please report any bug to julien.mille@insa-cvl.fr
26 | Thanks!
27 | 
28 | Copyright 2019 Julien Mille
29 | 


--------------------------------------------------------------------------------
/data/dirtyflow_0001.flo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/dirtyflow_0001.flo


--------------------------------------------------------------------------------
/data/frame_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0001.jpg


--------------------------------------------------------------------------------
/data/frame_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0002.jpg


--------------------------------------------------------------------------------
/data/gtflow_0001.flo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/gtflow_0001.flo


--------------------------------------------------------------------------------
/deepflowcuda.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2019 Julien Mille
  3 | 
  4 | This file is part of DeepFlowCUDA.
  5 | 
  6 | DeepFlowCUDA is free software: you can redistribute
  7 | it and/or modify it under the terms of the GNU Lesser General Public License
  8 | as published by the Free Software Foundation, either version 3 of the License,
  9 | or (at your option) any later version.
 10 | 
 11 | DeepFlowCUDA is distributed in the hope that it will
 12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 14 | General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License,
 17 | and a copy of the GNU Lesser General Public License, along with
 18 | DeepFlowCUDA. If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | #include "deepflowcuda.h"
 22 | #include <opencv2/cudaarithm.hpp> // for pixelwise operations on cv::cudev::GpuMat_
 23 | #include <opencv2/cudawarping.hpp> // for cv::cuda::resize
 24 | #include <iostream>
 25 | 
 26 | using namespace std;
 27 | 
 28 | // Maximum number of CUDA threads per block, per dimension, for 2D thread blocks
 29 | // It is equivalent to 32^2=1014 threads per 1D block 
 30 | #define THREADS_PER_BLOCK_2D 32
 31 | 
 32 | // Rounded up division, to compute number of thread blocks when launching CUDA kernels
 33 | int divUp(int a, int b)
 34 | {
 35 |     return (a + b - 1)/b;
 36 | }
 37 | 
 38 | DeepFlowCuda::DeepFlowCuda()
 39 | {
 40 |     // We use the same setting as DeepFlow, except for the scale factor
 41 |     fixedPointIterations = 5;
 42 |     sorIterations = 25;
 43 |     alpha = 1.0;
 44 |     beta = 32.0;
 45 |     delta = 0.1;
 46 |     gamma = 0.7;
 47 |     omega = 1.6;
 48 |     zeta = 0.1;
 49 |     epsilon = 0.01;
 50 |     scaleFactor = 0.5;
 51 |     sigma = 0; // 0.65;
 52 |     minSize = 10;
 53 | 
 54 |     paramsCuda = nullptr;
 55 |     copyParamsToCuda();
 56 | 
 57 |     current.step = 0;
 58 |     current.stepColor = 0;
 59 | 
 60 |     padding = 1;
 61 | 
 62 |     assert(createFilters());
 63 | }
 64 | 
 65 | DeepFlowCuda::~DeepFlowCuda()
 66 | {
 67 |     if (paramsCuda!=nullptr)
 68 |         cudaFree(paramsCuda);
 69 | }
 70 | 
 71 | bool DeepFlowCuda::createFilters()
 72 | {
 73 |     cv::Mat deriv, deriv5pt;
 74 |     cv::Mat o = cv::Mat::ones(1, 1, CV_32F);
 75 | 
 76 |     // CUDA filters for centered finite differences
 77 |     deriv.create(1, 3, CV_32F);
 78 |     deriv.at<float>(0,0) = -0.5;
 79 |     deriv.at<float>(0,1) = 0.0;
 80 |     deriv.at<float>(0,2) = 0.5;
 81 | 
 82 |     filtx = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv, o, cv::Point(-1, -1), cv::BORDER_REPLICATE);
 83 |     if (filtx==nullptr)
 84 |         return false;
 85 | 
 86 |     filty = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv, cv::Point(-1, -1), cv::BORDER_REPLICATE);
 87 |     if (filty==nullptr)
 88 |         return false;
 89 |     
 90 |     // CUDA filters for finite differences with 5-point stencil
 91 |     deriv5pt.create(1, 5, CV_32F);
 92 |     deriv5pt.at<float>(0,0) = 1.0/12;
 93 |     deriv5pt.at<float>(0,1) = -8.0/12;
 94 |     deriv5pt.at<float>(0,2) = 0;
 95 |     deriv5pt.at<float>(0,3) = 8.0/12;
 96 |     deriv5pt.at<float>(0,4) = -1.0/12;
 97 | 
 98 |     filtx5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv5pt, o, cv::Point(-1, -1), cv::BORDER_REPLICATE);
 99 |     if (filtx5pt==nullptr)
100 |         return false;
101 |     
102 |     filty5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv5pt, cv::Point(-1, -1), cv::BORDER_REPLICATE);
103 |     if (filty5pt==nullptr)
104 |         return false;
105 | 
106 |     if (sigma!=0)
107 |     {
108 |         filtg = cv::cuda::createGaussianFilter(CV_32FC3, CV_32FC3, cv::Size(0,0), sigma, sigma, cv::BORDER_REPLICATE);
109 |         if (filtg==nullptr)
110 |             return false;
111 |     }
112 | 
113 |     return true;
114 | }
115 | 
116 | __global__ void warpKernel(int width, int height, int padding, int step, int stepColor, const float3 *I, const float *u, const float *v, float3 *warpedI)
117 | {
118 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
119 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
120 |     if (i>=height || j>=width)
121 |         return;
122 | 
123 |     int offset = (i+padding)*step + j+padding;
124 |     MyVec3f *pWarpedI = (MyVec3f *)((float *)warpedI + ((i+padding)*stepColor + 3*(j+padding)));
125 |     
126 |     float x, y, xx, yy, dx, dy;
127 |     int x1, x2, y1, y2;
128 | 
129 |     xx = j + u[offset];
130 |     yy = i + v[offset];
131 |     x = floor(xx);
132 |     y = floor(yy);
133 |     dx = xx-x;
134 |     dy = yy-y;
135 |             
136 |     x1 = (int)x;
137 |     x2 = x1+1;
138 |     y1 = (int)y;
139 |     y2 = y1+1;
140 | 
141 |     if (x1<0) x1=0; else if (x1>=width) x1 = width-1;
142 |     if (x2<0) x2=0; else if (x2>=width) x2 = width-1;
143 |     if (y1<0) y1=0; else if (y1>=height) y1 = height-1;
144 |     if (y2<0) y2=0; else if (y2>=height) y2 = height-1;
145 |     
146 |     const MyVec3f *pI1 = (const MyVec3f *)((float *)I + ((y1+padding)*stepColor + 3*padding));
147 |     const MyVec3f *pI2 = (const MyVec3f *)((float *)I + ((y2+padding)*stepColor + 3*padding));
148 | 
149 |     *pWarpedI = 
150 |         pI1[x1]*(1.0f-dx)*(1.0f-dy) +
151 |         pI1[x2]*dx*(1.0f-dy) +
152 |         pI2[x1]*(1.0f-dx)*dy +
153 |         pI2[x2]*dx*dy;
154 | }
155 | 
156 | __global__ void averageKernel(int width, int height, int stepColor, const float3 *a, const float3 *b, float3 *c)
157 | {
158 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
159 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
160 |     if (i>=height || j>=width)
161 |         return;
162 | 
163 |     int offset = i*stepColor + 3*j;
164 |     const MyVec3f *pA = (const MyVec3f *)((float *)a + offset);
165 |     const MyVec3f *pB = (const MyVec3f *)((float *)b + offset);
166 |     MyVec3f *pC = (MyVec3f *)((float *)c + offset);
167 |     
168 |     *pC = (*pA+*pB)*0.5;
169 | }
170 | 
171 | void DeepFlowCuda::prepareBuffers(int scale)
172 | {
173 |     current.size = pyramid.I0[scale].size();
174 | 
175 |     current.sizePadded.width = current.size.width + 2*padding;
176 |     current.sizePadded.height = current.size.height + 2*padding;
177 | 
178 |     current.A11.create(current.sizePadded);
179 |     current.A12.create(current.sizePadded);
180 |     current.A22.create(current.sizePadded);
181 |     current.b1.create(current.sizePadded);
182 |     current.b2.create(current.sizePadded);
183 | 
184 |     current.luminance.create(current.sizePadded);
185 |     current.smoothX.create(current.sizePadded);
186 |     current.smoothY.create(current.sizePadded);
187 |     current.smoothWeight.create(current.sizePadded);
188 | 
189 |     current.smoothX.setTo(0.0);
190 |     current.smoothY.setTo(0.0);
191 |     current.smoothWeight.setTo(0.0);
192 | 
193 |     if (scale!=nbScales-1)
194 |     {
195 |         cv::cuda::resize(pyramid.ufinal[scale+1], pyramid.uinit[scale], pyramid.I0[scale].size());
196 |         cv::cuda::resize(pyramid.vfinal[scale+1], pyramid.vinit[scale], pyramid.I0[scale].size());
197 | 
198 |         cv::cuda::multiply(pyramid.uinit[scale], 1.0/scaleFactor, pyramid.uinit[scale]);
199 |         cv::cuda::multiply(pyramid.vinit[scale], 1.0/scaleFactor, pyramid.vinit[scale]);
200 |     }
201 | 
202 |     if (padding!=0)
203 |     {
204 |         cv::cuda::copyMakeBorder(pyramid.I0[scale], current.I0, padding, padding, padding, padding, cv::BORDER_REPLICATE);
205 |         cv::cuda::copyMakeBorder(pyramid.uinit[scale], current.u, padding, padding, padding, padding, cv::BORDER_REPLICATE);
206 |         cv::cuda::copyMakeBorder(pyramid.vinit[scale], current.v, padding, padding, padding, padding, cv::BORDER_REPLICATE);
207 | 
208 |         if (beta!=0)
209 |         {
210 |             cv::cuda::copyMakeBorder(pyramid.udesc[scale], current.udesc, padding, padding, padding, padding, cv::BORDER_REPLICATE);
211 |             cv::cuda::copyMakeBorder(pyramid.vdesc[scale], current.vdesc, padding, padding, padding, padding, cv::BORDER_REPLICATE);
212 |             cv::cuda::copyMakeBorder(pyramid.descWeight[scale], current.descWeight, padding, padding, padding, padding, cv::BORDER_REPLICATE);
213 |         }
214 |     }
215 |     else {
216 |         pyramid.I0[scale].copyTo(current.I0);
217 |         pyramid.uinit[scale].copyTo(current.u);
218 |         pyramid.vinit[scale].copyTo(current.v);
219 | 
220 |         if (beta!=0)
221 |         {
222 |             pyramid.udesc[scale].copyTo(current.udesc);
223 |             pyramid.vdesc[scale].copyTo(current.vdesc);
224 |             pyramid.descWeight[scale].copyTo(current.descWeight);
225 |         }
226 |     }
227 | 
228 |     current.step = current.A11.step/sizeof(float);
229 |     current.stepColor = current.I0.step/sizeof(float);
230 | 
231 |     // Computing an average of the current and warped next frames (to compute the derivatives on) and temporal derivative Iz
232 |     dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid;
233 | 
234 |     // Otherwise, pyramid.uinit[scale] and pyramid.vinit[scale] are already initialized
235 |     cv::cudev::GpuMat_<float3> gpuwarpedI, gpuaveragedI;
236 | 
237 |     // Average everywhere
238 |     gpuwarpedI.create(current.size);
239 | 
240 |     blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
241 |     warpKernel<<<blocksPerGrid, threadsPerBlock>>>(current.size.width, current.size.height, 0, pyramid.uinit[scale].step/sizeof(float), pyramid.I1[scale].step/sizeof(float),
242 |         pyramid.I1[scale][0], pyramid.uinit[scale][0], pyramid.vinit[scale][0], gpuwarpedI[0]);
243 | 
244 |     if (padding!=0)
245 |         cv::cuda::copyMakeBorder(gpuwarpedI, gpuwarpedI, padding, padding, padding, padding, cv::BORDER_REPLICATE);
246 | 
247 |     // Average everywhere
248 |     gpuaveragedI.create(current.sizePadded);
249 |     
250 |     blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
251 |     averageKernel<<<blocksPerGrid, threadsPerBlock>>>(current.sizePadded.width, current.sizePadded.height, current.stepColor, current.I0[0], gpuwarpedI[0], gpuaveragedI[0]);
252 |     cv::cuda::subtract(gpuwarpedI, current.I0, current.Iz);
253 | 
254 |     filtx5pt->apply(gpuaveragedI, current.Ix);
255 |     filty5pt->apply(gpuaveragedI, current.Iy);
256 | 
257 |     filtx5pt->apply(current.Iz, current.Ixz);
258 |     filty5pt->apply(current.Iz, current.Iyz);
259 | 
260 |     filtx5pt->apply(current.Ix, current.Ixx);
261 |     filty5pt->apply(current.Ix, current.Ixy);
262 |     filty5pt->apply(current.Iy, current.Iyy);
263 | 
264 |     current.u.copyTo(current.utmp);
265 |     current.v.copyTo(current.vtmp);
266 | 
267 |     current.du.create(current.sizePadded);
268 |     current.dv.create(current.sizePadded);
269 |     current.du.setTo(0.0);
270 |     current.dv.setTo(0.0);
271 | }
272 | 
273 | __global__ void structureTensorKernel(int width, int height, int step, int stepColor, const float3 *Ix, const float3 *Iy, float *stx2, float *stxy, float *sty2)
274 | {
275 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
276 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
277 |     if (i>=height || j>=width)
278 |         return;
279 | 
280 |     int offset = i*step + j;
281 |     int offsetColor = i*stepColor + 3*j;
282 |     const MyVec3f *pIx = (const MyVec3f *)((float *)Ix + offsetColor);
283 |     const MyVec3f *pIy = (const MyVec3f *)((float *)Iy + offsetColor);
284 |     
285 |     stx2[offset] = (*pIx).dot(*pIx);
286 |     stxy[offset] = (*pIx).dot(*pIy);
287 |     sty2[offset] = (*pIy).dot(*pIy);
288 | }
289 | 
290 | __global__ void minEigenvalueKernel(int width, int height, int step, const float *stx2, const float *stxy, const float *sty2, float *minEigen)
291 | {
292 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
293 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
294 |     if (i>=height || j>=width)
295 |         return;
296 | 
297 |     int offset = i*step + j;
298 |     const float *pstx2 = stx2 + offset;
299 |     const float *pstxy = stxy + offset;
300 |     const float *psty2 = sty2 + offset;
301 | 
302 |     float t = 0.5f * (*pstx2 + *psty2);
303 |     float t2 = t*t + (*pstxy)*(*pstxy) - (*pstx2)*(*psty2);
304 |     minEigen[offset] = t - (t2<=0.0f?0.0f:sqrtf(t2)); // may be negative due to floating points approximation
305 | }
306 | 
307 | __global__ void matchingScoreKernel(int width, int height, int step, int stepColor,
308 |     const float3 *I0, const float3 *I1,
309 |     const float3 *I0x, const float3 *I0y, const float3 *I1x, const float3 *I1y, 
310 |     const float *udesc, const float *vdesc, const float *minEigen, float *descWeight)
311 | {
312 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
313 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
314 |     if (i>=height || j>=width)
315 |         return;
316 | 
317 |     int offset = i*step + j;
318 |     int offsetColor = i*stepColor + 3*j;    
319 |     const MyVec3f *pI0 = (const MyVec3f *)((float *)I0 + offsetColor);
320 |     const MyVec3f *pI0x = (const MyVec3f *)((float *)I0x + offsetColor);
321 |     const MyVec3f *pI0y = (const MyVec3f *)((float *)I0y + offsetColor);
322 |     
323 |     float xw, yw;
324 |     xw = (int)(j + udesc[offset]);
325 |     yw = (int)(i + vdesc[offset]);
326 |     if (xw<0) xw=0;
327 |     else if (xw>width-1) xw = width-1;
328 |     if (yw<0) yw=0;
329 |     else if (yw>height-1) yw = height-1;
330 | 
331 |     int offsetColor2 = yw*stepColor + 3*xw;
332 |     const MyVec3f *pI1 = (const MyVec3f *)((float *)I1 + offsetColor2);
333 |     const MyVec3f *pI1x = (const MyVec3f *)((float *)I1x + offsetColor2);
334 |     const MyVec3f *pI1y = (const MyVec3f *)((float *)I1y + offsetColor2);
335 | 
336 |     float gradWeight = 1.0;
337 |     float flow_sigma_score = 50.0f;
338 |     float mul_coef = 10.0f;
339 | 
340 |     float flowscore = (*pI0 - *pI1).l1norm() + gradWeight * ((*pI0x - *pI1x).l1norm() + (*pI0y - *pI1y).l1norm());
341 |     float t2 = minEigen[offset];
342 | 
343 |     float t = 1.0f/(flow_sigma_score*sqrtf(2.0f*M_PI));
344 |     float sigmascore2 = -0.5f/(flow_sigma_score*flow_sigma_score);
345 | 
346 |     t2 = t2<=0.0?0.0:sqrtf(t2);
347 |     descWeight[offset] = mul_coef * t2 * t * expf( flowscore*flowscore*sigmascore2 );
348 |     if (descWeight[offset]<0.0)  // may be negative due to floating points approximation
349 |         descWeight[offset] = 0.0;
350 | }
351 | 
352 | void DeepFlowCuda::computeDescWeight()
353 | {
354 |     // Input : pyramid.I0[0], pyramid.I1[0]
355 |     // Output : pyramid.descWeight[0]
356 |     cv::cudev::GpuMat_<float3> I0x, I0y;
357 | 
358 |     // Structure tensor
359 |     cv::cudev::GpuMat_<float> stx2, stxy, sty2, minEigen;
360 | 
361 |     filtx->apply(pyramid.I0[0], I0x);
362 |     filty->apply(pyramid.I0[0], I0y);
363 | 
364 |     stx2.create(I0x.size());
365 |     stxy.create(I0x.size());
366 |     sty2.create(I0x.size());
367 | 
368 |     dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(I0x.cols, threadsPerBlock.x), divUp(I0x.rows, threadsPerBlock.y), 1);
369 |     
370 |     // No padding here
371 |     structureTensorKernel<<<blocksPerGrid, threadsPerBlock>>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float),
372 |         I0x[0], I0y[0], stx2[0], stxy[0], sty2[0]);
373 | 
374 |     // Smooth structure tensor
375 |     shared_ptr<cv::cuda::Filter> fg = cv::cuda::createGaussianFilter(CV_32F, CV_32F, cv::Size(0,0), 3.0, 3.0, cv::BORDER_REPLICATE);
376 |     fg->apply(stx2, stx2);
377 |     fg->apply(stxy, stxy);
378 |     fg->apply(sty2, sty2);
379 | 
380 |     minEigen.create(I0x.size());
381 |     pyramid.descWeight[0].create(I0x.size());
382 | 
383 |     minEigenvalueKernel<<<blocksPerGrid, threadsPerBlock>>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), stx2[0], stxy[0], sty2[0], minEigen[0]);
384 |     
385 |     cv::cudev::GpuMat_<float3> I0x5pt, I0y5pt, I1x5pt, I1y5pt;
386 | 
387 |     filtx5pt->apply(pyramid.I0[0], I0x5pt);
388 |     filty5pt->apply(pyramid.I0[0], I0y5pt);
389 |     filtx5pt->apply(pyramid.I1[0], I1x5pt);
390 |     filty5pt->apply(pyramid.I1[0], I1y5pt);
391 | 
392 |     matchingScoreKernel<<<blocksPerGrid, threadsPerBlock>>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float),
393 |         pyramid.I0[0][0], pyramid.I1[0][0],
394 |         I0x5pt[0], I0y5pt[0], I1x5pt[0], I1y5pt[0],
395 |         pyramid.udesc[0][0], pyramid.vdesc[0][0], minEigen[0], pyramid.descWeight[0][0]);
396 | }
397 | 
398 | // Computed on all pixels (including padding)
399 | __global__ void luminanceKernel(int width, int height, int step, int stepColor, const float3 *I, float *lum)
400 | {
401 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
402 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
403 |     if (i>=height || j>=width)
404 |         return;
405 | 
406 |     int offsetI = i*stepColor + 3*j;
407 |     int offset = i*step+j;
408 | 
409 |     const MyVec3f *pI = (const MyVec3f *)((const float *)I + offsetI);
410 |     lum[offset] = 0.299f*pI->z + 0.587f*pI->y + 0.114f*pI->x;
411 | }
412 | 
413 | __global__ void smoothnessWeightKernel(int width, int height, int padding, int step, const float *lum, float *smoothWeight, float coef)
414 | {
415 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
416 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
417 |     if (i>=height || j>=width)
418 |         return;
419 | 
420 |     int offset = (i+padding)*step + padding + j;
421 |     float lumx = (lum[offset+1]-lum[offset-1])*0.5;
422 |     float lumy = (lum[offset+step]-lum[offset-step])*0.5;
423 |     smoothWeight[offset] = 0.5f*expf(-coef*sqrtf(lumx*lumx+lumy*lumy));
424 | }
425 | 
426 | void DeepFlowCuda::computeSmoothnessWeight()
427 | {
428 |     dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid;
429 |     
430 |     // Luminance everywhere (including padding)
431 |     blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.sizePadded.height, threadsPerBlock.y), 1);
432 |     luminanceKernel<<<blocksPerGrid, threadsPerBlock>>>(current.sizePadded.width, current.sizePadded.height, current.step, current.stepColor, current.I0[0], current.luminance[0]);
433 | 
434 |     blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
435 |     smoothnessWeightKernel<<<blocksPerGrid, threadsPerBlock>>>(current.size.width, current.size.height, padding, current.step, current.luminance[0], current.smoothWeight[0], 5.0/255.0);
436 | }
437 | 
438 | __global__ void dataTermKernel(
439 |     const float *params, int width, int height, int padding, int step, int stepColor,
440 |     const float3 *Ix, const float3 *Iy, const float3 *Iz,
441 |     const float3 *Ixx, const float3 *Ixy, const float3 *Iyy,
442 |     const float3 *Ixz, const float3 *Iyz,
443 |     float *A11, float *A12, float *A22, float *b1, float *b2,
444 |     const float *du, const float *dv)
445 | {
446 |     // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
447 |     float delta = params[2], gamma = params[3], zeta = params[5], epsilon = params[6];
448 | 
449 |     float zeta_squared = zeta * zeta;
450 |     float epsilon_squared = epsilon * epsilon;
451 |     
452 |     const MyVec3f *pIx, *pIy, *pIz;
453 |     const MyVec3f *pIxx, *pIxy, *pIyy, *pIxz, *pIyz;
454 |     const float *pdU, *pdV;
455 |     float *pa11, *pa12, *pa22, *pb1, *pb2;
456 | 
457 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
458 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
459 |     if (i>=height || j>=width)
460 |         return;
461 | 
462 |     // stepColor is in number of float elements!
463 |     int offsetColor = (i+padding)*stepColor + 3*(padding+j);
464 |     pIx = (const MyVec3f *)((float *)Ix + offsetColor);
465 |     pIy = (const MyVec3f *)((float *)Iy + offsetColor);
466 |     pIz = (const MyVec3f *)((float *)Iz + offsetColor);
467 |     pIxx = (const MyVec3f *)((float *)Ixx + offsetColor);
468 |     pIxy = (const MyVec3f *)((float *)Ixy + offsetColor);
469 |     pIyy = (const MyVec3f *)((float *)Iyy + offsetColor);
470 |     pIxz = (const MyVec3f *)((float *)Ixz + offsetColor);
471 |     pIyz = (const MyVec3f *)((float *)Iyz + offsetColor);
472 |     
473 |     int offset = (i+padding)*step + padding+j;
474 |     pa11 = A11 + offset;
475 |     pa12 = A12 + offset;
476 |     pa22 = A22 + offset;
477 |     pb1 = b1 + offset;
478 |     pb2 = b2 + offset;
479 |     pdU = du + offset;
480 |     pdV = dv + offset;
481 | 
482 |     *pa11 = 0;
483 |     *pa12 = 0;
484 |     *pa22 = 0;
485 |     *pb1 = 0;
486 |     *pb2 = 0;
487 | 
488 |     // Color constancy
489 |     if (delta!=0.0)
490 |     {
491 |         MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared);
492 |         float hdover3 = delta*0.5f/3.0f;
493 |         float mask = 1.0;
494 | 
495 |         MyVec3f ngradI = *pIx*(*pIx) + *pIy*(*pIy) + dnorm;
496 | 
497 |         MyVec3f Ik1z = *pIz + *pIx*(*pdU) + *pIy*(*pdV);
498 |         float tmp = mask*hdover3/sqrt((Ik1z*Ik1z/ngradI).sum()+epsilon_squared);
499 |         MyVec3f ti = MyVec3f(tmp, tmp, tmp)/ngradI;
500 | 
501 |         *pa11 += (ti*(*pIx)*(*pIx)).sum();
502 |         *pa12 += (ti*(*pIx)*(*pIy)).sum();
503 |         *pa22 += (ti*(*pIy)*(*pIy)).sum();
504 |         *pb1 -= (ti*(*pIx)*(*pIz)).sum();
505 |         *pb2 -= (ti*(*pIy)*(*pIz)).sum();
506 |     }
507 | 
508 |     // Gradient constancy
509 |     if (gamma!=0)
510 |     {
511 |         MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared);
512 |         float hgover3 = gamma*0.5f/3.0f;
513 |         float mask = 1.0;
514 | 
515 |         MyVec3f nx = *pIxx*(*pIxx) + *pIxy*(*pIxy) + dnorm;
516 |         MyVec3f ny = *pIyy*(*pIyy) + *pIxy*(*pIxy) + dnorm;
517 | 
518 |         MyVec3f tmpx = *pIxz + *pIxx*(*pdU) + *pIxy*(*pdV);
519 |         MyVec3f tmpy = *pIyz + *pIxy*(*pdU) + *pIyy*(*pdV);
520 |     
521 |         float tmp = mask*hgover3/sqrt((tmpx*tmpx/nx).sum() + (tmpy*tmpy/ny).sum() + epsilon_squared);
522 | 
523 |         MyVec3f tix = MyVec3f(tmp, tmp, tmp)/nx;
524 |         MyVec3f tiy = MyVec3f(tmp, tmp, tmp)/ny;
525 | 
526 |         *pa11 += (tix*(*pIxx)*(*pIxx) + tiy*(*pIxy)*(*pIxy)).sum();
527 |         *pa12 += (tix*(*pIxx)*(*pIxy) + tiy*(*pIxy)*(*pIyy)).sum();
528 |         *pa22 += (tix*(*pIxy)*(*pIxy) + tiy*(*pIyy)*(*pIyy)).sum();
529 |         
530 |         *pb1 -= (tix*(*pIxx)*(*pIxz) + tiy*(*pIxy)*(*pIyz)).sum();
531 |         *pb2 -= (tix*(*pIxy)*(*pIxz) + tiy*(*pIyy)*(*pIyz)).sum();
532 |     }
533 | }
534 | 
535 | void DeepFlowCuda::computeDataTerm()
536 | {
537 |     dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
538 |     
539 |     dataTermKernel<<<blocksPerGrid, threadsPerBlock>>>(
540 |         paramsCuda, current.size.width, current.size.height, padding, current.step, current.stepColor,
541 |         current.Ix[0], current.Iy[0], current.Iz[0],
542 |         current.Ixx[0], current.Ixy[0], current.Iyy[0], current.Ixz[0], current.Iyz[0],
543 |         current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
544 |         current.du[0], current.dv[0]);
545 | }
546 | 
547 | __global__ void matchingTermKernel(
548 |     const float *params, int width, int height, int padding, int step,
549 |     float *A11, float *A22, float *b1, float *b2,
550 |     const float *u, const float *v, const float *utmp, const float *vtmp, const float *udesc, const float *vdesc, const float *descWeight)
551 | {
552 |     // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
553 |     float beta = params[1], epsilon = params[6];
554 | 
555 |     float epsilon_squared = epsilon*epsilon;
556 | 
557 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
558 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
559 |     if (i>=height || j>=width)
560 |         return;
561 |     
562 |     int offset = (i+padding)*step + padding+j;
563 |     
564 |     const float *pudesc = udesc + offset;
565 |     const float *pvdesc = vdesc + offset;
566 | 
567 |     float tmpx = utmp[offset] - *pudesc;
568 |     float tmpy = vtmp[offset] - *pvdesc;
569 |     float tmp = 0.5*descWeight[offset]*beta/sqrt(tmpx*tmpx+tmpy*tmpy+epsilon_squared);
570 |     A11[offset] += tmp;
571 |     A22[offset] += tmp;
572 |     b1[offset] -= tmp*(u[offset] - *pudesc);
573 |     b2[offset] -= tmp*(v[offset] - *pvdesc);
574 | }
575 | 
576 | void DeepFlowCuda::computeMatchingTerm()
577 | {
578 |     dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
579 |     
580 |     matchingTermKernel<<<blocksPerGrid, threadsPerBlock>>>(
581 |         paramsCuda, current.size.width, current.size.height, padding, current.step,
582 |         current.A11[0], current.A22[0], current.b1[0], current.b2[0],
583 |         current.u[0], current.v[0], current.utmp[0], current.vtmp[0], current.udesc[0], current.vdesc[0], current.descWeight[0]);
584 | }
585 | 
586 | __global__ void smoothnessTermKernel(
587 |     const float *params, int width, int height, int padding, int step,
588 |     float *smoothX, float *smoothY, const float *smoothWeight,
589 |     const float *utmp, const float *vtmp)
590 | {
591 |     // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
592 |     float alpha = params[0], epsilon = params[6]; 
593 | 
594 |     float epsilon_smooth = epsilon*epsilon; // 0.001f*0.001f;
595 |     
596 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
597 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
598 |     if (i>=height || j>=width)
599 |         return;
600 | 
601 |     int offset = (i+padding)*step + padding + j;
602 | 
603 |     float *psmoothX = smoothX + offset;
604 |     float *psmoothY = smoothY + offset;
605 |     const float *psmoothWeight = smoothWeight + offset; 
606 |     const float *pu = utmp + offset;
607 |     const float *pv = vtmp + offset;
608 | 
609 |     float ux1 = pu[1]-pu[0];
610 |     float vx1 = pv[1]-pv[0];
611 |     float uy1 = pu[step]-pu[0];
612 |     float vy1 = pv[step]-pv[0];
613 | 
614 |     float ux2 = (pu[1]-pu[-1])*0.5;
615 |     float vx2 = (pv[1]-pv[-1])*0.5;
616 |     float uy2 = (pu[step]-pu[-step])*0.5;
617 |     float vy2 = (pv[step]-pv[-step])*0.5;
618 |     
619 |     float tmpu = 0.5*(uy2 + (pu[step+1]-pu[-step+1])*0.5);
620 |     float uxsq = ux1*ux1 + tmpu*tmpu;
621 |     float tmpv = 0.5*(vy2 + (pv[step+1]-pv[-step+1])*0.5);
622 |     float vxsq = vx1*vx1 + tmpv*tmpv;
623 | 
624 |     *psmoothX = alpha*0.5*(psmoothWeight[0]+psmoothWeight[1])/sqrt(uxsq+vxsq+epsilon_smooth);
625 | 
626 |     tmpu = 0.5*(ux2 + (pu[step+1]-pu[step-1])*0.5);
627 |     float uysq = uy1*uy1 + tmpu*tmpu;
628 |     tmpv = 0.5*(vx2 + (pv[step+1]-pv[step-1])*0.5);
629 |     float vysq = vy1*vy1 + tmpv*tmpv;
630 | 
631 |     *psmoothY = alpha*0.5*(psmoothWeight[0]+psmoothWeight[step])/sqrt(uysq+vysq+epsilon_smooth);
632 | }
633 | 
634 | __global__ void applySmoothKernel(
635 |     int width, int height, int padding, int step,
636 |     float *b1, float *b2,
637 |     const float *smoothX, const float *smoothY,
638 |     const float *u, const float *v
639 |     )
640 | {
641 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
642 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
643 |     if (i>=height || j>=width)
644 |         return;
645 | 
646 |     int offset = (i+padding)*step + padding + j;
647 |     const float *pu = u + offset;
648 |     const float *pv = v + offset;
649 |     const float *psx = smoothX + offset;
650 |     const float *psy = smoothY + offset;
651 |     
652 |     // V1
653 |     b1[offset] += -psx[-1]*(pu[0]-pu[-1]) + psx[0]*(pu[1]-pu[0]) -psy[-step]*(pu[0]-pu[-step]) + psy[0]*(pu[step]-pu[0]);
654 |     b2[offset] += -psx[-1]*(pv[0]-pv[-1]) + psx[0]*(pv[1]-pv[0]) -psy[-step]*(pv[0]-pv[-step]) + psy[0]*(pv[step]-pv[0]);
655 | 
656 |     // V2
657 |     // b1[offset] += smoothX[offset]*(pu[1]-2*pu[0]+pu[-1]) + smoothY[offset]*(pu[step]-2*pu[0]+pu[-step]);
658 |     // b2[offset] += smoothX[offset]*(pv[1]-2*pv[0]+pv[-1]) + smoothY[offset]*(pv[step]-2*pv[0]+pv[-step]);    
659 | }
660 | 
661 | void DeepFlowCuda::computeSmoothnessTerm()
662 | {
663 |    dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
664 |     
665 |    smoothnessTermKernel<<<blocksPerGrid, threadsPerBlock>>>(
666 |         paramsCuda, current.size.width, current.size.height, padding, current.step,
667 |         current.smoothX[0], current.smoothY[0], current.smoothWeight[0],
668 |         current.utmp[0], current.vtmp[0]);
669 | 
670 |     applySmoothKernel<<<blocksPerGrid, threadsPerBlock>>>(
671 |         current.size.width, current.size.height, padding, current.step,
672 |         current.b1[0], current.b2[0], current.smoothX[0], current.smoothY[0], current.u[0], current.v[0]);
673 | }
674 | 
675 | __global__ void RedBlackSORKernel(
676 |     const float *params, int width, int height, int padding, int step, bool redpass,
677 |     const float *a11, const float *a12, const float *a22, const float *b1, const float *b2,
678 |     const float *smoothX, const float *smoothY,
679 |     float *du, float *dv)
680 | {
681 |     // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
682 |     float omega = params[4];
683 | 
684 |     int halfWidth = width/2 + width%2;
685 |     int widthRow; // Width of current row
686 |     
687 |     int j = blockDim.x * blockIdx.x + threadIdx.x;
688 |     int i = blockDim.y * blockIdx.y + threadIdx.y;
689 |     if (i>=height)
690 |         return;
691 | 
692 |     if (width%2==0)
693 |         widthRow = halfWidth;
694 |     else {
695 |         if (i%2==0)
696 |         {
697 |             if (redpass) widthRow = halfWidth;
698 |             else widthRow = halfWidth-1;
699 |         }
700 |         else {
701 |             if (redpass) widthRow = halfWidth-1;
702 |             else widthRow = halfWidth;
703 |         }
704 |     }
705 |     if (j>=widthRow)
706 |         return;
707 | 
708 |     int offset = (i+padding)*step + padding + j*2;
709 |     if ((redpass && i%2==1) || (!redpass && i%2==0))
710 |         offset++;
711 | 
712 |     float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2,det;
713 |     
714 |     sigma_u = 0.0f;
715 |     sigma_v = 0.0f;
716 |     sum_dpsis = 0.0f;
717 |     
718 |     if (i>0) {
719 |         sigma_u -= smoothY[offset-step] * du[offset-step];
720 |         sigma_v -= smoothY[offset-step] * dv[offset-step];
721 |         sum_dpsis += smoothY[offset-step];
722 |     }
723 | 
724 |     if(j>0){
725 |         sigma_u -= smoothX[offset-1] * du[offset-1];
726 |         sigma_v -= smoothX[offset-1] * dv[offset-1];
727 |         sum_dpsis += smoothX[offset-1];
728 |     }
729 |     if(i<height-1){
730 |         sigma_u -= smoothY[offset] * du[offset+step];
731 |         sigma_v -= smoothY[offset] * dv[offset+step];
732 |         sum_dpsis += smoothY[offset];
733 |     }
734 |     if(j<halfWidth-1){
735 |         sigma_u -= smoothX[offset] * du[offset+1];
736 |         sigma_v -= smoothX[offset] * dv[offset+1];
737 |         sum_dpsis += smoothX[offset];
738 |     }
739 | 
740 |     A11 = a11[offset] + sum_dpsis;
741 |     A12 = a12[offset];
742 |     A22 = a22[offset] + sum_dpsis;
743 |     det = A11*A22-A12*A12;
744 |     B1 = b1[offset]-sigma_u;
745 |     B2 = b2[offset]-sigma_v;
746 | 
747 |     du[offset] = (1.0f-omega)*du[offset] + omega*( A22*B1-A12*B2)/det;
748 |     dv[offset] = (1.0f-omega)*dv[offset] + omega*(-A12*B1+A11*B2)/det;
749 | }
750 | 
751 | void DeepFlowCuda::RedBlackSOR()
752 | {
753 |     int halfWidth = current.size.width/2 + current.size.width%2;
754 | 
755 |    dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(halfWidth, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
756 |     
757 |     for (int iter = 0; iter < sorIterations; iter++)
758 |     {
759 |         RedBlackSORKernel<<<blocksPerGrid, threadsPerBlock>>>(
760 |             paramsCuda, current.size.width, current.size.height, padding, current.step, true,
761 |             current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
762 |             current.smoothX[0], current.smoothY[0],
763 |             current.du[0], current.dv[0]);
764 | 
765 |         RedBlackSORKernel<<<blocksPerGrid, threadsPerBlock>>>(
766 |             paramsCuda, current.size.width, current.size.height, padding, current.step, false,
767 |             current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
768 |             current.smoothX[0], current.smoothY[0],
769 |             current.du[0], current.dv[0]);
770 |     }
771 | }
772 | 
773 | void DeepFlowCuda::computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow)
774 | {
775 |     assert(!I0.empty() && I0.type()==CV_32FC3);
776 |     assert(I1.size()==I0.size() && I1.type()==CV_32FC3);
777 |     
778 |     if (flow.empty() || flow.size()!=I0.size() || flow.type()!=CV_32FC2)
779 |     {
780 |         if (beta!=0)
781 |         {
782 |             cout<<"No correct input flow was provided but weight of matching term is non-zero. Setting it to zero..."<<endl;
783 |             beta = 0;
784 |             copyParamsToCuda();
785 |         }
786 | 
787 |         flow.create(I0.size(), CV_32FC2);
788 |         flow.setTo(0);
789 |     }
790 |     cv::Mat uv[2];
791 |     
792 |     cv::split(flow, uv);
793 |    
794 |     // Build pyramid
795 |     int widthTmp =  I0.cols, heightTmp = I1.rows;
796 |     nbScales = 0;
797 |     while (widthTmp>=minSize && heightTmp>=minSize)
798 |     {
799 |         widthTmp*=scaleFactor;
800 |         heightTmp*=scaleFactor;
801 |         nbScales++;
802 |     }
803 |     if (nbScales==0)
804 |         nbScales = 1;
805 | 
806 |     cout<<"Nb scales = "<<nbScales<<endl;
807 | 
808 |     cv::Mat rgba;
809 | 
810 |     pyramid.I0.resize(nbScales);
811 |     pyramid.I1.resize(nbScales);
812 |     pyramid.uinit.resize(nbScales);
813 |     pyramid.vinit.resize(nbScales);
814 |     pyramid.ufinal.resize(nbScales);
815 |     pyramid.vfinal.resize(nbScales);
816 |     pyramid.udesc.resize(nbScales);
817 |     pyramid.vdesc.resize(nbScales);
818 |     pyramid.descWeight.resize(nbScales);
819 |     
820 |     pyramid.scale.resize(nbScales);
821 | 
822 |     // cv::cvtColor(I0, rgba, cv::COLOR_BGR2BGRA);
823 |     // cv::GaussianBlur(I0, rgba, cv::Size(0,0), sigma);
824 |     pyramid.I0[0].upload(I0);
825 | 
826 |     // cv::cvtColor(I1, rgba, cv::COLOR_BGR2BGRA);
827 |     // cv::GaussianBlur(I1, rgba, cv::Size(0,0), sigma);
828 |     pyramid.I1[0].upload(I1);
829 | 
830 |     if (sigma!=0)
831 |     {
832 |         filtg->apply(pyramid.I0[0], pyramid.I0[0]);
833 |         filtg->apply(pyramid.I1[0], pyramid.I1[0]);
834 |     }
835 | 
836 |     pyramid.scale[0] = 1.0;
837 | 
838 |     if (beta!=0)
839 |     {
840 |         pyramid.udesc[0].upload(uv[0]);
841 |         pyramid.vdesc[0].upload(uv[1]);
842 |         computeDescWeight(); // Computes pyramid.descWeight[0]
843 |     }
844 | 
845 |     cout<<"Scale 0:"<<pyramid.I0[0].size()<<endl;
846 |     for (int k=1; k<nbScales; k++)
847 |     {
848 |         cv::cuda::resize(pyramid.I0[k-1], pyramid.I0[k], cv::Size(0,0), scaleFactor, scaleFactor);
849 |         cv::cuda::resize(pyramid.I1[k-1], pyramid.I1[k], cv::Size(0,0), scaleFactor, scaleFactor);
850 | 
851 |         if (beta!=0)
852 |         {
853 |             cv::cuda::resize(pyramid.udesc[k-1], pyramid.udesc[k], cv::Size(0,0), scaleFactor, scaleFactor);
854 |             cv::cuda::resize(pyramid.vdesc[k-1], pyramid.vdesc[k], cv::Size(0,0), scaleFactor, scaleFactor);
855 |             cv::cuda::resize(pyramid.descWeight[k-1], pyramid.descWeight[k], cv::Size(0,0), scaleFactor, scaleFactor);
856 |                     
857 |             cv::cuda::multiply(pyramid.udesc[k], scaleFactor, pyramid.udesc[k]);
858 |             cv::cuda::multiply(pyramid.vdesc[k], scaleFactor, pyramid.vdesc[k]);
859 |             cv::cuda::multiply(pyramid.descWeight[k], 1.0/(scaleFactor*scaleFactor), pyramid.descWeight[k]);
860 |         }
861 |         cout<<"Scale "<<k<<":"<<pyramid.I0[k].size()<<endl;
862 |     
863 |         pyramid.scale[k] = pyramid.scale[k-1]*scaleFactor;
864 |     }
865 | 
866 |     // Set the initial flow at the largest scale
867 |     if (beta!=0)
868 |     {
869 |         pyramid.udesc[nbScales-1].copyTo(pyramid.uinit[nbScales-1]);
870 |         pyramid.vdesc[nbScales-1].copyTo(pyramid.vinit[nbScales-1]);
871 |     }
872 |     else {
873 |         cv::cudev::GpuMat_<float> uinitTmp, vinitTmp;
874 |         uinitTmp.upload(uv[0]);
875 |         vinitTmp.upload(uv[1]);
876 | 
877 |         cv::cuda::resize(uinitTmp, pyramid.uinit[nbScales-1], pyramid.I0[nbScales-1].size());
878 |         cv::cuda::resize(vinitTmp, pyramid.vinit[nbScales-1], pyramid.I0[nbScales-1].size());
879 |         cv::cuda::multiply(pyramid.uinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.uinit[nbScales-1]);
880 |         cv::cuda::multiply(pyramid.vinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.vinit[nbScales-1]);
881 |     }
882 | 
883 |     for (int k=nbScales-1; k>=0; k--)
884 |         computeOneLevel(k);
885 | 
886 |     // At the end, we have ufinal and vfinal in pyramid.ufinal[0]
887 |     pyramid.ufinal[0].download(uv[0]);
888 |     pyramid.vfinal[0].download(uv[1]);
889 | 
890 |     cv::merge(uv, 2, flow);
891 | }
892 | 
893 | void DeepFlowCuda::computeOneLevel(int scale)
894 | {
895 |     prepareBuffers(scale);
896 | 
897 |     float betaSave = beta;
898 |     float bk = 0.45;
899 | 
900 |     if (bk>0.0f && nbScales>1)
901 |     {
902 |         beta = betaSave * pow((float)scale/(float)(nbScales-1), bk);
903 |         copyParamsToCuda();
904 |     }
905 | 
906 |     computeSmoothnessWeight();
907 | 
908 |     for (int i = 0; i < fixedPointIterations; i++)
909 |     {
910 |         computeDataTerm(); // Initializes A11, A12, A22, b1 and b2
911 |         if (beta!=0)
912 |             computeMatchingTerm();
913 | 
914 |         if (alpha!=0)
915 |             computeSmoothnessTerm(); // Updates b1 and b2
916 |         
917 |         RedBlackSOR();
918 | 
919 |         current.utmp = current.u + current.du;
920 |         current.vtmp = current.v + current.dv;
921 |     }
922 | 
923 |     beta = betaSave;
924 | 
925 |     if (padding!=0)
926 |     {
927 |         int width = pyramid.I0[scale].cols;
928 |         int height = pyramid.I0[scale].rows;
929 |         
930 |         current.utmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.ufinal[scale]);
931 |         current.vtmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.vfinal[scale]);
932 |     }
933 |     else {
934 |         current.utmp.copyTo(pyramid.ufinal[scale]);
935 |         current.vtmp.copyTo(pyramid.vfinal[scale]);
936 |     }
937 | }
938 | 
939 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_<float> &m) const
940 | {
941 |     cv::Mat a;
942 |     m.download(a);
943 |     return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone();
944 | }
945 | 
946 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_<float3> &m) const
947 | {
948 |     cv::Mat a;
949 |     m.download(a);
950 |     cv::cvtColor(a, a, cv::COLOR_BGRA2BGR);
951 |     return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone();
952 | }
953 | 
954 | void DeepFlowCuda::copyParamsToCuda()
955 | {
956 |     int nbParams = 7;
957 |     if (paramsCuda==nullptr && cudaMalloc(&paramsCuda, nbParams*sizeof(float))!=cudaSuccess)
958 |     {
959 |         paramsCuda = nullptr;
960 |         cout<<"cudaMalloc error"<<endl;
961 |         return;
962 |     }
963 | 
964 |     float params[] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
965 |     cudaMemcpy(paramsCuda, params, nbParams*sizeof(float), cudaMemcpyHostToDevice);
966 | }


--------------------------------------------------------------------------------
/deepflowcuda.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2019 Julien Mille
  3 | 
  4 | This file is part of DeepFlowCUDA.
  5 | 
  6 | DeepFlowCUDA is free software: you can redistribute
  7 | it and/or modify it under the terms of the GNU Lesser General Public License
  8 | as published by the Free Software Foundation, either version 3 of the License,
  9 | or (at your option) any later version.
 10 | 
 11 | DeepFlowCUDA is distributed in the hope that it will
 12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 14 | General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License,
 17 | and a copy of the GNU Lesser General Public License, along with
 18 | DeepFlowCUDA. If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | #ifndef DEEP_FLOW_CUDA_H
 22 | #define DEEP_FLOW_CUDA_H
 23 | 
 24 | #include <opencv2/core.hpp>
 25 | #include <opencv2/cudev.hpp> // For template cv::cudev::GpuMat_ -> needs to be compiled with nvcc !
 26 | #include <opencv2/cudafilters.hpp> // For cv::cuda::Filter
 27 | #include <vector>
 28 | 
 29 | #include "myvec3f.h"
 30 | 
 31 | class DeepFlowCuda
 32 | {
 33 |   // Member variables
 34 |   protected:
 35 |     int fixedPointIterations, sorIterations;
 36 |     float omega; // Update parameter in SOR iterations
 37 |     float alpha; // Weight of smoothing term
 38 |     float beta; // Weight of matching term
 39 |     float delta; // Weight of color constancy in data term
 40 |     float gamma; // Weight of gradient constancy in data term
 41 |     float zeta; // Regularization parameter (added to norms in data term)
 42 |     float epsilon; // Regularization parameter in Psi function
 43 |     float scaleFactor; // Scale factor between two succesive levels
 44 |     float sigma; // Standard deviation of presmoothing Gaussian filter
 45 |     int nbScales; // Number of scales (levels). Will be calculated from minSize, scaleFactor and size of input image
 46 |     int padding;
 47 |     int minSize; // Minimum width or height at the highest level (at the coarsest scale)
 48 |     float *paramsCuda;
 49 | 
 50 |     // Padded images and flows at current scale
 51 |     struct {
 52 |         cv::cudev::GpuMat_<float3> I0, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz;
 53 |         cv::cudev::GpuMat_<float> A11, A12, A22, b1, b2;
 54 |         cv::cudev::GpuMat_<float> smoothX, smoothY, luminance, smoothWeight;
 55 | 
 56 |         cv::cudev::GpuMat_<float> utmp, vtmp; // flow that is updated in each fixed point iteration
 57 |         cv::cudev::GpuMat_<float> du, dv;     // flow increment, updated in each SOR iteration
 58 |         cv::cudev::GpuMat_<float> u, v;       // red-black-buffer version of the input flow
 59 |         cv::cudev::GpuMat_<float> udesc, vdesc, descWeight; // Descriptor used in matching term
 60 | 
 61 |         int step;
 62 |         int stepColor;
 63 |     
 64 |         cv::Size size, sizePadded;
 65 |     } current;
 66 | 
 67 |     // Non-padded images and flows at all scales
 68 |     struct {
 69 |         std::vector<cv::cudev::GpuMat_<float3> > I0, I1;
 70 |         std::vector<cv::cudev::GpuMat_<float> > uinit, vinit, ufinal, vfinal;
 71 |         std::vector<cv::cudev::GpuMat_<float> > udesc, vdesc, descWeight;
 72 |         std::vector<float> scale;
 73 |     } pyramid;
 74 |     
 75 |     // Derivative filters, and Gaussian filter
 76 |     std::shared_ptr<cv::cuda::Filter> filtg, filtx, filty, filtx5pt, filty5pt;
 77 | 
 78 |   // Member functions
 79 |   public:
 80 |     DeepFlowCuda();
 81 |     ~DeepFlowCuda();
 82 | 
 83 |     void computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow);
 84 |     
 85 |     int getFixedPointIterations() const { return fixedPointIterations; }
 86 |     void setFixedPointIterations(int val) { fixedPointIterations = val; }
 87 |     int getSorIterations() const { return sorIterations; }
 88 |     void setSorIterations(int val) { sorIterations = val; }
 89 |     float getOmega() const { return omega; }
 90 |     void setOmega(float val) { omega = val; }
 91 |     float getAlpha() const { return alpha; }
 92 |     void setAlpha(float val) { alpha = val; }
 93 |     float getBeta() const { return beta; }
 94 |     void setBeta(float val) { beta = val; }
 95 |     float getDelta() const { return delta; }
 96 |     void setDelta(float val) { delta = val; }
 97 |     float getGamma() const { return gamma; }
 98 |     void setGamma(float val) { gamma = val; }
 99 | 
100 |   protected:
101 |     bool createFilters();
102 | 
103 |     void prepareBuffers(int scale);
104 | 
105 |     void computeDescWeight();
106 | 
107 |     void computeDataTerm();
108 |     void computeMatchingTerm();
109 | 
110 |     // The local smoothness weight is not described in the initial DeepFlow paper
111 |     // [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
112 |     // Instead it is mentioned in section 4.3 of the extended IJCV paper
113 |     // [J. Revaud, P. Weinzaepfel, Z. Harchaoui and  C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016]
114 |     // See ref [Wedel et al 2009] or [Xu et al 2012]
115 |     void computeSmoothnessWeight();
116 |     
117 |     void computeSmoothnessTerm();
118 |     
119 |     void RedBlackSOR();
120 |     
121 |     void computeOneLevel(int);
122 | 
123 |     // Remove padding and move to CPU
124 |     cv::Mat toCPU(const cv::cudev::GpuMat_<float> &) const ;
125 |     cv::Mat toCPU(const cv::cudev::GpuMat_<float3> &) const;
126 | 
127 |     void copyParamsToCuda();
128 | };
129 | 
130 | #endif
131 | 


--------------------------------------------------------------------------------
/main.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2019 Julien Mille
  3 | 
  4 | This file is part of DeepFlowCUDA.
  5 | 
  6 | DeepFlowCUDA is free software: you can redistribute
  7 | it and/or modify it under the terms of the GNU Lesser General Public License
  8 | as published by the Free Software Foundation, either version 3 of the License,
  9 | or (at your option) any later version.
 10 | 
 11 | DeepFlowCUDA is distributed in the hope that it will
 12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
 14 | General Public License for more details.
 15 | 
 16 | You should have received a copy of the GNU General Public License,
 17 | and a copy of the GNU Lesser General Public License, along with
 18 | DeepFlowCUDA. If not, see <http://www.gnu.org/licenses/>.
 19 | */
 20 | 
 21 | #include <iostream>
 22 | #include <vector>
 23 | #include <string>
 24 | 
 25 | #include <opencv2/imgcodecs.hpp> // For cv::imread
 26 | #include <opencv2/video.hpp> // For cv::readOpticalFlow, cv::writeOpticalFlow
 27 | #include "deepflowcuda.h"
 28 | 
 29 | using namespace std;
 30 | 
 31 | float L2Distance(const cv::Mat &flow1, const cv::Mat &flow2, cv::Mat &dist)
 32 | {
 33 |     assert(!flow1.empty() && flow1.type()==CV_32FC2 && flow2.type()==CV_32FC2 && flow1.size()==flow2.size());
 34 | 
 35 |     dist.create(flow1.size(), CV_32F);
 36 | 
 37 |     float avgDist = 0;
 38 |     float d;
 39 | 
 40 |     for (int y=0; y<flow1.rows; y++)
 41 |         for (int x=0; x<flow1.cols; x++)
 42 |         {
 43 |             cv::Point2f a = flow1.at<cv::Point2f>(y,x);
 44 |             cv::Point2f b = flow2.at<cv::Point2f>(y,x);
 45 |             d = (a-b).dot(a-b);
 46 |             avgDist += d;
 47 |             dist.at<float>(y,x) = d;
 48 |         }
 49 |     avgDist /= flow1.cols*flow1.rows;
 50 |     return avgDist;
 51 | }
 52 | 
 53 | int main(int argc, char ** argv)
 54 | {
 55 |     cv::Mat img0, img1, gtflow, flow, flowRefined;
 56 | 
 57 |     string dataDir = "./data/";
 58 |     string outputDir = "./";
 59 |     string outputPath = outputDir + "refinedflow_0001.flo";
 60 |     
 61 |     img0 = cv::imread(dataDir + "frame_0001.jpg", cv::IMREAD_COLOR);
 62 |     if (img0.data==nullptr)
 63 |     {
 64 |         cout<<"Failed to read first image"<<endl;
 65 |         return -1;
 66 |     }
 67 |     img0.convertTo(img0, CV_32F, 1.0/255.0);
 68 | 
 69 |     img1 = cv::imread(dataDir + "frame_0002.jpg", cv::IMREAD_COLOR);
 70 |     if (img1.data==nullptr)
 71 |     {
 72 |         cout<<"Failed to read second image"<<endl;
 73 |         return -1;
 74 |     }
 75 |     img1.convertTo(img1, CV_32F, 1.0/255.0);
 76 | 
 77 |     gtflow = cv::readOpticalFlow(dataDir + "gtflow_0001.flo");
 78 |     if (gtflow.data==nullptr)
 79 |     {
 80 |         cout<<"Failed to read ground truth flow"<<endl;
 81 |         return -1;
 82 |     }
 83 |         
 84 |     flow = cv::readOpticalFlow(dataDir + "dirtyflow_0001.flo");
 85 |     if (flow.data==nullptr)
 86 |     {
 87 |         cout<<"Failed to read input flow"<<endl;
 88 |         return -1;
 89 |     }
 90 |     
 91 |     DeepFlowCuda deepFlow;
 92 |     
 93 |     flowRefined = flow.clone();
 94 | 
 95 |     deepFlow.computeFlow(img0, img1, flowRefined);
 96 | 
 97 |     cv::Mat dist;
 98 |     cout<<"Distance = "<<L2Distance(flow, gtflow, dist)<<endl;
 99 |     cout<<"Distance refined = "<<L2Distance(flowRefined, gtflow, dist)<<endl;
100 | 
101 |     if (cv::writeOpticalFlow(outputPath, flowRefined)==false)
102 |     {
103 |         cout<<"Failed to write refined flow to"<<outputPath<<endl;
104 |         return -1;
105 |     }
106 |     else
107 |         cout<<"Refined flow written to "<<outputPath<<endl;
108 | 
109 |     return 0;
110 | }
111 | 


--------------------------------------------------------------------------------
/myvec3f.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright 2019 Julien Mille
 3 | 
 4 | This file is part of DeepFlowCUDA.
 5 | 
 6 | DeepFlowCUDA is free software: you can redistribute
 7 | it and/or modify it under the terms of the GNU Lesser General Public License
 8 | as published by the Free Software Foundation, either version 3 of the License,
 9 | or (at your option) any later version.
10 | 
11 | DeepFlowCUDA is distributed in the hope that it will
12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
14 | General Public License for more details.
15 | 
16 | You should have received a copy of the GNU General Public License,
17 | and a copy of the GNU Lesser General Public License, along with
18 | DeepFlowCUDA. If not, see <http://www.gnu.org/licenses/>.
19 | */
20 | 
21 | #ifndef MY_VEC3F_H
22 | #define MY_VEC3F_H
23 | 
24 | class MyVec3f
25 | {
26 |   public:
27 |     float x, y, z;
28 |     __device__ MyVec3f() {}
29 |     __device__ MyVec3f(float a, float b, float c):x(a),y(b),z(c) {}
30 |     __device__ float dot(const MyVec3f &v) const {return x*v.x+y*v.y+z*v.z;}
31 |     __device__ float norm2() const {return x*x+y*y+z*z;}
32 |     __device__ float norm() const {return sqrt(x*x+y*y+z*z);}
33 |     __device__ float l1norm() const {return fabs(x)+fabs(y)+fabs(z);}
34 |     __device__ float sum() const {return x+y+z;}
35 | 
36 |     __device__ MyVec3f operator +(const MyVec3f &v) const
37 |     {
38 |         MyVec3f s;
39 |         s.x = x+v.x;
40 |         s.y = y+v.y;
41 |         s.z = z+v.z;
42 |         return s;
43 |     }
44 | 
45 |     __device__ MyVec3f operator -(const MyVec3f &v) const
46 |     {
47 |         MyVec3f d;
48 |         d.x = x-v.x;
49 |         d.y = y-v.y;
50 |         d.z = z-v.z;
51 |         return d;
52 |     }
53 | 
54 |     __device__ MyVec3f operator *(float f) const
55 |     {
56 |         MyVec3f v;
57 |         v.x = x*f;
58 |         v.y = y*f;
59 |         v.z = z*f;
60 |         return v;
61 |     }
62 |     
63 |     // Element-wise product
64 |     __device__ MyVec3f operator *(const MyVec3f &v) const
65 |     {
66 |         MyVec3f s;
67 |         s.x = x*v.x;
68 |         s.y = y*v.y;
69 |         s.z = z*v.z;
70 |         return s;
71 |     }
72 | 
73 |     // Element-wise division
74 |     __device__ MyVec3f operator /(const MyVec3f &v) const
75 |     {
76 |         MyVec3f s;
77 |         s.x = x/v.x;
78 |         s.y = y/v.y;
79 |         s.z = z/v.z;
80 |         return s;
81 |     }
82 | };
83 | 
84 | #endif


--------------------------------------------------------------------------------