├── Makefile
├── README.md
├── README.txt
├── data
├── dirtyflow_0001.flo
├── frame_0001.jpg
├── frame_0002.jpg
└── gtflow_0001.flo
├── deepflowcuda.cu
├── deepflowcuda.h
├── main.cu
└── myvec3f.h
/Makefile:
--------------------------------------------------------------------------------
1 | NVCC = nvcc
2 | NVCCFLAGS = -c -g -O3
3 | LD = g++
4 | LDFLAGS = -O3
5 | FINAL_TARGET = deepflow
6 | CUDA_DIR = /usr/local/cuda-9.2
7 | OPENCV_DIR = /usr/local/opencv-4.1.0-build
8 | INCLUDE_DIR = -I$(CUDA_DIR)/include -I$(OPENCV_DIR)/include/opencv4
9 | LIB_DIR = -L$(CUDA_DIR)/lib64 -L$(OPENCV_DIR)/lib
10 | LIBS = -lopencv_cudawarping -lopencv_cudafilters -lopencv_cudaimgproc -lopencv_cudaarithm -lopencv_cudalegacy -lopencv_video -lopencv_imgproc -lopencv_imgcodecs -lopencv_core -lcudart
11 |
12 | default: $(FINAL_TARGET)
13 |
14 | $(FINAL_TARGET): main.o deepflowcuda.o
15 | $(LD) $+ -o $@ $(LDFLAGS) $(LIB_DIR) $(LIBS)
16 |
17 | %.o: %.cu
18 | $(NVCC) $(NVCCFLAGS) $(INCLUDE_DIR) $< -o $@
19 |
20 | clean:
21 | rm -f *.o $(FINAL_TARGET)
22 |
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DeepFlowCUDA
2 | A C++ port of Philippe Weinzaepfel's C DeepFlow library, using OpenCV/CUDA
3 |
4 | Reference papers are:
5 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
6 | [J. Revaud, P. Weinzaepfel, Z. Harchaoui and C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016]
7 |
8 | Provided test images are taken from the MPI Sintel dataset:
9 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012]
10 |
11 | See README.txt for requirements and build instructions.
12 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | This is a C++/OpenCV/CUDA port of the DeepFlow C library, associated to the following paper:
2 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
3 | See http://lear.inrialpes.fr/src/deepflow/
4 |
5 | The library is provided with two images and the corresponding ground truth flow from the MPI Sintel dataset:
6 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012]
7 | See http://sintel.is.tue.mpg.de/
8 |
9 | It should be used as a refinement step, as described for example in the FlowNet paper,
10 | [A. Dosovitskiy, P. Fischer, E. Ilg, P. Häusser, C. Hazirbas, V. Golkov, P. van der Smagt, D. Cremers and T. Brox. FlowNet: Learning Optical Flow with Convolutional Networks. ICCV 2015]
11 | Thus, the input is made up of two images and a coarse optical flow, typically output from a deep neural network. If no input flow is provided, the initial field is set to zero everywhere. As in the OpenCV modules variationalrefinement and cudaoptflow, the red-black successive overrelaxation method (SOR) is used to solve the linear systems. See the paper by Brox et al:
12 | [T. Brox, A. Bruhn, N. Papenberg, J. Weickert. High Accuracy Optical Flow Estimation Based on a Theory for Warping. ECCVV 2004]
13 |
14 | The DeepFlowCUDA class uses the cv::cudev::GpuMat_ template class, so you'll need OpenCV with additional opencv_contrib modules built (see https://github.com/opencv/opencv_contrib)
15 |
16 | Requirements:
17 | - OpenCV with opencv_contrib modules (need CUDA modules)
18 | - CUDA build environment (nvcc should be in your PATH)
19 |
20 | The Makefile assumes that CUDA is installed in /usr/local/cuda-9.2 and OpenCV in /usr/local/opencv-4.1.0
21 | Just edit the paths in the Makefile and run
22 | > make
23 | > ./deepflow
24 |
25 | Please report any bug to julien.mille@insa-cvl.fr
26 | Thanks!
27 |
28 | Copyright 2019 Julien Mille
29 |
--------------------------------------------------------------------------------
/data/dirtyflow_0001.flo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/dirtyflow_0001.flo
--------------------------------------------------------------------------------
/data/frame_0001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0001.jpg
--------------------------------------------------------------------------------
/data/frame_0002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0002.jpg
--------------------------------------------------------------------------------
/data/gtflow_0001.flo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/gtflow_0001.flo
--------------------------------------------------------------------------------
/deepflowcuda.cu:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2019 Julien Mille
3 |
4 | This file is part of DeepFlowCUDA.
5 |
6 | DeepFlowCUDA is free software: you can redistribute
7 | it and/or modify it under the terms of the GNU Lesser General Public License
8 | as published by the Free Software Foundation, either version 3 of the License,
9 | or (at your option) any later version.
10 |
11 | DeepFlowCUDA is distributed in the hope that it will
12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
14 | General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License,
17 | and a copy of the GNU Lesser General Public License, along with
18 | DeepFlowCUDA. If not, see .
19 | */
20 |
21 | #include "deepflowcuda.h"
22 | #include // for pixelwise operations on cv::cudev::GpuMat_
23 | #include // for cv::cuda::resize
24 | #include
25 |
26 | using namespace std;
27 |
28 | // Maximum number of CUDA threads per block, per dimension, for 2D thread blocks
29 | // It is equivalent to 32^2=1014 threads per 1D block
30 | #define THREADS_PER_BLOCK_2D 32
31 |
32 | // Rounded up division, to compute number of thread blocks when launching CUDA kernels
33 | int divUp(int a, int b)
34 | {
35 | return (a + b - 1)/b;
36 | }
37 |
38 | DeepFlowCuda::DeepFlowCuda()
39 | {
40 | // We use the same setting as DeepFlow, except for the scale factor
41 | fixedPointIterations = 5;
42 | sorIterations = 25;
43 | alpha = 1.0;
44 | beta = 32.0;
45 | delta = 0.1;
46 | gamma = 0.7;
47 | omega = 1.6;
48 | zeta = 0.1;
49 | epsilon = 0.01;
50 | scaleFactor = 0.5;
51 | sigma = 0; // 0.65;
52 | minSize = 10;
53 |
54 | paramsCuda = nullptr;
55 | copyParamsToCuda();
56 |
57 | current.step = 0;
58 | current.stepColor = 0;
59 |
60 | padding = 1;
61 |
62 | assert(createFilters());
63 | }
64 |
65 | DeepFlowCuda::~DeepFlowCuda()
66 | {
67 | if (paramsCuda!=nullptr)
68 | cudaFree(paramsCuda);
69 | }
70 |
71 | bool DeepFlowCuda::createFilters()
72 | {
73 | cv::Mat deriv, deriv5pt;
74 | cv::Mat o = cv::Mat::ones(1, 1, CV_32F);
75 |
76 | // CUDA filters for centered finite differences
77 | deriv.create(1, 3, CV_32F);
78 | deriv.at(0,0) = -0.5;
79 | deriv.at(0,1) = 0.0;
80 | deriv.at(0,2) = 0.5;
81 |
82 | filtx = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv, o, cv::Point(-1, -1), cv::BORDER_REPLICATE);
83 | if (filtx==nullptr)
84 | return false;
85 |
86 | filty = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv, cv::Point(-1, -1), cv::BORDER_REPLICATE);
87 | if (filty==nullptr)
88 | return false;
89 |
90 | // CUDA filters for finite differences with 5-point stencil
91 | deriv5pt.create(1, 5, CV_32F);
92 | deriv5pt.at(0,0) = 1.0/12;
93 | deriv5pt.at(0,1) = -8.0/12;
94 | deriv5pt.at(0,2) = 0;
95 | deriv5pt.at(0,3) = 8.0/12;
96 | deriv5pt.at(0,4) = -1.0/12;
97 |
98 | filtx5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv5pt, o, cv::Point(-1, -1), cv::BORDER_REPLICATE);
99 | if (filtx5pt==nullptr)
100 | return false;
101 |
102 | filty5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv5pt, cv::Point(-1, -1), cv::BORDER_REPLICATE);
103 | if (filty5pt==nullptr)
104 | return false;
105 |
106 | if (sigma!=0)
107 | {
108 | filtg = cv::cuda::createGaussianFilter(CV_32FC3, CV_32FC3, cv::Size(0,0), sigma, sigma, cv::BORDER_REPLICATE);
109 | if (filtg==nullptr)
110 | return false;
111 | }
112 |
113 | return true;
114 | }
115 |
116 | __global__ void warpKernel(int width, int height, int padding, int step, int stepColor, const float3 *I, const float *u, const float *v, float3 *warpedI)
117 | {
118 | int j = blockDim.x * blockIdx.x + threadIdx.x;
119 | int i = blockDim.y * blockIdx.y + threadIdx.y;
120 | if (i>=height || j>=width)
121 | return;
122 |
123 | int offset = (i+padding)*step + j+padding;
124 | MyVec3f *pWarpedI = (MyVec3f *)((float *)warpedI + ((i+padding)*stepColor + 3*(j+padding)));
125 |
126 | float x, y, xx, yy, dx, dy;
127 | int x1, x2, y1, y2;
128 |
129 | xx = j + u[offset];
130 | yy = i + v[offset];
131 | x = floor(xx);
132 | y = floor(yy);
133 | dx = xx-x;
134 | dy = yy-y;
135 |
136 | x1 = (int)x;
137 | x2 = x1+1;
138 | y1 = (int)y;
139 | y2 = y1+1;
140 |
141 | if (x1<0) x1=0; else if (x1>=width) x1 = width-1;
142 | if (x2<0) x2=0; else if (x2>=width) x2 = width-1;
143 | if (y1<0) y1=0; else if (y1>=height) y1 = height-1;
144 | if (y2<0) y2=0; else if (y2>=height) y2 = height-1;
145 |
146 | const MyVec3f *pI1 = (const MyVec3f *)((float *)I + ((y1+padding)*stepColor + 3*padding));
147 | const MyVec3f *pI2 = (const MyVec3f *)((float *)I + ((y2+padding)*stepColor + 3*padding));
148 |
149 | *pWarpedI =
150 | pI1[x1]*(1.0f-dx)*(1.0f-dy) +
151 | pI1[x2]*dx*(1.0f-dy) +
152 | pI2[x1]*(1.0f-dx)*dy +
153 | pI2[x2]*dx*dy;
154 | }
155 |
156 | __global__ void averageKernel(int width, int height, int stepColor, const float3 *a, const float3 *b, float3 *c)
157 | {
158 | int j = blockDim.x * blockIdx.x + threadIdx.x;
159 | int i = blockDim.y * blockIdx.y + threadIdx.y;
160 | if (i>=height || j>=width)
161 | return;
162 |
163 | int offset = i*stepColor + 3*j;
164 | const MyVec3f *pA = (const MyVec3f *)((float *)a + offset);
165 | const MyVec3f *pB = (const MyVec3f *)((float *)b + offset);
166 | MyVec3f *pC = (MyVec3f *)((float *)c + offset);
167 |
168 | *pC = (*pA+*pB)*0.5;
169 | }
170 |
171 | void DeepFlowCuda::prepareBuffers(int scale)
172 | {
173 | current.size = pyramid.I0[scale].size();
174 |
175 | current.sizePadded.width = current.size.width + 2*padding;
176 | current.sizePadded.height = current.size.height + 2*padding;
177 |
178 | current.A11.create(current.sizePadded);
179 | current.A12.create(current.sizePadded);
180 | current.A22.create(current.sizePadded);
181 | current.b1.create(current.sizePadded);
182 | current.b2.create(current.sizePadded);
183 |
184 | current.luminance.create(current.sizePadded);
185 | current.smoothX.create(current.sizePadded);
186 | current.smoothY.create(current.sizePadded);
187 | current.smoothWeight.create(current.sizePadded);
188 |
189 | current.smoothX.setTo(0.0);
190 | current.smoothY.setTo(0.0);
191 | current.smoothWeight.setTo(0.0);
192 |
193 | if (scale!=nbScales-1)
194 | {
195 | cv::cuda::resize(pyramid.ufinal[scale+1], pyramid.uinit[scale], pyramid.I0[scale].size());
196 | cv::cuda::resize(pyramid.vfinal[scale+1], pyramid.vinit[scale], pyramid.I0[scale].size());
197 |
198 | cv::cuda::multiply(pyramid.uinit[scale], 1.0/scaleFactor, pyramid.uinit[scale]);
199 | cv::cuda::multiply(pyramid.vinit[scale], 1.0/scaleFactor, pyramid.vinit[scale]);
200 | }
201 |
202 | if (padding!=0)
203 | {
204 | cv::cuda::copyMakeBorder(pyramid.I0[scale], current.I0, padding, padding, padding, padding, cv::BORDER_REPLICATE);
205 | cv::cuda::copyMakeBorder(pyramid.uinit[scale], current.u, padding, padding, padding, padding, cv::BORDER_REPLICATE);
206 | cv::cuda::copyMakeBorder(pyramid.vinit[scale], current.v, padding, padding, padding, padding, cv::BORDER_REPLICATE);
207 |
208 | if (beta!=0)
209 | {
210 | cv::cuda::copyMakeBorder(pyramid.udesc[scale], current.udesc, padding, padding, padding, padding, cv::BORDER_REPLICATE);
211 | cv::cuda::copyMakeBorder(pyramid.vdesc[scale], current.vdesc, padding, padding, padding, padding, cv::BORDER_REPLICATE);
212 | cv::cuda::copyMakeBorder(pyramid.descWeight[scale], current.descWeight, padding, padding, padding, padding, cv::BORDER_REPLICATE);
213 | }
214 | }
215 | else {
216 | pyramid.I0[scale].copyTo(current.I0);
217 | pyramid.uinit[scale].copyTo(current.u);
218 | pyramid.vinit[scale].copyTo(current.v);
219 |
220 | if (beta!=0)
221 | {
222 | pyramid.udesc[scale].copyTo(current.udesc);
223 | pyramid.vdesc[scale].copyTo(current.vdesc);
224 | pyramid.descWeight[scale].copyTo(current.descWeight);
225 | }
226 | }
227 |
228 | current.step = current.A11.step/sizeof(float);
229 | current.stepColor = current.I0.step/sizeof(float);
230 |
231 | // Computing an average of the current and warped next frames (to compute the derivatives on) and temporal derivative Iz
232 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid;
233 |
234 | // Otherwise, pyramid.uinit[scale] and pyramid.vinit[scale] are already initialized
235 | cv::cudev::GpuMat_ gpuwarpedI, gpuaveragedI;
236 |
237 | // Average everywhere
238 | gpuwarpedI.create(current.size);
239 |
240 | blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
241 | warpKernel<<>>(current.size.width, current.size.height, 0, pyramid.uinit[scale].step/sizeof(float), pyramid.I1[scale].step/sizeof(float),
242 | pyramid.I1[scale][0], pyramid.uinit[scale][0], pyramid.vinit[scale][0], gpuwarpedI[0]);
243 |
244 | if (padding!=0)
245 | cv::cuda::copyMakeBorder(gpuwarpedI, gpuwarpedI, padding, padding, padding, padding, cv::BORDER_REPLICATE);
246 |
247 | // Average everywhere
248 | gpuaveragedI.create(current.sizePadded);
249 |
250 | blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
251 | averageKernel<<>>(current.sizePadded.width, current.sizePadded.height, current.stepColor, current.I0[0], gpuwarpedI[0], gpuaveragedI[0]);
252 | cv::cuda::subtract(gpuwarpedI, current.I0, current.Iz);
253 |
254 | filtx5pt->apply(gpuaveragedI, current.Ix);
255 | filty5pt->apply(gpuaveragedI, current.Iy);
256 |
257 | filtx5pt->apply(current.Iz, current.Ixz);
258 | filty5pt->apply(current.Iz, current.Iyz);
259 |
260 | filtx5pt->apply(current.Ix, current.Ixx);
261 | filty5pt->apply(current.Ix, current.Ixy);
262 | filty5pt->apply(current.Iy, current.Iyy);
263 |
264 | current.u.copyTo(current.utmp);
265 | current.v.copyTo(current.vtmp);
266 |
267 | current.du.create(current.sizePadded);
268 | current.dv.create(current.sizePadded);
269 | current.du.setTo(0.0);
270 | current.dv.setTo(0.0);
271 | }
272 |
273 | __global__ void structureTensorKernel(int width, int height, int step, int stepColor, const float3 *Ix, const float3 *Iy, float *stx2, float *stxy, float *sty2)
274 | {
275 | int j = blockDim.x * blockIdx.x + threadIdx.x;
276 | int i = blockDim.y * blockIdx.y + threadIdx.y;
277 | if (i>=height || j>=width)
278 | return;
279 |
280 | int offset = i*step + j;
281 | int offsetColor = i*stepColor + 3*j;
282 | const MyVec3f *pIx = (const MyVec3f *)((float *)Ix + offsetColor);
283 | const MyVec3f *pIy = (const MyVec3f *)((float *)Iy + offsetColor);
284 |
285 | stx2[offset] = (*pIx).dot(*pIx);
286 | stxy[offset] = (*pIx).dot(*pIy);
287 | sty2[offset] = (*pIy).dot(*pIy);
288 | }
289 |
290 | __global__ void minEigenvalueKernel(int width, int height, int step, const float *stx2, const float *stxy, const float *sty2, float *minEigen)
291 | {
292 | int j = blockDim.x * blockIdx.x + threadIdx.x;
293 | int i = blockDim.y * blockIdx.y + threadIdx.y;
294 | if (i>=height || j>=width)
295 | return;
296 |
297 | int offset = i*step + j;
298 | const float *pstx2 = stx2 + offset;
299 | const float *pstxy = stxy + offset;
300 | const float *psty2 = sty2 + offset;
301 |
302 | float t = 0.5f * (*pstx2 + *psty2);
303 | float t2 = t*t + (*pstxy)*(*pstxy) - (*pstx2)*(*psty2);
304 | minEigen[offset] = t - (t2<=0.0f?0.0f:sqrtf(t2)); // may be negative due to floating points approximation
305 | }
306 |
307 | __global__ void matchingScoreKernel(int width, int height, int step, int stepColor,
308 | const float3 *I0, const float3 *I1,
309 | const float3 *I0x, const float3 *I0y, const float3 *I1x, const float3 *I1y,
310 | const float *udesc, const float *vdesc, const float *minEigen, float *descWeight)
311 | {
312 | int j = blockDim.x * blockIdx.x + threadIdx.x;
313 | int i = blockDim.y * blockIdx.y + threadIdx.y;
314 | if (i>=height || j>=width)
315 | return;
316 |
317 | int offset = i*step + j;
318 | int offsetColor = i*stepColor + 3*j;
319 | const MyVec3f *pI0 = (const MyVec3f *)((float *)I0 + offsetColor);
320 | const MyVec3f *pI0x = (const MyVec3f *)((float *)I0x + offsetColor);
321 | const MyVec3f *pI0y = (const MyVec3f *)((float *)I0y + offsetColor);
322 |
323 | float xw, yw;
324 | xw = (int)(j + udesc[offset]);
325 | yw = (int)(i + vdesc[offset]);
326 | if (xw<0) xw=0;
327 | else if (xw>width-1) xw = width-1;
328 | if (yw<0) yw=0;
329 | else if (yw>height-1) yw = height-1;
330 |
331 | int offsetColor2 = yw*stepColor + 3*xw;
332 | const MyVec3f *pI1 = (const MyVec3f *)((float *)I1 + offsetColor2);
333 | const MyVec3f *pI1x = (const MyVec3f *)((float *)I1x + offsetColor2);
334 | const MyVec3f *pI1y = (const MyVec3f *)((float *)I1y + offsetColor2);
335 |
336 | float gradWeight = 1.0;
337 | float flow_sigma_score = 50.0f;
338 | float mul_coef = 10.0f;
339 |
340 | float flowscore = (*pI0 - *pI1).l1norm() + gradWeight * ((*pI0x - *pI1x).l1norm() + (*pI0y - *pI1y).l1norm());
341 | float t2 = minEigen[offset];
342 |
343 | float t = 1.0f/(flow_sigma_score*sqrtf(2.0f*M_PI));
344 | float sigmascore2 = -0.5f/(flow_sigma_score*flow_sigma_score);
345 |
346 | t2 = t2<=0.0?0.0:sqrtf(t2);
347 | descWeight[offset] = mul_coef * t2 * t * expf( flowscore*flowscore*sigmascore2 );
348 | if (descWeight[offset]<0.0) // may be negative due to floating points approximation
349 | descWeight[offset] = 0.0;
350 | }
351 |
352 | void DeepFlowCuda::computeDescWeight()
353 | {
354 | // Input : pyramid.I0[0], pyramid.I1[0]
355 | // Output : pyramid.descWeight[0]
356 | cv::cudev::GpuMat_ I0x, I0y;
357 |
358 | // Structure tensor
359 | cv::cudev::GpuMat_ stx2, stxy, sty2, minEigen;
360 |
361 | filtx->apply(pyramid.I0[0], I0x);
362 | filty->apply(pyramid.I0[0], I0y);
363 |
364 | stx2.create(I0x.size());
365 | stxy.create(I0x.size());
366 | sty2.create(I0x.size());
367 |
368 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(I0x.cols, threadsPerBlock.x), divUp(I0x.rows, threadsPerBlock.y), 1);
369 |
370 | // No padding here
371 | structureTensorKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float),
372 | I0x[0], I0y[0], stx2[0], stxy[0], sty2[0]);
373 |
374 | // Smooth structure tensor
375 | shared_ptr fg = cv::cuda::createGaussianFilter(CV_32F, CV_32F, cv::Size(0,0), 3.0, 3.0, cv::BORDER_REPLICATE);
376 | fg->apply(stx2, stx2);
377 | fg->apply(stxy, stxy);
378 | fg->apply(sty2, sty2);
379 |
380 | minEigen.create(I0x.size());
381 | pyramid.descWeight[0].create(I0x.size());
382 |
383 | minEigenvalueKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), stx2[0], stxy[0], sty2[0], minEigen[0]);
384 |
385 | cv::cudev::GpuMat_ I0x5pt, I0y5pt, I1x5pt, I1y5pt;
386 |
387 | filtx5pt->apply(pyramid.I0[0], I0x5pt);
388 | filty5pt->apply(pyramid.I0[0], I0y5pt);
389 | filtx5pt->apply(pyramid.I1[0], I1x5pt);
390 | filty5pt->apply(pyramid.I1[0], I1y5pt);
391 |
392 | matchingScoreKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float),
393 | pyramid.I0[0][0], pyramid.I1[0][0],
394 | I0x5pt[0], I0y5pt[0], I1x5pt[0], I1y5pt[0],
395 | pyramid.udesc[0][0], pyramid.vdesc[0][0], minEigen[0], pyramid.descWeight[0][0]);
396 | }
397 |
398 | // Computed on all pixels (including padding)
399 | __global__ void luminanceKernel(int width, int height, int step, int stepColor, const float3 *I, float *lum)
400 | {
401 | int j = blockDim.x * blockIdx.x + threadIdx.x;
402 | int i = blockDim.y * blockIdx.y + threadIdx.y;
403 | if (i>=height || j>=width)
404 | return;
405 |
406 | int offsetI = i*stepColor + 3*j;
407 | int offset = i*step+j;
408 |
409 | const MyVec3f *pI = (const MyVec3f *)((const float *)I + offsetI);
410 | lum[offset] = 0.299f*pI->z + 0.587f*pI->y + 0.114f*pI->x;
411 | }
412 |
413 | __global__ void smoothnessWeightKernel(int width, int height, int padding, int step, const float *lum, float *smoothWeight, float coef)
414 | {
415 | int j = blockDim.x * blockIdx.x + threadIdx.x;
416 | int i = blockDim.y * blockIdx.y + threadIdx.y;
417 | if (i>=height || j>=width)
418 | return;
419 |
420 | int offset = (i+padding)*step + padding + j;
421 | float lumx = (lum[offset+1]-lum[offset-1])*0.5;
422 | float lumy = (lum[offset+step]-lum[offset-step])*0.5;
423 | smoothWeight[offset] = 0.5f*expf(-coef*sqrtf(lumx*lumx+lumy*lumy));
424 | }
425 |
426 | void DeepFlowCuda::computeSmoothnessWeight()
427 | {
428 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid;
429 |
430 | // Luminance everywhere (including padding)
431 | blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.sizePadded.height, threadsPerBlock.y), 1);
432 | luminanceKernel<<>>(current.sizePadded.width, current.sizePadded.height, current.step, current.stepColor, current.I0[0], current.luminance[0]);
433 |
434 | blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
435 | smoothnessWeightKernel<<>>(current.size.width, current.size.height, padding, current.step, current.luminance[0], current.smoothWeight[0], 5.0/255.0);
436 | }
437 |
438 | __global__ void dataTermKernel(
439 | const float *params, int width, int height, int padding, int step, int stepColor,
440 | const float3 *Ix, const float3 *Iy, const float3 *Iz,
441 | const float3 *Ixx, const float3 *Ixy, const float3 *Iyy,
442 | const float3 *Ixz, const float3 *Iyz,
443 | float *A11, float *A12, float *A22, float *b1, float *b2,
444 | const float *du, const float *dv)
445 | {
446 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
447 | float delta = params[2], gamma = params[3], zeta = params[5], epsilon = params[6];
448 |
449 | float zeta_squared = zeta * zeta;
450 | float epsilon_squared = epsilon * epsilon;
451 |
452 | const MyVec3f *pIx, *pIy, *pIz;
453 | const MyVec3f *pIxx, *pIxy, *pIyy, *pIxz, *pIyz;
454 | const float *pdU, *pdV;
455 | float *pa11, *pa12, *pa22, *pb1, *pb2;
456 |
457 | int j = blockDim.x * blockIdx.x + threadIdx.x;
458 | int i = blockDim.y * blockIdx.y + threadIdx.y;
459 | if (i>=height || j>=width)
460 | return;
461 |
462 | // stepColor is in number of float elements!
463 | int offsetColor = (i+padding)*stepColor + 3*(padding+j);
464 | pIx = (const MyVec3f *)((float *)Ix + offsetColor);
465 | pIy = (const MyVec3f *)((float *)Iy + offsetColor);
466 | pIz = (const MyVec3f *)((float *)Iz + offsetColor);
467 | pIxx = (const MyVec3f *)((float *)Ixx + offsetColor);
468 | pIxy = (const MyVec3f *)((float *)Ixy + offsetColor);
469 | pIyy = (const MyVec3f *)((float *)Iyy + offsetColor);
470 | pIxz = (const MyVec3f *)((float *)Ixz + offsetColor);
471 | pIyz = (const MyVec3f *)((float *)Iyz + offsetColor);
472 |
473 | int offset = (i+padding)*step + padding+j;
474 | pa11 = A11 + offset;
475 | pa12 = A12 + offset;
476 | pa22 = A22 + offset;
477 | pb1 = b1 + offset;
478 | pb2 = b2 + offset;
479 | pdU = du + offset;
480 | pdV = dv + offset;
481 |
482 | *pa11 = 0;
483 | *pa12 = 0;
484 | *pa22 = 0;
485 | *pb1 = 0;
486 | *pb2 = 0;
487 |
488 | // Color constancy
489 | if (delta!=0.0)
490 | {
491 | MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared);
492 | float hdover3 = delta*0.5f/3.0f;
493 | float mask = 1.0;
494 |
495 | MyVec3f ngradI = *pIx*(*pIx) + *pIy*(*pIy) + dnorm;
496 |
497 | MyVec3f Ik1z = *pIz + *pIx*(*pdU) + *pIy*(*pdV);
498 | float tmp = mask*hdover3/sqrt((Ik1z*Ik1z/ngradI).sum()+epsilon_squared);
499 | MyVec3f ti = MyVec3f(tmp, tmp, tmp)/ngradI;
500 |
501 | *pa11 += (ti*(*pIx)*(*pIx)).sum();
502 | *pa12 += (ti*(*pIx)*(*pIy)).sum();
503 | *pa22 += (ti*(*pIy)*(*pIy)).sum();
504 | *pb1 -= (ti*(*pIx)*(*pIz)).sum();
505 | *pb2 -= (ti*(*pIy)*(*pIz)).sum();
506 | }
507 |
508 | // Gradient constancy
509 | if (gamma!=0)
510 | {
511 | MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared);
512 | float hgover3 = gamma*0.5f/3.0f;
513 | float mask = 1.0;
514 |
515 | MyVec3f nx = *pIxx*(*pIxx) + *pIxy*(*pIxy) + dnorm;
516 | MyVec3f ny = *pIyy*(*pIyy) + *pIxy*(*pIxy) + dnorm;
517 |
518 | MyVec3f tmpx = *pIxz + *pIxx*(*pdU) + *pIxy*(*pdV);
519 | MyVec3f tmpy = *pIyz + *pIxy*(*pdU) + *pIyy*(*pdV);
520 |
521 | float tmp = mask*hgover3/sqrt((tmpx*tmpx/nx).sum() + (tmpy*tmpy/ny).sum() + epsilon_squared);
522 |
523 | MyVec3f tix = MyVec3f(tmp, tmp, tmp)/nx;
524 | MyVec3f tiy = MyVec3f(tmp, tmp, tmp)/ny;
525 |
526 | *pa11 += (tix*(*pIxx)*(*pIxx) + tiy*(*pIxy)*(*pIxy)).sum();
527 | *pa12 += (tix*(*pIxx)*(*pIxy) + tiy*(*pIxy)*(*pIyy)).sum();
528 | *pa22 += (tix*(*pIxy)*(*pIxy) + tiy*(*pIyy)*(*pIyy)).sum();
529 |
530 | *pb1 -= (tix*(*pIxx)*(*pIxz) + tiy*(*pIxy)*(*pIyz)).sum();
531 | *pb2 -= (tix*(*pIxy)*(*pIxz) + tiy*(*pIyy)*(*pIyz)).sum();
532 | }
533 | }
534 |
535 | void DeepFlowCuda::computeDataTerm()
536 | {
537 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
538 |
539 | dataTermKernel<<>>(
540 | paramsCuda, current.size.width, current.size.height, padding, current.step, current.stepColor,
541 | current.Ix[0], current.Iy[0], current.Iz[0],
542 | current.Ixx[0], current.Ixy[0], current.Iyy[0], current.Ixz[0], current.Iyz[0],
543 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
544 | current.du[0], current.dv[0]);
545 | }
546 |
547 | __global__ void matchingTermKernel(
548 | const float *params, int width, int height, int padding, int step,
549 | float *A11, float *A22, float *b1, float *b2,
550 | const float *u, const float *v, const float *utmp, const float *vtmp, const float *udesc, const float *vdesc, const float *descWeight)
551 | {
552 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
553 | float beta = params[1], epsilon = params[6];
554 |
555 | float epsilon_squared = epsilon*epsilon;
556 |
557 | int j = blockDim.x * blockIdx.x + threadIdx.x;
558 | int i = blockDim.y * blockIdx.y + threadIdx.y;
559 | if (i>=height || j>=width)
560 | return;
561 |
562 | int offset = (i+padding)*step + padding+j;
563 |
564 | const float *pudesc = udesc + offset;
565 | const float *pvdesc = vdesc + offset;
566 |
567 | float tmpx = utmp[offset] - *pudesc;
568 | float tmpy = vtmp[offset] - *pvdesc;
569 | float tmp = 0.5*descWeight[offset]*beta/sqrt(tmpx*tmpx+tmpy*tmpy+epsilon_squared);
570 | A11[offset] += tmp;
571 | A22[offset] += tmp;
572 | b1[offset] -= tmp*(u[offset] - *pudesc);
573 | b2[offset] -= tmp*(v[offset] - *pvdesc);
574 | }
575 |
576 | void DeepFlowCuda::computeMatchingTerm()
577 | {
578 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
579 |
580 | matchingTermKernel<<>>(
581 | paramsCuda, current.size.width, current.size.height, padding, current.step,
582 | current.A11[0], current.A22[0], current.b1[0], current.b2[0],
583 | current.u[0], current.v[0], current.utmp[0], current.vtmp[0], current.udesc[0], current.vdesc[0], current.descWeight[0]);
584 | }
585 |
586 | __global__ void smoothnessTermKernel(
587 | const float *params, int width, int height, int padding, int step,
588 | float *smoothX, float *smoothY, const float *smoothWeight,
589 | const float *utmp, const float *vtmp)
590 | {
591 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
592 | float alpha = params[0], epsilon = params[6];
593 |
594 | float epsilon_smooth = epsilon*epsilon; // 0.001f*0.001f;
595 |
596 | int j = blockDim.x * blockIdx.x + threadIdx.x;
597 | int i = blockDim.y * blockIdx.y + threadIdx.y;
598 | if (i>=height || j>=width)
599 | return;
600 |
601 | int offset = (i+padding)*step + padding + j;
602 |
603 | float *psmoothX = smoothX + offset;
604 | float *psmoothY = smoothY + offset;
605 | const float *psmoothWeight = smoothWeight + offset;
606 | const float *pu = utmp + offset;
607 | const float *pv = vtmp + offset;
608 |
609 | float ux1 = pu[1]-pu[0];
610 | float vx1 = pv[1]-pv[0];
611 | float uy1 = pu[step]-pu[0];
612 | float vy1 = pv[step]-pv[0];
613 |
614 | float ux2 = (pu[1]-pu[-1])*0.5;
615 | float vx2 = (pv[1]-pv[-1])*0.5;
616 | float uy2 = (pu[step]-pu[-step])*0.5;
617 | float vy2 = (pv[step]-pv[-step])*0.5;
618 |
619 | float tmpu = 0.5*(uy2 + (pu[step+1]-pu[-step+1])*0.5);
620 | float uxsq = ux1*ux1 + tmpu*tmpu;
621 | float tmpv = 0.5*(vy2 + (pv[step+1]-pv[-step+1])*0.5);
622 | float vxsq = vx1*vx1 + tmpv*tmpv;
623 |
624 | *psmoothX = alpha*0.5*(psmoothWeight[0]+psmoothWeight[1])/sqrt(uxsq+vxsq+epsilon_smooth);
625 |
626 | tmpu = 0.5*(ux2 + (pu[step+1]-pu[step-1])*0.5);
627 | float uysq = uy1*uy1 + tmpu*tmpu;
628 | tmpv = 0.5*(vx2 + (pv[step+1]-pv[step-1])*0.5);
629 | float vysq = vy1*vy1 + tmpv*tmpv;
630 |
631 | *psmoothY = alpha*0.5*(psmoothWeight[0]+psmoothWeight[step])/sqrt(uysq+vysq+epsilon_smooth);
632 | }
633 |
634 | __global__ void applySmoothKernel(
635 | int width, int height, int padding, int step,
636 | float *b1, float *b2,
637 | const float *smoothX, const float *smoothY,
638 | const float *u, const float *v
639 | )
640 | {
641 | int j = blockDim.x * blockIdx.x + threadIdx.x;
642 | int i = blockDim.y * blockIdx.y + threadIdx.y;
643 | if (i>=height || j>=width)
644 | return;
645 |
646 | int offset = (i+padding)*step + padding + j;
647 | const float *pu = u + offset;
648 | const float *pv = v + offset;
649 | const float *psx = smoothX + offset;
650 | const float *psy = smoothY + offset;
651 |
652 | // V1
653 | b1[offset] += -psx[-1]*(pu[0]-pu[-1]) + psx[0]*(pu[1]-pu[0]) -psy[-step]*(pu[0]-pu[-step]) + psy[0]*(pu[step]-pu[0]);
654 | b2[offset] += -psx[-1]*(pv[0]-pv[-1]) + psx[0]*(pv[1]-pv[0]) -psy[-step]*(pv[0]-pv[-step]) + psy[0]*(pv[step]-pv[0]);
655 |
656 | // V2
657 | // b1[offset] += smoothX[offset]*(pu[1]-2*pu[0]+pu[-1]) + smoothY[offset]*(pu[step]-2*pu[0]+pu[-step]);
658 | // b2[offset] += smoothX[offset]*(pv[1]-2*pv[0]+pv[-1]) + smoothY[offset]*(pv[step]-2*pv[0]+pv[-step]);
659 | }
660 |
661 | void DeepFlowCuda::computeSmoothnessTerm()
662 | {
663 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1);
664 |
665 | smoothnessTermKernel<<>>(
666 | paramsCuda, current.size.width, current.size.height, padding, current.step,
667 | current.smoothX[0], current.smoothY[0], current.smoothWeight[0],
668 | current.utmp[0], current.vtmp[0]);
669 |
670 | applySmoothKernel<<>>(
671 | current.size.width, current.size.height, padding, current.step,
672 | current.b1[0], current.b2[0], current.smoothX[0], current.smoothY[0], current.u[0], current.v[0]);
673 | }
674 |
675 | __global__ void RedBlackSORKernel(
676 | const float *params, int width, int height, int padding, int step, bool redpass,
677 | const float *a11, const float *a12, const float *a22, const float *b1, const float *b2,
678 | const float *smoothX, const float *smoothY,
679 | float *du, float *dv)
680 | {
681 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon};
682 | float omega = params[4];
683 |
684 | int halfWidth = width/2 + width%2;
685 | int widthRow; // Width of current row
686 |
687 | int j = blockDim.x * blockIdx.x + threadIdx.x;
688 | int i = blockDim.y * blockIdx.y + threadIdx.y;
689 | if (i>=height)
690 | return;
691 |
692 | if (width%2==0)
693 | widthRow = halfWidth;
694 | else {
695 | if (i%2==0)
696 | {
697 | if (redpass) widthRow = halfWidth;
698 | else widthRow = halfWidth-1;
699 | }
700 | else {
701 | if (redpass) widthRow = halfWidth-1;
702 | else widthRow = halfWidth;
703 | }
704 | }
705 | if (j>=widthRow)
706 | return;
707 |
708 | int offset = (i+padding)*step + padding + j*2;
709 | if ((redpass && i%2==1) || (!redpass && i%2==0))
710 | offset++;
711 |
712 | float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2,det;
713 |
714 | sigma_u = 0.0f;
715 | sigma_v = 0.0f;
716 | sum_dpsis = 0.0f;
717 |
718 | if (i>0) {
719 | sigma_u -= smoothY[offset-step] * du[offset-step];
720 | sigma_v -= smoothY[offset-step] * dv[offset-step];
721 | sum_dpsis += smoothY[offset-step];
722 | }
723 |
724 | if(j>0){
725 | sigma_u -= smoothX[offset-1] * du[offset-1];
726 | sigma_v -= smoothX[offset-1] * dv[offset-1];
727 | sum_dpsis += smoothX[offset-1];
728 | }
729 | if(i>>(
760 | paramsCuda, current.size.width, current.size.height, padding, current.step, true,
761 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
762 | current.smoothX[0], current.smoothY[0],
763 | current.du[0], current.dv[0]);
764 |
765 | RedBlackSORKernel<<>>(
766 | paramsCuda, current.size.width, current.size.height, padding, current.step, false,
767 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0],
768 | current.smoothX[0], current.smoothY[0],
769 | current.du[0], current.dv[0]);
770 | }
771 | }
772 |
773 | void DeepFlowCuda::computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow)
774 | {
775 | assert(!I0.empty() && I0.type()==CV_32FC3);
776 | assert(I1.size()==I0.size() && I1.type()==CV_32FC3);
777 |
778 | if (flow.empty() || flow.size()!=I0.size() || flow.type()!=CV_32FC2)
779 | {
780 | if (beta!=0)
781 | {
782 | cout<<"No correct input flow was provided but weight of matching term is non-zero. Setting it to zero..."<=minSize && heightTmp>=minSize)
798 | {
799 | widthTmp*=scaleFactor;
800 | heightTmp*=scaleFactor;
801 | nbScales++;
802 | }
803 | if (nbScales==0)
804 | nbScales = 1;
805 |
806 | cout<<"Nb scales = "<apply(pyramid.I0[0], pyramid.I0[0]);
833 | filtg->apply(pyramid.I1[0], pyramid.I1[0]);
834 | }
835 |
836 | pyramid.scale[0] = 1.0;
837 |
838 | if (beta!=0)
839 | {
840 | pyramid.udesc[0].upload(uv[0]);
841 | pyramid.vdesc[0].upload(uv[1]);
842 | computeDescWeight(); // Computes pyramid.descWeight[0]
843 | }
844 |
845 | cout<<"Scale 0:"< uinitTmp, vinitTmp;
874 | uinitTmp.upload(uv[0]);
875 | vinitTmp.upload(uv[1]);
876 |
877 | cv::cuda::resize(uinitTmp, pyramid.uinit[nbScales-1], pyramid.I0[nbScales-1].size());
878 | cv::cuda::resize(vinitTmp, pyramid.vinit[nbScales-1], pyramid.I0[nbScales-1].size());
879 | cv::cuda::multiply(pyramid.uinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.uinit[nbScales-1]);
880 | cv::cuda::multiply(pyramid.vinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.vinit[nbScales-1]);
881 | }
882 |
883 | for (int k=nbScales-1; k>=0; k--)
884 | computeOneLevel(k);
885 |
886 | // At the end, we have ufinal and vfinal in pyramid.ufinal[0]
887 | pyramid.ufinal[0].download(uv[0]);
888 | pyramid.vfinal[0].download(uv[1]);
889 |
890 | cv::merge(uv, 2, flow);
891 | }
892 |
893 | void DeepFlowCuda::computeOneLevel(int scale)
894 | {
895 | prepareBuffers(scale);
896 |
897 | float betaSave = beta;
898 | float bk = 0.45;
899 |
900 | if (bk>0.0f && nbScales>1)
901 | {
902 | beta = betaSave * pow((float)scale/(float)(nbScales-1), bk);
903 | copyParamsToCuda();
904 | }
905 |
906 | computeSmoothnessWeight();
907 |
908 | for (int i = 0; i < fixedPointIterations; i++)
909 | {
910 | computeDataTerm(); // Initializes A11, A12, A22, b1 and b2
911 | if (beta!=0)
912 | computeMatchingTerm();
913 |
914 | if (alpha!=0)
915 | computeSmoothnessTerm(); // Updates b1 and b2
916 |
917 | RedBlackSOR();
918 |
919 | current.utmp = current.u + current.du;
920 | current.vtmp = current.v + current.dv;
921 | }
922 |
923 | beta = betaSave;
924 |
925 | if (padding!=0)
926 | {
927 | int width = pyramid.I0[scale].cols;
928 | int height = pyramid.I0[scale].rows;
929 |
930 | current.utmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.ufinal[scale]);
931 | current.vtmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.vfinal[scale]);
932 | }
933 | else {
934 | current.utmp.copyTo(pyramid.ufinal[scale]);
935 | current.vtmp.copyTo(pyramid.vfinal[scale]);
936 | }
937 | }
938 |
939 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_ &m) const
940 | {
941 | cv::Mat a;
942 | m.download(a);
943 | return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone();
944 | }
945 |
946 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_ &m) const
947 | {
948 | cv::Mat a;
949 | m.download(a);
950 | cv::cvtColor(a, a, cv::COLOR_BGRA2BGR);
951 | return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone();
952 | }
953 |
954 | void DeepFlowCuda::copyParamsToCuda()
955 | {
956 | int nbParams = 7;
957 | if (paramsCuda==nullptr && cudaMalloc(¶msCuda, nbParams*sizeof(float))!=cudaSuccess)
958 | {
959 | paramsCuda = nullptr;
960 | cout<<"cudaMalloc error"<.
19 | */
20 |
21 | #ifndef DEEP_FLOW_CUDA_H
22 | #define DEEP_FLOW_CUDA_H
23 |
24 | #include
25 | #include // For template cv::cudev::GpuMat_ -> needs to be compiled with nvcc !
26 | #include // For cv::cuda::Filter
27 | #include
28 |
29 | #include "myvec3f.h"
30 |
31 | class DeepFlowCuda
32 | {
33 | // Member variables
34 | protected:
35 | int fixedPointIterations, sorIterations;
36 | float omega; // Update parameter in SOR iterations
37 | float alpha; // Weight of smoothing term
38 | float beta; // Weight of matching term
39 | float delta; // Weight of color constancy in data term
40 | float gamma; // Weight of gradient constancy in data term
41 | float zeta; // Regularization parameter (added to norms in data term)
42 | float epsilon; // Regularization parameter in Psi function
43 | float scaleFactor; // Scale factor between two succesive levels
44 | float sigma; // Standard deviation of presmoothing Gaussian filter
45 | int nbScales; // Number of scales (levels). Will be calculated from minSize, scaleFactor and size of input image
46 | int padding;
47 | int minSize; // Minimum width or height at the highest level (at the coarsest scale)
48 | float *paramsCuda;
49 |
50 | // Padded images and flows at current scale
51 | struct {
52 | cv::cudev::GpuMat_ I0, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz;
53 | cv::cudev::GpuMat_ A11, A12, A22, b1, b2;
54 | cv::cudev::GpuMat_ smoothX, smoothY, luminance, smoothWeight;
55 |
56 | cv::cudev::GpuMat_ utmp, vtmp; // flow that is updated in each fixed point iteration
57 | cv::cudev::GpuMat_ du, dv; // flow increment, updated in each SOR iteration
58 | cv::cudev::GpuMat_ u, v; // red-black-buffer version of the input flow
59 | cv::cudev::GpuMat_ udesc, vdesc, descWeight; // Descriptor used in matching term
60 |
61 | int step;
62 | int stepColor;
63 |
64 | cv::Size size, sizePadded;
65 | } current;
66 |
67 | // Non-padded images and flows at all scales
68 | struct {
69 | std::vector > I0, I1;
70 | std::vector > uinit, vinit, ufinal, vfinal;
71 | std::vector > udesc, vdesc, descWeight;
72 | std::vector scale;
73 | } pyramid;
74 |
75 | // Derivative filters, and Gaussian filter
76 | std::shared_ptr filtg, filtx, filty, filtx5pt, filty5pt;
77 |
78 | // Member functions
79 | public:
80 | DeepFlowCuda();
81 | ~DeepFlowCuda();
82 |
83 | void computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow);
84 |
85 | int getFixedPointIterations() const { return fixedPointIterations; }
86 | void setFixedPointIterations(int val) { fixedPointIterations = val; }
87 | int getSorIterations() const { return sorIterations; }
88 | void setSorIterations(int val) { sorIterations = val; }
89 | float getOmega() const { return omega; }
90 | void setOmega(float val) { omega = val; }
91 | float getAlpha() const { return alpha; }
92 | void setAlpha(float val) { alpha = val; }
93 | float getBeta() const { return beta; }
94 | void setBeta(float val) { beta = val; }
95 | float getDelta() const { return delta; }
96 | void setDelta(float val) { delta = val; }
97 | float getGamma() const { return gamma; }
98 | void setGamma(float val) { gamma = val; }
99 |
100 | protected:
101 | bool createFilters();
102 |
103 | void prepareBuffers(int scale);
104 |
105 | void computeDescWeight();
106 |
107 | void computeDataTerm();
108 | void computeMatchingTerm();
109 |
110 | // The local smoothness weight is not described in the initial DeepFlow paper
111 | // [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013]
112 | // Instead it is mentioned in section 4.3 of the extended IJCV paper
113 | // [J. Revaud, P. Weinzaepfel, Z. Harchaoui and C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016]
114 | // See ref [Wedel et al 2009] or [Xu et al 2012]
115 | void computeSmoothnessWeight();
116 |
117 | void computeSmoothnessTerm();
118 |
119 | void RedBlackSOR();
120 |
121 | void computeOneLevel(int);
122 |
123 | // Remove padding and move to CPU
124 | cv::Mat toCPU(const cv::cudev::GpuMat_ &) const ;
125 | cv::Mat toCPU(const cv::cudev::GpuMat_ &) const;
126 |
127 | void copyParamsToCuda();
128 | };
129 |
130 | #endif
131 |
--------------------------------------------------------------------------------
/main.cu:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright 2019 Julien Mille
3 |
4 | This file is part of DeepFlowCUDA.
5 |
6 | DeepFlowCUDA is free software: you can redistribute
7 | it and/or modify it under the terms of the GNU Lesser General Public License
8 | as published by the Free Software Foundation, either version 3 of the License,
9 | or (at your option) any later version.
10 |
11 | DeepFlowCUDA is distributed in the hope that it will
12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
14 | General Public License for more details.
15 |
16 | You should have received a copy of the GNU General Public License,
17 | and a copy of the GNU Lesser General Public License, along with
18 | DeepFlowCUDA. If not, see .
19 | */
20 |
21 | #include
22 | #include
23 | #include
24 |
25 | #include // For cv::imread
26 | #include // For cv::readOpticalFlow, cv::writeOpticalFlow
27 | #include "deepflowcuda.h"
28 |
29 | using namespace std;
30 |
31 | float L2Distance(const cv::Mat &flow1, const cv::Mat &flow2, cv::Mat &dist)
32 | {
33 | assert(!flow1.empty() && flow1.type()==CV_32FC2 && flow2.type()==CV_32FC2 && flow1.size()==flow2.size());
34 |
35 | dist.create(flow1.size(), CV_32F);
36 |
37 | float avgDist = 0;
38 | float d;
39 |
40 | for (int y=0; y(y,x);
44 | cv::Point2f b = flow2.at(y,x);
45 | d = (a-b).dot(a-b);
46 | avgDist += d;
47 | dist.at(y,x) = d;
48 | }
49 | avgDist /= flow1.cols*flow1.rows;
50 | return avgDist;
51 | }
52 |
53 | int main(int argc, char ** argv)
54 | {
55 | cv::Mat img0, img1, gtflow, flow, flowRefined;
56 |
57 | string dataDir = "./data/";
58 | string outputDir = "./";
59 | string outputPath = outputDir + "refinedflow_0001.flo";
60 |
61 | img0 = cv::imread(dataDir + "frame_0001.jpg", cv::IMREAD_COLOR);
62 | if (img0.data==nullptr)
63 | {
64 | cout<<"Failed to read first image"<.
19 | */
20 |
21 | #ifndef MY_VEC3F_H
22 | #define MY_VEC3F_H
23 |
24 | class MyVec3f
25 | {
26 | public:
27 | float x, y, z;
28 | __device__ MyVec3f() {}
29 | __device__ MyVec3f(float a, float b, float c):x(a),y(b),z(c) {}
30 | __device__ float dot(const MyVec3f &v) const {return x*v.x+y*v.y+z*v.z;}
31 | __device__ float norm2() const {return x*x+y*y+z*z;}
32 | __device__ float norm() const {return sqrt(x*x+y*y+z*z);}
33 | __device__ float l1norm() const {return fabs(x)+fabs(y)+fabs(z);}
34 | __device__ float sum() const {return x+y+z;}
35 |
36 | __device__ MyVec3f operator +(const MyVec3f &v) const
37 | {
38 | MyVec3f s;
39 | s.x = x+v.x;
40 | s.y = y+v.y;
41 | s.z = z+v.z;
42 | return s;
43 | }
44 |
45 | __device__ MyVec3f operator -(const MyVec3f &v) const
46 | {
47 | MyVec3f d;
48 | d.x = x-v.x;
49 | d.y = y-v.y;
50 | d.z = z-v.z;
51 | return d;
52 | }
53 |
54 | __device__ MyVec3f operator *(float f) const
55 | {
56 | MyVec3f v;
57 | v.x = x*f;
58 | v.y = y*f;
59 | v.z = z*f;
60 | return v;
61 | }
62 |
63 | // Element-wise product
64 | __device__ MyVec3f operator *(const MyVec3f &v) const
65 | {
66 | MyVec3f s;
67 | s.x = x*v.x;
68 | s.y = y*v.y;
69 | s.z = z*v.z;
70 | return s;
71 | }
72 |
73 | // Element-wise division
74 | __device__ MyVec3f operator /(const MyVec3f &v) const
75 | {
76 | MyVec3f s;
77 | s.x = x/v.x;
78 | s.y = y/v.y;
79 | s.z = z/v.z;
80 | return s;
81 | }
82 | };
83 |
84 | #endif
--------------------------------------------------------------------------------