├── Makefile ├── README.md ├── README.txt ├── data ├── dirtyflow_0001.flo ├── frame_0001.jpg ├── frame_0002.jpg └── gtflow_0001.flo ├── deepflowcuda.cu ├── deepflowcuda.h ├── main.cu └── myvec3f.h /Makefile: -------------------------------------------------------------------------------- 1 | NVCC = nvcc 2 | NVCCFLAGS = -c -g -O3 3 | LD = g++ 4 | LDFLAGS = -O3 5 | FINAL_TARGET = deepflow 6 | CUDA_DIR = /usr/local/cuda-9.2 7 | OPENCV_DIR = /usr/local/opencv-4.1.0-build 8 | INCLUDE_DIR = -I$(CUDA_DIR)/include -I$(OPENCV_DIR)/include/opencv4 9 | LIB_DIR = -L$(CUDA_DIR)/lib64 -L$(OPENCV_DIR)/lib 10 | LIBS = -lopencv_cudawarping -lopencv_cudafilters -lopencv_cudaimgproc -lopencv_cudaarithm -lopencv_cudalegacy -lopencv_video -lopencv_imgproc -lopencv_imgcodecs -lopencv_core -lcudart 11 | 12 | default: $(FINAL_TARGET) 13 | 14 | $(FINAL_TARGET): main.o deepflowcuda.o 15 | $(LD) $+ -o $@ $(LDFLAGS) $(LIB_DIR) $(LIBS) 16 | 17 | %.o: %.cu 18 | $(NVCC) $(NVCCFLAGS) $(INCLUDE_DIR) $< -o $@ 19 | 20 | clean: 21 | rm -f *.o $(FINAL_TARGET) 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepFlowCUDA 2 | A C++ port of Philippe Weinzaepfel's C DeepFlow library, using OpenCV/CUDA 3 | 4 | Reference papers are: 5 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013] 6 | [J. Revaud, P. Weinzaepfel, Z. Harchaoui and C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016] 7 | 8 | Provided test images are taken from the MPI Sintel dataset: 9 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012] 10 | 11 | See README.txt for requirements and build instructions. 12 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | This is a C++/OpenCV/CUDA port of the DeepFlow C library, associated to the following paper: 2 | [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013] 3 | See http://lear.inrialpes.fr/src/deepflow/ 4 | 5 | The library is provided with two images and the corresponding ground truth flow from the MPI Sintel dataset: 6 | [D.J. Butler, J. Wulff, G.B. Stanley and M.J. Black. A naturalistic open source movie for optical flow evaluation. ECCV 2012] 7 | See http://sintel.is.tue.mpg.de/ 8 | 9 | It should be used as a refinement step, as described for example in the FlowNet paper, 10 | [A. Dosovitskiy, P. Fischer, E. Ilg, P. Häusser, C. Hazirbas, V. Golkov, P. van der Smagt, D. Cremers and T. Brox. FlowNet: Learning Optical Flow with Convolutional Networks. ICCV 2015] 11 | Thus, the input is made up of two images and a coarse optical flow, typically output from a deep neural network. If no input flow is provided, the initial field is set to zero everywhere. As in the OpenCV modules variationalrefinement and cudaoptflow, the red-black successive overrelaxation method (SOR) is used to solve the linear systems. See the paper by Brox et al: 12 | [T. Brox, A. Bruhn, N. Papenberg, J. Weickert. High Accuracy Optical Flow Estimation Based on a Theory for Warping. ECCVV 2004] 13 | 14 | The DeepFlowCUDA class uses the cv::cudev::GpuMat_ template class, so you'll need OpenCV with additional opencv_contrib modules built (see https://github.com/opencv/opencv_contrib) 15 | 16 | Requirements: 17 | - OpenCV with opencv_contrib modules (need CUDA modules) 18 | - CUDA build environment (nvcc should be in your PATH) 19 | 20 | The Makefile assumes that CUDA is installed in /usr/local/cuda-9.2 and OpenCV in /usr/local/opencv-4.1.0 21 | Just edit the paths in the Makefile and run 22 | > make 23 | > ./deepflow 24 | 25 | Please report any bug to julien.mille@insa-cvl.fr 26 | Thanks! 27 | 28 | Copyright 2019 Julien Mille 29 | -------------------------------------------------------------------------------- /data/dirtyflow_0001.flo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/dirtyflow_0001.flo -------------------------------------------------------------------------------- /data/frame_0001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0001.jpg -------------------------------------------------------------------------------- /data/frame_0002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/frame_0002.jpg -------------------------------------------------------------------------------- /data/gtflow_0001.flo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/julien-mille/DeepFlowCUDA/06027ab56770f08a6fbf1a3c76f69125d1e109d3/data/gtflow_0001.flo -------------------------------------------------------------------------------- /deepflowcuda.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2019 Julien Mille 3 | 4 | This file is part of DeepFlowCUDA. 5 | 6 | DeepFlowCUDA is free software: you can redistribute 7 | it and/or modify it under the terms of the GNU Lesser General Public License 8 | as published by the Free Software Foundation, either version 3 of the License, 9 | or (at your option) any later version. 10 | 11 | DeepFlowCUDA is distributed in the hope that it will 12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 14 | General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License, 17 | and a copy of the GNU Lesser General Public License, along with 18 | DeepFlowCUDA. If not, see . 19 | */ 20 | 21 | #include "deepflowcuda.h" 22 | #include // for pixelwise operations on cv::cudev::GpuMat_ 23 | #include // for cv::cuda::resize 24 | #include 25 | 26 | using namespace std; 27 | 28 | // Maximum number of CUDA threads per block, per dimension, for 2D thread blocks 29 | // It is equivalent to 32^2=1014 threads per 1D block 30 | #define THREADS_PER_BLOCK_2D 32 31 | 32 | // Rounded up division, to compute number of thread blocks when launching CUDA kernels 33 | int divUp(int a, int b) 34 | { 35 | return (a + b - 1)/b; 36 | } 37 | 38 | DeepFlowCuda::DeepFlowCuda() 39 | { 40 | // We use the same setting as DeepFlow, except for the scale factor 41 | fixedPointIterations = 5; 42 | sorIterations = 25; 43 | alpha = 1.0; 44 | beta = 32.0; 45 | delta = 0.1; 46 | gamma = 0.7; 47 | omega = 1.6; 48 | zeta = 0.1; 49 | epsilon = 0.01; 50 | scaleFactor = 0.5; 51 | sigma = 0; // 0.65; 52 | minSize = 10; 53 | 54 | paramsCuda = nullptr; 55 | copyParamsToCuda(); 56 | 57 | current.step = 0; 58 | current.stepColor = 0; 59 | 60 | padding = 1; 61 | 62 | assert(createFilters()); 63 | } 64 | 65 | DeepFlowCuda::~DeepFlowCuda() 66 | { 67 | if (paramsCuda!=nullptr) 68 | cudaFree(paramsCuda); 69 | } 70 | 71 | bool DeepFlowCuda::createFilters() 72 | { 73 | cv::Mat deriv, deriv5pt; 74 | cv::Mat o = cv::Mat::ones(1, 1, CV_32F); 75 | 76 | // CUDA filters for centered finite differences 77 | deriv.create(1, 3, CV_32F); 78 | deriv.at(0,0) = -0.5; 79 | deriv.at(0,1) = 0.0; 80 | deriv.at(0,2) = 0.5; 81 | 82 | filtx = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv, o, cv::Point(-1, -1), cv::BORDER_REPLICATE); 83 | if (filtx==nullptr) 84 | return false; 85 | 86 | filty = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv, cv::Point(-1, -1), cv::BORDER_REPLICATE); 87 | if (filty==nullptr) 88 | return false; 89 | 90 | // CUDA filters for finite differences with 5-point stencil 91 | deriv5pt.create(1, 5, CV_32F); 92 | deriv5pt.at(0,0) = 1.0/12; 93 | deriv5pt.at(0,1) = -8.0/12; 94 | deriv5pt.at(0,2) = 0; 95 | deriv5pt.at(0,3) = 8.0/12; 96 | deriv5pt.at(0,4) = -1.0/12; 97 | 98 | filtx5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, deriv5pt, o, cv::Point(-1, -1), cv::BORDER_REPLICATE); 99 | if (filtx5pt==nullptr) 100 | return false; 101 | 102 | filty5pt = cv::cuda::createSeparableLinearFilter(CV_32FC3, CV_32FC3, o, deriv5pt, cv::Point(-1, -1), cv::BORDER_REPLICATE); 103 | if (filty5pt==nullptr) 104 | return false; 105 | 106 | if (sigma!=0) 107 | { 108 | filtg = cv::cuda::createGaussianFilter(CV_32FC3, CV_32FC3, cv::Size(0,0), sigma, sigma, cv::BORDER_REPLICATE); 109 | if (filtg==nullptr) 110 | return false; 111 | } 112 | 113 | return true; 114 | } 115 | 116 | __global__ void warpKernel(int width, int height, int padding, int step, int stepColor, const float3 *I, const float *u, const float *v, float3 *warpedI) 117 | { 118 | int j = blockDim.x * blockIdx.x + threadIdx.x; 119 | int i = blockDim.y * blockIdx.y + threadIdx.y; 120 | if (i>=height || j>=width) 121 | return; 122 | 123 | int offset = (i+padding)*step + j+padding; 124 | MyVec3f *pWarpedI = (MyVec3f *)((float *)warpedI + ((i+padding)*stepColor + 3*(j+padding))); 125 | 126 | float x, y, xx, yy, dx, dy; 127 | int x1, x2, y1, y2; 128 | 129 | xx = j + u[offset]; 130 | yy = i + v[offset]; 131 | x = floor(xx); 132 | y = floor(yy); 133 | dx = xx-x; 134 | dy = yy-y; 135 | 136 | x1 = (int)x; 137 | x2 = x1+1; 138 | y1 = (int)y; 139 | y2 = y1+1; 140 | 141 | if (x1<0) x1=0; else if (x1>=width) x1 = width-1; 142 | if (x2<0) x2=0; else if (x2>=width) x2 = width-1; 143 | if (y1<0) y1=0; else if (y1>=height) y1 = height-1; 144 | if (y2<0) y2=0; else if (y2>=height) y2 = height-1; 145 | 146 | const MyVec3f *pI1 = (const MyVec3f *)((float *)I + ((y1+padding)*stepColor + 3*padding)); 147 | const MyVec3f *pI2 = (const MyVec3f *)((float *)I + ((y2+padding)*stepColor + 3*padding)); 148 | 149 | *pWarpedI = 150 | pI1[x1]*(1.0f-dx)*(1.0f-dy) + 151 | pI1[x2]*dx*(1.0f-dy) + 152 | pI2[x1]*(1.0f-dx)*dy + 153 | pI2[x2]*dx*dy; 154 | } 155 | 156 | __global__ void averageKernel(int width, int height, int stepColor, const float3 *a, const float3 *b, float3 *c) 157 | { 158 | int j = blockDim.x * blockIdx.x + threadIdx.x; 159 | int i = blockDim.y * blockIdx.y + threadIdx.y; 160 | if (i>=height || j>=width) 161 | return; 162 | 163 | int offset = i*stepColor + 3*j; 164 | const MyVec3f *pA = (const MyVec3f *)((float *)a + offset); 165 | const MyVec3f *pB = (const MyVec3f *)((float *)b + offset); 166 | MyVec3f *pC = (MyVec3f *)((float *)c + offset); 167 | 168 | *pC = (*pA+*pB)*0.5; 169 | } 170 | 171 | void DeepFlowCuda::prepareBuffers(int scale) 172 | { 173 | current.size = pyramid.I0[scale].size(); 174 | 175 | current.sizePadded.width = current.size.width + 2*padding; 176 | current.sizePadded.height = current.size.height + 2*padding; 177 | 178 | current.A11.create(current.sizePadded); 179 | current.A12.create(current.sizePadded); 180 | current.A22.create(current.sizePadded); 181 | current.b1.create(current.sizePadded); 182 | current.b2.create(current.sizePadded); 183 | 184 | current.luminance.create(current.sizePadded); 185 | current.smoothX.create(current.sizePadded); 186 | current.smoothY.create(current.sizePadded); 187 | current.smoothWeight.create(current.sizePadded); 188 | 189 | current.smoothX.setTo(0.0); 190 | current.smoothY.setTo(0.0); 191 | current.smoothWeight.setTo(0.0); 192 | 193 | if (scale!=nbScales-1) 194 | { 195 | cv::cuda::resize(pyramid.ufinal[scale+1], pyramid.uinit[scale], pyramid.I0[scale].size()); 196 | cv::cuda::resize(pyramid.vfinal[scale+1], pyramid.vinit[scale], pyramid.I0[scale].size()); 197 | 198 | cv::cuda::multiply(pyramid.uinit[scale], 1.0/scaleFactor, pyramid.uinit[scale]); 199 | cv::cuda::multiply(pyramid.vinit[scale], 1.0/scaleFactor, pyramid.vinit[scale]); 200 | } 201 | 202 | if (padding!=0) 203 | { 204 | cv::cuda::copyMakeBorder(pyramid.I0[scale], current.I0, padding, padding, padding, padding, cv::BORDER_REPLICATE); 205 | cv::cuda::copyMakeBorder(pyramid.uinit[scale], current.u, padding, padding, padding, padding, cv::BORDER_REPLICATE); 206 | cv::cuda::copyMakeBorder(pyramid.vinit[scale], current.v, padding, padding, padding, padding, cv::BORDER_REPLICATE); 207 | 208 | if (beta!=0) 209 | { 210 | cv::cuda::copyMakeBorder(pyramid.udesc[scale], current.udesc, padding, padding, padding, padding, cv::BORDER_REPLICATE); 211 | cv::cuda::copyMakeBorder(pyramid.vdesc[scale], current.vdesc, padding, padding, padding, padding, cv::BORDER_REPLICATE); 212 | cv::cuda::copyMakeBorder(pyramid.descWeight[scale], current.descWeight, padding, padding, padding, padding, cv::BORDER_REPLICATE); 213 | } 214 | } 215 | else { 216 | pyramid.I0[scale].copyTo(current.I0); 217 | pyramid.uinit[scale].copyTo(current.u); 218 | pyramid.vinit[scale].copyTo(current.v); 219 | 220 | if (beta!=0) 221 | { 222 | pyramid.udesc[scale].copyTo(current.udesc); 223 | pyramid.vdesc[scale].copyTo(current.vdesc); 224 | pyramid.descWeight[scale].copyTo(current.descWeight); 225 | } 226 | } 227 | 228 | current.step = current.A11.step/sizeof(float); 229 | current.stepColor = current.I0.step/sizeof(float); 230 | 231 | // Computing an average of the current and warped next frames (to compute the derivatives on) and temporal derivative Iz 232 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid; 233 | 234 | // Otherwise, pyramid.uinit[scale] and pyramid.vinit[scale] are already initialized 235 | cv::cudev::GpuMat_ gpuwarpedI, gpuaveragedI; 236 | 237 | // Average everywhere 238 | gpuwarpedI.create(current.size); 239 | 240 | blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 241 | warpKernel<<>>(current.size.width, current.size.height, 0, pyramid.uinit[scale].step/sizeof(float), pyramid.I1[scale].step/sizeof(float), 242 | pyramid.I1[scale][0], pyramid.uinit[scale][0], pyramid.vinit[scale][0], gpuwarpedI[0]); 243 | 244 | if (padding!=0) 245 | cv::cuda::copyMakeBorder(gpuwarpedI, gpuwarpedI, padding, padding, padding, padding, cv::BORDER_REPLICATE); 246 | 247 | // Average everywhere 248 | gpuaveragedI.create(current.sizePadded); 249 | 250 | blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 251 | averageKernel<<>>(current.sizePadded.width, current.sizePadded.height, current.stepColor, current.I0[0], gpuwarpedI[0], gpuaveragedI[0]); 252 | cv::cuda::subtract(gpuwarpedI, current.I0, current.Iz); 253 | 254 | filtx5pt->apply(gpuaveragedI, current.Ix); 255 | filty5pt->apply(gpuaveragedI, current.Iy); 256 | 257 | filtx5pt->apply(current.Iz, current.Ixz); 258 | filty5pt->apply(current.Iz, current.Iyz); 259 | 260 | filtx5pt->apply(current.Ix, current.Ixx); 261 | filty5pt->apply(current.Ix, current.Ixy); 262 | filty5pt->apply(current.Iy, current.Iyy); 263 | 264 | current.u.copyTo(current.utmp); 265 | current.v.copyTo(current.vtmp); 266 | 267 | current.du.create(current.sizePadded); 268 | current.dv.create(current.sizePadded); 269 | current.du.setTo(0.0); 270 | current.dv.setTo(0.0); 271 | } 272 | 273 | __global__ void structureTensorKernel(int width, int height, int step, int stepColor, const float3 *Ix, const float3 *Iy, float *stx2, float *stxy, float *sty2) 274 | { 275 | int j = blockDim.x * blockIdx.x + threadIdx.x; 276 | int i = blockDim.y * blockIdx.y + threadIdx.y; 277 | if (i>=height || j>=width) 278 | return; 279 | 280 | int offset = i*step + j; 281 | int offsetColor = i*stepColor + 3*j; 282 | const MyVec3f *pIx = (const MyVec3f *)((float *)Ix + offsetColor); 283 | const MyVec3f *pIy = (const MyVec3f *)((float *)Iy + offsetColor); 284 | 285 | stx2[offset] = (*pIx).dot(*pIx); 286 | stxy[offset] = (*pIx).dot(*pIy); 287 | sty2[offset] = (*pIy).dot(*pIy); 288 | } 289 | 290 | __global__ void minEigenvalueKernel(int width, int height, int step, const float *stx2, const float *stxy, const float *sty2, float *minEigen) 291 | { 292 | int j = blockDim.x * blockIdx.x + threadIdx.x; 293 | int i = blockDim.y * blockIdx.y + threadIdx.y; 294 | if (i>=height || j>=width) 295 | return; 296 | 297 | int offset = i*step + j; 298 | const float *pstx2 = stx2 + offset; 299 | const float *pstxy = stxy + offset; 300 | const float *psty2 = sty2 + offset; 301 | 302 | float t = 0.5f * (*pstx2 + *psty2); 303 | float t2 = t*t + (*pstxy)*(*pstxy) - (*pstx2)*(*psty2); 304 | minEigen[offset] = t - (t2<=0.0f?0.0f:sqrtf(t2)); // may be negative due to floating points approximation 305 | } 306 | 307 | __global__ void matchingScoreKernel(int width, int height, int step, int stepColor, 308 | const float3 *I0, const float3 *I1, 309 | const float3 *I0x, const float3 *I0y, const float3 *I1x, const float3 *I1y, 310 | const float *udesc, const float *vdesc, const float *minEigen, float *descWeight) 311 | { 312 | int j = blockDim.x * blockIdx.x + threadIdx.x; 313 | int i = blockDim.y * blockIdx.y + threadIdx.y; 314 | if (i>=height || j>=width) 315 | return; 316 | 317 | int offset = i*step + j; 318 | int offsetColor = i*stepColor + 3*j; 319 | const MyVec3f *pI0 = (const MyVec3f *)((float *)I0 + offsetColor); 320 | const MyVec3f *pI0x = (const MyVec3f *)((float *)I0x + offsetColor); 321 | const MyVec3f *pI0y = (const MyVec3f *)((float *)I0y + offsetColor); 322 | 323 | float xw, yw; 324 | xw = (int)(j + udesc[offset]); 325 | yw = (int)(i + vdesc[offset]); 326 | if (xw<0) xw=0; 327 | else if (xw>width-1) xw = width-1; 328 | if (yw<0) yw=0; 329 | else if (yw>height-1) yw = height-1; 330 | 331 | int offsetColor2 = yw*stepColor + 3*xw; 332 | const MyVec3f *pI1 = (const MyVec3f *)((float *)I1 + offsetColor2); 333 | const MyVec3f *pI1x = (const MyVec3f *)((float *)I1x + offsetColor2); 334 | const MyVec3f *pI1y = (const MyVec3f *)((float *)I1y + offsetColor2); 335 | 336 | float gradWeight = 1.0; 337 | float flow_sigma_score = 50.0f; 338 | float mul_coef = 10.0f; 339 | 340 | float flowscore = (*pI0 - *pI1).l1norm() + gradWeight * ((*pI0x - *pI1x).l1norm() + (*pI0y - *pI1y).l1norm()); 341 | float t2 = minEigen[offset]; 342 | 343 | float t = 1.0f/(flow_sigma_score*sqrtf(2.0f*M_PI)); 344 | float sigmascore2 = -0.5f/(flow_sigma_score*flow_sigma_score); 345 | 346 | t2 = t2<=0.0?0.0:sqrtf(t2); 347 | descWeight[offset] = mul_coef * t2 * t * expf( flowscore*flowscore*sigmascore2 ); 348 | if (descWeight[offset]<0.0) // may be negative due to floating points approximation 349 | descWeight[offset] = 0.0; 350 | } 351 | 352 | void DeepFlowCuda::computeDescWeight() 353 | { 354 | // Input : pyramid.I0[0], pyramid.I1[0] 355 | // Output : pyramid.descWeight[0] 356 | cv::cudev::GpuMat_ I0x, I0y; 357 | 358 | // Structure tensor 359 | cv::cudev::GpuMat_ stx2, stxy, sty2, minEigen; 360 | 361 | filtx->apply(pyramid.I0[0], I0x); 362 | filty->apply(pyramid.I0[0], I0y); 363 | 364 | stx2.create(I0x.size()); 365 | stxy.create(I0x.size()); 366 | sty2.create(I0x.size()); 367 | 368 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(I0x.cols, threadsPerBlock.x), divUp(I0x.rows, threadsPerBlock.y), 1); 369 | 370 | // No padding here 371 | structureTensorKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float), 372 | I0x[0], I0y[0], stx2[0], stxy[0], sty2[0]); 373 | 374 | // Smooth structure tensor 375 | shared_ptr fg = cv::cuda::createGaussianFilter(CV_32F, CV_32F, cv::Size(0,0), 3.0, 3.0, cv::BORDER_REPLICATE); 376 | fg->apply(stx2, stx2); 377 | fg->apply(stxy, stxy); 378 | fg->apply(sty2, sty2); 379 | 380 | minEigen.create(I0x.size()); 381 | pyramid.descWeight[0].create(I0x.size()); 382 | 383 | minEigenvalueKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), stx2[0], stxy[0], sty2[0], minEigen[0]); 384 | 385 | cv::cudev::GpuMat_ I0x5pt, I0y5pt, I1x5pt, I1y5pt; 386 | 387 | filtx5pt->apply(pyramid.I0[0], I0x5pt); 388 | filty5pt->apply(pyramid.I0[0], I0y5pt); 389 | filtx5pt->apply(pyramid.I1[0], I1x5pt); 390 | filty5pt->apply(pyramid.I1[0], I1y5pt); 391 | 392 | matchingScoreKernel<<>>(I0x.cols, I0x.rows, stx2.step/sizeof(float), I0x.step/sizeof(float), 393 | pyramid.I0[0][0], pyramid.I1[0][0], 394 | I0x5pt[0], I0y5pt[0], I1x5pt[0], I1y5pt[0], 395 | pyramid.udesc[0][0], pyramid.vdesc[0][0], minEigen[0], pyramid.descWeight[0][0]); 396 | } 397 | 398 | // Computed on all pixels (including padding) 399 | __global__ void luminanceKernel(int width, int height, int step, int stepColor, const float3 *I, float *lum) 400 | { 401 | int j = blockDim.x * blockIdx.x + threadIdx.x; 402 | int i = blockDim.y * blockIdx.y + threadIdx.y; 403 | if (i>=height || j>=width) 404 | return; 405 | 406 | int offsetI = i*stepColor + 3*j; 407 | int offset = i*step+j; 408 | 409 | const MyVec3f *pI = (const MyVec3f *)((const float *)I + offsetI); 410 | lum[offset] = 0.299f*pI->z + 0.587f*pI->y + 0.114f*pI->x; 411 | } 412 | 413 | __global__ void smoothnessWeightKernel(int width, int height, int padding, int step, const float *lum, float *smoothWeight, float coef) 414 | { 415 | int j = blockDim.x * blockIdx.x + threadIdx.x; 416 | int i = blockDim.y * blockIdx.y + threadIdx.y; 417 | if (i>=height || j>=width) 418 | return; 419 | 420 | int offset = (i+padding)*step + padding + j; 421 | float lumx = (lum[offset+1]-lum[offset-1])*0.5; 422 | float lumy = (lum[offset+step]-lum[offset-step])*0.5; 423 | smoothWeight[offset] = 0.5f*expf(-coef*sqrtf(lumx*lumx+lumy*lumy)); 424 | } 425 | 426 | void DeepFlowCuda::computeSmoothnessWeight() 427 | { 428 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid; 429 | 430 | // Luminance everywhere (including padding) 431 | blocksPerGrid = dim3(divUp(current.sizePadded.width, threadsPerBlock.x), divUp(current.sizePadded.height, threadsPerBlock.y), 1); 432 | luminanceKernel<<>>(current.sizePadded.width, current.sizePadded.height, current.step, current.stepColor, current.I0[0], current.luminance[0]); 433 | 434 | blocksPerGrid = dim3(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 435 | smoothnessWeightKernel<<>>(current.size.width, current.size.height, padding, current.step, current.luminance[0], current.smoothWeight[0], 5.0/255.0); 436 | } 437 | 438 | __global__ void dataTermKernel( 439 | const float *params, int width, int height, int padding, int step, int stepColor, 440 | const float3 *Ix, const float3 *Iy, const float3 *Iz, 441 | const float3 *Ixx, const float3 *Ixy, const float3 *Iyy, 442 | const float3 *Ixz, const float3 *Iyz, 443 | float *A11, float *A12, float *A22, float *b1, float *b2, 444 | const float *du, const float *dv) 445 | { 446 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon}; 447 | float delta = params[2], gamma = params[3], zeta = params[5], epsilon = params[6]; 448 | 449 | float zeta_squared = zeta * zeta; 450 | float epsilon_squared = epsilon * epsilon; 451 | 452 | const MyVec3f *pIx, *pIy, *pIz; 453 | const MyVec3f *pIxx, *pIxy, *pIyy, *pIxz, *pIyz; 454 | const float *pdU, *pdV; 455 | float *pa11, *pa12, *pa22, *pb1, *pb2; 456 | 457 | int j = blockDim.x * blockIdx.x + threadIdx.x; 458 | int i = blockDim.y * blockIdx.y + threadIdx.y; 459 | if (i>=height || j>=width) 460 | return; 461 | 462 | // stepColor is in number of float elements! 463 | int offsetColor = (i+padding)*stepColor + 3*(padding+j); 464 | pIx = (const MyVec3f *)((float *)Ix + offsetColor); 465 | pIy = (const MyVec3f *)((float *)Iy + offsetColor); 466 | pIz = (const MyVec3f *)((float *)Iz + offsetColor); 467 | pIxx = (const MyVec3f *)((float *)Ixx + offsetColor); 468 | pIxy = (const MyVec3f *)((float *)Ixy + offsetColor); 469 | pIyy = (const MyVec3f *)((float *)Iyy + offsetColor); 470 | pIxz = (const MyVec3f *)((float *)Ixz + offsetColor); 471 | pIyz = (const MyVec3f *)((float *)Iyz + offsetColor); 472 | 473 | int offset = (i+padding)*step + padding+j; 474 | pa11 = A11 + offset; 475 | pa12 = A12 + offset; 476 | pa22 = A22 + offset; 477 | pb1 = b1 + offset; 478 | pb2 = b2 + offset; 479 | pdU = du + offset; 480 | pdV = dv + offset; 481 | 482 | *pa11 = 0; 483 | *pa12 = 0; 484 | *pa22 = 0; 485 | *pb1 = 0; 486 | *pb2 = 0; 487 | 488 | // Color constancy 489 | if (delta!=0.0) 490 | { 491 | MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared); 492 | float hdover3 = delta*0.5f/3.0f; 493 | float mask = 1.0; 494 | 495 | MyVec3f ngradI = *pIx*(*pIx) + *pIy*(*pIy) + dnorm; 496 | 497 | MyVec3f Ik1z = *pIz + *pIx*(*pdU) + *pIy*(*pdV); 498 | float tmp = mask*hdover3/sqrt((Ik1z*Ik1z/ngradI).sum()+epsilon_squared); 499 | MyVec3f ti = MyVec3f(tmp, tmp, tmp)/ngradI; 500 | 501 | *pa11 += (ti*(*pIx)*(*pIx)).sum(); 502 | *pa12 += (ti*(*pIx)*(*pIy)).sum(); 503 | *pa22 += (ti*(*pIy)*(*pIy)).sum(); 504 | *pb1 -= (ti*(*pIx)*(*pIz)).sum(); 505 | *pb2 -= (ti*(*pIy)*(*pIz)).sum(); 506 | } 507 | 508 | // Gradient constancy 509 | if (gamma!=0) 510 | { 511 | MyVec3f dnorm(zeta_squared, zeta_squared, zeta_squared); 512 | float hgover3 = gamma*0.5f/3.0f; 513 | float mask = 1.0; 514 | 515 | MyVec3f nx = *pIxx*(*pIxx) + *pIxy*(*pIxy) + dnorm; 516 | MyVec3f ny = *pIyy*(*pIyy) + *pIxy*(*pIxy) + dnorm; 517 | 518 | MyVec3f tmpx = *pIxz + *pIxx*(*pdU) + *pIxy*(*pdV); 519 | MyVec3f tmpy = *pIyz + *pIxy*(*pdU) + *pIyy*(*pdV); 520 | 521 | float tmp = mask*hgover3/sqrt((tmpx*tmpx/nx).sum() + (tmpy*tmpy/ny).sum() + epsilon_squared); 522 | 523 | MyVec3f tix = MyVec3f(tmp, tmp, tmp)/nx; 524 | MyVec3f tiy = MyVec3f(tmp, tmp, tmp)/ny; 525 | 526 | *pa11 += (tix*(*pIxx)*(*pIxx) + tiy*(*pIxy)*(*pIxy)).sum(); 527 | *pa12 += (tix*(*pIxx)*(*pIxy) + tiy*(*pIxy)*(*pIyy)).sum(); 528 | *pa22 += (tix*(*pIxy)*(*pIxy) + tiy*(*pIyy)*(*pIyy)).sum(); 529 | 530 | *pb1 -= (tix*(*pIxx)*(*pIxz) + tiy*(*pIxy)*(*pIyz)).sum(); 531 | *pb2 -= (tix*(*pIxy)*(*pIxz) + tiy*(*pIyy)*(*pIyz)).sum(); 532 | } 533 | } 534 | 535 | void DeepFlowCuda::computeDataTerm() 536 | { 537 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 538 | 539 | dataTermKernel<<>>( 540 | paramsCuda, current.size.width, current.size.height, padding, current.step, current.stepColor, 541 | current.Ix[0], current.Iy[0], current.Iz[0], 542 | current.Ixx[0], current.Ixy[0], current.Iyy[0], current.Ixz[0], current.Iyz[0], 543 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0], 544 | current.du[0], current.dv[0]); 545 | } 546 | 547 | __global__ void matchingTermKernel( 548 | const float *params, int width, int height, int padding, int step, 549 | float *A11, float *A22, float *b1, float *b2, 550 | const float *u, const float *v, const float *utmp, const float *vtmp, const float *udesc, const float *vdesc, const float *descWeight) 551 | { 552 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon}; 553 | float beta = params[1], epsilon = params[6]; 554 | 555 | float epsilon_squared = epsilon*epsilon; 556 | 557 | int j = blockDim.x * blockIdx.x + threadIdx.x; 558 | int i = blockDim.y * blockIdx.y + threadIdx.y; 559 | if (i>=height || j>=width) 560 | return; 561 | 562 | int offset = (i+padding)*step + padding+j; 563 | 564 | const float *pudesc = udesc + offset; 565 | const float *pvdesc = vdesc + offset; 566 | 567 | float tmpx = utmp[offset] - *pudesc; 568 | float tmpy = vtmp[offset] - *pvdesc; 569 | float tmp = 0.5*descWeight[offset]*beta/sqrt(tmpx*tmpx+tmpy*tmpy+epsilon_squared); 570 | A11[offset] += tmp; 571 | A22[offset] += tmp; 572 | b1[offset] -= tmp*(u[offset] - *pudesc); 573 | b2[offset] -= tmp*(v[offset] - *pvdesc); 574 | } 575 | 576 | void DeepFlowCuda::computeMatchingTerm() 577 | { 578 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 579 | 580 | matchingTermKernel<<>>( 581 | paramsCuda, current.size.width, current.size.height, padding, current.step, 582 | current.A11[0], current.A22[0], current.b1[0], current.b2[0], 583 | current.u[0], current.v[0], current.utmp[0], current.vtmp[0], current.udesc[0], current.vdesc[0], current.descWeight[0]); 584 | } 585 | 586 | __global__ void smoothnessTermKernel( 587 | const float *params, int width, int height, int padding, int step, 588 | float *smoothX, float *smoothY, const float *smoothWeight, 589 | const float *utmp, const float *vtmp) 590 | { 591 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon}; 592 | float alpha = params[0], epsilon = params[6]; 593 | 594 | float epsilon_smooth = epsilon*epsilon; // 0.001f*0.001f; 595 | 596 | int j = blockDim.x * blockIdx.x + threadIdx.x; 597 | int i = blockDim.y * blockIdx.y + threadIdx.y; 598 | if (i>=height || j>=width) 599 | return; 600 | 601 | int offset = (i+padding)*step + padding + j; 602 | 603 | float *psmoothX = smoothX + offset; 604 | float *psmoothY = smoothY + offset; 605 | const float *psmoothWeight = smoothWeight + offset; 606 | const float *pu = utmp + offset; 607 | const float *pv = vtmp + offset; 608 | 609 | float ux1 = pu[1]-pu[0]; 610 | float vx1 = pv[1]-pv[0]; 611 | float uy1 = pu[step]-pu[0]; 612 | float vy1 = pv[step]-pv[0]; 613 | 614 | float ux2 = (pu[1]-pu[-1])*0.5; 615 | float vx2 = (pv[1]-pv[-1])*0.5; 616 | float uy2 = (pu[step]-pu[-step])*0.5; 617 | float vy2 = (pv[step]-pv[-step])*0.5; 618 | 619 | float tmpu = 0.5*(uy2 + (pu[step+1]-pu[-step+1])*0.5); 620 | float uxsq = ux1*ux1 + tmpu*tmpu; 621 | float tmpv = 0.5*(vy2 + (pv[step+1]-pv[-step+1])*0.5); 622 | float vxsq = vx1*vx1 + tmpv*tmpv; 623 | 624 | *psmoothX = alpha*0.5*(psmoothWeight[0]+psmoothWeight[1])/sqrt(uxsq+vxsq+epsilon_smooth); 625 | 626 | tmpu = 0.5*(ux2 + (pu[step+1]-pu[step-1])*0.5); 627 | float uysq = uy1*uy1 + tmpu*tmpu; 628 | tmpv = 0.5*(vx2 + (pv[step+1]-pv[step-1])*0.5); 629 | float vysq = vy1*vy1 + tmpv*tmpv; 630 | 631 | *psmoothY = alpha*0.5*(psmoothWeight[0]+psmoothWeight[step])/sqrt(uysq+vysq+epsilon_smooth); 632 | } 633 | 634 | __global__ void applySmoothKernel( 635 | int width, int height, int padding, int step, 636 | float *b1, float *b2, 637 | const float *smoothX, const float *smoothY, 638 | const float *u, const float *v 639 | ) 640 | { 641 | int j = blockDim.x * blockIdx.x + threadIdx.x; 642 | int i = blockDim.y * blockIdx.y + threadIdx.y; 643 | if (i>=height || j>=width) 644 | return; 645 | 646 | int offset = (i+padding)*step + padding + j; 647 | const float *pu = u + offset; 648 | const float *pv = v + offset; 649 | const float *psx = smoothX + offset; 650 | const float *psy = smoothY + offset; 651 | 652 | // V1 653 | b1[offset] += -psx[-1]*(pu[0]-pu[-1]) + psx[0]*(pu[1]-pu[0]) -psy[-step]*(pu[0]-pu[-step]) + psy[0]*(pu[step]-pu[0]); 654 | b2[offset] += -psx[-1]*(pv[0]-pv[-1]) + psx[0]*(pv[1]-pv[0]) -psy[-step]*(pv[0]-pv[-step]) + psy[0]*(pv[step]-pv[0]); 655 | 656 | // V2 657 | // b1[offset] += smoothX[offset]*(pu[1]-2*pu[0]+pu[-1]) + smoothY[offset]*(pu[step]-2*pu[0]+pu[-step]); 658 | // b2[offset] += smoothX[offset]*(pv[1]-2*pv[0]+pv[-1]) + smoothY[offset]*(pv[step]-2*pv[0]+pv[-step]); 659 | } 660 | 661 | void DeepFlowCuda::computeSmoothnessTerm() 662 | { 663 | dim3 threadsPerBlock(THREADS_PER_BLOCK_2D, THREADS_PER_BLOCK_2D, 1), blocksPerGrid(divUp(current.size.width, threadsPerBlock.x), divUp(current.size.height, threadsPerBlock.y), 1); 664 | 665 | smoothnessTermKernel<<>>( 666 | paramsCuda, current.size.width, current.size.height, padding, current.step, 667 | current.smoothX[0], current.smoothY[0], current.smoothWeight[0], 668 | current.utmp[0], current.vtmp[0]); 669 | 670 | applySmoothKernel<<>>( 671 | current.size.width, current.size.height, padding, current.step, 672 | current.b1[0], current.b2[0], current.smoothX[0], current.smoothY[0], current.u[0], current.v[0]); 673 | } 674 | 675 | __global__ void RedBlackSORKernel( 676 | const float *params, int width, int height, int padding, int step, bool redpass, 677 | const float *a11, const float *a12, const float *a22, const float *b1, const float *b2, 678 | const float *smoothX, const float *smoothY, 679 | float *du, float *dv) 680 | { 681 | // float params[7] = {alpha, beta, delta, gamma, omega, zeta, epsilon}; 682 | float omega = params[4]; 683 | 684 | int halfWidth = width/2 + width%2; 685 | int widthRow; // Width of current row 686 | 687 | int j = blockDim.x * blockIdx.x + threadIdx.x; 688 | int i = blockDim.y * blockIdx.y + threadIdx.y; 689 | if (i>=height) 690 | return; 691 | 692 | if (width%2==0) 693 | widthRow = halfWidth; 694 | else { 695 | if (i%2==0) 696 | { 697 | if (redpass) widthRow = halfWidth; 698 | else widthRow = halfWidth-1; 699 | } 700 | else { 701 | if (redpass) widthRow = halfWidth-1; 702 | else widthRow = halfWidth; 703 | } 704 | } 705 | if (j>=widthRow) 706 | return; 707 | 708 | int offset = (i+padding)*step + padding + j*2; 709 | if ((redpass && i%2==1) || (!redpass && i%2==0)) 710 | offset++; 711 | 712 | float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2,det; 713 | 714 | sigma_u = 0.0f; 715 | sigma_v = 0.0f; 716 | sum_dpsis = 0.0f; 717 | 718 | if (i>0) { 719 | sigma_u -= smoothY[offset-step] * du[offset-step]; 720 | sigma_v -= smoothY[offset-step] * dv[offset-step]; 721 | sum_dpsis += smoothY[offset-step]; 722 | } 723 | 724 | if(j>0){ 725 | sigma_u -= smoothX[offset-1] * du[offset-1]; 726 | sigma_v -= smoothX[offset-1] * dv[offset-1]; 727 | sum_dpsis += smoothX[offset-1]; 728 | } 729 | if(i>>( 760 | paramsCuda, current.size.width, current.size.height, padding, current.step, true, 761 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0], 762 | current.smoothX[0], current.smoothY[0], 763 | current.du[0], current.dv[0]); 764 | 765 | RedBlackSORKernel<<>>( 766 | paramsCuda, current.size.width, current.size.height, padding, current.step, false, 767 | current.A11[0], current.A12[0], current.A22[0], current.b1[0], current.b2[0], 768 | current.smoothX[0], current.smoothY[0], 769 | current.du[0], current.dv[0]); 770 | } 771 | } 772 | 773 | void DeepFlowCuda::computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow) 774 | { 775 | assert(!I0.empty() && I0.type()==CV_32FC3); 776 | assert(I1.size()==I0.size() && I1.type()==CV_32FC3); 777 | 778 | if (flow.empty() || flow.size()!=I0.size() || flow.type()!=CV_32FC2) 779 | { 780 | if (beta!=0) 781 | { 782 | cout<<"No correct input flow was provided but weight of matching term is non-zero. Setting it to zero..."<=minSize && heightTmp>=minSize) 798 | { 799 | widthTmp*=scaleFactor; 800 | heightTmp*=scaleFactor; 801 | nbScales++; 802 | } 803 | if (nbScales==0) 804 | nbScales = 1; 805 | 806 | cout<<"Nb scales = "<apply(pyramid.I0[0], pyramid.I0[0]); 833 | filtg->apply(pyramid.I1[0], pyramid.I1[0]); 834 | } 835 | 836 | pyramid.scale[0] = 1.0; 837 | 838 | if (beta!=0) 839 | { 840 | pyramid.udesc[0].upload(uv[0]); 841 | pyramid.vdesc[0].upload(uv[1]); 842 | computeDescWeight(); // Computes pyramid.descWeight[0] 843 | } 844 | 845 | cout<<"Scale 0:"< uinitTmp, vinitTmp; 874 | uinitTmp.upload(uv[0]); 875 | vinitTmp.upload(uv[1]); 876 | 877 | cv::cuda::resize(uinitTmp, pyramid.uinit[nbScales-1], pyramid.I0[nbScales-1].size()); 878 | cv::cuda::resize(vinitTmp, pyramid.vinit[nbScales-1], pyramid.I0[nbScales-1].size()); 879 | cv::cuda::multiply(pyramid.uinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.uinit[nbScales-1]); 880 | cv::cuda::multiply(pyramid.vinit[nbScales-1], pyramid.scale[nbScales-1], pyramid.vinit[nbScales-1]); 881 | } 882 | 883 | for (int k=nbScales-1; k>=0; k--) 884 | computeOneLevel(k); 885 | 886 | // At the end, we have ufinal and vfinal in pyramid.ufinal[0] 887 | pyramid.ufinal[0].download(uv[0]); 888 | pyramid.vfinal[0].download(uv[1]); 889 | 890 | cv::merge(uv, 2, flow); 891 | } 892 | 893 | void DeepFlowCuda::computeOneLevel(int scale) 894 | { 895 | prepareBuffers(scale); 896 | 897 | float betaSave = beta; 898 | float bk = 0.45; 899 | 900 | if (bk>0.0f && nbScales>1) 901 | { 902 | beta = betaSave * pow((float)scale/(float)(nbScales-1), bk); 903 | copyParamsToCuda(); 904 | } 905 | 906 | computeSmoothnessWeight(); 907 | 908 | for (int i = 0; i < fixedPointIterations; i++) 909 | { 910 | computeDataTerm(); // Initializes A11, A12, A22, b1 and b2 911 | if (beta!=0) 912 | computeMatchingTerm(); 913 | 914 | if (alpha!=0) 915 | computeSmoothnessTerm(); // Updates b1 and b2 916 | 917 | RedBlackSOR(); 918 | 919 | current.utmp = current.u + current.du; 920 | current.vtmp = current.v + current.dv; 921 | } 922 | 923 | beta = betaSave; 924 | 925 | if (padding!=0) 926 | { 927 | int width = pyramid.I0[scale].cols; 928 | int height = pyramid.I0[scale].rows; 929 | 930 | current.utmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.ufinal[scale]); 931 | current.vtmp(cv::Rect(padding, padding, width, height)).copyTo(pyramid.vfinal[scale]); 932 | } 933 | else { 934 | current.utmp.copyTo(pyramid.ufinal[scale]); 935 | current.vtmp.copyTo(pyramid.vfinal[scale]); 936 | } 937 | } 938 | 939 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_ &m) const 940 | { 941 | cv::Mat a; 942 | m.download(a); 943 | return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone(); 944 | } 945 | 946 | cv::Mat DeepFlowCuda::toCPU(const cv::cudev::GpuMat_ &m) const 947 | { 948 | cv::Mat a; 949 | m.download(a); 950 | cv::cvtColor(a, a, cv::COLOR_BGRA2BGR); 951 | return a(cv::Rect(padding, padding, m.cols-2*padding, m.rows-2*padding)).clone(); 952 | } 953 | 954 | void DeepFlowCuda::copyParamsToCuda() 955 | { 956 | int nbParams = 7; 957 | if (paramsCuda==nullptr && cudaMalloc(¶msCuda, nbParams*sizeof(float))!=cudaSuccess) 958 | { 959 | paramsCuda = nullptr; 960 | cout<<"cudaMalloc error"<. 19 | */ 20 | 21 | #ifndef DEEP_FLOW_CUDA_H 22 | #define DEEP_FLOW_CUDA_H 23 | 24 | #include 25 | #include // For template cv::cudev::GpuMat_ -> needs to be compiled with nvcc ! 26 | #include // For cv::cuda::Filter 27 | #include 28 | 29 | #include "myvec3f.h" 30 | 31 | class DeepFlowCuda 32 | { 33 | // Member variables 34 | protected: 35 | int fixedPointIterations, sorIterations; 36 | float omega; // Update parameter in SOR iterations 37 | float alpha; // Weight of smoothing term 38 | float beta; // Weight of matching term 39 | float delta; // Weight of color constancy in data term 40 | float gamma; // Weight of gradient constancy in data term 41 | float zeta; // Regularization parameter (added to norms in data term) 42 | float epsilon; // Regularization parameter in Psi function 43 | float scaleFactor; // Scale factor between two succesive levels 44 | float sigma; // Standard deviation of presmoothing Gaussian filter 45 | int nbScales; // Number of scales (levels). Will be calculated from minSize, scaleFactor and size of input image 46 | int padding; 47 | int minSize; // Minimum width or height at the highest level (at the coarsest scale) 48 | float *paramsCuda; 49 | 50 | // Padded images and flows at current scale 51 | struct { 52 | cv::cudev::GpuMat_ I0, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz; 53 | cv::cudev::GpuMat_ A11, A12, A22, b1, b2; 54 | cv::cudev::GpuMat_ smoothX, smoothY, luminance, smoothWeight; 55 | 56 | cv::cudev::GpuMat_ utmp, vtmp; // flow that is updated in each fixed point iteration 57 | cv::cudev::GpuMat_ du, dv; // flow increment, updated in each SOR iteration 58 | cv::cudev::GpuMat_ u, v; // red-black-buffer version of the input flow 59 | cv::cudev::GpuMat_ udesc, vdesc, descWeight; // Descriptor used in matching term 60 | 61 | int step; 62 | int stepColor; 63 | 64 | cv::Size size, sizePadded; 65 | } current; 66 | 67 | // Non-padded images and flows at all scales 68 | struct { 69 | std::vector > I0, I1; 70 | std::vector > uinit, vinit, ufinal, vfinal; 71 | std::vector > udesc, vdesc, descWeight; 72 | std::vector scale; 73 | } pyramid; 74 | 75 | // Derivative filters, and Gaussian filter 76 | std::shared_ptr filtg, filtx, filty, filtx5pt, filty5pt; 77 | 78 | // Member functions 79 | public: 80 | DeepFlowCuda(); 81 | ~DeepFlowCuda(); 82 | 83 | void computeFlow(const cv::Mat &I0, const cv::Mat &I1, cv::Mat &flow); 84 | 85 | int getFixedPointIterations() const { return fixedPointIterations; } 86 | void setFixedPointIterations(int val) { fixedPointIterations = val; } 87 | int getSorIterations() const { return sorIterations; } 88 | void setSorIterations(int val) { sorIterations = val; } 89 | float getOmega() const { return omega; } 90 | void setOmega(float val) { omega = val; } 91 | float getAlpha() const { return alpha; } 92 | void setAlpha(float val) { alpha = val; } 93 | float getBeta() const { return beta; } 94 | void setBeta(float val) { beta = val; } 95 | float getDelta() const { return delta; } 96 | void setDelta(float val) { delta = val; } 97 | float getGamma() const { return gamma; } 98 | void setGamma(float val) { gamma = val; } 99 | 100 | protected: 101 | bool createFilters(); 102 | 103 | void prepareBuffers(int scale); 104 | 105 | void computeDescWeight(); 106 | 107 | void computeDataTerm(); 108 | void computeMatchingTerm(); 109 | 110 | // The local smoothness weight is not described in the initial DeepFlow paper 111 | // [P. Weinzaepfel, J. Revaud, Z. Harchaoui and C. Schmid. DeepFlow: Large displacement optical flow with deep matching. ICCV 2013] 112 | // Instead it is mentioned in section 4.3 of the extended IJCV paper 113 | // [J. Revaud, P. Weinzaepfel, Z. Harchaoui and C. Schmid. DeepMatching: hierarchical deformable dense matching. IJCV 2016] 114 | // See ref [Wedel et al 2009] or [Xu et al 2012] 115 | void computeSmoothnessWeight(); 116 | 117 | void computeSmoothnessTerm(); 118 | 119 | void RedBlackSOR(); 120 | 121 | void computeOneLevel(int); 122 | 123 | // Remove padding and move to CPU 124 | cv::Mat toCPU(const cv::cudev::GpuMat_ &) const ; 125 | cv::Mat toCPU(const cv::cudev::GpuMat_ &) const; 126 | 127 | void copyParamsToCuda(); 128 | }; 129 | 130 | #endif 131 | -------------------------------------------------------------------------------- /main.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2019 Julien Mille 3 | 4 | This file is part of DeepFlowCUDA. 5 | 6 | DeepFlowCUDA is free software: you can redistribute 7 | it and/or modify it under the terms of the GNU Lesser General Public License 8 | as published by the Free Software Foundation, either version 3 of the License, 9 | or (at your option) any later version. 10 | 11 | DeepFlowCUDA is distributed in the hope that it will 12 | be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser 14 | General Public License for more details. 15 | 16 | You should have received a copy of the GNU General Public License, 17 | and a copy of the GNU Lesser General Public License, along with 18 | DeepFlowCUDA. If not, see . 19 | */ 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | #include // For cv::imread 26 | #include // For cv::readOpticalFlow, cv::writeOpticalFlow 27 | #include "deepflowcuda.h" 28 | 29 | using namespace std; 30 | 31 | float L2Distance(const cv::Mat &flow1, const cv::Mat &flow2, cv::Mat &dist) 32 | { 33 | assert(!flow1.empty() && flow1.type()==CV_32FC2 && flow2.type()==CV_32FC2 && flow1.size()==flow2.size()); 34 | 35 | dist.create(flow1.size(), CV_32F); 36 | 37 | float avgDist = 0; 38 | float d; 39 | 40 | for (int y=0; y(y,x); 44 | cv::Point2f b = flow2.at(y,x); 45 | d = (a-b).dot(a-b); 46 | avgDist += d; 47 | dist.at(y,x) = d; 48 | } 49 | avgDist /= flow1.cols*flow1.rows; 50 | return avgDist; 51 | } 52 | 53 | int main(int argc, char ** argv) 54 | { 55 | cv::Mat img0, img1, gtflow, flow, flowRefined; 56 | 57 | string dataDir = "./data/"; 58 | string outputDir = "./"; 59 | string outputPath = outputDir + "refinedflow_0001.flo"; 60 | 61 | img0 = cv::imread(dataDir + "frame_0001.jpg", cv::IMREAD_COLOR); 62 | if (img0.data==nullptr) 63 | { 64 | cout<<"Failed to read first image"<. 19 | */ 20 | 21 | #ifndef MY_VEC3F_H 22 | #define MY_VEC3F_H 23 | 24 | class MyVec3f 25 | { 26 | public: 27 | float x, y, z; 28 | __device__ MyVec3f() {} 29 | __device__ MyVec3f(float a, float b, float c):x(a),y(b),z(c) {} 30 | __device__ float dot(const MyVec3f &v) const {return x*v.x+y*v.y+z*v.z;} 31 | __device__ float norm2() const {return x*x+y*y+z*z;} 32 | __device__ float norm() const {return sqrt(x*x+y*y+z*z);} 33 | __device__ float l1norm() const {return fabs(x)+fabs(y)+fabs(z);} 34 | __device__ float sum() const {return x+y+z;} 35 | 36 | __device__ MyVec3f operator +(const MyVec3f &v) const 37 | { 38 | MyVec3f s; 39 | s.x = x+v.x; 40 | s.y = y+v.y; 41 | s.z = z+v.z; 42 | return s; 43 | } 44 | 45 | __device__ MyVec3f operator -(const MyVec3f &v) const 46 | { 47 | MyVec3f d; 48 | d.x = x-v.x; 49 | d.y = y-v.y; 50 | d.z = z-v.z; 51 | return d; 52 | } 53 | 54 | __device__ MyVec3f operator *(float f) const 55 | { 56 | MyVec3f v; 57 | v.x = x*f; 58 | v.y = y*f; 59 | v.z = z*f; 60 | return v; 61 | } 62 | 63 | // Element-wise product 64 | __device__ MyVec3f operator *(const MyVec3f &v) const 65 | { 66 | MyVec3f s; 67 | s.x = x*v.x; 68 | s.y = y*v.y; 69 | s.z = z*v.z; 70 | return s; 71 | } 72 | 73 | // Element-wise division 74 | __device__ MyVec3f operator /(const MyVec3f &v) const 75 | { 76 | MyVec3f s; 77 | s.x = x/v.x; 78 | s.y = y/v.y; 79 | s.z = z/v.z; 80 | return s; 81 | } 82 | }; 83 | 84 | #endif --------------------------------------------------------------------------------