├── .gitattributes ├── 1-fps.mp4 ├── 5-fps.mp4 ├── LICENSE ├── README.md ├── alt_cuda_corr ├── correlation.cpp ├── correlation_kernel.cu └── setup.py ├── core ├── __pycache__ │ ├── corr.cpython-37.pyc │ ├── extractor.cpython-37.pyc │ ├── raft.cpython-37.pyc │ └── update.cpython-37.pyc ├── corr.py ├── datasets.py ├── extractor.py ├── raft.py ├── update.py └── utils │ ├── __pycache__ │ ├── flow_viz.cpython-37.pyc │ └── utils.cpython-37.pyc │ ├── augmentor.py │ ├── flow_viz.py │ ├── frame_utils.py │ └── utils.py ├── dependencies.py ├── frcnn.py ├── main.py ├── out.mp4 ├── output-1-fps.mp4 ├── output.avi ├── output.mp4 ├── raft-models ├── raft-chairs.pth ├── raft-kitti.pth ├── raft-sintel.pth ├── raft-small.pth └── raft-things.pth └── vehicle.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /1-fps.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/1-fps.mp4 -------------------------------------------------------------------------------- /5-fps.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/5-fps.mp4 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Real Time Vehicle Detection and Tracking 2 | Trajectory extraction from traffic surveillance videos 3 | -------------------------------------------------------------------------------- /alt_cuda_corr/correlation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | // CUDA forward declarations 5 | std::vector corr_cuda_forward( 6 | torch::Tensor fmap1, 7 | torch::Tensor fmap2, 8 | torch::Tensor coords, 9 | int radius); 10 | 11 | std::vector corr_cuda_backward( 12 | torch::Tensor fmap1, 13 | torch::Tensor fmap2, 14 | torch::Tensor coords, 15 | torch::Tensor corr_grad, 16 | int radius); 17 | 18 | // C++ interface 19 | #define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") 20 | #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") 21 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 22 | 23 | std::vector corr_forward( 24 | torch::Tensor fmap1, 25 | torch::Tensor fmap2, 26 | torch::Tensor coords, 27 | int radius) { 28 | CHECK_INPUT(fmap1); 29 | CHECK_INPUT(fmap2); 30 | CHECK_INPUT(coords); 31 | 32 | return corr_cuda_forward(fmap1, fmap2, coords, radius); 33 | } 34 | 35 | 36 | std::vector corr_backward( 37 | torch::Tensor fmap1, 38 | torch::Tensor fmap2, 39 | torch::Tensor coords, 40 | torch::Tensor corr_grad, 41 | int radius) { 42 | CHECK_INPUT(fmap1); 43 | CHECK_INPUT(fmap2); 44 | CHECK_INPUT(coords); 45 | CHECK_INPUT(corr_grad); 46 | 47 | return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius); 48 | } 49 | 50 | 51 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 52 | m.def("forward", &corr_forward, "CORR forward"); 53 | m.def("backward", &corr_backward, "CORR backward"); 54 | } -------------------------------------------------------------------------------- /alt_cuda_corr/correlation_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #define BLOCK_H 4 8 | #define BLOCK_W 8 9 | #define BLOCK_HW BLOCK_H * BLOCK_W 10 | #define CHANNEL_STRIDE 32 11 | 12 | 13 | __forceinline__ __device__ 14 | bool within_bounds(int h, int w, int H, int W) { 15 | return h >= 0 && h < H && w >= 0 && w < W; 16 | } 17 | 18 | template 19 | __global__ void corr_forward_kernel( 20 | const torch::PackedTensorAccessor32 fmap1, 21 | const torch::PackedTensorAccessor32 fmap2, 22 | const torch::PackedTensorAccessor32 coords, 23 | torch::PackedTensorAccessor32 corr, 24 | int r) 25 | { 26 | const int b = blockIdx.x; 27 | const int h0 = blockIdx.y * blockDim.x; 28 | const int w0 = blockIdx.z * blockDim.y; 29 | const int tid = threadIdx.x * blockDim.y + threadIdx.y; 30 | 31 | const int H1 = fmap1.size(1); 32 | const int W1 = fmap1.size(2); 33 | const int H2 = fmap2.size(1); 34 | const int W2 = fmap2.size(2); 35 | const int N = coords.size(1); 36 | const int C = fmap1.size(3); 37 | 38 | __shared__ scalar_t f1[CHANNEL_STRIDE][BLOCK_HW+1]; 39 | __shared__ scalar_t f2[CHANNEL_STRIDE][BLOCK_HW+1]; 40 | __shared__ scalar_t x2s[BLOCK_HW]; 41 | __shared__ scalar_t y2s[BLOCK_HW]; 42 | 43 | for (int c=0; c(floor(y2s[k1]))-r+iy; 76 | int w2 = static_cast(floor(x2s[k1]))-r+ix; 77 | int c2 = tid % CHANNEL_STRIDE; 78 | 79 | auto fptr = fmap2[b][h2][w2]; 80 | if (within_bounds(h2, w2, H2, W2)) 81 | f2[c2][k1] = fptr[c+c2]; 82 | else 83 | f2[c2][k1] = 0.0; 84 | } 85 | 86 | __syncthreads(); 87 | 88 | scalar_t s = 0.0; 89 | for (int k=0; k 0 && ix > 0 && within_bounds(h1, w1, H1, W1)) 105 | *(corr_ptr + ix_nw) += nw; 106 | 107 | if (iy > 0 && ix < rd && within_bounds(h1, w1, H1, W1)) 108 | *(corr_ptr + ix_ne) += ne; 109 | 110 | if (iy < rd && ix > 0 && within_bounds(h1, w1, H1, W1)) 111 | *(corr_ptr + ix_sw) += sw; 112 | 113 | if (iy < rd && ix < rd && within_bounds(h1, w1, H1, W1)) 114 | *(corr_ptr + ix_se) += se; 115 | } 116 | } 117 | } 118 | } 119 | } 120 | 121 | 122 | template 123 | __global__ void corr_backward_kernel( 124 | const torch::PackedTensorAccessor32 fmap1, 125 | const torch::PackedTensorAccessor32 fmap2, 126 | const torch::PackedTensorAccessor32 coords, 127 | const torch::PackedTensorAccessor32 corr_grad, 128 | torch::PackedTensorAccessor32 fmap1_grad, 129 | torch::PackedTensorAccessor32 fmap2_grad, 130 | torch::PackedTensorAccessor32 coords_grad, 131 | int r) 132 | { 133 | 134 | const int b = blockIdx.x; 135 | const int h0 = blockIdx.y * blockDim.x; 136 | const int w0 = blockIdx.z * blockDim.y; 137 | const int tid = threadIdx.x * blockDim.y + threadIdx.y; 138 | 139 | const int H1 = fmap1.size(1); 140 | const int W1 = fmap1.size(2); 141 | const int H2 = fmap2.size(1); 142 | const int W2 = fmap2.size(2); 143 | const int N = coords.size(1); 144 | const int C = fmap1.size(3); 145 | 146 | __shared__ scalar_t f1[CHANNEL_STRIDE][BLOCK_HW+1]; 147 | __shared__ scalar_t f2[CHANNEL_STRIDE][BLOCK_HW+1]; 148 | 149 | __shared__ scalar_t f1_grad[CHANNEL_STRIDE][BLOCK_HW+1]; 150 | __shared__ scalar_t f2_grad[CHANNEL_STRIDE][BLOCK_HW+1]; 151 | 152 | __shared__ scalar_t x2s[BLOCK_HW]; 153 | __shared__ scalar_t y2s[BLOCK_HW]; 154 | 155 | for (int c=0; c(floor(y2s[k1]))-r+iy; 190 | int w2 = static_cast(floor(x2s[k1]))-r+ix; 191 | int c2 = tid % CHANNEL_STRIDE; 192 | 193 | auto fptr = fmap2[b][h2][w2]; 194 | if (within_bounds(h2, w2, H2, W2)) 195 | f2[c2][k1] = fptr[c+c2]; 196 | else 197 | f2[c2][k1] = 0.0; 198 | 199 | f2_grad[c2][k1] = 0.0; 200 | } 201 | 202 | __syncthreads(); 203 | 204 | const scalar_t* grad_ptr = &corr_grad[b][n][0][h1][w1]; 205 | scalar_t g = 0.0; 206 | 207 | int ix_nw = H1*W1*((iy-1) + rd*(ix-1)); 208 | int ix_ne = H1*W1*((iy-1) + rd*ix); 209 | int ix_sw = H1*W1*(iy + rd*(ix-1)); 210 | int ix_se = H1*W1*(iy + rd*ix); 211 | 212 | if (iy > 0 && ix > 0 && within_bounds(h1, w1, H1, W1)) 213 | g += *(grad_ptr + ix_nw) * dy * dx; 214 | 215 | if (iy > 0 && ix < rd && within_bounds(h1, w1, H1, W1)) 216 | g += *(grad_ptr + ix_ne) * dy * (1-dx); 217 | 218 | if (iy < rd && ix > 0 && within_bounds(h1, w1, H1, W1)) 219 | g += *(grad_ptr + ix_sw) * (1-dy) * dx; 220 | 221 | if (iy < rd && ix < rd && within_bounds(h1, w1, H1, W1)) 222 | g += *(grad_ptr + ix_se) * (1-dy) * (1-dx); 223 | 224 | for (int k=0; k(floor(y2s[k1]))-r+iy; 232 | int w2 = static_cast(floor(x2s[k1]))-r+ix; 233 | int c2 = tid % CHANNEL_STRIDE; 234 | 235 | scalar_t* fptr = &fmap2_grad[b][h2][w2][0]; 236 | if (within_bounds(h2, w2, H2, W2)) 237 | atomicAdd(fptr+c+c2, f2_grad[c2][k1]); 238 | } 239 | } 240 | } 241 | } 242 | __syncthreads(); 243 | 244 | 245 | for (int k=0; k corr_cuda_forward( 261 | torch::Tensor fmap1, 262 | torch::Tensor fmap2, 263 | torch::Tensor coords, 264 | int radius) 265 | { 266 | const auto B = coords.size(0); 267 | const auto N = coords.size(1); 268 | const auto H = coords.size(2); 269 | const auto W = coords.size(3); 270 | 271 | const auto rd = 2 * radius + 1; 272 | auto opts = fmap1.options(); 273 | auto corr = torch::zeros({B, N, rd*rd, H, W}, opts); 274 | 275 | const dim3 blocks(B, (H+BLOCK_H-1)/BLOCK_H, (W+BLOCK_W-1)/BLOCK_W); 276 | const dim3 threads(BLOCK_H, BLOCK_W); 277 | 278 | corr_forward_kernel<<>>( 279 | fmap1.packed_accessor32(), 280 | fmap2.packed_accessor32(), 281 | coords.packed_accessor32(), 282 | corr.packed_accessor32(), 283 | radius); 284 | 285 | return {corr}; 286 | } 287 | 288 | std::vector corr_cuda_backward( 289 | torch::Tensor fmap1, 290 | torch::Tensor fmap2, 291 | torch::Tensor coords, 292 | torch::Tensor corr_grad, 293 | int radius) 294 | { 295 | const auto B = coords.size(0); 296 | const auto N = coords.size(1); 297 | 298 | const auto H1 = fmap1.size(1); 299 | const auto W1 = fmap1.size(2); 300 | const auto H2 = fmap2.size(1); 301 | const auto W2 = fmap2.size(2); 302 | const auto C = fmap1.size(3); 303 | 304 | auto opts = fmap1.options(); 305 | auto fmap1_grad = torch::zeros({B, H1, W1, C}, opts); 306 | auto fmap2_grad = torch::zeros({B, H2, W2, C}, opts); 307 | auto coords_grad = torch::zeros({B, N, H1, W1, 2}, opts); 308 | 309 | const dim3 blocks(B, (H1+BLOCK_H-1)/BLOCK_H, (W1+BLOCK_W-1)/BLOCK_W); 310 | const dim3 threads(BLOCK_H, BLOCK_W); 311 | 312 | 313 | corr_backward_kernel<<>>( 314 | fmap1.packed_accessor32(), 315 | fmap2.packed_accessor32(), 316 | coords.packed_accessor32(), 317 | corr_grad.packed_accessor32(), 318 | fmap1_grad.packed_accessor32(), 319 | fmap2_grad.packed_accessor32(), 320 | coords_grad.packed_accessor32(), 321 | radius); 322 | 323 | return {fmap1_grad, fmap2_grad, coords_grad}; 324 | } -------------------------------------------------------------------------------- /alt_cuda_corr/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import BuildExtension, CUDAExtension 3 | 4 | 5 | setup( 6 | name='correlation', 7 | ext_modules=[ 8 | CUDAExtension('alt_cuda_corr', 9 | sources=['correlation.cpp', 'correlation_kernel.cu'], 10 | extra_compile_args={'cxx': [], 'nvcc': ['-O3']}), 11 | ], 12 | cmdclass={ 13 | 'build_ext': BuildExtension 14 | }) 15 | 16 | -------------------------------------------------------------------------------- /core/__pycache__/corr.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/__pycache__/corr.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/extractor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/__pycache__/extractor.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/raft.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/__pycache__/raft.cpython-37.pyc -------------------------------------------------------------------------------- /core/__pycache__/update.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/__pycache__/update.cpython-37.pyc -------------------------------------------------------------------------------- /core/corr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from utils.utils import bilinear_sampler, coords_grid 4 | 5 | try: 6 | import alt_cuda_corr 7 | except: 8 | # alt_cuda_corr is not compiled 9 | pass 10 | 11 | 12 | class CorrBlock: 13 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 14 | self.num_levels = num_levels 15 | self.radius = radius 16 | self.corr_pyramid = [] 17 | 18 | # all pairs correlation 19 | corr = CorrBlock.corr(fmap1, fmap2) 20 | 21 | batch, h1, w1, dim, h2, w2 = corr.shape 22 | corr = corr.reshape(batch*h1*w1, dim, h2, w2) 23 | 24 | self.corr_pyramid.append(corr) 25 | for i in range(self.num_levels-1): 26 | corr = F.avg_pool2d(corr, 2, stride=2) 27 | self.corr_pyramid.append(corr) 28 | 29 | def __call__(self, coords): 30 | r = self.radius 31 | coords = coords.permute(0, 2, 3, 1) 32 | batch, h1, w1, _ = coords.shape 33 | 34 | out_pyramid = [] 35 | for i in range(self.num_levels): 36 | corr = self.corr_pyramid[i] 37 | dx = torch.linspace(-r, r, 2*r+1) 38 | dy = torch.linspace(-r, r, 2*r+1) 39 | delta = torch.stack(torch.meshgrid(dy, dx), axis=-1).to(coords.device) 40 | 41 | centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i 42 | delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2) 43 | coords_lvl = centroid_lvl + delta_lvl 44 | 45 | corr = bilinear_sampler(corr, coords_lvl) 46 | corr = corr.view(batch, h1, w1, -1) 47 | out_pyramid.append(corr) 48 | 49 | out = torch.cat(out_pyramid, dim=-1) 50 | return out.permute(0, 3, 1, 2).contiguous().float() 51 | 52 | @staticmethod 53 | def corr(fmap1, fmap2): 54 | batch, dim, ht, wd = fmap1.shape 55 | fmap1 = fmap1.view(batch, dim, ht*wd) 56 | fmap2 = fmap2.view(batch, dim, ht*wd) 57 | 58 | corr = torch.matmul(fmap1.transpose(1,2), fmap2) 59 | corr = corr.view(batch, ht, wd, 1, ht, wd) 60 | return corr / torch.sqrt(torch.tensor(dim).float()) 61 | 62 | 63 | class AlternateCorrBlock: 64 | def __init__(self, fmap1, fmap2, num_levels=4, radius=4): 65 | self.num_levels = num_levels 66 | self.radius = radius 67 | 68 | self.pyramid = [(fmap1, fmap2)] 69 | for i in range(self.num_levels): 70 | fmap1 = F.avg_pool2d(fmap1, 2, stride=2) 71 | fmap2 = F.avg_pool2d(fmap2, 2, stride=2) 72 | self.pyramid.append((fmap1, fmap2)) 73 | 74 | def __call__(self, coords): 75 | coords = coords.permute(0, 2, 3, 1) 76 | B, H, W, _ = coords.shape 77 | dim = self.pyramid[0][0].shape[1] 78 | 79 | corr_list = [] 80 | for i in range(self.num_levels): 81 | r = self.radius 82 | fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous() 83 | fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous() 84 | 85 | coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() 86 | corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r) 87 | corr_list.append(corr.squeeze(1)) 88 | 89 | corr = torch.stack(corr_list, dim=1) 90 | corr = corr.reshape(B, -1, H, W) 91 | return corr / torch.sqrt(torch.tensor(dim).float()) 92 | -------------------------------------------------------------------------------- /core/datasets.py: -------------------------------------------------------------------------------- 1 | # Data loading based on https://github.com/NVIDIA/flownet2-pytorch 2 | 3 | import numpy as np 4 | import torch 5 | import torch.utils.data as data 6 | import torch.nn.functional as F 7 | 8 | import os 9 | import math 10 | import random 11 | from glob import glob 12 | import os.path as osp 13 | 14 | from utils import frame_utils 15 | from utils.augmentor import FlowAugmentor, SparseFlowAugmentor 16 | 17 | 18 | class FlowDataset(data.Dataset): 19 | def __init__(self, aug_params=None, sparse=False): 20 | self.augmentor = None 21 | self.sparse = sparse 22 | if aug_params is not None: 23 | if sparse: 24 | self.augmentor = SparseFlowAugmentor(**aug_params) 25 | else: 26 | self.augmentor = FlowAugmentor(**aug_params) 27 | 28 | self.is_test = False 29 | self.init_seed = False 30 | self.flow_list = [] 31 | self.image_list = [] 32 | self.extra_info = [] 33 | 34 | def __getitem__(self, index): 35 | 36 | if self.is_test: 37 | img1 = frame_utils.read_gen(self.image_list[index][0]) 38 | img2 = frame_utils.read_gen(self.image_list[index][1]) 39 | img1 = np.array(img1).astype(np.uint8)[..., :3] 40 | img2 = np.array(img2).astype(np.uint8)[..., :3] 41 | img1 = torch.from_numpy(img1).permute(2, 0, 1).float() 42 | img2 = torch.from_numpy(img2).permute(2, 0, 1).float() 43 | return img1, img2, self.extra_info[index] 44 | 45 | if not self.init_seed: 46 | worker_info = torch.utils.data.get_worker_info() 47 | if worker_info is not None: 48 | torch.manual_seed(worker_info.id) 49 | np.random.seed(worker_info.id) 50 | random.seed(worker_info.id) 51 | self.init_seed = True 52 | 53 | index = index % len(self.image_list) 54 | valid = None 55 | if self.sparse: 56 | flow, valid = frame_utils.readFlowKITTI(self.flow_list[index]) 57 | else: 58 | flow = frame_utils.read_gen(self.flow_list[index]) 59 | 60 | img1 = frame_utils.read_gen(self.image_list[index][0]) 61 | img2 = frame_utils.read_gen(self.image_list[index][1]) 62 | 63 | flow = np.array(flow).astype(np.float32) 64 | img1 = np.array(img1).astype(np.uint8) 65 | img2 = np.array(img2).astype(np.uint8) 66 | 67 | # grayscale images 68 | if len(img1.shape) == 2: 69 | img1 = np.tile(img1[...,None], (1, 1, 3)) 70 | img2 = np.tile(img2[...,None], (1, 1, 3)) 71 | else: 72 | img1 = img1[..., :3] 73 | img2 = img2[..., :3] 74 | 75 | if self.augmentor is not None: 76 | if self.sparse: 77 | img1, img2, flow, valid = self.augmentor(img1, img2, flow, valid) 78 | else: 79 | img1, img2, flow = self.augmentor(img1, img2, flow) 80 | 81 | img1 = torch.from_numpy(img1).permute(2, 0, 1).float() 82 | img2 = torch.from_numpy(img2).permute(2, 0, 1).float() 83 | flow = torch.from_numpy(flow).permute(2, 0, 1).float() 84 | 85 | if valid is not None: 86 | valid = torch.from_numpy(valid) 87 | else: 88 | valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000) 89 | 90 | return img1, img2, flow, valid.float() 91 | 92 | 93 | def __rmul__(self, v): 94 | self.flow_list = v * self.flow_list 95 | self.image_list = v * self.image_list 96 | return self 97 | 98 | def __len__(self): 99 | return len(self.image_list) 100 | 101 | 102 | class MpiSintel(FlowDataset): 103 | def __init__(self, aug_params=None, split='training', root='datasets/Sintel', dstype='clean'): 104 | super(MpiSintel, self).__init__(aug_params) 105 | flow_root = osp.join(root, split, 'flow') 106 | image_root = osp.join(root, split, dstype) 107 | 108 | if split == 'test': 109 | self.is_test = True 110 | 111 | for scene in os.listdir(image_root): 112 | image_list = sorted(glob(osp.join(image_root, scene, '*.png'))) 113 | for i in range(len(image_list)-1): 114 | self.image_list += [ [image_list[i], image_list[i+1]] ] 115 | self.extra_info += [ (scene, i) ] # scene and frame_id 116 | 117 | if split != 'test': 118 | self.flow_list += sorted(glob(osp.join(flow_root, scene, '*.flo'))) 119 | 120 | 121 | class FlyingChairs(FlowDataset): 122 | def __init__(self, aug_params=None, split='train', root='datasets/FlyingChairs_release/data'): 123 | super(FlyingChairs, self).__init__(aug_params) 124 | 125 | images = sorted(glob(osp.join(root, '*.ppm'))) 126 | flows = sorted(glob(osp.join(root, '*.flo'))) 127 | assert (len(images)//2 == len(flows)) 128 | 129 | split_list = np.loadtxt('chairs_split.txt', dtype=np.int32) 130 | for i in range(len(flows)): 131 | xid = split_list[i] 132 | if (split=='training' and xid==1) or (split=='validation' and xid==2): 133 | self.flow_list += [ flows[i] ] 134 | self.image_list += [ [images[2*i], images[2*i+1]] ] 135 | 136 | 137 | class FlyingThings3D(FlowDataset): 138 | def __init__(self, aug_params=None, root='datasets/FlyingThings3D', dstype='frames_cleanpass'): 139 | super(FlyingThings3D, self).__init__(aug_params) 140 | 141 | for cam in ['left']: 142 | for direction in ['into_future', 'into_past']: 143 | image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*'))) 144 | image_dirs = sorted([osp.join(f, cam) for f in image_dirs]) 145 | 146 | flow_dirs = sorted(glob(osp.join(root, 'optical_flow/TRAIN/*/*'))) 147 | flow_dirs = sorted([osp.join(f, direction, cam) for f in flow_dirs]) 148 | 149 | for idir, fdir in zip(image_dirs, flow_dirs): 150 | images = sorted(glob(osp.join(idir, '*.png')) ) 151 | flows = sorted(glob(osp.join(fdir, '*.pfm')) ) 152 | for i in range(len(flows)-1): 153 | if direction == 'into_future': 154 | self.image_list += [ [images[i], images[i+1]] ] 155 | self.flow_list += [ flows[i] ] 156 | elif direction == 'into_past': 157 | self.image_list += [ [images[i+1], images[i]] ] 158 | self.flow_list += [ flows[i+1] ] 159 | 160 | 161 | class KITTI(FlowDataset): 162 | def __init__(self, aug_params=None, split='training', root='datasets/KITTI'): 163 | super(KITTI, self).__init__(aug_params, sparse=True) 164 | if split == 'testing': 165 | self.is_test = True 166 | 167 | root = osp.join(root, split) 168 | images1 = sorted(glob(osp.join(root, 'image_2/*_10.png'))) 169 | images2 = sorted(glob(osp.join(root, 'image_2/*_11.png'))) 170 | 171 | for img1, img2 in zip(images1, images2): 172 | frame_id = img1.split('/')[-1] 173 | self.extra_info += [ [frame_id] ] 174 | self.image_list += [ [img1, img2] ] 175 | 176 | if split == 'training': 177 | self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png'))) 178 | 179 | 180 | class HD1K(FlowDataset): 181 | def __init__(self, aug_params=None, root='datasets/HD1k'): 182 | super(HD1K, self).__init__(aug_params, sparse=True) 183 | 184 | seq_ix = 0 185 | while 1: 186 | flows = sorted(glob(os.path.join(root, 'hd1k_flow_gt', 'flow_occ/%06d_*.png' % seq_ix))) 187 | images = sorted(glob(os.path.join(root, 'hd1k_input', 'image_2/%06d_*.png' % seq_ix))) 188 | 189 | if len(flows) == 0: 190 | break 191 | 192 | for i in range(len(flows)-1): 193 | self.flow_list += [flows[i]] 194 | self.image_list += [ [images[i], images[i+1]] ] 195 | 196 | seq_ix += 1 197 | 198 | 199 | def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'): 200 | """ Create the data loader for the corresponding trainign set """ 201 | 202 | if args.stage == 'chairs': 203 | aug_params = {'crop_size': args.image_size, 'min_scale': -0.1, 'max_scale': 1.0, 'do_flip': True} 204 | train_dataset = FlyingChairs(aug_params, split='training') 205 | 206 | elif args.stage == 'things': 207 | aug_params = {'crop_size': args.image_size, 'min_scale': -0.4, 'max_scale': 0.8, 'do_flip': True} 208 | clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass') 209 | final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass') 210 | train_dataset = clean_dataset + final_dataset 211 | 212 | elif args.stage == 'sintel': 213 | aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.6, 'do_flip': True} 214 | things = FlyingThings3D(aug_params, dstype='frames_cleanpass') 215 | sintel_clean = MpiSintel(aug_params, split='training', dstype='clean') 216 | sintel_final = MpiSintel(aug_params, split='training', dstype='final') 217 | 218 | if TRAIN_DS == 'C+T+K+S+H': 219 | kitti = KITTI({'crop_size': args.image_size, 'min_scale': -0.3, 'max_scale': 0.5, 'do_flip': True}) 220 | hd1k = HD1K({'crop_size': args.image_size, 'min_scale': -0.5, 'max_scale': 0.2, 'do_flip': True}) 221 | train_dataset = 100*sintel_clean + 100*sintel_final + 200*kitti + 5*hd1k + things 222 | 223 | elif TRAIN_DS == 'C+T+K/S': 224 | train_dataset = 100*sintel_clean + 100*sintel_final + things 225 | 226 | elif args.stage == 'kitti': 227 | aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.4, 'do_flip': False} 228 | train_dataset = KITTI(aug_params, split='training') 229 | 230 | train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, 231 | pin_memory=False, shuffle=True, num_workers=4, drop_last=True) 232 | 233 | print('Training with %d image pairs' % len(train_dataset)) 234 | return train_loader 235 | 236 | -------------------------------------------------------------------------------- /core/extractor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ResidualBlock(nn.Module): 7 | def __init__(self, in_planes, planes, norm_fn='group', stride=1): 8 | super(ResidualBlock, self).__init__() 9 | 10 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride) 11 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) 12 | self.relu = nn.ReLU(inplace=True) 13 | 14 | num_groups = planes // 8 15 | 16 | if norm_fn == 'group': 17 | self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 18 | self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 19 | if not stride == 1: 20 | self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 21 | 22 | elif norm_fn == 'batch': 23 | self.norm1 = nn.BatchNorm2d(planes) 24 | self.norm2 = nn.BatchNorm2d(planes) 25 | if not stride == 1: 26 | self.norm3 = nn.BatchNorm2d(planes) 27 | 28 | elif norm_fn == 'instance': 29 | self.norm1 = nn.InstanceNorm2d(planes) 30 | self.norm2 = nn.InstanceNorm2d(planes) 31 | if not stride == 1: 32 | self.norm3 = nn.InstanceNorm2d(planes) 33 | 34 | elif norm_fn == 'none': 35 | self.norm1 = nn.Sequential() 36 | self.norm2 = nn.Sequential() 37 | if not stride == 1: 38 | self.norm3 = nn.Sequential() 39 | 40 | if stride == 1: 41 | self.downsample = None 42 | 43 | else: 44 | self.downsample = nn.Sequential( 45 | nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3) 46 | 47 | 48 | def forward(self, x): 49 | y = x 50 | y = self.relu(self.norm1(self.conv1(y))) 51 | y = self.relu(self.norm2(self.conv2(y))) 52 | 53 | if self.downsample is not None: 54 | x = self.downsample(x) 55 | 56 | return self.relu(x+y) 57 | 58 | 59 | 60 | class BottleneckBlock(nn.Module): 61 | def __init__(self, in_planes, planes, norm_fn='group', stride=1): 62 | super(BottleneckBlock, self).__init__() 63 | 64 | self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0) 65 | self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride) 66 | self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0) 67 | self.relu = nn.ReLU(inplace=True) 68 | 69 | num_groups = planes // 8 70 | 71 | if norm_fn == 'group': 72 | self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) 73 | self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4) 74 | self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 75 | if not stride == 1: 76 | self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes) 77 | 78 | elif norm_fn == 'batch': 79 | self.norm1 = nn.BatchNorm2d(planes//4) 80 | self.norm2 = nn.BatchNorm2d(planes//4) 81 | self.norm3 = nn.BatchNorm2d(planes) 82 | if not stride == 1: 83 | self.norm4 = nn.BatchNorm2d(planes) 84 | 85 | elif norm_fn == 'instance': 86 | self.norm1 = nn.InstanceNorm2d(planes//4) 87 | self.norm2 = nn.InstanceNorm2d(planes//4) 88 | self.norm3 = nn.InstanceNorm2d(planes) 89 | if not stride == 1: 90 | self.norm4 = nn.InstanceNorm2d(planes) 91 | 92 | elif norm_fn == 'none': 93 | self.norm1 = nn.Sequential() 94 | self.norm2 = nn.Sequential() 95 | self.norm3 = nn.Sequential() 96 | if not stride == 1: 97 | self.norm4 = nn.Sequential() 98 | 99 | if stride == 1: 100 | self.downsample = None 101 | 102 | else: 103 | self.downsample = nn.Sequential( 104 | nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4) 105 | 106 | 107 | def forward(self, x): 108 | y = x 109 | y = self.relu(self.norm1(self.conv1(y))) 110 | y = self.relu(self.norm2(self.conv2(y))) 111 | y = self.relu(self.norm3(self.conv3(y))) 112 | 113 | if self.downsample is not None: 114 | x = self.downsample(x) 115 | 116 | return self.relu(x+y) 117 | 118 | class BasicEncoder(nn.Module): 119 | def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): 120 | super(BasicEncoder, self).__init__() 121 | self.norm_fn = norm_fn 122 | 123 | if self.norm_fn == 'group': 124 | self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) 125 | 126 | elif self.norm_fn == 'batch': 127 | self.norm1 = nn.BatchNorm2d(64) 128 | 129 | elif self.norm_fn == 'instance': 130 | self.norm1 = nn.InstanceNorm2d(64) 131 | 132 | elif self.norm_fn == 'none': 133 | self.norm1 = nn.Sequential() 134 | 135 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) 136 | self.relu1 = nn.ReLU(inplace=True) 137 | 138 | self.in_planes = 64 139 | self.layer1 = self._make_layer(64, stride=1) 140 | self.layer2 = self._make_layer(96, stride=2) 141 | self.layer3 = self._make_layer(128, stride=2) 142 | 143 | # output convolution 144 | self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) 145 | 146 | self.dropout = None 147 | if dropout > 0: 148 | self.dropout = nn.Dropout2d(p=dropout) 149 | 150 | for m in self.modules(): 151 | if isinstance(m, nn.Conv2d): 152 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 153 | elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): 154 | if m.weight is not None: 155 | nn.init.constant_(m.weight, 1) 156 | if m.bias is not None: 157 | nn.init.constant_(m.bias, 0) 158 | 159 | def _make_layer(self, dim, stride=1): 160 | layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride) 161 | layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) 162 | layers = (layer1, layer2) 163 | 164 | self.in_planes = dim 165 | return nn.Sequential(*layers) 166 | 167 | 168 | def forward(self, x): 169 | 170 | # if input is list, combine batch dimension 171 | is_list = isinstance(x, tuple) or isinstance(x, list) 172 | if is_list: 173 | batch_dim = x[0].shape[0] 174 | x = torch.cat(x, dim=0) 175 | 176 | x = self.conv1(x) 177 | x = self.norm1(x) 178 | x = self.relu1(x) 179 | 180 | x = self.layer1(x) 181 | x = self.layer2(x) 182 | x = self.layer3(x) 183 | 184 | x = self.conv2(x) 185 | 186 | if self.training and self.dropout is not None: 187 | x = self.dropout(x) 188 | 189 | if is_list: 190 | x = torch.split(x, [batch_dim, batch_dim], dim=0) 191 | 192 | return x 193 | 194 | 195 | class SmallEncoder(nn.Module): 196 | def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): 197 | super(SmallEncoder, self).__init__() 198 | self.norm_fn = norm_fn 199 | 200 | if self.norm_fn == 'group': 201 | self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) 202 | 203 | elif self.norm_fn == 'batch': 204 | self.norm1 = nn.BatchNorm2d(32) 205 | 206 | elif self.norm_fn == 'instance': 207 | self.norm1 = nn.InstanceNorm2d(32) 208 | 209 | elif self.norm_fn == 'none': 210 | self.norm1 = nn.Sequential() 211 | 212 | self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) 213 | self.relu1 = nn.ReLU(inplace=True) 214 | 215 | self.in_planes = 32 216 | self.layer1 = self._make_layer(32, stride=1) 217 | self.layer2 = self._make_layer(64, stride=2) 218 | self.layer3 = self._make_layer(96, stride=2) 219 | 220 | self.dropout = None 221 | if dropout > 0: 222 | self.dropout = nn.Dropout2d(p=dropout) 223 | 224 | self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) 225 | 226 | for m in self.modules(): 227 | if isinstance(m, nn.Conv2d): 228 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 229 | elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): 230 | if m.weight is not None: 231 | nn.init.constant_(m.weight, 1) 232 | if m.bias is not None: 233 | nn.init.constant_(m.bias, 0) 234 | 235 | def _make_layer(self, dim, stride=1): 236 | layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride) 237 | layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) 238 | layers = (layer1, layer2) 239 | 240 | self.in_planes = dim 241 | return nn.Sequential(*layers) 242 | 243 | 244 | def forward(self, x): 245 | 246 | # if input is list, combine batch dimension 247 | is_list = isinstance(x, tuple) or isinstance(x, list) 248 | if is_list: 249 | batch_dim = x[0].shape[0] 250 | x = torch.cat(x, dim=0) 251 | 252 | x = self.conv1(x) 253 | x = self.norm1(x) 254 | x = self.relu1(x) 255 | 256 | x = self.layer1(x) 257 | x = self.layer2(x) 258 | x = self.layer3(x) 259 | x = self.conv2(x) 260 | 261 | if self.training and self.dropout is not None: 262 | x = self.dropout(x) 263 | 264 | if is_list: 265 | x = torch.split(x, [batch_dim, batch_dim], dim=0) 266 | 267 | return x 268 | -------------------------------------------------------------------------------- /core/raft.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from update import BasicUpdateBlock, SmallUpdateBlock 7 | from extractor import BasicEncoder, SmallEncoder 8 | from corr import CorrBlock, AlternateCorrBlock 9 | from utils.utils import bilinear_sampler, coords_grid, upflow8 10 | 11 | try: 12 | autocast = torch.cuda.amp.autocast 13 | except: 14 | # dummy autocast for PyTorch < 1.6 15 | class autocast: 16 | def __init__(self, enabled): 17 | pass 18 | def __enter__(self): 19 | pass 20 | def __exit__(self, *args): 21 | pass 22 | 23 | 24 | class RAFT(nn.Module): 25 | def __init__(self, args): 26 | super(RAFT, self).__init__() 27 | self.args = args 28 | 29 | if args.small: 30 | self.hidden_dim = hdim = 96 31 | self.context_dim = cdim = 64 32 | args.corr_levels = 4 33 | args.corr_radius = 3 34 | 35 | else: 36 | self.hidden_dim = hdim = 128 37 | self.context_dim = cdim = 128 38 | args.corr_levels = 4 39 | args.corr_radius = 4 40 | 41 | if 'dropout' not in self.args: 42 | self.args.dropout = 0 43 | 44 | if 'alternate_corr' not in self.args: 45 | self.args.alternate_corr = False 46 | 47 | # feature network, context network, and update block 48 | if args.small: 49 | self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout) 50 | self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout) 51 | self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim) 52 | 53 | else: 54 | self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout) 55 | self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout) 56 | self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim) 57 | 58 | def freeze_bn(self): 59 | for m in self.modules(): 60 | if isinstance(m, nn.BatchNorm2d): 61 | m.eval() 62 | 63 | def initialize_flow(self, img): 64 | """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" 65 | N, C, H, W = img.shape 66 | coords0 = coords_grid(N, H//8, W//8).to(img.device) 67 | coords1 = coords_grid(N, H//8, W//8).to(img.device) 68 | 69 | # optical flow computed as difference: flow = coords1 - coords0 70 | return coords0, coords1 71 | 72 | def upsample_flow(self, flow, mask): 73 | """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ 74 | N, _, H, W = flow.shape 75 | mask = mask.view(N, 1, 9, 8, 8, H, W) 76 | mask = torch.softmax(mask, dim=2) 77 | 78 | up_flow = F.unfold(8 * flow, [3,3], padding=1) 79 | up_flow = up_flow.view(N, 2, 9, 1, 1, H, W) 80 | 81 | up_flow = torch.sum(mask * up_flow, dim=2) 82 | up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) 83 | return up_flow.reshape(N, 2, 8*H, 8*W) 84 | 85 | 86 | def forward(self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False): 87 | """ Estimate optical flow between pair of frames """ 88 | 89 | image1 = 2 * (image1 / 255.0) - 1.0 90 | image2 = 2 * (image2 / 255.0) - 1.0 91 | 92 | image1 = image1.contiguous() 93 | image2 = image2.contiguous() 94 | 95 | hdim = self.hidden_dim 96 | cdim = self.context_dim 97 | 98 | # run the feature network 99 | with autocast(enabled=self.args.mixed_precision): 100 | fmap1, fmap2 = self.fnet([image1, image2]) 101 | 102 | fmap1 = fmap1.float() 103 | fmap2 = fmap2.float() 104 | if self.args.alternate_corr: 105 | corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius) 106 | else: 107 | corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius) 108 | 109 | # run the context network 110 | with autocast(enabled=self.args.mixed_precision): 111 | cnet = self.cnet(image1) 112 | net, inp = torch.split(cnet, [hdim, cdim], dim=1) 113 | net = torch.tanh(net) 114 | inp = torch.relu(inp) 115 | 116 | coords0, coords1 = self.initialize_flow(image1) 117 | 118 | if flow_init is not None: 119 | coords1 = coords1 + flow_init 120 | 121 | flow_predictions = [] 122 | for itr in range(iters): 123 | coords1 = coords1.detach() 124 | corr = corr_fn(coords1) # index correlation volume 125 | 126 | flow = coords1 - coords0 127 | with autocast(enabled=self.args.mixed_precision): 128 | net, up_mask, delta_flow = self.update_block(net, inp, corr, flow) 129 | 130 | # F(t+1) = F(t) + \Delta(t) 131 | coords1 = coords1 + delta_flow 132 | 133 | # upsample predictions 134 | if up_mask is None: 135 | flow_up = upflow8(coords1 - coords0) 136 | else: 137 | flow_up = self.upsample_flow(coords1 - coords0, up_mask) 138 | 139 | flow_predictions.append(flow_up) 140 | 141 | if test_mode: 142 | return coords1 - coords0, flow_up 143 | 144 | return flow_predictions 145 | -------------------------------------------------------------------------------- /core/update.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class FlowHead(nn.Module): 7 | def __init__(self, input_dim=128, hidden_dim=256): 8 | super(FlowHead, self).__init__() 9 | self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) 10 | self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1) 11 | self.relu = nn.ReLU(inplace=True) 12 | 13 | def forward(self, x): 14 | return self.conv2(self.relu(self.conv1(x))) 15 | 16 | class ConvGRU(nn.Module): 17 | def __init__(self, hidden_dim=128, input_dim=192+128): 18 | super(ConvGRU, self).__init__() 19 | self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) 20 | self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) 21 | self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1) 22 | 23 | def forward(self, h, x): 24 | hx = torch.cat([h, x], dim=1) 25 | 26 | z = torch.sigmoid(self.convz(hx)) 27 | r = torch.sigmoid(self.convr(hx)) 28 | q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1))) 29 | 30 | h = (1-z) * h + z * q 31 | return h 32 | 33 | class SepConvGRU(nn.Module): 34 | def __init__(self, hidden_dim=128, input_dim=192+128): 35 | super(SepConvGRU, self).__init__() 36 | self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) 37 | self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) 38 | self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2)) 39 | 40 | self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) 41 | self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) 42 | self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0)) 43 | 44 | 45 | def forward(self, h, x): 46 | # horizontal 47 | hx = torch.cat([h, x], dim=1) 48 | z = torch.sigmoid(self.convz1(hx)) 49 | r = torch.sigmoid(self.convr1(hx)) 50 | q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1))) 51 | h = (1-z) * h + z * q 52 | 53 | # vertical 54 | hx = torch.cat([h, x], dim=1) 55 | z = torch.sigmoid(self.convz2(hx)) 56 | r = torch.sigmoid(self.convr2(hx)) 57 | q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1))) 58 | h = (1-z) * h + z * q 59 | 60 | return h 61 | 62 | class SmallMotionEncoder(nn.Module): 63 | def __init__(self, args): 64 | super(SmallMotionEncoder, self).__init__() 65 | cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 66 | self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0) 67 | self.convf1 = nn.Conv2d(2, 64, 7, padding=3) 68 | self.convf2 = nn.Conv2d(64, 32, 3, padding=1) 69 | self.conv = nn.Conv2d(128, 80, 3, padding=1) 70 | 71 | def forward(self, flow, corr): 72 | cor = F.relu(self.convc1(corr)) 73 | flo = F.relu(self.convf1(flow)) 74 | flo = F.relu(self.convf2(flo)) 75 | cor_flo = torch.cat([cor, flo], dim=1) 76 | out = F.relu(self.conv(cor_flo)) 77 | return torch.cat([out, flow], dim=1) 78 | 79 | class BasicMotionEncoder(nn.Module): 80 | def __init__(self, args): 81 | super(BasicMotionEncoder, self).__init__() 82 | cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2 83 | self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0) 84 | self.convc2 = nn.Conv2d(256, 192, 3, padding=1) 85 | self.convf1 = nn.Conv2d(2, 128, 7, padding=3) 86 | self.convf2 = nn.Conv2d(128, 64, 3, padding=1) 87 | self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1) 88 | 89 | def forward(self, flow, corr): 90 | cor = F.relu(self.convc1(corr)) 91 | cor = F.relu(self.convc2(cor)) 92 | flo = F.relu(self.convf1(flow)) 93 | flo = F.relu(self.convf2(flo)) 94 | 95 | cor_flo = torch.cat([cor, flo], dim=1) 96 | out = F.relu(self.conv(cor_flo)) 97 | return torch.cat([out, flow], dim=1) 98 | 99 | class SmallUpdateBlock(nn.Module): 100 | def __init__(self, args, hidden_dim=96): 101 | super(SmallUpdateBlock, self).__init__() 102 | self.encoder = SmallMotionEncoder(args) 103 | self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64) 104 | self.flow_head = FlowHead(hidden_dim, hidden_dim=128) 105 | 106 | def forward(self, net, inp, corr, flow): 107 | motion_features = self.encoder(flow, corr) 108 | inp = torch.cat([inp, motion_features], dim=1) 109 | net = self.gru(net, inp) 110 | delta_flow = self.flow_head(net) 111 | 112 | return net, None, delta_flow 113 | 114 | class BasicUpdateBlock(nn.Module): 115 | def __init__(self, args, hidden_dim=128, input_dim=128): 116 | super(BasicUpdateBlock, self).__init__() 117 | self.args = args 118 | self.encoder = BasicMotionEncoder(args) 119 | self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim) 120 | self.flow_head = FlowHead(hidden_dim, hidden_dim=256) 121 | 122 | self.mask = nn.Sequential( 123 | nn.Conv2d(128, 256, 3, padding=1), 124 | nn.ReLU(inplace=True), 125 | nn.Conv2d(256, 64*9, 1, padding=0)) 126 | 127 | def forward(self, net, inp, corr, flow, upsample=True): 128 | motion_features = self.encoder(flow, corr) 129 | inp = torch.cat([inp, motion_features], dim=1) 130 | 131 | net = self.gru(net, inp) 132 | delta_flow = self.flow_head(net) 133 | 134 | # scale mask to balence gradients 135 | mask = .25 * self.mask(net) 136 | return net, mask, delta_flow 137 | 138 | 139 | 140 | -------------------------------------------------------------------------------- /core/utils/__pycache__/flow_viz.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/utils/__pycache__/flow_viz.cpython-37.pyc -------------------------------------------------------------------------------- /core/utils/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/core/utils/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /core/utils/augmentor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | from PIL import Image 5 | 6 | import cv2 7 | cv2.setNumThreads(0) 8 | cv2.ocl.setUseOpenCL(False) 9 | 10 | import torch 11 | from torchvision.transforms import ColorJitter 12 | import torch.nn.functional as F 13 | 14 | 15 | class FlowAugmentor: 16 | def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True): 17 | 18 | # spatial augmentation params 19 | self.crop_size = crop_size 20 | self.min_scale = min_scale 21 | self.max_scale = max_scale 22 | self.spatial_aug_prob = 0.8 23 | self.stretch_prob = 0.8 24 | self.max_stretch = 0.2 25 | 26 | # flip augmentation params 27 | self.do_flip = do_flip 28 | self.h_flip_prob = 0.5 29 | self.v_flip_prob = 0.1 30 | 31 | # photometric augmentation params 32 | self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14) 33 | self.asymmetric_color_aug_prob = 0.2 34 | self.eraser_aug_prob = 0.5 35 | 36 | def color_transform(self, img1, img2): 37 | """ Photometric augmentation """ 38 | 39 | # asymmetric 40 | if np.random.rand() < self.asymmetric_color_aug_prob: 41 | img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) 42 | img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) 43 | 44 | # symmetric 45 | else: 46 | image_stack = np.concatenate([img1, img2], axis=0) 47 | image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) 48 | img1, img2 = np.split(image_stack, 2, axis=0) 49 | 50 | return img1, img2 51 | 52 | def eraser_transform(self, img1, img2, bounds=[50, 100]): 53 | """ Occlusion augmentation """ 54 | 55 | ht, wd = img1.shape[:2] 56 | if np.random.rand() < self.eraser_aug_prob: 57 | mean_color = np.mean(img2.reshape(-1, 3), axis=0) 58 | for _ in range(np.random.randint(1, 3)): 59 | x0 = np.random.randint(0, wd) 60 | y0 = np.random.randint(0, ht) 61 | dx = np.random.randint(bounds[0], bounds[1]) 62 | dy = np.random.randint(bounds[0], bounds[1]) 63 | img2[y0:y0+dy, x0:x0+dx, :] = mean_color 64 | 65 | return img1, img2 66 | 67 | def spatial_transform(self, img1, img2, flow): 68 | # randomly sample scale 69 | ht, wd = img1.shape[:2] 70 | min_scale = np.maximum( 71 | (self.crop_size[0] + 8) / float(ht), 72 | (self.crop_size[1] + 8) / float(wd)) 73 | 74 | scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) 75 | scale_x = scale 76 | scale_y = scale 77 | if np.random.rand() < self.stretch_prob: 78 | scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) 79 | scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch) 80 | 81 | scale_x = np.clip(scale_x, min_scale, None) 82 | scale_y = np.clip(scale_y, min_scale, None) 83 | 84 | if np.random.rand() < self.spatial_aug_prob: 85 | # rescale the images 86 | img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 87 | img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 88 | flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 89 | flow = flow * [scale_x, scale_y] 90 | 91 | if self.do_flip: 92 | if np.random.rand() < self.h_flip_prob: # h-flip 93 | img1 = img1[:, ::-1] 94 | img2 = img2[:, ::-1] 95 | flow = flow[:, ::-1] * [-1.0, 1.0] 96 | 97 | if np.random.rand() < self.v_flip_prob: # v-flip 98 | img1 = img1[::-1, :] 99 | img2 = img2[::-1, :] 100 | flow = flow[::-1, :] * [1.0, -1.0] 101 | 102 | y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) 103 | x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) 104 | 105 | img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 106 | img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 107 | flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 108 | 109 | return img1, img2, flow 110 | 111 | def __call__(self, img1, img2, flow): 112 | img1, img2 = self.color_transform(img1, img2) 113 | img1, img2 = self.eraser_transform(img1, img2) 114 | img1, img2, flow = self.spatial_transform(img1, img2, flow) 115 | 116 | img1 = np.ascontiguousarray(img1) 117 | img2 = np.ascontiguousarray(img2) 118 | flow = np.ascontiguousarray(flow) 119 | 120 | return img1, img2, flow 121 | 122 | class SparseFlowAugmentor: 123 | def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False): 124 | # spatial augmentation params 125 | self.crop_size = crop_size 126 | self.min_scale = min_scale 127 | self.max_scale = max_scale 128 | self.spatial_aug_prob = 0.8 129 | self.stretch_prob = 0.8 130 | self.max_stretch = 0.2 131 | 132 | # flip augmentation params 133 | self.do_flip = do_flip 134 | self.h_flip_prob = 0.5 135 | self.v_flip_prob = 0.1 136 | 137 | # photometric augmentation params 138 | self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14) 139 | self.asymmetric_color_aug_prob = 0.2 140 | self.eraser_aug_prob = 0.5 141 | 142 | def color_transform(self, img1, img2): 143 | image_stack = np.concatenate([img1, img2], axis=0) 144 | image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) 145 | img1, img2 = np.split(image_stack, 2, axis=0) 146 | return img1, img2 147 | 148 | def eraser_transform(self, img1, img2): 149 | ht, wd = img1.shape[:2] 150 | if np.random.rand() < self.eraser_aug_prob: 151 | mean_color = np.mean(img2.reshape(-1, 3), axis=0) 152 | for _ in range(np.random.randint(1, 3)): 153 | x0 = np.random.randint(0, wd) 154 | y0 = np.random.randint(0, ht) 155 | dx = np.random.randint(50, 100) 156 | dy = np.random.randint(50, 100) 157 | img2[y0:y0+dy, x0:x0+dx, :] = mean_color 158 | 159 | return img1, img2 160 | 161 | def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): 162 | ht, wd = flow.shape[:2] 163 | coords = np.meshgrid(np.arange(wd), np.arange(ht)) 164 | coords = np.stack(coords, axis=-1) 165 | 166 | coords = coords.reshape(-1, 2).astype(np.float32) 167 | flow = flow.reshape(-1, 2).astype(np.float32) 168 | valid = valid.reshape(-1).astype(np.float32) 169 | 170 | coords0 = coords[valid>=1] 171 | flow0 = flow[valid>=1] 172 | 173 | ht1 = int(round(ht * fy)) 174 | wd1 = int(round(wd * fx)) 175 | 176 | coords1 = coords0 * [fx, fy] 177 | flow1 = flow0 * [fx, fy] 178 | 179 | xx = np.round(coords1[:,0]).astype(np.int32) 180 | yy = np.round(coords1[:,1]).astype(np.int32) 181 | 182 | v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) 183 | xx = xx[v] 184 | yy = yy[v] 185 | flow1 = flow1[v] 186 | 187 | flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) 188 | valid_img = np.zeros([ht1, wd1], dtype=np.int32) 189 | 190 | flow_img[yy, xx] = flow1 191 | valid_img[yy, xx] = 1 192 | 193 | return flow_img, valid_img 194 | 195 | def spatial_transform(self, img1, img2, flow, valid): 196 | # randomly sample scale 197 | 198 | ht, wd = img1.shape[:2] 199 | min_scale = np.maximum( 200 | (self.crop_size[0] + 1) / float(ht), 201 | (self.crop_size[1] + 1) / float(wd)) 202 | 203 | scale = 2 ** np.random.uniform(self.min_scale, self.max_scale) 204 | scale_x = np.clip(scale, min_scale, None) 205 | scale_y = np.clip(scale, min_scale, None) 206 | 207 | if np.random.rand() < self.spatial_aug_prob: 208 | # rescale the images 209 | img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 210 | img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR) 211 | flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y) 212 | 213 | if self.do_flip: 214 | if np.random.rand() < 0.5: # h-flip 215 | img1 = img1[:, ::-1] 216 | img2 = img2[:, ::-1] 217 | flow = flow[:, ::-1] * [-1.0, 1.0] 218 | valid = valid[:, ::-1] 219 | 220 | margin_y = 20 221 | margin_x = 50 222 | 223 | y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) 224 | x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x) 225 | 226 | y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) 227 | x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) 228 | 229 | img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 230 | img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 231 | flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 232 | valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]] 233 | return img1, img2, flow, valid 234 | 235 | 236 | def __call__(self, img1, img2, flow, valid): 237 | img1, img2 = self.color_transform(img1, img2) 238 | img1, img2 = self.eraser_transform(img1, img2) 239 | img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid) 240 | 241 | img1 = np.ascontiguousarray(img1) 242 | img2 = np.ascontiguousarray(img2) 243 | flow = np.ascontiguousarray(flow) 244 | valid = np.ascontiguousarray(valid) 245 | 246 | return img1, img2, flow, valid 247 | -------------------------------------------------------------------------------- /core/utils/flow_viz.py: -------------------------------------------------------------------------------- 1 | # Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization 2 | 3 | 4 | # MIT License 5 | # 6 | # Copyright (c) 2018 Tom Runia 7 | # 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to conditions. 14 | # 15 | # Author: Tom Runia 16 | # Date Created: 2018-08-03 17 | 18 | import numpy as np 19 | 20 | def make_colorwheel(): 21 | """ 22 | Generates a color wheel for optical flow visualization as presented in: 23 | Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) 24 | URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf 25 | 26 | Code follows the original C++ source code of Daniel Scharstein. 27 | Code follows the the Matlab source code of Deqing Sun. 28 | 29 | Returns: 30 | np.ndarray: Color wheel 31 | """ 32 | 33 | RY = 15 34 | YG = 6 35 | GC = 4 36 | CB = 11 37 | BM = 13 38 | MR = 6 39 | 40 | ncols = RY + YG + GC + CB + BM + MR 41 | colorwheel = np.zeros((ncols, 3)) 42 | col = 0 43 | 44 | # RY 45 | colorwheel[0:RY, 0] = 255 46 | colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY) 47 | col = col+RY 48 | # YG 49 | colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG) 50 | colorwheel[col:col+YG, 1] = 255 51 | col = col+YG 52 | # GC 53 | colorwheel[col:col+GC, 1] = 255 54 | colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC) 55 | col = col+GC 56 | # CB 57 | colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB) 58 | colorwheel[col:col+CB, 2] = 255 59 | col = col+CB 60 | # BM 61 | colorwheel[col:col+BM, 2] = 255 62 | colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM) 63 | col = col+BM 64 | # MR 65 | colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR) 66 | colorwheel[col:col+MR, 0] = 255 67 | return colorwheel 68 | 69 | 70 | def flow_uv_to_colors(u, v, convert_to_bgr=False): 71 | """ 72 | Applies the flow color wheel to (possibly clipped) flow components u and v. 73 | 74 | According to the C++ source code of Daniel Scharstein 75 | According to the Matlab source code of Deqing Sun 76 | 77 | Args: 78 | u (np.ndarray): Input horizontal flow of shape [H,W] 79 | v (np.ndarray): Input vertical flow of shape [H,W] 80 | convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. 81 | 82 | Returns: 83 | np.ndarray: Flow visualization image of shape [H,W,3] 84 | """ 85 | flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) 86 | colorwheel = make_colorwheel() # shape [55x3] 87 | ncols = colorwheel.shape[0] 88 | rad = np.sqrt(np.square(u) + np.square(v)) 89 | a = np.arctan2(-v, -u)/np.pi 90 | fk = (a+1) / 2*(ncols-1) 91 | k0 = np.floor(fk).astype(np.int32) 92 | k1 = k0 + 1 93 | k1[k1 == ncols] = 0 94 | f = fk - k0 95 | for i in range(colorwheel.shape[1]): 96 | tmp = colorwheel[:,i] 97 | col0 = tmp[k0] / 255.0 98 | col1 = tmp[k1] / 255.0 99 | col = (1-f)*col0 + f*col1 100 | idx = (rad <= 1) 101 | col[idx] = 1 - rad[idx] * (1-col[idx]) 102 | col[~idx] = col[~idx] * 0.75 # out of range 103 | # Note the 2-i => BGR instead of RGB 104 | ch_idx = 2-i if convert_to_bgr else i 105 | flow_image[:,:,ch_idx] = np.floor(255 * col) 106 | return flow_image 107 | 108 | 109 | def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): 110 | """ 111 | Expects a two dimensional flow image of shape. 112 | 113 | Args: 114 | flow_uv (np.ndarray): Flow UV image of shape [H,W,2] 115 | clip_flow (float, optional): Clip maximum of flow values. Defaults to None. 116 | convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. 117 | 118 | Returns: 119 | np.ndarray: Flow visualization image of shape [H,W,3] 120 | """ 121 | assert flow_uv.ndim == 3, 'input flow must have three dimensions' 122 | assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]' 123 | if clip_flow is not None: 124 | flow_uv = np.clip(flow_uv, 0, clip_flow) 125 | u = flow_uv[:,:,0] 126 | v = flow_uv[:,:,1] 127 | rad = np.sqrt(np.square(u) + np.square(v)) 128 | rad_max = np.max(rad) 129 | epsilon = 1e-5 130 | u = u / (rad_max + epsilon) 131 | v = v / (rad_max + epsilon) 132 | return flow_uv_to_colors(u, v, convert_to_bgr) -------------------------------------------------------------------------------- /core/utils/frame_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | from os.path import * 4 | import re 5 | 6 | import cv2 7 | cv2.setNumThreads(0) 8 | cv2.ocl.setUseOpenCL(False) 9 | 10 | TAG_CHAR = np.array([202021.25], np.float32) 11 | 12 | def readFlow(fn): 13 | """ Read .flo file in Middlebury format""" 14 | # Code adapted from: 15 | # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy 16 | 17 | # WARNING: this will work on little-endian architectures (eg Intel x86) only! 18 | # print 'fn = %s'%(fn) 19 | with open(fn, 'rb') as f: 20 | magic = np.fromfile(f, np.float32, count=1) 21 | if 202021.25 != magic: 22 | print('Magic number incorrect. Invalid .flo file') 23 | return None 24 | else: 25 | w = np.fromfile(f, np.int32, count=1) 26 | h = np.fromfile(f, np.int32, count=1) 27 | # print 'Reading %d x %d flo file\n' % (w, h) 28 | data = np.fromfile(f, np.float32, count=2*int(w)*int(h)) 29 | # Reshape data into 3D array (columns, rows, bands) 30 | # The reshape here is for visualization, the original code is (w,h,2) 31 | return np.resize(data, (int(h), int(w), 2)) 32 | 33 | def readPFM(file): 34 | file = open(file, 'rb') 35 | 36 | color = None 37 | width = None 38 | height = None 39 | scale = None 40 | endian = None 41 | 42 | header = file.readline().rstrip() 43 | if header == b'PF': 44 | color = True 45 | elif header == b'Pf': 46 | color = False 47 | else: 48 | raise Exception('Not a PFM file.') 49 | 50 | dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) 51 | if dim_match: 52 | width, height = map(int, dim_match.groups()) 53 | else: 54 | raise Exception('Malformed PFM header.') 55 | 56 | scale = float(file.readline().rstrip()) 57 | if scale < 0: # little-endian 58 | endian = '<' 59 | scale = -scale 60 | else: 61 | endian = '>' # big-endian 62 | 63 | data = np.fromfile(file, endian + 'f') 64 | shape = (height, width, 3) if color else (height, width) 65 | 66 | data = np.reshape(data, shape) 67 | data = np.flipud(data) 68 | return data 69 | 70 | def writeFlow(filename,uv,v=None): 71 | """ Write optical flow to file. 72 | 73 | If v is None, uv is assumed to contain both u and v channels, 74 | stacked in depth. 75 | Original code by Deqing Sun, adapted from Daniel Scharstein. 76 | """ 77 | nBands = 2 78 | 79 | if v is None: 80 | assert(uv.ndim == 3) 81 | assert(uv.shape[2] == 2) 82 | u = uv[:,:,0] 83 | v = uv[:,:,1] 84 | else: 85 | u = uv 86 | 87 | assert(u.shape == v.shape) 88 | height,width = u.shape 89 | f = open(filename,'wb') 90 | # write the header 91 | f.write(TAG_CHAR) 92 | np.array(width).astype(np.int32).tofile(f) 93 | np.array(height).astype(np.int32).tofile(f) 94 | # arrange into matrix form 95 | tmp = np.zeros((height, width*nBands)) 96 | tmp[:,np.arange(width)*2] = u 97 | tmp[:,np.arange(width)*2 + 1] = v 98 | tmp.astype(np.float32).tofile(f) 99 | f.close() 100 | 101 | 102 | def readFlowKITTI(filename): 103 | flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR) 104 | flow = flow[:,:,::-1].astype(np.float32) 105 | flow, valid = flow[:, :, :2], flow[:, :, 2] 106 | flow = (flow - 2**15) / 64.0 107 | return flow, valid 108 | 109 | def readDispKITTI(filename): 110 | disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 111 | valid = disp > 0.0 112 | flow = np.stack([-disp, np.zeros_like(disp)], -1) 113 | return flow, valid 114 | 115 | 116 | def writeFlowKITTI(filename, uv): 117 | uv = 64.0 * uv + 2**15 118 | valid = np.ones([uv.shape[0], uv.shape[1], 1]) 119 | uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) 120 | cv2.imwrite(filename, uv[..., ::-1]) 121 | 122 | 123 | def read_gen(file_name, pil=False): 124 | ext = splitext(file_name)[-1] 125 | if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': 126 | return Image.open(file_name) 127 | elif ext == '.bin' or ext == '.raw': 128 | return np.load(file_name) 129 | elif ext == '.flo': 130 | return readFlow(file_name).astype(np.float32) 131 | elif ext == '.pfm': 132 | flow = readPFM(file_name).astype(np.float32) 133 | if len(flow.shape) == 2: 134 | return flow 135 | else: 136 | return flow[:, :, :-1] 137 | return [] -------------------------------------------------------------------------------- /core/utils/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | import numpy as np 4 | from scipy import interpolate 5 | 6 | 7 | class InputPadder: 8 | """ Pads images such that dimensions are divisible by 8 """ 9 | def __init__(self, dims, mode='sintel'): 10 | self.ht, self.wd = dims[-2:] 11 | pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 12 | pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 13 | if mode == 'sintel': 14 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2] 15 | else: 16 | self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht] 17 | 18 | def pad(self, *inputs): 19 | return [F.pad(x, self._pad, mode='replicate') for x in inputs] 20 | 21 | def unpad(self,x): 22 | ht, wd = x.shape[-2:] 23 | c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]] 24 | return x[..., c[0]:c[1], c[2]:c[3]] 25 | 26 | def forward_interpolate(flow): 27 | flow = flow.detach().cpu().numpy() 28 | dx, dy = flow[0], flow[1] 29 | 30 | ht, wd = dx.shape 31 | x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) 32 | 33 | x1 = x0 + dx 34 | y1 = y0 + dy 35 | 36 | x1 = x1.reshape(-1) 37 | y1 = y1.reshape(-1) 38 | dx = dx.reshape(-1) 39 | dy = dy.reshape(-1) 40 | 41 | valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) 42 | x1 = x1[valid] 43 | y1 = y1[valid] 44 | dx = dx[valid] 45 | dy = dy[valid] 46 | 47 | flow_x = interpolate.griddata( 48 | (x1, y1), dx, (x0, y0), method='nearest', fill_value=0) 49 | 50 | flow_y = interpolate.griddata( 51 | (x1, y1), dy, (x0, y0), method='nearest', fill_value=0) 52 | 53 | flow = np.stack([flow_x, flow_y], axis=0) 54 | return torch.from_numpy(flow).float() 55 | 56 | 57 | def bilinear_sampler(img, coords, mode='bilinear', mask=False): 58 | """ Wrapper for grid_sample, uses pixel coordinates """ 59 | H, W = img.shape[-2:] 60 | xgrid, ygrid = coords.split([1,1], dim=-1) 61 | xgrid = 2*xgrid/(W-1) - 1 62 | ygrid = 2*ygrid/(H-1) - 1 63 | 64 | grid = torch.cat([xgrid, ygrid], dim=-1) 65 | img = F.grid_sample(img, grid, align_corners=True) 66 | 67 | if mask: 68 | mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) 69 | return img, mask.float() 70 | 71 | return img 72 | 73 | 74 | def coords_grid(batch, ht, wd): 75 | coords = torch.meshgrid(torch.arange(ht), torch.arange(wd)) 76 | coords = torch.stack(coords[::-1], dim=0).float() 77 | return coords[None].repeat(batch, 1, 1, 1) 78 | 79 | 80 | def upflow8(flow, mode='bilinear'): 81 | new_size = (8 * flow.shape[2], 8 * flow.shape[3]) 82 | return 8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True) 83 | -------------------------------------------------------------------------------- /dependencies.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | import os 4 | import argparse 5 | import glob 6 | import time 7 | import cv2 as cv 8 | import cv2 9 | import numpy as np 10 | import torch 11 | import torchvision.models as models 12 | import torchvision.transforms as transforms 13 | import torchvision 14 | from raft import RAFT 15 | from utils import flow_viz 16 | from utils.utils import InputPadder 17 | from PIL import Image 18 | 19 | # detector class names 20 | coco_names = [ 21 | '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 22 | 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 23 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 24 | 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 25 | 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 26 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 27 | 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 28 | 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 29 | 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 30 | 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 31 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 32 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' 33 | ] 34 | 35 | ID = 0 36 | FRAME_NUMBER = 0 37 | 38 | # utillity fuction for no-max-suppression, that filters ambiguous bounding boxes 39 | def nms(dets, confidence, thresh): 40 | x1 = dets[:, 0].detach().cpu().numpy() 41 | y1 = dets[:, 1].detach().cpu().numpy() 42 | x2 = dets[:, 2].detach().cpu().numpy() 43 | y2 = dets[:, 3].detach().cpu().numpy() 44 | scores = confidence.detach().cpu().numpy() 45 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 46 | order = scores.argsort()[::-1] 47 | 48 | keep = [] 49 | while order.size > 0: 50 | j = order[0] 51 | keep.append(j) 52 | xx1 = np.maximum(x1[j], x1[order[1:]]) 53 | yy1 = np.maximum(y1[j], y1[order[1:]]) 54 | xx2 = np.minimum(x2[j], x2[order[1:]]) 55 | yy2 = np.minimum(y2[j], y2[order[1:]]) 56 | 57 | w = np.maximum(0.0, xx2 - xx1 + 1) 58 | h = np.maximum(0.0, yy2 - yy1 + 1) 59 | inter = w * h 60 | ovr = inter / (areas[j] + areas[order[1:]] - inter) 61 | 62 | inds = np.where(ovr <= thresh)[0] 63 | order = order[inds + 1] 64 | 65 | return keep 66 | 67 | # Calculate model metrics 68 | def evaluate(ground_truth, inference): 69 | return true_positive, true_negative, false_positive, false_negative 70 | -------------------------------------------------------------------------------- /frcnn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('core') 3 | import os 4 | import argparse 5 | import glob 6 | import time 7 | import cv2 as cv 8 | import cv2 9 | import numpy as np 10 | import torch 11 | import torchvision.models as models 12 | import torchvision.transforms as transforms 13 | import torchvision 14 | from raft import RAFT 15 | from utils import flow_viz 16 | from utils.utils import InputPadder 17 | from PIL import Image 18 | 19 | # utillity fuction for no-max-suppression, that filters ambiguous bounding boxes 20 | def nms(dets, confidence, thresh): 21 | x1 = dets[:, 0].detach().cpu().numpy() 22 | y1 = dets[:, 1].detach().cpu().numpy() 23 | x2 = dets[:, 2].detach().cpu().numpy() 24 | y2 = dets[:, 3].detach().cpu().numpy() 25 | scores = confidence.detach().cpu().numpy() 26 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 27 | order = scores.argsort()[::-1] 28 | 29 | keep = [] 30 | while order.size > 0: 31 | j = order[0] 32 | keep.append(j) 33 | xx1 = np.maximum(x1[j], x1[order[1:]]) 34 | yy1 = np.maximum(y1[j], y1[order[1:]]) 35 | xx2 = np.minimum(x2[j], x2[order[1:]]) 36 | yy2 = np.minimum(y2[j], y2[order[1:]]) 37 | 38 | w = np.maximum(0.0, xx2 - xx1 + 1) 39 | h = np.maximum(0.0, yy2 - yy1 + 1) 40 | inter = w * h 41 | ovr = inter / (areas[j] + areas[order[1:]] - inter) 42 | 43 | inds = np.where(ovr <= thresh)[0] 44 | order = order[inds + 1] 45 | 46 | return keep 47 | 48 | # detector class names 49 | coco_names = [ 50 | '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 51 | 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 52 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 53 | 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 54 | 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 55 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 56 | 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 57 | 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 58 | 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 59 | 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 60 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 61 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush' 62 | ] 63 | 64 | # tresholding for bouding box selection 65 | SCORE_THRESHOLD = 0.7 66 | IOU_THRESHOLD = 0.5 67 | 68 | COLORS = np.random.uniform(0, 255, size=(len(coco_names), 3)) 69 | # input form 70 | # python frcnn.py --model=models/raft-things.pth --path=demo-frames 71 | parser = argparse.ArgumentParser() 72 | parser.add_argument('--model', help="restore checkpoint") 73 | parser.add_argument('--path', help="dataset for evaluation") 74 | parser.add_argument('--small', action='store_true', help='use small model') 75 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 76 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 77 | args = parser.parse_args() 78 | # Define GPU usage 79 | DEVICE = 'cuda' 80 | # Set input video 81 | cap = cv.VideoCapture("1-fps.mp4") 82 | # get and drop first Frame 83 | ret, current_img = cap.read() 84 | count = 1 85 | ############################################################################### 86 | # FASTER REGION CONVOLUTIONAL NETWORK 87 | # load faster r-cnn to gpu 88 | frcnn = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) 89 | frcnn.to(DEVICE) 90 | # set model to inference mode 91 | frcnn.eval() 92 | # set transformation to prepare image for network input 93 | transform = transforms.Compose([transforms.ToTensor()]) 94 | ############################################################################### 95 | # Recurrent All-Pairs Field Transforms for Optical Flow 96 | raft = torch.nn.DataParallel(RAFT(args)) 97 | raft.load_state_dict(torch.load(args.model)) 98 | raft = raft.module 99 | raft.to(DEVICE) 100 | raft.eval() 101 | ############################################################################### 102 | #ouput video setup 103 | #h, w = current_img.shape[:2] 104 | #fourcc = cv2.VideoWriter_fourcc(*"mp4v") 105 | #out = cv2.VideoWriter("output.mp4", fourcc, 5.0, (2*w, 2*h)) 106 | ############################################################################### 107 | # Run until video is finished 108 | while(cap.isOpened()): 109 | count = count +1 110 | start = time.time() 111 | # get frame 112 | ret, frame = cap.read() 113 | frame_1 = frame.copy() 114 | # convert image to torch tensor 115 | frcnn_img = transform(frame) 116 | # send input data to GPU 117 | frcnn_img = frcnn_img.to(DEVICE) 118 | # process inference 119 | detections = frcnn([frcnn_img]) 120 | 121 | boxes = detections[0]['boxes'] 122 | confidences = detections[0]['scores'] 123 | class_id = detections[0]['labels'] 124 | 125 | bbox = frame 126 | 127 | idxs = nms(boxes,confidences, IOU_THRESHOLD) 128 | 129 | for i in idxs: 130 | if confidences[i] > SCORE_THRESHOLD: 131 | if class_id[i] in [2,3,4,6,8]: 132 | color = COLORS[5] 133 | cv2.rectangle(bbox, (int(boxes[i][0]), int(boxes[i][1])), (int(boxes[i][2]), int(boxes[i][3])), color, 2) 134 | #cv2.imwrite('detection{:06d}'.format(count) + '.png',image) 135 | 136 | ############################################################################ 137 | past_img = current_img 138 | current_img = frame_1 139 | 140 | raft_img_1 = np.array(cv2.medianBlur(past_img,5)).astype(np.uint8) 141 | raft_img_1 = torch.from_numpy(raft_img_1).permute(2, 0, 1).float() 142 | raft_img_1 = raft_img_1[None].to(DEVICE) 143 | 144 | raft_img_2 = np.array(cv2.medianBlur(current_img,5)).astype(np.uint8) 145 | raft_img_2 = torch.from_numpy(raft_img_2).permute(2, 0, 1).float() 146 | raft_img_2 = raft_img_2[None].to(DEVICE) 147 | 148 | #padder = InputPadder(raft_img_1.shape) 149 | #raft_img_2, raft_img_1 = padder.pad(raft_img_2, raft_img_1) 150 | flow_low, flow_up = raft(raft_img_2, raft_img_1, iters=5, test_mode=True) 151 | 152 | flow_up = flow_up[0].permute(1,2,0).detach().cpu().numpy() 153 | flow_up = flow_viz.flow_to_image(flow_up) 154 | #cv2.imwrite('{:06d}'.format(count) + '.png',flow_up) 155 | 156 | merge = cv2.addWeighted(bbox, 1, flow_up, .5, 0) 157 | merge_img1 = np.concatenate((frame_1,bbox), axis = 0) 158 | merge_img2 = np.concatenate((flow_up, merge), axis = 0) 159 | merge_img = np.concatenate((merge_img1,merge_img2), axis = 1) 160 | cv2.imshow('image',merge_img) 161 | cv2.waitKey(1) 162 | #out.write(merge_img) 163 | #cv2.imwrite('{:06d}'.format(count) + '.png',merge_img) 164 | print('elapsed time: {}'.format(time.time()-start)) 165 | 166 | cap.release() 167 | cv2.destroyAllWindows() 168 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Get dependencies 2 | import sys 3 | import dependencies 4 | sys.path.append('yolo') 5 | sys.path.append('core') 6 | import math 7 | import glob 8 | import os 9 | import time 10 | import cv2 11 | import numpy as np 12 | from PIL import Image 13 | import torch 14 | import torchvision.models as models 15 | import torchvision.transforms as transforms 16 | from raft import RAFT 17 | from utils import flow_viz 18 | from utils.utils import InputPadder 19 | from inference import post_process 20 | import argparse 21 | from model import YoloNetV3 22 | import matplotlib.pyplot as plt 23 | from datetime import datetime 24 | #------------------------------------------------------------------------------- 25 | # Parameters 26 | #------------------------------------------------------------------------------- 27 | # Set Input type - image, video 28 | media_type = 'image' 29 | # Location of image folder or video file 30 | location = 'dataset/01/' 31 | # Set xml to true if ground truth data is in xml and False to txt file 32 | xml = False 33 | # Export results in video file 34 | video_out = True 35 | # Show Result in a pop up window frame by frame 36 | preview_result = False 37 | # Calculate and output metrics 38 | metrics_out = True 39 | # Initialize regions of no interest 40 | regions = [] 41 | #--------------------------------------------------------------- 42 | # Global Variables 43 | #--------------------------------------------------------------- 44 | # Detector IoU treshold 45 | IOU_THRESHOLD = 0.4 46 | # Metrics IoU Treshold 47 | EVAL_TRESHOLD = 0.5 48 | # Select GPU as target 49 | DEVICE = 'cuda' 50 | 51 | #--------------------------------------------------------------- 52 | # Input Image Sequence Handler Class 53 | #--------------------------------------------------------------- 54 | class InputData: 55 | # Initializer handling both image and video as input data 56 | def __init__(self, media_type, location): 57 | self.type = media_type 58 | if self.type == 'image': 59 | # get image file list and sort by filename 60 | self.images = glob.glob(os.path.join(location, '*.png')) 61 | self.images = sorted(self.images) 62 | 63 | if self.type == 'video': 64 | # Start video object 65 | self.images = cv2.VideoCapture(location) 66 | # Helper function to get next frame in sequence of iamges or video 67 | def get_next_frame(self): 68 | try: 69 | if self.type == 'image': 70 | imfile = self.images[FRAME_NUMBER] 71 | image = np.array(Image.open(imfile)).astype(np.uint8) 72 | #image = cv2.resize(image, (320,240), interpolation = cv2.INTER_AREA) 73 | return image 74 | 75 | if self.type == 'video': 76 | _, image = self.images.read() 77 | #image = cv2.resize(image, (320,240), interpolation = cv2.INTER_AREA) 78 | return image 79 | # In case reach end of sequence 80 | except: 81 | return False 82 | 83 | 84 | #--------------------------------------------------------------- 85 | #--------------------------------------------------------------- 86 | #--------------------------------------------------------------- 87 | class Vehicle: 88 | def __init__(self,frame_number, detection, flow): 89 | global ID 90 | global FRAME_NUMBER 91 | self.x, self.y, self.w, self.h = detection 92 | self.x_dot, self.y_dot = flow 93 | self.first_frame = FRAME_NUMBER 94 | self.gt_id = [] 95 | self.last_seen = 0 96 | self.veh_id = ID 97 | ID = ID + 1 98 | 99 | def update_full(self, frame_number, detection, flow): 100 | x, y, self.w, self.h = detection 101 | self.last_seen = 0 102 | self.x_dot = x - self.x 103 | self.y_dot = y - self.y 104 | self.x = x 105 | self.y = y 106 | 107 | def update_partial(self, flow): 108 | self.last_seen += 1 109 | #self.x_dot = self.x_dot *0.1 110 | #self.y_dot = self.y_dot *0.1 +flow[1] *0.9 111 | 112 | def predict(self): 113 | self.x = self.x + self.x_dot 114 | self.y = self.y + self.y_dot 115 | 116 | def bounds(self): 117 | for region in regions: 118 | if iou(region, [self.x, self.y, self.x+self.w, self.y+self.h])>0.7: 119 | return False 120 | if self.last_seen > 5: 121 | return False 122 | if self.x < IMG_X_MAX and self.x > 0 and self.y > 0 and self.y < IMG_Y_MAX: 123 | return True 124 | else: 125 | return False 126 | 127 | def check_id(self, gt_id): 128 | if self.gt_id == []: 129 | self.gt_id.append(gt_id) 130 | return 0 131 | 132 | if gt_id in self.gt_id: 133 | return 0 134 | else: 135 | self.gt_id.append(gt_id) 136 | return 1 137 | 138 | 139 | #--------------------------------------------------------------- 140 | #--------------------------------------------------------------- 141 | #--------------------------------------------------------------- 142 | class Frame: 143 | def __init__(self, detection, flow , vehicles): 144 | global FRAME_NUMBER 145 | self.frame_number = FRAME_NUMBER 146 | self.bounding_boxes = detection 147 | self.optical_flow = flow 148 | self.prior_vehicles = vehicles 149 | self.measurement = {} 150 | self.update_veh = [] 151 | self.predict_veh = [] 152 | FRAME_NUMBER = FRAME_NUMBER + 1 153 | 154 | def match(self): 155 | iou_matrix = np.zeros((len(self.bounding_boxes),len(self.prior_vehicles))) 156 | i = 0 157 | j = 0 158 | for box in self.bounding_boxes: 159 | for vehicle in self.prior_vehicles: 160 | vehicle_box = [vehicle.x, vehicle.y, vehicle.w + vehicle.x, vehicle.h + vehicle.y] 161 | detection_box = [box[0], box[1], box[0]+box[2], box[1]+box[3]] 162 | iou_matrix[i][j] = iou(detection_box , vehicle_box) 163 | j += 1 164 | i += 1 165 | j = 0 166 | 167 | full = 0 168 | initialize = 0 169 | partial = 0 170 | 171 | for i in range(len(self.bounding_boxes)): 172 | if max(iou_matrix[i]) > 0.5: 173 | full += 1 174 | idx = int(np.where(iou_matrix[i] == iou_matrix[i].max())[0][0]) 175 | self.measurement[self.prior_vehicles[idx].veh_id] = ['full',self.bounding_boxes[i], self.average_flow(self.bounding_boxes[i]), self.prior_vehicles[idx]] 176 | else: 177 | initialize += 1 178 | vehicle = Vehicle(self.frame_number, self.bounding_boxes[i], self.average_flow(self.bounding_boxes[i])) 179 | self.measurement[vehicle.veh_id] = ['initialize', 0, 0, vehicle] 180 | 181 | for vehicle in self.prior_vehicles: 182 | if vehicle.veh_id not in self.measurement: 183 | partial += 1 184 | bbox = [vehicle.x, vehicle.y, vehicle.w, vehicle.h] 185 | self.measurement[vehicle.veh_id] = ['partial', 0, self.average_flow(bbox), vehicle] 186 | 187 | #print('Full: ' + str(full) + ' Partial: ' + str(partial) + ' New: ' + str(initialize)) 188 | 189 | def update(self): 190 | for item in self.measurement.values(): 191 | if item[0] == 'full': 192 | item[3].update_full(self.frame_number, item[1], item[2]) 193 | if item[0] == 'partial': 194 | item[3].update_partial(item[2]) 195 | if item[3].bounds(): 196 | self.update_veh.append(item[3]) 197 | 198 | def predict(self): 199 | for vehicle in self.update_veh: 200 | vehicle.predict() 201 | if vehicle.bounds(): 202 | self.predict_veh.append(vehicle) 203 | 204 | def average_flow(self, bbox): 205 | x, y, w, h = bbox 206 | direction_x = np.average(self.optical_flow[int(y) : int(y +h) , int(x) : int(x+w), 0]) 207 | direction_y = np.average(self.optical_flow[int(y) : int(y +h) , int(x):int(x+w) , 1]) 208 | return [direction_x, direction_y] 209 | 210 | def get_bbox(self): 211 | bbox = [] 212 | for vehicle in self.update_veh: 213 | bbox.append([int(vehicle.x), int(vehicle.y), int(vehicle.x + vehicle.w), int(vehicle.y + vehicle.h)]) 214 | return bbox 215 | 216 | #--------------------------------------------------------------- 217 | #--------------------------------------------------------------- 218 | #--------------------------------------------------------------- 219 | class Detector: 220 | def __init__(self, type): 221 | self.type = type 222 | if type == 'frcnn': 223 | # load faster r-cnn 224 | self.detector = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) 225 | # send model to gpu 226 | self.detector.to(DEVICE) 227 | # set model to inference mode 228 | self.detector.eval() 229 | # set transformation to prepare image for network input 230 | self.transform = transforms.Compose([transforms.ToTensor()]) 231 | if type == 'yolo': 232 | weight_path = 'weights/yolov3_original.pt' 233 | # load faster r-cnn 234 | self.detector = YoloNetV3(nms=False) 235 | # load weights 236 | self.detector.load_state_dict(torch.load(weight_path)) 237 | # send model to gpu 238 | self.detector.to(DEVICE) 239 | # set model to inference mode 240 | self.detector.eval() 241 | # set transformation to prepare image for network input 242 | self.transform = transforms.Compose([transforms.ToTensor()]) 243 | if type == 'sinet': 244 | print('SINet') 245 | 246 | def inference(self, image): 247 | if self.type == 'frcnn': 248 | # convert image to torch tensor 249 | input = self.transform(image) 250 | # send input data to GPU 251 | input = input.to(DEVICE) 252 | # process inference and get detections 253 | detections = self.detector([input]) 254 | boxes = detections[0]['boxes'] 255 | confidence = detections[0]['scores'] 256 | class_id = detections[0]['labels'] 257 | self.result = self.filter_detection(boxes, confidence, class_id) 258 | 259 | if self.type == 'yolo': 260 | # convert image to torch tensor 261 | im = Image.fromarray(image) 262 | input = self.transform(im.resize((IMG_X_MAX,IMG_X_MAX),Image.ANTIALIAS)) 263 | input = input.unsqueeze(0) 264 | # send input data to GPU 265 | input = input.to(DEVICE) 266 | # process inference and get detections 267 | with torch.no_grad(): 268 | detections = self.detector(input) 269 | detections = post_process(detections, True, SCORE_THRESHOLD, IOU_THRESHOLD) 270 | for detection in detections: 271 | detection[..., :4] = untransform_bboxes(detection[..., :4]) 272 | cxcywh_to_xywh(detection) 273 | boxes = detections[0][..., :4] 274 | self.result = boxes.detach().cpu().numpy() 275 | 276 | if self.type == 'sinet': 277 | # convert image to torch tensor 278 | input = self.transform(image) 279 | # send input data to GPU 280 | input = input.to(DEVICE) 281 | # process inference and get detections 282 | detections = self.detector([input]) 283 | boxes = detections[0]['boxes'] 284 | confidence = detections[0]['scores'] 285 | class_id = detections[0]['labels'] 286 | 287 | 288 | def filter_detection(self, detections, confidence, class_id): 289 | x1 = detections[:, 0].detach().cpu().numpy() 290 | y1 = detections[:, 1].detach().cpu().numpy() 291 | x2 = detections[:, 2].detach().cpu().numpy() 292 | y2 = detections[:, 3].detach().cpu().numpy() 293 | scores = confidence.detach().cpu().numpy() 294 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 295 | order = scores.argsort()[::-1] 296 | 297 | keep = [] 298 | while order.size > 0: 299 | j = order[0] 300 | keep.append(j) 301 | xx1 = np.maximum(x1[j], x1[order[1:]]) 302 | yy1 = np.maximum(y1[j], y1[order[1:]]) 303 | xx2 = np.minimum(x2[j], x2[order[1:]]) 304 | yy2 = np.minimum(y2[j], y2[order[1:]]) 305 | 306 | w = np.maximum(0.0, xx2 - xx1 + 1) 307 | h = np.maximum(0.0, yy2 - yy1 + 1) 308 | inter = w * h 309 | ovr = inter / (areas[j] + areas[order[1:]] - inter) 310 | 311 | inds = np.where(ovr <= IOU_THRESHOLD)[0] 312 | order = order[inds + 1] 313 | filter = [] 314 | for i in keep: 315 | if confidence[i] >= SCORE_THRESHOLD: 316 | if class_id[i] in [2,3,4,6, 7, 8]: 317 | filter.append([int(x1[i]), int(y1[i]), int(x2[i]-x1[i]), int(y2[i]-y1[i])]) 318 | return filter 319 | 320 | #--------------------------------------------------------------- 321 | #--------------------------------------------------------------- 322 | #--------------------------------------------------------------- 323 | class OpticalFlow: 324 | def __init__(self, type): 325 | self.type = type 326 | if type == 'farneback': 327 | self.type = 'farneback' 328 | 329 | if type == 'raft': 330 | parser = argparse.ArgumentParser() 331 | parser.add_argument('--model', nargs='?', const='raft-models/raft-things.pth', type=str, help="restore checkpoint") 332 | parser.add_argument('--path', nargs='?', const='frames', type=int, help="dataset for evaluation") 333 | parser.add_argument('--small', action='store_true', help='use small model') 334 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 335 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 336 | args = parser.parse_args() 337 | args.model = 'raft-models/raft-things.pth' 338 | self.flow_model = torch.nn.DataParallel(RAFT(args)) 339 | self.flow_model.load_state_dict(torch.load(args.model)) 340 | self.flow_model = self.flow_model.module 341 | self.flow_model.to(DEVICE) 342 | self.flow_model.eval() 343 | 344 | if type == 'flownet': 345 | 346 | print('Flownet') 347 | 348 | 349 | def inference(self,image1,image2): 350 | if self.type == 'farneback': 351 | self.mask = np.zeros_like(image1) 352 | gray1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY) 353 | gray2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY) 354 | flow = cv2.calcOpticalFlowFarneback(gray1, gray2, flow=None, 355 | pyr_scale=0.5, levels=10, winsize=15, 356 | iterations=10, poly_n=7, poly_sigma=1.5, 357 | flags=cv2.OPTFLOW_FARNEBACK_GAUSSIAN) 358 | self.result = flow 359 | 360 | if self.type == 'raft': 361 | image1 = torch.from_numpy(image1).permute(2, 0, 1).float() 362 | image1 = image1[None].to(DEVICE) 363 | 364 | image2 = torch.from_numpy(image2).permute(2, 0, 1).float() 365 | image2 = image2[None].to(DEVICE) 366 | 367 | padder = InputPadder(image2.shape) 368 | image1, image2 = padder.pad(image1, image2) 369 | _, flow_up = self.flow_model(image1, image2, iters=5, test_mode=True) 370 | self.result = flow_up[0].permute(1,2,0).detach().cpu().numpy() 371 | 372 | if self.type == 'flownet': 373 | self.flow = True 374 | 375 | def toimage(self): 376 | if self.type == 'raft': 377 | image = flow_viz.flow_to_image(self.result) 378 | return image 379 | if self.type == 'farneback': 380 | magnitude, angle = cv2.cartToPolar(self.result[..., 0], self.result[..., 1]) 381 | mask = self.mask 382 | mask[..., 1] = 255 383 | mask[..., 0] = angle * 180 / np.pi / 2 384 | mask[..., 2] = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX) 385 | image = cv2.cvtColor(mask, cv2.COLOR_HSV2BGR) 386 | return image 387 | 388 | 389 | #--------------------------------------------------------------- 390 | # Helper functions 391 | #--------------------------------------------------------------- 392 | # draw vehicle bounding box on input image 393 | def draw_bbox(image, bboxes): 394 | copy = np.copy(image) 395 | for bbox in bboxes: 396 | cv2.rectangle(copy, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), np.random.uniform(0, 255) , 2) 397 | return copy 398 | 399 | # create optical flow + detection mask 400 | def flow_mask(flow, bboxes): 401 | image = flow.toimage() 402 | mask = np.full(image.shape[:2], 0, dtype=np.uint8) 403 | for bbox in bboxes: 404 | cv2.rectangle(mask, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255,255,255), -1) 405 | image = cv2.bitwise_or(np.array(image).astype(np.uint8), np.array(image).astype(np.uint8), mask=mask) 406 | return image 407 | 408 | # create side by side images 409 | def visuzalization(input, bbox, flow, expected): 410 | bbox_img = cv2.resize(draw_evaluation(input, expected, bbox), (IMG_X_MAX, IMG_Y_MAX), interpolation = cv2.INTER_AREA) 411 | flow_visualization = cv2.resize(flow.toimage(), (IMG_X_MAX, IMG_Y_MAX), interpolation = cv2.INTER_AREA) 412 | mask = cv2.resize(flow_mask(flow, bbox), (IMG_X_MAX, IMG_Y_MAX), interpolation = cv2.INTER_AREA) 413 | 414 | img1 = np.concatenate((input,bbox_img), axis = 1) 415 | img2 = np.concatenate((flow_visualization, mask), axis = 1) 416 | merge_img = np.concatenate((img1,img2), axis = 1) 417 | return merge_img 418 | 419 | # update output image window 420 | def display_result(image): 421 | cv2.imshow('image',image) 422 | cv2.waitKey(0) 423 | 424 | def iou(boxA, boxB): 425 | # determine the (x, y)-coordinates of the intersection rectangle 426 | xA = max(boxA[0], boxB[0]) 427 | yA = max(boxA[1], boxB[1]) 428 | xB = min(boxA[2], boxB[2]) 429 | yB = min(boxA[3], boxB[3]) 430 | 431 | # compute the area of intersection rectangle 432 | interArea = abs(max((xB - xA, 0)) * max((yB - yA), 0)) 433 | if interArea == 0: 434 | return 0 435 | # compute the area of both the prediction and ground-truth 436 | # rectangles 437 | boxAArea = abs((boxA[2] - boxA[0]) * (boxA[3] - boxA[1])) 438 | boxBArea = abs((boxB[2] - boxB[0]) * (boxB[3] - boxB[1])) 439 | 440 | # compute the intersection over union by taking the intersection 441 | # area and dividing it by the sum of prediction + ground-truth 442 | # areas - the interesection area 443 | iou = interArea / float(boxAArea + boxBArea - interArea) 444 | 445 | # return the intersection over union value 446 | return iou 447 | 448 | def evaluate(gt , result): 449 | 450 | if len(result) == 0: 451 | return [0,len(gt),0, 0] 452 | 453 | iou_result = np.zeros((len(gt),len(result)), dtype=float) 454 | for i in range(len(gt)): 455 | for j in range(len(result)): 456 | box1 = [gt[i][2], gt[i][3], gt[i][4] , gt[i][5]] 457 | box2 = [result[j].x, result[j].y, result[j].w +result[j].x, result[j].h+result[j].y] 458 | iou_result[i,j] = iou(box1, box2) 459 | tp = 0 460 | fn = 0 461 | fp = 0 462 | ids = 0 463 | 464 | for i in range(len(gt)): 465 | if max(iou_result[i,:]) >= EVAL_TRESHOLD: 466 | tp += 1 467 | idx = np.where(iou_result[i] == iou_result[i].max()) 468 | ids += result[int(idx[0][0])].check_id(gt[i][0]) 469 | else: 470 | fn += 1 471 | 472 | for j in range(len(result)): 473 | if max(iou_result[:,j]) < EVAL_TRESHOLD: 474 | fp += 1 475 | 476 | return [tp, fn, fp, ids] 477 | 478 | def draw_evaluation(input, gt, result): 479 | iou_result = np.zeros((len(gt),len(result)), dtype=float) 480 | for i in range(len(gt)): 481 | for j in range(len(result)): 482 | box1 = [gt[i][2], gt[i][3], gt[i][4] , gt[i][5] ] 483 | box2 = [result[j][0], result[j][1], result[j][2] , result[j][3]] 484 | iou_result[i,j] = iou(box1, box2) 485 | copy = np.copy(input) 486 | 487 | for i in range(len(gt)): 488 | if max(iou_result[i,:]) >= EVAL_TRESHOLD: 489 | idx = int(np.where(iou_result[i] == iou_result[i].max())[0][0]) 490 | cv2.rectangle(copy, (int(result[idx][0]), int(result[idx][1])), (int(result[idx][2]), int(result[idx][3])), (255,0,0) , 2) 491 | else: 492 | cv2.rectangle(copy, (int(gt[i][2]), int(gt[i][3])), (int(gt[i][4]), int(gt[i][5])), (0,255,0), 2) 493 | 494 | for j in range(len(result)): 495 | if max(iou_result[:,j]) < EVAL_TRESHOLD: 496 | cv2.rectangle(copy, (int(result[j][0]), int(result[j][1])), (int(result[j][2]), int(result[j][3])), (0,0,255), 2) 497 | return copy 498 | 499 | 500 | def untransform_bboxes(bboxes): 501 | """transform the bounding box from the scaled image back to the unscaled image.""" 502 | x = bboxes[..., 0] 503 | y = bboxes[..., 1] 504 | w = bboxes[..., 2] 505 | h = bboxes[..., 3] 506 | # x, y, w, h = bbs 507 | x /= 1 508 | y /= IMG_X_MAX/IMG_Y_MAX 509 | w /= 1 510 | h /= IMG_X_MAX/IMG_Y_MAX 511 | return bboxes 512 | 513 | def cxcywh_to_xywh(bbox): 514 | bbox[..., 0] -= bbox[..., 2] / 2 515 | bbox[..., 1] -= bbox[..., 3] / 2 516 | return bbox 517 | 518 | if metrics_out: 519 | now = datetime.now() 520 | now = now.strftime('%Y%m%d_%H-%M') 521 | result_text = open(location + now + '.txt','w') 522 | line = 'DETECTOR' + ','+'FLOW' + ',' + 'SCORE_THRESHOLD' + ',' + 'PRECISION' + ',' + 'RECALL' + ',' + 'MOTA' + ',' + 'FPS' + '\n' 523 | result_text.writelines((line)) 524 | # open csv file 525 | 526 | results = [] 527 | for detector_type in ['frcnn', 'yolo']: 528 | for flow_type in ['raft', 'farneback']: 529 | metrics_all = [] 530 | print('---------------------------------------------------------------') 531 | print('Detector: ' + detector_type) 532 | print('Flow: ' + flow_type) 533 | print('---------------------------------------------------------------') 534 | for SCORE_THRESHOLD in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]: 535 | performance = [0,0,0,0] 536 | print('Score Treshold: ' + str(SCORE_THRESHOLD)) 537 | if detector_type == 'yolo' and SCORE_THRESHOLD ==0: 538 | SCORE_THRESHOLD = 0.001 539 | print('---------------------------------------------------------------') 540 | ID = 1 541 | FRAME_NUMBER = 2 542 | #------------------------------------------------------------------------------- 543 | # INITIALIZE INPUT DATA -------------------------------------------------------- 544 | #------------------------------------------------------------------------------- 545 | input = InputData(media_type, location) 546 | current_frame = input.get_next_frame() 547 | 548 | # INITIALIZE DETECTOR 549 | detector = Detector(detector_type) 550 | detector.inference(current_frame) 551 | inital_veh = [] 552 | 553 | for detection in detector.result: 554 | vehicle = Vehicle(FRAME_NUMBER, detection, [0 ,0]) 555 | inital_veh.append(vehicle) 556 | # INITIALIZE Optical Flow 557 | flow = OpticalFlow(flow_type) 558 | #------------------------------------------------------------------------------- 559 | if video_out: 560 | h, w = current_frame.shape[:2] 561 | fourcc = cv2.VideoWriter_fourcc(*"mp4v") 562 | name = detector_type+'-'+flow_type+ '-' + str(SCORE_THRESHOLD) + '.mp4' 563 | out = cv2.VideoWriter(os.path.join(location, name), fourcc, 5.0, (4*w, h)) 564 | #------------------------------------------------------------------------------- 565 | 566 | #------------------------------------------------------------------------------- 567 | if metrics_out: 568 | if xml: 569 | import xml.etree.ElementTree as ET 570 | root = ET.parse(os.path.join(location, 'gt.xml')).getroot() 571 | gt = [] 572 | for frame in root.findall('frame'): 573 | frame_id = frame.get('num') 574 | vehicles = frame.find('target_list') 575 | for vehicle in vehicles: 576 | veh_id = vehicle.get('id') 577 | x = vehicle.find('box').get('left') 578 | y = vehicle.find('box').get('top') 579 | w = vehicle.find('box').get('width') 580 | h = vehicle.find('box').get('height') 581 | gt.append([int(frame_id), int(veh_id), float(x), float(y), float(w)+float(x), float(h)+float(y)]) 582 | for region in root.find('ignored_region').findall('box'): 583 | regions.append([float(region.get('left')),float(region.get('top')),float(region.get('left'))+float(region.get('width')),float(region.get('top'))+float(region.get('height'))]) 584 | else: 585 | gt_text = open(location + 'gt.txt','r') 586 | gt_text = gt_text.readlines() 587 | gt = [] 588 | for line in gt_text: 589 | data = line.split(',') 590 | gt.append([int(data[0]), int(data[1]), float(data[2]), float(data[3]), float(data[4])+ float(data[2]), float(data[5]) + float(data[3])]) 591 | 592 | # open csv file 593 | #------------------------------------------------------------------------------- 594 | if metrics_out: 595 | times = [] 596 | #------------------------------------------------------------------------------- 597 | # MAIN LOOP 598 | #------------------------------------------------------------------------------- 599 | while(current_frame is not False): 600 | #print('Frame No: ' + str(FRAME_NUMBER) + ' Veh. No.: ' + str(ID)) 601 | # read Image 602 | if metrics_out: 603 | start = time.time() 604 | # get image pair 605 | previous_frame = current_frame 606 | current_frame = input.get_next_frame() 607 | # check if reached end frame 608 | if current_frame is False: 609 | break 610 | IMG_Y_MAX, IMG_X_MAX, _ = current_frame.shape 611 | # run detection 612 | detector.inference(current_frame) 613 | # run flow 614 | flow.inference(current_frame, previous_frame) 615 | # create frame 616 | if FRAME_NUMBER == 2: # first pair 617 | frame = Frame(detector.result, flow.result, inital_veh) 618 | else: 619 | frame.predict() 620 | frame = Frame(detector.result, flow.result, frame.predict_veh) 621 | # match 622 | frame.match() 623 | # update 624 | frame.update() 625 | 626 | ############################################################################################################################# 627 | # LOGGING RESULTS 628 | ############################################################################################################################# 629 | 630 | 631 | if metrics_out: 632 | processing_time = time.time()-start 633 | times.append(processing_time) 634 | #print('elapsed time: {}'.format(processing_time)) 635 | 636 | expected = [item for item in gt if item[0] == FRAME_NUMBER] 637 | #result = [] 638 | #for vehicle in frame.update_veh: 639 | # result.append([vehicle.veh_id , vehicle.x , vehicle.y, vehicle.w +vehicle.x , vehicle.h+vehicle.y]) 640 | tp, fn , fp, ids = evaluate(expected , frame.update_veh) 641 | performance[0] += tp 642 | performance[1] += fn 643 | performance[2] += fp 644 | performance[3] += ids 645 | #print(str(FRAME_NUMBER) + ' | TP : ' + str(tp) + ' | FN : ' + str(fn) + ' | FP : ' + str(fp)) 646 | 647 | if video_out: 648 | expected = [item for item in gt if item[0] == FRAME_NUMBER] 649 | image = visuzalization(current_frame, frame.get_bbox(), flow, expected) 650 | out.write(image) 651 | 652 | if preview_result: 653 | expected = [item for item in gt if item[0] == FRAME_NUMBER] 654 | image = visuzalization(current_frame, frame.get_bbox(), flow, expected) 655 | display_result(image) 656 | 657 | ################################################################################################################################### 658 | # HANDLING CLOSURES 659 | ################################################################################################################################### 660 | 661 | if media_type == 'video': 662 | input.images.release() 663 | 664 | if preview_result: 665 | cv2.destroyAllWindows() 666 | 667 | 668 | if metrics_out: 669 | #print('RESULTS @ IoU Treshold ' + str(EVAL_TRESHOLD)) 670 | try: 671 | precision = performance[0] / (performance[0] + performance[2]) 672 | except: 673 | precision = 0.0 674 | 675 | try: 676 | recall = performance[0] / (performance[0] + performance[1]) 677 | except: 678 | recall = 0.0 679 | 680 | try: 681 | mota = 1 - (performance[1] + performance[2] + performance[3])/ (performance[0] + performance[1]) 682 | except: 683 | mota = 0.0 684 | 685 | 686 | metrics_all.append([recall,precision,mota]) 687 | 688 | 689 | average = 1/np.average(times) 690 | 691 | print ('Average Frames Per Second: ' + str(average)) 692 | print ('Precision: ' + str(precision)) 693 | print ('Recall: ' + str(recall)) 694 | print ('MOTA: ' + str(mota)) 695 | print ('###############################################################') 696 | line = detector_type + ','+ flow_type + ',' + str(SCORE_THRESHOLD) + ',' + str(precision) + ',' + str(recall) + ',' + str(mota) + ',' + str(average) + '\n' 697 | result_text.writelines((line)) 698 | 699 | if metrics_out: 700 | axis = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] 701 | metrics_all.sort() 702 | metrics_all = np.array(metrics_all) 703 | corrected_precision = np.interp(axis,metrics_all[:,0],metrics_all[:,1] ) 704 | AP = np.average(corrected_precision) 705 | print ('Average Precision: ' + str(AP)) 706 | print ('###############################################################') 707 | print ('###############################################################') 708 | 709 | import matplotlib.pyplot as plt 710 | plt.scatter(axis, corrected_precision) 711 | plt.plot(axis,corrected_precision) 712 | plt.title(detector_type + ' - ' + flow_type) 713 | plt.xlabel("recall") 714 | plt.ylabel("Precision") 715 | plt.show() 716 | 717 | 718 | if metrics_out: 719 | result_text.close() 720 | -------------------------------------------------------------------------------- /out.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/out.mp4 -------------------------------------------------------------------------------- /output-1-fps.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/output-1-fps.mp4 -------------------------------------------------------------------------------- /output.avi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/output.avi -------------------------------------------------------------------------------- /output.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/output.mp4 -------------------------------------------------------------------------------- /raft-models/raft-chairs.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/raft-models/raft-chairs.pth -------------------------------------------------------------------------------- /raft-models/raft-kitti.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/raft-models/raft-kitti.pth -------------------------------------------------------------------------------- /raft-models/raft-sintel.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/raft-models/raft-sintel.pth -------------------------------------------------------------------------------- /raft-models/raft-small.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/raft-models/raft-small.pth -------------------------------------------------------------------------------- /raft-models/raft-things.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/apbraga/Real-Time-Vehicle-Detection-and-Tracking/85db5cc948575ac1fdcad0de90c215c3386c8ae1/raft-models/raft-things.pth -------------------------------------------------------------------------------- /vehicle.py: -------------------------------------------------------------------------------- 1 | import dependencies 2 | import glob 3 | import os 4 | import cv2 5 | import numpy as np 6 | from PIL import Image 7 | import torch 8 | import torchvision.models as models 9 | import torchvision.transforms as transforms 10 | from raft import RAFT 11 | from utils import flow_viz 12 | from utils.utils import InputPadder 13 | import argparse 14 | from utils import flow_viz 15 | #--------------------------------------------------------------- 16 | #GLOBAL VARIABLES----------------------------------------------- 17 | #--------------------------------------------------------------- 18 | ID = 0 19 | FRAME_NUMBER = 1 20 | SCORE_THRESHOLD = 0.5 21 | IOU_THRESHOLD = 0.5 22 | DEVICE = 'cuda' 23 | #--------------------------------------------------------------- 24 | #--------------------------------------------------------------- 25 | #--------------------------------------------------------------- 26 | class InputData: 27 | def __init__(self, media_type, location): 28 | self.type = media_type 29 | if self.type == 'image': 30 | self.images = glob.glob(os.path.join(location)) 31 | self.images = sorted(self.images) 32 | 33 | if self.type == 'video': 34 | self.images = cv2.VideoCapture(location) 35 | 36 | def get_next_frame(self): 37 | try: 38 | if self.type == 'image': 39 | imfile = self.images[0] 40 | image = np.array(Image.open(imfile)).astype(np.uint8) 41 | image = cv2.resize(image, (320,240), interpolation = cv2.INTER_AREA) 42 | return image 43 | 44 | if self.type == 'video': 45 | _, image = self.images.read() 46 | image = cv2.resize(image, (320,240), interpolation = cv2.INTER_AREA) 47 | return image 48 | except: 49 | return False 50 | 51 | #--------------------------------------------------------------- 52 | #--------------------------------------------------------------- 53 | #--------------------------------------------------------------- 54 | class Vehicle: 55 | def __init__(self,frame_number, detection, flow): 56 | global ID 57 | global FRAME_NUMBER 58 | self.x, self.y, self.w, self.h = detection 59 | self.x_dot, self.y_dot = flow 60 | self.first_frame = FRAME_NUMBER 61 | self.last_seen = FRAME_NUMBER 62 | self.veh_id = ID 63 | ID = ID + 1 64 | 65 | def update_full(self, frame_number, detection, flow): 66 | self.last_seen = FRAME_NUMBER 67 | 68 | def update_partial(self, flow): 69 | self.x, self.y 70 | self.x_dot, self.y_dot 71 | 72 | def predict(self): 73 | self.x = self.x + self.x_dot 74 | self.y = self.y + self.y_dot 75 | 76 | def bounds(self): 77 | return True 78 | #--------------------------------------------------------------- 79 | #--------------------------------------------------------------- 80 | #--------------------------------------------------------------- 81 | class Frame: 82 | def __init__(self, detection, flow , vehicles): 83 | global FRAME_NUMBER 84 | self.frame_number = FRAME_NUMBER 85 | self.bounding_boxes = detection 86 | self.optical_flow = flow 87 | self.prior_vehicles = vehicles 88 | self.measurement = {} 89 | self.update_veh = [] 90 | self.predict_veh = [] 91 | FRAME_NUMBER = FRAME_NUMBER + 1 92 | 93 | def match(self): 94 | error = np.zeros((len(self.bounding_boxes),len(self.prior_vehicles))) 95 | for i in range(len(self.bounding_boxes)): 96 | for j in range(len(self.prior_vehicles)): 97 | error[i][j] = np.sqrt((self.bounding_boxes[i][0]-self.prior_vehicles[j].x)**2+(self.bounding_boxes[i][1]-self.prior_vehicles[j].y)**2) 98 | 99 | for i in range(len(self.bounding_boxes)): 100 | if min(error[i]) < 1.0: 101 | idx = int(np.where(error[i] == error[i].min())[0]) 102 | self.measurement[self.prior_vehicles[idx].veh_id] = ['full',self.bounding_boxes[i], self.average_flow(self.bounding_boxes[i]), self.prior_vehicles[idx]] 103 | else: 104 | vehicle = Vehicle(self.frame_number, self.bounding_boxes[i], self.average_flow(self.bounding_boxes[i])) 105 | self.measurement[vehicle.veh_id] = ['initialize', 0, 0, vehicle] 106 | 107 | for vehicle in self.prior_vehicles: 108 | if vehicle.last_seen - self.frame_number > 10: 109 | if vehicle.veh_id not in self.measurement: 110 | bbox = [vehicle.x, vehicle.y, vehicle.w, vehicle.h] 111 | self.measurement[vehicle.veh_id] = ['partial', 0, self.average_flow(bbox), vehicle] 112 | 113 | def update(self): 114 | for item in self.measurement.values(): 115 | if item[0] == 'full': 116 | item[3].update_full(self.frame_number, item[1], item[2]) 117 | if item[0] == 'partial': 118 | item[3].update_partial(item[2]) 119 | self.update_veh.append(item[3]) 120 | 121 | def predict(self): 122 | for vehicle in self.update_veh: 123 | vehicle.predict() 124 | if vehicle.bounds(): 125 | self.predict_veh.append(vehicle) 126 | 127 | def average_flow(self, bbox): 128 | x, y, w, h = bbox 129 | magnitude = np.average(self.optical_flow[0][0][x:x+w , y : y +h]) 130 | direction = np.average(self.optical_flow[0][1][x:x+w , y : y +h]) 131 | return [magnitude, direction] 132 | 133 | #--------------------------------------------------------------- 134 | #--------------------------------------------------------------- 135 | #--------------------------------------------------------------- 136 | class Detector: 137 | def __init__(self, type): 138 | self.type = type 139 | if type == 'frcnn': 140 | # load faster r-cnn 141 | self.detector = models.detection.fasterrcnn_resnet50_fpn(pretrained=True) 142 | # send model to gpu 143 | self.detector.to(DEVICE) 144 | # set model to inference mode 145 | self.detector.eval() 146 | # set transformation to prepare image for network input 147 | self.transform = transforms.Compose([transforms.ToTensor()]) 148 | if type == 'yolo': 149 | return True 150 | if type == 'sinet': 151 | return True 152 | 153 | def inference(self, image): 154 | if self.type == 'frcnn': 155 | # convert image to torch tensor 156 | image = self.transform(image) 157 | # send input data to GPU 158 | image = image.to(DEVICE) 159 | # process inference and get detections 160 | detections = self.detector([image]) 161 | boxes = detections[0]['boxes'] 162 | confidence = detections[0]['scores'] 163 | class_id = detections[0]['labels'] 164 | 165 | 166 | 167 | if self.type == 'yolo': 168 | detections, confidence, class_id = self.detector([image]) 169 | detections.detach().cpu().numpy() 170 | confidence.detach().cpu().numpy() 171 | class_id.detach().cpu().numpy() 172 | 173 | if self.type == 'sinet': 174 | detections, confidence, class_id = self.detector([image]) 175 | detections.detach().cpu().numpy() 176 | confidence.detach().cpu().numpy() 177 | class_id.detach().cpu().numpy() 178 | 179 | 180 | self.result = self.filter_detection(boxes, confidence, class_id) 181 | 182 | def filter_detection(self, detections, confidence, class_id): 183 | x1 = detections[:, 0].detach().cpu().numpy() 184 | y1 = detections[:, 1].detach().cpu().numpy() 185 | x2 = detections[:, 2].detach().cpu().numpy() 186 | y2 = detections[:, 3].detach().cpu().numpy() 187 | scores = confidence.detach().cpu().numpy() 188 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 189 | order = scores.argsort()[::-1] 190 | 191 | keep = [] 192 | while order.size > 0: 193 | j = order[0] 194 | keep.append(j) 195 | xx1 = np.maximum(x1[j], x1[order[1:]]) 196 | yy1 = np.maximum(y1[j], y1[order[1:]]) 197 | xx2 = np.minimum(x2[j], x2[order[1:]]) 198 | yy2 = np.minimum(y2[j], y2[order[1:]]) 199 | 200 | w = np.maximum(0.0, xx2 - xx1 + 1) 201 | h = np.maximum(0.0, yy2 - yy1 + 1) 202 | inter = w * h 203 | ovr = inter / (areas[j] + areas[order[1:]] - inter) 204 | 205 | inds = np.where(ovr <= IOU_THRESHOLD)[0] 206 | order = order[inds + 1] 207 | filter = [] 208 | for i in keep: 209 | if confidence[i] > SCORE_THRESHOLD: 210 | if class_id[i] in [2,3,4,6,8]: 211 | filter.append([int(x1[i]), int(y1[i]), int(x2[i]-x1[i]), int(y2[i]-y1[i])]) 212 | return filter 213 | 214 | #--------------------------------------------------------------- 215 | #--------------------------------------------------------------- 216 | #--------------------------------------------------------------- 217 | class OpticalFlow: 218 | def __init__(self, type): 219 | self.type = type 220 | if type == 'farneback': 221 | return True 222 | 223 | if type == 'raft': 224 | parser = argparse.ArgumentParser() 225 | parser.add_argument('--model', nargs='?', const='raft-models/raft-things.pth', type=str, help="restore checkpoint") 226 | parser.add_argument('--path', nargs='?', const='frames', type=int, help="dataset for evaluation") 227 | parser.add_argument('--small', action='store_true', help='use small model') 228 | parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision') 229 | parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation') 230 | args = parser.parse_args() 231 | self.flow_model = torch.nn.DataParallel(RAFT(args)) 232 | self.flow_model.load_state_dict(torch.load(args.model)) 233 | self.flow_model = self.flow_model.module 234 | self.flow_model.to(DEVICE) 235 | self.flow_model.eval() 236 | 237 | if type == 'flownet': 238 | return True 239 | 240 | 241 | def inference(self,image1,image2): 242 | if self.type == 'farneback': 243 | self.flow = True 244 | if self.type == 'raft': 245 | image1 = torch.from_numpy(image1).permute(2, 0, 1).float() 246 | image1 = image1[None].to(DEVICE) 247 | 248 | image2 = torch.from_numpy(image2).permute(2, 0, 1).float() 249 | image2 = image2[None].to(DEVICE) 250 | 251 | padder = InputPadder(image2.shape) 252 | image1, image2 = padder.pad(image1, image2) 253 | _, flow_up = self.flow_model(image1, image2, iters=5, test_mode=True) 254 | self.result = flow_up.detach().cpu().numpy() 255 | 256 | if self.type == 'flownet': 257 | self.flow = True 258 | 259 | 260 | 261 | def toimage(self): 262 | image = flow_viz.flow_to_image(self.flow) 263 | return image 264 | #--------------------------------------------------------------- 265 | #--------------------------------------------------------------- 266 | #--------------------------------------------------------------- 267 | 268 | 269 | def draw_bbox(image, bboxes): 270 | for bbox in bboxes: 271 | cv2.rectangle(image, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), np.random.uniform(0, 255) , 2) 272 | return image 273 | 274 | def flow_mask(flow, bboxes): 275 | image = flow.toimage() 276 | mask = np.full(image.shape[:2], 0, dtype=np.uint8) 277 | for bbox in bboxes: 278 | cv2.rectangle(mask, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (255,255,255), -1) 279 | image = cv2.bitwise_or(np.array(image).astype(np.uint8), np.array(image).astype(np.uint8), mask=mask) 280 | return image 281 | 282 | def visuzalization(input, bbox, flow): 283 | bbox = draw_bbox(input, bbox) 284 | flow_visualization = flow.toimage() 285 | mask = flow_mask(flow, bbox) 286 | 287 | img1 = np.concatenate((input,bbox), axis = 1) 288 | img2 = np.concatenate((flow_visualization, mask), axis = 1) 289 | merge_img = np.concatenate((img1,img2), axis = 1) 290 | return merge_img 291 | 292 | def display_result(image): 293 | cv2.imshow('image',image) 294 | cv2.waitKey(1) 295 | --------------------------------------------------------------------------------