├── .gitattributes ├── README.md ├── common.py ├── dataset ├── DataLoader.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── __init__.cpython-36.pyc │ ├── audio_processing_kg.cpython-36.pyc │ ├── audio_processing_tf.cpython-36.pyc │ ├── cdiscount_feature_dataset.cpython-36.pyc │ ├── cdiscount_feature_set_dataset.cpython-36.pyc │ ├── cdiscount_image_dataset.cpython-36.pyc │ ├── reader.cpython-35.pyc │ ├── reader.cpython-36.pyc │ ├── sampler.cpython-35.pyc │ ├── sampler.cpython-36.pyc │ ├── transform.cpython-35.pyc │ └── transform.cpython-36.pyc ├── process.py ├── reader.py ├── sampler.py └── transform.py ├── model ├── __init__.py ├── configuration.py ├── dlrt.py └── yolo-voc.cfg ├── train.py └── utils.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deep Reinforcement Learning for Visual Object Tracking in Videos 2 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | # edit settings here 2 | ROOT_DIR ='' 3 | 4 | 5 | 6 | DATA_DIR = '' 7 | RESULTS_DIR = ROOT_DIR + '/results' 8 | 9 | ##--------------------------------------------------------------------- 10 | import os 11 | import copy 12 | from datetime import datetime 13 | PROJECT_PATH = os.path.dirname(os.path.realpath(__file__)) 14 | IDENTIFIER = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') 15 | 16 | #numerical libs 17 | import math 18 | import numpy as np 19 | import random 20 | import PIL 21 | from PIL import Image # import jpg in python 22 | import cv2 23 | 24 | import matplotlib 25 | matplotlib.use('TkAgg') 26 | #matplotlib.use('Qt4Agg') 27 | #matplotlib.use('Qt5Agg') 28 | 29 | 30 | # torch libs 31 | import torch 32 | import torchvision.transforms as transforms 33 | from torch.utils.data.dataset import Dataset 34 | from torch.utils.data import DataLoader 35 | from torch.utils.data.sampler import * 36 | 37 | import torch 38 | import torch.nn as nn 39 | import torch.nn.functional as F 40 | from torch.autograd import Variable 41 | import torch.optim as optim 42 | from torch.optim import lr_scheduler 43 | from torch.nn.parallel.data_parallel import data_parallel 44 | from torch.utils.data import Dataset 45 | import torchvision 46 | 47 | 48 | # std libs 49 | import collections 50 | import numbers 51 | import inspect 52 | import shutil 53 | from timeit import default_timer as timer 54 | from __future__ import print_function, division 55 | 56 | 57 | 58 | import csv 59 | import pandas as pd 60 | import pickle 61 | import glob 62 | import sys 63 | from distutils.dir_util import copy_tree 64 | import time 65 | import matplotlib.pyplot as plt 66 | 67 | import skimage 68 | import skimage.color 69 | from skimage import io, transform 70 | # import all images from a folder, see the dataloader 71 | from skimage.io import imread_collection, imread, concatenate_images 72 | from scipy import ndimage 73 | from shapely.geometry import Polygon # for the Polygon 74 | 75 | 76 | 77 | #--------------------------------------------------------------------------------- 78 | print('@%s: ' % os.path.basename(__file__)) 79 | 80 | if 1: 81 | SEED=35202#1510302253 #int(time.time()) # 82 | random.seed(SEED) 83 | np.random.seed(SEED) 84 | torch.manual_seed(SEED) 85 | torch.cuda.manual_seed_all(SEED) 86 | print ('\tset random seed') 87 | print ('\t\tSEED=%d'%SEED) 88 | 89 | if 1: 90 | torch.backends.cudnn.benchmark = True ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. - 91 | torch.backends.cudnn.enabled = True 92 | print ('\tset cuda environment') 93 | #print ('\t\ttorch.__version__ =', torch.__version__) 94 | #print ('\t\ttorch.version.cuda =', torch.version.cuda) 95 | print ('\t\ttorch.backends.cudnn.version() =', torch.backends.cudnn.version()) 96 | try: 97 | print ('\t\tos[\'CUDA_VISIBLE_DEVICES\'] =',os.environ['CUDA_VISIBLE_DEVICES']) 98 | NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(',')) 99 | except Exception: 100 | print ('\t\tos[\'CUDA_VISIBLE_DEVICES\'] =','None') 101 | NUM_CUDA_DEVICES = 1 102 | 103 | print ('\t\ttorch.cuda.device_count() =', torch.cuda.device_count()) 104 | print ('\t\ttorch.cuda.current_device() =', torch.cuda.current_device()) 105 | 106 | 107 | print('') 108 | 109 | #--------------------------------------------------------------------------------- -------------------------------------------------------------------------------- /dataset/DataLoader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Feb 18 13:44:42 2018 4 | 5 | @author: Einmal 6 | """ 7 | 8 | # Build a dataset loader according to 9 | # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html 10 | from __future__ import print_function, division 11 | 12 | import pandas as pd 13 | import torch 14 | import torch.nn as nn 15 | import torch.optim as optim 16 | from torch.optim import lr_scheduler 17 | from torch.autograd import Variable 18 | from skimage import io, transform 19 | import torchvision 20 | from torch.utils.data import Dataset 21 | import numpy as np 22 | import matplotlib.pyplot as plt 23 | import matplotlib.patches 24 | import time 25 | import os 26 | import copy 27 | from PIL import Image # import jpg in python 28 | from skimage.io import imread_collection, imread, concatenate_images # import all images from a folder, see the dataloader 29 | 30 | 31 | 32 | 33 | #%% 34 | 35 | # Load the list of all videos 36 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None) 37 | 38 | # Name of a video can be accessed by e.g. vid_list[0][5] 39 | print( vid_list[0][33]) 40 | 41 | # Nr. of videos available 42 | n = vid_list.shape[0] 43 | 44 | 45 | #%% Test drawing 46 | 47 | # Load the gt boxes, which are apparently stored in coordinates of 4 points 48 | gt = pd.read_csv('F:/vot2017/ants1/groundtruth.txt', header = None) 49 | 50 | 51 | # Transform dataset into array 52 | # Not neccessary dt.iloc[x] does it 53 | 54 | #im = Image.open('F:/vot2017/ants1/00000092.jpg') 55 | 56 | # Draws a rectangle given the coordinates of all four corners in one array 57 | # Where the order is upper-left, upper-right, lower-rigth, lower-left 58 | def draw_gt(im, coords): 59 | """ Arguments: 60 | im = image 61 | coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower) 62 | """ 63 | plt.imshow(im) 64 | 65 | Xs = coords[::2] # Save Xcoords 66 | Ys = coords[1::2] # Save Ycoords 67 | for i in range(4): 68 | if i < 3: 69 | plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1) 70 | elif i == 3: 71 | plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1) 72 | plt.show() 73 | 74 | 75 | #draw_gt(im, gt.iloc[91]) 76 | #Check 77 | 78 | #%% Recalculate x,y,w,h to the four corner coordinates 79 | 80 | def calc_coords(gt_wh): 81 | gt_new = np.zeros((gt_wh.shape[0],8)) 82 | print(gt_new.shape) 83 | print(gt_wh.shape) 84 | print(gt_wh[:,0]) 85 | gt_new[:,0] = gt_wh[:,0] # x1 86 | gt_new[:,1] = gt_wh[:,1] # y1 87 | gt_new[:,2] = gt_wh[:,0] + gt_wh[:,2] # x2 88 | gt_new[:,3] = gt_wh[:,1] # y2 89 | gt_new[:,4] = gt_wh[:,0] + gt_wh[:,2] # x3 90 | gt_new[:,5] = gt_wh[:,1] + gt_wh[:,3] # y3 91 | gt_new[:,6] = gt_wh[:,0] # x4 92 | gt_new[:,7] = gt_wh[:,1] + gt_wh[:,3] # y4 93 | 94 | return gt_new 95 | #%% Dataset class 96 | 97 | '''Output is the imagesequence in an np.array format and the gt aswell.''' 98 | 99 | class VOT2017_dataset(Dataset): 100 | """This is the VOT2017 dataset""" 101 | def __init__(self, csv_file, root_dir, transform = None): 102 | """ Arguments: 103 | csv_file(string): Path to list file, where all videos are listed 104 | root_dir(string): Directory with all the videos 105 | transform(callable, optional): Will transform on a sample(for pytorch I guess) 106 | """ 107 | self.vot_list = pd.read_csv(csv_file, header = None) 108 | self.root_dir = root_dir 109 | self.transform = transform 110 | 111 | # Returns the nr of videos available 112 | def __len__(self): 113 | return len(self.vot_list) 114 | 115 | # Return the complete video sequence 116 | def __getitem__(self, vid_idx): 117 | """ Arguments: 118 | vid_idx(int): Video Index to be fetched form the video list 119 | """ 120 | vid_name_path = os.path.join(self.root_dir, 121 | self.vot_list.iloc[vid_idx,0], 122 | '*.jpg') 123 | 124 | gt = pd.read_csv(os.path.join(self.root_dir, 125 | self.vot_list.iloc[vid_idx,0], 126 | 'groundtruth.txt'), header = None) 127 | 128 | im_seq = imread_collection(vid_name_path) 129 | 130 | # Image collection to np.array 131 | images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB) 132 | 133 | # Also convert the gt to np.array 134 | gt = gt.values 135 | 136 | if gt.shape[1] == 4: 137 | gt = calc_coords(gt) 138 | 139 | 140 | sample = {'Video': images, 'gt': gt} 141 | 142 | # Cant tell yet what this is for 143 | if self.transform: 144 | sample = self.transform(sample) 145 | return sample 146 | 147 | 148 | #%% Test the dataset class 149 | 150 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt', 151 | root_dir= 'F:/vot2017/') 152 | 153 | # E.g. load the second video of the vid_list 154 | sample = test[14] 155 | 156 | # Simply draw a single video - here the idx refers to the image in the sequence 157 | draw_gt(sample['Video'][0], sample['gt'][0]) 158 | 159 | 160 | #%% Just for information - Find the smallest sized video 161 | 162 | Vids = vid_list.shape[0] 163 | 164 | Size = np.zeros((Vids, 2)) 165 | 166 | for i in range(Vids): 167 | im = Image.open(os.path.join('F:/vot2017/', 168 | vid_list.iloc[i,0], 169 | '00000001.jpg')) 170 | Size[i,0] = im.size[0] 171 | Size[i,1] = im.size[1] 172 | 173 | # Smallest size ist 320 x240 174 | #Histogram of image sizes 175 | plt.hist(Size) 176 | 177 | 178 | 179 | 180 | #%% Transforms - Rescale/Resize 181 | 182 | # Rescaling of an image so that we can feed it with the same size into a network 183 | # Also the groundtruth boxes have to be rescaled accordingly! 184 | # Problem atm rescale the whole Video!! - so far only with for loop 185 | 186 | class Rescale(object): 187 | ''' 188 | Rescale the image in a sample to a given size 189 | 190 | Arguments: output_size(tuple): Desired output size 191 | idx(int) : For now idx of the image to be resized 192 | ''' 193 | 194 | # Check if output_size is a tuple, 195 | # maybe also assert if it isnt bigger than the smallest image? 196 | def __init__(self, output_size): 197 | assert isinstance(output_size,(tuple)) 198 | self.output_size = output_size 199 | 200 | def __call__(self, sample): 201 | # Split the sample in video and gt 202 | images, gt = sample['Video'], sample['gt'] 203 | nr = len(images) # Save the amount of images to iterate over 204 | print(nr) 205 | # Save heigth and width of video 206 | h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry 207 | 208 | new_h, new_w = self.output_size 209 | 210 | 211 | # I dont like this part due to the for loop.! 212 | # Initialize the resized image sequence array 213 | img = np.zeros((nr,new_h,new_w, images.shape[3])) 214 | # Iterate over all images and resize them to the given scale. 215 | for i in range(nr): 216 | img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w)) 217 | 218 | 219 | # Here the groundtruth boxes are rescaled aswell 220 | gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h)) 221 | 222 | return {'Video': img, 'gt': gt_new} 223 | 224 | 225 | 226 | #%% Test rescaling 227 | 228 | scale = Rescale((220,280)) 229 | 230 | 231 | transformed_sample = scale(sample) 232 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100]) 233 | 234 | # Check 235 | 236 | #%% Transforms - ToTensor 237 | 238 | # Transform the loaded image collection to Tensors 239 | 240 | class ToTensor(object): 241 | '''Convert sample to tensor''' 242 | def __call__(self, sample): 243 | # Load the sample and split it 244 | images, gt = sample['Video'], sample['gt'] 245 | 246 | # swap color axis because 247 | # numpy image: H x W x C 248 | # torch image: C X H X W 249 | # How does this relate to videos/imagesequences? 250 | images = images.transpose((0,3,1,2)) 251 | 252 | return {'Video': torch.from_numpy(images), 253 | 'gt': torch.from_numpy(gt)} 254 | 255 | #%% Test the ToTensor 256 | 257 | tens = ToTensor() 258 | tens(sample) 259 | 260 | # Still have to test this 261 | ======= 262 | # -*- coding: utf-8 -*- 263 | """ 264 | Created on Sun Feb 18 13:44:42 2018 265 | 266 | @author: Einmal 267 | """ 268 | 269 | # Build a dataset loader according to 270 | # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html 271 | from __future__ import print_function, division 272 | 273 | import pandas as pd 274 | import torch 275 | import torch.nn as nn 276 | import torch.optim as optim 277 | from torch.optim import lr_scheduler 278 | from torch.autograd import Variable 279 | from skimage import io, transform 280 | import torchvision 281 | from torch.utils.data import Dataset 282 | import numpy as np 283 | import matplotlib.pyplot as plt 284 | import matplotlib.patches 285 | import time 286 | import os 287 | import glob 288 | import copy 289 | from PIL import Image # import jpg in python 290 | from skimage.io import imread_collection, imread, concatenate_images # import all images from a folder, see the dataloader 291 | import shapely 292 | 293 | #%% 294 | 295 | # Load the list of all videos 296 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None) 297 | 298 | # Name of a video can be accessed by e.g. vid_list[0][5] 299 | print( vid_list[0][33]) 300 | 301 | # Nr. of videos available 302 | n = vid_list.shape[0] 303 | 304 | 305 | #%% Test 306 | 307 | test =os.path.join('F:/vot2017/', 308 | vid_list.iloc[0,0]) 309 | 310 | included_extension = ['jpg'] 311 | file_names = [fn for fn in os.listdir(test) 312 | if any(fn.endswith(ext) for ext in included_extension)] 313 | #%% Is rectangular 314 | 315 | #for i in range(vid_list.shape[0]): 316 | gt = pd.read_csv(os.path.join('F:/vot2017/', 317 | vid_list.iloc[3,0], 318 | 'groundtruth.txt'), header = None) 319 | gt = gt.values 320 | Xs = gt[:,::2] # Save Xcoords 321 | Ys = gt[:,1::2] # Save Ycoords 322 | print(Xs) 323 | #%% Test drawing 324 | 325 | # Load the gt boxes, which are apparently stored in coordinates of 4 points 326 | gt = pd.read_csv('F:/vot2017/ball1/groundtruth.txt', header = None) 327 | gt = gt.values 328 | 329 | # Transform dataset into array 330 | # Not neccessary dt.iloc[x] does it 331 | 332 | im = Image.open('F:/vot2017/ball1/00000095.jpg') 333 | 334 | # Draws a rectangle given the coordinates of all four corners in one array 335 | # Where the order is upper-left, upper-right, lower-rigth, lower-left 336 | def draw_gt(im, coords): 337 | """ Arguments: 338 | im = image 339 | coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower) 340 | """ 341 | plt.imshow(im) 342 | Xs = coords[::2] # Save Xcoords 343 | Ys = coords[1::2] # Save Ycoords 344 | for i in range(4): 345 | if i < 3: 346 | plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1) 347 | elif i == 3: 348 | plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1) 349 | plt.show() 350 | 351 | 352 | draw_gt(im, gt[94]) 353 | #Check 354 | 355 | 356 | 357 | #%% Dataset class 358 | 359 | '''Output is the imagesequence in an np.array format and the gt aswell.''' 360 | 361 | class VOT2017_dataset(Dataset): 362 | """This is the VOT2017 dataset""" 363 | def __init__(self, csv_file, root_dir, transform = None): 364 | """ Arguments: 365 | csv_file(string): Path to list file, where all videos are listed 366 | root_dir(string): Directory with all the videos 367 | transform(callable, optional): Will transform on a sample(for pytorch I guess) 368 | 369 | """ 370 | self.vot_list = pd.read_csv(csv_file, header = None) 371 | self.root_dir = root_dir 372 | self.transform = transform 373 | 374 | 375 | # Return the complete video sequence 376 | def __getitem__(self, vid_idx, T = 10): 377 | """ Arguments: 378 | vid_idx(int): Video Index to be fetched form the video list 379 | T(int): Nr of Images in sequence - default == 10 380 | """ 381 | gt = pd.read_csv(os.path.join(self.root_dir, 382 | self.vot_list.iloc[vid_idx,0], 383 | 'groundtruth.txt'), header = None) 384 | 385 | vid_name_path = os.path.join(self.root_dir, 386 | self.vot_list.iloc[vid_idx,0], 387 | '*.jpg') 388 | 389 | file_names = glob.glob(vid_name_path) 390 | 391 | rand_start = np.random.randint(0, len(file_names)-T+1) 392 | 393 | file_names = file_names[rand_start:(rand_start+T-1)] 394 | 395 | im_seq = imread_collection(file_names) 396 | 397 | # Image collection to np.array 398 | images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB) 399 | 400 | # Also convert the gt to np.array 401 | gt = gt.values 402 | gt = gt[rand_start:(rand_start+T-1),:] 403 | 404 | sample = {'Video': images, 'gt': gt} 405 | 406 | # Cant tell yet what this is for 407 | if self.transform: 408 | sample = self.transform(sample) 409 | return sample 410 | 411 | 412 | #%% Test the dataset class 413 | 414 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt', 415 | root_dir= 'F:/vot2017/') 416 | 417 | # E.g. load a of the vid_list 418 | sample = test.__getitem__(0, T = 20) 419 | 420 | # Simply draw a single video - here the idx refers to the image in the sequence 421 | draw_gt(sample['Video'][10], sample['gt'][10]) 422 | 423 | #%% define loss/reward functions; given the coordinates of all corners 424 | 425 | def loss_v1(pred, gt): 426 | r = - np.mean(np.absolute(pred-gt)) - np.max(np.absolute(pred-gt)) 427 | return r 428 | 429 | 430 | # Calculate the reward given all the for corners x1,y1,x2,y2,x3,y3,x4,y4 431 | def loss_v2(pred, gt): 432 | #reorder the coord in tuples for the polygon 433 | pred_re = [(pred[0],pred[1]),(pred[2],pred[3]), (pred[4],pred[5]),(pred[6],pred[7])] 434 | gt_re = [(gt[0],gt[1]),(gt[2],gt[3]),(gt[4],gt[5]),(gt[6],gt[7])] 435 | 436 | pred_poly = Polygon(pred_re) 437 | gt_poly = Polygon(gt_re) 438 | # Reward == Intersection/total area 439 | r = pred_poly.intersection(gt_poly).area/(pred_poly.area + gt_poly.area - pred_poly.intersection(gt_poly).area) 440 | return r 441 | #%% Test reward functions 442 | 443 | test_1 = np.array((0,0,0,1,1,1,1,0)) 444 | test_2 = np.array((0.5,0,0.5,1,1.5,1,1.5,0)) 445 | 446 | print(loss_v1(test_1, test_2)) 447 | print(loss_v2(test_1, test_2)) 448 | 449 | 450 | #%% Just for information - Find the smallest sized video 451 | 452 | Vids = vid_list.shape[0] 453 | 454 | Size = np.zeros((Vids, 2)) 455 | 456 | for i in range(Vids): 457 | im = Image.open(os.path.join('F:/vot2017/', 458 | vid_list.iloc[i,0], 459 | '00000001.jpg')) 460 | Size[i,0] = im.size[0] 461 | Size[i,1] = im.size[1] 462 | 463 | # Smallest size ist 320 x240 464 | #Histogram of image sizes 465 | plt.hist(Size) 466 | 467 | 468 | 469 | 470 | #%% Transforms - Rescale/Resize 471 | 472 | # Rescaling of an image so that we can feed it with the same size into a network 473 | # Also the groundtruth boxes have to be rescaled accordingly! 474 | # Problem atm rescale the whole Video!! - so far only with for loop 475 | 476 | class Rescale(object): 477 | ''' 478 | Rescale the image in a sample to a given size 479 | 480 | Arguments: output_size(tuple): Desired output size 481 | idx(int) : For now idx of the image to be resized 482 | ''' 483 | 484 | # Check if output_size is a tuple, 485 | # maybe also assert if it isnt bigger than the smallest image? 486 | def __init__(self, output_size): 487 | assert isinstance(output_size,(tuple)) 488 | self.output_size = output_size 489 | 490 | def __call__(self, sample): 491 | # Split the sample in video and gt 492 | images, gt = sample['Video'], sample['gt'] 493 | nr = len(images) # Save the amount of images to iterate over 494 | print(nr) 495 | # Save heigth and width of video 496 | h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry 497 | 498 | new_h, new_w = self.output_size 499 | 500 | 501 | # I dont like this part due to the for loop.! 502 | # Initialize the resized image sequence array 503 | img = np.zeros((nr,new_h,new_w, images.shape[3])) 504 | # Iterate over all images and resize them to the given scale. 505 | for i in range(nr): 506 | img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w)) 507 | 508 | 509 | # Here the groundtruth boxes are rescaled aswell 510 | gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h)) 511 | 512 | return {'Video': img, 'gt': gt_new} 513 | 514 | 515 | 516 | #%% Test rescaling 517 | 518 | scale = Rescale((220,280)) 519 | 520 | 521 | transformed_sample = scale(sample) 522 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100]) 523 | 524 | # Check 525 | 526 | #%% Transforms - ToTensor 527 | 528 | # Transform the loaded image collection to Tensors 529 | 530 | class ToTensor(object): 531 | '''Convert sample to tensor''' 532 | def __call__(self, sample): 533 | # Load the sample and split it 534 | images, gt = sample['Video'], sample['gt'] 535 | 536 | # swap color axis because 537 | # numpy image: H x W x C 538 | # torch image: C X H X W 539 | # How does this relate to videos/imagesequences? 540 | images = images.transpose((0,3,1,2)) 541 | 542 | return {'Video': torch.from_numpy(images), 543 | 'gt': torch.from_numpy(gt)} 544 | 545 | #%% Test the ToTensor 546 | 547 | 548 | rescale = Rescale() 549 | rescale(sample) 550 | tens = ToTensor() 551 | tens(sample) 552 | 553 | # Apparently not enoguh RAM on my machine here to fully check that it works :D 554 | >>>>>>> b297fdf775d26e7df2deb91721412a875e22dd77 555 | -------------------------------------------------------------------------------- /dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__init__.py -------------------------------------------------------------------------------- /dataset/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/audio_processing_kg.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/audio_processing_kg.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/audio_processing_tf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/audio_processing_tf.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/cdiscount_feature_dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_feature_dataset.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/cdiscount_feature_set_dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_feature_set_dataset.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/cdiscount_image_dataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_image_dataset.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/reader.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/reader.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/reader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/reader.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/sampler.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/sampler.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/sampler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/sampler.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/transform.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/transform.cpython-35.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/transform.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/transform.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/process.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | from utility.file import * 3 | from utility.draw import * 4 | 5 | from dataset.reader import * 6 | 7 | def multi_mask_to_overlay(multi_mask): 8 | overlay = skimage.color.label2rgb(multi_mask, bg_label=0, bg_color=(0, 0, 0))*255 9 | overlay = overlay.astype(np.uint8) 10 | return overlay 11 | 12 | def thresh_to_inner_contour(thresh): 13 | thresh_pad = np.lib.pad(thresh, ((1, 1), (1, 1)), 'reflect') 14 | contour = thresh_pad[1:-1,1:-1] & ( 15 | (thresh_pad[1:-1,1:-1] != thresh_pad[:-2,1:-1]) \ 16 | | (thresh_pad[1:-1,1:-1] != thresh_pad[2:,1:-1]) \ 17 | | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,:-2]) \ 18 | | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,2:]) 19 | ) 20 | return contour 21 | 22 | 23 | 24 | #extra processing 25 | def run_make_annotation(): 26 | 27 | split = 'train1_ids_all_670' 28 | ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#') 29 | 30 | num_ids = len(ids) 31 | for i in range(num_ids): 32 | id = ids[i] 33 | image_files = glob.glob(DATA_DIR + '/image/' + id + '/images/*.png') 34 | assert(len(image_files)==1) 35 | image_file=image_files[0] 36 | print(id) 37 | 38 | #----clear old ----------------------------- 39 | if 1: 40 | for f in ['one_mask.png','one_countour_mask.png','one_countour_image.png','one_countour.png', 41 | 'overlap.png', 'one_center.png','/masks.npy', '/labels.npy', 42 | '/countour_on_image.png', '/cut_mask.png', '/label.npy', '/mask.png','/overlay.png', 43 | '/multi.npy','/multi.png', 44 | '/instance.npy','/instance.png', 45 | '/multi_instance.npy','/multi_instance.png', 46 | ]: 47 | file = DATA_DIR + '/image/' + id + '/' + f 48 | if os.path.exists(file): 49 | os.remove(file) 50 | #----clear old ----------------------------- 51 | 52 | 53 | #image 54 | image = cv2.imread(image_file,cv2.IMREAD_COLOR) 55 | 56 | H,W,C = image.shape 57 | multi_mask = np.zeros((H,W), np.int32) 58 | mask = np.zeros((H,W), np.uint8) 59 | countour = np.zeros((H,W), np.uint8) 60 | 61 | 62 | 63 | 64 | mask_files = glob.glob(DATA_DIR + '/image/' + id + '/masks/*.png') 65 | mask_files.sort() 66 | count = len(mask_files) 67 | for i in range(count): 68 | mask_file = mask_files[i] 69 | thresh = cv2.imread(mask_file,cv2.IMREAD_GRAYSCALE) 70 | thresh = thresh >128 71 | index = np.where(thresh==True) 72 | 73 | multi_mask[thresh]= i+1 74 | mask = np.logical_or(mask,thresh) 75 | countour = np.logical_or(countour, thresh_to_inner_contour(thresh) ) 76 | 77 | 78 | 79 | ## save and show ------------------------------------------- 80 | countour_on_image = image.copy() 81 | countour_on_image = countour[:,:,np.newaxis]*np.array((0,255,0)) + (1-countour[:,:,np.newaxis])*countour_on_image 82 | 83 | countour_overlay = countour*255 84 | mask_overlay = mask*255 85 | multi_mask_overlay = multi_mask_to_overlay(multi_mask) 86 | 87 | 88 | image_show('image',image) 89 | image_show('mask', mask_overlay) 90 | image_show('multi_mask',multi_mask_overlay) 91 | image_show('countour',countour_overlay) 92 | image_show('countour_on_image',countour_on_image) 93 | 94 | 95 | 96 | np.save(DATA_DIR + '/image/' + id + '/multi_mask.npy', multi_mask) 97 | cv2.imwrite(DATA_DIR + '/image/' + id + '/multi_mask.png',multi_mask_overlay) 98 | cv2.imwrite(DATA_DIR + '/image/' + id + '/mask.png',mask_overlay) 99 | cv2.imwrite(DATA_DIR + '/image/' + id + '/countour.png',countour_overlay) 100 | cv2.imwrite(DATA_DIR + '/image/' + id + '/countour_on_image.png',countour_on_image) 101 | 102 | cv2.waitKey(1) 103 | 104 | 105 | 106 | 107 | 108 | # main ################################################################# 109 | if __name__ == '__main__': 110 | print( '%s: calling main function ... ' % os.path.basename(__file__)) 111 | 112 | run_make_annotation() 113 | 114 | print( 'sucess!') 115 | -------------------------------------------------------------------------------- /dataset/reader.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | 3 | from dataset.transform import * 4 | from dataset.sampler import * 5 | from utility.file import * 6 | from utility.draw import * 7 | 8 | 9 | #data reader ---------------------------------------------------------------- 10 | class ScienceDataset(Dataset): 11 | 12 | def __init__(self, split, transform=None, mode='train'): 13 | super(ScienceDataset, self).__init__() 14 | start = timer() 15 | 16 | self.split = split 17 | self.transform = transform 18 | self.mode = mode 19 | 20 | #read split 21 | ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#') 22 | 23 | #save 24 | self.ids = ids 25 | 26 | #print 27 | print('\ttime = %0.2f min'%((timer() - start) / 60)) 28 | print('\tnum_ids = %d'%(len(self.ids))) 29 | print('') 30 | 31 | 32 | def __getitem__(self, index): 33 | id = self.ids[index] 34 | image_id = id.split('/')[-1] 35 | image = cv2.imread(DATA_DIR + '/image/' + id + '/images/' + image_id +'.png', cv2.IMREAD_COLOR) 36 | 37 | if self.mode in ['train']: 38 | multi_mask = np.load( DATA_DIR + '/image/' + id + '/multi_mask.npy')#.astype(int32) 39 | 40 | if self.transform is not None: 41 | return self.transform(image, multi_mask, index) 42 | else: 43 | return input, multi_mask, index 44 | 45 | if self.mode in ['test']: 46 | if self.transform is not None: 47 | return self.transform(image,index) 48 | else: 49 | return image, index 50 | 51 | def __len__(self): 52 | return len(self.ids) 53 | # draw ---------------------------------------------------------------- 54 | def multi_mask_to_overlay(multi_mask): 55 | overlay = skimage.color.label2rgb(multi_mask, bg_label=0, bg_color=(0, 0, 0))*255 56 | overlay = overlay.astype(np.uint8) 57 | return overlay 58 | 59 | 60 | # modifier ---------------------------------------------------------------- 61 | def thresh_to_inner_contour(thresh): 62 | thresh_pad = np.lib.pad(thresh, ((1, 1), (1, 1)), 'reflect') 63 | contour = thresh_pad[1:-1,1:-1] & ( 64 | (thresh_pad[1:-1,1:-1] != thresh_pad[:-2,1:-1]) \ 65 | | (thresh_pad[1:-1,1:-1] != thresh_pad[2:,1:-1]) \ 66 | | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,:-2]) \ 67 | | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,2:]) 68 | ) 69 | return contour 70 | 71 | 72 | def multi_mask_to_annotation(multi_mask): 73 | H,W = multi_mask.shape[:2] 74 | count = multi_mask.max() 75 | 76 | box = [] 77 | label = [] 78 | instance = [] 79 | for i in range(count): 80 | thresh = (multi_mask==(i+1)) 81 | if thresh.sum()>1: 82 | # filter small, etc 83 | 84 | y,x = np.where(thresh) 85 | y0 = y.min() 86 | y1 = y.max() 87 | x0 = x.min() 88 | x1 = x.max() 89 | w = (x1-x0)+1 90 | h = (y1-y0)+1 91 | 92 | #f = int(0.3*min(w,h)) 93 | border = int(0.3*(w+h)/2) 94 | x0 = x0-border 95 | x1 = x1+border 96 | y0 = y0-border 97 | y1 = y1+border 98 | 99 | #clip 100 | x0 = max(0,x0) 101 | y0 = max(0,y0) 102 | x1 = min(W-1,x1) 103 | y1 = min(H-1,y1) 104 | 105 | # filter small 106 | box.append([x0,y0,x1,y1]) 107 | label.append(1) # now assume one class 108 | instance.append(thresh.astype(np.float32)) 109 | 110 | if box!=[]: 111 | box = np.array(box,np.float32) 112 | label = np.array(label,np.float32) 113 | instance = np.array(instance,np.float32) 114 | else: 115 | box = None 116 | label = None 117 | instance = None 118 | 119 | return box, label, instance 120 | 121 | 122 | 123 | 124 | 125 | # check ##################################################################################3 126 | def run_check_dataset_reader(): 127 | 128 | def augment(image, multi_mask, index): 129 | box, label, instance = multi_mask_to_annotation(multi_mask) 130 | 131 | #for display 132 | multi_mask = multi_mask/multi_mask.max() *255 133 | count = len(instance) 134 | 135 | instance_gray = instance.copy() 136 | instance =[] 137 | for i in range(count): 138 | instance.append( 139 | cv2.cvtColor((instance_gray[i]*255).astype(np.uint8),cv2.COLOR_GRAY2BGR) 140 | ) 141 | instance = np.array(instance) 142 | return image, multi_mask, box, label, instance, index 143 | 144 | 145 | dataset = ScienceDataset( 146 | 'train1_ids_gray_only1_500', mode='train', 147 | transform = augment, 148 | ) 149 | sampler = SequentialSampler(dataset) 150 | #sampler = RandomSampler(dataset) 151 | 152 | 153 | for n in iter(sampler): 154 | #for n in range(10): 155 | #n=0 156 | #while 1: 157 | image, multi_mask, box, label, instance, index = dataset[n] 158 | image_show('image',image) 159 | image_show('multi_mask',multi_mask) 160 | count = len(instance) 161 | for i in range(count): 162 | x0,y0,x1,y1 = box[i] 163 | cv2.rectangle(instance[i],(x0,y0),(x1,y1),(0,0,255),1) 164 | 165 | image_show('instance[i]',instance[i]) 166 | print('label[i], box[i] : ', label[i], box[i]) 167 | 168 | cv2.waitKey(1) 169 | 170 | 171 | 172 | 173 | 174 | # main ################################################################# 175 | if __name__ == '__main__': 176 | print( '%s: calling main function ... ' % os.path.basename(__file__)) 177 | 178 | run_check_dataset_reader() 179 | 180 | print( 'sucess!') 181 | -------------------------------------------------------------------------------- /dataset/sampler.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | # common tool for dataset 3 | 4 | #sampler ----------------------------------------------- 5 | 6 | class ConstantSampler(Sampler): 7 | def __init__(self, data, list): 8 | self.num_samples = len(list) 9 | self.list = list 10 | 11 | def __iter__(self): 12 | #print ('\tcalling Sampler:__iter__') 13 | return iter(self.list) 14 | 15 | def __len__(self): 16 | #print ('\tcalling Sampler:__len__') 17 | return self.num_samples 18 | 19 | 20 | # see trorch/utils/data/sampler.py 21 | class FixLengthRandomSampler(Sampler): 22 | def __init__(self, data, length=None): 23 | self.len_data = len(data) 24 | self.length = length or self.len_data 25 | 26 | def __iter__(self): 27 | #print ('\tcalling Sampler:__iter__') 28 | 29 | l=[] 30 | while 1: 31 | ll = list(range(self.len_data)) 32 | random.shuffle(ll) 33 | l = l + ll 34 | if len(l)>=self.length: break 35 | 36 | l= l[:self.length] 37 | return iter(l) 38 | 39 | 40 | def __len__(self): 41 | #print ('\tcalling Sampler:__len__') 42 | return self.length 43 | -------------------------------------------------------------------------------- /dataset/transform.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | 3 | 4 | ## for debug 5 | def dummy_transform(image): 6 | print ('\tdummy_transform') 7 | return image 8 | 9 | # kaggle science bowl-2 : ------------------------------------------------------- 10 | 11 | def resize_to_factor2(image, mask, factor=16): 12 | 13 | H,W = image.shape[:2] 14 | h = (H//factor)*factor 15 | w = (W //factor)*factor 16 | return fix_resize_transform2(image, mask, w, h) 17 | 18 | 19 | 20 | def fix_resize_transform2(image, mask, w, h): 21 | H,W = image.shape[:2] 22 | if (H,W) != (h,w): 23 | image = cv2.resize(image,(w,h)) 24 | 25 | mask = mask.astype(np.float32) 26 | mask = cv2.resize(mask,(w,h)) 27 | mask = mask.astype(np.int32) 28 | return image, mask 29 | 30 | 31 | 32 | 33 | def fix_crop_transform2(image, mask, x,y,w,h): 34 | 35 | H,W = image.shape[:2] 36 | assert(H>=h) 37 | assert(W >=w) 38 | 39 | if (x==-1 & y==-1): 40 | x=(W-w)//2 41 | y=(H-h)//2 42 | 43 | if (x,y,w,h) != (0,0,W,H): 44 | image = image[y:y+h, x:x+w] 45 | mask = mask[y:y+h, x:x+w] 46 | 47 | return image, mask 48 | 49 | 50 | def random_crop_transform2(image, mask, w,h): 51 | H,W = image.shape[:2] 52 | 53 | if H!=h: 54 | y = np.random.choice(H-h) 55 | else: 56 | y=0 57 | 58 | if W!=w: 59 | x = np.random.choice(W-w) 60 | else: 61 | x=0 62 | 63 | return fix_crop_transform2(image, mask, x,y,w,h) 64 | 65 | 66 | def resize_to_factor(image, factor=16): 67 | height,width = image.shape[:2] 68 | h = (height//factor)*factor 69 | w = (width //factor)*factor 70 | return fix_resize_transform(image, w, h) 71 | 72 | 73 | def fix_resize_transform(image, w, h): 74 | height,width = image.shape[:2] 75 | if (height,width) != (h,w): 76 | image = cv2.resize(image,(w,h)) 77 | return image 78 | 79 | # main ################################################################# 80 | if __name__ == '__main__': 81 | print( '%s: calling main function ... ' % os.path.basename(__file__)) 82 | 83 | print('\nsucess!') -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/model/__init__.py -------------------------------------------------------------------------------- /model/configuration.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | import configparser 3 | 4 | 5 | class Configuration(object): 6 | 7 | def __init__(self): 8 | super(Configuration, self).__init__() 9 | self.version='configuration version \'mask-rcnn-resnet50-fpn, kaggle\'' 10 | 11 | #net 12 | self.num_classes = 2 #include background class 13 | 14 | #rpn 15 | self.rpn_num_heads = 4 16 | self.rpn_num_bases = 3 17 | self.rpn_base_sizes = [ 8, 16, 32, 64 ] #diameter 18 | self.rpn_base_apsect_ratios = [1, 0.5, 2] 19 | self.rpn_strides = [ 1, 2, 4, 8 ] 20 | 21 | 22 | self.rpn_train_batch_size = 256 # rpn target 256 23 | self.rpn_train_fg_fraction = 0.5 24 | self.rpn_train_bg_thresh_high = 0.3 25 | self.rpn_train_fg_thresh_low = 0.7 26 | 27 | self.rpn_train_nms_threshold = 0.7 # rpn nms 28 | self.rpn_train_nms_min_size = [ 64, 32, 16, 8, 4] # not using: -1 29 | self.rpn_train_nms_pre_top_n = 5000 #12000 30 | self.rpn_train_nms_post_top_n = 1000 #2000 31 | 32 | self.rpn_test_nms_threshold = 0.7 33 | self.rpn_test_nms_min_size = [ 64, 32, 16, 8, 4] # not using: -1 34 | self.rpn_test_nms_pre_top_n = 5000 35 | self.rpn_test_nms_post_top_n = 1000 36 | 37 | #crop 38 | self.pool_size = 16 39 | self.rcnn_select_size_thresholds = [ 40 | [ 0, 8],#'stride 1': 41 | [ 8, 16],#'stride 2': 42 | [ 16, 32],#'stride 4': 43 | [ 32, 1e8],#'stride 8': 44 | ] 45 | 46 | #rcnn 47 | self.rcnn_train_batch_size = 256 # rcnn target 48 | self.rcnn_train_fg_fraction = 0.25 49 | self.rcnn_train_bg_thresh_high = 0.5 50 | self.rcnn_train_bg_thresh_low = 0.0 51 | self.rcnn_train_fg_thresh_low = 0.5 52 | # self.rcnn_train_delta_norm_stds = (0.1, 0.1, 0.2, 0.2) #(1, 1, 1, 1) # 53 | 54 | self.rcnn_train_nms_pre_threshold = 0.05 # set low 0.05 to make roc curve. 55 | self.rcnn_train_nms_post_overlap_threshold = 0.8 56 | self.rcnn_train_nms_max_per_image = 256 57 | 58 | self.rcnn_test_nms_pre_threshold = 0.1 59 | self.rcnn_test_nms_post_overlap_threshold = 0.5 60 | self.rcnn_test_nms_max_per_image = 512 61 | 62 | #mask 63 | self.mask_size = 16 64 | self.mask_train_fg_thresh_low = 0.5 65 | 66 | self.mask_test_nms_threshold = 0.9 67 | self.mask_test_threshold = 0.5 68 | 69 | 70 | 71 | 72 | def __repr__(self): 73 | raise NotImplementedError 74 | 75 | def save(self, file): 76 | raise NotImplementedError 77 | 78 | def load(self, file): 79 | raise NotImplementedError 80 | 81 | 82 | 83 | 84 | # main ################################################################# 85 | if __name__ == '__main__': 86 | print( '%s: calling main function ... ' % os.path.basename(__file__)) 87 | 88 | os.makedirs('/root/share/project/ellen-object-detect/results/xxx/',exist_ok=True) 89 | file='/root/share/project/ellen-object-detect/results/xxx/configure' 90 | 91 | cfg = Configuration() 92 | cfg.save(file) 93 | cfg.load(file) 94 | cfg.save('/root/share/project/ellen-object-detect/results/xxx/configure1') 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /model/dlrt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.nn import init 7 | 8 | 9 | 10 | 11 | 12 | class DLRTnet(nn.Module): 13 | """DLRTnet is based on https://arxiv.org/abs/1701.08936 14 | # write something about the architecture 15 | 16 | """ 17 | def __init__(self, *args): 18 | """args: 19 | - a 20 | - b 21 | - c 22 | """ 23 | 24 | @staticmethod 25 | def weight_init(m): 26 | """ call this for weight initialisation 27 | """ 28 | if isinstance(m, nn.Conv2d): 29 | init.xavier_normal(m.weight) 30 | init.constant(m.bias, 0) 31 | 32 | 33 | def reset_params(self): 34 | """ call this for weight reset in each of the modules 35 | # we might not need this 36 | """ 37 | for i, m in enumerate(self.modules()): 38 | self.weight_init(m) 39 | 40 | 41 | def forward(self, x): 42 | 43 | return x -------------------------------------------------------------------------------- /model/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=40000,60000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from common import * 2 | from utils import * 3 | 4 | # ------------------------------------------------------------------------------------ 5 | from model import DLRTnet 6 | from utils import VOT2017Dataset 7 | 8 | def train_augment(image, multi_mask, index): 9 | pass 10 | 11 | 12 | 13 | 14 | def valid_augment(image, multi_mask, index): 15 | pass 16 | 17 | 18 | 19 | def train_collate(batch): 20 | 21 | batch_size = len(batch) 22 | #for b in range(batch_size): print (batch[b][0].size()) 23 | inputs = torch.stack([batch[b][0]for b in range(batch_size)], 0) 24 | boxes = [batch[b][1]for b in range(batch_size)] 25 | labels = [batch[b][2]for b in range(batch_size)] 26 | instances = [batch[b][3]for b in range(batch_size)] 27 | indices = [batch[b][4]for b in range(batch_size)] 28 | 29 | return [inputs, boxes, labels, instances, indices] 30 | 31 | ### draw ######################################################### 32 | 33 | def draw(): 34 | 35 | pass 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | ### training ############################################################## 44 | def evaluate(net, test_loader): 45 | 46 | test_num = 0 47 | test_loss = np.zeros(6,np.float32) 48 | test_acc = 0 49 | for i, (inputs, boxes, labels, instances, indices) in enumerate(test_loader, 0): 50 | inputs = Variable(inputs,volatile=True).cuda() 51 | 52 | net(inputs, boxes, labels, instances ) 53 | loss = net.loss(inputs, boxes, labels, instances) 54 | 55 | # acc = dice_loss(masks, labels) #todo 56 | 57 | batch_size = len(indices) 58 | test_acc += 0 #batch_size*acc[0][0] 59 | test_loss += batch_size*np.array(( 60 | loss .cpu().data.numpy()[0], 61 | net.rpn_cls_loss.cpu().data.numpy()[0], 62 | net.rpn_reg_loss.cpu().data.numpy()[0], 63 | net.rcnn_cls_loss.cpu().data.numpy()[0], 64 | net.rcnn_reg_loss.cpu().data.numpy()[0], 65 | net.mask_cls_loss.cpu().data.numpy()[0], 66 | )) 67 | test_num += batch_size 68 | 69 | assert(test_num == len(test_loader.sampler)) 70 | test_acc = test_acc/test_num 71 | test_loss = test_loss/test_num 72 | return test_loss, test_acc 73 | 74 | 75 | 76 | #-------------------------------------------------------------- 77 | def run_train(): 78 | 79 | out_dir = RESULTS_DIR + '/mask-rcnn-gray-011a-debug' 80 | initial_checkpoint = \ 81 | RESULTS_DIR + '/mask-rcnn-gray-011a-debug/checkpoint/00072200_model.pth' 82 | # 83 | 84 | 85 | pretrain_file = None #imagenet pretrain 86 | ## setup ----------------- 87 | os.makedirs(out_dir +'/checkpoint', exist_ok=True) 88 | os.makedirs(out_dir +'/train', exist_ok=True) 89 | os.makedirs(out_dir +'/backup', exist_ok=True) 90 | backup_project_as_zip(PROJECT_PATH, out_dir +'/backup/code.train.%s.zip'%IDENTIFIER) 91 | 92 | log = Logger() 93 | log.open(out_dir+'/log.train.txt',mode='a') 94 | log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64)) 95 | log.write('** some experiment setting **\n') 96 | log.write('\tSEED = %u\n' % SEED) 97 | log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH) 98 | log.write('\tout_dir = %s\n' % out_dir) 99 | log.write('\n') 100 | 101 | 102 | ## net ---------------------- 103 | log.write('** net setting **\n') 104 | cfg = Configuration() 105 | net = MaskRcnnNet(cfg).cuda() 106 | 107 | if initial_checkpoint is not None: 108 | log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint) 109 | net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage)) 110 | 111 | elif pretrain_file is not None: 112 | log.write('\tpretrained_file = %s\n' % pretrain_file) 113 | #load_pretrain_file(net, pretrain_file) 114 | 115 | 116 | log.write('%s\n\n'%(type(net))) 117 | log.write('\n') 118 | 119 | 120 | 121 | ## optimiser ---------------------------------- 122 | iter_accum = 1 123 | batch_size = 4 ##NUM_CUDA_DEVICES*512 #256//iter_accum #512 #2*288//iter_accum 124 | 125 | num_iters = 1000 *1000 126 | iter_smooth = 20 127 | iter_log = 50 128 | iter_valid = 100 129 | iter_save = [0, num_iters-1]\ 130 | + list(range(0,num_iters,100))#1*1000 131 | 132 | 133 | LR = None #LR = StepLR([ (0, 0.01), (200, 0.001), (300, -1)]) 134 | optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), 135 | lr=0.001/iter_accum, momentum=0.9, weight_decay=0.0001) 136 | 137 | start_iter = 0 138 | start_epoch= 0. 139 | if initial_checkpoint is not None: 140 | checkpoint = torch.load(initial_checkpoint.replace('_model.pth','_optimizer.pth')) 141 | start_iter = checkpoint['iter' ] 142 | start_epoch = checkpoint['epoch'] 143 | #optimizer.load_state_dict(checkpoint['optimizer']) 144 | 145 | 146 | ## dataset ---------------------------------------- 147 | log.write('** dataset setting **\n') 148 | 149 | train_dataset = XXXXXDataset( 150 | #'train1_ids_gray_only1_500', mode='train', 151 | 'valid1_ids_gray_only1_43', mode='train', 152 | transform = train_augment) 153 | train_loader = DataLoader( 154 | train_dataset, 155 | sampler = RandomSampler(train_dataset), 156 | #sampler = ConstantSampler(train_dataset,list(range(16))), 157 | batch_size = batch_size, 158 | drop_last = True, 159 | num_workers = 4, 160 | pin_memory = True, 161 | collate_fn = train_collate) 162 | 163 | 164 | valid_dataset = ScienceDataset( 165 | 'valid1_ids_gray_only1_43', mode='train', 166 | #'debug1_ids_gray_only1_10', mode='train', 167 | transform = valid_augment) 168 | valid_loader = DataLoader( 169 | valid_dataset, 170 | sampler = SequentialSampler(valid_dataset), 171 | batch_size = batch_size, 172 | drop_last = False, 173 | num_workers = 4, 174 | pin_memory = True, 175 | collate_fn = train_collate) 176 | 177 | log.write('\ttrain_dataset.split = %s\n'%(train_dataset.split)) 178 | log.write('\tvalid_dataset.split = %s\n'%(valid_dataset.split)) 179 | log.write('\tlen(train_dataset) = %d\n'%(len(train_dataset))) 180 | log.write('\tlen(valid_dataset) = %d\n'%(len(valid_dataset))) 181 | log.write('\tlen(train_loader) = %d\n'%(len(train_loader))) 182 | log.write('\tlen(valid_loader) = %d\n'%(len(valid_loader))) 183 | log.write('\tbatch_size = %d\n'%(batch_size)) 184 | log.write('\titer_accum = %d\n'%(iter_accum)) 185 | log.write('\tbatch_size*iter_accum = %d\n'%(batch_size*iter_accum)) 186 | log.write('\n') 187 | 188 | #log.write(inspect.getsource(train_augment)+'\n') 189 | #log.write(inspect.getsource(valid_augment)+'\n') 190 | #log.write('\n') 191 | 192 | if 0: # 193 | for inputs, truth_boxes, truth_labels, truth_instances, indices in valid_loader: 194 | 195 | batch_size, C,H,W = inputs.size() 196 | print(batch_size) 197 | 198 | images = inputs.cpu().numpy() 199 | for b in range(batch_size): 200 | image = (images[b].transpose((1,2,0))*255) 201 | image = np.clip(image.astype(np.float32)*3,0,255) 202 | 203 | image1 = image.copy() 204 | 205 | truth_box = truth_boxes[b] 206 | truth_label = truth_labels[b] 207 | truth_instance = truth_instances[b] 208 | if truth_box is not None: 209 | for box,label,instance in zip(truth_box,truth_label,truth_instance): 210 | x0,y0,x1,y1 = box.astype(np.int32) 211 | cv2.rectangle(image,(x0,y0),(x1,y1),(0,0,255),1) 212 | print(label) 213 | 214 | thresh = instance>0.5 215 | contour = thresh_to_inner_contour(thresh) 216 | contour = contour.astype(np.float32) *0.5 217 | 218 | image1 = contour[:,:,np.newaxis]*np.array((0,255,0)) + (1-contour[:,:,np.newaxis])*image1 219 | 220 | 221 | print('') 222 | 223 | 224 | image_show('image',image) 225 | image_show('image1',image1) 226 | cv2.waitKey(0) 227 | 228 | 229 | 230 | ## start training here! ############################################## 231 | log.write('** start training here! **\n') 232 | log.write(' optimizer=%s\n'%str(optimizer) ) 233 | log.write(' momentum=%f\n'% optimizer.param_groups[0]['momentum']) 234 | log.write(' LR=%s\n\n'%str(LR) ) 235 | 236 | log.write(' images_per_epoch = %d\n\n'%len(train_dataset)) 237 | log.write(' rate iter epoch num | valid_loss | train_loss | batch_loss | time \n') 238 | log.write('------------------------------------------------------------------------------------------------------------------------------------------------------------------\n') 239 | 240 | train_loss = np.zeros(6,np.float32) 241 | train_acc = 0.0 242 | valid_loss = np.zeros(6,np.float32) 243 | valid_acc = 0.0 244 | batch_loss = np.zeros(6,np.float32) 245 | batch_acc = 0.0 246 | rate = 0 247 | 248 | start = timer() 249 | j = 0 250 | i = 0 251 | 252 | 253 | for i in range(n_epochs): # loop over the dataset multiple times 254 | sum_train_loss = np.zeros(6, np.float32) 255 | sum_train_acc = 0.0 256 | sum = 0 257 | 258 | net.set_mode('train') 259 | optimizer.zero_grad() 260 | for inputs, truth_boxes, truth_labels, truth_instances, indices in train_loader: 261 | batch_size = len(indices) 262 | i = j/iter_accum + start_iter 263 | epoch = (i-start_iter)*batch_size*iter_accum/len(train_dataset) + start_epoch 264 | num_products = epoch*len(train_dataset) 265 | 266 | if i % iter_valid==0: 267 | net.set_mode('valid') 268 | valid_loss, valid_acc = evaluate(net, valid_loader) 269 | net.set_mode('train') 270 | 271 | print('\r',end='',flush=True) 272 | log.write('%0.4f %5.1f k %6.2f %4.1f m | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %s\n' % (\ 273 | rate, i/1000, epoch, num_products/1000000, 274 | valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc, 275 | train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc, 276 | batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc, 277 | time_to_str((timer() - start)/60))) 278 | time.sleep(0.01) 279 | 280 | #if 1: 281 | if i in iter_save: 282 | torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(i)) 283 | torch.save({ 284 | 'optimizer': optimizer.state_dict(), 285 | 'iter' : i, 286 | 'epoch' : epoch, 287 | }, out_dir +'/checkpoint/%08d_optimizer.pth'%(i)) 288 | 289 | 290 | 291 | # learning rate schduler ------------- 292 | if LR is not None: 293 | lr = LR.get_rate(i) 294 | if lr<0 : break 295 | adjust_learning_rate(optimizer, lr/iter_accum) 296 | rate = get_learning_rate(optimizer)[0]*iter_accum 297 | 298 | 299 | 300 | 301 | # one iteration update ------------- 302 | inputs = Variable(inputs).cuda() 303 | net( inputs, truth_boxes, truth_labels, truth_instances ) 304 | loss = net.loss( inputs, truth_boxes, truth_labels, truth_instances ) 305 | 306 | 307 | if 1: # 308 | debug_and_draw(net, inputs, truth_boxes, truth_labels, truth_instances, mode='test') 309 | 310 | # masks = (probs>0.5).float() 311 | # acc = dice_loss(masks, labels) 312 | 313 | 314 | # accumulated update 315 | loss.backward() 316 | if j%iter_accum == 0: 317 | #torch.nn.utils.clip_grad_norm(net.parameters(), 1) 318 | optimizer.step() 319 | optimizer.zero_grad() 320 | 321 | 322 | # print statistics ------------ 323 | batch_acc = 0 #acc[0][0] 324 | batch_loss = np.array(( 325 | loss.cpu().data.numpy()[0], 326 | net.rpn_cls_loss.cpu().data.numpy()[0], 327 | net.rpn_reg_loss.cpu().data.numpy()[0], 328 | net.rcnn_cls_loss.cpu().data.numpy()[0], 329 | net.rcnn_reg_loss.cpu().data.numpy()[0], 330 | net.mask_cls_loss.cpu().data.numpy()[0], 331 | )) 332 | sum_train_loss += batch_loss 333 | sum_train_acc += batch_acc 334 | sum += 1 335 | if i%iter_smooth == 0: 336 | train_loss = sum_train_loss/sum 337 | train_acc = sum_train_acc /sum 338 | sum_train_loss = np.zeros(6,np.float32) 339 | sum_train_acc = 0. 340 | sum = 0 341 | 342 | 343 | print('\r%0.4f %5.1f k %6.2f %4.1f m | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %0.3f %0.2f %0.2f %0.2f %0.2f %0.2f | %s %d,%d,%s' % (\ 344 | rate, i/1000, epoch, num_products/1000000, 345 | valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc, 346 | train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc, 347 | batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc, 348 | time_to_str((timer() - start)/60) ,i,j, str(inputs.size())), end='',flush=True) 349 | j=j+1 350 | 351 | 352 | 353 | pass #-- end of one data loader -- 354 | pass #-- end of all iterations -- 355 | 356 | 357 | if 1: #save last 358 | torch.save(net.state_dict(),out_dir +'/checkpoint/%d_model.pth'%(i)) 359 | torch.save({ 360 | 'optimizer': optimizer.state_dict(), 361 | 'iter' : i, 362 | 'epoch' : epoch, 363 | }, out_dir +'/checkpoint/%d_optimizer.pth'%(i)) 364 | 365 | log.write('\n') 366 | 367 | def train(model, criterion, optimizer, n_epochs, T): 368 | # Train the Model 369 | vot_data = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt', 370 | root_dir= 'F:/vot2017/') 371 | train_loader = dataloader(vot_data) # iterating over this gives videos 372 | for vid in train_loader: 373 | for image in vid[...]: 374 | 375 | for epoch in range(n_epochs): 376 | 377 | for i, video in enumerate(train_loader): 378 | # video is now a dict of video, gt pairs 379 | image_stack = video['video'] # shape(Nr. of images, h, w, RGB) 380 | masks = video['gt'] 381 | current_pos_of_t = 0 # start from the start of the video 382 | for t in range(T): 383 | """we take a T-image sequence out of the video""" 384 | if current_pos_of_t + T > image_stack.shape[0]: 385 | """This loop makes sure that we dont get an index error. Handling the edges like this is not so nice though""" 386 | continue 387 | image_stack_temp = image_stack[current_pos_of_t: current_pos_of_t + T, :, :, :] 388 | masks_temp = masks[current_pos_of_t: current_pos_of_t + T, :, :] # check FORMATS! 389 | current_pos_of_t += T # next iteration, take the next T-image sequence 390 | reward_list = [] 391 | b_t_list = [] 392 | for image, mask in zip(image_stack_temp, masks_temp): 393 | """ iterate over the first dimensionof image_stack_temp, the number of images 394 | image is now a particular image of the sequence, mask its corresonding mask 395 | """ 396 | image = Variable(mask.view(-1, sequence_length, input_size)) 397 | mask = Variable(mask) 398 | 399 | # Forward + Backward + Optimize 400 | optimizer.zero_grad() # reset gradients 401 | outputs = DLRTnet(images) # observation network 402 | out, hidden = LSTM(outputs, hidden) # LSTM gets the output of DLRTnet as input and its previous hidden state 403 | """GaussianLayer takes a hidden layer and samples N masks from the last 4 / 8 numbers""" 404 | l_t = GaussianLayer(hidden) # l_t contains N sampled masks 405 | 406 | for mask_ in l_t: 407 | reward += LOSSFUNCTION(l_t, mask) # take the loss function 1 from the paper and 408 | reward_list.append(reward) # this list contains r1, r2, r3, ..., rT 409 | b_t_list.append(1/N*reward) # this contains b1, b2, ... bT 410 | 411 | 412 | # Compute gradient 413 | image_reward = np.asarray(reward_list) 414 | image_base = np.asarray(b_t_list) 415 | 416 | 417 | loss.backward() # Crucial step here is to implement backward pass of GaussianLayer using reward_list 418 | optimizer.step() 419 | 420 | if (i+1) % 100 == 0: 421 | print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 422 | %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) 423 | 424 | 425 | 426 | 427 | # main ################################################################# 428 | if __name__ == '__main__': 429 | print( '%s: calling main function ... ' % os.path.basename(__file__)) 430 | 431 | run_train() 432 | 433 | print('\nsucess!') 434 | 435 | 436 | 437 | # ffmpeg -f image2 -pattern_type glob -r 33 -i "iterations/*.png" -c:v libx264 iterations.mp4 438 | # 439 | # -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from common import * 5 | 6 | #%% Recalculate x,y,w,h to the four corner coordinates 7 | 8 | def calc_coords(gt_wh): 9 | gt_new = np.zeros((gt_wh.shape[0],8)) 10 | print(gt_new.shape) 11 | print(gt_wh.shape) 12 | print(gt_wh[:,0]) 13 | gt_new[:,0] = gt_wh[:,0] # x1 14 | gt_new[:,1] = gt_wh[:,1] # y1 15 | gt_new[:,2] = gt_wh[:,0] + gt_wh[:,2] # x2 16 | gt_new[:,3] = gt_wh[:,1] # y2 17 | gt_new[:,4] = gt_wh[:,0] + gt_wh[:,2] # x3 18 | gt_new[:,5] = gt_wh[:,1] + gt_wh[:,3] # y3 19 | gt_new[:,6] = gt_wh[:,0] # x4 20 | gt_new[:,7] = gt_wh[:,1] + gt_wh[:,3] # y4 21 | 22 | return gt_new 23 | #%% Dataset class 24 | 25 | '''Output is the imagesequence in an np.array format and the gt aswell.''' 26 | 27 | class VOT2017_dataset(Dataset): 28 | """This is the VOT2017 dataset""" 29 | def __init__(self, csv_file, root_dir, transform = None): 30 | """ Arguments: 31 | csv_file(string): Path to list file, where all videos are listed 32 | root_dir(string): Directory with all the videos 33 | transform(callable, optional): Will transform on a sample(for pytorch I guess) 34 | """ 35 | self.vot_list = pd.read_csv(csv_file, header = None) 36 | self.root_dir = root_dir 37 | self.transform = transform 38 | 39 | # Returns the nr of videos available 40 | def __len__(self): 41 | return len(self.vot_list) 42 | 43 | # Return the complete video sequence 44 | def __getitem__(self, vid_idx): 45 | """ Arguments: 46 | vid_idx(int): Video Index to be fetched form the video list 47 | """ 48 | vid_name_path = os.path.join(self.root_dir, 49 | self.vot_list.iloc[vid_idx,0], 50 | '*.jpg') 51 | 52 | gt = pd.read_csv(os.path.join(self.root_dir, 53 | self.vot_list.iloc[vid_idx,0], 54 | 'groundtruth.txt'), header = None) 55 | 56 | im_seq = imread_collection(vid_name_path) 57 | 58 | # Image collection to np.array 59 | images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB) 60 | 61 | # Also convert the gt to np.array 62 | gt = gt.values 63 | 64 | if gt.shape[1] == 4: 65 | gt = calc_coords(gt) 66 | 67 | 68 | sample = {'Video': images, 'gt': gt} 69 | 70 | # Cant tell yet what this is for 71 | if self.transform: 72 | sample = self.transform(sample) 73 | return sample 74 | 75 | 76 | #%% 77 | 78 | # Draws a rectangle given the coordinates of all four corners in one array 79 | # Where the order is upper-left, upper-right, lower-rigth, lower-left 80 | def draw_gt(im, coords): 81 | """ Arguments: 82 | im = image 83 | coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower) 84 | """ 85 | plt.imshow(im) 86 | Xs = coords[::2] # Save Xcoords 87 | Ys = coords[1::2] # Save Ycoords 88 | for i in range(4): 89 | if i < 3: 90 | plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1) 91 | elif i == 3: 92 | plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1) 93 | plt.show() 94 | 95 | 96 | #%% Transforms - Rescale/Resize 97 | 98 | # Rescaling of an image so that we can feed it with the same size into a network 99 | # Also the groundtruth boxes have to be rescaled accordingly! 100 | # Problem atm rescale the whole Video!! - so far only with for loop 101 | 102 | class Rescale(object): 103 | ''' 104 | Rescale the image in a sample to a given size 105 | 106 | Arguments: output_size(tuple): Desired output size 107 | idx(int) : For now idx of the image to be resized 108 | ''' 109 | 110 | # Check if output_size is a tuple, 111 | # maybe also assert if it isnt bigger than the smallest image? 112 | def __init__(self, output_size): 113 | assert isinstance(output_size,(tuple)) 114 | self.output_size = output_size 115 | 116 | def __call__(self, sample): 117 | # Split the sample in video and gt 118 | images, gt = sample['Video'], sample['gt'] 119 | nr = len(images) # Save the amount of images to iterate over 120 | print(nr) 121 | # Save heigth and widthim of video 122 | h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry 123 | 124 | new_h, new_w = self.output_size 125 | 126 | 127 | # I dont like this part due to the for loop.! 128 | # Initialize the resized image sequence array 129 | img = np.zeros((nr,new_h,new_w, images.shape[3])) 130 | # Iterate over all images and resize them to the given scale. 131 | for i in range(nr): 132 | img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w)) 133 | 134 | 135 | # Here the groundtruth boxes are rescaled aswell 136 | gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h)) 137 | 138 | return {'Video': img, 'gt': gt_new} 139 | 140 | #%% Transforms - ToTensor 141 | 142 | # Transform the loaded image collection to Tensors 143 | 144 | class ToTensor(object): 145 | '''Convert sample to tensor''' 146 | def __call__(self, sample): 147 | # Load the sample and split it 148 | images, gt = sample['Video'], sample['gt'] 149 | 150 | # swap color axis because 151 | # numpy image: H x W x C 152 | # torch image: C X H X W 153 | # How does this relate to videos/imagesequences? 154 | images = images.transpose((0,3,1,2)) 155 | 156 | return {'Video': torch.from_numpy(images), 157 | 'gt': torch.from_numpy(gt)} 158 | 159 | #%% Just some test - care the directories 160 | 161 | # Load the list of all videos 162 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None) 163 | 164 | # Name of a video can be accessed by e.g. vid_list[0][5] 165 | print( vid_list[0][33]) 166 | 167 | # Nr. of videos available 168 | n = vid_list.shape[0] 169 | 170 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt', 171 | root_dir= 'F:/vot2017/') 172 | 173 | # E.g. load the second video of the vid_list 174 | sample = test[2] 175 | 176 | # Simply draw a single video - here the idx refers to the image in the sequence 177 | draw_gt(sample['Video'][0], sample['gt'][0]) 178 | 179 | # Test rescaling 180 | scale = Rescale((220,280)) 181 | transformed_sample = scale(sample) 182 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100]) 183 | 184 | # Test the ToTensor - does not work on Nils Laptop 185 | tens = ToTensor() 186 | tens(sample) 187 | 188 | #%% Just for information - Find the smallest sized video 189 | 190 | Vids = vid_list.shape[0] 191 | 192 | Size = np.zeros((Vids, 2)) 193 | 194 | for i in range(Vids): 195 | im = Image.open(os.path.join('F:/vot2017/', 196 | vid_list.iloc[i,0], 197 | '00000001.jpg')) 198 | Size[i,0] = im.size[0] 199 | Size[i,1] = im.size[1] 200 | 201 | # Smallest size ist 320 x240 202 | 203 | #%% define loss/reward functions; given the coordinates of all corners 204 | 205 | def reward_v1(pred, gt): 206 | r = - np.mean(np.absolute(pred-gt)) - np.max(np.absolute(pred-gt)) 207 | return r 208 | 209 | 210 | # Calculate the reward given all the for corners x1,y1,x2,y2,x3,y3,x4,y4 211 | def reward_v2(pred, gt): 212 | #reorder the coord in tuples for the polygon 213 | pred_re = [(pred[0],pred[1]),(pred[2],pred[3]), (pred[4],pred[5]),(pred[6],pred[7])] 214 | gt_re = [(gt[0],gt[1]),(gt[2],gt[3]),(gt[4],gt[5]),(gt[6],gt[7])] 215 | 216 | pred_poly = Polygon(pred_re) 217 | gt_poly = Polygon(gt_re) 218 | # Reward == Intersection/total area 219 | r = pred_poly.intersection(gt_poly).area/(pred_poly.area + gt_poly.area - pred_poly.intersection(gt_poly).area) 220 | return r 221 | 222 | #%% Test reward functions 223 | 224 | test_1 = np.array((0,0,0,1,1,1,1,0)) 225 | test_2 = np.array((0.5,0,0.5,1,1.5,1,1.5,0)) 226 | 227 | print(reward_v1(test_1, test_2)) 228 | print(reward_v2(test_1, test_2)) 229 | 230 | 231 | #%% 232 | 233 | class GaussianLayer(torch.autograd.Function): 234 | '''Implement custom gaussian layer here''' 235 | 236 | 237 | @staticmethod 238 | def forward(self, inp, sigma, N): 239 | ''' 240 | We receive a Tensor input(take last 8 nr) from that we will draw outputs from a gaussian 241 | distribution, centered around our inputs, with sigma as a covariance matrix 242 | N is the samples we draw 243 | ''' 244 | cov = np.identity(8)*sigma # Covariance matrix 245 | output = np.random.multivariate_normal(inp[-8,:], cov, N) #sample form gaussian 246 | output = torch.from_numpy(output) # to tensor 247 | return output 248 | 249 | @staticmethod 250 | def backward(self, grad): 251 | ''' 252 | Given grad as a tensor(8 values) 253 | ''' 254 | self.grad = grad 255 | return grad 256 | 257 | 258 | #%% 259 | 260 | def calc_grad(pred_l, output, R_t, b_t, N, sigma): 261 | ''' 262 | We receive pred_l(8xNxT) 263 | also we receive our baseline b_t(T) and the cumulative rewards R_t(NxT) 264 | therefore we can compute the gradient as 265 | ''' 266 | b_t.reshape((N,1)) # as is a vector we want column difference 267 | output.reshape((8,1)) # We want to have it as a column 268 | diff = R_t - b_t # is NxT 269 | ln_pi = np.zeros(pred_l.shape) 270 | # apperently have to use a for loop 271 | for i in range(N): 272 | ln_pi = pred_l-output/sigma**2 273 | product = ln_pi*diff # 8xNxT 274 | grad_G = 1/N*np.sum(np.sum(product)) # Sum that up 275 | grad_G = torch.from_numpy(grad_G) # To torch 276 | return grad_G 277 | 278 | 279 | #%% 280 | 281 | #data reader ---------------------------------------------------------------- 282 | class XXXXXDataset(Dataset): 283 | 284 | def __init__(self, split, transform=None, mode='train'): 285 | super(XXXXXXDataset, self).__init__() 286 | start = timer() 287 | 288 | self.split = split 289 | self.transform = transform 290 | self.mode = mode 291 | 292 | #read split 293 | ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#') 294 | 295 | #save 296 | self.ids = ids 297 | 298 | #print 299 | print('\ttime = %0.2f min'%((timer() - start) / 60)) 300 | print('\tnum_ids = %d'%(len(self.ids))) 301 | print('') 302 | 303 | 304 | def __getitem__(self, index): 305 | id = self.ids[index] 306 | image_id = id.split('/')[-1] 307 | image = cv2.imread(DATA_DIR + '/image/' + id + '/images/' + image_id +'.png', cv2.IMREAD_COLOR) 308 | 309 | if self.mode in ['train']: 310 | multi_mask = np.load( DATA_DIR + '/image/' + id + '/multi_mask.npy')#.astype(int32) 311 | 312 | if self.transform is not None: 313 | return self.transform(image, multi_mask, index) 314 | else: 315 | return input, multi_mask, index 316 | 317 | if self.mode in ['test']: 318 | if self.transform is not None: 319 | return self.transform(image,index) 320 | else: 321 | return image, index 322 | 323 | def __len__(self): 324 | return len(self.ids) --------------------------------------------------------------------------------