├── .gitattributes
├── README.md
├── common.py
├── dataset
    ├── DataLoader.py
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── audio_processing_kg.cpython-36.pyc
    │   ├── audio_processing_tf.cpython-36.pyc
    │   ├── cdiscount_feature_dataset.cpython-36.pyc
    │   ├── cdiscount_feature_set_dataset.cpython-36.pyc
    │   ├── cdiscount_image_dataset.cpython-36.pyc
    │   ├── reader.cpython-35.pyc
    │   ├── reader.cpython-36.pyc
    │   ├── sampler.cpython-35.pyc
    │   ├── sampler.cpython-36.pyc
    │   ├── transform.cpython-35.pyc
    │   └── transform.cpython-36.pyc
    ├── process.py
    ├── reader.py
    ├── sampler.py
    └── transform.py
├── model
    ├── __init__.py
    ├── configuration.py
    ├── dlrt.py
    └── yolo-voc.cfg
├── train.py
└── utils.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deep Reinforcement Learning for Visual Object Tracking in Videos
2 | 


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | # edit settings here
  2 | ROOT_DIR =''
  3 | 
  4 | 
  5 | 
  6 | DATA_DIR    = '' 
  7 | RESULTS_DIR = ROOT_DIR + '/results'
  8 | 
  9 | ##---------------------------------------------------------------------
 10 | import os
 11 | import copy
 12 | from datetime import datetime
 13 | PROJECT_PATH = os.path.dirname(os.path.realpath(__file__))
 14 | IDENTIFIER   = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
 15 | 
 16 | #numerical libs
 17 | import math
 18 | import numpy as np
 19 | import random
 20 | import PIL
 21 | from PIL import Image # import jpg in python
 22 | import cv2
 23 | 
 24 | import matplotlib
 25 | matplotlib.use('TkAgg')
 26 | #matplotlib.use('Qt4Agg')
 27 | #matplotlib.use('Qt5Agg')
 28 | 
 29 | 
 30 | # torch libs
 31 | import torch
 32 | import torchvision.transforms as transforms
 33 | from torch.utils.data.dataset import Dataset
 34 | from torch.utils.data import DataLoader
 35 | from torch.utils.data.sampler import *
 36 | 
 37 | import torch
 38 | import torch.nn as nn
 39 | import torch.nn.functional as F
 40 | from torch.autograd import Variable
 41 | import torch.optim as optim
 42 | from torch.optim import lr_scheduler
 43 | from torch.nn.parallel.data_parallel import data_parallel
 44 | from torch.utils.data import Dataset
 45 | import torchvision
 46 | 
 47 | 
 48 | # std libs
 49 | import collections
 50 | import numbers
 51 | import inspect
 52 | import shutil
 53 | from timeit import default_timer as timer
 54 | from __future__ import print_function, division
 55 | 
 56 | 
 57 | 
 58 | import csv
 59 | import pandas as pd
 60 | import pickle
 61 | import glob
 62 | import sys
 63 | from distutils.dir_util import copy_tree
 64 | import time
 65 | import matplotlib.pyplot as plt
 66 | 
 67 | import skimage
 68 | import skimage.color
 69 | from skimage import io, transform
 70 | # import all images from a folder, see the dataloader
 71 | from skimage.io import imread_collection, imread, concatenate_images 
 72 | from scipy import ndimage
 73 | from shapely.geometry import Polygon # for the Polygon
 74 | 
 75 | 
 76 | 
 77 | #---------------------------------------------------------------------------------
 78 | print('@%s:  ' % os.path.basename(__file__))
 79 | 
 80 | if 1:
 81 |     SEED=35202#1510302253  #int(time.time()) #
 82 |     random.seed(SEED)
 83 |     np.random.seed(SEED)
 84 |     torch.manual_seed(SEED)
 85 |     torch.cuda.manual_seed_all(SEED)
 86 |     print ('\tset random seed')
 87 |     print ('\t\tSEED=%d'%SEED)
 88 | 
 89 | if 1:
 90 |     torch.backends.cudnn.benchmark = True  ##uses the inbuilt cudnn auto-tuner to find the fastest convolution algorithms. -
 91 |     torch.backends.cudnn.enabled   = True
 92 |     print ('\tset cuda environment')
 93 |     #print ('\t\ttorch.__version__              =', torch.__version__)
 94 |     #print ('\t\ttorch.version.cuda             =', torch.version.cuda)
 95 |     print ('\t\ttorch.backends.cudnn.version() =', torch.backends.cudnn.version())
 96 |     try:
 97 |         print ('\t\tos[\'CUDA_VISIBLE_DEVICES\']  =',os.environ['CUDA_VISIBLE_DEVICES'])
 98 |         NUM_CUDA_DEVICES = len(os.environ['CUDA_VISIBLE_DEVICES'].split(','))
 99 |     except Exception:
100 |         print ('\t\tos[\'CUDA_VISIBLE_DEVICES\']  =','None')
101 |         NUM_CUDA_DEVICES = 1
102 | 
103 |     print ('\t\ttorch.cuda.device_count()   =', torch.cuda.device_count())
104 |     print ('\t\ttorch.cuda.current_device() =', torch.cuda.current_device())
105 | 
106 | 
107 | print('')
108 | 
109 | #---------------------------------------------------------------------------------


--------------------------------------------------------------------------------
/dataset/DataLoader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sun Feb 18 13:44:42 2018
  4 | 
  5 | @author: Einmal
  6 | """
  7 | 
  8 | # Build a dataset loader according to 
  9 | # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html
 10 | from __future__ import print_function, division
 11 | 
 12 | import pandas as pd
 13 | import torch
 14 | import torch.nn as nn
 15 | import torch.optim as optim
 16 | from torch.optim import lr_scheduler
 17 | from torch.autograd import Variable
 18 | from skimage import io, transform
 19 | import torchvision
 20 | from torch.utils.data import Dataset
 21 | import numpy as np
 22 | import matplotlib.pyplot as plt
 23 | import matplotlib.patches
 24 | import time
 25 | import os
 26 | import copy
 27 | from PIL import Image # import jpg in python
 28 | from skimage.io import imread_collection, imread, concatenate_images # import all images from a folder, see the dataloader
 29 | 
 30 | 
 31 | 
 32 | 
 33 | #%%
 34 | 
 35 | # Load the list of all videos
 36 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None)
 37 | 
 38 | # Name of a video can be accessed by e.g. vid_list[0][5]
 39 | print( vid_list[0][33])
 40 | 
 41 |  # Nr. of videos available
 42 | n = vid_list.shape[0]
 43 | 
 44 | 
 45 | #%% Test drawing
 46 | 
 47 | # Load the gt boxes, which are apparently stored in coordinates of 4 points
 48 | gt = pd.read_csv('F:/vot2017/ants1/groundtruth.txt', header = None)
 49 | 
 50 | 
 51 | # Transform dataset into array
 52 | # Not neccessary dt.iloc[x] does it
 53 | 
 54 | #im  = Image.open('F:/vot2017/ants1/00000092.jpg')
 55 | 
 56 | # Draws a rectangle given the coordinates of all four corners in one array
 57 | # Where the order is upper-left, upper-right, lower-rigth, lower-left
 58 | def draw_gt(im, coords):
 59 |     """ Arguments:
 60 |             im = image
 61 |             coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower)
 62 |         """
 63 |     plt.imshow(im)
 64 |     
 65 |     Xs = coords[::2] # Save Xcoords
 66 |     Ys = coords[1::2] # Save Ycoords
 67 |     for i in range(4):
 68 |         if i < 3:
 69 |             plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1)
 70 |         elif i == 3:
 71 |             plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1)
 72 |     plt.show()
 73 | 
 74 | 
 75 | #draw_gt(im, gt.iloc[91])
 76 | #Check
 77 | 
 78 | #%% Recalculate x,y,w,h to the four corner coordinates
 79 |     
 80 | def calc_coords(gt_wh):
 81 |     gt_new = np.zeros((gt_wh.shape[0],8))
 82 |     print(gt_new.shape)
 83 |     print(gt_wh.shape)
 84 |     print(gt_wh[:,0])
 85 |     gt_new[:,0] = gt_wh[:,0]                # x1
 86 |     gt_new[:,1] = gt_wh[:,1]                # y1
 87 |     gt_new[:,2] = gt_wh[:,0] + gt_wh[:,2]   # x2
 88 |     gt_new[:,3] = gt_wh[:,1]                # y2
 89 |     gt_new[:,4] = gt_wh[:,0] + gt_wh[:,2]   # x3
 90 |     gt_new[:,5] = gt_wh[:,1] + gt_wh[:,3]   # y3  
 91 |     gt_new[:,6] = gt_wh[:,0]                # x4
 92 |     gt_new[:,7] = gt_wh[:,1] + gt_wh[:,3]   # y4
 93 |     
 94 |     return gt_new
 95 | #%% Dataset class
 96 |     
 97 | '''Output is the imagesequence in an np.array format and the gt aswell.'''
 98 | 
 99 | class VOT2017_dataset(Dataset):
100 |     """This is the VOT2017 dataset"""
101 |     def __init__(self, csv_file, root_dir, transform = None):
102 |         """ Arguments:
103 |             csv_file(string): Path to list file, where all videos are listed
104 |             root_dir(string): Directory with all the videos
105 |             transform(callable, optional): Will transform on a sample(for pytorch I guess)
106 |         """
107 |         self.vot_list = pd.read_csv(csv_file, header = None)
108 |         self.root_dir = root_dir
109 |         self.transform = transform
110 |     
111 |     # Returns the nr of videos available
112 |     def __len__(self):
113 |         return len(self.vot_list)
114 |     
115 |     # Return the complete video sequence
116 |     def __getitem__(self, vid_idx):
117 |         """ Arguments:
118 |             vid_idx(int): Video Index to be fetched form the video list
119 |         """
120 |         vid_name_path = os.path.join(self.root_dir, 
121 |                                      self.vot_list.iloc[vid_idx,0],
122 |                                      '*.jpg')
123 |         
124 |         gt = pd.read_csv(os.path.join(self.root_dir, 
125 |                                       self.vot_list.iloc[vid_idx,0],
126 |                                       'groundtruth.txt'), header = None)
127 |         
128 |         im_seq = imread_collection(vid_name_path)
129 |         
130 |         # Image collection to np.array
131 |         images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB)
132 |         
133 |         # Also convert the gt to np.array
134 |         gt = gt.values
135 |         
136 |         if gt.shape[1] == 4:
137 |             gt = calc_coords(gt)
138 |             
139 |         
140 |         sample = {'Video': images, 'gt': gt}
141 |         
142 |         # Cant tell yet what this is for
143 |         if self.transform:
144 |             sample = self.transform(sample)    
145 |         return sample
146 |     
147 | 
148 | #%% Test the dataset class
149 | 
150 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt',
151 |                        root_dir= 'F:/vot2017/')  
152 | 
153 | # E.g. load the second video of the vid_list
154 | sample = test[14]
155 | 
156 | # Simply draw a single video - here the idx refers to the image in the sequence
157 | draw_gt(sample['Video'][0], sample['gt'][0])
158 | 
159 | 
160 | #%% Just for information - Find the smallest sized video
161 | 
162 | Vids = vid_list.shape[0]
163 | 
164 | Size = np.zeros((Vids, 2))
165 | 
166 | for i in range(Vids):
167 |     im = Image.open(os.path.join('F:/vot2017/', 
168 |                                      vid_list.iloc[i,0],
169 |                                      '00000001.jpg'))
170 |     Size[i,0] = im.size[0]
171 |     Size[i,1] = im.size[1]
172 | 
173 | # Smallest size ist 320 x240
174 | #Histogram of image sizes
175 | plt.hist(Size)
176 | 
177 | 
178 | 
179 | 
180 | #%% Transforms - Rescale/Resize
181 |     
182 | # Rescaling of an image so that we can feed it with the same size into a network
183 | # Also the groundtruth boxes have to be rescaled accordingly!
184 | # Problem atm rescale the whole Video!! - so far only with for loop
185 | 
186 | class Rescale(object):
187 |     '''
188 |     Rescale the image in a sample to a given size
189 |     
190 |     Arguments: output_size(tuple): Desired output size
191 |                idx(int) : For now idx of the image to be resized
192 |     '''
193 |     
194 |     # Check if output_size is a tuple,
195 |     # maybe also assert if it isnt bigger than the smallest image?
196 |     def __init__(self, output_size):
197 |         assert isinstance(output_size,(tuple))
198 |         self.output_size = output_size
199 |         
200 |     def __call__(self, sample):
201 |         # Split the sample in video and gt
202 |         images, gt = sample['Video'], sample['gt']
203 |         nr = len(images) # Save the amount of images to iterate over
204 |         print(nr)
205 |         # Save heigth and width of video
206 |         h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry
207 |         
208 |         new_h, new_w = self.output_size
209 |         
210 |         
211 |         # I dont like this part due to the for loop.! 
212 |         # Initialize the resized image sequence array
213 |         img = np.zeros((nr,new_h,new_w, images.shape[3]))
214 |         # Iterate over all images and resize them to the given scale.
215 |         for i in range(nr):
216 |             img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w))
217 |         
218 |     
219 |         # Here the groundtruth boxes are rescaled aswell
220 |         gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h))
221 |         
222 |         return {'Video': img, 'gt': gt_new}
223 | 
224 | 
225 | 
226 | #%% Test rescaling
227 |         
228 | scale = Rescale((220,280))
229 | 
230 | 
231 | transformed_sample = scale(sample)
232 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100])
233 | 
234 | # Check
235 | 
236 | #%% Transforms - ToTensor
237 | 
238 | # Transform the loaded image collection to Tensors
239 | 
240 | class ToTensor(object):
241 |     '''Convert sample to tensor'''
242 |     def __call__(self, sample):
243 |         # Load the sample and split it
244 |         images, gt = sample['Video'], sample['gt']
245 |         
246 |         # swap color axis because
247 |         # numpy image: H x W x C
248 |         # torch image: C X H X W
249 |         # How does this relate to videos/imagesequences?
250 |         images = images.transpose((0,3,1,2))
251 |         
252 |         return {'Video': torch.from_numpy(images),
253 |                 'gt': torch.from_numpy(gt)}
254 |         
255 | #%% Test the ToTensor
256 |         
257 | tens = ToTensor()
258 | tens(sample)
259 | 
260 | # Still have to test this
261 | =======
262 | # -*- coding: utf-8 -*-
263 | """
264 | Created on Sun Feb 18 13:44:42 2018
265 | 
266 | @author: Einmal
267 | """
268 | 
269 | # Build a dataset loader according to 
270 | # http://pytorch.org/tutorials/beginner/data_loading_tutorial.html
271 | from __future__ import print_function, division
272 | 
273 | import pandas as pd
274 | import torch
275 | import torch.nn as nn
276 | import torch.optim as optim
277 | from torch.optim import lr_scheduler
278 | from torch.autograd import Variable
279 | from skimage import io, transform
280 | import torchvision
281 | from torch.utils.data import Dataset
282 | import numpy as np
283 | import matplotlib.pyplot as plt
284 | import matplotlib.patches
285 | import time
286 | import os
287 | import glob
288 | import copy
289 | from PIL import Image # import jpg in python
290 | from skimage.io import imread_collection, imread, concatenate_images # import all images from a folder, see the dataloader
291 | import shapely
292 | 
293 | #%%
294 | 
295 | # Load the list of all videos
296 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None)
297 | 
298 | # Name of a video can be accessed by e.g. vid_list[0][5]
299 | print( vid_list[0][33])
300 | 
301 |  # Nr. of videos available
302 | n = vid_list.shape[0]
303 | 
304 | 
305 | #%% Test
306 | 
307 | test =os.path.join('F:/vot2017/',
308 |                    vid_list.iloc[0,0])
309 | 
310 | included_extension = ['jpg']
311 | file_names = [fn for fn in os.listdir(test)
312 |               if any(fn.endswith(ext) for ext in included_extension)]
313 | #%% Is rectangular 
314 | 
315 | #for i in range(vid_list.shape[0]):
316 |     gt = pd.read_csv(os.path.join('F:/vot2017/', 
317 |                               vid_list.iloc[3,0],
318 |                               'groundtruth.txt'), header = None)
319 |     gt = gt.values
320 |     Xs = gt[:,::2] # Save Xcoords
321 |     Ys = gt[:,1::2] # Save Ycoords
322 |     print(Xs)
323 | #%% Test drawing
324 | 
325 | # Load the gt boxes, which are apparently stored in coordinates of 4 points
326 | gt = pd.read_csv('F:/vot2017/ball1/groundtruth.txt', header = None)
327 | gt = gt.values
328 | 
329 | # Transform dataset into array
330 | # Not neccessary dt.iloc[x] does it
331 | 
332 | im  = Image.open('F:/vot2017/ball1/00000095.jpg')
333 | 
334 | # Draws a rectangle given the coordinates of all four corners in one array
335 | # Where the order is upper-left, upper-right, lower-rigth, lower-left
336 | def draw_gt(im, coords):
337 |     """ Arguments:
338 |             im = image
339 |             coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower)
340 |         """
341 |     plt.imshow(im)
342 |     Xs = coords[::2] # Save Xcoords
343 |     Ys = coords[1::2] # Save Ycoords
344 |     for i in range(4):
345 |         if i < 3:
346 |             plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1)
347 |         elif i == 3:
348 |             plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1)
349 |     plt.show()
350 | 
351 | 
352 | draw_gt(im, gt[94])
353 | #Check
354 | 
355 | 
356 | 
357 | #%% Dataset class
358 |     
359 | '''Output is the imagesequence in an np.array format and the gt aswell.'''
360 | 
361 | class VOT2017_dataset(Dataset):
362 |     """This is the VOT2017 dataset"""
363 |     def __init__(self, csv_file, root_dir, transform = None):
364 |         """ Arguments:
365 |             csv_file(string): Path to list file, where all videos are listed
366 |             root_dir(string): Directory with all the videos
367 |             transform(callable, optional): Will transform on a sample(for pytorch I guess)
368 |             
369 |         """
370 |         self.vot_list = pd.read_csv(csv_file, header = None)
371 |         self.root_dir = root_dir
372 |         self.transform = transform
373 |     
374 |     
375 |     # Return the complete video sequence
376 |     def __getitem__(self, vid_idx, T = 10):
377 |         """ Arguments:
378 |             vid_idx(int): Video Index to be fetched form the video list
379 |             T(int): Nr of Images in sequence - default == 10
380 |         """
381 |         gt = pd.read_csv(os.path.join(self.root_dir, 
382 |                                       self.vot_list.iloc[vid_idx,0],
383 |                                       'groundtruth.txt'), header = None)
384 |         
385 |         vid_name_path = os.path.join(self.root_dir, 
386 |                                      self.vot_list.iloc[vid_idx,0],
387 |                                      '*.jpg')
388 |         
389 |         file_names = glob.glob(vid_name_path)
390 |         
391 |         rand_start = np.random.randint(0, len(file_names)-T+1)
392 |         
393 |         file_names = file_names[rand_start:(rand_start+T-1)]
394 |         
395 |         im_seq = imread_collection(file_names)
396 |         
397 |         # Image collection to np.array
398 |         images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB)
399 |         
400 |         # Also convert the gt to np.array
401 |         gt = gt.values
402 |         gt = gt[rand_start:(rand_start+T-1),:]
403 |         
404 |         sample = {'Video': images, 'gt': gt}
405 |         
406 |         # Cant tell yet what this is for
407 |         if self.transform:
408 |             sample = self.transform(sample)    
409 |         return sample
410 |     
411 | 
412 | #%% Test the dataset class
413 | 
414 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt',
415 |                        root_dir= 'F:/vot2017/')  
416 | 
417 | # E.g. load a of the vid_list
418 | sample = test.__getitem__(0, T = 20)
419 | 
420 | # Simply draw a single video - here the idx refers to the image in the sequence
421 | draw_gt(sample['Video'][10], sample['gt'][10])
422 | 
423 | #%% define loss/reward functions; given the coordinates of all corners
424 | 
425 | def loss_v1(pred, gt):
426 |     r = - np.mean(np.absolute(pred-gt)) - np.max(np.absolute(pred-gt))
427 |     return r
428 | 
429 | 
430 | # Calculate the reward given all the for corners x1,y1,x2,y2,x3,y3,x4,y4
431 | def loss_v2(pred, gt):
432 |     #reorder the coord in tuples for the polygon
433 |     pred_re = [(pred[0],pred[1]),(pred[2],pred[3]), (pred[4],pred[5]),(pred[6],pred[7])]
434 |     gt_re   = [(gt[0],gt[1]),(gt[2],gt[3]),(gt[4],gt[5]),(gt[6],gt[7])]
435 |     
436 |     pred_poly = Polygon(pred_re)
437 |     gt_poly = Polygon(gt_re)
438 |     # Reward == Intersection/total area
439 |     r = pred_poly.intersection(gt_poly).area/(pred_poly.area + gt_poly.area - pred_poly.intersection(gt_poly).area)
440 |     return r
441 | #%% Test reward functions
442 |     
443 | test_1 = np.array((0,0,0,1,1,1,1,0))
444 | test_2 = np.array((0.5,0,0.5,1,1.5,1,1.5,0))
445 | 
446 | print(loss_v1(test_1, test_2))
447 | print(loss_v2(test_1, test_2))
448 | 
449 | 
450 | #%% Just for information - Find the smallest sized video
451 | 
452 | Vids = vid_list.shape[0]
453 | 
454 | Size = np.zeros((Vids, 2))
455 | 
456 | for i in range(Vids):
457 |     im = Image.open(os.path.join('F:/vot2017/', 
458 |                                     vid_list.iloc[i,0],
459 |                                      '00000001.jpg'))
460 |     Size[i,0] = im.size[0]
461 |     Size[i,1] = im.size[1]
462 | 
463 | # Smallest size ist 320 x240
464 | #Histogram of image sizes
465 | plt.hist(Size)
466 | 
467 | 
468 | 
469 | 
470 | #%% Transforms - Rescale/Resize
471 |     
472 | # Rescaling of an image so that we can feed it with the same size into a network
473 | # Also the groundtruth boxes have to be rescaled accordingly!
474 | # Problem atm rescale the whole Video!! - so far only with for loop
475 | 
476 | class Rescale(object):
477 |     '''
478 |     Rescale the image in a sample to a given size
479 |     
480 |     Arguments: output_size(tuple): Desired output size
481 |                idx(int) : For now idx of the image to be resized
482 |     '''
483 |     
484 |     # Check if output_size is a tuple,
485 |     # maybe also assert if it isnt bigger than the smallest image?
486 |     def __init__(self, output_size):
487 |         assert isinstance(output_size,(tuple))
488 |         self.output_size = output_size
489 |         
490 |     def __call__(self, sample):
491 |         # Split the sample in video and gt
492 |         images, gt = sample['Video'], sample['gt']
493 |         nr = len(images) # Save the amount of images to iterate over
494 |         print(nr)
495 |         # Save heigth and width of video
496 |         h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry
497 |         
498 |         new_h, new_w = self.output_size
499 |         
500 |         
501 |         # I dont like this part due to the for loop.! 
502 |         # Initialize the resized image sequence array
503 |         img = np.zeros((nr,new_h,new_w, images.shape[3]))
504 |         # Iterate over all images and resize them to the given scale.
505 |         for i in range(nr):
506 |             img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w))
507 |         
508 |     
509 |         # Here the groundtruth boxes are rescaled aswell
510 |         gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h))
511 |         
512 |         return {'Video': img, 'gt': gt_new}
513 | 
514 | 
515 | 
516 | #%% Test rescaling
517 |         
518 | scale = Rescale((220,280))
519 | 
520 | 
521 | transformed_sample = scale(sample)
522 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100])
523 | 
524 | # Check
525 | 
526 | #%% Transforms - ToTensor
527 | 
528 | # Transform the loaded image collection to Tensors
529 | 
530 | class ToTensor(object):
531 |     '''Convert sample to tensor'''
532 |     def __call__(self, sample):
533 |         # Load the sample and split it
534 |         images, gt = sample['Video'], sample['gt']
535 |         
536 |         # swap color axis because
537 |         # numpy image: H x W x C
538 |         # torch image: C X H X W
539 |         # How does this relate to videos/imagesequences?
540 |         images = images.transpose((0,3,1,2))
541 |         
542 |         return {'Video': torch.from_numpy(images),
543 |                 'gt': torch.from_numpy(gt)}
544 |         
545 | #%% Test the ToTensor
546 | 
547 | 
548 | rescale = Rescale()
549 | rescale(sample)
550 | tens = ToTensor()
551 | tens(sample)
552 | 
553 | # Apparently not enoguh RAM on my machine here to fully check that it works :D
554 | >>>>>>> b297fdf775d26e7df2deb91721412a875e22dd77
555 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__init__.py


--------------------------------------------------------------------------------
/dataset/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/audio_processing_kg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/audio_processing_kg.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/audio_processing_tf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/audio_processing_tf.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/cdiscount_feature_dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_feature_dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/cdiscount_feature_set_dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_feature_set_dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/cdiscount_image_dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/cdiscount_image_dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/reader.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/reader.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/reader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/reader.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/sampler.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/sampler.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/sampler.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/transform.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/transform.cpython-35.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/transform.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/dataset/__pycache__/transform.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/process.py:
--------------------------------------------------------------------------------
  1 | from common import *
  2 | from utility.file import *
  3 | from utility.draw import *
  4 | 
  5 | from dataset.reader import *
  6 | 
  7 | def multi_mask_to_overlay(multi_mask):
  8 |     overlay = skimage.color.label2rgb(multi_mask, bg_label=0, bg_color=(0, 0, 0))*255
  9 |     overlay = overlay.astype(np.uint8)
 10 |     return overlay
 11 | 
 12 | def thresh_to_inner_contour(thresh):
 13 |     thresh_pad = np.lib.pad(thresh, ((1, 1), (1, 1)), 'reflect')
 14 |     contour = thresh_pad[1:-1,1:-1] & (
 15 |             (thresh_pad[1:-1,1:-1] != thresh_pad[:-2,1:-1]) \
 16 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[2:,1:-1])  \
 17 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,:-2]) \
 18 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,2:])
 19 |     )
 20 |     return contour
 21 | 
 22 | 
 23 | 
 24 | #extra processing
 25 | def run_make_annotation():
 26 | 
 27 |     split = 'train1_ids_all_670'
 28 |     ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#')
 29 | 
 30 |     num_ids = len(ids)
 31 |     for i in range(num_ids):
 32 |         id = ids[i]
 33 |         image_files =   glob.glob(DATA_DIR + '/image/' + id + '/images/*.png')
 34 |         assert(len(image_files)==1)
 35 |         image_file=image_files[0]
 36 |         print(id)
 37 | 
 38 |         #----clear old -----------------------------
 39 |         if 1:
 40 |             for f in ['one_mask.png','one_countour_mask.png','one_countour_image.png','one_countour.png',
 41 |                       'overlap.png', 'one_center.png','/masks.npy', '/labels.npy',
 42 |                       '/countour_on_image.png', '/cut_mask.png', '/label.npy', '/mask.png','/overlay.png',
 43 |                       '/multi.npy','/multi.png',
 44 |                       '/instance.npy','/instance.png',
 45 |                       '/multi_instance.npy','/multi_instance.png',
 46 |                       ]:
 47 |                 file = DATA_DIR + '/image/' + id + '/' + f
 48 |                 if os.path.exists(file):
 49 |                     os.remove(file)
 50 |         #----clear old -----------------------------
 51 | 
 52 | 
 53 |         #image
 54 |         image = cv2.imread(image_file,cv2.IMREAD_COLOR)
 55 | 
 56 |         H,W,C = image.shape
 57 |         multi_mask = np.zeros((H,W), np.int32)
 58 |         mask     = np.zeros((H,W), np.uint8)
 59 |         countour = np.zeros((H,W), np.uint8)
 60 | 
 61 | 
 62 | 
 63 | 
 64 |         mask_files = glob.glob(DATA_DIR + '/image/' + id + '/masks/*.png')
 65 |         mask_files.sort()
 66 |         count = len(mask_files)
 67 |         for i in range(count):
 68 |             mask_file = mask_files[i]
 69 |             thresh = cv2.imread(mask_file,cv2.IMREAD_GRAYSCALE)
 70 |             thresh = thresh >128
 71 |             index  = np.where(thresh==True)
 72 | 
 73 |             multi_mask[thresh]= i+1
 74 |             mask  = np.logical_or(mask,thresh)
 75 |             countour = np.logical_or(countour, thresh_to_inner_contour(thresh) )
 76 | 
 77 | 
 78 | 
 79 |         ## save and show -------------------------------------------
 80 |         countour_on_image = image.copy()
 81 |         countour_on_image = countour[:,:,np.newaxis]*np.array((0,255,0)) +  (1-countour[:,:,np.newaxis])*countour_on_image
 82 | 
 83 |         countour_overlay  = countour*255
 84 |         mask_overlay  = mask*255
 85 |         multi_mask_overlay = multi_mask_to_overlay(multi_mask)
 86 | 
 87 | 
 88 |         image_show('image',image)
 89 |         image_show('mask', mask_overlay)
 90 |         image_show('multi_mask',multi_mask_overlay)
 91 |         image_show('countour',countour_overlay)
 92 |         image_show('countour_on_image',countour_on_image)
 93 | 
 94 | 
 95 | 
 96 |         np.save(DATA_DIR + '/image/' + id + '/multi_mask.npy', multi_mask)
 97 |         cv2.imwrite(DATA_DIR + '/image/' + id + '/multi_mask.png',multi_mask_overlay)
 98 |         cv2.imwrite(DATA_DIR + '/image/' + id + '/mask.png',mask_overlay)
 99 |         cv2.imwrite(DATA_DIR + '/image/' + id + '/countour.png',countour_overlay)
100 |         cv2.imwrite(DATA_DIR + '/image/' + id + '/countour_on_image.png',countour_on_image)
101 | 
102 |         cv2.waitKey(1)
103 | 
104 | 
105 | 
106 | 
107 | 
108 | # main #################################################################
109 | if __name__ == '__main__':
110 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
111 | 
112 |     run_make_annotation()
113 | 
114 |     print( 'sucess!')
115 | 


--------------------------------------------------------------------------------
/dataset/reader.py:
--------------------------------------------------------------------------------
  1 | from common import *
  2 | 
  3 | from dataset.transform import *
  4 | from dataset.sampler import *
  5 | from utility.file import *
  6 | from utility.draw import *
  7 | 
  8 | 
  9 | #data reader  ----------------------------------------------------------------
 10 | class ScienceDataset(Dataset):
 11 | 
 12 |     def __init__(self, split, transform=None, mode='train'):
 13 |         super(ScienceDataset, self).__init__()
 14 |         start = timer()
 15 | 
 16 |         self.split = split
 17 |         self.transform = transform
 18 |         self.mode = mode
 19 | 
 20 |         #read split
 21 |         ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#')
 22 | 
 23 |         #save
 24 |         self.ids = ids
 25 | 
 26 |         #print
 27 |         print('\ttime = %0.2f min'%((timer() - start) / 60))
 28 |         print('\tnum_ids = %d'%(len(self.ids)))
 29 |         print('')
 30 | 
 31 | 
 32 |     def __getitem__(self, index):
 33 |         id = self.ids[index]
 34 |         image_id = id.split('/')[-1]
 35 |         image = cv2.imread(DATA_DIR + '/image/' + id + '/images/' + image_id +'.png', cv2.IMREAD_COLOR)
 36 | 
 37 |         if self.mode in ['train']:
 38 |             multi_mask = np.load( DATA_DIR + '/image/' + id + '/multi_mask.npy')#.astype(int32)
 39 | 
 40 |             if self.transform is not None:
 41 |                 return self.transform(image, multi_mask, index)
 42 |             else:
 43 |                 return input, multi_mask, index
 44 | 
 45 |         if self.mode in ['test']:
 46 |             if self.transform is not None:
 47 |                 return self.transform(image,index)
 48 |             else:
 49 |                 return image, index
 50 | 
 51 |     def __len__(self):
 52 |         return len(self.ids)
 53 | # draw  ----------------------------------------------------------------
 54 | def multi_mask_to_overlay(multi_mask):
 55 |     overlay = skimage.color.label2rgb(multi_mask, bg_label=0, bg_color=(0, 0, 0))*255
 56 |     overlay = overlay.astype(np.uint8)
 57 |     return overlay
 58 | 
 59 | 
 60 | # modifier  ----------------------------------------------------------------
 61 | def thresh_to_inner_contour(thresh):
 62 |     thresh_pad = np.lib.pad(thresh, ((1, 1), (1, 1)), 'reflect')
 63 |     contour = thresh_pad[1:-1,1:-1] & (
 64 |             (thresh_pad[1:-1,1:-1] != thresh_pad[:-2,1:-1]) \
 65 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[2:,1:-1])  \
 66 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,:-2]) \
 67 |           | (thresh_pad[1:-1,1:-1] != thresh_pad[1:-1,2:])
 68 |     )
 69 |     return contour
 70 | 
 71 | 
 72 | def multi_mask_to_annotation(multi_mask):
 73 |     H,W = multi_mask.shape[:2]
 74 |     count = multi_mask.max()
 75 | 
 76 |     box      = []
 77 |     label    = []
 78 |     instance = []
 79 |     for i in range(count):
 80 |         thresh = (multi_mask==(i+1))
 81 |         if thresh.sum()>1:
 82 |             #<todo> filter small, etc
 83 | 
 84 |             y,x = np.where(thresh)
 85 |             y0 = y.min()
 86 |             y1 = y.max()
 87 |             x0 = x.min()
 88 |             x1 = x.max()
 89 |             w = (x1-x0)+1
 90 |             h = (y1-y0)+1
 91 | 
 92 |             #f  = int(0.3*min(w,h))
 93 |             border  = int(0.3*(w+h)/2)
 94 |             x0 = x0-border
 95 |             x1 = x1+border
 96 |             y0 = y0-border
 97 |             y1 = y1+border
 98 | 
 99 |             #clip
100 |             x0 = max(0,x0)
101 |             y0 = max(0,y0)
102 |             x1 = min(W-1,x1)
103 |             y1 = min(H-1,y1)
104 | 
105 |             #<todo> filter small
106 |             box.append([x0,y0,x1,y1])
107 |             label.append(1) #<todo> now assume one class
108 |             instance.append(thresh.astype(np.float32))
109 | 
110 |     if box!=[]:
111 |         box      = np.array(box,np.float32)
112 |         label    = np.array(label,np.float32)
113 |         instance = np.array(instance,np.float32)
114 |     else:
115 |         box      = None
116 |         label    = None
117 |         instance = None
118 | 
119 |     return box, label, instance
120 | 
121 | 
122 | 
123 | 
124 | 
125 | # check ##################################################################################3
126 | def run_check_dataset_reader():
127 | 
128 |     def augment(image, multi_mask, index):
129 |         box, label, instance = multi_mask_to_annotation(multi_mask)
130 | 
131 |         #for display
132 |         multi_mask = multi_mask/multi_mask.max() *255
133 |         count  = len(instance)
134 | 
135 |         instance_gray = instance.copy()
136 |         instance =[]
137 |         for i in range(count):
138 |             instance.append(
139 |                 cv2.cvtColor((instance_gray[i]*255).astype(np.uint8),cv2.COLOR_GRAY2BGR)
140 |             )
141 |         instance = np.array(instance)
142 |         return image, multi_mask, box, label, instance, index
143 | 
144 | 
145 |     dataset = ScienceDataset(
146 |         'train1_ids_gray_only1_500', mode='train',
147 |         transform = augment,
148 |     )
149 |     sampler = SequentialSampler(dataset)
150 |     #sampler = RandomSampler(dataset)
151 | 
152 | 
153 |     for n in iter(sampler):
154 |     #for n in range(10):
155 |     #n=0
156 |     #while 1:
157 |         image, multi_mask, box, label, instance, index = dataset[n]
158 |         image_show('image',image)
159 |         image_show('multi_mask',multi_mask)
160 |         count  = len(instance)
161 |         for i in range(count):
162 |             x0,y0,x1,y1 = box[i]
163 |             cv2.rectangle(instance[i],(x0,y0),(x1,y1),(0,0,255),1)
164 | 
165 |             image_show('instance[i]',instance[i])
166 |             print('label[i], box[i] : ', label[i], box[i])
167 | 
168 |             cv2.waitKey(1)
169 | 
170 | 
171 | 
172 | 
173 | 
174 | # main #################################################################
175 | if __name__ == '__main__':
176 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
177 | 
178 |     run_check_dataset_reader()
179 | 
180 |     print( 'sucess!')
181 | 


--------------------------------------------------------------------------------
/dataset/sampler.py:
--------------------------------------------------------------------------------
 1 | from common import *
 2 | # common tool for dataset
 3 | 
 4 | #sampler -----------------------------------------------
 5 | 
 6 | class ConstantSampler(Sampler):
 7 |     def __init__(self, data, list):
 8 |         self.num_samples = len(list)
 9 |         self.list = list
10 | 
11 |     def __iter__(self):
12 |         #print ('\tcalling Sampler:__iter__')
13 |         return iter(self.list)
14 | 
15 |     def __len__(self):
16 |         #print ('\tcalling Sampler:__len__')
17 |         return self.num_samples
18 | 
19 | 
20 | # see trorch/utils/data/sampler.py
21 | class FixLengthRandomSampler(Sampler):
22 |     def __init__(self, data, length=None):
23 |         self.len_data = len(data)
24 |         self.length   = length or self.len_data
25 | 
26 |     def __iter__(self):
27 |         #print ('\tcalling Sampler:__iter__')
28 | 
29 |         l=[]
30 |         while 1:
31 |             ll = list(range(self.len_data))
32 |             random.shuffle(ll)
33 |             l = l + ll
34 |             if len(l)>=self.length: break
35 | 
36 |         l= l[:self.length]
37 |         return iter(l)
38 | 
39 | 
40 |     def __len__(self):
41 |         #print ('\tcalling Sampler:__len__')
42 |         return self.length
43 | 


--------------------------------------------------------------------------------
/dataset/transform.py:
--------------------------------------------------------------------------------
 1 | from common import *
 2 | 
 3 | 
 4 | ## for debug
 5 | def dummy_transform(image):
 6 |     print ('\tdummy_transform')
 7 |     return image
 8 | 
 9 | # kaggle science bowl-2 : -------------------------------------------------------
10 | 
11 | def resize_to_factor2(image, mask, factor=16):
12 | 
13 |     H,W = image.shape[:2]
14 |     h = (H//factor)*factor
15 |     w = (W //factor)*factor
16 |     return fix_resize_transform2(image, mask, w, h)
17 | 
18 | 
19 | 
20 | def fix_resize_transform2(image, mask, w, h):
21 |     H,W = image.shape[:2]
22 |     if (H,W) != (h,w):
23 |         image = cv2.resize(image,(w,h))
24 | 
25 |         mask = mask.astype(np.float32)
26 |         mask = cv2.resize(mask,(w,h))
27 |         mask = mask.astype(np.int32)
28 |     return image, mask
29 | 
30 | 
31 | 
32 | 
33 | def fix_crop_transform2(image, mask, x,y,w,h):
34 | 
35 |     H,W = image.shape[:2]
36 |     assert(H>=h)
37 |     assert(W >=w)
38 | 
39 |     if (x==-1 & y==-1):
40 |         x=(W-w)//2
41 |         y=(H-h)//2
42 | 
43 |     if (x,y,w,h) != (0,0,W,H):
44 |         image = image[y:y+h, x:x+w]
45 |         mask = mask[y:y+h, x:x+w]
46 | 
47 |     return image, mask
48 | 
49 | 
50 | def random_crop_transform2(image, mask, w,h):
51 |     H,W = image.shape[:2]
52 | 
53 |     if H!=h:
54 |         y = np.random.choice(H-h)
55 |     else:
56 |         y=0
57 | 
58 |     if W!=w:
59 |         x = np.random.choice(W-w)
60 |     else:
61 |         x=0
62 | 
63 |     return fix_crop_transform2(image, mask, x,y,w,h)
64 | 
65 | 
66 | def resize_to_factor(image, factor=16):
67 |     height,width = image.shape[:2]
68 |     h = (height//factor)*factor
69 |     w = (width //factor)*factor
70 |     return fix_resize_transform(image, w, h)
71 | 
72 | 
73 | def fix_resize_transform(image, w, h):
74 |     height,width = image.shape[:2]
75 |     if (height,width) != (h,w):
76 |         image = cv2.resize(image,(w,h))
77 |     return image
78 | 
79 | # main #################################################################
80 | if __name__ == '__main__':
81 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
82 | 
83 |     print('\nsucess!')


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fgabel/Deep-Reinforcement-Learning-for-Visual-Object-Tracking-in-Videos/96c81fa86b5e0f935881fb8d9e6bc85eae74337c/model/__init__.py


--------------------------------------------------------------------------------
/model/configuration.py:
--------------------------------------------------------------------------------
 1 | from common import *
 2 | import configparser
 3 | 
 4 | 
 5 | class Configuration(object):
 6 | 
 7 |     def __init__(self):
 8 |         super(Configuration, self).__init__()
 9 |         self.version='configuration version \'mask-rcnn-resnet50-fpn, kaggle\''
10 | 
11 |         #net
12 |         self.num_classes = 2 #include background class
13 | 
14 |         #rpn
15 |         self.rpn_num_heads  = 4
16 |         self.rpn_num_bases  = 3
17 |         self.rpn_base_sizes         = [ 8, 16, 32, 64 ] #diameter
18 |         self.rpn_base_apsect_ratios = [1, 0.5,  2]
19 |         self.rpn_strides    = [ 1,  2,  4,  8 ]
20 | 
21 | 
22 |         self.rpn_train_batch_size     = 256  # rpn target  256
23 |         self.rpn_train_fg_fraction    = 0.5
24 |         self.rpn_train_bg_thresh_high = 0.3
25 |         self.rpn_train_fg_thresh_low  = 0.7
26 | 
27 |         self.rpn_train_nms_threshold  = 0.7 # rpn nms
28 |         self.rpn_train_nms_min_size   = [ 64, 32, 16, 8, 4] # not using: -1
29 |         self.rpn_train_nms_pre_top_n  =  5000  #12000
30 |         self.rpn_train_nms_post_top_n =  1000  #2000
31 | 
32 |         self.rpn_test_nms_threshold  = 0.7
33 |         self.rpn_test_nms_min_size   = [ 64, 32, 16, 8, 4] # not using: -1
34 |         self.rpn_test_nms_pre_top_n  =  5000
35 |         self.rpn_test_nms_post_top_n =  1000
36 | 
37 |         #crop
38 |         self.pool_size = 16
39 |         self.rcnn_select_size_thresholds = [
40 |           [  0,    8],#'stride  1':
41 |           [  8,   16],#'stride  2':
42 |           [ 16,   32],#'stride  4':
43 |           [ 32,  1e8],#'stride  8':
44 |         ]
45 | 
46 |         #rcnn
47 |         self.rcnn_train_batch_size      = 256  # rcnn target
48 |         self.rcnn_train_fg_fraction     = 0.25
49 |         self.rcnn_train_bg_thresh_high  = 0.5
50 |         self.rcnn_train_bg_thresh_low   = 0.0
51 |         self.rcnn_train_fg_thresh_low   = 0.5
52 |         # self.rcnn_train_delta_norm_stds = (0.1, 0.1, 0.2, 0.2) #(1, 1, 1, 1) # <todo>
53 | 
54 |         self.rcnn_train_nms_pre_threshold          = 0.05 # set low 0.05 to make roc curve.
55 |         self.rcnn_train_nms_post_overlap_threshold = 0.8
56 |         self.rcnn_train_nms_max_per_image          = 256
57 | 
58 |         self.rcnn_test_nms_pre_threshold          = 0.1
59 |         self.rcnn_test_nms_post_overlap_threshold = 0.5
60 |         self.rcnn_test_nms_max_per_image          = 512
61 | 
62 |         #mask
63 |         self.mask_size = 16
64 |         self.mask_train_fg_thresh_low = 0.5
65 | 
66 |         self.mask_test_nms_threshold = 0.9
67 |         self.mask_test_threshold = 0.5
68 | 
69 | 
70 | 
71 | 
72 |     def __repr__(self):
73 |         raise NotImplementedError
74 | 
75 |     def save(self, file):
76 |         raise NotImplementedError
77 | 
78 |     def load(self, file):
79 |         raise NotImplementedError
80 | 
81 | 
82 | 
83 | 
84 | # main #################################################################
85 | if __name__ == '__main__':
86 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
87 | 
88 |     os.makedirs('/root/share/project/ellen-object-detect/results/xxx/',exist_ok=True)
89 |     file='/root/share/project/ellen-object-detect/results/xxx/configure'
90 | 
91 |     cfg = Configuration()
92 |     cfg.save(file)
93 |     cfg.load(file)
94 |     cfg.save('/root/share/project/ellen-object-detect/results/xxx/configure1')
95 | 
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/model/dlrt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | from torch.nn import init
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 | class DLRTnet(nn.Module):
13 |     """DLRTnet is based on https://arxiv.org/abs/1701.08936
14 |     # write something about the architecture
15 |     
16 |     """
17 |     def __init__(self, *args):
18 |         """args:
19 |             - a
20 |             - b
21 |             - c 
22 |         """
23 |         
24 |     @staticmethod
25 |     def weight_init(m):
26 |         """ call this for weight initialisation 
27 |         """
28 |         if isinstance(m, nn.Conv2d):
29 |             init.xavier_normal(m.weight)
30 |             init.constant(m.bias, 0)
31 | 
32 | 
33 |     def reset_params(self):
34 |         """ call this for weight reset in each of the modules
35 |         # we might not need this
36 |         """
37 |         for i, m in enumerate(self.modules()):
38 |             self.weight_init(m)
39 | 
40 | 
41 |     def forward(self, x):
42 | 
43 |         return x


--------------------------------------------------------------------------------
/model/yolo-voc.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=8
  8 | height=416
  9 | width=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 80200
 21 | policy=steps
 22 | steps=40000,60000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | [maxpool]
 34 | size=2
 35 | stride=2
 36 | 
 37 | [convolutional]
 38 | batch_normalize=1
 39 | filters=64
 40 | size=3
 41 | stride=1
 42 | pad=1
 43 | activation=leaky
 44 | 
 45 | [maxpool]
 46 | size=2
 47 | stride=2
 48 | 
 49 | [convolutional]
 50 | batch_normalize=1
 51 | filters=128
 52 | size=3
 53 | stride=1
 54 | pad=1
 55 | activation=leaky
 56 | 
 57 | [convolutional]
 58 | batch_normalize=1
 59 | filters=64
 60 | size=1
 61 | stride=1
 62 | pad=1
 63 | activation=leaky
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=1
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [maxpool]
 74 | size=2
 75 | stride=2
 76 | 
 77 | [convolutional]
 78 | batch_normalize=1
 79 | filters=256
 80 | size=3
 81 | stride=1
 82 | pad=1
 83 | activation=leaky
 84 | 
 85 | [convolutional]
 86 | batch_normalize=1
 87 | filters=128
 88 | size=1
 89 | stride=1
 90 | pad=1
 91 | activation=leaky
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=256
 96 | size=3
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [maxpool]
102 | size=2
103 | stride=2
104 | 
105 | [convolutional]
106 | batch_normalize=1
107 | filters=512
108 | size=3
109 | stride=1
110 | pad=1
111 | activation=leaky
112 | 
113 | [convolutional]
114 | batch_normalize=1
115 | filters=256
116 | size=1
117 | stride=1
118 | pad=1
119 | activation=leaky
120 | 
121 | [convolutional]
122 | batch_normalize=1
123 | filters=512
124 | size=3
125 | stride=1
126 | pad=1
127 | activation=leaky
128 | 
129 | [convolutional]
130 | batch_normalize=1
131 | filters=256
132 | size=1
133 | stride=1
134 | pad=1
135 | activation=leaky
136 | 
137 | [convolutional]
138 | batch_normalize=1
139 | filters=512
140 | size=3
141 | stride=1
142 | pad=1
143 | activation=leaky
144 | 
145 | [maxpool]
146 | size=2
147 | stride=2
148 | 
149 | [convolutional]
150 | batch_normalize=1
151 | filters=1024
152 | size=3
153 | stride=1
154 | pad=1
155 | activation=leaky
156 | 
157 | [convolutional]
158 | batch_normalize=1
159 | filters=512
160 | size=1
161 | stride=1
162 | pad=1
163 | activation=leaky
164 | 
165 | [convolutional]
166 | batch_normalize=1
167 | filters=1024
168 | size=3
169 | stride=1
170 | pad=1
171 | activation=leaky
172 | 
173 | [convolutional]
174 | batch_normalize=1
175 | filters=512
176 | size=1
177 | stride=1
178 | pad=1
179 | activation=leaky
180 | 
181 | [convolutional]
182 | batch_normalize=1
183 | filters=1024
184 | size=3
185 | stride=1
186 | pad=1
187 | activation=leaky
188 | 
189 | 
190 | #######
191 | 
192 | [convolutional]
193 | batch_normalize=1
194 | size=3
195 | stride=1
196 | pad=1
197 | filters=1024
198 | activation=leaky
199 | 
200 | [convolutional]
201 | batch_normalize=1
202 | size=3
203 | stride=1
204 | pad=1
205 | filters=1024
206 | activation=leaky
207 | 
208 | [route]
209 | layers=-9
210 | 
211 | [convolutional]
212 | batch_normalize=1
213 | size=1
214 | stride=1
215 | pad=1
216 | filters=64
217 | activation=leaky
218 | 
219 | [reorg]
220 | stride=2
221 | 
222 | [route]
223 | layers=-1,-4
224 | 
225 | [convolutional]
226 | batch_normalize=1
227 | size=3
228 | stride=1
229 | pad=1
230 | filters=1024
231 | activation=leaky
232 | 
233 | [convolutional]
234 | size=1
235 | stride=1
236 | pad=1
237 | filters=125
238 | activation=linear
239 | 
240 | 
241 | [region]
242 | anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
243 | bias_match=1
244 | classes=20
245 | coords=4
246 | num=5
247 | softmax=1
248 | jitter=.3
249 | rescore=1
250 | 
251 | object_scale=5
252 | noobject_scale=1
253 | class_scale=1
254 | coord_scale=1
255 | 
256 | absolute=1
257 | thresh = .6
258 | random=1
259 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from common import *
  2 | from utils import *
  3 | 
  4 | # ------------------------------------------------------------------------------------
  5 | from model import DLRTnet
  6 | from utils import VOT2017Dataset
  7 | 
  8 | def train_augment(image, multi_mask, index):
  9 |     pass
 10 | 
 11 | 
 12 | 
 13 | 
 14 | def valid_augment(image, multi_mask, index):
 15 |     pass
 16 | 
 17 | 
 18 | 
 19 | def train_collate(batch):
 20 | 
 21 |     batch_size = len(batch)
 22 |     #for b in range(batch_size): print (batch[b][0].size())
 23 |     inputs    = torch.stack([batch[b][0]for b in range(batch_size)], 0)
 24 |     boxes     =             [batch[b][1]for b in range(batch_size)]
 25 |     labels    =             [batch[b][2]for b in range(batch_size)]
 26 |     instances =             [batch[b][3]for b in range(batch_size)]
 27 |     indices   =             [batch[b][4]for b in range(batch_size)]
 28 | 
 29 |     return [inputs, boxes, labels, instances, indices]
 30 | 
 31 | ### draw #########################################################
 32 | 
 33 | def draw():
 34 |     
 35 |     pass
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | ### training ##############################################################
 44 | def evaluate(net, test_loader):
 45 | 
 46 |     test_num  = 0
 47 |     test_loss = np.zeros(6,np.float32)
 48 |     test_acc  = 0
 49 |     for i, (inputs, boxes, labels, instances, indices) in enumerate(test_loader, 0):
 50 |         inputs = Variable(inputs,volatile=True).cuda()
 51 | 
 52 |         net(inputs, boxes,  labels, instances )
 53 |         loss = net.loss(inputs, boxes,  labels, instances)
 54 | 
 55 |         # acc    = dice_loss(masks, labels) #todo
 56 | 
 57 |         batch_size = len(indices)
 58 |         test_acc  += 0 #batch_size*acc[0][0]
 59 |         test_loss += batch_size*np.array((
 60 |                            loss .cpu().data.numpy()[0],
 61 |                            net.rpn_cls_loss.cpu().data.numpy()[0],
 62 |                            net.rpn_reg_loss.cpu().data.numpy()[0],
 63 |                            net.rcnn_cls_loss.cpu().data.numpy()[0],
 64 |                            net.rcnn_reg_loss.cpu().data.numpy()[0],
 65 |                            net.mask_cls_loss.cpu().data.numpy()[0],
 66 |                          ))
 67 |         test_num  += batch_size
 68 | 
 69 |     assert(test_num == len(test_loader.sampler))
 70 |     test_acc  = test_acc/test_num
 71 |     test_loss = test_loss/test_num
 72 |     return test_loss, test_acc
 73 | 
 74 | 
 75 | 
 76 | #--------------------------------------------------------------
 77 | def run_train():
 78 | 
 79 |     out_dir  = RESULTS_DIR + '/mask-rcnn-gray-011a-debug'
 80 |     initial_checkpoint = \
 81 |         RESULTS_DIR + '/mask-rcnn-gray-011a-debug/checkpoint/00072200_model.pth'
 82 |         #
 83 | 
 84 | 
 85 |     pretrain_file = None #imagenet pretrain
 86 |     ## setup  -----------------
 87 |     os.makedirs(out_dir +'/checkpoint', exist_ok=True)
 88 |     os.makedirs(out_dir +'/train', exist_ok=True)
 89 |     os.makedirs(out_dir +'/backup', exist_ok=True)
 90 |     backup_project_as_zip(PROJECT_PATH, out_dir +'/backup/code.train.%s.zip'%IDENTIFIER)
 91 | 
 92 |     log = Logger()
 93 |     log.open(out_dir+'/log.train.txt',mode='a')
 94 |     log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
 95 |     log.write('** some experiment setting **\n')
 96 |     log.write('\tSEED         = %u\n' % SEED)
 97 |     log.write('\tPROJECT_PATH = %s\n' % PROJECT_PATH)
 98 |     log.write('\tout_dir      = %s\n' % out_dir)
 99 |     log.write('\n')
100 | 
101 | 
102 |     ## net ----------------------
103 |     log.write('** net setting **\n')
104 |     cfg = Configuration()
105 |     net = MaskRcnnNet(cfg).cuda()
106 | 
107 |     if initial_checkpoint is not None:
108 |         log.write('\tinitial_checkpoint = %s\n' % initial_checkpoint)
109 |         net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage))
110 | 
111 |     elif pretrain_file is not None:
112 |         log.write('\tpretrained_file = %s\n' % pretrain_file)
113 |         #load_pretrain_file(net, pretrain_file)
114 | 
115 | 
116 |     log.write('%s\n\n'%(type(net)))
117 |     log.write('\n')
118 | 
119 | 
120 | 
121 |     ## optimiser ----------------------------------
122 |     iter_accum  = 1
123 |     batch_size  = 4  ##NUM_CUDA_DEVICES*512 #256//iter_accum #512 #2*288//iter_accum
124 | 
125 |     num_iters   = 1000  *1000
126 |     iter_smooth = 20
127 |     iter_log    = 50
128 |     iter_valid  = 100
129 |     iter_save   = [0, num_iters-1]\
130 |                    + list(range(0,num_iters,100))#1*1000
131 | 
132 | 
133 |     LR = None  #LR = StepLR([ (0, 0.01),  (200, 0.001),  (300, -1)])
134 |     optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()),
135 |                           lr=0.001/iter_accum, momentum=0.9, weight_decay=0.0001)
136 | 
137 |     start_iter = 0
138 |     start_epoch= 0.
139 |     if initial_checkpoint is not None:
140 |         checkpoint  = torch.load(initial_checkpoint.replace('_model.pth','_optimizer.pth'))
141 |         start_iter  = checkpoint['iter' ]
142 |         start_epoch = checkpoint['epoch']
143 |         #optimizer.load_state_dict(checkpoint['optimizer'])
144 | 
145 | 
146 |     ## dataset ----------------------------------------
147 |     log.write('** dataset setting **\n')
148 | 
149 |     train_dataset = XXXXXDataset(
150 |                                 #'train1_ids_gray_only1_500', mode='train',
151 |                                 'valid1_ids_gray_only1_43', mode='train',
152 |                                 transform = train_augment)
153 |     train_loader  = DataLoader(
154 |                         train_dataset,
155 |                         sampler = RandomSampler(train_dataset),
156 |                         #sampler = ConstantSampler(train_dataset,list(range(16))),
157 |                         batch_size  = batch_size,
158 |                         drop_last   = True,
159 |                         num_workers = 4,
160 |                         pin_memory  = True,
161 |                         collate_fn  = train_collate)
162 | 
163 | 
164 |     valid_dataset = ScienceDataset(
165 |                                 'valid1_ids_gray_only1_43', mode='train',
166 |                                 #'debug1_ids_gray_only1_10', mode='train',
167 |                                  transform = valid_augment)
168 |     valid_loader  = DataLoader(
169 |                         valid_dataset,
170 |                         sampler     = SequentialSampler(valid_dataset),
171 |                         batch_size  = batch_size,
172 |                         drop_last   = False,
173 |                         num_workers = 4,
174 |                         pin_memory  = True,
175 |                         collate_fn  = train_collate)
176 | 
177 |     log.write('\ttrain_dataset.split = %s\n'%(train_dataset.split))
178 |     log.write('\tvalid_dataset.split = %s\n'%(valid_dataset.split))
179 |     log.write('\tlen(train_dataset)  = %d\n'%(len(train_dataset)))
180 |     log.write('\tlen(valid_dataset)  = %d\n'%(len(valid_dataset)))
181 |     log.write('\tlen(train_loader)   = %d\n'%(len(train_loader)))
182 |     log.write('\tlen(valid_loader)   = %d\n'%(len(valid_loader)))
183 |     log.write('\tbatch_size  = %d\n'%(batch_size))
184 |     log.write('\titer_accum  = %d\n'%(iter_accum))
185 |     log.write('\tbatch_size*iter_accum  = %d\n'%(batch_size*iter_accum))
186 |     log.write('\n')
187 | 
188 |     #log.write(inspect.getsource(train_augment)+'\n')
189 |     #log.write(inspect.getsource(valid_augment)+'\n')
190 |     #log.write('\n')
191 | 
192 |     if 0: #<debug>
193 |         for inputs, truth_boxes, truth_labels, truth_instances, indices in valid_loader:
194 | 
195 |             batch_size, C,H,W = inputs.size()
196 |             print(batch_size)
197 | 
198 |             images = inputs.cpu().numpy()
199 |             for b in range(batch_size):
200 |                 image = (images[b].transpose((1,2,0))*255)
201 |                 image = np.clip(image.astype(np.float32)*3,0,255)
202 | 
203 |                 image1 = image.copy()
204 | 
205 |                 truth_box = truth_boxes[b]
206 |                 truth_label = truth_labels[b]
207 |                 truth_instance = truth_instances[b]
208 |                 if truth_box is not None:
209 |                     for box,label,instance in zip(truth_box,truth_label,truth_instance):
210 |                         x0,y0,x1,y1 = box.astype(np.int32)
211 |                         cv2.rectangle(image,(x0,y0),(x1,y1),(0,0,255),1)
212 |                         print(label)
213 | 
214 |                         thresh = instance>0.5
215 |                         contour = thresh_to_inner_contour(thresh)
216 |                         contour = contour.astype(np.float32) *0.5
217 | 
218 |                         image1 = contour[:,:,np.newaxis]*np.array((0,255,0)) +  (1-contour[:,:,np.newaxis])*image1
219 | 
220 | 
221 |                     print('')
222 | 
223 | 
224 |                 image_show('image',image)
225 |                 image_show('image1',image1)
226 |                 cv2.waitKey(0)
227 | 
228 | 
229 | 
230 |     ## start training here! ##############################################
231 |     log.write('** start training here! **\n')
232 |     log.write(' optimizer=%s\n'%str(optimizer) )
233 |     log.write(' momentum=%f\n'% optimizer.param_groups[0]['momentum'])
234 |     log.write(' LR=%s\n\n'%str(LR) )
235 | 
236 |     log.write(' images_per_epoch = %d\n\n'%len(train_dataset))
237 |     log.write(' rate    iter   epoch  num   | valid_loss                           | train_loss                           | batch_loss                           |  time    \n')
238 |     log.write('------------------------------------------------------------------------------------------------------------------------------------------------------------------\n')
239 | 
240 |     train_loss  = np.zeros(6,np.float32)
241 |     train_acc   = 0.0
242 |     valid_loss  = np.zeros(6,np.float32)
243 |     valid_acc   = 0.0
244 |     batch_loss  = np.zeros(6,np.float32)
245 |     batch_acc   = 0.0
246 |     rate = 0
247 | 
248 |     start = timer()
249 |     j = 0
250 |     i = 0
251 | 
252 | 
253 |     for i in range(n_epochs):  # loop over the dataset multiple times
254 |         sum_train_loss = np.zeros(6, np.float32)
255 |         sum_train_acc  = 0.0
256 |         sum = 0
257 | 
258 |         net.set_mode('train')
259 |         optimizer.zero_grad()
260 |         for inputs, truth_boxes, truth_labels, truth_instances, indices in train_loader:
261 |             batch_size = len(indices)
262 |             i = j/iter_accum + start_iter
263 |             epoch = (i-start_iter)*batch_size*iter_accum/len(train_dataset) + start_epoch
264 |             num_products = epoch*len(train_dataset)
265 | 
266 |             if i % iter_valid==0:
267 |                 net.set_mode('valid')
268 |                 valid_loss, valid_acc = evaluate(net, valid_loader)
269 |                 net.set_mode('train')
270 | 
271 |                 print('\r',end='',flush=True)
272 |                 log.write('%0.4f %5.1f k %6.2f %4.1f m | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %s\n' % (\
273 |                          rate, i/1000, epoch, num_products/1000000,
274 |                          valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc,
275 |                          train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc,
276 |                          batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc,
277 |                          time_to_str((timer() - start)/60)))
278 |                 time.sleep(0.01)
279 | 
280 |             #if 1:
281 |             if i in iter_save:
282 |                 torch.save(net.state_dict(),out_dir +'/checkpoint/%08d_model.pth'%(i))
283 |                 torch.save({
284 |                     'optimizer': optimizer.state_dict(),
285 |                     'iter'     : i,
286 |                     'epoch'    : epoch,
287 |                 }, out_dir +'/checkpoint/%08d_optimizer.pth'%(i))
288 | 
289 | 
290 | 
291 |             # learning rate schduler -------------
292 |             if LR is not None:
293 |                 lr = LR.get_rate(i)
294 |                 if lr<0 : break
295 |                 adjust_learning_rate(optimizer, lr/iter_accum)
296 |             rate = get_learning_rate(optimizer)[0]*iter_accum
297 | 
298 | 
299 | 
300 | 
301 |             # one iteration update  -------------
302 |             inputs = Variable(inputs).cuda()
303 |             net( inputs, truth_boxes, truth_labels, truth_instances )
304 |             loss = net.loss( inputs, truth_boxes, truth_labels, truth_instances )
305 | 
306 | 
307 |             if 1: #<debug>
308 |                 debug_and_draw(net, inputs, truth_boxes, truth_labels, truth_instances, mode='test')
309 | 
310 |             # masks  = (probs>0.5).float()
311 |             # acc    = dice_loss(masks, labels)
312 | 
313 | 
314 |             # accumulated update
315 |             loss.backward()
316 |             if j%iter_accum == 0:
317 |                 #torch.nn.utils.clip_grad_norm(net.parameters(), 1)
318 |                 optimizer.step()
319 |                 optimizer.zero_grad()
320 | 
321 | 
322 |             # print statistics  ------------
323 |             batch_acc  = 0 #acc[0][0]
324 |             batch_loss = np.array((
325 |                            loss.cpu().data.numpy()[0],
326 |                            net.rpn_cls_loss.cpu().data.numpy()[0],
327 |                            net.rpn_reg_loss.cpu().data.numpy()[0],
328 |                            net.rcnn_cls_loss.cpu().data.numpy()[0],
329 |                            net.rcnn_reg_loss.cpu().data.numpy()[0],
330 |                            net.mask_cls_loss.cpu().data.numpy()[0],
331 |                          ))
332 |             sum_train_loss += batch_loss
333 |             sum_train_acc  += batch_acc
334 |             sum += 1
335 |             if i%iter_smooth == 0:
336 |                 train_loss = sum_train_loss/sum
337 |                 train_acc  = sum_train_acc /sum
338 |                 sum_train_loss = np.zeros(6,np.float32)
339 |                 sum_train_acc  = 0.
340 |                 sum = 0
341 | 
342 | 
343 |             print('\r%0.4f %5.1f k %6.2f %4.1f m | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %0.3f   %0.2f %0.2f   %0.2f %0.2f   %0.2f | %s  %d,%d,%s' % (\
344 |                          rate, i/1000, epoch, num_products/1000000,
345 |                          valid_loss[0], valid_loss[1], valid_loss[2], valid_loss[3], valid_loss[4], valid_loss[5], #valid_acc,
346 |                          train_loss[0], train_loss[1], train_loss[2], train_loss[3], train_loss[4], train_loss[5], #train_acc,
347 |                          batch_loss[0], batch_loss[1], batch_loss[2], batch_loss[3], batch_loss[4], batch_loss[5], #batch_acc,
348 |                          time_to_str((timer() - start)/60) ,i,j, str(inputs.size())), end='',flush=True)
349 |             j=j+1
350 | 
351 | 
352 | 
353 |         pass  #-- end of one data loader --
354 |     pass #-- end of all iterations --
355 | 
356 | 
357 |     if 1: #save last
358 |         torch.save(net.state_dict(),out_dir +'/checkpoint/%d_model.pth'%(i))
359 |         torch.save({
360 |             'optimizer': optimizer.state_dict(),
361 |             'iter'     : i,
362 |             'epoch'    : epoch,
363 |         }, out_dir +'/checkpoint/%d_optimizer.pth'%(i))
364 | 
365 |     log.write('\n')
366 | 
367 | def train(model, criterion, optimizer, n_epochs, T):
368 |     # Train the Model
369 |     vot_data = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt',
370 |                        root_dir= 'F:/vot2017/')  
371 |     train_loader = dataloader(vot_data) # iterating over this gives videos
372 |     for vid in train_loader:
373 |         for image in vid[...]:
374 |             
375 |     for epoch in range(n_epochs):
376 |         
377 |         for i, video in enumerate(train_loader):
378 |             # video is now a dict of video, gt pairs
379 |             image_stack = video['video'] # shape(Nr. of images, h, w, RGB)
380 |             masks = video['gt']
381 |             current_pos_of_t = 0 # start from the start of the video
382 |             for t in range(T):
383 |                 """we take a T-image sequence out of the video"""     
384 |                 if current_pos_of_t + T > image_stack.shape[0]:
385 |                     """This loop makes sure that we dont get an index error. Handling the edges like this is not so nice though"""
386 |                     continue
387 |                 image_stack_temp = image_stack[current_pos_of_t: current_pos_of_t + T, :, :, :]
388 |                 masks_temp = masks[current_pos_of_t: current_pos_of_t + T, :, :] # check FORMATS!
389 |                 current_pos_of_t += T # next iteration, take the next T-image sequence
390 |                 reward_list = []
391 |                 b_t_list = []
392 |                 for image, mask in zip(image_stack_temp, masks_temp): 
393 |                     """ iterate over the first dimensionof image_stack_temp, the number of images
394 |                         image is now a particular image of the sequence, mask its corresonding mask
395 |                     """
396 |                     image = Variable(mask.view(-1, sequence_length, input_size))
397 |                     mask = Variable(mask)
398 |                 
399 |                     # Forward + Backward + Optimize
400 |                     optimizer.zero_grad() # reset gradients
401 |                     outputs = DLRTnet(images) # observation network
402 |                     out, hidden = LSTM(outputs, hidden) # LSTM gets the output of DLRTnet as input and its previous hidden state
403 |                     """GaussianLayer takes a hidden layer and samples N masks from the last 4 / 8 numbers"""
404 |                     l_t = GaussianLayer(hidden) # l_t contains N sampled masks
405 |                     
406 |                     for mask_ in l_t:
407 |                         reward += LOSSFUNCTION(l_t, mask) # take the loss function 1 from the paper and 
408 |                     reward_list.append(reward) # this list contains r1, r2, r3, ..., rT
409 |                     b_t_list.append(1/N*reward) # this contains b1, b2, ... bT
410 |                 
411 |                 
412 |                 # Compute gradient
413 |                 image_reward = np.asarray(reward_list)
414 |                 image_base = np.asarray(b_t_list)
415 |                     
416 |                     
417 |                 loss.backward() # Crucial step here is to implement backward pass of GaussianLayer using reward_list
418 |                 optimizer.step()
419 |                 
420 |                 if (i+1) % 100 == 0:
421 |                     print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
422 |                            %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
423 | 
424 | 
425 | 
426 | 
427 | # main #################################################################
428 | if __name__ == '__main__':
429 |     print( '%s: calling main function ... ' % os.path.basename(__file__))
430 | 
431 |     run_train()
432 | 
433 |     print('\nsucess!')
434 | 
435 | 
436 | 
437 | #  ffmpeg -f image2  -pattern_type glob -r 33 -i "iterations/*.png" -c:v libx264  iterations.mp4
438 | #
439 | #


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from common import *
  5 | 
  6 | #%% Recalculate x,y,w,h to the four corner coordinates
  7 |     
  8 | def calc_coords(gt_wh):
  9 |     gt_new = np.zeros((gt_wh.shape[0],8))
 10 |     print(gt_new.shape)
 11 |     print(gt_wh.shape)
 12 |     print(gt_wh[:,0])
 13 |     gt_new[:,0] = gt_wh[:,0]                # x1
 14 |     gt_new[:,1] = gt_wh[:,1]                # y1
 15 |     gt_new[:,2] = gt_wh[:,0] + gt_wh[:,2]   # x2
 16 |     gt_new[:,3] = gt_wh[:,1]                # y2
 17 |     gt_new[:,4] = gt_wh[:,0] + gt_wh[:,2]   # x3
 18 |     gt_new[:,5] = gt_wh[:,1] + gt_wh[:,3]   # y3  
 19 |     gt_new[:,6] = gt_wh[:,0]                # x4
 20 |     gt_new[:,7] = gt_wh[:,1] + gt_wh[:,3]   # y4
 21 |     
 22 |     return gt_new
 23 | #%% Dataset class
 24 |     
 25 | '''Output is the imagesequence in an np.array format and the gt aswell.'''
 26 | 
 27 | class VOT2017_dataset(Dataset):
 28 |     """This is the VOT2017 dataset"""
 29 |     def __init__(self, csv_file, root_dir, transform = None):
 30 |         """ Arguments:
 31 |             csv_file(string): Path to list file, where all videos are listed
 32 |             root_dir(string): Directory with all the videos
 33 |             transform(callable, optional): Will transform on a sample(for pytorch I guess)
 34 |         """
 35 |         self.vot_list = pd.read_csv(csv_file, header = None)
 36 |         self.root_dir = root_dir
 37 |         self.transform = transform
 38 |     
 39 |     # Returns the nr of videos available
 40 |     def __len__(self):
 41 |         return len(self.vot_list)
 42 |     
 43 |     # Return the complete video sequence
 44 |     def __getitem__(self, vid_idx):
 45 |         """ Arguments:
 46 |             vid_idx(int): Video Index to be fetched form the video list
 47 |         """
 48 |         vid_name_path = os.path.join(self.root_dir, 
 49 |                                      self.vot_list.iloc[vid_idx,0],
 50 |                                      '*.jpg')
 51 |         
 52 |         gt = pd.read_csv(os.path.join(self.root_dir, 
 53 |                                       self.vot_list.iloc[vid_idx,0],
 54 |                                       'groundtruth.txt'), header = None)
 55 |         
 56 |         im_seq = imread_collection(vid_name_path)
 57 |         
 58 |         # Image collection to np.array
 59 |         images = concatenate_images(im_seq) # Shape(Nr. of images, h, w, RGB)
 60 |         
 61 |         # Also convert the gt to np.array
 62 |         gt = gt.values
 63 |         
 64 |         if gt.shape[1] == 4:
 65 |             gt = calc_coords(gt)
 66 |             
 67 |         
 68 |         sample = {'Video': images, 'gt': gt}
 69 |         
 70 |         # Cant tell yet what this is for
 71 |         if self.transform:
 72 |             sample = self.transform(sample)    
 73 |         return sample
 74 | 
 75 | 
 76 | #%%
 77 |         
 78 | # Draws a rectangle given the coordinates of all four corners in one array
 79 | # Where the order is upper-left, upper-right, lower-rigth, lower-left
 80 | def draw_gt(im, coords):
 81 |     """ Arguments:
 82 |             im = image
 83 |             coords = coords of all corners as in ground truth files(u.l,u.r,l.r,l.l)(u=upper,l = lower)
 84 |         """
 85 |     plt.imshow(im)
 86 |     Xs = coords[::2] # Save Xcoords
 87 |     Ys = coords[1::2] # Save Ycoords
 88 |     for i in range(4):
 89 |         if i < 3:
 90 |             plt.plot([Xs[i],Xs[i+1]],[Ys[i],Ys[i+1]],'k-', color = 'r',lw=1)
 91 |         elif i == 3:
 92 |             plt.plot([Xs[i],Xs[0]],[Ys[i],Ys[0]],'k-', color ='r', lw=1)
 93 |     plt.show()
 94 |     
 95 | 
 96 | #%% Transforms - Rescale/Resize
 97 |     
 98 | # Rescaling of an image so that we can feed it with the same size into a network
 99 | # Also the groundtruth boxes have to be rescaled accordingly!
100 | # Problem atm rescale the whole Video!! - so far only with for loop
101 | 
102 | class Rescale(object):
103 |     '''
104 |     Rescale the image in a sample to a given size
105 |     
106 |     Arguments: output_size(tuple): Desired output size
107 |                idx(int) : For now idx of the image to be resized
108 |     '''
109 |     
110 |     # Check if output_size is a tuple,
111 |     # maybe also assert if it isnt bigger than the smallest image?
112 |     def __init__(self, output_size):
113 |         assert isinstance(output_size,(tuple))
114 |         self.output_size = output_size
115 |         
116 |     def __call__(self, sample):
117 |         # Split the sample in video and gt
118 |         images, gt = sample['Video'], sample['gt']
119 |         nr = len(images) # Save the amount of images to iterate over
120 |         print(nr)
121 |         # Save heigth and widthim of video
122 |         h, w = images.shape[1:3] # heigth and width are the 2nd and 3rd entry
123 |         
124 |         new_h, new_w = self.output_size
125 |         
126 |         
127 |         # I dont like this part due to the for loop.! 
128 |         # Initialize the resized image sequence array
129 |         img = np.zeros((nr,new_h,new_w, images.shape[3]))
130 |         # Iterate over all images and resize them to the given scale.
131 |         for i in range(nr):
132 |             img[i,:,:,:] = transform.resize(images[i,:,:,:], (new_h, new_w))
133 |         
134 |     
135 |         # Here the groundtruth boxes are rescaled aswell
136 |         gt_new = gt*np.array((new_w/w, new_h/h, new_w/w,new_h/h, new_w/w,new_h/h, new_w/w, new_h/h))
137 |         
138 |         return {'Video': img, 'gt': gt_new}
139 | 
140 | #%% Transforms - ToTensor
141 | 
142 | # Transform the loaded image collection to Tensors
143 | 
144 | class ToTensor(object):
145 |     '''Convert sample to tensor'''
146 |     def __call__(self, sample):
147 |         # Load the sample and split it
148 |         images, gt = sample['Video'], sample['gt']
149 |         
150 |         # swap color axis because
151 |         # numpy image: H x W x C
152 |         # torch image: C X H X W
153 |         # How does this relate to videos/imagesequences?
154 |         images = images.transpose((0,3,1,2))
155 |         
156 |         return {'Video': torch.from_numpy(images),
157 |                 'gt': torch.from_numpy(gt)}
158 |     
159 | #%% Just some test - care the directories
160 | 
161 | # Load the list of all videos
162 | vid_list = pd.read_csv('F:/vot2017/list.txt', header = None)
163 | 
164 | # Name of a video can be accessed by e.g. vid_list[0][5]
165 | print( vid_list[0][33])
166 | 
167 |  # Nr. of videos available
168 | n = vid_list.shape[0]
169 | 
170 | test = VOT2017_dataset(csv_file= 'F:/vot2017/list.txt',
171 |                        root_dir= 'F:/vot2017/')
172 | 
173 | # E.g. load the second video of the vid_list
174 | sample = test[2]
175 | 
176 | # Simply draw a single video - here the idx refers to the image in the sequence
177 | draw_gt(sample['Video'][0], sample['gt'][0])
178 | 
179 | # Test rescaling       
180 | scale = Rescale((220,280))
181 | transformed_sample = scale(sample)
182 | draw_gt(transformed_sample['Video'][100], transformed_sample['gt'][100])
183 | 
184 | # Test the ToTensor - does not work on Nils Laptop     
185 | tens = ToTensor()
186 | tens(sample)
187 | 
188 | #%% Just for information - Find the smallest sized video
189 | 
190 | Vids = vid_list.shape[0]
191 | 
192 | Size = np.zeros((Vids, 2))
193 | 
194 | for i in range(Vids):
195 |     im = Image.open(os.path.join('F:/vot2017/', 
196 |                                      vid_list.iloc[i,0],
197 |                                      '00000001.jpg'))
198 |     Size[i,0] = im.size[0]
199 |     Size[i,1] = im.size[1]
200 | 
201 | # Smallest size ist 320 x240
202 | 
203 | #%% define loss/reward functions; given the coordinates of all corners
204 | 
205 | def reward_v1(pred, gt):
206 |     r = - np.mean(np.absolute(pred-gt)) - np.max(np.absolute(pred-gt))
207 |     return r
208 | 
209 | 
210 | # Calculate the reward given all the for corners x1,y1,x2,y2,x3,y3,x4,y4
211 | def reward_v2(pred, gt):
212 |     #reorder the coord in tuples for the polygon
213 |     pred_re = [(pred[0],pred[1]),(pred[2],pred[3]), (pred[4],pred[5]),(pred[6],pred[7])]
214 |     gt_re   = [(gt[0],gt[1]),(gt[2],gt[3]),(gt[4],gt[5]),(gt[6],gt[7])]
215 |     
216 |     pred_poly = Polygon(pred_re)
217 |     gt_poly = Polygon(gt_re)
218 |     # Reward == Intersection/total area
219 |     r = pred_poly.intersection(gt_poly).area/(pred_poly.area + gt_poly.area - pred_poly.intersection(gt_poly).area)
220 |     return r
221 | 
222 | #%% Test reward functions
223 |     
224 | test_1 = np.array((0,0,0,1,1,1,1,0))
225 | test_2 = np.array((0.5,0,0.5,1,1.5,1,1.5,0))
226 | 
227 | print(reward_v1(test_1, test_2))
228 | print(reward_v2(test_1, test_2))
229 | 
230 | 
231 | #%%
232 | 
233 | class GaussianLayer(torch.autograd.Function):
234 |     '''Implement custom gaussian layer here'''
235 |     
236 |     
237 |     @staticmethod
238 |     def forward(self, inp, sigma, N):
239 |         '''
240 |         We receive a Tensor input(take last 8 nr) from that we will draw outputs from a gaussian 
241 |         distribution, centered around our inputs, with sigma as a covariance matrix
242 |         N is the samples we draw
243 |         '''
244 |         cov = np.identity(8)*sigma # Covariance matrix
245 |         output = np.random.multivariate_normal(inp[-8,:], cov, N) #sample form gaussian
246 |         output = torch.from_numpy(output) # to tensor
247 |         return output
248 |     
249 |     @staticmethod
250 |     def backward(self, grad):
251 |         '''
252 |         Given grad as a tensor(8 values)
253 |         '''
254 |         self.grad = grad
255 |         return grad
256 |     
257 |     
258 | #%%
259 |     
260 | def calc_grad(pred_l, output, R_t, b_t, N, sigma):
261 |       '''
262 |       We receive pred_l(8xNxT) 
263 |       also we receive our baseline b_t(T) and the cumulative rewards R_t(NxT)
264 |       therefore we can compute the gradient as
265 |       '''
266 |       b_t.reshape((N,1)) # as is a vector we want column difference
267 |       output.reshape((8,1)) # We want to have it as a column
268 |       diff = R_t - b_t # is NxT
269 |       ln_pi = np.zeros(pred_l.shape)
270 |       # apperently have to use a for loop
271 |       for i in range(N):
272 |           ln_pi = pred_l-output/sigma**2 
273 |       product = ln_pi*diff # 8xNxT
274 |       grad_G = 1/N*np.sum(np.sum(product)) # Sum that up
275 |       grad_G = torch.from_numpy(grad_G) # To torch
276 |       return grad_G
277 |       
278 | 
279 | #%%
280 | 
281 | #data reader  ----------------------------------------------------------------
282 | class XXXXXDataset(Dataset):
283 | 
284 |     def __init__(self, split, transform=None, mode='train'):
285 |         super(XXXXXXDataset, self).__init__()
286 |         start = timer()
287 | 
288 |         self.split = split
289 |         self.transform = transform
290 |         self.mode = mode
291 | 
292 |         #read split
293 |         ids = read_list_from_file(DATA_DIR + '/split/' + split, comment='#')
294 | 
295 |         #save
296 |         self.ids = ids
297 | 
298 |         #print
299 |         print('\ttime = %0.2f min'%((timer() - start) / 60))
300 |         print('\tnum_ids = %d'%(len(self.ids)))
301 |         print('')
302 | 
303 | 
304 |     def __getitem__(self, index):
305 |         id = self.ids[index]
306 |         image_id = id.split('/')[-1]
307 |         image = cv2.imread(DATA_DIR + '/image/' + id + '/images/' + image_id +'.png', cv2.IMREAD_COLOR)
308 | 
309 |         if self.mode in ['train']:
310 |             multi_mask = np.load( DATA_DIR + '/image/' + id + '/multi_mask.npy')#.astype(int32)
311 | 
312 |             if self.transform is not None:
313 |                 return self.transform(image, multi_mask, index)
314 |             else:
315 |                 return input, multi_mask, index
316 | 
317 |         if self.mode in ['test']:
318 |             if self.transform is not None:
319 |                 return self.transform(image,index)
320 |             else:
321 |                 return image, index
322 | 
323 |     def __len__(self):
324 |         return len(self.ids)


--------------------------------------------------------------------------------