├── Incremental-Learning-Main.ipynb
├── README.md
├── database_stats.pkl
└── funs.py


/README.md:
--------------------------------------------------------------------------------
 1 | # incremental_learning
 2 | Initial Code for paper [Incremental Learning through Deep Adaptation](https://arxiv.org/abs/1705.04228), by Amir Rosenfeld, John Tsotsos
 3 | 
 4 | 
 5 | This work is now superseded by "Efficient parametrization of multi-domain deep neural networks" by S. Rebuffi, H. Bilen and A. Vedaldi. I recommend using that repo instead. It can be found [Here](https://github.com/srebuffi/residual_adapters)
 6 | 
 7 | Note that this is a very initial commit. It cannot work straight out of the box due to absolute paths, etc.
 8 | The main interesting function here is "makeItControlled" in the .ipynb file which adds controller modules to a new model to re-use modules of an old model.
 9 | Everything else is boilerplate,training,testing,experiments.
10 | Please report any issues/comments/suggestions as you may have them.
11 | 
12 | 


--------------------------------------------------------------------------------
/database_stats.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rosenfeldamir/incremental_learning/0d47970f919c3ae2eb0494170cddb650beb8e6e2/database_stats.pkl


--------------------------------------------------------------------------------
/funs.py:
--------------------------------------------------------------------------------
   1 | 
   2 | import progressbar
   3 | import sys
   4 | import argparse
   5 | import torch
   6 | from copy import deepcopy
   7 | import numpy as np
   8 | import torch.nn as nn
   9 | from torch.nn import init
  10 | import torch.nn.functional as F
  11 | import torch.optim as optim
  12 | import torchvision
  13 | from torchvision import datasets, transforms
  14 | from torch.autograd import Variable
  15 | import torchvision.models as models
  16 | from torch.utils.data import Dataset, TensorDataset, DataLoader
  17 | import torch.utils.data as data
  18 | 
  19 | from tensorboard_logger import configure, log_value, Logger
  20 | 
  21 | import itertools
  22 | from itertools import izip
  23 | from matplotlib import  pyplot as plt
  24 | import os,os.path
  25 | import glob
  26 | from time import time
  27 | import shutil
  28 | import hickle as pickle
  29 | from PIL import Image
  30 | import collections
  31 | import math
  32 | 
  33 | batch_size = 128
  34 | base_lr = .1
  35 | lr_drop_freq=10
  36 | criterion = nn.CrossEntropyLoss()
  37 | num_workers = 0
  38 | from os.path import expanduser
  39 | homeDir = expanduser('~')
  40 | sys.path.append(os.path.join(homeDir,'YellowFin_Pytorch/tuner_utils/')) # yellowfin :-)
  41 | from yellowfin import YFOptimizer
  42 | 
  43 | def matVar(size=(1,3,64,64),cuda=False):
  44 |     v = Variable(torch.randn(size))
  45 |     if cuda:
  46 |         v = v.cuda()
  47 |     return v
  48 | 
  49 | #def adjust_learning_rate(optimizer, epoch, base_lr, lr_drop_freq, gamma=0.5):
  50 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
  51 |     
  52 | #    lr = base_lr * (gamma ** (epoch // lr_drop_freq))
  53 | #    for param_group in optimizer.param_groups:
  54 | #
  55 | 
  56 | def adjust_learning_rate(optimizer, epoch, base_lr, lr_drop_freq = 100, gamma=0.1):
  57 |     """Sets the learning rate to the initial LR decayed by gamma every K epochs"""                
  58 |     if (epoch + 1) % lr_drop_freq == 0: # Note this works only for continuous mode (not stopping+loading)
  59 |         if type(optimizer) is YFOptimizer:
  60 |             optimizer.set_lr_factor(optimizer.get_lr_factor() * gamma)
  61 |         else:
  62 |             for param_group in optimizer.param_groups:
  63 |                 param_group['lr'] = param_group['lr'] * gamma
  64 | 
  65 | def train(model,epoch,optimizer,maxIters=np.inf,targetTranslator=None,train_loader=None, criterion=None,criterion2 = None,disableBatchNorm=False,cuda=True, balancing_factor = 0.0, logger=None):
  66 |     T0 = time()
  67 |     if not disableBatchNorm:
  68 |         model.train()
  69 |     else:
  70 |         model.eval()
  71 |     
  72 |     nBatches = 0    
  73 |     running_loss = 0.0
  74 |     running_loss2 = 0.0
  75 |     losses = []
  76 |     nSamples=0
  77 |     maxIters = min(maxIters,len(train_loader))
  78 |     startTime = time()
  79 |     for batch_idx, (data, target) in enumerate(train_loader):     
  80 |         
  81 |         target = target.long().squeeze()
  82 |         if targetTranslator is not None:            
  83 |             target2 = targetTranslator(target.clone())
  84 |             target2 = data.cuda(), targfet.cuda(),target2.cuda()
  85 |         target = target.long().squeeze()
  86 |         if cuda:
  87 |             data, target = data.cuda(), target.cuda()                        
  88 |         data, target = Variable(data), Variable(target)
  89 |         #, Variable(target2)
  90 |         optimizer.zero_grad()
  91 |         
  92 |         output = model(data)
  93 |         if type(output) is tuple:
  94 |             gates = output[1]
  95 |             output = output[0]
  96 |                 
  97 |         #output = model(data)
  98 |                         
  99 |         loss = criterion(output, target)# + criterion(output2,target2)        
 100 |         if criterion2 is not None and balancing_factor > 0:
 101 |             loss2 = criterion2(gates)
 102 |             loss+= balancing_factor * loss2
 103 |         else:
 104 |             loss2 = 0
 105 |                     
 106 |         #        
 107 | 
 108 |         loss.backward()
 109 |         optimizer.step()
 110 |         losses.append(loss.data[0])
 111 |         running_loss += loss.data[0]
 112 |         if criterion2 is not None and balancing_factor > 0:
 113 |             running_loss2 += loss2.data[0]/balancing_factor
 114 |         else:
 115 |             running_loss2 = -1
 116 |         
 117 |         
 118 |     
 119 |         
 120 |         nBatches+=1#len(data)
 121 |         nSamples+=len(data)
 122 |         if batch_idx % 5 == 0 and time()-T0 > .1:
 123 |             T0 = time()
 124 |             elapsedTime = time()-startTime
 125 |             S = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tAvg Loss: {:.6f}\tAvg Loss 2: {:.6f} ({:.2f} imgs/sec)'.format(epoch, batch_idx * len(data),
 126 |                                                                              len(train_loader.dataset), 
 127 |                                                                                  100. * batch_idx / len(train_loader), 
 128 |                                                                                  running_loss/nBatches,running_loss2/(nBatches),
 129 |                                                                                  nSamples/elapsedTime)
 130 |             if logger is not None:
 131 |                 logger.log_value('training loss',loss.data[0],batch_idx + epoch * maxIters)
 132 |             print '\r{}'.format(S),
 133 |         if batch_idx > maxIters:
 134 |             break
 135 |     #b1
 136 |     if logger is not None:
 137 |         if hasattr(optimizer,'param_groups'):
 138 |             
 139 |             for param_group in optimizer.param_groups:
 140 |                 cur_lr = param_group['lr']
 141 |                 logger.log_value('learning rate',cur_lr,epoch)
 142 |             
 143 |         if hasattr(optimizer,'get_lr_factor'):
 144 |             logger.log_value('learning rate',optimizer.get_lr_factor(),epoch)
 145 |             
 146 |     return losses
 147 |             
 148 | def test(model,epoch,targetTranslator=None,test_loader=None,prev_acc=0,alpha=None,criterion=None, maxIters=np.inf,cuda=True, logger=None):
 149 |     assert (criterion is not None)
 150 |     #criterion = nn.CrossEntropyLoss()
 151 |     model.eval()
 152 |     test_loss = 0
 153 |     correct = 0
 154 |     nSamples = 0
 155 |     maxIters = min(maxIters,len(test_loader))
 156 |     for batch_idx, (data, target) in enumerate(test_loader):
 157 |         target = target.long().squeeze()
 158 |         if targetTranslator is not None:            
 159 |             target2 = targetTranslator(target.clone())
 160 |             target2 = target2.cuda()
 161 |             Variable(target2)
 162 |         if cuda:
 163 |             data, target = data.cuda(), target.cuda()
 164 |         
 165 |         data, target = Variable(data), Variable(target)
 166 |         if alpha is not None:            
 167 |             output = model(data,alpha)
 168 |         else:
 169 |             #b1
 170 |             output = model(data)
 171 |             if type(output) is tuple:
 172 |                 gates = output[1]
 173 |                 output = output[0]
 174 |         cur_test_loss = criterion(output, target).data[0]
 175 |         test_loss += cur_test_loss 
 176 |                         
 177 |         pred = output.data.max(1)[1] # get the index of the max log-probability        
 178 |         correct += pred.eq(target.data).cpu().sum()
 179 |         nSamples+=len(data)
 180 |         if batch_idx >= maxIters:
 181 |             break
 182 | 
 183 |     test_loss /= len(test_loader) # loss function already averages over batch size
 184 |     if logger is not None:
 185 |             logger.log_value('test loss',test_loss,epoch)
 186 |     cur_acc = 100. * correct / nSamples
 187 |     #if prev_acc < cur_acc:
 188 |     P = '({}) :Test set: Avg. loss: {:.4f}, Acc: {}/{} ({:.1f}%)'.format(epoch,
 189 |     test_loss, correct, nSamples, cur_acc)
 190 |     if logger is not None:        
 191 |         logger.log_value('test accuracy',cur_acc,epoch)
 192 |         
 193 |     print '\r{}'.format(P),
 194 |     return 100. * correct / nSamples
 195 | 
 196 | def checkModelConsistency(newModel,oldModel):
 197 |     for a_fine,a_orig in zip(newModel, oldModel):
 198 |         tt = type(a_orig)    
 199 |         if tt is nn.Conv2d:    
 200 |             print '*',
 201 |             w_fine = a_fine.w.transpose(0,1).contiguous().view(a_fine.s)
 202 |             w_orig = a_orig.weight    
 203 |             assert( (w_fine-w_orig).data.sum() ==0)
 204 | #checkModelConsistency(f_fine_m,model_10.features.children())    
 205 | 
 206 | 
 207 | def save_checkpoint(state, is_best, epoch, modelDir):
 208 |     """Saves checkpoint to disk"""
 209 |     checkPointPath = '{}/{}'.format(modelDir,str(epoch).zfill(4))
 210 |     torch.save(state, checkPointPath)
 211 |     if is_best:
 212 |         shutil.copyfile(checkPointPath, '{}/{}'.format(modelDir,'best'))
 213 |         
 214 | def defaultCallBacks():
 215 |     return {'trainEpochStart':[],'trainEpochEnd':[],'testEpochStart':[],'testEpochEnd':[]}
 216 | 
 217 | def trainAndTest(model,optimizer=None,modelDir=None,epochs=5,targetTranslator=None,model_save_freq=20,
 218 |                 train_loader=None,test_loader=None,stopIfPerfect=True,  criterion=nn.CrossEntropyLoss(),
 219 |                 criterion2 = None, adjust_learning_rate=adjust_learning_rate, maxIters=np.inf,base_lr=base_lr,
 220 |                 lr_drop_freq=lr_drop_freq,disableBatchNorm=False,cuda=True,balancing_factor=0.0,logger=None,
 221 |                 callbacks=defaultCallBacks(),gamma=.1):
 222 |              
 223 |     last_epoch = 0
 224 |     corrects = []
 225 |     
 226 |     needToSave = modelDir is not None and model_save_freq > 0
 227 |     all_accuracies = []
 228 |     if needToSave:
 229 |     
 230 |         if not os.path.isdir(modelDir):
 231 |             os.makedirs(modelDir)    
 232 | 
 233 |         
 234 |         g = list(sorted(glob.glob(os.path.join(modelDir,'*'))))
 235 |         g = [g_ for g_ in g if not 'best' in g_]
 236 |         
 237 |         g_new = []
 238 |         for gg in g: # fixing file names to be zero padded    
 239 |             g1,g2 = os.path.split(gg)
 240 |             newName = '/'.join([g1,g2.zfill(4)])
 241 |             if gg <> newName:
 242 |                 print 'moving'
 243 |                 print gg,'to'
 244 |                 print newName
 245 |                 shutil.move(gg,newName)
 246 |             g_new.append(newName)
 247 |         g = list(sorted(g_new))
 248 |         
 249 |         if len(g) > 0:
 250 |             lastCheckpoint = g[-1]
 251 |             # load the last checkpoint
 252 |             print 'loading from', lastCheckpoint
 253 |             
 254 |             checkpoint = torch.load(lastCheckpoint)
 255 |             last_epoch = checkpoint['epoch']
 256 |             best_acc = checkpoint['best_acc']
 257 |             all_accuracies = checkpoint.get('all_accuracies',all_accuracies)
 258 |             model.load_state_dict(checkpoint['state_dict'])
 259 |             print("=> loaded checkpoint '{}'".format(lastCheckpoint))
 260 |             
 261 |     best_acc = 0
 262 |     all_losses = []
 263 |     hasCallBacks = callbacks is not None
 264 |     
 265 |     for epoch in range(last_epoch, epochs): # epochs + 1):
 266 |         
 267 |         if hasCallBacks:
 268 |             for callback in callbacks['trainEpochStart']:
 269 |                 callback(model,optimizer,epoch)
 270 |                         
 271 |         if adjust_learning_rate is not None:
 272 |             adjust_learning_rate(optimizer,epoch,base_lr,lr_drop_freq,gamma)
 273 |         losses = train(model=model,epoch=epoch,optimizer=optimizer,targetTranslator=targetTranslator,
 274 |               train_loader=train_loader,criterion=criterion,criterion2 = criterion2, maxIters=maxIters,disableBatchNorm=disableBatchNorm,cuda=cuda,
 275 |                         balancing_factor=balancing_factor,logger=logger)
 276 |         
 277 |         if hasCallBacks:
 278 |             for callback in callbacks['trainEpochEnd']:
 279 |                 callback(model,optimizer,epoch)
 280 |         
 281 |         all_losses.extend(losses)
 282 |         print
 283 |         if hasCallBacks:
 284 |             for callback in callbacks['testEpochStart']:
 285 |                 callback(model,optimizer,epoch)
 286 |         
 287 |         cur_acc = test(model,epoch,targetTranslator=targetTranslator,test_loader=test_loader,
 288 |                        prev_acc=best_acc,criterion=criterion, maxIters=maxIters,cuda=cuda,logger=logger)        
 289 |         if hasCallBacks:
 290 |             for callback in callbacks['testEpochEnd']:
 291 |                 callback(model,optimizer,epoch)
 292 |         all_accuracies.append(cur_acc)
 293 |         corrects.append(cur_acc)
 294 |         print
 295 |                     
 296 |         
 297 |         if needToSave and (epoch % model_save_freq == 0 or epoch == epochs-1):
 298 |             print 'saving model...',
 299 |             checkPointPath = '{}/{}'.format(modelDir,epoch)
 300 |             if cur_acc > best_acc:
 301 |                 best_acc = cur_acc
 302 |                 is_best = True
 303 |             else:
 304 |                 is_best = False
 305 |             save_checkpoint({
 306 |             'epoch': epoch + 1,
 307 |             'all_losses':all_losses,
 308 |             'all_accuracies':all_accuracies,
 309 |             'last_epoch_losses':losses,
 310 |             'state_dict': model.state_dict(),
 311 |             'best_acc': best_acc,
 312 |             'cur_acc': cur_acc
 313 |             }, is_best, epoch, modelDir)                                    
 314 |         #if cur_acc>=99.5:
 315 |         #    break
 316 |                 
 317 |     return corrects
 318 | 
 319 | 
 320 | def imshow(img):
 321 |     #img = img / 2 + 0.5     # unnormalize
 322 |     npimg = img.numpy()
 323 |     npimg = npimg-npimg.min()
 324 |     npimg = npimg/npimg.max()
 325 |     plt.imshow(np.transpose(npimg, (1, 2, 0)))
 326 | 
 327 | def init_params(net):
 328 |     '''Init layer parameters.'''
 329 |     for m in net.modules():
 330 |         if isinstance(m, nn.Conv2d):
 331 |             init.kaiming_normal(m.weight, mode='fan_out')
 332 |             if m.bias is not None:
 333 |                 init.constant(m.bias, 0)
 334 |         elif isinstance(m, nn.BatchNorm2d) and m.affine:            
 335 |             init.constant(m.weight, 1)
 336 |             init.constant(m.bias, 0)
 337 |         #elif isinstance(m, nn.Linear):
 338 |         #    init.normal(m.weight, std=1e-3)
 339 |         #    if m.bias:
 340 |         #        init.constant(m.bias, 0)
 341 |                 
 342 |                 
 343 | 
 344 | class VGG_backcomp(nn.Module):
 345 |     def __init__(self, features, fc_size=512,num_classes=1000,dropout=True,fullyconv=False):
 346 |             super(VGG, self).__init__()
 347 |             self.features = features
 348 |             self.fullyconv = fullyconv
 349 |             if not fullyconv:            
 350 |                 
 351 |                 if dropout:
 352 |                 
 353 |                     self.classifier = nn.Sequential(
 354 |                         nn.Linear(fc_size, 512),
 355 |                         nn.ReLU(True),
 356 |                     nn.Dropout(),                
 357 |                         nn.Linear(512, num_classes),
 358 |                     )
 359 |                 else:
 360 |                     self.classifier = nn.Sequential(
 361 |                         nn.Linear(fc_size, 512),
 362 |                         nn.ReLU(True),                    
 363 |                         nn.Linear(512, num_classes),
 364 |                     )
 365 |             else:
 366 |                 self.classifier = nn.Sequential(nn.Linear(512,num_classes)) # get just the last layer,yes?
 367 |             
 368 |     def forward(self, x):
 369 |         
 370 |         x = self.features(x)
 371 |         #print 'x size:',x.size()
 372 |         x = x.view(x.size(0), -1)
 373 |         x = self.classifier(x)
 374 |         return x
 375 | class VGG(nn.Module):
 376 |     def __init__(self, features, fc_size=512,num_classes=1000,dropout=True,fullyconv=False):
 377 |         super(VGG, self).__init__()
 378 |         self.features = features
 379 |         self.fullyconv = fullyconv
 380 |         if not fullyconv:            
 381 |             
 382 |             if dropout:
 383 |             
 384 |                 self.classifier = nn.Sequential(
 385 |                     nn.Linear(fc_size, 512),
 386 |                     nn.ReLU(True),
 387 |                 nn.Dropout(),                
 388 |                     nn.Linear(512, num_classes),
 389 |                 )
 390 |             else:
 391 |                 self.classifier = nn.Sequential(
 392 |                     nn.Linear(fc_size, 512),
 393 |                     nn.ReLU(True),                    
 394 |                     nn.Linear(512, num_classes),
 395 |                 )
 396 |         else:
 397 |             self.classifier = nn.Sequential(nn.Conv2d(512,num_classes,2,2)) # get just the last layer,Yes?
 398 |         init_params(self)
 399 |     def forward(self, x):
 400 |         
 401 |         x = self.features(x)
 402 |         #print 'x size:',x.size()
 403 |         if not self.fullyconv:
 404 |             x = x.view(x.size(0), -1)        
 405 |         x = self.classifier(x)
 406 |         if self.fullyconv:
 407 |             x = x.view(x.size(0), -1)
 408 |         return x,None
 409 | 
 410 | # In[3]:
 411 | #cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
 412 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
 413 | 
 414 | class AlphaNet(nn.Module):
 415 |     def __init__(self, features, classifier, otherClassifier=None):
 416 |             super(AlphaNet, self).__init__()
 417 |             if type(features) is list:
 418 |                 self.features = nn.Sequential(*features)
 419 |             else:
 420 |                 self.features = features            
 421 |             self.classifier = classifier
 422 |             self.otherClassifier = otherClassifier
 423 |             self.outputSize = None
 424 |     def getControlParams(self):
 425 |         # return parameters of all layers, except convolutional.
 426 |         params = []
 427 |         for q in self.features:
 428 |             q_type = type(q)
 429 |             if q_type is nn.Conv2d or q_type is nn.BatchNorm2d:
 430 |                 continue
 431 |             if q_type is controlledConv: # probably nothing else
 432 |                 params.extend(list(q.parameters()))
 433 |                 params.append(q.bias)
 434 |             
 435 |         params.extend(list(self.classifier.parameters()))
 436 |         return params
 437 |             
 438 |     
 439 |     def extendToSize(self,x):
 440 |         S = self.outputSize        
 441 |         if S is not None:
 442 |             s = x.size()
 443 |             assert s[1] <= S, 'output larger than required output size'
 444 |             if s[1] < S:
 445 |                 XX = Variable(torch.zeros(s[0],S).cuda())
 446 |                 XX[:,:s[1]] = x
 447 |                 x = XX
 448 |         return x
 449 |             
 450 |     
 451 |     def forward(self, x, alpha=None):       
 452 |         for f in self.features:
 453 |             if type(f) is controlledConv:
 454 |                 x = f(x,alpha)
 455 |             else:
 456 |                 x = f(x)
 457 |         
 458 |         x = x.view(x.size(0), -1)
 459 |         
 460 |         if alpha is None:        
 461 |             x = self.classifier(x)
 462 |         else:
 463 |             assert self.otherClassifier is not None, 'cannot use alpha without other classifier'
 464 |             #assert self.outputSize is not None , 'cannot use alpha without specified output size'
 465 |             x1 = self.classifier(x)                                    
 466 |             x2 = self.otherClassifier(x)
 467 |             
 468 |             # set the desired output to the maximum between the two classes
 469 |             if self.outputSize is None:
 470 |                 print 'automatically determining maximal output size...'
 471 |                 self.outputSize = max(x1.size()[1],x2.size()[1])
 472 |             #print 'sizes before:',x1.size(),x2.size()
 473 |             x1 = self.extendToSize(x1)
 474 |             x2 = self.extendToSize(x2)
 475 |             #print 'sizes after:',x1.size(),x2.size()
 476 |             
 477 |             myAlpha = alpha.expand_as(x1)
 478 |             x = myAlpha * x1 + (1-myAlpha) * x2
 479 |         return x
 480 | 
 481 | def replaceLastLayer(model,num_outputs):
 482 |     mod = list(model.children())
 483 |     mod.pop()
 484 |     mod.append(torch.nn.Linear(512, num_outputs))
 485 |     model = torch.nn.Sequential(*mod)
 486 |     return model
 487 | def freezeBatchNormLayers(model):        
 488 |     if hasattr(model,'features'):    
 489 |         for p in model.features.children():
 490 |             
 491 |             if type(p) is nn.BatchNorm2d:
 492 |                 print '.',
 493 |                 for q in p.parameters():
 494 |                     
 495 |                     q.requires_grad = False
 496 |         for p in model.classifier.children():            
 497 |             if type(p) is nn.BatchNorm2d:
 498 |                 print '.',
 499 |                 for q in p.parameters():
 500 |                     
 501 |                     q.requires_grad = False
 502 |     else:
 503 |         for p in model.children():
 504 |             if type(p) is nn.BatchNorm2d:                
 505 |                 print '.',
 506 |                 for q in p.parameters():
 507 |                     
 508 |                     q.requires_grad = False
 509 | def ton(V):
 510 |     if type(V) is not Variable:
 511 |         return V.cpu().numpy()
 512 |     else:
 513 |         return V.data.cpu().numpy()
 514 | def showmat(M):
 515 |     if type(M) is not np.ndarray:
 516 |         M = ton(M)
 517 |     plt.matshow(M)
 518 | 
 519 | def countModelParameters(model,need_require_grad=True):
 520 |     return sum([p.data.nelement() for p in model.parameters() if p.requires_grad or not need_require_grad])
 521 | 
 522 | normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
 523 |                                      std=[x/255.0 for x in [63.0, 62.1, 66.7]])
 524 | cuda=True
 525 | kwargs = {'num_workers': num_workers, 'pin_memory': False} 
 526 | 
 527 | 
 528 | def quickTest(model,test_loader,alpha=None,maxSamples=100000):
 529 | 
 530 |     #criterion = nn.CrossEntropyLoss()
 531 |     model.eval()
 532 |     test_loss = 0
 533 |     correct = 0
 534 |     nPoints = 0    
 535 |     for idx, (data, target) in enumerate(test_loader):
 536 |         target = target.long().squeeze()
 537 |         nPoints += len(target)        
 538 |         data, target = Variable(data.cuda()), Variable(target.cuda())    
 539 |         if alpha is not None:
 540 |             output = model(data,alpha)
 541 |         else:
 542 |             output = model(data)
 543 |         pred = output.data.max(1)[1] # get the index of the max log-probability
 544 |         correct += pred.eq(target.data).cpu().sum()
 545 |         if nPoints >= maxSamples:
 546 |             break    
 547 |     cur_acc = 100. * correct / nPoints
 548 |     #if prev_acc < cur_acc:
 549 |     P = 'Test set: Acc: {}/{} ({:.1f}%)'.format(correct, nPoints, cur_acc)    
 550 |     print '\r{}'.format(P),    
 551 |     return cur_acc
 552 | 
 553 | 
 554 | 
 555 | 
 556 |     
 557 | # Initialize from scratch.
 558 | 
 559 | from numpy.linalg import lstsq
 560 | 
 561 | class conv2d_bn(nn.Module):
 562 |     def __init__(self, conv, bn):
 563 |         super(conv2d_bn, self).__init__()
 564 |         self.conv = conv
 565 |         self.bn = bn
 566 |     def forward(self,x):
 567 |         return self.bn(self.conv(x))
 568 |     
 569 | 
 570 | class controlledConv(nn.Module):
 571 |     def __init__(self, conv, X = None,bias = None, sparse = False, diagonal=False):
 572 |         super(controlledConv, self).__init__()
 573 |         self.padding = conv.padding
 574 |         self.stride = conv.stride
 575 |         self.dilation = conv.dilation
 576 |         self.conv = conv        
 577 |         # Copy the weights as a constant from the original convolution --
 578 |         # just to make sure it doesn't change                
 579 |         s = conv.weight.size()
 580 |         #print 'size of conv:',s
 581 |         self.s = list(s)
 582 |         w = Variable(torch.Tensor(s).copy_(conv.weight.data))
 583 |         w = w.view(s[0],-1).transpose(0,1)
 584 |         #print 'size of flattened weights' ,w.size()
 585 |         self.w = w.detach().cuda()
 586 |         
 587 |         self.my_bn = None
 588 |         s = conv.weight.size()
 589 |         R = s[0]
 590 |         #print 'size of X:',X.size()
 591 |         L = nn.Linear(X.size()[1],X.size()[0],bias=False)
 592 |         #print 'L:',L
 593 |         if X is None:
 594 |             L.weight.data = torch.eye(R) # Initilize to unit (e.g, keep configuration)
 595 |         else:
 596 |             L.weight.data = X
 597 |         self.L = L
 598 |         self.s[0] = L.weight.size()[0]
 599 | 	hasBias = bias is not None
 600 |         if hasBias:
 601 |             s_bias = self.s[0]
 602 |             self.conv_bias = Variable(torch.Tensor(conv.bias.data.size()).copy_(conv.bias.data))
 603 |             self.conv_bias = self.conv_bias.detach().cuda()        
 604 |             #print 'self size:',self.s  
 605 |             #if bias is None: # copy bias from current convolution.            
 606 |             self.bias.data.copy_(conv.bias.data[:s_bias])
 607 |         else:
 608 |             self.bias = None
 609 |             #else:
 610 |             #    self.bias = bias
 611 |                 
 612 |         for p in conv.parameters():
 613 |             p.requires_grad = False
 614 |                     
 615 |     def setConvLearnable(self,T):
 616 |         for p in self.conv.parameters():
 617 |             p.requires_grad = T
 618 |     
 619 |     def set_bn(self,bn):
 620 |         my_bn = nn.BatchNorm2d(bn.num_features,affine=bn.affine)
 621 |         bn.eval()
 622 |         my_bn.load_state_dict(bn.state_dict())
 623 |         my_bn.train()
 624 |         self.my_bn = my_bn
 625 |         self.old_bn = bn
 626 |             
 627 |     def forward(self,x, alpha = None):
 628 |         # Modify the weights
 629 |         #conv = self.conv
 630 |         s = self.s
 631 |         w = self.w 
 632 |         if alpha is not None:
 633 |             #print 'got alpha'
 634 |             alpha1 = alpha.expand_as(w)
 635 |             newWeights = alpha1 * self.L(w) + (1-alpha1) * w
 636 |             if hasBias:
 637 |                 alpha2 = alpha.squeeze().expand_as(self.bias)            
 638 |                 bias = alpha2 * self.bias + (1-alpha2) * self.conv_bias
 639 |         else:            
 640 |             #print 'no alpha'
 641 |             newWeights = self.L(w)
 642 |             bias = self.bias
 643 |         newWeights = newWeights.transpose(0,1).contiguous()
 644 |         newWeights = newWeights.view(s)
 645 |                 
 646 |         #print newWeights.size()
 647 |         #print bias.size()
 648 |         
 649 |         x = F.conv2d(x,newWeights,bias,stride=self.stride,padding=self.padding,dilation=self.dilation)
 650 |         
 651 |         # apply the batch normalization...
 652 |         if self.my_bn is not None:
 653 |             x_bn = self.my_bn(x)
 654 |             if alpha is not None:
 655 |                 alpha3 = alpha.expand_as(x)
 656 |                 x = alpha3 * x_bn + (1-alpha3) * self.old_bn(x)
 657 |             else:
 658 |                 x = x_bn
 659 |         return x
 660 | 
 661 | def checkApproximation(net1,net2):
 662 |     a_orig = list(net1.features.children())
 663 |     a_fine = list(net2.features.children())
 664 |     abs_errors = []
 665 | 
 666 |     bar = progressbar.ProgressBar(max_value=len(a_fine)-1)
 667 |     for i,(orig,fine) in bar(enumerate(izip(a_orig,a_fine))):    
 668 |         if type(orig) is nn.BatchNorm2d:
 669 |             # make sure the batch-norm layers are unchanged
 670 |             ss1 = orig.state_dict()
 671 |             ss2 = fine.state_dict()
 672 |             assert ((ss1['running_mean']-ss2['running_mean']).sum()==0 and \
 673 |                     (ss1['running_var']-ss2['running_var']).sum()==0), \
 674 |             'found mismatch between batch norm on layer {}'.format(i)
 675 | 
 676 |             continue
 677 | 
 678 |         if type(orig) is not nn.Conv2d:
 679 |             continue
 680 |         s1 = orig.weight.size()
 681 |         nOrigParams = np.prod(s1)
 682 |         nNewParams = s1[0]*(1+s1[0])
 683 |         w1 = orig.weight.view(s1[0],-1) # Old weights
 684 |         s2 = fine.weight.size() 
 685 |         w2 = fine.weight.view(s1[0],-1) # new weights
 686 |         A = ton(w1).T        
 687 |         #A = A-np.mean(A,1,keepdims=True)
 688 |         B = ton(w2).T
 689 |         #B = A-np.mean(B,1,keepdims=True)
 690 |         X,residuals,rank,s = lstsq(A,B) # Approximation.
 691 |         cur_mean_error = np.abs((A.dot(X)-B)).mean()
 692 |         abs_errors.append(cur_mean_error)
 693 |     return abs_errors
 694 | 
 695 |     s1 = orig.weight.size()
 696 |     nOrigParams = np.prod(s1)
 697 |     nNewParams = s1[0]*(1+s1[0])
 698 |     w1 = orig.weight.view(s1[0],-1) # Old weights
 699 |     s2 = fine.weight.size() 
 700 |     w2 = fine.weight.view(s1[0],-1) # new weights
 701 |     A = ton(w1).T
 702 |     B = ton(w2).T                
 703 |     X,residuals,rank,s = lstsq(A,B) # Approximation.
 704 |     m = controlledConv(orig,torch.Tensor(X.T),fine.bias)
 705 |     return m,A,B,X
 706 | 
 707 | initializationTypes = ['linear_approx','random','diagonal']
 708 | def makeControlledConv(orig,fine,initializationType='linear_approx'):
 709 |     assert initializationType in initializationTypes,'Unknown initialization type from controlledConv: {}'.format(initializationType)
 710 |     s1 = orig.weight.size()
 711 |     s2 = fine.weight.size()
 712 |     nOrigParams = np.prod(s1)
 713 |     nNewParams = s2[0]*(1+s1[0])
 714 |     
 715 |     print s1,s2
 716 |     
 717 |     w1 = orig.weight.view(s1[0],-1) # Old weights
 718 |     s2 = fine.weight.size() 
 719 |     w2 = fine.weight.view(s2[0],-1) # new weights
 720 |     A = ton(w1).T
 721 |     B = ton(w2).T
 722 |     if initializationType == 'linear_approx':
 723 |         X,residuals,rank,s = lstsq(A,B) # Approximation.        
 724 |         
 725 |     elif initializationType == 'random':
 726 |         X = torch.zeros(s1[0],s2[0])
 727 |         #print '!!!!!!',X.size()
 728 |         init.xavier_uniform(X)
 729 |         X = X.numpy()
 730 |     elif initializationType == 'diagonal':
 731 |         # assert that s1 is a multiple of s2
 732 |         assert s1[0] % s2[0] == 0
 733 |         
 734 |         X = [torch.eye(s2[0])]* (s1[0] / s2[0])
 735 |         X = torch.cat(X)
 736 |         X = X.numpy()
 737 |         
 738 |     else:
 739 |         raise Exception('This code should not be reached.')
 740 |         
 741 |     m = controlledConv(orig,torch.Tensor(X.T),fine.bias)    
 742 |     
 743 |     return m,A,B,X
 744 | 
 745 | def makeControllerNetwork(net_orig,net_fine, initializationType='linear_approx', verbose = True, trackValues = True):
 746 |     """ Given two sequential networks net_orig and net_fine with the same structure,
 747 |         reformulate B so that is is compactly represented by re-using the weights of A. 
 748 |         Params :
 749 |             net_orig - the original network
 750 |             net_fine - network to be approximateed
 751 |             initializationType ['linear_approx']
 752 |             
 753 |             verbose - whether to track and print the layer-wise error for some random input, stemming 
 754 |             from the linear approximations.            
 755 |     """
 756 |     a_fine = list(net_fine.features.children())
 757 |     for p in net_fine.parameters():
 758 |         p.requires_grad=False
 759 | 
 760 |     a_orig = list(net_orig.features.children())
 761 |         
 762 |     v = Variable(torch.randn(1,3,64,64))
 763 |     v = v.cpu()
 764 | 
 765 |     value_fine = v.cuda()
 766 |     value_new = v.cuda()
 767 |     
 768 |     s_fine_vs_new = []
 769 |     s_controlled_vs_fine = []
 770 |     errors = []
 771 |     newChildren = []
 772 |     oldChildren = []
 773 |     types  = []
 774 |     
 775 |     #U = list(a_fine)
 776 |     bar = progressbar.ProgressBar(max_value=len(a_fine))
 777 |     
 778 |     for i,(orig,fine) in bar(enumerate(izip(a_orig,a_fine))): 
 779 |         wasBN = False
 780 |         #print i,
 781 |         tt = type(fine)        
 782 |         tt_str = str(tt)         
 783 |         types.append(tt_str.split('.')[-1][:-2])        
 784 |         if tt is nn.Conv2d:
 785 |             #if verbose: print '(conv)'
 786 |             #if use_linear_approx:
 787 |             m,A,B,X = makeControlledConv(orig,fine,initializationType)
 788 |             #else:
 789 |             #    m = controlledConv(orig,None)                          
 790 |             m.cuda() 
 791 |         elif tt is nn.BatchNorm2d:
 792 |             
 793 |             wasBN = True
 794 |             m.set_bn(orig)
 795 |             m = orig            
 796 |             #m = deepcopy(orig)                        
 797 |             #continue
 798 |         else:
 799 |             m = fine
 800 |             #if tt is nn.MaxPool2d:
 801 |                 #if verbose: print '(maxpool2)'                    
 802 |             #elif tt is nn.ReLU:
 803 |             #    if verbose: print '(relu)'
 804 |                     
 805 |         value_fine_before = value_fine
 806 |         value_new_before = value_new
 807 |                         
 808 |         oldChildren.append(fine)
 809 |         if not wasBN:
 810 |             newChildren.append(m)            
 811 |         if trackValues:
 812 |             value_fine = fine(value_fine)
 813 |             value_new = m(value_new)
 814 |             curdiff = (value_fine-value_new).data.abs().mean()
 815 |             if verbose:
 816 |                 print 'diff:',curdiff
 817 |         
 818 |             s_fine_vs_new.append(curdiff)
 819 |         
 820 |     return newChildren,oldChildren,s_fine_vs_new,types
 821 | 
 822 | def scalarVar(s):
 823 |     return Variable(torch.ones(1).cuda() * s)
 824 | 
 825 | def extractFeats(model,loader):
 826 |     # Extract all top-layer features once.
 827 |     cats = []
 828 |     feats = []
 829 |     for i,(a,b) in enumerate(loader):
 830 |         print i,
 831 |         a = Variable(a.cuda())
 832 |         feats.append(ton(model(a)))
 833 |         cats.append(b.numpy())
 834 |     feats = np.vstack(feats)
 835 |     cats = list(itertools.chain.from_iterable(cats))
 836 |     return feats,cats
 837 | def makeFeatLoader(model,loader,batch_size):
 838 |     feats,cats = extractFeats(model,loader)
 839 |     return DataLoader(TensorDataset(torch.Tensor(feats),torch.Tensor(cats)),batch_size=batch_size,shuffle=True)
 840 | 
 841 | class shifterNet(nn.Module):
 842 |     def __init__(self, decider,shiftable):
 843 |             super(shifterNet, self).__init__()
 844 |             self.decider = decider
 845 |             self.shiftable = shiftable            
 846 |     def forward(self, x):                                
 847 |         my_alpha = F.softmax(self.decider(x))[:,1:]
 848 |         my_alpha[my_alpha < .5] = 0
 849 |         my_alpha[my_alpha >= .5] = 1
 850 |         return self.shiftable(x,my_alpha)
 851 |     
 852 | '''
 853 | class Scale(object): # This is a copy from the torchvision repository, it's just a version conflict
 854 |     
 855 |     """Rescales the input PIL.Image to the given 'size'.
 856 |     If 'size' is a 2-element tuple or list in the order of (width, height), it will be the exactly size to scale.
 857 |     If 'size' is a number, it will indicate the size of the smaller edge.
 858 |     For example, if height > width, then image will be
 859 |     rescaled to (size * height / width, size)
 860 |     size: size of the exactly size or the smaller edge
 861 |     interpolation: Default: PIL.Image.BILINEAR
 862 |     """
 863 | 
 864 |     def __init__(self, size, interpolation=Image.BILINEAR):
 865 |         assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2)
 866 |         self.size = size
 867 |         self.interpolation = interpolation
 868 | 
 869 |     def __call__(self, img):
 870 |         if isinstance(self.size, int):
 871 |             w, h = img.size
 872 |             if (w <= h and w == self.size) or (h <= w and h == self.size):
 873 |                 return img
 874 |             if w < h:
 875 |                 ow = self.size
 876 |                 oh = int(self.size * h / w)
 877 |                 return img.resize((ow, oh), self.interpolation)
 878 |             else:
 879 |                 oh = self.size
 880 |                 ow = int(self.size * w / h)
 881 |                 return img.resize((ow, oh), self.interpolation)
 882 |         else:
 883 |             return img.resize(self.size, self.interpolation)
 884 | '''
 885 | def getTrainableParams(model):
 886 |     if type(model) is list:
 887 |         return [p for p in model if p.requires_grad]
 888 |     else:
 889 |         return [p for p in model.parameters() if p.requires_grad]
 890 | def makeTrainable(model,toggle):
 891 |     for p in model.parameters():
 892 |         p.requires_grad = toggle
 893 |     if hasattr(model,'features'):
 894 |         for q in model.features:
 895 |             q.train()
 896 | 
 897 | from PIL import Image,ImageOps
 898 | import numbers
 899 | 
 900 | class RandomCrop(object):
 901 |     """Crops the given PIL.Image at a random location to have a region of
 902 |     the given size. size can be a tuple (target_height, target_width)
 903 |     or an integer, in which case the target will be of a square shape (size, size)
 904 |     """
 905 | 
 906 |     def __init__(self, size, padding=0, fill = 0):
 907 |         if isinstance(size, numbers.Number):
 908 |             self.size = (int(size), int(size))
 909 |         else:
 910 |             self.size = size
 911 |         self.padding = padding
 912 |         self.fill = fill
 913 | 
 914 |     def __call__(self, img):
 915 |         if self.padding > 0:
 916 |             img = ImageOps.expand(img, border=self.padding, fill=self.fill)
 917 | 
 918 |         w, h = img.size
 919 |         th, tw = self.size
 920 |         if w == tw and h == th:
 921 |             return img
 922 | 
 923 |         x1 = random.randint(0, w - tw)
 924 |         y1 = random.randint(0, h - th)
 925 |         return img.crop((x1, y1, x1 + tw, y1 + th))
 926 | 
 927 | # Make a relatively lightweight model for the baselines
 928 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
 929 | # Various configuration parameters
 930 | 
 931 | baseDataDir = os.path.expanduser('~/data_transfer/')
 932 | modelsBaseDir = os.path.expanduser('~/models')
 933 | all_datasets = {}
 934 | all_datasets['caltech256'] = {'trainDir': 'Caltech256/train/',
 935 |                           'testDir': 'Caltech256/test/',
 936 |                              'nClasses':257}                          
 937 | all_datasets['omniglot'] = {'trainDir': 'omniglot/python/train/',
 938 |                           'testDir': 'omniglot/python/test/',
 939 |                            'nClasses':1623}
 940 | all_datasets['daimler'] = {'trainDir': 'daimler/all_train/',
 941 |                           'testDir': 'daimler/all_test',
 942 |                           'nClasses':2}
 943 | all_datasets['sketch'] = {'trainDir': 'sketch_train',
 944 |                           'testDir': 'sketch_test',
 945 |                          'nClasses':250}
 946 | all_datasets['GTSR'] = {'trainDir': 'GTSR/Final_Training/',
 947 |                           'testDir': 'GTSR/Final_Test/',
 948 |                        'nClasses':43}
 949 | all_datasets['CIFAR-10'] = {'trainDir': 'cifar-10/train/',
 950 |                           'testDir': 'cifar-10/test/',
 951 |                        'nClasses':10}
 952 | all_datasets['CIFAR-100'] = {'trainDir': 'cifar-100/train/',
 953 |                           'testDir': 'cifar-100/test/',
 954 |                        'nClasses':100}
 955 |                            
 956 | all_datasets['SVHN'] = {'trainDir': 'svhn/train/',
 957 |                           'testDir': 'svhn/test/',
 958 |                        'nClasses':10}
 959 | all_datasets['plankton'] = {'trainDir': 'plankton_train',
 960 |                           'testDir': 'plankton_test',
 961 |                        'nClasses':121}
 962 | all_datasets['CUB'] = {'trainDir': 'CUB/train',
 963 |                           'testDir': 'CUB/test',
 964 |                        'nClasses':200}
 965 | all_datasets['mnist'] = {'trainDir': 'mnist/train',
 966 |                           'testDir': 'mnist/test',
 967 |                        'nClasses':10}
 968 | 
 969 | all_datasets_extra = {}
 970 | for k in all_datasets.keys():
 971 |     all_datasets_extra[k] = {}
 972 | #all_datasets_extra['sketch'] = {'crop_fill':1}
 973 | all_datasets_extra['SVHN'] = {'augment_flip':False}
 974 | all_datasets_extra['omniglot'] = {'augment_flip':False}
 975 | 
 976 | #dataset_stats = pickle.load(os.path.join(baseDataDir,'database_stats'))
 977 | 
 978 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] # B
 979 |       
 980 | big_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] # D 
 981 | 
 982 | cuda=True
 983 | lr_drop_freq = 10
 984 | base_lr = 1e-3
 985 | adjust_learning_rate = None
 986 | 
 987 | import random
 988 | def makeNet(name,bigNet=False,fullyconv=False,batch_norm=True): 
 989 |     nClasses = all_datasets[name]['nClasses']
 990 |     my_cfg = cfg
 991 |     if bigNet:
 992 |         my_cfg = big_cfg
 993 |     model = VGG(make_layers(my_cfg,batch_norm=batch_norm,fullyconv=fullyconv),fc_size=2048, num_classes= nClasses,fullyconv=fullyconv)
 994 |     return model
 995 | '''
 996 | class RandomHorizontalFlip(object):
 997 |     """Randomly horizontally flips the given PIL.Image or np.ndarray with a probability of 0.5
 998 |     """
 999 |     def __call__(self, img):
1000 |         if random.random() < 0.5:
1001 |             if isinstance(img, np.ndarray):
1002 |                 return np.fliplr(img)
1003 |             else:
1004 |                 return img.transpose(Image.FLIP_LEFT_RIGHT)
1005 |         return img
1006 |     
1007 | 
1008 | '''
1009 | def makeLoaders2(name, stats = None):
1010 |     """
1011 |     quick and easy loaders,with default values
1012 |     """
1013 |     if stats is None:
1014 |         stats = dataset_stats[name]
1015 |     trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir'])
1016 |     testDir = os.path.join(baseDataDir,all_datasets[name]['testDir'])            
1017 |     augment_flip = all_datasets_extra[name].get("augment_flip", False)
1018 |     augment_crop = all_datasets_extra[name].get("augment_crop", False)
1019 |     crop_fill = all_datasets_extra[name].get("crop_fill", 
1020 |                                              tuple( (255*dataset_stats[name][0]).astype(np.uint8)))
1021 |     
1022 |     print augment_flip,augment_crop,crop_fill
1023 |         
1024 |     train_loader,test_loader = makeLoaders(trainDir,testDir,stats,augment_flip=augment_flip,
1025 |                                            augment_crop=augment_crop,crop_fill=crop_fill)
1026 |     
1027 |     return train_loader,test_loader
1028 | 
1029 | if False:
1030 |     def makeLoaders(train_dir,test_dir,stats,augment_flip=False, augment_crop = False,
1031 |                 crop_fill = 0):
1032 |         # remove mean and divide by std (computed stats is variance, hence sqrt)
1033 |                     
1034 |         normalize = transforms.Normalize(np.asarray(stats[0]), np.asarray(stats[1])**.5)
1035 |         # random crop for jittering at train time.
1036 |         transform_list = [Scale((64,64))]
1037 |         if augment_crop:
1038 |             transform_list.append(RandomCrop(64,8,fill = crop_fill))
1039 |         if augment_flip:
1040 |             transform_list.append(RandomHorizontalFlip())
1041 |         
1042 |         transform_list.extend([transforms.ToTensor(), normalize])            
1043 |         transform_train=transforms.Compose(transform_list)
1044 | 
1045 |         db_train = DataLoader(dataset=datasets.ImageFolder(root = train_dir, transform=transform_train),
1046 |                     batch_size=128, shuffle=True,**kwargs)
1047 |         transform_test=transforms.Compose([Scale((64,64)), transforms.ToTensor(), normalize])
1048 |         db_test = DataLoader(dataset=datasets.ImageFolder(root = test_dir, transform=transform_test),
1049 |                     batch_size=128, shuffle=True)
1050 |         return db_train,db_test
1051 | 
1052 | 
1053 | def freezeAllButLastLayer(model):
1054 |     for p in model.features.parameters():
1055 |         p.requires_grad = False
1056 |     children = list(model.classifier.children())
1057 |     for p in children[:-1]:
1058 |         for q in p.parameters():
1059 |             q.requires_grad = False
1060 |     
1061 | 
1062 | 
1063 | 
1064 | 
1065 | def make_layers(cfg_1, batch_norm=False,instance_norm = False, affine=False,fullyconv=False):
1066 |     #print 'fully conv:',fullyconv
1067 |     cfg = list(cfg_1) # copy it to make sure it's not modified
1068 |     if batch_norm and instance_norm:
1069 |         raise Exception('cannot use both batch and instance normalization')
1070 |     layers = []
1071 |     in_channels = 3
1072 |     if fullyconv:
1073 |         cfg.append(512)
1074 |         #print cfg
1075 |     for i,v in enumerate(cfg):
1076 |         if v == 'M':
1077 |             layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
1078 |         else:
1079 |             
1080 |             my_kernel_size = 3 # hacky!
1081 |             my_padding = 1
1082 |             if fullyconv and i == len(cfg)-1:
1083 |                 #print '!'
1084 |                 my_kernel_size = 2
1085 |                 my_padding = 0
1086 |                             
1087 |             conv2d = nn.Conv2d(in_channels, v, kernel_size=my_kernel_size, padding=my_padding)            
1088 |             #init.kaiming_normal(conv2d.weight,mode='fan_out')
1089 |             init.kaiming_uniform(conv2d.weight)
1090 |             if batch_norm:
1091 |                 layers +=[ conv2d, nn.BatchNorm2d(v, affine=affine), nn.ReLU(inplace=True)]
1092 |             else:
1093 |                 layers += [conv2d, nn.ReLU(inplace=True)]
1094 |             in_channels = v        
1095 |     return nn.Sequential(*layers)
1096 | 
1097 | 
1098 | def makeModelDirName(name,modelsBaseDir = modelsBaseDir, sfx='',baseNetwork=None):
1099 |     modelDir =  os.path.join(modelsBaseDir,'baseline_'+name+sfx)
1100 |     if baseNetwork is not None:
1101 |         modelDir += '_from_'+baseNetwork['name']
1102 |         if baseNetwork['onlyLastLayer']:
1103 |             modelDir+='_last'
1104 |     return modelDir
1105 |      
1106 | def doTrainingStuff(name,modelsBaseDir = modelsBaseDir, maxIters=np.inf, override=False,differentStats = None,
1107 |                    augment_flip=True,augment_crop = False, bigNet=False,
1108 |                    base_lr = 1e-3, baseNetwork=None,epochs=50,sfx='',batch_norm=True,
1109 |                    instance_norm=False, affine=False,cuda=True,lr_drop_freq=lr_drop_freq,optimizer=None,
1110 |                    adjust_learning_rate=adjust_learning_rate,disableBatchNorm=False,fullyconv=False):
1111 |         
1112 |     
1113 |     trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir'])
1114 |     testDir = os.path.join(baseDataDir,all_datasets[name]['testDir'])
1115 |         
1116 |     modelDir = makeModelDirName(name,modelsBaseDir = modelsBaseDir,sfx=sfx,baseNetwork=baseNetwork)
1117 |         
1118 |     if override:
1119 |         if not os.path.isdir(modelDir):
1120 |             print '{} : nothing to override - creating anew'.format(name)
1121 |         else:
1122 |             print 'warning - moving model for {} to backup at {}_BAK',name,modelDir
1123 |             shutil.move(modelDir,modelDir+'_BAK')
1124 |             
1125 |     if differentStats is not None:
1126 |         print 'using different stats...'
1127 |         stats = differentStats
1128 |     else:        
1129 |         print 
1130 |         stats = dataset_stats[name]
1131 |     
1132 |         
1133 |     augment_flip = all_datasets_extra[name].get("augment_flip", augment_flip)
1134 |     augment_crop = all_datasets_extra[name].get("augment_crop", augment_crop)
1135 |     crop_fill = all_datasets_extra[name].get("crop_fill", 
1136 |                                              tuple( (255*dataset_stats[name][0]).astype(np.uint8)))
1137 |         
1138 |     train_loader,test_loader = makeLoaders(trainDir,testDir,stats,augment_flip=augment_flip,
1139 |                                            augment_crop=augment_crop,crop_fill=crop_fill)
1140 |     #disableBatchNorm = False
1141 |     
1142 |     if baseNetwork is None:
1143 |         print 'training network from scratch...'
1144 |         #model = makeNet(name,bigNet)        
1145 |         
1146 |         nClasses = all_datasets[name]['nClasses']
1147 |         my_cfg = cfg
1148 |         if bigNet:
1149 |             my_cfg = big_cfg
1150 |         model = VGG(make_layers(my_cfg, batch_norm=batch_norm, instance_norm=instance_norm, affine=affine,fullyconv=fullyconv),
1151 |                     fc_size=2048, num_classes = nClasses,fullyconv=fullyconv)
1152 |         
1153 |     else: # fine tune from an existing network. 
1154 |         
1155 |         print 'fine tuning network from',baseNetwork['name']
1156 |         epochs = baseNetwork.get('max_ft_epochs',epochs)
1157 |         toContinue = baseNetwork.get('toContinue',False)
1158 |         model = baseNetwork['net']
1159 |         if not toContinue:
1160 |             mod = list(model.classifier.children())
1161 |             mod.pop()
1162 |             nClasses = all_datasets[name]['nClasses']
1163 |             mod.append(torch.nn.Linear(512, nClasses))
1164 |             model.classifier = nn.Sequential(*mod)
1165 |         control = baseNetwork.get('control',False)
1166 |         if control:
1167 |             raise NotImplementedError('Still need to link this to the controlling module')
1168 |         if baseNetwork['onlyLastLayer']:
1169 |             freezeAllButLastLayer(model)
1170 |             
1171 |         #disableBatchNorm = baseNetwork.get('disableBatchNorm',True)
1172 |         
1173 |                        
1174 |         # TODO - should we disallow the batch-norm layers to change from now?        
1175 |         #'onlyLastLayer':True,'max_ft_epochs':max_ft_epochs})
1176 |     params = [p for p in model.parameters() if p.requires_grad] 
1177 |     if cuda:
1178 |         model.cuda();
1179 |        
1180 |     if optimizer is None:
1181 |         optimizer = optim.Adam(params = params)
1182 |     elif type(optimizer) is str:
1183 |         if optimizer=='sgd':
1184 |             optimizer = optim.SGD(lr=base_lr,momentum=.9,weight_decay=0.0001,params = params)
1185 |         elif optimizer=='rmsprop':
1186 |             optimizer = optim.RMSprop(lr=base_lr,momentum=.9,weight_decay=0.0001,params = params)
1187 |         else:
1188 |             raise Exception('unexpected optimizer')
1189 |     
1190 |         
1191 |     trainAndTest(model= model, modelDir = modelDir, epochs=epochs, targetTranslator=None, model_save_freq=5,
1192 |                 train_loader=train_loader, test_loader=test_loader, stopIfPerfect=True, optimizer=optimizer,
1193 |                 criterion =nn.CrossEntropyLoss(), adjust_learning_rate=adjust_learning_rate,
1194 |                 maxIters=maxIters,base_lr=base_lr, lr_drop_freq=lr_drop_freq, disableBatchNorm=disableBatchNorm,
1195 |                 cuda=cuda)
1196 |     return model
1197 | def loadLastCheckpoint(model,modelDir,removeBest=False,verbose=False, onlyPerf = False):
1198 |     g = list(sorted(glob.glob(os.path.join(modelDir,'*'))))
1199 |     
1200 |     
1201 |     if verbose:
1202 |         print 'number of saved checkpoints:',len(g)
1203 |     
1204 |     if removeBest:
1205 |         g = [a for a in g if 'best' not in a]
1206 |     
1207 |     if len(g) > 0:
1208 |         lastCheckpoint = g[-1]
1209 |         if verbose:
1210 |             print 'last checkpoint:',lastCheckpoint
1211 |         
1212 |         #cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
1213 |         #model = VGG(make_layers(cfg,True),fc_size=2048, num_classes= 1624)
1214 |         T = torch.load(lastCheckpoint)
1215 |         if not onlyPerf:
1216 |             if type(T) is dict:
1217 |                 model.load_state_dict(T['state_dict'])
1218 |             else:
1219 |                 model.load_state_dict(T)
1220 |             model.cuda();
1221 |         
1222 |         if verbose:            
1223 |             print 'loaded with accuracy of', T['best_acc']
1224 |         
1225 |         return T,model
1226 |     else:
1227 |         raise Exception('No checkpoint found for {}'.format(modelDir))    
1228 |         
1229 | def testNet(name,maxSamples=500,modelDir=None):
1230 |     trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir'])
1231 |     testDir = os.path.join(baseDataDir,all_datasets[name]['testDir'])
1232 |     
1233 |     if modelDir is None: 
1234 |         print 'reverting to default model dir.'
1235 |         modelDir =  os.path.join(modelsBaseDir,'baseline_'+name)
1236 |     train_loader,test_loader = makeLoaders(trainDir,testDir,dataset_stats[name])
1237 |     
1238 |     
1239 |     print 'testing dataset:',name
1240 |     
1241 |     cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M']
1242 |     model = VGG(make_layers(cfg,True),fc_size=2048, num_classes= all_datasets[name]['nClasses'])            
1243 |     checkpoint = loadLastCheckpoint(model,modelDir)     
1244 |     return quickTest(model,test_loader,maxSamples=maxSamples)
1245 | 
1246 | def loadNet(name,verbose=False,isalpha=False):
1247 |     """
1248 |     loads the default network for this name
1249 |     """
1250 |     if verbose:
1251 |         print 'loading',name
1252 |     model = makeNet(name)
1253 |     modelDir =  makeModelDirName(name)
1254 |     checkpoint = loadLastCheckpoint(model,modelDir,verbose=verbose)    
1255 |     return model,checkpoint
1256 | 
1257 | # functions to "concatenate" the features parts neural networks.
1258 | 
1259 | # concatenate convolutions
1260 | 
1261 | def concatConv(c1,c2):
1262 |     newWeights = torch.cat([c1.weight,c2.weight])
1263 |     newBias = torch.cat([c1.bias,c2.bias])
1264 |     s = c1.weight.size()
1265 |     c3 = nn.Conv2d(s[1],2*s[0],s[2],stride = c1.stride, padding=c1.padding)
1266 |     c3.weight.data = newWeights.data
1267 |     c3.bias.data = newBias.data
1268 |     return c3
1269 | 
1270 | 
1271 | def concatBN(bn1,bn2):
1272 |     s = bn1.num_features
1273 |     bn3 = nn.BatchNorm2d(s*2)
1274 |     for k in ['running_mean','running_var']:
1275 |         bn3.state_dict()[k] = torch.cat([bn1.state_dict()[k],bn1.state_dict()[k]])
1276 |     return bn3
1277 | 
1278 | 
1279 | # good!
1280 | def concatNets(net1,net2): # concatenet :-)
1281 |     newFeatures = []
1282 |     for f1,f2 in zip(net1.features,net2.features):
1283 |         tt1,tt2 = type(f1),type(f2)        
1284 |         assert tt1==tt2,'cannot concatenate networks with different structures'        
1285 |         if tt1 is nn.Conv2d:
1286 |             newFeatures.append(concatConv(f1,f2))
1287 |         elif tt1 is nn.BatchNorm2d:
1288 |             newFeatures.append(concatBN(f1,f2))
1289 |         elif tt1 in [nn.MaxPool2d, nn.ReLU]:
1290 |             newFeatures.append(f1)
1291 |         else:
1292 |             Exception('Don''t know how to "concatenate" modules of types {},{}'.format(tt1,tt2))
1293 |     return newFeatures
1294 | 
1295 | '''ResNet in PyTorch.
1296 | BasicBlock and Bottleneck module is from the original ResNet paper:
1297 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
1298 |     Deep Residual Learning for Image Recognition. arXiv:1512.03385
1299 | PreActBlock and PreActBottleneck module is from the later paper:
1300 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
1301 |     Identity Mappings in Deep Residual Networks. arXiv:1603.05027
1302 | '''
1303 | import torch
1304 | import torch.nn as nn
1305 | import torch.nn.functional as F
1306 | 
1307 | from torch.autograd import Variable
1308 | 
1309 | def conv3x3(in_planes, out_planes, stride=1):
1310 |     return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
1311 | 
1312 | 
1313 | class BasicBlock(nn.Module):
1314 |     expansion = 1
1315 | 
1316 |     def __init__(self, in_planes, planes, stride=1):
1317 |         super(BasicBlock, self).__init__()
1318 |         self.conv1 = conv3x3(in_planes, planes, stride)
1319 |         self.bn1 = nn.BatchNorm2d(planes)
1320 |         self.conv2 = conv3x3(planes, planes)
1321 |         self.bn2 = nn.BatchNorm2d(planes)
1322 | 
1323 |         self.shortcut = nn.Sequential()
1324 |         if stride != 1 or in_planes != self.expansion*planes:
1325 |             self.shortcut = nn.Sequential(
1326 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
1327 |                 nn.BatchNorm2d(self.expansion*planes)
1328 |             )
1329 | 
1330 |     def forward(self, x):
1331 |         out = F.relu(self.bn1(self.conv1(x)))
1332 |         out = self.bn2(self.conv2(out))
1333 |         out += self.shortcut(x)
1334 |         out = F.relu(out)
1335 |         return out
1336 | 
1337 | 
1338 | class PreActBlock(nn.Module):
1339 |     '''Pre-activation version of the BasicBlock.'''
1340 |     expansion = 1
1341 | 
1342 |     def __init__(self, in_planes, planes, stride=1):
1343 |         super(PreActBlock, self).__init__()
1344 |         self.bn1 = nn.BatchNorm2d(in_planes)
1345 |         self.conv1 = conv3x3(in_planes, planes, stride)
1346 |         self.bn2 = nn.BatchNorm2d(planes)
1347 |         self.conv2 = conv3x3(planes, planes)
1348 | 
1349 |         self.shortcut = nn.Sequential()
1350 |         if stride != 1 or in_planes != self.expansion*planes:
1351 |             self.shortcut = nn.Sequential(
1352 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
1353 |             )
1354 | 
1355 |     def forward(self, x):
1356 |         out = F.relu(self.bn1(x))
1357 |         shortcut = self.shortcut(out)
1358 |         out = self.conv1(out)
1359 |         out = self.conv2(F.relu(self.bn2(out)))
1360 |         out += shortcut
1361 |         return out
1362 | 
1363 | 
1364 | class Bottleneck(nn.Module):
1365 |     expansion = 4
1366 | 
1367 |     def __init__(self, in_planes, planes, stride=1):
1368 |         super(Bottleneck, self).__init__()
1369 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
1370 |         self.bn1 = nn.BatchNorm2d(planes)
1371 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
1372 |         self.bn2 = nn.BatchNorm2d(planes)
1373 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
1374 |         self.bn3 = nn.BatchNorm2d(self.expansion*planes)
1375 | 
1376 |         self.shortcut = nn.Sequential()
1377 |         if stride != 1 or in_planes != self.expansion*planes:
1378 |             self.shortcut = nn.Sequential(
1379 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
1380 |                 nn.BatchNorm2d(self.expansion*planes)
1381 |             )
1382 | 
1383 |     def forward(self, x):
1384 |         out = F.relu(self.bn1(self.conv1(x)))
1385 |         out = F.relu(self.bn2(self.conv2(out)))
1386 |         out = self.bn3(self.conv3(out))
1387 |         out += self.shortcut(x)
1388 |         out = F.relu(out)
1389 |         return out
1390 | 
1391 | class PreActBottleneck(nn.Module):
1392 |     '''Pre-activation version of the original Bottleneck module.'''
1393 |     expansion = 4
1394 |     def __init__(self, in_planes, planes, stride=1):
1395 |         super(PreActBottleneck, self).__init__()
1396 |         self.bn1 = nn.BatchNorm2d(in_planes)
1397 |         self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
1398 |         self.bn2 = nn.BatchNorm2d(planes)
1399 |         self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
1400 |         self.bn3 = nn.BatchNorm2d(planes)
1401 |         self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False)
1402 | 
1403 |         self.shortcut = nn.Sequential()
1404 |         if stride != 1 or in_planes != self.expansion*planes:
1405 |             self.shortcut = nn.Sequential(
1406 |                 nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False)
1407 |             )
1408 | 
1409 |     def forward(self, x):
1410 |         out = F.relu(self.bn1(x))
1411 |         shortcut = self.shortcut(out)
1412 |         out = self.conv1(out)
1413 |         out = self.conv2(F.relu(self.bn2(out)))
1414 |         out = self.conv3(F.relu(self.bn3(out)))
1415 |         out += shortcut
1416 |         return out
1417 | 
1418 | def default_loader(path):
1419 |     return Image.open(path).convert('RGB')
1420 | 
1421 | def default_flist_reader(flist):
1422 |     """
1423 |     flist format: impath label\nimpath label\n ...(same to caffe's filelist)
1424 |     """
1425 |     imlist = []
1426 |     with open(flist, 'r') as rf:
1427 |         for line in rf.readlines():
1428 |             impath, imlabel = line.strip().split()
1429 |             imlist.append( (impath, int(imlabel)) )
1430 | 
1431 |     return imlist
1432 | 
1433 | class ImageFilelist(data.Dataset):
1434 |     def __init__(self, root, flist, transform=None, target_transform=None,
1435 |             flist_reader=default_flist_reader, loader=default_loader):
1436 |             self.root   = root
1437 |             self.imlist = flist_reader(flist)		
1438 |             self.transform = transform
1439 |             self.target_transform = target_transform
1440 |             self.loader = loader
1441 | 
1442 |     def __getitem__(self, index):
1443 |         impath, target = self.imlist[index]
1444 |         img = self.loader(os.path.join(self.root,impath))
1445 |         if self.transform is not None:
1446 |             img = self.transform(img)
1447 |         if self.target_transform is not None:
1448 |             target = self.target_transform(target)
1449 | 
1450 |         return img, target
1451 | 
1452 |     def __len__(self):
1453 |         return len(self.imlist)
1454 | 


--------------------------------------------------------------------------------