├── Incremental-Learning-Main.ipynb ├── README.md ├── database_stats.pkl └── funs.py /README.md: -------------------------------------------------------------------------------- 1 | # incremental_learning 2 | Initial Code for paper [Incremental Learning through Deep Adaptation](https://arxiv.org/abs/1705.04228), by Amir Rosenfeld, John Tsotsos 3 | 4 | 5 | This work is now superseded by "Efficient parametrization of multi-domain deep neural networks" by S. Rebuffi, H. Bilen and A. Vedaldi. I recommend using that repo instead. It can be found [Here](https://github.com/srebuffi/residual_adapters) 6 | 7 | Note that this is a very initial commit. It cannot work straight out of the box due to absolute paths, etc. 8 | The main interesting function here is "makeItControlled" in the .ipynb file which adds controller modules to a new model to re-use modules of an old model. 9 | Everything else is boilerplate,training,testing,experiments. 10 | Please report any issues/comments/suggestions as you may have them. 11 | 12 | -------------------------------------------------------------------------------- /database_stats.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rosenfeldamir/incremental_learning/0d47970f919c3ae2eb0494170cddb650beb8e6e2/database_stats.pkl -------------------------------------------------------------------------------- /funs.py: -------------------------------------------------------------------------------- 1 | 2 | import progressbar 3 | import sys 4 | import argparse 5 | import torch 6 | from copy import deepcopy 7 | import numpy as np 8 | import torch.nn as nn 9 | from torch.nn import init 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | import torchvision 13 | from torchvision import datasets, transforms 14 | from torch.autograd import Variable 15 | import torchvision.models as models 16 | from torch.utils.data import Dataset, TensorDataset, DataLoader 17 | import torch.utils.data as data 18 | 19 | from tensorboard_logger import configure, log_value, Logger 20 | 21 | import itertools 22 | from itertools import izip 23 | from matplotlib import pyplot as plt 24 | import os,os.path 25 | import glob 26 | from time import time 27 | import shutil 28 | import hickle as pickle 29 | from PIL import Image 30 | import collections 31 | import math 32 | 33 | batch_size = 128 34 | base_lr = .1 35 | lr_drop_freq=10 36 | criterion = nn.CrossEntropyLoss() 37 | num_workers = 0 38 | from os.path import expanduser 39 | homeDir = expanduser('~') 40 | sys.path.append(os.path.join(homeDir,'YellowFin_Pytorch/tuner_utils/')) # yellowfin :-) 41 | from yellowfin import YFOptimizer 42 | 43 | def matVar(size=(1,3,64,64),cuda=False): 44 | v = Variable(torch.randn(size)) 45 | if cuda: 46 | v = v.cuda() 47 | return v 48 | 49 | #def adjust_learning_rate(optimizer, epoch, base_lr, lr_drop_freq, gamma=0.5): 50 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 51 | 52 | # lr = base_lr * (gamma ** (epoch // lr_drop_freq)) 53 | # for param_group in optimizer.param_groups: 54 | # 55 | 56 | def adjust_learning_rate(optimizer, epoch, base_lr, lr_drop_freq = 100, gamma=0.1): 57 | """Sets the learning rate to the initial LR decayed by gamma every K epochs""" 58 | if (epoch + 1) % lr_drop_freq == 0: # Note this works only for continuous mode (not stopping+loading) 59 | if type(optimizer) is YFOptimizer: 60 | optimizer.set_lr_factor(optimizer.get_lr_factor() * gamma) 61 | else: 62 | for param_group in optimizer.param_groups: 63 | param_group['lr'] = param_group['lr'] * gamma 64 | 65 | def train(model,epoch,optimizer,maxIters=np.inf,targetTranslator=None,train_loader=None, criterion=None,criterion2 = None,disableBatchNorm=False,cuda=True, balancing_factor = 0.0, logger=None): 66 | T0 = time() 67 | if not disableBatchNorm: 68 | model.train() 69 | else: 70 | model.eval() 71 | 72 | nBatches = 0 73 | running_loss = 0.0 74 | running_loss2 = 0.0 75 | losses = [] 76 | nSamples=0 77 | maxIters = min(maxIters,len(train_loader)) 78 | startTime = time() 79 | for batch_idx, (data, target) in enumerate(train_loader): 80 | 81 | target = target.long().squeeze() 82 | if targetTranslator is not None: 83 | target2 = targetTranslator(target.clone()) 84 | target2 = data.cuda(), targfet.cuda(),target2.cuda() 85 | target = target.long().squeeze() 86 | if cuda: 87 | data, target = data.cuda(), target.cuda() 88 | data, target = Variable(data), Variable(target) 89 | #, Variable(target2) 90 | optimizer.zero_grad() 91 | 92 | output = model(data) 93 | if type(output) is tuple: 94 | gates = output[1] 95 | output = output[0] 96 | 97 | #output = model(data) 98 | 99 | loss = criterion(output, target)# + criterion(output2,target2) 100 | if criterion2 is not None and balancing_factor > 0: 101 | loss2 = criterion2(gates) 102 | loss+= balancing_factor * loss2 103 | else: 104 | loss2 = 0 105 | 106 | # 107 | 108 | loss.backward() 109 | optimizer.step() 110 | losses.append(loss.data[0]) 111 | running_loss += loss.data[0] 112 | if criterion2 is not None and balancing_factor > 0: 113 | running_loss2 += loss2.data[0]/balancing_factor 114 | else: 115 | running_loss2 = -1 116 | 117 | 118 | 119 | 120 | nBatches+=1#len(data) 121 | nSamples+=len(data) 122 | if batch_idx % 5 == 0 and time()-T0 > .1: 123 | T0 = time() 124 | elapsedTime = time()-startTime 125 | S = 'Train Epoch: {} [{}/{} ({:.0f}%)]\tAvg Loss: {:.6f}\tAvg Loss 2: {:.6f} ({:.2f} imgs/sec)'.format(epoch, batch_idx * len(data), 126 | len(train_loader.dataset), 127 | 100. * batch_idx / len(train_loader), 128 | running_loss/nBatches,running_loss2/(nBatches), 129 | nSamples/elapsedTime) 130 | if logger is not None: 131 | logger.log_value('training loss',loss.data[0],batch_idx + epoch * maxIters) 132 | print '\r{}'.format(S), 133 | if batch_idx > maxIters: 134 | break 135 | #b1 136 | if logger is not None: 137 | if hasattr(optimizer,'param_groups'): 138 | 139 | for param_group in optimizer.param_groups: 140 | cur_lr = param_group['lr'] 141 | logger.log_value('learning rate',cur_lr,epoch) 142 | 143 | if hasattr(optimizer,'get_lr_factor'): 144 | logger.log_value('learning rate',optimizer.get_lr_factor(),epoch) 145 | 146 | return losses 147 | 148 | def test(model,epoch,targetTranslator=None,test_loader=None,prev_acc=0,alpha=None,criterion=None, maxIters=np.inf,cuda=True, logger=None): 149 | assert (criterion is not None) 150 | #criterion = nn.CrossEntropyLoss() 151 | model.eval() 152 | test_loss = 0 153 | correct = 0 154 | nSamples = 0 155 | maxIters = min(maxIters,len(test_loader)) 156 | for batch_idx, (data, target) in enumerate(test_loader): 157 | target = target.long().squeeze() 158 | if targetTranslator is not None: 159 | target2 = targetTranslator(target.clone()) 160 | target2 = target2.cuda() 161 | Variable(target2) 162 | if cuda: 163 | data, target = data.cuda(), target.cuda() 164 | 165 | data, target = Variable(data), Variable(target) 166 | if alpha is not None: 167 | output = model(data,alpha) 168 | else: 169 | #b1 170 | output = model(data) 171 | if type(output) is tuple: 172 | gates = output[1] 173 | output = output[0] 174 | cur_test_loss = criterion(output, target).data[0] 175 | test_loss += cur_test_loss 176 | 177 | pred = output.data.max(1)[1] # get the index of the max log-probability 178 | correct += pred.eq(target.data).cpu().sum() 179 | nSamples+=len(data) 180 | if batch_idx >= maxIters: 181 | break 182 | 183 | test_loss /= len(test_loader) # loss function already averages over batch size 184 | if logger is not None: 185 | logger.log_value('test loss',test_loss,epoch) 186 | cur_acc = 100. * correct / nSamples 187 | #if prev_acc < cur_acc: 188 | P = '({}) :Test set: Avg. loss: {:.4f}, Acc: {}/{} ({:.1f}%)'.format(epoch, 189 | test_loss, correct, nSamples, cur_acc) 190 | if logger is not None: 191 | logger.log_value('test accuracy',cur_acc,epoch) 192 | 193 | print '\r{}'.format(P), 194 | return 100. * correct / nSamples 195 | 196 | def checkModelConsistency(newModel,oldModel): 197 | for a_fine,a_orig in zip(newModel, oldModel): 198 | tt = type(a_orig) 199 | if tt is nn.Conv2d: 200 | print '*', 201 | w_fine = a_fine.w.transpose(0,1).contiguous().view(a_fine.s) 202 | w_orig = a_orig.weight 203 | assert( (w_fine-w_orig).data.sum() ==0) 204 | #checkModelConsistency(f_fine_m,model_10.features.children()) 205 | 206 | 207 | def save_checkpoint(state, is_best, epoch, modelDir): 208 | """Saves checkpoint to disk""" 209 | checkPointPath = '{}/{}'.format(modelDir,str(epoch).zfill(4)) 210 | torch.save(state, checkPointPath) 211 | if is_best: 212 | shutil.copyfile(checkPointPath, '{}/{}'.format(modelDir,'best')) 213 | 214 | def defaultCallBacks(): 215 | return {'trainEpochStart':[],'trainEpochEnd':[],'testEpochStart':[],'testEpochEnd':[]} 216 | 217 | def trainAndTest(model,optimizer=None,modelDir=None,epochs=5,targetTranslator=None,model_save_freq=20, 218 | train_loader=None,test_loader=None,stopIfPerfect=True, criterion=nn.CrossEntropyLoss(), 219 | criterion2 = None, adjust_learning_rate=adjust_learning_rate, maxIters=np.inf,base_lr=base_lr, 220 | lr_drop_freq=lr_drop_freq,disableBatchNorm=False,cuda=True,balancing_factor=0.0,logger=None, 221 | callbacks=defaultCallBacks(),gamma=.1): 222 | 223 | last_epoch = 0 224 | corrects = [] 225 | 226 | needToSave = modelDir is not None and model_save_freq > 0 227 | all_accuracies = [] 228 | if needToSave: 229 | 230 | if not os.path.isdir(modelDir): 231 | os.makedirs(modelDir) 232 | 233 | 234 | g = list(sorted(glob.glob(os.path.join(modelDir,'*')))) 235 | g = [g_ for g_ in g if not 'best' in g_] 236 | 237 | g_new = [] 238 | for gg in g: # fixing file names to be zero padded 239 | g1,g2 = os.path.split(gg) 240 | newName = '/'.join([g1,g2.zfill(4)]) 241 | if gg <> newName: 242 | print 'moving' 243 | print gg,'to' 244 | print newName 245 | shutil.move(gg,newName) 246 | g_new.append(newName) 247 | g = list(sorted(g_new)) 248 | 249 | if len(g) > 0: 250 | lastCheckpoint = g[-1] 251 | # load the last checkpoint 252 | print 'loading from', lastCheckpoint 253 | 254 | checkpoint = torch.load(lastCheckpoint) 255 | last_epoch = checkpoint['epoch'] 256 | best_acc = checkpoint['best_acc'] 257 | all_accuracies = checkpoint.get('all_accuracies',all_accuracies) 258 | model.load_state_dict(checkpoint['state_dict']) 259 | print("=> loaded checkpoint '{}'".format(lastCheckpoint)) 260 | 261 | best_acc = 0 262 | all_losses = [] 263 | hasCallBacks = callbacks is not None 264 | 265 | for epoch in range(last_epoch, epochs): # epochs + 1): 266 | 267 | if hasCallBacks: 268 | for callback in callbacks['trainEpochStart']: 269 | callback(model,optimizer,epoch) 270 | 271 | if adjust_learning_rate is not None: 272 | adjust_learning_rate(optimizer,epoch,base_lr,lr_drop_freq,gamma) 273 | losses = train(model=model,epoch=epoch,optimizer=optimizer,targetTranslator=targetTranslator, 274 | train_loader=train_loader,criterion=criterion,criterion2 = criterion2, maxIters=maxIters,disableBatchNorm=disableBatchNorm,cuda=cuda, 275 | balancing_factor=balancing_factor,logger=logger) 276 | 277 | if hasCallBacks: 278 | for callback in callbacks['trainEpochEnd']: 279 | callback(model,optimizer,epoch) 280 | 281 | all_losses.extend(losses) 282 | print 283 | if hasCallBacks: 284 | for callback in callbacks['testEpochStart']: 285 | callback(model,optimizer,epoch) 286 | 287 | cur_acc = test(model,epoch,targetTranslator=targetTranslator,test_loader=test_loader, 288 | prev_acc=best_acc,criterion=criterion, maxIters=maxIters,cuda=cuda,logger=logger) 289 | if hasCallBacks: 290 | for callback in callbacks['testEpochEnd']: 291 | callback(model,optimizer,epoch) 292 | all_accuracies.append(cur_acc) 293 | corrects.append(cur_acc) 294 | print 295 | 296 | 297 | if needToSave and (epoch % model_save_freq == 0 or epoch == epochs-1): 298 | print 'saving model...', 299 | checkPointPath = '{}/{}'.format(modelDir,epoch) 300 | if cur_acc > best_acc: 301 | best_acc = cur_acc 302 | is_best = True 303 | else: 304 | is_best = False 305 | save_checkpoint({ 306 | 'epoch': epoch + 1, 307 | 'all_losses':all_losses, 308 | 'all_accuracies':all_accuracies, 309 | 'last_epoch_losses':losses, 310 | 'state_dict': model.state_dict(), 311 | 'best_acc': best_acc, 312 | 'cur_acc': cur_acc 313 | }, is_best, epoch, modelDir) 314 | #if cur_acc>=99.5: 315 | # break 316 | 317 | return corrects 318 | 319 | 320 | def imshow(img): 321 | #img = img / 2 + 0.5 # unnormalize 322 | npimg = img.numpy() 323 | npimg = npimg-npimg.min() 324 | npimg = npimg/npimg.max() 325 | plt.imshow(np.transpose(npimg, (1, 2, 0))) 326 | 327 | def init_params(net): 328 | '''Init layer parameters.''' 329 | for m in net.modules(): 330 | if isinstance(m, nn.Conv2d): 331 | init.kaiming_normal(m.weight, mode='fan_out') 332 | if m.bias is not None: 333 | init.constant(m.bias, 0) 334 | elif isinstance(m, nn.BatchNorm2d) and m.affine: 335 | init.constant(m.weight, 1) 336 | init.constant(m.bias, 0) 337 | #elif isinstance(m, nn.Linear): 338 | # init.normal(m.weight, std=1e-3) 339 | # if m.bias: 340 | # init.constant(m.bias, 0) 341 | 342 | 343 | 344 | class VGG_backcomp(nn.Module): 345 | def __init__(self, features, fc_size=512,num_classes=1000,dropout=True,fullyconv=False): 346 | super(VGG, self).__init__() 347 | self.features = features 348 | self.fullyconv = fullyconv 349 | if not fullyconv: 350 | 351 | if dropout: 352 | 353 | self.classifier = nn.Sequential( 354 | nn.Linear(fc_size, 512), 355 | nn.ReLU(True), 356 | nn.Dropout(), 357 | nn.Linear(512, num_classes), 358 | ) 359 | else: 360 | self.classifier = nn.Sequential( 361 | nn.Linear(fc_size, 512), 362 | nn.ReLU(True), 363 | nn.Linear(512, num_classes), 364 | ) 365 | else: 366 | self.classifier = nn.Sequential(nn.Linear(512,num_classes)) # get just the last layer,yes? 367 | 368 | def forward(self, x): 369 | 370 | x = self.features(x) 371 | #print 'x size:',x.size() 372 | x = x.view(x.size(0), -1) 373 | x = self.classifier(x) 374 | return x 375 | class VGG(nn.Module): 376 | def __init__(self, features, fc_size=512,num_classes=1000,dropout=True,fullyconv=False): 377 | super(VGG, self).__init__() 378 | self.features = features 379 | self.fullyconv = fullyconv 380 | if not fullyconv: 381 | 382 | if dropout: 383 | 384 | self.classifier = nn.Sequential( 385 | nn.Linear(fc_size, 512), 386 | nn.ReLU(True), 387 | nn.Dropout(), 388 | nn.Linear(512, num_classes), 389 | ) 390 | else: 391 | self.classifier = nn.Sequential( 392 | nn.Linear(fc_size, 512), 393 | nn.ReLU(True), 394 | nn.Linear(512, num_classes), 395 | ) 396 | else: 397 | self.classifier = nn.Sequential(nn.Conv2d(512,num_classes,2,2)) # get just the last layer,Yes? 398 | init_params(self) 399 | def forward(self, x): 400 | 401 | x = self.features(x) 402 | #print 'x size:',x.size() 403 | if not self.fullyconv: 404 | x = x.view(x.size(0), -1) 405 | x = self.classifier(x) 406 | if self.fullyconv: 407 | x = x.view(x.size(0), -1) 408 | return x,None 409 | 410 | # In[3]: 411 | #cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] 412 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] 413 | 414 | class AlphaNet(nn.Module): 415 | def __init__(self, features, classifier, otherClassifier=None): 416 | super(AlphaNet, self).__init__() 417 | if type(features) is list: 418 | self.features = nn.Sequential(*features) 419 | else: 420 | self.features = features 421 | self.classifier = classifier 422 | self.otherClassifier = otherClassifier 423 | self.outputSize = None 424 | def getControlParams(self): 425 | # return parameters of all layers, except convolutional. 426 | params = [] 427 | for q in self.features: 428 | q_type = type(q) 429 | if q_type is nn.Conv2d or q_type is nn.BatchNorm2d: 430 | continue 431 | if q_type is controlledConv: # probably nothing else 432 | params.extend(list(q.parameters())) 433 | params.append(q.bias) 434 | 435 | params.extend(list(self.classifier.parameters())) 436 | return params 437 | 438 | 439 | def extendToSize(self,x): 440 | S = self.outputSize 441 | if S is not None: 442 | s = x.size() 443 | assert s[1] <= S, 'output larger than required output size' 444 | if s[1] < S: 445 | XX = Variable(torch.zeros(s[0],S).cuda()) 446 | XX[:,:s[1]] = x 447 | x = XX 448 | return x 449 | 450 | 451 | def forward(self, x, alpha=None): 452 | for f in self.features: 453 | if type(f) is controlledConv: 454 | x = f(x,alpha) 455 | else: 456 | x = f(x) 457 | 458 | x = x.view(x.size(0), -1) 459 | 460 | if alpha is None: 461 | x = self.classifier(x) 462 | else: 463 | assert self.otherClassifier is not None, 'cannot use alpha without other classifier' 464 | #assert self.outputSize is not None , 'cannot use alpha without specified output size' 465 | x1 = self.classifier(x) 466 | x2 = self.otherClassifier(x) 467 | 468 | # set the desired output to the maximum between the two classes 469 | if self.outputSize is None: 470 | print 'automatically determining maximal output size...' 471 | self.outputSize = max(x1.size()[1],x2.size()[1]) 472 | #print 'sizes before:',x1.size(),x2.size() 473 | x1 = self.extendToSize(x1) 474 | x2 = self.extendToSize(x2) 475 | #print 'sizes after:',x1.size(),x2.size() 476 | 477 | myAlpha = alpha.expand_as(x1) 478 | x = myAlpha * x1 + (1-myAlpha) * x2 479 | return x 480 | 481 | def replaceLastLayer(model,num_outputs): 482 | mod = list(model.children()) 483 | mod.pop() 484 | mod.append(torch.nn.Linear(512, num_outputs)) 485 | model = torch.nn.Sequential(*mod) 486 | return model 487 | def freezeBatchNormLayers(model): 488 | if hasattr(model,'features'): 489 | for p in model.features.children(): 490 | 491 | if type(p) is nn.BatchNorm2d: 492 | print '.', 493 | for q in p.parameters(): 494 | 495 | q.requires_grad = False 496 | for p in model.classifier.children(): 497 | if type(p) is nn.BatchNorm2d: 498 | print '.', 499 | for q in p.parameters(): 500 | 501 | q.requires_grad = False 502 | else: 503 | for p in model.children(): 504 | if type(p) is nn.BatchNorm2d: 505 | print '.', 506 | for q in p.parameters(): 507 | 508 | q.requires_grad = False 509 | def ton(V): 510 | if type(V) is not Variable: 511 | return V.cpu().numpy() 512 | else: 513 | return V.data.cpu().numpy() 514 | def showmat(M): 515 | if type(M) is not np.ndarray: 516 | M = ton(M) 517 | plt.matshow(M) 518 | 519 | def countModelParameters(model,need_require_grad=True): 520 | return sum([p.data.nelement() for p in model.parameters() if p.requires_grad or not need_require_grad]) 521 | 522 | normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]], 523 | std=[x/255.0 for x in [63.0, 62.1, 66.7]]) 524 | cuda=True 525 | kwargs = {'num_workers': num_workers, 'pin_memory': False} 526 | 527 | 528 | def quickTest(model,test_loader,alpha=None,maxSamples=100000): 529 | 530 | #criterion = nn.CrossEntropyLoss() 531 | model.eval() 532 | test_loss = 0 533 | correct = 0 534 | nPoints = 0 535 | for idx, (data, target) in enumerate(test_loader): 536 | target = target.long().squeeze() 537 | nPoints += len(target) 538 | data, target = Variable(data.cuda()), Variable(target.cuda()) 539 | if alpha is not None: 540 | output = model(data,alpha) 541 | else: 542 | output = model(data) 543 | pred = output.data.max(1)[1] # get the index of the max log-probability 544 | correct += pred.eq(target.data).cpu().sum() 545 | if nPoints >= maxSamples: 546 | break 547 | cur_acc = 100. * correct / nPoints 548 | #if prev_acc < cur_acc: 549 | P = 'Test set: Acc: {}/{} ({:.1f}%)'.format(correct, nPoints, cur_acc) 550 | print '\r{}'.format(P), 551 | return cur_acc 552 | 553 | 554 | 555 | 556 | 557 | # Initialize from scratch. 558 | 559 | from numpy.linalg import lstsq 560 | 561 | class conv2d_bn(nn.Module): 562 | def __init__(self, conv, bn): 563 | super(conv2d_bn, self).__init__() 564 | self.conv = conv 565 | self.bn = bn 566 | def forward(self,x): 567 | return self.bn(self.conv(x)) 568 | 569 | 570 | class controlledConv(nn.Module): 571 | def __init__(self, conv, X = None,bias = None, sparse = False, diagonal=False): 572 | super(controlledConv, self).__init__() 573 | self.padding = conv.padding 574 | self.stride = conv.stride 575 | self.dilation = conv.dilation 576 | self.conv = conv 577 | # Copy the weights as a constant from the original convolution -- 578 | # just to make sure it doesn't change 579 | s = conv.weight.size() 580 | #print 'size of conv:',s 581 | self.s = list(s) 582 | w = Variable(torch.Tensor(s).copy_(conv.weight.data)) 583 | w = w.view(s[0],-1).transpose(0,1) 584 | #print 'size of flattened weights' ,w.size() 585 | self.w = w.detach().cuda() 586 | 587 | self.my_bn = None 588 | s = conv.weight.size() 589 | R = s[0] 590 | #print 'size of X:',X.size() 591 | L = nn.Linear(X.size()[1],X.size()[0],bias=False) 592 | #print 'L:',L 593 | if X is None: 594 | L.weight.data = torch.eye(R) # Initilize to unit (e.g, keep configuration) 595 | else: 596 | L.weight.data = X 597 | self.L = L 598 | self.s[0] = L.weight.size()[0] 599 | hasBias = bias is not None 600 | if hasBias: 601 | s_bias = self.s[0] 602 | self.conv_bias = Variable(torch.Tensor(conv.bias.data.size()).copy_(conv.bias.data)) 603 | self.conv_bias = self.conv_bias.detach().cuda() 604 | #print 'self size:',self.s 605 | #if bias is None: # copy bias from current convolution. 606 | self.bias.data.copy_(conv.bias.data[:s_bias]) 607 | else: 608 | self.bias = None 609 | #else: 610 | # self.bias = bias 611 | 612 | for p in conv.parameters(): 613 | p.requires_grad = False 614 | 615 | def setConvLearnable(self,T): 616 | for p in self.conv.parameters(): 617 | p.requires_grad = T 618 | 619 | def set_bn(self,bn): 620 | my_bn = nn.BatchNorm2d(bn.num_features,affine=bn.affine) 621 | bn.eval() 622 | my_bn.load_state_dict(bn.state_dict()) 623 | my_bn.train() 624 | self.my_bn = my_bn 625 | self.old_bn = bn 626 | 627 | def forward(self,x, alpha = None): 628 | # Modify the weights 629 | #conv = self.conv 630 | s = self.s 631 | w = self.w 632 | if alpha is not None: 633 | #print 'got alpha' 634 | alpha1 = alpha.expand_as(w) 635 | newWeights = alpha1 * self.L(w) + (1-alpha1) * w 636 | if hasBias: 637 | alpha2 = alpha.squeeze().expand_as(self.bias) 638 | bias = alpha2 * self.bias + (1-alpha2) * self.conv_bias 639 | else: 640 | #print 'no alpha' 641 | newWeights = self.L(w) 642 | bias = self.bias 643 | newWeights = newWeights.transpose(0,1).contiguous() 644 | newWeights = newWeights.view(s) 645 | 646 | #print newWeights.size() 647 | #print bias.size() 648 | 649 | x = F.conv2d(x,newWeights,bias,stride=self.stride,padding=self.padding,dilation=self.dilation) 650 | 651 | # apply the batch normalization... 652 | if self.my_bn is not None: 653 | x_bn = self.my_bn(x) 654 | if alpha is not None: 655 | alpha3 = alpha.expand_as(x) 656 | x = alpha3 * x_bn + (1-alpha3) * self.old_bn(x) 657 | else: 658 | x = x_bn 659 | return x 660 | 661 | def checkApproximation(net1,net2): 662 | a_orig = list(net1.features.children()) 663 | a_fine = list(net2.features.children()) 664 | abs_errors = [] 665 | 666 | bar = progressbar.ProgressBar(max_value=len(a_fine)-1) 667 | for i,(orig,fine) in bar(enumerate(izip(a_orig,a_fine))): 668 | if type(orig) is nn.BatchNorm2d: 669 | # make sure the batch-norm layers are unchanged 670 | ss1 = orig.state_dict() 671 | ss2 = fine.state_dict() 672 | assert ((ss1['running_mean']-ss2['running_mean']).sum()==0 and \ 673 | (ss1['running_var']-ss2['running_var']).sum()==0), \ 674 | 'found mismatch between batch norm on layer {}'.format(i) 675 | 676 | continue 677 | 678 | if type(orig) is not nn.Conv2d: 679 | continue 680 | s1 = orig.weight.size() 681 | nOrigParams = np.prod(s1) 682 | nNewParams = s1[0]*(1+s1[0]) 683 | w1 = orig.weight.view(s1[0],-1) # Old weights 684 | s2 = fine.weight.size() 685 | w2 = fine.weight.view(s1[0],-1) # new weights 686 | A = ton(w1).T 687 | #A = A-np.mean(A,1,keepdims=True) 688 | B = ton(w2).T 689 | #B = A-np.mean(B,1,keepdims=True) 690 | X,residuals,rank,s = lstsq(A,B) # Approximation. 691 | cur_mean_error = np.abs((A.dot(X)-B)).mean() 692 | abs_errors.append(cur_mean_error) 693 | return abs_errors 694 | 695 | s1 = orig.weight.size() 696 | nOrigParams = np.prod(s1) 697 | nNewParams = s1[0]*(1+s1[0]) 698 | w1 = orig.weight.view(s1[0],-1) # Old weights 699 | s2 = fine.weight.size() 700 | w2 = fine.weight.view(s1[0],-1) # new weights 701 | A = ton(w1).T 702 | B = ton(w2).T 703 | X,residuals,rank,s = lstsq(A,B) # Approximation. 704 | m = controlledConv(orig,torch.Tensor(X.T),fine.bias) 705 | return m,A,B,X 706 | 707 | initializationTypes = ['linear_approx','random','diagonal'] 708 | def makeControlledConv(orig,fine,initializationType='linear_approx'): 709 | assert initializationType in initializationTypes,'Unknown initialization type from controlledConv: {}'.format(initializationType) 710 | s1 = orig.weight.size() 711 | s2 = fine.weight.size() 712 | nOrigParams = np.prod(s1) 713 | nNewParams = s2[0]*(1+s1[0]) 714 | 715 | print s1,s2 716 | 717 | w1 = orig.weight.view(s1[0],-1) # Old weights 718 | s2 = fine.weight.size() 719 | w2 = fine.weight.view(s2[0],-1) # new weights 720 | A = ton(w1).T 721 | B = ton(w2).T 722 | if initializationType == 'linear_approx': 723 | X,residuals,rank,s = lstsq(A,B) # Approximation. 724 | 725 | elif initializationType == 'random': 726 | X = torch.zeros(s1[0],s2[0]) 727 | #print '!!!!!!',X.size() 728 | init.xavier_uniform(X) 729 | X = X.numpy() 730 | elif initializationType == 'diagonal': 731 | # assert that s1 is a multiple of s2 732 | assert s1[0] % s2[0] == 0 733 | 734 | X = [torch.eye(s2[0])]* (s1[0] / s2[0]) 735 | X = torch.cat(X) 736 | X = X.numpy() 737 | 738 | else: 739 | raise Exception('This code should not be reached.') 740 | 741 | m = controlledConv(orig,torch.Tensor(X.T),fine.bias) 742 | 743 | return m,A,B,X 744 | 745 | def makeControllerNetwork(net_orig,net_fine, initializationType='linear_approx', verbose = True, trackValues = True): 746 | """ Given two sequential networks net_orig and net_fine with the same structure, 747 | reformulate B so that is is compactly represented by re-using the weights of A. 748 | Params : 749 | net_orig - the original network 750 | net_fine - network to be approximateed 751 | initializationType ['linear_approx'] 752 | 753 | verbose - whether to track and print the layer-wise error for some random input, stemming 754 | from the linear approximations. 755 | """ 756 | a_fine = list(net_fine.features.children()) 757 | for p in net_fine.parameters(): 758 | p.requires_grad=False 759 | 760 | a_orig = list(net_orig.features.children()) 761 | 762 | v = Variable(torch.randn(1,3,64,64)) 763 | v = v.cpu() 764 | 765 | value_fine = v.cuda() 766 | value_new = v.cuda() 767 | 768 | s_fine_vs_new = [] 769 | s_controlled_vs_fine = [] 770 | errors = [] 771 | newChildren = [] 772 | oldChildren = [] 773 | types = [] 774 | 775 | #U = list(a_fine) 776 | bar = progressbar.ProgressBar(max_value=len(a_fine)) 777 | 778 | for i,(orig,fine) in bar(enumerate(izip(a_orig,a_fine))): 779 | wasBN = False 780 | #print i, 781 | tt = type(fine) 782 | tt_str = str(tt) 783 | types.append(tt_str.split('.')[-1][:-2]) 784 | if tt is nn.Conv2d: 785 | #if verbose: print '(conv)' 786 | #if use_linear_approx: 787 | m,A,B,X = makeControlledConv(orig,fine,initializationType) 788 | #else: 789 | # m = controlledConv(orig,None) 790 | m.cuda() 791 | elif tt is nn.BatchNorm2d: 792 | 793 | wasBN = True 794 | m.set_bn(orig) 795 | m = orig 796 | #m = deepcopy(orig) 797 | #continue 798 | else: 799 | m = fine 800 | #if tt is nn.MaxPool2d: 801 | #if verbose: print '(maxpool2)' 802 | #elif tt is nn.ReLU: 803 | # if verbose: print '(relu)' 804 | 805 | value_fine_before = value_fine 806 | value_new_before = value_new 807 | 808 | oldChildren.append(fine) 809 | if not wasBN: 810 | newChildren.append(m) 811 | if trackValues: 812 | value_fine = fine(value_fine) 813 | value_new = m(value_new) 814 | curdiff = (value_fine-value_new).data.abs().mean() 815 | if verbose: 816 | print 'diff:',curdiff 817 | 818 | s_fine_vs_new.append(curdiff) 819 | 820 | return newChildren,oldChildren,s_fine_vs_new,types 821 | 822 | def scalarVar(s): 823 | return Variable(torch.ones(1).cuda() * s) 824 | 825 | def extractFeats(model,loader): 826 | # Extract all top-layer features once. 827 | cats = [] 828 | feats = [] 829 | for i,(a,b) in enumerate(loader): 830 | print i, 831 | a = Variable(a.cuda()) 832 | feats.append(ton(model(a))) 833 | cats.append(b.numpy()) 834 | feats = np.vstack(feats) 835 | cats = list(itertools.chain.from_iterable(cats)) 836 | return feats,cats 837 | def makeFeatLoader(model,loader,batch_size): 838 | feats,cats = extractFeats(model,loader) 839 | return DataLoader(TensorDataset(torch.Tensor(feats),torch.Tensor(cats)),batch_size=batch_size,shuffle=True) 840 | 841 | class shifterNet(nn.Module): 842 | def __init__(self, decider,shiftable): 843 | super(shifterNet, self).__init__() 844 | self.decider = decider 845 | self.shiftable = shiftable 846 | def forward(self, x): 847 | my_alpha = F.softmax(self.decider(x))[:,1:] 848 | my_alpha[my_alpha < .5] = 0 849 | my_alpha[my_alpha >= .5] = 1 850 | return self.shiftable(x,my_alpha) 851 | 852 | ''' 853 | class Scale(object): # This is a copy from the torchvision repository, it's just a version conflict 854 | 855 | """Rescales the input PIL.Image to the given 'size'. 856 | If 'size' is a 2-element tuple or list in the order of (width, height), it will be the exactly size to scale. 857 | If 'size' is a number, it will indicate the size of the smaller edge. 858 | For example, if height > width, then image will be 859 | rescaled to (size * height / width, size) 860 | size: size of the exactly size or the smaller edge 861 | interpolation: Default: PIL.Image.BILINEAR 862 | """ 863 | 864 | def __init__(self, size, interpolation=Image.BILINEAR): 865 | assert isinstance(size, int) or (isinstance(size, collections.Iterable) and len(size) == 2) 866 | self.size = size 867 | self.interpolation = interpolation 868 | 869 | def __call__(self, img): 870 | if isinstance(self.size, int): 871 | w, h = img.size 872 | if (w <= h and w == self.size) or (h <= w and h == self.size): 873 | return img 874 | if w < h: 875 | ow = self.size 876 | oh = int(self.size * h / w) 877 | return img.resize((ow, oh), self.interpolation) 878 | else: 879 | oh = self.size 880 | ow = int(self.size * w / h) 881 | return img.resize((ow, oh), self.interpolation) 882 | else: 883 | return img.resize(self.size, self.interpolation) 884 | ''' 885 | def getTrainableParams(model): 886 | if type(model) is list: 887 | return [p for p in model if p.requires_grad] 888 | else: 889 | return [p for p in model.parameters() if p.requires_grad] 890 | def makeTrainable(model,toggle): 891 | for p in model.parameters(): 892 | p.requires_grad = toggle 893 | if hasattr(model,'features'): 894 | for q in model.features: 895 | q.train() 896 | 897 | from PIL import Image,ImageOps 898 | import numbers 899 | 900 | class RandomCrop(object): 901 | """Crops the given PIL.Image at a random location to have a region of 902 | the given size. size can be a tuple (target_height, target_width) 903 | or an integer, in which case the target will be of a square shape (size, size) 904 | """ 905 | 906 | def __init__(self, size, padding=0, fill = 0): 907 | if isinstance(size, numbers.Number): 908 | self.size = (int(size), int(size)) 909 | else: 910 | self.size = size 911 | self.padding = padding 912 | self.fill = fill 913 | 914 | def __call__(self, img): 915 | if self.padding > 0: 916 | img = ImageOps.expand(img, border=self.padding, fill=self.fill) 917 | 918 | w, h = img.size 919 | th, tw = self.size 920 | if w == tw and h == th: 921 | return img 922 | 923 | x1 = random.randint(0, w - tw) 924 | y1 = random.randint(0, h - th) 925 | return img.crop((x1, y1, x1 + tw, y1 + th)) 926 | 927 | # Make a relatively lightweight model for the baselines 928 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] 929 | # Various configuration parameters 930 | 931 | baseDataDir = os.path.expanduser('~/data_transfer/') 932 | modelsBaseDir = os.path.expanduser('~/models') 933 | all_datasets = {} 934 | all_datasets['caltech256'] = {'trainDir': 'Caltech256/train/', 935 | 'testDir': 'Caltech256/test/', 936 | 'nClasses':257} 937 | all_datasets['omniglot'] = {'trainDir': 'omniglot/python/train/', 938 | 'testDir': 'omniglot/python/test/', 939 | 'nClasses':1623} 940 | all_datasets['daimler'] = {'trainDir': 'daimler/all_train/', 941 | 'testDir': 'daimler/all_test', 942 | 'nClasses':2} 943 | all_datasets['sketch'] = {'trainDir': 'sketch_train', 944 | 'testDir': 'sketch_test', 945 | 'nClasses':250} 946 | all_datasets['GTSR'] = {'trainDir': 'GTSR/Final_Training/', 947 | 'testDir': 'GTSR/Final_Test/', 948 | 'nClasses':43} 949 | all_datasets['CIFAR-10'] = {'trainDir': 'cifar-10/train/', 950 | 'testDir': 'cifar-10/test/', 951 | 'nClasses':10} 952 | all_datasets['CIFAR-100'] = {'trainDir': 'cifar-100/train/', 953 | 'testDir': 'cifar-100/test/', 954 | 'nClasses':100} 955 | 956 | all_datasets['SVHN'] = {'trainDir': 'svhn/train/', 957 | 'testDir': 'svhn/test/', 958 | 'nClasses':10} 959 | all_datasets['plankton'] = {'trainDir': 'plankton_train', 960 | 'testDir': 'plankton_test', 961 | 'nClasses':121} 962 | all_datasets['CUB'] = {'trainDir': 'CUB/train', 963 | 'testDir': 'CUB/test', 964 | 'nClasses':200} 965 | all_datasets['mnist'] = {'trainDir': 'mnist/train', 966 | 'testDir': 'mnist/test', 967 | 'nClasses':10} 968 | 969 | all_datasets_extra = {} 970 | for k in all_datasets.keys(): 971 | all_datasets_extra[k] = {} 972 | #all_datasets_extra['sketch'] = {'crop_fill':1} 973 | all_datasets_extra['SVHN'] = {'augment_flip':False} 974 | all_datasets_extra['omniglot'] = {'augment_flip':False} 975 | 976 | #dataset_stats = pickle.load(os.path.join(baseDataDir,'database_stats')) 977 | 978 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] # B 979 | 980 | big_cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] # D 981 | 982 | cuda=True 983 | lr_drop_freq = 10 984 | base_lr = 1e-3 985 | adjust_learning_rate = None 986 | 987 | import random 988 | def makeNet(name,bigNet=False,fullyconv=False,batch_norm=True): 989 | nClasses = all_datasets[name]['nClasses'] 990 | my_cfg = cfg 991 | if bigNet: 992 | my_cfg = big_cfg 993 | model = VGG(make_layers(my_cfg,batch_norm=batch_norm,fullyconv=fullyconv),fc_size=2048, num_classes= nClasses,fullyconv=fullyconv) 994 | return model 995 | ''' 996 | class RandomHorizontalFlip(object): 997 | """Randomly horizontally flips the given PIL.Image or np.ndarray with a probability of 0.5 998 | """ 999 | def __call__(self, img): 1000 | if random.random() < 0.5: 1001 | if isinstance(img, np.ndarray): 1002 | return np.fliplr(img) 1003 | else: 1004 | return img.transpose(Image.FLIP_LEFT_RIGHT) 1005 | return img 1006 | 1007 | 1008 | ''' 1009 | def makeLoaders2(name, stats = None): 1010 | """ 1011 | quick and easy loaders,with default values 1012 | """ 1013 | if stats is None: 1014 | stats = dataset_stats[name] 1015 | trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir']) 1016 | testDir = os.path.join(baseDataDir,all_datasets[name]['testDir']) 1017 | augment_flip = all_datasets_extra[name].get("augment_flip", False) 1018 | augment_crop = all_datasets_extra[name].get("augment_crop", False) 1019 | crop_fill = all_datasets_extra[name].get("crop_fill", 1020 | tuple( (255*dataset_stats[name][0]).astype(np.uint8))) 1021 | 1022 | print augment_flip,augment_crop,crop_fill 1023 | 1024 | train_loader,test_loader = makeLoaders(trainDir,testDir,stats,augment_flip=augment_flip, 1025 | augment_crop=augment_crop,crop_fill=crop_fill) 1026 | 1027 | return train_loader,test_loader 1028 | 1029 | if False: 1030 | def makeLoaders(train_dir,test_dir,stats,augment_flip=False, augment_crop = False, 1031 | crop_fill = 0): 1032 | # remove mean and divide by std (computed stats is variance, hence sqrt) 1033 | 1034 | normalize = transforms.Normalize(np.asarray(stats[0]), np.asarray(stats[1])**.5) 1035 | # random crop for jittering at train time. 1036 | transform_list = [Scale((64,64))] 1037 | if augment_crop: 1038 | transform_list.append(RandomCrop(64,8,fill = crop_fill)) 1039 | if augment_flip: 1040 | transform_list.append(RandomHorizontalFlip()) 1041 | 1042 | transform_list.extend([transforms.ToTensor(), normalize]) 1043 | transform_train=transforms.Compose(transform_list) 1044 | 1045 | db_train = DataLoader(dataset=datasets.ImageFolder(root = train_dir, transform=transform_train), 1046 | batch_size=128, shuffle=True,**kwargs) 1047 | transform_test=transforms.Compose([Scale((64,64)), transforms.ToTensor(), normalize]) 1048 | db_test = DataLoader(dataset=datasets.ImageFolder(root = test_dir, transform=transform_test), 1049 | batch_size=128, shuffle=True) 1050 | return db_train,db_test 1051 | 1052 | 1053 | def freezeAllButLastLayer(model): 1054 | for p in model.features.parameters(): 1055 | p.requires_grad = False 1056 | children = list(model.classifier.children()) 1057 | for p in children[:-1]: 1058 | for q in p.parameters(): 1059 | q.requires_grad = False 1060 | 1061 | 1062 | 1063 | 1064 | 1065 | def make_layers(cfg_1, batch_norm=False,instance_norm = False, affine=False,fullyconv=False): 1066 | #print 'fully conv:',fullyconv 1067 | cfg = list(cfg_1) # copy it to make sure it's not modified 1068 | if batch_norm and instance_norm: 1069 | raise Exception('cannot use both batch and instance normalization') 1070 | layers = [] 1071 | in_channels = 3 1072 | if fullyconv: 1073 | cfg.append(512) 1074 | #print cfg 1075 | for i,v in enumerate(cfg): 1076 | if v == 'M': 1077 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 1078 | else: 1079 | 1080 | my_kernel_size = 3 # hacky! 1081 | my_padding = 1 1082 | if fullyconv and i == len(cfg)-1: 1083 | #print '!' 1084 | my_kernel_size = 2 1085 | my_padding = 0 1086 | 1087 | conv2d = nn.Conv2d(in_channels, v, kernel_size=my_kernel_size, padding=my_padding) 1088 | #init.kaiming_normal(conv2d.weight,mode='fan_out') 1089 | init.kaiming_uniform(conv2d.weight) 1090 | if batch_norm: 1091 | layers +=[ conv2d, nn.BatchNorm2d(v, affine=affine), nn.ReLU(inplace=True)] 1092 | else: 1093 | layers += [conv2d, nn.ReLU(inplace=True)] 1094 | in_channels = v 1095 | return nn.Sequential(*layers) 1096 | 1097 | 1098 | def makeModelDirName(name,modelsBaseDir = modelsBaseDir, sfx='',baseNetwork=None): 1099 | modelDir = os.path.join(modelsBaseDir,'baseline_'+name+sfx) 1100 | if baseNetwork is not None: 1101 | modelDir += '_from_'+baseNetwork['name'] 1102 | if baseNetwork['onlyLastLayer']: 1103 | modelDir+='_last' 1104 | return modelDir 1105 | 1106 | def doTrainingStuff(name,modelsBaseDir = modelsBaseDir, maxIters=np.inf, override=False,differentStats = None, 1107 | augment_flip=True,augment_crop = False, bigNet=False, 1108 | base_lr = 1e-3, baseNetwork=None,epochs=50,sfx='',batch_norm=True, 1109 | instance_norm=False, affine=False,cuda=True,lr_drop_freq=lr_drop_freq,optimizer=None, 1110 | adjust_learning_rate=adjust_learning_rate,disableBatchNorm=False,fullyconv=False): 1111 | 1112 | 1113 | trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir']) 1114 | testDir = os.path.join(baseDataDir,all_datasets[name]['testDir']) 1115 | 1116 | modelDir = makeModelDirName(name,modelsBaseDir = modelsBaseDir,sfx=sfx,baseNetwork=baseNetwork) 1117 | 1118 | if override: 1119 | if not os.path.isdir(modelDir): 1120 | print '{} : nothing to override - creating anew'.format(name) 1121 | else: 1122 | print 'warning - moving model for {} to backup at {}_BAK',name,modelDir 1123 | shutil.move(modelDir,modelDir+'_BAK') 1124 | 1125 | if differentStats is not None: 1126 | print 'using different stats...' 1127 | stats = differentStats 1128 | else: 1129 | print 1130 | stats = dataset_stats[name] 1131 | 1132 | 1133 | augment_flip = all_datasets_extra[name].get("augment_flip", augment_flip) 1134 | augment_crop = all_datasets_extra[name].get("augment_crop", augment_crop) 1135 | crop_fill = all_datasets_extra[name].get("crop_fill", 1136 | tuple( (255*dataset_stats[name][0]).astype(np.uint8))) 1137 | 1138 | train_loader,test_loader = makeLoaders(trainDir,testDir,stats,augment_flip=augment_flip, 1139 | augment_crop=augment_crop,crop_fill=crop_fill) 1140 | #disableBatchNorm = False 1141 | 1142 | if baseNetwork is None: 1143 | print 'training network from scratch...' 1144 | #model = makeNet(name,bigNet) 1145 | 1146 | nClasses = all_datasets[name]['nClasses'] 1147 | my_cfg = cfg 1148 | if bigNet: 1149 | my_cfg = big_cfg 1150 | model = VGG(make_layers(my_cfg, batch_norm=batch_norm, instance_norm=instance_norm, affine=affine,fullyconv=fullyconv), 1151 | fc_size=2048, num_classes = nClasses,fullyconv=fullyconv) 1152 | 1153 | else: # fine tune from an existing network. 1154 | 1155 | print 'fine tuning network from',baseNetwork['name'] 1156 | epochs = baseNetwork.get('max_ft_epochs',epochs) 1157 | toContinue = baseNetwork.get('toContinue',False) 1158 | model = baseNetwork['net'] 1159 | if not toContinue: 1160 | mod = list(model.classifier.children()) 1161 | mod.pop() 1162 | nClasses = all_datasets[name]['nClasses'] 1163 | mod.append(torch.nn.Linear(512, nClasses)) 1164 | model.classifier = nn.Sequential(*mod) 1165 | control = baseNetwork.get('control',False) 1166 | if control: 1167 | raise NotImplementedError('Still need to link this to the controlling module') 1168 | if baseNetwork['onlyLastLayer']: 1169 | freezeAllButLastLayer(model) 1170 | 1171 | #disableBatchNorm = baseNetwork.get('disableBatchNorm',True) 1172 | 1173 | 1174 | # TODO - should we disallow the batch-norm layers to change from now? 1175 | #'onlyLastLayer':True,'max_ft_epochs':max_ft_epochs}) 1176 | params = [p for p in model.parameters() if p.requires_grad] 1177 | if cuda: 1178 | model.cuda(); 1179 | 1180 | if optimizer is None: 1181 | optimizer = optim.Adam(params = params) 1182 | elif type(optimizer) is str: 1183 | if optimizer=='sgd': 1184 | optimizer = optim.SGD(lr=base_lr,momentum=.9,weight_decay=0.0001,params = params) 1185 | elif optimizer=='rmsprop': 1186 | optimizer = optim.RMSprop(lr=base_lr,momentum=.9,weight_decay=0.0001,params = params) 1187 | else: 1188 | raise Exception('unexpected optimizer') 1189 | 1190 | 1191 | trainAndTest(model= model, modelDir = modelDir, epochs=epochs, targetTranslator=None, model_save_freq=5, 1192 | train_loader=train_loader, test_loader=test_loader, stopIfPerfect=True, optimizer=optimizer, 1193 | criterion =nn.CrossEntropyLoss(), adjust_learning_rate=adjust_learning_rate, 1194 | maxIters=maxIters,base_lr=base_lr, lr_drop_freq=lr_drop_freq, disableBatchNorm=disableBatchNorm, 1195 | cuda=cuda) 1196 | return model 1197 | def loadLastCheckpoint(model,modelDir,removeBest=False,verbose=False, onlyPerf = False): 1198 | g = list(sorted(glob.glob(os.path.join(modelDir,'*')))) 1199 | 1200 | 1201 | if verbose: 1202 | print 'number of saved checkpoints:',len(g) 1203 | 1204 | if removeBest: 1205 | g = [a for a in g if 'best' not in a] 1206 | 1207 | if len(g) > 0: 1208 | lastCheckpoint = g[-1] 1209 | if verbose: 1210 | print 'last checkpoint:',lastCheckpoint 1211 | 1212 | #cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] 1213 | #model = VGG(make_layers(cfg,True),fc_size=2048, num_classes= 1624) 1214 | T = torch.load(lastCheckpoint) 1215 | if not onlyPerf: 1216 | if type(T) is dict: 1217 | model.load_state_dict(T['state_dict']) 1218 | else: 1219 | model.load_state_dict(T) 1220 | model.cuda(); 1221 | 1222 | if verbose: 1223 | print 'loaded with accuracy of', T['best_acc'] 1224 | 1225 | return T,model 1226 | else: 1227 | raise Exception('No checkpoint found for {}'.format(modelDir)) 1228 | 1229 | def testNet(name,maxSamples=500,modelDir=None): 1230 | trainDir = os.path.join(baseDataDir,all_datasets[name]['trainDir']) 1231 | testDir = os.path.join(baseDataDir,all_datasets[name]['testDir']) 1232 | 1233 | if modelDir is None: 1234 | print 'reverting to default model dir.' 1235 | modelDir = os.path.join(modelsBaseDir,'baseline_'+name) 1236 | train_loader,test_loader = makeLoaders(trainDir,testDir,dataset_stats[name]) 1237 | 1238 | 1239 | print 'testing dataset:',name 1240 | 1241 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'] 1242 | model = VGG(make_layers(cfg,True),fc_size=2048, num_classes= all_datasets[name]['nClasses']) 1243 | checkpoint = loadLastCheckpoint(model,modelDir) 1244 | return quickTest(model,test_loader,maxSamples=maxSamples) 1245 | 1246 | def loadNet(name,verbose=False,isalpha=False): 1247 | """ 1248 | loads the default network for this name 1249 | """ 1250 | if verbose: 1251 | print 'loading',name 1252 | model = makeNet(name) 1253 | modelDir = makeModelDirName(name) 1254 | checkpoint = loadLastCheckpoint(model,modelDir,verbose=verbose) 1255 | return model,checkpoint 1256 | 1257 | # functions to "concatenate" the features parts neural networks. 1258 | 1259 | # concatenate convolutions 1260 | 1261 | def concatConv(c1,c2): 1262 | newWeights = torch.cat([c1.weight,c2.weight]) 1263 | newBias = torch.cat([c1.bias,c2.bias]) 1264 | s = c1.weight.size() 1265 | c3 = nn.Conv2d(s[1],2*s[0],s[2],stride = c1.stride, padding=c1.padding) 1266 | c3.weight.data = newWeights.data 1267 | c3.bias.data = newBias.data 1268 | return c3 1269 | 1270 | 1271 | def concatBN(bn1,bn2): 1272 | s = bn1.num_features 1273 | bn3 = nn.BatchNorm2d(s*2) 1274 | for k in ['running_mean','running_var']: 1275 | bn3.state_dict()[k] = torch.cat([bn1.state_dict()[k],bn1.state_dict()[k]]) 1276 | return bn3 1277 | 1278 | 1279 | # good! 1280 | def concatNets(net1,net2): # concatenet :-) 1281 | newFeatures = [] 1282 | for f1,f2 in zip(net1.features,net2.features): 1283 | tt1,tt2 = type(f1),type(f2) 1284 | assert tt1==tt2,'cannot concatenate networks with different structures' 1285 | if tt1 is nn.Conv2d: 1286 | newFeatures.append(concatConv(f1,f2)) 1287 | elif tt1 is nn.BatchNorm2d: 1288 | newFeatures.append(concatBN(f1,f2)) 1289 | elif tt1 in [nn.MaxPool2d, nn.ReLU]: 1290 | newFeatures.append(f1) 1291 | else: 1292 | Exception('Don''t know how to "concatenate" modules of types {},{}'.format(tt1,tt2)) 1293 | return newFeatures 1294 | 1295 | '''ResNet in PyTorch. 1296 | BasicBlock and Bottleneck module is from the original ResNet paper: 1297 | [1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 1298 | Deep Residual Learning for Image Recognition. arXiv:1512.03385 1299 | PreActBlock and PreActBottleneck module is from the later paper: 1300 | [2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun 1301 | Identity Mappings in Deep Residual Networks. arXiv:1603.05027 1302 | ''' 1303 | import torch 1304 | import torch.nn as nn 1305 | import torch.nn.functional as F 1306 | 1307 | from torch.autograd import Variable 1308 | 1309 | def conv3x3(in_planes, out_planes, stride=1): 1310 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 1311 | 1312 | 1313 | class BasicBlock(nn.Module): 1314 | expansion = 1 1315 | 1316 | def __init__(self, in_planes, planes, stride=1): 1317 | super(BasicBlock, self).__init__() 1318 | self.conv1 = conv3x3(in_planes, planes, stride) 1319 | self.bn1 = nn.BatchNorm2d(planes) 1320 | self.conv2 = conv3x3(planes, planes) 1321 | self.bn2 = nn.BatchNorm2d(planes) 1322 | 1323 | self.shortcut = nn.Sequential() 1324 | if stride != 1 or in_planes != self.expansion*planes: 1325 | self.shortcut = nn.Sequential( 1326 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 1327 | nn.BatchNorm2d(self.expansion*planes) 1328 | ) 1329 | 1330 | def forward(self, x): 1331 | out = F.relu(self.bn1(self.conv1(x))) 1332 | out = self.bn2(self.conv2(out)) 1333 | out += self.shortcut(x) 1334 | out = F.relu(out) 1335 | return out 1336 | 1337 | 1338 | class PreActBlock(nn.Module): 1339 | '''Pre-activation version of the BasicBlock.''' 1340 | expansion = 1 1341 | 1342 | def __init__(self, in_planes, planes, stride=1): 1343 | super(PreActBlock, self).__init__() 1344 | self.bn1 = nn.BatchNorm2d(in_planes) 1345 | self.conv1 = conv3x3(in_planes, planes, stride) 1346 | self.bn2 = nn.BatchNorm2d(planes) 1347 | self.conv2 = conv3x3(planes, planes) 1348 | 1349 | self.shortcut = nn.Sequential() 1350 | if stride != 1 or in_planes != self.expansion*planes: 1351 | self.shortcut = nn.Sequential( 1352 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 1353 | ) 1354 | 1355 | def forward(self, x): 1356 | out = F.relu(self.bn1(x)) 1357 | shortcut = self.shortcut(out) 1358 | out = self.conv1(out) 1359 | out = self.conv2(F.relu(self.bn2(out))) 1360 | out += shortcut 1361 | return out 1362 | 1363 | 1364 | class Bottleneck(nn.Module): 1365 | expansion = 4 1366 | 1367 | def __init__(self, in_planes, planes, stride=1): 1368 | super(Bottleneck, self).__init__() 1369 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 1370 | self.bn1 = nn.BatchNorm2d(planes) 1371 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 1372 | self.bn2 = nn.BatchNorm2d(planes) 1373 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 1374 | self.bn3 = nn.BatchNorm2d(self.expansion*planes) 1375 | 1376 | self.shortcut = nn.Sequential() 1377 | if stride != 1 or in_planes != self.expansion*planes: 1378 | self.shortcut = nn.Sequential( 1379 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), 1380 | nn.BatchNorm2d(self.expansion*planes) 1381 | ) 1382 | 1383 | def forward(self, x): 1384 | out = F.relu(self.bn1(self.conv1(x))) 1385 | out = F.relu(self.bn2(self.conv2(out))) 1386 | out = self.bn3(self.conv3(out)) 1387 | out += self.shortcut(x) 1388 | out = F.relu(out) 1389 | return out 1390 | 1391 | class PreActBottleneck(nn.Module): 1392 | '''Pre-activation version of the original Bottleneck module.''' 1393 | expansion = 4 1394 | def __init__(self, in_planes, planes, stride=1): 1395 | super(PreActBottleneck, self).__init__() 1396 | self.bn1 = nn.BatchNorm2d(in_planes) 1397 | self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) 1398 | self.bn2 = nn.BatchNorm2d(planes) 1399 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 1400 | self.bn3 = nn.BatchNorm2d(planes) 1401 | self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) 1402 | 1403 | self.shortcut = nn.Sequential() 1404 | if stride != 1 or in_planes != self.expansion*planes: 1405 | self.shortcut = nn.Sequential( 1406 | nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) 1407 | ) 1408 | 1409 | def forward(self, x): 1410 | out = F.relu(self.bn1(x)) 1411 | shortcut = self.shortcut(out) 1412 | out = self.conv1(out) 1413 | out = self.conv2(F.relu(self.bn2(out))) 1414 | out = self.conv3(F.relu(self.bn3(out))) 1415 | out += shortcut 1416 | return out 1417 | 1418 | def default_loader(path): 1419 | return Image.open(path).convert('RGB') 1420 | 1421 | def default_flist_reader(flist): 1422 | """ 1423 | flist format: impath label\nimpath label\n ...(same to caffe's filelist) 1424 | """ 1425 | imlist = [] 1426 | with open(flist, 'r') as rf: 1427 | for line in rf.readlines(): 1428 | impath, imlabel = line.strip().split() 1429 | imlist.append( (impath, int(imlabel)) ) 1430 | 1431 | return imlist 1432 | 1433 | class ImageFilelist(data.Dataset): 1434 | def __init__(self, root, flist, transform=None, target_transform=None, 1435 | flist_reader=default_flist_reader, loader=default_loader): 1436 | self.root = root 1437 | self.imlist = flist_reader(flist) 1438 | self.transform = transform 1439 | self.target_transform = target_transform 1440 | self.loader = loader 1441 | 1442 | def __getitem__(self, index): 1443 | impath, target = self.imlist[index] 1444 | img = self.loader(os.path.join(self.root,impath)) 1445 | if self.transform is not None: 1446 | img = self.transform(img) 1447 | if self.target_transform is not None: 1448 | target = self.target_transform(target) 1449 | 1450 | return img, target 1451 | 1452 | def __len__(self): 1453 | return len(self.imlist) 1454 | --------------------------------------------------------------------------------