├── advrush ├── hessianflow │ ├── optimizer │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── absa.cpython-36.pyc │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── baseline.cpython-36.pyc │ │ │ ├── optm_utils.cpython-36.pyc │ │ │ └── progressbar.cpython-36.pyc │ │ ├── baseline.py │ │ ├── progressbar.py │ │ ├── optm_utils.py │ │ └── absa.py │ ├── __pycache__ │ │ ├── eigen.cpython-36.pyc │ │ ├── utils.cpython-36.pyc │ │ └── __init__.cpython-36.pyc │ ├── __init__.py │ ├── utils.py │ └── eigen.py ├── visualize.py ├── genotypes.py ├── operations.py ├── architect.py ├── trades.py ├── utils.py ├── model_search.py ├── model.py ├── adv_train.py ├── train_search.py └── regularizer.py ├── README.md ├── eval ├── genotypes.py ├── operations.py ├── utils.py ├── model.py └── pgd_attack.py └── LICENSE /advrush/hessianflow/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .baseline import baseline 2 | from .absa import absa 3 | -------------------------------------------------------------------------------- /advrush/hessianflow/__pycache__/eigen.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/__pycache__/eigen.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Hessian tool for neural networks based on pytorch 0.4.1 3 | """ 4 | 5 | name = 'Hessian Flow' 6 | 7 | from .eigen import * 8 | -------------------------------------------------------------------------------- /advrush/hessianflow/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/__pycache__/absa.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/optimizer/__pycache__/absa.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/optimizer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/__pycache__/baseline.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/optimizer/__pycache__/baseline.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/__pycache__/optm_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/optimizer/__pycache__/optm_utils.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/__pycache__/progressbar.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nutellamok/advrush/HEAD/advrush/hessianflow/optimizer/__pycache__/progressbar.cpython-36.pyc -------------------------------------------------------------------------------- /advrush/visualize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import genotypes 3 | from graphviz import Digraph 4 | 5 | 6 | def plot(genotype, filename): 7 | g = Digraph( 8 | format='pdf', 9 | edge_attr=dict(fontsize='20', fontname="times"), 10 | node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"), 11 | engine='dot') 12 | g.body.extend(['rankdir=LR']) 13 | 14 | g.node("c_{k-2}", fillcolor='darkseagreen2') 15 | g.node("c_{k-1}", fillcolor='darkseagreen2') 16 | assert len(genotype) % 2 == 0 17 | steps = len(genotype) // 2 18 | 19 | for i in range(steps): 20 | g.node(str(i), fillcolor='lightblue') 21 | 22 | for i in range(steps): 23 | for k in [2*i, 2*i + 1]: 24 | op, j = genotype[k] 25 | if j == 0: 26 | u = "c_{k-2}" 27 | elif j == 1: 28 | u = "c_{k-1}" 29 | else: 30 | u = str(j-2) 31 | v = str(i) 32 | g.edge(u, v, label=op, fillcolor="gray") 33 | 34 | g.node("c_{k}", fillcolor='palegoldenrod') 35 | for i in range(steps): 36 | g.edge(str(i), "c_{k}", fillcolor="gray") 37 | 38 | g.render(filename, view=True) 39 | 40 | 41 | if __name__ == '__main__': 42 | if len(sys.argv) != 2: 43 | print("usage:\n python {} ARCH_NAME".format(sys.argv[0])) 44 | sys.exit(1) 45 | 46 | genotype_name = sys.argv[1] 47 | try: 48 | genotype = eval('genotypes.{}'.format(genotype_name)) 49 | except AttributeError: 50 | print("{} is not specified in genotypes.py".format(genotype_name)) 51 | sys.exit(1) 52 | 53 | plot(genotype.normal, "advrush_normal") 54 | plot(genotype.reduce, "advrush_reduction") 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AdvRush 2 | Official Code for [AdvRush: Searching for Adversarially Robust Neural Architectures](https://openaccess.thecvf.com/content/ICCV2021/html/Mok_AdvRush_Searching_for_Adversarially_Robust_Neural_Architectures_ICCV_2021_paper.html) (ICCV '21) 3 | 4 | ## Environmental Set-up 5 | ``` 6 | Python == 3.6.12, PyTorch == 1.2.0, torchvision == 0.4.0 7 | ``` 8 | 9 | ## AdvRush Search Process 10 | ``` 11 | cd advrush && python train_search.py --batch_size 32 --gpu 0 --epochs 60 --a_gamma 0.01 --a_warmup_epochs 50 --w_warmup_epochs 60 --loss_hessian loss_cure 12 | ``` 13 | 14 | ## Adversarial Training 15 | ``` 16 | cd advrush && python adv_train.py --batch_size 64 --gpu 0 --epochs 200 --adv_loss pgd --arch ADVRUSH 17 | ``` 18 | 19 | ## Evaluation under PGD Attack 20 | Prior to the evaluation process, add all necessary checkpoint files (preferably in the form of .pth.tar) to the /eval/checkpoints folder. 21 | To conduct white-box attacks, 22 | ``` 23 | cd eval && 24 | python pgd_attack.py --white-box-attack True --test-batch-size 10 --arch [arch_name] --checkpoint [./checkpoints/file_name.pth.tar] --data_type [cifar10/svhn] 25 | ``` 26 | 27 | To conduct black-box attacks, 28 | ``` 29 | cd eval && 30 | python pgd_attack.py --test-batch-size 10 --target_arch [target_arch] --target_checkpoint [./checkpoints/target_file.pth.tar] --source_arch [source_arch] --source_checkpoint [./checkpoints/source_file.pth.tar] --data_type cifar10 31 | ``` 32 | 33 | ## References 34 | 35 | DARTS: Differentiable Architecture Search [ICLR '19] [code](https://github.com/quark0/darts) [paper](https://arxiv.org/abs/1806.09055) 36 | 37 | Robustness via Curvature Regularization, and Vice Versa [CVPR '19] [code](https://github.com/F-Salehi/CURE_robustness) [paper](https://openaccess.thecvf.com/content_CVPR_2019/papers/Moosavi-Dezfooli_Robustness_via_Curvature_Regularization_and_Vice_Versa_CVPR_2019_paper.pdf) 38 | 39 | Tradeoff-inspired Adversarial Defense via Surrogate-loss Minimization [ICML '19] [code](https://github.com/yaodongyu/TRADES) [paper](https://arxiv.org/pdf/1901.08573.pdf) 40 | -------------------------------------------------------------------------------- /advrush/hessianflow/utils.py: -------------------------------------------------------------------------------- 1 | #* 2 | # @file Different utility functions 3 | # Copyright (c) Zhewei Yao, Amir Gholami 4 | # All rights reserved. 5 | # This file is part of HessianFlow library. 6 | # 7 | # HessianFlow is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # HessianFlow is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with HessianFlow. If not, see . 19 | #* 20 | 21 | import torch 22 | import math 23 | from torch.autograd import Variable 24 | import numpy as np 25 | 26 | 27 | def group_product(xs, ys): 28 | """ 29 | the inner product of two lists of variables xs,ys 30 | :param xs: 31 | :param ys: 32 | :return: 33 | """ 34 | return sum([torch.sum(x * y) for (x, y) in zip(xs, ys)]) 35 | 36 | def group_add(params, update, alpha=1): 37 | """ 38 | params = params + update*alpha 39 | :param params: list of variable 40 | :param update: list of data 41 | :return: 42 | """ 43 | for i,p in enumerate(params): 44 | params[i].data.add_(update[i] * alpha) 45 | return params 46 | 47 | def normalization(v): 48 | """ 49 | normalization of a list of vectors 50 | return: normalized vectors v 51 | """ 52 | s = group_product(v,v) 53 | s = s ** 0.5 54 | s = s.cpu().item() 55 | v = [vi / (s + 1e-6) for vi in v] 56 | return v 57 | 58 | 59 | def get_params_grad(model): 60 | """ 61 | get model parameters and corresponding gradients 62 | """ 63 | params = [] 64 | grads = [] 65 | for param in model.parameters(): 66 | params.append(param) 67 | if param.grad is None: 68 | continue 69 | grads.append(param.grad + 0.) 70 | return params, grads 71 | 72 | def hessian_vector_product(gradsH, params, v): 73 | """ 74 | compute the hessian vector product of Hv, where 75 | gradsH is the gradient at the current point, 76 | params is the corresponding variables, 77 | v is the vector. 78 | """ 79 | hv = torch.autograd.grad(gradsH, params, grad_outputs = v, only_inputs = True, retain_graph = True) 80 | return hv 81 | 82 | -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/baseline.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torchvision import datasets, transforms 8 | from torch.autograd import Variable 9 | 10 | from .progressbar import progress_bar 11 | from .optm_utils import exp_lr_scheduler, test 12 | 13 | # import hessianflow 14 | 15 | def baseline(model, train_loader, test_loader, criterion, optimizer, epochs, lr_decay_epoch, 16 | lr_decay_ratio, batch_size = 128, max_large_ratio = 1, cuda = True): 17 | """ 18 | baseline method training, i,e, vanilla training schedule 19 | """ 20 | 21 | inner_loop = 0 22 | num_updates = 0 23 | large_ratio = max_large_ratio 24 | # assert that shuffle is set for train_loader 25 | # assert and explain large ratio 26 | # assert that the train_loader is always set with a small batch size if not print error/warning telling 27 | # the user to instead use large_ratio 28 | for epoch in range(1, epochs + 1): 29 | print('\nCurrent Epoch: ', epoch) 30 | print('\nTraining') 31 | train_loss = 0. 32 | total_num = 0. 33 | correct = 0. 34 | 35 | for batch_idx, (data, target) in enumerate(train_loader): 36 | if target.size(0) < 128: 37 | continue 38 | model.train() 39 | # gather input and target for large batch training 40 | inner_loop += 1 41 | # get small model update 42 | if cuda: 43 | data, target = data.cuda(), target.cuda() 44 | output = model(data) 45 | loss = criterion(output, target)/float(large_ratio) 46 | loss.backward() 47 | train_loss += loss.item()*target.size(0)*float(large_ratio) 48 | total_num += target.size(0) 49 | _, predicted = output.max(1) 50 | correct += predicted.eq(target).sum().item() 51 | 52 | if inner_loop % large_ratio == 0: 53 | num_updates += 1 54 | optimizer.step() 55 | inner_loop = 0 56 | optimizer.zero_grad() 57 | 58 | progress_bar(batch_idx, len(train_loader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 59 | % (train_loss / total_num, 60 | 100. * correct / total_num, correct, total_num)) 61 | 62 | if epoch in lr_decay_epoch: 63 | exp_lr_scheduler(optimizer, decay_ratio=lr_decay_ratio) 64 | 65 | test(model, test_loader) 66 | return model, num_updates 67 | -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/progressbar.py: -------------------------------------------------------------------------------- 1 | ''' 2 | The progress_bar is from: 3 | https://github.com/noahgolmant/skeletor/blob/master/skeletor/utils.py 4 | ''' 5 | 6 | import os 7 | import sys 8 | import time 9 | import math 10 | 11 | ######## fancy progress bar 12 | try: 13 | _, term_width = os.popen('stty size', 'r').read().split() 14 | except: 15 | term_width = 100 16 | term_width = int(term_width) 17 | 18 | 19 | TOTAL_BAR_LENGTH = 65. 20 | last_time = time.time() 21 | begin_time = last_time 22 | def progress_bar(current, total, msg=None): 23 | global last_time, begin_time 24 | if current == 0: 25 | begin_time = time.time() # Reset for new bar. 26 | 27 | cur_len = int(TOTAL_BAR_LENGTH*current/total) 28 | rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 29 | 30 | sys.stdout.write(' [') 31 | for i in range(cur_len): 32 | sys.stdout.write('=') 33 | sys.stdout.write('>') 34 | for i in range(rest_len): 35 | sys.stdout.write('.') 36 | sys.stdout.write(']') 37 | 38 | cur_time = time.time() 39 | step_time = cur_time - last_time 40 | last_time = cur_time 41 | tot_time = cur_time - begin_time 42 | 43 | L = [] 44 | L.append(' Step: %s' % format_time(step_time)) 45 | L.append(' | Tot: %s' % format_time(tot_time)) 46 | if msg: 47 | L.append(' | ' + msg) 48 | 49 | msg = ''.join(L) 50 | sys.stdout.write(msg) 51 | for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): 52 | sys.stdout.write(' ') 53 | 54 | # Go back to the center of the bar. 55 | for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2): 56 | sys.stdout.write('\b') 57 | sys.stdout.write(' %d/%d ' % (current+1, total)) 58 | 59 | if current < total-1: 60 | sys.stdout.write('\r') 61 | else: 62 | sys.stdout.write('\n') 63 | sys.stdout.flush() 64 | 65 | def format_time(seconds): 66 | days = int(seconds / 3600/24) 67 | seconds = seconds - days*3600*24 68 | hours = int(seconds / 3600) 69 | seconds = seconds - hours*3600 70 | minutes = int(seconds / 60) 71 | seconds = seconds - minutes*60 72 | secondsf = int(seconds) 73 | seconds = seconds - secondsf 74 | millis = int(seconds*1000) 75 | 76 | f = '' 77 | i = 1 78 | if days > 0: 79 | f += str(days) + 'D' 80 | i += 1 81 | if hours > 0 and i <= 2: 82 | f += str(hours) + 'h' 83 | i += 1 84 | if minutes > 0 and i <= 2: 85 | f += str(minutes) + 'm' 86 | i += 1 87 | if secondsf > 0 and i <= 2: 88 | f += str(secondsf) + 's' 89 | i += 1 90 | if millis > 0 and i <= 2: 91 | f += str(millis) + 'ms' 92 | i += 1 93 | if f == '': 94 | f = '0ms' 95 | return f 96 | -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/optm_utils.py: -------------------------------------------------------------------------------- 1 | 2 | #* 3 | # @file optm_utils.py different utility functions 4 | # This file is part of HessianFlow library. 5 | # 6 | # HessianFlow is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | # 11 | # HessianFlow is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with HessianFlow. If not, see . 18 | #* 19 | from __future__ import print_function 20 | import numpy as np 21 | import torch 22 | import torch.nn as nn 23 | import torch.nn.functional as F 24 | import torch.optim as optim 25 | from torchvision import datasets, transforms 26 | from torch.autograd import Variable 27 | from .progressbar import progress_bar 28 | 29 | 30 | def fgsm(model, data, target, eps, cuda = True): 31 | """Generate an adversarial pertubation using the fast gradient sign method. 32 | 33 | Args: 34 | data: input image to perturb 35 | """ 36 | model.eval() 37 | if cuda: 38 | data, target = data.cuda(), target.cuda() 39 | data.requires_grad = True 40 | model.zero_grad() 41 | output = model(data) 42 | loss = F.cross_entropy(output, target) 43 | loss.backward(create_graph = False) 44 | pertubation = eps * torch.sign(data.grad.data) 45 | x_fgsm = data.data + pertubation 46 | X_adv = torch.clamp(x_fgsm, torch.min(data.data), torch.max(data.data)) 47 | 48 | return X_adv.cpu() 49 | 50 | def exp_lr_scheduler(optimizer, decay_ratio = 0.1): 51 | """ 52 | Decay learning rate by a factor of lr_decay 53 | """ 54 | for param_group in optimizer.param_groups: 55 | param_group['lr'] *= decay_ratio 56 | return optimizer 57 | 58 | 59 | def test(model, test_loader): 60 | """ 61 | Evaluation the performance of model on test_loader 62 | """ 63 | print('\nTesting') 64 | model.eval() 65 | correct = 0 66 | total = 0 67 | with torch.no_grad(): 68 | for batch_idx, (inputs, targets) in enumerate(test_loader): 69 | inputs, targets = inputs.cuda(), targets.cuda() 70 | outputs = model(inputs) 71 | _, predicted = outputs.max(1) 72 | total += targets.size(0) 73 | correct += predicted.eq(targets).sum().item() 74 | 75 | progress_bar(batch_idx, len(test_loader), 'Acc: %.3f%% (%d/%d)' 76 | % (100. * correct/total, correct, total)) 77 | 78 | return correct * 100 / total 79 | -------------------------------------------------------------------------------- /advrush/hessianflow/eigen.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | from torch.autograd import Variable 4 | import numpy as np 5 | 6 | from .utils import * 7 | 8 | 9 | def get_eigen(model, inputs, targets, criterion, cuda = True, maxIter = 50, tol = 1e-3): 10 | """ 11 | compute the top eigenvalues of model parameters and 12 | the corresponding eigenvectors. 13 | """ 14 | if cuda: 15 | inputs, targets = inputs.cuda(), targets.cuda() 16 | device = 'cuda' 17 | else: 18 | device = 'cpu' 19 | # change the model to evaluation mode, otherwise the batch Normalization Layer will change. 20 | # If you call this functino during training, remember to change the mode back to training mode. 21 | model.eval() 22 | 23 | outputs = model(inputs) 24 | loss = criterion(outputs, targets) 25 | loss.backward(create_graph = True) 26 | 27 | params, gradsH = get_params_grad(model) 28 | v = [torch.randn(p.size()).to(device) for p in params] 29 | v = normalization(v) 30 | 31 | eigenvalue = None 32 | 33 | for i in range(maxIter): 34 | model.zero_grad() 35 | Hv = hessian_vector_product(gradsH, params, v) 36 | eigenvalue_tmp = group_product(Hv, v).cpu().item() 37 | v = normalization(Hv) 38 | if eigenvalue == None: 39 | eigenvalue = eigenvalue_tmp 40 | else: 41 | if abs(eigenvalue-eigenvalue_tmp)/abs(eigenvalue) < tol: 42 | return eigenvalue_tmp, v 43 | else: 44 | eigenvalue = eigenvalue_tmp 45 | return eigenvalue, v 46 | 47 | def get_eigen_full_dataset(model, dataloader, criterion, cuda = True, maxIter = 50, tol = 1e-3): 48 | """ 49 | compute the top eigenvalues of model parameters and 50 | the corresponding eigenvectors with a full dataset. 51 | Notice, this is very expensive. 52 | """ 53 | if cuda: 54 | device = 'cuda' 55 | else: 56 | device = 'cpu' 57 | # change the model to evaluation mode, otherwise the batch Normalization Layer will change. 58 | # If you call this functino during training, remember to change the mode back to training mode. 59 | model.eval() 60 | 61 | 62 | params,_ = get_params_grad(model) 63 | v = [torch.randn(p.size()).to(device) for p in params] 64 | v = normalization(v) 65 | 66 | batch_size = None 67 | eigenvalue = None 68 | 69 | for i in range(maxIter): 70 | THv = [torch.zeros(p.size()).to(device) for p in params] 71 | counter = 0 72 | for inputs, targets in dataloader: 73 | 74 | if batch_size == None: 75 | batch_size = targets.size(0) 76 | 77 | if targets.size(0) < batch_size: 78 | continue 79 | 80 | model.zero_grad() 81 | outputs = model(inputs.to(device)) 82 | loss = criterion(outputs, targets.to(device)) 83 | loss.backward(create_graph=True) 84 | 85 | params, gradsH = get_params_grad(model) 86 | Hv = torch.autograd.grad(gradsH, params, grad_outputs = v, only_inputs = True, retain_graph = False) 87 | 88 | THv = [THv1 + Hv1 + 0. for THv1, Hv1 in zip(THv, Hv)] 89 | counter += 1 90 | 91 | eigenvalue_tmp =group_product(THv,v).cpu().item() / float(counter) 92 | v = normalization(THv) 93 | 94 | if eigenvalue == None: 95 | eigenvalue = eigenvalue_tmp 96 | else: 97 | if abs(eigenvalue-eigenvalue_tmp)/abs(eigenvalue) < tol: 98 | return eigenvalue_tmp, v 99 | else: 100 | eigenvalue = eigenvalue_tmp 101 | 102 | return eigenvalue, v 103 | -------------------------------------------------------------------------------- /advrush/genotypes.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') 4 | 5 | PRIMITIVES = [ 6 | 'none', 7 | 'max_pool_3x3', 8 | 'avg_pool_3x3', 9 | 'skip_connect', 10 | 'sep_conv_3x3', 11 | 'sep_conv_5x5', 12 | 'dil_conv_3x3', 13 | 'dil_conv_5x5' 14 | ] 15 | 16 | NASNet = Genotype( 17 | normal = [ 18 | ('sep_conv_5x5', 1), 19 | ('sep_conv_3x3', 0), 20 | ('sep_conv_5x5', 0), 21 | ('sep_conv_3x3', 0), 22 | ('avg_pool_3x3', 1), 23 | ('skip_connect', 0), 24 | ('avg_pool_3x3', 0), 25 | ('avg_pool_3x3', 0), 26 | ('sep_conv_3x3', 1), 27 | ('skip_connect', 1), 28 | ], 29 | normal_concat = [2, 3, 4, 5, 6], 30 | reduce = [ 31 | ('sep_conv_5x5', 1), 32 | ('sep_conv_7x7', 0), 33 | ('max_pool_3x3', 1), 34 | ('sep_conv_7x7', 0), 35 | ('avg_pool_3x3', 1), 36 | ('sep_conv_5x5', 0), 37 | ('skip_connect', 3), 38 | ('avg_pool_3x3', 2), 39 | ('sep_conv_3x3', 2), 40 | ('max_pool_3x3', 1), 41 | ], 42 | reduce_concat = [4, 5, 6], 43 | ) 44 | 45 | AmoebaNet = Genotype( 46 | normal = [ 47 | ('avg_pool_3x3', 0), 48 | ('max_pool_3x3', 1), 49 | ('sep_conv_3x3', 0), 50 | ('sep_conv_5x5', 2), 51 | ('sep_conv_3x3', 0), 52 | ('avg_pool_3x3', 3), 53 | ('sep_conv_3x3', 1), 54 | ('skip_connect', 1), 55 | ('skip_connect', 0), 56 | ('avg_pool_3x3', 1), 57 | ], 58 | normal_concat = [4, 5, 6], 59 | reduce = [ 60 | ('avg_pool_3x3', 0), 61 | ('sep_conv_3x3', 1), 62 | ('max_pool_3x3', 0), 63 | ('sep_conv_7x7', 2), 64 | ('sep_conv_7x7', 0), 65 | ('avg_pool_3x3', 1), 66 | ('max_pool_3x3', 0), 67 | ('max_pool_3x3', 1), 68 | ('conv_7x1_1x7', 0), 69 | ('sep_conv_3x3', 5), 70 | ], 71 | reduce_concat = [3, 4, 6] 72 | ) 73 | 74 | ADVRUSH = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_5x5', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1)], normal_concat=range(2, 6), reduce=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('dil_conv_3x3', 2), ('skip_connect', 0), ('avg_pool_3x3', 1), ('skip_connect', 0), ('skip_connect', 2)], reduce_concat=range(2, 6)) 75 | 76 | DARTS_V1 = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 0), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('avg_pool_3x3', 0)], reduce_concat=[2, 3, 4, 5]) 77 | DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5]) 78 | 79 | DARTS = DARTS_V2 80 | 81 | PDARTS = Genotype(normal=[('skip_connect', 0), ('dil_conv_3x3', 1), ('skip_connect', 0),('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 3), ('sep_conv_3x3',0), ('dil_conv_5x5', 4)], normal_concat=range(2, 6), reduce=[('avg_pool_3x3', 0), ('sep_conv_5x5', 1), ('sep_conv_3x3', 0), ('dil_conv_5x5', 2), ('max_pool_3x3', 0), ('dil_conv_3x3', 1), ('dil_conv_3x3', 1), ('dil_conv_5x5', 3)], reduce_concat=range(2, 6)) 82 | 83 | RACL = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_5x5', 0), ('skip_connect', 1), ('sep_conv_3x3', 0), ('skip_connect', 3), ('sep_conv_3x3', 3), ('skip_connect', 4)], normal_concat=[2, 3, 4, 5], reduce=[('sep_conv_3x3',0), ('sep_conv_5x5', 1), ('avg_pool_3x3', 0), ('dil_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_5x5',1), ('sep_conv_3x3', 2), ('dil_conv_3x3', 3)], reduce_concat=[2, 3, 4, 5]) 84 | -------------------------------------------------------------------------------- /eval/genotypes.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') 4 | 5 | PRIMITIVES = [ 6 | 'none', 7 | 'max_pool_3x3', 8 | 'avg_pool_3x3', 9 | 'skip_connect', 10 | 'sep_conv_3x3', 11 | 'sep_conv_5x5', 12 | 'dil_conv_3x3', 13 | 'dil_conv_5x5' 14 | ] 15 | 16 | NASNet = Genotype( 17 | normal = [ 18 | ('sep_conv_5x5', 1), 19 | ('sep_conv_3x3', 0), 20 | ('sep_conv_5x5', 0), 21 | ('sep_conv_3x3', 0), 22 | ('avg_pool_3x3', 1), 23 | ('skip_connect', 0), 24 | ('avg_pool_3x3', 0), 25 | ('avg_pool_3x3', 0), 26 | ('sep_conv_3x3', 1), 27 | ('skip_connect', 1), 28 | ], 29 | normal_concat = [2, 3, 4, 5, 6], 30 | reduce = [ 31 | ('sep_conv_5x5', 1), 32 | ('sep_conv_7x7', 0), 33 | ('max_pool_3x3', 1), 34 | ('sep_conv_7x7', 0), 35 | ('avg_pool_3x3', 1), 36 | ('sep_conv_5x5', 0), 37 | ('skip_connect', 3), 38 | ('avg_pool_3x3', 2), 39 | ('sep_conv_3x3', 2), 40 | ('max_pool_3x3', 1), 41 | ], 42 | reduce_concat = [4, 5, 6], 43 | ) 44 | 45 | AmoebaNet = Genotype( 46 | normal = [ 47 | ('avg_pool_3x3', 0), 48 | ('max_pool_3x3', 1), 49 | ('sep_conv_3x3', 0), 50 | ('sep_conv_5x5', 2), 51 | ('sep_conv_3x3', 0), 52 | ('avg_pool_3x3', 3), 53 | ('sep_conv_3x3', 1), 54 | ('skip_connect', 1), 55 | ('skip_connect', 0), 56 | ('avg_pool_3x3', 1), 57 | ], 58 | normal_concat = [4, 5, 6], 59 | reduce = [ 60 | ('avg_pool_3x3', 0), 61 | ('sep_conv_3x3', 1), 62 | ('max_pool_3x3', 0), 63 | ('sep_conv_7x7', 2), 64 | ('sep_conv_7x7', 0), 65 | ('avg_pool_3x3', 1), 66 | ('max_pool_3x3', 0), 67 | ('max_pool_3x3', 1), 68 | ('conv_7x1_1x7', 0), 69 | ('sep_conv_3x3', 5), 70 | ], 71 | reduce_concat = [3, 4, 6] 72 | ) 73 | 74 | ADVRUSH = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_5x5', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1)], normal_concat=range(2, 6), reduce=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('dil_conv_3x3', 2), ('skip_connect', 0), ('avg_pool_3x3', 1), ('skip_connect', 0), ('skip_connect', 2)], reduce_concat=range(2, 6)) 75 | 76 | DARTS_V1 = Genotype(normal=[('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 0), ('sep_conv_3x3', 1), ('skip_connect', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('skip_connect', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 0), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('avg_pool_3x3', 0)], reduce_concat=[2, 3, 4, 5]) 77 | DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5]) 78 | 79 | DARTS = DARTS_V2 80 | 81 | PDARTS = Genotype(normal=[('skip_connect', 0), ('dil_conv_3x3', 1), ('skip_connect', 0),('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('sep_conv_3x3', 3), ('sep_conv_3x3',0), ('dil_conv_5x5', 4)], normal_concat=range(2, 6), reduce=[('avg_pool_3x3', 0), ('sep_conv_5x5', 1), ('sep_conv_3x3', 0), ('dil_conv_5x5', 2), ('max_pool_3x3', 0), ('dil_conv_3x3', 1), ('dil_conv_3x3', 1), ('dil_conv_5x5', 3)], reduce_concat=range(2, 6)) 82 | 83 | RACL = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_5x5', 0), ('skip_connect', 1), ('sep_conv_3x3', 0), ('skip_connect', 3), ('sep_conv_3x3', 3), ('skip_connect', 4)], normal_concat=[2, 3, 4, 5], reduce=[('sep_conv_3x3',0), ('sep_conv_5x5', 1), ('avg_pool_3x3', 0), ('dil_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_5x5',1), ('sep_conv_3x3', 2), ('dil_conv_3x3', 3)], reduce_concat=[2, 3, 4, 5]) 84 | 85 | 86 | -------------------------------------------------------------------------------- /eval/operations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | OPS = { 5 | 'none' : lambda C, stride, affine: Zero(stride), 6 | 'avg_pool_3x3' : lambda C, stride, affine: nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False), 7 | 'max_pool_3x3' : lambda C, stride, affine: nn.MaxPool2d(3, stride=stride, padding=1), 8 | 'skip_connect' : lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine), 9 | 'sep_conv_3x3' : lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine), 10 | 'sep_conv_5x5' : lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine), 11 | 'sep_conv_7x7' : lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine), 12 | 'dil_conv_3x3' : lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine), 13 | 'dil_conv_5x5' : lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine), 14 | 'conv_7x1_1x7' : lambda C, stride, affine: nn.Sequential( 15 | nn.ReLU(inplace=False), 16 | nn.Conv2d(C, C, (1,7), stride=(1, stride), padding=(0, 3), bias=False), 17 | nn.Conv2d(C, C, (7,1), stride=(stride, 1), padding=(3, 0), bias=False), 18 | nn.BatchNorm2d(C, affine=affine) 19 | ), 20 | } 21 | 22 | class ReLUConvBN(nn.Module): 23 | 24 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 25 | super(ReLUConvBN, self).__init__() 26 | self.op = nn.Sequential( 27 | nn.ReLU(inplace=False), 28 | nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False), 29 | nn.BatchNorm2d(C_out, affine=affine) 30 | ) 31 | 32 | def forward(self, x): 33 | return self.op(x) 34 | 35 | class DilConv(nn.Module): 36 | 37 | def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True): 38 | super(DilConv, self).__init__() 39 | self.op = nn.Sequential( 40 | nn.ReLU(inplace=False), 41 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False), 42 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 43 | nn.BatchNorm2d(C_out, affine=affine), 44 | ) 45 | 46 | def forward(self, x): 47 | return self.op(x) 48 | 49 | 50 | class SepConv(nn.Module): 51 | 52 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 53 | super(SepConv, self).__init__() 54 | self.op = nn.Sequential( 55 | nn.ReLU(inplace=False), 56 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False), 57 | nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False), 58 | nn.BatchNorm2d(C_in, affine=affine), 59 | nn.ReLU(inplace=False), 60 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False), 61 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 62 | nn.BatchNorm2d(C_out, affine=affine), 63 | ) 64 | 65 | def forward(self, x): 66 | return self.op(x) 67 | 68 | 69 | class Identity(nn.Module): 70 | 71 | def __init__(self): 72 | super(Identity, self).__init__() 73 | 74 | def forward(self, x): 75 | return x 76 | 77 | 78 | class Zero(nn.Module): 79 | 80 | def __init__(self, stride): 81 | super(Zero, self).__init__() 82 | self.stride = stride 83 | 84 | def forward(self, x): 85 | if self.stride == 1: 86 | return x.mul(0.) 87 | return x[:,:,::self.stride,::self.stride].mul(0.) 88 | 89 | 90 | class FactorizedReduce(nn.Module): 91 | 92 | def __init__(self, C_in, C_out, affine=True): 93 | super(FactorizedReduce, self).__init__() 94 | assert C_out % 2 == 0 95 | self.relu = nn.ReLU(inplace=False) 96 | self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 97 | self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 98 | self.bn = nn.BatchNorm2d(C_out, affine=affine) 99 | 100 | def forward(self, x): 101 | x = self.relu(x) 102 | out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1) 103 | out = self.bn(out) 104 | return out 105 | 106 | -------------------------------------------------------------------------------- /advrush/operations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | OPS = { 5 | 'none' : lambda C, stride, affine: Zero(stride), 6 | 'avg_pool_3x3' : lambda C, stride, affine: nn.AvgPool2d(3, stride=stride, padding=1, count_include_pad=False), 7 | 'max_pool_3x3' : lambda C, stride, affine: nn.MaxPool2d(3, stride=stride, padding=1), 8 | 'skip_connect' : lambda C, stride, affine: Identity() if stride == 1 else FactorizedReduce(C, C, affine=affine), 9 | 'sep_conv_3x3' : lambda C, stride, affine: SepConv(C, C, 3, stride, 1, affine=affine), 10 | 'sep_conv_5x5' : lambda C, stride, affine: SepConv(C, C, 5, stride, 2, affine=affine), 11 | 'sep_conv_7x7' : lambda C, stride, affine: SepConv(C, C, 7, stride, 3, affine=affine), 12 | 'dil_conv_3x3' : lambda C, stride, affine: DilConv(C, C, 3, stride, 2, 2, affine=affine), 13 | 'dil_conv_5x5' : lambda C, stride, affine: DilConv(C, C, 5, stride, 4, 2, affine=affine), 14 | 'conv_7x1_1x7' : lambda C, stride, affine: nn.Sequential( 15 | nn.ReLU(inplace=False), 16 | nn.Conv2d(C, C, (1,7), stride=(1, stride), padding=(0, 3), bias=False), 17 | nn.Conv2d(C, C, (7,1), stride=(stride, 1), padding=(3, 0), bias=False), 18 | nn.BatchNorm2d(C, affine=affine) 19 | ), 20 | } 21 | 22 | class ReLUConvBN(nn.Module): 23 | 24 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 25 | super(ReLUConvBN, self).__init__() 26 | self.op = nn.Sequential( 27 | nn.ReLU(inplace=False), 28 | nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False), 29 | nn.BatchNorm2d(C_out, affine=affine) 30 | ) 31 | 32 | def forward(self, x): 33 | return self.op(x) 34 | 35 | class DilConv(nn.Module): 36 | 37 | def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True): 38 | super(DilConv, self).__init__() 39 | self.op = nn.Sequential( 40 | nn.ReLU(inplace=False), 41 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False), 42 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 43 | nn.BatchNorm2d(C_out, affine=affine), 44 | ) 45 | 46 | def forward(self, x): 47 | return self.op(x) 48 | 49 | 50 | class SepConv(nn.Module): 51 | 52 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 53 | super(SepConv, self).__init__() 54 | self.op = nn.Sequential( 55 | nn.ReLU(inplace=False), 56 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False), 57 | nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False), 58 | nn.BatchNorm2d(C_in, affine=affine), 59 | nn.ReLU(inplace=False), 60 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False), 61 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 62 | nn.BatchNorm2d(C_out, affine=affine), 63 | ) 64 | 65 | def forward(self, x): 66 | return self.op(x) 67 | 68 | 69 | class Identity(nn.Module): 70 | 71 | def __init__(self): 72 | super(Identity, self).__init__() 73 | 74 | def forward(self, x): 75 | return x 76 | 77 | 78 | class Zero(nn.Module): 79 | 80 | def __init__(self, stride): 81 | super(Zero, self).__init__() 82 | self.stride = stride 83 | 84 | def forward(self, x): 85 | if self.stride == 1: 86 | return x.mul(0.) 87 | return x[:,:,::self.stride,::self.stride].mul(0.) 88 | 89 | 90 | class FactorizedReduce(nn.Module): 91 | 92 | def __init__(self, C_in, C_out, affine=True): 93 | super(FactorizedReduce, self).__init__() 94 | assert C_out % 2 == 0 95 | self.relu = nn.ReLU(inplace=False) 96 | self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 97 | self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 98 | self.bn = nn.BatchNorm2d(C_out, affine=affine) 99 | 100 | def forward(self, x): 101 | x = self.relu(x) 102 | out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1) 103 | out = self.bn(out) 104 | return out 105 | 106 | -------------------------------------------------------------------------------- /eval/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import shutil 5 | import torchvision.transforms as transforms 6 | import torchvision.datasets as dset 7 | from torch.autograd import Variable 8 | 9 | 10 | class AvgrageMeter(object): 11 | 12 | def __init__(self): 13 | self.reset() 14 | 15 | def reset(self): 16 | self.avg = 0 17 | self.sum = 0 18 | self.cnt = 0 19 | 20 | def update(self, val, n=1): 21 | self.sum += val * n 22 | self.cnt += n 23 | self.avg = self.sum / self.cnt 24 | 25 | 26 | def accuracy(output, target, topk=(1,)): 27 | maxk = max(topk) 28 | batch_size = target.size(0) 29 | 30 | _, pred = output.topk(maxk, 1, True, True) 31 | pred = pred.t() 32 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 33 | 34 | res = [] 35 | for k in topk: 36 | correct_k = correct[:k].view(-1).float().sum(0) 37 | res.append(correct_k.mul_(100.0/batch_size)) 38 | return res 39 | 40 | 41 | class Cutout(object): 42 | def __init__(self, length): 43 | self.length = length 44 | 45 | def __call__(self, img): 46 | h, w = img.size(1), img.size(2) 47 | mask = np.ones((h, w), np.float32) 48 | y = np.random.randint(h) 49 | x = np.random.randint(w) 50 | 51 | y1 = np.clip(y - self.length // 2, 0, h) 52 | y2 = np.clip(y + self.length // 2, 0, h) 53 | x1 = np.clip(x - self.length // 2, 0, w) 54 | x2 = np.clip(x + self.length // 2, 0, w) 55 | 56 | mask[y1: y2, x1: x2] = 0. 57 | mask = torch.from_numpy(mask) 58 | mask = mask.expand_as(img) 59 | img *= mask 60 | return img 61 | 62 | 63 | def _data_transforms_cifar10(args): 64 | CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] 65 | CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] 66 | 67 | train_transform = transforms.Compose([ 68 | transforms.RandomCrop(32, padding=4), 69 | transforms.RandomHorizontalFlip(), 70 | transforms.ToTensor(), 71 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 72 | ]) 73 | if args.cutout: 74 | train_transform.transforms.append(Cutout(args.cutout_length)) 75 | 76 | valid_transform = transforms.Compose([ 77 | transforms.ToTensor(), 78 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 79 | ]) 80 | return train_transform, valid_transform 81 | 82 | def _data_transforms_cifar10_eval(args): 83 | train_transform = transforms.Compose([ 84 | transforms.RandomCrop(32, padding=4), 85 | transforms.RandomHorizontalFlip(), 86 | transforms.ToTensor(), 87 | ]) 88 | if args.cutout: 89 | train_transform.transforms.append(Cutout(args.cutout_length)) 90 | 91 | valid_transform = transforms.Compose([ 92 | transforms.ToTensor() 93 | ]) 94 | return train_transform, valid_transform 95 | 96 | def _data_imagenet(args): 97 | traindir = os.path.join(args.data, 'train') 98 | #validdir = os.path.join(args.data, 'val') 99 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 100 | train_data = dset.ImageFolder( 101 | traindir, 102 | transforms.Compose([ 103 | transforms.RandomResizedCrop(224), 104 | transforms.RandomHorizontalFlip(), 105 | transforms.ToTensor(), 106 | normalize, 107 | ])) 108 | return train_data 109 | 110 | def count_parameters_in_MB(model): 111 | return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6 112 | 113 | 114 | def save_checkpoint(state, is_best, save, epoch): 115 | filename = os.path.join(save, 'checkpoint-epoch{}.pth.tar'.format(epoch)) 116 | torch.save(state, filename) 117 | if is_best: 118 | best_filename = os.path.join(save, 'model_best_epoch{}.pth.tar'.format(epoch)) 119 | shutil.copyfile(filename, best_filename) 120 | 121 | 122 | def save(model, model_path): 123 | torch.save(model.state_dict(), model_path) 124 | 125 | 126 | def load(model, model_path): 127 | model.load_state_dict(torch.load(model_path)) 128 | 129 | 130 | def drop_path(x, drop_prob): 131 | if drop_prob > 0.: 132 | keep_prob = 1.-drop_prob 133 | mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob)) 134 | x.div_(keep_prob) 135 | x.mul_(mask) 136 | return x 137 | 138 | 139 | def create_exp_dir(path, scripts_to_save=None): 140 | if not os.path.exists(path): 141 | os.mkdir(path) 142 | print('Experiment dir : {}'.format(path)) 143 | 144 | if scripts_to_save is not None: 145 | os.mkdir(os.path.join(path, 'scripts')) 146 | for script in scripts_to_save: 147 | dst_file = os.path.join(path, 'scripts', os.path.basename(script)) 148 | shutil.copyfile(script, dst_file) 149 | 150 | -------------------------------------------------------------------------------- /advrush/architect.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | from regularizer import * 7 | import hessianflow as hf 8 | import hessianflow.optimizer.optm_utils as hf_optm_utils 9 | import hessianflow.optimizer.progressbar as hf_optm_pgb 10 | 11 | def _concat(xs): 12 | return torch.cat([x.view(-1) for x in xs]) 13 | 14 | 15 | class Architect(object): 16 | 17 | def __init__(self, model, args): 18 | self.network_momentum = args.momentum 19 | self.network_weight_decay = args.weight_decay 20 | self.model = model 21 | self.optimizer = torch.optim.Adam(self.model.arch_parameters(), 22 | lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) 23 | 24 | def _compute_unrolled_model(self, input, target, eta, network_optimizer): 25 | logits, loss = self.model._loss(input, target) 26 | theta = _concat(self.model.parameters()).data 27 | try: 28 | moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum) 29 | except: 30 | moment = torch.zeros_like(theta) 31 | dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta 32 | unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta)) 33 | return unrolled_model 34 | 35 | def step(self, input_train, target_train, epoch, warm_epoch, gamma, criterion, loss_hessian, valid_queue, input_valid, target_valid, eta, network_optimizer, unrolled, h): 36 | self.optimizer.zero_grad() 37 | if unrolled: 38 | self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer) 39 | else: 40 | regularizer = self._backward_step(epoch, warm_epoch, gamma, criterion, loss_hessian, valid_queue, input_valid, target_valid, h) 41 | self.optimizer.step() 42 | return regularizer 43 | 44 | def _backward_step(self, epoch, warm_epoch, gamma, criterion, loss_hessian, valid_queue, input_valid, target_valid, h): 45 | logits, loss = self.model._loss(input_valid, target_valid) 46 | if epoch < warm_epoch: 47 | loss = loss #criterion(logits, target) 48 | regularizer = torch.tensor(0, dtype=torch.float) 49 | else: 50 | if loss_hessian == 'loss_cure': 51 | reg = loss_cure(self.model, criterion, lambda_=4, device='cuda') 52 | regularizer, grad_norm = reg.regularizer(input_valid, target_valid, h=h) 53 | else: 54 | reg = loss_eigen(self.model, valid_queue, input_valid, target_valid, criterion, full_eigen=False, maxIter=10, tol=1e-2) 55 | regularizer, _ = reg.regularizer() 56 | loss += gamma * regularizer 57 | loss.backward() 58 | return regularizer 59 | 60 | def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer): 61 | unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer) 62 | unrolled_loss = unrolled_model._loss(input_valid, target_valid) 63 | 64 | unrolled_loss.backward() 65 | dalpha = [v.grad for v in unrolled_model.arch_parameters()] 66 | vector = [v.grad.data for v in unrolled_model.parameters()] 67 | implicit_grads = self._hessian_vector_product(vector, input_train, target_train) 68 | 69 | for g, ig in zip(dalpha, implicit_grads): 70 | g.data.sub_(eta, ig.data) 71 | 72 | for v, g in zip(self.model.arch_parameters(), dalpha): 73 | if v.grad is None: 74 | v.grad = Variable(g.data) 75 | else: 76 | v.grad.data.copy_(g.data) 77 | 78 | def _construct_model_from_theta(self, theta): 79 | model_new = self.model.new() 80 | model_dict = self.model.state_dict() 81 | 82 | params, offset = {}, 0 83 | for k, v in self.model.named_parameters(): 84 | v_length = np.prod(v.size()) 85 | params[k] = theta[offset: offset+v_length].view(v.size()) 86 | offset += v_length 87 | 88 | assert offset == len(theta) 89 | model_dict.update(params) 90 | model_new.load_state_dict(model_dict) 91 | return model_new.cuda() 92 | 93 | def _hessian_vector_product(self, vector, input, target, r=1e-2): 94 | R = r / _concat(vector).norm() 95 | for p, v in zip(self.model.parameters(), vector): 96 | p.data.add_(R, v) 97 | loss = self.model._loss(input, target) 98 | grads_p = torch.autograd.grad(loss, self.model.arch_parameters()) 99 | 100 | for p, v in zip(self.model.parameters(), vector): 101 | p.data.sub_(2*R, v) 102 | loss = self.model._loss(input, target) 103 | grads_n = torch.autograd.grad(loss, self.model.arch_parameters()) 104 | 105 | for p, v in zip(self.model.parameters(), vector): 106 | p.data.add_(R, v) 107 | 108 | return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)] 109 | 110 | -------------------------------------------------------------------------------- /advrush/trades.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import torch.optim as optim 6 | 7 | 8 | def squared_l2_norm(x): 9 | flattened = x.view(x.unsqueeze(0).shape[0], -1) 10 | return (flattened ** 2).sum(1) 11 | 12 | 13 | def l2_norm(x): 14 | return squared_l2_norm(x).sqrt() 15 | 16 | 17 | def trades_loss(model, 18 | x_natural, 19 | y, 20 | optimizer, 21 | step_size=0.003, 22 | epsilon=0.031, 23 | perturb_steps=10, 24 | beta=1.0, 25 | distance='l_inf'): 26 | # define KL-loss 27 | criterion_kl = nn.KLDivLoss(size_average=False) 28 | model.eval() 29 | batch_size = len(x_natural) 30 | # generate adversarial example 31 | x_adv = x_natural.detach() + 0.001 * torch.randn(x_natural.shape).cuda().detach() 32 | if distance == 'l_inf': 33 | for _ in range(perturb_steps): 34 | x_adv.requires_grad_() 35 | with torch.enable_grad(): 36 | adv_logits, _ = model(x_adv) 37 | clean_logits, _ = model(x_natural) 38 | loss_kl = criterion_kl(F.log_softmax(adv_logits, dim=1), #model(x_adv) 39 | F.softmax(clean_logits, dim=1)) #model(x_natural) 40 | grad = torch.autograd.grad(loss_kl, [x_adv])[0] 41 | x_adv = x_adv.detach() + step_size * torch.sign(grad.detach()) 42 | x_adv = torch.min(torch.max(x_adv, x_natural - epsilon), x_natural + epsilon) 43 | x_adv = torch.clamp(x_adv, 0.0, 1.0) 44 | elif distance == 'l_2': 45 | delta = 0.001 * torch.randn(x_natural.shape).cuda().detach() 46 | delta = Variable(delta.data, requires_grad=True) 47 | 48 | # Setup optimizers 49 | optimizer_delta = optim.SGD([delta], lr=epsilon / perturb_steps * 2) 50 | 51 | for _ in range(perturb_steps): 52 | adv = x_natural + delta 53 | 54 | # optimize 55 | optimizer_delta.zero_grad() 56 | with torch.enable_grad(): 57 | loss = (-1) * criterion_kl(F.log_softmax(model(adv), dim=1), 58 | F.softmax(model(x_natural), dim=1)) 59 | loss.backward() 60 | # renorming gradient 61 | grad_norms = delta.grad.view(batch_size, -1).norm(p=2, dim=1) 62 | delta.grad.div_(grad_norms.view(-1, 1, 1, 1)) 63 | # avoid nan or inf if gradient is 0 64 | if (grad_norms == 0).any(): 65 | delta.grad[grad_norms == 0] = torch.randn_like(delta.grad[grad_norms == 0]) 66 | optimizer_delta.step() 67 | 68 | # projection 69 | delta.data.add_(x_natural) 70 | delta.data.clamp_(0, 1).sub_(x_natural) 71 | delta.data.renorm_(p=2, dim=0, maxnorm=epsilon) 72 | x_adv = Variable(x_natural + delta, requires_grad=False) 73 | else: 74 | x_adv = torch.clamp(x_adv, 0.0, 1.0) 75 | model.train() 76 | 77 | x_adv = Variable(torch.clamp(x_adv, 0.0, 1.0), requires_grad=False) 78 | # zero gradient 79 | optimizer.zero_grad() 80 | # calculate robust loss 81 | clean_logits_new, _ = model(x_natural) 82 | adv_logits_new, _ = model(x_natural) 83 | loss_natural = F.cross_entropy(clean_logits_new, y) #model(x_natural) 84 | loss_robust = (1.0 / batch_size) * criterion_kl(F.log_softmax(adv_logits_new, dim=1), #model(x_adv) 85 | F.softmax(clean_logits_new, dim=1)) #model(x_natural) 86 | loss = loss_natural + beta * loss_robust 87 | return loss 88 | 89 | 90 | def madry_loss(model, 91 | x_natural, 92 | y, 93 | optimizer, 94 | step_size=0.003, 95 | epsilon=0.031, 96 | perturb_steps=10, 97 | distance='l_inf', 98 | ): 99 | # define KL-loss 100 | criterion_ce = torch.nn.CrossEntropyLoss(reduction='none') 101 | model.eval() 102 | batch_size = len(x_natural) 103 | 104 | # generate adversarial example 105 | x_adv = x_natural.detach() + 0.001 * torch.randn(x_natural.shape).cuda().detach() 106 | 107 | if distance == 'l_inf': 108 | for _ in range(perturb_steps): 109 | x_adv.requires_grad_() 110 | with torch.enable_grad(): 111 | logits, _ = model(x_adv) 112 | loss_ce = criterion_ce(logits, y).mean() 113 | grad = torch.autograd.grad(loss_ce, [x_adv])[0] 114 | x_adv = x_adv.detach() + step_size * torch.sign(grad.detach()) 115 | x_adv = torch.min(torch.max(x_adv, x_natural - epsilon), x_natural + epsilon) 116 | x_adv = torch.clamp(x_adv, 0.0, 1.0) 117 | else: 118 | x_adv = torch.clamp(x_adv, 0.0, 1.0) 119 | model.train() 120 | 121 | x_adv = Variable(torch.clamp(x_adv, 0.0, 1.0), requires_grad=False) 122 | # zero gradient 123 | optimizer.zero_grad() 124 | 125 | logits, _ = model(x_adv) 126 | loss = F.cross_entropy(logits, y) 127 | 128 | return loss 129 | -------------------------------------------------------------------------------- /advrush/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import shutil 5 | import torchvision.transforms as transforms 6 | import torchvision.datasets as dset 7 | from torch.autograd import Variable 8 | 9 | 10 | class AvgrageMeter(object): 11 | 12 | def __init__(self): 13 | self.reset() 14 | 15 | def reset(self): 16 | self.avg = 0 17 | self.sum = 0 18 | self.cnt = 0 19 | 20 | def update(self, val, n=1): 21 | self.sum += val * n 22 | self.cnt += n 23 | self.avg = self.sum / self.cnt 24 | 25 | 26 | def accuracy(output, target, topk=(1,)): 27 | maxk = max(topk) 28 | batch_size = target.size(0) 29 | 30 | _, pred = output.topk(maxk, 1, True, True) 31 | pred = pred.t() 32 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 33 | 34 | res = [] 35 | for k in topk: 36 | correct_k = correct[:k].view(-1).float().sum(0) 37 | res.append(correct_k.mul_(100.0/batch_size)) 38 | return res 39 | 40 | 41 | class Cutout(object): 42 | def __init__(self, length): 43 | self.length = length 44 | 45 | def __call__(self, img): 46 | h, w = img.size(1), img.size(2) 47 | mask = np.ones((h, w), np.float32) 48 | y = np.random.randint(h) 49 | x = np.random.randint(w) 50 | 51 | y1 = np.clip(y - self.length // 2, 0, h) 52 | y2 = np.clip(y + self.length // 2, 0, h) 53 | x1 = np.clip(x - self.length // 2, 0, w) 54 | x2 = np.clip(x + self.length // 2, 0, w) 55 | 56 | mask[y1: y2, x1: x2] = 0. 57 | mask = torch.from_numpy(mask) 58 | mask = mask.expand_as(img) 59 | img *= mask 60 | return img 61 | 62 | 63 | def _data_transforms_cifar10(args): 64 | CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] 65 | CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] 66 | 67 | train_transform = transforms.Compose([ 68 | transforms.RandomCrop(32, padding=4), 69 | transforms.RandomHorizontalFlip(), 70 | transforms.ToTensor(), 71 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 72 | ]) 73 | if args.cutout: 74 | train_transform.transforms.append(Cutout(args.cutout_length)) 75 | 76 | valid_transform = transforms.Compose([ 77 | transforms.ToTensor(), 78 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 79 | ]) 80 | return train_transform, valid_transform 81 | 82 | def _data_transforms_cifar10_eval(args): 83 | train_transform = transforms.Compose([ 84 | transforms.RandomCrop(32, padding=4), 85 | transforms.RandomHorizontalFlip(), 86 | transforms.ToTensor(), 87 | ]) 88 | if args.cutout: 89 | train_transform.transforms.append(Cutout(args.cutout_length)) 90 | 91 | valid_transform = transforms.Compose([ 92 | transforms.ToTensor() 93 | ]) 94 | return train_transform, valid_transform 95 | 96 | def _data_transforms_cifar100(args): 97 | CIFAR_MEAN = [0.5071, 0.4867, 0.4408] 98 | CIFAR_STD = [0.2675, 0.2565, 0.2761] 99 | 100 | train_transform = transforms.Compose([ 101 | transforms.RandomCrop(32, padding=4), 102 | transforms.RandomHorizontalFlip(), 103 | transforms.ToTensor(), 104 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 105 | ]) 106 | if args.cutout: 107 | train_transform.transforms.append(Cutout(args.cutout_length)) 108 | 109 | valid_transform = transforms.Compose([ 110 | transforms.ToTensor(), 111 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 112 | ]) 113 | return train_transform, valid_transform 114 | 115 | def _data_imagenet(args): 116 | traindir = os.path.join(args.data, 'train') 117 | #validdir = os.path.join(args.data, 'val') 118 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 119 | train_data = dset.ImageFolder( 120 | traindir, 121 | transforms.Compose([ 122 | transforms.RandomResizedCrop(224), 123 | transforms.RandomHorizontalFlip(), 124 | transforms.ToTensor(), 125 | normalize, 126 | ])) 127 | return train_data 128 | 129 | def count_parameters_in_MB(model): 130 | return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6 131 | 132 | 133 | def save_checkpoint(state, is_best, save, epoch): 134 | filename = os.path.join(save, 'checkpoint-epoch{}.pth.tar'.format(epoch)) 135 | torch.save(state, filename) 136 | if is_best: 137 | best_filename = os.path.join(save, 'model_best.pth.tar') 138 | shutil.copyfile(filename, best_filename) 139 | 140 | 141 | def save(model, model_path): 142 | torch.save(model.state_dict(), model_path) 143 | 144 | 145 | def load(model, model_path): 146 | model.load_state_dict(torch.load(model_path)) 147 | 148 | 149 | def drop_path(x, drop_prob): 150 | if drop_prob > 0.: 151 | keep_prob = 1.-drop_prob 152 | mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob)) 153 | x.div_(keep_prob) 154 | x.mul_(mask) 155 | return x 156 | 157 | 158 | def create_exp_dir(path, scripts_to_save=None): 159 | if not os.path.exists(path): 160 | os.mkdir(path) 161 | print('Experiment dir : {}'.format(path)) 162 | 163 | if scripts_to_save is not None: 164 | os.mkdir(os.path.join(path, 'scripts')) 165 | for script in scripts_to_save: 166 | dst_file = os.path.join(path, 'scripts', os.path.basename(script)) 167 | shutil.copyfile(script, dst_file) 168 | 169 | -------------------------------------------------------------------------------- /advrush/model_search.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from operations import * 5 | from torch.autograd import Variable 6 | from genotypes import PRIMITIVES 7 | from genotypes import Genotype 8 | 9 | 10 | class MixedOp(nn.Module): 11 | 12 | def __init__(self, C, stride): 13 | super(MixedOp, self).__init__() 14 | self._ops = nn.ModuleList() 15 | for primitive in PRIMITIVES: 16 | op = OPS[primitive](C, stride, False) 17 | if 'pool' in primitive: 18 | op = nn.Sequential(op, nn.BatchNorm2d(C, affine=False)) 19 | self._ops.append(op) 20 | 21 | def forward(self, x, weights): 22 | return sum(w * op(x) for w, op in zip(weights, self._ops)) 23 | 24 | 25 | class Cell(nn.Module): 26 | 27 | def __init__(self, steps, multiplier, C_prev_prev, C_prev, C, reduction, reduction_prev): 28 | super(Cell, self).__init__() 29 | self.reduction = reduction 30 | 31 | if reduction_prev: 32 | self.preprocess0 = FactorizedReduce(C_prev_prev, C, affine=False) 33 | else: 34 | self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0, affine=False) 35 | self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0, affine=False) 36 | self._steps = steps 37 | self._multiplier = multiplier 38 | 39 | self._ops = nn.ModuleList() 40 | self._bns = nn.ModuleList() 41 | for i in range(self._steps): 42 | for j in range(2+i): 43 | stride = 2 if reduction and j < 2 else 1 44 | op = MixedOp(C, stride) 45 | self._ops.append(op) 46 | 47 | def forward(self, s0, s1, weights): 48 | s0 = self.preprocess0(s0) 49 | s1 = self.preprocess1(s1) 50 | 51 | states = [s0, s1] 52 | offset = 0 53 | for i in range(self._steps): 54 | s = sum(self._ops[offset+j](h, weights[offset+j]) for j, h in enumerate(states)) 55 | offset += len(states) 56 | states.append(s) 57 | 58 | return torch.cat(states[-self._multiplier:], dim=1) 59 | 60 | 61 | class Network(nn.Module): 62 | 63 | def __init__(self, C, num_classes, layers, criterion, steps=4, multiplier=4, stem_multiplier=3): 64 | super(Network, self).__init__() 65 | self._C = C 66 | self._num_classes = num_classes 67 | self._layers = layers 68 | self._criterion = criterion 69 | self._steps = steps 70 | self._multiplier = multiplier 71 | 72 | C_curr = stem_multiplier*C 73 | self.stem = nn.Sequential( 74 | nn.Conv2d(3, C_curr, 3, padding=1, bias=False), 75 | nn.BatchNorm2d(C_curr) 76 | ) 77 | 78 | C_prev_prev, C_prev, C_curr = C_curr, C_curr, C 79 | self.cells = nn.ModuleList() 80 | reduction_prev = False 81 | for i in range(layers): 82 | if i in [layers//3, 2*layers//3]: 83 | C_curr *= 2 84 | reduction = True 85 | else: 86 | reduction = False 87 | cell = Cell(steps, multiplier, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 88 | reduction_prev = reduction 89 | self.cells += [cell] 90 | C_prev_prev, C_prev = C_prev, multiplier*C_curr 91 | 92 | self.global_pooling = nn.AdaptiveAvgPool2d(1) 93 | self.classifier = nn.Linear(C_prev, num_classes) 94 | 95 | self._initialize_alphas() 96 | 97 | def new(self): 98 | model_new = Network(self._C, self._num_classes, self._layers, self._criterion).cuda() 99 | for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): 100 | x.data.copy_(y.data) 101 | return model_new 102 | 103 | def forward(self, input): 104 | s0 = s1 = self.stem(input) 105 | for i, cell in enumerate(self.cells): 106 | if cell.reduction: 107 | weights = F.softmax(self.alphas_reduce, dim=-1) 108 | else: 109 | weights = F.softmax(self.alphas_normal, dim=-1) 110 | s0, s1 = s1, cell(s0, s1, weights) 111 | out = self.global_pooling(s1) 112 | logits = self.classifier(out.view(out.size(0),-1)) 113 | return logits 114 | 115 | def _loss(self, input, target): 116 | logits = self(input) 117 | return logits, self._criterion(logits, target) 118 | 119 | def _initialize_alphas(self): 120 | k = sum(1 for i in range(self._steps) for n in range(2+i)) 121 | num_ops = len(PRIMITIVES) 122 | 123 | self.alphas_normal = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True) 124 | self.alphas_reduce = Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True) 125 | self._arch_parameters = [ 126 | self.alphas_normal, 127 | self.alphas_reduce, 128 | ] 129 | 130 | def arch_parameters(self): 131 | return self._arch_parameters 132 | 133 | def restore(self, alphas_normal, alphas_reduce): 134 | self.alphas_normal = alphas_normal 135 | self.alphas_reduce = alphas_reduce 136 | self.alphas_normal = Variable(self.alphas_normal, requires_grad=True) 137 | self.alphas_reduce = Variable(self.alphas_reduce, requires_grad=True) 138 | 139 | def genotype(self): 140 | 141 | def _parse(weights): 142 | gene = [] 143 | n = 2 144 | start = 0 145 | for i in range(self._steps): 146 | end = start + n 147 | W = weights[start:end].copy() 148 | edges = sorted(range(i + 2), key=lambda x: -max(W[x][k] for k in range(len(W[x])) if k != PRIMITIVES.index('none')))[:2] 149 | for j in edges: 150 | k_best = None 151 | for k in range(len(W[j])): 152 | if k != PRIMITIVES.index('none'): 153 | if k_best is None or W[j][k] > W[j][k_best]: 154 | k_best = k 155 | gene.append((PRIMITIVES[k_best], j)) 156 | start = end 157 | n += 1 158 | return gene 159 | 160 | gene_normal = _parse(F.softmax(self.alphas_normal, dim=-1).data.cpu().numpy()) 161 | gene_reduce = _parse(F.softmax(self.alphas_reduce, dim=-1).data.cpu().numpy()) 162 | 163 | concat = range(2+self._steps-self._multiplier, self._steps+2) 164 | genotype = Genotype( 165 | normal=gene_normal, normal_concat=concat, 166 | reduce=gene_reduce, reduce_concat=concat 167 | ) 168 | return genotype 169 | 170 | -------------------------------------------------------------------------------- /advrush/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from operations import * 4 | from torch.autograd import Variable 5 | from utils import drop_path 6 | 7 | 8 | class Cell(nn.Module): 9 | 10 | def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev): 11 | super(Cell, self).__init__() 12 | print(C_prev_prev, C_prev, C) 13 | 14 | if reduction_prev: 15 | self.preprocess0 = FactorizedReduce(C_prev_prev, C) 16 | else: 17 | self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0) 18 | self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0) 19 | 20 | if reduction: 21 | op_names, indices = zip(*genotype.reduce) 22 | concat = genotype.reduce_concat 23 | else: 24 | op_names, indices = zip(*genotype.normal) 25 | concat = genotype.normal_concat 26 | self._compile(C, op_names, indices, concat, reduction) 27 | 28 | def _compile(self, C, op_names, indices, concat, reduction): 29 | assert len(op_names) == len(indices) 30 | self._steps = len(op_names) // 2 31 | self._concat = concat 32 | self.multiplier = len(concat) 33 | 34 | self._ops = nn.ModuleList() 35 | for name, index in zip(op_names, indices): 36 | stride = 2 if reduction and index < 2 else 1 37 | op = OPS[name](C, stride, True) 38 | self._ops += [op] 39 | self._indices = indices 40 | 41 | def forward(self, s0, s1, drop_prob): 42 | s0 = self.preprocess0(s0) 43 | s1 = self.preprocess1(s1) 44 | 45 | states = [s0, s1] 46 | for i in range(self._steps): 47 | h1 = states[self._indices[2*i]] 48 | h2 = states[self._indices[2*i+1]] 49 | op1 = self._ops[2*i] 50 | op2 = self._ops[2*i+1] 51 | h1 = op1(h1) 52 | h2 = op2(h2) 53 | if self.training and drop_prob > 0.: 54 | if not isinstance(op1, Identity): 55 | h1 = drop_path(h1, drop_prob) 56 | if not isinstance(op2, Identity): 57 | h2 = drop_path(h2, drop_prob) 58 | s = h1 + h2 59 | states += [s] 60 | return torch.cat([states[i] for i in self._concat], dim=1) 61 | 62 | 63 | class AuxiliaryHeadCIFAR(nn.Module): 64 | 65 | def __init__(self, C, num_classes): 66 | """assuming input size 8x8""" 67 | super(AuxiliaryHeadCIFAR, self).__init__() 68 | self.features = nn.Sequential( 69 | nn.ReLU(inplace=True), 70 | nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), # image size = 2 x 2 71 | nn.Conv2d(C, 128, 1, bias=False), 72 | nn.BatchNorm2d(128), 73 | nn.ReLU(inplace=True), 74 | nn.Conv2d(128, 768, 2, bias=False), 75 | nn.BatchNorm2d(768), 76 | nn.ReLU(inplace=True) 77 | ) 78 | self.classifier = nn.Linear(768, num_classes) 79 | 80 | def forward(self, x): 81 | x = self.features(x) 82 | x = self.classifier(x.view(x.size(0),-1)) 83 | return x 84 | 85 | 86 | class AuxiliaryHeadImageNet(nn.Module): 87 | 88 | def __init__(self, C, num_classes): 89 | """assuming input size 14x14""" 90 | super(AuxiliaryHeadImageNet, self).__init__() 91 | self.features = nn.Sequential( 92 | nn.ReLU(inplace=True), 93 | nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False), 94 | nn.Conv2d(C, 128, 1, bias=False), 95 | nn.BatchNorm2d(128), 96 | nn.ReLU(inplace=True), 97 | nn.Conv2d(128, 768, 2, bias=False), 98 | # NOTE: This batchnorm was omitted in my earlier implementation due to a typo. 99 | # Commenting it out for consistency with the experiments in the paper. 100 | # nn.BatchNorm2d(768), 101 | nn.ReLU(inplace=True) 102 | ) 103 | self.classifier = nn.Linear(768, num_classes) 104 | 105 | def forward(self, x): 106 | x = self.features(x) 107 | x = self.classifier(x.view(x.size(0),-1)) 108 | return x 109 | 110 | 111 | class NetworkCIFAR(nn.Module): 112 | 113 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 114 | super(NetworkCIFAR, self).__init__() 115 | self._layers = layers 116 | self._auxiliary = auxiliary 117 | 118 | stem_multiplier = 3 119 | C_curr = stem_multiplier*C 120 | self.stem = nn.Sequential( 121 | nn.Conv2d(3, C_curr, 3, padding=1, bias=False), 122 | nn.BatchNorm2d(C_curr) 123 | ) 124 | 125 | C_prev_prev, C_prev, C_curr = C_curr, C_curr, C 126 | self.cells = nn.ModuleList() 127 | reduction_prev = False 128 | for i in range(layers): 129 | if i in [layers//3, 2*layers//3]: 130 | C_curr *= 2 131 | reduction = True 132 | else: 133 | reduction = False 134 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 135 | reduction_prev = reduction 136 | self.cells += [cell] 137 | C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr 138 | if i == 2*layers//3: 139 | C_to_auxiliary = C_prev 140 | 141 | if auxiliary: 142 | self.auxiliary_head = AuxiliaryHeadCIFAR(C_to_auxiliary, num_classes) 143 | self.global_pooling = nn.AdaptiveAvgPool2d(1) 144 | self.classifier = nn.Linear(C_prev, num_classes) 145 | 146 | def forward(self, input): 147 | logits_aux = None 148 | s0 = s1 = self.stem(input) 149 | for i, cell in enumerate(self.cells): 150 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 151 | if i == 2*self._layers//3: 152 | if self._auxiliary and self.training: 153 | logits_aux = self.auxiliary_head(s1) 154 | out = self.global_pooling(s1) 155 | logits = self.classifier(out.view(out.size(0),-1)) 156 | return logits, logits_aux 157 | 158 | 159 | class NetworkImageNet(nn.Module): 160 | 161 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 162 | super(NetworkImageNet, self).__init__() 163 | self._layers = layers 164 | self._auxiliary = auxiliary 165 | 166 | self.stem0 = nn.Sequential( 167 | nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False), 168 | nn.BatchNorm2d(C // 2), 169 | nn.ReLU(inplace=True), 170 | nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False), 171 | nn.BatchNorm2d(C), 172 | ) 173 | 174 | self.stem1 = nn.Sequential( 175 | nn.ReLU(inplace=True), 176 | nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False), 177 | nn.BatchNorm2d(C), 178 | ) 179 | 180 | C_prev_prev, C_prev, C_curr = C, C, C 181 | 182 | self.cells = nn.ModuleList() 183 | reduction_prev = True 184 | for i in range(layers): 185 | if i in [layers // 3, 2 * layers // 3]: 186 | C_curr *= 2 187 | reduction = True 188 | else: 189 | reduction = False 190 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 191 | reduction_prev = reduction 192 | self.cells += [cell] 193 | C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr 194 | if i == 2 * layers // 3: 195 | C_to_auxiliary = C_prev 196 | 197 | if auxiliary: 198 | self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes) 199 | self.global_pooling = nn.AvgPool2d(7) 200 | self.classifier = nn.Linear(C_prev, num_classes) 201 | 202 | def forward(self, input): 203 | logits_aux = None 204 | s0 = self.stem0(input) 205 | s1 = self.stem1(s0) 206 | for i, cell in enumerate(self.cells): 207 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 208 | if i == 2 * self._layers // 3: 209 | if self._auxiliary and self.training: 210 | logits_aux = self.auxiliary_head(s1) 211 | out = self.global_pooling(s1) 212 | logits = self.classifier(out.view(out.size(0), -1)) 213 | return logits, logits_aux 214 | 215 | -------------------------------------------------------------------------------- /eval/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from operations import * 4 | from torch.autograd import Variable 5 | from utils import drop_path 6 | 7 | 8 | class Cell(nn.Module): 9 | 10 | def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev): 11 | super(Cell, self).__init__() 12 | print(C_prev_prev, C_prev, C) 13 | 14 | if reduction_prev: 15 | self.preprocess0 = FactorizedReduce(C_prev_prev, C) 16 | else: 17 | self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0) 18 | self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0) 19 | 20 | if reduction: 21 | op_names, indices = zip(*genotype.reduce) 22 | concat = genotype.reduce_concat 23 | else: 24 | op_names, indices = zip(*genotype.normal) 25 | concat = genotype.normal_concat 26 | self._compile(C, op_names, indices, concat, reduction) 27 | 28 | def _compile(self, C, op_names, indices, concat, reduction): 29 | assert len(op_names) == len(indices) 30 | self._steps = len(op_names) // 2 31 | self._concat = concat 32 | self.multiplier = len(concat) 33 | 34 | self._ops = nn.ModuleList() 35 | for name, index in zip(op_names, indices): 36 | stride = 2 if reduction and index < 2 else 1 37 | op = OPS[name](C, stride, True) 38 | self._ops += [op] 39 | self._indices = indices 40 | 41 | def forward(self, s0, s1, drop_prob): 42 | s0 = self.preprocess0(s0) 43 | s1 = self.preprocess1(s1) 44 | 45 | states = [s0, s1] 46 | for i in range(self._steps): 47 | h1 = states[self._indices[2*i]] 48 | h2 = states[self._indices[2*i+1]] 49 | op1 = self._ops[2*i] 50 | op2 = self._ops[2*i+1] 51 | h1 = op1(h1) 52 | h2 = op2(h2) 53 | if self.training and drop_prob > 0.: 54 | if not isinstance(op1, Identity): 55 | h1 = drop_path(h1, drop_prob) 56 | if not isinstance(op2, Identity): 57 | h2 = drop_path(h2, drop_prob) 58 | s = h1 + h2 59 | states += [s] 60 | return torch.cat([states[i] for i in self._concat], dim=1) 61 | 62 | 63 | class AuxiliaryHeadCIFAR(nn.Module): 64 | 65 | def __init__(self, C, num_classes): 66 | """assuming input size 8x8""" 67 | super(AuxiliaryHeadCIFAR, self).__init__() 68 | self.features = nn.Sequential( 69 | nn.ReLU(inplace=True), 70 | nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), # image size = 2 x 2 71 | nn.Conv2d(C, 128, 1, bias=False), 72 | nn.BatchNorm2d(128), 73 | nn.ReLU(inplace=True), 74 | nn.Conv2d(128, 768, 2, bias=False), 75 | nn.BatchNorm2d(768), 76 | nn.ReLU(inplace=True) 77 | ) 78 | self.classifier = nn.Linear(768, num_classes) 79 | 80 | def forward(self, x): 81 | x = self.features(x) 82 | x = self.classifier(x.view(x.size(0),-1)) 83 | return x 84 | 85 | 86 | class AuxiliaryHeadImageNet(nn.Module): 87 | 88 | def __init__(self, C, num_classes): 89 | """assuming input size 14x14""" 90 | super(AuxiliaryHeadImageNet, self).__init__() 91 | self.features = nn.Sequential( 92 | nn.ReLU(inplace=True), 93 | nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False), 94 | nn.Conv2d(C, 128, 1, bias=False), 95 | nn.BatchNorm2d(128), 96 | nn.ReLU(inplace=True), 97 | nn.Conv2d(128, 768, 2, bias=False), 98 | # NOTE: This batchnorm was omitted in my earlier implementation due to a typo. 99 | # Commenting it out for consistency with the experiments in the paper. 100 | # nn.BatchNorm2d(768), 101 | nn.ReLU(inplace=True) 102 | ) 103 | self.classifier = nn.Linear(768, num_classes) 104 | 105 | def forward(self, x): 106 | x = self.features(x) 107 | x = self.classifier(x.view(x.size(0),-1)) 108 | return x 109 | 110 | 111 | class NetworkCIFAR(nn.Module): 112 | 113 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 114 | super(NetworkCIFAR, self).__init__() 115 | self._layers = layers 116 | self._auxiliary = auxiliary 117 | 118 | stem_multiplier = 3 119 | C_curr = stem_multiplier*C 120 | self.stem = nn.Sequential( 121 | nn.Conv2d(3, C_curr, 3, padding=1, bias=False), 122 | nn.BatchNorm2d(C_curr) 123 | ) 124 | 125 | C_prev_prev, C_prev, C_curr = C_curr, C_curr, C 126 | self.cells = nn.ModuleList() 127 | reduction_prev = False 128 | for i in range(layers): 129 | if i in [layers//3, 2*layers//3]: 130 | C_curr *= 2 131 | reduction = True 132 | else: 133 | reduction = False 134 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 135 | reduction_prev = reduction 136 | self.cells += [cell] 137 | C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr 138 | if i == 2*layers//3: 139 | C_to_auxiliary = C_prev 140 | 141 | if auxiliary: 142 | self.auxiliary_head = AuxiliaryHeadCIFAR(C_to_auxiliary, num_classes) 143 | self.global_pooling = nn.AdaptiveAvgPool2d(1) 144 | self.classifier = nn.Linear(C_prev, num_classes) 145 | 146 | def forward(self, input): 147 | logits_aux = None 148 | s0 = s1 = self.stem(input) 149 | for i, cell in enumerate(self.cells): 150 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 151 | if i == 2*self._layers//3: 152 | if self._auxiliary and self.training: 153 | logits_aux = self.auxiliary_head(s1) 154 | out = self.global_pooling(s1) 155 | logits = self.classifier(out.view(out.size(0),-1)) 156 | return logits#, logits_aux 157 | 158 | 159 | class NetworkImageNet(nn.Module): 160 | 161 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 162 | super(NetworkImageNet, self).__init__() 163 | self._layers = layers 164 | self._auxiliary = auxiliary 165 | 166 | self.stem0 = nn.Sequential( 167 | nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False), 168 | nn.BatchNorm2d(C // 2), 169 | nn.ReLU(inplace=True), 170 | nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False), 171 | nn.BatchNorm2d(C), 172 | ) 173 | 174 | self.stem1 = nn.Sequential( 175 | nn.ReLU(inplace=True), 176 | nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False), 177 | nn.BatchNorm2d(C), 178 | ) 179 | 180 | C_prev_prev, C_prev, C_curr = C, C, C 181 | 182 | self.cells = nn.ModuleList() 183 | reduction_prev = True 184 | for i in range(layers): 185 | if i in [layers // 3, 2 * layers // 3]: 186 | C_curr *= 2 187 | reduction = True 188 | else: 189 | reduction = False 190 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 191 | reduction_prev = reduction 192 | self.cells += [cell] 193 | C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr 194 | if i == 2 * layers // 3: 195 | C_to_auxiliary = C_prev 196 | 197 | if auxiliary: 198 | self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes) 199 | self.global_pooling = nn.AvgPool2d(7) 200 | self.classifier = nn.Linear(C_prev, num_classes) 201 | 202 | def forward(self, input): 203 | logits_aux = None 204 | s0 = self.stem0(input) 205 | s1 = self.stem1(s0) 206 | for i, cell in enumerate(self.cells): 207 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 208 | if i == 2 * self._layers // 3: 209 | if self._auxiliary and self.training: 210 | logits_aux = self.auxiliary_head(s1) 211 | out = self.global_pooling(s1) 212 | logits = self.classifier(out.view(out.size(0), -1)) 213 | return logits, logits_aux 214 | 215 | -------------------------------------------------------------------------------- /advrush/hessianflow/optimizer/absa.py: -------------------------------------------------------------------------------- 1 | #* 2 | # @file ABSA training driver based on arxiv:1810.01021 3 | # Copyright (c) Zhewei Yao, Amir Gholami 4 | # All rights reserved. 5 | # This file is part of HessianFlow library. 6 | # 7 | # HessianFlow is free software: you can redistribute it and/or modify 8 | # it under the terms of the GNU General Public License as published by 9 | # the Free Software Foundation, either version 3 of the License, or 10 | # (at your option) any later version. 11 | # 12 | # HessianFlow is distributed in the hope that it will be useful, 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 | # GNU General Public License for more details. 16 | # 17 | # You should have received a copy of the GNU General Public License 18 | # along with HessianFlow. If not, see . 19 | #* 20 | 21 | from __future__ import print_function 22 | import numpy as np 23 | import torch 24 | import torch.nn as nn 25 | import torch.nn.functional as F 26 | import torch.optim as optim 27 | from torchvision import datasets, transforms 28 | from torch.autograd import Variable 29 | 30 | from .progressbar import progress_bar 31 | from .optm_utils import fgsm, exp_lr_scheduler, test 32 | 33 | import hessianflow 34 | from hessianflow.utils import get_params_grad, group_add 35 | from hessianflow.eigen import get_eigen 36 | from copy import deepcopy 37 | 38 | 39 | def get_lr(opt): 40 | """ 41 | get the learning rate 42 | """ 43 | for param_group in opt.param_groups: 44 | return param_group['lr'] 45 | 46 | def copy_update(opt, grad): 47 | """ 48 | used for optimizer update 49 | """ 50 | for group in opt.param_groups: 51 | weight_decay = group['weight_decay'] 52 | momentum = group['momentum'] 53 | dampening = group['dampening'] 54 | nesterov = group['nesterov'] 55 | 56 | for i,p in enumerate(group['params']): 57 | d_p = grad[i] 58 | if weight_decay != 0: 59 | d_p.add_(weight_decay, p.data) 60 | if momentum != 0: 61 | param_state = opt.state[p] 62 | if 'momentum_buffer' not in param_state: 63 | buf = param_state['momentum_buffer'] = torch.zeros_like(p.data) 64 | buf.mul_(momentum).add_(d_p) 65 | else: 66 | buf = param_state['momentum_buffer'] 67 | buf.mul_(momentum).add_(1 - dampening, d_p) 68 | if nesterov: 69 | d_p = d_p.add(momentum, buf) 70 | else: 71 | d_p = buf 72 | p.data.add_(-group['lr'], d_p) 73 | 74 | def absa(model, train_loader, hessian_loader, test_loader, criterion, optimizer, epochs, lr_decay_epoch, lr_decay_ratio, batch_size = 128, 75 | max_large_ratio = 1, adv_ratio = 0., eps = 0., duration = True, cuda = True, print_flag = False): 76 | """ 77 | adaptive batch size with adversarial training 78 | """ 79 | 80 | # initilization 81 | large_grad = [] 82 | inner_loop = 0 83 | large_ratio = 1 84 | max_eig = None 85 | decay_ratio = 2 86 | flag = True 87 | if max_large_ratio == 1: 88 | flag = False 89 | 90 | data_eigen = None 91 | target_eigen = None 92 | flag_data = True 93 | if duration == True: 94 | duration = 10 95 | else: 96 | duration = None 97 | 98 | cur_duration = 0 99 | num_updates = 0 100 | initial_lr = get_lr(optimizer) 101 | 102 | 103 | for epoch in range(1, epochs + 1): 104 | print('\nCurrent Epoch: %d' % epoch) 105 | print('\nTraining') 106 | train_loss = 0. 107 | total_num = 0 108 | correct = 0 109 | 110 | for batch_idx, (data, target) in enumerate(train_loader): 111 | if data.size()[0] < batch_size: 112 | continue 113 | # gather input and target for large batch training 114 | inner_loop += 1 115 | 116 | # save the data for eigen-computation 117 | if flag_data: 118 | data_eigen = data 119 | target_eigen = target 120 | #flag_data = False 121 | # get small model update 122 | # use adversarial training 123 | if adv_ratio > 1. / batch_size: 124 | adv_r = max(int(batch_size * adv_ratio), 1) 125 | model.eval() # set flag so that Batch Norm statistics would not be polluted with fgsm 126 | adv_data = fgsm(model, data[:adv_r], target[:adv_r], eps, cuda) 127 | model.train() # set flag to train for Batch Norm 128 | adv_data = torch.cat([adv_data, data[adv_r:]]) 129 | else: 130 | model.train() 131 | adv_data = data 132 | 133 | optimizer.zero_grad() 134 | if cuda: 135 | adv_data, target = adv_data.cuda(), target.cuda() 136 | 137 | output = model(adv_data) 138 | loss = criterion(output, target) / large_ratio 139 | total_num +=target.size(0) 140 | _, predicted = output.max(1) 141 | correct += predicted.eq(target).sum().item() 142 | 143 | train_loss += loss.item() * target.size(0) * float(large_ratio) 144 | loss.backward() 145 | _, small_grad= get_params_grad(model) 146 | if not large_grad: 147 | large_grad = deepcopy(small_grad) #[small_grad_ + 0. for small_grad_ in small_grad] 148 | else: 149 | large_grad = group_add(large_grad, small_grad) 150 | 151 | 152 | if inner_loop % large_ratio == 0: 153 | num_updates += 1 154 | copy_update(optimizer, large_grad) # todo: see if we can use deep copy to set optimizer.grad = large_grad 155 | large_grad = [] 156 | inner_loop = 0 157 | optimizer.zero_grad() 158 | 159 | progress_bar(batch_idx, len(train_loader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' 160 | % (train_loss / total_num, 161 | 100. * correct/total_num, correct, total_num)) 162 | 163 | ## compute eigenvalues and update large_ratio, adv_ratio etc 164 | if flag: 165 | for data, target in hessian_loader: 166 | data_eigen = data 167 | target_eigen = target 168 | break 169 | eig, _ = get_eigen(model, data_eigen, target_eigen, criterion, cuda = True, maxIter = 10, tol = 1e-2) 170 | cur_duration += 1 171 | 172 | if max_eig == None: 173 | max_eig = eig 174 | else: 175 | if eig <= max_eig/decay_ratio: 176 | # ensure the learning rate is not too crazy, espeacially for model without batch normalization 177 | max_eig = eig 178 | prev_ratio = large_ratio 179 | large_ratio = int(large_ratio*decay_ratio) 180 | adv_ratio /= decay_ratio 181 | if large_ratio >= max_large_ratio: 182 | large_ratio = max_large_ratio 183 | adv_ratio = 0. 184 | flag = False 185 | cur_duration = 0 186 | optimizer = exp_lr_scheduler(optimizer, decay_ratio = large_ratio/prev_ratio) 187 | if duration != None: # if it is around a quadratic bowl, increase batch size 188 | # ensure the learning rate is not too crazy, espeacially for model without batch normalization 189 | if cur_duration - duration > -0.5: 190 | prev_ratio = large_ratio 191 | large_ratio = int(large_ratio*decay_ratio) 192 | adv_ratio /= decay_ratio 193 | if large_ratio >= max_large_ratio: 194 | large_ratio = max_large_ratio 195 | adv_ratio = 0. 196 | flag = False 197 | cur_duration = 0 198 | optimizer = exp_lr_scheduler(optimizer, decay_ratio = large_ratio/prev_ratio) 199 | 200 | 201 | if epoch in lr_decay_epoch: 202 | optimizer = exp_lr_scheduler(optimizer, decay_ratio = lr_decay_ratio) 203 | 204 | if epoch >= epochs // 2: 205 | adv_ratio = 0. 206 | 207 | if print_flag: 208 | #print('\n Batch size %d' % (batch_size*large_ratio)) 209 | print('\n Eig %f Max Eig %f Batch size %d' % (eig, max_eig, batch_size * large_ratio)) 210 | 211 | test(model, test_loader) 212 | 213 | return model, num_updates 214 | -------------------------------------------------------------------------------- /advrush/adv_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import glob 5 | import numpy as np 6 | import torch 7 | import utils 8 | import logging 9 | import argparse 10 | import torch.nn as nn 11 | import genotypes 12 | import torch.utils 13 | import torchvision.datasets as dset 14 | import torch.backends.cudnn as cudnn 15 | 16 | from torch.autograd import Variable 17 | from model import NetworkCIFAR as Network 18 | from trades import trades_loss, madry_loss 19 | 20 | parser = argparse.ArgumentParser("cifar") 21 | parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') 22 | parser.add_argument('--batch_size', type=int, default=64, help='batch size') #128 23 | parser.add_argument('--learning_rate', type=float, default=0.1, help='init learning rate') 24 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum') 25 | parser.add_argument('--weight_decay', type=float, default=1e-4, help='weight decay') 26 | parser.add_argument('--report_freq', type=float, default=50, help='report frequency') 27 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 28 | parser.add_argument('--epochs', type=int, default=200, help='num of training epochs') 29 | parser.add_argument('--epsilon', type=float, default=0.031, help='perturbation') 30 | parser.add_argument('--num_steps', type=int, default=7, help='perturb number of steps') 31 | parser.add_argument('--step_size', type=float, default=0.01, help='perturb step size') 32 | parser.add_argument('--beta', type=float, default=6.0, help='regularization in TRADES') 33 | parser.add_argument('--adv_loss', type=str, default='pgd', help='experiment name') 34 | parser.add_argument('--init_channels', type=int, default=36, help='num of init channels') 35 | parser.add_argument('--layers', type=int, default=20, help='total number of layers') 36 | parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model') 37 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 38 | parser.add_argument('--auxiliary_weight', type=float, default=0.4, help='weight for auxiliary loss') 39 | parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') 40 | parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') 41 | parser.add_argument('--drop_path_prob', type=float, default=0.0, help='drop path probability') 42 | parser.add_argument('--save', type=str, default='EXP', help='experiment name') 43 | parser.add_argument('--seed', type=int, default=0, help='random seed') 44 | parser.add_argument('--arch', type=str, default='ADVRUSH', help='which architecture to use') 45 | parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') 46 | 47 | args = parser.parse_args() 48 | 49 | args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) 50 | utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) 51 | 52 | log_format = '%(asctime)s %(message)s' 53 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 54 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 55 | fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) 56 | fh.setFormatter(logging.Formatter(log_format)) 57 | logging.getLogger().addHandler(fh) 58 | 59 | CIFAR_CLASSES = 10 60 | 61 | 62 | def main(): 63 | if not torch.cuda.is_available(): 64 | logging.info('no gpu device available') 65 | sys.exit(1) 66 | 67 | np.random.seed(args.seed) 68 | torch.cuda.set_device(args.gpu) 69 | cudnn.benchmark = True 70 | torch.manual_seed(args.seed) 71 | cudnn.enabled=True 72 | torch.cuda.manual_seed(args.seed) 73 | logging.info('gpu device = %d' % args.gpu) 74 | logging.info("args = %s", args) 75 | 76 | genotype = eval("genotypes.%s" % args.arch) 77 | model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) 78 | model = model.cuda() 79 | 80 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 81 | 82 | criterion = nn.CrossEntropyLoss() 83 | criterion = criterion.cuda() 84 | optimizer = torch.optim.SGD( 85 | model.parameters(), 86 | args.learning_rate, 87 | momentum=args.momentum, 88 | weight_decay=args.weight_decay 89 | ) 90 | 91 | train_transform, valid_transform = utils._data_transforms_cifar10_eval(args) 92 | train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) 93 | valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) 94 | 95 | train_queue = torch.utils.data.DataLoader( 96 | train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) 97 | 98 | valid_queue = torch.utils.data.DataLoader( 99 | valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) 100 | 101 | best_acc = 0.0 102 | for epoch in range(args.epochs): 103 | adjust_learning_rate(optimizer, epoch) 104 | model.drop_path_prob = args.drop_path_prob * epoch / args.epochs 105 | 106 | train_acc, train_obj = train(train_queue, model, criterion, optimizer) 107 | logging.info('epoch %d train_acc %f', epoch, train_acc) 108 | 109 | valid_acc, valid_obj = infer(valid_queue, model, criterion) 110 | if valid_acc > best_acc: 111 | best_acc = valid_acc 112 | utils.save_checkpoint({ 113 | 'epoch': epoch +1, 114 | 'state_dict': model.state_dict(), 115 | 'optimizer': optimizer.state_dict(), 116 | }, is_best=True, save=args.save, epoch=epoch) 117 | logging.info('epoch %d valid_acc %f, best_acc %f', epoch, valid_acc, best_acc) 118 | 119 | utils.save(model, os.path.join(args.save, 'weights.pt')) 120 | utils.save_checkpoint({ 121 | 'epoch': epoch + 1, 122 | 'state_dict': model.state_dict(), 123 | 'optimizer': optimizer.state_dict(), 124 | }, is_best=False, save=args.save, epoch=epoch) 125 | 126 | 127 | def train(train_queue, model, criterion, optimizer): 128 | objs = utils.AvgrageMeter() 129 | top1 = utils.AvgrageMeter() 130 | top5 = utils.AvgrageMeter() 131 | model.train() 132 | 133 | for step, (input, target) in enumerate(train_queue): 134 | input = Variable(input).cuda(non_blocking=True) 135 | target = Variable(target).cuda(non_blocking=True) 136 | 137 | optimizer.zero_grad() 138 | logits, logits_aux = model(input) 139 | if args.adv_loss == 'pgd': 140 | loss = madry_loss( 141 | model, 142 | input, 143 | target, 144 | optimizer, 145 | step_size = args.step_size, 146 | epsilon = args.epsilon, 147 | perturb_steps = args.num_steps) 148 | elif args.adv_loss == 'trades': 149 | loss = trades_loss(model, 150 | input, 151 | target, 152 | optimizer, 153 | step_size=args.step_size, 154 | epsilon=args.epsilon, 155 | perturb_steps=args.num_steps, 156 | beta=args.beta, 157 | distance='l_inf') 158 | #loss = criterion(logits, target) 159 | if args.auxiliary: 160 | loss_aux = criterion(logits_aux, target) 161 | loss += args.auxiliary_weight*loss_aux 162 | loss.backward() 163 | nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) 164 | optimizer.step() 165 | 166 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 167 | n = input.size(0) 168 | objs.update(loss.data.item(), n) 169 | top1.update(prec1.data.item(), n) 170 | top5.update(prec5.data.item(), n) 171 | 172 | if step % args.report_freq == 0: 173 | logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 174 | 175 | return top1.avg, objs.avg 176 | 177 | 178 | def infer(valid_queue, model, criterion): 179 | objs = utils.AvgrageMeter() 180 | top1 = utils.AvgrageMeter() 181 | top5 = utils.AvgrageMeter() 182 | model.eval() 183 | 184 | with torch.no_grad(): 185 | for step, (input, target) in enumerate(valid_queue): 186 | input = Variable(input, requires_grad=False).cuda(non_blocking=True) 187 | target = Variable(target, requires_grad=False).cuda(non_blocking=True) 188 | 189 | logits, _ = model(input) 190 | loss = criterion(logits, target) 191 | 192 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 193 | n = input.size(0) 194 | objs.update(loss.data.item(), n) 195 | top1.update(prec1.data.item(), n) 196 | top5.update(prec5.data.item(), n) 197 | 198 | if step % args.report_freq == 0: 199 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 200 | 201 | return top1.avg, objs.avg 202 | 203 | def adjust_learning_rate(optimizer, epoch): 204 | """decrease the learning rate""" 205 | lr = args.learning_rate 206 | if epoch >= 99: 207 | lr = args.learning_rate * 0.1 208 | if epoch >= 149: 209 | lr = args.learning_rate * 0.01 210 | for param_group in optimizer.param_groups: 211 | param_group['lr'] = lr 212 | 213 | if __name__ == '__main__': 214 | main() 215 | 216 | -------------------------------------------------------------------------------- /eval/pgd_attack.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torchvision 8 | from torch.autograd import Variable 9 | import torch.optim as optim 10 | from torchvision import datasets, transforms 11 | from model import NetworkCIFAR as Network 12 | import genotypes 13 | 14 | parser = argparse.ArgumentParser(description='PyTorch CIFAR PGD Attack Evaluation') 15 | parser.add_argument('--test-batch-size', type=int, default=25, metavar='N', 16 | help='input batch size for testing (default: 200)') 17 | parser.add_argument('--no-cuda', action='store_true', default=False, 18 | help='disables CUDA training') 19 | parser.add_argument('--epsilon', default=0.031, 20 | help='perturbation') 21 | parser.add_argument('--num-steps', type=int, default=20, 22 | help='perturb number of steps') 23 | parser.add_argument('--step-size', default=0.01, 24 | help='perturb step size') 25 | parser.add_argument('--random', 26 | default=True, 27 | help='random initialization for PGD') 28 | parser.add_argument('--white-box-attack', default=False, 29 | help='whether perform white-box attack') 30 | parser.add_argument('--arch', type=str, default='ADVRUSH', help='which architecture to use') 31 | parser.add_argument('--init_channels', type=int, default=36, help='num of init channels') 32 | parser.add_argument('--layers', type=int, default=20, help='total number of layers') 33 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 34 | parser.add_argument('--drop_path_prob', type=float, default=0.0, help='drop path probability') 35 | parser.add_argument('--target_arch', type=str, default='ADVRUSH', help='which architecture to use') 36 | parser.add_argument('--source_arch', type=str, default='ADVRUSH', help='which architecture to use') 37 | parser.add_argument('--target_checkpoint', type=str, default='./', help='which architecture to use') 38 | parser.add_argument('--source_checkpoint', type=str, default='./', help='which architecture to use') 39 | parser.add_argument('--log_path', type=str, default='./', help='path to store log file') 40 | parser.add_argument('--checkpoint', type=str, default='./', help='which architecture to use') 41 | parser.add_argument('--data_type', type=str, default='cifar10', help='which dataset to use') 42 | 43 | args = parser.parse_args() 44 | 45 | # settings 46 | use_cuda = not args.no_cuda and torch.cuda.is_available() 47 | device = torch.device("cuda" if use_cuda else "cpu") 48 | kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {} 49 | 50 | # set up data loader 51 | if args.data_type == 'cifar10': 52 | transform_list = [transforms.ToTensor()] 53 | transform_test = transforms.Compose(transform_list) 54 | testset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test) 55 | elif args.data_type == 'cifar100': 56 | transform_list = [transforms.ToTensor()] 57 | transform_test = transforms.Compose(transform_list) 58 | testset = torchvision.datasets.CIFAR100(root='../data', train=False, download=True, transform=transform_test) 59 | elif args.data_type == 'svhn': 60 | transform_list = [transforms.ToTensor()] 61 | transform_test = transforms.Compose(transform_list) 62 | testset = torchvision.datasets.SVHN(root='../data', split='test', download=True, transform=transform_test) 63 | 64 | test_loader = torch.utils.data.DataLoader(testset, batch_size=args.test_batch_size, shuffle=False, **kwargs) 65 | 66 | 67 | def _pgd_whitebox(model, 68 | X, 69 | y, 70 | epsilon=args.epsilon, 71 | num_steps=args.num_steps, 72 | step_size=args.step_size): 73 | out = model(X) 74 | err = (out.data.max(1)[1] != y.data).float().sum() 75 | X_pgd = Variable(X.data, requires_grad=True) 76 | if args.random: 77 | random_noise = torch.FloatTensor(*X_pgd.shape).uniform_(-epsilon, epsilon).to(device) 78 | X_pgd = Variable(X_pgd.data + random_noise, requires_grad=True) 79 | 80 | for _ in range(num_steps): 81 | opt = optim.SGD([X_pgd], lr=1e-3) 82 | opt.zero_grad() 83 | 84 | with torch.enable_grad(): 85 | loss = nn.CrossEntropyLoss()(model(X_pgd), y) 86 | loss.backward() 87 | eta = step_size * X_pgd.grad.data.sign() 88 | X_pgd = Variable(X_pgd.data + eta, requires_grad=True) 89 | eta = torch.clamp(X_pgd.data - X.data, -epsilon, epsilon) 90 | X_pgd = Variable(X.data + eta, requires_grad=True) 91 | X_pgd = Variable(torch.clamp(X_pgd, 0, 1.0), requires_grad=True) 92 | err_pgd = (model(X_pgd).data.max(1)[1] != y.data).float().sum() 93 | print('err pgd (white-box): ', err_pgd) 94 | return err, err_pgd 95 | 96 | 97 | def _pgd_blackbox(model_target, 98 | model_source, 99 | X, 100 | y, 101 | epsilon=args.epsilon, 102 | num_steps=args.num_steps, 103 | step_size=args.step_size): 104 | out = model_target(X) 105 | err = (out.data.max(1)[1] != y.data).float().sum() 106 | X_pgd = Variable(X.data, requires_grad=True) 107 | if args.random: 108 | random_noise = torch.FloatTensor(*X_pgd.shape).uniform_(-epsilon, epsilon).to(device) 109 | X_pgd = Variable(X_pgd.data + random_noise, requires_grad=True) 110 | 111 | for _ in range(num_steps): 112 | opt = optim.SGD([X_pgd], lr=1e-3) 113 | opt.zero_grad() 114 | with torch.enable_grad(): 115 | loss = nn.CrossEntropyLoss()(model_source(X_pgd), y) 116 | loss.backward() 117 | eta = step_size * X_pgd.grad.data.sign() 118 | X_pgd = Variable(X_pgd.data + eta, requires_grad=True) 119 | eta = torch.clamp(X_pgd.data - X.data, -epsilon, epsilon) 120 | X_pgd = Variable(X.data + eta, requires_grad=True) 121 | X_pgd = Variable(torch.clamp(X_pgd, 0, 1.0), requires_grad=True) 122 | 123 | err_pgd = (model_target(X_pgd).data.max(1)[1] != y.data).float().sum() 124 | print('err pgd black-box: ', err_pgd) 125 | return err, err_pgd 126 | 127 | 128 | def eval_adv_test_whitebox(model, device, test_loader): 129 | """ 130 | evaluate model by white-box attack 131 | """ 132 | model.eval() 133 | robust_err_total = 0 134 | natural_err_total = 0 135 | 136 | for data, target in test_loader: 137 | data, target = data.to(device), target.to(device) 138 | # pgd attack 139 | X, y = Variable(data, requires_grad=True), Variable(target) 140 | err_natural, err_robust = _pgd_whitebox(model, X, y) 141 | robust_err_total += err_robust 142 | natural_err_total += err_natural 143 | print('natural_err_total: ', natural_err_total) 144 | print('robust_err_total: ', robust_err_total) 145 | 146 | 147 | def eval_adv_test_blackbox(model_target, model_source, device, test_loader): 148 | """ 149 | evaluate model by black-box attack 150 | """ 151 | model_target.eval() 152 | model_source.eval() 153 | robust_err_total = 0 154 | natural_err_total = 0 155 | 156 | for data, target in test_loader: 157 | data, target = data.to(device), target.to(device) 158 | # pgd attack 159 | X, y = Variable(data, requires_grad=True), Variable(target) 160 | err_natural, err_robust = _pgd_blackbox(model_target, model_source, X, y) 161 | robust_err_total += err_robust 162 | natural_err_total += err_natural 163 | print('natural_err_total: ', natural_err_total) 164 | print('robust_err_total: ', robust_err_total) 165 | 166 | def main(): 167 | 168 | 169 | if args.white_box_attack: 170 | # white-box attack 171 | print('pgd white-box attack') 172 | if args.data_type == 'cifar100': 173 | CIFAR_CLASSES = 100 174 | else: 175 | CIFAR_CLASSES = 10 176 | genotype = eval("genotypes.%s" % args.arch) 177 | model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) 178 | checkpoint = torch.load(args.checkpoint, map_location='cuda:0') 179 | model.load_state_dict(checkpoint['state_dict']) 180 | model.drop_path_prob = args.drop_path_prob 181 | model.cuda() 182 | eval_adv_test_whitebox(model, device, test_loader) 183 | 184 | else: 185 | # black-box attack 186 | CIFAR_CLASSES = 10 187 | print('pgd black-box attack') 188 | target_genotype = eval("genotypes.%s" % args.target_arch) 189 | source_genotype = eval("genotypes.%s" % args.source_arch) 190 | 191 | model_source = Network(args.init_channels,CIFAR_CLASSES, args.layers, args.auxiliary, source_genotype) 192 | source_checkpoint = torch.load(args.source_checkpoint) 193 | model_source.load_state_dict(source_checkpoint['state_dict']) 194 | model_source.drop_path_prob = args.drop_path_prob 195 | model_source.cuda() 196 | 197 | model_target = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, target_genotype) 198 | target_checkpoint = torch.load(args.target_checkpoint) 199 | model_target.load_state_dict(target_checkpoint['state_dict']) 200 | model_target.drop_path_prob = args.drop_path_prob 201 | model_target.cuda() 202 | 203 | eval_adv_test_blackbox(model_target, model_source, device, test_loader) 204 | 205 | 206 | if __name__ == '__main__': 207 | main() 208 | -------------------------------------------------------------------------------- /advrush/train_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import glob 5 | from random import shuffle 6 | import numpy as np 7 | import torch 8 | import utils 9 | import logging 10 | import argparse 11 | import torch.nn as nn 12 | import torch.utils 13 | import torch.nn.functional as F 14 | import torchvision.datasets as dset 15 | import torch.backends.cudnn as cudnn 16 | 17 | from torch.autograd import Variable 18 | from model_search import Network 19 | from architect import Architect 20 | from regularizer import * 21 | from tensorboardX import SummaryWriter 22 | import hessianflow as hf 23 | import hessianflow.optimizer.optm_utils as hf_optm_utils 24 | import hessianflow.optimizer.progressbar as hf_optm_pgb 25 | 26 | parser = argparse.ArgumentParser("cifar") 27 | parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') 28 | parser.add_argument('--batch_size', type=int, default=64, help='batch size') 29 | parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate') 30 | parser.add_argument('--learning_rate_min', type=float, default=0.001, help='min learning rate') 31 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum') 32 | parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay') 33 | parser.add_argument('--report_freq', type=float, default=50, help='report frequency') 34 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 35 | parser.add_argument('--epochs', type=int, default=50, help='num of training epochs') 36 | parser.add_argument('--init_channels', type=int, default=16, help='num of init channels') 37 | parser.add_argument('--layers', type=int, default=8, help='total number of layers') 38 | parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model') 39 | parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') 40 | parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') 41 | parser.add_argument('--drop_path_prob', type=float, default=0.3, help='drop path probability') 42 | parser.add_argument('--save', type=str, default='EXP', help='experiment name') 43 | parser.add_argument('--seed', type=int, default=2, help='random seed') 44 | parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') 45 | parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') 46 | parser.add_argument('--unrolled', action='store_true', default=False, help='use one-step unrolled validation loss') 47 | parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') 48 | parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') 49 | parser.add_argument('--a_gamma', type=float, default=0.01, help='a regularization strength') 50 | parser.add_argument('--w_gamma', type=float, default=1e-4, help='w regularization strength') 51 | parser.add_argument('--a_warmup_epochs', type=int, default=50, help='num of warm up epochs before using Hessian - architecture weight') 52 | parser.add_argument('--w_warmup_epochs', type=int, default=60, help='num of warm up epochs before using Hessian - model parameters') 53 | parser.add_argument('--loss_hessian', type=str, default='loss_cure', help='type of hessian loss to use, loss_eigen') 54 | 55 | args = parser.parse_args() 56 | 57 | args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) 58 | utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) 59 | 60 | log_format = '%(asctime)s %(message)s' 61 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 62 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 63 | fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) 64 | fh.setFormatter(logging.Formatter(log_format)) 65 | logging.getLogger().addHandler(fh) 66 | 67 | if not os.path.isdir(os.path.join(args.save, './log')): 68 | os.makedirs(os.path.join(args.save, './log')) 69 | tb_logger = SummaryWriter(os.path.join(args.save, './log')) 70 | 71 | CIFAR_CLASSES = 10 72 | 73 | 74 | def main(): 75 | if not torch.cuda.is_available(): 76 | logging.info('no gpu device available') 77 | sys.exit(1) 78 | 79 | np.random.seed(args.seed) 80 | torch.cuda.set_device(args.gpu) 81 | cudnn.benchmark = True 82 | torch.manual_seed(args.seed) 83 | cudnn.enabled=True 84 | torch.cuda.manual_seed(args.seed) 85 | logging.info('gpu device = %d' % args.gpu) 86 | logging.info("args = %s", args) 87 | 88 | criterion = nn.CrossEntropyLoss() 89 | criterion = criterion.cuda() 90 | model = Network(args.init_channels, CIFAR_CLASSES, args.layers, criterion) 91 | model = model.cuda() 92 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 93 | 94 | optimizer = torch.optim.SGD( 95 | model.parameters(), 96 | args.learning_rate, 97 | momentum=args.momentum, 98 | weight_decay=args.weight_decay) 99 | 100 | train_transform, valid_transform = utils._data_transforms_cifar10(args) 101 | train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) 102 | 103 | train_queue = torch.utils.data.DataLoader( 104 | train_data, batch_size=args.batch_size, pin_memory=True, num_workers=2) 105 | 106 | valid_queue = torch.utils.data.DataLoader( 107 | train_data, batch_size=args.batch_size, pin_memory=True, num_workers=2) 108 | 109 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 110 | optimizer, float(args.epochs), eta_min=args.learning_rate_min) 111 | 112 | architect = Architect(model, args) 113 | 114 | for epoch in range(args.epochs): 115 | scheduler.step() 116 | lr = scheduler.get_lr()[0] 117 | logging.info('epoch %d lr %e', epoch, lr) 118 | 119 | genotype = model.genotype() 120 | logging.info('genotype = %s', genotype) 121 | 122 | logging.info(F.softmax(model.alphas_normal, dim=-1)) 123 | logging.info(F.softmax(model.alphas_reduce, dim=-1)) 124 | h_all = np.array([0.0, 0.3, 0.6, 0.9, 1.2, 1.5]) 125 | h_all = np.append(h_all, [1.5]*int(args.epochs-6)) 126 | # training 127 | train_acc, train_obj, a_reg, w_reg = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch, h=h_all[epoch]) 128 | logging.info('train_acc %f', train_acc) 129 | tb_logger.add_scalar('train_accuracy', train_acc, epoch) 130 | tb_logger.add_scalar('train_loss', train_obj, epoch) 131 | tb_logger.add_scalar('alpha_regularization', a_reg, epoch) 132 | tb_logger.add_scalar('weight_regularization', w_reg, epoch) 133 | 134 | # validation 135 | valid_acc, valid_obj = infer(valid_queue, model, criterion) 136 | logging.info('valid_acc %f', valid_acc) 137 | 138 | utils.save(model, os.path.join(args.save, 'weights.pt')) 139 | utils.save_checkpoint({ 140 | 'epoch': epoch + 1, 141 | 'model_optimizer': optimizer.state_dict(), 142 | 'arch_optimizer': architect.optimizer.state_dict(), 143 | 'model': model.state_dict(), 144 | 'scheduler': scheduler.state_dict(), 145 | 'alpha_normal': model.alphas_normal, 146 | 'alpha_reduce': model.alphas_reduce}, is_best=False, save=args.save, epoch=epoch) 147 | 148 | 149 | 150 | def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr, epoch, h): 151 | objs = utils.AvgrageMeter() 152 | a_regs = utils.AvgrageMeter() 153 | w_regs = utils.AvgrageMeter() 154 | top1 = utils.AvgrageMeter() 155 | top5 = utils.AvgrageMeter() 156 | 157 | for step, (input, target) in enumerate(train_queue): 158 | model.train() 159 | n = input.size(0) 160 | 161 | input = Variable(input, requires_grad=False).cuda(non_blocking=True) 162 | target = Variable(target, requires_grad=False).cuda(non_blocking=True) 163 | 164 | # get a random minibatch from the search queue with replacement 165 | input_search, target_search = next(iter(valid_queue)) 166 | input_search = Variable(input_search, requires_grad=False).cuda(non_blocking=True) 167 | target_search = Variable(target_search, requires_grad=False).cuda(non_blocking=True) 168 | 169 | a_regularizer = architect.step(input, target, epoch, args.a_warmup_epochs, args.a_gamma, criterion, args.loss_hessian, valid_queue, input_search, target_search, lr, optimizer, unrolled=args.unrolled, h=h) 170 | 171 | optimizer.zero_grad() 172 | logits = model(input) 173 | 174 | if epoch < args.w_warmup_epochs: 175 | loss = criterion(logits, target) 176 | w_regularizer = torch.tensor(0, dtype=torch.float) 177 | else: 178 | if args.loss_hessian == 'loss_cure': 179 | reg = loss_cure(model, criterion, lambda_=1, device='cuda') 180 | w_regularizer, grad_norm = reg.regularizer(input, target, h=h) 181 | else: 182 | reg = loss_eigen(model, train_queue, input, target, criterion, full_eigen=False, maxIter=10, tol=1e-2) 183 | regularizer, _ = reg.regularizer() 184 | 185 | loss = criterion(logits, target) + args.w_gamma * w_regularizer 186 | print(f'epoch={epoch} | step={step} | loss={loss} | w_reg={w_regularizer} | a_reg = {a_regularizer}') 187 | 188 | loss.backward() 189 | nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) 190 | optimizer.step() 191 | 192 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 193 | objs.update(loss.data.item(), n) 194 | w_regs.update(w_regularizer.data.item(), n) 195 | a_regs.update(a_regularizer.data.item(), n) 196 | top1.update(prec1.data.item(), n) 197 | top5.update(prec5.data.item(), n) 198 | 199 | if step % args.report_freq == 0: 200 | logging.info('train %03d objs %e a_regs %e w_regs %e %f %f', step, objs.avg, a_regs.avg, w_regs.avg, top1.avg, top5.avg) 201 | 202 | return top1.avg, objs.avg, a_regs.avg, w_regs.avg 203 | 204 | 205 | def infer(valid_queue, model, criterion): 206 | objs = utils.AvgrageMeter() 207 | top1 = utils.AvgrageMeter() 208 | top5 = utils.AvgrageMeter() 209 | model.eval() 210 | 211 | with torch.no_grad(): 212 | for step, (input, target) in enumerate(valid_queue): 213 | input = Variable(input, requires_grad=False).cuda(non_blocking=True) 214 | target = Variable(target, requires_grad=False).cuda(non_blocking=True) 215 | 216 | logits = model(input) 217 | loss = criterion(logits, target) 218 | 219 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 220 | n = input.size(0) 221 | objs.update(loss.data.item(), n) 222 | top1.update(prec1.data.item(), n) 223 | top5.update(prec5.data.item(), n) 224 | 225 | if step % args.report_freq == 0: 226 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 227 | 228 | return top1.avg, objs.avg 229 | 230 | 231 | if __name__ == '__main__': 232 | main() 233 | 234 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /advrush/regularizer.py: -------------------------------------------------------------------------------- 1 | # import torch 2 | # import copy 3 | # import torch.nn as nn 4 | from torch.autograd.gradcheck import zero_gradients 5 | # from utils.utils import progress_bar 6 | # import numpy as np 7 | # import matplotlib.pyplot as plt 8 | # from utils.utils import pgd 9 | # import torchvision 10 | # import os 11 | import torch 12 | # import torch.nn as nn 13 | # from torch.autograd import grad 14 | # import torch.optim as optim 15 | # import torch.nn.functional as F 16 | # import torch.backends.cudnn as cudnn 17 | # from torch.optim.lr_scheduler import StepLR 18 | # from torch.distributions import uniform 19 | 20 | import hessianflow as hf 21 | import hessianflow.optimizer.optm_utils as hf_optm_utils 22 | import hessianflow.optimizer.progressbar as hf_optm_pgb 23 | 24 | class loss_cure(): 25 | def __init__(self, net, criterion, lambda_, device='cuda'): 26 | self.net = net 27 | self.criterion = criterion 28 | self.lambda_ = lambda_ 29 | self.device = device 30 | 31 | def _find_z(self, inputs, targets, h): 32 | 33 | inputs.requires_grad_() 34 | outputs = self.net.eval()(inputs) 35 | loss_z = self.criterion(outputs, targets) #self.net.eval()(inputs) 36 | 37 | loss_z.backward(torch.ones(targets.size(), dtype=torch.float).to(self.device)) #torch.ones(targets.size(), dtype=torch.float).to(self.device) 38 | grad = inputs.grad.data + 0.0 39 | norm_grad = grad.norm().item() 40 | z = torch.sign(grad).detach() + 0. 41 | z = 1. * (h) * (z + 1e-7) / (z.reshape(z.size(0), -1).norm(dim=1)[:, None, None, None] + 1e-7) 42 | inputs.grad.detach() 43 | inputs.grad.zero_() 44 | #zero_gradients(inputs) 45 | self.net.zero_grad() 46 | 47 | return z, norm_grad 48 | 49 | def regularizer(self, inputs, targets, h=3., lambda_=4): 50 | ''' 51 | Regularizer term in CURE 52 | ''' 53 | z, norm_grad = self._find_z(inputs, targets, h) 54 | 55 | inputs.requires_grad_() 56 | outputs_pos = self.net.eval()(inputs + z) 57 | outputs_orig = self.net.eval()(inputs) 58 | 59 | loss_pos = self.criterion(outputs_pos, targets) 60 | loss_orig = self.criterion(outputs_orig, targets) 61 | grad_diff = \ 62 | torch.autograd.grad((loss_pos - loss_orig), inputs, grad_outputs=torch.ones(targets.size()).to(self.device), 63 | create_graph=True)[0] 64 | reg = grad_diff.reshape(grad_diff.size(0), -1).norm(dim=1) 65 | self.net.zero_grad() 66 | 67 | return torch.sum(self.lambda_ * reg) / float(inputs.size(0)), norm_grad 68 | 69 | class loss_eigen(): 70 | def __init__(self, net, test_loader, input, target, criterion, full_eigen, maxIter=10, tol=1e-2): 71 | self.net = net 72 | self.test_loader = test_loader 73 | self.criterion = criterion 74 | self.full_eigen = full_eigen 75 | self.max_iter = maxIter 76 | self.tol = tol 77 | self.input = input 78 | self.target = target 79 | self.cuda = True 80 | 81 | def regularizer(self): 82 | if self.full_eigen: 83 | eigenvalue, eigenvector = hf.get_eigen_full_dataset(self.net, self.test_loader, self.criterion, self.max_iter, self.tol) 84 | else: 85 | eigenvalue, eigenvector= hf.get_eigen(self.net, self.input, self.target, self.criterion, self.cuda, self.max_iter, self.tol) 86 | 87 | return eigenvalue, eigenvector 88 | 89 | # class CURELearner(): 90 | # def __init__(self, net, trainloader, testloader, device='cuda', lambda_=4, 91 | # path='./checkpoint'): 92 | # ''' 93 | # CURE Class: Implementation of "Robustness via curvature regularization, and vice versa" 94 | # in https://arxiv.org/abs/1811.09716 95 | # ================================================ 96 | # Arguments: 97 | # 98 | # net: PyTorch nn 99 | # network structure 100 | # trainloader: PyTorch Dataloader 101 | # testloader: PyTorch Dataloader 102 | # device: 'cpu' or 'cuda' if GPU available 103 | # type of decide to move tensors 104 | # lambda_: float 105 | # power of regularization 106 | # path: string 107 | # path to save the best model 108 | # ''' 109 | # if not torch.cuda.is_available() and device == 'cuda': 110 | # raise ValueError("cuda is not available") 111 | # 112 | # self.net = net.to(device) 113 | # self.criterion = nn.CrossEntropyLoss() 114 | # self.device = device 115 | # self.lambda_ = lambda_ 116 | # self.trainloader, self.testloader = trainloader, testloader 117 | # self.path = path 118 | # self.test_acc_adv_best = 0 119 | # self.train_loss, self.train_acc, self.train_curv = [], [], [] 120 | # self.test_loss, self.test_acc_adv, self.test_acc_clean, self.test_curv = [], [], [], [] 121 | # 122 | # def set_optimizer(self, optim_alg='Adam', args={'lr': 1e-4}, scheduler=None, args_scheduler={}): 123 | # ''' 124 | # Setting the optimizer of the network 125 | # ================================================ 126 | # Arguments: 127 | # 128 | # optim_alg : string 129 | # Name of the optimizer 130 | # args: dict 131 | # Parameter of the optimizer 132 | # scheduler: optim.lr_scheduler 133 | # Learning rate scheduler 134 | # args_scheduler : dict 135 | # Parameters of the scheduler 136 | # ''' 137 | # self.optimizer = getattr(optim, optim_alg)(self.net.parameters(), **args) 138 | # if not scheduler: 139 | # self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=10 ** 6, gamma=1) 140 | # else: 141 | # self.scheduler = getattr(optim.lr_scheduler, scheduler)(self.optimizer, **args_scheduler) 142 | # 143 | # def train(self, h=[3], epochs=15): 144 | # ''' 145 | # Training the network 146 | # ================================================ 147 | # Arguemnets: 148 | # 149 | # h : list with length less than the number of epochs 150 | # Different h for different epochs of training, 151 | # can have a single number or a list of floats for each epoch 152 | # epochs : int 153 | # Number of epochs 154 | # ''' 155 | # if len(h) > epochs: 156 | # raise ValueError('Length of h should be less than number of epochs') 157 | # if len(h) == 1: 158 | # h_all = epochs * [h[0]] 159 | # else: 160 | # h_all = epochs * [1.0] 161 | # h_all[:len(h)] = list(h[:]) 162 | # h_all[len(h):] = (epochs - len(h)) * [h[-1]] 163 | # 164 | # for epoch, h_tmp in enumerate(h_all): 165 | # self._train(epoch, h=h_tmp) 166 | # self.test(epoch, h=h_tmp) 167 | # self.scheduler.step() 168 | # 169 | # def _train(self, epoch, h): 170 | # ''' 171 | # Training the model 172 | # ''' 173 | # print('\nEpoch: %d' % epoch) 174 | # train_loss, total = 0, 0 175 | # num_correct = 0 176 | # curv, curvature, norm_grad_sum = 0, 0, 0 177 | # for batch_idx, (inputs, targets) in enumerate(self.trainloader): 178 | # inputs, targets = inputs.to(self.device), targets.to(self.device) 179 | # self.optimizer.zero_grad() 180 | # total += targets.size(0) 181 | # outputs = self.net.train()(inputs) 182 | # 183 | # regularizer, grad_norm = self.regularizer(inputs, targets, h=h) 184 | # 185 | # curvature += regularizer.item() 186 | # neg_log_likelihood = self.criterion(outputs, targets) 187 | # loss = neg_log_likelihood + regularizer 188 | # loss.backward() 189 | # self.optimizer.step() 190 | # self.optimizer.zero_grad() 191 | # 192 | # train_loss += loss.item() 193 | # _, predicted = outputs.max(1) 194 | # outcome = predicted.data == targets 195 | # num_correct += outcome.sum().item() 196 | # 197 | # progress_bar(batch_idx, len(self.trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d) | curvature: %.3f ' % \ 198 | # (train_loss / (batch_idx + 1), 100. * num_correct / total, num_correct, total, 199 | # curvature / (batch_idx + 1))) 200 | # 201 | # self.train_loss.append(train_loss / (batch_idx + 1)) 202 | # self.train_acc.append(100. * num_correct / total) 203 | # self.train_curv.append(curvature / (batch_idx + 1)) 204 | # 205 | # def test(self, epoch, h, num_pgd_steps=20): 206 | # ''' 207 | # Testing the model 208 | # ''' 209 | # test_loss, adv_acc, total, curvature, clean_acc, grad_sum = 0, 0, 0, 0, 0, 0 210 | # 211 | # for batch_idx, (inputs, targets) in enumerate(self.testloader): 212 | # inputs, targets = inputs.to(self.device), targets.to(self.device) 213 | # outputs = self.net.eval()(inputs) 214 | # loss = self.criterion(outputs, targets) 215 | # test_loss += loss.item() 216 | # _, predicted = outputs.max(1) 217 | # clean_acc += predicted.eq(targets).sum().item() 218 | # total += targets.size(0) 219 | # 220 | # inputs_pert = inputs + 0. 221 | # eps = 5. / 255. * 8 222 | # r = pgd(inputs, self.net.eval(), epsilon=[eps], targets=targets, step_size=0.04, 223 | # num_steps=num_pgd_steps, epsil=eps) 224 | # 225 | # inputs_pert = inputs_pert + eps * torch.Tensor(r).to(self.device) 226 | # outputs = self.net(inputs_pert) 227 | # probs, predicted = outputs.max(1) 228 | # adv_acc += predicted.eq(targets).sum().item() 229 | # cur, norm_grad = self.regularizer(inputs, targets, h=h) 230 | # grad_sum += norm_grad 231 | # curvature += cur.item() 232 | # test_loss += cur.item() 233 | # 234 | # print( 235 | # f'epoch = {epoch}, adv_acc = {100. * adv_acc / total}, clean_acc = {100. * clean_acc / total}, loss = {test_loss / (batch_idx + 1)}', \ 236 | # f'curvature = {curvature / (batch_idx + 1)}') 237 | # 238 | # self.test_loss.append(test_loss / (batch_idx + 1)) 239 | # self.test_acc_adv.append(100. * adv_acc / total) 240 | # self.test_acc_clean.append(100. * clean_acc / total) 241 | # self.test_curv.append(curvature / (batch_idx + 1)) 242 | # if self.test_acc_adv[-1] > self.test_acc_adv_best: 243 | # self.test_acc_adv_best = self.test_acc_adv[-1] 244 | # print(f'Saving the best model to {self.path}') 245 | # self.save_model(self.path) 246 | # 247 | # return test_loss / (batch_idx + 1), 100. * adv_acc / total, 100. * clean_acc / total, curvature / ( 248 | # batch_idx + 1) 249 | # 250 | # def _find_z(self, inputs, targets, h): 251 | # ''' 252 | # Finding the direction in the regularizer 253 | # ''' 254 | # inputs.requires_grad_() 255 | # outputs = self.net.eval()(inputs) 256 | # loss_z = self.criterion(self.net.eval()(inputs), targets) 257 | # loss_z.backward(torch.ones(targets.size()).to(self.device)) 258 | # grad = inputs.grad.data + 0.0 259 | # norm_grad = grad.norm().item() 260 | # z = torch.sign(grad).detach() + 0. 261 | # z = 1. * (h) * (z + 1e-7) / (z.reshape(z.size(0), -1).norm(dim=1)[:, None, None, None] + 1e-7) 262 | # zero_gradients(inputs) 263 | # self.net.zero_grad() 264 | # 265 | # return z, norm_grad 266 | # 267 | # def regularizer(self, inputs, targets, h=3., lambda_=4): 268 | # ''' 269 | # Regularizer term in CURE 270 | # ''' 271 | # z, norm_grad = self._find_z(inputs, targets, h) 272 | # 273 | # inputs.requires_grad_() 274 | # outputs_pos = self.net.eval()(inputs + z) 275 | # outputs_orig = self.net.eval()(inputs) 276 | # 277 | # loss_pos = self.criterion(outputs_pos, targets) 278 | # loss_orig = self.criterion(outputs_orig, targets) 279 | # grad_diff = \ 280 | # torch.autograd.grad((loss_pos - loss_orig), inputs, grad_outputs=torch.ones(targets.size()).to(self.device), 281 | # create_graph=True)[0] 282 | # reg = grad_diff.reshape(grad_diff.size(0), -1).norm(dim=1) 283 | # self.net.zero_grad() 284 | # 285 | # return torch.sum(self.lambda_ * reg) / float(inputs.size(0)), norm_grad 286 | # 287 | # def save_model(self, path): 288 | # ''' 289 | # Saving the model 290 | # ================================================ 291 | # Arguments: 292 | # 293 | # path: string 294 | # path to save the model 295 | # ''' 296 | # 297 | # print('Saving...') 298 | # 299 | # state = { 300 | # 'net': self.net.state_dict(), 301 | # 'optimizer': self.optimizer.state_dict() 302 | # } 303 | # torch.save(state, path) 304 | # 305 | # def import_model(self, path): 306 | # ''' 307 | # Importing the pre-trained model 308 | # ''' 309 | # checkpoint = torch.load(path) 310 | # self.net.load_state_dict(checkpoint['net']) 311 | # 312 | # 313 | # 314 | # 315 | --------------------------------------------------------------------------------