├── .gitignore ├── Readme.md ├── proxylessnas ├── __init__.py ├── architect.py ├── genotypes.py ├── latency.csv ├── latencyloss.py ├── model.py ├── model_search.py ├── operations.py ├── test.py ├── test_imagenet.py ├── train.py ├── train_imagenet.py ├── train_search.py ├── utils.py └── visualize.py ├── requirements.txt └── test ├── __init__.py ├── test_latencyloss.py ├── test_operations.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | 107 | ######################################## 108 | # vscode setting 109 | .vscode/ 110 | 111 | 112 | darts/ -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # ProxylessNAS 2 | 3 | base code [DARTs](https://github.com/quark0/darts) 4 | 5 | ## Implementaion Plan 6 | - [x] Add binarize operation after softmax (3.2 LEARNING BINARIZED PATH) 7 | - [x] Add two path select when gradient update (3.2.1 TRAINING BINARIZED ARCHITECTURE PARAMETERS) 8 | - [ ] Add latency loss (3.3.1 MAKING LATENCY DIFFERENTIABLE) 9 | - [x] Change search space (4.2 EXPERIMENTS ON IMAGENET) 10 | - [x] Add MBConv operation (4.2 EXPERIMENTS ON IMAGENET) 11 | 12 | - [ ] Change searched model training code 13 | - [ ] Change searched model test code -------------------------------------------------------------------------------- /proxylessnas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kairos03/ProxylessNAS-Pytorch/c87b233aaffb9e38329cbb7d4fc5f5398b1312a8/proxylessnas/__init__.py -------------------------------------------------------------------------------- /proxylessnas/architect.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | import utils 8 | 9 | def _concat(xs): 10 | return torch.cat([x.view(-1) for x in xs]) 11 | 12 | 13 | class Architect(object): 14 | """ architect 15 | """ 16 | def __init__(self, model, args): 17 | self.network_momentum = args.momentum 18 | self.network_weight_decay = args.weight_decay 19 | self.model = model 20 | self.optimizer = torch.optim.Adam(self.model.arch_parameters(), 21 | lr=args.arch_learning_rate, betas=(0.5, 0.999), weight_decay=args.arch_weight_decay) 22 | 23 | def _compute_unrolled_model(self, input, target, eta, network_optimizer): 24 | # forward and calculate model loss 25 | loss = self.model._loss(input, target) 26 | 27 | # get all model parameters 28 | theta = _concat(self.model.parameters()).data 29 | try: 30 | moment = _concat(network_optimizer.state[v]['momentum_buffer'] for v in self.model.parameters()).mul_(self.network_momentum) 31 | except: 32 | moment = torch.zeros_like(theta) 33 | # calculate gradient and pass manual gradients to new model 34 | dtheta = _concat(torch.autograd.grad(loss, self.model.parameters())).data + self.network_weight_decay*theta 35 | unrolled_model = self._construct_model_from_theta(theta.sub(eta, moment+dtheta)) 36 | return unrolled_model 37 | 38 | def step(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer, unrolled): 39 | self.optimizer.zero_grad() 40 | if unrolled: 41 | self._backward_step_unrolled(input_train, target_train, input_valid, target_valid, eta, network_optimizer) 42 | else: 43 | self._backward_step(input_valid, target_valid) 44 | self.optimizer.step() 45 | 46 | def _backward_step(self, input_valid, target_valid): 47 | loss = self.model._loss(input_valid, target_valid) 48 | loss.backward() 49 | 50 | def _backward_step_unrolled(self, input_train, target_train, input_valid, target_valid, eta, network_optimizer): 51 | # unrolled model 52 | unrolled_model = self._compute_unrolled_model(input_train, target_train, eta, network_optimizer) 53 | # forward and calculate unrolled loss 54 | unrolled_loss = unrolled_model._loss(input_valid, target_valid) 55 | 56 | # compute gradient 57 | unrolled_loss.backward() 58 | dalpha = [v.grad for v in unrolled_model.arch_parameters()] # dalpha { L_val(w`, alpha)} 59 | vector = [v.grad.data for v in unrolled_model.parameters()] # dw` { L_val(w`, alpha) } 60 | implicit_grads = self._hessian_vector_product(vector, input_train, target_train) 61 | 62 | for g, ig in zip(dalpha, implicit_grads): 63 | g.data.sub_(eta, ig.data) 64 | 65 | for v, g in zip(self.model.arch_parameters(), dalpha): 66 | if v.grad is None: 67 | v.grad = Variable(g.data) 68 | else: 69 | v.grad.data.copy_(g.data) 70 | 71 | def _construct_model_from_theta(self, theta): 72 | # create new model 73 | model_new = self.model.new() 74 | model_dict = self.model.state_dict() 75 | 76 | # calculate new weight dict(params) 77 | params, offset = {}, 0 78 | for k, v in self.model.named_parameters(): 79 | v_length = np.prod(v.size()) 80 | params[k] = theta[offset: offset+v_length].view(v.size()) 81 | offset += v_length 82 | 83 | assert offset == len(theta) 84 | # update new param and load updated model 85 | model_dict.update(params) 86 | model_new.load_state_dict(model_dict) 87 | 88 | # mask alpha only select two path 89 | new_alpha = [] 90 | for alpha in self.model.arch_parameters(): 91 | new_step = [] 92 | for step in F.softmax(alpha, dim=-1).data: 93 | # select two path 94 | new_step.append(utils.binarize(step, 2)) 95 | new_alpha.append(Variable(torch.stack(new_step), requires_grad=True)) 96 | self.model._alphas_parameters = new_alpha 97 | 98 | return model_new.cuda() 99 | 100 | def _hessian_vector_product(self, vector, input, target, r=1e-2): 101 | R = r / _concat(vector).norm() 102 | for p, v in zip(self.model.parameters(), vector): 103 | p.data.add_(R, v) 104 | loss = self.model._loss(input, target) 105 | grads_p = torch.autograd.grad(loss, self.model.arch_parameters()) 106 | 107 | for p, v in zip(self.model.parameters(), vector): 108 | p.data.sub_(2*R, v) 109 | loss = self.model._loss(input, target) 110 | grads_n = torch.autograd.grad(loss, self.model.arch_parameters()) 111 | 112 | for p, v in zip(self.model.parameters(), vector): 113 | p.data.add_(R, v) 114 | 115 | return [(x-y).div_(2*R) for x, y in zip(grads_p, grads_n)] 116 | 117 | -------------------------------------------------------------------------------- /proxylessnas/genotypes.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Genotype = namedtuple('Genotype', 'normal normal_concat reduce reduce_concat') 4 | 5 | PRIMITIVES = [ 6 | 'none', 7 | 'identity', 8 | 'mbconv_3_3', 9 | 'mbconv_3_5', 10 | 'mbconv_3_7', 11 | 'mbconv_6_3', 12 | 'mbconv_6_5', 13 | 'mbconv_6_7', 14 | ] 15 | 16 | NASNet = Genotype( 17 | normal = [ 18 | ('sep_conv_5x5', 1), 19 | ('sep_conv_3x3', 0), 20 | ('sep_conv_5x5', 0), 21 | ('sep_conv_3x3', 0), 22 | ('avg_pool_3x3', 1), 23 | ('skip_connect', 0), 24 | ('avg_pool_3x3', 0), 25 | ('avg_pool_3x3', 0), 26 | ('sep_conv_3x3', 1), 27 | ('skip_connect', 1), 28 | ], 29 | normal_concat = [2, 3, 4, 5, 6], 30 | reduce = [ 31 | ('sep_conv_5x5', 1), 32 | ('sep_conv_7x7', 0), 33 | ('max_pool_3x3', 1), 34 | ('sep_conv_7x7', 0), 35 | ('avg_pool_3x3', 1), 36 | ('sep_conv_5x5', 0), 37 | ('skip_connect', 3), 38 | ('avg_pool_3x3', 2), 39 | ('sep_conv_3x3', 2), 40 | ('max_pool_3x3', 1), 41 | ], 42 | reduce_concat = [4, 5, 6], 43 | ) 44 | 45 | AmoebaNet = Genotype( 46 | normal = [ 47 | ('avg_pool_3x3', 0), 48 | ('max_pool_3x3', 1), 49 | ('sep_conv_3x3', 0), 50 | ('sep_conv_5x5', 2), 51 | ('sep_conv_3x3', 0), 52 | ('avg_pool_3x3', 3), 53 | ('sep_conv_3x3', 1), 54 | ('skip_connect', 1), 55 | ('skip_connect', 0), 56 | ('avg_pool_3x3', 1), 57 | ], 58 | normal_concat = [4, 5, 6], 59 | reduce = [ 60 | ('avg_pool_3x3', 0), 61 | ('sep_conv_3x3', 1), 62 | ('max_pool_3x3', 0), 63 | ('sep_conv_7x7', 2), 64 | ('sep_conv_7x7', 0), 65 | ('avg_pool_3x3', 1), 66 | ('max_pool_3x3', 0), 67 | ('max_pool_3x3', 1), 68 | ('conv_7x1_1x7', 0), 69 | ('sep_conv_3x3', 5), 70 | ], 71 | reduce_concat=[3, 4, 6] 72 | ) 73 | 74 | DARTS_V1 = Genotype( 75 | normal=[ 76 | ('sep_conv_3x3', 1), 77 | ('sep_conv_3x3', 0), 78 | ('skip_connect', 0), 79 | ('sep_conv_3x3', 1), 80 | ('skip_connect', 0), 81 | ('sep_conv_3x3', 1), 82 | ('sep_conv_3x3', 0), 83 | ('skip_connect', 2) 84 | ], 85 | normal_concat=[2, 3, 4, 5], 86 | reduce=[ 87 | ('max_pool_3x3', 0), 88 | ('max_pool_3x3', 1), 89 | ('skip_connect', 2), 90 | ('max_pool_3x3', 0), 91 | ('max_pool_3x3', 0), 92 | ('skip_connect', 2), 93 | ('skip_connect', 2), 94 | ('avg_pool_3x3', 0) 95 | ], 96 | reduce_concat=[2, 3, 4, 5] 97 | ) 98 | DARTS_V2 = Genotype(normal=[('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 0), ('sep_conv_3x3', 1), ('sep_conv_3x3', 1), ('skip_connect', 0), ('skip_connect', 0), ('dil_conv_3x3', 2)], normal_concat=[2, 3, 4, 5], reduce=[('max_pool_3x3', 0), ('max_pool_3x3', 1), ('skip_connect', 2), ('max_pool_3x3', 1), ('max_pool_3x3', 0), ('skip_connect', 2), ('skip_connect', 2), ('max_pool_3x3', 1)], reduce_concat=[2, 3, 4, 5]) 99 | 100 | DARTS = DARTS_V2 101 | 102 | -------------------------------------------------------------------------------- /proxylessnas/latency.csv: -------------------------------------------------------------------------------- 1 | 112x112x16-56x56x24-expand:3-kernel:3-stride:2,5.608820512820512 2 | 112x112x16-56x56x24-expand:3-kernel:5-stride:2,6.978000000000001 3 | 112x112x16-56x56x24-expand:3-kernel:7-stride:2,8.544238709677419 4 | 112x112x16-56x56x24-expand:6-kernel:3-stride:2,11.297755952380951 5 | 112x112x16-56x56x24-expand:6-kernel:5-stride:2,13.576893081761007 6 | 112x112x16-56x56x24-expand:6-kernel:7-stride:2,16.887745222929933 7 | 112x112x32-112x112x16-expand:1-kernel:3-stride:1,4.035709677419355 8 | 112x112x32-112x112x16-expand:1-kernel:5-stride:1,6.9890061538461525 9 | 112x112x32-112x112x16-expand:1-kernel:7-stride:1,11.216767515923568 10 | 14x14x80-14x14x80-expand:3-kernel:3-stride:1,1.7122791762013732 11 | 14x14x80-14x14x80-expand:3-kernel:5-stride:1,1.9960465116279071 12 | 14x14x80-14x14x80-expand:3-kernel:7-stride:1,2.369415 13 | 14x14x80-14x14x80-expand:6-kernel:3-stride:1,3.3526117936117936 14 | 14x14x80-14x14x80-expand:6-kernel:5-stride:1,3.9504312796208527 15 | 14x14x80-14x14x80-expand:6-kernel:7-stride:1,4.742143902439024 16 | 14x14x80-14x14x96-expand:3-kernel:3-stride:1,1.8539716312056735 17 | 14x14x80-14x14x96-expand:3-kernel:5-stride:1,2.127169811320755 18 | 14x14x80-14x14x96-expand:3-kernel:7-stride:1,2.501621621621622 19 | 14x14x80-14x14x96-expand:6-kernel:3-stride:1,3.6193525641025635 20 | 14x14x80-14x14x96-expand:6-kernel:5-stride:1,4.20704761904762 21 | 14x14x80-14x14x96-expand:6-kernel:7-stride:1,5.019310344827586 22 | 14x14x96-14x14x96-expand:3-kernel:3-stride:1,2.383217821782178 23 | 14x14x96-14x14x96-expand:3-kernel:5-stride:1,2.7191749999999995 24 | 14x14x96-14x14x96-expand:3-kernel:7-stride:1,3.1634937027707815 25 | 14x14x96-14x14x96-expand:6-kernel:3-stride:1,4.714105263157895 26 | 14x14x96-14x14x96-expand:6-kernel:5-stride:1,5.440153664302599 27 | 14x14x96-14x14x96-expand:6-kernel:7-stride:1,6.4148737864077665 28 | 14x14x96-7x7x192-expand:3-kernel:3-stride:2,1.8193141025641024 29 | 14x14x96-7x7x192-expand:3-kernel:5-stride:2,1.9280621468926553 30 | 14x14x96-7x7x192-expand:3-kernel:7-stride:2,2.0634964539007092 31 | 14x14x96-7x7x192-expand:6-kernel:3-stride:2,3.565739726027397 32 | 14x14x96-7x7x192-expand:6-kernel:5-stride:2,3.80021568627451 33 | 14x14x96-7x7x192-expand:6-kernel:7-stride:2,4.100414772727273 34 | 28x28x40-14x14x80-expand:3-kernel:3-stride:2,1.3473525179856116 35 | 28x28x40-14x14x80-expand:3-kernel:5-stride:2,1.5405426829268296 36 | 28x28x40-14x14x80-expand:3-kernel:7-stride:2,1.8065283018867921 37 | 28x28x40-14x14x80-expand:6-kernel:3-stride:2,2.795662420382165 38 | 28x28x40-14x14x80-expand:6-kernel:5-stride:2,3.1278452380952384 39 | 28x28x40-14x14x80-expand:6-kernel:7-stride:2,3.5956111111111113 40 | 28x28x40-28x28x40-expand:3-kernel:3-stride:1,2.1105636363636364 41 | 28x28x40-28x28x40-expand:3-kernel:5-stride:1,2.8044000000000002 42 | 28x28x40-28x28x40-expand:3-kernel:7-stride:1,3.7657860576923077 43 | 28x28x40-28x28x40-expand:6-kernel:3-stride:1,4.09110447761194 44 | 28x28x40-28x28x40-expand:6-kernel:5-stride:1,5.358751807228915 45 | 28x28x40-28x28x40-expand:6-kernel:7-stride:1,7.088029484029484 46 | 56x56x24-28x28x40-expand:3-kernel:3-stride:2,2.5136643835616437 47 | 56x56x24-28x28x40-expand:3-kernel:5-stride:2,3.015164473684211 48 | 56x56x24-28x28x40-expand:3-kernel:7-stride:2,3.7918 49 | 56x56x24-28x28x40-expand:6-kernel:3-stride:2,4.870601351351351 50 | 56x56x24-28x28x40-expand:6-kernel:5-stride:2,5.726016393442624 51 | 56x56x24-28x28x40-expand:6-kernel:7-stride:2,6.946482352941176 52 | 56x56x24-56x56x24-expand:3-kernel:3-stride:1,4.132448192771084 53 | 56x56x24-56x56x24-expand:3-kernel:5-stride:1,6.0329findstr04639175257 54 | 56x56x24-56x56x24-expand:3-kernel:7-stride:1,8.712114014251782 55 | 56x56x24-56x56x24-expand:6-kernel:3-stride:1,7.595687344913151 56 | 56x56x24-56x56x24-expand:6-kernel:5-stride:1,10.662506203473946 57 | 56x56x24-56x56x24-expand:6-kernel:7-stride:1,15.038211195928753 58 | 7x7x192-7x7x192-expand:3-kernel:3-stride:1,2.9128099999999995 59 | 7x7x192-7x7x192-expand:3-kernel:5-stride:1,3.0936337349397593 60 | 7x7x192-7x7x192-expand:3-kernel:7-stride:1,3.293690531177829 61 | 7x7x192-7x7x192-expand:6-kernel:3-stride:1,5.919768149882904 62 | 7x7x192-7x7x192-expand:6-kernel:5-stride:1,6.266471032745591 63 | 7x7x192-7x7x192-expand:6-kernel:7-stride:1,6.72846875 64 | 7x7x192-7x7x320-expand:3-kernel:3-stride:1,3.912773006134969 65 | 7x7x192-7x7x320-expand:3-kernel:5-stride:1,4.098578947368421 66 | 7x7x192-7x7x320-expand:3-kernel:7-stride:1,4.323613636363636 67 | 7x7x192-7x7x320-expand:6-kernel:3-stride:1,7.944133333333332 68 | 7x7x192-7x7x320-expand:6-kernel:5-stride:1,8.172687074829932 69 | 7x7x192-7x7x320-expand:6-kernel:7-stride:1,8.737869565217393 -------------------------------------------------------------------------------- /proxylessnas/latencyloss.py: -------------------------------------------------------------------------------- 1 | import csv 2 | try: 3 | import importlib.resources as pkg_resources 4 | except ImportError: 5 | import importlib_resources as pkg_resources 6 | 7 | import proxylessnas 8 | 9 | import torch 10 | import torch.nn as nn 11 | from proxylessnas.genotypes import PRIMITIVES 12 | 13 | 14 | class LatnecyLoss(nn.Module): 15 | def __init__(self, channels, steps, strides, input_size=56): 16 | super(LatnecyLoss, self).__init__() 17 | 18 | self.channels = channels 19 | self.steps = steps 20 | self.strides = strides 21 | 22 | self._calculate_feature_map_size(input_size) 23 | self._load_latency() 24 | 25 | def _load_latency(self): 26 | # load predicted latency file 27 | f = pkg_resources.open_text(proxylessnas, "latency.csv") 28 | rdr = csv.reader(f) 29 | 30 | self._latency = {} 31 | for line in rdr: 32 | self._latency[line[0]] = line[1] 33 | f.close() 34 | 35 | def _calculate_feature_map_size(self, input_size): 36 | self.feature_maps = [input_size] 37 | for s in self.strides[:-1]: 38 | input_size = input_size // s 39 | self.feature_maps.append(input_size) 40 | 41 | def _predictor(self, inputs): 42 | """predict latency 43 | input example: mbconv_6_3_80_80_14_1 44 | """ 45 | div = inputs.split('_', maxsplit=-1) 46 | if div[0] == 'identity' or div[0] == 'none': 47 | div.insert(1, 0) # insert fake exp_rate 48 | div.insert(2, 0) # insert fake ksize 49 | op, exp_rate, ksize, C_in, C_out, size, stride = div 50 | print(op) 51 | if op == 'identity' or op == 'none': 52 | return 0 53 | out_size = int(size) // int(stride) 54 | findstr = f'{size}x{size}x{C_in}-{out_size}x{out_size}x{C_out}-expand:{exp_rate}-kernel:{ksize}-stride:{stride}' 55 | print(findstr) 56 | return float(self._latency.get(findstr)) 57 | 58 | def forward(self, alpha): 59 | latency = 0 60 | 61 | for i, a_cell in enumerate(alpha): 62 | c_in = self.channels[i] 63 | c_out = self.channels[i+1] 64 | fm = self.feature_maps[i] 65 | strides = self.strides[i] 66 | 67 | for j, weights in enumerate(a_cell): 68 | op_names = PRIMITIVES 69 | strides = 1 if j != 0 else strides 70 | latency += sum(w * self._predictor(f'{op}_{c_in}_{c_out}_{fm}_{strides}') for w, op in zip(weights, op_names)) 71 | -------------------------------------------------------------------------------- /proxylessnas/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from operations import * 4 | from torch.autograd import Variable 5 | from utils import drop_path 6 | 7 | # cell 8 | class Cell(nn.Module): 9 | 10 | def __init__(self, genotype, C_prev_prev, C_prev, C, reduction, reduction_prev): 11 | super(Cell, self).__init__() 12 | print(C_prev_prev, C_prev, C) 13 | 14 | # preprocess layer 15 | if reduction_prev: 16 | self.preprocess0 = FactorizedReduce(C_prev_prev, C) 17 | else: 18 | self.preprocess0 = ReLUConvBN(C_prev_prev, C, 1, 1, 0) 19 | self.preprocess1 = ReLUConvBN(C_prev, C, 1, 1, 0) 20 | 21 | # cell define by type 22 | if reduction: 23 | op_names, indices = zip(*genotype.reduce) 24 | concat = genotype.reduce_concat 25 | else: 26 | op_names, indices = zip(*genotype.normal) 27 | concat = genotype.normal_concat 28 | self._compile(C, op_names, indices, concat, reduction) 29 | 30 | def _compile(self, C, op_names, indices, concat, reduction): 31 | assert len(op_names) == len(indices) 32 | self._steps = len(op_names) // 2 33 | self._concat = concat 34 | self.multiplier = len(concat) 35 | 36 | self._ops = nn.ModuleList() 37 | for name, index in zip(op_names, indices): 38 | stride = 2 if reduction and index < 2 else 1 39 | op = OPS[name](C, stride, True) 40 | self._ops += [op] 41 | self._indices = indices 42 | 43 | def forward(self, s0, s1, drop_prob): 44 | s0 = self.preprocess0(s0) 45 | s1 = self.preprocess1(s1) 46 | 47 | states = [s0, s1] 48 | for i in range(self._steps): 49 | h1 = states[self._indices[2*i]] 50 | h2 = states[self._indices[2*i+1]] 51 | op1 = self._ops[2*i] 52 | op2 = self._ops[2*i+1] 53 | h1 = op1(h1) 54 | h2 = op2(h2) 55 | if self.training and drop_prob > 0.: 56 | if not isinstance(op1, Identity): 57 | h1 = drop_path(h1, drop_prob) 58 | if not isinstance(op2, Identity): 59 | h2 = drop_path(h2, drop_prob) 60 | s = h1 + h2 61 | states += [s] 62 | return torch.cat([states[i] for i in self._concat], dim=1) 63 | 64 | 65 | class AuxiliaryHeadCIFAR(nn.Module): 66 | 67 | def __init__(self, C, num_classes): 68 | """assuming input size 8x8""" 69 | super(AuxiliaryHeadCIFAR, self).__init__() 70 | self.features = nn.Sequential( 71 | nn.ReLU(inplace=True), 72 | nn.AvgPool2d(5, stride=3, padding=0, count_include_pad=False), # image size = 2 x 2 73 | nn.Conv2d(C, 128, 1, bias=False), 74 | nn.BatchNorm2d(128), 75 | nn.ReLU(inplace=True), 76 | nn.Conv2d(128, 768, 2, bias=False), 77 | nn.BatchNorm2d(768), 78 | nn.ReLU(inplace=True) 79 | ) 80 | self.classifier = nn.Linear(768, num_classes) 81 | 82 | def forward(self, x): 83 | x = self.features(x) 84 | x = self.classifier(x.view(x.size(0),-1)) 85 | return x 86 | 87 | 88 | class AuxiliaryHeadImageNet(nn.Module): 89 | 90 | def __init__(self, C, num_classes): 91 | """assuming input size 14x14""" 92 | super(AuxiliaryHeadImageNet, self).__init__() 93 | self.features = nn.Sequential( 94 | nn.ReLU(inplace=True), 95 | nn.AvgPool2d(5, stride=2, padding=0, count_include_pad=False), 96 | nn.Conv2d(C, 128, 1, bias=False), 97 | nn.BatchNorm2d(128), 98 | nn.ReLU(inplace=True), 99 | nn.Conv2d(128, 768, 2, bias=False), 100 | # NOTE: This batchnorm was omitted in my earlier implementation due to a typo. 101 | # Commenting it out for consistency with the experiments in the paper. 102 | # nn.BatchNorm2d(768), 103 | nn.ReLU(inplace=True) 104 | ) 105 | self.classifier = nn.Linear(768, num_classes) 106 | 107 | def forward(self, x): 108 | x = self.features(x) 109 | x = self.classifier(x.view(x.size(0),-1)) 110 | return x 111 | 112 | 113 | class NetworkCIFAR(nn.Module): 114 | 115 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 116 | super(NetworkCIFAR, self).__init__() 117 | self._layers = layers 118 | self._auxiliary = auxiliary 119 | 120 | stem_multiplier = 3 121 | C_curr = stem_multiplier*C 122 | self.stem = nn.Sequential( 123 | nn.Conv2d(3, C_curr, 3, padding=1, bias=False), 124 | nn.BatchNorm2d(C_curr) 125 | ) 126 | 127 | C_prev_prev, C_prev, C_curr = C_curr, C_curr, C 128 | self.cells = nn.ModuleList() 129 | reduction_prev = False 130 | for i in range(layers): 131 | if i in [layers//3, 2*layers//3]: 132 | C_curr *= 2 133 | reduction = True 134 | else: 135 | reduction = False 136 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 137 | reduction_prev = reduction 138 | self.cells += [cell] 139 | C_prev_prev, C_prev = C_prev, cell.multiplier*C_curr 140 | if i == 2*layers//3: 141 | C_to_auxiliary = C_prev 142 | 143 | if auxiliary: 144 | self.auxiliary_head = AuxiliaryHeadCIFAR(C_to_auxiliary, num_classes) 145 | self.global_pooling = nn.AdaptiveAvgPool2d(1) 146 | self.classifier = nn.Linear(C_prev, num_classes) 147 | 148 | def forward(self, input): 149 | logits_aux = None 150 | s0 = s1 = self.stem(input) 151 | for i, cell in enumerate(self.cells): 152 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 153 | if i == 2*self._layers//3: 154 | if self._auxiliary and self.training: 155 | logits_aux = self.auxiliary_head(s1) 156 | out = self.global_pooling(s1) 157 | logits = self.classifier(out.view(out.size(0),-1)) 158 | return logits, logits_aux 159 | 160 | 161 | class NetworkImageNet(nn.Module): 162 | 163 | def __init__(self, C, num_classes, layers, auxiliary, genotype): 164 | super(NetworkImageNet, self).__init__() 165 | self._layers = layers 166 | self._auxiliary = auxiliary 167 | 168 | self.stem0 = nn.Sequential( 169 | nn.Conv2d(3, C // 2, kernel_size=3, stride=2, padding=1, bias=False), 170 | nn.BatchNorm2d(C // 2), 171 | nn.ReLU(inplace=True), 172 | nn.Conv2d(C // 2, C, 3, stride=2, padding=1, bias=False), 173 | nn.BatchNorm2d(C), 174 | ) 175 | 176 | self.stem1 = nn.Sequential( 177 | nn.ReLU(inplace=True), 178 | nn.Conv2d(C, C, 3, stride=2, padding=1, bias=False), 179 | nn.BatchNorm2d(C), 180 | ) 181 | 182 | C_prev_prev, C_prev, C_curr = C, C, C 183 | 184 | self.cells = nn.ModuleList() 185 | reduction_prev = True 186 | for i in range(layers): 187 | if i in [layers // 3, 2 * layers // 3]: 188 | C_curr *= 2 189 | reduction = True 190 | else: 191 | reduction = False 192 | cell = Cell(genotype, C_prev_prev, C_prev, C_curr, reduction, reduction_prev) 193 | reduction_prev = reduction 194 | self.cells += [cell] 195 | C_prev_prev, C_prev = C_prev, cell.multiplier * C_curr 196 | if i == 2 * layers // 3: 197 | C_to_auxiliary = C_prev 198 | 199 | if auxiliary: 200 | self.auxiliary_head = AuxiliaryHeadImageNet(C_to_auxiliary, num_classes) 201 | self.global_pooling = nn.AvgPool2d(7) 202 | self.classifier = nn.Linear(C_prev, num_classes) 203 | 204 | def forward(self, input): 205 | logits_aux = None 206 | s0 = self.stem0(input) 207 | s1 = self.stem1(s0) 208 | for i, cell in enumerate(self.cells): 209 | s0, s1 = s1, cell(s0, s1, self.drop_path_prob) 210 | if i == 2 * self._layers // 3: 211 | if self._auxiliary and self.training: 212 | logits_aux = self.auxiliary_head(s1) 213 | out = self.global_pooling(s1) 214 | logits = self.classifier(out.view(out.size(0), -1)) 215 | return logits, logits_aux 216 | 217 | -------------------------------------------------------------------------------- /proxylessnas/model_search.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from proxylessnas.operations import * 5 | from torch.autograd import Variable 6 | from proxylessnas.genotypes import PRIMITIVES, Genotype 7 | import proxylessnas.utils 8 | 9 | 10 | class MixedOp(nn.Module): 11 | """mixed operation 12 | """ 13 | def __init__(self, C_in, C_out, stride): 14 | super(MixedOp, self).__init__() 15 | self._ops = nn.ModuleList() 16 | for primitive in PRIMITIVES: 17 | if primitive == 'identity' and C_in != C_out: 18 | continue 19 | op = OPS[primitive](C_in, C_out, stride, False) 20 | self._ops.append(op) 21 | 22 | def forward(self, x, weights): 23 | # weighted sum for all operations weights is architacture weight(alpha) 24 | return sum(w * op(x) for w, op in zip(weights, self._ops)) 25 | 26 | 27 | class Cell(nn.Module): 28 | """Cell""" 29 | def __init__(self, C_in, C_out, steps, stride): 30 | super(Cell, self).__init__() 31 | 32 | self._steps = steps 33 | 34 | self._ops = nn.ModuleList() 35 | 36 | # stack layer 37 | self._ops.append(MixedOp(C_in, C_out, stride)) 38 | for i in range(1, self._steps): 39 | op = MixedOp(C_out, C_out, 1) 40 | self._ops.append(op) 41 | 42 | def forward(self, x, weights): 43 | for i, op in enumerate(self._ops): 44 | x = op(x, weights[i]) 45 | 46 | return x 47 | 48 | 49 | class Network(nn.Module): 50 | 51 | def __init__(self, C_list, steps_list, strides_list, num_classes, criterion): 52 | super(Network, self).__init__() 53 | self._C_list = C_list 54 | self._steps_list = steps_list 55 | self._strides_list = strides_list 56 | self._num_classes = num_classes # 1000 for Imagenet 57 | self._criterion = criterion 58 | 59 | # stem layer 60 | self.stem = nn.Sequential( 61 | nn.Conv2d(3, self._C_list[0], 3, stride=self._strides_list[0], padding=1, bias=False), 62 | nn.BatchNorm2d(self._C_list[0]), 63 | MBConv(self._C_list[0], self._C_list[1], 1, self._strides_list[1], 0, 1) 64 | ) 65 | 66 | # cells 67 | self.cells = list() 68 | for i in range(2, 8): 69 | cell = Cell(self._C_list[i-1], self._C_list[i], self._steps_list[i], self._strides_list[i]) 70 | self.cells.append(cell) 71 | self.cells = nn.Sequential(*self.cells) 72 | 73 | # postprocess 74 | self.post = ConvBNReLU(self._C_list[7], self._C_list[8], 1, 1, 0) 75 | self.global_pooling = nn.AdaptiveAvgPool2d(1) 76 | self.classifier = nn.Linear(self._C_list[8], num_classes) 77 | 78 | self._initialize_alphas() 79 | 80 | def new(self): 81 | model_new = Network(self._C_list, self._steps_list, self._strides_list, self._num_classes, self._criterion).cuda() 82 | for x, y in zip(model_new.arch_parameters(), self.arch_parameters()): 83 | x.data.copy_(y.data) 84 | return model_new 85 | 86 | def forward(self, x): 87 | x = self.stem(x) 88 | for i, cell in enumerate(self.cells): 89 | alpha = F.softmax(self._alphas_parameters[i], dim=-1) 90 | x = cell(x, alpha) 91 | x = self.post(x) 92 | x = self.global_pooling(x) 93 | logits = self.classifier(x.view(x.size(0), -1)) 94 | return logits 95 | 96 | def _loss(self, input, target): 97 | logits = self(input) 98 | return self._criterion(logits, target) 99 | 100 | def _initialize_alphas(self): 101 | num_ops = len(PRIMITIVES) 102 | 103 | # init alpha param for each mixed op 104 | self._alphas_parameters = list() 105 | for k in self._steps_list[2:8]: 106 | self._alphas_parameters.append(Variable(1e-3*torch.randn(k, num_ops).cuda(), requires_grad=True)) 107 | 108 | def arch_parameters(self): 109 | return self._alphas_parameters 110 | 111 | def genotype(self): 112 | def _parse(weights): 113 | gene = [] 114 | for i in range(len(weights)): 115 | idx = torch.argmax(weights[i][1:]) + 1 # except zero operation 116 | best = PRIMITIVES[idx] 117 | gene.append(best) 118 | return gene 119 | 120 | genotype = list() 121 | for i in range(len(self.cells)): 122 | genotype.append(_parse(F.softmax(self._alphas_parameters[i], dim=-1).data.cpu())) 123 | 124 | return genotype 125 | -------------------------------------------------------------------------------- /proxylessnas/operations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | OPS = { 5 | 'none': lambda C_in, C_out, stride, affine: Zero(C_out, stride), 6 | 'identity': lambda C_in, C_out, stride, affine: Identity() if stride == 1 else FactorizedReduce(C_in, C_out, affine), 7 | 'mbconv_3_3': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 3, stride, 1, 3, affine), 8 | 'mbconv_3_5': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 5, stride, 2, 3, affine), 9 | 'mbconv_3_7': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 7, stride, 3, 3, affine), 10 | 'mbconv_6_3': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 3, stride, 1, 6, affine), 11 | 'mbconv_6_5': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 5, stride, 2, 6, affine), 12 | 'mbconv_6_7': lambda C_in, C_out, stride, affine: MBConv(C_in, C_out, 7, stride, 3, 6, affine), 13 | } 14 | 15 | def depthwise_conv(in_channels, kernel_size, stride, groups, affine): 16 | padding = kernel_size // 2 17 | return ConvBNReLU(in_channels, in_channels, kernel_size, stride, padding, groups, affine) 18 | 19 | 20 | class ConvBNReLU(nn.Module): 21 | def __init__(self, C_in, C_out, kernel_size, stride, padding, groups=1, affine=True, activation=True): 22 | super(ConvBNReLU, self).__init__() 23 | 24 | self.conv = nn.Conv2d(C_in, C_out, kernel_size, stride, padding, groups=groups, bias=False) 25 | self.bn = nn.BatchNorm2d(C_out, affine=affine) 26 | if activation: 27 | self.act = nn.ReLU6() 28 | 29 | def forward(self, x): 30 | x = self.conv(x) 31 | x = self.bn(x) 32 | if hasattr(self, 'act'): 33 | x = self.act(x) 34 | return x 35 | 36 | 37 | class MBConv(nn.Module): 38 | def __init__(self, C_in, C_out, kernel_size, stride, padding, expansion_factor, affine=True): 39 | super(MBConv, self).__init__() 40 | 41 | C_exp = C_in * expansion_factor 42 | self.res_connect = C_in == C_out and stride == 1 43 | 44 | self.op = nn.Sequential( 45 | ConvBNReLU(C_in, C_exp, 1, 1, 0, affine=affine), 46 | depthwise_conv(C_exp, kernel_size, stride, C_exp, affine=affine), 47 | ConvBNReLU(C_exp, C_out, 1, 1, 0, activation=False, affine=affine) 48 | ) 49 | 50 | def forward(self, x): 51 | if self.res_connect: 52 | return self.op(x) + x 53 | else: 54 | return self.op(x) 55 | 56 | 57 | class ReLUConvBN(nn.Module): 58 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 59 | super(ReLUConvBN, self).__init__() 60 | self.op = nn.Sequential( 61 | nn.ReLU(inplace=False), 62 | nn.Conv2d(C_in, C_out, kernel_size, stride=stride, padding=padding, bias=False), 63 | nn.BatchNorm2d(C_out, affine=affine) 64 | ) 65 | 66 | def forward(self, x): 67 | return self.op(x) 68 | 69 | 70 | class DilConv(nn.Module): 71 | 72 | def __init__(self, C_in, C_out, kernel_size, stride, padding, dilation, affine=True): 73 | super(DilConv, self).__init__() 74 | self.op = nn.Sequential( 75 | nn.ReLU(inplace=False), 76 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=C_in, bias=False), 77 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 78 | nn.BatchNorm2d(C_out, affine=affine), 79 | ) 80 | 81 | def forward(self, x): 82 | return self.op(x) 83 | 84 | 85 | class SepConv(nn.Module): 86 | 87 | def __init__(self, C_in, C_out, kernel_size, stride, padding, affine=True): 88 | super(SepConv, self).__init__() 89 | self.op = nn.Sequential( 90 | nn.ReLU(inplace=False), 91 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=stride, padding=padding, groups=C_in, bias=False), 92 | nn.Conv2d(C_in, C_in, kernel_size=1, padding=0, bias=False), 93 | nn.BatchNorm2d(C_in, affine=affine), 94 | nn.ReLU(inplace=False), 95 | nn.Conv2d(C_in, C_in, kernel_size=kernel_size, stride=1, padding=padding, groups=C_in, bias=False), 96 | nn.Conv2d(C_in, C_out, kernel_size=1, padding=0, bias=False), 97 | nn.BatchNorm2d(C_out, affine=affine), 98 | ) 99 | 100 | def forward(self, x): 101 | return self.op(x) 102 | 103 | 104 | class Identity(nn.Module): 105 | 106 | def __init__(self): 107 | super(Identity, self).__init__() 108 | 109 | def forward(self, x): 110 | return x 111 | 112 | 113 | class Zero(nn.Module): 114 | 115 | def __init__(self, C_out, stride): 116 | super(Zero, self).__init__() 117 | self.stride = stride 118 | self.C_out = C_out 119 | 120 | def forward(self, x): 121 | n, _, h, w = x.size() 122 | c = self.C_out 123 | h //= self.stride 124 | w //= self.stride 125 | if x.is_cuda: 126 | with torch.cuda.device(x.get_device()): 127 | padding = torch.cuda.FloatTensor(n, c, h, w).fill_(0) 128 | else: 129 | padding = torch.zeros(n, c, h, w) 130 | padding = torch.autograd.Variable(padding, requires_grad=False) 131 | return padding 132 | 133 | class FactorizedReduce(nn.Module): 134 | 135 | def __init__(self, C_in, C_out, affine=True): 136 | super(FactorizedReduce, self).__init__() 137 | assert C_out % 2 == 0 138 | self.relu = nn.ReLU(inplace=False) 139 | self.conv_1 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 140 | self.conv_2 = nn.Conv2d(C_in, C_out // 2, 1, stride=2, padding=0, bias=False) 141 | self.bn = nn.BatchNorm2d(C_out, affine=affine) 142 | 143 | def forward(self, x): 144 | x = self.relu(x) 145 | out = torch.cat([self.conv_1(x), self.conv_2(x[:,:,1:,1:])], dim=1) 146 | out = self.bn(out) 147 | return out 148 | 149 | -------------------------------------------------------------------------------- /proxylessnas/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import numpy as np 5 | import torch 6 | import utils 7 | import logging 8 | import argparse 9 | import torch.nn as nn 10 | import genotypes 11 | import torch.utils 12 | import torchvision.datasets as dset 13 | import torch.backends.cudnn as cudnn 14 | 15 | from torch.autograd import Variable 16 | from model import NetworkCIFAR as Network 17 | 18 | 19 | parser = argparse.ArgumentParser("cifar") 20 | parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') 21 | parser.add_argument('--batch_size', type=int, default=96, help='batch size') 22 | parser.add_argument('--report_freq', type=float, default=50, help='report frequency') 23 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 24 | parser.add_argument('--init_channels', type=int, default=36, help='num of init channels') 25 | parser.add_argument('--layers', type=int, default=20, help='total number of layers') 26 | parser.add_argument('--model_path', type=str, default='EXP/model.pt', help='path of pretrained model') 27 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 28 | parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') 29 | parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') 30 | parser.add_argument('--drop_path_prob', type=float, default=0.2, help='drop path probability') 31 | parser.add_argument('--seed', type=int, default=0, help='random seed') 32 | parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use') 33 | args = parser.parse_args() 34 | 35 | log_format = '%(asctime)s %(message)s' 36 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 37 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 38 | 39 | CIFAR_CLASSES = 10 40 | 41 | 42 | def main(): 43 | if not torch.cuda.is_available(): 44 | logging.info('no gpu device available') 45 | sys.exit(1) 46 | 47 | np.random.seed(args.seed) 48 | torch.cuda.set_device(args.gpu) 49 | cudnn.benchmark = True 50 | torch.manual_seed(args.seed) 51 | cudnn.enabled=True 52 | torch.cuda.manual_seed(args.seed) 53 | logging.info('gpu device = %d' % args.gpu) 54 | logging.info("args = %s", args) 55 | 56 | genotype = eval("genotypes.%s" % args.arch) 57 | model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) 58 | model = model.cuda() 59 | utils.load(model, args.model_path) 60 | 61 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 62 | 63 | criterion = nn.CrossEntropyLoss() 64 | criterion = criterion.cuda() 65 | 66 | _, test_transform = utils._data_transforms_cifar10(args) 67 | test_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=test_transform) 68 | 69 | test_queue = torch.utils.data.DataLoader( 70 | test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) 71 | 72 | model.drop_path_prob = args.drop_path_prob 73 | test_acc, test_obj = infer(test_queue, model, criterion) 74 | logging.info('test_acc %f', test_acc) 75 | 76 | 77 | def infer(test_queue, model, criterion): 78 | objs = utils.AvgrageMeter() 79 | top1 = utils.AvgrageMeter() 80 | top5 = utils.AvgrageMeter() 81 | model.eval() 82 | 83 | for step, (input, target) in enumerate(test_queue): 84 | input = Variable(input, volatile=True).cuda() 85 | target = Variable(target, volatile=True).cuda(async=True) 86 | 87 | logits, _ = model(input) 88 | loss = criterion(logits, target) 89 | 90 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 91 | n = input.size(0) 92 | objs.update(loss.data[0], n) 93 | top1.update(prec1.data[0], n) 94 | top5.update(prec5.data[0], n) 95 | 96 | if step % args.report_freq == 0: 97 | logging.info('test %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 98 | 99 | return top1.avg, objs.avg 100 | 101 | 102 | if __name__ == '__main__': 103 | main() 104 | 105 | -------------------------------------------------------------------------------- /proxylessnas/test_imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import torch 5 | import utils 6 | import glob 7 | import random 8 | import logging 9 | import argparse 10 | import torch.nn as nn 11 | import genotypes 12 | import torch.utils 13 | import torchvision.datasets as dset 14 | import torchvision.transforms as transforms 15 | import torch.backends.cudnn as cudnn 16 | 17 | from torch.autograd import Variable 18 | from model import NetworkImageNet as Network 19 | 20 | 21 | parser = argparse.ArgumentParser("imagenet") 22 | parser.add_argument('--data', type=str, default='../data/imagenet/', help='location of the data corpus') 23 | parser.add_argument('--batch_size', type=int, default=128, help='batch size') 24 | parser.add_argument('--report_freq', type=float, default=100, help='report frequency') 25 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 26 | parser.add_argument('--init_channels', type=int, default=48, help='num of init channels') 27 | parser.add_argument('--layers', type=int, default=14, help='total number of layers') 28 | parser.add_argument('--model_path', type=str, default='EXP/model.pt', help='path of pretrained model') 29 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 30 | parser.add_argument('--drop_path_prob', type=float, default=0, help='drop path probability') 31 | parser.add_argument('--seed', type=int, default=0, help='random seed') 32 | parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use') 33 | args = parser.parse_args() 34 | 35 | log_format = '%(asctime)s %(message)s' 36 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 37 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 38 | 39 | CLASSES = 1000 40 | 41 | 42 | def main(): 43 | if not torch.cuda.is_available(): 44 | logging.info('no gpu device available') 45 | sys.exit(1) 46 | 47 | np.random.seed(args.seed) 48 | torch.cuda.set_device(args.gpu) 49 | cudnn.benchmark = True 50 | torch.manual_seed(args.seed) 51 | cudnn.enabled=True 52 | torch.cuda.manual_seed(args.seed) 53 | logging.info('gpu device = %d' % args.gpu) 54 | logging.info("args = %s", args) 55 | 56 | genotype = eval("genotypes.%s" % args.arch) 57 | model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) 58 | model = model.cuda() 59 | model.load_state_dict(torch.load(args.model_path)['state_dict']) 60 | 61 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 62 | 63 | criterion = nn.CrossEntropyLoss() 64 | criterion = criterion.cuda() 65 | 66 | validdir = os.path.join(args.data, 'val') 67 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 68 | valid_data = dset.ImageFolder( 69 | validdir, 70 | transforms.Compose([ 71 | transforms.Resize(256), 72 | transforms.CenterCrop(224), 73 | transforms.ToTensor(), 74 | normalize, 75 | ])) 76 | 77 | valid_queue = torch.utils.data.DataLoader( 78 | valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4) 79 | 80 | model.drop_path_prob = args.drop_path_prob 81 | valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, criterion) 82 | logging.info('valid_acc_top1 %f', valid_acc_top1) 83 | logging.info('valid_acc_top5 %f', valid_acc_top5) 84 | 85 | 86 | def infer(valid_queue, model, criterion): 87 | objs = utils.AvgrageMeter() 88 | top1 = utils.AvgrageMeter() 89 | top5 = utils.AvgrageMeter() 90 | model.eval() 91 | 92 | for step, (input, target) in enumerate(valid_queue): 93 | input = Variable(input, volatile=True).cuda() 94 | target = Variable(target, volatile=True).cuda(async=True) 95 | 96 | logits, _ = model(input) 97 | loss = criterion(logits, target) 98 | 99 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 100 | n = input.size(0) 101 | objs.update(loss.data[0], n) 102 | top1.update(prec1.data[0], n) 103 | top5.update(prec5.data[0], n) 104 | 105 | if step % args.report_freq == 0: 106 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 107 | 108 | return top1.avg, top5.avg, objs.avg 109 | 110 | 111 | if __name__ == '__main__': 112 | main() 113 | -------------------------------------------------------------------------------- /proxylessnas/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import glob 5 | import numpy as np 6 | import torch 7 | import utils 8 | import logging 9 | import argparse 10 | import torch.nn as nn 11 | import genotypes 12 | import torch.utils 13 | import torchvision.datasets as dset 14 | import torch.backends.cudnn as cudnn 15 | 16 | from torch.autograd import Variable 17 | from model import NetworkCIFAR as Network 18 | 19 | 20 | parser = argparse.ArgumentParser("cifar") 21 | parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') 22 | parser.add_argument('--batch_size', type=int, default=96, help='batch size') 23 | parser.add_argument('--learning_rate', type=float, default=0.025, help='init learning rate') 24 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum') 25 | parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay') 26 | parser.add_argument('--report_freq', type=float, default=50, help='report frequency') 27 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 28 | parser.add_argument('--epochs', type=int, default=600, help='num of training epochs') 29 | parser.add_argument('--init_channels', type=int, default=36, help='num of init channels') 30 | parser.add_argument('--layers', type=int, default=20, help='total number of layers') 31 | parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model') 32 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 33 | parser.add_argument('--auxiliary_weight', type=float, default=0.4, help='weight for auxiliary loss') 34 | parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') 35 | parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') 36 | parser.add_argument('--drop_path_prob', type=float, default=0.2, help='drop path probability') 37 | parser.add_argument('--save', type=str, default='EXP', help='experiment name') 38 | parser.add_argument('--seed', type=int, default=0, help='random seed') 39 | parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use') 40 | parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') 41 | args = parser.parse_args() 42 | 43 | args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) 44 | utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) 45 | 46 | log_format = '%(asctime)s %(message)s' 47 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 48 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 49 | fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) 50 | fh.setFormatter(logging.Formatter(log_format)) 51 | logging.getLogger().addHandler(fh) 52 | 53 | CIFAR_CLASSES = 10 54 | 55 | 56 | def main(): 57 | if not torch.cuda.is_available(): 58 | logging.info('no gpu device available') 59 | sys.exit(1) 60 | 61 | np.random.seed(args.seed) 62 | torch.cuda.set_device(args.gpu) 63 | cudnn.benchmark = True 64 | torch.manual_seed(args.seed) 65 | cudnn.enabled=True 66 | torch.cuda.manual_seed(args.seed) 67 | logging.info('gpu device = %d' % args.gpu) 68 | logging.info("args = %s", args) 69 | 70 | genotype = eval("genotypes.%s" % args.arch) 71 | model = Network(args.init_channels, CIFAR_CLASSES, args.layers, args.auxiliary, genotype) 72 | model = model.cuda() 73 | 74 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 75 | 76 | criterion = nn.CrossEntropyLoss() 77 | criterion = criterion.cuda() 78 | optimizer = torch.optim.SGD( 79 | model.parameters(), 80 | args.learning_rate, 81 | momentum=args.momentum, 82 | weight_decay=args.weight_decay 83 | ) 84 | 85 | train_transform, valid_transform = utils._data_transforms_cifar10(args) 86 | train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) 87 | valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) 88 | 89 | train_queue = torch.utils.data.DataLoader( 90 | train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) 91 | 92 | valid_queue = torch.utils.data.DataLoader( 93 | valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) 94 | 95 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, float(args.epochs)) 96 | 97 | for epoch in range(args.epochs): 98 | scheduler.step() 99 | logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) 100 | model.drop_path_prob = args.drop_path_prob * epoch / args.epochs 101 | 102 | train_acc, train_obj = train(train_queue, model, criterion, optimizer) 103 | logging.info('train_acc %f', train_acc) 104 | 105 | valid_acc, valid_obj = infer(valid_queue, model, criterion) 106 | logging.info('valid_acc %f', valid_acc) 107 | 108 | utils.save(model, os.path.join(args.save, 'weights.pt')) 109 | 110 | 111 | def train(train_queue, model, criterion, optimizer): 112 | objs = utils.AvgrageMeter() 113 | top1 = utils.AvgrageMeter() 114 | top5 = utils.AvgrageMeter() 115 | model.train() 116 | 117 | for step, (input, target) in enumerate(train_queue): 118 | input = Variable(input).cuda() 119 | target = Variable(target).cuda(async=True) 120 | 121 | optimizer.zero_grad() 122 | logits, logits_aux = model(input) 123 | loss = criterion(logits, target) 124 | if args.auxiliary: 125 | loss_aux = criterion(logits_aux, target) 126 | loss += args.auxiliary_weight*loss_aux 127 | loss.backward() 128 | nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) 129 | optimizer.step() 130 | 131 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 132 | n = input.size(0) 133 | objs.update(loss.data[0], n) 134 | top1.update(prec1.data[0], n) 135 | top5.update(prec5.data[0], n) 136 | 137 | if step % args.report_freq == 0: 138 | logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 139 | 140 | return top1.avg, objs.avg 141 | 142 | 143 | def infer(valid_queue, model, criterion): 144 | objs = utils.AvgrageMeter() 145 | top1 = utils.AvgrageMeter() 146 | top5 = utils.AvgrageMeter() 147 | model.eval() 148 | 149 | for step, (input, target) in enumerate(valid_queue): 150 | input = Variable(input, volatile=True).cuda() 151 | target = Variable(target, volatile=True).cuda(async=True) 152 | 153 | logits, _ = model(input) 154 | loss = criterion(logits, target) 155 | 156 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 157 | n = input.size(0) 158 | objs.update(loss.data[0], n) 159 | top1.update(prec1.data[0], n) 160 | top5.update(prec5.data[0], n) 161 | 162 | if step % args.report_freq == 0: 163 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 164 | 165 | return top1.avg, objs.avg 166 | 167 | 168 | if __name__ == '__main__': 169 | main() 170 | 171 | -------------------------------------------------------------------------------- /proxylessnas/train_imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import numpy as np 4 | import time 5 | import torch 6 | import utils 7 | import glob 8 | import random 9 | import logging 10 | import argparse 11 | import torch.nn as nn 12 | import genotypes 13 | import torch.utils 14 | import torchvision.datasets as dset 15 | import torchvision.transforms as transforms 16 | import torch.backends.cudnn as cudnn 17 | 18 | from torch.autograd import Variable 19 | from model import NetworkImageNet as Network 20 | 21 | 22 | parser = argparse.ArgumentParser("imagenet") 23 | parser.add_argument('--data', type=str, default='../data/imagenet/', help='location of the data corpus') 24 | parser.add_argument('--batch_size', type=int, default=128, help='batch size') 25 | parser.add_argument('--learning_rate', type=float, default=0.1, help='init learning rate') 26 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum') 27 | parser.add_argument('--weight_decay', type=float, default=3e-5, help='weight decay') 28 | parser.add_argument('--report_freq', type=float, default=100, help='report frequency') 29 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 30 | parser.add_argument('--epochs', type=int, default=250, help='num of training epochs') 31 | parser.add_argument('--init_channels', type=int, default=48, help='num of init channels') 32 | parser.add_argument('--layers', type=int, default=14, help='total number of layers') 33 | parser.add_argument('--auxiliary', action='store_true', default=False, help='use auxiliary tower') 34 | parser.add_argument('--auxiliary_weight', type=float, default=0.4, help='weight for auxiliary loss') 35 | parser.add_argument('--drop_path_prob', type=float, default=0, help='drop path probability') 36 | parser.add_argument('--save', type=str, default='EXP', help='experiment name') 37 | parser.add_argument('--seed', type=int, default=0, help='random seed') 38 | parser.add_argument('--arch', type=str, default='DARTS', help='which architecture to use') 39 | parser.add_argument('--grad_clip', type=float, default=5., help='gradient clipping') 40 | parser.add_argument('--label_smooth', type=float, default=0.1, help='label smoothing') 41 | parser.add_argument('--gamma', type=float, default=0.97, help='learning rate decay') 42 | parser.add_argument('--decay_period', type=int, default=1, help='epochs between two learning rate decays') 43 | parser.add_argument('--parallel', action='store_true', default=False, help='data parallelism') 44 | args = parser.parse_args() 45 | 46 | args.save = 'eval-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) 47 | utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) 48 | 49 | log_format = '%(asctime)s %(message)s' 50 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 51 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 52 | fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) 53 | fh.setFormatter(logging.Formatter(log_format)) 54 | logging.getLogger().addHandler(fh) 55 | 56 | CLASSES = 1000 57 | 58 | 59 | class CrossEntropyLabelSmooth(nn.Module): 60 | 61 | def __init__(self, num_classes, epsilon): 62 | super(CrossEntropyLabelSmooth, self).__init__() 63 | self.num_classes = num_classes 64 | self.epsilon = epsilon 65 | self.logsoftmax = nn.LogSoftmax(dim=1) 66 | 67 | def forward(self, inputs, targets): 68 | log_probs = self.logsoftmax(inputs) 69 | targets = torch.zeros_like(log_probs).scatter_(1, targets.unsqueeze(1), 1) 70 | targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes 71 | loss = (-targets * log_probs).mean(0).sum() 72 | return loss 73 | 74 | 75 | def main(): 76 | if not torch.cuda.is_available(): 77 | logging.info('no gpu device available') 78 | sys.exit(1) 79 | 80 | np.random.seed(args.seed) 81 | torch.cuda.set_device(args.gpu) 82 | cudnn.benchmark = True 83 | torch.manual_seed(args.seed) 84 | cudnn.enabled=True 85 | torch.cuda.manual_seed(args.seed) 86 | logging.info('gpu device = %d' % args.gpu) 87 | logging.info("args = %s", args) 88 | 89 | genotype = eval("genotypes.%s" % args.arch) 90 | model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) 91 | if args.parallel: 92 | model = nn.DataParallel(model).cuda() 93 | else: 94 | model = model.cuda() 95 | 96 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 97 | 98 | criterion = nn.CrossEntropyLoss() 99 | criterion = criterion.cuda() 100 | criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) 101 | criterion_smooth = criterion_smooth.cuda() 102 | 103 | optimizer = torch.optim.SGD( 104 | model.parameters(), 105 | args.learning_rate, 106 | momentum=args.momentum, 107 | weight_decay=args.weight_decay 108 | ) 109 | 110 | traindir = os.path.join(args.data, 'train') 111 | validdir = os.path.join(args.data, 'val') 112 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 113 | train_data = dset.ImageFolder( 114 | traindir, 115 | transforms.Compose([ 116 | transforms.RandomResizedCrop(224), 117 | transforms.RandomHorizontalFlip(), 118 | transforms.ColorJitter( 119 | brightness=0.4, 120 | contrast=0.4, 121 | saturation=0.4, 122 | hue=0.2), 123 | transforms.ToTensor(), 124 | normalize, 125 | ])) 126 | valid_data = dset.ImageFolder( 127 | validdir, 128 | transforms.Compose([ 129 | transforms.Resize(256), 130 | transforms.CenterCrop(224), 131 | transforms.ToTensor(), 132 | normalize, 133 | ])) 134 | 135 | train_queue = torch.utils.data.DataLoader( 136 | train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=4) 137 | 138 | valid_queue = torch.utils.data.DataLoader( 139 | valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=4) 140 | 141 | scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.decay_period, gamma=args.gamma) 142 | 143 | best_acc_top1 = 0 144 | for epoch in range(args.epochs): 145 | scheduler.step() 146 | logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) 147 | model.drop_path_prob = args.drop_path_prob * epoch / args.epochs 148 | 149 | train_acc, train_obj = train(train_queue, model, criterion_smooth, optimizer) 150 | logging.info('train_acc %f', train_acc) 151 | 152 | valid_acc_top1, valid_acc_top5, valid_obj = infer(valid_queue, model, criterion) 153 | logging.info('valid_acc_top1 %f', valid_acc_top1) 154 | logging.info('valid_acc_top5 %f', valid_acc_top5) 155 | 156 | is_best = False 157 | if valid_acc_top1 > best_acc_top1: 158 | best_acc_top1 = valid_acc_top1 159 | is_best = True 160 | 161 | utils.save_checkpoint({ 162 | 'epoch': epoch + 1, 163 | 'state_dict': model.state_dict(), 164 | 'best_acc_top1': best_acc_top1, 165 | 'optimizer' : optimizer.state_dict(), 166 | }, is_best, args.save) 167 | 168 | 169 | def train(train_queue, model, criterion, optimizer): 170 | objs = utils.AvgrageMeter() 171 | top1 = utils.AvgrageMeter() 172 | top5 = utils.AvgrageMeter() 173 | model.train() 174 | 175 | for step, (input, target) in enumerate(train_queue): 176 | target = target.cuda(async=True) 177 | input = input.cuda() 178 | input = Variable(input) 179 | target = Variable(target) 180 | 181 | optimizer.zero_grad() 182 | logits, logits_aux = model(input) 183 | loss = criterion(logits, target) 184 | if args.auxiliary: 185 | loss_aux = criterion(logits_aux, target) 186 | loss += args.auxiliary_weight*loss_aux 187 | 188 | loss.backward() 189 | nn.utils.clip_grad_norm(model.parameters(), args.grad_clip) 190 | optimizer.step() 191 | 192 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 193 | n = input.size(0) 194 | objs.update(loss.data[0], n) 195 | top1.update(prec1.data[0], n) 196 | top5.update(prec5.data[0], n) 197 | 198 | if step % args.report_freq == 0: 199 | logging.info('train %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 200 | 201 | return top1.avg, objs.avg 202 | 203 | 204 | def infer(valid_queue, model, criterion): 205 | objs = utils.AvgrageMeter() 206 | top1 = utils.AvgrageMeter() 207 | top5 = utils.AvgrageMeter() 208 | model.eval() 209 | 210 | for step, (input, target) in enumerate(valid_queue): 211 | input = Variable(input, volatile=True).cuda() 212 | target = Variable(target, volatile=True).cuda(async=True) 213 | 214 | logits, _ = model(input) 215 | loss = criterion(logits, target) 216 | 217 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 218 | n = input.size(0) 219 | objs.update(loss.data[0], n) 220 | top1.update(prec1.data[0], n) 221 | top5.update(prec5.data[0], n) 222 | 223 | if step % args.report_freq == 0: 224 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 225 | 226 | return top1.avg, top5.avg, objs.avg 227 | 228 | 229 | if __name__ == '__main__': 230 | main() 231 | -------------------------------------------------------------------------------- /proxylessnas/train_search.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | import glob 5 | import numpy as np 6 | import torch 7 | import utils 8 | import logging 9 | import argparse 10 | import torch.nn as nn 11 | import torch.utils 12 | import torch.nn.functional as F 13 | import torchvision.datasets as dset 14 | import torch.backends.cudnn as cudnn 15 | 16 | from torch.autograd import Variable 17 | from model_search import Network 18 | from architect import Architect 19 | 20 | # arguments 21 | parser = argparse.ArgumentParser("cifar") 22 | parser.add_argument('--data', type=str, default='../data', help='location of the data corpus') 23 | parser.add_argument('--batch_size', type=int, default=64, help='batch size') 24 | parser.add_argument('--learning_rate', type=float, default=0.001, help='init learning rate') 25 | parser.add_argument('--learning_rate_min', type=float, default=0.001, help='min learning rate') 26 | parser.add_argument('--momentum', type=float, default=0.9, help='momentum') 27 | parser.add_argument('--weight_decay', type=float, default=3e-4, help='weight decay') 28 | parser.add_argument('--report_freq', type=float, default=50, help='report frequency') 29 | parser.add_argument('--gpu', type=int, default=0, help='gpu device id') 30 | parser.add_argument('--epochs', type=int, default=50, help='num of training epochs') 31 | parser.add_argument('--init_channels', type=int, default=16, help='num of init channels') 32 | parser.add_argument('--layers', type=int, default=8, help='total number of layers') 33 | parser.add_argument('--model_path', type=str, default='saved_models', help='path to save the model') 34 | parser.add_argument('--cutout', action='store_true', default=False, help='use cutout') 35 | parser.add_argument('--cutout_length', type=int, default=16, help='cutout length') 36 | parser.add_argument('--drop_path_prob', type=float, default=0.3, help='drop path probability') 37 | parser.add_argument('--save', type=str, default='EXP', help='experiment name') 38 | parser.add_argument('--seed', type=int, default=2, help='random seed') 39 | parser.add_argument('--grad_clip', type=float, default=5, help='gradient clipping') 40 | parser.add_argument('--train_portion', type=float, default=0.5, help='portion of training data') 41 | parser.add_argument('--unrolled', action='store_true', default=False, help='use one-step unrolled validation loss') 42 | parser.add_argument('--arch_learning_rate', type=float, default=3e-4, help='learning rate for arch encoding') 43 | parser.add_argument('--arch_weight_decay', type=float, default=1e-3, help='weight decay for arch encoding') 44 | args = parser.parse_args() 45 | 46 | # create serarch result directory 47 | args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S")) 48 | utils.create_exp_dir(args.save, scripts_to_save=glob.glob('*.py')) 49 | 50 | # set logger 51 | log_format = '%(asctime)s %(message)s' 52 | logging.basicConfig(stream=sys.stdout, level=logging.INFO, 53 | format=log_format, datefmt='%m/%d %I:%M:%S %p') 54 | fh = logging.FileHandler(os.path.join(args.save, 'log.txt')) 55 | fh.setFormatter(logging.Formatter(log_format)) 56 | logging.getLogger().addHandler(fh) 57 | 58 | # classes 59 | CLASSES = 1000 60 | # model config 61 | channels = [32, 16, 24, 32, 64, 96, 160, 320, 1280] 62 | steps = [1, 1, 2, 3, 4, 3, 3, 1, 1] 63 | strides = [2, 1, 2, 2, 2, 1, 2, 1, 1] 64 | 65 | # main 66 | def main(): 67 | # check gpu is available 68 | if not torch.cuda.is_available(): 69 | logging.info('no gpu device available') 70 | sys.exit(1) 71 | 72 | # init 73 | np.random.seed(args.seed) 74 | torch.cuda.set_device(args.gpu) 75 | cudnn.benchmark = True 76 | torch.manual_seed(args.seed) 77 | cudnn.enabled=True 78 | torch.cuda.manual_seed(args.seed) 79 | logging.info('gpu device = %d' % args.gpu) 80 | logging.info("args = %s", args) 81 | 82 | 83 | # criterion, model, optimizer, for model training 84 | criterion = nn.CrossEntropyLoss() # TODO add latency loss 85 | criterion = criterion.cuda() 86 | model = Network(channels, steps, strides, CLASSES, criterion) 87 | model = model.cuda() 88 | logging.info("param size = %fMB", utils.count_parameters_in_MB(model)) 89 | 90 | optimizer = torch.optim.SGD( 91 | model.parameters(), 92 | args.learning_rate, 93 | momentum=args.momentum, 94 | weight_decay=args.weight_decay) 95 | 96 | # prepare datasets 97 | #train_transform, valid_transform = utils._data_transforms_cifar10(args) 98 | #train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) 99 | 100 | train_transform, valid_transform = utils._data_transforms_imagenet(args) 101 | train_data = dset.ImageNet(root=args.data, split='train', download=True, transform=train_transform) 102 | valid_data = dset.ImageNet(root=args.data, split='val', download=True, transform=valid_transform) 103 | 104 | num_train = len(train_data) 105 | #indices = list(range(num_train)) 106 | #split = int(np.floor(args.train_portion * num_train)) 107 | 108 | # create dataloader 109 | train_queue = torch.utils.data.DataLoader( 110 | train_data, batch_size=args.batch_size, 111 | #sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]), 112 | pin_memory=True, num_workers=2) 113 | 114 | valid_queue = torch.utils.data.DataLoader( 115 | valid_data, batch_size=args.batch_size, 116 | #sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]), 117 | pin_memory=True, num_workers=2) 118 | 119 | # learning rate scheduler with cosineAnnealingtopk 120 | scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( 121 | optimizer, float(args.epochs), eta_min=args.learning_rate_min) 122 | 123 | # architect 124 | architect = Architect(model, args) 125 | 126 | # training 127 | for epoch in range(args.epochs): 128 | # lr update 129 | scheduler.step() 130 | lr = scheduler.get_lr()[0] 131 | logging.info('epoch %d lr %e', epoch, lr) 132 | 133 | # get genotype for logging 134 | genotype = model.genotype() 135 | logging.info('genotype = %s', genotype) 136 | 137 | for alpha in model.arch_parameters(): 138 | print(F.softmax(alpha, dim=-1).data) 139 | 140 | # training 141 | train_acc, train_obj = train(train_queue, valid_queue, model, architect, criterion, optimizer, lr) 142 | logging.info('train_acc %f', train_acc) 143 | 144 | # validation 145 | valid_acc, valid_obj = infer(valid_queue, model, criterion) 146 | logging.info('valid_acc %f', valid_acc) 147 | 148 | utils.save(model, os.path.join(args.save, 'weights.pt')) 149 | 150 | # training 151 | def train(train_queue, valid_queue, model, architect, criterion, optimizer, lr): 152 | # init matric 153 | objs = utils.AvgrageMeter() 154 | top1 = utils.AvgrageMeter() 155 | top5 = utils.AvgrageMeter() 156 | 157 | for step, (input, target) in enumerate(train_queue): 158 | # set model to train mode 159 | model.train() 160 | # input batch 161 | n = input.size(0) 162 | 163 | # input and target move to gpu 164 | input = Variable(input, requires_grad=False).cuda() 165 | target = Variable(target, requires_grad=False).cuda(async=True) 166 | 167 | # get a random minibatch from the search queue with replacement 168 | input_search, target_search = next(iter(valid_queue)) 169 | input_search = Variable(input_search, requires_grad=False).cuda() 170 | target_search = Variable(target_search, requires_grad=False).cuda(async=True) 171 | 172 | # model weight update 173 | optimizer.zero_grad() 174 | logits = model(input) 175 | loss = criterion(logits, target) 176 | 177 | loss.backward() 178 | nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) 179 | optimizer.step() 180 | 181 | # architect parameter alpha update 182 | architect.step(input, target, input_search, target_search, lr, optimizer, unrolled=args.unrolled) 183 | 184 | # get trained model accuracy and update matric 185 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 186 | objs.update(loss.item(), n) 187 | top1.update(prec1.item(), n) 188 | top5.update(prec5.item(), n) 189 | 190 | if step % args.report_freq == 0: 191 | logging.info('train %05d %e %f %f', step, objs.avg, top1.avg, top5.avg) 192 | 193 | return top1.avg, objs.avg 194 | 195 | 196 | def infer(valid_queue, model, criterion): 197 | objs = utils.AvgrageMeter() 198 | top1 = utils.AvgrageMeter() 199 | top5 = utils.AvgrageMeter() 200 | model.eval() 201 | 202 | for step, (input, target) in enumerate(valid_queue): 203 | input = Variable(input, volatile=True).cuda() 204 | target = Variable(target, volatile=True).cuda(async=True) 205 | 206 | logits = model(input) 207 | loss = criterion(logits, target) 208 | 209 | prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) 210 | n = input.size(0) 211 | objs.update(loss.item(), n) 212 | top1.update(prec1.item(), n) 213 | top5.update(prec5.item(), n) 214 | 215 | if step % args.report_freq == 0: 216 | logging.info('valid %03d %e %f %f', step, objs.avg, top1.avg, top5.avg) 217 | 218 | return top1.avg, objs.avg 219 | 220 | 221 | if __name__ == '__main__': 222 | main() 223 | -------------------------------------------------------------------------------- /proxylessnas/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import torch 4 | import shutil 5 | import torchvision.transforms as transforms 6 | from torch.autograd import Variable 7 | 8 | 9 | class AvgrageMeter(object): 10 | 11 | def __init__(self): 12 | self.reset() 13 | 14 | def reset(self): 15 | self.avg = 0 16 | self.sum = 0 17 | self.cnt = 0 18 | 19 | def update(self, val, n=1): 20 | self.sum += val * n 21 | self.cnt += n 22 | self.avg = self.sum / self.cnt 23 | 24 | 25 | def accuracy(output, target, topk=(1,)): 26 | maxk = max(topk) 27 | batch_size = target.size(0) 28 | 29 | _, pred = output.topk(maxk, 1, True, True) 30 | pred = pred.t() 31 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 32 | 33 | res = [] 34 | for k in topk: 35 | correct_k = correct[:k].view(-1).float().sum(0) 36 | res.append(correct_k.mul_(100.0/batch_size)) 37 | return res 38 | 39 | 40 | class Cutout(object): 41 | def __init__(self, length): 42 | self.length = length 43 | 44 | def __call__(self, img): 45 | h, w = img.size(1), img.size(2) 46 | mask = np.ones((h, w), np.float32) 47 | y = np.random.randint(h) 48 | x = np.random.randint(w) 49 | 50 | y1 = np.clip(y - self.length // 2, 0, h) 51 | y2 = np.clip(y + self.length // 2, 0, h) 52 | x1 = np.clip(x - self.length // 2, 0, w) 53 | x2 = np.clip(x + self.length // 2, 0, w) 54 | 55 | mask[y1: y2, x1: x2] = 0. 56 | mask = torch.from_numpy(mask) 57 | mask = mask.expand_as(img) 58 | img *= mask 59 | return img 60 | 61 | 62 | def _data_transforms_cifar10(args): 63 | CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124] 64 | CIFAR_STD = [0.24703233, 0.24348505, 0.26158768] 65 | 66 | train_transform = transforms.Compose([ 67 | transforms.RandomCrop(32, padding=4), 68 | transforms.RandomHorizontalFlip(), 69 | transforms.ToTensor(), 70 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 71 | ]) 72 | if args.cutout: 73 | train_transform.transforms.append(Cutout(args.cutout_length)) 74 | 75 | valid_transform = transforms.Compose([ 76 | transforms.ToTensor(), 77 | transforms.Normalize(CIFAR_MEAN, CIFAR_STD), 78 | ]) 79 | return train_transform, valid_transform 80 | 81 | 82 | def _data_transforms_imagenet(args): 83 | mean = [0.485, 0.456, 0.406] 84 | std = [0.229, 0.224, 0.225] 85 | 86 | train_transform = transforms.Compose([ 87 | transforms.Resize(256), 88 | transforms.RandomCrop(224), 89 | transforms.RandomHorizontalFlip(), 90 | transforms.ToTensor(), 91 | transforms.Normalize(mean, std), 92 | ]) 93 | if args.cutout: 94 | train_transform.transforms.append(Cutout(args.cutout_length)) 95 | 96 | valid_transform = transforms.Compose([ 97 | transforms.Resize(256), 98 | transforms.CenterCrop(224), 99 | transforms.ToTensor(), 100 | transforms.Normalize(mean, std), 101 | ]) 102 | return train_transform, valid_transform 103 | 104 | 105 | def count_parameters_in_MB(model): 106 | return np.sum(np.prod(v.size()) for name, v in model.named_parameters() if "auxiliary" not in name)/1e6 107 | 108 | 109 | def save_checkpoint(state, is_best, save): 110 | filename = os.path.join(save, 'checkpoint.pth.tar') 111 | torch.save(state, filename) 112 | if is_best: 113 | best_filename = os.path.join(save, 'model_best.pth.tar') 114 | shutil.copyfile(filename, best_filename) 115 | 116 | 117 | def save(model, model_path): 118 | torch.save(model.state_dict(), model_path) 119 | 120 | 121 | def load(model, model_path): 122 | model.load_state_dict(torch.load(model_path)) 123 | 124 | 125 | def drop_path(x, drop_prob): 126 | if drop_prob > 0.: 127 | keep_prob = 1.-drop_prob 128 | mask = Variable(torch.cuda.FloatTensor(x.size(0), 1, 1, 1).bernoulli_(keep_prob)) 129 | x.div_(keep_prob) 130 | x.mul_(mask) 131 | return x 132 | 133 | 134 | def create_exp_dir(path, scripts_to_save=None): 135 | if not os.path.exists(path): 136 | os.mkdir(path) 137 | print('Experiment dir : {}'.format(path)) 138 | 139 | if scripts_to_save is not None: 140 | os.mkdir(os.path.join(path, 'scripts')) 141 | for script in scripts_to_save: 142 | dst_file = os.path.join(path, 'scripts', os.path.basename(script)) 143 | shutil.copyfile(script, dst_file) 144 | 145 | 146 | def binarize(probapility, select=1): 147 | selected = torch.multinomial(probapility, select) 148 | binarize_array = torch.zeros_like(probapility) 149 | binarize_array[selected] = 1 150 | return binarize_array 151 | -------------------------------------------------------------------------------- /proxylessnas/visualize.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import genotypes 3 | from graphviz import Digraph 4 | 5 | 6 | def plot(genotype, filename): 7 | g = Digraph( 8 | format='pdf', 9 | edge_attr=dict(fontsize='20', fontname="times"), 10 | node_attr=dict(style='filled', shape='rect', align='center', fontsize='20', height='0.5', width='0.5', penwidth='2', fontname="times"), 11 | engine='dot') 12 | g.body.extend(['rankdir=LR']) 13 | 14 | g.node("c_{k-2}", fillcolor='darkseagreen2') 15 | g.node("c_{k-1}", fillcolor='darkseagreen2') 16 | assert len(genotype) % 2 == 0 17 | steps = len(genotype) // 2 18 | 19 | for i in range(steps): 20 | g.node(str(i), fillcolor='lightblue') 21 | 22 | for i in range(steps): 23 | for k in [2*i, 2*i + 1]: 24 | op, j = genotype[k] 25 | if j == 0: 26 | u = "c_{k-2}" 27 | elif j == 1: 28 | u = "c_{k-1}" 29 | else: 30 | u = str(j-2) 31 | v = str(i) 32 | g.edge(u, v, label=op, fillcolor="gray") 33 | 34 | g.node("c_{k}", fillcolor='palegoldenrod') 35 | for i in range(steps): 36 | g.edge(str(i), "c_{k}", fillcolor="gray") 37 | 38 | g.render(filename, view=True) 39 | 40 | 41 | if __name__ == '__main__': 42 | if len(sys.argv) != 2: 43 | print("usage:\n python {} ARCH_NAME".format(sys.argv[0])) 44 | sys.exit(1) 45 | 46 | genotype_name = sys.argv[1] 47 | try: 48 | genotype = eval('genotypes.{}'.format(genotype_name)) 49 | except AttributeError: 50 | print("{} is not specified in genotypes.py".format(genotype_name)) 51 | sys.exit(1) 52 | 53 | plot(genotype.normal, "normal") 54 | plot(genotype.reduce, "reduction") 55 | 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kairos03/ProxylessNAS-Pytorch/c87b233aaffb9e38329cbb7d4fc5f5398b1312a8/requirements.txt -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kairos03/ProxylessNAS-Pytorch/c87b233aaffb9e38329cbb7d4fc5f5398b1312a8/test/__init__.py -------------------------------------------------------------------------------- /test/test_latencyloss.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | from torch.autograd import Variable 5 | 6 | from proxylessnas.latencyloss import * 7 | from proxylessnas.model_search import Network 8 | 9 | 10 | class test_latencyloss(unittest.TestCase): 11 | 12 | def setUp(self): 13 | # 14x14x80-14x14x80-expand:3-kernel:5 14 | self.channels = [32, 16, 24, 40, 80, 96, 192, 320, 1280] 15 | self.steps = [1, 1, 2, 3, 4, 3, 3, 1, 1] 16 | self.strides = [2, 1, 2, 2, 2, 1, 2, 1, 1] 17 | self.loss = LatnecyLoss(self.channels[2:8], self.steps[2:8], self.strides[2:8]) 18 | 19 | def test_find_latency(self): 20 | self.assertEqual(self.loss._predictor('identity_3_5_80_80_14_1'), 0) 21 | self.assertEqual(self.loss._predictor('mbconv_3_5_80_80_14_1'), 1.9960465116279071) 22 | 23 | def test_calculate_feature_map_size(self): 24 | self.loss._calculate_feature_map_size(112) 25 | self.assertEqual(self.loss.feature_maps, [112, 56, 28, 14, 14, 7]) 26 | 27 | def test_forward(self): 28 | # run test 29 | num_ops = len(PRIMITIVES) 30 | # init alpha param for each mixed op 31 | self._alphas_parameters = list() 32 | for k in self.steps[2:8]: 33 | self._alphas_parameters.append(Variable(1e-3*torch.randn(k, num_ops), requires_grad=True)) 34 | 35 | self.loss.forward(self._alphas_parameters) 36 | -------------------------------------------------------------------------------- /test/test_operations.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import torch 3 | 4 | from proxylessnas import operations as op 5 | 6 | 7 | class test_operations(unittest.TestCase): 8 | 9 | def test_ConvBNReLU(self): 10 | x = torch.randn([1, 10, 8, 8]) 11 | 12 | conv = op.ConvBNReLU(10, 20, 1, 1, 0) 13 | out = conv(x) 14 | self.assertEqual(out.shape, (1, 20, 8, 8)) 15 | print(out.shape) 16 | 17 | conv = op.ConvBNReLU(10, 10, 1, 1, 0) 18 | out = conv(x) 19 | print(out.shape) 20 | self.assertEqual(out.shape, (1, 10, 8, 8)) 21 | 22 | conv = op.ConvBNReLU(10, 5, 3, 2, 1) 23 | out = conv(x) 24 | print(out.shape) 25 | self.assertEqual(out.shape, (1, 5, 4, 4)) 26 | 27 | conv = op.ConvBNReLU(10, 5, 3, 2, 1) 28 | out = conv(x) 29 | print(out.shape) 30 | self.assertEqual(out.shape, (1, 5, 4, 4)) 31 | 32 | def test_depthwise_conv(self): 33 | x = torch.randn([1, 10, 8, 8]) 34 | 35 | conv = op.depthwise_conv(10, 1, 1, 10, True) 36 | out = conv(x) 37 | self.assertEqual(out.shape, (1, 10, 8, 8)) 38 | 39 | def test_MBConv(self): 40 | x = torch.randn([1, 10, 8, 8]) 41 | 42 | conv = op.MBConv(10, 10, 3, 1, 1, 3) 43 | out = conv(x) 44 | self.assertEqual(out.shape, (1, 10, 8, 8)) 45 | 46 | conv = op.MBConv(10, 20, 3, 1, 1, 3) 47 | out = conv(x) 48 | self.assertEqual(out.shape, (1, 20, 8, 8)) 49 | 50 | conv = op.MBConv(10, 20, 3, 2, 1, 6) 51 | out = conv(x) 52 | self.assertEqual(out.shape, (1, 20, 4, 4)) 53 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import torch 4 | import numpy as np 5 | 6 | from proxylessnas import utils 7 | 8 | 9 | class test_utils(unittest.TestCase): 10 | 11 | def test_binarize(self): 12 | p = torch.tensor([0.1, 0.4, 0.4, 0.1]) 13 | b = utils.binarize(p, 1) 14 | self.assertEqual(len(b), len(p)) 15 | self.assertEqual(sum(b), 1) 16 | 17 | p = torch.tensor([1., 0., 0., 0.]) 18 | for _ in range(100): 19 | b = utils.binarize(p) 20 | self.assertTrue(torch.equal(b, p)) 21 | 22 | p = torch.tensor([0.1, 0.4, 0.4, 0.1]) 23 | b = utils.binarize(p, 2) 24 | print(b) 25 | self.assertEqual(len(b), len(p)) 26 | self.assertEqual(sum(b), 2) 27 | --------------------------------------------------------------------------------