├── .gitignore ├── LICENSE ├── Layer.py ├── Model.py ├── README.md ├── architecture.py ├── controllers ├── ActorCriticLSTM.py ├── Autoregressive.py ├── AutoregressiveLayer.py ├── AutoregressiveParam.py ├── EncoderDecoder.py ├── LSTM.py └── __init__.py ├── datasets ├── __init__.py ├── caltech256.py ├── cifar10.py ├── cifar100.py ├── cifar10_old.py ├── imagenet.py ├── mnist.py └── svhn.py ├── experiments ├── ar_run_layer_clean.py ├── ar_run_param_clean.py ├── bd_run_layer_clean.py ├── ed_run_layer_general.py ├── resnet_actor_critic_layer.py ├── resnet_ar_run_layer_clean.py ├── resnet_ar_run_param_clean.py └── resnet_db_run_layer_clean.py ├── model ├── __init__.py ├── cifar_new.py ├── densenet.py ├── googlenet.py ├── lenet.py ├── mnistnet.py ├── mnistnetv2.py ├── resnet.py ├── ssd.py └── vgg.py ├── rl.py ├── run.py ├── test_model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | parent_models/* 3 | *.pyc 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /Layer.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import torch 3 | 4 | 5 | class Layer(): 6 | def __init__(self, layer): 7 | self._layer = layer 8 | self.type = getLayerType(layer) 9 | k = getattr(layer, 'kernel_size', 0) 10 | k = k[0] if type(k) is tuple else k 11 | s = getattr(layer, 'stride', 0) 12 | s = s[0] if type(s) is tuple else s 13 | o = getattr(layer, 'out_channels', 0) 14 | o = getattr(layer, 'out_features', o) 15 | p = getattr(layer, 'padding', 0) 16 | p = p[0] if type(p) is tuple else p 17 | skipstart = layer.skipstart if hasattr(layer, 'skipstart') else 0 18 | skipend = layer.skipend if hasattr(layer, 'skipend') else 0 19 | self.k = k 20 | self.s = s 21 | self.o = o 22 | self.p = p 23 | self.skipstart = skipstart 24 | self.skipend = skipend 25 | 26 | def getRepresentation(self, skipSupport=False): 27 | rep = [self.type, self.k, self.s, self.o, self.p] 28 | if skipSupport: 29 | rep.extend([self.skipstart, self.skipend]) 30 | return rep 31 | 32 | def toTorchTensor(self, skipSupport=False): 33 | t = torch.Tensor(self.getRepresentation(skipSupport)) 34 | t = t.unsqueeze(0) 35 | t = t.unsqueeze(0) 36 | return t 37 | -------------------------------------------------------------------------------- /Model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | class Model(nn.Module): 4 | def __init__(self, features, classifier): 5 | super(Model, self).__init__() 6 | if features: 7 | self.features = features 8 | if classifier: 9 | self.classifier = classifier 10 | 11 | def forward(self, x): 12 | x = self.features(x) 13 | x = self.classifier(x.view(x.size(0), -1)) 14 | return x 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # N2N: Network to Network Compression using Policy Gradient Reinforcement Learning (ICLR 2018) 2 | This is the code to run the model compression algorithm described in the paper. 3 | It currently supports trained models in pytorch. If you would like to use it with a model in another deep learning framework, it would have to be converted to pytorch first. 4 | [Link to ICLR paper](https://openreview.net/pdf?id=B1hcZZ-AW) 5 | 6 | ## Dependencies 7 | There are some dependencies for running this 8 | 1. [python](https://www.python.org/) >= 2.7 9 | 2. [pytorch](http://pytorch.org/) >= 0.2 10 | 3. [torchvision](http://pytorch.org/) >= 0.19 11 | 12 | ## How to run 13 | 1. Clone this repository using 14 | ``` 15 | git clone https://github.com/anubhavashok/N2N.git 16 | ``` 17 | 2. Download teacher models from the links below 18 | 19 | 3. Layer removal and Layer shrinkage instructions are described below 20 | Additional detailed instructions can be found in the help menu in run.py 21 | ### Removal 22 | Here is an example command to train the layer removal policy on the cifar10 dataset using the resnet-18 model 23 | ``` 24 | python run.py removal cifar10 teacherModels/resnet18_cifar10.net --cuda True 25 | ``` 26 | 27 | ### Shrinkage 28 | NOTE: To run shrinkage, specify both teacher model and reduced model from stage1 29 | ``` 30 | python run.py shrinkage cifar10 teacherModels/resnet18_cifar10.net --model Stage1_cifar10/reduced_model1.net --cuda True 31 | ``` 32 | 33 | ## Downloading models 34 | All models can be downloaded at this [link](https://drive.google.com/drive/folders/13MMFq2trB5oEVr6yXuysjHXAoSf65sV3?usp=sharing) 35 | ### Pre-trained teacher models 36 | The teacher models are to be specified to run.py to train. 37 | ### Pre-trained student models 38 | The pre-trained student models are given to show the performance of the models described in the paper. They can be tested using test\_model.py 39 | Test using 40 | ``` 41 | python test_model.py studentModels/resnet18_cifar10.net cifar10 42 | ``` 43 | ### Pre-trained policies 44 | The pre-trained polcies are specified to run the transfer learning experiments 45 | 46 | 47 | 48 | ## Experiments folder 49 | The experiments folder contains various variants of layer removal and shrinkage that were tried for the actual paper. These were mainly experiments which require substantial modifications to the main code or were used on earlier iterations of the project. 50 | They have to be moved to the main folder before being run. 51 | The following describes each experiment 52 | 1. ar\_run\_layer\_clean.py - Layer removal using the Autoregressive controller 53 | 2. ar\_run\_param\_clean.py - Layer shrinkage for **Non-ResNet** convolutional models 54 | 3. bd\_run\_layer\_clean.py - Layer removal for **Non-ResNet** convolutional models using the bidirectional controller 55 | 4. ed\_run\_layer\_general.py - Layer removal for **Non-ResNet** convolutional models using the encoder-decoder controller 56 | 5. resnet\_actor\_critic\_layer.py - Layer removal using the Actor-Critic controller 57 | 6. resnet\_ar\_run\_layer\_clean.py - Layer removal for **ResNet** models using the Autoregressive controller 58 | 59 | ## Citing 60 | Please use the following bibtex to cite the paper: 61 | ``` 62 | @inproceedings{ 63 | ashok2018nn, 64 | title={N2N learning: Network to Network Compression via Policy Gradient Reinforcement Learning}, 65 | author={Anubhav Ashok and Nicholas Rhinehart and Fares Beainy and Kris M. Kitani}, 66 | booktitle={International Conference on Learning Representations}, 67 | year={2018}, 68 | url={https://openreview.net/pdf?id=B1hcZZ-AW}, 69 | } 70 | ``` 71 | -------------------------------------------------------------------------------- /architecture.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils import * 3 | from Layer import * 4 | from Model import * 5 | import copy 6 | import numpy as np 7 | 8 | 9 | LINEAR_THRESHOLD = 50000 10 | 11 | def applyActionsShrinkage(m, action, inp, lookup): 12 | #if m.fixed: 13 | # return resizeToFit(Layer(m), inp) 14 | # Get representation 15 | # Perform updates 16 | _, k, s, o, p = Layer(m).getRepresentation() 17 | k = max(int(k * lookup[action[1]]), 1) if m.fixed[1] == False else k 18 | s = max(int(s * lookup[action[2]]), 1) if m.fixed[2] == False else s 19 | o = max(int(o * lookup[action[3]]), 10) if m.fixed[3] == False else o 20 | p = int(p * lookup[action[4]]) if m.fixed[4] == False else p 21 | in_channels = inp.size(1) 22 | cn = m.__class__.__name__ 23 | if cn == 'Linear': 24 | in_channels = inp.view(inp.size(0), -1).size(1) 25 | if in_channels > LINEAR_THRESHOLD or in_channels < 10: 26 | print('Linear layer too large') 27 | return None 28 | return resizeLayer(m, in_channels, o, kernel_size=k, stride=s, padding=p) 29 | 30 | 31 | class Architecture: 32 | def __init__(self, mode, model, datasetInputTensor, datasetName, LINEAR_THRESHOLD=50000, baseline_acc=.5, lookup=None): 33 | self.mode = mode 34 | self.model = model 35 | self.datasetInputTensor = datasetInputTensor 36 | self.a = 0 37 | self.inp = None 38 | self.LINEAR_THRESHOLD = LINEAR_THRESHOLD 39 | self.datasetName = datasetName 40 | self.parentSize = numParams(model) 41 | self.baseline_acc = baseline_acc 42 | self.lookup = lookup 43 | 44 | def traverse_removal(self, parent, m, m_name, actions): 45 | classname = m.__class__.__name__ 46 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model', 'ResNetModifiable', 'BasicBlockModifiable']: 47 | child = createParentContainer(m) 48 | for i in m._modules.keys(): 49 | if i == 'shortcut': 50 | continue 51 | res = self.traverse_removal(child, m._modules[i], i, actions) 52 | if res == None: 53 | return None 54 | if classname not in ['ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model', 'ResNetModifiable']: 55 | parent.add_module(m_name, child) 56 | else: 57 | return child 58 | else: 59 | # childless layers -> we can shrink/remove these 60 | if classname == 'Linear': 61 | self.inp = self.inp.view(self.inp.size(0), -1) 62 | if self.inp.size(1) > self.LINEAR_THRESHOLD: 63 | return None 64 | # perform removal/shrinkage 65 | # if self.mode == 'removal': 66 | # add if ok 67 | # else if self.mode == 'shrinkage': 68 | # shrink layer 69 | # add if ok 70 | # else 71 | if m.fixed or actions[self.a]: 72 | m = resizeToFit(Layer(m), self.inp).cuda() 73 | self.inp = m(self.inp) 74 | parent.add_module(m_name, m) 75 | self.a += 1 76 | return True 77 | 78 | def traverse_shrinkage(self, parent, m, m_name, actions): 79 | classname = m.__class__.__name__ 80 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'Model', 'ResNetModifiable', 'BasicBlockModifiable']: 81 | # Change the number of input channels of the first conv of the shortcut layer 82 | oldInp = Variable(copy.deepcopy(self.inp.data)) 83 | child = createParentContainer(m) 84 | if classname in ['BasicBlock', 'BottleNeck', 'BasicBlockModifiable']: 85 | self.fixBlockLayers(m) 86 | m = self.processBlock(actions, m, self.lookup, self.inp.size(1)).cuda() 87 | self.inp = m.layers(self.inp.cuda()) 88 | child = m 89 | else: 90 | for i in m._modules.keys(): 91 | res = self.traverse_shrinkage(child, m._modules[i], i, actions) 92 | if res == None: 93 | return None 94 | # Change the number of output channels of the last conv of the shortcut layer 95 | if classname not in ['ResNet', 'VGG', 'LeNet', 'Model', 'ResNetModifiable']: 96 | child(oldInp) 97 | parent.add_module(m_name, child) 98 | return True 99 | else: 100 | return child 101 | else: 102 | if classname == 'Linear': 103 | self.inp = self.inp.view(self.inp.size(0), -1) 104 | #print(inp.size(1)) 105 | if self.inp.size(1) > LINEAR_THRESHOLD or self.inp.size(1) < 10: 106 | print('Linear layer too large') 107 | return None 108 | action = actions[self.a][:] 109 | m = applyActionsShrinkage(m, action, self.inp, self.lookup) 110 | if m == None: 111 | return None 112 | try: 113 | self.inp = m.cuda()(self.inp) 114 | except: 115 | print('Error in model, probably because of receptive field size') 116 | return None 117 | parent.add_module(m_name, m) 118 | self.a += 1 119 | return True 120 | 121 | def processBlock(self, actions, m, lookup, input_size): 122 | finalAction = actions[self.a+len(m.layers._modules)-1][3] 123 | finalActionUsed = False 124 | 125 | secondFinalAction = actions[self.a+len(m.layers._modules)-2][3] 126 | secondFinalActionUsed = False 127 | 128 | firstConv = False 129 | secondConv = False 130 | hasShortcut = False 131 | 132 | if '0' in m.layers._modules: 133 | firstConv = True 134 | 135 | if '3' in m.layers._modules: 136 | secondConv = True 137 | 138 | if hasattr(m, 'shortcut') and m.shortcut != None: 139 | hasShortcut = True 140 | o = input_size 141 | if firstConv: 142 | i = input_size#m.layers._modules['0'].in_channels 143 | k = m.layers._modules['0'].kernel_size 144 | s = m.layers._modules['0'].stride 145 | o = m.layers._modules['0'].out_channels 146 | if secondConv: 147 | o = max(int(o * self.lookup[finalAction]), 10) 148 | finalActionUsed = True 149 | elif hasShortcut: 150 | o = max(int(o * self.lookup[finalAction]), 10) 151 | si = i 152 | sk = m.shortcut._modules['0'].kernel_size 153 | ss = m.shortcut._modules['0'].stride 154 | sp = m.shortcut._modules['0'].padding 155 | m.shortcut._modules['0'] = resizeLayer(m.shortcut._modules['0'], si, o, sk, ss, sp).cuda() 156 | m.shortcut._modules['1'] = resizeLayer(m.shortcut._modules['1'], o, o).cuda() 157 | finalActionUsed = True 158 | else: 159 | # We want output to be same as input in the event of no shortcut and no secondConv 160 | o = i 161 | p = m.layers._modules['0'].padding 162 | m.layers._modules['0'] = resizeLayer(m.layers._modules['0'], i, o, k, s, p).cuda() 163 | if '1' in m.layers._modules: 164 | m.layers._modules['1'] = resizeLayer(m.layers._modules['1'], o, o).cuda() 165 | if secondConv: 166 | #i = m.layers._modules['3'].in_channels if not firstConv else m.layers._modules['0'].out_channels 167 | i = o 168 | k = m.layers._modules['3'].kernel_size 169 | s = m.layers._modules['3'].stride 170 | o = m.layers._modules['3'].out_channels 171 | if hasShortcut: 172 | o = max(int(o * self.lookup[secondFinalAction]), 10) 173 | si = m.layers._modules['0'].in_channels if firstConv else i 174 | sk = m.shortcut._modules['0'].kernel_size 175 | ss = m.shortcut._modules['0'].stride 176 | sp = m.shortcut._modules['0'].padding 177 | m.shortcut._modules['0'] = resizeLayer(m.shortcut._modules['0'], si, o, sk, ss, sp).cuda() 178 | m.shortcut._modules['1'] = resizeLayer(m.shortcut._modules['1'], o, o).cuda() 179 | secondFinalActionUsed = True 180 | else: 181 | o = m.layers._modules['0'].in_channels if firstConv else i 182 | p = m.layers._modules['3'].padding 183 | m.layers._modules['3'] = resizeLayer(m.layers._modules['3'], i, o, k, s, p).cuda() 184 | if '4' in m.layers._modules: 185 | m.layers._modules['4'] = resizeLayer(m.layers._modules['4'], o, o).cuda() 186 | 187 | # Void actions 188 | for _ in range(len(m.layers._modules)-2): 189 | # actions[a].detach() 190 | self.a += 1 191 | 192 | #if not secondFinalActionUsed: 193 | # actions[a].detach() 194 | self.a += 1 195 | 196 | #if not finalActionUsed: 197 | # actions[a].detach() 198 | self.a += 1 199 | return m 200 | 201 | 202 | def fixLayers(self, m): 203 | # TODO: Make this function generalize to most models 204 | # We basically want to make sure at least one fc layer exists 205 | # We also want to make sure that the stride layer for downsampling does not get removed 206 | layers = flattenModule(m) 207 | # Initialize 208 | for l in layers: 209 | l.fixed = False 210 | layers[-1].fixed = True 211 | for l in layers: 212 | cn = l.__class__.__name__ 213 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 214 | l.fixed = True 215 | if cn == 'Linear' or cn == 'AvgPool2d': 216 | l.fixed = True 217 | 218 | def fixBlockLayers(self, m): 219 | # Only allow num_filters of conv layers to change 220 | for mm in m.layers._modules.values(): 221 | mm.fixed = [True]*5 222 | m.layers._modules.values()[0].fixed = [True, True, True, False, True] 223 | #m._modules.values()[-2].fixed = [True, True, True, False, True] 224 | 225 | def fixParams(self, m): 226 | layers = flattenModule(m) 227 | # Initialize 228 | for l in layers: 229 | l.fixed = [False]*5 230 | 231 | # Fix any layers you want here 232 | # ---- 233 | # Fix all shortcut layers and corresponding stride layers, but not pre layers 234 | for l in layers: 235 | # Fix all shortcut/downsampling layers 236 | # Since we couple the action for the conv layer and this layer we can modify this when building model 237 | cn = l.__class__.__name__ 238 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 239 | l.fixed = [True]*5 240 | # Fix final linear and average pooling layer 241 | if cn == 'Linear' or cn == 'AvgPool2d': 242 | l.fixed = [True]*5 243 | # ---- 244 | 245 | 246 | def generateChildModel(self, actions): 247 | m = copy.deepcopy(self.model) 248 | self.a = 0 249 | self.inp = Variable(self.datasetInputTensor.clone()).cuda() 250 | if self.mode == 'shrinkage': 251 | # Reshape actions to [Layer, Param] 252 | actions = np.reshape(actions, (-1, 5)) 253 | self.fixParams(m) 254 | newModel = self.traverse_shrinkage(None, m, None, actions) 255 | else: 256 | actions[0] = 1 257 | self.fixLayers(m) 258 | newModel = self.traverse_removal(None, m, None, actions) 259 | if newModel == None or numParams(newModel) >= self.parentSize: 260 | return None 261 | resetModel(newModel) 262 | return newModel 263 | -------------------------------------------------------------------------------- /controllers/ActorCriticLSTM.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | class LSTM(nn.Module): 4 | def __init__(self, input_size, output_size, hidden_size, num_layers, bidirectional=True): 5 | super(LSTM, self).__init__() 6 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional) 7 | self.Wt_softmax = nn.Linear(num_layers*hidden_size, output_size) 8 | self.critic = nn.Linear(num_layers*hidden_size, 1) 9 | self.softmax = nn.Softmax() 10 | 11 | def forward(self, input, hx): 12 | output, hx = self.lstm(input, hx) 13 | output = output.squeeze(1) 14 | value = self.critic(output) 15 | output = self.Wt_softmax(output) 16 | probs = self.softmax(output) 17 | actions = probs.multinomial() 18 | return actions, value 19 | 20 | def reset_parameters(self): 21 | self.lstm.reset_parameters() 22 | ''' 23 | num_layers = 2 24 | inp = Variable(torch.rand(21, 1, 5)) 25 | hx = (Variable(torch.rand(num_layers*2, 1, 10)), Variable(torch.rand(num_layers*2, 1, 10))) 26 | 27 | lstm = LSTM(5, 2, 10, 2, bidirectional=True) 28 | actions = lstm.forward(inp, hx) 29 | ''' 30 | -------------------------------------------------------------------------------- /controllers/Autoregressive.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import LSTMCell 4 | from torch.autograd import Variable 5 | 6 | class LSTMAuto(nn.Module): 7 | def __init__(self, input_size, output_size, hidden_size): 8 | super(LSTMAuto, self).__init__() 9 | self.input_size = input_size 10 | self.output_size = output_size 11 | self.hidden_size = hidden_size 12 | # Output of previous iteration appended to input 13 | self.lstmCell = LSTMCell(output_size + input_size, hidden_size) 14 | # Softmax variables 15 | self.linear = nn.Linear(hidden_size, output_size) 16 | self.softmax = nn.Softmax() 17 | 18 | def forward(self, input, hx): 19 | outputs = [] 20 | output = torch.Tensor(1, self.output_size) 21 | # Keep layer 22 | output[0][1] = 1 23 | output[0][0] = 0 24 | output = Variable(output) 25 | hn, cn = hx 26 | for i in range(len(input)): 27 | input_augmented = Variable(torch.cat([output.data, input[i].data], 1)) 28 | hn, cn = self.lstmCell(input_augmented, (hn, cn)) 29 | output = self.softmax(self.linear(hn)) 30 | outputs.append(output) 31 | return torch.stack(outputs) 32 | 33 | ''' 34 | from Autoregressive import * 35 | from torch.autograd import * 36 | inp = Variable(torch.rand(21, 1, 5)) 37 | hx = (Variable(torch.Tensor(1, 10)), Variable(torch.Tensor(1, 10))) 38 | 39 | lstm = LSTMAuto(5, 2, 10, 21) 40 | lstm(inp, hx) 41 | ''' 42 | -------------------------------------------------------------------------------- /controllers/AutoregressiveLayer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import LSTMCell 4 | from torch.autograd import Variable 5 | 6 | class LSTMAuto(nn.Module): 7 | def __init__(self, input_size, output_size, hidden_size, num_layers): 8 | super(LSTMAuto, self).__init__() 9 | self.input_size = input_size 10 | self.output_size = output_size 11 | self.hidden_size = hidden_size 12 | self.num_layers = num_layers 13 | # Output of previous iteration appended to input 14 | self.layers = [] 15 | input_size += 1 16 | for i in range(num_layers): 17 | self.layers.append(LSTMCell(input_size, hidden_size)) 18 | input_size = hidden_size 19 | # Softmax variables 20 | self.linear = nn.Linear(hidden_size, output_size) 21 | self.softmax = nn.Softmax() 22 | 23 | def forwardLayer(self, input, hn, cn, layer): 24 | return (hn, cn) 25 | 26 | def forwardLayers(self, input, hns, cns, layers): 27 | new_hns = [] 28 | new_cns = [] 29 | (hn, cn) = layers[0](input, (hns[0], cns[0])) 30 | new_hns.append(hn) 31 | new_cns.append(cn) 32 | #hns[0], cns[0] = hn, cn 33 | for i in range(1, len(layers)): 34 | (hn, cn) = layers[i](hn, (hns[i], cns[i])) 35 | new_hns.append(hn) 36 | new_cns.append(cn) 37 | #hns[i] = hn 38 | #cns[i] = cn 39 | return hn, (new_hns, new_cns) 40 | 41 | def forward(self, input, hx): 42 | outputs = [] 43 | output = torch.Tensor(1, 1) 44 | # Keep layer 45 | output[0][0] = 1 46 | output = Variable(output) 47 | hns, cns = hx 48 | for i in range(len(input)): 49 | input_augmented = Variable(torch.cat([output.data.float(), input[i].data], 1)) 50 | output, (hns, cns) = self.forwardLayers(input_augmented, hns, cns, self.layers) 51 | probs = self.softmax(self.linear(output)) 52 | output = probs.multinomial() 53 | outputs.append(output) 54 | return outputs 55 | 56 | ''' 57 | from Autoregressive import * 58 | from torch.autograd import * 59 | inp = Variable(torch.rand(21, 1, 5)) 60 | hx = ([Variable(torch.Tensor(1, 10))]*num_layers, [Variable(torch.Tensor(1, 10))]*num_layers) 61 | 62 | lstm = LSTMAuto(5, 2, 10, 2) 63 | lstm(inp, hx) 64 | ''' 65 | -------------------------------------------------------------------------------- /controllers/AutoregressiveParam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import LSTMCell 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | class LSTMAutoParams(nn.Module): 8 | def __init__(self, input_size, output_size, hidden_size, num_layers, lookup): 9 | super(LSTMAutoParams, self).__init__() 10 | self.input_size = input_size 11 | self.output_size = output_size 12 | self.hidden_size = hidden_size 13 | self.num_layers = num_layers 14 | self.lookup = lookup 15 | # Output of previous iteration appended to input 16 | self.layers = [] 17 | for i in range(num_layers): 18 | self.layers.append(LSTMCell(input_size, hidden_size)) 19 | input_size = hidden_size 20 | # Softmax variables 21 | self.linear = nn.Linear(hidden_size, output_size) 22 | self.softmax = nn.Softmax() 23 | 24 | def forwardLayers(self, input, hns, cns, layers): 25 | new_hns = [] 26 | new_cns = [] 27 | (hn, cn) = layers[0](input, (hns[0], cns[0])) 28 | new_hns.append(hn) 29 | new_cns.append(cn) 30 | for i in range(1, len(layers)): 31 | (hn, cn) = layers[i](hn, (hns[i], cns[i])) 32 | new_hns.append(hn) 33 | new_cns.append(cn) 34 | return hn, (new_hns, new_cns) 35 | 36 | def forward(self, input, hx): 37 | actions = [] 38 | output = torch.Tensor(1, self.output_size) 39 | # Keep layer 40 | hns, cns = hx 41 | action = Variable(torch.Tensor(1)) 42 | action[0] = 0 43 | for i in range(len(input)): 44 | input_augmented = input[i] 45 | for j in range(self.input_size): 46 | # incorporate previous decision into input[i][j] 47 | output, (hns, cns) = self.forwardLayers(input_augmented, hns, cns, self.layers) 48 | output = self.softmax(self.linear(output)) 49 | action = output.squeeze(1).multinomial() 50 | actions.append(action) 51 | # Update input_augmented for next iteration 52 | intAction = int(action.data.numpy()) 53 | mult = np.ones((self.input_size)).astype(np.float32) 54 | # Don't change type here 55 | mult[j] = 1 if j == 0 else self.lookup[intAction] 56 | input_augmented = Variable(input_augmented.data * torch.from_numpy(mult)) 57 | return actions 58 | 59 | 60 | ''' 61 | import random 62 | from AutoregressiveParam import * 63 | import numpy as np 64 | 65 | inp = np.zeros((21, 1, 5)) 66 | 67 | for i in range(inp.shape[0]): 68 | r = random.random() 69 | num = 1/(1 + pow(np.e, r)) 70 | for j in range(inp.shape[2]): 71 | inp[i][0][j] = num 72 | 73 | num_layers = 2 74 | 75 | inp = Variable(torch.from_numpy(inp.astype(np.float32))) 76 | hx = ([Variable(torch.Tensor(1, 10))]*num_layers, [Variable(torch.Tensor(1, 10))]*num_layers) 77 | 78 | lookup = [1, 0] 79 | lstm = LSTMAutoParams(5, 2, 10, num_layers, lookup) 80 | lstm(inp, hx) 81 | ''' 82 | -------------------------------------------------------------------------------- /controllers/EncoderDecoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import LSTM, LSTMCell 4 | from torch.autograd import Variable 5 | 6 | class EncoderLSTM(nn.Module): 7 | ''' 8 | Based on encoder in Sequence2Sequence 9 | In the Encoder, we take in a sequence of inputs 10 | E.g. [Layer1, Layer2, Layer3...] or [Layer1.param1, Layer1.param2, Layer2.param1, ...] 11 | We return JUST the final hidden state after the LSTM has processed this sequence 12 | The idea is that the LSTM encodes the whole sequence in the hidden state 13 | We then pass the output of this to the Decoder to generate our sequence of actions 14 | 15 | NOTE: input to this is the reverse of sequence of inputs since its shown to perform better 16 | ''' 17 | def __init__(self, input_size, hidden_size, seq_len, num_layers=1, bias=True, dropout=0, bidirectional=False): 18 | super(EncoderLSTM, self).__init__() 19 | self.input_size = input_size 20 | self.hidden_size = hidden_size 21 | self.num_layers = num_layers 22 | self.bias = bias 23 | self.dropout = dropout 24 | self.dropout_state = {} 25 | self.bidirectional = bidirectional 26 | self.seq_len = seq_len 27 | num_directions = 2 if bidirectional else 1 28 | 29 | self.lstm = LSTM(input_size, hidden_size, num_layers) 30 | 31 | def forward(self, input): 32 | # here input is some input sequence 33 | # define some initial hidden state 34 | hx = (Variable(torch.zeros(self.num_layers, 1, self.hidden_size)), Variable(torch.zeros(self.num_layers, 1, self.hidden_size))) 35 | output, hx = self.lstm(input, hx) 36 | return hx 37 | 38 | 39 | class DecoderLSTM(nn.Module): 40 | ''' 41 | Based on decoder in Sequence2Sequence 42 | In the Decoder, we take in a hidden state that is the output of the encoder 43 | Since this is autoregressive, we first generate an input to the LSTM which is the same size as the output 44 | This is done using the softmax layer 45 | Since the hidden states of the encoder and decoder are of the same dimension, this will work 46 | We then generate T outputs, where T is the length of our sequence (i.e. action) 47 | Since this is autoregressive, the input to each iteration is the softmax output of the previous one 48 | ''' 49 | def __init__(self, output_size, hidden_size, seq_len, num_layers=1, bias=True, dropout=0, bidirectional=False): 50 | super(DecoderLSTM, self).__init__() 51 | self.output_size = output_size 52 | self.hidden_size = hidden_size 53 | self.num_layers = num_layers 54 | self.bias = bias 55 | self.dropout = dropout 56 | self.dropout_state = {} 57 | self.bidirectional = bidirectional 58 | self.seq_len = seq_len 59 | num_directions = 2 if bidirectional else 1 60 | 61 | self.lstm = LSTMCell(output_size, hidden_size) 62 | self.linear = nn.Linear(hidden_size, output_size) 63 | self.softmax = nn.Softmax() 64 | 65 | def forward(self, hx): 66 | outputs = [] 67 | # Convert hidden state of encoder into a input 68 | hx = (hx[0].squeeze(0), hx[1].squeeze(0)) 69 | h = hx[0] 70 | input = self.softmax(self.linear(h)) 71 | for i in range(self.seq_len): 72 | h, c = self.lstm(input, hx) 73 | hx = (h, c) 74 | # do softmax on output 75 | output = self.softmax(self.linear(h)) 76 | outputs.append(output) 77 | input = output 78 | return torch.stack(outputs) 79 | 80 | 81 | class EncoderDecoderLSTM(nn.Module): 82 | def __init__(self, input_size, output_size, hidden_size, seq_len): 83 | super(EncoderDecoderLSTM, self).__init__() 84 | self.encoder = EncoderLSTM(input_size, hidden_size, seq_len) 85 | self.decoder = DecoderLSTM(output_size, hidden_size, seq_len) 86 | 87 | def forward(self, input): 88 | h = self.encoder.forward(input) 89 | outputs = self.decoder.forward(h) 90 | return outputs 91 | 92 | # from EncoderDecoder import * 93 | # from torch.autograd import Variable 94 | # import torch 95 | # ed = EncoderDecoderLSTM(5, 2, 100, 21) 96 | # outputs = ed.forward(Variable(torch.rand(21, 1, 5))) 97 | # actions = torch.stack([o.multinomial() for o in outputs]) 98 | -------------------------------------------------------------------------------- /controllers/LSTM.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | class LSTM(nn.Module): 4 | def __init__(self, input_size, output_size, hidden_size, num_layers, bidirectional=True): 5 | super(LSTM, self).__init__() 6 | self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=bidirectional) 7 | self.Wt_softmax = nn.Linear(num_layers*hidden_size, output_size) 8 | self.softmax = nn.Softmax() 9 | 10 | def forward(self, input, hx): 11 | output, hx = self.lstm(input, hx) 12 | output = output.squeeze(1) 13 | output = self.Wt_softmax(output) 14 | probs = self.softmax(output) 15 | actions = probs.multinomial() 16 | return actions 17 | 18 | def reset_parameters(self): 19 | self.lstm.reset_parameters() 20 | ''' 21 | num_layers = 2 22 | inp = Variable(torch.rand(21, 1, 5)) 23 | hx = (Variable(torch.rand(num_layers*2, 1, 10)), Variable(torch.rand(num_layers*2, 1, 10))) 24 | 25 | lstm = LSTM(5, 2, 10, 2, bidirectional=True) 26 | actions = lstm.forward(inp, hx) 27 | ''' 28 | -------------------------------------------------------------------------------- /controllers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anubhavashok/N2N/544f5dd6c9c023c81b9c7b8ff5c8ccc1c895c66d/datasets/__init__.py -------------------------------------------------------------------------------- /datasets/caltech256.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils import data 3 | from torch import nn 4 | from torchvision import transforms 5 | from torch.autograd import Variable 6 | from os.path import join 7 | from PIL import Image 8 | from glob import glob 9 | import numpy as np 10 | from torch import optim 11 | 12 | cuda = True 13 | batch_size = 128 14 | 15 | def load_img(filepath): 16 | img = Image.open(filepath).convert('RGB') 17 | return img.resize((224, 224)) 18 | 19 | mean = torch.FloatTensor([0.485, 0.456, 0.406]) 20 | std = torch.FloatTensor([0.229, 0.224, 0.225]) 21 | img_transform = transforms.Compose([ 22 | transforms.Scale(256), 23 | transforms.RandomCrop(224), 24 | transforms.RandomHorizontalFlip(), 25 | transforms.ToTensor(), 26 | transforms.Normalize(mean,std) 27 | ]) 28 | 29 | class Caltech256(data.Dataset): 30 | def __init__(self, split='train', path='/home/anubhava/ArchSearch/data/caltech256/256_ObjectCategories'): 31 | super(Caltech256, self).__init__() 32 | self.path = path 33 | self.split = split 34 | self.filepaths = glob(join(self.path, '*/*.jpg')) 35 | n = len(self.filepaths) 36 | train_paths, test_paths = self.get_splits(self.path, 1001) 37 | if split == "train": 38 | self.filepaths = train_paths#list(map(lambda i: self.filepaths[i], train_paths)) 39 | else: 40 | #test_choices = filter(lambda i: i not in train_choices, range(len(self.filepaths))) 41 | self.filepaths = test_paths#list(map(lambda i: self.filepaths[i], test_paths)) 42 | self.targets = [f.split('/')[-1] for f in glob(join(self.path, '*'))] 43 | 44 | def get_splits(self, base_path, seed=1000): 45 | np.random.seed(seed) 46 | train_files = [] 47 | test_files = [] 48 | # From each class select 10% at random 49 | classes = [f.split('/')[-1] for f in glob(join(base_path, '*'))] 50 | for c in classes: 51 | files = glob(join(base_path, c, '*')) 52 | n = len(files) 53 | #train = np.random.choice(files, int(n*0.8), replace=False) 54 | train = np.random.choice(files, n - 15, replace=False) 55 | test = filter(lambda x: x not in train, files) 56 | train_files.extend(train) 57 | test_files.extend(test) 58 | return train_files, test_files 59 | 60 | def __getitem__(self, index): 61 | filepath = self.filepaths[index] 62 | img = img_transform(load_img(filepath)) 63 | # Scale and convert to tensor 64 | target = torch.Tensor([self.targets.index(filepath.split('/')[-2])]) 65 | return img, target 66 | 67 | def __len__(self): 68 | return len(self.filepaths) 69 | 70 | kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} 71 | train_loader = torch.utils.data.DataLoader(Caltech256(split='train'), batch_size=batch_size, shuffle=True, **kwargs) 72 | test_loader = torch.utils.data.DataLoader(Caltech256(split='test'), batch_size=batch_size, **kwargs) 73 | optimizer = None 74 | ceLoss = nn.CrossEntropyLoss() 75 | lr = 0.01 76 | lr_decay = 10 77 | 78 | def lr_schedule(optimizer, epoch): 79 | new_lr = lr / pow(10, epoch // lr_decay) 80 | for param_group in optimizer.param_groups: 81 | param_group['lr'] = new_lr 82 | return optimizer 83 | 84 | def train(epoch): 85 | global optimizer 86 | global avg_loss 87 | if epoch == 1: 88 | optimizer = optim.SGD(net.parameters(), lr=lr, momentum=0.9, nesterov=True) 89 | #optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4) 90 | optimizer = lr_schedule(optimizer, epoch) 91 | correct = 0 92 | net.train() 93 | for b_idx, (data, targets) in enumerate(train_loader): 94 | optimizer.zero_grad() 95 | data, targets = Variable(data).cuda(), Variable(targets.long().squeeze()).cuda().detach() 96 | output = net(data) 97 | loss = ceLoss(output, targets) 98 | loss.backward() 99 | optimizer.step() 100 | 101 | # compute the accuracy 102 | pred = output.data.max(1)[1].squeeze() # get the index of the max log-probability 103 | correct += pred.eq(targets.data).sum() 104 | 105 | if b_idx % 10 == 0: 106 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 107 | epoch, (b_idx+1) * len(data), len(train_loader.dataset), 108 | 100. * (b_idx+1)*len(data) / len(train_loader.dataset), loss.data[0])) 109 | # now that the epoch is completed plot the accuracy 110 | train_accuracy = correct / float(len(train_loader.dataset)) 111 | print("training accuracy ({:.2f}%)".format(100*train_accuracy)) 112 | return (train_accuracy*100.0) 113 | 114 | 115 | best_accuracy = 0.0 116 | 117 | def test(): 118 | net.eval() 119 | global best_accuracy 120 | correct = 0 121 | for idx, (data, target) in enumerate(test_loader): 122 | data, target = Variable(data, volatile=True).cuda(), Variable(target.long().squeeze()).cuda() 123 | 124 | # do the forward pass 125 | score = net.forward(data) 126 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 127 | correct += pred.eq(target.data).cpu().sum() 128 | 129 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 130 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 131 | print("accuracy = {:.2f}".format(val_accuracy)) 132 | 133 | # now save the model if it has better accuracy than the best model seen so forward 134 | return val_accuracy/100.0 135 | 136 | -------------------------------------------------------------------------------- /datasets/cifar10.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | 8 | from torchvision import datasets, transforms 9 | from torchvision import models 10 | 11 | 12 | batch_size = 200 13 | lr = 1e-3 14 | seed = 1 15 | log_schedule = 10 16 | cuda = True 17 | 18 | torch.manual_seed(seed) 19 | if cuda: 20 | print('Using cuda') 21 | torch.cuda.manual_seed(seed) 22 | #torch.cuda.set_device(1) 23 | 24 | kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} 25 | transform = transforms.Compose([ 26 | transforms.RandomCrop(32, padding=4), 27 | transforms.RandomHorizontalFlip(), 28 | transforms.ToTensor(), 29 | #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 30 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 31 | ]) 32 | train_loader = torch.utils.data.DataLoader( 33 | datasets.CIFAR10('./', train=True, download=True, 34 | transform=transforms.Compose([ 35 | transforms.RandomCrop(32, padding=4), 36 | transforms.RandomHorizontalFlip(), 37 | transforms.ToTensor(), 38 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 39 | #transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 40 | ])), 41 | batch_size=batch_size, shuffle=True, **kwargs) 42 | test_loader = torch.utils.data.DataLoader( 43 | datasets.CIFAR10('./', train=False, transform=transforms.Compose([ 44 | transforms.ToTensor(), 45 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 46 | #transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 47 | ])), 48 | batch_size=batch_size, shuffle=False, **kwargs) 49 | 50 | 51 | # using the 55 epoch learning rule here 52 | def paramsforepoch(epoch): 53 | p = dict() 54 | regimes = [[1, 18, 5e-3, 5e-4], 55 | [19, 29, 1e-3, 5e-4], 56 | [30, 43, 5e-4, 5e-4], 57 | [44, 52, 1e-4, 0], 58 | [53, 1e8, 1e-5, 0]] 59 | # regimes = [[1, 18, 1e-4, 5e-4], 60 | # [19, 29, 5e-5, 5e-4], 61 | # [30, 43, 1e-5, 5e-4], 62 | # [44, 52, 5e-6, 0], 63 | # [53, 1e8, 1e-6, 0]] 64 | for i, row in enumerate(regimes): 65 | if epoch >= row[0] and epoch <= row[1]: 66 | p['learning_rate'] = row[2] 67 | p['weight_decay'] = row[3] 68 | return p 69 | 70 | avg_loss = list() 71 | best_accuracy = 0.0 72 | 73 | def adjustlrwd(params): 74 | for param_group in optimizer.state_dict()['param_groups']: 75 | param_group['lr'] = params['learning_rate'] 76 | param_group['weight_decay'] = params['weight_decay'] 77 | 78 | # train the network 79 | optimizer = None 80 | def train(epoch): 81 | global optimizer 82 | if epoch == 1: 83 | optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4) 84 | 85 | global avg_loss 86 | correct = 0 87 | net.train() 88 | for b_idx, (data, targets) in enumerate(train_loader): 89 | 90 | if cuda: 91 | data, targets = data.cuda(), targets.cuda() 92 | # convert the data and targets into Variable and cuda form 93 | data, targets = Variable(data), Variable(targets) 94 | 95 | # train the network 96 | optimizer.zero_grad() 97 | scores = net.forward(data) 98 | loss = F.nll_loss(scores, targets) 99 | 100 | # compute the accuracy 101 | pred = scores.data.max(1)[1] # get the index of the max log-probability 102 | correct += pred.eq(targets.data).cpu().sum() 103 | 104 | avg_loss.append(loss.data[0]) 105 | loss.backward() 106 | optimizer.step() 107 | 108 | if b_idx % log_schedule == 0: 109 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 110 | epoch, (b_idx+1) * len(data), len(train_loader.dataset), 111 | 100. * (b_idx+1)*len(data) / len(train_loader.dataset), loss.data[0])) 112 | 113 | # now that the epoch is completed plot the accuracy 114 | train_accuracy = correct / float(len(train_loader.dataset)) 115 | print("training accuracy ({:.2f}%)".format(100*train_accuracy)) 116 | return (train_accuracy*100.0) 117 | 118 | 119 | def test(): 120 | net.eval() 121 | global best_accuracy 122 | correct = 0 123 | for idx, (data, target) in enumerate(test_loader): 124 | if cuda: 125 | data, target = data.cuda(), target.cuda() 126 | data, target = Variable(data, volatile=True), Variable(target) 127 | 128 | # do the forward pass 129 | score = net.forward(data) 130 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 131 | correct += pred.eq(target.data).cpu().sum() 132 | 133 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 134 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 135 | print("accuracy = {:.2f}".format(val_accuracy)) 136 | 137 | # now save the model if it has better accuracy than the best model seen so forward 138 | return val_accuracy/100.0 139 | 140 | -------------------------------------------------------------------------------- /datasets/cifar100.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | 8 | from torchvision import datasets, transforms 9 | from torchvision import models 10 | 11 | batch_size = 200 12 | lr = 1e-3 13 | log_schedule = 10 14 | seed = 1 15 | cuda = True 16 | 17 | torch.manual_seed(seed) 18 | if cuda: 19 | print('Using cuda') 20 | torch.cuda.manual_seed(seed) 21 | #torch.cuda.set_device(1) 22 | 23 | kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} 24 | transform = transforms.Compose([ 25 | transforms.RandomCrop(32, padding=4), 26 | transforms.RandomHorizontalFlip(), 27 | transforms.ToTensor(), 28 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 29 | #transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 30 | ]) 31 | train_loader = torch.utils.data.DataLoader( 32 | datasets.CIFAR100('./', train=True, download=True, 33 | transform=transforms.Compose([ 34 | transforms.RandomCrop(32, padding=4), 35 | transforms.RandomHorizontalFlip(), 36 | transforms.ToTensor(), 37 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 38 | #transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 39 | ])), 40 | batch_size=batch_size, shuffle=True, **kwargs) 41 | test_loader = torch.utils.data.DataLoader( 42 | datasets.CIFAR100('./', train=False, transform=transforms.Compose([ 43 | transforms.ToTensor(), 44 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 45 | #transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 46 | ])), 47 | batch_size=batch_size, shuffle=False, **kwargs) 48 | 49 | 50 | # using the 55 epoch learning rule here 51 | def paramsforepoch(epoch): 52 | p = dict() 53 | regimes = [[1, 18, 5e-3, 5e-4], 54 | [19, 29, 1e-3, 5e-4], 55 | [30, 43, 5e-4, 5e-4], 56 | [44, 52, 1e-4, 0], 57 | [53, 1e8, 1e-5, 0]] 58 | # regimes = [[1, 18, 1e-4, 5e-4], 59 | # [19, 29, 5e-5, 5e-4], 60 | # [30, 43, 1e-5, 5e-4], 61 | # [44, 52, 5e-6, 0], 62 | # [53, 1e8, 1e-6, 0]] 63 | for i, row in enumerate(regimes): 64 | if epoch >= row[0] and epoch <= row[1]: 65 | p['learning_rate'] = row[2] 66 | p['weight_decay'] = row[3] 67 | return p 68 | 69 | avg_loss = list() 70 | best_accuracy = 0.0 71 | 72 | def adjustlrwd(params): 73 | for param_group in optimizer.state_dict()['param_groups']: 74 | param_group['lr'] = params['learning_rate'] 75 | param_group['weight_decay'] = params['weight_decay'] 76 | 77 | # train the network 78 | optimizer = None 79 | def train(epoch): 80 | global optimizer 81 | if epoch == 1: 82 | optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4) 83 | 84 | global avg_loss 85 | correct = 0 86 | net.train() 87 | for b_idx, (data, targets) in enumerate(train_loader): 88 | 89 | if cuda: 90 | data, targets = data.cuda(), targets.cuda() 91 | # convert the data and targets into Variable and cuda form 92 | data, targets = Variable(data), Variable(targets) 93 | 94 | # train the network 95 | optimizer.zero_grad() 96 | scores = F.log_softmax(net.forward(data)) 97 | loss = F.nll_loss(scores, targets) 98 | 99 | # compute the accuracy 100 | pred = scores.data.max(1)[1] # get the index of the max log-probability 101 | correct += pred.eq(targets.data).cpu().sum() 102 | 103 | avg_loss.append(loss.data[0]) 104 | loss.backward() 105 | optimizer.step() 106 | 107 | if b_idx % log_schedule == 0: 108 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 109 | epoch, (b_idx+1) * len(data), len(train_loader.dataset), 110 | 100. * (b_idx+1)*len(data) / len(train_loader.dataset), loss.data[0])) 111 | 112 | # now that the epoch is completed plot the accuracy 113 | train_accuracy = correct / float(len(train_loader.dataset)) 114 | print("training accuracy ({:.2f}%)".format(100*train_accuracy)) 115 | return (train_accuracy*100.0) 116 | 117 | 118 | def test(): 119 | net.eval() 120 | global best_accuracy 121 | correct = 0 122 | for idx, (data, target) in enumerate(test_loader): 123 | if cuda: 124 | data, target = data.cuda(), target.cuda() 125 | data, target = Variable(data, volatile=True), Variable(target) 126 | 127 | # do the forward pass 128 | score = net.forward(data) 129 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 130 | correct += pred.eq(target.data).cpu().sum() 131 | 132 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 133 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 134 | print("accuracy = {:.2f}".format(val_accuracy)) 135 | 136 | # now save the model if it has better accuracy than the best model seen so forward 137 | return val_accuracy/100.0 138 | 139 | def _test(): 140 | test_correct = 0 141 | total_examples = 0 142 | accuracy = 0.0 143 | for idx, (data, target) in enumerate(test_loader): 144 | if idx < 73: 145 | continue 146 | total_examples += len(target) 147 | data, target = Variable(data), Variable(target) 148 | if cuda: 149 | data, target = data.cuda(), target.cuda() 150 | 151 | scores = net(data) 152 | pred = scores.data.max(1)[1] 153 | test_correct += pred.eq(target.data).cpu().sum() 154 | print("Predicted {} out of {} correctly".format(test_correct, total_examples)) 155 | return 100.0 * test_correct / (float(total_examples)) 156 | -------------------------------------------------------------------------------- /datasets/cifar10_old.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | 8 | from torchvision import datasets, transforms 9 | from torchvision import models 10 | 11 | batch_size = 64 12 | lr = 1e-3 13 | log_schedule = 10 14 | seed = 1 15 | cuda = torch.cuda.is_available() 16 | 17 | torch.manual_seed(seed) 18 | if cuda: 19 | print('Using cuda') 20 | torch.cuda.manual_seed(seed) 21 | #torch.cuda.set_device(1) 22 | 23 | kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {} 24 | transform = transforms.Compose([ 25 | transforms.RandomCrop(32, padding=4), 26 | transforms.RandomHorizontalFlip(), 27 | transforms.ToTensor(), 28 | #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 29 | transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), 30 | ]) 31 | train_loader = torch.utils.data.DataLoader( 32 | datasets.CIFAR10('./', train=True, download=True, 33 | transform=transforms.Compose([ 34 | transforms.RandomCrop(32, padding=4), 35 | transforms.RandomHorizontalFlip(), 36 | transforms.ToTensor(), 37 | #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 38 | transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 39 | ])), 40 | batch_size=batch_size, shuffle=True, **kwargs) 41 | test_loader = torch.utils.data.DataLoader( 42 | datasets.CIFAR10('./', train=False, transform=transforms.Compose([ 43 | transforms.ToTensor(), 44 | #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 45 | transforms.Normalize((0.491399689874, 0.482158419622, 0.446530924224), (0.247032237587, 0.243485133253, 0.261587846975)) 46 | ])), 47 | batch_size=batch_size, shuffle=False, **kwargs) 48 | 49 | 50 | # using the 55 epoch learning rule here 51 | def paramsforepoch(epoch): 52 | p = dict() 53 | regimes = [[1, 18, 5e-3, 5e-4], 54 | [19, 29, 1e-3, 5e-4], 55 | [30, 43, 5e-4, 5e-4], 56 | [44, 52, 1e-4, 0], 57 | [53, 1e8, 1e-5, 0]] 58 | # regimes = [[1, 18, 1e-4, 5e-4], 59 | # [19, 29, 5e-5, 5e-4], 60 | # [30, 43, 1e-5, 5e-4], 61 | # [44, 52, 5e-6, 0], 62 | # [53, 1e8, 1e-6, 0]] 63 | for i, row in enumerate(regimes): 64 | if epoch >= row[0] and epoch <= row[1]: 65 | p['learning_rate'] = row[2] 66 | p['weight_decay'] = row[3] 67 | return p 68 | 69 | avg_loss = list() 70 | best_accuracy = 0.0 71 | 72 | def adjustlrwd(params): 73 | for param_group in optimizer.state_dict()['param_groups']: 74 | param_group['lr'] = params['learning_rate'] 75 | param_group['weight_decay'] = params['weight_decay'] 76 | 77 | # train the network 78 | optimizer = None 79 | def train(epoch): 80 | global optimizer 81 | if epoch == 1: 82 | optimizer = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4) 83 | 84 | global avg_loss 85 | correct = 0 86 | net.train() 87 | for b_idx, (data, targets) in enumerate(train_loader): 88 | 89 | if cuda: 90 | data, targets = data.cuda(), targets.cuda() 91 | # convert the data and targets into Variable and cuda form 92 | data, targets = Variable(data), Variable(targets) 93 | 94 | # train the network 95 | optimizer.zero_grad() 96 | scores = net.forward(data) 97 | loss = F.nll_loss(scores, targets) 98 | 99 | # compute the accuracy 100 | pred = scores.data.max(1)[1] # get the index of the max log-probability 101 | correct += pred.eq(targets.data).cpu().sum() 102 | 103 | avg_loss.append(loss.data[0]) 104 | loss.backward() 105 | optimizer.step() 106 | 107 | if b_idx % log_schedule == 0: 108 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 109 | epoch, (b_idx+1) * len(data), len(train_loader.dataset), 110 | 100. * (b_idx+1)*len(data) / len(train_loader.dataset), loss.data[0])) 111 | 112 | # now that the epoch is completed plot the accuracy 113 | train_accuracy = correct / float(len(train_loader.dataset)) 114 | print("training accuracy ({:.2f}%)".format(100*train_accuracy)) 115 | return (train_accuracy*100.0) 116 | 117 | 118 | def test(): 119 | net.eval() 120 | global best_accuracy 121 | correct = 0 122 | for idx, (data, target) in enumerate(test_loader): 123 | if cuda: 124 | data, target = data.cuda(), target.cuda() 125 | data, target = Variable(data, volatile=True), Variable(target) 126 | 127 | # do the forward pass 128 | score = net.forward(data) 129 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 130 | correct += pred.eq(target.data).cpu().sum() 131 | 132 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 133 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 134 | print("accuracy = {:.2f}".format(val_accuracy)) 135 | 136 | # now save the model if it has better accuracy than the best model seen so forward 137 | return val_accuracy/100.0 138 | 139 | -------------------------------------------------------------------------------- /datasets/imagenet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | 8 | from torchvision import datasets, transforms 9 | from torchvision import models 10 | 11 | # Training settings 12 | parser = argparse.ArgumentParser('PyTorch ImageNet Example') 13 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='batch size of train') 14 | parser.add_argument('--epochs', type=int, default=5, metavar='N', help='number of epochs to train for') 15 | parser.add_argument('--learning-rate', type=float, default=1e-3, metavar='LR', help='learning rate') 16 | parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='percentage of past parameters to store') 17 | parser.add_argument('--no-cuda', action='store_true', default=False, help='use cuda for training') 18 | parser.add_argument('--log-schedule', type=int, default=10, metavar='N', help='number of epochs to save snapshot after') 19 | parser.add_argument('--seed', type=int, default=1, help='set seed to some constant value to reproduce experiments') 20 | parser.add_argument('--model_name', type=str, default=None, help='Use a pretrained model') 21 | parser.add_argument('--want_to_test', type=bool, default=False, help='make true if you just want to test') 22 | 23 | args = parser.parse_args() 24 | args.cuda = not args.no_cuda and torch.cuda.is_available() 25 | 26 | torch.manual_seed(args.seed) 27 | if args.cuda: 28 | print('Using cuda') 29 | torch.cuda.manual_seed(args.seed) 30 | #torch.cuda.set_device(1) 31 | 32 | kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 33 | normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 34 | std=[0.229, 0.224, 0.225]) 35 | traindir = '/mnt/sdb1/anubhava/ILSVRC2012_img_train' 36 | valdir = '/mnt/sdb1/anubhava/ILSVRC2012_img_val' 37 | 38 | train_loader = torch.utils.data.DataLoader( 39 | datasets.ImageFolder(traindir, transforms.Compose([ 40 | transforms.RandomSizedCrop(224), 41 | transforms.RandomHorizontalFlip(), 42 | transforms.ToTensor(), 43 | normalize, 44 | ])), 45 | batch_size=args.batch_size, shuffle=True, 46 | **kwargs) 47 | 48 | test_loader = torch.utils.data.DataLoader( 49 | datasets.ImageFolder(valdir, transforms.Compose([ 50 | transforms.Scale(256), 51 | transforms.CenterCrop(224), 52 | transforms.ToTensor(), 53 | normalize, 54 | ])), 55 | batch_size=args.batch_size, shuffle=False, 56 | **kwargs) 57 | 58 | 59 | avg_loss = list() 60 | best_accuracy = 0.0 61 | 62 | # train the network 63 | optimizer = None 64 | def train(epoch): 65 | global optimizer 66 | if epoch == 1: 67 | #optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=5e-4) 68 | optimizer = optim.Adam(net.parameters(), lr=args.learning_rate, weight_decay=1e-4) 69 | 70 | global avg_loss 71 | correct = 0 72 | net.train() 73 | for b_idx, (data, targets) in enumerate(train_loader): 74 | if b_idx >= 250000: 75 | break 76 | if args.cuda: 77 | data, targets = data.cuda(), targets.cuda() 78 | # convert the data and targets into Variable and cuda form 79 | data, targets = Variable(data), Variable(targets) 80 | 81 | # train the network 82 | optimizer.zero_grad() 83 | scores = net.forward(data) 84 | loss = F.nll_loss(scores, targets) 85 | 86 | # compute the accuracy 87 | pred = scores.data.max(1)[1] # get the index of the max log-probability 88 | correct += pred.eq(targets.data).cpu().sum() 89 | 90 | avg_loss.append(loss.data[0]) 91 | loss.backward() 92 | optimizer.step() 93 | 94 | if b_idx % args.log_schedule == 0: 95 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 96 | epoch, (b_idx+1) * len(data), len(train_loader.dataset), 97 | 100. * (b_idx+1)*len(data) / len(train_loader.dataset), loss.data[0])) 98 | 99 | # now that the epoch is completed plot the accuracy 100 | train_accuracy = correct / float(len(train_loader.dataset)) 101 | print("training accuracy ({:.2f}%)".format(100*train_accuracy)) 102 | return (train_accuracy*100.0) 103 | 104 | 105 | def test(): 106 | net.eval() 107 | global best_accuracy 108 | correct = 0 109 | for idx, (data, target) in enumerate(test_loader): 110 | if idx >= 100000: 111 | break 112 | if args.cuda: 113 | data, target = data.cuda(), target.cuda() 114 | data, target = Variable(data, volatile=True), Variable(target) 115 | 116 | # do the forward pass 117 | score = net.forward(data) 118 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 119 | correct += pred.eq(target.data).cpu().sum() 120 | 121 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 122 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 123 | print("accuracy = {:.2f}".format(val_accuracy)) 124 | 125 | # now save the model if it has better accuracy than the best model seen so forward 126 | return val_accuracy/100.0 127 | 128 | -------------------------------------------------------------------------------- /datasets/mnist.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import torch.optim as optim 6 | from torch.autograd import Variable 7 | 8 | from torchvision import datasets, transforms 9 | from torchvision import models 10 | from utils import * 11 | 12 | # Training settings 13 | 14 | batch_size = 64 15 | lr = 0.01 16 | seed = 1 17 | log_interval = 10 18 | cuda = True 19 | 20 | torch.manual_seed(seed) 21 | if cuda: 22 | print('Using cuda') 23 | torch.cuda.manual_seed(seed) 24 | 25 | 26 | kwargs = {'num_workers': 8, 'pin_memory': True} if cuda else {} 27 | train_loader = torch.utils.data.DataLoader( 28 | datasets.MNIST('data', train=True, download=True, 29 | transform=transforms.Compose([ 30 | #transforms.Scale(227), 31 | transforms.ToTensor(), 32 | transforms.Normalize((0.1307,), (0.3081,)) 33 | ])), 34 | batch_size=batch_size, shuffle=True, **kwargs) 35 | test_loader = torch.utils.data.DataLoader( 36 | datasets.MNIST('data', train=False, transform=transforms.Compose([ 37 | #transforms.Scale(227), 38 | transforms.ToTensor(), 39 | transforms.Normalize((0.1307,), (0.3081,)) 40 | ])), 41 | batch_size=batch_size, shuffle=True, **kwargs) 42 | 43 | 44 | optimizer = None 45 | def train(epoch): 46 | global optimizer 47 | if epoch == 1: 48 | optimizer = optim.Adam(net.parameters(), lr=lr) 49 | net.train() 50 | for batch_idx, (data, target) in enumerate(train_loader): 51 | if cuda: 52 | data, target = data.cuda(), target.cuda() 53 | data, target = Variable(data), Variable(target) 54 | optimizer.zero_grad() 55 | output = F.log_softmax(net(data)) 56 | #output = net(data) 57 | loss = F.nll_loss(output, target) 58 | loss.backward() 59 | optimizer.step() 60 | if batch_idx % log_interval == 0: 61 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( 62 | epoch, batch_idx * len(data), len(train_loader.dataset), 63 | 100. * batch_idx / len(train_loader), loss.data[0])) 64 | 65 | def test(): 66 | removeLayers(net, 'LogSoftmax') 67 | removeLayers(net, 'Softmax') 68 | net.classifier.add_module('softmax', nn.LogSoftmax()) 69 | net.eval() 70 | test_loss = 0 71 | correct = 0 72 | for data, target in test_loader: 73 | if cuda: 74 | data, target = data.cuda(), target.cuda() 75 | data, target = Variable(data, volatile=True), Variable(target) 76 | #output = F.log_softmax(model(data)) 77 | output = net(data) 78 | test_loss += F.nll_loss(output, target).data[0] 79 | pred = output.data.max(1)[1] # get the index of the max log-probability 80 | correct += pred.eq(target.data).cpu().sum() 81 | 82 | test_loss = test_loss 83 | test_loss /= len(test_loader) # loss function already averages over batch size 84 | acc = float(correct) / len(test_loader.dataset) 85 | print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( 86 | test_loss, correct, len(test_loader.dataset), 87 | 100. * correct / len(test_loader.dataset))) 88 | return acc 89 | 90 | -------------------------------------------------------------------------------- /datasets/svhn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import datasets, transforms, models 3 | from torch import nn 4 | import torch.optim as optim 5 | from torch.autograd import Variable 6 | 7 | net = None 8 | best_accuracy = 0 9 | kwargs = {'num_workers': 1, 'pin_memory': True} 10 | 11 | cuda = True 12 | batch_size = 200 13 | 14 | def target_transform(target): 15 | return int(target[0]) - 1 16 | 17 | train_loader = torch.utils.data.DataLoader( 18 | datasets.SVHN('./', split='train', download=True, transform=transforms.Compose([ 19 | transforms.ToTensor(), 20 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 21 | ]), 22 | target_transform=target_transform 23 | ), 24 | batch_size=batch_size, shuffle=True, **kwargs) 25 | 26 | 27 | test_loader = torch.utils.data.DataLoader( 28 | datasets.SVHN('./', split='test', download=True, transform=transforms.Compose([ 29 | transforms.ToTensor(), 30 | transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) 31 | ]), 32 | target_transform=target_transform 33 | ), 34 | batch_size=batch_size, shuffle=False, **kwargs) 35 | 36 | def test(): 37 | net.eval() 38 | global best_accuracy 39 | correct = 0 40 | for idx, (data, target) in enumerate(test_loader): 41 | if cuda: 42 | data, target = data.cuda(), target.cuda() 43 | data, target = Variable(data, volatile=True), Variable(target) 44 | 45 | # do the forward pass 46 | score = net.forward(data) 47 | pred = score.data.max(1)[1] # got the indices of the maximum, match them 48 | correct += pred.eq(target.data).cpu().sum() 49 | 50 | print("predicted {} out of {}".format(correct, len(test_loader.dataset))) 51 | val_accuracy = correct / float(len(test_loader.dataset)) * 100.0 52 | print("accuracy = {:.2f}".format(val_accuracy)) 53 | 54 | # now save the model if it has better accuracy than the best model seen so forward 55 | return val_accuracy/100.0 56 | -------------------------------------------------------------------------------- /experiments/ar_run_layer_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from controllers.AutoregressiveLayer import * 19 | 20 | 21 | parser = argparse.ArgumentParser(description='Run layer only version') 22 | parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', 23 | help='which dataset to test on') 24 | parser.add_argument('--cuda', action='store_true', default=True, 25 | help='enables CUDA training') 26 | args = parser.parse_args() 27 | 28 | 29 | datasetName = args.dataset 30 | useCuda = args.cuda 31 | 32 | datasetInputTensor = None 33 | baseline_acc = 0 34 | modelSavePath = None 35 | if datasetName is 'mnist': 36 | print('Using mnist') 37 | import datasets.mnist as dataset 38 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 39 | model = torch.load('./parent_models/mnist.pth') 40 | baseline_acc = 0.994 41 | modelSavePath = './protos_mnist/' 42 | else: 43 | print('Using cifar') 44 | import datasets.cifar10 as dataset 45 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 46 | model = torch.load('./parent_models/cifar10.pth') 47 | baseline_acc = 0.88 48 | modelSavePath = './protos_cifar/' 49 | 50 | dataset.args.cuda = useCuda 51 | parentSize = numParams(model) 52 | 53 | 54 | def Reward(acc, params, baseline_acc, baseline_params): 55 | #R_acc = (baseline_loss - loss)^3 # determine what is chance as well 56 | R_acc = (acc/baseline_acc) 57 | C = (float(baseline_params - params))/baseline_params 58 | R_par = C*(2-C) 59 | #print('R_acc %f, R_par %f' % (R_acc, R_par)) 60 | return R_acc * R_par 61 | # return R_acc*(R_par**2 + 0.3) 62 | 63 | # Parameters for LSTM controller 64 | num_layers = 2 65 | num_hidden = 30 66 | num_input = 5 67 | num_output = 2 68 | seq_len = 24 69 | 70 | 71 | controller = LSTMAuto(num_input, num_output, num_hidden, num_layers) 72 | opti = optim.Adam(controller.parameters(), lr=0.003) 73 | 74 | 75 | # Store statistics for each model 76 | accsPerModel = {} 77 | paramsPerModel = {} 78 | rewardsPerModel = {} 79 | numSavedModels = 0 80 | 81 | R_sum = 0 82 | b = 0 83 | 84 | 85 | ''' 86 | Build child model 87 | ''' 88 | def build_child_model(featureLayers, classifierLayers, actions): 89 | actions[0] = 1 90 | actions[len(featureLayers)] = 1 91 | featureActions = actions[:len(featureLayers)] 92 | classifierActions = actions[len(featureLayers):] 93 | classifierActions[-1] = 1 94 | 95 | featureLayers = [featureLayers[l] for l in range(len(featureLayers)) if featureActions[l]] 96 | classifierLayers = [classifierLayers[l] for l in range(len(classifierLayers)) if classifierActions[l]] 97 | 98 | features = nn.Sequential() 99 | classifier = nn.Sequential() 100 | 101 | inp = Variable(datasetInputTensor.clone().cuda()) 102 | # Add first layer always to preserve input channels 103 | features.add_module('0', featureLayers[0]._layer) 104 | inp = featureLayers[0]._layer(inp) 105 | # Build feature sequence 106 | for i in range(1, len(featureLayers)): 107 | layer = resizeToFit(featureLayers[i], inp) 108 | features.add_module(str(i), layer) 109 | if featureLayers[i].type is 1: # Conv2d layer, add ReLU and BN 110 | features.add_module(str(i)+'b', nn.BatchNorm2d(layer.out_channels)) 111 | features.add_module(str(i)+'r', nn.ReLU(inplace=False)) 112 | inp = layer.cuda()(inp) 113 | 114 | numInputsToFC = inp.view(inp.size(0), -1).size(1) 115 | inp = inp.view(inp.size(0), -1) 116 | # Check if size is out of range 117 | if numInputsToFC < 10 or numInputsToFC > 30000: 118 | return None 119 | 120 | # Build classifier sequence 121 | for i in range(len(classifierLayers)): 122 | layer = resizeToFit(classifierLayers[i], inp) 123 | #classifier.add_module('cd%d' % i, nn.Dropout()) 124 | classifier.add_module('c%d' % i, layer) 125 | if i != (len(classifierLayers)-1): 126 | classifier.add_module('cr%d' % i, nn.ReLU(inplace=False)) 127 | inp = layer.cuda()(inp) 128 | 129 | # Build whole model 130 | newModel = Model(features, classifier) 131 | newModel = resetModel(newModel) 132 | # Check if any compression has been achieved 133 | if numParams(newModel) > parentSize: 134 | return None 135 | 136 | return newModel 137 | 138 | 139 | def rolloutActions(featureLayers, classifierLayers): 140 | global controller 141 | hn = Variable(torch.zeros(1, num_hidden)) 142 | cn = Variable(torch.zeros(1, num_hidden)) 143 | input = Variable(torch.Tensor(len(featureLayers) + len(classifierLayers), 1, num_input)) 144 | for i in range(len(featureLayers)): 145 | input[i] = featureLayers[i].toTorchTensor() 146 | for i in range(len(classifierLayers)): 147 | input[i + len(featureLayers)] = classifierLayers[i].toTorchTensor() 148 | print(input.size()) 149 | output = controller(input, (hn, cn)) 150 | probs = probs.squeeze(1) 151 | actions = probs.multinomial() 152 | return actions 153 | 154 | 155 | def rollout(model_, i): 156 | global b 157 | global R_sum 158 | featureLayers = [Layer(l) for l in model_.features._modules.values()] 159 | featureLayers = list(filter(lambda x: x.type in [1, 2, 8], featureLayers)) 160 | classifierLayers = [Layer(l) for l in model_.classifier._modules.values()] 161 | classifierLayers = list(filter(lambda x: x.type in [5], classifierLayers)) 162 | actions = rolloutActions(featureLayers, classifierLayers) 163 | newModel = build_child_model(featureLayers, classifierLayers, [a.data.numpy()[0] for a in actions]) 164 | if newModel is None: 165 | R = -1 166 | else: 167 | print(newModel) 168 | acc = train(dataset, newModel) 169 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize) 170 | rewardsPerModel[i] = R 171 | accsPerModel[i] = acc 172 | paramsPerModel[i] = numParams(newModel) 173 | #torch.save(newModel, modelSavePath + '%f.net' % i) 174 | #print('Val accuracy: %f' % acc) 175 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 176 | print('Reward achieved %f' % R) 177 | print('Reward after baseline %f' % (R-b)) 178 | # Update reward and baseline after each rollout 179 | return (R, actions, newModel) 180 | 181 | def rollouts(N, model, e): 182 | Rs = [] 183 | actionSeqs = [] 184 | models = [] 185 | for i in range(N): 186 | R, actions, newModel = rollout(copy.deepcopy(model), e + float(i)/10) 187 | Rs.append(R); actionSeqs.append(actions); models.append(newModel) 188 | return (Rs, actionSeqs, models) 189 | 190 | def update_controller(actionSeqs, avgR): 191 | print('Reinforcing for epoch %d' % e) 192 | for actions in actionSeqs: 193 | print(actions.size()) 194 | actions.reinforce(avgR - b) 195 | opti.zero_grad() 196 | autograd.backward(actions, [None for _ in actions]) 197 | opti.step() 198 | 199 | epochs = 30 200 | N = 5 201 | for e in range(epochs): 202 | # Compute N rollouts 203 | (Rs, actionSeqs, models) = rollouts(N, model, e) 204 | # Compute average reward 205 | avgR = np.mean(Rs) 206 | print('Average reward: %f' % avgR) 207 | b = R_sum/float(e+1) 208 | R_sum = R_sum + avgR 209 | # Update controller 210 | update_controller(actionSeqs, avgR) 211 | resultsFile = open(modelSavePath + 'results.txt', "w") 212 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 213 | -------------------------------------------------------------------------------- /experiments/ar_run_param_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from controllers.AutoregressiveParam import * 19 | 20 | 21 | parser = argparse.ArgumentParser(description='Run layer only version') 22 | parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', 23 | help='which dataset to test on') 24 | parser.add_argument('--cuda', action='store_true', default=True, 25 | help='enables CUDA training') 26 | args = parser.parse_args() 27 | 28 | 29 | datasetName = args.dataset 30 | useCuda = args.cuda 31 | 32 | datasetInputTensor = None 33 | baseline_acc = 0 34 | modelSavePath = None 35 | if datasetName is 'mnist': 36 | print('Using mnist') 37 | import datasets.mnist as dataset 38 | torch.cuda.set_device(3) 39 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 40 | #model = torch.load('./parent_models/mnist.pth') 41 | #teacherModel = torch.load('./parent_models/lenet_mnist.pth') 42 | teacherModel = torch.load('./parent_models/mnistvgg13.net') 43 | model = torch.load('stage1_mnist/995.net') 44 | baseline_acc = 0.995 45 | modelSavePath = './protos_mnist_param/' 46 | else: 47 | print('Using cifar') 48 | import datasets.cifar10 as dataset 49 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 50 | #teacherModel = torch.load('./parent_models/cifar10.pth') 51 | #teacherModel = torch.load('./parent_models/resnet18cifar.net') 52 | teacherModel = torch.load('./parent_models/cifar10_new.net') 53 | model = torch.load('stage1_vgg_cifar/9215.net') 54 | #model = torch.load('./results/c50lstm0_fc1.net') 55 | baseline_acc = 0.92 56 | modelSavePath = './protos_cifar_vgg_param/' 57 | 58 | dataset.args.cuda = useCuda 59 | parentSize = numParams(model) 60 | 61 | constrained = False 62 | 63 | 64 | def getEpsilon(iter, max_iter=15.0): 65 | return min(1, max(0, (1-iter/float(max_iter))**4)) #return 0 66 | 67 | def getConstrainedReward(R_a, R_c, cons, vars, iter): 68 | eps = getEpsilon(iter) 69 | modelSize = vars[0] 70 | modelSizeConstraint = cons[0] 71 | if modelSize > modelSizeConstraint: 72 | return (eps - 1) + eps * (R_a * R_c) 73 | else: 74 | return R_a * R_c 75 | 76 | def Reward(acc, params, baseline_acc, baseline_params, constrained=False, iter=50, cons=[], vars=[]): 77 | R_a = (acc/baseline_acc) 78 | C = (float(baseline_params - params))/baseline_params 79 | R_c = C*(2-C) 80 | if constrained: 81 | return getConstrainedReward(R_a, R_c, cons, vars, iter) 82 | return R_a * R_c 83 | 84 | # Parameters for LSTM controller 85 | num_layers = 2 86 | num_hidden = 30 87 | num_input = 5 88 | num_output = 11 89 | seq_len = 24 90 | 91 | #lookup = [0.1, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0] 92 | lookup = [0.25, .5, .5, .5, .5, .5, .6, .7, .8, .9, 1.0] 93 | 94 | controller = LSTMAutoParams(num_input, num_output, num_hidden, num_layers, lookup) 95 | opti = optim.Adam(controller.parameters(), lr=0.01) 96 | 97 | previousModels = {} 98 | 99 | # Store statistics for each model 100 | accsPerModel = {} 101 | paramsPerModel = {} 102 | rewardsPerModel = {} 103 | numSavedModels = 0 104 | 105 | R_sum = 0 106 | b = 0 107 | 108 | LINEAR_THRESHOLD = 50000 109 | 110 | 111 | def applyActions(m, action, inp, lookup): 112 | if m.fixed: 113 | return resizeToFit(Layer(m), inp) 114 | # Get representation 115 | # Perform updates 116 | _, k, s, o, p = Layer(m).getRepresentation() 117 | k = max(int(k * lookup[action[1]]), 1) 118 | #s = max(int(s * lookup[action[2]]), 1) 119 | o = max(int(o * lookup[action[3]]), 10) 120 | #p = max(int(p * lookup[action[4]]), 1) 121 | p = int(p*lookup[action[4]]) 122 | in_channels = inp.size(1) 123 | cn = m.__class__.__name__ 124 | if cn == 'Linear': 125 | in_channels = inp.view(inp.size(0), -1).size(1) 126 | if in_channels > LINEAR_THRESHOLD or in_channels < 10: 127 | print('Linear layer too large') 128 | return None 129 | return resizeLayer(m, in_channels, o, kernel_size=k, stride=s, padding=p) 130 | 131 | a = 0 132 | inp = Variable(datasetInputTensor.clone()).cuda() 133 | 134 | def traverse(parent, m, m_name, actions): 135 | global a 136 | global inp 137 | classname = m.__class__.__name__ 138 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'Model']: 139 | child = createParentContainer(m) 140 | for i in m._modules.keys(): 141 | if i == 'shortcut': 142 | continue 143 | #print(i) 144 | res = traverse(child, m._modules[i], i, actions) 145 | if res == None: 146 | return None 147 | if classname not in ['ResNet', 'VGG', 'LeNet', 'Model']: 148 | parent.add_module(m_name, child) 149 | return True 150 | else: 151 | return child 152 | else: 153 | if classname == 'Linear': 154 | inp = inp.view(inp.size(0), -1) 155 | #print(inp.size(1)) 156 | if inp.size(1) > LINEAR_THRESHOLD: 157 | print('Linear layer too large') 158 | return None 159 | action = actions[a][:] 160 | m = applyActions(m, action, inp, lookup) 161 | if m == None: 162 | return None 163 | inp = m.cuda()(inp) 164 | parent.add_module(m_name, m) 165 | a += 1 166 | return True 167 | 168 | def fixLayers(m): 169 | layers = flattenModule(m) 170 | # Initialize 171 | for l in layers: 172 | l.fixed = False 173 | 174 | # Fix any layers you want here 175 | # ---- 176 | # Fix final linear layer 177 | layers[1].fixed = True 178 | layers[2].fixed = True 179 | layers[-1].fixed = True 180 | layers[-2].fixed = True 181 | # ---- 182 | 183 | 184 | ''' 185 | Build child model 186 | ''' 187 | def build_child_model(m, actions): 188 | # We eliminate a layer if any one of the coefficients are = 0 189 | global inp 190 | global a 191 | a = 0 192 | actions = np.reshape(actions, (-1, num_input)) 193 | 194 | inp = Variable(datasetInputTensor.clone()).cuda() 195 | fixLayers(m) 196 | 197 | # Build whole model 198 | newModel = traverse(None, m, None, actions) 199 | if newModel == None: 200 | print('newModel is none for some reason') 201 | return None 202 | resetModel(newModel) 203 | # Check if any compression has been achieved 204 | if numParams(newModel) > parentSize: 205 | print('newModel is larger than parent') 206 | return None 207 | 208 | return newModel 209 | 210 | 211 | def rolloutActions(layers): 212 | global controller 213 | hn = [Variable(torch.zeros(1, num_hidden))] * num_layers 214 | cn = [Variable(torch.zeros(1, num_hidden))] * num_layers 215 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 216 | for i in range(len(layers)): 217 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=False) 218 | output = controller(input, (hn, cn)) 219 | return output 220 | 221 | 222 | def rollout(model_, e): 223 | global b 224 | global R_sum 225 | layers = layersFromModule(model_) 226 | actions = rolloutActions(layers) 227 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 228 | hashcode = hash(str(newModel)) if newModel != None else 0 229 | if hashcode in previousModels: 230 | R = previousModels[hashcode] 231 | elif newModel is None: 232 | R = -1 233 | else: 234 | print(newModel) 235 | acc = trainTeacherStudent(teacherModel, newModel, dataset) 236 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize) 237 | previousModels[hashcode] = R 238 | rewardsPerModel[i] = R 239 | accsPerModel[i] = acc 240 | paramsPerModel[i] = numParams(newModel) 241 | torch.save(newModel, modelSavePath + '%f.net' % e) 242 | print('Val accuracy: %f' % acc) 243 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 244 | print('Reward achieved %f' % R) 245 | #print('Reward after baseline %f' % (R-b)) 246 | # Update reward and baseline after each rollout 247 | return (R, actions, newModel) 248 | 249 | def rollout_batch(model, N, e): 250 | global b 251 | global R_sum 252 | newModels = [] 253 | idxs = [] 254 | Rs = [0]*N 255 | actionSeqs = [] 256 | studentModels = [] 257 | for i in range(N): 258 | model_ = copy.deepcopy(model) 259 | layers = layersFromModule(model_) 260 | actions = rolloutActions(layers) 261 | actionSeqs.append(actions) 262 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 263 | newModels.append(newModel) 264 | hashcode = hash(str(newModel)) if newModel else 0 265 | if hashcode in previousModels and constrained == False: 266 | Rs[i] = previousModels[hashcode] 267 | elif newModel is None: 268 | Rs[i] = -1 269 | else: 270 | studentModels.append(newModel) 271 | idxs.append(i) 272 | accs = trainTeacherStudentParallel(model, studentModels, dataset, epochs=5) 273 | print(accs) 274 | R = [Reward(accs[i], numParams(studentModels[i]), baseline_acc, parentSize) for i in range(len(accs))] 275 | for i in range(len(accs)): 276 | print(studentModels[i]) 277 | torch.save(studentModels[i], modelSavePath + '%f.net' %(e+float(i)/10.0)) 278 | comp = 1 - (numParams(studentModels[i])/float(parentSize)) 279 | print('Compression ' + str(comp)) 280 | print('Val accuracy ' + str(accs[i])) 281 | Rs[idxs[i]] = R[i] 282 | for i in range(len(Rs)): 283 | print('Reward achieved ' + str(Rs[i])) 284 | return (Rs, actionSeqs, newModels) 285 | ''' 286 | def rollouts(N, model, e): 287 | Rs = [] 288 | actionSeqs = [] 289 | models = [] 290 | for i in range(N): 291 | R, actions, newModel = rollout(copy.deepcopy(model), e + float(i)/10) 292 | Rs.append(R); actionSeqs.append(actions); models.append(newModel) 293 | return (Rs, actionSeqs, models)''' 294 | 295 | def rollouts(N, model, e): 296 | Rs = [] 297 | actionSeqs = [] 298 | models = [] 299 | (Rs, actionSeqs, models) = rollout_batch(copy.deepcopy(model), N, e) 300 | return (Rs, actionSeqs, models) 301 | 302 | def update_controller(actionSeqs, avgR): 303 | print('Reinforcing for epoch %d' % e) 304 | for actions in actionSeqs: 305 | for action in actions: 306 | action.reinforce(avgR - b) 307 | opti.zero_grad() 308 | autograd.backward(actions, [None for _ in actions]) 309 | opti.step() 310 | 311 | epochs = 50 312 | N = 3 313 | for e in range(epochs): 314 | # Compute N rollouts 315 | (Rs, actionSeqs, models) = rollouts(N, model, e) 316 | # Compute average reward 317 | avgR = np.mean(Rs) 318 | print('Average reward: %f' % avgR) 319 | b = R_sum/float(e+1) 320 | R_sum = R_sum + avgR 321 | # Update controller 322 | update_controller(actionSeqs, avgR) 323 | resultsFile = open(modelSavePath + 'results.txt', "w") 324 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 325 | 326 | -------------------------------------------------------------------------------- /experiments/bd_run_layer_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | 19 | 20 | parser = argparse.ArgumentParser(description='Run layer only version') 21 | parser.add_argument('--dataset', type=str, default='mnist', metavar='N', 22 | help='which dataset to test on') 23 | parser.add_argument('--cuda', action='store_true', default=True, 24 | help='enables CUDA training') 25 | args = parser.parse_args() 26 | 27 | 28 | datasetName = args.dataset 29 | useCuda = args.cuda 30 | 31 | datasetInputTensor = None 32 | baseline_acc = 0 33 | modelSavePath = None 34 | if datasetName is 'mnist': 35 | print('Using mnist') 36 | import datasets.mnist as dataset 37 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 38 | #model = torch.load('./parent_models/mnist.pth') 39 | model = torch.load('./parent_models/mnist_new2.net') 40 | baseline_acc = 0.994 41 | modelSavePath = './protos_mnist/' 42 | else: 43 | print('Using cifar') 44 | import datasets.cifar10 as dataset 45 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 46 | model = torch.load('./parent_models/cifar10_new.net') 47 | baseline_acc = 0.92 48 | modelSavePath = './protos_cifar/' 49 | 50 | dataset.args.cuda = useCuda 51 | parentSize = numParams(model) 52 | 53 | 54 | def Reward(acc, params, baseline_acc, baseline_params): 55 | #R_acc = (baseline_loss - loss)^3 # determine what is chance as well 56 | R_acc = acc/baseline_acc 57 | C = (float(baseline_params - params))/baseline_params 58 | R_par = C*(2-C) 59 | #print('R_acc %f, R_par %f' % (R_acc, R_par)) 60 | return R_acc * R_par 61 | # return R_acc*(R_par**2 + 0.3) 62 | 63 | # Parameters for LSTM controller 64 | num_layers = 2 65 | num_hidden = 30 66 | num_input = 5 67 | num_output = 2 68 | seq_len = 1 69 | 70 | 71 | controller = nn.LSTM(num_input, num_hidden, num_layers, bidirectional=True) 72 | opti = optim.Adam(controller.parameters(), lr=0.003) 73 | Wt_softmax = nn.Linear(num_hidden*2, num_output) 74 | softmax = nn.Softmax() 75 | 76 | # Store statistics for each model 77 | accsPerModel = {} 78 | paramsPerModel = {} 79 | rewardsPerModel = {} 80 | numSavedModels = 0 81 | 82 | R_sum = 0 83 | b = 0 84 | 85 | LINEAR_THRESHOLD = 30000 86 | ''' 87 | Build child model 88 | ''' 89 | def build_child_model(featureLayers, classifierLayers, actions): 90 | actions[0] = 1 91 | actions[len(featureLayers)] = 1 92 | featureActions = actions[:len(featureLayers)] 93 | classifierActions = actions[len(featureLayers):] 94 | classifierActions[-1] = 1 95 | 96 | featureLayers = [featureLayers[l] for l in range(len(featureLayers)) if featureActions[l]] 97 | classifierLayers = [classifierLayers[l] for l in range(len(classifierLayers)) if classifierActions[l]] 98 | 99 | features = nn.Sequential() 100 | classifier = nn.Sequential() 101 | 102 | inp = Variable(datasetInputTensor.clone().cuda()) 103 | # Add first layer always to preserve input channels 104 | features.add_module('0', featureLayers[0]._layer) 105 | inp = featureLayers[0]._layer.cuda()(inp) 106 | # Build feature sequence 107 | for i in range(1, len(featureLayers)): 108 | layer = resizeToFit(featureLayers[i], inp) 109 | features.add_module(str(i), layer) 110 | if featureLayers[i].type is 1: # Conv2d layer, add ReLU and BN 111 | features.add_module(str(i)+'b', nn.BatchNorm2d(layer.out_channels)) 112 | features.add_module(str(i)+'r', nn.ReLU(inplace=False)) 113 | inp = layer.cuda()(inp) 114 | 115 | numInputsToFC = inp.view(inp.size(0), -1).size(1) 116 | inp = inp.view(inp.size(0), -1) 117 | # Check if size is out of range 118 | if numInputsToFC < 10 or numInputsToFC > LINEAR_THRESHOLD: 119 | return None 120 | 121 | # Build classifier sequence 122 | for i in range(len(classifierLayers)): 123 | layer = resizeToFit(classifierLayers[i], inp) 124 | #classifier.add_module('cd%d' % i, nn.Dropout()) 125 | classifier.add_module('c%d' % i, layer) 126 | if i != (len(classifierLayers)-1): 127 | classifier.add_module('cr%d' % i, nn.ReLU(inplace=False)) 128 | inp = layer.cuda()(inp) 129 | 130 | # Build whole model 131 | newModel = Model(features, classifier) 132 | resetModel(newModel) 133 | # Check if any compression has been achieved 134 | if numParams(newModel) > parentSize: 135 | return None 136 | 137 | return newModel 138 | 139 | 140 | def rolloutActions(featureLayers, classifierLayers): 141 | global controller 142 | hn = Variable(torch.zeros(num_layers * 2, 1, num_hidden)) 143 | cn = Variable(torch.zeros(num_layers * 2 , 1, num_hidden)) 144 | input = Variable(torch.Tensor(len(featureLayers) + len(classifierLayers), 1, num_input)) 145 | for i in range(len(featureLayers)): 146 | input[i] = featureLayers[i].toTorchTensor() 147 | for i in range(len(classifierLayers)): 148 | input[i + len(featureLayers)] = classifierLayers[i].toTorchTensor() 149 | output, (hn, cn) = controller(input, (hn, cn)) 150 | probs = softmax(Wt_softmax(output.squeeze(1))) 151 | actions = probs.multinomial() 152 | return actions 153 | 154 | 155 | def rollout(model_, i): 156 | global b 157 | global R_sum 158 | featureLayers = [Layer(l) for l in model_.features._modules.values()] 159 | featureLayers = list(filter(lambda x: x.type in [1, 2], featureLayers)) 160 | classifierLayers = [Layer(l) for l in model_.classifier._modules.values()] 161 | classifierLayers = list(filter(lambda x: x.type in [5], classifierLayers)) 162 | actions = rolloutActions(featureLayers, classifierLayers) 163 | newModel = build_child_model(featureLayers, classifierLayers, [a.data.numpy()[0] for a in actions]) 164 | if newModel is None: 165 | R = -1 166 | else: 167 | print(newModel) 168 | acc = trainTeacherStudent(model, newModel, dataset, epochs=5) 169 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize) 170 | #C = 1 - float(numParams(newModel))/parentSize 171 | #R = -1 if acc < 0.88 or C < 0.5 else R 172 | rewardsPerModel[i] = R 173 | accsPerModel[i] = acc 174 | paramsPerModel[i] = numParams(newModel) 175 | torch.save(newModel, modelSavePath + '%f.net' % i) 176 | print('Val accuracy: %f' % acc) 177 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 178 | print('Reward achieved %f' % R) 179 | #print('Reward after baseline %f' % (R-b)) 180 | # Update reward and baseline after each rollout 181 | return (R, actions, newModel) 182 | 183 | def rollouts(N, model, e): 184 | Rs = [] 185 | actionSeqs = [] 186 | models = [] 187 | for i in range(N): 188 | R, actions, newModel = rollout(copy.deepcopy(model), e + float(i)/10) 189 | Rs.append(R); actionSeqs.append(actions); models.append(newModel) 190 | return (Rs, actionSeqs, models) 191 | 192 | def update_controller(actionSeqs, avgR): 193 | print('Reinforcing for epoch %d' % e) 194 | for actions in actionSeqs: 195 | print(actions.size()) 196 | actions.reinforce(avgR - b) 197 | opti.zero_grad() 198 | autograd.backward(actions, [None for _ in actions]) 199 | opti.step() 200 | 201 | epochs = 50 202 | N = 5 203 | for e in range(epochs): 204 | # Compute N rollouts 205 | (Rs, actionSeqs, models) = rollouts(N, model, e) 206 | # Compute average reward 207 | avgR = np.mean(Rs) 208 | print('Average reward: %f' % avgR) 209 | b = R_sum/float(e+1) 210 | R_sum = R_sum + avgR 211 | # Update controller 212 | update_controller(actionSeqs, avgR) 213 | resultsFile = open(modelSavePath + 'results.txt', "w") 214 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 215 | -------------------------------------------------------------------------------- /experiments/ed_run_layer_general.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | from controllers.EncoderDecoder import * 17 | 18 | 19 | 20 | import signal 21 | import sys 22 | 23 | parser = argparse.ArgumentParser(description='Run layer only version') 24 | parser.add_argument('--dataset', type=str, default='mnist', metavar='N', 25 | help='which dataset to test on') 26 | parser.add_argument('--cuda', action='store_true', default=True, 27 | help='enables CUDA training') 28 | args = parser.parse_args() 29 | 30 | 31 | datasetName = args.dataset 32 | useCuda = args.cuda 33 | 34 | # Define dataset variables 35 | # We need: 36 | # 1. size of input data 37 | # 2. pre-trained Parent model 38 | # 3. baseline accuracy of parent model, hardcode since we don't want to run everytime we load 39 | datasetInputTensor = None 40 | baseline_acc = 0 41 | modelSavePath = None 42 | if datasetName is 'mnist': 43 | print('Using mnist') 44 | import datasets.mnist as dataset 45 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 46 | model = torch.load('./parent_models/mnist.pth') 47 | baseline_acc = 0.994 48 | modelSavePath = './protos_mnist/' 49 | else: 50 | print('Using cifar') 51 | import datasets.cifar10 as dataset 52 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 53 | model = torch.load('./parent_models/cifar10.pth') 54 | baseline_acc = 0.88 55 | modelSavePath = './protos_cifar/' 56 | 57 | dataset.args.cuda = useCuda 58 | parentSize = numParams(model) 59 | 60 | # Define Observation space 61 | 62 | 63 | # Define Action space 64 | 65 | 66 | # Define Loss 67 | def Reward(acc, params, baseline_acc, baseline_params): 68 | #R_acc = (baseline_loss - loss)^3 # determine what is chance as well 69 | R_acc = (acc/baseline_acc) 70 | C = (float(baseline_params - params))/baseline_params 71 | R_par = C*(2-C) 72 | print('R_acc %f, R_par %f' % (R_acc, R_par)) 73 | return R_acc * R_par 74 | # return R_acc*(R_par**2 + 0.3) 75 | 76 | 77 | layers = [Layer(l) for l in model.features._modules.values()] 78 | layers = list(filter(lambda x: x.type in [1, 2], layers)) 79 | 80 | 81 | # Parameters for LSTM 82 | num_layers = 2 83 | num_hidden = 30 84 | num_input = 5 85 | num_output = 2 86 | seq_len = len(layers) 87 | 88 | # LSTM definition 89 | #lstm = nn.LSTM(num_input, num_hidden, num_layers) 90 | #Wt_softmax = nn.Linear(num_hidden, num_output) 91 | #softmax = nn.Softmax() 92 | lstm = EncoderDecoderLSTM(num_input, num_output, num_hidden, seq_len) 93 | 94 | # Optimizer definition 95 | opti = optim.Adam(lstm.parameters(), lr=0.0006) 96 | 97 | epochs = 100 98 | R_sum = 0 99 | lookup = [0, 1] 100 | 101 | print('About to setup models') 102 | 103 | # ReLU fix 104 | # layers = [l if l.type is not 'ReLU' else Layer(nn.ReLU(inplace=False)) for l in layers] 105 | 106 | # Store statistics for each model 107 | accsPerModel = {} 108 | paramsPerModel = {} 109 | prevMaxReward = 0 110 | numSavedModels = 0 111 | 112 | def signal_handler(signal, frame): 113 | print('Ending experiment') 114 | resultsFile = open(modelSavePath + 'results.txt', "w") 115 | output_results(resultsFile, accsPerModel, paramsPerModel) 116 | sys.exit(0) 117 | signal.signal(signal.SIGINT, signal_handler) 118 | 119 | 120 | 121 | for e in range(epochs): 122 | # Create layers 123 | model_ = copy.deepcopy(model) 124 | layers = [Layer(l) for l in model_.features._modules.values()] 125 | layers = list(filter(lambda x: x.type in [1, 2], layers)) 126 | classifierLayers = [Layer(l) for l in model_.classifier._modules.values()] 127 | print len(layers) 128 | 129 | input = torch.Tensor(len(layers), 1, num_input) 130 | for i in range(len(layers)): 131 | input[i] = layers[i].toTorchTensor().squeeze(0) 132 | input = Variable(input) 133 | outputs = lstm.forward(input) 134 | actions = [o.multinomial() for o in outputs] 135 | intActions = [a.data.numpy()[0] for a in actions] 136 | # Generate new model 137 | features = nn.Sequential() 138 | input = Variable(datasetInputTensor) 139 | intActions[0] = 1 # Save first layer so that input size is consistent 140 | for i in [i for i, x in enumerate(intActions) if x == 1]: 141 | if 'weight' in layers[i]._layer._parameters: 142 | in_channels = input.size()[1] 143 | _, kernel_size, stride, out_channels, padding = layers[i].getRepresentation() 144 | layers[i] = Layer(resizeLayer(layers[i]._layer, in_channels, out_channels, kernel_size, stride, padding)) 145 | features.add_module(str(i), layers[i]._layer) 146 | if layers[i].type == 1: 147 | features.add_module(str(i)+'b', nn.BatchNorm2d(layers[i]._layer.out_channels)) 148 | features.add_module(str(i)+'r', nn.ReLU(inplace=False)) 149 | input = layers[i]._layer.forward(input) 150 | if input.size(2) * input.size(3) < 5: 151 | break 152 | # Check a few things to disqualify the model 153 | # 1. Representation size 154 | fc_in_channels = input.size(1)*input.size(2)*input.size(3) 155 | print 'Current fc nodes', fc_in_channels 156 | classifier = nn.Sequential() 157 | 158 | if fc_in_channels < 10 or fc_in_channels > 100000: 159 | R = -1 160 | else: 161 | # Set fc in channels 162 | modules = classifierLayers 163 | print('Setting classifier layers') 164 | modules[1] = Layer(resizeLayer(modules[1]._layer, fc_in_channels, modules[1]._layer.out_features)) 165 | print('Resized fc layer') 166 | for j in range(len(modules)): 167 | classifier.add_module('c%d' % j, modules[j]._layer) 168 | # Attach classifier 169 | # Run model and determine accuracy 170 | newModel = Model(features, classifier) 171 | newModel = resetModel(newModel) 172 | print(newModel) 173 | if numParams(newModel) > parentSize: 174 | R = -1 175 | else: 176 | newModel = newModel.cuda() if dataset.args.cuda else newModel 177 | acc = train(dataset, newModel) 178 | #viz = make_dot(output) 179 | #viz.render('./vis/%d' % e) 180 | accsPerModel[e] = acc 181 | paramsPerModel[e] = numParams(newModel) 182 | R = Reward(acc, numParams(newModel), baseline_acc, numParams(model)) 183 | print('Val accuracy: %f' % acc) 184 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/numParams(model)))) 185 | print('Reward achieved %f' % R) 186 | if R >= prevMaxReward and numSavedModels < 25: 187 | torch.save(newModel, modelSavePath + '%d.net' % e) 188 | numSavedModels += 1 189 | prevMaxReward = max(prevMaxReward, R) 190 | b = R_sum/float(e+1) 191 | R_sum = R_sum + R 192 | actions.reverse() 193 | for k in range(len(actions)): 194 | actions[k].reinforce(R-b) 195 | print('Reinforcing for epoch %d' % e) 196 | opti.zero_grad() 197 | autograd.backward(actions, [None for _ in actions]) 198 | opti.step() 199 | 200 | # Print statstics 201 | resultsFile = open(modelSavePath + 'results.txt', "w") 202 | output_results(resultsFile, accsPerModel, paramsPerModel) 203 | -------------------------------------------------------------------------------- /experiments/resnet_actor_critic_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from model import resnet 19 | from controllers.ActorCriticLSTM import LSTM 20 | 21 | sys.path.insert(0, '/home/anubhava/ssd.pytorch/') 22 | 23 | constrained = False#True 24 | 25 | parser = argparse.ArgumentParser(description='Run layer only version') 26 | parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', 27 | help='which dataset to test on') 28 | parser.add_argument('--cuda', action='store_true', default=True, 29 | help='enables CUDA training') 30 | args = parser.parse_args() 31 | 32 | 33 | datasetName = args.dataset 34 | useCuda = args.cuda 35 | loadController = False 36 | skipSupport = False 37 | 38 | datasetInputTensor = None 39 | baseline_acc = 0 40 | modelSavePath = None 41 | controllerSavePath = None 42 | if datasetName is 'mnist': 43 | print('Using mnist') 44 | import datasets.mnist as dataset 45 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 46 | model = torch.load('./parent_models/mnistvgg13.net') 47 | #model = torch.load('./parent_models/lenet_mnist.pth') 48 | baseline_acc = 0.9955 49 | #baseline_acc = 0.983 50 | modelSavePath = './protos_mnist/' 51 | controllerSavePath = './controllers_mnist/lstm_lenet.net' 52 | controllerLoadPath = './controllers_mnist/lstm_lenet.net' 53 | elif datasetName is 'cifar100': 54 | print('Using cifar100') 55 | import datasets.cifar100 as dataset 56 | baseline_acc = 0.67 57 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 58 | model = torch.load('./parent_models/vgg19cifar100.net') 59 | modelSavePath = './protos_cifar100/' 60 | controllerSavePath = './controllers_cifar100/lstm_resnet18.net' 61 | controllerLoadPath = './controllers_cifar100/lstm_resnet18.net' 62 | elif datasetName is 'caltech256': 63 | print('Using caltech256') 64 | import datasets.caltech256 as dataset 65 | baseline_acc = 0.79 66 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 67 | model = torch.load('./parent_models/caltech256_resnet18.net') 68 | modelSavePath = './protos_caltech256/' 69 | controllerSavePath = './controllers_caltech256/lstm_resnet18.net' 70 | controllerLoadPath = './controllers_caltech256/lstm_resnet18.net' 71 | elif datasetName is 'imagenet': 72 | print('Using imagenet') 73 | import datasets.imagenet as dataset 74 | torch.cuda.set_device(2) 75 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 76 | model = torch.load('./parent_models/resnet34_imagenet.net') 77 | modelSavePath = './protos_imagenet' 78 | controllerSavePath = './controllers_imagenet/lstm_resnet34.net' 79 | controllerLoadPath = './controllers_imagenet/lstm_resnet34.net' 80 | else: 81 | print('Using cifar') 82 | torch.cuda.set_device(0) 83 | import datasets.cifar10_old as dataset 84 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 85 | #model = torch.load('./parent_models/cifar10.pth') 86 | #baseline_acc = 0.88 87 | model = torch.load('./parent_models/resnet18cifar.net') 88 | #model = torch.load('./parent_models/resnet34cifar.net') 89 | #model = torch.load('./parent_models/cifar_vgg19.net') 90 | baseline_acc = 0.9205 91 | modelSavePath = './protos_cifar/' 92 | #controllerSavePath = './controllers_cifar/lstm_vgg19.net' 93 | #controllerSavePath = './controllers_cifar/lstm_resnet34.net' 94 | controllerSavePath = './controllers_cifar/lstm_resnet18.net' 95 | controllerLoadPath = './controllers_cifar/lstm_resnet34.net' 96 | 97 | dataset.args.cuda = useCuda 98 | parentSize = numParams(model) 99 | 100 | def getEpsilon(iter, max_iter=15.0): 101 | return min(1, max(0, (1-iter/float(max_iter))**4)) #return 0 102 | 103 | def getConstrainedReward(R_a, R_c, cons, vars, iter): 104 | eps = getEpsilon(iter) 105 | modelSize = vars[0] 106 | modelSizeConstraint = cons[0] 107 | if modelSize > modelSizeConstraint: 108 | return (eps - 1) + eps * (R_a * R_c) 109 | else: 110 | return R_a * R_c 111 | 112 | def Reward(acc, params, baseline_acc, baseline_params, constrained=False, iter=50, cons=[], vars=[]): 113 | R_a = (acc/baseline_acc) #if acc > 0.92 else -1 114 | C = (float(baseline_params - params))/baseline_params 115 | R_c = C*(2-C) 116 | if constrained: 117 | return getConstrainedReward(R_a, R_c, cons, vars, iter) 118 | return R_a * R_c 119 | 120 | # Parameters for LSTM controller 121 | num_layers = 2 122 | num_hidden = 30 123 | num_input = 7 if skipSupport else 5 124 | num_output = 2 125 | seq_len = 1 126 | 127 | controller = LSTM(num_input, num_output, num_hidden, num_layers, bidirectional=True) 128 | if loadController: 129 | controller = torch.load(controllerLoadPath) 130 | opti = optim.Adam(controller.parameters(), lr=0.003) 131 | 132 | previousModels = {} 133 | # Store statistics for each model 134 | accsPerModel = {} 135 | paramsPerModel = {} 136 | rewardsPerModel = {} 137 | numSavedModels = 0 138 | 139 | R_sum = 0 140 | b = 0 141 | 142 | LINEAR_THRESHOLD = 50000 143 | 144 | a = 0 145 | inp = Variable(datasetInputTensor.clone()).cuda() 146 | def traverse(parent, m, m_name, actions): 147 | global a 148 | global inp 149 | classname = m.__class__.__name__ 150 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 151 | child = createParentContainer(m) 152 | for i in m._modules.keys(): 153 | if i == 'shortcut': 154 | continue 155 | #print(i) 156 | res = traverse(child, m._modules[i], i, actions) 157 | if res == None: 158 | return None 159 | if classname not in ['ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 160 | parent.add_module(m_name, child) 161 | else: 162 | return child 163 | else: 164 | if classname == 'Linear': 165 | inp = inp.view(inp.size(0), -1) 166 | #print(inp.size(1)) 167 | if inp.size(1) > LINEAR_THRESHOLD: 168 | return None 169 | if m.fixed or actions[a]: 170 | m = resizeToFit(Layer(m), inp).cuda() 171 | inp = m(inp) 172 | parent.add_module(m_name, m) 173 | a += 1 174 | return True 175 | 176 | def fixLayers(m): 177 | layers = flattenModule(m) 178 | # Initialize 179 | for l in layers: 180 | l.fixed = False 181 | 182 | layers[-1].fixed=True 183 | # Fix any layers you want here 184 | # ---- 185 | # Fix all shortcut layers and corresponding stride layers, but not pre layers 186 | for l in layers: 187 | # Fix final linear and average pooling layer 188 | cn = l.__class__.__name__ 189 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 190 | l.fixed = True 191 | if cn == 'Linear' or cn == 'AvgPool2d': 192 | l.fixed = True 193 | # ---- 194 | 195 | ''' 196 | Build child model 197 | ''' 198 | def build_child_model(m, actions): 199 | 200 | # What we want to do here is: 201 | # Automatically construct containers based on actions of the child 202 | # We also want to have universality across models 203 | # Need to handle conv to fc transition 204 | # In VGG FC is in Sequential called features, in ResNet FC is in Sequential called fc 205 | # Have a switch that looks out for layers called fc or features 206 | # Flatten inp on seeing that 207 | # Need to also incorporate filter channels resizeToFit 208 | 209 | actions[0] = 1 210 | global a 211 | global inp 212 | a = 0 213 | 214 | inp = Variable(datasetInputTensor.clone()).cuda() 215 | fixLayers(m) 216 | # Here we traverse the teacher model, which has a heirarchical structure to generate a child model 217 | newModel = traverse(None, m, None, actions) 218 | if newModel == None: 219 | return None 220 | resetModel(newModel) 221 | # Check if any compression has been achieved 222 | if numParams(newModel) >= parentSize: 223 | return None 224 | 225 | return newModel 226 | 227 | 228 | def rolloutActions(layers): 229 | global controller 230 | hn = Variable(torch.zeros(num_layers * 2, 1, num_hidden)) 231 | cn = Variable(torch.zeros(num_layers * 2 , 1, num_hidden)) 232 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 233 | for i in range(len(layers)): 234 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=skipSupport) 235 | actions, values = controller(input, (hn, cn)) 236 | return actions, values 237 | 238 | 239 | def rollout(model_, i): 240 | global b 241 | global R_sum 242 | layers = layersFromModule(model_) 243 | actions = rolloutActions(layers) 244 | #fixLayers(model_) 245 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 246 | hashcode = hash(str(newModel)) if newModel else 0 247 | if hashcode in previousModels and constrained == False: 248 | R = previousModels[hashcode] 249 | elif newModel is None: 250 | R = -1 251 | else: 252 | print(newModel) 253 | #if numParams(newModel) >= 1700000: 254 | # return (-1, actions, newModel) 255 | acc = trainTeacherStudent(model, newModel, dataset, epochs=5) 256 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize, iter=int(i), constrained=constrained, vars=[numParams(newModel)], cons=[1700000]) 257 | previousModels[hashcode] = R 258 | # TODO: Turn constrained off after 20 or so iterations 259 | #C = 1 - float(numParams(newModel))/parentSize 260 | #R = -1 if acc < 0.88 or C < 0.5 else R 261 | rewardsPerModel[i] = R 262 | accsPerModel[i] = acc 263 | paramsPerModel[i] = numParams(newModel) 264 | torch.save(newModel, modelSavePath + '%f.net' % i) 265 | print('Val accuracy: %f' % acc) 266 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 267 | print('Reward achieved %f' % R) 268 | #print('Reward after baseline %f' % (R-b)) 269 | # Update reward and baseline after each rollout 270 | return (R, actions, newModel) 271 | 272 | def rollout_batch(model, N, e): 273 | global b 274 | global R_sum 275 | newModels = [] 276 | idxs = [] 277 | Rs = [0]*N 278 | actionSeqs = [] 279 | valueSeqs = [] 280 | studentModels = [] 281 | for i in range(N): 282 | model_ = copy.deepcopy(model) 283 | layers = layersFromModule(model_) 284 | actions, values = rolloutActions(layers) 285 | valueSeqs.append(values) 286 | actionSeqs.append(actions) 287 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 288 | hashcode = hash(str(newModel)) if newModel else 0 289 | if hashcode in previousModels and constrained == False: 290 | Rs[i] = previousModels[hashcode] 291 | elif newModel is None: 292 | Rs[i] = -1 293 | else: 294 | print(newModel) 295 | torch.save(newModel, modelSavePath + '%f_%f.net' % (e, i)) 296 | newModels.append(newModel) 297 | studentModels.append(newModel) 298 | idxs.append(i) 299 | accs = trainTeacherStudentParallel(model, studentModels, dataset, epochs=5) 300 | for acc in accs: 301 | print('Val accuracy: %f' % acc) 302 | for i in range(len(newModels)): 303 | print('Compression: %f' % (1.0 - (float(numParams(newModels[i]))/parentSize))) 304 | R = [Reward(accs[i], numParams(newModels[i]), baseline_acc, parentSize, iter=int(i), constrained=constrained, vars=[numParams(newModels[i])], cons=[1700000]) for i in range(len(accs))] 305 | for i in range(len(idxs)): 306 | Rs[idxs[i]] = R[i] 307 | for i in range(len(Rs)): 308 | print('Reward achieved %f' % Rs[i]) 309 | return (Rs, actionSeqs, valueSeqs, newModels) 310 | 311 | 312 | def rollouts(N, model, e): 313 | Rs = [] 314 | actionSeqs = [] 315 | models = [] 316 | (Rs, actionSeqs, valueSeqs, models) = rollout_batch(copy.deepcopy(model), N, e) 317 | return (Rs, actionSeqs, valueSeqs, models) 318 | 319 | 320 | def update_controller(actionSeqs, valueSeqs, avgR): 321 | print('Reinforcing for epoch %d' % e) 322 | LossFn = nn.SmoothL1Loss() 323 | value_loss = 0 324 | for (actions, values) in zip(actionSeqs, valueSeqs): 325 | actions.reinforce(-(values.data-avgR)) 326 | rew = Variable(torch.Tensor([avgR]*values.size(0))).detach() 327 | value_loss += LossFn(values, rew) 328 | opti.zero_grad() 329 | autograd.backward([value_loss] + actionSeqs, [torch.ones(1)]+[None for _ in actionSeqs]) 330 | opti.step() 331 | 332 | epochs = 100 333 | N = 5 334 | prevRs = [0, 0, 0, 0, 0] 335 | for e in range(epochs): 336 | # Compute N rollouts 337 | (Rs, actionSeqs, valueSeqs, models) = rollouts(N, model, e) 338 | # Compute average reward 339 | avgR = np.mean(Rs) 340 | print('Average reward: %f' % avgR) 341 | #b = np.mean(prevRs[-5:]) 342 | prevRs.append(avgR) 343 | b = R_sum/float(e+1) 344 | R_sum = R_sum + avgR 345 | # Update controller 346 | update_controller(actionSeqs, valueSeqs, avgR) 347 | 348 | torch.save(controller, controllerSavePath) 349 | resultsFile = open(modelSavePath + 'results.txt', "w") 350 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 351 | 352 | 353 | -------------------------------------------------------------------------------- /experiments/resnet_ar_run_layer_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from model import resnet 19 | from controllers.AutoregressiveLayer import LSTMAuto as LSTM 20 | 21 | sys.path.insert(0, '/home/anubhava/ssd.pytorch/') 22 | 23 | constrained = False#True 24 | 25 | parser = argparse.ArgumentParser(description='Run layer only version') 26 | parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', 27 | help='which dataset to test on') 28 | parser.add_argument('--cuda', action='store_true', default=True, 29 | help='enables CUDA training') 30 | args = parser.parse_args() 31 | 32 | 33 | datasetName = args.dataset 34 | useCuda = args.cuda 35 | loadController = False 36 | skipSupport = False 37 | 38 | datasetInputTensor = None 39 | baseline_acc = 0 40 | modelSavePath = None 41 | controllerSavePath = None 42 | if datasetName is 'mnist': 43 | print('Using mnist') 44 | import datasets.mnist as dataset 45 | torch.cuda.set_device(2) 46 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 47 | model = torch.load('./parent_models/mnistvgg13.net') 48 | #model = torch.load('./parent_models/lenet_mnist.pth') 49 | baseline_acc = 0.994 50 | #baseline_acc = 0.983 51 | modelSavePath = './protos_mnist/' 52 | controllerSavePath = './controllers_mnist/lstm_lenet.net' 53 | controllerLoadPath = './controllers_mnist/lstm_lenet.net' 54 | elif datasetName is 'caltech256': 55 | print('Using caltech256') 56 | import datasets.caltech256 as dataset 57 | baseline_acc = 0.79 58 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 59 | model = torch.load('./parent_models/caltech256_resnet18.net') 60 | modelSavePath = './protos_caltech256/' 61 | controllerSavePath = './controllers_caltech256/lstm_resnet18.net' 62 | controllerLoadPath = './controllers_caltech256/lstm_resnet18.net' 63 | elif datasetName is 'cifar100': 64 | print('Using cifar100') 65 | torch.cuda.set_device(1) 66 | import datasets.cifar100 as dataset 67 | baseline_acc = 0.67 68 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 69 | model = torch.load('./parent_models/resnet34_cifar100.net') 70 | modelSavePath = './protos_cifar100/' 71 | controllerSavePath = './controllers_cifar100/lstm_resnet34.net' 72 | controllerLoadPath = './controllers_cifar100/lstm_resnet34.net' 73 | else: 74 | torch.cuda.set_device(2) 75 | print('Using cifar') 76 | import datasets.cifar10 as dataset 77 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 78 | #model = torch.load('./parent_models/cifar10.pth') 79 | #baseline_acc = 0.88 80 | #model = torch.load('./parent_models/resnet18cifar.net') 81 | #model = torch.load('./parent_models/resnet34cifar.net') 82 | #model = torch.load('./parent_models/cifar_vgg19.net') 83 | model = torch.load('./parent_models/vgg11cifar.net') 84 | baseline_acc = 0.90 85 | modelSavePath = './protos_cifar/' 86 | controllerSavePath = './controllers_cifar/lstm_vgg11.net' 87 | #controllerSavePath = './controllers_cifar/lstm_resnet18.net' 88 | controllerLoadPath = './controllers_cifar/lstm_vggcifar.net' 89 | 90 | dataset.args.cuda = useCuda 91 | parentSize = numParams(model) 92 | 93 | def getEpsilon(iter, max_iter=15.0): 94 | return min(1, max(0, (1-iter/float(max_iter))**4)) #return 0 95 | 96 | def getConstrainedReward(R_a, R_c, cons, vars, iter): 97 | eps = getEpsilon(iter) 98 | modelSize = vars[0] 99 | modelSizeConstraint = cons[0] 100 | if modelSize > modelSizeConstraint: 101 | return (eps - 1) + eps * (R_a * R_c) 102 | else: 103 | return R_a * R_c 104 | 105 | def Reward(acc, params, baseline_acc, baseline_params, constrained=False, iter=50, cons=[], vars=[]): 106 | R_a = (acc/baseline_acc) #if acc > 0.92 else -1 107 | C = (float(baseline_params - params))/baseline_params 108 | R_c = C*(2-C) 109 | if constrained: 110 | return getConstrainedReward(R_a, R_c, cons, vars, iter) 111 | return (R_a) * (R_c) 112 | 113 | # Parameters for LSTM controller 114 | num_layers = 2 115 | num_hidden = 30 116 | num_input = 7 if skipSupport else 5 117 | num_output = 2 118 | seq_len = 1 119 | 120 | controller = LSTM(num_input, num_output, num_hidden, num_layers) 121 | if loadController: 122 | controller = torch.load(controllerLoadPath) 123 | opti = optim.Adam(controller.parameters(), lr=0.003) 124 | 125 | previousModels = {} 126 | # Store statistics for each model 127 | accsPerModel = {} 128 | paramsPerModel = {} 129 | rewardsPerModel = {} 130 | numSavedModels = 0 131 | 132 | R_sum = 0 133 | b = 0 134 | 135 | LINEAR_THRESHOLD = 50000 136 | 137 | a = 0 138 | inp = Variable(datasetInputTensor.clone()).cuda() 139 | def traverse(parent, m, m_name, actions): 140 | global a 141 | global inp 142 | classname = m.__class__.__name__ 143 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 144 | child = createParentContainer(m) 145 | for i in m._modules.keys(): 146 | if i == 'shortcut': 147 | continue 148 | #print(i) 149 | res = traverse(child, m._modules[i], i, actions) 150 | if res == None: 151 | return None 152 | if classname not in ['ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 153 | parent.add_module(m_name, child) 154 | else: 155 | return child 156 | else: 157 | if classname == 'Linear': 158 | inp = inp.view(inp.size(0), -1) 159 | #print(inp.size(1)) 160 | if inp.size(1) > LINEAR_THRESHOLD: 161 | return None 162 | #print(actions[a], a, m) 163 | if m.fixed or actions[a]: 164 | m = resizeToFit(Layer(m), inp).cuda() 165 | inp = m(inp) 166 | parent.add_module(m_name, m) 167 | a += 1 168 | return True 169 | 170 | def fixLayers(m): 171 | #m.classifier._modules['0'].fixed = True 172 | layers = flattenModule(m) 173 | # Initialize 174 | for l in layers: 175 | l.fixed = False 176 | 177 | layers[-1].fixed=True 178 | # Fix any layers you want here 179 | # ---- 180 | # Fix all shortcut layers and corresponding stride layers, but not pre layers 181 | for l in layers: 182 | # Fix final linear and average pooling layer 183 | cn = l.__class__.__name__ 184 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 185 | l.fixed = True 186 | if cn == 'Linear' or cn == 'AvgPool2d': 187 | l.fixed = True 188 | # ---- 189 | 190 | ''' 191 | Build child model 192 | ''' 193 | def build_child_model(m, actions): 194 | 195 | # What we want to do here is: 196 | # Automatically construct containers based on actions of the child 197 | # We also want to have universality across models 198 | # Need to handle conv to fc transition 199 | # In VGG FC is in Sequential called features, in ResNet FC is in Sequential called fc 200 | # Have a switch that looks out for layers called fc or features 201 | # Flatten inp on seeing that 202 | # Need to also incorporate filter channels resizeToFit 203 | 204 | actions[0] = 1 205 | global a 206 | global inp 207 | a = 0 208 | 209 | inp = Variable(datasetInputTensor.clone()).cuda() 210 | fixLayers(m) 211 | # Here we traverse the teacher model, which has a heirarchical structure to generate a child model 212 | newModel = traverse(None, m, None, actions) 213 | if newModel == None: 214 | return None 215 | resetModel(newModel) 216 | # Check if any compression has been achieved 217 | if numParams(newModel) >= parentSize: 218 | return None 219 | 220 | return newModel 221 | 222 | 223 | def rolloutActions(layers): 224 | global controller 225 | hn = Variable(torch.zeros(num_layers * 2, 1, num_hidden)) 226 | cn = Variable(torch.zeros(num_layers * 2 , 1, num_hidden)) 227 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 228 | for i in range(len(layers)): 229 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=skipSupport) 230 | actions = controller(input, (hn, cn)) 231 | return actions 232 | 233 | 234 | def rollout(model_, i): 235 | global b 236 | global R_sum 237 | layers = layersFromModule(model_) 238 | actions = rolloutActions(layers) 239 | fixLayers(model_.classifier) 240 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 241 | hashcode = hash(str(newModel)) if newModel else 0 242 | if hashcode in previousModels and constrained == False: 243 | R = previousModels[hashcode] 244 | elif newModel is None: 245 | R = -1 246 | else: 247 | print(newModel) 248 | #if numParams(newModel) >= 1700000: 249 | # return (-1, actions, newModel) 250 | acc = trainTeacherStudent(model, newModel, dataset, epochs=5) 251 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize, iter=int(i), constrained=constrained, vars=[numParams(newModel)], cons=[1700000]) 252 | previousModels[hashcode] = R 253 | # TODO: Turn constrained off after 20 or so iterations 254 | #C = 1 - float(numParams(newModel))/parentSize 255 | #R = -1 if acc < 0.88 or C < 0.5 else R 256 | rewardsPerModel[i] = R 257 | accsPerModel[i] = acc 258 | paramsPerModel[i] = numParams(newModel) 259 | #torch.save(newModel, modelSavePath + '%f.net' % i) 260 | print('Val accuracy: %f' % acc) 261 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 262 | print('Reward achieved %f' % R) 263 | #print('Reward after baseline %f' % (R-b)) 264 | # Update reward and baseline after each rollout 265 | return (R, actions, newModel) 266 | 267 | def rollout_batch(model, N, e): 268 | global b 269 | global R_sum 270 | newModels = [] 271 | idxs = [] 272 | Rs = [0]*N 273 | actionSeqs = [] 274 | studentModels = [] 275 | for i in range(N): 276 | model_ = copy.deepcopy(model) 277 | layers = layersFromModule(model_) 278 | actions = rolloutActions(layers) 279 | actionSeqs.append(actions) 280 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 281 | hashcode = hash(str(newModel)) if newModel else 0 282 | if hashcode in previousModels and constrained == False: 283 | Rs[i] = previousModels[hashcode] 284 | elif newModel is None: 285 | Rs[i] = -1 286 | else: 287 | print(newModel) 288 | torch.save(newModel, modelSavePath + '%f_%f.net' % (e, i)) 289 | newModels.append(newModel) 290 | studentModels.append(newModel) 291 | idxs.append(i) 292 | accs = trainTeacherStudentParallel(model, studentModels, dataset, epochs=5) 293 | for acc in accs: 294 | print('Val accuracy: %f' % acc) 295 | for i in range(len(newModels)): 296 | print('Compression: %f' % (1.0 - (float(numParams(newModels[i]))/parentSize))) 297 | R = [Reward(accs[i], numParams(newModels[i]), baseline_acc, parentSize, iter=int(e), constrained=constrained, vars=[numParams(newModels[i])], cons=[1700000]) for i in range(len(accs))] 298 | for i in range(len(idxs)): 299 | Rs[idxs[i]] = R[i] 300 | for i in range(len(Rs)): 301 | print('Reward achieved %f' % Rs[i]) 302 | return (Rs, actionSeqs, newModels) 303 | 304 | 305 | def rollouts(N, model, e): 306 | Rs = [] 307 | actionSeqs = [] 308 | models = [] 309 | (Rs, actionSeqs, models) = rollout_batch(copy.deepcopy(model), N, e) 310 | return (Rs, actionSeqs, models) 311 | 312 | 313 | def update_controller(actionSeqs, avgR): 314 | print('Reinforcing for epoch %d' % e) 315 | opti.zero_grad() 316 | for actions in actionSeqs: 317 | for action in actions: 318 | action.reinforce(avgR - b) 319 | autograd.backward(actions, [None for _ in actions]) 320 | opti.step() 321 | 322 | epochs = 100 323 | N = 3 324 | prevRs = [0, 0, 0, 0, 0] 325 | for e in range(epochs): 326 | # Compute N rollouts 327 | (Rs, actionSeqs, models) = rollouts(N, model, e) 328 | # Compute average reward 329 | avgR = np.mean(Rs) 330 | print('Average reward: %f' % avgR) 331 | #b = np.mean(prevRs[-5:]) 332 | prevRs.append(avgR) 333 | b = R_sum/float(e+1) 334 | R_sum = R_sum + avgR 335 | # Update controller 336 | update_controller(actionSeqs, avgR) 337 | 338 | torch.save(controller, controllerSavePath) 339 | resultsFile = open(modelSavePath + 'results.txt', "w") 340 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 341 | 342 | 343 | -------------------------------------------------------------------------------- /experiments/resnet_ar_run_param_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from controllers.AutoregressiveParam import * 19 | 20 | 21 | parser = argparse.ArgumentParser(description='Run layer only version') 22 | parser.add_argument('--dataset', type=str, default='svhn', metavar='N', 23 | help='which dataset to test on') 24 | parser.add_argument('--cuda', action='store_true', default=True, 25 | help='enables CUDA training') 26 | args = parser.parse_args() 27 | 28 | 29 | datasetName = args.dataset 30 | useCuda = args.cuda 31 | 32 | datasetInputTensor = None 33 | baseline_acc = 0 34 | modelSavePath = None 35 | if datasetName is 'mnist': 36 | print('Using mnist') 37 | import datasets.mnist as dataset 38 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 39 | #model = torch.load('./parent_models/mnist.pth') 40 | teacherModel = torch.load('./parent_models/lenet_mnist.pth') 41 | model = torch.load('stage1_mnist/lenet_layer_reduced.net') 42 | #baseline_acc = 0.989 43 | baseline_acc = 0.987 44 | modelSavePath = './protos_mnist/' 45 | elif datasetName is 'caltech256': 46 | print('Using caltech256') 47 | import datasets.caltech256 as dataset 48 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 49 | teacherModel = torch.load('./caltech256_resnet18.net') 50 | model = torch.load('./stage1_caltech256/best.net') 51 | baseline_acc = 0.58 52 | modelSavePath = './protos_caltech256_param' 53 | elif datasetName is 'cifar100': 54 | print('Using cifar100') 55 | torch.cuda.set_device(1) 56 | import datasets.cifar100 as dataset 57 | baseline_acc = 0.72 58 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 59 | model = torch.load('stage1_cifar100_resnet34/best.net') 60 | teacherModel = torch.load('./parent_models/resnet34_cifar100.net') 61 | modelSavePath = './protos_cifar100_stage_2/' 62 | elif datasetName is 'svhn': 63 | print('Using svhn') 64 | import datasets.svhn as dataset 65 | baseline_acc = 0.94 66 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 67 | teacherModel = torch.load('./parent_models/svhn_resnet18.net') 68 | model = torch.load('./stage1_svhn/95.66.net') 69 | modelSavePath = './protos_svhn_stage2/' 70 | else: 71 | print('Using cifar') 72 | import datasets.cifar10 as dataset 73 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 74 | model = torch.load('stage1_resnet34/93_fixed.net') 75 | teacherModel = torch.load('./parent_models/resnet18_best.net') 76 | baseline_acc = 0.912 77 | modelSavePath = './protos_cifar10_resnet18_stage2/' 78 | 79 | dataset.args.cuda = useCuda 80 | parentSize = numParams(model) 81 | 82 | 83 | def Reward(acc, params, baseline_acc, baseline_params): 84 | #R_acc = (baseline_loss - loss)^3 # determine what is chance as well 85 | R_acc = (acc/baseline_acc) 86 | C = (float(baseline_params - params))/baseline_params 87 | R_par = C*(2-C) 88 | #print('R_acc %f, R_par %f' % (R_acc, R_par)) 89 | return R_acc * R_par 90 | # return R_acc*(R_par**2 + 0.3) 91 | 92 | # Parameters for LSTM controller 93 | num_layers = 2 94 | num_hidden = 30 95 | num_input = 5 96 | num_output = 11 97 | seq_len = 24 98 | 99 | #lookup = [0.1, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0] 100 | lookup = [0.25, .5, .5, .5, .5, .5, .6, .7, .8, .9, 1.0] 101 | 102 | controller = LSTMAutoParams(num_input, num_output, num_hidden, num_layers, lookup) 103 | opti = optim.Adam(controller.parameters(), lr=0.1) 104 | 105 | previousModels = {} 106 | 107 | # Store statistics for each model 108 | accsPerModel = {} 109 | paramsPerModel = {} 110 | rewardsPerModel = {} 111 | numSavedModels = 0 112 | 113 | R_sum = 0 114 | b = 0 115 | 116 | LINEAR_THRESHOLD = 50000 117 | 118 | 119 | def applyActions(m, action, inp, lookup): 120 | #if m.fixed: 121 | # return resizeToFit(Layer(m), inp) 122 | # Get representation 123 | # Perform updates 124 | _, k, s, o, p = Layer(m).getRepresentation() 125 | k = max(int(k * lookup[action[1]]), 1) if m.fixed[1] == False else k 126 | s = max(int(s * lookup[action[2]]), 1) if m.fixed[2] == False else s 127 | o = max(int(o * lookup[action[3]]), 10) if m.fixed[3] == False else o 128 | p = int(p * lookup[action[4]]) if m.fixed[4] == False else p 129 | in_channels = inp.size(1) 130 | cn = m.__class__.__name__ 131 | if cn == 'Linear': 132 | in_channels = inp.view(inp.size(0), -1).size(1) 133 | if in_channels > LINEAR_THRESHOLD or in_channels < 10: 134 | print('Linear layer too large') 135 | return None 136 | return resizeLayer(m, in_channels, o, kernel_size=k, stride=s, padding=p) 137 | 138 | a = 0 139 | inp = Variable(datasetInputTensor.clone()).cuda() 140 | def processBlock(actions, m, lookup, input_size): 141 | global a 142 | finalAction = actions[a+len(m.layers._modules)-1][3] 143 | finalActionUsed = False 144 | 145 | secondFinalAction = actions[a+len(m.layers._modules)-2][3] 146 | secondFinalActionUsed = False 147 | 148 | firstConv = False 149 | secondConv = False 150 | hasShortcut = False 151 | 152 | if '0' in m.layers._modules: 153 | firstConv = True 154 | 155 | if '3' in m.layers._modules: 156 | secondConv = True 157 | 158 | if hasattr(m, 'shortcut') and m.shortcut != None: 159 | hasShortcut = True 160 | o = input_size 161 | if firstConv: 162 | i = input_size#m.layers._modules['0'].in_channels 163 | k = m.layers._modules['0'].kernel_size 164 | s = m.layers._modules['0'].stride 165 | o = m.layers._modules['0'].out_channels 166 | if secondConv: 167 | o = max(int(o * lookup[finalAction]), 10) 168 | finalActionUsed = True 169 | elif hasShortcut: 170 | o = max(int(o * lookup[finalAction]), 10) 171 | si = i 172 | sk = m.shortcut._modules['0'].kernel_size 173 | ss = m.shortcut._modules['0'].stride 174 | sp = m.shortcut._modules['0'].padding 175 | m.shortcut._modules['0'] = resizeLayer(m.shortcut._modules['0'], si, o, sk, ss, sp).cuda() 176 | m.shortcut._modules['1'] = resizeLayer(m.shortcut._modules['1'], o, o).cuda() 177 | finalActionUsed = True 178 | else: 179 | # We want output to be same as input in the event of no shortcut and no secondConv 180 | o = i 181 | p = m.layers._modules['0'].padding 182 | m.layers._modules['0'] = resizeLayer(m.layers._modules['0'], i, o, k, s, p).cuda() 183 | if '1' in m.layers._modules: 184 | m.layers._modules['1'] = resizeLayer(m.layers._modules['1'], o, o).cuda() 185 | if secondConv: 186 | #i = m.layers._modules['3'].in_channels if not firstConv else m.layers._modules['0'].out_channels 187 | i = o 188 | k = m.layers._modules['3'].kernel_size 189 | s = m.layers._modules['3'].stride 190 | o = m.layers._modules['3'].out_channels 191 | if hasShortcut: 192 | o = max(int(o * lookup[secondFinalAction]), 10) 193 | si = m.layers._modules['0'].in_channels if firstConv else i 194 | sk = m.shortcut._modules['0'].kernel_size 195 | ss = m.shortcut._modules['0'].stride 196 | sp = m.shortcut._modules['0'].padding 197 | m.shortcut._modules['0'] = resizeLayer(m.shortcut._modules['0'], si, o, sk, ss, sp).cuda() 198 | m.shortcut._modules['1'] = resizeLayer(m.shortcut._modules['1'], o, o).cuda() 199 | secondFinalActionUsed = True 200 | else: 201 | o = m.layers._modules['0'].in_channels if firstConv else i 202 | p = m.layers._modules['3'].padding 203 | m.layers._modules['3'] = resizeLayer(m.layers._modules['3'], i, o, k, s, p).cuda() 204 | if '4' in m.layers._modules: 205 | m.layers._modules['4'] = resizeLayer(m.layers._modules['4'], o, o).cuda() 206 | 207 | # Void actions 208 | for a in range(len(m.layers._modules)-2): 209 | # actions[a].detach() 210 | a += 1 211 | 212 | #if not secondFinalActionUsed: 213 | # actions[a].detach() 214 | a += 1 215 | 216 | #if not finalActionUsed: 217 | # actions[a].detach() 218 | a += 1 219 | return m 220 | 221 | def traverse(parent, m, m_name, actions): 222 | global a 223 | global inp 224 | classname = m.__class__.__name__ 225 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'Model', 'ResNetModifiable', 'BasicBlockModifiable']: 226 | # Change the number of input channels of the first conv of the shortcut layer 227 | oldInp = Variable(copy.deepcopy(inp.data)) 228 | child = createParentContainer(m) 229 | if classname in ['BasicBlock', 'BottleNeck', 'BasicBlockModifiable']: 230 | fixBlockLayers(m) 231 | m = processBlock(actions, m, lookup, inp.size(1)).cuda() 232 | inp = m.layers(inp.cuda()) 233 | child = m 234 | else: 235 | for i in m._modules.keys(): 236 | res = traverse(child, m._modules[i], i, actions) 237 | if res == None: 238 | return None 239 | # Change the number of output channels of the last conv of the shortcut layer 240 | if classname not in ['ResNet', 'VGG', 'LeNet', 'Model', 'ResNetModifiable']: 241 | child(oldInp) 242 | parent.add_module(m_name, child) 243 | return True 244 | else: 245 | return child 246 | else: 247 | if classname == 'Linear': 248 | inp = inp.view(inp.size(0), -1) 249 | #print(inp.size(1)) 250 | if inp.size(1) > LINEAR_THRESHOLD or inp.size(1) < 10: 251 | print('Linear layer too large') 252 | return None 253 | action = actions[a][:] 254 | m = applyActions(m, action, inp, lookup) 255 | if m == None: 256 | return None 257 | try: 258 | inp = m.cuda()(inp) 259 | except: 260 | print('Error in model, probably because of receptive field size') 261 | return None 262 | parent.add_module(m_name, m) 263 | a += 1 264 | return True 265 | ''' 266 | def fixLayers(m): 267 | layers = flattenModule(m) 268 | # Initialize 269 | for l in layers: 270 | l.fixed = False 271 | 272 | # Fix any layers you want here 273 | # ---- 274 | # Fix final linear layer 275 | layers[1].fixed = True 276 | layers[2].fixed = True 277 | layers[-1].fixed = True 278 | layers[-2].fixed = True 279 | # ---- 280 | ''' 281 | def fixBlockLayers(m): 282 | # Only allow num_filters of conv layers to change 283 | for mm in m.layers._modules.values(): 284 | mm.fixed = [True]*5 285 | m.layers._modules.values()[0].fixed = [True, True, True, False, True] 286 | #m._modules.values()[-2].fixed = [True, True, True, False, True] 287 | 288 | 289 | def fixLayers(m): 290 | layers = flattenModule(m) 291 | # Initialize 292 | for l in layers: 293 | l.fixed = [False]*5 294 | 295 | # Fix any layers you want here 296 | # ---- 297 | # Fix all shortcut layers and corresponding stride layers, but not pre layers 298 | for l in layers: 299 | # Fix all shortcut/downsampling layers 300 | # Since we couple the action for the conv layer and this layer we can modify this when building model 301 | cn = l.__class__.__name__ 302 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 303 | l.fixed = [True]*5 304 | # Fix final linear and average pooling layer 305 | if cn == 'Linear' or cn == 'AvgPool2d': 306 | l.fixed = [True]*5 307 | # ---- 308 | 309 | ''' 310 | Build child model 311 | ''' 312 | def build_child_model(m, actions): 313 | # We eliminate a layer if any one of the coefficients are = 0 314 | global inp 315 | global a 316 | a = 0 317 | actions = np.reshape(actions, (-1, num_input)) 318 | 319 | inp = Variable(datasetInputTensor.clone()).cuda() 320 | fixLayers(m) 321 | 322 | # Build whole model 323 | newModel = traverse(None, m, None, actions) 324 | if newModel == None: 325 | print('newModel is none for some reason') 326 | return None 327 | resetModel(newModel) 328 | # Check if any compression has been achieved 329 | if numParams(newModel) > parentSize: 330 | print('newModel is larger than parent') 331 | return None 332 | 333 | return newModel 334 | 335 | 336 | def rolloutActions(layers): 337 | global controller 338 | hn = [Variable(torch.zeros(1, num_hidden))] * num_layers 339 | cn = [Variable(torch.zeros(1, num_hidden))] * num_layers 340 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 341 | for i in range(len(layers)): 342 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=False) 343 | output = controller(input, (hn, cn)) 344 | return output 345 | 346 | 347 | def rollout(model_, e): 348 | global b 349 | global R_sum 350 | layers = layersFromModule(model_) 351 | actions = rolloutActions(layers) 352 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 353 | actionsMask = np.ravel([l.fixed for l in layers]) 354 | newActions = [] 355 | for i in range(len(actionsMask)): 356 | if actionsMask[i]: 357 | newActions.append(actions[i]) 358 | actions = newActions 359 | print(newModel) 360 | hashcode = hash(str(newModel)) if newModel != None else 0 361 | print(hashcode) 362 | if hashcode in previousModels: 363 | R = previousModels[hashcode] 364 | elif newModel is None: 365 | R = -1 366 | else: 367 | print(newModel) 368 | acc = trainTeacherStudent(teacherModel, newModel, dataset, epochs=5) if datasetName is not 'caltech256' else trainNormal(newModel, dataset, epochs=3) 369 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize) 370 | previousModels[hashcode] = R 371 | rewardsPerModel[i] = R 372 | accsPerModel[i] = acc 373 | paramsPerModel[i] = numParams(newModel) 374 | torch.save(newModel, modelSavePath + '%f.net' % e) 375 | print('Val accuracy: %f' % acc) 376 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 377 | print('Reward achieved %f' % R) 378 | #print('Reward after baseline %f' % (R-b)) 379 | # Update reward and baseline after each rollout 380 | return (R, actions, newModel) 381 | 382 | def rollouts(N, model, e): 383 | Rs = [] 384 | actionSeqs = [] 385 | models = [] 386 | for i in range(N): 387 | R, actions, newModel = rollout(copy.deepcopy(model), e + float(i)/10) 388 | Rs.append(R); actionSeqs.append(actions); models.append(newModel) 389 | return (Rs, actionSeqs, models) 390 | 391 | def update_controller(actionSeqs, avgR): 392 | print('Reinforcing for epoch %d' % e) 393 | for actions in actionSeqs: 394 | for action in actions: 395 | action.reinforce(avgR - b) 396 | opti.zero_grad() 397 | autograd.backward(actions, [None for _ in actions]) 398 | opti.step() 399 | 400 | epochs = 50 401 | N = 3 402 | for e in range(epochs): 403 | # Compute N rollouts 404 | (Rs, actionSeqs, models) = rollouts(N, model, e) 405 | # Compute average reward 406 | avgR = np.mean(Rs) 407 | print('Average reward: %f' % avgR) 408 | b = R_sum/float(e+1) 409 | R_sum = R_sum + avgR 410 | # Update controller 411 | update_controller(actionSeqs, avgR) 412 | resultsFile = open(modelSavePath + 'results.txt', "w") 413 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 414 | 415 | -------------------------------------------------------------------------------- /experiments/resnet_db_run_layer_clean.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch import optim 4 | from torch.autograd import Variable 5 | import torch.autograd as autograd 6 | import numpy as np 7 | import torchvision 8 | import random 9 | #from visualize import make_dot 10 | from torch.nn.parameter import Parameter 11 | from Model import Model 12 | from utils import * 13 | from Layer import Layer 14 | import argparse 15 | import copy 16 | import signal 17 | import sys 18 | from model import resnet 19 | from controllers.LSTM import LSTM 20 | 21 | sys.path.insert(0, '/home/anubhava/ssd.pytorch/') 22 | 23 | constrained = False#True 24 | 25 | parser = argparse.ArgumentParser(description='Run layer only version') 26 | parser.add_argument('--dataset', type=str, default='cifar10', metavar='N', 27 | help='which dataset to test on') 28 | parser.add_argument('--cuda', action='store_true', default=True, 29 | help='enables CUDA training') 30 | args = parser.parse_args() 31 | 32 | 33 | datasetName = args.dataset 34 | useCuda = args.cuda 35 | loadController = False 36 | skipSupport = False 37 | 38 | datasetInputTensor = None 39 | baseline_acc = 0 40 | modelSavePath = None 41 | controllerSavePath = None 42 | if datasetName is 'mnist': 43 | print('Using mnist') 44 | import datasets.mnist as dataset 45 | torch.cuda.set_device(3) 46 | datasetInputTensor = torch.Tensor(1, 1, 28, 28) 47 | model = torch.load('./parent_models/mnistvgg13.net') 48 | #model = torch.load('./parent_models/lenet_mnist.pth') 49 | baseline_acc = 0.9955 50 | #baseline_acc = 0.983 51 | modelSavePath = './protos_mnist/' 52 | controllerSavePath = './controllers_mnist/lstm_lenet.net' 53 | controllerLoadPath = './controllers_mnist/lstm_lenet.net' 54 | elif datasetName is 'cifar100': 55 | print('Using cifar100') 56 | import datasets.cifar100 as dataset 57 | baseline_acc = 0.67 58 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 59 | model = torch.load('./parent_models/resnet18_cifar100.net') 60 | modelSavePath = './protos_cifar100/' 61 | controllerSavePath = './controllers_cifar100/lstm_resnet18.net' 62 | controllerLoadPath = './controllers_cifar100/lstm_resnet18.net' 63 | elif datasetName is 'caltech256': 64 | print('Using caltech256') 65 | import datasets.caltech256 as dataset 66 | baseline_acc = 0.79 67 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 68 | model = torch.load('./parent_models/caltech256_resnet18.net') 69 | modelSavePath = './protos_caltech256/' 70 | controllerSavePath = './controllers_caltech256/lstm_resnet18.net' 71 | controllerLoadPath = './controllers_caltech256/lstm_resnet18.net' 72 | elif datasetName is 'svhn': 73 | print('Using svhn') 74 | import datasets.svhn as dataset 75 | baseline_acc = 0.9525 76 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 77 | model = torch.load('./parent_models/svhn_resnet18.net') 78 | modelSavePath = './protos_svhn/' 79 | controllerSavePath = './controllers_svhn/lstm_resnet18.net' 80 | controllerLoadPath = './controllers_svhn/lstm_resnet18.net' 81 | elif datasetName is 'imagenet': 82 | print('Using imagenet') 83 | import datasets.imagenet as dataset 84 | torch.cuda.set_device(2) 85 | datasetInputTensor = torch.Tensor(1, 3, 224, 224) 86 | model = torch.load('./parent_models/resnet34_imagenet.net') 87 | modelSavePath = './protos_imagenet' 88 | controllerSavePath = './controllers_imagenet/lstm_resnet34.net' 89 | controllerLoadPath = './controllers_imagenet/lstm_resnet34.net' 90 | else: 91 | torch.cuda.set_device(0) 92 | print('Using cifar') 93 | import datasets.cifar10 as dataset 94 | datasetInputTensor = torch.Tensor(1, 3, 32, 32) 95 | model = torch.load('./parent_models/resnet18cifar.net') 96 | baseline_acc = 0.9205 97 | modelSavePath = './protos_cifar/' 98 | #controllerSavePath = './controllers_cifar/lstm_vgg19.net' 99 | #controllerSavePath = './controllers_cifar/lstm_resnet34.net' 100 | controllerSavePath = './controllers_cifar/lstm_resnet18.net' 101 | controllerLoadPath = './controllers_cifar/lstm_resnet34.net' 102 | 103 | dataset.args.cuda = useCuda 104 | parentSize = numParams(model) 105 | 106 | def getEpsilon(iter, max_iter=15.0): 107 | return min(1, max(0, (1-iter/float(max_iter))**4)) #return 0 108 | 109 | def getConstrainedReward(R_a, R_c, cons, vars, iter): 110 | eps = getEpsilon(iter) 111 | modelSize = vars[0] 112 | modelSizeConstraint = cons[0] 113 | if modelSize > modelSizeConstraint: 114 | return (eps - 1) + eps * (R_a * R_c) 115 | else: 116 | return R_a * R_c 117 | 118 | def Reward(acc, params, baseline_acc, baseline_params, constrained=False, iter=50, cons=[], vars=[]): 119 | R_a = (acc/baseline_acc) #if acc > 0.92 else -1 120 | C = (float(baseline_params - params))/baseline_params 121 | R_c = C*(2-C) 122 | if constrained: 123 | return getConstrainedReward(R_a, R_c, cons, vars, iter) 124 | return (R_a) * (R_c) 125 | 126 | # Parameters for LSTM controller 127 | num_layers = 2 128 | num_hidden = 30 129 | num_input = 7 if skipSupport else 5 130 | num_output = 2 131 | seq_len = 1 132 | 133 | controller = LSTM(num_input, num_output, num_hidden, num_layers, bidirectional=True) 134 | if loadController: 135 | controller = torch.load(controllerLoadPath) 136 | opti = optim.Adam(controller.parameters(), lr=0.003) 137 | 138 | previousModels = {} 139 | # Store statistics for each model 140 | accsPerModel = {} 141 | paramsPerModel = {} 142 | rewardsPerModel = {} 143 | numSavedModels = 0 144 | 145 | R_sum = 0 146 | b = 0 147 | 148 | LINEAR_THRESHOLD = 50000 149 | 150 | a = 0 151 | inp = Variable(datasetInputTensor.clone()).cuda() 152 | def traverse(parent, m, m_name, actions): 153 | global a 154 | global inp 155 | classname = m.__class__.__name__ 156 | if classname in ['Sequential', 'BasicBlock', 'Bottleneck', 'ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 157 | child = createParentContainer(m) 158 | for i in m._modules.keys(): 159 | if i == 'shortcut': 160 | continue 161 | #print(i) 162 | res = traverse(child, m._modules[i], i, actions) 163 | if res == None: 164 | return None 165 | if classname not in ['ResNet', 'VGG', 'LeNet', 'mnist_model', 'Model']: 166 | parent.add_module(m_name, child) 167 | else: 168 | return child 169 | else: 170 | if classname == 'Linear': 171 | inp = inp.view(inp.size(0), -1) 172 | #print(inp.size(1)) 173 | if inp.size(1) > LINEAR_THRESHOLD: 174 | return None 175 | if m.fixed or actions[a]: 176 | m = resizeToFit(Layer(m), inp).cuda() 177 | inp = m(inp) 178 | parent.add_module(m_name, m) 179 | a += 1 180 | return True 181 | 182 | def fixLayers(m): 183 | #m.classifier._modules['0'].fixed = True 184 | layers = flattenModule(m) 185 | # Initialize 186 | for l in layers: 187 | l.fixed = False 188 | 189 | layers[-1].fixed=True 190 | # Fix any layers you want here 191 | # ---- 192 | # Fix all shortcut layers and corresponding stride layers, but not pre layers 193 | for l in layers: 194 | # Fix final linear and average pooling layer 195 | cn = l.__class__.__name__ 196 | if hasattr(l, 'stride') and l.stride != (1, 1) and cn == 'Conv2d': 197 | l.fixed = True 198 | if cn == 'Linear' or cn == 'AvgPool2d': 199 | l.fixed = True 200 | # ---- 201 | 202 | ''' 203 | Build child model 204 | ''' 205 | def build_child_model(m, actions): 206 | 207 | # What we want to do here is: 208 | # Automatically construct containers based on actions of the child 209 | # We also want to have universality across models 210 | # Need to handle conv to fc transition 211 | # In VGG FC is in Sequential called features, in ResNet FC is in Sequential called fc 212 | # Have a switch that looks out for layers called fc or features 213 | # Flatten inp on seeing that 214 | # Need to also incorporate filter channels resizeToFit 215 | 216 | actions[0] = 1 217 | global a 218 | global inp 219 | a = 0 220 | 221 | inp = Variable(datasetInputTensor.clone()).cuda() 222 | fixLayers(m) 223 | # Here we traverse the teacher model, which has a heirarchical structure to generate a child model 224 | newModel = traverse(None, m, None, actions) 225 | if newModel == None: 226 | return None 227 | resetModel(newModel) 228 | # Check if any compression has been achieved 229 | if numParams(newModel) >= parentSize: 230 | return None 231 | 232 | return newModel 233 | 234 | 235 | def rolloutActions(layers): 236 | global controller 237 | hn = Variable(torch.zeros(num_layers * 2, 1, num_hidden)) 238 | cn = Variable(torch.zeros(num_layers * 2 , 1, num_hidden)) 239 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 240 | for i in range(len(layers)): 241 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=skipSupport) 242 | actions = controller(input, (hn, cn)) 243 | return actions 244 | 245 | 246 | def rollout(model_, i): 247 | global b 248 | global R_sum 249 | layers = layersFromModule(model_) 250 | actions = rolloutActions(layers) 251 | fixLayers(model_) 252 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 253 | hashcode = hash(str(newModel)) if newModel else 0 254 | if hashcode in previousModels and constrained == False: 255 | R = previousModels[hashcode] 256 | elif newModel is None: 257 | R = -1 258 | else: 259 | print(newModel) 260 | #if numParams(newModel) >= 1700000: 261 | # return (-1, actions, newModel) 262 | acc = trainTeacherStudent(model, newModel, dataset, epochs=5) 263 | R = Reward(acc, numParams(newModel), baseline_acc, parentSize, iter=int(i), constrained=constrained, vars=[numParams(newModel)], cons=[1700000]) 264 | previousModels[hashcode] = R 265 | # TODO: Turn constrained off after 20 or so iterations 266 | #C = 1 - float(numParams(newModel))/parentSize 267 | #R = -1 if acc < 0.88 or C < 0.5 else R 268 | rewardsPerModel[i] = R 269 | accsPerModel[i] = acc 270 | paramsPerModel[i] = numParams(newModel) 271 | #torch.save(newModel, modelSavePath + '%f.net' % i) 272 | print('Val accuracy: %f' % acc) 273 | print('Compression: %f' % (1.0 - (float(numParams(newModel))/parentSize))) 274 | print('Reward achieved %f' % R) 275 | #print('Reward after baseline %f' % (R-b)) 276 | # Update reward and baseline after each rollout 277 | return (R, actions, newModel) 278 | 279 | def rollout_batch(model, N, e): 280 | global b 281 | global R_sum 282 | newModels = [] 283 | idxs = [] 284 | Rs = [0]*N 285 | actionSeqs = [] 286 | studentModels = [] 287 | for i in range(N): 288 | model_ = copy.deepcopy(model) 289 | layers = layersFromModule(model_) 290 | actions = rolloutActions(layers) 291 | actionSeqs.append(actions) 292 | newModel = build_child_model(model_, [a.data.numpy()[0] for a in actions]) 293 | hashcode = hash(str(newModel)) if newModel else 0 294 | if hashcode in previousModels and constrained == False: 295 | Rs[i] = previousModels[hashcode] 296 | elif newModel is None: 297 | Rs[i] = -1 298 | else: 299 | print(newModel) 300 | torch.save(newModel, modelSavePath + '%f_%f.net' % (e, i)) 301 | newModels.append(newModel) 302 | studentModels.append(newModel) 303 | idxs.append(i) 304 | accs = trainNormalParallel(studentModels, dataset, epochs=5) if datasetName is 'caltech256' else trainTeacherStudentParallel(model, studentModels, dataset, epochs=5) 305 | for acc in accs: 306 | print('Val accuracy: %f' % acc) 307 | for i in range(len(newModels)): 308 | print('Compression: %f' % (1.0 - (float(numParams(newModels[i]))/parentSize))) 309 | R = [Reward(accs[i], numParams(newModels[i]), baseline_acc, parentSize, iter=int(e), constrained=constrained, vars=[numParams(newModels[i])], cons=[1700000]) for i in range(len(accs))] 310 | for i in range(len(idxs)): 311 | Rs[idxs[i]] = R[i] 312 | for i in range(len(Rs)): 313 | print('Reward achieved %f' % Rs[i]) 314 | return (Rs, actionSeqs, newModels) 315 | 316 | 317 | def rollouts(N, model, e): 318 | Rs = [] 319 | actionSeqs = [] 320 | models = [] 321 | (Rs, actionSeqs, models) = rollout_batch(copy.deepcopy(model), N, e) 322 | return (Rs, actionSeqs, models) 323 | 324 | 325 | def update_controller(actionSeqs, avgR): 326 | print('Reinforcing for epoch %d' % e) 327 | for actions in actionSeqs: 328 | actions.reinforce(avgR - b) 329 | opti.zero_grad() 330 | autograd.backward(actions, [None for _ in actions]) 331 | opti.step() 332 | 333 | epochs = 100 334 | N = 5 335 | prevRs = [0, 0, 0, 0, 0] 336 | for e in range(epochs): 337 | # Compute N rollouts 338 | (Rs, actionSeqs, models) = rollouts(N, model, e) 339 | # Compute average reward 340 | avgR = np.mean(Rs) 341 | print('Average reward: %f' % avgR) 342 | #b = np.mean(prevRs[-5:]) 343 | prevRs.append(avgR) 344 | b = R_sum/float(e+1) 345 | R_sum = R_sum + avgR 346 | # Update controller 347 | update_controller(actionSeqs, avgR) 348 | 349 | torch.save(controller, controllerSavePath) 350 | resultsFile = open(modelSavePath + 'results.txt', "w") 351 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 352 | 353 | 354 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | from .lenet import * 2 | from .vgg import * 3 | from .resnet import * 4 | from .googlenet import * 5 | from .densenet import * 6 | -------------------------------------------------------------------------------- /model/cifar_new.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.utils.model_zoo as model_zoo 3 | from collections import OrderedDict 4 | 5 | model_urls = { 6 | 'cifar10': 'http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/cifar10-d875770b.pth', 7 | 'cifar100': 'http://ml.cs.tsinghua.edu.cn/~chenxi/pytorch-models/cifar100-3a55a987.pth', 8 | } 9 | 10 | class CIFAR(nn.Module): 11 | def __init__(self, features, n_channel, num_classes): 12 | super(CIFAR, self).__init__() 13 | assert isinstance(features, nn.Sequential), type(features) 14 | self.features = features 15 | self.classifier = nn.Sequential( 16 | nn.Linear(n_channel, num_classes) 17 | ) 18 | print(self.features) 19 | print(self.classifier) 20 | 21 | def forward(self, x): 22 | x = self.features(x) 23 | x = x.view(x.size(0), -1) 24 | x = self.classifier(x) 25 | return x 26 | 27 | def make_layers(cfg, batch_norm=False): 28 | layers = [] 29 | in_channels = 3 30 | for i, v in enumerate(cfg): 31 | if v == 'M': 32 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 33 | else: 34 | padding = v[1] if isinstance(v, tuple) else 1 35 | out_channels = v[0] if isinstance(v, tuple) else v 36 | conv2d = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=padding) 37 | if batch_norm: 38 | layers += [conv2d, nn.BatchNorm2d(out_channels, affine=False), nn.ReLU()] 39 | else: 40 | layers += [conv2d, nn.ReLU()] 41 | in_channels = out_channels 42 | return nn.Sequential(*layers) 43 | 44 | def cifar10(n_channel, pretrained=None): 45 | cfg = [n_channel, n_channel, 'M', 2*n_channel, 2*n_channel, 'M', 4*n_channel, 4*n_channel, 'M', (8*n_channel, 0), 'M'] 46 | layers = make_layers(cfg, batch_norm=True) 47 | model = CIFAR(layers, n_channel=8*n_channel, num_classes=10) 48 | if pretrained is not None: 49 | m = model_zoo.load_url(model_urls['cifar10']) 50 | state_dict = m.state_dict() if isinstance(m, nn.Module) else m 51 | assert isinstance(state_dict, (dict, OrderedDict)), type(state_dict) 52 | model.load_state_dict(state_dict) 53 | return model 54 | 55 | def cifar100(n_channel, pretrained=None): 56 | cfg = [n_channel, n_channel, 'M', 2*n_channel, 2*n_channel, 'M', 4*n_channel, 4*n_channel, 'M', (8*n_channel, 0), 'M'] 57 | layers = make_layers(cfg, batch_norm=True) 58 | model = CIFAR(layers, n_channel=8*n_channel, num_classes=100) 59 | if pretrained is not None: 60 | m = model_zoo.load_url(model_urls['cifar100']) 61 | state_dict = m.state_dict() if isinstance(m, nn.Module) else m 62 | assert isinstance(state_dict, (dict, OrderedDict)), type(state_dict) 63 | model.load_state_dict(state_dict) 64 | return model 65 | 66 | if __name__ == '__main__': 67 | model = cifar10(128, pretrained='log/cifar10/best-135.pth') 68 | print(model) 69 | -------------------------------------------------------------------------------- /model/densenet.py: -------------------------------------------------------------------------------- 1 | '''DenseNet in PyTorch.''' 2 | import math 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from torch.autograd import Variable 9 | 10 | 11 | class Bottleneck(nn.Module): 12 | def __init__(self, in_planes, growth_rate): 13 | super(Bottleneck, self).__init__() 14 | self.bn1 = nn.BatchNorm2d(in_planes) 15 | self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) 16 | self.bn2 = nn.BatchNorm2d(4*growth_rate) 17 | self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) 18 | 19 | def forward(self, x): 20 | out = self.conv1(F.relu(self.bn1(x))) 21 | out = self.conv2(F.relu(self.bn2(out))) 22 | out = torch.cat([out,x], 1) 23 | return out 24 | 25 | 26 | class Transition(nn.Module): 27 | def __init__(self, in_planes, out_planes): 28 | super(Transition, self).__init__() 29 | self.bn = nn.BatchNorm2d(in_planes) 30 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) 31 | 32 | def forward(self, x): 33 | out = self.conv(F.relu(self.bn(x))) 34 | out = F.avg_pool2d(out, 2) 35 | return out 36 | 37 | 38 | class DenseNet(nn.Module): 39 | def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): 40 | super(DenseNet, self).__init__() 41 | self.growth_rate = growth_rate 42 | 43 | num_planes = 2*growth_rate 44 | self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) 45 | 46 | self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) 47 | num_planes += nblocks[0]*growth_rate 48 | out_planes = int(math.floor(num_planes*reduction)) 49 | self.trans1 = Transition(num_planes, out_planes) 50 | num_planes = out_planes 51 | 52 | self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) 53 | num_planes += nblocks[1]*growth_rate 54 | out_planes = int(math.floor(num_planes*reduction)) 55 | self.trans2 = Transition(num_planes, out_planes) 56 | num_planes = out_planes 57 | 58 | self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) 59 | num_planes += nblocks[2]*growth_rate 60 | out_planes = int(math.floor(num_planes*reduction)) 61 | self.trans3 = Transition(num_planes, out_planes) 62 | num_planes = out_planes 63 | 64 | self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) 65 | num_planes += nblocks[3]*growth_rate 66 | 67 | self.bn = nn.BatchNorm2d(num_planes) 68 | self.linear = nn.Linear(num_planes, num_classes) 69 | 70 | def _make_dense_layers(self, block, in_planes, nblock): 71 | layers = [] 72 | for i in range(nblock): 73 | layers.append(block(in_planes, self.growth_rate)) 74 | in_planes += self.growth_rate 75 | return nn.Sequential(*layers) 76 | 77 | def forward(self, x): 78 | out = self.conv1(x) 79 | out = self.trans1(self.dense1(out)) 80 | out = self.trans2(self.dense2(out)) 81 | out = self.trans3(self.dense3(out)) 82 | out = self.dense4(out) 83 | out = F.avg_pool2d(F.relu(self.bn(out)), 4) 84 | out = out.view(out.size(0), -1) 85 | out = self.linear(out) 86 | return out 87 | 88 | def densenet121(): 89 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) 90 | 91 | def densenet169(): 92 | return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) 93 | 94 | def densenet201(): 95 | return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) 96 | 97 | def densenet161(): 98 | return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) 99 | 100 | def densenet_cifar(): 101 | return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) 102 | 103 | def test_densenet(): 104 | net = densenet_cifar() 105 | x = torch.randn(1,3,32,32) 106 | y = net(Variable(x)) 107 | print(y) 108 | 109 | # test_densenet() 110 | -------------------------------------------------------------------------------- /model/googlenet.py: -------------------------------------------------------------------------------- 1 | '''GoogLeNet with PyTorch.''' 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from torch.autograd import Variable 7 | 8 | 9 | class Inception(nn.Module): 10 | def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): 11 | super(Inception, self).__init__() 12 | # 1x1 conv branch 13 | self.b1 = nn.Sequential( 14 | nn.Conv2d(in_planes, n1x1, kernel_size=1), 15 | nn.BatchNorm2d(n1x1), 16 | nn.ReLU(True), 17 | ) 18 | 19 | # 1x1 conv -> 3x3 conv branch 20 | self.b2 = nn.Sequential( 21 | nn.Conv2d(in_planes, n3x3red, kernel_size=1), 22 | nn.BatchNorm2d(n3x3red), 23 | nn.ReLU(True), 24 | nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), 25 | nn.BatchNorm2d(n3x3), 26 | nn.ReLU(True), 27 | ) 28 | 29 | # 1x1 conv -> 5x5 conv branch 30 | self.b3 = nn.Sequential( 31 | nn.Conv2d(in_planes, n5x5red, kernel_size=1), 32 | nn.BatchNorm2d(n5x5red), 33 | nn.ReLU(True), 34 | nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), 35 | nn.BatchNorm2d(n5x5), 36 | nn.ReLU(True), 37 | nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), 38 | nn.BatchNorm2d(n5x5), 39 | nn.ReLU(True), 40 | ) 41 | 42 | # 3x3 pool -> 1x1 conv branch 43 | self.b4 = nn.Sequential( 44 | nn.MaxPool2d(3, stride=1, padding=1), 45 | nn.Conv2d(in_planes, pool_planes, kernel_size=1), 46 | nn.BatchNorm2d(pool_planes), 47 | nn.ReLU(True), 48 | ) 49 | 50 | def forward(self, x): 51 | y1 = self.b1(x) 52 | y2 = self.b2(x) 53 | y3 = self.b3(x) 54 | y4 = self.b4(x) 55 | return torch.cat([y1,y2,y3,y4], 1) 56 | 57 | 58 | class GoogLeNet(nn.Module): 59 | def __init__(self): 60 | super(GoogLeNet, self).__init__() 61 | self.pre_layers = nn.Sequential( 62 | nn.Conv2d(3, 192, kernel_size=3, padding=1), 63 | nn.BatchNorm2d(192), 64 | nn.ReLU(True), 65 | ) 66 | 67 | self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) 68 | self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) 69 | 70 | self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) 71 | 72 | self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) 73 | self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) 74 | self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) 75 | self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) 76 | self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) 77 | 78 | self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) 79 | self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) 80 | 81 | self.avgpool = nn.AvgPool2d(8, stride=1) 82 | self.linear = nn.Linear(1024, 10) 83 | 84 | def forward(self, x): 85 | x = self.pre_layers(x) 86 | x = self.a3(x) 87 | x = self.b3(x) 88 | x = self.maxpool(x) 89 | x = self.a4(x) 90 | x = self.b4(x) 91 | x = self.c4(x) 92 | x = self.d4(x) 93 | x = self.e4(x) 94 | x = self.maxpool(x) 95 | x = self.a5(x) 96 | x = self.b5(x) 97 | x = self.avgpool(x) 98 | x = x.view(x.size(0), -1) 99 | x = self.linear(x) 100 | return x 101 | 102 | # net = GoogLeNet() 103 | # x = torch.randn(1,3,32,32) 104 | # y = net(Variable(x)) 105 | # print(y.size()) 106 | -------------------------------------------------------------------------------- /model/lenet.py: -------------------------------------------------------------------------------- 1 | '''LeNet in PyTorch.''' 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class LeNet(nn.Module): 6 | def __init__(self): 7 | super(LeNet, self).__init__() 8 | self.features = nn.Sequential() 9 | self.classifier = nn.Sequential() 10 | self.features.add_module('conv1', nn.Conv2d(1, 6, 5)) 11 | self.features.add_module('relu1', nn.ReLU()) 12 | self.features.add_module('pool1', nn.MaxPool2d(2, 2)) 13 | self.features.add_module('conv2', nn.Conv2d(6, 16, 5)) 14 | self.features.add_module('relu2', nn.ReLU()) 15 | self.features.add_module('pool2', nn.MaxPool2d(2, 2)) 16 | self.classifier.add_module('linear1', nn.Linear(16*4*4, 120)) 17 | self.classifier.add_module('relu1', nn.ReLU()) 18 | self.classifier.add_module('linear2', nn.Linear(120, 84)) 19 | self.classifier.add_module('relu2', nn.ReLU()) 20 | self.classifier.add_module('linear3', nn.Linear(84, 10)) 21 | self.classifier.add_module('lsm', nn.LogSoftmax()) 22 | 23 | def forward(self, x): 24 | x = self.features(x) 25 | x = x.view(x.size(0), -1) 26 | x = self.classifier(x) 27 | return x 28 | -------------------------------------------------------------------------------- /model/mnistnet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | class mnist_model(nn.Module): 3 | 4 | def __init__(self): 5 | super(mnist_model, self).__init__() 6 | self.feats = nn.Sequential( 7 | nn.Conv2d(1, 32, 5, 1, 1), 8 | nn.MaxPool2d(2, 2), 9 | nn.ReLU(True), 10 | nn.BatchNorm2d(32), 11 | 12 | nn.Conv2d(32, 64, 3, 1, 1), 13 | nn.ReLU(True), 14 | nn.BatchNorm2d(64), 15 | 16 | nn.Conv2d(64, 64, 3, 1, 1), 17 | nn.MaxPool2d(2, 2), 18 | nn.ReLU(True), 19 | nn.BatchNorm2d(64), 20 | 21 | nn.Conv2d(64, 128, 3, 1, 1), 22 | nn.ReLU(True), 23 | nn.BatchNorm2d(128), 24 | 25 | nn.AvgPool2d(6, 6) 26 | ) 27 | 28 | self.classifier = nn.Sequential( 29 | nn.Dropout(0.2), 30 | nn.Linear(128, 10), 31 | nn.ReLU(True) 32 | ) 33 | 34 | def forward(self, inputs): 35 | out = self.feats(inputs) 36 | out = out.view(-1, 128) 37 | out = self.classifier(out) 38 | return out 39 | -------------------------------------------------------------------------------- /model/mnistnetv2.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | class mnist_model(nn.Module): 3 | 4 | def __init__(self): 5 | super(mnist_model, self).__init__() 6 | self.features = nn.Sequential( 7 | nn.Conv2d(1, 32, 5, 1, 1), 8 | nn.MaxPool2d(2, 2), 9 | nn.ReLU(True), 10 | nn.BatchNorm2d(32), 11 | 12 | nn.Conv2d(32, 64, 3, 1, 1), 13 | nn.ReLU(True), 14 | nn.BatchNorm2d(64), 15 | 16 | nn.Conv2d(64, 64, 3, 1, 1), 17 | nn.MaxPool2d(2, 2), 18 | nn.ReLU(True), 19 | nn.BatchNorm2d(64), 20 | 21 | nn.Conv2d(64, 128, 3, 1, 1), 22 | nn.ReLU(True), 23 | nn.BatchNorm2d(128), 24 | 25 | nn.AvgPool2d(6, 6) 26 | ) 27 | 28 | self.classifier = nn.Sequential( 29 | nn.Dropout(0.2), 30 | nn.Linear(128, 10), 31 | nn.ReLU(True) 32 | ) 33 | 34 | def forward(self, inputs): 35 | out = self.features(inputs) 36 | out = out.view(-1, 128) 37 | out = self.classifier(out) 38 | return out 39 | -------------------------------------------------------------------------------- /model/resnet.py: -------------------------------------------------------------------------------- 1 | '''ResNet18/34/50/101/152 in Pytorch.''' 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | from torch.autograd import Variable 7 | 8 | # Following 2 classes are new - Bhav Ashok 9 | class BasicBlockModifiable(nn.Module): 10 | expansion = 1 11 | 12 | def __init__(self, shortcut=None): 13 | super(BasicBlockModifiable, self).__init__() 14 | self.layers = nn.Sequential() 15 | self.shortcut = shortcut 16 | 17 | def forward(self, x): 18 | residual = x 19 | y = self.layers(x) 20 | if self.shortcut: 21 | residual = self.shortcut(x) 22 | y += residual 23 | y = F.relu(y) 24 | return y 25 | 26 | def add_module(self, name, module): 27 | if module.__class__.__name__ == 'Sequential': 28 | self.layers = module 29 | else: 30 | self.layers.add_module(name, module) 31 | 32 | 33 | class ResNetModifiable(nn.Module): 34 | def __init__(self): 35 | super(ResNetModifiable, self).__init__() 36 | 37 | def forward(self, x): 38 | x = self.pre_layers(x) 39 | x = self.layer1(x) 40 | x = self.layer2(x) 41 | x = self.layer3(x) 42 | x = self.layer4(x) 43 | x = self.avgpool(x) 44 | x = x.view(x.size(0), -1) 45 | x = self.linear(x) 46 | return x 47 | 48 | 49 | def conv3x3(in_planes, out_planes, stride=1): 50 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 51 | 52 | 53 | class BasicBlock(nn.Module): 54 | expansion = 1 55 | 56 | def __init__(self, in_planes, planes, stride=1, shortcut=None): 57 | super(BasicBlock, self).__init__() 58 | self.layers = nn.Sequential( 59 | conv3x3(in_planes, planes, stride), 60 | nn.BatchNorm2d(planes), 61 | nn.ReLU(True), 62 | conv3x3(planes, planes), 63 | nn.BatchNorm2d(planes), 64 | ) 65 | self.shortcut = shortcut 66 | 67 | def forward(self, x): 68 | residual = x 69 | y = self.layers(x) 70 | if self.shortcut: 71 | residual = self.shortcut(x) 72 | y += residual 73 | y = F.relu(y) 74 | return y 75 | 76 | 77 | class Bottleneck(nn.Module): 78 | expansion = 4 79 | 80 | def __init__(self, in_planes, planes, stride=1, shortcut=None): 81 | super(Bottleneck, self).__init__() 82 | self.layers = nn.Sequential( 83 | nn.Conv2d(in_planes, planes, kernel_size=1, bias=False), 84 | nn.BatchNorm2d(planes), 85 | nn.ReLU(True), 86 | nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False), 87 | nn.BatchNorm2d(planes), 88 | nn.ReLU(True), 89 | nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False), 90 | nn.BatchNorm2d(planes * 4), 91 | ) 92 | self.shortcut = shortcut 93 | 94 | def forward(self, x): 95 | residual = x 96 | y = self.layers(x) 97 | if self.shortcut: 98 | residual = self.shortcut(x) 99 | y += residual 100 | y = F.relu(y) 101 | return y 102 | 103 | 104 | class ResNet(nn.Module): 105 | def __init__(self, block, nblocks, num_classes=10): 106 | super(ResNet, self).__init__() 107 | self.in_planes = 64 108 | self.pre_layers = nn.Sequential( 109 | conv3x3(3,64), 110 | nn.BatchNorm2d(64), 111 | nn.ReLU(True), 112 | ) 113 | self.layer1 = self._make_layer(block, 64, nblocks[0]) 114 | self.layer2 = self._make_layer(block, 128, nblocks[1], stride=2) 115 | self.layer3 = self._make_layer(block, 256, nblocks[2], stride=2) 116 | self.layer4 = self._make_layer(block, 512, nblocks[3], stride=2) 117 | self.avgpool = nn.AvgPool2d(4) 118 | self.linear = nn.Linear(512*block.expansion, num_classes) 119 | 120 | def _make_layer(self, block, planes, nblocks, stride=1): 121 | shortcut = None 122 | if stride != 1 or self.in_planes != planes * block.expansion: 123 | shortcut = nn.Sequential( 124 | nn.Conv2d(self.in_planes, planes * block.expansion, 125 | kernel_size=1, stride=stride, bias=False), 126 | nn.BatchNorm2d(planes * block.expansion), 127 | ) 128 | layers = [] 129 | layers.append(block(self.in_planes, planes, stride, shortcut)) 130 | self.in_planes = planes * block.expansion 131 | for i in range(1, nblocks): 132 | layers.append(block(self.in_planes, planes)) 133 | return nn.Sequential(*layers) 134 | 135 | def forward(self, x): 136 | x = self.pre_layers(x) 137 | x = self.layer1(x) 138 | x = self.layer2(x) 139 | x = self.layer3(x) 140 | x = self.layer4(x) 141 | x = self.avgpool(x) 142 | x = x.view(x.size(0), -1) 143 | x = self.linear(x) 144 | return x 145 | 146 | 147 | def resnet18(): 148 | return ResNet(BasicBlock, [2,2,2,2]) 149 | 150 | def resnet34(): 151 | return ResNet(BasicBlock, [3,4,6,3]) 152 | 153 | def resnet50(): 154 | return ResNet(Bottleneck, [3,4,6,3]) 155 | 156 | def resnet101(): 157 | return ResNet(Bottleneck, [3,4,23,3]) 158 | 159 | def resnet152(): 160 | return ResNet(Bottleneck, [3,8,36,3]) 161 | 162 | # net = ResNet(BasicBlock, [2,2,2,2]) 163 | # x = torch.randn(1,3,32,32) 164 | # y = net(Variable(x)) 165 | # print(y.size()) 166 | -------------------------------------------------------------------------------- /model/ssd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from ssd_pytorch.functions import Detect, PriorBox 6 | from ssd_pytorch.modules import L2Norm 7 | from ssd_pytorch.data import v2, v1 8 | import torchvision.transforms as transforms 9 | import torchvision.models as models 10 | import torch.backends.cudnn as cudnn 11 | import os 12 | 13 | 14 | _base = { 15 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 16 | 512, 512, 512], 17 | '512': [], 18 | } 19 | _extras = { 20 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 21 | '512': [], 22 | } 23 | _mbox = { 24 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 25 | '512': [], 26 | } 27 | 28 | 29 | class SSDModifiable(nn.Module): 30 | 31 | def __init__(self, vgg_base): 32 | super(SSDModifiable, self).__init__() 33 | size = 300 34 | num_classes = 21 35 | phase = 'train' 36 | base, extras, head = multibox(vgg_base, 37 | add_extras(_extras[str(size)], 1024), 38 | _mbox[str(size)], num_classes) 39 | self.phase = phase 40 | self.num_classes = num_classes 41 | self.priorbox = PriorBox(v2) 42 | self.priors = Variable(self.priorbox.forward(), volatile=True) 43 | self.size = 300 44 | 45 | # SSD network 46 | self.vgg = vgg_base#nn.ModuleList(base) 47 | # Layer learns to scale the l2 normalized features from conv4_3 48 | num_outputs = 512 49 | for k in self.vgg._modules: 50 | if int(k) < 23 and self.vgg._modules[k].__class__.__name__ == 'Conv2d': 51 | num_outputs = self.vgg._modules[k].out_channels 52 | self.L2Norm = L2Norm(num_outputs, 20) 53 | self.extras = nn.ModuleList(extras) 54 | 55 | self.loc = nn.ModuleList(head[0]) 56 | self.conf = nn.ModuleList(head[1]) 57 | 58 | if phase == 'test': 59 | self.softmax = nn.Softmax() 60 | self.detect = Detect(21, 0, 200, 0.01, 0.25, 400) 61 | 62 | def forward(self, x): 63 | """Applies network layers and ops on input image(s) x. 64 | 65 | Args: 66 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 67 | 68 | Return: 69 | Depending on phase: 70 | test: 71 | Variable(tensor) of output class label predictions, 72 | confidence score, and corresponding location predictions for 73 | each object detected. Shape: [batch,topk,7] 74 | 75 | train: 76 | list of concat outputs from: 77 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 78 | 2: localization layers, Shape: [batch,num_priors*4] 79 | 3: priorbox layers, Shape: [2,num_priors*4] 80 | """ 81 | sources = list() 82 | loc = list() 83 | conf = list() 84 | 85 | # apply vgg up to conv4_3 relu 86 | #print('Reached start of vgg') 87 | for k in self.vgg._modules.keys(): 88 | if int(k) < 23: 89 | #print('Reached ' + k + ' ', x.size()) 90 | x = self.vgg._modules[k].cuda()(x) 91 | #print('Reached L2Norm') 92 | s = self.L2Norm(x) 93 | sources.append(s) 94 | 95 | #print('Reached after L2Norm') 96 | # apply vgg up to fc7 97 | for k in self.vgg._modules.keys(): 98 | if int(k) >= 23: 99 | #print('Reached ' + k + ' ', x.size()) 100 | x = self.vgg._modules[k].cuda()(x) 101 | sources.append(x) 102 | #print('Reached end of VGG') 103 | 104 | # apply extra layers and cache source layer outputs 105 | for k, v in enumerate(self.extras): 106 | x = F.relu(v(x), inplace=True) 107 | if k % 2 == 1: 108 | sources.append(x) 109 | 110 | # apply multibox head to source layers 111 | for (x, l, c) in zip(sources, self.loc, self.conf): 112 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 113 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 114 | 115 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 116 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 117 | 118 | if self.phase == "test": 119 | output = self.detect( 120 | loc.view(loc.size(0), -1, 4), # loc preds 121 | self.softmax(conf.view(-1, self.num_classes)), # conf preds 122 | self.priors # default boxes 123 | ) 124 | else: 125 | output = ( 126 | loc.view(loc.size(0), -1, 4), 127 | conf.view(conf.size(0), -1, self.num_classes), 128 | self.priors 129 | ) 130 | return output 131 | 132 | def load_weights(self, base_file): 133 | other, ext = os.path.splitext(base_file) 134 | if ext == '.pkl' or '.pth': 135 | print('Loading weights into state dict...') 136 | self.load_state_dict(torch.load(base_file)) 137 | print('Finished!') 138 | else: 139 | print('Sorry only .pth and .pkl files supported.') 140 | 141 | def add_module(self, name, module): 142 | if int(name) < 23 and module.__class__.__name__ == 'Conv2d': 143 | self.L2Norm = L2Norm(module.out_channels, 20) 144 | self.vgg.add_module(name, module) 145 | 146 | 147 | # This function is derived from torchvision VGG make_layers() 148 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 149 | def vgg(cfg, i, batch_norm=False): 150 | layers = [] 151 | in_channels = i 152 | for v in cfg: 153 | if v == 'M': 154 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 155 | elif v == 'C': 156 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 157 | else: 158 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 159 | if batch_norm: 160 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 161 | else: 162 | layers += [conv2d, nn.ReLU(inplace=True)] 163 | in_channels = v 164 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 165 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 166 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 167 | layers += [pool5, conv6, 168 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 169 | return layers 170 | 171 | 172 | def add_extras(cfg, i, batch_norm=False): 173 | # Extra layers added to VGG for feature scaling 174 | layers = [] 175 | in_channels = i 176 | flag = False 177 | for k, v in enumerate(cfg): 178 | if in_channels != 'S': 179 | if v == 'S': 180 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 181 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 182 | else: 183 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 184 | flag = not flag 185 | in_channels = v 186 | return layers 187 | 188 | 189 | def multibox(vgg, extra_layers, cfg, num_classes): 190 | loc_layers = [] 191 | conf_layers = [] 192 | vs = 0 193 | for vv in vgg._modules: 194 | if int(vv) <= 24 and vgg._modules[vv].__class__.__name__ == 'Conv2d': 195 | vs = int(vv) 196 | ve = int(list(vgg._modules.keys())[-2]) 197 | vgg_source = [vs, ve] 198 | for k, v in enumerate(vgg_source): 199 | loc_layers += [nn.Conv2d(vgg[v].out_channels, 200 | cfg[k] * 4, kernel_size=3, padding=1)] 201 | conf_layers += [nn.Conv2d(vgg[v].out_channels, 202 | cfg[k] * num_classes, kernel_size=3, padding=1)] 203 | for k, v in enumerate(extra_layers[1::2], 2): 204 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 205 | * 4, kernel_size=3, padding=1)] 206 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 207 | * num_classes, kernel_size=3, padding=1)] 208 | return vgg, extra_layers, (loc_layers, conf_layers) 209 | 210 | 211 | 212 | def build_ssd(phase, size=300, num_classes=21): 213 | if phase != "test" and phase != "train": 214 | print("Error: Phase not recognized") 215 | return 216 | if size != 300: 217 | print("Error: Sorry only SSD300 is supported currently!") 218 | return 219 | 220 | return SSDModifiable(phase, *multibox(vgg(base[str(size)], 3), 221 | add_extras(extras[str(size)], 1024), 222 | mbox[str(size)], num_classes), num_classes) 223 | -------------------------------------------------------------------------------- /model/vgg.py: -------------------------------------------------------------------------------- 1 | '''VGG11/13/16/19 in Pytorch.''' 2 | import torch 3 | import torch.nn as nn 4 | from torch.autograd import Variable 5 | 6 | 7 | cfg = { 8 | 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 9 | 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 10 | 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 11 | 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], 12 | } 13 | 14 | 15 | class VGG(nn.Module): 16 | def __init__(self, vgg_name): 17 | super(VGG, self).__init__() 18 | self.features = self._make_layers(cfg[vgg_name]) 19 | self.classifier = nn.Linear(512, 10) 20 | 21 | def forward(self, x): 22 | x = self.features(x) 23 | x = x.view(x.size(0), -1) 24 | x = self.classifier(x) 25 | return x 26 | 27 | def _make_layers(self, cfg): 28 | layers = [] 29 | in_channels = 3 30 | for x in cfg: 31 | if x == 'M': 32 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 33 | else: 34 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 35 | nn.BatchNorm2d(x), 36 | nn.ReLU(inplace=True)] 37 | in_channels = x 38 | layers += [nn.AvgPool2d(kernel_size=1, stride=1)] 39 | return nn.Sequential(*layers) 40 | 41 | # net = VGG('VGG11') 42 | # x = torch.randn(2,3,32,32) 43 | # print(net(Variable(x)).size()) 44 | -------------------------------------------------------------------------------- /rl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | from Layer import * 4 | from utils import * 5 | from architecture import * 6 | from torch import nn 7 | from torch import optim 8 | from torch.autograd import Variable 9 | from torch import autograd 10 | 11 | 12 | 13 | class Controller: 14 | def __init__(self, controllerClass, input_size, output_size, hidden_size, num_layers, lr=0.003, skipSupport=False, kwargs={}): 15 | self.input_size = input_size 16 | self.output_size = output_size 17 | self.hidden_size = hidden_size 18 | self.num_layers = num_layers 19 | self.kwargs = kwargs 20 | if isinstance(controllerClass, basestring): 21 | self.controller = torch.load(controllerClass) 22 | else: 23 | self.controller = controllerClass(input_size, output_size, hidden_size, num_layers, **kwargs) 24 | self.optimizer = optim.Adam(self.controller.parameters(), lr=lr) 25 | self.skipSupport = skipSupport 26 | self.actionSeqs = [] 27 | 28 | def update_controller(self, avgR, b): 29 | for actions in self.actionSeqs: 30 | if isinstance(actions, list): 31 | for action in actions: 32 | action.reinforce(avgR - b) 33 | else: 34 | actions.reinforce(avgR - b) 35 | self.optimizer.zero_grad() 36 | autograd.backward(actions, [None for _ in actions]) 37 | self.optimizer.step() 38 | self.actionSeqs = [] 39 | 40 | def rolloutActions(self, layers): 41 | num_input = self.input_size 42 | num_hidden = self.hidden_size 43 | num_layers = self.num_layers 44 | num_directions = 2 if (('bidirectional' in self.kwargs) and (self.kwargs['bidirectional'])) else 1 45 | hn = Variable(torch.zeros(num_layers * num_directions, 1, num_hidden)) 46 | cn = Variable(torch.zeros(num_layers * num_directions, 1, num_hidden)) 47 | input = Variable(torch.Tensor(len(layers), 1, num_input)) 48 | for i in range(len(layers)): 49 | input[i] = Layer(layers[i]).toTorchTensor(skipSupport=self.skipSupport) 50 | actions = self.controller(input, (hn, cn)) 51 | self.actionSeqs.append(actions) 52 | return actions 53 | 54 | 55 | def getEpsilon(iter, max_iter=15.0): 56 | return min(1, max(0, (1-iter/float(max_iter))**4)) #return 0 57 | ''' 58 | def getConstrainedReward(R_a, R_c, cons, vars, iter): 59 | eps = getEpsilon(iter) 60 | modelSize = vars[0] 61 | modelSizeConstraint = cons[0] 62 | if modelSize > modelSizeConstraint: 63 | return (eps - 1) + eps * (R_a * R_c) 64 | else: 65 | return R_a * R_c 66 | ''' 67 | def getConstrainedReward(R_a, R_c, acc, params, acc_constraint, size_constraint, epoch, soft=True): 68 | eps = getEpsilon(epoch) if soft else 0 69 | if (size_constraint and params > size_constraint) or (acc_constraint and acc < acc_constraint): 70 | return (eps - 1) + eps * (R_a * R_c) 71 | return R_a * R_c 72 | 73 | 74 | def Reward(acc, params, baseline_acc, baseline_params, size_constraint=None, acc_constraint=None, epoch=-1): 75 | R_a = (acc/baseline_acc) #if acc > 0.92 else -1 76 | C = (float(baseline_params - params))/baseline_params 77 | R_c = C*(2-C) 78 | if size_constraint or acc_constraint: 79 | return getConstrainedReward(R_a, R_c, acc, params, acc_constraint, size_constraint, epoch) 80 | return (R_a) * (R_c) 81 | 82 | previousModels = {} 83 | def rollout_batch(model, controller, architecture, dataset, N, e, acc_constraint=None, size_constraint=None): 84 | newModels = [] 85 | idxs = [] 86 | Rs = [0]*N 87 | actionSeqs = [] 88 | studentModels = [] 89 | for i in range(N): 90 | model_ = copy.deepcopy(model) 91 | layers = layersFromModule(model_) 92 | actions = controller.rolloutActions(layers) 93 | actionSeqs.append(actions) 94 | newModel = architecture.generateChildModel([a.data.numpy()[0] for a in actions]) 95 | hashcode = hash(str(newModel)) if newModel else 0 96 | if hashcode in previousModels and constrained == False: 97 | Rs[i] = previousModels[hashcode] 98 | elif newModel is None: 99 | Rs[i] = -1 100 | else: 101 | print(newModel) 102 | #torch.save(newModel, modelSavePath + '%f_%f.net' % (e, i)) 103 | newModels.append(newModel) 104 | studentModels.append(newModel) 105 | idxs.append(i) 106 | accs = trainNormalParallel(studentModels, dataset, epochs=5) if architecture.datasetName is 'caltech256' else trainTeacherStudentParallel(model, studentModels, dataset, epochs=5) 107 | for acc in accs: 108 | print('Val accuracy: %f' % acc) 109 | for i in range(len(newModels)): 110 | print('Compression: %f' % (1.0 - (float(numParams(newModels[i]))/architecture.parentSize))) 111 | #R = [Reward(accs[i], numParams(newModels[i]), architecture.baseline_acc, architecture.parentSize, iter=int(e), constrained=constrained, vars=[numParams(newModels[i])], cons=[1700000]) for i in range(len(accs))] 112 | R = [Reward(accs[i], numParams(newModels[i]), architecture.baseline_acc, architecture.parentSize, size_constraint=size_constraint, acc_constraint=acc_constraint, epoch=e) for i in range(len(accs))] 113 | for i in range(len(idxs)): 114 | Rs[idxs[i]] = R[i] 115 | for i in range(len(Rs)): 116 | print('Reward achieved %f' % Rs[i]) 117 | return (Rs, actionSeqs, newModels) 118 | 119 | 120 | def rollouts(N, model, controller, architecture, dataset, e, size_constraint=None, acc_constraint=None): 121 | Rs = [] 122 | actionSeqs = [] 123 | models = [] 124 | (Rs, actionSeqs, models) = rollout_batch(copy.deepcopy(model), controller, architecture, dataset, N, e, acc_constraint=acc_constraint, size_constraint=size_constraint) 125 | return (Rs, actionSeqs, models) 126 | 127 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch import nn 4 | from torch import optim 5 | from torch.autograd import Variable 6 | import numpy as np 7 | from copy import deepcopy 8 | import argparse 9 | from rl import * 10 | from architecture import * 11 | import os 12 | import warnings 13 | warnings.filterwarnings("ignore") 14 | with warnings.catch_warnings(): 15 | warnings.simplefilter('ignore') 16 | 17 | parser = argparse.ArgumentParser(description='N2N: Network to Network Compression using Policy Gradient Reinforcement Learning') 18 | parser.add_argument('mode', type=str, choices=['removal', 'shrinkage'], 19 | help='Which mode to run the program') 20 | parser.add_argument('dataset', type=str, choices=['mnist', 'cifar10', 'cifar10_old', 'cifar100', 'svhn', 'caltech256'], 21 | help='Name of dataset') 22 | parser.add_argument('teacherModel', type=str, 23 | help='Path to teacher model') 24 | parser.add_argument('--model', type=str, required=False, 25 | help='Path to base model architecture if different from teacherModel') 26 | parser.add_argument('--cuda', type=bool, required=False, default=True, 27 | help='Use GPU or not') 28 | parser.add_argument('--gpuids', type=list, required=False, default=[0], 29 | help='Which GPUs to use') 30 | parser.add_argument('--debug', type=bool, required=False, default=False, 31 | help='Debug mode') 32 | parser.add_argument('--size_constraint', type=int, required=False, 33 | help='Add a constraint on size in # parameters') 34 | parser.add_argument('--acc_constraint', type=float, required=False, 35 | help='Add a constraint on accuracy in [0, 1]') 36 | parser.add_argument('--controller', type=str, required=False, 37 | help='Path to a previously trained controller') 38 | args = parser.parse_args() 39 | 40 | if len(args.gpuids) > 1: 41 | print('Parallel version not implemented yet') 42 | else: 43 | torch.cuda.set_device(args.gpuids[0]) 44 | 45 | # ----DATASETS---- 46 | if args.dataset == 'mnist': 47 | import datasets.mnist as dataset 48 | elif args.dataset == 'cifar10': 49 | import datasets.cifar10 as dataset 50 | elif args.dataset == 'cifar10_old': 51 | import datasets.cifar10_old as dataset 52 | elif args.dataset == 'cifar100': 53 | import datasets.cifar100 as dataset 54 | elif args.dataset == 'svhn': 55 | import datasets.svhn as dataset 56 | elif args.dataset == 'caltech256': 57 | import datasets.caltech256 as dataset 58 | elif args.dataset == 'imagenet': 59 | import datasets.imagenet as dataset 60 | else: 61 | print('Dataset not found: ' + args.dataset) 62 | quit() 63 | 64 | print('Using %s as dataset' % args.dataset) 65 | dataset.cuda = args.cuda 66 | datasetInputTensor = dataset.test_loader.dataset[0][0].unsqueeze(0) 67 | print(datasetInputTensor.size()) 68 | baseline_acc = None 69 | 70 | # ----MODELS---- 71 | # Load teacherModel 72 | teacherModel = torch.load(args.teacherModel) 73 | # Load baseModel (if available) 74 | model = torch.load(args.model) if args.model else deepcopy(teacherModel) 75 | 76 | # ----PATHS---- 77 | # Define save paths 78 | controllerSavePath = './controllers_%s/' % args.dataset 79 | if not os.path.exists(controllerSavePath): 80 | os.mkdir(controllerSavePath) 81 | modelSavePath = './models_%s' % args.dataset 82 | 83 | # ----HYPERPARAMETERS---- 84 | # Initialize controller based on mode 85 | skipSupport = False 86 | num_layers = 2 87 | num_hidden = 30 88 | num_input = 7 if skipSupport else 5 89 | lookup = [0.25 , .5, .5, .5, .5, .5, .6, .7, .8, .9, 1.] # Used for shrinkage only 90 | controller = None 91 | optim_controller = None 92 | lr = 0.003 93 | 94 | # ----MODE---- 95 | if args.mode == 'removal': 96 | num_output = 2 97 | #from controllers.ActorCriticLSTM import * 98 | from controllers.LSTM import * 99 | controllerClass = LSTM 100 | extraControllerParams = {'bidirectional': True} 101 | lr = 0.003 102 | elif args.mode == 'shrinkage': 103 | num_output = len(lookup) 104 | from controllers.AutoregressiveParam import * 105 | controllerClass = LSTMAutoParams 106 | extraControllerParams = {'lookup': lookup} 107 | lr = 0.1 108 | else: 109 | print('Mode not known: ' + args.mode) 110 | quit() 111 | 112 | 113 | # ----CONSTRAINTS---- 114 | size_constraint = args.size_constraint 115 | acc_constraint = args.acc_constraint 116 | 117 | # Identify baseline accuracy of base model 118 | dataset.net = model.cuda() if args.cuda else model 119 | print('Testing parent model to determine baseline accuracy') 120 | baseline_acc = baseline_acc if baseline_acc != None else dataset.test() 121 | 122 | 123 | # Store statistics for each model 124 | previousModels = {} 125 | accsPerModel = {} 126 | paramsPerModel = {} 127 | rewardsPerModel = {} 128 | numSavedModels = 0 129 | 130 | # Reward terms for reinforce baseline 131 | R_sum = 0 132 | b = 0 133 | 134 | epochs = 100 135 | N = 5 136 | prevRs = [0] * N 137 | if args.controller: 138 | controllerClass = args.controller 139 | controller = Controller(controllerClass, num_input, num_output, num_hidden, num_layers, lr=lr, skipSupport=skipSupport, kwargs=extraControllerParams) 140 | architecture = Architecture(args.mode, model, datasetInputTensor, args.dataset, baseline_acc=baseline_acc, lookup=lookup) 141 | # ----MAIN LOOP---- 142 | for e in range(epochs): 143 | # Compute N rollouts 144 | (Rs, actionSeqs, models) = rollouts(N, model, controller, architecture, dataset, e, size_constraint=size_constraint, acc_constraint=acc_constraint) 145 | saveModels(e, models, modelSavePath) 146 | # Compute average reward 147 | avgR = np.mean(Rs) 148 | print('Average reward: %f' % avgR) 149 | #b = np.mean(prevRs[-5:]) 150 | prevRs.append(avgR) 151 | b = R_sum/float(e+1) 152 | R_sum = R_sum + avgR 153 | # Update controller 154 | print('Reinforcing for epoch %d' % e) 155 | controller.update_controller(avgR, b) 156 | 157 | torch.save(controller, controllerSavePath) 158 | resultsFile = open(os.path.join(modelSavePath, 'results.txt'), "w") 159 | output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel) 160 | -------------------------------------------------------------------------------- /test_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import warnings 4 | warnings.filterwarnings('ignore') 5 | with warnings.catch_warnings(): 6 | warnings.simplefilter('ignore') 7 | 8 | parser = argparse.ArgumentParser(description='Test model') 9 | parser.add_argument('model', type=str, 10 | help='Path to model') 11 | parser.add_argument('--cuda', type=str, default=True, 12 | help='Use GPU or not') 13 | parser.add_argument('dataset', type=str, choices=['mnist', 'cifar10', 'cifar10_old', 'cifar100', 'svhn', 'caltech256'], 14 | help='Name of dataset') 15 | args = parser.parse_args() 16 | 17 | # ----DATASETS---- 18 | if args.dataset == 'mnist': 19 | import datasets.mnist as dataset 20 | elif args.dataset == 'cifar10': 21 | import datasets.cifar10 as dataset 22 | elif args.dataset == 'cifar10_old': 23 | import datasets.cifar10_old as dataset 24 | elif args.dataset == 'cifar100': 25 | import datasets.cifar100 as dataset 26 | elif args.dataset == 'svhn': 27 | import datasets.svhn as dataset 28 | elif args.dataset == 'caltech256': 29 | import datasets.caltech256 as dataset 30 | elif args.dataset == 'imagenet': 31 | import datasets.imagenet as dataset 32 | else: 33 | print('Dataset not found: ' + args.dataset) 34 | quit() 35 | 36 | 37 | model = torch.load(args.model) 38 | dataset.net = model.cuda() if args.cuda else model 39 | 40 | acc = dataset.test() 41 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, optim 3 | from torch.autograd import Variable 4 | from torchvision import transforms 5 | import os 6 | 7 | from Model import Model 8 | from model.resnet import * 9 | from model.lenet import * 10 | 11 | def resizeLayer(layer, in_channels, out_channels, kernel_size=1, stride=1, padding=1, dilation=1): 12 | if dilation == 1 and hasattr(layer, 'dilation'): 13 | dilation = layer.dilation 14 | if layer.__class__.__name__ is 'Conv2d': 15 | kernel_size = (kernel_size, kernel_size) if type(kernel_size) is not tuple else kernel_size 16 | stride = (stride, stride) if type(stride) is not tuple else stride 17 | padding = (padding, padding) if type(padding) is not tuple else padding 18 | sd = layer.state_dict() 19 | sd['weight'].resize_(out_channels, in_channels, kernel_size[0], kernel_size[1]) 20 | if 'bias' in sd: 21 | sd['bias'].resize_(out_channels) 22 | # Define new layer 23 | layer = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation=dilation) 24 | else: 25 | layer = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False) 26 | layer.load_state_dict(sd) 27 | if layer.__class__.__name__ is 'MaxPool2d': 28 | layer = nn.MaxPool2d(kernel_size, stride=stride, dilation=dilation) 29 | if layer.__class__.__name__ is 'Linear': 30 | sd = layer.state_dict() 31 | sd['weight'].resize_(out_channels, in_channels) 32 | sd['bias'].resize_(out_channels) 33 | layer = nn.Linear(in_channels, out_channels) 34 | layer.load_state_dict(sd) 35 | if layer.__class__.__name__ is 'ReLU': 36 | layer = nn.ReLU(inplace=False) 37 | if layer.__class__.__name__ is 'BatchNorm2d': 38 | sd = layer.state_dict() 39 | for k in sd: 40 | sd[k].resize_(in_channels) 41 | layer = nn.BatchNorm2d(in_channels, eps=layer.eps, momentum=layer.momentum, affine=layer.affine) 42 | layer.load_state_dict(sd) 43 | return layer 44 | 45 | 46 | def determine_fc_size(inp, model): 47 | output = model.features(inp) 48 | return output.view(-1).size()[0] 49 | 50 | def output_results(resultsFile, accsPerModel, paramsPerModel, rewardsPerModel): 51 | resultsString = '' 52 | s = '-- Models ranked by accuracy --' 53 | print(s) 54 | resultsString += s + "\n" 55 | i = 1 56 | for k in sorted(accsPerModel, key=accsPerModel.get)[::-1]: 57 | s = '#%d: model%f acc %f' % (i, k, accsPerModel[k]) 58 | print(s) 59 | resultsString += s + "\n" 60 | i += 1 61 | i = 1 62 | s = '-- Models ranked by size --' 63 | print(s) 64 | resultsString += s + "\n" 65 | for k in sorted(paramsPerModel, key=paramsPerModel.get): 66 | s = '#%d: model%f size %d' % (i, k, paramsPerModel[k]) 67 | print(s) 68 | resultsString += s + "\n" 69 | i += 1 70 | i = 1 71 | for k in sorted(rewardsPerModel, key=rewardsPerModel.get)[::-1]: 72 | s = '#%d: model%f reward %f ' % (i, k, rewardsPerModel[k]) 73 | print(s) 74 | resultsString += s + "\n" 75 | i += 1 76 | if resultsFile: 77 | resultsFile.write(resultsString) 78 | 79 | def numParams(model): 80 | return sum([len(w.view(-1)) for w in model.parameters()]) 81 | 82 | 83 | def train(dataset, net): 84 | net.add_module('LogSoftmax', nn.LogSoftmax()) 85 | print (dataset.args.cuda) 86 | dataset.net = net.cuda() if dataset.args.cuda else net.cpu() 87 | train_acc = [] 88 | val_acc = [-1] 89 | for i in xrange(1, dataset.args.epochs+1): 90 | train_acc.append(dataset.train(i)) 91 | acc = dataset.test() 92 | if i >= 2 and acc < 0.2: 93 | break 94 | print('Val acc: ' + str(acc)) 95 | val_acc.append(acc) 96 | return max(val_acc) 97 | 98 | def removeLayers(m, type): 99 | if m.__class__.__name__ == type: 100 | return True 101 | for k in m._modules.keys(): 102 | res = removeLayers(m._modules[k], type) 103 | if res: 104 | del m._modules[k] 105 | return False 106 | 107 | import time 108 | import itertools 109 | def trainTeacherStudent(teacher, student, dataset, epochs=5, lr=0.0005): 110 | startTime = time.time() 111 | student = student.cuda() 112 | teacher = teacher.cuda() 113 | # If there is a log softmax somewhere, delete it in both teacher and student 114 | removeLayers(teacher, type='LogSoftmax') 115 | removeLayers(teacher, type='Softmax') 116 | removeLayers(student, type='LogSoftmax') 117 | removeLayers(student, type='Softmax') 118 | MSEloss = nn.MSELoss().cuda() 119 | optimizer = optim.SGD(student.parameters(), lr=lr, momentum=0.9, nesterov=True, weight_decay=5e-4) 120 | student.train() 121 | for i in range(1, epochs+1): 122 | for b_idx, (data, targets) in enumerate(dataset.train_loader): 123 | data = data.cuda() 124 | data = Variable(data) 125 | optimizer.zero_grad() 126 | studentOutput = student(data) 127 | teacherOutput = teacher(data).detach() 128 | loss = MSEloss(studentOutput, teacherOutput) 129 | loss.backward() 130 | optimizer.step() 131 | student.add_module('LogSoftmax', nn.LogSoftmax()) 132 | dataset.net = student 133 | removeLayers(student, type='LogSoftmax') 134 | print(dataset.test()) 135 | print('Train Epoch: {} \tLoss: {:.6f}'.format(i, loss.data[0])) 136 | student.add_module('LogSoftmax', nn.LogSoftmax()) 137 | dataset.net = student 138 | acc = dataset.test() 139 | print('Time elapsed: {}'.format(time.time()-startTime)) 140 | return acc 141 | 142 | import torch.nn.functional as F 143 | def trainTeacherStudentRand(teacher, student, dataset, epochs=50, lr=0.0001): 144 | startTime = time.time() 145 | student = student.cuda() 146 | teacher = teacher.cuda() 147 | # If there is a log softmax somewhere, delete it in both teacher and student 148 | removeLayers(teacher, type='LogSoftmax') 149 | removeLayers(teacher, type='Softmax') 150 | removeLayers(student, type='LogSoftmax') 151 | removeLayers(student, type='Softmax') 152 | MSEloss = nn.MSELoss().cuda() 153 | optimizer = optim.Adam(student.parameters(), lr=lr, weight_decay=5e-4) 154 | student.train() 155 | for i in range(1, epochs+1): 156 | for b_idx, (data, targets) in enumerate(dataset.train_loader): 157 | data = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))(torch.rand(64, 3, 32, 32)).cuda() 158 | data = Variable(data) 159 | optimizer.zero_grad() 160 | studentOutput = student(data) 161 | teacherOutput = teacher(data).detach() 162 | loss = MSEloss(studentOutput, teacherOutput) 163 | loss.backward() 164 | optimizer.step() 165 | student.add_module('LogSoftmax', nn.LogSoftmax()) 166 | dataset.net = student 167 | removeLayers(student, type='LogSoftmax') 168 | print(dataset.test()) 169 | print('Train Epoch: {} \tLoss: {:.6f}'.format(i, loss.data[0])) 170 | student.add_module('LogSoftmax', nn.LogSoftmax()) 171 | dataset.net = student 172 | acc = dataset.test() 173 | print('Time elapsed: {}'.format(time.time()-startTime)) 174 | return acc 175 | 176 | def trainTeacherStudentNew(teacher, student, dataset, epochs=5, lr=0.0005, T=3.0, lambd=0.3): 177 | startTime = time.time() 178 | student = student.cuda() 179 | teacher = teacher.cuda() 180 | # If there is a log softmax somewhere, delete it in both teacher and student 181 | removeLayers(teacher, type='LogSoftmax') 182 | removeLayers(teacher, type='Softmax') 183 | removeLayers(student, type='LogSoftmax') 184 | removeLayers(student, type='Softmax') 185 | MSEloss = nn.MSELoss().cuda() 186 | optimizer = optim.Adam(student.parameters(), lr=lr, weight_decay=5e-4) 187 | student.train() 188 | for i in range(1, epochs+1): 189 | for b_idx, (data, targets) in enumerate(dataset.train_loader): 190 | data = data.cuda() 191 | data = Variable(data) 192 | targets = targets.cuda() 193 | targets = Variable(targets) 194 | optimizer.zero_grad() 195 | studentOutput = F.log_softmax(student(data)/T) 196 | teacherOutput = F.log_softmax(teacher(data).detach()/T) 197 | loss = (1-lambd)*MSEloss(studentOutput, teacherOutput) + lambd*F.nll_loss(studentOutput, targets) 198 | loss.backward() 199 | optimizer.step() 200 | student.add_module('LogSoftmax', nn.LogSoftmax()) 201 | dataset.net = student 202 | removeLayers(student, type='LogSoftmax') 203 | print(dataset.test()) 204 | print('Train Epoch: {} \tLoss: {:.6f}'.format(i, loss.data[0])) 205 | student.add_module('LogSoftmax', nn.LogSoftmax()) 206 | dataset.net = student 207 | acc = dataset.test() 208 | print('Time elapsed: {}'.format(time.time()-startTime)) 209 | return acc 210 | 211 | def trainTeacherStudentParallel(teacher, students, dataset, epochs=5, lr=0.0005): 212 | if len(students) == 0: 213 | return [] 214 | startTime = time.time() 215 | students = [student.cuda() for student in students] 216 | teacher = teacher.cuda() 217 | # If there is a log softmax somewhere, delete it in both teacher and student 218 | removeLayers(teacher, type='LogSoftmax') 219 | for student in students: 220 | removeLayers(student, type='LogSoftmax') 221 | student.train() 222 | MSEloss = nn.MSELoss().cuda() 223 | optimizers = [optim.Adam(student.parameters(), lr=lr, weight_decay=5e-4) for student in students] 224 | for i in range(1, epochs+1): 225 | for b_idx, (data, targets) in enumerate(dataset.train_loader): 226 | data = data.cuda() 227 | teacherOutput = teacher(Variable(data)).detach() 228 | for j in range(len(students)): 229 | studentData = Variable(data) 230 | optimizers[j].zero_grad() 231 | studentOutput = students[j](studentData) 232 | loss = MSEloss(studentOutput, teacherOutput) 233 | loss.backward() 234 | optimizers[j].step() 235 | print('Train Epoch: {}'.format(i)) 236 | for j in range(len(students)): 237 | removeLayers(students[j], type='LogSoftmax') 238 | students[j].add_module('LogSoftmax', nn.LogSoftmax()) 239 | dataset.net = students[j] 240 | print('Student {} acc {}'.format(j, dataset.test())) 241 | removeLayers(student, type='LogSoftmax') 242 | 243 | accs = [] 244 | for student in students: 245 | removeLayers(student, type='LogSoftmax') 246 | student.add_module('LogSoftmax', nn.LogSoftmax()) 247 | dataset.net = student 248 | accs.append(dataset.test()) 249 | print('Time elapsed {}'.format(time.time() - startTime)) 250 | return accs 251 | 252 | def trainNormal(studentModel, dataset, epochs=5): 253 | return trainNormalParallel([studentModel], dataset, epochs)[0] 254 | 255 | def trainNormalParallel(studentModels, dataset, epochs=5): 256 | accs = [] 257 | for model in studentModels: 258 | dataset.net = model 259 | for i in range(1, epochs+1): 260 | dataset.train(i) 261 | acc = dataset.test() 262 | accs.append(acc) 263 | return accs 264 | 265 | 266 | layerTypes = ['Unknown', 'Conv2d', 'MaxPool2d', 'ReLU', 'BatchNorm2d', 'Linear', 'Dropout', 'LogSoftmax', 'AvgPool2d', 'L2Norm', 'Softmax'] 267 | def getLayerType(layer): 268 | name = layer.__class__.__name__ 269 | return max(layerTypes.index(name), 0) 270 | 271 | import torch.nn.init as init 272 | def weights_init(m): 273 | if isinstance(m, nn.Conv2d): 274 | init.xavier_uniform(m.weight) 275 | 276 | 277 | def resetModel(m): 278 | if len(m._modules) == 0 and hasattr(m, 'reset_parameters'): 279 | m.reset_parameters() 280 | return 281 | for i in m._modules.values(): 282 | resetModel(i) 283 | 284 | ''' 285 | def resetModel(model): 286 | for l in model.features._modules.values(): 287 | if hasattr(l, 'reset_parameters'): 288 | l.reset_parameters() 289 | 290 | for l in model.classifier._modules.values(): 291 | if hasattr(l, 'reset_parameters'): 292 | l.reset_parameters() 293 | #model.apply(weights_init) 294 | return model 295 | ''' 296 | 297 | import Layer 298 | def resizeToFit(layer, inp): 299 | if layer._layer.__class__.__name__ is 'Linear': 300 | in_channels = inp.view(inp.size(0), -1).size(1) 301 | return resizeLayer(layer._layer, in_channels, layer._layer.out_features) 302 | in_channels = inp.size(1) 303 | if 'weight' in layer._layer._parameters: 304 | _, kernel_size, stride, out_channels, padding = layer.getRepresentation() 305 | return resizeLayer(layer._layer, in_channels, out_channels, kernel_size, stride, padding) 306 | if layer._layer.__class__.__name__ is 'ReLU': 307 | return nn.ReLU(inplace=False) 308 | return layer._layer 309 | 310 | def createParentContainer(m): 311 | classname = m.__class__.__name__ 312 | if classname == 'Sequential': 313 | return nn.Sequential() 314 | elif classname in ['BasicBlock', 'Bottleneck', 'BasicBlockModifiable']: 315 | return BasicBlockModifiable(shortcut=m.shortcut if hasattr(m, 'shortcut') else None) 316 | elif classname == 'ResNet' or classname == 'ResNetModifiable': 317 | return ResNetModifiable() 318 | elif classname == 'VGG': 319 | return Model(None, None) 320 | elif classname == 'LeNet': 321 | return Model(None, None) 322 | elif classname == 'mnist_model': 323 | return Model(None, None) 324 | elif classname == 'Model': 325 | return Model(None, None) 326 | elif classname == 'SSD': 327 | from model.ssd import SSDModifiable 328 | return SSDModifiable() 329 | elif classname == 'ModuleList': 330 | return nn.ModuleList() 331 | 332 | 333 | def flattenModule(m): 334 | if len(m._modules) == 0: 335 | return [m] 336 | top = [] 337 | for i in m._modules.values(): 338 | bottom = flattenModule(i) 339 | top.extend(bottom) 340 | return top 341 | 342 | 343 | def layersFromModule(m): 344 | if len(m._modules) == 0: 345 | m.skipstart = 0 346 | m.skipend = 0 347 | return [m] 348 | top = [] 349 | for i in m._modules: 350 | bottom = layersFromModule(m._modules[i]) 351 | #print(i, bottom) 352 | if i in ['layers']: 353 | # Introduce skip connections to layers in bottom 354 | n = len(bottom) 355 | for j in range(n): 356 | bottom[j].skipstart = j 357 | bottom[j].skipend = n - j - 1 358 | top.extend(bottom) 359 | return top 360 | 361 | 362 | def saveModels(epoch, models, modelSavePath): 363 | for i in range(len(models)): 364 | torch.save(models[i], os.path.join(modelSavePath, '%f_%f.net' %(epoch, i))) 365 | --------------------------------------------------------------------------------