├── .gitignore ├── LICENSE ├── load.py ├── README.md ├── measure_stat.py └── magic_init.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | **/*.pyc 3 | caffe 4 | *.kdev4 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Philipp Krähenbühl 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. -------------------------------------------------------------------------------- /load.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | 3 | def parseProtoString(s): 4 | from google.protobuf import text_format 5 | from caffe.proto import caffe_pb2 as pb 6 | proto_net = pb.NetParameter() 7 | text_format.Merge(s, proto_net) 8 | return proto_net 9 | 10 | 11 | def get_param(l, exclude=set(['top', 'bottom', 'name', 'type'])): 12 | if not hasattr(l,'ListFields'): 13 | if hasattr(l,'__delitem__'): 14 | return list(l) 15 | return l 16 | r = dict() 17 | for f, v in l.ListFields(): 18 | if f.name not in exclude: 19 | r[f.name] = get_param(v, []) 20 | return r 21 | 22 | class ProtoDesc: 23 | def __init__(self, prototxt): 24 | from os import path 25 | self.prototxt = prototxt 26 | self.parsed_proto = parseProtoString(open(self.prototxt, 'r').read()) 27 | # Guess the input dimension 28 | self.input_dim = (3, 227, 227) 29 | net = self.parsed_proto 30 | if len(net.input_dim) > 0: 31 | self.input_dim = net.input_dim[1:] 32 | else: 33 | lrs = net.layer 34 | cs = [l.transform_param.crop_size for l in lrs 35 | if l.HasField('transform_param')] 36 | if len(cs): 37 | self.input_dim = (3, cs[0], cs[0]) 38 | 39 | def __call__(self, clip=None, **inputs): 40 | from caffe import layers as L 41 | from collections import OrderedDict 42 | net = self.parsed_proto 43 | blobs = OrderedDict(inputs) 44 | for l in net.layer: 45 | if l.name not in inputs: 46 | in_place = l.top == l.bottom 47 | param = get_param(l) 48 | assert all([b in blobs for b in l.bottom]), "Some bottoms not founds: " + ', '.join([b for b in l.bottom if not b in blobs]) 49 | tops = getattr(L, l.type)(*[blobs[b] for b in l.bottom], 50 | ntop=len(l.top), in_place=in_place, 51 | name=l.name, 52 | **param) 53 | if len(l.top) <= 1: 54 | tops = [tops] 55 | for i, t in enumerate(l.top): 56 | blobs[t] = tops[i] 57 | if l.name == clip: 58 | break 59 | return list(blobs.values())[-1] 60 | 61 | 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-dependent initialization of convolutional neural networks 2 | 3 | Created by Philipp Krähenbühl. 4 | 5 | ### Introduction 6 | 7 | This code implements the initialization presented in our [arXiv tech report](http://arxiv.org/abs/1511.06856), which is under submission at ICLR 2016. 8 | 9 | *This is a reimplementation and currently work in progress. Use at your own risk.* 10 | 11 | ### License 12 | 13 | This code is released under the BSD License (refer to the LICENSE file for details). 14 | 15 | ### Citing 16 | 17 | If you find our initialization useful in your research, please consider citing: 18 | 19 | @article{krahenbuhl2015data, 20 | title={Data-dependent Initializations of Convolutional Neural Networks}, 21 | author={Kr{\"a}henb{\"u}hl, Philipp and Doersch, Carl and Donahue, Jeff and Darrell, Trevor}, 22 | journal={arXiv preprint arXiv:1511.06856}, 23 | year={2015} 24 | } 25 | 26 | ### Setup 27 | 28 | Checkout the project and create a symlink to caffe in the `magic_init` directory: 29 | ```Shell 30 | ln -s path/to/caffe/python/caffe caffe 31 | ``` 32 | 33 | ### Examples 34 | 35 | Here is a quick example on how to initialize alexnet: 36 | ```bash 37 | python magic_init.py path/to/alexnet/deploy.prototxt path/to/output.caffemodel -d "path/to/some/images/*.png" -q -nit 10 -cs 38 | ``` 39 | Here ```-d``` flag allows you to initialize the network using your own images. Feel free to use imagenet, Pascal, COCO or whatever you have at hand, it shouldn't make a big difference. The ```-q``` (queit) flag suppresses all the caffe logging, ```-nit``` controls the number of batches used (while ```-bs``` controls the batch size). Finally ```-cs``` rescales the gradients accross layers. This rescaling currently works best for feed-forward networks, and might not work too well for DAG structured networks (we are working on that). 40 | 41 | To run the k-means initialization use: 42 | ```bash 43 | python magic_init.py path/to/alexnet/deploy.prototxt path/to/output.caffemodel -d "path/to/some/images/*.png" -q -nit 10 -cs -t kmeans 44 | ``` 45 | 46 | Finally, ```python magic_init.py -h``` should provide you with more help. 47 | 48 | 49 | ### Pro tips 50 | If you're numpy implementation is based on openblas, try disabeling threading ```export OPENBLAS_NUM_THREADS=1```, it can improve the runtime performance a bit. 51 | -------------------------------------------------------------------------------- /measure_stat.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from magic_init import * 3 | 4 | class BCOLORS: 5 | HEADER = '\033[95m' 6 | OKBLUE = '\033[94m' 7 | OKGREEN = '\033[92m' 8 | WARNING = '\033[93m' 9 | FAIL = '\033[91m' 10 | ENDC = '\033[0m' 11 | BOLD = '\033[1m' 12 | UNDERLINE = '\033[4m' 13 | 14 | class NOCOLORS: 15 | HEADER = '' 16 | OKBLUE = '' 17 | OKGREEN = '' 18 | WARNING = '' 19 | FAIL = '' 20 | ENDC = '' 21 | BOLD = '' 22 | UNDERLINE = '' 23 | 24 | def coloredNumbers(v, color=None, fmt='%6.2f', max_display=300, bcolors=BCOLORS): 25 | import numpy as np 26 | # Display a numpy array and highlight the min and max values [required a nice linux 27 | # terminal supporting colors] 28 | r = "" 29 | mn, mx = np.min(v), np.max(v) 30 | for k,i in enumerate(v): 31 | if len(v) > max_display and k > max_display/2 and k < len(v) - max_display/2: 32 | if r[-1] != '.': 33 | r += '...' 34 | continue 35 | if i <= mn + 1e-3: 36 | r += bcolors.BOLD+bcolors.FAIL 37 | elif i + 1e-3 >= mx: 38 | r += bcolors.BOLD+bcolors.FAIL 39 | elif color is not None: 40 | r += color 41 | r += (fmt+' ')%i 42 | r += bcolors.ENDC 43 | r += bcolors.ENDC 44 | return r 45 | 46 | def computeGradientRatio(net, NIT=1): 47 | import numpy as np 48 | last_layer = 0 49 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 50 | if l.type not in STRIP_LAYER: 51 | last_layer = i 52 | last_tops = net.top_names[net._layer_names[last_layer]] 53 | 54 | var = {} 55 | for it in range(NIT): 56 | net._forward(0, last_layer) 57 | # Reset the diffs 58 | for l in net.layers: 59 | for b in l.blobs: 60 | b.diff[...] = 0 61 | # Set the top diffs 62 | for t in last_tops: 63 | net.blobs[t].diff[...] = np.random.normal(0, 1, net.blobs[t].shape) 64 | net._backward(last_layer, 0) 65 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 66 | if len(l.blobs) > 0: 67 | assert l.type in PARAMETER_LAYERS, "Parameter layer '%s' currently not supported"%l.type 68 | b = l.blobs[0] 69 | r = np.mean(b.diff.swapaxes(0,1).reshape((b.diff.shape[1],-1))**2, axis=1) / np.mean(b.data**2) 70 | if n in var: var[n] += r / NIT 71 | else: var[n] = r / NIT 72 | std = {n: np.sqrt(var[n]) for n in var} 73 | return {n: np.std(s) / np.mean(s) for n,s in std.items()}, {n: np.mean(s) for n,s in std.items()} 74 | 75 | def printMeanStddev(net, NIT=10, show_all=False, show_color=True, quiet=False): 76 | import numpy as np 77 | bcolors = NOCOLORS 78 | if show_color: bcolors = BCOLORS 79 | 80 | layer_names = list(net._layer_names) 81 | if not show_all: 82 | layer_names = [n for n, l in zip(net._layer_names, net.layers) if len(l.blobs)>0] 83 | if 'data' in net._layer_names: 84 | layer_names.append('data') 85 | 86 | # When was a blob last used 87 | last_used = {} 88 | # Make sure all layers are supported, and compute the range each blob is used in 89 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 90 | for b in net.bottom_names[n]: 91 | last_used[b] = i 92 | 93 | active_data, cvar = {}, {} 94 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 95 | # Run the network forward 96 | new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n]) 97 | active_data.update(new_data) 98 | 99 | if len(net.top_names[n]) > 0 and n in layer_names: 100 | m = net.top_names[n][0] 101 | D = flattenData(new_data[m]) 102 | mean = np.mean(D, axis=0) 103 | stddev = np.std(D, axis=0) 104 | if not quiet: 105 | print( bcolors.BOLD, ' '*5, n, ':', m, ' '*5, bcolors.ENDC ) 106 | print( 'mean ', coloredNumbers(mean, bcolors.OKGREEN, bcolors=bcolors) ) 107 | print( 'stddev', coloredNumbers(stddev, bcolors.OKBLUE, bcolors=bcolors) ) 108 | print( 'coef of variation ', bcolors.OKGREEN, stddev.std() / stddev.mean(), bcolors.ENDC ) 109 | print() 110 | cvar[n] = stddev.std() / stddev.mean() 111 | # Delete all unused data 112 | for k in list(active_data): 113 | if k not in last_used or last_used[k] == i: 114 | del active_data[k] 115 | return cvar 116 | 117 | def main(): 118 | from argparse import ArgumentParser 119 | from os import path 120 | 121 | parser = ArgumentParser() 122 | parser.add_argument('prototxt') 123 | parser.add_argument('-l', '--load', help='Load a caffemodel') 124 | parser.add_argument('-d', '--data', default=None, help='Image list to use [default prototxt data]') 125 | #parser.add_argument('-q', action='store_true', help='Quiet execution') 126 | parser.add_argument('-sm', action='store_true', help='Summary only') 127 | parser.add_argument('-q', action='store_true', help='Quiet execution') 128 | parser.add_argument('-a', '--all', action='store_true', help='Show the statistic for all layers') 129 | parser.add_argument('-nc', action='store_true', help='Do not use color') 130 | parser.add_argument('-s', type=float, default=1.0, help='Scale the input [only custom data "-d"]') 131 | parser.add_argument('-bs', type=int, default=16, help='Batch size [only custom data "-d"]') 132 | parser.add_argument('-nit', type=int, default=10, help='Number of iterations') 133 | parser.add_argument('--gpu', type=int, default=0, help='What gpu to run it on?') 134 | args = parser.parse_args() 135 | 136 | if args.q: 137 | from os import environ 138 | environ['GLOG_minloglevel'] = '2' 139 | import caffe, load 140 | from caffe import NetSpec, layers as L 141 | 142 | caffe.set_mode_gpu() 143 | if args.gpu is not None: 144 | caffe.set_device(args.gpu) 145 | 146 | if args.data is not None: 147 | model = load.ProtoDesc(args.prototxt) 148 | net = NetSpec() 149 | fl = getFileList(args.data) 150 | if len(fl) == 0: 151 | print("Unknown data type for '%s'"%args.data) 152 | exit(1) 153 | from tempfile import NamedTemporaryFile 154 | f = NamedTemporaryFile('w') 155 | f.write('\n'.join([path.abspath(i)+' 0' for i in fl])) 156 | f.flush() 157 | net.data, net.label = L.ImageData(source=f.name, batch_size=args.bs, new_width=model.input_dim[-1], new_height=model.input_dim[-1], transform_param=dict(mean_value=[104,117,123], scale=args.s),ntop=2) 158 | net.out = model(data=net.data, label=net.label) 159 | n = netFromString('force_backward:true\n'+str(net.to_proto()), caffe.TRAIN ) 160 | else: 161 | n = caffe.Net(args.prototxt, caffe.TRAIN) 162 | 163 | if args.load is not None: 164 | n.copy_from(args.load) 165 | 166 | cvar = printMeanStddev(n, NIT=args.nit, show_all=args.all, show_color=not args.nc, quiet=args.sm) 167 | cv, gr = computeGradientRatio(n, NIT=args.nit) 168 | print() 169 | print(' Summary ') 170 | print('-----------') 171 | print() 172 | print('layer name out cvar rate cvar rate mean') 173 | for l in n._layer_names: 174 | if l in cvar and l in cv and l in gr: 175 | print('%-30s %10.2f %10.2f %10.2f'%(l, cvar[l], cv[l], gr[l]) ) 176 | 177 | if __name__ == "__main__": 178 | main() 179 | -------------------------------------------------------------------------------- /magic_init.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | INPUT_LAYERS = ['Data', 'ImageData', 'Input'] 4 | # Layers that only support elwise 5 | ELWISE_LAYERS = ['Deconvolution'] 6 | # Layers that support parameters 7 | PARAMETER_LAYERS = ['Convolution', 'InnerProduct']+ELWISE_LAYERS 8 | # All supported layers 9 | SUPPORTED_LAYERS = ['ReLU', 'Sigmoid', 'LRN', 'Pooling', 'Eltwise'] + PARAMETER_LAYERS + INPUT_LAYERS 10 | STRIP_LAYER = ['Softmax', 'SoftmaxWithLoss', 'SigmoidCrossEntropyLoss'] 11 | # Use 'Dropout' at your own risk 12 | # Unless Jon merges #2865 , 'Split' cannot be supported 13 | UNSUPPORTED_LAYERS = ['Split', 'BatchNorm', 'Reshape'] 14 | 15 | def forward(net, i, NIT, data, output_names): 16 | n = net._layer_names[i] 17 | # Create the top data if needed 18 | output = {t: [None]*NIT for t in output_names} 19 | for it in range(NIT): 20 | for b in data: 21 | net.blobs[b].data[...] = data[b][it] 22 | net._forward(i, i) 23 | for t in output_names: 24 | output[t][it] = 1*net.blobs[t].data 25 | return output 26 | 27 | def flattenData(data): 28 | import numpy as np 29 | return np.concatenate([d.swapaxes(0, 1).reshape((d.shape[1],-1)) for d in data], axis=1).T 30 | 31 | def gatherInputData(net, layer_id, bottom_data, top_name, fast=False, max_data=None): 32 | # This functions gathers all input data. 33 | # In order to not replicate all the internal functionality of convolutions (eg. padding ...) 34 | # we gather the data in the output space and use random gaussian weights. The output of this 35 | # function is W and D, there the input data I = D * W^-1 [with some abuse of tensor notation] 36 | # If we not compute an initialization A for D, we then simply multiply A by W to obtain the 37 | # proper initialization in the input space 38 | import numpy as np 39 | l = net.layers[layer_id] 40 | NIT = len(list(bottom_data.values())[0]) 41 | # How many times do we need to over-sample to get a full basis (out of random projections) 42 | OS = int(np.ceil( np.prod(l.blobs[0].data.shape[1:]) / l.blobs[0].data.shape[0] )) 43 | if fast: OS = 1 44 | 45 | # If we are over sampling we might run out of memory at some point, especially for filters higher up 46 | # Do avoid any issues we never return more than max_data number of elements 47 | subsample = None 48 | 49 | # Note this could cause some memory issues in the FC layers 50 | W, D = [], [] 51 | for i in range(OS): 52 | d = l.blobs[0].data 53 | d[...] = np.random.normal(0, 1, d.shape) 54 | W.append(1*d) 55 | # Collect the data and flatten out the convs 56 | data = np.concatenate([i.swapaxes(0, 1).reshape((i.shape[1],-1)).T for i in forward(net, layer_id, NIT, bottom_data, [top_name])[top_name]], axis=0) 57 | # Do we need to subsample the data to save memory? 58 | if subsample is None and max_data is not None: 59 | # Randomly select n data representative samples 60 | N = int(max_data / (data.shape[1]*OS)) 61 | subsample = np.arange(data.shape[0]) 62 | if N < data.shape[0]: 63 | np.random.shuffle(subsample) 64 | subsample = subsample[:N] 65 | if subsample is not None: 66 | data = data[subsample] 67 | D.append(data) 68 | # In order to handle any sort of groups we want to have the samples packed in the following order: 69 | # a1 a2 a3 a4 b1 b2 b3 b4 c1 ... (where the original data was a b c and OS=4) 70 | W, D = np.concatenate([w[:,None] for w in W], axis=1), np.concatenate([d[:,:,None] for d in D], axis=2) 71 | return W.reshape((-1,)+W.shape[2:]), D.reshape((D.shape[0], -1)+D.shape[3:]) 72 | 73 | def initializeWeight(D, type, N_OUT): 74 | # Here we first whiten the data (PCA or ZCA) and then optionally run k-means 75 | # on this whitened data. 76 | import numpy as np 77 | if D.shape[0] < N_OUT: 78 | print( " Not enough data for '%s' estimation, using elwise"%type ) 79 | return np.random.normal(0, 1, (N_OUT,D.shape[1])) 80 | D = D - np.mean(D, axis=0, keepdims=True) 81 | # PCA, ZCA, K-Means 82 | assert type in ['pca', 'zca', 'kmeans', 'rand'], "Unknown initialization type '%s'"%type 83 | C = D.T.dot(D) 84 | s, V = np.linalg.eigh(C) 85 | # order the eigenvalues 86 | ids = np.argsort(s)[-N_OUT:] 87 | s = s[ids] 88 | V = V[:,ids] 89 | s[s<1e-6] = 0 90 | s[s>=1e-6] = 1. / np.sqrt(s[s>=1e-6]+1e-3) 91 | S = np.diag(s) 92 | if type == 'pca': 93 | return S.dot(V.T) 94 | elif type == 'zca': 95 | return V.dot(S.dot(V.T)) 96 | # Whiten the data 97 | wD = D.dot(V.dot(S)) 98 | wD /= np.linalg.norm(wD, axis=1)[:,None] 99 | if type == 'kmeans': 100 | # Run k-means 101 | from sklearn.cluster import MiniBatchKMeans 102 | km = MiniBatchKMeans(n_clusters = wD.shape[1], batch_size=10*wD.shape[1]).fit(wD).cluster_centers_ 103 | elif type == 'rand': 104 | km = wD[np.random.choice(wD.shape[0], wD.shape[1], False)] 105 | C = km.dot(S.dot(V.T)) 106 | C /= np.std(D.dot(C.T), axis=0, keepdims=True).T 107 | return C 108 | 109 | 110 | def initializeLayer(net, layer_id, bottom_data, top_name, bias=0, type='elwise', max_data=None): 111 | import numpy as np 112 | l = net.layers[layer_id] 113 | NIT = len(list(bottom_data.values())[0]) 114 | 115 | if type!='elwise' and l.type in ELWISE_LAYERS: 116 | print( "Only 'elwise' supported for layer '%s'. Falling back."%net._layer_names[layer_id] ) 117 | type = 'elwise' 118 | 119 | for p in l.blobs: p.data[...] = 0 120 | fast = 'fast_' in type 121 | if fast: 122 | type = type.replace('fast_', '') 123 | # Initialize the weights [k-means, ...] 124 | if type == 'elwise': 125 | d = l.blobs[0].data 126 | d[...] = np.random.normal(0, 1, d.shape) 127 | else: # Use the input data 128 | # Are there any groups? 129 | G = 1 130 | bottom_names = net.bottom_names[net._layer_names[layer_id]] 131 | if len(bottom_names) == 1: 132 | N1 = net.blobs[bottom_names[0]].shape[1] 133 | N2 = l.blobs[0].shape[1] 134 | G = N1 // N2 135 | 136 | # Gather the input data 137 | T, D = gatherInputData(net, layer_id, bottom_data, top_name, fast, max_data=max_data) 138 | 139 | # Figure out the output dimensionality of d 140 | d = l.blobs[0].data 141 | 142 | # Loop over groups 143 | for g in range(G): 144 | dg, Dg = d[g*(d.shape[0]//G):(g+1)*(d.shape[0]//G)], D[:,g*(D.shape[1]//G):(g+1)*(D.shape[1]//G):] 145 | Tg = T[g*(T.shape[0]//G):(g+1)*(T.shape[0]//G)] 146 | # Compute the weights 147 | W = initializeWeight(Dg, type, N_OUT=dg.shape[0]) 148 | 149 | # Multiply the weights by the random basis 150 | # NOTE: This matrix multiplication is a bit large, if it's too slow, 151 | # reduce the oversampling in gatherInputData 152 | dg[...] = np.dot(W, Tg.reshape((Tg.shape[0],-1))).reshape(dg.shape) 153 | # Scale the mean and initialize the bias 154 | top_data = forward(net, layer_id, NIT, bottom_data, [top_name])[top_name] 155 | flat_data = flattenData(top_data) 156 | mu = flat_data.mean(axis=0) 157 | std = flat_data.std(axis=0) 158 | if l.type == 'Deconvolution': 159 | l.blobs[0].data[...] /= std.reshape((1,-1,)+(1,)*(len(l.blobs[0].data.shape)-2)) 160 | else: 161 | l.blobs[0].data[...] /= std.reshape((-1,)+(1,)*(len(l.blobs[0].data.shape)-1)) 162 | for b in l.blobs[1:]: 163 | b.data[...] = -mu / std + bias 164 | 165 | 166 | 167 | def magicInitialize(net, bias=0, NIT=10, type='elwise', max_data=None): 168 | import numpy as np 169 | # When was a blob last used 170 | last_used = {} 171 | # Make sure all layers are supported, and compute the last time each blob is used 172 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 173 | if l.type in UNSUPPORTED_LAYERS: 174 | print( "WARNING: Layer type '%s' not supported! Things might go very wrong..."%l.type ) 175 | elif l.type not in SUPPORTED_LAYERS+STRIP_LAYER: 176 | print( "Unknown layer type '%s'. double check if it is supported"%l.type ) 177 | for b in net.bottom_names[n]: 178 | last_used[b] = i 179 | 180 | active_data = {} 181 | # Read all the input data 182 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 183 | # Initialize the layer 184 | if len(l.blobs) > 0: 185 | if np.sum(np.abs(l.blobs[0].data)) <= 1e-10: 186 | print( "Initializing layer '%s'"%n ) 187 | assert l.type in PARAMETER_LAYERS, "Unsupported parameter layer" 188 | assert len(net.top_names[n]) == 1, "Exactly one output supported" 189 | 190 | # Fill the parameters 191 | initializeLayer(net, i, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n][0], bias, type, max_data=max_data) 192 | else: 193 | print( "Skipping layer '%s'"%n ) 194 | 195 | # TODO: Estimate and rescale the values [TODO: Record and undo this scaling above] 196 | 197 | # Run the network forward 198 | new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n]) 199 | active_data.update(new_data) 200 | 201 | # Delete all unused data 202 | for k in list(active_data): 203 | if k not in last_used or last_used[k] == i: 204 | del active_data[k] 205 | 206 | def load(net, blobs): 207 | for l,n in zip(net.layers, net._layer_names): 208 | if n in blobs: 209 | for b, sb in zip(l.blobs, blobs[n]): 210 | b.data[...] = sb 211 | 212 | def save(net): 213 | import numpy as np 214 | r = {} 215 | for l,n in zip(net.layers, net._layer_names): 216 | if len(l.blobs) > 0: 217 | r[n] = [np.copy(b.data) for b in l.blobs] 218 | return r 219 | 220 | def estimateHomogenety(net): 221 | # Estimate if a certain layer is homogeneous and if yes return the degree k 222 | # by which the output is scaled (if input is scaled by alpha then the output 223 | # is scaled by alpha^k). Return None if the layer is not homogeneous. 224 | import numpy as np 225 | # When was a blob last used 226 | last_used = {} 227 | # Make sure all layers are supported, and compute the range each blob is used in 228 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 229 | for b in net.bottom_names[n]: 230 | last_used[b] = i 231 | 232 | active_data = {} 233 | homogenety = {} 234 | # Read all the input data 235 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 236 | # Run the network forward 237 | new_data1 = forward(net, i, 1, {b: [1*d for d in active_data[b]] for b in net.bottom_names[n]}, net.top_names[n]) 238 | new_data2 = forward(net, i, 1, {b: [2*d for d in active_data[b]] for b in net.bottom_names[n]}, net.top_names[n]) 239 | active_data.update(new_data1) 240 | 241 | if len(new_data1) == 1: 242 | m = list(new_data1.keys())[0] 243 | d1, d2 = flattenData(new_data1[m]), flattenData(new_data2[m]) 244 | f = np.mean(np.abs(d1), axis=0) / np.mean(np.abs(d2), axis=0) 245 | if 1e-3*np.mean(f) < np.std(f): 246 | # Not homogeneous 247 | homogenety[n] = None 248 | else: 249 | # Compute the degree of the homogeneous transformation 250 | homogenety[n] = (np.log(np.mean(np.abs(d2))) - np.log(np.mean(np.abs(d1)))) / np.log(2) 251 | else: 252 | homogenety[n] = None 253 | # Delete all unused data 254 | for k in list(active_data): 255 | if k not in last_used or last_used[k] == i: 256 | del active_data[k] 257 | return homogenety 258 | 259 | def calibrateGradientRatio(net, NIT=1): 260 | import numpy as np 261 | # When was a blob last used 262 | last_used = {} 263 | # Find the last layer to use 264 | last_layer = 0 265 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 266 | if l.type not in STRIP_LAYER: 267 | last_layer = i 268 | for b in net.bottom_names[n]: 269 | last_used[b] = i 270 | # Figure out which tops are involved 271 | last_tops = net.top_names[net._layer_names[last_layer]] 272 | for t in last_tops: 273 | last_used[t] = len(net.layers) 274 | 275 | # Call forward and store the data of all data layers 276 | active_data, input_data, bottom_scale = {}, {}, {} 277 | # Read all the input data 278 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 279 | if i > last_layer: break 280 | # Compute the input scale for parameter layers 281 | if len(l.blobs) > 0: 282 | bottom_scale[n] = np.mean([np.mean(np.abs(active_data[b])) for b in net.bottom_names[n]]) 283 | # Run the network forward 284 | new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n]) 285 | if l.type in INPUT_LAYERS: 286 | input_data.update(new_data) 287 | active_data.update(new_data) 288 | 289 | # Delete all unused data 290 | for k in list(active_data): 291 | if k not in last_used or last_used[k] == i: 292 | del active_data[k] 293 | output_std = np.mean(np.std(flattenData(active_data[last_tops[0]]), axis=0)) 294 | 295 | for it in range(10): 296 | # Reset the diffs 297 | for l in net.layers: 298 | for b in l.blobs: 299 | b.diff[...] = 0 300 | # Set the top diffs 301 | for t in last_tops: 302 | net.blobs[t].diff[...] = np.random.normal(0, 1, net.blobs[t].shape) 303 | # Compute all gradients 304 | net._backward(last_layer, 0) 305 | 306 | # Compute the gradient ratio 307 | ratio={} 308 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 309 | if len(l.blobs) > 0: 310 | assert l.type in PARAMETER_LAYERS, "Parameter layer '%s' currently not supported"%l.type 311 | b = l.blobs[0] 312 | ratio[n] = np.sqrt(np.mean(b.diff**2) / np.mean(b.data**2)) 313 | 314 | # If all layers are homogeneous, then the target ratio is the geometric mean of all ratios 315 | # (assuming we want the same output) 316 | # To deal with non-homogeneous layers we scale by output_std in the hope to undo correct the 317 | # estimation over time. 318 | # NOTE: for non feed-forward networks the geometric mean might not be the right scaling factor 319 | target_ratio = np.exp(np.mean(np.log(np.array(list(ratio.values()))))) * (output_std)**(1. / len(ratio)) 320 | 321 | # Terminate if the relative change is less than 1% for all values 322 | log_ratio = np.log( np.array(list(ratio.values())) ) 323 | if np.all( np.abs(log_ratio/np.log(target_ratio) - 1) < 0.01 ): 324 | break 325 | 326 | # Update all the weights and biases 327 | active_data = {} 328 | # Read all the input data 329 | for i, (n, l) in enumerate(zip(net._layer_names, net.layers)): 330 | if i > last_layer: break 331 | # Use the stored input 332 | if l.type in INPUT_LAYERS: 333 | active_data.update({b: input_data[b] for b in net.top_names[n]}) 334 | else: 335 | if len(l.blobs) > 0: 336 | # Add the scaling from the bottom to the biases 337 | current_scale = np.mean([np.mean(np.abs(active_data[b])) for b in net.bottom_names[n]]) 338 | adj = current_scale / bottom_scale[n] 339 | for b in list(l.blobs)[1:]: 340 | b.data[...] *= adj 341 | bottom_scale[n] = current_scale 342 | 343 | # Scale to obtain the target ratio 344 | scale = np.sqrt(ratio[n] / target_ratio) 345 | for b in l.blobs: 346 | b.data[...] *= scale 347 | 348 | active_data.update(forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n])) 349 | # Delete all unused data 350 | for k in list(active_data): 351 | if k not in last_used or last_used[k] == i: 352 | del active_data[k] 353 | 354 | new_output_std = np.mean(np.std(flattenData(active_data[last_tops[0]]), axis=0)) 355 | if np.abs(np.log(output_std) - np.log(new_output_std)) > 0.25: 356 | # If we diverge by a factor of exp(0.25) = ~1.3, then we should check if the network is really 357 | # homogeneous 358 | print( "WARNING: It looks like one or more layers are not homogeneous! Trying to correct for this..." ) 359 | print( " Output std = %f" % new_output_std ) 360 | output_std = new_output_std 361 | 362 | def netFromString(s, t=None): 363 | import caffe 364 | from tempfile import NamedTemporaryFile 365 | if t is None: t = caffe.TEST 366 | f = NamedTemporaryFile('w') 367 | f.write(s) 368 | f.flush() 369 | r = caffe.Net(f.name, t) 370 | f.close() 371 | return r 372 | 373 | def getFileList(f): 374 | from glob import glob 375 | from os import path 376 | return [f for f in glob(f) if path.isfile(f)] 377 | 378 | def main(): 379 | from argparse import ArgumentParser 380 | from os import path 381 | import numpy as np 382 | parser = ArgumentParser() 383 | parser.add_argument('prototxt') 384 | parser.add_argument('output_caffemodel') 385 | parser.add_argument('-l', '--load', help='Load a pretrained model and rescale it [bias and type are not supported]') 386 | parser.add_argument('-d', '--data', default=None, help='Image list to use [default prototxt data]') 387 | parser.add_argument('-b', '--bias', type=float, default=0.1, help='Bias') 388 | parser.add_argument('-t', '--type', default='elwise', help='Type: elwise, pca, zca, kmeans, rand (random input patches). Add fast_ to speed up the initialization, but you might lose in precision.') 389 | parser.add_argument('-z', action='store_true', help='Zero all weights and reinitialize') 390 | parser.add_argument('-cs', action='store_true', help='Correct for scaling') 391 | parser.add_argument('-q', action='store_true', help='Quiet execution') 392 | parser.add_argument('-s', type=float, default=1.0, help='Scale the input [only custom data "-d"]') 393 | parser.add_argument('-bs', type=int, default=16, help='Batch size [only custom data "-d"]') 394 | parser.add_argument('-nit', type=int, default=10, help='Number of iterations') 395 | parser.add_argument('--mem-limit', type=int, default=500, help='How much memory should we use for the data buffer (MB)?') 396 | parser.add_argument('--gpu', type=int, default=0, help='What gpu to run it on?') 397 | args = parser.parse_args() 398 | 399 | if args.q: 400 | from os import environ 401 | environ['GLOG_minloglevel'] = '2' 402 | import caffe, load 403 | from caffe import NetSpec, layers as L 404 | 405 | caffe.set_mode_gpu() 406 | if args.gpu is not None: 407 | caffe.set_device(args.gpu) 408 | 409 | if args.data is not None: 410 | model = load.ProtoDesc(args.prototxt) 411 | net = NetSpec() 412 | fl = getFileList(args.data) 413 | if len(fl) == 0: 414 | print("Unknown data type for '%s'"%args.data) 415 | exit(1) 416 | from tempfile import NamedTemporaryFile 417 | f = NamedTemporaryFile('w') 418 | f.write('\n'.join([path.abspath(i)+' 0' for i in fl])) 419 | f.flush() 420 | net.data, net.label = L.ImageData(source=f.name, batch_size=args.bs, new_width=model.input_dim[-1], new_height=model.input_dim[-1], transform_param=dict(mean_value=[104,117,123], scale=args.s),ntop=2) 421 | net.out = model(data=net.data, label=net.label) 422 | n = netFromString('force_backward:true\n'+str(net.to_proto()), caffe.TRAIN ) 423 | else: 424 | n = caffe.Net(args.prototxt, caffe.TRAIN) 425 | 426 | if args.load is not None: 427 | n.copy_from(args.load) 428 | # Rescale existing layers? 429 | #if args.fix: 430 | #magicFix(n, args.nit) 431 | 432 | if args.z: 433 | # Zero out all layers 434 | for l in n.layers: 435 | for b in l.blobs: 436 | b.data[...] = 0 437 | if any([np.abs(l.blobs[0].data).sum() < 1e-10 for l in n.layers if len(l.blobs) > 0]): 438 | print( [m for l,m in zip(n.layers, n._layer_names) if len(l.blobs) > 0 and np.abs(l.blobs[0].data).sum() < 1e-10] ) 439 | magicInitialize(n, args.bias, NIT=args.nit, type=args.type, max_data=args.mem_limit*1024*1024/4) 440 | else: 441 | print( "Network already initialized, skipping magic init" ) 442 | if args.cs: 443 | # A simply helper function that lets you figure out which layers are not 444 | # homogeneous 445 | #print( estimateHomogenety(n) ) 446 | calibrateGradientRatio(n) 447 | n.save(args.output_caffemodel) 448 | 449 | if __name__ == "__main__": 450 | main() 451 | --------------------------------------------------------------------------------