├── .gitignore
├── LICENSE
├── load.py
├── README.md
├── measure_stat.py
└── magic_init.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | **/*.pyc
3 | caffe
4 | *.kdev4
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Philipp Krähenbühl
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this
 8 |    list of conditions and the following disclaimer.
 9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 |    this list of conditions and the following disclaimer in the documentation
11 |    and/or other materials provided with the distribution.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 | 
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.


--------------------------------------------------------------------------------
/load.py:
--------------------------------------------------------------------------------
 1 | import caffe
 2 | 
 3 | def parseProtoString(s):
 4 | 	from google.protobuf import text_format
 5 | 	from caffe.proto import caffe_pb2 as pb
 6 | 	proto_net = pb.NetParameter()
 7 | 	text_format.Merge(s, proto_net)
 8 | 	return proto_net
 9 | 
10 | 
11 | def get_param(l, exclude=set(['top', 'bottom', 'name', 'type'])):
12 | 	if not hasattr(l,'ListFields'):
13 | 		if hasattr(l,'__delitem__'):
14 | 			return list(l)
15 | 		return l
16 | 	r = dict()
17 | 	for f, v in l.ListFields():
18 | 		if f.name not in exclude:
19 | 			r[f.name] = get_param(v, [])
20 | 	return r
21 | 
22 | class ProtoDesc:
23 | 	def __init__(self, prototxt):
24 | 		from os import path
25 | 		self.prototxt = prototxt
26 | 		self.parsed_proto = parseProtoString(open(self.prototxt, 'r').read())
27 | 		# Guess the input dimension
28 | 		self.input_dim = (3, 227, 227)
29 | 		net = self.parsed_proto
30 | 		if len(net.input_dim) > 0:
31 | 			self.input_dim = net.input_dim[1:]
32 | 		else:
33 | 			lrs = net.layer
34 | 			cs = [l.transform_param.crop_size for l in lrs
35 | 				if l.HasField('transform_param')]
36 | 			if len(cs):
37 | 				self.input_dim = (3, cs[0], cs[0])
38 | 
39 | 	def __call__(self, clip=None, **inputs):
40 | 		from caffe import layers as L
41 | 		from collections import OrderedDict
42 | 		net = self.parsed_proto
43 | 		blobs = OrderedDict(inputs)
44 | 		for l in net.layer:
45 | 			if l.name not in inputs:
46 | 				in_place = l.top == l.bottom
47 | 				param = get_param(l)
48 | 				assert all([b in blobs for b in l.bottom]), "Some bottoms not founds: " + ', '.join([b for b in l.bottom if not b in blobs])
49 | 				tops = getattr(L, l.type)(*[blobs[b] for b in l.bottom],
50 | 				                          ntop=len(l.top), in_place=in_place,
51 | 				                          name=l.name,
52 | 				                          **param)
53 | 				if len(l.top) <= 1:
54 | 					tops = [tops]
55 | 				for i, t in enumerate(l.top):
56 | 					blobs[t] = tops[i]
57 | 			if l.name == clip:
58 | 				break
59 | 		return list(blobs.values())[-1]
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data-dependent initialization of convolutional neural networks
 2 | 
 3 | Created by Philipp Krähenbühl.
 4 | 
 5 | ### Introduction
 6 | 
 7 | This code implements the initialization presented in our [arXiv tech report](http://arxiv.org/abs/1511.06856), which is under submission at ICLR 2016.
 8 | 
 9 | *This is a reimplementation and currently work in progress. Use at your own risk.*
10 | 
11 | ### License
12 | 
13 | This code is released under the BSD License (refer to the LICENSE file for details).
14 | 
15 | ### Citing
16 | 
17 | If you find our initialization useful in your research, please consider citing:
18 | 
19 |     @article{krahenbuhl2015data,
20 |       title={Data-dependent Initializations of Convolutional Neural Networks},
21 |       author={Kr{\"a}henb{\"u}hl, Philipp and Doersch, Carl and Donahue, Jeff and Darrell, Trevor},
22 |       journal={arXiv preprint arXiv:1511.06856},
23 |       year={2015}
24 |     }
25 | 
26 | ### Setup
27 | 
28 | Checkout the project and create a symlink to caffe in the `magic_init` directory:
29 | ```Shell
30 | ln -s path/to/caffe/python/caffe caffe
31 | ```
32 | 
33 | ### Examples
34 | 
35 | Here is a quick example on how to initialize alexnet:
36 | ```bash
37 | python magic_init.py path/to/alexnet/deploy.prototxt path/to/output.caffemodel -d "path/to/some/images/*.png" -q -nit 10 -cs
38 | ```
39 | Here ```-d``` flag allows you to initialize the network using your own images. Feel free to use imagenet, Pascal, COCO or whatever you have at hand, it shouldn't make a big difference. The ```-q``` (queit) flag suppresses all the caffe logging, ```-nit``` controls the number of batches used (while ```-bs``` controls the batch size). Finally ```-cs``` rescales the gradients accross layers. This rescaling currently works best for feed-forward networks, and might not work too well for DAG structured networks (we are working on that).
40 | 
41 | To run the k-means initialization use:
42 | ```bash
43 | python magic_init.py path/to/alexnet/deploy.prototxt path/to/output.caffemodel -d "path/to/some/images/*.png" -q -nit 10 -cs -t kmeans
44 | ```
45 | 
46 | Finally, ```python magic_init.py -h``` should provide you with more help.
47 | 
48 | 
49 | ### Pro tips
50 | If you're numpy implementation is based on openblas, try disabeling threading ```export OPENBLAS_NUM_THREADS=1```, it can improve the runtime performance a bit.
51 | 


--------------------------------------------------------------------------------
/measure_stat.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from magic_init import *
  3 | 
  4 | class BCOLORS:
  5 | 	HEADER = '\033[95m'
  6 | 	OKBLUE = '\033[94m'
  7 | 	OKGREEN = '\033[92m'
  8 | 	WARNING = '\033[93m'
  9 | 	FAIL = '\033[91m'
 10 | 	ENDC = '\033[0m'
 11 | 	BOLD = '\033[1m'
 12 | 	UNDERLINE = '\033[4m'
 13 | 	
 14 | class NOCOLORS:
 15 | 	HEADER = ''
 16 | 	OKBLUE = ''
 17 | 	OKGREEN = ''
 18 | 	WARNING = ''
 19 | 	FAIL = ''
 20 | 	ENDC = ''
 21 | 	BOLD = ''
 22 | 	UNDERLINE = ''
 23 | 
 24 | def coloredNumbers(v, color=None, fmt='%6.2f', max_display=300, bcolors=BCOLORS):
 25 | 	import numpy as np
 26 | 	# Display a numpy array and highlight the min and max values [required a nice linux
 27 | 	# terminal supporting colors]
 28 | 	r = ""
 29 | 	mn, mx = np.min(v), np.max(v)
 30 | 	for k,i in enumerate(v):
 31 | 		if len(v) > max_display and k > max_display/2 and k < len(v) - max_display/2:
 32 | 			if r[-1] != '.':
 33 | 				r += '...'
 34 | 			continue
 35 | 		if i <= mn + 1e-3:
 36 | 			r += bcolors.BOLD+bcolors.FAIL
 37 | 		elif i + 1e-3 >= mx:
 38 | 			r += bcolors.BOLD+bcolors.FAIL
 39 | 		elif color is not None:
 40 | 			r += color
 41 | 		r += (fmt+' ')%i
 42 | 		r += bcolors.ENDC
 43 | 	r += bcolors.ENDC
 44 | 	return r
 45 | 
 46 | def computeGradientRatio(net, NIT=1):
 47 | 	import numpy as np
 48 | 	last_layer = 0
 49 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
 50 | 		if l.type not in STRIP_LAYER:
 51 | 			last_layer = i
 52 | 	last_tops = net.top_names[net._layer_names[last_layer]]
 53 | 	
 54 | 	var = {}
 55 | 	for it in range(NIT):
 56 | 		net._forward(0, last_layer)
 57 | 		# Reset the diffs
 58 | 		for l in net.layers:
 59 | 			for b in l.blobs:
 60 | 				b.diff[...] = 0
 61 | 		# Set the top diffs
 62 | 		for t in last_tops:
 63 | 			net.blobs[t].diff[...] = np.random.normal(0, 1, net.blobs[t].shape)
 64 | 		net._backward(last_layer, 0)
 65 | 		for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
 66 | 			if len(l.blobs) > 0:
 67 | 				assert l.type in PARAMETER_LAYERS, "Parameter layer '%s' currently not supported"%l.type
 68 | 				b = l.blobs[0]
 69 | 				r = np.mean(b.diff.swapaxes(0,1).reshape((b.diff.shape[1],-1))**2, axis=1) / np.mean(b.data**2)
 70 | 				if n in var: var[n] += r / NIT
 71 | 				else: var[n] = r / NIT
 72 | 	std = {n: np.sqrt(var[n]) for n in var}
 73 | 	return {n: np.std(s) / np.mean(s) for n,s in std.items()}, {n: np.mean(s) for n,s in std.items()}
 74 | 
 75 | def printMeanStddev(net, NIT=10, show_all=False, show_color=True, quiet=False):
 76 | 	import numpy as np
 77 | 	bcolors = NOCOLORS
 78 | 	if show_color: bcolors = BCOLORS
 79 | 	
 80 | 	layer_names = list(net._layer_names)
 81 | 	if not show_all:
 82 | 		layer_names = [n for n, l in zip(net._layer_names, net.layers) if len(l.blobs)>0]
 83 | 		if 'data' in net._layer_names:
 84 | 			layer_names.append('data')
 85 | 
 86 | 	# When was a blob last used
 87 | 	last_used = {}
 88 | 	# Make sure all layers are supported, and compute the range each blob is used in
 89 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
 90 | 		for b in net.bottom_names[n]:
 91 | 			last_used[b] = i
 92 | 	
 93 | 	active_data, cvar = {}, {}
 94 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
 95 | 		# Run the network forward
 96 | 		new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n])
 97 | 		active_data.update(new_data)
 98 | 		
 99 | 		if len(net.top_names[n]) > 0 and n in layer_names:
100 | 			m = net.top_names[n][0]
101 | 			D = flattenData(new_data[m])
102 | 			mean = np.mean(D, axis=0)
103 | 			stddev = np.std(D, axis=0)
104 | 			if not quiet:
105 | 				print( bcolors.BOLD, ' '*5, n, ':', m, ' '*5, bcolors.ENDC )
106 | 				print( 'mean  ', coloredNumbers(mean, bcolors.OKGREEN, bcolors=bcolors) )
107 | 				print( 'stddev', coloredNumbers(stddev, bcolors.OKBLUE, bcolors=bcolors) )
108 | 				print( 'coef of variation ', bcolors.OKGREEN, stddev.std() / stddev.mean(), bcolors.ENDC )
109 | 				print()
110 | 			cvar[n] = stddev.std() / stddev.mean()
111 | 		# Delete all unused data
112 | 		for k in list(active_data):
113 | 			if k not in last_used or last_used[k] == i:
114 | 				del active_data[k]
115 | 	return cvar
116 | 
117 | def main():
118 | 	from argparse import ArgumentParser
119 | 	from os import path
120 | 	
121 | 	parser = ArgumentParser()
122 | 	parser.add_argument('prototxt')
123 | 	parser.add_argument('-l', '--load', help='Load a caffemodel')
124 | 	parser.add_argument('-d', '--data', default=None, help='Image list to use [default prototxt data]')
125 | 	#parser.add_argument('-q', action='store_true', help='Quiet execution')
126 | 	parser.add_argument('-sm', action='store_true', help='Summary only')
127 | 	parser.add_argument('-q', action='store_true', help='Quiet execution')
128 | 	parser.add_argument('-a', '--all', action='store_true', help='Show the statistic for all layers')
129 | 	parser.add_argument('-nc', action='store_true', help='Do not use color')
130 | 	parser.add_argument('-s', type=float, default=1.0, help='Scale the input [only custom data "-d"]')
131 | 	parser.add_argument('-bs', type=int, default=16, help='Batch size [only custom data "-d"]')
132 | 	parser.add_argument('-nit', type=int, default=10, help='Number of iterations')
133 | 	parser.add_argument('--gpu', type=int, default=0, help='What gpu to run it on?')
134 | 	args = parser.parse_args()
135 | 	
136 | 	if args.q:
137 | 		from os import environ
138 | 		environ['GLOG_minloglevel'] = '2'
139 | 	import caffe, load
140 | 	from caffe import NetSpec, layers as L
141 | 	
142 | 	caffe.set_mode_gpu()
143 | 	if args.gpu is not None:
144 | 		caffe.set_device(args.gpu)
145 | 	
146 | 	if args.data is not None:
147 | 		model = load.ProtoDesc(args.prototxt)
148 | 		net = NetSpec()
149 | 		fl = getFileList(args.data)
150 | 		if len(fl) == 0:
151 | 			print("Unknown data type for '%s'"%args.data)
152 | 			exit(1)
153 | 		from tempfile import NamedTemporaryFile
154 | 		f = NamedTemporaryFile('w')
155 | 		f.write('\n'.join([path.abspath(i)+' 0' for i in fl]))
156 | 		f.flush()
157 | 		net.data, net.label = L.ImageData(source=f.name, batch_size=args.bs, new_width=model.input_dim[-1], new_height=model.input_dim[-1], transform_param=dict(mean_value=[104,117,123], scale=args.s),ntop=2)
158 | 		net.out = model(data=net.data, label=net.label)
159 | 		n = netFromString('force_backward:true\n'+str(net.to_proto()), caffe.TRAIN )
160 | 	else:
161 | 		n = caffe.Net(args.prototxt, caffe.TRAIN)
162 | 
163 | 	if args.load is not None:
164 | 		n.copy_from(args.load)
165 | 	
166 | 	cvar = printMeanStddev(n, NIT=args.nit, show_all=args.all, show_color=not args.nc, quiet=args.sm)
167 | 	cv, gr = computeGradientRatio(n, NIT=args.nit)
168 | 	print()
169 | 	print('  Summary  ')
170 | 	print('-----------')
171 | 	print()
172 | 	print('layer name                         out cvar    rate cvar    rate mean')
173 | 	for l in n._layer_names:
174 | 		if l in cvar and l in cv and l in gr:
175 | 			print('%-30s   %10.2f   %10.2f   %10.2f'%(l, cvar[l], cv[l], gr[l]) )
176 | 
177 | if __name__ == "__main__":
178 | 	main()
179 | 


--------------------------------------------------------------------------------
/magic_init.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division
  2 | 
  3 | INPUT_LAYERS = ['Data', 'ImageData', 'Input']
  4 | # Layers that only support elwise
  5 | ELWISE_LAYERS = ['Deconvolution']
  6 | # Layers that support parameters
  7 | PARAMETER_LAYERS = ['Convolution', 'InnerProduct']+ELWISE_LAYERS
  8 | # All supported layers
  9 | SUPPORTED_LAYERS = ['ReLU', 'Sigmoid', 'LRN', 'Pooling', 'Eltwise'] + PARAMETER_LAYERS + INPUT_LAYERS
 10 | STRIP_LAYER = ['Softmax', 'SoftmaxWithLoss', 'SigmoidCrossEntropyLoss']
 11 | # Use 'Dropout' at your own risk
 12 | # Unless Jon merges #2865 , 'Split' cannot be supported
 13 | UNSUPPORTED_LAYERS = ['Split', 'BatchNorm', 'Reshape']
 14 | 
 15 | def forward(net, i, NIT, data, output_names):
 16 | 	n = net._layer_names[i]
 17 | 	# Create the top data if needed
 18 | 	output = {t: [None]*NIT for t in output_names}
 19 | 	for it in range(NIT):
 20 | 		for b in data:
 21 | 			net.blobs[b].data[...] = data[b][it]
 22 | 		net._forward(i, i)
 23 | 		for t in output_names:
 24 | 			output[t][it] = 1*net.blobs[t].data
 25 | 	return output
 26 | 
 27 | def flattenData(data):
 28 | 	import numpy as np
 29 | 	return np.concatenate([d.swapaxes(0, 1).reshape((d.shape[1],-1)) for d in data], axis=1).T
 30 | 
 31 | def gatherInputData(net, layer_id, bottom_data, top_name, fast=False, max_data=None):
 32 | 	# This functions gathers all input data.
 33 | 	# In order to not replicate all the internal functionality of convolutions (eg. padding ...)
 34 | 	# we gather the data in the output space and use random gaussian weights. The output of this
 35 | 	# function is W and D, there the input data I = D * W^-1  [with some abuse of tensor notation]
 36 | 	# If we not compute an initialization A for D, we then simply multiply A by W to obtain the
 37 | 	# proper initialization in the input space
 38 | 	import numpy as np
 39 | 	l = net.layers[layer_id]
 40 | 	NIT = len(list(bottom_data.values())[0])
 41 | 	# How many times do we need to over-sample to get a full basis (out of random projections)
 42 | 	OS = int(np.ceil( np.prod(l.blobs[0].data.shape[1:]) / l.blobs[0].data.shape[0] ))
 43 | 	if fast: OS = 1
 44 | 	
 45 | 	# If we are over sampling we might run out of memory at some point, especially for filters higher up
 46 | 	# Do avoid any issues we never return more than max_data number of elements
 47 | 	subsample = None
 48 | 	
 49 | 	# Note this could cause some memory issues in the FC layers
 50 | 	W, D = [], []
 51 | 	for i in range(OS):
 52 | 		d = l.blobs[0].data
 53 | 		d[...] = np.random.normal(0, 1, d.shape)
 54 | 		W.append(1*d)
 55 | 		# Collect the data and flatten out the convs
 56 | 		data = np.concatenate([i.swapaxes(0, 1).reshape((i.shape[1],-1)).T for i in forward(net, layer_id, NIT, bottom_data, [top_name])[top_name]], axis=0)
 57 | 		# Do we need to subsample the data to save memory?
 58 | 		if subsample is None and max_data is not None:
 59 | 			# Randomly select n data representative samples
 60 | 			N = int(max_data / (data.shape[1]*OS))
 61 | 			subsample = np.arange(data.shape[0])
 62 | 			if N < data.shape[0]:
 63 | 				np.random.shuffle(subsample)
 64 | 				subsample = subsample[:N]
 65 | 		if subsample is not None:
 66 | 			data = data[subsample]
 67 | 		D.append(data)
 68 | 	# In order to handle any sort of groups we want to have the samples packed in the following order:
 69 | 	# a1 a2 a3 a4 b1 b2 b3 b4 c1 ...  (where the original data was a b c and OS=4)
 70 | 	W, D = np.concatenate([w[:,None] for w in W], axis=1), np.concatenate([d[:,:,None] for d in D], axis=2)
 71 | 	return W.reshape((-1,)+W.shape[2:]), D.reshape((D.shape[0], -1)+D.shape[3:])
 72 | 
 73 | def initializeWeight(D, type, N_OUT):
 74 | 	# Here we first whiten the data (PCA or ZCA) and then optionally run k-means
 75 | 	# on this whitened data.
 76 | 	import numpy as np
 77 | 	if D.shape[0] < N_OUT:
 78 | 		print( "  Not enough data for '%s' estimation, using elwise"%type )
 79 | 		return np.random.normal(0, 1, (N_OUT,D.shape[1]))
 80 | 	D = D - np.mean(D, axis=0, keepdims=True)
 81 | 	# PCA, ZCA, K-Means
 82 | 	assert type in ['pca', 'zca', 'kmeans', 'rand'], "Unknown initialization type '%s'"%type
 83 | 	C = D.T.dot(D)
 84 | 	s, V = np.linalg.eigh(C)
 85 | 	# order the eigenvalues
 86 | 	ids = np.argsort(s)[-N_OUT:]
 87 | 	s = s[ids]
 88 | 	V = V[:,ids]
 89 | 	s[s<1e-6] = 0
 90 | 	s[s>=1e-6] = 1. / np.sqrt(s[s>=1e-6]+1e-3)
 91 | 	S = np.diag(s)
 92 | 	if type == 'pca':
 93 | 		return S.dot(V.T)
 94 | 	elif type == 'zca':
 95 | 		return V.dot(S.dot(V.T))
 96 | 	# Whiten the data
 97 | 	wD = D.dot(V.dot(S))
 98 | 	wD /= np.linalg.norm(wD, axis=1)[:,None]
 99 | 	if type == 'kmeans':
100 | 		# Run k-means
101 | 		from sklearn.cluster import MiniBatchKMeans
102 | 		km = MiniBatchKMeans(n_clusters = wD.shape[1], batch_size=10*wD.shape[1]).fit(wD).cluster_centers_
103 | 	elif type == 'rand':
104 | 		km = wD[np.random.choice(wD.shape[0], wD.shape[1], False)]
105 | 	C = km.dot(S.dot(V.T))
106 | 	C /= np.std(D.dot(C.T), axis=0, keepdims=True).T
107 | 	return C
108 | 		
109 | 
110 | def initializeLayer(net, layer_id, bottom_data, top_name, bias=0, type='elwise', max_data=None):
111 | 	import numpy as np
112 | 	l = net.layers[layer_id]
113 | 	NIT = len(list(bottom_data.values())[0])
114 | 	
115 | 	if type!='elwise' and l.type in ELWISE_LAYERS:
116 | 		print( "Only 'elwise' supported for layer '%s'. Falling back."%net._layer_names[layer_id] )
117 | 		type = 'elwise'
118 | 	
119 | 	for p in l.blobs: p.data[...] = 0
120 | 	fast = 'fast_' in type
121 | 	if fast:
122 | 		type = type.replace('fast_', '')
123 | 	# Initialize the weights [k-means, ...]
124 | 	if type == 'elwise':
125 | 		d = l.blobs[0].data
126 | 		d[...] = np.random.normal(0, 1, d.shape)
127 | 	else: # Use the input data
128 | 		# Are there any groups?
129 | 		G = 1
130 | 		bottom_names = net.bottom_names[net._layer_names[layer_id]]
131 | 		if len(bottom_names) == 1:
132 | 			N1 = net.blobs[bottom_names[0]].shape[1]
133 | 			N2 = l.blobs[0].shape[1]
134 | 			G = N1 // N2
135 | 		
136 | 		# Gather the input data
137 | 		T, D = gatherInputData(net, layer_id, bottom_data, top_name, fast, max_data=max_data)
138 | 
139 | 		# Figure out the output dimensionality of d
140 | 		d = l.blobs[0].data
141 | 
142 | 		# Loop over groups
143 | 		for g in range(G):
144 | 			dg, Dg = d[g*(d.shape[0]//G):(g+1)*(d.shape[0]//G)], D[:,g*(D.shape[1]//G):(g+1)*(D.shape[1]//G):]
145 | 			Tg = T[g*(T.shape[0]//G):(g+1)*(T.shape[0]//G)]
146 | 			# Compute the weights
147 | 			W = initializeWeight(Dg, type, N_OUT=dg.shape[0])
148 | 			
149 | 			# Multiply the weights by the random basis
150 | 			# NOTE: This matrix multiplication is a bit large, if it's too slow,
151 | 			#       reduce the oversampling in gatherInputData
152 | 			dg[...] = np.dot(W, Tg.reshape((Tg.shape[0],-1))).reshape(dg.shape)
153 | 	# Scale the mean and initialize the bias
154 | 	top_data = forward(net, layer_id, NIT, bottom_data, [top_name])[top_name]
155 | 	flat_data = flattenData(top_data)
156 | 	mu = flat_data.mean(axis=0)
157 | 	std = flat_data.std(axis=0)
158 | 	if l.type == 'Deconvolution':
159 | 		l.blobs[0].data[...] /= std.reshape((1,-1,)+(1,)*(len(l.blobs[0].data.shape)-2))
160 | 	else:
161 | 		l.blobs[0].data[...] /= std.reshape((-1,)+(1,)*(len(l.blobs[0].data.shape)-1))
162 | 	for b in l.blobs[1:]:
163 | 		b.data[...] = -mu / std + bias
164 | 
165 | 
166 | 
167 | def magicInitialize(net, bias=0, NIT=10, type='elwise', max_data=None):
168 | 	import numpy as np
169 | 	# When was a blob last used
170 | 	last_used = {}
171 | 	# Make sure all layers are supported, and compute the last time each blob is used
172 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
173 | 		if l.type in UNSUPPORTED_LAYERS:
174 | 			print( "WARNING: Layer type '%s' not supported! Things might go very wrong..."%l.type )
175 | 		elif l.type not in SUPPORTED_LAYERS+STRIP_LAYER:
176 | 			print( "Unknown layer type '%s'. double check if it is supported"%l.type )
177 | 		for b in net.bottom_names[n]:
178 | 			last_used[b] = i
179 | 	
180 | 	active_data = {}
181 | 	# Read all the input data
182 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
183 | 		# Initialize the layer
184 | 		if len(l.blobs) > 0:
185 | 			if np.sum(np.abs(l.blobs[0].data)) <= 1e-10:
186 | 				print( "Initializing layer '%s'"%n )
187 | 				assert l.type in PARAMETER_LAYERS, "Unsupported parameter layer"
188 | 				assert len(net.top_names[n]) == 1, "Exactly one output supported"
189 | 				
190 | 				# Fill the parameters
191 | 				initializeLayer(net, i, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n][0], bias, type, max_data=max_data)
192 | 			else:
193 | 				print( "Skipping layer '%s'"%n )
194 | 
195 | 			# TODO: Estimate and rescale the values [TODO: Record and undo this scaling above]
196 | 		
197 | 		# Run the network forward
198 | 		new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n])
199 | 		active_data.update(new_data)
200 | 		
201 | 		# Delete all unused data
202 | 		for k in list(active_data):
203 | 			if k not in last_used or last_used[k] == i:
204 | 				del active_data[k]
205 | 
206 | def load(net, blobs):
207 | 	for l,n in zip(net.layers, net._layer_names):
208 | 		if n in blobs:
209 | 			for b, sb in zip(l.blobs, blobs[n]):
210 | 				b.data[...] = sb
211 | 
212 | def save(net):
213 | 	import numpy as np
214 | 	r = {}
215 | 	for l,n in zip(net.layers, net._layer_names):
216 | 		if len(l.blobs) > 0:
217 | 			r[n] = [np.copy(b.data) for b in l.blobs]
218 | 	return r
219 | 
220 | def estimateHomogenety(net):
221 | 	# Estimate if a certain layer is homogeneous and if yes return the degree k
222 | 	# by which the output is scaled (if input is scaled by alpha then the output
223 | 	# is scaled by alpha^k). Return None if the layer is not homogeneous.
224 | 	import numpy as np
225 | 	# When was a blob last used
226 | 	last_used = {}
227 | 	# Make sure all layers are supported, and compute the range each blob is used in
228 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
229 | 		for b in net.bottom_names[n]:
230 | 			last_used[b] = i
231 | 	
232 | 	active_data = {}
233 | 	homogenety = {}
234 | 	# Read all the input data
235 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
236 | 		# Run the network forward
237 | 		new_data1 = forward(net, i, 1, {b: [1*d for d in active_data[b]] for b in net.bottom_names[n]}, net.top_names[n])
238 | 		new_data2 = forward(net, i, 1, {b: [2*d for d in active_data[b]] for b in net.bottom_names[n]}, net.top_names[n])
239 | 		active_data.update(new_data1)
240 | 		
241 | 		if len(new_data1) == 1:
242 | 			m = list(new_data1.keys())[0]
243 | 			d1, d2 = flattenData(new_data1[m]), flattenData(new_data2[m])
244 | 			f = np.mean(np.abs(d1), axis=0) / np.mean(np.abs(d2), axis=0)
245 | 			if 1e-3*np.mean(f) < np.std(f):
246 | 				# Not homogeneous
247 | 				homogenety[n] = None
248 | 			else:
249 | 				# Compute the degree of the homogeneous transformation
250 | 				homogenety[n] = (np.log(np.mean(np.abs(d2))) - np.log(np.mean(np.abs(d1)))) / np.log(2)
251 | 		else:
252 | 			homogenety[n] = None
253 | 		# Delete all unused data
254 | 		for k in list(active_data):
255 | 			if k not in last_used or last_used[k] == i:
256 | 				del active_data[k]
257 | 	return homogenety
258 | 
259 | def calibrateGradientRatio(net, NIT=1):
260 | 	import numpy as np
261 | 	# When was a blob last used
262 | 	last_used = {}
263 | 	# Find the last layer to use
264 | 	last_layer = 0
265 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
266 | 		if l.type not in STRIP_LAYER:
267 | 			last_layer = i
268 | 		for b in net.bottom_names[n]:
269 | 			last_used[b] = i
270 | 	# Figure out which tops are involved
271 | 	last_tops = net.top_names[net._layer_names[last_layer]]
272 | 	for t in last_tops:
273 | 		last_used[t] = len(net.layers)
274 | 	
275 | 	# Call forward and store the data of all data layers
276 | 	active_data, input_data, bottom_scale = {}, {}, {}
277 | 	# Read all the input data
278 | 	for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
279 | 		if i > last_layer: break
280 | 		# Compute the input scale for parameter layers
281 | 		if len(l.blobs) > 0:
282 | 			bottom_scale[n] = np.mean([np.mean(np.abs(active_data[b])) for b in net.bottom_names[n]])
283 | 		# Run the network forward
284 | 		new_data = forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n])
285 | 		if l.type in INPUT_LAYERS:
286 | 			input_data.update(new_data)
287 | 		active_data.update(new_data)
288 | 		
289 | 		# Delete all unused data
290 | 		for k in list(active_data):
291 | 			if k not in last_used or last_used[k] == i:
292 | 				del active_data[k]
293 | 	output_std = np.mean(np.std(flattenData(active_data[last_tops[0]]), axis=0))
294 | 	
295 | 	for it in range(10):
296 | 		# Reset the diffs
297 | 		for l in net.layers:
298 | 			for b in l.blobs:
299 | 				b.diff[...] = 0
300 | 		# Set the top diffs
301 | 		for t in last_tops:
302 | 			net.blobs[t].diff[...] = np.random.normal(0, 1, net.blobs[t].shape)
303 | 		# Compute all gradients
304 | 		net._backward(last_layer, 0)
305 | 		
306 | 		# Compute the gradient ratio
307 | 		ratio={}
308 | 		for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
309 | 			if len(l.blobs) > 0:
310 | 				assert l.type in PARAMETER_LAYERS, "Parameter layer '%s' currently not supported"%l.type
311 | 				b = l.blobs[0]
312 | 				ratio[n] = np.sqrt(np.mean(b.diff**2) / np.mean(b.data**2))
313 | 		
314 | 		# If all layers are homogeneous, then the target ratio is the geometric mean of all ratios
315 | 		# (assuming we want the same output)
316 | 		# To deal with non-homogeneous layers we scale by output_std in the hope to undo correct the
317 | 		# estimation over time.
318 | 		# NOTE: for non feed-forward networks the geometric mean might not be the right scaling factor
319 | 		target_ratio = np.exp(np.mean(np.log(np.array(list(ratio.values()))))) * (output_std)**(1. / len(ratio))
320 | 		
321 | 		# Terminate if the relative change is less than 1% for all values
322 | 		log_ratio = np.log( np.array(list(ratio.values())) )
323 | 		if np.all( np.abs(log_ratio/np.log(target_ratio) - 1) < 0.01 ):
324 | 			break
325 | 		
326 | 		# Update all the weights and biases
327 | 		active_data = {}
328 | 		# Read all the input data
329 | 		for i, (n, l) in enumerate(zip(net._layer_names, net.layers)):
330 | 			if i > last_layer: break
331 | 			# Use the stored input
332 | 			if l.type in INPUT_LAYERS:
333 | 				active_data.update({b: input_data[b] for b in net.top_names[n]})
334 | 			else:
335 | 				if len(l.blobs) > 0:
336 | 					# Add the scaling from the bottom to the biases
337 | 					current_scale = np.mean([np.mean(np.abs(active_data[b])) for b in net.bottom_names[n]])
338 | 					adj = current_scale / bottom_scale[n]
339 | 					for b in list(l.blobs)[1:]:
340 | 						b.data[...] *= adj
341 | 					bottom_scale[n] = current_scale
342 | 					
343 | 					# Scale to obtain the target ratio
344 | 					scale = np.sqrt(ratio[n] / target_ratio)
345 | 					for b in l.blobs:
346 | 						b.data[...] *= scale
347 | 					
348 | 				active_data.update(forward(net, i, NIT, {b: active_data[b] for b in net.bottom_names[n]}, net.top_names[n]))
349 | 			# Delete all unused data
350 | 			for k in list(active_data):
351 | 				if k not in last_used or last_used[k] == i:
352 | 					del active_data[k]
353 | 		
354 | 		new_output_std = np.mean(np.std(flattenData(active_data[last_tops[0]]), axis=0))
355 | 		if np.abs(np.log(output_std) - np.log(new_output_std)) > 0.25:
356 | 			# If we diverge by a factor of exp(0.25) = ~1.3, then we should check if the network is really
357 | 			# homogeneous
358 | 			print( "WARNING: It looks like one or more layers are not homogeneous! Trying to correct for this..." )
359 | 			print( "         Output std = %f" % new_output_std )
360 | 		output_std = new_output_std
361 | 
362 | def netFromString(s, t=None):
363 | 	import caffe
364 | 	from tempfile import NamedTemporaryFile
365 | 	if t is None: t = caffe.TEST
366 | 	f = NamedTemporaryFile('w')
367 | 	f.write(s)
368 | 	f.flush()
369 | 	r = caffe.Net(f.name, t)
370 | 	f.close()
371 | 	return r
372 | 
373 | def getFileList(f):
374 | 	from glob import glob
375 | 	from os import path
376 | 	return [f for f in glob(f) if path.isfile(f)]
377 | 
378 | def main():
379 | 	from argparse import ArgumentParser
380 | 	from os import path
381 | 	import numpy as np
382 | 	parser = ArgumentParser()
383 | 	parser.add_argument('prototxt')
384 | 	parser.add_argument('output_caffemodel')
385 | 	parser.add_argument('-l', '--load', help='Load a pretrained model and rescale it [bias and type are not supported]')
386 | 	parser.add_argument('-d', '--data', default=None, help='Image list to use [default prototxt data]')
387 | 	parser.add_argument('-b', '--bias', type=float, default=0.1, help='Bias')
388 | 	parser.add_argument('-t', '--type', default='elwise', help='Type: elwise, pca, zca, kmeans, rand (random input patches). Add fast_ to speed up the initialization, but you might lose in precision.')
389 | 	parser.add_argument('-z', action='store_true', help='Zero all weights and reinitialize')
390 | 	parser.add_argument('-cs',  action='store_true', help='Correct for scaling')
391 | 	parser.add_argument('-q', action='store_true', help='Quiet execution')
392 | 	parser.add_argument('-s', type=float, default=1.0, help='Scale the input [only custom data "-d"]')
393 | 	parser.add_argument('-bs', type=int, default=16, help='Batch size [only custom data "-d"]')
394 | 	parser.add_argument('-nit', type=int, default=10, help='Number of iterations')
395 | 	parser.add_argument('--mem-limit', type=int, default=500, help='How much memory should we use for the data buffer (MB)?')
396 | 	parser.add_argument('--gpu', type=int, default=0, help='What gpu to run it on?')
397 | 	args = parser.parse_args()
398 | 	
399 | 	if args.q:
400 | 		from os import environ
401 | 		environ['GLOG_minloglevel'] = '2'
402 | 	import caffe, load
403 | 	from caffe import NetSpec, layers as L
404 | 	
405 | 	caffe.set_mode_gpu()
406 | 	if args.gpu is not None:
407 | 		caffe.set_device(args.gpu)
408 | 	
409 | 	if args.data is not None:
410 | 		model = load.ProtoDesc(args.prototxt)
411 | 		net = NetSpec()
412 | 		fl = getFileList(args.data)
413 | 		if len(fl) == 0:
414 | 			print("Unknown data type for '%s'"%args.data)
415 | 			exit(1)
416 | 		from tempfile import NamedTemporaryFile
417 | 		f = NamedTemporaryFile('w')
418 | 		f.write('\n'.join([path.abspath(i)+' 0' for i in fl]))
419 | 		f.flush()
420 | 		net.data, net.label = L.ImageData(source=f.name, batch_size=args.bs, new_width=model.input_dim[-1], new_height=model.input_dim[-1], transform_param=dict(mean_value=[104,117,123], scale=args.s),ntop=2)
421 | 		net.out = model(data=net.data, label=net.label)
422 | 		n = netFromString('force_backward:true\n'+str(net.to_proto()), caffe.TRAIN )
423 | 	else:
424 | 		n = caffe.Net(args.prototxt, caffe.TRAIN)
425 | 	
426 | 	if args.load is not None:
427 | 		n.copy_from(args.load)
428 | 		# Rescale existing layers?
429 | 		#if args.fix:
430 | 			#magicFix(n, args.nit)
431 | 
432 | 	if args.z:
433 | 		# Zero out all layers
434 | 		for l in n.layers:
435 | 			for b in l.blobs:
436 | 				b.data[...] = 0
437 | 	if any([np.abs(l.blobs[0].data).sum() < 1e-10 for l in n.layers if len(l.blobs) > 0]):
438 | 		print( [m for l,m in zip(n.layers, n._layer_names) if len(l.blobs) > 0 and np.abs(l.blobs[0].data).sum() < 1e-10] )
439 | 		magicInitialize(n, args.bias, NIT=args.nit, type=args.type, max_data=args.mem_limit*1024*1024/4)
440 | 	else:
441 | 		print( "Network already initialized, skipping magic init" )
442 | 	if args.cs:
443 | 		# A simply helper function that lets you figure out which layers are not
444 | 		# homogeneous
445 | 		#print( estimateHomogenety(n) )
446 | 		calibrateGradientRatio(n)
447 | 	n.save(args.output_caffemodel)
448 | 
449 | if __name__ == "__main__":
450 | 	main()
451 | 


--------------------------------------------------------------------------------