├── .gitignore ├── .idea └── vcs.xml ├── LICENSE ├── README.md ├── examples ├── input_data.py ├── tensorflow_grid_search.py └── tensorflow_mnist.py ├── net2net ├── __init__.py ├── net_2_deeper_net.py └── net_2_wider_net.py └── tests ├── __init__.py ├── test_net_2_deeper_net.py └── test_net_2_wider_net.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | #Ipython Notebook 62 | .ipynb_checkpoints 63 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Daniel Slater 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Net2Net 2 | numpy implementation of net 2 net from the paper Net2Net: Accelerating Learning via Knowledge Transfer http://arxiv.org/abs/1511.05641 3 | 4 | # Requirements 5 | - numpy 6 | 7 | # Usage 8 | Here is how you would use it to create a wider version of an existing layer 9 | 10 | import numpy as np 11 | 12 | weights = np.matrix([[1.0, 0.1, 0.5], [1.0, 0.1, 0.5]]) 13 | bias = np.array([0.0, 0.0, 0.0]) 14 | weights_next_layer = np.matrix([[1.0], [0.2], [0.5]]) 15 | 16 | weights, bias, weights_next_layer = net_2_wider_net(weights, bias, 17 | weights_next_layer, 18 | new_layer_size=5) 19 | Then simply use the new variables from then on. 20 | 21 | Here is creating the weights and biases for a new layer using net 2 deeper net 22 | 23 | import numpy as np 24 | 25 | bias = np.array([0.0, 0.0, 0.0]) 26 | 27 | next_layer_weights, next_layer_bias = net_2_deeper_net(bias) 28 | 29 | There are complete examples of using this for grid searching the number of hidden nodes in examples/tensorflow_grid_search.py -------------------------------------------------------------------------------- /examples/input_data.py: -------------------------------------------------------------------------------- 1 | """Functions for downloading and reading MNIST data.""" 2 | from __future__ import print_function 3 | import gzip 4 | import os 5 | import urllib 6 | import numpy 7 | 8 | SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/' 9 | 10 | 11 | def maybe_download(filename, work_directory): 12 | """Download the data from Yann's website, unless it's already here.""" 13 | if not os.path.exists(work_directory): 14 | os.mkdir(work_directory) 15 | filepath = os.path.join(work_directory, filename) 16 | if not os.path.exists(filepath): 17 | filepath, _ = urllib.urlretrieve(SOURCE_URL + filename, filepath) 18 | statinfo = os.stat(filepath) 19 | print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.') 20 | return filepath 21 | 22 | 23 | def _read32(bytestream): 24 | dt = numpy.dtype(numpy.uint32).newbyteorder('>') 25 | return numpy.frombuffer(bytestream.read(4), dtype=dt) 26 | 27 | 28 | def extract_images(filename): 29 | """Extract the images into a 4D uint8 numpy array [index, y, x, depth].""" 30 | print('Extracting', filename) 31 | with gzip.open(filename) as bytestream: 32 | magic = _read32(bytestream) 33 | if magic != 2051: 34 | raise ValueError( 35 | 'Invalid magic number %d in MNIST image file: %s' % 36 | (magic, filename)) 37 | num_images = _read32(bytestream) 38 | rows = _read32(bytestream) 39 | cols = _read32(bytestream) 40 | buf = bytestream.read(rows * cols * num_images) 41 | data = numpy.frombuffer(buf, dtype=numpy.uint8) 42 | data = data.reshape(num_images, rows, cols, 1) 43 | return data 44 | 45 | 46 | def dense_to_one_hot(labels_dense, num_classes=10): 47 | """Convert class labels from scalars to one-hot vectors.""" 48 | num_labels = labels_dense.shape[0] 49 | index_offset = numpy.arange(num_labels) * num_classes 50 | labels_one_hot = numpy.zeros((num_labels, num_classes)) 51 | labels_one_hot.flat[index_offset + labels_dense.ravel()] = 1 52 | return labels_one_hot 53 | 54 | 55 | def extract_labels(filename, one_hot=False): 56 | """Extract the labels into a 1D uint8 numpy array [index].""" 57 | print('Extracting', filename) 58 | with gzip.open(filename) as bytestream: 59 | magic = _read32(bytestream) 60 | if magic != 2049: 61 | raise ValueError( 62 | 'Invalid magic number %d in MNIST label file: %s' % 63 | (magic, filename)) 64 | num_items = _read32(bytestream) 65 | buf = bytestream.read(num_items) 66 | labels = numpy.frombuffer(buf, dtype=numpy.uint8) 67 | if one_hot: 68 | return dense_to_one_hot(labels) 69 | return labels 70 | 71 | 72 | class DataSet(object): 73 | def __init__(self, images, labels, fake_data=False): 74 | if fake_data: 75 | self._num_examples = 10000 76 | else: 77 | assert images.shape[0] == labels.shape[0], ( 78 | "images.shape: %s labels.shape: %s" % (images.shape, 79 | labels.shape)) 80 | self._num_examples = images.shape[0] 81 | # Convert shape from [num examples, rows, columns, depth] 82 | # to [num examples, rows*columns] (assuming depth == 1) 83 | assert images.shape[3] == 1 84 | images = images.reshape(images.shape[0], 85 | images.shape[1] * images.shape[2]) 86 | # Convert from [0, 255] -> [0.0, 1.0]. 87 | images = images.astype(numpy.float32) 88 | images = numpy.multiply(images, 1.0 / 255.0) 89 | self._images = images 90 | self._labels = labels 91 | self._epochs_completed = 0 92 | self._index_in_epoch = 0 93 | 94 | @property 95 | def images(self): 96 | return self._images 97 | 98 | @property 99 | def labels(self): 100 | return self._labels 101 | 102 | @property 103 | def num_examples(self): 104 | return self._num_examples 105 | 106 | @property 107 | def epochs_completed(self): 108 | return self._epochs_completed 109 | 110 | def next_batch(self, batch_size, fake_data=False): 111 | """Return the next `batch_size` examples from this data set.""" 112 | if fake_data: 113 | fake_image = [1.0 for _ in xrange(784)] 114 | fake_label = 0 115 | return [fake_image for _ in xrange(batch_size)], [ 116 | fake_label for _ in xrange(batch_size)] 117 | start = self._index_in_epoch 118 | self._index_in_epoch += batch_size 119 | if self._index_in_epoch > self._num_examples: 120 | # Finished epoch 121 | self._epochs_completed += 1 122 | # Shuffle the data 123 | perm = numpy.arange(self._num_examples) 124 | numpy.random.shuffle(perm) 125 | self._images = self._images[perm] 126 | self._labels = self._labels[perm] 127 | # Start next epoch 128 | start = 0 129 | self._index_in_epoch = batch_size 130 | assert batch_size <= self._num_examples 131 | end = self._index_in_epoch 132 | return self._images[start:end], self._labels[start:end] 133 | 134 | 135 | def read_data_sets(train_dir, fake_data=False, one_hot=False): 136 | class DataSets(object): 137 | pass 138 | 139 | data_sets = DataSets() 140 | if fake_data: 141 | data_sets.train = DataSet([], [], fake_data=True) 142 | data_sets.validation = DataSet([], [], fake_data=True) 143 | data_sets.test = DataSet([], [], fake_data=True) 144 | return data_sets 145 | TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' 146 | TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' 147 | TEST_IMAGES = 't10k-images-idx3-ubyte.gz' 148 | TEST_LABELS = 't10k-labels-idx1-ubyte.gz' 149 | VALIDATION_SIZE = 5000 150 | local_file = maybe_download(TRAIN_IMAGES, train_dir) 151 | train_images = extract_images(local_file) 152 | local_file = maybe_download(TRAIN_LABELS, train_dir) 153 | train_labels = extract_labels(local_file, one_hot=one_hot) 154 | local_file = maybe_download(TEST_IMAGES, train_dir) 155 | test_images = extract_images(local_file) 156 | local_file = maybe_download(TEST_LABELS, train_dir) 157 | test_labels = extract_labels(local_file, one_hot=one_hot) 158 | validation_images = train_images[:VALIDATION_SIZE] 159 | validation_labels = train_labels[:VALIDATION_SIZE] 160 | train_images = train_images[VALIDATION_SIZE:] 161 | train_labels = train_labels[VALIDATION_SIZE:] 162 | data_sets.train = DataSet(train_images, train_labels) 163 | data_sets.validation = DataSet(validation_images, validation_labels) 164 | data_sets.test = DataSet(test_images, test_labels) 165 | return data_sets 166 | -------------------------------------------------------------------------------- /examples/tensorflow_grid_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Multilayer Perceptron implementation example using TensorFlow library. 3 | A first network with 100, 100 hidden nodes is trained 4 | Then a series of new networks of different sizes are cloned from it to find the best size 5 | 6 | This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/) 7 | 8 | This is an extension of: https://github.com/aymericdamien/TensorFlow-Examples/ 9 | By: Aymeric Damien 10 | """ 11 | from copy import copy 12 | 13 | import input_data 14 | from net2net.net_2_wider_net import net_2_wider_net 15 | import tensorflow as tf 16 | 17 | mnist = input_data.read_data_sets("/tmp/data/", one_hot=True) 18 | 19 | # Parameters 20 | minimal_model_learning_rate = 0.001 21 | minimal_model_training_epochs = 15 22 | after_resize_training_epochs = 15 23 | after_resize_learning_rate = 0.0005 24 | batch_size = 100 25 | 26 | # Network Parameters 27 | n_input = 784 # MNIST data input (img shape: 28*28) 28 | n_classes = 10 # MNIST total classes (0-9 digits) 29 | minimal_n_hidden_1 = 100 # 1st layer num features 30 | minimal_n_hidden_2 = 100 # 2nd layer num features 31 | 32 | max_nodes_per_layer = 301 33 | node_per_layer_step = 50 34 | 35 | hidden_node_grid_search = [(x, y) for x in range(minimal_n_hidden_1, max_nodes_per_layer, node_per_layer_step) for y in 36 | range(minimal_n_hidden_2, max_nodes_per_layer, node_per_layer_step) if x >= y] 37 | 38 | print("We will be testing the following numbers of hidden nodes in layer 1 and 2:") 39 | print(hidden_node_grid_search) 40 | 41 | # tf Graph input 42 | x = tf.placeholder("float", [None, n_input]) 43 | y = tf.placeholder("float", [None, n_classes]) 44 | learning_rate_tensor = tf.Variable(minimal_model_learning_rate, trainable=False) 45 | 46 | # Create model 47 | def multilayer_perceptron(input_placeholder, _weights, _biases): 48 | layer_1 = tf.nn.relu( 49 | tf.add(tf.matmul(input_placeholder, _weights[0]), _biases[0])) # Hidden layer with RELU activation 50 | layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, _weights[1]), _biases[1])) # Hidden layer with RELU activation 51 | prediction_op = tf.matmul(layer_2, _weights[2]) + _biases[2] 52 | cost_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(prediction_op, y)) # Softmax loss 53 | train_op = tf.train.AdamOptimizer(learning_rate=learning_rate_tensor).minimize(cost_op) 54 | return cost_op, train_op, prediction_op 55 | 56 | 57 | # method for training a network 58 | def training_cycle(session, cost_fn, train_op, prediction, epochs): 59 | for epoch in range(epochs): 60 | avg_cost = 0. 61 | total_batch = int(mnist.train.num_examples / batch_size) 62 | # Loop over all batches 63 | for i in range(total_batch): 64 | batch_xs, batch_ys = mnist.train.next_batch(batch_size) 65 | # Fit training using batch data 66 | session.run(train_op, feed_dict={x: batch_xs, y: batch_ys}) 67 | # Compute average loss 68 | avg_cost += session.run(cost_fn, feed_dict={x: batch_xs, y: batch_ys}) / total_batch 69 | 70 | print("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost)) 71 | 72 | print("Optimization Finished!") 73 | # Test model 74 | correct_prediction = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1)) 75 | # Calculate accuracy 76 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 77 | train_accuracy = accuracy.eval({x: mnist.train.images, y: mnist.train.labels}) 78 | test_accuracy = accuracy.eval({x: mnist.test.images, y: mnist.test.labels}) 79 | print("Train Accuracy: %s Test Accuracy: %s" % (train_accuracy, test_accuracy)) 80 | 81 | return avg_cost, train_accuracy, test_accuracy 82 | 83 | 84 | def clone_wider_network(minimal_network_weights, 85 | minimal_network_biases, 86 | new_n_hidden_nodes_1, 87 | new_n_hidden_nodes_2): 88 | print("Creating network with hidden nodes %s, %s" % (new_n_hidden_nodes_1, new_n_hidden_nodes_2)) 89 | new_weights = copy(minimal_network_weights) 90 | new_biases = copy(minimal_network_biases) 91 | # expand the layers that need expanding 92 | if new_biases[0].shape[0] < new_n_hidden_nodes_1: 93 | new_weights[0], new_biases[0], new_weights[1] = net_2_wider_net(new_weights[0], new_biases[0], 94 | new_weights[1], 95 | new_layer_size=new_n_hidden_nodes_1, 96 | noise_std=0.01) 97 | 98 | if new_biases[1].shape[0] < new_n_hidden_nodes_2: 99 | new_weights[1], new_biases[1], new_weights[2] = net_2_wider_net(new_weights[1], new_biases[1], 100 | new_weights[2], 101 | new_layer_size=new_n_hidden_nodes_2, 102 | noise_std=0.01) 103 | 104 | weights_variables = [ 105 | tf.Variable(new_weights[0]), 106 | tf.Variable(new_weights[1]), 107 | tf.Variable(new_weights[2])] 108 | 109 | biases_variables = [ 110 | tf.Variable(new_biases[0]), 111 | tf.Variable(new_biases[1]), 112 | tf.Variable(new_biases[2])] 113 | 114 | return weights_variables, biases_variables 115 | 116 | # Store layers weight & bias 117 | weight_variables = [tf.Variable(tf.random_normal([n_input, minimal_n_hidden_1])), 118 | tf.Variable(tf.random_normal([minimal_n_hidden_1, minimal_n_hidden_2])), 119 | tf.Variable(tf.random_normal([minimal_n_hidden_2, n_classes]))] 120 | 121 | bias_variables = [tf.Variable(tf.zeros([minimal_n_hidden_1])), 122 | tf.Variable(tf.zeros([minimal_n_hidden_2])), 123 | tf.Variable(tf.zeros([n_classes]))] 124 | 125 | # Construct model 126 | cost, optimizer, pred = multilayer_perceptron(x, weight_variables, bias_variables) 127 | 128 | results = [] 129 | 130 | # Launch the graph 131 | with tf.Session() as sess: 132 | new_all_variables = set(tf.all_variables()) 133 | sess.run(tf.initialize_variables(new_all_variables)) 134 | old_all_variables = new_all_variables 135 | 136 | training_cycle(sess, cost, optimizer, pred, minimal_model_training_epochs) 137 | 138 | # get the values of the trained parameters 139 | minimal_network_weights = list(sess.run(weight_variables)) 140 | minimal_network_biases = list(sess.run(bias_variables)) 141 | 142 | learning_rate_tensor.assign(after_resize_learning_rate) 143 | 144 | for n_layer_h1, n_layer_h2 in hidden_node_grid_search: 145 | weight_variables, bias_variables = clone_wider_network(minimal_network_weights, minimal_network_biases, 146 | n_layer_h1, n_layer_h2) 147 | 148 | new_cost, new_optimizer, new_pred = multilayer_perceptron(x, weight_variables, bias_variables) 149 | 150 | # must initalize variables for the new net 151 | new_all_variables = set(tf.all_variables()) 152 | sess.run(tf.initialize_variables(new_all_variables - old_all_variables)) 153 | old_all_variables = new_all_variables 154 | 155 | cost, train, test = training_cycle(sess, new_cost, new_optimizer, new_pred, after_resize_training_epochs) 156 | results.append((n_layer_h1, n_layer_h2, cost, train, test)) 157 | 158 | print(results) 159 | print("best was " + str(max(results, key=lambda a: a[-1]))) 160 | -------------------------------------------------------------------------------- /examples/tensorflow_mnist.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Multilayer Perceptron implementation example using TensorFlow library. 3 | After training we then create a 2nd network which is a bigger version of the 1st network and train on that 4 | 5 | This example is using the MNIST database of handwritten digits (http://yann.lecun.com/exdb/mnist/) 6 | 7 | This is an extension of: https://github.com/aymericdamien/TensorFlow-Examples/ 8 | By: Aymeric Damien 9 | """ 10 | import input_data 11 | from net2net.net_2_wider_net import net_2_wider_net 12 | import tensorflow as tf 13 | 14 | mnist = input_data.read_data_sets("/tmp/data/", one_hot=True) 15 | 16 | # Parameters 17 | learning_rate = 0.01 18 | training_epochs = 10 19 | batch_size = 100 20 | display_step = 1 21 | 22 | # Network Parameters 23 | n_hidden_1 = 256 # 1st layer num features 24 | n_hidden_2 = 40 # 2nd layer num features, for this example it is initialy over constrained 25 | n_input = 784 # MNIST data input (img shape: 28*28) 26 | n_classes = 10 # MNIST total classes (0-9 digits) 27 | n_hidden_2_nodes_after_resize = 200 28 | 29 | # tf Graph input 30 | x = tf.placeholder("float", [None, n_input]) 31 | y = tf.placeholder("float", [None, n_classes]) 32 | 33 | 34 | # Create model 35 | def multilayer_perceptron(_X, _weights, _biases): 36 | layer_1 = tf.nn.relu(tf.add(tf.matmul(_X, _weights[0]), _biases[0])) # Hidden layer with RELU activation 37 | layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, _weights[1]), _biases[1])) # Hidden layer with RELU activation 38 | return tf.matmul(layer_2, _weights[2]) + _biases[2] 39 | 40 | 41 | # method for training a network 42 | def training_cycle(session, cost_function, train_op, predication): 43 | for epoch in range(training_epochs): 44 | avg_cost = 0. 45 | total_batch = int(mnist.train.num_examples / batch_size) 46 | # Loop over all batches 47 | for i in range(total_batch): 48 | batch_xs, batch_ys = mnist.train.next_batch(batch_size) 49 | # Fit training using batch data 50 | session.run(train_op, feed_dict={x: batch_xs, y: batch_ys}) 51 | # Compute average loss 52 | avg_cost += session.run(cost_function, feed_dict={x: batch_xs, y: batch_ys}) / total_batch 53 | # Display logs per epoch step 54 | if epoch % display_step == 0: 55 | print "Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost) 56 | print "Optimization Finished!" 57 | # Test model 58 | correct_prediction = tf.equal(tf.argmax(predication, 1), tf.argmax(y, 1)) 59 | # Calculate accuracy 60 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 61 | print "Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}) 62 | 63 | 64 | # Store layers weight & bias 65 | weights = [ 66 | tf.Variable(tf.random_normal([n_input, n_hidden_1])), 67 | tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])), 68 | tf.Variable(tf.random_normal([n_hidden_2, n_classes]))] 69 | 70 | biases = [ 71 | tf.Variable(tf.random_normal([n_hidden_1])), 72 | tf.Variable(tf.random_normal([n_hidden_2])), 73 | tf.Variable(tf.random_normal([n_classes]))] 74 | 75 | # Construct model 76 | pred = multilayer_perceptron(x, weights, biases) 77 | 78 | # Define loss and optimizer 79 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y)) 80 | optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost) 81 | 82 | # Initializing the variables 83 | init = tf.initialize_all_variables() 84 | 85 | # Launch the graph 86 | with tf.Session() as sess: 87 | sess.run(init) 88 | 89 | training_cycle(sess, cost, optimizer, pred) 90 | 91 | print("creating new network, increasing size of 2nd hidden layer from %s to %s" % ( 92 | n_hidden_2, n_hidden_2_nodes_after_resize)) 93 | 94 | # now we have trained the model lets copy the parameters and make one of the layer wider 95 | trained_weights_h1, trained_weights_h2, trained_weights_output = sess.run( 96 | [weights[0], weights[1], weights[2]]) 97 | trained_bias_h1, trained_bias_h2, trained_bias_output = sess.run([biases[0], biases[1], biases[2]]) 98 | 99 | # make the 2nd layer bigger 100 | new_weights_h2, new_bias_h2, new_weights_output = net_2_wider_net(trained_weights_h2, trained_bias_h2, 101 | trained_weights_output, 102 | new_layer_size=n_hidden_2_nodes_after_resize) 103 | 104 | # create new network with the changed weights, you can also simple reassign the existing variables to have these 105 | # new values, if validate is set to False it will all still work 106 | new_weights = [ 107 | tf.Variable(trained_weights_h1), 108 | tf.Variable(new_weights_h2), 109 | tf.Variable(new_weights_output)] 110 | 111 | new_biases = [ 112 | tf.Variable(trained_bias_h1), 113 | tf.Variable(new_bias_h2), 114 | tf.Variable(trained_bias_output)] 115 | 116 | # must initalize all these variables 117 | sess.run(tf.initialize_variables(new_weights + new_biases)) 118 | 119 | new_pred = multilayer_perceptron(x, new_weights, new_biases) 120 | 121 | new_cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(new_pred, y)) 122 | # if we were to use an Adam optimizer we would have to do something clever about initializing it's variables 123 | new_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(new_cost) 124 | 125 | # loss will hopefully be better with more nodes in the 2nd layer 126 | training_cycle(sess, new_cost, new_optimizer, new_pred) 127 | -------------------------------------------------------------------------------- /net2net/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/Net2Net/0f7f39956923f0b6591c5205d51dad537e889abd/net2net/__init__.py -------------------------------------------------------------------------------- /net2net/net_2_deeper_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def net_2_deeper_net(bias, noise_std=0.01): 5 | """ 6 | This is a similar idea to net 2 deeper net from http://arxiv.org/pdf/1511.05641.pdf 7 | Assumes that this is a linear layer that is being extended and also adds some noise 8 | 9 | Args: 10 | bias (numpy.array): The bias for the layer we are adding after 11 | noise_std (Optional float): The amount of normal noise to add to the layer. 12 | If None then no noise is added 13 | Default is 0.01 14 | Returns: 15 | (numpy.matrix, numpy.array) 16 | The first item is the weights for the new layer 17 | Second item is the bias for the new layer 18 | """ 19 | new_weights = np.matrix(np.eye(bias.shape[0], dtype=bias.dtype)) 20 | new_bias = np.zeros(bias.shape, dtype=bias.dtype) 21 | 22 | if noise_std: 23 | new_weights = new_weights + np.random.normal(scale=noise_std, size=new_weights.shape) 24 | new_bias = new_bias + np.random.normal(scale=noise_std, size=new_bias.shape) 25 | 26 | return new_weights, new_bias 27 | -------------------------------------------------------------------------------- /net2net/net_2_wider_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def net_2_wider_net(weights, bias, weights_next_layer, 5 | noise_std=0.01, 6 | new_layer_size=None, 7 | split_max_weight_else_random=True): 8 | """ 9 | Numpy implementation of net 2 wider net from http://arxiv.org/pdf/1511.05641.pdf 10 | 11 | Args: 12 | weights (numpy.matrix|numpy.ndarray): The weights for the layer 13 | bias (numpy.array): The bias for the layer 14 | weights_next_layer (numpy.matrix|numpy.ndarray): The weights for the next layer 15 | noise_std (Optional float): The amount of noise to add to the weights when we expand 16 | If None no noise is added 17 | new_layer_size (Optional int): The size the new layer should be. If none the size is set to 1 larger than the 18 | current layer 19 | split_max_weight_else_random (bool): If True we split by selecting the node with the largest activation. 20 | If False we split a random node 21 | 22 | Returns: 23 | (numpy.matrix, numpy.array, numpy.matrix): This tuple contains 24 | the new_weights, new_bias and new_weights_next_layer 25 | 26 | These will all be 1 size larger than the ones passed in as specified by net 2 net paper 27 | 28 | Raises: 29 | ValueError: If the weights shape second dimension doesn't equal the bias dimension 30 | If the bias dimension doesnt equal the new_layer_weight first dimension 31 | If the new_layer_size is not greater than the bias dimension 32 | """ 33 | if weights.shape[1] != bias.shape[0]: 34 | raise ValueError('weights with shape %s must have same last dimension as bias which had shape %s' % 35 | (weights.shape, bias.shape)) 36 | 37 | if bias.shape[0] != weights_next_layer.shape[0]: 38 | raise ValueError( 39 | 'bias with shape %s must have same size as weight_next_layer first dimension which has shape %s' % 40 | (weights.shape, bias.shape)) 41 | 42 | if new_layer_size is None: 43 | new_layer_size = bias.shape[0] + 1 44 | elif new_layer_size <= bias.shape[0]: 45 | raise ValueError('New layer size must be greater than current layer size') 46 | 47 | while bias.shape[0] < new_layer_size: 48 | weights, bias, weights_next_layer = _net_2_wider_net_increase_size_by_one(weights, bias, 49 | weights_next_layer, 50 | noise_std, 51 | split_max_weight_else_random) 52 | 53 | return weights, bias, weights_next_layer 54 | 55 | 56 | def _net_2_wider_net_increase_size_by_one(weights, bias, weights_next_layer, 57 | noise_std=0.01, 58 | split_max_weight_else_random=True): 59 | if split_max_weight_else_random: 60 | # find the node with the highest activation 61 | split_index = np.argmax((np.dot(np.ones(weights.shape[0]), weights)) + bias) 62 | else: 63 | # randomly select a node to split, a new node will be created with the same weights as this one. 64 | split_index = np.random.randint(0, weights.shape[1]) 65 | 66 | # add split node weights to layer weights 67 | node_to_split_weights = weights[:, split_index] 68 | 69 | # add new node bias to bias 70 | new_bias = np.r_[bias, [bias[split_index]]] 71 | 72 | # reduce the output connections to the next layer by half for the split node and the new node 73 | # this means the activation of the next layer will remain unchanged 74 | output_weights_for_split_node = weights_next_layer[split_index, :] * .5 75 | 76 | # if we got an ndarry as input we need to pad it out 77 | if output_weights_for_split_node.ndim == 1: 78 | output_weights_for_split_node = np.reshape(output_weights_for_split_node, 79 | (1, output_weights_for_split_node.shape[0])) 80 | 81 | if noise_std: 82 | weight_noise = np.random.normal(scale=noise_std, size=node_to_split_weights.shape) 83 | node_to_split_weights += weight_noise 84 | 85 | bias_noise = np.random.normal(scale=noise_std) 86 | new_bias[-1] += bias_noise 87 | new_bias[split_index] -= bias_noise 88 | 89 | output_weights_for_split_node += np.random.normal(scale=noise_std, 90 | size=output_weights_for_split_node.shape) 91 | 92 | new_weights = np.c_[weights, node_to_split_weights] 93 | 94 | new_weights_next_layer = np.r_[weights_next_layer, 95 | output_weights_for_split_node] 96 | 97 | new_weights_next_layer[split_index, :] *= .5 98 | 99 | return new_weights, new_bias, new_weights_next_layer 100 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DanielSlater/Net2Net/0f7f39956923f0b6591c5205d51dad537e889abd/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_net_2_deeper_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | 4 | from net2net.net_2_deeper_net import net_2_deeper_net 5 | 6 | 7 | class TestNet2DeeperNet(TestCase): 8 | SMALL_NOISE_EPSILON = 1e-6 9 | WEIGHTS = np.matrix([[1.0, 0.1, 0.5], [1.0, 0.1, 0.5]]) 10 | BIAS = np.array([0.0, 0.0, 0.0]) 11 | WEIGHTS_NEXT_LAYER = np.matrix([[1.0], [0.2], [0.5]]) 12 | 13 | def test_activation_should_be_unchanged_after_adding_layer(self): 14 | inputs = np.array([0.1, 0.9]) 15 | 16 | activation_pre_deepening = ((inputs * self.WEIGHTS) + self.BIAS) 17 | 18 | weight_new_layer, bias_new_layer = net_2_deeper_net(self.BIAS, noise_std=0.0001) 19 | 20 | activation_post_deepening = (activation_pre_deepening * weight_new_layer) + bias_new_layer 21 | 22 | np.testing.assert_array_almost_equal(activation_pre_deepening, activation_post_deepening, decimal=2, 23 | err_msg='Activation should be unchanged after adding a new layer') 24 | -------------------------------------------------------------------------------- /tests/test_net_2_wider_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | 4 | from net2net.net_2_wider_net import net_2_wider_net 5 | 6 | 7 | class TestNet2WiderNet(TestCase): 8 | SMALL_NOISE_EPSILON = 1e-6 9 | WEIGHTS = np.matrix([[1.0, 0.1, 0.5], [1.0, 0.1, 0.5]]) 10 | BIAS = np.array([0.0, 0.0, 0.0]) 11 | WEIGHTS_NEXT_LAYER = np.matrix([[1.0], [0.2], [0.5]]) 12 | 13 | def test_one_wider_random_split(self): 14 | inputs = np.array([0.1, 0.9]) 15 | 16 | activation_pre_widening = ((inputs * self.WEIGHTS) + self.BIAS) * self.WEIGHTS_NEXT_LAYER 17 | 18 | weights_post, bias_post, weights_next_layer_post = net_2_wider_net(self.WEIGHTS, self.BIAS, 19 | self.WEIGHTS_NEXT_LAYER, 20 | noise_std=self.SMALL_NOISE_EPSILON, 21 | split_max_weight_else_random=False) 22 | 23 | activation_post_net_widening = ((inputs * weights_post) + bias_post) * weights_next_layer_post 24 | 25 | np.testing.assert_array_almost_equal(activation_pre_widening, activation_post_net_widening, decimal=2, 26 | err_msg='activation should not be significantly changed after widening') 27 | 28 | self.assertEqual(self.BIAS.shape[0] + 1, bias_post.shape[0], msg='bias should be one larger') 29 | 30 | def test_x_wider(self): 31 | inputs = np.array([0.1, 0.9]) 32 | new_layer_size = 8 33 | 34 | activation_pre_widening = ((inputs * self.WEIGHTS) + self.BIAS) * self.WEIGHTS_NEXT_LAYER 35 | 36 | weights_post, bias_post, weights_next_layer_post = net_2_wider_net(self.WEIGHTS, self.BIAS, 37 | self.WEIGHTS_NEXT_LAYER, 38 | noise_std=self.SMALL_NOISE_EPSILON, 39 | split_max_weight_else_random=False, 40 | new_layer_size=new_layer_size) 41 | 42 | activation_post_net_widening = ((inputs * weights_post) + bias_post) * weights_next_layer_post 43 | 44 | np.testing.assert_array_almost_equal(activation_pre_widening, activation_post_net_widening, decimal=2, 45 | err_msg='activation should not be significantly changed after widening') 46 | 47 | self.assertEqual(new_layer_size, bias_post.shape[0], msg='bias should same size as new_layer_size') 48 | 49 | def test_one_wider_max_split(self): 50 | inputs = np.array([0.1, 0.9]) 51 | 52 | activation_pre_widening = ((inputs * self.WEIGHTS) + self.BIAS) * self.WEIGHTS_NEXT_LAYER 53 | 54 | weights_post, bias_post, weights_next_layer_post = net_2_wider_net(self.WEIGHTS, self.BIAS, 55 | self.WEIGHTS_NEXT_LAYER, 56 | noise_std=self.SMALL_NOISE_EPSILON, 57 | split_max_weight_else_random=True) 58 | 59 | activation_post_net_widening = ((inputs * weights_post) + bias_post) * weights_next_layer_post 60 | 61 | np.testing.assert_array_almost_equal(activation_pre_widening, activation_post_net_widening, decimal=2, 62 | err_msg='activation should not be significantly changed after widening') 63 | 64 | self.assertEqual(self.BIAS.shape[0] + 1, bias_post.shape[0], msg='bias should be one larger') 65 | self.assertAlmostEqual(self.WEIGHTS_NEXT_LAYER[0, 0] / 2., weights_next_layer_post[0, 0], 66 | msg='this weight was the max so should have been split in 2') 67 | 68 | def test_no_noise(self): 69 | weights_post, bias_post, weights_next_layer_post = net_2_wider_net(self.WEIGHTS, self.BIAS, 70 | self.WEIGHTS_NEXT_LAYER, 71 | noise_std=None, 72 | split_max_weight_else_random=True) 73 | 74 | self.assertEqual(self.WEIGHTS_NEXT_LAYER[1, 0], weights_next_layer_post[1, 0], 75 | msg='this weight was not the max so should not have been split so should be exactly equal') 76 | 77 | def test_one_wider_max_split_with_ndarray(self): 78 | inputs = np.array([0.1, 0.9]) 79 | 80 | # make sure it can handle these being ndarrays rather than matrix 81 | weights_as_array = np.asarray(self.WEIGHTS) 82 | weights_next_layer_as_array = np.asarray(self.WEIGHTS_NEXT_LAYER) 83 | 84 | activation_pre_widening = np.dot((np.dot(inputs, weights_as_array) + self.BIAS), weights_next_layer_as_array) 85 | 86 | weights_post, bias_post, weights_next_layer_post = net_2_wider_net(weights_as_array, self.BIAS, 87 | weights_next_layer_as_array, 88 | noise_std=self.SMALL_NOISE_EPSILON, 89 | split_max_weight_else_random=True) 90 | 91 | self.assertEqual(type(weights_post), np.ndarray, 92 | msg='if we give the inputs as ndarrays we expect them to come back as ndarrays') 93 | self.assertEqual(type(weights_next_layer_post), np.ndarray, 94 | msg='if we give the inputs as ndarrays we expect them to come back as ndarrays') 95 | 96 | activation_post_net_widening = np.dot((np.dot(inputs, weights_post) + bias_post), weights_next_layer_post) 97 | 98 | np.testing.assert_array_almost_equal(activation_pre_widening, activation_post_net_widening, decimal=2, 99 | err_msg='activation should not be significantly changed after widening') 100 | 101 | self.assertEqual(self.BIAS.shape[0] + 1, bias_post.shape[0], msg='bias should be one larger') 102 | self.assertAlmostEqual(self.WEIGHTS_NEXT_LAYER[0, 0] / 2., weights_next_layer_post[0, 0], 103 | msg='this weight was the max so should have been split in 2') --------------------------------------------------------------------------------