├── utilities
├── goliat2_script.py
├── goliat3_script.py
├── goliat4_script.py
├── results_extractor.py
├── svhn_preprocessing.py
└── filter_plot.py
├── README.md
├── format.py
├── main.py
├── trainer.py
├── LICENSE.txt
├── model.py
└── layer.py
/utilities/goliat2_script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | # gpu
5 | nb_gpu = 4
6 | delay_between_lauches = 5*60 # 5 minutes
7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs
8 | gpu_id = 0
9 |
10 | # hyper parameters lists
11 | comp_NOB = [9,11]
12 | up_NOB = [11,13]
13 | NOIB = [8]
14 | dynamic_range = [1]
15 |
16 | for i in comp_NOB:
17 | for j in up_NOB:
18 | for k in NOIB:
19 | for l in dynamic_range:
20 |
21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 | os.system(command)
23 | print command
24 |
25 | if gpu_id == nb_gpu - 1:
26 |
27 | gpu_id = 0
28 | time.sleep(time_per_job)
29 | print " "
30 |
31 | else:
32 |
33 | gpu_id = gpu_id+1
34 | time.sleep(delay_between_lauches)
35 |
36 |
--------------------------------------------------------------------------------
/utilities/goliat3_script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | # gpu
5 | nb_gpu = 4
6 | delay_between_lauches = 5*60 # 5 minutes
7 | time_per_job = 7.5*60*60 # CIFAR10 9h in float32 with 1.3 more neurones, partial sum =1, no range updates, 200 epochs
8 | gpu_id = 0
9 |
10 | # hyper parameters lists
11 | comp_NOB = [19,21]
12 | up_NOB = [17,19]
13 | NOIB = [5]
14 | dynamic_range = [0]
15 |
16 | for i in comp_NOB:
17 | for j in up_NOB:
18 | for k in NOIB:
19 | for l in dynamic_range:
20 |
21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 | os.system(command)
23 | print command
24 |
25 | if gpu_id == nb_gpu - 1:
26 |
27 | gpu_id = 0
28 | time.sleep(time_per_job)
29 | print " "
30 |
31 | else:
32 |
33 | gpu_id = gpu_id+1
34 | time.sleep(delay_between_lauches)
35 |
36 |
--------------------------------------------------------------------------------
/utilities/goliat4_script.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | # gpu
5 | nb_gpu = 4
6 | delay_between_lauches = 5*60 # 2 minutes
7 | time_per_job = 7*60*60 # 9 in float32 for 1.3 more neurones, partial sum, no range updates, for 200 epochs
8 | gpu_id = 0
9 |
10 | # hyper parameters lists
11 | comp_NOB = [31]
12 | up_NOB = [31]
13 | NOIB = [3,5,6]
14 | dynamic_range = [0]
15 |
16 | for i in comp_NOB:
17 | for j in up_NOB:
18 | for k in NOIB:
19 | for l in dynamic_range:
20 |
21 | command = "THEANO_FLAGS='device=gpu"+str(gpu_id)+"' python main.py "+str(i)+" "+str(j)+" "+str(k)+" "+str(l)+" &> "+str(i)+"_"+str(j)+"_"+str(k)+"_"+str(l)+".txt &"
22 | os.system(command)
23 | print command
24 |
25 | if gpu_id == nb_gpu - 1:
26 |
27 | gpu_id = 0
28 | time.sleep(time_per_job)
29 | print " "
30 |
31 | else:
32 |
33 | if gpu_id == 0:
34 | gpu_id = gpu_id+2 # gpu1 does not work :(
35 | else:
36 | gpu_id = gpu_id+1
37 |
38 | time.sleep(delay_between_lauches)
--------------------------------------------------------------------------------
/utilities/results_extractor.py:
--------------------------------------------------------------------------------
1 |
2 | import csv
3 | import re
4 |
5 | comp_precision = [19,21,23]
6 | update_precision = [17,19]
7 | initial_range = [5]
8 | dynamic_range = [0]
9 |
10 | csv_file = open('X_X_5_0.csv', 'w')
11 | csv_writer = csv.writer(csv_file, lineterminator = '\n')
12 | csv_writer.writerow(["comp_precision","update_precision", "initial_range","dynamic_range","validation_error","test_error"])
13 |
14 | for j in comp_precision:
15 | for k in update_precision:
16 | for l in initial_range:
17 | for m in dynamic_range:
18 |
19 | name = str(j) + "_" + str(k) + "_" + str(l) + "_" + str(m) + ".txt"
20 | f = open(name, 'r').readlines()
21 |
22 | length = len(f)
23 |
24 | print f[length-3]
25 | validation_error = float(re.findall("\d+.\d+", f[length-3])[0])/100.
26 |
27 | print f[length-2]
28 | test_error = float(re.findall("\d+.\d+", f[length-2])[0])/100.
29 |
30 | # print f[length-3-44-1]
31 | # validation_error = float(re.findall("\d+.\d+", f[length-3-44-1])[0])/100.
32 |
33 | # print f[length-2-44-1]
34 | # test_error = float(re.findall("\d+.\d+", f[length-2-44-1])[0])/100.
35 |
36 | csv_writer.writerow([j,k,l,m,validation_error,test_error])
--------------------------------------------------------------------------------
/utilities/svhn_preprocessing.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import shutil
4 | from theano import config
5 | from pylearn2.datasets import preprocessing
6 | from pylearn2.datasets.svhn import SVHN
7 | from pylearn2.utils.string_utils import preprocess
8 |
9 | orig_path = preprocess('${PYLEARN2_DATA_PATH}/SVHN/format2')
10 | try:
11 | local_path = preprocess('${SVHN_LOCAL_PATH}')
12 | except ValueError:
13 | raise ValueError("You need to define SVHN_LOCAL_PATH environment "
14 | "variable.")
15 |
16 | train_name ='h5/splitted_train_32x32.h5'
17 | valid_name = 'h5/valid_32x32.h5'
18 | test_name = 'h5/test_32x32.h5'
19 |
20 | # copy data if don't exist
21 | if not os.path.isdir(os.path.join(local_path, 'h5')):
22 | os.makedirs(os.path.join(local_path, 'h5'))
23 |
24 | for d_set in [train_name, valid_name, test_name]:
25 | if not os.path.isfile(os.path.join(local_path, d_set)):
26 | logging.info("Copying data from {0} to {1}".format(os.path.join(local_path, d_set), local_path))
27 | shutil.copyfile(os.path.join(orig_path, d_set),
28 | os.path.join(local_path, d_set))
29 |
30 | def check_dtype(data):
31 | if str(data.X.dtype) != config.floatX:
32 | logging.warning("The dataset is saved as {}, changing theano's floatX "\
33 | "to the same dtype".format(data.X.dtype))
34 | config.floatX = str(data.X.dtype)
35 |
36 | # Load train data
37 | train = SVHN('splitted_train', path=local_path)
38 | check_dtype(train)
39 |
40 | # prepare preprocessing
41 | pipeline = preprocessing.Pipeline()
42 | # without batch_size there is a high chance that you might encounter memory error
43 | # or pytables crashes
44 | pipeline.items.append(preprocessing.GlobalContrastNormalization(batch_size=5000))
45 | pipeline.items.append(preprocessing.LeCunLCN((32,32)))
46 |
47 | # apply the preprocessings to train
48 | train.apply_preprocessor(pipeline, can_fit=True)
49 | del train
50 |
51 | # load and preprocess valid
52 | valid = SVHN('valid', path=local_path)
53 | check_dtype(valid)
54 | valid.apply_preprocessor(pipeline, can_fit=False)
55 |
56 | # load and preprocess test
57 | test = SVHN('test', path=local_path)
58 | check_dtype(test)
59 | test.apply_preprocessor(pipeline, can_fit=False)
60 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # deep-learning-multipliers
2 |
3 | ## Requirements
4 |
5 | * Theano 0.6 (Bleeding edge version)
6 | * Pylearn2 0.1
7 | * PyTables (for the SVHN dataset)
8 | * a CUDA capable GPU
9 |
10 | ## Goal
11 |
12 | This code was written to allow anyone to easily reproduce the results
13 | of the article "Deep learning with low precision multipliers", available at http://arxiv.org/abs/1412.7024 .
14 | The article in question assesses whether it is possible to train Deep Neural Networks with low precision multipliers.
15 |
16 | Note that this code only simulates the impact of low precision multipliers.
17 | It does not exploit it in any way.
18 | If you are looking for fast low precision GPU kernels, NervanaSystems made some available https://github.com/NervanaSystems/nervanagpu .
19 |
20 | ## How to run it
21 |
22 | ### Command line
23 |
24 | python main.py [task] [format] [initial range] [propagations bit-width]
25 | [parameters updates bit-width] [ranges updates frequency]
26 | [maximum overflow rate] [number of epochs of ranges initialization]
27 |
28 | ### Task
29 |
30 | There are 4 different tasks: the permutation invariant MNIST (PI_MNIST),
31 | MNIST, CIFAR10 and SVHN.
32 | A set of hyperparameters is associated with each of those tasks
33 | (They are stored in model.py).
34 | For the SVHN dataset,
35 | you need to set an environment variable:
36 |
37 | SVHN_LOCAL_PATH=/tmp/SVHN/
38 |
39 | You then need to pre-process it with the script
40 | utilities/svhn_preprocessing.py (script taken from pylearn2).
41 |
42 | ### Format
43 |
44 | There are 4 different formats: floating point (FLP),
45 | half floating point (HFLP),
46 | fixed point (FXP) and dynamic fixed point (DFXP).
47 |
48 | ### Initial range
49 |
50 | Initial range is only useful for FXP and DFXP.
51 | It is the initial position of the radix point
52 | for the fixed point formats.
53 | 5 works most of the time.
54 |
55 | ### Propagations and parameters updates bit-widths
56 |
57 | Only useful for FXP and DFXP.
58 | Those are the bit-widths of respectively the
59 | propagations and the parameters updates.
60 | Note that the sign is not counted in the bit-width.
61 |
62 | ### Ranges update frequency
63 |
64 | Range update frequency is only useful for DFXP.
65 | It is the number of batches between two ranges updates.
66 |
67 | ### Maximum overflow rate
68 |
69 | Only useful for DFXP.
70 | It is the amount of overflow tolerated before modifying the range.
71 |
72 | ### Number of epochs of range initialization
73 |
74 | Only useful for DFXP.
75 | This is the number of epochs we train with high precision
76 | to find the initial scaling factors.
77 | Once they are found,
78 | the parameters are reinitialized, and the DFXP training can begin.
79 |
80 | ### Examples
81 |
82 | python main.py PI_MNIST FLP
83 | python main.py SVHN FXP 5 19 19
84 | python main.py CIFAR10 DFXP 5 9 11 100 0.0001 2
85 |
86 |
--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Matthieu Courbariaux
2 |
3 | # This file is part of deep-learning-multipliers.
4 |
5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 |
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 |
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers. If not, see .
17 |
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import theano
25 | import theano.tensor as T
26 | import time
27 |
28 | from theano.scalar.basic import UnaryScalarOp, same_out_nocomplex
29 | from theano.tensor.elemwise import Elemwise
30 |
31 | def apply_format(format, X, NOB, NOIB):
32 |
33 | if format == "FXP" or format == "DFXP":
34 | return fixed_point(X,NOB, NOIB)
35 |
36 | elif format == "FLP":
37 | return X
38 |
39 | elif format == "HFLP":
40 | return float16(X)
41 |
42 | # float16 function
43 | # we are using the nvidia cuda function (only works on GPU)
44 | class Float16(UnaryScalarOp):
45 |
46 | def impl(self, x):
47 | return numpy.float32(numpy.float16(x))
48 |
49 | def c_code(self, node, name, (x,), (z,), sub):
50 | return "%(z)s = __half2float(__float2half_rn(%(x)s));" % locals()
51 | float16_scalar = Float16(same_out_nocomplex, name='float16')
52 | float16 = Elemwise(float16_scalar)
53 |
54 | # this function simulate the precision and the range of a fixed point
55 | # while working with floats
56 | # NOB = Number Of Bits = bit-width
57 | # NOIB = Number Of Integer Bits = position of the radix point = range
58 | def fixed_point(X,NOB, NOIB):
59 |
60 | power = T.cast(2.**(NOB - NOIB), theano.config.floatX) # float !
61 | max = T.cast((2.**NOB)-1, theano.config.floatX)
62 | value = X*power
63 | value = T.round(value) # rounding
64 | value = T.clip(value, -max, max) # saturation arithmetic
65 | value = value/power
66 | return value
67 |
68 | # compute the new range of the dynamic fixed point representation
69 | def new_range(overflow, overflow_1, max_overflow):
70 |
71 | # the goal is to update the range of the vector
72 | # we know the overflow rates associated with range (overflow)
73 | # and range-1 (overflow_1)
74 | # if (overflow > max_overflow): increment range
75 | # else if (overflow_1 < max_overflow): decrement range
76 | return T.switch(T.gt(overflow, max_overflow), 1,
77 | T.switch(T.gt(overflow_1, max_overflow), 0, - 1))
78 |
79 | # Overflow rate of a vector knowing its NOIB and NOB
80 | def overflow(vector, NOB, NOIB):
81 |
82 | # compute the max value of the fixed point representation (i.e. the overflow value)
83 | max = ((2.**NOB)-1)/(2.**(NOB - NOIB))
84 |
85 | # compute the overflow rate of the vector
86 | overflow = T.mean(T.switch(T.ge(T.abs_(vector), max), 1., 0.))
87 |
88 | return overflow
89 |
--------------------------------------------------------------------------------
/utilities/filter_plot.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy
3 |
4 | def scale_to_unit_interval(ndar, eps=1e-8):
5 | """ Scales all values in the ndarray ndar to be between 0 and 1 """
6 | ndar = ndar.copy()
7 | ndar -= ndar.min()
8 | ndar *= 1.0 / (ndar.max() + eps)
9 | return ndar
10 |
11 |
12 | def tile_raster_images(X, img_shape, tile_shape, tile_spacing=(0, 0),
13 | scale_rows_to_unit_interval=True,
14 | output_pixel_vals=True):
15 | """
16 | Transform an array with one flattened image per row, into an array in
17 | which images are reshaped and layed out like tiles on a floor.
18 |
19 | This function is useful for visualizing datasets whose rows are images,
20 | and also columns of matrices for transforming those rows
21 | (such as the first layer of a neural net).
22 |
23 | :type X: a 2-D ndarray or a tuple of 4 channels, elements of which can
24 | be 2-D ndarrays or None;
25 | :param X: a 2-D array in which every row is a flattened image.
26 |
27 | :type img_shape: tuple; (height, width)
28 | :param img_shape: the original shape of each image
29 |
30 | :type tile_shape: tuple; (rows, cols)
31 | :param tile_shape: the number of images to tile (rows, cols)
32 |
33 | :param output_pixel_vals: if output should be pixel values (i.e. int8
34 | values) or floats
35 |
36 | :param scale_rows_to_unit_interval: if the values need to be scaled before
37 | being plotted to [0,1] or not
38 |
39 |
40 | :returns: array suitable for viewing as an image.
41 | (See:`Image.fromarray`.)
42 | :rtype: a 2-d array with same dtype as X.
43 |
44 | """
45 |
46 | assert len(img_shape) == 2
47 | assert len(tile_shape) == 2
48 | assert len(tile_spacing) == 2
49 |
50 | # The expression below can be re-written in a more C style as
51 | # follows :
52 | #
53 | # out_shape = [0,0]
54 | # out_shape[0] = (img_shape[0] + tile_spacing[0]) * tile_shape[0] -
55 | # tile_spacing[0]
56 | # out_shape[1] = (img_shape[1] + tile_spacing[1]) * tile_shape[1] -
57 | # tile_spacing[1]
58 | out_shape = [(ishp + tsp) * tshp - tsp for ishp, tshp, tsp
59 | in zip(img_shape, tile_shape, tile_spacing)]
60 |
61 | if isinstance(X, tuple):
62 | assert len(X) == 4
63 | # Create an output numpy ndarray to store the image
64 | if output_pixel_vals:
65 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype='uint8')
66 | else:
67 | out_array = numpy.zeros((out_shape[0], out_shape[1], 4), dtype=X.dtype)
68 |
69 | #colors default to 0, alpha defaults to 1 (opaque)
70 | if output_pixel_vals:
71 | channel_defaults = [0, 0, 0, 255]
72 | else:
73 | channel_defaults = [0., 0., 0., 1.]
74 |
75 | for i in xrange(4):
76 | if X[i] is None:
77 | # if channel is None, fill it with zeros of the correct
78 | # dtype
79 | out_array[:, :, i] = numpy.zeros(out_shape,
80 | dtype='uint8' if output_pixel_vals else out_array.dtype
81 | ) + channel_defaults[i]
82 | else:
83 | # use a recurrent call to compute the channel and store it
84 | # in the output
85 | out_array[:, :, i] = tile_raster_images(X[i], img_shape, tile_shape, tile_spacing, scale_rows_to_unit_interval, output_pixel_vals)
86 | return out_array
87 |
88 | else:
89 | # if we are dealing with only one channel
90 | H, W = img_shape
91 | Hs, Ws = tile_spacing
92 |
93 | # generate a matrix to store the output
94 | out_array = numpy.zeros(out_shape, dtype='uint8' if output_pixel_vals else X.dtype)
95 |
96 |
97 | for tile_row in xrange(tile_shape[0]):
98 | for tile_col in xrange(tile_shape[1]):
99 | if tile_row * tile_shape[1] + tile_col < X.shape[0]:
100 | if scale_rows_to_unit_interval:
101 | # if we should scale values to be between 0 and 1
102 | # do this by calling the `scale_to_unit_interval`
103 | # function
104 | this_img = scale_to_unit_interval(X[tile_row * tile_shape[1] + tile_col].reshape(img_shape))
105 | else:
106 | this_img = X[tile_row * tile_shape[1] + tile_col].reshape(img_shape)
107 | # add the slice to the corresponding position in the
108 | # output array
109 | out_array[
110 | tile_row * (H+Hs): tile_row * (H + Hs) + H,
111 | tile_col * (W+Ws): tile_col * (W + Ws) + W
112 | ] \
113 | = this_img * (255 if output_pixel_vals else 1)
114 | return out_array
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Matthieu Courbariaux
2 |
3 | # This file is part of deep-learning-multipliers.
4 |
5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 |
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 |
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers. If not, see .
17 |
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import time
25 |
26 | from trainer import Trainer
27 | from model import PI_MNIST_model, MNIST_model, CIFAR10_SVHN_model
28 |
29 | from pylearn2.datasets.mnist import MNIST
30 | from pylearn2.datasets.zca_dataset import ZCA_Dataset
31 | from pylearn2.datasets.svhn import SVHN
32 | from pylearn2.utils import serial
33 |
34 | def onehot(x,numclasses=None):
35 |
36 | if x.shape==():
37 | x = x[None]
38 | if numclasses is None:
39 | numclasses = np.max(x) + 1
40 | result = np.zeros(list(x.shape) + [numclasses], dtype="int")
41 | z = np.zeros(x.shape)
42 | for c in range(numclasses):
43 | z *= 0
44 | z[np.where(x==c)] = 1
45 | result[...,c] += z
46 |
47 | result = np.reshape(result,(np.shape(result)[0], np.shape(result)[result.ndim-1]))
48 | return result
49 |
50 | # MAIN
51 |
52 | if __name__ == "__main__":
53 |
54 | print 'Beginning of the program'
55 | start_time = time.clock()
56 |
57 | print 'Loading the dataset'
58 |
59 | dataset = sys.argv[1]
60 |
61 | if dataset == "PI_MNIST" or dataset == "MNIST":
62 |
63 | train_set = MNIST(which_set= 'train',start=0, stop = 50000)#, center = True)
64 | valid_set = MNIST(which_set= 'train',start=50000, stop = 60000)#, center = True)
65 | test_set = MNIST(which_set= 'test')#, center = True)
66 |
67 | # for both datasets, onehot the target
68 | train_set.y = np.float32(onehot(train_set.y))
69 | valid_set.y = np.float32(onehot(valid_set.y))
70 | test_set.y = np.float32(onehot(test_set.y))
71 |
72 | elif dataset == "CIFAR10":
73 |
74 | preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
75 | train_set = ZCA_Dataset(
76 | preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"),
77 | preprocessor = preprocessor,
78 | start=0, stop = 45000)
79 | valid_set = ZCA_Dataset(
80 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"),
81 | preprocessor = preprocessor,
82 | start=45000, stop = 50000)
83 | test_set = ZCA_Dataset(
84 | preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"),
85 | preprocessor = preprocessor)
86 |
87 | # for both datasets, onehot the target
88 | train_set.y = np.float32(onehot(train_set.y))
89 | valid_set.y = np.float32(onehot(valid_set.y))
90 | test_set.y = np.float32(onehot(test_set.y))
91 |
92 | elif dataset == "SVHN":
93 |
94 | train_set = SVHN(
95 | which_set= 'splitted_train',
96 | path= "${SVHN_LOCAL_PATH}",
97 | axes= ['b', 'c', 0, 1])
98 |
99 | valid_set = SVHN(
100 | which_set= 'valid',
101 | path= "${SVHN_LOCAL_PATH}",
102 | axes= ['b', 'c', 0, 1])
103 |
104 | test_set = SVHN(
105 | which_set= 'test',
106 | path= "${SVHN_LOCAL_PATH}",
107 | axes= ['b', 'c', 0, 1])
108 |
109 | print 'Creating the model'
110 |
111 | # storing format hyperparameters
112 | format = sys.argv[2]
113 |
114 | initial_range = 0
115 | comp_precision = 0
116 | update_precision = 0
117 | range_update_frequency = 0
118 | max_overflow = 0
119 | range_init_epoch = 0
120 |
121 | if format == "FXP" or format == "DFXP":
122 | initial_range = int(sys.argv[3])
123 | comp_precision = int(sys.argv[4])
124 | update_precision = int(sys.argv[5])
125 |
126 | if format == "DFXP":
127 | range_update_frequency = int(sys.argv[6])
128 | max_overflow = float(sys.argv[7])
129 | range_init_epoch = int(sys.argv[8])
130 |
131 | if dataset == "PI_MNIST":
132 |
133 | rng = np.random.RandomState(1234)
134 | LR_start = 0.05
135 | batch_size = 100
136 | gpu_batches = 500
137 | n_epoch = 800
138 |
139 | model = PI_MNIST_model(rng = rng, batch_size = batch_size,
140 | n_input = 784, n_output = 10, n_hidden = 240, n_pieces = 5, n_hidden_layers = 2,
141 | p_input = 0.8, scale_input = 1., p_hidden = 0.5, scale_hidden = 0.5,
142 | max_col_norm = 1.9365, format = format,
143 | comp_precision = comp_precision, update_precision = update_precision,
144 | initial_range = initial_range, max_overflow = max_overflow)
145 |
146 | trainer = Trainer(rng = rng, load_path = None, save_path = None,
147 | train_set = train_set, valid_set = valid_set, test_set = test_set,
148 | model = model,
149 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7,
150 | batch_size = batch_size, gpu_batches = gpu_batches,
151 | n_epoch = n_epoch,
152 | shuffle_batches = False, shuffle_examples = True,
153 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
154 |
155 | elif dataset == "MNIST":
156 |
157 | rng = np.random.RandomState(1234)
158 | LR_start = 0.02
159 | batch_size = 128
160 | gpu_batches = 391 # 391 -> 50000, 196 -> 25000, 79 -> 10000
161 | n_epoch = 800
162 |
163 | model = MNIST_model(rng = rng, batch_size = batch_size, format = format,
164 | comp_precision = comp_precision, update_precision = update_precision,
165 | initial_range = initial_range, max_overflow = max_overflow)
166 |
167 | trainer = Trainer(rng = rng, load_path = None, save_path = None,
168 | train_set = train_set, valid_set = valid_set, test_set = test_set,
169 | model = model,
170 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/4, M_fin = 0.7,
171 | batch_size = batch_size, gpu_batches = gpu_batches,
172 | n_epoch = n_epoch,
173 | shuffle_batches = False, shuffle_examples = True,
174 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
175 |
176 | elif dataset == "CIFAR10":
177 |
178 | rng = np.random.RandomState(1234)
179 | LR_start = 0.02
180 | batch_size = 128
181 | gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000
182 | n_epoch = 400
183 |
184 | model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format,
185 | comp_precision = comp_precision, update_precision = update_precision,
186 | initial_range = initial_range, max_overflow = max_overflow)
187 |
188 | trainer = Trainer(rng = rng, load_path = None, save_path = None,
189 | train_set = train_set, valid_set = valid_set, test_set = test_set,
190 | model = model,
191 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7,
192 | batch_size = batch_size, gpu_batches = gpu_batches,
193 | n_epoch = n_epoch,
194 | shuffle_batches = False, shuffle_examples = True,
195 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
196 |
197 | elif dataset == "SVHN":
198 |
199 | rng = np.random.RandomState(1234)
200 | LR_start = 0.05
201 | batch_size = 128
202 | gpu_batches = 79 # 391 -> 50000, 196 -> 25000, 79 -> 10000
203 | n_epoch = 200
204 |
205 | model = CIFAR10_SVHN_model(rng = rng, batch_size = batch_size, format = format,
206 | comp_precision = comp_precision, update_precision = update_precision,
207 | initial_range = initial_range, max_overflow = max_overflow)
208 |
209 | trainer = Trainer(rng = rng, load_path = None, save_path = None,
210 | train_set = train_set, valid_set = valid_set, test_set = test_set,
211 | model = model,
212 | LR_start = LR_start, LR_sat = n_epoch/2, LR_fin = LR_start/10, M_start = 0.5, M_sat = n_epoch/2, M_fin = 0.7,
213 | batch_size = batch_size, gpu_batches = gpu_batches,
214 | n_epoch = n_epoch,
215 | shuffle_batches = True, shuffle_examples = False,
216 | format = format, range_update_frequency = range_update_frequency,range_init_epoch=range_init_epoch)
217 |
218 | print 'Building'
219 |
220 | trainer.build()
221 |
222 | print 'Training'
223 |
224 | trainer.train()
225 |
226 | end_time = time.clock()
227 | print 'The code ran for %i seconds'%(end_time - start_time)
228 |
--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Matthieu Courbariaux
2 |
3 | # This file is part of deep-learning-multipliers.
4 |
5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 |
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 |
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers. If not, see .
17 |
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import theano
25 | import theano.tensor as T
26 | import time
27 |
28 | # TRAINING
29 |
30 | class Trainer(object):
31 |
32 | def __init__(self,
33 | rng, save_path, load_path,
34 | train_set, valid_set, test_set,
35 | model,
36 | LR_start, LR_sat, LR_fin, M_start, M_sat, M_fin,
37 | batch_size, gpu_batches,
38 | n_epoch,
39 | format, range_update_frequency, range_init_epoch,
40 | shuffle_batches, shuffle_examples):
41 |
42 | print ' Training algorithm:'
43 | print ' Learning rate = %f' %(LR_start)
44 | print ' Learning rate saturation = %i' %(LR_sat)
45 | print ' Final learning rate = %f' %(LR_fin)
46 | print ' Momentum = %f' %(M_start)
47 | print ' Momentum saturation = %i' %(M_sat)
48 | print ' Final momentum = %f' %(M_fin)
49 | print ' Batch size = %i' %(batch_size)
50 | print ' gpu_batches = %i' %(gpu_batches)
51 | print ' Number of epochs = %i' %(n_epoch)
52 | print ' shuffle_batches = %i' %(shuffle_batches)
53 | print ' shuffle_examples = %i' %(shuffle_examples)
54 | print ' Format = '+ format
55 | print ' Range update frequency = %i' %(range_update_frequency)
56 | print ' Range init epochs = %i' %(range_init_epoch)
57 |
58 | # save the dataset
59 | self.rng = rng
60 | self.shuffle_batches = shuffle_batches
61 | self.shuffle_examples = shuffle_examples
62 | self.load_path = load_path
63 | self.save_path = save_path
64 | self.train_set = train_set
65 | self.valid_set = valid_set
66 | self.test_set = test_set
67 |
68 | # save the model
69 | self.model = model
70 |
71 | # save the parameters
72 | self.LR_start = LR_start
73 | self.LR_sat = LR_sat
74 | self.LR_fin = LR_fin
75 | self.M_start = M_start
76 | self.M_sat = M_sat
77 | self.M_fin = M_fin
78 | self.batch_size = batch_size
79 | self.gpu_batches = gpu_batches
80 | self.n_epoch = n_epoch
81 | self.format = format
82 | self.range_update_frequency = range_update_frequency
83 | self.range_init_epoch = range_init_epoch
84 |
85 | # put a part of the dataset on gpu
86 | self.shared_x = theano.shared(
87 | np.asarray(self.train_set.X[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX))
88 | self.shared_y = theano.shared(
89 | np.asarray(self.train_set.y[0:self.batch_size*self.gpu_batches], dtype=theano.config.floatX))
90 |
91 | def shuffle(self, set):
92 |
93 | # on the CPU for the moment.
94 | X = np.copy(set.X)
95 | y = np.copy(set.y)
96 |
97 | shuffled_index = range(set.X.shape[0])
98 | self.rng.shuffle(shuffled_index)
99 |
100 | for i in range(set.X.shape[0]):
101 | set.X[i] = X[shuffled_index[i]]
102 | set.y[i] = y[shuffled_index[i]]
103 |
104 | def init_range(self):
105 |
106 | # save the precisions and the random parameters of the model
107 | comp_precision = self.model.get_comp_precision()
108 | update_precision = self.model.get_update_precision()
109 | self.model.save_params()
110 |
111 | # set a good precision
112 | self.model.set_comp_precision(31)
113 | self.model.set_update_precision(31)
114 |
115 | # train n epochs to adjust the initial range
116 | for k in range(self.range_init_epoch):
117 | self.train_epoch(self.train_set)
118 |
119 | # set back the precision and the random parameters
120 | self.model.set_comp_precision(comp_precision)
121 | self.model.set_update_precision(update_precision)
122 | self.model.load_params()
123 |
124 | def init(self):
125 |
126 | if self.load_path != None:
127 | self.model.load_params_file(self.load_path)
128 |
129 | self.LR = self.LR_start
130 | self.LR_step = (self.LR_fin-self.LR_start)/self.LR_sat
131 | self.M = self.M_start
132 | self.M_step = (self.M_fin-self.M_start)/self.M_sat
133 |
134 | self.epoch = 0
135 | self.best_epoch = self.epoch
136 |
137 | # test it on the validation set
138 | self.validation_ER = self.test_epoch(self.valid_set)
139 | # test it on the test set
140 | self.test_ER = self.test_epoch(self.test_set)
141 |
142 | self.best_validation_ER = self.validation_ER
143 | self.best_test_ER = self.test_ER
144 |
145 | if self.format == "DFXP" :
146 | self.init_range()
147 |
148 | def update(self):
149 |
150 | # start by shuffling train set
151 | if self.shuffle_examples == True:
152 | self.shuffle(self.train_set)
153 |
154 | self.epoch += 1
155 |
156 | # train the model on all training examples
157 | self.train_epoch(self.train_set)
158 |
159 | # test it on the validation set
160 | self.validation_ER = self.test_epoch(self.valid_set)
161 |
162 | # test it on the test set
163 | self.test_ER = self.test_epoch(self.test_set)
164 |
165 | # update LR and M as well during the first phase
166 | self.update_LR()
167 | self.update_M()
168 |
169 | # save the best parameters
170 | if self.validation_ER < self.best_validation_ER:
171 | self.best_validation_ER = self.validation_ER
172 | self.best_test_ER = self.test_ER
173 | self.best_epoch = self.epoch
174 | if self.save_path != None:
175 | self.model.save_params_file(self.save_path)
176 |
177 | def load_shared_dataset(self, set, start,size):
178 |
179 | self.shared_x.set_value(
180 | set.X[self.batch_size*start:self.batch_size*(size+start)])
181 | self.shared_y.set_value(
182 | set.y[self.batch_size*start:self.batch_size*(size+start)])
183 |
184 | def train_epoch(self, set):
185 |
186 | # number of batch in the dataset
187 | n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size))
188 | # number of group of batches (in the memory of the GPU)
189 | n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches))
190 |
191 | # number of batches in the last group
192 | if self.gpu_batches<=n_batches:
193 | n_remaining_batches = n_batches%self.gpu_batches
194 | else:
195 | n_remaining_batches = n_batches
196 |
197 | # batch counter for the range update frequency
198 | k = 0
199 |
200 | shuffled_range_i = range(n_gpu_batches)
201 |
202 | if self.shuffle_batches==True:
203 | self.rng.shuffle(shuffled_range_i)
204 |
205 | for i in shuffled_range_i:
206 |
207 | self.load_shared_dataset(set,
208 | start=i*self.gpu_batches,
209 | size=self.gpu_batches)
210 |
211 | shuffled_range_j = range(self.gpu_batches)
212 |
213 | if self.shuffle_batches==True:
214 | self.rng.shuffle(shuffled_range_j)
215 |
216 | for j in shuffled_range_j:
217 |
218 | self.train_batch(j, self.LR, self.M)
219 |
220 | # update the dynamic ranges every range_update_frequency epoch
221 | if self.format == "DFXP" :
222 | k+=1
223 | if k==self.range_update_frequency:
224 | self.update_range(k)
225 | k=0
226 |
227 | # load the last incomplete gpu batch of batches
228 | if n_remaining_batches > 0:
229 |
230 | self.load_shared_dataset(set,
231 | start=n_gpu_batches*self.gpu_batches,
232 | size=n_remaining_batches)
233 |
234 | shuffled_range_j = range(n_remaining_batches)
235 | if self.shuffle_batches==True:
236 | self.rng.shuffle(shuffled_range_j)
237 |
238 | for j in shuffled_range_j:
239 |
240 | self.train_batch(j, self.LR, self.M)
241 |
242 | # update the dynamic ranges every range_update_frequency epoch
243 | if self.format == "DFXP" :
244 | k+=1
245 | if k==self.range_update_frequency:
246 | self.update_range(k)
247 | k=0
248 |
249 | def test_epoch(self, set):
250 |
251 | n_batches = np.int(np.floor(set.X.shape[0]/self.batch_size))
252 | n_gpu_batches = np.int(np.floor(n_batches/self.gpu_batches))
253 |
254 | if self.gpu_batches<=n_batches:
255 | n_remaining_batches = n_batches%self.gpu_batches
256 | else:
257 | n_remaining_batches = n_batches
258 |
259 | error_rate = 0.
260 |
261 | for i in range(n_gpu_batches):
262 |
263 | self.load_shared_dataset(set,
264 | start=i*self.gpu_batches,
265 | size=self.gpu_batches)
266 |
267 | for j in range(self.gpu_batches):
268 |
269 | error_rate += self.test_batch(j)
270 |
271 | # load the last incomplete gpu batch of batches
272 | if n_remaining_batches > 0:
273 |
274 | self.load_shared_dataset(set,
275 | start=n_gpu_batches*self.gpu_batches,
276 | size=n_remaining_batches)
277 |
278 | for j in range(n_remaining_batches):
279 |
280 | error_rate += self.test_batch(j)
281 |
282 | error_rate /= (n_batches*self.batch_size)
283 | error_rate *= 100.
284 |
285 | return error_rate
286 |
287 | def update_LR(self):
288 |
289 | if self.LR > self.LR_fin:
290 | self.LR += self.LR_step
291 | else:
292 | self.LR = self.LR_fin
293 |
294 | def update_M(self):
295 |
296 | if self.M < self.M_fin:
297 | self.M += self.M_step
298 | else:
299 | self.M = self.M_fin
300 |
301 | def monitor(self):
302 |
303 | print ' epoch %i:' %(self.epoch)
304 | print ' learning rate %f' %(self.LR)
305 | print ' momentum %f' %(self.M)
306 | print ' validation error rate %f%%' %(self.validation_ER)
307 | print ' test error rate %f%%' %(self.test_ER)
308 | print ' epoch associated to best validation error %i' %(self.best_epoch)
309 | print ' best validation error rate %f%%' %(self.best_validation_ER)
310 | print ' test error rate associated to best validation error %f%%' %(self.best_test_ER)
311 |
312 | if self.format == "DFXP":
313 | self.model.print_range()
314 |
315 | def train(self):
316 |
317 | self.init()
318 | self.monitor()
319 |
320 | for epoch in range(self.n_epoch):
321 |
322 | self.update()
323 | self.monitor()
324 |
325 | def build(self):
326 |
327 | # input and output variables
328 | x = T.matrix('x')
329 | y = T.matrix('y')
330 | index = T.lscalar()
331 | batch_count = T.lscalar()
332 | LR = T.scalar('LR', dtype=theano.config.floatX)
333 | M = T.scalar('M', dtype=theano.config.floatX)
334 |
335 | # before the build, you work with symbolic variables
336 | # after the build, you work with numeric variables
337 |
338 | self.train_batch = theano.function(inputs=[index,LR,M], updates=self.model.updates(x,y,LR,M),givens={
339 | x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size],
340 | y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
341 | name = "train_batch", on_unused_input='warn')
342 |
343 | self.test_batch = theano.function(inputs=[index],outputs=self.model.errors(x,y),givens={
344 | x: self.shared_x[index * self.batch_size:(index + 1) * self.batch_size],
345 | y: self.shared_y[index * self.batch_size:(index + 1) * self.batch_size]},
346 | name = "test_batch")
347 |
348 | if self.format == "DFXP" :
349 | self.update_range = theano.function(inputs=[batch_count],updates=self.model.range_updates(batch_count), name = "update_range")
350 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 |
294 | Copyright (C)
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | , 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Matthieu Courbariaux
2 |
3 | # This file is part of deep-learning-multipliers.
4 |
5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 |
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 |
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers. If not, see .
17 |
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import theano
25 | import theano.tensor as T
26 | import time
27 |
28 | from layer import Maxout_conv_layer, SoftmaxLayer, MaxoutLayer
29 |
30 |
31 | class deep_dropout_network(object):
32 |
33 | layer = []
34 |
35 | def __init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision,
36 | initial_range, max_overflow, format):
37 |
38 | print ' Overall description:'
39 | print ' Batch size = %i' %(batch_size)
40 | print ' Number of layers = %i' %(n_hidden_layers)
41 | print ' Computation precision = %i bits' %(comp_precision)
42 | print ' Update precision = %i bits' %(update_precision)
43 | print ' Initial range = %i bits' %(initial_range)
44 | print ' Maximum overflow rate = %f %%' %(max_overflow*100)
45 | print " Format = " + format
46 |
47 | self.rng = rng
48 | self.batch_size = batch_size
49 | self.n_hidden_layers = n_hidden_layers
50 | self.comp_precision = comp_precision
51 | self.update_precision = update_precision
52 | self.initial_range = initial_range
53 | self.max_overflow = max_overflow
54 | self.format = format
55 |
56 | def fprop(self, x):
57 |
58 | y = self.layer[0].fprop(x)
59 |
60 | for k in range(1,self.n_hidden_layers+1):
61 |
62 | y = self.layer[k].fprop(y)
63 |
64 | return y
65 |
66 | def dropout_fprop(self, x):
67 |
68 | y = self.layer[0].dropout_fprop(x)
69 |
70 | for k in range(1,self.n_hidden_layers+1):
71 |
72 | y = self.layer[k].dropout_fprop(y)
73 |
74 | return y
75 |
76 | # when you use fixed point, you cannot use T.grad directly -> bprop modifications.
77 | def bprop(self, y, t):
78 |
79 | # there is a simplification between softmax derivative and nll derivative
80 | dEdy = (y-t)/T.cast(T.shape(y)[1],dtype=theano.config.floatX) # /2. # actually, it is dEdz and not dEdy
81 |
82 | # bprop
83 | for k in range(self.n_hidden_layers,-1,-1):
84 | dEdy = self.layer[k].bprop(dEdy)
85 |
86 | # you give it the input and the target and it gives you the updates
87 | def parameter_updates(self, LR, M):
88 |
89 | # updates
90 | parameter_updates = self.layer[0].parameter_updates(LR, M)
91 | for k in range(1,self.n_hidden_layers+1):
92 | parameter_updates = parameter_updates + self.layer[k].parameter_updates(LR, M)
93 |
94 | return parameter_updates
95 |
96 | # function that updates the ranges of all fixed point vectors
97 | def range_updates(self,batch_count):
98 |
99 | range_updates = self.layer[0].range_updates(batch_count)
100 | for k in range(1,self.n_hidden_layers+1):
101 | range_updates = range_updates + self.layer[k].range_updates(batch_count)
102 |
103 | return range_updates
104 |
105 | # function that updates the ranges of all fixed point vectors
106 | def overflow_updates(self):
107 |
108 | overflow_updates = self.layer[0].overflow_updates()
109 | for k in range(1,self.n_hidden_layers+1):
110 | overflow_updates = overflow_updates + self.layer[k].overflow_updates()
111 |
112 | return overflow_updates
113 |
114 | # train function
115 | def updates(self, x, t, LR, M):
116 |
117 | y = self.dropout_fprop(x)
118 | self.bprop(y,t)
119 | updates = self.parameter_updates(LR,M)
120 |
121 | if self.format == "DFXP":
122 | updates += self.overflow_updates()
123 |
124 | return updates
125 |
126 | def errors(self, x, t):
127 |
128 | y = self.fprop(x)
129 |
130 | # error function
131 | errors = T.sum(T.neq(T.argmax(y, axis=1), T.argmax(t, axis=1)))
132 |
133 | return errors
134 |
135 | def save_params(self):
136 |
137 | self.W_save = []
138 | self.b_save = []
139 |
140 | for k in xrange(self.n_hidden_layers+1):
141 | self.W_save.append(self.layer[k].W.get_value(borrow=False))
142 | self.b_save.append(self.layer[k].b.get_value(borrow=False))
143 |
144 | def load_params(self):
145 |
146 | # read an load all the parameters
147 | for k in xrange(self.n_hidden_layers+1):
148 | self.layer[k].W.set_value(self.W_save[k])
149 | self.layer[k].b.set_value(self.b_save[k])
150 |
151 | def save_params_file(self, path):
152 |
153 | # Open the file and overwrite current contents
154 | save_file = open(path, 'wb')
155 |
156 | # write all the parameters in the file
157 | for k in xrange(self.n_hidden_layers+1):
158 | cPickle.dump(self.layer[k].W.get_value(), save_file, -1)
159 | cPickle.dump(self.layer[k].b.get_value(), save_file, -1)
160 |
161 | # close the file
162 | save_file.close()
163 |
164 | def load_params_file(self, path):
165 |
166 | # Open the file
167 | save_file = open(path)
168 |
169 | # read an load all the parameters
170 | for k in xrange(self.n_hidden_layers+1):
171 | self.layer[k].W.set_value(cPickle.load(save_file))
172 | self.layer[k].b.set_value(cPickle.load(save_file))
173 |
174 | # close the file
175 | save_file.close()
176 |
177 |
178 | def print_range(self):
179 |
180 | for k in xrange(self.n_hidden_layers+1):
181 | print ' Layer %i range:'%(k)
182 | self.layer[k].print_range()
183 |
184 | def set_comp_precision(self, comp_precision):
185 |
186 | for k in xrange(self.n_hidden_layers+1):
187 | self.layer[k].comp_precision.set_value(comp_precision)
188 |
189 | def get_comp_precision(self):
190 |
191 | return self.layer[0].comp_precision.get_value()
192 |
193 | def set_update_precision(self, update_precision):
194 |
195 | for k in xrange(self.n_hidden_layers+1):
196 | self.layer[k].update_precision.set_value(update_precision)
197 |
198 | def get_update_precision(self):
199 |
200 | return self.layer[0].update_precision.get_value()
201 |
202 | def set_max_overflow(self, max_overflow):
203 |
204 | for k in xrange(self.n_hidden_layers+1):
205 | self.layer[k].max_overflow.set_value(max_overflow)
206 |
207 | def get_max_overflow(self):
208 |
209 | return self.layer[0].max_overflow.get_value()
210 |
211 | class PI_MNIST_model(deep_dropout_network):
212 |
213 | def __init__(self, rng, batch_size, n_input, n_output, n_hidden, n_pieces, n_hidden_layers,
214 | p_input, scale_input, p_hidden, scale_hidden, max_col_norm, format,
215 | comp_precision, update_precision, initial_range, max_overflow):
216 |
217 | deep_dropout_network.__init__(self, rng, batch_size, n_hidden_layers, comp_precision, update_precision,
218 | initial_range, max_overflow, format)
219 |
220 | print ' n_input = %i' %(n_input)
221 | print ' n_output = %i' %(n_output)
222 | print ' n_hidden = %i' %(n_hidden)
223 | print ' n_pieces = %i' %(n_pieces)
224 | print ' p_input = %f' %(p_input)
225 | print ' scale_input = %f' %(scale_input)
226 | print ' p_hidden = %f' %(p_hidden)
227 | print ' scale_hidden = %f' %(scale_hidden)
228 | print ' max_col_norm = %f' %(max_col_norm)
229 |
230 | # save the parameters
231 | self.n_input = n_input
232 | self.n_output = n_output
233 | self.n_hidden = n_hidden
234 | self.n_pieces = n_pieces
235 | self.p_input = p_input
236 | self.scale_input = scale_input
237 | self.p_hidden = p_hidden
238 | self.scale_hidden = scale_hidden
239 | self.max_col_norm = max_col_norm
240 |
241 | # Create MLP layers
242 | if self.n_hidden_layers == 0 :
243 |
244 | print " Softmax layer:"
245 |
246 | self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs=self.n_input, n_units=self.n_output,
247 | p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format,
248 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
249 |
250 | else :
251 |
252 | print " Maxout layer 1:"
253 |
254 | self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_input, n_units = self.n_hidden, n_pieces = self.n_pieces,
255 | p = self.p_input, scale = self.scale_input, max_col_norm = self.max_col_norm, format = self.format,
256 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
257 |
258 | for k in range(1,self.n_hidden_layers):
259 |
260 | print " Maxout layer "+str(k+1)+":"
261 | self.layer.append(MaxoutLayer(rng = self.rng, n_inputs = self.n_hidden, n_units = self.n_hidden, n_pieces = self.n_pieces,
262 | p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format,
263 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
264 |
265 | print " Softmax layer:"
266 |
267 | self.layer.append(SoftmaxLayer(rng = self.rng, n_inputs= self.n_hidden, n_units= self.n_output,
268 | p = self.p_hidden, scale = self.scale_hidden, max_col_norm = self.max_col_norm, format = self.format,
269 | comp_precision = self.comp_precision, update_precision = self.update_precision, initial_range = self.initial_range, max_overflow = self.max_overflow))
270 |
271 | class MNIST_model(deep_dropout_network):
272 |
273 | def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format):
274 |
275 | deep_dropout_network.__init__(self, rng, batch_size, 3, comp_precision, update_precision,
276 | initial_range, max_overflow, format)
277 |
278 | print " Convolution layer 1:"
279 |
280 | self.layer.append(Maxout_conv_layer(
281 | rng,
282 | image_shape=(batch_size, 1, 28, 28),
283 | zero_pad = 0,
284 | output_shape=(batch_size, 48, 10, 10),
285 | filter_shape=(48, 1, 8, 8),
286 | filter_stride = 1,
287 | n_pieces = 2,
288 | pool_shape=(4, 4),
289 | pool_stride = 2,
290 | p = 0.8,
291 | scale = 1.,
292 | max_col_norm = 0.9,
293 | format = format,
294 | comp_precision = comp_precision,
295 | update_precision = update_precision,
296 | initial_range = initial_range,
297 | max_overflow = max_overflow
298 | ))
299 |
300 |
301 | print " Convolution layer 2:"
302 |
303 | self.layer.append(Maxout_conv_layer(
304 | rng,
305 | image_shape=(batch_size, 48, 10, 10),
306 | zero_pad = 3, # add n zero on both side of the input
307 | output_shape=(batch_size, 48, 4, 4),
308 | filter_shape=(48, 48, 8, 8),
309 | filter_stride = 1,
310 | n_pieces = 2,
311 | pool_shape=(4, 4),
312 | pool_stride =2,
313 | p = 0.5,
314 | scale = 0.5,
315 | max_col_norm = 1.9365,
316 | format = format,
317 | comp_precision = comp_precision,
318 | update_precision = update_precision,
319 | initial_range = initial_range,
320 | max_overflow = max_overflow
321 | ))
322 |
323 |
324 | print " Convolution layer 3:"
325 |
326 | self.layer.append(Maxout_conv_layer(
327 | rng,
328 | image_shape=(batch_size, 48, 4, 4),
329 | zero_pad = 3, # add n zero on both side of the input
330 | output_shape=(batch_size, 24, 3, 3),
331 | filter_shape=(24, 48, 5, 5),
332 | filter_stride = 1,
333 | n_pieces = 4,
334 | pool_shape=(2, 2),
335 | pool_stride =2,
336 | p = 0.5,
337 | scale = 0.5,
338 | max_col_norm = 1.9365,
339 | format = format,
340 | comp_precision = comp_precision,
341 | update_precision = update_precision,
342 | initial_range = initial_range,
343 | max_overflow = max_overflow
344 | ))
345 |
346 | print " Softmax layer:"
347 |
348 | self.layer.append(SoftmaxLayer(
349 | rng = rng,
350 | n_inputs= 24*3*3,
351 | n_units = 10,
352 | p = 0.5,
353 | scale = 0.5,
354 | max_col_norm =1.9365,
355 | format = format,
356 | comp_precision = comp_precision,
357 | update_precision = update_precision,
358 | initial_range = initial_range,
359 | max_overflow = max_overflow
360 | ))
361 |
362 | class CIFAR10_SVHN_model(deep_dropout_network):
363 |
364 | def __init__(self, rng, batch_size, comp_precision, update_precision, initial_range, max_overflow, format):
365 |
366 | deep_dropout_network.__init__(self, rng, batch_size, 4, comp_precision, update_precision,
367 | initial_range, max_overflow, format)
368 |
369 | print " Convolution layer 1:"
370 |
371 | self.layer.append(Maxout_conv_layer(
372 | rng,
373 | image_shape=(batch_size, 3, 32, 32),
374 | zero_pad = 2,
375 | output_shape=(batch_size, 64, 16, 16), # 64 does fit in memory
376 | filter_shape=(64, 3, 5, 5),
377 | filter_stride = 1,
378 | n_pieces = 2,
379 | pool_shape=(3, 3),
380 | pool_stride = 2,
381 | p = 0.8,
382 | scale = 1.,
383 | max_col_norm = 0.9,
384 | format = format,
385 | comp_precision = comp_precision,
386 | update_precision = update_precision,
387 | initial_range = initial_range,
388 | max_overflow = max_overflow,
389 | w_LR_scale = 0.2,
390 | b_LR_scale = 0.2,
391 | # partial_sum = 32 # total number = 33*33
392 | ))
393 |
394 |
395 | print " Convolution layer 2:"
396 |
397 | self.layer.append(Maxout_conv_layer(
398 | rng,
399 | image_shape=(batch_size, 64, 16, 16),
400 | zero_pad = 2, # add n zero on both side of the input
401 | output_shape=(batch_size, 128, 8, 8),
402 | filter_shape=(128, 64, 5, 5),
403 | filter_stride = 1,
404 | n_pieces = 2,
405 | pool_shape=(3, 3),
406 | pool_stride =2,
407 | p = 0.5,
408 | scale = 0.5,
409 | max_col_norm = 1.9365,
410 | format = format,
411 | comp_precision = comp_precision,
412 | update_precision = update_precision,
413 | initial_range = initial_range,
414 | max_overflow = max_overflow,
415 | w_LR_scale = 0.2,
416 | b_LR_scale = 0.2,
417 | # partial_sum = 16 # total number = 15*15
418 | ))
419 |
420 |
421 | print " Convolution layer 3:"
422 |
423 | self.layer.append(Maxout_conv_layer(
424 | rng,
425 | image_shape=(batch_size, 128, 8, 8),
426 | zero_pad = 2, # add n zero on both side of the input
427 | output_shape=(batch_size, 128, 4, 4),
428 | filter_shape=(128, 128, 5, 5),
429 | filter_stride = 1,
430 | n_pieces = 2,
431 | pool_shape=(3, 3),
432 | pool_stride =2,
433 | p = 0.5,
434 | scale = 0.5,
435 | max_col_norm = 1.9365,
436 | format = format,
437 | comp_precision = comp_precision,
438 | update_precision = update_precision,
439 | initial_range = initial_range,
440 | max_overflow = max_overflow,
441 | w_LR_scale = 0.2,
442 | b_LR_scale = 0.2,
443 | # partial_sum = 8 # total number = 9*9
444 | ))
445 |
446 | print " Maxout layer:"
447 |
448 | self.layer.append(MaxoutLayer(
449 | rng = rng,
450 | n_inputs= 128*4*4,
451 | n_units = 400,
452 | n_pieces = 5,
453 | p = 0.5,
454 | scale = 0.5,
455 | max_col_norm = 1.9365,
456 | format = format,
457 | comp_precision = comp_precision,
458 | update_precision = update_precision,
459 | initial_range = initial_range,
460 | max_overflow = max_overflow
461 | ))
462 |
463 | print " Softmax layer:"
464 |
465 | self.layer.append(SoftmaxLayer(
466 | rng = rng,
467 | n_inputs= 400,
468 | n_units = 10,
469 | p = 0.5,
470 | scale = 0.5,
471 | max_col_norm = 1.9365,
472 | format = format,
473 | comp_precision = comp_precision,
474 | update_precision = update_precision,
475 | initial_range = initial_range,
476 | max_overflow = max_overflow
477 | ))
478 |
--------------------------------------------------------------------------------
/layer.py:
--------------------------------------------------------------------------------
1 | # Copyright 2014 Matthieu Courbariaux
2 |
3 | # This file is part of deep-learning-multipliers.
4 |
5 | # deep-learning-multipliers is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 |
10 | # deep-learning-multipliers is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 |
15 | # You should have received a copy of the GNU General Public License
16 | # along with deep-learning-multipliers. If not, see .
17 |
18 | import gzip
19 | import cPickle
20 | import numpy as np
21 | import os
22 | import os.path
23 | import sys
24 | import theano
25 | import theano.tensor as T
26 | from theano import pp
27 | import time
28 | import scipy.stats
29 | from pylearn2.sandbox.cuda_convnet.filter_acts import FilterActs
30 | from theano.sandbox.cuda.basic_ops import gpu_contiguous
31 | from pylearn2.sandbox.cuda_convnet.pool import MaxPool
32 |
33 | from format import apply_format, overflow, new_range
34 |
35 | class dropout_layer(object):
36 |
37 | def __init__(self, rng, p, scale, max_col_norm, format,
38 | comp_precision, update_precision, initial_range, max_overflow, w_LR_scale = 1., b_LR_scale = 1.):
39 |
40 | print " p = " + str(p)
41 | print " scale = " + str(scale)
42 | print " w_LR_scale = " + str(w_LR_scale)
43 | print " b_LR_scale = " + str(b_LR_scale)
44 | print " max_col_norm = " + str(max_col_norm)
45 | print " format = " + str(format)
46 |
47 | # save the parameters
48 | self.p = p
49 | self.scale = scale
50 | self.w_LR_scale = w_LR_scale
51 | self.b_LR_scale = b_LR_scale
52 | self.rng = rng
53 | self.max_col_norm = max_col_norm
54 | self.format = format
55 |
56 | # create shared variables
57 | self.comp_precision = theano.shared(value=comp_precision, name='comp_precision')
58 | self.update_precision = theano.shared(value=update_precision, name='update_precision')
59 | self.max_overflow = theano.shared(value=max_overflow, name='max_overflow')
60 |
61 | # create shared variables for the fixed point range
62 | self.z_range = theano.shared(value=initial_range, name='z_range')
63 | self.dEdz_range = theano.shared(value=initial_range, name='dEdz_range')
64 | self.y_range = theano.shared(value=initial_range, name='y_range')
65 | self.dEdy_range = theano.shared(value=initial_range, name='dEdy_range')
66 | self.w_range = theano.shared(value=initial_range, name='w_range')
67 | self.b_range = theano.shared(value=initial_range, name='b_range')
68 | self.dEdw_range = theano.shared(value=initial_range, name='dEdw_range')
69 | self.dEdb_range = theano.shared(value=initial_range, name='dEdb_range')
70 | self.update_w_range = theano.shared(value=initial_range, name='update_w_range')
71 | self.update_b_range = theano.shared(value=initial_range, name='update_b_range')
72 |
73 | # overflow counters for current range (needed to know when to augment the range)
74 | self.z_overflow = theano.shared(value=0., name='z_overflow')
75 | self.dEdz_overflow = theano.shared(value=0., name='dEdz_overflow')
76 | self.y_overflow = theano.shared(value=0., name='y_overflow')
77 | self.dEdy_overflow = theano.shared(value=0., name='dEdy_overflow')
78 | self.w_overflow = theano.shared(value=0., name='w_overflow')
79 | self.b_overflow = theano.shared(value=0., name='b_overflow')
80 | self.dEdw_overflow = theano.shared(value=0., name='dEdw_overflow')
81 | self.dEdb_overflow = theano.shared(value=0., name='dEdb_overflow')
82 | self.update_w_overflow = theano.shared(value=0., name='update_w_overflow')
83 | self.update_b_overflow = theano.shared(value=0., name='update_b_overflow')
84 |
85 | # overflow counter for current range-1 (needed to know when to reduce the range)
86 | self.z_overflow_1 = theano.shared(value=0., name='z_overflow_1')
87 | self.dEdz_overflow_1 = theano.shared(value=0., name='dEdz_overflow_1')
88 | self.y_overflow_1 = theano.shared(value=0., name='y_overflow_1')
89 | self.dEdy_overflow_1 = theano.shared(value=0., name='dEdy_overflow_1')
90 | self.w_overflow_1 = theano.shared(value=0., name='w_overflow_1')
91 | self.b_overflow_1 = theano.shared(value=0., name='b_overflow_1')
92 | self.dEdw_overflow_1 = theano.shared(value=0., name='dEdw_overflow_1')
93 | self.dEdb_overflow_1 = theano.shared(value=0., name='dEdb_overflow_1')
94 | self.update_w_overflow_1 = theano.shared(value=0., name='update_w_overflow_1')
95 | self.update_b_overflow_1 = theano.shared(value=0., name='update_b_overflow_1')
96 |
97 | def fprop(self, input):
98 |
99 | # we reduce the precision of parameters for the computations
100 | self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
101 | self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
102 |
103 | # scaled weighted sum
104 | self.z = apply_format(self.format, T.dot(input, self.w_comp * self.scale) + self.b_comp*self.scale, self.comp_precision, self.z_range)
105 |
106 | # activation
107 | self.y = apply_format(self.format, self.activation(self.z), self.comp_precision, self.y_range)
108 |
109 | # return the output
110 | return self.y
111 |
112 | def dropout_fprop(self, input):
113 |
114 | # we reduce the precision of parameters for the computations
115 | self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
116 | self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
117 |
118 | # create the dropout mask
119 | # The cast is important because
120 | # int * float32 = float64 which pulls things off the gpu
121 | srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
122 | self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
123 |
124 | # apply the mask
125 | self.fixed_x = input * self.mask
126 |
127 | # weighted sum
128 | self.z = T.dot(self.fixed_x, self.fixed_W) + self.fixed_b
129 | self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range)
130 |
131 | # activation
132 | self.y = self.activation(self.fixed_z)
133 | self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
134 |
135 | # return the output
136 | return self.fixed_y
137 |
138 | def activation(self):
139 |
140 | raise NotImplementedError("Subclass must implement abstract method")
141 |
142 | def activation_bprop(self):
143 |
144 | raise NotImplementedError("Subclass must implement abstract method")
145 |
146 | def bprop(self, dEdy):
147 |
148 | self.fixed_dEdy = apply_format(self.format, dEdy, self.comp_precision, self.dEdy_range)
149 |
150 | # activation
151 | self.activation_bprop()
152 |
153 | # compute gradients of parameters
154 | self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdw_range)
155 | self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision, self.dEdb_range)
156 |
157 | # weighted sum
158 | dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
159 |
160 | # apply mask
161 | dEdx = self.mask * dEdx
162 |
163 | return dEdx
164 |
165 | def parameter_updates(self, LR, M):
166 |
167 | # compute updates
168 | new_update_W = apply_format(self.format, M * self.update_W - LR * self.w_LR_scale * self.fixed_dEdW, self.comp_precision, self.update_w_range)
169 | new_update_b = apply_format(self.format, M * self.update_b - LR * self.b_LR_scale * self.fixed_dEdb, self.comp_precision, self.update_b_range)
170 |
171 | # compute new parameters. Note that we use a better precision than the other operations
172 | new_W = apply_format(self.format, self.W + new_update_W, self.update_precision, self.w_range)
173 | new_b = apply_format(self.format, self.b + new_update_b, self.update_precision, self.b_range)
174 |
175 | # L2 column constraint on W
176 | col_norms = T.sqrt(T.sum(T.sqr(new_W), axis=0))
177 | # col_norms = T.max(new_W, axis=0)
178 | desired_norms = T.clip(col_norms, 0, self.max_col_norm) # clip = saturate below min and beyond max
179 | new_W = apply_format(self.format, new_W * (desired_norms / (1e-7 + col_norms)), self.update_precision, self.w_range)
180 | # for some reason, works better than
181 | # new_W = new_W * (desired_norms / col_norms)
182 | # It may be a kind of regularization
183 |
184 | # return the updates of shared variables
185 | updates = []
186 | updates.append((self.W, new_W))
187 | updates.append((self.b, new_b))
188 | updates.append((self.update_W, new_update_W))
189 | updates.append((self.update_b, new_update_b))
190 |
191 | return updates
192 |
193 | def overflow_updates(self):
194 |
195 | updates = []
196 |
197 | # update overflow counters for the dynamic fixed point
198 | updates.append((self.z_overflow, self.z_overflow + overflow(self.fixed_z, self.comp_precision, self.z_range)))
199 | updates.append((self.dEdz_overflow, self.dEdz_overflow + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range)))
200 | updates.append((self.y_overflow, self.y_overflow + overflow(self.fixed_y, self.comp_precision, self.y_range)))
201 | updates.append((self.dEdy_overflow, self.dEdy_overflow + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range)))
202 | updates.append((self.w_overflow, self.w_overflow + overflow(self.W, self.update_precision, self.w_range)))
203 | updates.append((self.b_overflow, self.b_overflow + overflow(self.b, self.update_precision, self.b_range)))
204 | updates.append((self.dEdw_overflow, self.dEdw_overflow + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range)))
205 | updates.append((self.dEdb_overflow, self.dEdb_overflow + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range)))
206 | updates.append((self.update_w_overflow, self.update_w_overflow + overflow(self.update_W, self.comp_precision, self.update_w_range)))
207 | updates.append((self.update_b_overflow, self.update_b_overflow + overflow(self.update_b, self.comp_precision, self.update_b_range)))
208 |
209 | updates.append((self.z_overflow_1, self.z_overflow_1 + overflow(self.fixed_z, self.comp_precision, self.z_range-1)))
210 | updates.append((self.dEdz_overflow_1, self.dEdz_overflow_1 + overflow(self.fixed_dEdz, self.comp_precision, self.dEdz_range-1)))
211 | updates.append((self.y_overflow_1, self.y_overflow_1 + overflow(self.fixed_y, self.comp_precision, self.y_range-1)))
212 | updates.append((self.dEdy_overflow_1, self.dEdy_overflow_1 + overflow(self.fixed_dEdy, self.comp_precision, self.dEdy_range-1)))
213 | updates.append((self.w_overflow_1, self.w_overflow_1 + overflow(self.W, self.update_precision, self.w_range-1)))
214 | updates.append((self.b_overflow_1, self.b_overflow_1 + overflow(self.b, self.update_precision, self.b_range-1)))
215 | updates.append((self.dEdw_overflow_1, self.dEdw_overflow_1 + overflow(self.fixed_dEdW, self.comp_precision, self.dEdw_range-1)))
216 | updates.append((self.dEdb_overflow_1, self.dEdb_overflow_1 + overflow(self.fixed_dEdb, self.comp_precision, self.dEdb_range-1)))
217 | updates.append((self.update_w_overflow_1, self.update_w_overflow_1 + overflow(self.update_W, self.comp_precision, self.update_w_range-1)))
218 | updates.append((self.update_b_overflow_1, self.update_b_overflow_1 + overflow(self.update_b, self.comp_precision, self.update_b_range-1)))
219 |
220 | return updates
221 |
222 | def range_updates(self,batch_count):
223 |
224 | updates = []
225 |
226 | # update the ranges according to the overflow counters
227 | updates.append((self.z_range, self.z_range+new_range(self.z_overflow/batch_count,self.z_overflow_1/batch_count, self.max_overflow)))
228 | updates.append((self.dEdz_range, self.dEdz_range+new_range(self.dEdz_overflow/batch_count, self.dEdz_overflow_1/batch_count, self.max_overflow)))
229 | updates.append((self.y_range, self.y_range+new_range(self.y_overflow/batch_count, self.y_overflow_1/batch_count, self.max_overflow)))
230 | updates.append((self.dEdy_range, self.dEdy_range+new_range(self.dEdy_overflow/batch_count, self.dEdy_overflow_1/batch_count, self.max_overflow)))
231 | updates.append((self.w_range, self.w_range+new_range(self.w_overflow/batch_count, self.w_overflow_1/batch_count, self.max_overflow)))
232 | updates.append((self.b_range, self.b_range+new_range(self.b_overflow/batch_count, self.b_overflow_1/batch_count, self.max_overflow)))
233 | updates.append((self.dEdw_range, self.dEdw_range+new_range(self.dEdw_overflow/batch_count, self.dEdw_overflow_1/batch_count, self.max_overflow)))
234 | updates.append((self.dEdb_range, self.dEdb_range+new_range(self.dEdb_overflow/batch_count, self.dEdb_overflow_1/batch_count, self.max_overflow)))
235 | updates.append((self.update_w_range, self.update_w_range+new_range(self.update_w_overflow/batch_count, self.update_w_overflow_1/batch_count, self.max_overflow)))
236 | updates.append((self.update_b_range, self.update_b_range+new_range(self.update_b_overflow/batch_count, self.update_b_overflow_1/batch_count, self.max_overflow)))
237 |
238 | # reset the overflow counters
239 | updates.append((self.z_overflow, 0.))
240 | updates.append((self.dEdz_overflow, 0.))
241 | updates.append((self.y_overflow, 0.))
242 | updates.append((self.dEdy_overflow, 0.))
243 | updates.append((self.w_overflow, 0.))
244 | updates.append((self.b_overflow, 0.))
245 | updates.append((self.dEdw_overflow, 0.))
246 | updates.append((self.dEdb_overflow, 0.))
247 | updates.append((self.update_w_overflow, 0.))
248 | updates.append((self.update_b_overflow, 0.))
249 |
250 | updates.append((self.z_overflow_1, 0.))
251 | updates.append((self.dEdz_overflow_1, 0.))
252 | updates.append((self.y_overflow_1, 0.))
253 | updates.append((self.dEdy_overflow_1, 0.))
254 | updates.append((self.w_overflow_1, 0.))
255 | updates.append((self.b_overflow_1, 0.))
256 | updates.append((self.dEdw_overflow_1, 0.))
257 | updates.append((self.dEdb_overflow_1, 0.))
258 | updates.append((self.update_w_overflow_1, 0.))
259 | updates.append((self.update_b_overflow_1, 0.))
260 |
261 | return updates
262 |
263 | def print_range(self):
264 |
265 | print ' z NOIB = %i' %(self.z_range.get_value())
266 | print ' y NOIB = %i' %(self.y_range.get_value())
267 | print ' w NOIB = %i' %(self.w_range.get_value())
268 | print ' b NOIB = %i' %(self.b_range.get_value())
269 | print ' dEdz NOIB = %i' %(self.dEdz_range.get_value())
270 | print ' dEdy NOIB = %i' %(self.dEdy_range.get_value())
271 | print ' dEdw NOIB = %i' %(self.dEdw_range.get_value())
272 | print ' dEdb NOIB = %i' %(self.dEdb_range.get_value())
273 | print ' update w NOIB = %i' %(self.update_w_range.get_value())
274 | print ' update b NOIB = %i' %(self.update_b_range.get_value())
275 |
276 | class MaxoutLayer(dropout_layer):
277 |
278 | def __init__(self, rng, n_inputs, n_units, n_pieces, p, scale, max_col_norm, format,
279 | comp_precision, update_precision, initial_range, max_overflow):
280 |
281 | self.n_pieces=n_pieces
282 | self.n_inputs = n_inputs
283 | self.n_units = n_units
284 |
285 | print " n_pieces = " + str(n_pieces)
286 | print " n_inputs = " + str(n_inputs)
287 | print " n_units = " + str(n_units)
288 |
289 | # call mother class constructor
290 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format,
291 | comp_precision, update_precision, initial_range, max_overflow)
292 |
293 | # initial values of parameters
294 | low=-np.sqrt(6. / (n_inputs + n_units*n_pieces))
295 | high=np.sqrt(6. / (n_inputs + n_units*n_pieces))
296 | W_values = np.asarray(self.rng.uniform(low=low,high=high,size=(n_inputs, n_units*n_pieces)),dtype=theano.config.floatX)
297 | b_values = np.zeros((n_units*n_pieces), dtype=theano.config.floatX)
298 |
299 | # creation of shared symbolic variables
300 | # shared variables are the state of the built function
301 | # in practice, we put them in the GPU memory
302 | self.W = theano.shared(value=W_values, name='W')
303 | self.b = theano.shared(value=b_values, name='b')
304 |
305 | # momentum
306 | self.update_W = theano.shared(value=np.zeros((n_inputs, n_units*n_pieces), dtype=theano.config.floatX), name='update_W')
307 | self.update_b = theano.shared(value=b_values, name='update_b')
308 |
309 | # activation function
310 | def activation(self,z):
311 |
312 | y = T.reshape(z,(T.shape(z)[0], self.n_units, self.n_pieces))
313 |
314 | # maxout
315 | y = T.max(y,axis=2)
316 |
317 | y = T.reshape(y,(T.shape(z)[0],self.n_units))
318 |
319 | return y
320 |
321 | def activation_bprop(self):
322 |
323 | self.fixed_dEdz = apply_format(self.format,
324 | T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.y:self.fixed_dEdy})[0],
325 | self.comp_precision, self.dEdz_range)
326 |
327 | class SoftmaxLayer(dropout_layer):
328 |
329 | def __init__(self, rng, n_inputs, n_units, p, scale, max_col_norm, format,
330 | comp_precision, update_precision, initial_range, max_overflow):
331 |
332 | self.n_inputs = n_inputs
333 | self.n_units = n_units
334 |
335 | print " n_inputs = " + str(n_inputs)
336 | print " n_units = " + str(n_units)
337 |
338 | # call mother class constructor
339 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format,
340 | comp_precision, update_precision, initial_range, max_overflow)
341 |
342 | # initial values of parameters
343 | W_values = np.zeros((n_inputs, n_units), dtype=theano.config.floatX)
344 | b_values = np.zeros(n_units, dtype=theano.config.floatX)
345 |
346 | # creation of shared symbolic variables
347 | self.W = theano.shared(value=W_values, name='W')
348 | self.b = theano.shared(value=b_values, name='b')
349 |
350 | # momentum
351 | self.update_W = theano.shared(value=W_values, name='update_W')
352 | self.update_b = theano.shared(value=b_values, name='update_b')
353 |
354 | # activation function
355 | def activation(self,z):
356 |
357 | return T.nnet.softmax(z)
358 |
359 | def activation_bprop(self):
360 |
361 | self.fixed_dEdz = apply_format(self.format, self.fixed_dEdy,
362 | self.comp_precision, self.dEdz_range)
363 |
364 | class Maxout_conv_layer(dropout_layer):
365 |
366 | def __init__(self, rng, image_shape, zero_pad, output_shape, filter_shape, filter_stride, n_pieces, pool_shape, pool_stride, p, scale, max_col_norm, format,
367 | comp_precision, update_precision, initial_range, max_overflow, w_LR_scale=1., b_LR_scale=1., partial_sum = 1):
368 |
369 | # call mother class constructor
370 | dropout_layer.__init__(self, rng, p, scale, max_col_norm, format, comp_precision, update_precision, initial_range, max_overflow, w_LR_scale, b_LR_scale)
371 |
372 | print ' output_shape = ' +str(output_shape)
373 | print ' image_shape = ' +str(image_shape)
374 |
375 | # add n zero on both side of the input
376 | # 0 <-> valid convolution, result is smaller
377 | # filter_size -1 <-> full convolution, result is bigger !
378 | # valid convolution makes more sense to me. I use it to reduce the size of feature maps without using max pool.
379 | print ' zero_pad = ' +str(zero_pad)
380 |
381 | # number of output feature maps, number of inputs feature maps, x, y
382 | # number of inputs feature maps is important for the weights
383 | print ' filter_shape = ' +str(filter_shape)
384 | print ' filter_stride = ' +str(filter_stride)
385 | print ' n_pieces = ' +str(n_pieces)
386 | print ' pool_shape = ' +str(pool_shape)
387 | print ' pool_stride = ' +str(pool_stride)
388 | print ' partial_sum = ' +str(partial_sum)
389 |
390 | # save the parameters
391 | self.output_shape = output_shape
392 | self.image_shape = image_shape
393 | self.zero_pad = zero_pad
394 | self.filter_shape = (filter_shape[0]*n_pieces,filter_shape[1],filter_shape[2],filter_shape[3])
395 | self.filter_stride = filter_stride
396 | self.n_pieces = n_pieces
397 | self.pool_shape = pool_shape
398 | self.pool_stride = pool_stride
399 | self.partial_sum = partial_sum
400 |
401 | # range of init
402 | fan_in = np.prod(self.filter_shape[1:])
403 | fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]) / self.n_pieces / np.prod(self.pool_shape))
404 |
405 | # initialize weights with random weights
406 | W_bound = np.sqrt(6. / (fan_in + fan_out))
407 | self.W = theano.shared(
408 | np.asarray(rng.uniform(low=-W_bound, high=W_bound, size=self.filter_shape),
409 | dtype=theano.config.floatX))
410 |
411 | # the bias is a 1D tensor -- one bias per output feature map
412 | b_values = np.zeros((self.filter_shape[0],), dtype=theano.config.floatX)
413 | self.b = theano.shared(value=b_values)
414 |
415 | self.update_W = theano.shared(value=np.zeros(self.filter_shape, dtype=theano.config.floatX), name='update_W')
416 | self.update_b = theano.shared(value=np.zeros((self.filter_shape[0],), dtype=theano.config.floatX), name='update_b')
417 |
418 | # activation function
419 | def activation(self,conv_out):
420 |
421 | conv_out = T.reshape(conv_out,(T.shape(conv_out)[0], T.shape(conv_out)[1]//self.n_pieces, self.n_pieces,T.shape(conv_out)[2],T.shape(conv_out)[3] ))
422 | return T.max( conv_out,axis=2)
423 |
424 | def fprop(self, input):
425 |
426 | # we reduce the precision of parameters for the computations
427 | self.w_comp = apply_format(self.format, self.W, self.comp_precision, self.w_range)
428 | self.b_comp = apply_format(self.format, self.b, self.comp_precision, self.b_range)
429 |
430 | input = input.reshape(self.image_shape)
431 |
432 | # convolution
433 | input_shuffled = input.dimshuffle(1, 2, 3, 0) # bc01 to c01b
434 | filters_shuffled = self.w_comp.dimshuffle(1, 2, 3, 0) *self.scale # bc01 to c01b
435 | conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad)
436 | contiguous_input = gpu_contiguous(input_shuffled)
437 | contiguous_filters = gpu_contiguous(filters_shuffled)
438 | conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
439 |
440 | # downsample each feature map individually, using maxpooling
441 | # pooled_out = downsample.max_pool_2d(input=conv_out,
442 | # ds=poolsize, ignore_border=True)
443 | pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
444 | pooled_out_shuffled = pool_op(conv_out_shuffled)
445 | pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
446 |
447 | # bias
448 | pooled_out = apply_format(self.format, pooled_out + self.b_comp.dimshuffle('x', 0, 'x', 'x')*self.scale, self.comp_precision, self.z_range)
449 |
450 | # activation
451 | pooled_out = self.activation(pooled_out)
452 | pooled_out = apply_format(self.format, pooled_out.flatten(2), self.comp_precision, self.y_range)
453 |
454 | return pooled_out
455 |
456 | def dropout_fprop(self, input):
457 |
458 | # we reduce the precision of parameters for the computations
459 | self.fixed_W = apply_format(self.format, self.W, self.comp_precision, self.w_range)
460 | self.fixed_b = apply_format(self.format, self.b, self.comp_precision, self.b_range)
461 |
462 | # create the dropout mask
463 | # The cast is important because
464 | # int * float32 = float64 which pulls things off the gpu
465 |
466 | srng = T.shared_randomstreams.RandomStreams(self.rng.randint(999999))
467 | self.mask = T.cast(srng.binomial(n=1, p=self.p, size=T.shape(input)), theano.config.floatX)
468 | input = input * self.mask
469 |
470 | self.fixed_x = input.reshape(self.image_shape)
471 |
472 | # convolution
473 | input_shuffled = self.fixed_x.dimshuffle(1, 2, 3, 0) # bc01 to c01b
474 | filters_shuffled = self.fixed_W.dimshuffle(1, 2, 3, 0) # bc01 to c01b
475 | conv_op = FilterActs(stride=self.filter_stride, partial_sum=self.partial_sum,pad = self.zero_pad) # augment partial sum -> use less memory but slower
476 | contiguous_input = gpu_contiguous(input_shuffled)
477 | contiguous_filters = gpu_contiguous(filters_shuffled)
478 | conv_out_shuffled = conv_op(contiguous_input, contiguous_filters)
479 |
480 | self.z = conv_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
481 | self.fixed_z = apply_format(self.format, self.z, self.comp_precision, self.z_range)
482 |
483 | conv_out_shuffled = self.fixed_z.dimshuffle(1, 2, 3, 0) # bc01 to c01b
484 | conv_out_shuffled = gpu_contiguous(conv_out_shuffled)
485 |
486 | # downsample each feature map individually, using maxpooling
487 | # pooled_out = downsample.max_pool_2d(input=conv_out,
488 | # ds=poolsize, ignore_border=True)
489 | pool_op = MaxPool(ds=self.pool_shape, stride=self.pool_stride)
490 | pooled_out_shuffled = pool_op(conv_out_shuffled)
491 | pooled_out = pooled_out_shuffled.dimshuffle(3, 0, 1, 2) # c01b to bc01
492 |
493 | # bias
494 | self.u = pooled_out + self.fixed_b.dimshuffle('x', 0, 'x', 'x')
495 | self.fixed_u = apply_format(self.format, self.u, self.comp_precision, self.z_range)
496 |
497 | # activation
498 | self.y = self.activation(self.fixed_u).flatten(2)
499 | self.fixed_y = apply_format(self.format, self.y, self.comp_precision, self.y_range)
500 |
501 | return self.fixed_y
502 |
503 | def bprop(self, dEdy):
504 |
505 | self.fixed_dEdy = apply_format(self.format, dEdy.reshape(self.output_shape), self.comp_precision, self.dEdy_range)
506 |
507 | fixed_dEdu = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_u], known_grads={self.y:self.fixed_dEdy})[0], self.comp_precision,self.dEdz_range)
508 |
509 | self.fixed_dEdb = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_b], known_grads={self.u:fixed_dEdu})[0], self.comp_precision,self.dEdb_range)
510 |
511 | self.fixed_dEdz = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_z], known_grads={self.u:fixed_dEdu})[0], self.comp_precision, self.dEdz_range)
512 |
513 | self.fixed_dEdW = apply_format(self.format, T.grad(cost = None, wrt=[self.fixed_W], known_grads={self.z:self.fixed_dEdz})[0], self.comp_precision,self.dEdw_range)
514 |
515 | dEdx = T.grad(cost = None, wrt=[self.fixed_x], known_grads={self.z:self.fixed_dEdz})[0]
516 |
517 | dEdx = T.reshape(self.mask,T.shape(dEdx)) * dEdx
518 |
519 | return dEdx
--------------------------------------------------------------------------------