├── .gitignore
├── IMAGES.mat
├── MNIST_images.py
├── README.md
├── compute_numerical_gradient.py
├── image_patches.py
├── requirements.txt
├── softmax.py
├── softmax_test.py
├── sparse_autoencoder.py
├── sparse_autoencoder_test.py
├── stacked_autoencoder.py
├── stacked_autoencoder_test.py
└── visualize_network.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.png
3 | *.dat
4 | *.npy
5 | mnist/
6 | 


--------------------------------------------------------------------------------
/IMAGES.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dusano/DeepLearning/b0c495aba27bffdf5ff0ee646ff70ca3827ae49d/IMAGES.mat


--------------------------------------------------------------------------------
/MNIST_images.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | 
 4 | 
 5 | def loadMNISTImages(filename):
 6 | 	"""loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing
 7 | 	the raw MNIST images.
 8 | 	"""
 9 | 	
10 | 	f = open(filename, 'rb')
11 | 
12 | 	assert f != -1, 'Could not open %s' % filename
13 | 
14 | 	magic = fromfile(f, dtype='>i4', count=1)
15 | 	assert magic == 2051, 'Bad magic number in %s' % filename
16 | 
17 | 	numImages = fromfile(f, dtype='>i4', count=1)
18 | 	numRows = fromfile(f, dtype='>i4', count=1)
19 | 	numCols = fromfile(f, dtype='>i4', count=1)
20 | 
21 | 	images = fromfile(f, dtype='B')
22 | 	images = images.reshape(numImages, numCols, numRows)
23 | 	
24 | 	f.close()
25 | 
26 | 	# Reshape to #pixels x #examples
27 | 	images = images.reshape(images.shape[0], images.shape[1]*images.shape[2])
28 | 	# Convert to double and rescale to [0,1]
29 | 	images = double(images) / 255
30 | 
31 | 	return images
32 | 	
33 | 	
34 | def loadMNISTLabels(filename):
35 | 	"""loadMNISTLabels returns a [number of MNIST images]x1 matrix containing
36 | 	the labels for the MNIST images
37 | 	"""
38 | 
39 | 	f = open(filename, 'rb')
40 | 	assert f != -1, 'Could not open %s' % filename
41 | 
42 | 	magic = fromfile(f, dtype='>i4', count=1)
43 | 	assert magic == 2049, 'Bad magic number in %s' % filename
44 | 	
45 | 	numLabels = fromfile(f, dtype='>i4', count=1)
46 | 	
47 | 	labels = fromfile(f, dtype='B')
48 | 	
49 | 	assert labels.shape[0] == numLabels, 'Mismatch in label count'
50 | 	
51 | 	f.close()
52 | 	
53 | 	return labels
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | DeepLearning
 2 | ============
 3 | 
 4 | Python implementation of UFLDL tutorial code (http://ufldl.stanford.edu/wiki/index.php/UFLDL_Tutorial)
 5 | 
 6 | Installation
 7 | ------------
 8 | 
 9 | 1. Clone repository
10 | 2. Set up virtualenv
11 | 3. pip install -r requirements.txt
12 | 
13 | To speed things up, install Intel Math Kernel Library and fill-in ~/.numpy-site.cfg before installing numpy (see http://stackoverflow.com/questions/13769936/supplying-numpy-site-cfg-arguments-to-pip for more information).
14 | 
15 | Test
16 | ----
17 | Running
18 | 
19 | > python stacked_autoencoder_test.py
20 | 
21 | should produce
22 | 
23 | > Before Fine-tuning Test Accuracy: 92.180%
24 | >
25 | > After Fine-tuning Test Accuracy: 97.830%
26 | 
27 | on MNIST data set (http://yann.lecun.com/exdb/mnist/)
28 | 
29 | Enjoy!
30 | 


--------------------------------------------------------------------------------
/compute_numerical_gradient.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | 
 4 | def computeNumericalGradient(J, theta):
 5 | 	
 6 | 	numgrad = zeros(theta.shape)
 7 | 	
 8 | 	EPSILON = 1e-04
 9 | 	
10 | 	bases = eye(numgrad.shape[0])
11 | 	
12 | 	for i in range(numgrad.shape[0]):
13 | 		(value1, grad1) = J(theta + EPSILON*bases[:,i])
14 | 		(value2, grad2) = J(theta - EPSILON*bases[:,i])
15 | 		numgrad[i] = (value1 - value2) / (2*EPSILON)
16 | 
17 | 	return numgrad
18 | 
19 | 
20 | if __name__ == "__main__":
21 | 	""" Check correctness of implemenation of computeNumericalGradient
22 | 	on an example of simple quadratic function
23 | 	"""
24 | 	
25 | 	def simpleQuadraticFunction(x):
26 | 		value = x[0]**2 + 3*x[0]*x[1]
27 | 	
28 | 		grad = zeros(2)
29 | 		grad[0]  = 2*x[0] + 3*x[1]
30 | 		grad[1]  = 3*x[0]
31 | 	
32 | 		return (value, grad)
33 | 	
34 | 	
35 | 	x = array([4, 10]).T
36 | 	
37 | 	(value, grad) = simpleQuadraticFunction(x);
38 | 	
39 | 	numgrad = computeNumericalGradient(simpleQuadraticFunction, x)
40 | 	
41 | 	diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad)
42 | 	
43 | 	print('%s' % diff)
44 | 	print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n')


--------------------------------------------------------------------------------
/image_patches.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | import scipy.io
 4 | 
 5 | 
 6 | def getPatches(numPatches, patchSize):
 7 | 	
 8 | 	images = scipy.io.loadmat('IMAGES.mat')['IMAGES']
 9 | 	
10 | 	patches = zeros((numPatches, patchSize*patchSize))
11 | 	
12 | 	numImages = images.shape[2]
13 | 	imageIdxs = random.randint(numImages, size=numPatches)
14 | 	sortedImageIdxs = argsort(imageIdxs)
15 | 	
16 | 	lastImageIdx = -1
17 | 	for i in range(numPatches):
18 | 		imageIdx = imageIdxs[sortedImageIdxs[i]]
19 | 		if lastImageIdx != imageIdx:
20 | 			img = images[:,:,imageIdx]
21 | 			lastImageIdx = imageIdx
22 | 			
23 | 		x = random.randint(img.shape[0] - patchSize)
24 | 		y = random.randint(img.shape[1] - patchSize)
25 | 	
26 | 		patch = img[x:x+patchSize, y:y+patchSize]
27 | 	
28 | 		patches[sortedImageIdxs[i], :] = patch.reshape(1, patchSize*patchSize)
29 | 	
30 | 	# Remove DC (mean of images)
31 | 	patches = patches - mean(patches)
32 | 	
33 | 	# Truncate to +/-3 standard deviations and scale to -1 to 1
34 | 	pstd = 3 * std(patches)
35 | 	patches = maximum(minimum(patches, pstd), -pstd) / pstd
36 | 	
37 | 	# Rescale from [-1,1] to [0.1,0.9]
38 | 	patches = (patches + 1) * 0.4 + 0.1
39 | 	
40 | 	return patches


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | PIL==1.1.7
2 | numpy==1.8.0
3 | scipy==0.13.2
4 | 


--------------------------------------------------------------------------------
/softmax.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | from scipy.sparse import *
 4 | 
 5 | from compute_numerical_gradient import computeNumericalGradient
 6 | 
 7 | 
 8 | def cost(thetaParam, numClasses, inputSize, lambdaParam, data, labels):
 9 | 	"""Compute the cost and gradient for softmax regression.
10 | 	
11 | 	Keyword arguments:
12 | 	thetaParam -- a vector of parameters
13 | 	numClasses -- the number of classes 
14 | 	inputSize -- the size N of the input vector
15 | 	lambdaParam -- weight decay parameter
16 | 	data - the N x M input matrix, where each column data(:, i) corresponds to a single test set
17 | 	labels - an M x 1 matrix containing the labels corresponding for the input data
18 | 	
19 | 	"""
20 | 
21 | 	# Unroll the parameters from theta
22 | 	thetaParam = thetaParam.reshape(numClasses, inputSize)
23 | 	
24 | 	m = data.shape[0]
25 | 
26 | 	groundTruth = csc_matrix( (ones(m),(labels,range(m))), shape=(numClasses,m) ).todense()
27 | 	cost = 0
28 | 
29 | 	M = thetaParam.dot(data.T)
30 | 	M = M - amax(M, 0)
31 | 	h_data = exp(M)
32 | 	h_data = h_data / sum(h_data, 0)
33 | 	
34 | 	cost = -sum(multiply(groundTruth, log(h_data)))/m + lambdaParam/2 * sum(thetaParam**2)
35 | 
36 | 	thetaGrad = -((groundTruth - h_data).dot(data))/m + lambdaParam*thetaParam
37 | 
38 | 	return (cost, squeeze(array(thetaGrad.ravel())))
39 | 
40 | 
41 | def predict(thetaParam, data):
42 | 	"""Compute pred using theta
43 | 	
44 | 	Keyword arguments:
45 | 	optTheta -- this provides a numClasses x inputSize matrix
46 | 	data -- the N x M input matrix, where each column data(:, i) corresponds to a single test set
47 | 	"""	
48 | 	h_data = exp(thetaParam.dot(data.T))
49 | 	h_data = h_data / sum(h_data, 0)
50 | 	return argmax(h_data, axis=0)
51 | 	
52 | 
53 | if __name__ == "__main__":
54 | 	""" Check correctness of implemenation of softmax cost function
55 | 	using gradient check
56 | 	"""
57 | 	numClasses = 10			# Number of classes (MNIST images fall into 10 classes)
58 | 	lambdaParam = 1e-4		# Weight decay parameter
59 | 	inputSize = 8
60 | 	inputData = random.normal(size=(100,inputSize))
61 | 	labels = random.randint(10, size=100)
62 | 
63 | 	def softmaxCostCallback(x):
64 | 		return cost(x, numClasses, inputSize, lambdaParam, inputData, labels) 
65 | 	
66 | 	# Randomly initialise theta
67 | 	thetaParam = 0.005 * random.normal(size=numClasses * inputSize)
68 | 
69 | 	(cost_value, grad) = softmaxCostCallback(thetaParam)
70 | 	
71 | 	numGrad = computeNumericalGradient(softmaxCostCallback, thetaParam)
72 | 	
73 | 	# Compare numerically computed gradients with those computed analytically
74 | 	diff = linalg.norm(numGrad-grad)/linalg.norm(numGrad+grad)
75 | 	
76 | 	print('%s' % diff)
77 | 	print('Norm of the difference between numerical and analytical gradient (should be < 1e-7)\n\n')
78 | 


--------------------------------------------------------------------------------
/softmax_test.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | from scipy import optimize
 4 | 
 5 | from MNIST_images import loadMNISTImages, loadMNISTLabels
 6 | import softmax
 7 | 
 8 | 
 9 | inputSize = 28 * 28		# Size of input vector (MNIST images are 28x28)
10 | numClasses = 10			# Number of classes (MNIST images fall into 10 classes)
11 | lambdaParam = 1e-4		# Weight decay parameter
12 | 
13 | trainData = loadMNISTImages('mnist/train-images-idx3-ubyte')
14 | trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte')
15 | 
16 | def softmaxCostCallback(x):
17 | 	return softmax.cost(x, numClasses, inputSize, lambdaParam, trainData, trainLabels) 
18 | 
19 | # Randomly initialise theta
20 | thetaParam = 0.005 * random.normal(size=numClasses * inputSize)
21 | 
22 | options = {
23 | 		'maxiter': 100,
24 | 		'disp': True,
25 | 	}
26 | 
27 | result = optimize.minimize(softmaxCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=options)
28 | 
29 | optTheta = result.x[0:numClasses*inputSize].reshape(numClasses, inputSize)
30 | 
31 | # Evaluating performance of the softmax classifier
32 | testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte')
33 | testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte')
34 | 
35 | pred = softmax.predict(optTheta, testData)
36 | 
37 | acc = mean(testLabels==pred)
38 | print('Accuracy: %0.3f%%\n' % (acc * 100))
39 | 


--------------------------------------------------------------------------------
/sparse_autoencoder.py:
--------------------------------------------------------------------------------
  1 | # Based on CS294A/CS294W Programming Assignment Starter Code
  2 | from numpy  import *
  3 | 
  4 | from compute_numerical_gradient import computeNumericalGradient
  5 | from image_patches import getPatches
  6 | 
  7 | 
  8 | def sigmoid(x):
  9 | 	return 1 / (1 + exp(-x))
 10 | 	
 11 | 
 12 | def feedForward(thetaParam, hiddenSize, visibleSize, data):
 13 | 	"""Compute the activation of the hidden layer for the Sparse Autoencoder.
 14 | 	
 15 | 	Keyword arguments:
 16 | 	thetaParam -- trained weights from the autoencoder
 17 | 	visibleSize -- the number of input units (probably 64) 
 18 | 	hiddenSize -- the number of hidden units (probably 25) 
 19 | 	data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example. 
 20 | 
 21 | 	"""
 22 | 	
 23 | 	W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
 24 | 	b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]
 25 | 
 26 | 	return sigmoid(data.dot(W1.T) + b1)
 27 | 
 28 | 
 29 | def cost(thetaParam, visibleSize, hiddenSize, lambdaParam, sparsityParam, betaParam, data, corruptionLevel=0.0):
 30 | 	""" Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
 31 | 	and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
 32 | 	
 33 | 	Keyword arguments:
 34 | 	thetaParam -- a vector of parameters (W1, W2, b1, b2)
 35 | 	visibleSize -- the number of input units (probably 64) 
 36 | 	hiddenSize -- the number of hidden units (probably 25) 
 37 | 	lambdaParam -- weight decay parameter
 38 | 	sparsityParam -- the desired average activation for the hidden units
 39 | 	betaParam -- weight of sparsity penalty term
 40 | 	data -- a matrix containing the training data. So, data[i,:] is the i-th training example.
 41 | 	corruptionLevel -- how much of the input will get corrupted (denoising autoencoder)
 42 | 	
 43 | 	"""
 44 | 
 45 | 	W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
 46 | 	W2 = thetaParam[hiddenSize*visibleSize:2*hiddenSize*visibleSize].reshape(visibleSize, hiddenSize)
 47 | 	b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize]
 48 | 	b2 = thetaParam[2*hiddenSize*visibleSize+hiddenSize:]
 49 | 	
 50 | 	m = data.shape[0]
 51 | 	
 52 | 	inputData = data
 53 | 	# Corrupt input data (so that denoising autoencoder can fix it)
 54 | 	if corruptionLevel > 0.0:
 55 | 		corruptionMatrix = random.binomial(1,1-corruptionLevel, size=inputData.shape)
 56 | 		inputData = inputData * corruptionMatrix
 57 | 
 58 | 	# Forward propagation
 59 | 	a2 = sigmoid(inputData.dot(W1.T) + b1)
 60 | 	a3 = sigmoid(a2.dot(W2.T) + b2)
 61 | 		
 62 | 	# Back propagation
 63 | 	mean_a2 = mean(a2,0)
 64 | 	
 65 | 	sparsity_delta = (-sparsityParam / mean_a2) + (1-sparsityParam)/(1-mean_a2)
 66 | 	
 67 | 	delta3 = -(data - a3) * (a3 * (1-a3))
 68 | 	delta2 = (delta3.dot(W2) + betaParam*sparsity_delta) * (a2 * (1-a2))
 69 | 
 70 | 	W1grad = (delta2.T.dot(inputData))/m + lambdaParam * W1
 71 | 	b1grad = sum(delta2, 0)/m
 72 | 	W2grad = (delta3.T.dot(a2))/m + lambdaParam * W2
 73 | 	b2grad = sum(delta3, 0)/m
 74 | 	
 75 | 	cost = sum((a3 - data)**2)/2
 76 | 		
 77 | 	weight_decay = sum(W1**2) + sum(W2**2)
 78 | 	
 79 | 	sparsity_penalty = sparsityParam*log(sparsityParam/mean_a2) + \
 80 | 						(1-sparsityParam)*log((1-sparsityParam) / (1-mean_a2))
 81 | 						
 82 | 	cost = cost/m + (lambdaParam/2) * weight_decay + betaParam * sum(sparsity_penalty)
 83 | 	
 84 | 	grad = concatenate([W1grad.ravel(), W2grad.ravel(), b1grad.ravel(), b2grad.ravel()])
 85 | 
 86 | 	return (cost, grad)
 87 | 
 88 | 
 89 | def initializeParameters(hiddenSize, visibleSize):
 90 | 	# Initialize parameters randomly based on layer sizes.
 91 | 	
 92 | 	# we'll choose weights uniformly from the interval [-r, r]
 93 | 	r  = sqrt(6) / sqrt(hiddenSize+visibleSize+1)
 94 | 	
 95 | 	W1 = random.rand(hiddenSize, visibleSize) * 2 * r - r;
 96 | 	W2 = random.rand(visibleSize, hiddenSize) * 2 * r - r;
 97 | 	
 98 | 	b1 = zeros((hiddenSize, 1));
 99 | 	b2 = zeros((visibleSize, 1));
100 | 	
101 | 	# Convert weights and bias gradients to the vector form.
102 | 	# This step will "unroll" (flatten and concatenate together) all 
103 | 	# your parameters into a vector, which can then be used with minFunc. 
104 | 	theta = concatenate([W1.ravel(), W2.ravel(), b1.ravel(), b2.ravel()])
105 | 	
106 | 	return theta
107 | 
108 | 
109 | if __name__ == "__main__":
110 | 	""" Check correctness of implemenation of sparse_autoencoder cost function
111 | 	using gradient check
112 | 	"""
113 | 	patchSize=8
114 | 	visibleSize = patchSize*patchSize		# number of input units 
115 | 	hiddenSize = 25							# number of hidden units
116 | 	sparsityParam = 0.01					# desired average activation of the hidden units.
117 | 	lambdaParam = 0.0001					# weight decay parameter       
118 | 	betaParam = 3							# weight of sparsity penalty term
119 | 
120 | 	patches = getPatches(numPatches=10, patchSize=patchSize)
121 | 
122 | 	# Obtain random parameters theta
123 | 	thetaParam = initializeParameters(hiddenSize, visibleSize)
124 | 	
125 | 	def sparseAutoencoderCostCallback(x):
126 | 		return cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam,
127 | 					betaParam, patches) 
128 | 	
129 | 	(cost_value, grad) = sparseAutoencoderCostCallback(thetaParam)
130 | 	
131 | 	numgrad = computeNumericalGradient(sparseAutoencoderCostCallback, thetaParam);
132 | 	diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad)
133 | 	
134 | 	print('%s' % diff)
135 | 	print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n')


--------------------------------------------------------------------------------
/sparse_autoencoder_test.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | from scipy import optimize
 4 | 
 5 | from image_patches import getPatches
 6 | from visualize_network import visualizeNetwork
 7 | import sparse_autoencoder
 8 | 
 9 | patchSize=8
10 | visibleSize = patchSize*patchSize		# number of input units 
11 | hiddenSize = 25							# number of hidden units 
12 | sparsityParam = 0.01					# desired average activation of the hidden units.
13 | lambdaParam = 0.0001					# weight decay parameter       
14 | betaParam = 3							# weight of sparsity penalty term
15 | 
16 | def sparseAutoencoderCostCallback(x):
17 | 	return sparse_autoencoder.cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam,
18 | 				betaParam, patches) 
19 | 
20 | patches = getPatches(numPatches=10000, patchSize=patchSize)
21 | 
22 | thetaParam = sparse_autoencoder.initializeParameters(hiddenSize, visibleSize)
23 | 
24 | options = {
25 | 		'maxiter': 400,
26 | 		'disp': True,
27 | 	}
28 | 
29 | result = optimize.minimize(sparseAutoencoderCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=options)
30 | 
31 | W1 = result.x[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize)
32 | 
33 | image_filename = 'images.png'
34 | print 'Saving learned features to %s' % image_filename
35 | visualizeNetwork(W1.T, image_filename)
36 | 


--------------------------------------------------------------------------------
/stacked_autoencoder.py:
--------------------------------------------------------------------------------
  1 | # Based on CS294A/CS294W Programming Assignment Starter Code
  2 | from numpy  import *
  3 | from scipy.sparse import *
  4 | 
  5 | from compute_numerical_gradient import computeNumericalGradient
  6 | import sparse_autoencoder
  7 | 
  8 | 
  9 | class Layer:
 10 | 	
 11 | 	def __init__(self, num):
 12 | 		self.num = num
 13 | 		self.W = None
 14 | 		self.b = None
 15 | 	
 16 | 	
 17 | class NetConfig:
 18 | 	
 19 | 	def __init__(self):
 20 | 		self.inputSize = 0
 21 | 		self.layerSizes = []
 22 | 
 23 | 
 24 | def stack2params(stack):
 25 | 	"""Converts a "stack" structure into a flattened parameter vector and also
 26 | 	stores the network configuration. This is useful when working with
 27 | 	optimization toolboxes such as minFunc.
 28 | 	
 29 | 	Keyword arguments:
 30 | 	stack - the stack structure, where stack{1}.w = weights of first layer
 31 | 	                                   stack{1}.b = weights of first layer
 32 | 	                                   stack{2}.w = weights of second layer
 33 | 	                                   stack{2}.b = weights of second layer
 34 | 	                                   ... etc.
 35 | 	 
 36 | 	""" 
 37 | 	params = array([])
 38 | 	netConfig = NetConfig()
 39 | 	if len(stack) > 0:
 40 | 		prev_layer = None
 41 | 		netConfig.inputSize = stack[0].W.shape[1]
 42 | 		for layer in stack:
 43 | 			assert layer.W.shape[0] == layer.b.shape[0], 'The bias should be a *column* vector of %i x1' % layer.W.shape[0]
 44 | 			
 45 | 			if prev_layer is not None:
 46 | 				assert prev_layer.W.shape[0] == layer.W.shape[1], \
 47 | 						'The adjacent layers L%i and L%i should have matching sizes.' % (prev_layer.num, layer.num)
 48 | 						
 49 | 			params = concatenate([params, layer.W.ravel(), layer.b.ravel()])
 50 | 			netConfig.layerSizes.append(layer.W.shape[0])
 51 | 	
 52 | 			prev_layer = layer
 53 | 	
 54 | 	return (params, netConfig)
 55 | 
 56 | 
 57 | def params2stack(params, netConfig):
 58 | 	"""Converts a flattened parameter vector into a nice "stack" structure 
 59 | 	for us to work with. This is useful when you're building multilayer
 60 | 	networks.
 61 | 	
 62 | 	Keyword arguments:
 63 | 	params -- flattened parameter vector
 64 | 	netConfig -- auxiliary variable containing the configuration of the network
 65 | 	
 66 | 	"""
 67 | 	stack = []
 68 | 	layerNum = 0
 69 | 	prevLayerSize = netConfig.inputSize
 70 | 	curPos = 0
 71 | 	for layerSize in netConfig.layerSizes:
 72 | 		layerNum += 1
 73 | 		layer = Layer(layerNum)
 74 | 
 75 | 		layer.W = params[curPos:curPos+layerSize * prevLayerSize].reshape(layerSize, prevLayerSize)
 76 | 		curPos += layerSize * prevLayerSize
 77 | 
 78 | 		layer.b = params[curPos:curPos+layerSize].ravel()
 79 | 		curPos += layerSize
 80 | 		
 81 | 		prevLayerSize = layerSize
 82 | 		
 83 | 		stack.append(layer)
 84 | 	
 85 | 	return stack
 86 | 
 87 | 
 88 | def cost(thetaParam, inputSize, hiddenSize, numClasses, netConfig, lambdaParam, data, labels, corruptionLevel=0.0):
 89 | 	"""Takes a trained softmaxTheta and a training data set with labels, and returns cost
 90 | 	and gradient using a stacked autoencoder model. Used for finetuning.
 91 | 	
 92 | 	Keyword arguments:
 93 | 	thetaParam -- trained weights from the autoencoder
 94 | 	visibleSize -- the number of input units
 95 | 	hiddenSize --  the number of hidden units *at the 2nd layer*
 96 | 	numClasses --  the number of categories
 97 | 	netConfig --   the network configuration of the stack
 98 | 	lambdaParam -- the weight regularization penalty
 99 | 	data -- our matrix containing the training data as columns.  So, data[i,:] is the i-th training example. 
100 | 	labels -- a vector containing labels, where labels[i] is the label for the i-th training example
101 | 	corruptionLevel -- how much of the input will get corrupted (denoising autoencoder)
102 | 	
103 | 	"""
104 | 	
105 | 	# We first extract the part which compute the softmax gradient
106 | 	softmaxTheta = thetaParam[0:hiddenSize*numClasses].reshape(numClasses, hiddenSize)
107 | 	stack = params2stack(thetaParam[hiddenSize*numClasses:], netConfig)
108 | 	
109 | 	m = data.shape[0]
110 | 	groundTruth = array(csc_matrix( (ones(m),(labels,range(m))), shape=(numClasses,m) ).todense())
111 | 
112 | 	activation = data
113 | 	
114 | 	# Corrupt input data (so that denoising autoencoder can fix it)
115 | 	if corruptionLevel > 0.0:
116 | 		corruptionMatrix = random.binomial(1,1-corruptionLevel, size=activation.shape)
117 | 		activation = activation * corruptionMatrix
118 | 
119 | 	# Forward propagation
120 | 	activations = []
121 | 	for layer in stack:
122 | 		activations.append(activation)
123 | 		activation = sparse_autoencoder.sigmoid(activation.dot(layer.W.T) + layer.b)
124 | 
125 | 	# Back propagation
126 | 	M = softmaxTheta.dot(activation.T)
127 | 	M = M - amax(M, 0)
128 | 	h_data = exp(M)
129 | 	h_data = h_data / sum(h_data, 0)
130 | 	
131 | 	cost = -1.0/numClasses * sum(multiply(groundTruth, log(h_data))) + lambdaParam/2 * sum(softmaxTheta**2)
132 | 	softmaxThetaGrad = -1.0/numClasses * ((groundTruth - h_data).dot(activation)) + lambdaParam*softmaxTheta
133 | 	
134 | 	stackGrad = []
135 | 	delta = multiply(-(softmaxTheta.T.dot(groundTruth - h_data)), (activation * (1-activation)).T)
136 | 	idx = len(activations)
137 | 	while activations != []:
138 | 		activation = activations.pop()
139 | 		layer = Layer(idx)
140 | 		layer.W = (1.0/numClasses) * delta.dot(activation)
141 | 		layer.b = (1.0/numClasses) * sum(delta, 1)
142 | 		stackGrad.insert(0, layer)
143 | 
144 | 		delta = multiply(stack[idx-1].W.T.dot(delta), (activation * (1-activation)).T)
145 | 		
146 | 		idx -= 1
147 | 
148 | 	(params, netConfig) = stack2params(stackGrad)
149 | 	grad = concatenate([softmaxThetaGrad.ravel(), params])
150 | 	
151 | 	return (cost, grad)
152 | 
153 | 
154 | def predict(thetaParam, inputSize, hiddenSize, numClasses, netConfig, data):
155 | 	"""Takes a trained theta and a test data set, and returns the predicted labels for each example.
156 | 	
157 | 	Keyword arguments:
158 | 	thetaParam -- trained weights from the autoencoder
159 | 	inputSize -- the number of input units
160 | 	hiddenSize -- the number of hidden units *at the 2nd layer*
161 | 	numClasses -- the number of categories
162 | 	netConfig - configuration of the neural network
163 | 	data -- our matrix containing the training data as columns.  So, data[i,:] is the i-th training example.
164 | 	
165 | 	"""
166 | 
167 | 	softmaxTheta = thetaParam[0:hiddenSize*numClasses].reshape(numClasses, hiddenSize)
168 | 	stack = params2stack(thetaParam[hiddenSize*numClasses:], netConfig)
169 | 
170 | 	activation = data
171 | 	for layer in stack:
172 | 		activation = sparse_autoencoder.sigmoid(activation.dot(layer.W.T) + layer.b)
173 | 		
174 | 	h_data = exp(softmaxTheta.dot(activation.T))
175 | 	h_data = h_data / sum(h_data, 0)
176 | 	return argmax(h_data, axis=0)
177 | 
178 | 
179 | if __name__ == "__main__":
180 | 	inputSize = 4
181 | 	hiddenSize = 5
182 | 	lambdaParam = 0.01
183 | 	data = random.normal(size=(5, inputSize))
184 | 	labels = array([0, 1, 0, 1, 0])
185 | 	numClasses = 2
186 | 	
187 | 	stack = [Layer(1), Layer(2)]
188 | 	stack[0].W = 0.1 * random.normal(size=(3, inputSize))
189 | 	stack[0].b = zeros(3)
190 | 	stack[1].W = 0.1 * random.normal(size=(hiddenSize, 3))
191 | 	stack[1].b = zeros(hiddenSize)
192 | 
193 | 	softmaxTheta = 0.005 * random.normal(size=hiddenSize * numClasses)
194 | 	
195 | 	(stackParams, netConfig) = stack2params(stack)
196 | 	stackedAETheta = concatenate([softmaxTheta, stackParams])
197 | 
198 | 	def stackedAutoencoderCostCallback(x):
199 | 		return cost(x, inputSize, hiddenSize, numClasses, netConfig,
200 | 				lambdaParam, data, labels)
201 | 				
202 | 	(cost_value, grad) = stackedAutoencoderCostCallback(stackedAETheta)
203 | 
204 | 	numgrad = computeNumericalGradient(stackedAutoencoderCostCallback, stackedAETheta)
205 | 	
206 | 	diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad)
207 | 
208 | 	print('%s' % diff)
209 | 	print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n')
210 | 


--------------------------------------------------------------------------------
/stacked_autoencoder_test.py:
--------------------------------------------------------------------------------
  1 | # Based on CS294A/CS294W Programming Assignment Starter Code
  2 | import os
  3 | 
  4 | from numpy  import *
  5 | from scipy import optimize
  6 | 
  7 | from MNIST_images import loadMNISTImages, loadMNISTLabels
  8 | from visualize_network import visualizeNetwork
  9 | import sparse_autoencoder
 10 | import softmax
 11 | import stacked_autoencoder
 12 | 
 13 | 
 14 | results_dir = 'results/dae1/'
 15 | 
 16 | inputSize = 28 * 28		# MNIST inputs are 28x28
 17 | numClasses = 10			# MNIST dataset consists of 10 digits
 18 | hiddenSizeL1 = 200		# Layer 1 Hidden Size
 19 | hiddenSizeL2 = 200		# Layer 2 Hidden Size
 20 | sparsityParam = 0.1		# desired average activation of the hidden units.
 21 | lambdaParam = 3e-3		# weight decay parameter       
 22 | betaParam = 3			# weight of sparsity penalty term       
 23 | corruptionLevel = 0.1	# how much of the input will get corrupted (denoising autoencoder)
 24 | 
 25 | if not os.path.exists(results_dir):
 26 | 	os.makedirs(results_dir)
 27 | 
 28 | trainData = loadMNISTImages('mnist/train-images-idx3-ubyte')
 29 | trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte')
 30 | 
 31 | # Train the first sparse autoencoder
 32 | options = {
 33 | 		'maxiter': 400,
 34 | 		'disp': True,
 35 | 	}
 36 | 
 37 | sae1OptThetaFilename = results_dir + 'sae1OptTheta.npy'
 38 | 
 39 | if os.path.exists(sae1OptThetaFilename):
 40 | 	sae1OptTheta = load(sae1OptThetaFilename)
 41 | else:
 42 | 	def sparseAutoencoderCostCallbackL1(x):
 43 | 		return sparse_autoencoder.cost(x, inputSize, hiddenSizeL1, lambdaParam, sparsityParam,
 44 | 					betaParam, trainData, corruptionLevel)
 45 | 	
 46 | 	sae1Theta = sparse_autoencoder.initializeParameters(hiddenSizeL1, inputSize)
 47 | 	result = optimize.minimize(sparseAutoencoderCostCallbackL1, sae1Theta, method='L-BFGS-B', jac=True, options=options)
 48 | 	
 49 | 	sae1OptTheta = result.x
 50 | 	save(sae1OptThetaFilename, sae1OptTheta)
 51 | 	
 52 | 	W1 = sae1OptTheta[0:hiddenSizeL1*inputSize].reshape(hiddenSizeL1, inputSize)
 53 | 	visualizeNetwork(W1.T, results_dir + 'sae1.png')
 54 | 
 55 | # Train the second sparse autoencoder
 56 | sae1Features = sparse_autoencoder.feedForward(sae1OptTheta, hiddenSizeL1, inputSize, trainData)
 57 | 
 58 | sae2OptThetaFilename = results_dir + 'sae2OptTheta.npy'
 59 | 
 60 | if os.path.exists(sae2OptThetaFilename):
 61 | 	sae2OptTheta = load(sae2OptThetaFilename)
 62 | else:
 63 | 	def sparseAutoencoderCostCallbackL2(x):
 64 | 		return sparse_autoencoder.cost(x, hiddenSizeL1, hiddenSizeL2, lambdaParam, sparsityParam,
 65 | 					betaParam, sae1Features, corruptionLevel)
 66 | 	
 67 | 	sae2Theta = sparse_autoencoder.initializeParameters(hiddenSizeL2, hiddenSizeL1)
 68 | 	result = optimize.minimize(sparseAutoencoderCostCallbackL2, sae2Theta, method='L-BFGS-B', jac=True, options=options)
 69 | 	
 70 | 	sae2OptTheta = result.x
 71 | 	save(sae2OptThetaFilename, sae2OptTheta)
 72 | 
 73 | # Train the softmax classifier
 74 | saeSoftmaxOptThetaFilename = results_dir + 'saeSoftmaxOptTheta.npy'
 75 | 
 76 | if os.path.exists(saeSoftmaxOptThetaFilename):
 77 | 	saeSoftmaxOptTheta = load(saeSoftmaxOptThetaFilename)
 78 | else:
 79 | 	sae2Features = sparse_autoencoder.feedForward(sae2OptTheta, hiddenSizeL2, hiddenSizeL1, sae1Features)
 80 | 	
 81 | 	softmax_lambda = 1e-4
 82 | 	
 83 | 	def softmaxCostCallback(x):
 84 | 		return softmax.cost(x, numClasses, hiddenSizeL2, softmax_lambda, sae2Features, trainLabels) 
 85 | 	
 86 | 	# Randomly initialise theta
 87 | 	thetaParam = 0.005 * random.normal(size=numClasses * hiddenSizeL2)
 88 | 	
 89 | 	softmax_options = {
 90 | 			'maxiter': 100,
 91 | 			'disp': False,
 92 | 		}
 93 | 	
 94 | 	result = optimize.minimize(softmaxCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=softmax_options)
 95 | 	
 96 | 	saeSoftmaxOptTheta = result.x[0:numClasses*hiddenSizeL2]
 97 | 	
 98 | 	save(saeSoftmaxOptThetaFilename, saeSoftmaxOptTheta)
 99 | 
100 | # Finetune softmax model
101 | 
102 | stack = [stacked_autoencoder.Layer(1), stacked_autoencoder.Layer(2)]
103 | stack[0].W = sae1OptTheta[0:hiddenSizeL1*inputSize].reshape(hiddenSizeL1, inputSize)
104 | stack[0].b = sae1OptTheta[2*hiddenSizeL1*inputSize:2*hiddenSizeL1*inputSize+hiddenSizeL1]
105 | stack[1].W = sae2OptTheta[0:hiddenSizeL2*hiddenSizeL1].reshape(hiddenSizeL2, hiddenSizeL1)
106 | stack[1].b = sae2OptTheta[2*hiddenSizeL2*hiddenSizeL1:2*hiddenSizeL2*hiddenSizeL1+hiddenSizeL2]
107 | 
108 | (stackParams, netConfig) = stacked_autoencoder.stack2params(stack)
109 | stackedAETheta = concatenate([saeSoftmaxOptTheta, stackParams])
110 | 
111 | saeOptThetaFilename = results_dir + 'saeOptTheta.npy'
112 | 
113 | if os.path.exists(saeOptThetaFilename):
114 | 	stackedAEOptTheta = load(saeOptThetaFilename)
115 | else:
116 | 	def stackedAutoencoderCostCallback(x):
117 | 		return stacked_autoencoder.cost(x, inputSize, hiddenSizeL2, numClasses, netConfig,
118 | 				lambdaParam, trainData, trainLabels, corruptionLevel)
119 | 
120 | 	result = optimize.minimize(stackedAutoencoderCostCallback, stackedAETheta, method='L-BFGS-B', jac=True, options=options)
121 | 	
122 | 	stackedAEOptTheta = result.x
123 | 	save(saeOptThetaFilename, stackedAEOptTheta)
124 | 
125 | # Test
126 | 
127 | testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte')
128 | testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte')
129 | 
130 | pred_no_fine_tuning = stacked_autoencoder.predict(stackedAETheta, inputSize, hiddenSizeL2, numClasses, netConfig, testData)
131 | 
132 | acc_no_fine_tuning = mean(testLabels==pred_no_fine_tuning)
133 | print('Before Fine-tuning Test Accuracy: %0.3f%%\n' % (acc_no_fine_tuning * 100))
134 | 
135 | pred_fine_tuned = stacked_autoencoder.predict(stackedAEOptTheta, inputSize, hiddenSizeL2, numClasses, netConfig, testData)
136 | 
137 | acc_fine_tuned = mean(testLabels==pred_fine_tuned)
138 | print('After Fine-tuning Test Accuracy: %0.3f%%\n' % (acc_fine_tuned * 100))


--------------------------------------------------------------------------------
/visualize_network.py:
--------------------------------------------------------------------------------
 1 | # Based on CS294A/CS294W Programming Assignment Starter Code
 2 | from numpy  import *
 3 | import scipy.misc
 4 | 
 5 | def visualizeNetwork(A, filename):
 6 | 	"""This function visualizes filters in matrix A"""
 7 | 
 8 | 	# rescale
 9 | 	A = A - mean(A)
10 |  
11 | 	# compute rows, cols
12 | 	L = A.shape[0]
13 | 	M = A.shape[1]
14 | 	sz = int(sqrt(L))
15 | 	buf = 1
16 | 	if floor(sqrt(M))**2 != M:
17 | 		n = ceil(sqrt(M))
18 | 		while ((M % n) !=0) and (n < 1.2*sqrt(M)):
19 | 			n += 1
20 | 		m = ceil(M/n)
21 | 	else:
22 | 		n=sqrt(M);
23 | 		m=n;
24 | 
25 | 	array = -ones((buf+m*(sz+buf), buf+n*(sz+buf))) 
26 | 	k = 0
27 | 	for i in range(int(m)):
28 | 		for j in range(int(n)):
29 | 			if k >= M: 
30 | 				continue 
31 | 			
32 | 			clim = max(abs(A[:,k]))
33 | 			array[buf+i*(sz+buf):buf+i*(sz+buf)+sz, buf+j*(sz+buf):buf+j*(sz+buf)+sz] = A[:,k].reshape(sz,sz)/clim
34 | 			k += 1
35 | 	
36 | 	scipy.misc.toimage(array).save(filename)
37 | 
38 | 	return
39 | 


--------------------------------------------------------------------------------