├── .gitignore ├── IMAGES.mat ├── MNIST_images.py ├── README.md ├── compute_numerical_gradient.py ├── image_patches.py ├── requirements.txt ├── softmax.py ├── softmax_test.py ├── sparse_autoencoder.py ├── sparse_autoencoder_test.py ├── stacked_autoencoder.py ├── stacked_autoencoder_test.py └── visualize_network.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.png 3 | *.dat 4 | *.npy 5 | mnist/ 6 | -------------------------------------------------------------------------------- /IMAGES.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dusano/DeepLearning/b0c495aba27bffdf5ff0ee646ff70ca3827ae49d/IMAGES.mat -------------------------------------------------------------------------------- /MNIST_images.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | 4 | 5 | def loadMNISTImages(filename): 6 | """loadMNISTImages returns a 28x28x[number of MNIST images] matrix containing 7 | the raw MNIST images. 8 | """ 9 | 10 | f = open(filename, 'rb') 11 | 12 | assert f != -1, 'Could not open %s' % filename 13 | 14 | magic = fromfile(f, dtype='>i4', count=1) 15 | assert magic == 2051, 'Bad magic number in %s' % filename 16 | 17 | numImages = fromfile(f, dtype='>i4', count=1) 18 | numRows = fromfile(f, dtype='>i4', count=1) 19 | numCols = fromfile(f, dtype='>i4', count=1) 20 | 21 | images = fromfile(f, dtype='B') 22 | images = images.reshape(numImages, numCols, numRows) 23 | 24 | f.close() 25 | 26 | # Reshape to #pixels x #examples 27 | images = images.reshape(images.shape[0], images.shape[1]*images.shape[2]) 28 | # Convert to double and rescale to [0,1] 29 | images = double(images) / 255 30 | 31 | return images 32 | 33 | 34 | def loadMNISTLabels(filename): 35 | """loadMNISTLabels returns a [number of MNIST images]x1 matrix containing 36 | the labels for the MNIST images 37 | """ 38 | 39 | f = open(filename, 'rb') 40 | assert f != -1, 'Could not open %s' % filename 41 | 42 | magic = fromfile(f, dtype='>i4', count=1) 43 | assert magic == 2049, 'Bad magic number in %s' % filename 44 | 45 | numLabels = fromfile(f, dtype='>i4', count=1) 46 | 47 | labels = fromfile(f, dtype='B') 48 | 49 | assert labels.shape[0] == numLabels, 'Mismatch in label count' 50 | 51 | f.close() 52 | 53 | return labels 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DeepLearning 2 | ============ 3 | 4 | Python implementation of UFLDL tutorial code (http://ufldl.stanford.edu/wiki/index.php/UFLDL_Tutorial) 5 | 6 | Installation 7 | ------------ 8 | 9 | 1. Clone repository 10 | 2. Set up virtualenv 11 | 3. pip install -r requirements.txt 12 | 13 | To speed things up, install Intel Math Kernel Library and fill-in ~/.numpy-site.cfg before installing numpy (see http://stackoverflow.com/questions/13769936/supplying-numpy-site-cfg-arguments-to-pip for more information). 14 | 15 | Test 16 | ---- 17 | Running 18 | 19 | > python stacked_autoencoder_test.py 20 | 21 | should produce 22 | 23 | > Before Fine-tuning Test Accuracy: 92.180% 24 | > 25 | > After Fine-tuning Test Accuracy: 97.830% 26 | 27 | on MNIST data set (http://yann.lecun.com/exdb/mnist/) 28 | 29 | Enjoy! 30 | -------------------------------------------------------------------------------- /compute_numerical_gradient.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | 4 | def computeNumericalGradient(J, theta): 5 | 6 | numgrad = zeros(theta.shape) 7 | 8 | EPSILON = 1e-04 9 | 10 | bases = eye(numgrad.shape[0]) 11 | 12 | for i in range(numgrad.shape[0]): 13 | (value1, grad1) = J(theta + EPSILON*bases[:,i]) 14 | (value2, grad2) = J(theta - EPSILON*bases[:,i]) 15 | numgrad[i] = (value1 - value2) / (2*EPSILON) 16 | 17 | return numgrad 18 | 19 | 20 | if __name__ == "__main__": 21 | """ Check correctness of implemenation of computeNumericalGradient 22 | on an example of simple quadratic function 23 | """ 24 | 25 | def simpleQuadraticFunction(x): 26 | value = x[0]**2 + 3*x[0]*x[1] 27 | 28 | grad = zeros(2) 29 | grad[0] = 2*x[0] + 3*x[1] 30 | grad[1] = 3*x[0] 31 | 32 | return (value, grad) 33 | 34 | 35 | x = array([4, 10]).T 36 | 37 | (value, grad) = simpleQuadraticFunction(x); 38 | 39 | numgrad = computeNumericalGradient(simpleQuadraticFunction, x) 40 | 41 | diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) 42 | 43 | print('%s' % diff) 44 | print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n') -------------------------------------------------------------------------------- /image_patches.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | import scipy.io 4 | 5 | 6 | def getPatches(numPatches, patchSize): 7 | 8 | images = scipy.io.loadmat('IMAGES.mat')['IMAGES'] 9 | 10 | patches = zeros((numPatches, patchSize*patchSize)) 11 | 12 | numImages = images.shape[2] 13 | imageIdxs = random.randint(numImages, size=numPatches) 14 | sortedImageIdxs = argsort(imageIdxs) 15 | 16 | lastImageIdx = -1 17 | for i in range(numPatches): 18 | imageIdx = imageIdxs[sortedImageIdxs[i]] 19 | if lastImageIdx != imageIdx: 20 | img = images[:,:,imageIdx] 21 | lastImageIdx = imageIdx 22 | 23 | x = random.randint(img.shape[0] - patchSize) 24 | y = random.randint(img.shape[1] - patchSize) 25 | 26 | patch = img[x:x+patchSize, y:y+patchSize] 27 | 28 | patches[sortedImageIdxs[i], :] = patch.reshape(1, patchSize*patchSize) 29 | 30 | # Remove DC (mean of images) 31 | patches = patches - mean(patches) 32 | 33 | # Truncate to +/-3 standard deviations and scale to -1 to 1 34 | pstd = 3 * std(patches) 35 | patches = maximum(minimum(patches, pstd), -pstd) / pstd 36 | 37 | # Rescale from [-1,1] to [0.1,0.9] 38 | patches = (patches + 1) * 0.4 + 0.1 39 | 40 | return patches -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PIL==1.1.7 2 | numpy==1.8.0 3 | scipy==0.13.2 4 | -------------------------------------------------------------------------------- /softmax.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | from scipy.sparse import * 4 | 5 | from compute_numerical_gradient import computeNumericalGradient 6 | 7 | 8 | def cost(thetaParam, numClasses, inputSize, lambdaParam, data, labels): 9 | """Compute the cost and gradient for softmax regression. 10 | 11 | Keyword arguments: 12 | thetaParam -- a vector of parameters 13 | numClasses -- the number of classes 14 | inputSize -- the size N of the input vector 15 | lambdaParam -- weight decay parameter 16 | data - the N x M input matrix, where each column data(:, i) corresponds to a single test set 17 | labels - an M x 1 matrix containing the labels corresponding for the input data 18 | 19 | """ 20 | 21 | # Unroll the parameters from theta 22 | thetaParam = thetaParam.reshape(numClasses, inputSize) 23 | 24 | m = data.shape[0] 25 | 26 | groundTruth = csc_matrix( (ones(m),(labels,range(m))), shape=(numClasses,m) ).todense() 27 | cost = 0 28 | 29 | M = thetaParam.dot(data.T) 30 | M = M - amax(M, 0) 31 | h_data = exp(M) 32 | h_data = h_data / sum(h_data, 0) 33 | 34 | cost = -sum(multiply(groundTruth, log(h_data)))/m + lambdaParam/2 * sum(thetaParam**2) 35 | 36 | thetaGrad = -((groundTruth - h_data).dot(data))/m + lambdaParam*thetaParam 37 | 38 | return (cost, squeeze(array(thetaGrad.ravel()))) 39 | 40 | 41 | def predict(thetaParam, data): 42 | """Compute pred using theta 43 | 44 | Keyword arguments: 45 | optTheta -- this provides a numClasses x inputSize matrix 46 | data -- the N x M input matrix, where each column data(:, i) corresponds to a single test set 47 | """ 48 | h_data = exp(thetaParam.dot(data.T)) 49 | h_data = h_data / sum(h_data, 0) 50 | return argmax(h_data, axis=0) 51 | 52 | 53 | if __name__ == "__main__": 54 | """ Check correctness of implemenation of softmax cost function 55 | using gradient check 56 | """ 57 | numClasses = 10 # Number of classes (MNIST images fall into 10 classes) 58 | lambdaParam = 1e-4 # Weight decay parameter 59 | inputSize = 8 60 | inputData = random.normal(size=(100,inputSize)) 61 | labels = random.randint(10, size=100) 62 | 63 | def softmaxCostCallback(x): 64 | return cost(x, numClasses, inputSize, lambdaParam, inputData, labels) 65 | 66 | # Randomly initialise theta 67 | thetaParam = 0.005 * random.normal(size=numClasses * inputSize) 68 | 69 | (cost_value, grad) = softmaxCostCallback(thetaParam) 70 | 71 | numGrad = computeNumericalGradient(softmaxCostCallback, thetaParam) 72 | 73 | # Compare numerically computed gradients with those computed analytically 74 | diff = linalg.norm(numGrad-grad)/linalg.norm(numGrad+grad) 75 | 76 | print('%s' % diff) 77 | print('Norm of the difference between numerical and analytical gradient (should be < 1e-7)\n\n') 78 | -------------------------------------------------------------------------------- /softmax_test.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | from scipy import optimize 4 | 5 | from MNIST_images import loadMNISTImages, loadMNISTLabels 6 | import softmax 7 | 8 | 9 | inputSize = 28 * 28 # Size of input vector (MNIST images are 28x28) 10 | numClasses = 10 # Number of classes (MNIST images fall into 10 classes) 11 | lambdaParam = 1e-4 # Weight decay parameter 12 | 13 | trainData = loadMNISTImages('mnist/train-images-idx3-ubyte') 14 | trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte') 15 | 16 | def softmaxCostCallback(x): 17 | return softmax.cost(x, numClasses, inputSize, lambdaParam, trainData, trainLabels) 18 | 19 | # Randomly initialise theta 20 | thetaParam = 0.005 * random.normal(size=numClasses * inputSize) 21 | 22 | options = { 23 | 'maxiter': 100, 24 | 'disp': True, 25 | } 26 | 27 | result = optimize.minimize(softmaxCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=options) 28 | 29 | optTheta = result.x[0:numClasses*inputSize].reshape(numClasses, inputSize) 30 | 31 | # Evaluating performance of the softmax classifier 32 | testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte') 33 | testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte') 34 | 35 | pred = softmax.predict(optTheta, testData) 36 | 37 | acc = mean(testLabels==pred) 38 | print('Accuracy: %0.3f%%\n' % (acc * 100)) 39 | -------------------------------------------------------------------------------- /sparse_autoencoder.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | 4 | from compute_numerical_gradient import computeNumericalGradient 5 | from image_patches import getPatches 6 | 7 | 8 | def sigmoid(x): 9 | return 1 / (1 + exp(-x)) 10 | 11 | 12 | def feedForward(thetaParam, hiddenSize, visibleSize, data): 13 | """Compute the activation of the hidden layer for the Sparse Autoencoder. 14 | 15 | Keyword arguments: 16 | thetaParam -- trained weights from the autoencoder 17 | visibleSize -- the number of input units (probably 64) 18 | hiddenSize -- the number of hidden units (probably 25) 19 | data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example. 20 | 21 | """ 22 | 23 | W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize) 24 | b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize] 25 | 26 | return sigmoid(data.dot(W1.T) + b1) 27 | 28 | 29 | def cost(thetaParam, visibleSize, hiddenSize, lambdaParam, sparsityParam, betaParam, data, corruptionLevel=0.0): 30 | """ Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder, 31 | and the corresponding gradients W1grad, W2grad, b1grad, b2grad. 32 | 33 | Keyword arguments: 34 | thetaParam -- a vector of parameters (W1, W2, b1, b2) 35 | visibleSize -- the number of input units (probably 64) 36 | hiddenSize -- the number of hidden units (probably 25) 37 | lambdaParam -- weight decay parameter 38 | sparsityParam -- the desired average activation for the hidden units 39 | betaParam -- weight of sparsity penalty term 40 | data -- a matrix containing the training data. So, data[i,:] is the i-th training example. 41 | corruptionLevel -- how much of the input will get corrupted (denoising autoencoder) 42 | 43 | """ 44 | 45 | W1 = thetaParam[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize) 46 | W2 = thetaParam[hiddenSize*visibleSize:2*hiddenSize*visibleSize].reshape(visibleSize, hiddenSize) 47 | b1 = thetaParam[2*hiddenSize*visibleSize:2*hiddenSize*visibleSize+hiddenSize] 48 | b2 = thetaParam[2*hiddenSize*visibleSize+hiddenSize:] 49 | 50 | m = data.shape[0] 51 | 52 | inputData = data 53 | # Corrupt input data (so that denoising autoencoder can fix it) 54 | if corruptionLevel > 0.0: 55 | corruptionMatrix = random.binomial(1,1-corruptionLevel, size=inputData.shape) 56 | inputData = inputData * corruptionMatrix 57 | 58 | # Forward propagation 59 | a2 = sigmoid(inputData.dot(W1.T) + b1) 60 | a3 = sigmoid(a2.dot(W2.T) + b2) 61 | 62 | # Back propagation 63 | mean_a2 = mean(a2,0) 64 | 65 | sparsity_delta = (-sparsityParam / mean_a2) + (1-sparsityParam)/(1-mean_a2) 66 | 67 | delta3 = -(data - a3) * (a3 * (1-a3)) 68 | delta2 = (delta3.dot(W2) + betaParam*sparsity_delta) * (a2 * (1-a2)) 69 | 70 | W1grad = (delta2.T.dot(inputData))/m + lambdaParam * W1 71 | b1grad = sum(delta2, 0)/m 72 | W2grad = (delta3.T.dot(a2))/m + lambdaParam * W2 73 | b2grad = sum(delta3, 0)/m 74 | 75 | cost = sum((a3 - data)**2)/2 76 | 77 | weight_decay = sum(W1**2) + sum(W2**2) 78 | 79 | sparsity_penalty = sparsityParam*log(sparsityParam/mean_a2) + \ 80 | (1-sparsityParam)*log((1-sparsityParam) / (1-mean_a2)) 81 | 82 | cost = cost/m + (lambdaParam/2) * weight_decay + betaParam * sum(sparsity_penalty) 83 | 84 | grad = concatenate([W1grad.ravel(), W2grad.ravel(), b1grad.ravel(), b2grad.ravel()]) 85 | 86 | return (cost, grad) 87 | 88 | 89 | def initializeParameters(hiddenSize, visibleSize): 90 | # Initialize parameters randomly based on layer sizes. 91 | 92 | # we'll choose weights uniformly from the interval [-r, r] 93 | r = sqrt(6) / sqrt(hiddenSize+visibleSize+1) 94 | 95 | W1 = random.rand(hiddenSize, visibleSize) * 2 * r - r; 96 | W2 = random.rand(visibleSize, hiddenSize) * 2 * r - r; 97 | 98 | b1 = zeros((hiddenSize, 1)); 99 | b2 = zeros((visibleSize, 1)); 100 | 101 | # Convert weights and bias gradients to the vector form. 102 | # This step will "unroll" (flatten and concatenate together) all 103 | # your parameters into a vector, which can then be used with minFunc. 104 | theta = concatenate([W1.ravel(), W2.ravel(), b1.ravel(), b2.ravel()]) 105 | 106 | return theta 107 | 108 | 109 | if __name__ == "__main__": 110 | """ Check correctness of implemenation of sparse_autoencoder cost function 111 | using gradient check 112 | """ 113 | patchSize=8 114 | visibleSize = patchSize*patchSize # number of input units 115 | hiddenSize = 25 # number of hidden units 116 | sparsityParam = 0.01 # desired average activation of the hidden units. 117 | lambdaParam = 0.0001 # weight decay parameter 118 | betaParam = 3 # weight of sparsity penalty term 119 | 120 | patches = getPatches(numPatches=10, patchSize=patchSize) 121 | 122 | # Obtain random parameters theta 123 | thetaParam = initializeParameters(hiddenSize, visibleSize) 124 | 125 | def sparseAutoencoderCostCallback(x): 126 | return cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam, 127 | betaParam, patches) 128 | 129 | (cost_value, grad) = sparseAutoencoderCostCallback(thetaParam) 130 | 131 | numgrad = computeNumericalGradient(sparseAutoencoderCostCallback, thetaParam); 132 | diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) 133 | 134 | print('%s' % diff) 135 | print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n') -------------------------------------------------------------------------------- /sparse_autoencoder_test.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | from scipy import optimize 4 | 5 | from image_patches import getPatches 6 | from visualize_network import visualizeNetwork 7 | import sparse_autoencoder 8 | 9 | patchSize=8 10 | visibleSize = patchSize*patchSize # number of input units 11 | hiddenSize = 25 # number of hidden units 12 | sparsityParam = 0.01 # desired average activation of the hidden units. 13 | lambdaParam = 0.0001 # weight decay parameter 14 | betaParam = 3 # weight of sparsity penalty term 15 | 16 | def sparseAutoencoderCostCallback(x): 17 | return sparse_autoencoder.cost(x, visibleSize, hiddenSize, lambdaParam, sparsityParam, 18 | betaParam, patches) 19 | 20 | patches = getPatches(numPatches=10000, patchSize=patchSize) 21 | 22 | thetaParam = sparse_autoencoder.initializeParameters(hiddenSize, visibleSize) 23 | 24 | options = { 25 | 'maxiter': 400, 26 | 'disp': True, 27 | } 28 | 29 | result = optimize.minimize(sparseAutoencoderCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=options) 30 | 31 | W1 = result.x[0:hiddenSize*visibleSize].reshape(hiddenSize, visibleSize) 32 | 33 | image_filename = 'images.png' 34 | print 'Saving learned features to %s' % image_filename 35 | visualizeNetwork(W1.T, image_filename) 36 | -------------------------------------------------------------------------------- /stacked_autoencoder.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | from scipy.sparse import * 4 | 5 | from compute_numerical_gradient import computeNumericalGradient 6 | import sparse_autoencoder 7 | 8 | 9 | class Layer: 10 | 11 | def __init__(self, num): 12 | self.num = num 13 | self.W = None 14 | self.b = None 15 | 16 | 17 | class NetConfig: 18 | 19 | def __init__(self): 20 | self.inputSize = 0 21 | self.layerSizes = [] 22 | 23 | 24 | def stack2params(stack): 25 | """Converts a "stack" structure into a flattened parameter vector and also 26 | stores the network configuration. This is useful when working with 27 | optimization toolboxes such as minFunc. 28 | 29 | Keyword arguments: 30 | stack - the stack structure, where stack{1}.w = weights of first layer 31 | stack{1}.b = weights of first layer 32 | stack{2}.w = weights of second layer 33 | stack{2}.b = weights of second layer 34 | ... etc. 35 | 36 | """ 37 | params = array([]) 38 | netConfig = NetConfig() 39 | if len(stack) > 0: 40 | prev_layer = None 41 | netConfig.inputSize = stack[0].W.shape[1] 42 | for layer in stack: 43 | assert layer.W.shape[0] == layer.b.shape[0], 'The bias should be a *column* vector of %i x1' % layer.W.shape[0] 44 | 45 | if prev_layer is not None: 46 | assert prev_layer.W.shape[0] == layer.W.shape[1], \ 47 | 'The adjacent layers L%i and L%i should have matching sizes.' % (prev_layer.num, layer.num) 48 | 49 | params = concatenate([params, layer.W.ravel(), layer.b.ravel()]) 50 | netConfig.layerSizes.append(layer.W.shape[0]) 51 | 52 | prev_layer = layer 53 | 54 | return (params, netConfig) 55 | 56 | 57 | def params2stack(params, netConfig): 58 | """Converts a flattened parameter vector into a nice "stack" structure 59 | for us to work with. This is useful when you're building multilayer 60 | networks. 61 | 62 | Keyword arguments: 63 | params -- flattened parameter vector 64 | netConfig -- auxiliary variable containing the configuration of the network 65 | 66 | """ 67 | stack = [] 68 | layerNum = 0 69 | prevLayerSize = netConfig.inputSize 70 | curPos = 0 71 | for layerSize in netConfig.layerSizes: 72 | layerNum += 1 73 | layer = Layer(layerNum) 74 | 75 | layer.W = params[curPos:curPos+layerSize * prevLayerSize].reshape(layerSize, prevLayerSize) 76 | curPos += layerSize * prevLayerSize 77 | 78 | layer.b = params[curPos:curPos+layerSize].ravel() 79 | curPos += layerSize 80 | 81 | prevLayerSize = layerSize 82 | 83 | stack.append(layer) 84 | 85 | return stack 86 | 87 | 88 | def cost(thetaParam, inputSize, hiddenSize, numClasses, netConfig, lambdaParam, data, labels, corruptionLevel=0.0): 89 | """Takes a trained softmaxTheta and a training data set with labels, and returns cost 90 | and gradient using a stacked autoencoder model. Used for finetuning. 91 | 92 | Keyword arguments: 93 | thetaParam -- trained weights from the autoencoder 94 | visibleSize -- the number of input units 95 | hiddenSize -- the number of hidden units *at the 2nd layer* 96 | numClasses -- the number of categories 97 | netConfig -- the network configuration of the stack 98 | lambdaParam -- the weight regularization penalty 99 | data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example. 100 | labels -- a vector containing labels, where labels[i] is the label for the i-th training example 101 | corruptionLevel -- how much of the input will get corrupted (denoising autoencoder) 102 | 103 | """ 104 | 105 | # We first extract the part which compute the softmax gradient 106 | softmaxTheta = thetaParam[0:hiddenSize*numClasses].reshape(numClasses, hiddenSize) 107 | stack = params2stack(thetaParam[hiddenSize*numClasses:], netConfig) 108 | 109 | m = data.shape[0] 110 | groundTruth = array(csc_matrix( (ones(m),(labels,range(m))), shape=(numClasses,m) ).todense()) 111 | 112 | activation = data 113 | 114 | # Corrupt input data (so that denoising autoencoder can fix it) 115 | if corruptionLevel > 0.0: 116 | corruptionMatrix = random.binomial(1,1-corruptionLevel, size=activation.shape) 117 | activation = activation * corruptionMatrix 118 | 119 | # Forward propagation 120 | activations = [] 121 | for layer in stack: 122 | activations.append(activation) 123 | activation = sparse_autoencoder.sigmoid(activation.dot(layer.W.T) + layer.b) 124 | 125 | # Back propagation 126 | M = softmaxTheta.dot(activation.T) 127 | M = M - amax(M, 0) 128 | h_data = exp(M) 129 | h_data = h_data / sum(h_data, 0) 130 | 131 | cost = -1.0/numClasses * sum(multiply(groundTruth, log(h_data))) + lambdaParam/2 * sum(softmaxTheta**2) 132 | softmaxThetaGrad = -1.0/numClasses * ((groundTruth - h_data).dot(activation)) + lambdaParam*softmaxTheta 133 | 134 | stackGrad = [] 135 | delta = multiply(-(softmaxTheta.T.dot(groundTruth - h_data)), (activation * (1-activation)).T) 136 | idx = len(activations) 137 | while activations != []: 138 | activation = activations.pop() 139 | layer = Layer(idx) 140 | layer.W = (1.0/numClasses) * delta.dot(activation) 141 | layer.b = (1.0/numClasses) * sum(delta, 1) 142 | stackGrad.insert(0, layer) 143 | 144 | delta = multiply(stack[idx-1].W.T.dot(delta), (activation * (1-activation)).T) 145 | 146 | idx -= 1 147 | 148 | (params, netConfig) = stack2params(stackGrad) 149 | grad = concatenate([softmaxThetaGrad.ravel(), params]) 150 | 151 | return (cost, grad) 152 | 153 | 154 | def predict(thetaParam, inputSize, hiddenSize, numClasses, netConfig, data): 155 | """Takes a trained theta and a test data set, and returns the predicted labels for each example. 156 | 157 | Keyword arguments: 158 | thetaParam -- trained weights from the autoencoder 159 | inputSize -- the number of input units 160 | hiddenSize -- the number of hidden units *at the 2nd layer* 161 | numClasses -- the number of categories 162 | netConfig - configuration of the neural network 163 | data -- our matrix containing the training data as columns. So, data[i,:] is the i-th training example. 164 | 165 | """ 166 | 167 | softmaxTheta = thetaParam[0:hiddenSize*numClasses].reshape(numClasses, hiddenSize) 168 | stack = params2stack(thetaParam[hiddenSize*numClasses:], netConfig) 169 | 170 | activation = data 171 | for layer in stack: 172 | activation = sparse_autoencoder.sigmoid(activation.dot(layer.W.T) + layer.b) 173 | 174 | h_data = exp(softmaxTheta.dot(activation.T)) 175 | h_data = h_data / sum(h_data, 0) 176 | return argmax(h_data, axis=0) 177 | 178 | 179 | if __name__ == "__main__": 180 | inputSize = 4 181 | hiddenSize = 5 182 | lambdaParam = 0.01 183 | data = random.normal(size=(5, inputSize)) 184 | labels = array([0, 1, 0, 1, 0]) 185 | numClasses = 2 186 | 187 | stack = [Layer(1), Layer(2)] 188 | stack[0].W = 0.1 * random.normal(size=(3, inputSize)) 189 | stack[0].b = zeros(3) 190 | stack[1].W = 0.1 * random.normal(size=(hiddenSize, 3)) 191 | stack[1].b = zeros(hiddenSize) 192 | 193 | softmaxTheta = 0.005 * random.normal(size=hiddenSize * numClasses) 194 | 195 | (stackParams, netConfig) = stack2params(stack) 196 | stackedAETheta = concatenate([softmaxTheta, stackParams]) 197 | 198 | def stackedAutoencoderCostCallback(x): 199 | return cost(x, inputSize, hiddenSize, numClasses, netConfig, 200 | lambdaParam, data, labels) 201 | 202 | (cost_value, grad) = stackedAutoencoderCostCallback(stackedAETheta) 203 | 204 | numgrad = computeNumericalGradient(stackedAutoencoderCostCallback, stackedAETheta) 205 | 206 | diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) 207 | 208 | print('%s' % diff) 209 | print('Norm of the difference between numerical and analytical gradient (should be < 1e-9)\n\n') 210 | -------------------------------------------------------------------------------- /stacked_autoencoder_test.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | import os 3 | 4 | from numpy import * 5 | from scipy import optimize 6 | 7 | from MNIST_images import loadMNISTImages, loadMNISTLabels 8 | from visualize_network import visualizeNetwork 9 | import sparse_autoencoder 10 | import softmax 11 | import stacked_autoencoder 12 | 13 | 14 | results_dir = 'results/dae1/' 15 | 16 | inputSize = 28 * 28 # MNIST inputs are 28x28 17 | numClasses = 10 # MNIST dataset consists of 10 digits 18 | hiddenSizeL1 = 200 # Layer 1 Hidden Size 19 | hiddenSizeL2 = 200 # Layer 2 Hidden Size 20 | sparsityParam = 0.1 # desired average activation of the hidden units. 21 | lambdaParam = 3e-3 # weight decay parameter 22 | betaParam = 3 # weight of sparsity penalty term 23 | corruptionLevel = 0.1 # how much of the input will get corrupted (denoising autoencoder) 24 | 25 | if not os.path.exists(results_dir): 26 | os.makedirs(results_dir) 27 | 28 | trainData = loadMNISTImages('mnist/train-images-idx3-ubyte') 29 | trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte') 30 | 31 | # Train the first sparse autoencoder 32 | options = { 33 | 'maxiter': 400, 34 | 'disp': True, 35 | } 36 | 37 | sae1OptThetaFilename = results_dir + 'sae1OptTheta.npy' 38 | 39 | if os.path.exists(sae1OptThetaFilename): 40 | sae1OptTheta = load(sae1OptThetaFilename) 41 | else: 42 | def sparseAutoencoderCostCallbackL1(x): 43 | return sparse_autoencoder.cost(x, inputSize, hiddenSizeL1, lambdaParam, sparsityParam, 44 | betaParam, trainData, corruptionLevel) 45 | 46 | sae1Theta = sparse_autoencoder.initializeParameters(hiddenSizeL1, inputSize) 47 | result = optimize.minimize(sparseAutoencoderCostCallbackL1, sae1Theta, method='L-BFGS-B', jac=True, options=options) 48 | 49 | sae1OptTheta = result.x 50 | save(sae1OptThetaFilename, sae1OptTheta) 51 | 52 | W1 = sae1OptTheta[0:hiddenSizeL1*inputSize].reshape(hiddenSizeL1, inputSize) 53 | visualizeNetwork(W1.T, results_dir + 'sae1.png') 54 | 55 | # Train the second sparse autoencoder 56 | sae1Features = sparse_autoencoder.feedForward(sae1OptTheta, hiddenSizeL1, inputSize, trainData) 57 | 58 | sae2OptThetaFilename = results_dir + 'sae2OptTheta.npy' 59 | 60 | if os.path.exists(sae2OptThetaFilename): 61 | sae2OptTheta = load(sae2OptThetaFilename) 62 | else: 63 | def sparseAutoencoderCostCallbackL2(x): 64 | return sparse_autoencoder.cost(x, hiddenSizeL1, hiddenSizeL2, lambdaParam, sparsityParam, 65 | betaParam, sae1Features, corruptionLevel) 66 | 67 | sae2Theta = sparse_autoencoder.initializeParameters(hiddenSizeL2, hiddenSizeL1) 68 | result = optimize.minimize(sparseAutoencoderCostCallbackL2, sae2Theta, method='L-BFGS-B', jac=True, options=options) 69 | 70 | sae2OptTheta = result.x 71 | save(sae2OptThetaFilename, sae2OptTheta) 72 | 73 | # Train the softmax classifier 74 | saeSoftmaxOptThetaFilename = results_dir + 'saeSoftmaxOptTheta.npy' 75 | 76 | if os.path.exists(saeSoftmaxOptThetaFilename): 77 | saeSoftmaxOptTheta = load(saeSoftmaxOptThetaFilename) 78 | else: 79 | sae2Features = sparse_autoencoder.feedForward(sae2OptTheta, hiddenSizeL2, hiddenSizeL1, sae1Features) 80 | 81 | softmax_lambda = 1e-4 82 | 83 | def softmaxCostCallback(x): 84 | return softmax.cost(x, numClasses, hiddenSizeL2, softmax_lambda, sae2Features, trainLabels) 85 | 86 | # Randomly initialise theta 87 | thetaParam = 0.005 * random.normal(size=numClasses * hiddenSizeL2) 88 | 89 | softmax_options = { 90 | 'maxiter': 100, 91 | 'disp': False, 92 | } 93 | 94 | result = optimize.minimize(softmaxCostCallback, thetaParam, method='L-BFGS-B', jac=True, options=softmax_options) 95 | 96 | saeSoftmaxOptTheta = result.x[0:numClasses*hiddenSizeL2] 97 | 98 | save(saeSoftmaxOptThetaFilename, saeSoftmaxOptTheta) 99 | 100 | # Finetune softmax model 101 | 102 | stack = [stacked_autoencoder.Layer(1), stacked_autoencoder.Layer(2)] 103 | stack[0].W = sae1OptTheta[0:hiddenSizeL1*inputSize].reshape(hiddenSizeL1, inputSize) 104 | stack[0].b = sae1OptTheta[2*hiddenSizeL1*inputSize:2*hiddenSizeL1*inputSize+hiddenSizeL1] 105 | stack[1].W = sae2OptTheta[0:hiddenSizeL2*hiddenSizeL1].reshape(hiddenSizeL2, hiddenSizeL1) 106 | stack[1].b = sae2OptTheta[2*hiddenSizeL2*hiddenSizeL1:2*hiddenSizeL2*hiddenSizeL1+hiddenSizeL2] 107 | 108 | (stackParams, netConfig) = stacked_autoencoder.stack2params(stack) 109 | stackedAETheta = concatenate([saeSoftmaxOptTheta, stackParams]) 110 | 111 | saeOptThetaFilename = results_dir + 'saeOptTheta.npy' 112 | 113 | if os.path.exists(saeOptThetaFilename): 114 | stackedAEOptTheta = load(saeOptThetaFilename) 115 | else: 116 | def stackedAutoencoderCostCallback(x): 117 | return stacked_autoencoder.cost(x, inputSize, hiddenSizeL2, numClasses, netConfig, 118 | lambdaParam, trainData, trainLabels, corruptionLevel) 119 | 120 | result = optimize.minimize(stackedAutoencoderCostCallback, stackedAETheta, method='L-BFGS-B', jac=True, options=options) 121 | 122 | stackedAEOptTheta = result.x 123 | save(saeOptThetaFilename, stackedAEOptTheta) 124 | 125 | # Test 126 | 127 | testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte') 128 | testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte') 129 | 130 | pred_no_fine_tuning = stacked_autoencoder.predict(stackedAETheta, inputSize, hiddenSizeL2, numClasses, netConfig, testData) 131 | 132 | acc_no_fine_tuning = mean(testLabels==pred_no_fine_tuning) 133 | print('Before Fine-tuning Test Accuracy: %0.3f%%\n' % (acc_no_fine_tuning * 100)) 134 | 135 | pred_fine_tuned = stacked_autoencoder.predict(stackedAEOptTheta, inputSize, hiddenSizeL2, numClasses, netConfig, testData) 136 | 137 | acc_fine_tuned = mean(testLabels==pred_fine_tuned) 138 | print('After Fine-tuning Test Accuracy: %0.3f%%\n' % (acc_fine_tuned * 100)) -------------------------------------------------------------------------------- /visualize_network.py: -------------------------------------------------------------------------------- 1 | # Based on CS294A/CS294W Programming Assignment Starter Code 2 | from numpy import * 3 | import scipy.misc 4 | 5 | def visualizeNetwork(A, filename): 6 | """This function visualizes filters in matrix A""" 7 | 8 | # rescale 9 | A = A - mean(A) 10 | 11 | # compute rows, cols 12 | L = A.shape[0] 13 | M = A.shape[1] 14 | sz = int(sqrt(L)) 15 | buf = 1 16 | if floor(sqrt(M))**2 != M: 17 | n = ceil(sqrt(M)) 18 | while ((M % n) !=0) and (n < 1.2*sqrt(M)): 19 | n += 1 20 | m = ceil(M/n) 21 | else: 22 | n=sqrt(M); 23 | m=n; 24 | 25 | array = -ones((buf+m*(sz+buf), buf+n*(sz+buf))) 26 | k = 0 27 | for i in range(int(m)): 28 | for j in range(int(n)): 29 | if k >= M: 30 | continue 31 | 32 | clim = max(abs(A[:,k])) 33 | array[buf+i*(sz+buf):buf+i*(sz+buf)+sz, buf+j*(sz+buf):buf+j*(sz+buf)+sz] = A[:,k].reshape(sz,sz)/clim 34 | k += 1 35 | 36 | scipy.misc.toimage(array).save(filename) 37 | 38 | return 39 | --------------------------------------------------------------------------------