├── .gitignore
├── README.md
├── conv_keras.py
├── conv_tensorflow.py
├── conv_theano.py
├── mlp_keras.py
├── mlp_numpy.py
├── mlp_numpy_relu.py
├── mlp_tensorflow.py
├── mlp_theano.py
├── mlp_torch.lua
├── mnist.py
└── mnist_torch.lua


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 
3 | # MNIST Dataset from Yann LeCun's website
4 | train-images-idx3-ubyte.gz
5 | train-labels-idx1-ubyte.gz
6 | t10k-images-idx3-ubyte.gz
7 | t10k-labels-idx1-ubyte.gz
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Neural Networks
 2 | 
 3 | Introductory examples in deep learning may sometimes be too verbose, often up to
 4 | 300 lines, which makes it hard to actually see what is going on. This repo is an
 5 | attempt to fix this - the longest example is 39 lines (31 LOC). It contains the
 6 | same neural network implemented with 5 different libraries -
 7 | [Numpy](http://www.numpy.org/), [Theano](http://www.deeplearning.net/software/theano/),
 8 | [TensorFlow](https://www.tensorflow.org/), [Keras](http://keras.io/) and
 9 | [Torch](http://torch.ch/). The network is a simple multilayer perceptron with a
10 | hidden layer of 100 neurons and an output layer with 10 neurons, and is trained
11 | on the [MNIST](http://yann.lecun.com/exdb/mnist/) database of handwritten digits.
12 | It can achieve accuracy of 97.8%.
13 | 
14 | ```shell
15 | $ python mlp_numpy.py
16 | $ python mlp_theano.py
17 | $ python mlp_tensorflow.py
18 | $ python mlp_keras.py
19 | $ th mlp_torch.lua
20 | ```
21 | 
22 | A more detailed explanation of the implementation with Numpy can be found in my blog
23 | post "[Hacking MNIST in 30 Lines of Python](http://jrusev.github.io/post/hacking-mnist/)".
24 | 
25 | ## MNIST dataset
26 | 
27 | The MNIST dataset consists of handwritten digit images and is divided in 60,000
28 | examples for the training set and 10,000 examples for testing. We use a small
29 | [script](./mnist.py) to download the MNIST data and load it to memory. By default
30 | it reserves 10,000 examples from the official training set for validation,
31 | so all neural nets train with 50,000 examples.
32 | 
33 | ## Convolutional Neural Network
34 | 
35 | For completeness, I also included a conv net trained on MNIST (implemented with
36 | Theano, TensorFlow and Keras). The last two layers are the same as in the MLP,
37 | but now there is a convolutional layer in front (8 kernels of size 5x5, with 2x2
38 | max pooling). This improves the accuracy to 98.7%. Run with:
39 | 
40 | ```shell
41 | $ THEANO_FLAGS='floatX=float32' python conv_theano.py
42 | $ python conv_tensorflow.py
43 | $ python conv_keras.py
44 | ```
45 | 
46 | You can reach 99.0% accuracy (99.1% using Keras) with the following architecture:
47 | 
48 | ```
49 | conv8(5x5) -> conv16(5x5) -> pool2 -> fc100 -> softmax10
50 | ```
51 | 


--------------------------------------------------------------------------------
/conv_keras.py:
--------------------------------------------------------------------------------
 1 | import mnist
 2 | from keras.models import Sequential
 3 | from keras.layers.core import Dense, Activation, Flatten
 4 | from keras.layers.convolutional import Convolution2D, MaxPooling2D
 5 | from keras.optimizers import SGD
 6 | 
 7 | trX, trY, teX, teY = mnist.load_data(one_hot=True, reshape=(-1, 1, 28, 28))
 8 | 
 9 | model = Sequential()
10 | model.add(Convolution2D(8, 5, 5, input_shape=trX.shape[1:], activation='sigmoid'))
11 | model.add(MaxPooling2D(pool_size=(2, 2)))
12 | model.add(Flatten())
13 | model.add(Dense(100, activation='sigmoid'))
14 | model.add(Dense(10, activation='softmax'))
15 | 
16 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
17 | 
18 | model.compile(SGD(learn_rate), 'categorical_crossentropy', metrics=['accuracy'])
19 | model.fit(trX, trY, batch_size, num_epochs, verbose=1, validation_data=(teX, teY))
20 | 


--------------------------------------------------------------------------------
/conv_tensorflow.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import mnist
 3 | from math import sqrt
 4 | 
 5 | def init_weights(shape):
 6 |     return tf.Variable(tf.truncated_normal(shape, stddev=1 / sqrt(shape[0])))
 7 | 
 8 | def feed_forward(X, w1, w2, w3):
 9 |     l1_conv = tf.nn.sigmoid(tf.nn.conv2d(X, w1, strides=[1, 1, 1, 1], padding='VALID'))
10 |     l1_pool = tf.nn.max_pool(l1_conv, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
11 |     l1_pool_flat = tf.reshape(l1_pool, [-1, w2.get_shape().as_list()[0]])
12 |     l2 = tf.nn.sigmoid(tf.matmul(l1_pool_flat, w2))
13 |     return tf.matmul(l2, w3)
14 | 
15 | trX, trY, teX, teY = mnist.load_data(one_hot=True, reshape=(-1, 28, 28, 1))
16 | 
17 | # conv8(5x5) -> pool(2x2) -> dense100 -> softmax10
18 | w1 = init_weights([5, 5, 1, 8])
19 | w2 = init_weights([8*12*12, 100])
20 | w3 = init_weights([100, 10])
21 | 
22 | X = tf.placeholder(tf.float32, [None, 28, 28, 1])
23 | Y = tf.placeholder(tf.float32, [None, 10])
24 | 
25 | y_ = feed_forward(X, w1, w2, w3)
26 | 
27 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
28 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_, Y))
29 | train = tf.train.GradientDescentOptimizer(learn_rate).minimize(cost)
30 | accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_,1), tf.argmax(Y,1)), tf.float32))
31 | 
32 | sess = tf.Session()
33 | sess.run(tf.initialize_all_variables())
34 | for i in range(num_epochs):
35 |     for j in xrange(0, len(trX), batch_size):
36 |         batch_x, batch_y = trX[j:j+batch_size], trY[j:j+batch_size]
37 |         sess.run(train, feed_dict={X: batch_x, Y: batch_y})
38 |     print i, sess.run(accuracy, feed_dict={X: teX, Y: teY})
39 | 


--------------------------------------------------------------------------------
/conv_theano.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | import mnist
 5 | from theano.tensor.nnet.conv import conv2d
 6 | from theano.tensor.signal.pool import pool_2d
 7 | 
 8 | def init_weights(shape):
 9 |     weights = np.random.randn(*shape) / np.sqrt(shape[0])
10 |     return theano.shared(np.asarray(weights, dtype=theano.config.floatX))
11 | 
12 | def feed_forward(X, weights, pool_size=(2, 2)):
13 |     l1_conv = T.nnet.sigmoid(conv2d(X, weights[0]))
14 |     l1_pool = pool_2d(l1_conv, pool_size, ignore_border=True)
15 |     l2 = T.nnet.sigmoid(T.dot(l1_pool.flatten(ndim=2), weights[1]))
16 |     return T.nnet.softmax(T.dot(l2, weights[2]))
17 | 
18 | trX, trY, teX, teY = mnist.load_data(one_hot=True, reshape=(-1, 1, 28, 28))
19 | 
20 | # conv8(5x5) -> pool(2x2) -> dense100 -> softmax10
21 | shapes = (8, 1, 5, 5), (8*12*12, 100), (100, 10)
22 | weights = [init_weights(shape) for shape in shapes]
23 | X, Y = T.ftensor4(), T.fmatrix()
24 | y_ = feed_forward(X, weights)
25 | 
26 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
27 | 
28 | grads = T.grad(cost=T.nnet.categorical_crossentropy(y_, Y).mean(), wrt=weights)
29 | train = theano.function(
30 |     inputs=[X, Y],
31 |     updates=[[w, w - g * learn_rate] for w, g in zip(weights, grads)],
32 |     allow_input_downcast=True)
33 | predict = theano.function(inputs=[X], outputs=T.argmax(y_, axis=1))
34 | 
35 | for i in range(num_epochs):
36 |     for j in xrange(0, len(trX), batch_size):
37 |         train(trX[j:j+batch_size], trY[j:j+batch_size])
38 |     print i, np.mean(predict(teX) == np.argmax(teY, axis=1))
39 | 


--------------------------------------------------------------------------------
/mlp_keras.py:
--------------------------------------------------------------------------------
 1 | import mnist
 2 | from keras.models import Sequential
 3 | from keras.layers.core import Dense, Activation
 4 | from keras.optimizers import SGD
 5 | 
 6 | trX, trY, teX, teY = mnist.load_data(one_hot=True)
 7 | 
 8 | net = Sequential([
 9 |     Dense(100, input_dim=28*28, activation='sigmoid'),
10 |     Dense(10, activation='softmax')
11 | ])
12 | 
13 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
14 | 
15 | net.compile(SGD(learn_rate), 'categorical_crossentropy', metrics=['accuracy'])
16 | net.fit(trX, trY, batch_size, num_epochs, verbose=1, validation_data=(teX, teY))
17 | 


--------------------------------------------------------------------------------
/mlp_numpy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import mnist
 3 | 
 4 | def feed_forward(X, weights):
 5 |     a = [X]
 6 |     for w in weights:
 7 |         a.append(sigmoid(a[-1].dot(w)))
 8 |     return a
 9 | 
10 | def grads(X, Y, weights):
11 |     grads = np.empty_like(weights)
12 |     a = feed_forward(X, weights)
13 |     delta = a[-1] - Y # cross-entropy
14 |     grads[-1] = np.dot(a[-2].T, delta)
15 |     for i in xrange(len(a)-2, 0, -1):
16 |         delta = np.dot(delta, weights[i].T) * d_sigmoid(a[i])
17 |         grads[i-1] = np.dot(a[i-1].T, delta)
18 |     return grads / len(X)
19 | 
20 | sigmoid = lambda x: 1 / (1 + np.exp(-x))
21 | d_sigmoid = lambda y: y * (1 - y)
22 | 
23 | trX, trY, teX, teY = mnist.load_data(one_hot=True)
24 | 
25 | weights = [
26 |     np.random.randn(28*28, 100) / np.sqrt(28*28),
27 |     np.random.randn(100, 10) / np.sqrt(100)]
28 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
29 | 
30 | for i in xrange(num_epochs):
31 |     for j in xrange(0, len(trX), batch_size):
32 |         X, Y = trX[j:j+batch_size], trY[j:j+batch_size]
33 |         weights -= learn_rate * grads(X, Y, weights)
34 |     prediction = np.argmax(feed_forward(teX, weights)[-1], axis=1)
35 |     print i, np.mean(prediction == np.argmax(teY, axis=1))
36 | 


--------------------------------------------------------------------------------
/mlp_numpy_relu.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import mnist
 3 | 
 4 | def feed_forward(X, weights):
 5 |     a = [X]
 6 |     for w in weights:
 7 |         a.append(np.maximum(a[-1].dot(w),0))
 8 |     return a
 9 | 
10 | def grads(X, Y, weights):
11 |     grads = np.empty_like(weights)
12 |     a = feed_forward(X, weights)
13 |     delta = a[-1] - Y
14 |     grads[-1] = a[-2].T.dot(delta)
15 |     for i in xrange(len(a)-2, 0, -1):
16 |         delta = (a[i] > 0) * delta.dot(weights[i].T)
17 |         grads[i-1] = a[i-1].T.dot(delta)
18 |     return grads / len(X)
19 | 
20 | trX, trY, teX, teY = mnist.load_data()
21 | weights = [np.random.randn(*w) * 0.1 for w in [(784, 100), (100, 10)]]
22 | num_epochs, batch_size, learn_rate = 30, 20, 0.1
23 | 
24 | for i in xrange(num_epochs):
25 |     for j in xrange(0, len(trX), batch_size):
26 |         X, Y = trX[j:j+batch_size], trY[j:j+batch_size]
27 |         weights -= learn_rate * grads(X, Y, weights)
28 |     prediction = np.argmax(feed_forward(teX, weights)[-1], axis=1)
29 |     print i, np.mean(prediction == np.argmax(teY, axis=1))
30 | 


--------------------------------------------------------------------------------
/mlp_tensorflow.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import mnist
 4 | 
 5 | def init_weights(shape):
 6 |     return tf.Variable(tf.random_normal(shape, stddev=0.01))
 7 | 
 8 | def feed_forward(X, w_h, w_o):
 9 |     h = tf.nn.sigmoid(tf.matmul(X, w_h))
10 |     return tf.matmul(h, w_o)
11 | 
12 | trX, trY, teX, teY = mnist.load_data(one_hot=True)
13 | 
14 | w_h, w_o = init_weights([28*28, 100]), init_weights([100, 10])
15 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
16 | 
17 | X = tf.placeholder("float", [None, 28*28])
18 | Y = tf.placeholder("float", [None, 10])
19 | y_ = feed_forward(X, w_h, w_o)
20 | cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(y_, Y))
21 | train = tf.train.GradientDescentOptimizer(learn_rate).minimize(cost)
22 | 
23 | sess = tf.Session()
24 | sess.run(tf.initialize_all_variables())
25 | for i in range(num_epochs):
26 |     for j in xrange(0, len(trX), batch_size):
27 |         batch_x, batch_y = trX[j:j+batch_size], trY[j:j+batch_size]
28 |         sess.run(train, feed_dict={X: batch_x, Y: batch_y})
29 |     prediction = sess.run(tf.argmax(y_, 1), feed_dict={X: teX, Y: teY})
30 |     print i, np.mean(prediction == np.argmax(teY, axis=1))
31 | 


--------------------------------------------------------------------------------
/mlp_theano.py:
--------------------------------------------------------------------------------
 1 | import theano
 2 | import theano.tensor as T
 3 | import numpy as np
 4 | import mnist
 5 | 
 6 | def init_weights(n_in, n_out):
 7 |     weights = np.random.randn(n_in, n_out) / np.sqrt(n_in)
 8 |     return theano.shared(np.asarray(weights, dtype=theano.config.floatX))
 9 | 
10 | def feed_forward(X, w_h, w_o):
11 |     h = T.nnet.sigmoid(T.dot(X, w_h))
12 |     return T.nnet.softmax(T.dot(h, w_o))
13 | 
14 | trX, trY, teX, teY = mnist.load_data(one_hot=True)
15 | 
16 | w_h, w_o = init_weights(28*28, 100), init_weights(100, 10)
17 | num_epochs, batch_size, learn_rate = 30, 10, 0.2
18 | 
19 | X, Y = T.fmatrices('X', 'Y')
20 | y_ = feed_forward(X, w_h, w_o)
21 | 
22 | weights = [w_h, w_o]
23 | grads = T.grad(cost=T.nnet.categorical_crossentropy(y_, Y).mean(), wrt=weights)
24 | train = theano.function(
25 |     inputs=[X, Y],
26 |     updates=[[w, w - g * learn_rate] for w, g in zip(weights, grads)],
27 |     allow_input_downcast=True)
28 | predict = theano.function(inputs=[X], outputs=T.argmax(y_, axis=1))
29 | 
30 | for i in range(num_epochs):
31 |     for j in xrange(0, len(trX), batch_size):
32 |         train(trX[j:j+batch_size], trY[j:j+batch_size])
33 |     print i, np.mean(predict(teX) == np.argmax(teY, axis=1))
34 | 


--------------------------------------------------------------------------------
/mlp_torch.lua:
--------------------------------------------------------------------------------
 1 | require 'torch'
 2 | require 'nn'
 3 | require 'mnist_torch'
 4 | 
 5 | net = nn.Sequential()
 6 |         :add(nn.Linear(28*28, 100))
 7 |         :add(nn.Sigmoid())
 8 |         :add(nn.Linear(100, 10))
 9 | 
10 | function train(X, Y)
11 |   local criterion = nn.CrossEntropyCriterion()
12 |   criterion:forward(net:forward(X), Y)
13 |   net:zeroGradParameters()
14 |   net:backward(X, criterion:backward(net.output, Y))
15 |   net:updateParameters(learn_rate) -- weights:add(-learn_rate, grads)
16 | end
17 | 
18 | function accuracy(teX, teY)
19 |     local _, predicted = torch.max(net:forward(teX), 2)
20 |     return predicted:eq(teY:long()):sum() / teX:size(1)
21 | end
22 | 
23 | num_epochs = 30
24 | batch_size = 10
25 | learn_rate = 0.2
26 | 
27 | trX, trY, teX, teY = mnist.load_data()
28 | for i = 1,num_epochs do
29 |     for t = 1,trX:size(1),batch_size do
30 |         train(trX:sub(t, t+batch_size-1), trY:sub(t, t+batch_size-1))
31 |     end
32 |     print(i .. ' ' .. accuracy(teX, teY))
33 | end
34 | 


--------------------------------------------------------------------------------
/mnist.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import gzip
 3 | import numpy as np
 4 | 
 5 | DATA_URL = 'http://yann.lecun.com/exdb/mnist/'
 6 | 
 7 | # Download and import the MNIST dataset from Yann LeCun's website.
 8 | # Reserve 10,000 examples from the training set for validation.
 9 | # Each image is an array of 784 (28x28) float values  from 0 (white) to 1 (black).
10 | def load_data(one_hot=True, reshape=None, validation_size=10000):
11 |     x_tr = load_images('train-images-idx3-ubyte.gz')
12 |     y_tr = load_labels('train-labels-idx1-ubyte.gz')
13 |     x_te = load_images('t10k-images-idx3-ubyte.gz')
14 |     y_te = load_labels('t10k-labels-idx1-ubyte.gz')
15 | 
16 |     x_tr = x_tr[:-validation_size]
17 |     y_tr = y_tr[:-validation_size]
18 | 
19 |     if one_hot:
20 |         y_tr, y_te = [to_one_hot(y) for y in (y_tr, y_te)]
21 | 
22 |     if reshape:
23 |         x_tr, x_te = [x.reshape(*reshape) for x in (x_tr, x_te)]
24 | 
25 |     return x_tr, y_tr, x_te, y_te
26 | 
27 | def load_images(filename):
28 |     maybe_download(filename)
29 |     with gzip.open(filename, 'rb') as f:
30 |         data = np.frombuffer(f.read(), np.uint8, offset=16)
31 |     return data.reshape(-1, 28 * 28) / np.float32(256)
32 | 
33 | def load_labels(filename):
34 |     maybe_download(filename)
35 |     with gzip.open(filename, 'rb') as f:
36 |         data = np.frombuffer(f.read(), np.uint8, offset=8)
37 |     return data
38 | 
39 | # Download the file, unless it's already here.
40 | def maybe_download(filename):
41 |     if not os.path.exists(filename):
42 |         from urllib import urlretrieve
43 |         print("Downloading %s" % filename)
44 |         urlretrieve(DATA_URL + filename, filename)
45 | 
46 | # Convert class labels from scalars to one-hot vectors.
47 | def to_one_hot(labels, num_classes=10):
48 |     return np.eye(num_classes)[labels]
49 | 


--------------------------------------------------------------------------------
/mnist_torch.lua:
--------------------------------------------------------------------------------
 1 | local mnist_dataset = require 'mnist' -- https://github.com/andresy/mnist
 2 | 
 3 | mnist = {}
 4 | 
 5 | function mnist.load_data(flatten)
 6 |   flatten = flatten or true
 7 |   local train_set = mnist_dataset.traindataset()
 8 |   local test_set = mnist_dataset.testdataset()
 9 | 
10 |   local trX = train_set.data[{{1,50000}}]:double()
11 |   local trY = train_set.label[{{1,50000}}]:add(1)
12 |   local teX = test_set.data:double()
13 |   local teY = test_set.label:add(1)
14 | 
15 |   -- Convert images from matrix of size 28x28 to a vector of size 784
16 |   if flatten then
17 |     trX = trX:reshape(trX:size(1), trX:nElement() / trX:size(1))
18 |     teX = teX:reshape(teX:size(1), teX:nElement() / teX:size(1))
19 |   end
20 | 
21 |   -- Normalize the inputs in the range [0,1] (actually [0, 255/256]) for
22 |   -- compatibility with http://deeplearning.net/data/mnist/mnist.pkl.gz
23 |   trX = trX / 256
24 |   teX = teX / 256
25 | 
26 |   return trX, trY, teX, teY
27 | end
28 | 


--------------------------------------------------------------------------------