├── helpers.py ├── .gitignore ├── PCA.ipynb └── Autoencoder.ipynb /helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | def batch_iter(data, batch_size, num_epochs, seed=None, fill=False): 5 | """ 6 | Generates a batch iterator for a dataset. 7 | """ 8 | random = np.random.RandomState(seed) 9 | data = np.array(data) 10 | data_length = len(data) 11 | num_batches_per_epoch = int(len(data)/batch_size) 12 | if len(data) % batch_size != 0: 13 | num_batches_per_epoch += 1 14 | for epoch in range(num_epochs): 15 | # Shuffle the data at each epoch 16 | shuffle_indices = random.permutation(np.arange(data_length)) 17 | for batch_num in range(num_batches_per_epoch): 18 | start_index = batch_num * batch_size 19 | end_index = min((batch_num + 1) * batch_size, data_length) 20 | selected_indices = shuffle_indices[start_index:end_index] 21 | # If we don't have enough data left for a whole batch, fill it randomly 22 | if fill is True and end_index >= data_length: 23 | num_missing = batch_size - len(selected_indices) 24 | selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)]) 25 | yield data[selected_indices] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | 3 | 4 | # Created by https://www.gitignore.io/api/python,ipythonnotebook 5 | 6 | ### Python ### 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | env/ 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | #Ipython Notebook 68 | .ipynb_checkpoints 69 | 70 | 71 | ### IPythonNotebook ### 72 | # Temporary data 73 | .ipynb_checkpoints/ 74 | -------------------------------------------------------------------------------- /PCA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "from sklearn.cross_validation import train_test_split\n", 13 | "from sklearn.metrics import accuracy_score, classification_report\n", 14 | "from sklearn.neighbors import KNeighborsClassifier\n", 15 | "import sklearn.preprocessing\n", 16 | "import sklearn.decomposition\n", 17 | "from sklearn.linear_model import LogisticRegressionCV\n", 18 | "from sklearn.datasets import fetch_mldata" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "DATA_HOME = \"./data\"\n", 30 | "np.random.seed(42)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": { 37 | "collapsed": false 38 | }, 39 | "outputs": [ 40 | { 41 | "name": "stderr", 42 | "output_type": "stream", 43 | "text": [ 44 | "/Users/dennybritz/projects/venvs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n", 45 | " warnings.warn(msg, DataConversionWarning)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "# Load MNIST Data\n", 51 | "mnist = fetch_mldata('MNIST original', data_home=DATA_HOME)\n", 52 | "data_x = sklearn.preprocessing.scale(mnist.data)\n", 53 | "x_train, x_test, y_train, y_test = train_test_split(data_x, mnist.target, test_size=0.1, random_state=42)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 24, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "PCA_COMPONENTS=32\n", 65 | "pca = sklearn.decomposition.PCA(n_components=PCA_COMPONENTS)\n", 66 | "pca.fit(x_train)\n", 67 | "x_train_transformed = pca.transform(x_train)\n", 68 | "x_test_transformed = pca.transform(x_test)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 25, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 82 | " metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n", 83 | " weights='uniform')" 84 | ] 85 | }, 86 | "execution_count": 25, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "clf = KNeighborsClassifier()\n", 93 | "clf.fit(x_train_transformed, y_train)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 26, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | " precision recall f1-score support\n", 108 | "\n", 109 | " 0.0 0.97 0.98 0.97 671\n", 110 | " 1.0 0.98 0.99 0.99 800\n", 111 | " 2.0 0.96 0.97 0.96 697\n", 112 | " 3.0 0.93 0.95 0.94 719\n", 113 | " 4.0 0.97 0.96 0.97 653\n", 114 | " 5.0 0.96 0.93 0.94 662\n", 115 | " 6.0 0.98 0.99 0.98 712\n", 116 | " 7.0 0.96 0.95 0.95 739\n", 117 | " 8.0 0.94 0.93 0.93 686\n", 118 | " 9.0 0.92 0.92 0.92 661\n", 119 | "\n", 120 | "avg / total 0.96 0.96 0.96 7000\n", 121 | "\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "y_pred = clf.predict(x_test_transformed)\n", 127 | "print(classification_report(y_test, y_pred))" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [] 138 | } 139 | ], 140 | "metadata": { 141 | "kernelspec": { 142 | "display_name": "Python 3", 143 | "language": "python", 144 | "name": "python3" 145 | }, 146 | "language_info": { 147 | "codemirror_mode": { 148 | "name": "ipython", 149 | "version": 3 150 | }, 151 | "file_extension": ".py", 152 | "mimetype": "text/x-python", 153 | "name": "python", 154 | "nbconvert_exporter": "python", 155 | "pygments_lexer": "ipython3", 156 | "version": "3.5.0" 157 | } 158 | }, 159 | "nbformat": 4, 160 | "nbformat_minor": 0 161 | } 162 | -------------------------------------------------------------------------------- /Autoencoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 77, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import tensorflow as tf\n", 13 | "from sklearn.cross_validation import train_test_split\n", 14 | "from sklearn.metrics import accuracy_score, classification_report\n", 15 | "from sklearn.neighbors import KNeighborsClassifier\n", 16 | "import sklearn.preprocessing\n", 17 | "import sklearn.decomposition\n", 18 | "from sklearn.linear_model import LogisticRegressionCV\n", 19 | "from sklearn.datasets import fetch_mldata\n", 20 | "from helpers import batch_iter" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 78, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "DATA_HOME = \"./data\"\n", 32 | "np.random.seed(42)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 79, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stderr", 44 | "output_type": "stream", 45 | "text": [ 46 | "/Users/dennybritz/projects/venvs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n", 47 | " warnings.warn(msg, DataConversionWarning)\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "# Load MNIST Data\n", 53 | "mnist = fetch_mldata('MNIST original', data_home=DATA_HOME)\n", 54 | "data_x = sklearn.preprocessing.scale(mnist.data)\n", 55 | "x_train, x_test, y_train, y_test = train_test_split(data_x, mnist.target, test_size=0.1, random_state=42)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "class Autoencoder:\n", 67 | " \n", 68 | " def build_layer(self, output_dim, prev_layer, activation_func=tf.tanh):\n", 69 | " \"\"\"\n", 70 | " Builds a single hidden layer.\n", 71 | " \"\"\"\n", 72 | " input_dim = prev_layer.get_shape().as_list()[1]\n", 73 | " W_init = tf.random_uniform([input_dim, output_dim], -1.0/np.sqrt(input_dim), 1.0/np.sqrt(input_dim))\n", 74 | " W = tf.Variable(W_init, name=\"W\")\n", 75 | " b = tf.Variable(tf.zeros([output_dim]), name=\"b\")\n", 76 | " return activation_func(tf.nn.xw_plus_b(prev_layer, W, b))\n", 77 | " \n", 78 | " def __init__(self, x, hidden_dims=[32]): \n", 79 | " # Keeps track of the hidden layers so we can refer to them later\n", 80 | " self.hidden_layers = []\n", 81 | " prev_layer = x\n", 82 | " \n", 83 | " # For each dimension, build a hidden layer\n", 84 | " for i, layer in enumerate(hidden_dims):\n", 85 | " with tf.variable_scope(\"hidden-{}\".format(i)):\n", 86 | " prev_layer = self.build_layer(hidden_dims[i], prev_layer)\n", 87 | " self.hidden_layers.append(prev_layer)\n", 88 | " \n", 89 | " # Build output (reconstruction) layer\n", 90 | " with tf.variable_scope(\"output\"):\n", 91 | " output_dim = x.get_shape().as_list()[1]\n", 92 | " self.output = self.build_layer(output_dim, prev_layer)\n", 93 | " \n", 94 | " # Squared loss function\n", 95 | " self.total_loss = tf.reduce_sum(tf.square(x - self.output))\n", 96 | " self.avg_loss = tf.reduce_mean(tf.square(x - self.output))" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "BATCH_SIZE = 32\n", 108 | "NUM_EPOCHS = 30\n", 109 | "PRINT_LOSS_EVERY=2000\n", 110 | "LAYERS = [32, 32]\n", 111 | "\n", 112 | "graph = tf.Graph()\n", 113 | "sess = tf.Session(graph=graph)\n", 114 | "\n", 115 | "with graph.as_default(), sess.as_default():\n", 116 | " x = tf.placeholder(tf.float32, [None, x_train.shape[1]])\n", 117 | " ae = Autoencoder(x, LAYERS)\n", 118 | " \n", 119 | " # Optimization\n", 120 | " global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n", 121 | " optimizer = tf.train.AdamOptimizer(1e-4)\n", 122 | " train_op = optimizer.minimize(ae.total_loss, global_step=global_step)\n", 123 | " \n", 124 | " # Initialize variables\n", 125 | " sess.run(tf.initialize_all_variables())\n", 126 | " \n", 127 | " batches = batch_iter(x_train, BATCH_SIZE, NUM_EPOCHS)\n", 128 | " # For each batch...\n", 129 | " for x_batch in batches:\n", 130 | " feed_dict = { x: x_batch }\n", 131 | " _, loss, step = sess.run([train_op, ae.avg_loss, global_step], feed_dict)\n", 132 | " if step % PRINT_LOSS_EVERY == 0:\n", 133 | " total_loss = sess.run(ae.avg_loss, { x: x_train })\n", 134 | " print(\"{}: Mean Loss: {:g}\".format(step ,total_loss))\n", 135 | " print(\"{}: Final Mean Loss: {:g}\".format(step ,total_loss))" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "# Get the compressed representation of the input X\n", 147 | "with graph.as_default(), sess.as_default():\n", 148 | " x_train_transformed = sess.run(ae.hidden_layers[1], { x: x_train })\n", 149 | " x_test_transformed = sess.run(ae.hidden_layers[1], { x: x_test })" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "clf = KNeighborsClassifier()\n", 161 | "clf.fit(x_train_transformed, y_train)\n", 162 | "y_pred = clf.predict(x_test_transformed)\n", 163 | "print(classification_report(y_test, y_pred, digits=3))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.5.0" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 0 197 | } 198 | --------------------------------------------------------------------------------