├── helpers.py
├── .gitignore
├── PCA.ipynb
└── Autoencoder.ipynb


/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | def batch_iter(data, batch_size, num_epochs, seed=None, fill=False):
 5 |     """
 6 |     Generates a batch iterator for a dataset.
 7 |     """
 8 |     random = np.random.RandomState(seed)
 9 |     data = np.array(data)
10 |     data_length = len(data)
11 |     num_batches_per_epoch = int(len(data)/batch_size)
12 |     if len(data) % batch_size != 0:
13 |         num_batches_per_epoch += 1
14 |     for epoch in range(num_epochs):
15 |         # Shuffle the data at each epoch
16 |         shuffle_indices = random.permutation(np.arange(data_length))
17 |         for batch_num in range(num_batches_per_epoch):
18 |             start_index = batch_num * batch_size
19 |             end_index = min((batch_num + 1) * batch_size, data_length)
20 |             selected_indices = shuffle_indices[start_index:end_index]
21 |             # If we don't have enough data left for a whole batch, fill it randomly
22 |             if fill is True and end_index >= data_length:
23 |                 num_missing = batch_size - len(selected_indices)
24 |                 selected_indices = np.concatenate([selected_indices, random.randint(0, data_length, num_missing)])
25 |             yield data[selected_indices]


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | 
 3 | 
 4 | # Created by https://www.gitignore.io/api/python,ipythonnotebook
 5 | 
 6 | ### Python ###
 7 | # Byte-compiled / optimized / DLL files
 8 | __pycache__/
 9 | *.py[cod]
10 | *$py.class
11 | 
12 | # C extensions
13 | *.so
14 | 
15 | # Distribution / packaging
16 | .Python
17 | env/
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | 
33 | # PyInstaller
34 | #  Usually these files are written by a python script from a template
35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 | 
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 | 
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *,cover
52 | .hypothesis/
53 | 
54 | # Translations
55 | *.mo
56 | *.pot
57 | 
58 | # Django stuff:
59 | *.log
60 | 
61 | # Sphinx documentation
62 | docs/_build/
63 | 
64 | # PyBuilder
65 | target/
66 | 
67 | #Ipython Notebook
68 | .ipynb_checkpoints
69 | 
70 | 
71 | ### IPythonNotebook ###
72 | # Temporary data
73 | .ipynb_checkpoints/
74 | 


--------------------------------------------------------------------------------
/PCA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "from sklearn.cross_validation import train_test_split\n",
 13 |     "from sklearn.metrics import accuracy_score, classification_report\n",
 14 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 15 |     "import sklearn.preprocessing\n",
 16 |     "import sklearn.decomposition\n",
 17 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 18 |     "from sklearn.datasets import fetch_mldata"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "DATA_HOME = \"./data\"\n",
 30 |     "np.random.seed(42)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {
 37 |     "collapsed": false
 38 |    },
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stderr",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "/Users/dennybritz/projects/venvs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n",
 45 |       "  warnings.warn(msg, DataConversionWarning)\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "# Load MNIST Data\n",
 51 |     "mnist = fetch_mldata('MNIST original', data_home=DATA_HOME)\n",
 52 |     "data_x = sklearn.preprocessing.scale(mnist.data)\n",
 53 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, mnist.target, test_size=0.1, random_state=42)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 24,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "PCA_COMPONENTS=32\n",
 65 |     "pca = sklearn.decomposition.PCA(n_components=PCA_COMPONENTS)\n",
 66 |     "pca.fit(x_train)\n",
 67 |     "x_train_transformed = pca.transform(x_train)\n",
 68 |     "x_test_transformed = pca.transform(x_test)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 25,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
 82 |        "           metric_params=None, n_jobs=1, n_neighbors=5, p=2,\n",
 83 |        "           weights='uniform')"
 84 |       ]
 85 |      },
 86 |      "execution_count": 25,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "clf = KNeighborsClassifier()\n",
 93 |     "clf.fit(x_train_transformed, y_train)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 26,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "             precision    recall  f1-score   support\n",
108 |       "\n",
109 |       "        0.0       0.97      0.98      0.97       671\n",
110 |       "        1.0       0.98      0.99      0.99       800\n",
111 |       "        2.0       0.96      0.97      0.96       697\n",
112 |       "        3.0       0.93      0.95      0.94       719\n",
113 |       "        4.0       0.97      0.96      0.97       653\n",
114 |       "        5.0       0.96      0.93      0.94       662\n",
115 |       "        6.0       0.98      0.99      0.98       712\n",
116 |       "        7.0       0.96      0.95      0.95       739\n",
117 |       "        8.0       0.94      0.93      0.93       686\n",
118 |       "        9.0       0.92      0.92      0.92       661\n",
119 |       "\n",
120 |       "avg / total       0.96      0.96      0.96      7000\n",
121 |       "\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "y_pred = clf.predict(x_test_transformed)\n",
127 |     "print(classification_report(y_test, y_pred))"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": []
138 |   }
139 |  ],
140 |  "metadata": {
141 |   "kernelspec": {
142 |    "display_name": "Python 3",
143 |    "language": "python",
144 |    "name": "python3"
145 |   },
146 |   "language_info": {
147 |    "codemirror_mode": {
148 |     "name": "ipython",
149 |     "version": 3
150 |    },
151 |    "file_extension": ".py",
152 |    "mimetype": "text/x-python",
153 |    "name": "python",
154 |    "nbconvert_exporter": "python",
155 |    "pygments_lexer": "ipython3",
156 |    "version": "3.5.0"
157 |   }
158 |  },
159 |  "nbformat": 4,
160 |  "nbformat_minor": 0
161 | }
162 | 


--------------------------------------------------------------------------------
/Autoencoder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 77,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import tensorflow as tf\n",
 13 |     "from sklearn.cross_validation import train_test_split\n",
 14 |     "from sklearn.metrics import accuracy_score, classification_report\n",
 15 |     "from sklearn.neighbors import KNeighborsClassifier\n",
 16 |     "import sklearn.preprocessing\n",
 17 |     "import sklearn.decomposition\n",
 18 |     "from sklearn.linear_model import LogisticRegressionCV\n",
 19 |     "from sklearn.datasets import fetch_mldata\n",
 20 |     "from helpers import batch_iter"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 78,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "DATA_HOME = \"./data\"\n",
 32 |     "np.random.seed(42)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 79,
 38 |    "metadata": {
 39 |     "collapsed": false
 40 |    },
 41 |    "outputs": [
 42 |     {
 43 |      "name": "stderr",
 44 |      "output_type": "stream",
 45 |      "text": [
 46 |       "/Users/dennybritz/projects/venvs/tensorflow/lib/python3.5/site-packages/sklearn/utils/validation.py:420: DataConversionWarning: Data with input dtype uint8 was converted to float64 by the scale function.\n",
 47 |       "  warnings.warn(msg, DataConversionWarning)\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "# Load MNIST Data\n",
 53 |     "mnist = fetch_mldata('MNIST original', data_home=DATA_HOME)\n",
 54 |     "data_x = sklearn.preprocessing.scale(mnist.data)\n",
 55 |     "x_train, x_test, y_train, y_test = train_test_split(data_x, mnist.target, test_size=0.1, random_state=42)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "class Autoencoder:\n",
 67 |     "    \n",
 68 |     "    def build_layer(self, output_dim, prev_layer, activation_func=tf.tanh):\n",
 69 |     "        \"\"\"\n",
 70 |     "        Builds a single hidden layer.\n",
 71 |     "        \"\"\"\n",
 72 |     "        input_dim = prev_layer.get_shape().as_list()[1]\n",
 73 |     "        W_init = tf.random_uniform([input_dim, output_dim], -1.0/np.sqrt(input_dim), 1.0/np.sqrt(input_dim))\n",
 74 |     "        W = tf.Variable(W_init, name=\"W\")\n",
 75 |     "        b = tf.Variable(tf.zeros([output_dim]), name=\"b\")\n",
 76 |     "        return activation_func(tf.nn.xw_plus_b(prev_layer, W, b))\n",
 77 |     "    \n",
 78 |     "    def __init__(self, x, hidden_dims=[32]):        \n",
 79 |     "        # Keeps track of the hidden layers so we can refer to them later\n",
 80 |     "        self.hidden_layers = []\n",
 81 |     "        prev_layer = x\n",
 82 |     "        \n",
 83 |     "        # For each dimension, build a hidden layer\n",
 84 |     "        for i, layer in enumerate(hidden_dims):\n",
 85 |     "            with tf.variable_scope(\"hidden-{}\".format(i)):\n",
 86 |     "                prev_layer = self.build_layer(hidden_dims[i], prev_layer)\n",
 87 |     "                self.hidden_layers.append(prev_layer)\n",
 88 |     "               \n",
 89 |     "        # Build output (reconstruction) layer\n",
 90 |     "        with tf.variable_scope(\"output\"):\n",
 91 |     "            output_dim = x.get_shape().as_list()[1]\n",
 92 |     "            self.output = self.build_layer(output_dim, prev_layer)\n",
 93 |     "        \n",
 94 |     "        # Squared loss function\n",
 95 |     "        self.total_loss = tf.reduce_sum(tf.square(x - self.output))\n",
 96 |     "        self.avg_loss = tf.reduce_mean(tf.square(x - self.output))"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "BATCH_SIZE = 32\n",
108 |     "NUM_EPOCHS = 30\n",
109 |     "PRINT_LOSS_EVERY=2000\n",
110 |     "LAYERS = [32, 32]\n",
111 |     "\n",
112 |     "graph = tf.Graph()\n",
113 |     "sess = tf.Session(graph=graph)\n",
114 |     "\n",
115 |     "with graph.as_default(), sess.as_default():\n",
116 |     "    x = tf.placeholder(tf.float32, [None, x_train.shape[1]])\n",
117 |     "    ae = Autoencoder(x, LAYERS)\n",
118 |     "    \n",
119 |     "    # Optimization\n",
120 |     "    global_step = tf.Variable(0, name=\"global_step\", trainable=False)\n",
121 |     "    optimizer = tf.train.AdamOptimizer(1e-4)\n",
122 |     "    train_op = optimizer.minimize(ae.total_loss, global_step=global_step)\n",
123 |     "    \n",
124 |     "    # Initialize variables\n",
125 |     "    sess.run(tf.initialize_all_variables())\n",
126 |     "    \n",
127 |     "    batches = batch_iter(x_train, BATCH_SIZE, NUM_EPOCHS)\n",
128 |     "    # For each batch...\n",
129 |     "    for x_batch in batches:\n",
130 |     "        feed_dict = { x: x_batch }\n",
131 |     "        _, loss, step = sess.run([train_op, ae.avg_loss, global_step], feed_dict)\n",
132 |     "        if step % PRINT_LOSS_EVERY == 0:\n",
133 |     "            total_loss =  sess.run(ae.avg_loss, { x: x_train })\n",
134 |     "            print(\"{}: Mean Loss: {:g}\".format(step ,total_loss))\n",
135 |     "    print(\"{}: Final Mean Loss: {:g}\".format(step ,total_loss))"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "# Get the compressed representation of the input X\n",
147 |     "with graph.as_default(), sess.as_default():\n",
148 |     "    x_train_transformed = sess.run(ae.hidden_layers[1], { x: x_train })\n",
149 |     "    x_test_transformed = sess.run(ae.hidden_layers[1], { x: x_test })"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {
156 |     "collapsed": false
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "clf = KNeighborsClassifier()\n",
161 |     "clf.fit(x_train_transformed, y_train)\n",
162 |     "y_pred = clf.predict(x_test_transformed)\n",
163 |     "print(classification_report(y_test, y_pred, digits=3))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": []
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.5.0"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 0
197 | }
198 | 


--------------------------------------------------------------------------------