├── .gitignore ├── Adjusted Variable Importances with Randomized Trees.ipynb ├── Attention └── Keras Attention.ipynb ├── Bootstrap.ipynb ├── Clustering Model Selection.ipynb ├── Data Preprocessing for the Learning to Rank example.ipynb ├── Distributed Aggregate and Join.ipynb ├── Distributed Learning of Extra Trees with IPython.parallel.ipynb ├── Explained variances.ipynb ├── Function Approximation.ipynb ├── GP overfitting.ipynb ├── Gradient.ipynb ├── Labeled Faces in the Wild recognition.ipynb ├── Learning to Rank.ipynb ├── MNIST8M Chunking and Upload to Cloud Blob Storage.ipynb ├── Non IID cross-validation.ipynb ├── Numa-aware computation experiments.ipynb ├── Numba Parakeet Cython.ipynb ├── Numpy intro.ipynb ├── Overfitting └── linear-model-overfitting.ipynb ├── Parameter search for Extra Trees on the MNIST classificationt task.ipynb ├── Patch-Based Feature Extraction for Image Classification.ipynb ├── README.md ├── Reinforcement Learning └── Random walk policy evaluation.ipynb ├── SGD stuff.ipynb ├── Saddle Point LBFGS.ipynb ├── Semi-supervised Extra Trees.ipynb ├── Text Classification.ipynb ├── Time Series.ipynb ├── Untitled Diagram.drawio ├── Variable Importance with Completely Randomized Trees.ipynb ├── cloudstorage.ini.example ├── dask └── fold_learn.ipynb ├── environment.yml ├── fmri_vae ├── fmri_autoencoder.ipynb └── fmri_model.py ├── generalization └── run_mnist.py ├── gmm ├── GMM with PyTorch.ipynb ├── GMM with SGD.ipynb ├── Gaussian likelihood landscape.ipynb ├── Model Selection for GMM.ipynb └── gmmsgd.py ├── gradients └── custom optim.ipynb ├── letor_cluster ├── MSLR Grid Search.ipynb ├── letor_gridpoint.py ├── letor_gridresults.json ├── letor_gridresults.py └── letor_gridsearch.py ├── nmf_topics.ipynb ├── quantile_regression_as_classification.ipynb ├── representations ├── Autoencoder ELMs.ipynb ├── Entangled Manifolds.ipynb ├── MNIST experiments.ipynb ├── Sparse non-linear random projections.ipynb └── Unsupervised feature extraction with the Breiman trick.ipynb ├── screenshots ├── digits.png └── topics.png ├── sklearn_demos ├── Classifier calibration.ipynb ├── Face recognition.ipynb ├── Feature Importances.ipynb ├── Gradient Boosting.ipynb ├── Income classification - Column Transformer Edition.ipynb ├── Income classification.ipynb ├── Language Classification.ipynb ├── Large Scale 2D Clustering-1M.ipynb ├── Large Scale 2D Clustering.ipynb ├── Permutation Importances.ipynb ├── ames_gbrt_search_results.json ├── ames_housing.ipynb ├── fastText.ipynb ├── gbdt_vs_neural_nets_on_tabular_data.ipynb ├── iris.ipynb ├── language │ └── fetch_data.py └── splines_overfitting.ipynb ├── structure_digits.ipynb ├── test.drawio └── ubuntu-quickstart.sh /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | *.f 3 | *.so 4 | nohup.out 5 | *.ini 6 | *.pkl 7 | *.npy 8 | joblib 9 | *.nii 10 | *_pkl 11 | nilearn_data 12 | Untitled*.ipynb 13 | *.dat 14 | *.lprof 15 | *.html 16 | *.pyc 17 | adult.data 18 | *.txt 19 | sparse_chunks 20 | *.hdf5 21 | -------------------------------------------------------------------------------- /Attention/Keras Attention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 72, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import theano.tensor as tt\n", 12 | "from keras.layers.recurrent import GRU\n", 13 | "from keras.layers.core import Dense, MaskedLayer, Layer, Merge\n", 14 | "from keras.models import Graph\n", 15 | "from keras.utils.theano_utils import shared_zeros" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 46, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "class SoftSequentialAttentionLayer(MaskedLayer):\n", 27 | " \n", 28 | " def __init__(self, memmory_dim, driver_dim, inner_dim=128, init='glorot_uniform', inner_activation='relu'):\n", 29 | " super(SoftSequentialAttentionLayer, self).__init__()\n", 30 | " self.init = initializations.get(init)\n", 31 | " self.W_m = self.init((memory_dim, inner_dim))\n", 32 | " self.W_d = self.init((driver_dim, inner_dim))\n", 33 | " self.W_a = self.init((inner_dim, 1))\n", 34 | " self.inner_activation = activations.get(inner_activation)\n", 35 | " self.b_inner = shared_zeros(inner_dim)\n", 36 | " self.b_out = shared_zeros(1)\n", 37 | " \n", 38 | " def set_previous(self, *previous_layers):\n", 39 | " type_name = self.__class__.__name__\n", 40 | " if len(previous_layers) != 2:\n", 41 | " raise ValueError(\"{}.set_previous expects 2 input layers, got {}\".format(\n", 42 | " type_name, previous_layers))\n", 43 | " sequential_memory, attention_driver = previous_layers\n", 44 | " if not sequential_memory.return_sequences:\n", 45 | " raise ValueError(\"The first input of {} should be a recurrent layer with\"\n", 46 | " \" return_sequences=True\".format(type_name))\n", 47 | " self.sequential_memory = sequential_memory\n", 48 | " self.attention_driver = attention_driver\n", 49 | " \n", 50 | " def get_input(self, train=False):\n", 51 | " return [self.sequential_memory.get_output(train=train),\n", 52 | " self.attention_driver.get_output(train=train)]\n", 53 | " \n", 54 | " def get_output(self, train=False):\n", 55 | " sequential_memory, attention_driver = self.get_input(train=train)\n", 56 | " # sequential_memory shape: (nb_samples, time (padded with zeros), input_dim)\n", 57 | " # attentin_driver shape: (nb_samples, input_dim)\n", 58 | " # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension\n", 59 | " padded_mask = self.get_padded_shuffled_mask(train, sequential_memory, pad=1)\n", 60 | " sequential_memory = sequential_memory.dimshuffle((1, 0, 2))\n", 61 | " h = self.inner_activation(tt.dot(sequential_memory, self.W_m)\n", 62 | " + tt.dot(driver, self.W_d)\n", 63 | " + self.b_inner)\n", 64 | " a = tt.exp(tt.dot(h, self.W_a) + self.b_out)\n", 65 | " \n", 66 | " \n", 67 | " output = None #XXX: TODO\n", 68 | " return output\n", 69 | " \n", 70 | " def _variable_length_softmax_step(self, a_t, sum_t):\n", 71 | " return )" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 47, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [], 81 | "source": [ 82 | "class CustomGraph(Graph):\n", 83 | "\n", 84 | " def add_node(self, layer, name, input=None, inputs=[], merge_mode='concat', create_output=False):\n", 85 | " if hasattr(layer, 'set_name'):\n", 86 | " layer.set_name(name)\n", 87 | " if name in self.namespace:\n", 88 | " raise Exception('Duplicate node identifier: ' + name)\n", 89 | " if input:\n", 90 | " if input not in self.namespace:\n", 91 | " raise Exception('Unknown node/input identifier: ' + input)\n", 92 | " if input in self.nodes:\n", 93 | " layer.set_previous(self.nodes[input])\n", 94 | " elif input in self.inputs:\n", 95 | " layer.set_previous(self.inputs[input])\n", 96 | " if inputs:\n", 97 | " to_merge = []\n", 98 | " for n in inputs:\n", 99 | " if n in self.nodes:\n", 100 | " to_merge.append(self.nodes[n])\n", 101 | " elif n in self.inputs:\n", 102 | " to_merge.append(self.inputs[n])\n", 103 | " else:\n", 104 | " raise Exception('Unknown identifier: ' + n)\n", 105 | " # XXX: here is the change\n", 106 | " if merge_mode == 'distinct':\n", 107 | " layer.set_previous(*to_merge)\n", 108 | " else:\n", 109 | " merge = Merge(to_merge, mode=merge_mode)\n", 110 | " layer.set_previous(merge)\n", 111 | "\n", 112 | " self.namespace.add(name)\n", 113 | " self.nodes[name] = layer\n", 114 | " self.node_config.append({'name': name,\n", 115 | " 'input': input,\n", 116 | " 'inputs': inputs,\n", 117 | " 'merge_mode': merge_mode})\n", 118 | " layer.init_updates()\n", 119 | " params, regularizers, constraints, updates = layer.get_params()\n", 120 | " self.params += params\n", 121 | " self.regularizers += regularizers\n", 122 | " self.constraints += constraints\n", 123 | " self.updates += updates\n", 124 | "\n", 125 | " if create_output:\n", 126 | " self.add_output(name, input=name)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 54, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "graph = CustomGraph()\n", 138 | "graph.add_input(name='context_sequences', ndim=3)\n", 139 | "graph.add_node(GRU(32, return_sequences=True), name='dense1', input='context_sequences')\n", 140 | "graph.add_node(Dense(32, 4), name='dense2', input='context_sequences')\n", 141 | "graph.add_node(SoftSequentialAttentionLayer(),\n", 142 | " name='attention', inputs=['dense1', 'dense2'],\n", 143 | " merge_mode='distinct')\n", 144 | "graph.add_output(name='output1', input='dense2')\n", 145 | "graph.add_output(name='output2', input='attention')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 55, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [ 155 | { 156 | "data": { 157 | "text/plain": [ 158 | "{'attention': <__main__.SoftSequentialAttentionLayer at 0x10873d630>,\n", 159 | " 'dense1': ,\n", 160 | " 'dense2': }" 161 | ] 162 | }, 163 | "execution_count": 55, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "graph.nodes" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 56, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "{'attention', 'context_sequences', 'dense1', 'dense2'}" 183 | ] 184 | }, 185 | "execution_count": 56, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "graph.namespace" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 62, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "import numpy as np" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 69, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "x = np.arange(3 * 4 * 5).reshape(5, 3, 4)\n", 223 | "a = np.arange(4 * 2).reshape(4, 2)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 71, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "(5, 3, 2)" 237 | ] 238 | }, 239 | "execution_count": 71, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "np.dot(x, a).shape" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": { 252 | "collapsed": true 253 | }, 254 | "outputs": [], 255 | "source": [] 256 | } 257 | ], 258 | "metadata": { 259 | "kernelspec": { 260 | "display_name": "Python 3", 261 | "language": "python", 262 | "name": "python3" 263 | }, 264 | "language_info": { 265 | "codemirror_mode": { 266 | "name": "ipython", 267 | "version": 3 268 | }, 269 | "file_extension": ".py", 270 | "mimetype": "text/x-python", 271 | "name": "python", 272 | "nbconvert_exporter": "python", 273 | "pygments_lexer": "ipython3", 274 | "version": "3.4.3" 275 | } 276 | }, 277 | "nbformat": 4, 278 | "nbformat_minor": 0 279 | } 280 | -------------------------------------------------------------------------------- /Clustering Model Selection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "Clustering Model Selection" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [], 14 | "language": "python", 15 | "metadata": {}, 16 | "outputs": [] 17 | } 18 | ], 19 | "metadata": {} 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /Data Preprocessing for the Learning to Rank example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 2, 13 | "metadata": {}, 14 | "source": [ 15 | "Svmlight formatted file parsing with sklearn for Learning to Rank data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "import numpy as np\n", 23 | "\n", 24 | "from os.path import expanduser\n", 25 | "from sklearn.datasets import load_svmlight_file\n", 26 | "from sklearn.externals import joblib" 27 | ], 28 | "language": "python", 29 | "metadata": {}, 30 | "outputs": [] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "collapsed": false, 35 | "input": [ 36 | "memory = joblib.Memory(cachedir='.', mmap_mode='r')\n", 37 | "\n", 38 | "@memory.cache\n", 39 | "def load_fold(dataset, subset, fold_idx=1, dtype=np.float32):\n", 40 | " DATA_FOLDER = expanduser('~/data')\n", 41 | " filepath = join(DATA_FOLDER, dataset, 'Fold%d' % fold_idx, subset + '.txt')\n", 42 | " X, y, qid = load_svmlight_file(filepath, dtype=dtype, query_id=True)\n", 43 | " return X.toarray(), y, qid" 44 | ], 45 | "language": "python", 46 | "metadata": {}, 47 | "outputs": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "collapsed": false, 52 | "input": [ 53 | "X_train, y_train, qid_train = load_fold('MSLR-WEB10K','train', fold_idx=1)\n", 54 | "X_vali, y_vali, qid_vali = load_fold('MSLR-WEB10K', 'vali', fold_idx=1)\n", 55 | "X_test, y_test, qid_test = load_fold('MSLR-WEB10K', 'test', fold_idx=1)" 56 | ], 57 | "language": "python", 58 | "metadata": {}, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "collapsed": false, 64 | "input": [ 65 | "%%time\n", 66 | "\n", 67 | "np.savez_compressed(expanduser('~/data/MSLR-WEB10K/mslr_web10k_fold1.npz'),\n", 68 | " X_train=X_train, y_train=y_train, qid_train=qid_train,\n", 69 | " X_vali=X_vali, y_vali=y_vali, qid_vali=qid_vali,\n", 70 | " X_test=X_test, y_test=y_test, qid_test=qid_test)" 71 | ], 72 | "language": "python", 73 | "metadata": {}, 74 | "outputs": [] 75 | } 76 | ], 77 | "metadata": {} 78 | } 79 | ] 80 | } -------------------------------------------------------------------------------- /Function Approximation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 307, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "\n", 13 | "from keras.layers import Dense\n", 14 | "from keras.models import Sequential\n", 15 | "from keras import optimizers" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 916, 21 | "metadata": { 22 | "collapsed": false 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "X = np.random.uniform(low=0, high=1, size=(300, 30))\n", 27 | "y = np.array([0, 1] * (X.shape[0] // 2))" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 917, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "# n_hidden = X.shape[0]\n", 39 | "# W0 = np.ones_like(X.T)\n", 40 | "# b0 = -X.ravel() + 0.001\n", 41 | "# weights_0 = [W0, b0]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 918, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "# hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n", 53 | "# np.linalg.matrix_rank(hidden_activations)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 919, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "n_hidden = X.shape[0]\n", 65 | "W0 = X / (np.linalg.norm(X, axis=1, keepdims=True) ** 2 + 1e-8)\n", 66 | "W0 = W0.T\n", 67 | "b0 = -0.98\n", 68 | "weights_0 = [W0, b0]" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 920, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "300" 82 | ] 83 | }, 84 | "execution_count": 920, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n", 91 | "np.linalg.matrix_rank(hidden_activations)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 921, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "0.0" 105 | ] 106 | }, 107 | "execution_count": 921, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "np.linalg.det(hidden_activations)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 931, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "n_hidden = X.shape[0]\n", 125 | "W0 = X\n", 126 | "W0 = W0.T\n", 127 | "b0 = -0.98 * np.linalg.norm(X, axis=1) ** 2\n", 128 | "weights_0 = [W0, b0]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 932, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "300" 142 | ] 143 | }, 144 | "execution_count": 932, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n", 151 | "np.linalg.matrix_rank(hidden_activations)" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 933, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "5.278958284816851e-214" 165 | ] 166 | }, 167 | "execution_count": 933, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "np.linalg.det(hidden_activations)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 934, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "array([[ 0.20354264, 0. , 0. , 0. , 0. ,\n", 187 | " 0. , 0. , 0. , 0. , 0. ,\n", 188 | " 0. , 0. , 0. , 0. , 0. ,\n", 189 | " 0. , 0. , 0. , 0. , 0. ,\n", 190 | " 0. , 0. , 0. , 0. , 0. ,\n", 191 | " 0. , 0. , 0. , 0. , 0. ,\n", 192 | " 0. , 0. , 0. , 0. , 0. ,\n", 193 | " 0. , 0. , 0. , 0.05563168, 0. ,\n", 194 | " 0. , 0. , 0. , 0. , 0. ,\n", 195 | " 0. , 0. , 0. , 0. , 0. ,\n", 196 | " 0. , 0. , 0. , 0. , 0. ,\n", 197 | " 0. , 0. , 0. , 0. , 0. ,\n", 198 | " 0. , 0. , 0. , 0. , 0. ,\n", 199 | " 0.01919508, 0.06842773, 0. , 0. , 0. ,\n", 200 | " 0. , 0. , 0. , 0. , 0. ,\n", 201 | " 0. , 0. , 0. , 0.2154047 , 0. ,\n", 202 | " 0. , 0. , 0. , 0. , 0. ,\n", 203 | " 0. , 0. , 0. , 0. , 0. ,\n", 204 | " 0. , 0. , 0. , 0. , 0. ,\n", 205 | " 0. , 0. , 0. , 0. , 0. ,\n", 206 | " 0. , 0. , 0. , 0. , 0. ,\n", 207 | " 0. , 0. , 0. , 0. , 0. ,\n", 208 | " 0. , 0. , 0. , 0. , 0. ,\n", 209 | " 0. , 0. , 0. , 0. , 0. ,\n", 210 | " 0. , 0. , 0. , 0. , 0. ,\n", 211 | " 0. , 0. , 0. , 0. , 0. ,\n", 212 | " 0.09565226, 0. , 0. , 0. , 0. ,\n", 213 | " 0. , 0. , 0. , 0. , 0. ,\n", 214 | " 0. , 0. , 0. , 0. , 0. ,\n", 215 | " 0. , 0. , 0. , 0. , 0. ,\n", 216 | " 0. , 0. , 0. , 0. , 0. ,\n", 217 | " 0. , 0. , 0. , 0. , 0. ,\n", 218 | " 0. , 0. , 0. , 0. , 0. ,\n", 219 | " 0. , 0. , 0. , 0. , 0. ,\n", 220 | " 0. , 0. , 0. , 0. , 0. ,\n", 221 | " 0. , 0. , 0. , 0. , 0. ,\n", 222 | " 0. , 0. , 0. , 0. , 0. ,\n", 223 | " 0. , 0. , 0. , 0. , 0. ,\n", 224 | " 0. , 0. , 0. , 0. , 0. ,\n", 225 | " 0. , 0. , 0. , 0. , 0. ,\n", 226 | " 0. , 0. , 0. , 0. , 0. ,\n", 227 | " 0. , 0. , 0. , 0. , 0. ,\n", 228 | " 0. , 0. , 0. , 0. , 0. ,\n", 229 | " 0. , 0. , 0. , 0. , 0. ,\n", 230 | " 0. , 0. , 0. , 0. , 0. ,\n", 231 | " 0. , 0. , 0. , 0. , 0. ,\n", 232 | " 0. , 0. , 0. , 0. , 0. ,\n", 233 | " 0. , 0. , 0. , 0. , 0. ,\n", 234 | " 0. , 0. , 0. , 0. , 0. ,\n", 235 | " 0. , 0. , 0. , 0. , 0. ,\n", 236 | " 0. , 0. , 0. , 0. , 0. ,\n", 237 | " 0. , 0. , 0. , 0. , 0. ,\n", 238 | " 0. , 0. , 0. , 0. , 0. ,\n", 239 | " 0. , 0. , 0. , 0. , 0. ,\n", 240 | " 0. , 0. , 0. , 0. , 0. ,\n", 241 | " 0. , 0. , 0. , 0. , 0. ,\n", 242 | " 0. , 0. , 0. , 0. , 0. ,\n", 243 | " 0. , 0. , 0. , 0. , 0. ,\n", 244 | " 0. , 0. , 0. , 0. , 0. ,\n", 245 | " 0. , 0. , 0. , 0. , 0. ],\n", 246 | " [ 0. , 0.18351664, 0. , 0. , 0. ,\n", 247 | " 0. , 0. , 0. , 0. , 0. ,\n", 248 | " 0. , 0. , 0. , 0. , 0. ,\n", 249 | " 0. , 0. , 0. , 0. , 0. ,\n", 250 | " 0. , 0. , 0. , 0. , 0. ,\n", 251 | " 0. , 0. , 0. , 0. , 0. ,\n", 252 | " 0. , 0. , 0. , 0. , 0. ,\n", 253 | " 0. , 0. , 0. , 0. , 0. ,\n", 254 | " 0. , 0. , 0. , 0. , 0. ,\n", 255 | " 0. , 0. , 0. , 0. , 0. ,\n", 256 | " 0. , 0. , 0. , 0. , 0. ,\n", 257 | " 0. , 0. , 0. , 0. , 0. ,\n", 258 | " 0. , 0. , 0. , 0. , 0. ,\n", 259 | " 0. , 0. , 0. , 0. , 0. ,\n", 260 | " 0. , 0. , 0. , 0. , 0. ,\n", 261 | " 0. , 0. , 0. , 0.28104634, 0. ,\n", 262 | " 0. , 0. , 0. , 0. , 0. ,\n", 263 | " 0. , 0. , 0. , 0. , 0. ,\n", 264 | " 0. , 0. , 0. , 0. , 0. ,\n", 265 | " 0. , 0. , 0. , 0. , 0. ,\n", 266 | " 0. , 0. , 0. , 0. , 0. ,\n", 267 | " 0. , 0. , 0. , 0. , 0. ,\n", 268 | " 0. , 0. , 0. , 0. , 0. ,\n", 269 | " 0. , 0. , 0. , 0. , 0. ,\n", 270 | " 0. , 0. , 0. , 0. , 0. ,\n", 271 | " 0. , 0. , 0. , 0. , 0. ,\n", 272 | " 0. , 0. , 0. , 0. , 0. ,\n", 273 | " 0. , 0. , 0. , 0. , 0. ,\n", 274 | " 0. , 0. , 0. , 0. , 0. ,\n", 275 | " 0. , 0. , 0. , 0. , 0. ,\n", 276 | " 0. , 0. , 0. , 0. , 0. ,\n", 277 | " 0. , 0. , 0. , 0. , 0. ,\n", 278 | " 0. , 0. , 0. , 0. , 0. ,\n", 279 | " 0. , 0. , 0. , 0. , 0. ,\n", 280 | " 0. , 0. , 0. , 0. , 0. ,\n", 281 | " 0. , 0. , 0. , 0. , 0. ,\n", 282 | " 0. , 0. , 0. , 0. , 0. ,\n", 283 | " 0. , 0. , 0. , 0. , 0. ,\n", 284 | " 0. , 0. , 0. , 0. , 0. ,\n", 285 | " 0. , 0. , 0. , 0. , 0. ,\n", 286 | " 0. , 0. , 0. , 0. , 0. ,\n", 287 | " 0. , 0. , 0. , 0. , 0. ,\n", 288 | " 0. , 0. , 0. , 0. , 0. ,\n", 289 | " 0. , 0. , 0. , 0. , 0. ,\n", 290 | " 0. , 0. , 0. , 0. , 0. ,\n", 291 | " 0. , 0. , 0. , 0. , 0. ,\n", 292 | " 0. , 0. , 0. , 0. , 0. ,\n", 293 | " 0. , 0. , 0. , 0. , 0. ,\n", 294 | " 0. , 0. , 0. , 0. , 0. ,\n", 295 | " 0. , 0. , 0. , 0. , 0. ,\n", 296 | " 0. , 0. , 0. , 0. , 0. ,\n", 297 | " 0. , 0. , 0. , 0. , 0. ,\n", 298 | " 0. , 0. , 0. , 0. , 0. ,\n", 299 | " 0. , 0. , 0. , 0. , 0. ,\n", 300 | " 0. , 0. , 0. , 0. , 0. ,\n", 301 | " 0. , 0. , 0. , 0. , 0. ,\n", 302 | " 0. , 0. , 0. , 0. , 0. ,\n", 303 | " 0. , 0. , 0. , 0. , 0. ,\n", 304 | " 0. , 0. , 0. , 0. , 0. ,\n", 305 | " 0. , 0. , 0. , 0. , 0. ],\n", 306 | " [ 0. , 0. , 0.16006979, 0. , 0. ,\n", 307 | " 0. , 0. , 0. , 0. , 0. ,\n", 308 | " 0. , 0. , 0. , 0. , 0. ,\n", 309 | " 0. , 0. , 0. , 0. , 0. ,\n", 310 | " 0. , 0. , 0. , 0. , 0. ,\n", 311 | " 0. , 0. , 0. , 0. , 0. ,\n", 312 | " 0. , 0. , 0. , 0. , 0. ,\n", 313 | " 0. , 0. , 0. , 0. , 0. ,\n", 314 | " 0. , 0. , 0. , 0. , 0. ,\n", 315 | " 0. , 0. , 0. , 0. , 0. ,\n", 316 | " 0. , 0. , 0. , 0. , 0. ,\n", 317 | " 0. , 0. , 0. , 0. , 0. ,\n", 318 | " 0. , 0. , 0. , 0. , 0. ,\n", 319 | " 0. , 0. , 0. , 0. , 0. ,\n", 320 | " 0. , 0. , 0. , 0. , 0. ,\n", 321 | " 0. , 0. , 0. , 0. , 0. ,\n", 322 | " 0. , 0. , 0. , 0. , 0. ,\n", 323 | " 0. , 0. , 0. , 0. , 0. ,\n", 324 | " 0. , 0. , 0. , 0. , 0. ,\n", 325 | " 0. , 0. , 0. , 0. , 0. ,\n", 326 | " 0. , 0. , 0. , 0. , 0. ,\n", 327 | " 0. , 0. , 0. , 0. , 0. ,\n", 328 | " 0. , 0. , 0. , 0. , 0. ,\n", 329 | " 0. , 0. , 0. , 0. , 0. ,\n", 330 | " 0. , 0. , 0. , 0. , 0. ,\n", 331 | " 0. , 0. , 0. , 0. , 0. ,\n", 332 | " 0. , 0. , 0. , 0. , 0. ,\n", 333 | " 0. , 0. , 0. , 0. , 0. ,\n", 334 | " 0. , 0. , 0. , 0. , 0. ,\n", 335 | " 0. , 0. , 0. , 0. , 0. ,\n", 336 | " 0. , 0. , 0. , 0. , 0. ,\n", 337 | " 0. , 0. , 0. , 0. , 0. ,\n", 338 | " 0. , 0. , 0. , 0. , 0. ,\n", 339 | " 0. , 0. , 0. , 0. , 0. ,\n", 340 | " 0. , 0. , 0. , 0. , 0. ,\n", 341 | " 0. , 0. , 0. , 0. , 0. ,\n", 342 | " 0. , 0. , 0. , 0. , 0. ,\n", 343 | " 0. , 0. , 0. , 0. , 0. ,\n", 344 | " 0. , 0. , 0. , 0. , 0. ,\n", 345 | " 0. , 0. , 0. , 0. , 0. ,\n", 346 | " 0. , 0. , 0. , 0. , 0. ,\n", 347 | " 0. , 0. , 0. , 0. , 0. ,\n", 348 | " 0. , 0. , 0. , 0. , 0. ,\n", 349 | " 0. , 0. , 0. , 0. , 0. ,\n", 350 | " 0. , 0. , 0. , 0. , 0. ,\n", 351 | " 0. , 0. , 0. , 0. , 0. ,\n", 352 | " 0. , 0. , 0. , 0. , 0. ,\n", 353 | " 0. , 0. , 0. , 0. , 0. ,\n", 354 | " 0. , 0. , 0. , 0. , 0. ,\n", 355 | " 0. , 0. , 0. , 0. , 0. ,\n", 356 | " 0. , 0. , 0. , 0. , 0. ,\n", 357 | " 0. , 0. , 0. , 0. , 0. ,\n", 358 | " 0. , 0. , 0. , 0. , 0. ,\n", 359 | " 0. , 0. , 0. , 0. , 0. ,\n", 360 | " 0. , 0. , 0. , 0. , 0. ,\n", 361 | " 0. , 0. , 0. , 0. , 0. ,\n", 362 | " 0. , 0. , 0. , 0. , 0. ,\n", 363 | " 0. , 0. , 0. , 0. , 0. ,\n", 364 | " 0. , 0. , 0. , 0. , 0. ,\n", 365 | " 0. , 0. , 0. , 0. , 0. ]])" 366 | ] 367 | }, 368 | "execution_count": 934, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "hidden_activations[:3]" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 935, 380 | "metadata": { 381 | "collapsed": false 382 | }, 383 | "outputs": [], 384 | "source": [ 385 | "# W1 = np.linalg.solve(hidden_activations, y)[:, np.newaxis]\n", 386 | "W1 = np.dot(np.linalg.pinv(hidden_activations), y)[:, np.newaxis]\n", 387 | "b1 = np.zeros(1)\n", 388 | "weights_1 = [W1, b1]" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 936, 394 | "metadata": { 395 | "collapsed": false 396 | }, 397 | "outputs": [ 398 | { 399 | "data": { 400 | "text/plain": [ 401 | "array([[ 8.12960810e-14],\n", 402 | " [ 1.00000000e+00],\n", 403 | " [ 1.53071833e-13],\n", 404 | " [ 1.00000000e+00],\n", 405 | " [ 5.55215829e-15]])" 406 | ] 407 | }, 408 | "execution_count": 936, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "np.dot(hidden_activations, W1)[:5]" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": 937, 420 | "metadata": { 421 | "collapsed": false 422 | }, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "Epoch 1/5\n", 429 | "1s - loss: 2.0071e-08 - acc: 1.0000\n", 430 | "Epoch 2/5\n", 431 | "0s - loss: 2.0071e-08 - acc: 1.0000\n", 432 | "Epoch 3/5\n", 433 | "0s - loss: 2.0071e-08 - acc: 1.0000\n", 434 | "Epoch 4/5\n", 435 | "0s - loss: 2.0071e-08 - acc: 1.0000\n", 436 | "Epoch 5/5\n", 437 | "0s - loss: 2.0071e-08 - acc: 1.0000\n" 438 | ] 439 | }, 440 | { 441 | "data": { 442 | "text/plain": [ 443 | "" 444 | ] 445 | }, 446 | "execution_count": 937, 447 | "metadata": {}, 448 | "output_type": "execute_result" 449 | } 450 | ], 451 | "source": [ 452 | "model = Sequential()\n", 453 | "first_layer = Dense(output_dim=n_hidden, input_dim=X.shape[1], activation='relu',\n", 454 | " weights=weights_0)\n", 455 | "first_layer.trainable = False\n", 456 | "model.add(first_layer)\n", 457 | "second_layer = Dense(output_dim=1, activation='linear', weights=weights_1)\n", 458 | "second_layer.trainable = False\n", 459 | "model.add(second_layer)\n", 460 | "\n", 461 | "optimizer = optimizers.Adam(lr=0.001)\n", 462 | "model.compile(optimizer=optimizer, loss='mse',\n", 463 | " metrics=['accuracy'])\n", 464 | "\n", 465 | "model.fit(X, y, nb_epoch=5, verbose=2)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": { 481 | "collapsed": true 482 | }, 483 | "outputs": [], 484 | "source": [] 485 | } 486 | ], 487 | "metadata": { 488 | "kernelspec": { 489 | "display_name": "Python 3", 490 | "language": "python", 491 | "name": "python3" 492 | }, 493 | "language_info": { 494 | "codemirror_mode": { 495 | "name": "ipython", 496 | "version": 3 497 | }, 498 | "file_extension": ".py", 499 | "mimetype": "text/x-python", 500 | "name": "python", 501 | "nbconvert_exporter": "python", 502 | "pygments_lexer": "ipython3", 503 | "version": "3.5.2" 504 | } 505 | }, 506 | "nbformat": 4, 507 | "nbformat_minor": 1 508 | } 509 | -------------------------------------------------------------------------------- /MNIST8M Chunking and Upload to Cloud Blob Storage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Chunking the MNIST8M dataset and store the chunks in the cloud" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "This notebook is an example to demonstrate how to preprocess a large dataset in the svmlight format to convert into chunked, dense numpy arrays that are them compressed individually and stored in a cloud object store on Amazon S3 or Azure Blob Store for later consumption by machine learning models." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "collapsed": false, 28 | "input": [ 29 | "import re\n", 30 | "import bz2\n", 31 | "import os\n", 32 | "from os.path import expanduser, join, exists\n", 33 | "from configparser import ConfigParser\n", 34 | "from time import time\n", 35 | "\n", 36 | "import numpy as np\n", 37 | "from concurrent.futures import ThreadPoolExecutor\n", 38 | "\n", 39 | "from libcloud.storage.types import Provider\n", 40 | "from libcloud.storage.types import ContainerDoesNotExistError\n", 41 | "from libcloud.storage.types import ObjectDoesNotExistError\n", 42 | "from libcloud.storage.providers import get_driver\n", 43 | "\n", 44 | "\n", 45 | "DATA_FOLDER = expanduser('~/data/mnist8m')\n", 46 | "SVMLIGHT_DATA_FOLDER = join(DATA_FOLDER, 'svmlight')\n", 47 | "NUMPY_DATA_FOLDER = join(DATA_FOLDER, 'numpy')\n", 48 | "\n", 49 | "MNIST8M_SRC_URL = ('http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/'\n", 50 | " 'datasets/multiclass/mnist8m.bz2')\n", 51 | "MNIST8M_SRC_FILENAME = MNIST8M_SRC_URL.rsplit('/', 1)[1]\n", 52 | "MNIST8M_SRC_FILEPATH = join(DATA_FOLDER, MNIST8M_SRC_FILENAME)\n", 53 | "\n", 54 | "\n", 55 | "CHUNK_FILENAME_PREFIX = \"mnist8m-chunk-\"\n", 56 | "\n", 57 | "CHUNK_SIZE = 100000" 58 | ], 59 | "language": "python", 60 | "metadata": {}, 61 | "outputs": [], 62 | "prompt_number": 15 63 | }, 64 | { 65 | "cell_type": "heading", 66 | "level": 2, 67 | "metadata": {}, 68 | "source": [ 69 | "Decompressing and chunking the source dataset" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "Download the `mnist8m.bz2` source file into the data folder if not previously downloaded:" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": false, 82 | "input": [ 83 | "if not exists(DATA_FOLDER):\n", 84 | " os.makedirs(DATA_FOLDER)\n", 85 | "\n", 86 | "if not exists(MNIST8M_SRC_FILEPATH):\n", 87 | " cmd = \"(cd '%s' && wget -c '%s')\" % (DATA_FOLDER, MNIST8M_SRC_URL)\n", 88 | " print(cmd)\n", 89 | " os.system(cmd)" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [], 94 | "prompt_number": 16 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Decompress the big bz2 source file and chunk the source svmlight formatted data file to make it easier to process it in parallel:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "collapsed": false, 106 | "input": [ 107 | "if not exists(SVMLIGHT_DATA_FOLDER):\n", 108 | " os.makedirs(SVMLIGHT_DATA_FOLDER)\n", 109 | "\n", 110 | "chunk_filenames = [fn for fn in os.listdir(SVMLIGHT_DATA_FOLDER)\n", 111 | " if (fn.startswith(CHUNK_FILENAME_PREFIX)\n", 112 | " and fn.endswith('.svmlight'))]\n", 113 | "chunk_filenames.sort()\n", 114 | "\n", 115 | "\n", 116 | "def get_svmlight_filename(chunk_idx):\n", 117 | " chunk_filename = \"%s%03d.svmlight\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n", 118 | " return join(SVMLIGHT_DATA_FOLDER, chunk_filename)\n", 119 | "\n", 120 | "\n", 121 | "if not chunk_filenames:\n", 122 | " chunk_filenames = []\n", 123 | " with bz2.BZ2File(MNIST8M_SRC_FILEPATH) as source:\n", 124 | " target, line_no, chunk_idx = None, 0, 0\n", 125 | " for line in source:\n", 126 | " line_no += 1\n", 127 | " if target is None:\n", 128 | " chunk_filename = get_svmlight_filename(chunk_idx)\n", 129 | " target = open(chunk_filename, 'wb')\n", 130 | " chunk_idx += 1\n", 131 | " chunk_filenames.append(chunk_filename)\n", 132 | " \n", 133 | " target.write(line)\n", 134 | " \n", 135 | " if line_no >= CHUNK_SIZE:\n", 136 | " target.close()\n", 137 | " target, line_no = None, 0\n", 138 | " if target is not None:\n", 139 | " target.close()" 140 | ], 141 | "language": "python", 142 | "metadata": {}, 143 | "outputs": [], 144 | "prompt_number": 22 145 | }, 146 | { 147 | "cell_type": "heading", 148 | "level": 2, 149 | "metadata": {}, 150 | "source": [ 151 | "Parsing the svmlight format in parallel and compressing the resulting chunks locally" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Parse the svmlight formatted chunks into dense numpy arrays and store the resulting chunks as compressed binary files using NumPy own format." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "collapsed": false, 164 | "input": [ 165 | "from IPython.parallel import Client\n", 166 | "client = Client()\n", 167 | "lb_view = client.load_balanced_view()\n", 168 | "len(lb_view)" 169 | ], 170 | "language": "python", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "metadata": {}, 175 | "output_type": "pyout", 176 | "prompt_number": 23, 177 | "text": [ 178 | "4" 179 | ] 180 | } 181 | ], 182 | "prompt_number": 23 183 | }, 184 | { 185 | "cell_type": "code", 186 | "collapsed": false, 187 | "input": [ 188 | "def parse_svmlight_chunk(input_chunk_filename, output_chunk_filename,\n", 189 | " output_chunk_labels_filename,\n", 190 | " n_features, chunk_size=CHUNK_SIZE):\n", 191 | " # Import dependencies lazily to be able to run this function\n", 192 | " # on remote nodes of the cluster in parallel with IPython\n", 193 | " from sklearn.datasets import load_svmlight_file\n", 194 | "\n", 195 | " if (not exists(output_chunk_filename)\n", 196 | " or not exists(output_chunk_labels_filename)):\n", 197 | " X, y = load_svmlight_file(input_chunk_filename, n_features=n_features)\n", 198 | " np.savez_compressed(output_chunk_filename, X.toarray() / 255.)\n", 199 | " np.savez_compressed(output_chunk_labels_filename, y)\n", 200 | "\n", 201 | "\n", 202 | "def get_numpy_filenames(i):\n", 203 | " data = \"%s%03d_data.npz\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n", 204 | " labels = \"%s%03d_labels.npz\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n", 205 | " return (\n", 206 | " join(NUMPY_DATA_FOLDER, data),\n", 207 | " join(NUMPY_DATA_FOLDER, labels),\n", 208 | " )\n", 209 | "\n", 210 | " \n", 211 | "tasks = []\n", 212 | "n_features = 28 ** 2 # hardcoded for now\n", 213 | "\n", 214 | "for i in range(81): # 8100000 lines // 100000 lines per chunk:\n", 215 | " svmlight_chunk_name = get_svmlight_filename(i)\n", 216 | " data_chunk_name, label_chunk_name = get_numpy_filenames(i)\n", 217 | " tasks.append(lb_view.apply(parse_svmlight_chunk,\n", 218 | " svmlight_chunk_name,\n", 219 | " data_chunk_name,\n", 220 | " label_chunk_name,\n", 221 | " n_features))" 222 | ], 223 | "language": "python", 224 | "metadata": {}, 225 | "outputs": [], 226 | "prompt_number": 24 227 | }, 228 | { 229 | "cell_type": "code", 230 | "collapsed": false, 231 | "input": [ 232 | "sum(t.ready() for t in tasks), len(tasks)" 233 | ], 234 | "language": "python", 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "metadata": {}, 239 | "output_type": "pyout", 240 | "prompt_number": 30, 241 | "text": [ 242 | "(0, 81)" 243 | ] 244 | } 245 | ], 246 | "prompt_number": 30 247 | }, 248 | { 249 | "cell_type": "heading", 250 | "level": 2, 251 | "metadata": {}, 252 | "source": [ 253 | "Uploading the results to a cloud store" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "collapsed": false, 259 | "input": [ 260 | "CONFIGFILE_PATH = 'cloudstorage.ini'" 261 | ], 262 | "language": "python", 263 | "metadata": {}, 264 | "outputs": [], 265 | "prompt_number": 112 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Let's use [Apache Libcloud](http://libcloud.apache.org) to upload the chunk objects to a permanent store for later usage in ephemeral VMs. We will store the credential in a configuration file named `cloudstorage.ini`. Here is the expected content for the Windows Azure Cloud:\n", 272 | "\n", 273 | "```\n", 274 | "[account]\n", 275 | "libcloud_provider = azure_blobs\n", 276 | "account_name = myacount\n", 277 | "account_secret = primarykey\n", 278 | "```\n", 279 | "\n", 280 | "On Amazon S3, the config file would look like:\n", 281 | "\n", 282 | "```\n", 283 | "[account]\n", 284 | "libcloud_provider = s3\n", 285 | "account_name = aws_key_id\n", 286 | "account_secret = aws_secret_key\n", 287 | "```\n", 288 | "\n", 289 | "Apache Libcloud supports many more [Cloud Object Store providers](https://ci.apache.org/projects/libcloud/docs/storage/supported_providers.html).\n", 290 | "\n", 291 | "The objects will be stored in a specific container. On some providers, the container name must be globally unique (such as is the case for bucket names on S3). On others like Azure, the container names are local to the cloud storage account. In case of conflict, just change the container name: " 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "collapsed": false, 297 | "input": [ 298 | "CONTAINER_NAME = \"mnist8m\"" 299 | ], 300 | "language": "python", 301 | "metadata": {}, 302 | "outputs": [], 303 | "prompt_number": 110 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "The following function parse the `cloudstorage.ini` file and build a Libcloud driver instance. This instance is not thread safe, hence we wrap the driver instanciation in a function to be reused in individual threads." 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "collapsed": false, 315 | "input": [ 316 | "def build_driver(configfile_path=CONFIGFILE_PATH, section='account'):\n", 317 | " config = ConfigParser()\n", 318 | " config.read(configfile_path)\n", 319 | " provider_name = config.get(section, 'libcloud_provider')\n", 320 | " driver_type = get_driver(provider_name)\n", 321 | " account_name = config.get(section, 'account_name')\n", 322 | " account_secret = config.get(section, 'account_secret')\n", 323 | " return driver_type(account_name, account_secret)\n", 324 | "\n", 325 | "driver = build_driver()" 326 | ], 327 | "language": "python", 328 | "metadata": {}, 329 | "outputs": [], 330 | "prompt_number": 103 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "The following utility function checks that a container with a specific name exits on the Cloud Storage provider, otherwise it creates it:" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "collapsed": false, 342 | "input": [ 343 | "def get_or_create_container(driver, container_name=CONTAINER_NAME):\n", 344 | " try:\n", 345 | " return driver.get_container(container_name)\n", 346 | " except ContainerDoesNotExistError:\n", 347 | " return driver.create_container(container_name)\n", 348 | " \n", 349 | "container = get_or_create_container(driver)" 350 | ], 351 | "language": "python", 352 | "metadata": {}, 353 | "outputs": [], 354 | "prompt_number": 104 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "We can now write a function that uploads invidual local files to a target object container. As this function will be called in parallel in various threads we instanciate a dedicated driver inside." 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "collapsed": false, 366 | "input": [ 367 | "def upload_object(local_folder, object_name, container_name=CONTAINER_NAME, skip_if_exists=True):\n", 368 | " driver = build_driver() # libcloud drivers are not thread-safe\n", 369 | " container = get_or_create_container(driver, container_name)\n", 370 | " filepath = os.path.join(local_folder, object_name)\n", 371 | " if skip_if_exists:\n", 372 | " try:\n", 373 | " # Check the size to deal with partially uploaded files\n", 374 | " ob = container.get_object(object_name)\n", 375 | " if ob.size == os.stat(filepath).st_size:\n", 376 | " return ob\n", 377 | " except ObjectDoesNotExistError:\n", 378 | " pass\n", 379 | " return container.upload_object(filepath, object_name,\n", 380 | " extra={'content_type': 'application/octet-stream'})" 381 | ], 382 | "language": "python", 383 | "metadata": {}, 384 | "outputs": [], 385 | "prompt_number": 105 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "Finally let us upload all the chunks and labels from the MNIST8M dataset in parallel to speedup the upload. As IPython does not seem to be fully compatible with gevent monkeypatching we will use Python threads to upload data in parallel: " 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "collapsed": false, 397 | "input": [ 398 | "n_workers = 10\n", 399 | "filenames = os.listdir(NUMPY_DATA_FOLDER)\n", 400 | "\n", 401 | "tic = time()\n", 402 | "with ThreadPoolExecutor(max_workers=n_workers) as e:\n", 403 | " for f in filenames:\n", 404 | " e.submit(upload_object, local_folder, f)\n", 405 | "print(\"Uploaded {} files with {} workers in {:0.3f}s\".format(\n", 406 | " len(filenames), n_workers, time() - tic))" 407 | ], 408 | "language": "python", 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "output_type": "stream", 413 | "stream": "stdout", 414 | "text": [ 415 | "Uploaded 83 files with 10 workers in 281.750s\n" 416 | ] 417 | } 418 | ], 419 | "prompt_number": 106 420 | } 421 | ], 422 | "metadata": {} 423 | } 424 | ] 425 | } -------------------------------------------------------------------------------- /Non IID cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 2, 13 | "metadata": {}, 14 | "source": [ 15 | "Impact of the dependency between samples on cross-validation test score estimates" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "collapsed": false, 21 | "input": [ 22 | "import numpy as np\n", 23 | "from sklearn.datasets import load_digits" 24 | ], 25 | "language": "python", 26 | "metadata": {}, 27 | "outputs": [], 28 | "prompt_number": 1 29 | }, 30 | { 31 | "cell_type": "code", 32 | "collapsed": false, 33 | "input": [ 34 | "digits = load_digits()\n", 35 | "X, y = digits.data, digits.target" 36 | ], 37 | "language": "python", 38 | "metadata": {}, 39 | "outputs": [], 40 | "prompt_number": 2 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "The digits dataset of scikit-learn is the test set of the [UCI optdigits dataset](http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/). Apparently consecutive samples are more likely to stem from the same writer on this dataset. Hence the samples are not independent and identically distributed (iid) as different writing styles grouped togethers effectively introduce a dependency. Unfortunately the exact per-sample authorship metadata has not be kept in the optdigits dataset.\n", 47 | "\n", 48 | "This is highlighted by the fact that shuffling the data significantly affects the test score estimated by K-Fold cross-validation. Let us build a model with non-optimal parameters to highlight the impact of dependent samples:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "collapsed": false, 54 | "input": [ 55 | "from sklearn.svm import SVC\n", 56 | "\n", 57 | "model = SVC(C=10, gamma=0.005)" 58 | ], 59 | "language": "python", 60 | "metadata": {}, 61 | "outputs": [], 62 | "prompt_number": 3 63 | }, 64 | { 65 | "cell_type": "code", 66 | "collapsed": false, 67 | "input": [ 68 | "from sklearn.cross_validation import cross_val_score\n", 69 | "\n", 70 | "def print_cv_score_summary(model, X, y, cv):\n", 71 | " scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)\n", 72 | " print(\"mean: {:3f}, stdev: {:3f}\".format(\n", 73 | " np.mean(scores), np.std(scores)))" 74 | ], 75 | "language": "python", 76 | "metadata": {}, 77 | "outputs": [], 78 | "prompt_number": 4 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "KFold does not shuffle the data by default hence takes the dependency structure of the dataset into account for small number of folds such as k=5:" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "collapsed": false, 90 | "input": [ 91 | "from sklearn.cross_validation import KFold\n", 92 | "\n", 93 | "cv = KFold(len(y), 5)\n", 94 | "print_cv_score_summary(model, X, y, cv)" 95 | ], 96 | "language": "python", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "output_type": "stream", 101 | "stream": "stdout", 102 | "text": [ 103 | "mean: 0.901543, stdev: 0.037016\n" 104 | ] 105 | } 106 | ], 107 | "prompt_number": 5 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": {}, 112 | "source": [ 113 | "If we shuffle the data, the estimated test score is much higher as we hide the dependency structure to the model hence we cannot detect the overfitting caused by the author writing styles:" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "collapsed": false, 119 | "input": [ 120 | "cv = KFold(len(y), 5, shuffle=True, random_state=0)\n", 121 | "print_cv_score_summary(model, X, y, cv)" 122 | ], 123 | "language": "python", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "stream": "stdout", 129 | "text": [ 130 | "mean: 0.968836, stdev: 0.007350\n" 131 | ] 132 | } 133 | ], 134 | "prompt_number": 6 135 | }, 136 | { 137 | "cell_type": "code", 138 | "collapsed": false, 139 | "input": [ 140 | "cv = KFold(len(y), 5, shuffle=True, random_state=1)\n", 141 | "print_cv_score_summary(model, X, y, cv)" 142 | ], 143 | "language": "python", 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "output_type": "stream", 148 | "stream": "stdout", 149 | "text": [ 150 | "mean: 0.967725, stdev: 0.004847\n" 151 | ] 152 | } 153 | ], 154 | "prompt_number": 7 155 | }, 156 | { 157 | "cell_type": "code", 158 | "collapsed": false, 159 | "input": [ 160 | "cv = KFold(len(y), 5, shuffle=True, random_state=2)\n", 161 | "print_cv_score_summary(model, X, y, cv)" 162 | ], 163 | "language": "python", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "output_type": "stream", 168 | "stream": "stdout", 169 | "text": [ 170 | "mean: 0.966622, stdev: 0.010217\n" 171 | ] 172 | } 173 | ], 174 | "prompt_number": 8 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "There is almost **7% discrepancy between the estimated score** probably caused by the dependency between samples.\n", 181 | "\n", 182 | "Those shuffled KFold cv scores are in-line with equivalent `ShuffleSplit`:" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "collapsed": false, 188 | "input": [ 189 | "from sklearn.cross_validation import ShuffleSplit\n", 190 | "\n", 191 | "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=0)\n", 192 | "print_cv_score_summary(model, X, y, cv)" 193 | ], 194 | "language": "python", 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "output_type": "stream", 199 | "stream": "stdout", 200 | "text": [ 201 | "mean: 0.971667, stdev: 0.007115\n" 202 | ] 203 | } 204 | ], 205 | "prompt_number": 9 206 | }, 207 | { 208 | "cell_type": "code", 209 | "collapsed": false, 210 | "input": [ 211 | "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=1)\n", 212 | "print_cv_score_summary(model, X, y, cv)" 213 | ], 214 | "language": "python", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "output_type": "stream", 219 | "stream": "stdout", 220 | "text": [ 221 | "mean: 0.973333, stdev: 0.003333\n" 222 | ] 223 | } 224 | ], 225 | "prompt_number": 10 226 | }, 227 | { 228 | "cell_type": "code", 229 | "collapsed": false, 230 | "input": [ 231 | "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=2)\n", 232 | "print_cv_score_summary(model, X, y, cv)" 233 | ], 234 | "language": "python", 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "output_type": "stream", 239 | "stream": "stdout", 240 | "text": [ 241 | "mean: 0.958333, stdev: 0.008784\n" 242 | ] 243 | } 244 | ], 245 | "prompt_number": 11 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Note that `StratifiedKFold` sorts the samples by classes prior to computing the folds hence breaks the dependency too (at least in scikit-learn 0.14):" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "collapsed": false, 257 | "input": [ 258 | "from sklearn.cross_validation import StratifiedKFold\n", 259 | "\n", 260 | "cv = StratifiedKFold(y, 5)\n", 261 | "print_cv_score_summary(model, X, y, cv)" 262 | ], 263 | "language": "python", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "output_type": "stream", 268 | "stream": "stdout", 269 | "text": [ 270 | "mean: 0.969404, stdev: 0.010674\n" 271 | ] 272 | } 273 | ], 274 | "prompt_number": 12 275 | } 276 | ], 277 | "metadata": {} 278 | } 279 | ] 280 | } -------------------------------------------------------------------------------- /Numba Parakeet Cython.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "Numba Parakeet Cython" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Numba vs. Parakeet vs. Cython" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "*This notebook is derived from a blog *\n", 23 | "[*post*](http://jakevdp.github.io/blog/2012/08/24/numba-vs-cython/)\n", 24 | "*by Jake Vanderplas on the blog*\n", 25 | "[*Pythonic Perambulations*](http://jakevdp.github.io) and updated by Olivier Grisel to add Parakeet." 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "collapsed": false, 31 | "input": [ 32 | "import numpy as np\n", 33 | "\n", 34 | "X = np.random.random((1000, 3))\n", 35 | "X_wide = np.random.random((1000, 100))" 36 | ], 37 | "language": "python", 38 | "metadata": {}, 39 | "outputs": [], 40 | "prompt_number": 1 41 | }, 42 | { 43 | "cell_type": "heading", 44 | "level": 2, 45 | "metadata": {}, 46 | "source": [ 47 | "Numpy Function With Broadcasting" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "collapsed": false, 53 | "input": [ 54 | "def pairwise_numpy(X):\n", 55 | " return np.sqrt(((X[:, None, :] - X) ** 2).sum(-1))\n", 56 | "%timeit pairwise_numpy(X)" 57 | ], 58 | "language": "python", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "output_type": "stream", 63 | "stream": "stdout", 64 | "text": [ 65 | "10 loops, best of 3: 64.7 ms per loop\n" 66 | ] 67 | } 68 | ], 69 | "prompt_number": 2 70 | }, 71 | { 72 | "cell_type": "heading", 73 | "level": 2, 74 | "metadata": {}, 75 | "source": [ 76 | "Pure Python Function" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": false, 82 | "input": [ 83 | "def pairwise_python(X):\n", 84 | " M = X.shape[0]\n", 85 | " N = X.shape[1]\n", 86 | " D = np.empty((M, M), dtype=np.float)\n", 87 | " for i in range(M):\n", 88 | " for j in range(M):\n", 89 | " d = 0.0\n", 90 | " for k in range(N):\n", 91 | " tmp = X[i, k] - X[j, k]\n", 92 | " d += tmp * tmp\n", 93 | " D[i, j] = np.sqrt(d)\n", 94 | " return D" 95 | ], 96 | "language": "python", 97 | "metadata": {}, 98 | "outputs": [], 99 | "prompt_number": 3 100 | }, 101 | { 102 | "cell_type": "code", 103 | "collapsed": false, 104 | "input": [ 105 | "%timeit pairwise_python(X)" 106 | ], 107 | "language": "python", 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "output_type": "stream", 112 | "stream": "stdout", 113 | "text": [ 114 | "1 loops, best of 3: 9.51 s per loop\n" 115 | ] 116 | } 117 | ], 118 | "prompt_number": 4 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Alternative python / numpy implementation closer to the parakeet example from the `examples` folder of its git repo to be fair." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "collapsed": false, 130 | "input": [ 131 | "def pairwise_python2(X):\n", 132 | " n_samples = X.shape[0]\n", 133 | " result = np.zeros((n_samples, n_samples), dtype=X.dtype)\n", 134 | " for i in xrange(X.shape[0]):\n", 135 | " for j in xrange(X.shape[0]):\n", 136 | " result[i, j] = np.sqrt(np.sum((X[i, :] - X[j, :]) ** 2))\n", 137 | " return result" 138 | ], 139 | "language": "python", 140 | "metadata": {}, 141 | "outputs": [], 142 | "prompt_number": 5 143 | }, 144 | { 145 | "cell_type": "code", 146 | "collapsed": false, 147 | "input": [ 148 | "%timeit pairwise_python2(X)" 149 | ], 150 | "language": "python", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "output_type": "stream", 155 | "stream": "stdout", 156 | "text": [ 157 | "1 loops, best of 3: 18.2 s per loop\n" 158 | ] 159 | } 160 | ], 161 | "prompt_number": 6 162 | }, 163 | { 164 | "cell_type": "code", 165 | "collapsed": false, 166 | "input": [ 167 | "#np.allclose(pairwise_python(X), pairwise_python2(X))" 168 | ], 169 | "language": "python", 170 | "metadata": {}, 171 | "outputs": [], 172 | "prompt_number": 7 173 | }, 174 | { 175 | "cell_type": "heading", 176 | "level": 2, 177 | "metadata": {}, 178 | "source": [ 179 | "Numba Wrapper" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "Note: I did not use master as I get a `TypeError: 'numba.numbawrapper.NumbaCompiledWrapper' object is not callable` when calling it." 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "collapsed": false, 192 | "input": [ 193 | "import numba\n", 194 | "\n", 195 | "numba.__version__" 196 | ], 197 | "language": "python", 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "metadata": {}, 202 | "output_type": "pyout", 203 | "prompt_number": 8, 204 | "text": [ 205 | "'0.9.0'" 206 | ] 207 | } 208 | ], 209 | "prompt_number": 8 210 | }, 211 | { 212 | "cell_type": "code", 213 | "collapsed": false, 214 | "input": [ 215 | "from numba import double\n", 216 | "from numba.decorators import jit, autojit\n", 217 | "\n", 218 | "pairwise_numba = autojit(pairwise_python)\n", 219 | "\n", 220 | "%timeit pairwise_numba(X)" 221 | ], 222 | "language": "python", 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "output_type": "stream", 227 | "stream": "stdout", 228 | "text": [ 229 | "1 loops, best of 3: 6.72 ms per loop\n" 230 | ] 231 | } 232 | ], 233 | "prompt_number": 9 234 | }, 235 | { 236 | "cell_type": "code", 237 | "collapsed": false, 238 | "input": [ 239 | "%timeit pairwise_numba(X_wide)" 240 | ], 241 | "language": "python", 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "output_type": "stream", 246 | "stream": "stdout", 247 | "text": [ 248 | "10 loops, best of 3: 97.3 ms per loop\n" 249 | ] 250 | } 251 | ], 252 | "prompt_number": 10 253 | }, 254 | { 255 | "cell_type": "code", 256 | "collapsed": false, 257 | "input": [ 258 | "pairwise_numba2 = autojit(pairwise_python2)\n", 259 | "\n", 260 | "%timeit pairwise_numba2(X)" 261 | ], 262 | "language": "python", 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "output_type": "stream", 267 | "stream": "stdout", 268 | "text": [ 269 | "1 loops, best of 3: 13.9 s per loop" 270 | ] 271 | }, 272 | { 273 | "output_type": "stream", 274 | "stream": "stdout", 275 | "text": [ 276 | "\n" 277 | ] 278 | } 279 | ], 280 | "prompt_number": 11 281 | }, 282 | { 283 | "cell_type": "heading", 284 | "level": 2, 285 | "metadata": {}, 286 | "source": [ 287 | "Parakeet Wrapper" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "Parakeet is installed from the master branch of the git repo on Jul. 3 2013" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "collapsed": false, 300 | "input": [ 301 | "from parakeet import jit\n", 302 | "\n", 303 | "pairwise_parakeet = jit(pairwise_python)\n", 304 | "\n", 305 | "%timeit pairwise_parakeet(X)" 306 | ], 307 | "language": "python", 308 | "metadata": {}, 309 | "outputs": [ 310 | { 311 | "output_type": "stream", 312 | "stream": "stdout", 313 | "text": [ 314 | "100 loops, best of 3: 12.3 ms per loop\n" 315 | ] 316 | } 317 | ], 318 | "prompt_number": 12 319 | }, 320 | { 321 | "cell_type": "code", 322 | "collapsed": false, 323 | "input": [ 324 | "%timeit pairwise_parakeet(X_wide)" 325 | ], 326 | "language": "python", 327 | "metadata": {}, 328 | "outputs": [ 329 | { 330 | "output_type": "stream", 331 | "stream": "stdout", 332 | "text": [ 333 | "10 loops, best of 3: 101 ms per loop\n" 334 | ] 335 | } 336 | ], 337 | "prompt_number": 13 338 | }, 339 | { 340 | "cell_type": "code", 341 | "collapsed": false, 342 | "input": [ 343 | "pairwise_parakeet2 = jit(pairwise_python2)\n", 344 | "%timeit pairwise_parakeet2(X)" 345 | ], 346 | "language": "python", 347 | "metadata": {}, 348 | "outputs": [ 349 | { 350 | "output_type": "stream", 351 | "stream": "stdout", 352 | "text": [ 353 | "1 loops, best of 3: 13 ms per loop\n" 354 | ] 355 | } 356 | ], 357 | "prompt_number": 14 358 | }, 359 | { 360 | "cell_type": "code", 361 | "collapsed": false, 362 | "input": [ 363 | "%timeit pairwise_parakeet2(X_wide)" 364 | ], 365 | "language": "python", 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "output_type": "stream", 370 | "stream": "stdout", 371 | "text": [ 372 | "10 loops, best of 3: 103 ms per loop\n" 373 | ] 374 | } 375 | ], 376 | "prompt_number": 15 377 | }, 378 | { 379 | "cell_type": "code", 380 | "collapsed": false, 381 | "input": [ 382 | "np.allclose(pairwise_parakeet(X), pairwise_parakeet2(X))" 383 | ], 384 | "language": "python", 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "metadata": {}, 389 | "output_type": "pyout", 390 | "prompt_number": 16, 391 | "text": [ 392 | "True" 393 | ] 394 | } 395 | ], 396 | "prompt_number": 16 397 | }, 398 | { 399 | "cell_type": "code", 400 | "collapsed": false, 401 | "input": [ 402 | "np.allclose(pairwise_parakeet(X_wide), pairwise_parakeet2(X_wide))" 403 | ], 404 | "language": "python", 405 | "metadata": {}, 406 | "outputs": [ 407 | { 408 | "metadata": {}, 409 | "output_type": "pyout", 410 | "prompt_number": 17, 411 | "text": [ 412 | "True" 413 | ] 414 | } 415 | ], 416 | "prompt_number": 17 417 | }, 418 | { 419 | "cell_type": "heading", 420 | "level": 2, 421 | "metadata": {}, 422 | "source": [ 423 | "Optimized Cython Function" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "collapsed": false, 429 | "input": [ 430 | "!cython --version" 431 | ], 432 | "language": "python", 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "output_type": "stream", 437 | "stream": "stdout", 438 | "text": [ 439 | "Cython version 0.19.1\r\n" 440 | ] 441 | } 442 | ], 443 | "prompt_number": 18 444 | }, 445 | { 446 | "cell_type": "code", 447 | "collapsed": false, 448 | "input": [ 449 | "%load_ext cythonmagic" 450 | ], 451 | "language": "python", 452 | "metadata": {}, 453 | "outputs": [], 454 | "prompt_number": 19 455 | }, 456 | { 457 | "cell_type": "code", 458 | "collapsed": false, 459 | "input": [ 460 | "%%cython\n", 461 | "\n", 462 | "import numpy as np\n", 463 | "cimport cython\n", 464 | "from libc.math cimport sqrt\n", 465 | "\n", 466 | "@cython.boundscheck(False)\n", 467 | "@cython.wraparound(False)\n", 468 | "def pairwise_cython(double[:, ::1] X):\n", 469 | " cdef int M = X.shape[0]\n", 470 | " cdef int N = X.shape[1]\n", 471 | " cdef double tmp, d\n", 472 | " cdef double[:, ::1] D = np.empty((M, M), dtype=np.float64)\n", 473 | " for i in range(M):\n", 474 | " for j in range(M):\n", 475 | " d = 0.0\n", 476 | " for k in range(N):\n", 477 | " tmp = X[i, k] - X[j, k]\n", 478 | " d += tmp * tmp\n", 479 | " D[i, j] = sqrt(d)\n", 480 | " return np.asarray(D)" 481 | ], 482 | "language": "python", 483 | "metadata": {}, 484 | "outputs": [], 485 | "prompt_number": 20 486 | }, 487 | { 488 | "cell_type": "code", 489 | "collapsed": false, 490 | "input": [ 491 | "%timeit pairwise_cython(X)" 492 | ], 493 | "language": "python", 494 | "metadata": {}, 495 | "outputs": [ 496 | { 497 | "output_type": "stream", 498 | "stream": "stdout", 499 | "text": [ 500 | "100 loops, best of 3: 6.57 ms per loop\n" 501 | ] 502 | } 503 | ], 504 | "prompt_number": 21 505 | }, 506 | { 507 | "cell_type": "code", 508 | "collapsed": false, 509 | "input": [ 510 | "%timeit pairwise_cython(X_wide)" 511 | ], 512 | "language": "python", 513 | "metadata": {}, 514 | "outputs": [ 515 | { 516 | "output_type": "stream", 517 | "stream": "stdout", 518 | "text": [ 519 | "10 loops, best of 3: 95.5 ms per loop\n" 520 | ] 521 | } 522 | ], 523 | "prompt_number": 22 524 | }, 525 | { 526 | "cell_type": "heading", 527 | "level": 2, 528 | "metadata": {}, 529 | "source": [ 530 | "Fortran/F2Py" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "collapsed": false, 536 | "input": [ 537 | "%%file pairwise_fortran.f\n", 538 | "\n", 539 | " subroutine pairwise_fortran(X,D,m,n)\n", 540 | " integer :: n,m\n", 541 | " double precision, intent(in) :: X(m,n)\n", 542 | " double precision, intent(out) :: D(m,m) \n", 543 | " integer :: i,j,k\n", 544 | " double precision :: r \n", 545 | " do i = 1,m \n", 546 | " do j = 1,m \n", 547 | " r = 0\n", 548 | " do k = 1,n \n", 549 | " r = r + (X(i,k) - X(j,k)) * (X(i,k) - X(j,k)) \n", 550 | " end do \n", 551 | " D(i,j) = sqrt(r) \n", 552 | " end do \n", 553 | " end do \n", 554 | " end subroutine pairwise_fortran" 555 | ], 556 | "language": "python", 557 | "metadata": {}, 558 | "outputs": [ 559 | { 560 | "output_type": "stream", 561 | "stream": "stdout", 562 | "text": [ 563 | "Overwriting pairwise_fortran.f\n" 564 | ] 565 | } 566 | ], 567 | "prompt_number": 23 568 | }, 569 | { 570 | "cell_type": "code", 571 | "collapsed": false, 572 | "input": [ 573 | "# Compile the Fortran with f2py.\n", 574 | "# We'll direct the output into /dev/null so it doesn't fill the screen\n", 575 | "!f2py -c pairwise_fortran.f -m pairwise_fortran > /dev/null" 576 | ], 577 | "language": "python", 578 | "metadata": {}, 579 | "outputs": [], 580 | "prompt_number": 24 581 | }, 582 | { 583 | "cell_type": "code", 584 | "collapsed": false, 585 | "input": [ 586 | "from pairwise_fortran import pairwise_fortran\n", 587 | "XF = np.asarray(X, order='F')\n", 588 | "%timeit pairwise_fortran(XF)" 589 | ], 590 | "language": "python", 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "output_type": "stream", 595 | "stream": "stdout", 596 | "text": [ 597 | "100 loops, best of 3: 10.8 ms per loop\n" 598 | ] 599 | } 600 | ], 601 | "prompt_number": 25 602 | }, 603 | { 604 | "cell_type": "code", 605 | "collapsed": false, 606 | "input": [ 607 | "XF_wide = np.asarray(X_wide, order='F')\n", 608 | "%timeit pairwise_fortran(XF_wide)" 609 | ], 610 | "language": "python", 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "output_type": "stream", 615 | "stream": "stdout", 616 | "text": [ 617 | "10 loops, best of 3: 111 ms per loop\n" 618 | ] 619 | } 620 | ], 621 | "prompt_number": 26 622 | }, 623 | { 624 | "cell_type": "heading", 625 | "level": 2, 626 | "metadata": {}, 627 | "source": [ 628 | "Scipy Pairwise Distances" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "collapsed": false, 634 | "input": [ 635 | "from scipy.spatial.distance import cdist\n", 636 | "%timeit cdist(X, X)" 637 | ], 638 | "language": "python", 639 | "metadata": {}, 640 | "outputs": [ 641 | { 642 | "output_type": "stream", 643 | "stream": "stdout", 644 | "text": [ 645 | "100 loops, best of 3: 7.37 ms per loop\n" 646 | ] 647 | } 648 | ], 649 | "prompt_number": 27 650 | }, 651 | { 652 | "cell_type": "code", 653 | "collapsed": false, 654 | "input": [ 655 | "%timeit cdist(X_wide, X_wide)" 656 | ], 657 | "language": "python", 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "output_type": "stream", 662 | "stream": "stdout", 663 | "text": [ 664 | "10 loops, best of 3: 97.6 ms per loop\n" 665 | ] 666 | } 667 | ], 668 | "prompt_number": 28 669 | }, 670 | { 671 | "cell_type": "heading", 672 | "level": 2, 673 | "metadata": {}, 674 | "source": [ 675 | "Scikit-learn Pairwise Distances" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "collapsed": false, 681 | "input": [ 682 | "from sklearn.metrics import euclidean_distances\n", 683 | "%timeit euclidean_distances(X, X)" 684 | ], 685 | "language": "python", 686 | "metadata": {}, 687 | "outputs": [ 688 | { 689 | "output_type": "stream", 690 | "stream": "stdout", 691 | "text": [ 692 | "100 loops, best of 3: 16.2 ms per loop\n" 693 | ] 694 | } 695 | ], 696 | "prompt_number": 29 697 | }, 698 | { 699 | "cell_type": "code", 700 | "collapsed": false, 701 | "input": [ 702 | "%timeit euclidean_distances(X_wide, X_wide)" 703 | ], 704 | "language": "python", 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "output_type": "stream", 709 | "stream": "stdout", 710 | "text": [ 711 | "10 loops, best of 3: 22.4 ms per loop\n" 712 | ] 713 | } 714 | ], 715 | "prompt_number": 30 716 | }, 717 | { 718 | "cell_type": "heading", 719 | "level": 2, 720 | "metadata": {}, 721 | "source": [ 722 | "Remarks and analysis" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "- This was run on a macbook air 2012 2Ghz Core i7 with the default system blas implementation (no MKL) for numpy\n", 730 | "- Some of the timings vary quite a lot from Jake's original post.\n", 731 | "- Numba seems to be able to go twice faster than Parakeet when `n_features` is small (e.g. 3 in Jake's original setting)\n", 732 | "- Numba fails to optimize the python version that uses the numpy notation to compute distances on pairs of rows\n", 733 | "- Maybe calling numba `nopython=True` would catch this but I did not understand how to add this option and make the first example work so I am not sure how to use that option correctly \n", 734 | "- Parakeet is almost as fast as numba when `n_features` grows to more realistic sizes (e.g. 100)\n", 735 | "- Parakeet can work as efficiently with the numpy row slice expression without any issue which allow for a more natural and concise syntax.\n", 736 | "- Blas (as used in the scikit-learn implementation) is still a killer as soon as all the dimensions are not small (note: the scikit-learn implementation can be less numerically stable though)" 737 | ] 738 | } 739 | ], 740 | "metadata": {} 741 | } 742 | ] 743 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ogrisel's notebook 2 | 3 | This is a bunch of IPython notebooks documents with mostly unfinished ML related experiments. 4 | 5 | Some of them can be executed in a basic numpy / scipy / pandas / matplotlib / scikit-learn 6 | environment for instance using: 7 | 8 | [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/ogrisel/notebooks) 9 | -------------------------------------------------------------------------------- /Semi-supervised Extra Trees.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "code", 12 | "collapsed": false, 13 | "input": [ 14 | "%matplotlib inline\n", 15 | "import numpy as np\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "\n", 18 | "from sklearn.ensemble import ExtraTreesClassifier\n", 19 | "from sklearn.datasets import fetch_covtype\n", 20 | "from sklearn.cross_validation import train_test_split\n", 21 | "from sklearn.cross_validation import cross_val_score\n", 22 | "from sklearn.utils import shuffle\n", 23 | "from sklearn.base import BaseEstimator\n", 24 | "from sklearn.base import clone" 25 | ], 26 | "language": "python", 27 | "metadata": {}, 28 | "outputs": [], 29 | "prompt_number": 99 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "covtype = fetch_covtype()\n", 36 | "X, y = covtype.data, covtype.target\n", 37 | "\n", 38 | "print(X.shape)\n", 39 | "print(np.unique(y))" 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "stream": "stdout", 47 | "text": [ 48 | "(581012, 54)\n", 49 | "[1 2 3 4 5 6 7]\n" 50 | ] 51 | } 52 | ], 53 | "prompt_number": 37 54 | }, 55 | { 56 | "cell_type": "code", 57 | "collapsed": false, 58 | "input": [ 59 | "X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.1)" 60 | ], 61 | "language": "python", 62 | "metadata": {}, 63 | "outputs": [], 64 | "prompt_number": 13 65 | }, 66 | { 67 | "cell_type": "code", 68 | "collapsed": false, 69 | "input": [ 70 | "X_small, X_unlabeled, y_small, _ = train_test_split(X_dev, y_dev, train_size=10000)" 71 | ], 72 | "language": "python", 73 | "metadata": {}, 74 | "outputs": [], 75 | "prompt_number": 30 76 | }, 77 | { 78 | "cell_type": "code", 79 | "collapsed": false, 80 | "input": [ 81 | "%%time\n", 82 | "\n", 83 | "etrees = ExtraTreesClassifier(n_estimators=80, n_jobs=4)\n", 84 | "scores = cross_val_score(etrees, X_small, y_small, cv=5)\n", 85 | "\n", 86 | "print(\"5-folds cv score: %0.3f+/-%0.3f\" % (np.mean(scores), np.std(scores)))" 87 | ], 88 | "language": "python", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "output_type": "stream", 93 | "stream": "stdout", 94 | "text": [ 95 | "5-folds cv score: 0.836+/-0.005\n", 96 | "CPU times: user 14.5 s, sys: 1.99 s, total: 16.5 s\n", 97 | "Wall time: 6.12 s\n" 98 | ] 99 | } 100 | ], 101 | "prompt_number": 36 102 | }, 103 | { 104 | "cell_type": "code", 105 | "collapsed": false, 106 | "input": [ 107 | "np.random.uniform(size=(3, 4))" 108 | ], 109 | "language": "python", 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "metadata": {}, 114 | "output_type": "pyout", 115 | "prompt_number": 68, 116 | "text": [ 117 | "array([[ 0.29906499, 0.61963946, 0.07382687, 0.51198165],\n", 118 | " [ 0.75008411, 0.32665691, 0.38846908, 0.26959562],\n", 119 | " [ 0.56896242, 0.1422773 , 0.06123208, 0.77610519]])" 120 | ] 121 | } 122 | ], 123 | "prompt_number": 68 124 | }, 125 | { 126 | "cell_type": "code", 127 | "collapsed": false, 128 | "input": [ 129 | "def shuffle_columns(X, copy=True, seed=0):\n", 130 | " rng = np.random.RandomState(seed)\n", 131 | " if copy:\n", 132 | " X = X.copy()\n", 133 | " for i in range(X.shape[1]):\n", 134 | " rng.shuffle(X[:, i])\n", 135 | " return X\n", 136 | "\n", 137 | "\n", 138 | "def corrupt(X, copy=True, rate=0.1, seed=0):\n", 139 | " rng = np.random.RandomState(seed)\n", 140 | " if copy:\n", 141 | " X = X.copy()\n", 142 | " X_shuffled = shuffle_columns(X, seed=0)\n", 143 | " mask = rng.uniform(size=X.shape) < rate\n", 144 | " X[mask] = X_shuffled[mask]\n", 145 | " return X\n", 146 | "\n", 147 | "\n", 148 | "def make_normality_problem(X, seed=0):\n", 149 | " data = np.vstack([X, shuffle_columns(X, seed=seed)])\n", 150 | " target = np.zeros(X.shape[0] * 2, dtype=np.int)\n", 151 | " target[:X.shape[0]] = 1\n", 152 | " return shuffle(data, target, random_state=seed)\n", 153 | "\n", 154 | "\n", 155 | "X_normal, y_normal = make_normality_problem(X_small)" 156 | ], 157 | "language": "python", 158 | "metadata": {}, 159 | "outputs": [], 160 | "prompt_number": 74 161 | }, 162 | { 163 | "cell_type": "code", 164 | "collapsed": false, 165 | "input": [ 166 | "%%time\n", 167 | "\n", 168 | "etrees_normality = ExtraTreesClassifier(n_estimators=80, n_jobs=4)\n", 169 | "scores = cross_val_score(etrees_normality, X_normal, y_normal, cv=5)\n", 170 | "\n", 171 | "print(\"5-folds cv score: %0.3f+/-%0.3f\" % (np.mean(scores), np.std(scores)))" 172 | ], 173 | "language": "python", 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "output_type": "stream", 178 | "stream": "stdout", 179 | "text": [ 180 | "5-folds cv score: 0.978+/-0.001\n", 181 | "CPU times: user 32 s, sys: 464 ms, total: 32.5 s\n", 182 | "Wall time: 10 s\n" 183 | ] 184 | } 185 | ], 186 | "prompt_number": 75 187 | }, 188 | { 189 | "cell_type": "code", 190 | "collapsed": false, 191 | "input": [ 192 | "%%time\n", 193 | "\n", 194 | "_ = etrees_normality.fit(X_normal, y_normal)" 195 | ], 196 | "language": "python", 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "output_type": "stream", 201 | "stream": "stdout", 202 | "text": [ 203 | "CPU times: user 8.47 s, sys: 55.8 ms, total: 8.53 s\n", 204 | "Wall time: 2.38 s\n" 205 | ] 206 | } 207 | ], 208 | "prompt_number": 76 209 | }, 210 | { 211 | "cell_type": "code", 212 | "collapsed": false, 213 | "input": [ 214 | "X_corrupted = corrupt(X_small, rate=0.2)\n", 215 | "\n", 216 | "predicted_normality = etrees_normality.predict_proba(X_corrupted)[:, 1]\n", 217 | "_ = plt.hist(predicted_normality, bins=30)\n", 218 | "\n", 219 | "X_new_unlabeled = X_corrupted[predicted_normality > 0.5]\n", 220 | "print(X_new_unlabeled.shape)" 221 | ], 222 | "language": "python", 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "metadata": {}, 227 | "output_type": "display_data", 228 | "png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEACAYAAABbMHZzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEwpJREFUeJzt3X+MHOVhh/Fni+0WWh/GdWVsnyu7xm4xIlFCsEkimkuT\nWhfU2lYrYWhDncSqKrlN0qpNgymC4w8oSdU0RJGpGmQwUbnKJcgyKVgYEquoCTghhBgO1z8Ut75L\nfSSBYrdqwJa3f7zv5cbH3e3M7N3s3r7PRxp29t13Z17Gt999552ZHZAkSZIkSZIkSZIkSZIkSdIM\ntQMYBg6OKf848DLwIvCZTPk24AhwCFiXKb8qLuMIcM90NVaSNDWuBd7B+eH/fmAfMDs+/6X4uBr4\nbixfBhwFavG1A8CaOP8Y0DttLZYkTYllnB/+u4DfGKfeNuDTmed7gWuARYS9hBE3AH8/tU2UJBXx\nMyXesxL4deAZYD/wrli+GBjM1BsEloxTPhTLJUktMqvkey4h9OqvJuwJ/MpUNkqSNL3KhP8g8Eic\n/xZwDlhA6NEvzdTrjnWH4ny2fGi8Ba9YsaJ+7NixEk2SpKQdAy4r8oYywz67GR3zXwXMAX4E7CGM\n588BlhOGhw4AJ4FTwFrCAeCb4jLe4tixY9Trdad6ndtvv73lbWiXyW3htnBbTD4BK4oGeaOefz/w\nPuAXgRPAbYTTP3cQDgK/CfxBrDtAGAIaAM4CW4F6fG0r8ABwIeFsn71FGypJmjqNwv/GCcpvmqD8\nrjiN9RxwZd5GSZKmV5lhH1Wgp6en1U1oG26LUW6LUW6L5tQaV6lUPY5fSZJyqtVqUDDP7flLUoIM\nf0lKkOEvSW2mq2s+tVot91SGY/6S1GZCoBfJQsf8JUk5GP6SlCDDX5ISZPhLUoIMf0lKkOEvSQky\n/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSlCj8N8BDBNu1j7WnwPngPmZsm3AEeAQsC5T\nflVcxhHgnrKNlSRNjUbhfz/QO075UuA3gf/IlK0GNsXHXmA7oz8xei+wBVgZp/GWKUmqSKPwfxp4\nbZzyzwF/OaZsA9APnAGOA0eBtcAiYC5wINZ7ENhYrrmSpKlQZsx/AzAIfG9M+eJYPmIQWDJO+VAs\nlyS1yKyC9S8CbiEM+Yxot7uBSZIaKBr+K4BlwAvxeTfwHGF4Z4hwLIDMa4OxvHtM+dBEK+jr6/vp\nfE9PDz09PQWbKEmdbn+cysvTa18GPApcOc5r3yecyfMq4UDvQ8AawrDOk8BlhBtRPgt8gjDu/y/A\nF4C94yzPe/hKSl473MO3H/gGsAo4AXx0zOvZ1g0Au+Lj48DWzOtbgfsIp3oeZfzglyRVpN3G6+35\nS0peO/T8JUkdyPCXpAQZ/pKUIMNfkhJk+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlCDD\nX5ISVPT3/Kfd448/nqve1VdfzYIFC6a5NZLUmdruVz0vvrjxvd1/8pPD3HbbFm655ZYKmiRJ1ari\nVz3bruf/+uuNe/612q2cO3eugtZIUmdyzF+SEmT4S1KCDH9JSpDhL0kJahT+O4Bh4GCm7G+Al4EX\ngEeAizOvbSPcpP0QsC5TflVcxhHgnuaaLElqVqPwvx8Ye+7lE8AVwNuBw4TAB1gNbIqPvcB2Rk89\nuhfYAqyMU+PzOSVJ06ZR+D8NvDambB8wcp7ls0B3nN8A9ANngOPAUWAtsAiYCxyI9R4ENjbTaElS\nc5od8/8Y8FicXwwMZl4bBJaMUz4UyyVJLdLMRV5/BbwJPDRFbYn6MvM9cZIkjdofp/LKhv9HgOuA\nD2TKhoClmefdhB7/EKNDQyPlQxMvuq9kkyQpFT2c3zG+o/ASygz79AKfIozx/yRTvge4AZgDLCcc\n2D0AnAROEcb/a8BNwO4S65UkTZFGPf9+4H3AAuAEcDvh7J45hAO/AN8EtgIDwK74eDaWjfwy0Vbg\nAeBCwjGCvVP1PyBJKq5R+N84TtmOSerfFaexngOuzNsoSdL08gpfSUqQ4S9JCTL8JSlBhr8kJcjw\nl6QEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9J\nSpDhL0kJMvwlKUGNwn8HMAwczJTNJ9y8/TDwBDAv89o24AhwCFiXKb8qLuMIcE9zTZYkNatR+N8P\n9I4pu5kQ/quAp+JzgNXApvjYC2wHavG1e4EtwMo4jV2mJKlCjcL/aeC1MWXrgZ1xfiewMc5vAPqB\nM8Bx4CiwFlgEzAUOxHoPZt4jSWqBMmP+CwlDQcTHhXF+MTCYqTcILBmnfCiWS5JaZFaT76/HaQr1\nZeZ74iRJGrU/TuWVCf9h4FLgJGFI55VYPgQszdTrJvT4h+J8tnxo4sX3lWiSJKWkh/M7xncUXkKZ\nYZ89wOY4vxnYnSm/AZgDLCcc2D1A+JI4RRj/rwE3Zd4jSWqBRj3/fuB9wALgBHAbcDewi3D2znHg\n+lh3IJYPAGeBrYwOCW0FHgAuBB4D9k5R+yVJJTQK/xsnKP/gBOV3xWms54Ar8zZKkjS9vMJXkhJk\n+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky/CUpQYa/\nJCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSlAz4b8NeAk4CDwE/CwwH9gHHAaeAOaNqX8EOASsa2K9\nkqQmlQ3/ZcAfAu8k3Jj9AuAG4GZC+K8CnorPAVYDm+JjL7C9iXVLkppUNoBPAWeAi4BZ8fEHwHpg\nZ6yzE9gY5zcA/fE9x4GjwJqS65YkNals+L8K/C3wn4TQ/29Cj38hMBzrDMfnAIuBwcz7B4ElJdct\nSWrSrJLvWwH8KWH453Xgn4EPj6lTj9NEJnitLzPfEydJ0qj9cSqvbPi/C/gG8OP4/BHg3cBJ4NL4\nuAh4Jb4+BCzNvL87lo2jr2STJCkVPZzfMb6j8BLKDvscAq4BLgRqwAeBAeBRYHOssxnYHef3EA4I\nzwGWAyuBAyXXLUlqUtme/wvAg8C3gXPAd4B/AOYCu4AthAO718f6A7F8ADgLbGXyISFJ0jQqG/4A\nn41T1quEvYDx3BUnSVKLea69JCWo1uoGjFHPMxpUq93K7Nmf5803/zfXQufOvYRTp15ttm2SVIla\nrUaxkfHaT/+TVzPDPi0Vgj/fxjl9ut2+4ySptRz2kaQEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky\n/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSpDhL0kJMvwlKUGGvyQlqJnwnwc8DLxMuDH7\nWmA+sA84DDwR64zYBhwBDgHrmlivJKlJzYT/PcBjwOXA2wihfjMh/FcBT8XnAKuBTfGxF9je5Lol\nSU0oG8AXA9cCO+Lzs8DrwHpgZyzbCWyM8xuAfuAMcBw4CqwpuW5JUpPKhv9y4IfA/cB3gC8BPw8s\nBIZjneH4HGAxMJh5/yCwpOS6JUlNKnsD91nAO4E/Ab4FfJ7RIZ4RdSa/w/oEr/Vl5nviJEkatT9O\n5ZUN/8E4fSs+f5hwQPckcGl8XAS8El8fApZm3t8dy8bRV7JJkpSKHs7vGN9ReAllh31OAicIB3YB\nPgi8BDwKbI5lm4HdcX4PcAMwhzBktBI4UHLdkqQmle35A3wc+EdCoB8DPgpcAOwCthAO7F4f6w7E\n8gHCweGtTD4kJEmaRrVWN2CMep7vhFrtVur1O8n//VGjXve7RtLMUKvVKNY/rv30P3l5rr0kJcjw\nl6QEGf6SlCDDX5Iq0NU1n1qtlmuqQjNn+0iScjp9+jWKnKQy3ez5S1KCDH9JSpDhL0kJMvwlKUGG\nvyQlyPCXpAQZ/pKUIMN/jCIXYnR1zW91cyWpFC/yGqPIhRinT7fbj6JKUj72/CUpQYa/JCXI8Jek\nBBn+kpQgw1+SEtRs+F8APA88Gp/PB/YBh4EngHmZutuAI8AhYF2T65UkNaHZ8P8kMMDouZE3E8J/\nFfBUfA6wGtgUH3uB7VOwbklSSc0EcDdwHXAfo3ceWA/sjPM7gY1xfgPQD5wBjgNHgTVNrFuS1IRm\nwv/vgE8B5zJlC4HhOD8cnwMsBgYz9QaBJU2sW5Jart1uzVhE2St8fwt4hTDe3zNBnTqTXyo7wWt9\nmfmeSRYvSa3Vulsz7o9TeWXD/z2EIZ7rgJ8DuoAvE3r7lwIngUWELwiAIWBp5v3dsWwcfSWbJEmp\n6OH8jvEdhZdQdtjnFkKYLwduAL4G3ATsATbHOpuB3XF+T6w3J75nJXCg5LolSU2aqh92G9nvuRvY\nBWwhHNi9PpYPxPIB4Cywlfz7SpKkKdZuRyHqeb4TarVbqdfvpMhYW72er244MDP1y5XUeYrmxfTU\nHalfLM89174ps3If6ff3/yW1E3/PvylnKfLt7O//S2oX9vwlKUGJ9PxnteVFFpLUKomEf5HhGb8k\nJHU+h30kKUGGvyQlyPCXpAQZ/pKUIMNfkhJk+EuakYr8lr5X179VIqd6Suo0RX5L//Tp2bmv9Zk7\n9xJOnXq1iZbNDIa/pATkv9YnlZ9hcdinTblLK2k62fNvU8V2adPoqUiaOvb8JU2bInuw7sVWy56/\npGlT7Abn7sVWyZ6/JCWobPgvBb4OvAS8CHwils8H9gGHgSeAeZn3bAOOAIeAdSXXq3Hlv6OYu9WS\noHz4nwH+DLgCuAb4Y+By4GZC+K8CnorPAVYDm+JjL7C9iXXrLUZOY2s8hd1wSRPL35maycoG8Eng\nu3H+f4CXgSXAemBnLN8JbIzzG4B+wpfGceAosKbkuiVpGuXvTM1kU9H7Xga8A3gWWAgMx/Lh+Bxg\nMTCYec8g4csiMWn0KNQevFZEk2n2bJ9fAL4CfBI4Pea1Rl+NM/trsxTvKKbqeK2IJtNM+M8mBP+X\ngd2xbBi4lDAstAh4JZYPEQ4Sj+iOZePoy8z3xEmt0NU1P/cxglR+D0XF/i40XfbHqbyyX/c1wpj+\njwkHfkd8NpZ9hnCwd158XA08RBjnXwI8CVzGW7sl9Tw9lVrtVur1O8d5+2TNbXXddmlHjXo9X90w\n/JR3ubMJezb5+GVRTvHgbfXfRdHPSLG/o3b4PLW+7kj9Ynletuf/XuDDwPeA52PZNuBuYBewhXBg\n9/r42kAsHyD8y24lyWGfTlZkSKvYMIN7IKOKXTQ1E4dyHBqtSrttPXv+FdRtjx7e9LUj73KLKNbj\nnk04sS2fIl9Y0/dv0j5/F+3wGZlZdUfqV9Pzl5JSvMftTxqovXmhlSQlyPCXpAQ57KMOM8vb9ZWS\nf7upMxj+yen0D7m36yvHs2xSY/gnxw+5JMNfagOdvjemdmT4K2HtErrujal6hr8SZugqXYa/WqRd\net1Smgx/tYi9bqmVvMhLkhJk+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlKCqw78XOAQc\nAT5d8bolSVGV4X8B8EXCF8Bq4Ebg8grXL0mKqgz/NcBR4DhwBvgnYEOF65ckRVWG/xLgROb5YCyT\nJFWsyh92y/UrXl1dv92wzhtvHOKNN5pujyQlq8rwHwKWZp4vJfT+s46dOvXVFfkXWeTXHtuhbru0\nox3qtks72qFuu7SjHeq2SztmWl2OFalctVmEBi4D5gDfxQO+kpSEDwH/Tjjwu63FbZEkSZI03fJc\n7PWF+PoLwDsqalcrNNoWv0/YBt8D/g14W3VNq1TeCwCvJtwG7HeqaFSL5NkWPcDzwIvA/kpa1RqN\ntsUCYC9hGPlF4COVtax6O4Bh4OAkddo6Ny8gDPssA2Yz/tj/dcBjcX4t8ExVjatYnm3xbuDiON9L\nZ26LPNthpN7XgK8Cv1tV4yqWZ1vMA14CuuPzBVU1rmJ5tkUf8NdxfgHwYzr39rTXEgJ9ovAvlJut\n+G2fPBd7rQd2xvlnCX/sCytqX5XybItvAq/H+WcZ/cB3krwXAH4ceBj4YWUtq16ebfF7wFcYPVvu\nR1U1rmJ5tsV/AV1xvosQ/mcral/VngZem+T1QrnZivDPc7HXeHU6MfSKXvi2hdFv9k6S929iA3Bv\nfJ737u8zTZ5tsRKYD3wd+DZwUzVNq1yebfEl4ArgB4Shjk9W07S2VCg3W7F7lPdDO/Yk1078sBf5\nf3o/8DHgvdPUllbKsx0+D9wc69YofvL4TJFnW8wG3gl8ALiIsHf4DGGst5Pk2Ra3EIaDeoAVwD7g\n7cDp6WtWW8udm60I/zwXe42t0x3LOk2ebQHhIO+XCGP+k+32zVR5tsNVhN1+CGO7HyIMBeyZ9tZV\nK8+2OEEY6vm/OP0rIfA6LfzzbIv3AHfG+WPA94FfJewRpabtczPPxV7ZAxfX0JkHOSHftvhlwrjn\nNZW2rFpFLwC8n8492yfPtvg14EnCAdGLCAcAV1fXxMrk2RafA26P8wsJXw7zK2pfKywj3wHfts3N\n8S72+qM4jfhifP0Fwi5up2q0Le4jHMR6Pk4Hqm5gRfL8TYzo5PCHfNviLwhn/BwEPlFp66rVaFss\nAB4l5MRBwsHwTtVPOLbxJmHv72Okm5uSJEmSJEmSJEmSJEmSJEmSJEmSNLP9P5m/8m5+bQgcAAAA\nAElFTkSuQmCC\n", 229 | "text": [ 230 | "" 231 | ] 232 | } 233 | ], 234 | "prompt_number": 90 235 | }, 236 | { 237 | "cell_type": "code", 238 | "collapsed": false, 239 | "input": [ 240 | "predicted_normality = etrees_normality.predict_proba(X_unlabeled)[:, 1]\n", 241 | "_ = plt.hist(predicted_normality, bins=30)" 242 | ], 243 | "language": "python", 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "metadata": {}, 248 | "output_type": "display_data", 249 | "png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEACAYAAABPiSrXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFPVJREFUeJzt3VuMVdd9x/HvsTEEN1w8psJcBowc3Ia0qR0acJu0OREy\nHvcBcIsMaYtpgypLtI7VqlJNHswQt039kDhEkXmIibm0JaCgGtxQLgGjRFUxToRtbEK5yEjMYHCK\nzSVKL6CcPqz/MJvJMbPmds6Z8fcjbfY6/32ZdXac/WPvtTcDkiRJkiRJkiRJkiRJkiRJkvrgQ8DL\nwKvAEeDLUW8F2oBDMT1Y2GYFcBw4Cswt1GcCh2PZ6kJ9BLA56geAqYVlS4FjMT3SD99HkjTAbo35\nMNJJ/dPASuCvqqw7gxQwtwB3AieAUiw7CMyK9g6gJdrLgWejvQj4drSbgJPA2Jg62pKkOrkpY52f\nxXw4cDPwXnwuVVl3PrAJuAKcIoXGbGACMIoUHAAbgAXRngesj/ZWYE60HwB2Axdi2kNn0EiS6iAn\nNG4iXT2cA14C3oz6Y8BrwFo6rwAmkm5bdWgDJlWpt0edmJ+O9lXgInD7DfYlSaqTnND4OXAPMBn4\nXaAMrAGmRf1t4CsD1D9JUgMZ1oN1LwLfBX4T2F+oPwe8GO12oLmwbDLpCqE92l3rHdtMAc5Ef8YA\n56NeLmzTDOzr2qm77rqrcvLkyR58DUkSaZz4Iz3dqLsrjXF03noaCdxPelrqjsI6D5GeigLYDiwm\njX9MA6aTxjHOApdI4xslYAmwrbDN0mgvBPZGezfp6auxwG3xs3d17eDJkyepVCpOlQorV66sex8a\nZfJYeCw8FjeegLu6Of9X1d2VxgTSIPVNMW2Mk/oG0q2pCvAW8GisfwTYEvOrpCejKrFsObCOFD47\ngJ1RXxv7PU66wlgc9XeBp4BX4vMq0oC4JKlOuguNw8AnqtRv9M7E38fU1Y+AX69S/1/g4ffZ1/Mx\nSZIaQM5AuAaJcrlc7y40DI9FJ49FJ49F31V712KwqcT9OUlSplKpBL3IAK80JEnZDA1JUjZDQ5KU\nzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KU\nzdCQpCFi9OgmSqVS1tRb/hImSRoiUhjkng/9JUySpAFmaEiSshkakqRshoYkKVt3ofEh4GXgVeAI\n8OWoNwF7gGPAbmBsYZsVwHHgKDC3UJ8JHI5lqwv1EcDmqB8AphaWLY2fcQx4JPM7SZIGSHeh8T/A\nZ4F7gI9H+9PAE6TQuBvYG58BZgCLYt4CPEvn6PwaYBkwPaaWqC8DzkftGeDpqDcBTwKzYlrJ9eEk\nSaqxnNtTP4v5cOBm4D1gHrA+6uuBBdGeD2wCrgCngBPAbGACMAo4GOttKGxT3NdWYE60HyBdxVyI\naQ+dQSNJqoOc0LiJdHvqHPAS8CYwPj4T8/HRngi0FbZtAyZVqbdHnZifjvZV4CJw+w32JUmqk2EZ\n6/ycdHtqDLCLdIuqqEL+2yQDorW19Vq7XC5TLpfr1hdJakz7Y+qbnNDocBH4LmlA+xxwB3CWdOvp\nnVinHWgubDOZdIXQHu2u9Y5tpgBnoj9jSGMc7UC5sE0zsK9ax4qhIUmqpsz1p9RVvdpLd7enxtE5\n+DwSuB84BGwnPdlEzF+I9nZgMWn8YxppcPsgKVwukcY3SsASYFthm459LSQNrEMaz5gbP/+2+Nm7\nevj9JEn9qLsrjQmkQeqbYtpIOqkfAraQnnw6BTwc6x+J+hHS+MRyOm9dLQfWkcJnB7Az6mtjv8dJ\nVxiLo/4u8BTwSnxeRRoQlyTVif9goSQNEf6DhZKkhmJoSJKyGRqSpGyGhiQpm6EhScpmaEiSshka\nkqRshoYkKZuhIUnKZmhIkrIZGpKkbIaGJCmboSFJymZoSJKyGRqSpGyGhiQpm6EhScpmaEiSshka\nkqRshoYkKZuhIUnKZmhIkrIZGpKkbN2FRjPwEvAm8Abwhai3Am3AoZgeLGyzAjgOHAXmFuozgcOx\nbHWhPgLYHPUDwNTCsqXAsZgeyftKkqSBUupm+R0xvQp8GPgRsAB4GLgMfLXL+jOAfwY+CUwCvgdM\nByrAQeAvYr4D+DqwE1gO/FrMFwEPAYuBJuAVUtgQP3smcKHLz6xUKpXMrytJQ1epVCKdbrPWvvZH\nT3R3pXGWFBgAPwV+TAqD9/th84FNwBXgFHACmA1MAEaRAgNgAyl8AOYB66O9FZgT7QeA3aSQuADs\nAVq6/0qSpIHSkzGNO4F7SbeQAB4DXgPWAmOjNpF026pDGylkutbb6QyfScDpaF8FLgK332BfkqQ6\nGZa53oeB7wCPk6441gBfimVPAV8BlvV77zK1trZea5fLZcrlcr26IkkNan9MfZMTGreQbhv9I/BC\n1N4pLH8OeDHa7aTB8w6TSVcI7dHuWu/YZgpwJvozBjgf9XJhm2ZgX7UOFkNDklRNmetPqat6tZfu\nbk+VSLefjgBfK9QnFNoPkZ6KAthOGsQeDkwjDYIfJI2NXCKNb5SAJcC2wjZLo70Q2Bvt3aSnr8YC\ntwH3A7uyv5kkqd91d6XxKeCPgddJj9YCfBH4HHAPaZj+LeDRWHYE2BLzq6QnojqG8pcD64CRpKen\ndkZ9LbCR9MjteVLoALxLuvX1SnxexS8+OSVJqqEeP27VgHzkVpJojEduJUm6xtCQJGUzNCRJ2QwN\nSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2QwN\nSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZuguNZuAl4E3gDeALUW8C9gDHgN3A\n2MI2K4DjwFFgbqE+Ezgcy1YX6iOAzVE/AEwtLFsaP+MY8Ejmd5IkDZDuQuMK8JfAx4D7gD8HPgo8\nQQqNu4G98RlgBrAo5i3As0Aplq0BlgHTY2qJ+jLgfNSeAZ6OehPwJDArppVcH06SpBrrLjTOAq9G\n+6fAj4FJwDxgfdTXAwuiPR/YRAqbU8AJYDYwARgFHIz1NhS2Ke5rKzAn2g+QrmIuxLSHzqCRJNVB\nT8Y07gTuBV4GxgPnon4uPgNMBNoK27SRQqZrvT3qxPx0tK8CF4Hbb7AvSVKdDMtc78Okq4DHgctd\nllViqpvW1tZr7XK5TLlcrltfJKkx7Y+pb3JC4xZSYGwEXojaOeAO0u2rCcA7UW8nDZ53mEy6QmiP\ndtd6xzZTgDPRnzGkMY52oFzYphnYV62DxdCQJFVT5vpT6qpe7aW721MlYC1wBPhaob6d9GQTMX+h\nUF8MDAemkQa3D5LC5RJpfKMELAG2VdnXQtLAOqTxjLmkwe/bgPuBXT35cpKk/lXqZvmnge8Dr9N5\nC2oFKQi2kK4QTgEPkwarAb4IfJ40PvE4nSf6mcA6YCSwg87Hd0eQrmLuJV1hLI59Avxp7A/gb+kc\nMC+qVCp1vTsmSQ2hVCqRP1pQuvZHj35GTzdoQIaGJFGb0PCNcElSNkNDkpTN0JAkZTM0JEnZDA1J\nUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1J\nUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlywmNbwHngMOFWivQBhyK6cHCshXAceAoMLdQ\nnxn7OA6sLtRHAJujfgCYWli2FDgW0yMZfZUkDaCc0HgeaOlSqwBfBe6N6d+iPgNYFPMW4FmgFMvW\nAMuA6TF17HMZcD5qzwBPR70JeBKYFdNKYGz2N5Mk9buc0PgB8F6VeqlKbT6wCbgCnAJOALOBCcAo\n4GCstwFYEO15wPpobwXmRPsBYDdwIaY9/GJ4SZJqqC9jGo8BrwFr6bwCmEi6bdWhDZhUpd4edWJ+\nOtpXgYvA7TfYlySpTob1crs1wJei/RTwFdJtprpobW291i6Xy5TL5Xp1RZIa1P6Y+qa3ofFOof0c\n8GK024HmwrLJpCuE9mh3rXdsMwU4E/0ZQxrjaAfKhW2agX3VOlMMDUlSNWWuP6Wu6tVeent7akKh\n/RCdT1ZtBxYDw4FppMHtg8BZ4BJpfKMELAG2FbZZGu2FwN5o7yY9fTUWuA24H9jVy/5KkvpBzpXG\nJuAzwDjS2MNKUlzdQ3qK6i3g0Vj3CLAl5leB5bEO0V4HjAR2ADujvhbYSHrk9jwpdADeJd36eiU+\nryINiEuS6qTaE1CDTaVSqXS/liQNcaVSic6/p3e79rU/esI3wiVJ2QwNSVI2Q0OSlM3QkCRlMzQk\nSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQk\nSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KULSc0vgWcAw4Xak3AHuAYsBsYW1i2AjgOHAXm\nFuozYx/HgdWF+ghgc9QPAFMLy5bGzzgGPJLRV0nSAMoJjeeBli61J0ihcTewNz4DzAAWxbwFeBYo\nxbI1wDJgekwd+1wGnI/aM8DTUW8CngRmxbSS68NJklRjOaHxA+C9LrV5wPporwcWRHs+sAm4ApwC\nTgCzgQnAKOBgrLehsE1xX1uBOdF+gHQVcyGmPfxieEmSaqi3YxrjSbesiPn4aE8E2grrtQGTqtTb\no07MT0f7KnARuP0G+5Ik1cmwfthHJaa6aW1tvdYul8uUy+W69UWSGtP+mPqmt6FxDrgDOEu69fRO\n1NuB5sJ6k0lXCO3R7lrv2GYKcCb6M4Y0xtEOlAvbNAP7qnWmGBqSpGrKXH9KXdWrvfT29tR20pNN\nxPyFQn0xMByYRhrcPkgKl0uk8Y0SsATYVmVfC0kD65DGM+aSBr9vA+4HdvWyv5KkfpBzpbEJ+Aww\njjT28CTwD8AW0pNPp4CHY90jUT9CGp9YTuetq+XAOmAksAPYGfW1wEbSI7fnSaED8C7wFPBKfF5F\nGhCXJNVJqftVGl6lUqnrkIokNYRSqUT+EHPp2h894RvhkqRshoYkKZuhIUnKZmhIkrIZGpKkbIaG\nJCmboSFJymZoSJKyGRqSpGyGhiQpm6EhScpmaEiSshkakqRshoYkNbjRo5solUrdTrXgP40uSQ0u\n/588959GlyQ1EENDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2foaGqeA14FDwMGoNQF7\ngGPAbmBsYf0VwHHgKDC3UJ8JHI5lqwv1EcDmqB8Apvaxv5KkPuhraFSAMnAvMCtqT5BC425gb3wG\nmAEsinkL8CydbyOuAZYB02Nqifoy4HzUngGe7mN/JUl90B+3p7q+hj4PWB/t9cCCaM8HNgFXSFco\nJ4DZwARgFJ1XKhsK2xT3tRWY0w/9laSG0Ej/plSu/rjS+B7wQ+DPojYeOBftc/EZYCLQVti2DZhU\npd4edWJ+OtpXgYuk21+SNOhdvvwe6TTa3dQ4hvVx+08BbwO/TLoldbTL8pp849bW1mvtcrlMuVwe\n6B8pSYPM/pj6pq+h8XbMfwL8C2lc4xxwB3CWdOvpnVinHWgubDuZdIXRHu2u9Y5tpgBnoq9jgHe7\ndqIYGpKkasoxdVjVq7305fbUraSxCIBfIj0NdRjYDiyN+lLghWhvBxYDw4FppMHtg6RwuUQa3ygB\nS4BthW069rWQNLAuSaqTvlxpjCddXXTs559Ij9j+ENhCevLpFPBwrHMk6kdI4xPL6bx1tRxYB4wE\ndgA7o74W2Eh65PY8KXQkSXXSWMPyveMvYZI0KPX/L1fylzBJkhqIoSFJymZoSJKyGRqSpGyGhiQp\nm6EhScpmaEiSshkakqRshoYkKZuhIUnKZmhIkrIZGpKkbIaGJCmboSFJymZoSJKy9fXXvUrSB8Lo\n0U1cvvxexpq3AFcGujt14y9hkqQM/f8Lk3qyrr+ESZI0CBkakqRshoYkKZuhIUnKZmhIkrINhtBo\nAY4Cx4G/qXNfJA0So0c3USqVMqbhWespafTQuBn4Bik4ZgCfAz5a1x41sP3799e7Cw3DY9Hpg3os\n0jsVlS7TS1VqV6rUqk2Cxg+NWcAJ4BTpf9lvA/Pr2aFG9kE9OVTjseg0GI5Ff18VvP+Vwf5afq0h\nqdHfCJ8EnC58bgNm16kvkujJm9HQs7ejB+TlNfWzRr/SqPs14YEDB7L/ZrNkyZJ6d1eD3ED8jXvV\nqqf6dZ/Vb/u83+Stn6Gm0aP4PqCVNKYBsAL4OfB0YZ0TwF217ZYkDXongY/UuxP9bRjpi90JDAde\nxYFwSdINPAj8J+mKYkWd+yJJkiRpqMl5ye/rsfw14N4a9aseujsWf0Q6Bq8D/w58vHZdq7nclz8/\nCVwFfr8WnaqDnONQBg4BbzC0nz3t7liMA3aSbne/AfxJzXpWe98CzgGHb7DOkDxv3ky6PXUn6Rm+\namMbvwfsiPZs4ECtOldjOcfit4Ax0W7hg30sOtbbB/wr8Ae16lwN5RyHscCbwOT4PK5WnauxnGPR\nCnw52uOA8zT+6we99TukIHi/0OjxebPRH7ntkPOS3zxgfbRfJv2fZHyN+ldLOcfiP4CL0X6ZzhPF\nUJP78udjwHeAn9SsZ7WVcxz+ENhKetcJ4L9q1bkayzkWbwOjoz2aFBpXa9S/WvsBcKOXanp83hws\noVHtJb9JGesMxZNlzrEoWkbn3ySGmtz/LuYDa+LzUHwpIOc4TAeaSP+Oxg+BofpSUc6x+CbwMeAM\n6ZbM47XpWkPq8XlzsFyS9fYV0KF4gujJd/os8HngUwPUl3rLORZfA56IdUs0/rtJvZFzHG4BPgHM\nAW4lXY0eIN3LHkpyjsUXSbetyqR3vPYAvwFcHrhuNbQenTcHS2i0A82Fz810Xma/3zqTozbU5BwL\nSIPf3ySNaeT+mw+DTc6xmEm6RQHp/vWDpNsW2we8d7WTcxxOk25J/XdM3yedKIdaaOQci98G/i7a\nJ4G3gF8hXYF90AzZ82bOS37FAZ37GLqDvznHYgrpvu59Ne1Z7fX05c/nGZpPT+Uch18FvkcaKL6V\nNDA6o3ZdrJmcY/FVYGW0x5NCpalG/auHO8kbCB9y581qL/k9GlOHb8Ty10iX4kNVd8fiOdLg3qGY\nDta6gzWU899Fh6EaGpB3HP6a9ATVYeALNe1dbXV3LMYBL5LOE4dJDwkMVZtIYzf/R7ra/Dwf3POm\nJEmSJEmSJEmSJEmSJEmSJEmSJEmD0/8DKGxa/p8Kz/YAAAAASUVORK5CYII=\n", 250 | "text": [ 251 | "" 252 | ] 253 | } 254 | ], 255 | "prompt_number": 138 256 | }, 257 | { 258 | "cell_type": "code", 259 | "collapsed": false, 260 | "input": [ 261 | "class SelfTrainingClassifier(BaseEstimator):\n", 262 | " \n", 263 | " def __init__(self, base_estimator=None, n_iter=10, clamp_true_target=False):\n", 264 | " self.base_estimator = base_estimator\n", 265 | " self.n_iter = n_iter\n", 266 | " self.clamp_true_target = clamp_true_target\n", 267 | " \n", 268 | " def fit(self, X, y, X_unlabeled=None, X_val=None, y_val=None):\n", 269 | " if self.base_estimator is None:\n", 270 | " model = ExtraTreesClassifier(n_estimators=100)\n", 271 | " else:\n", 272 | " model = clone(self.base_estimator)\n", 273 | " \n", 274 | " X_train, y_train = X, y\n", 275 | " \n", 276 | " for i in range(self.n_iter):\n", 277 | " model.fit(X_train, y_train)\n", 278 | " \n", 279 | " if X_val is not None and y_val is not None:\n", 280 | " print(model.score(X_val, y_val))\n", 281 | "\n", 282 | " if self.clamp_true_target:\n", 283 | " y_predicted = y\n", 284 | " else:\n", 285 | " y_predicted = model.predict(X)\n", 286 | " \n", 287 | " X_train = np.vstack([X, X_unlabeled])\n", 288 | " y_train = np.concatenate([y, model.predict(X_unlabeled)])\n", 289 | "\n", 290 | " self.estimator_ = model\n", 291 | " \n", 292 | " def predict(self, X):\n", 293 | " return self.estimator_.predict(X)\n", 294 | " \n", 295 | " def score(self, X, y):\n", 296 | " return self.estimator_.score(X, y)\n", 297 | " " 298 | ], 299 | "language": "python", 300 | "metadata": {}, 301 | "outputs": [], 302 | "prompt_number": 133 303 | }, 304 | { 305 | "cell_type": "code", 306 | "collapsed": false, 307 | "input": [ 308 | "ssc = SelfTrainingClassifier(etrees).fit(X_small, y_small, X_new_unlabeled, X_val=X_test, y_val=y_test)" 309 | ], 310 | "language": "python", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "output_type": "stream", 315 | "stream": "stdout", 316 | "text": [ 317 | "0.840573474235\n", 318 | "0.834170940759" 319 | ] 320 | }, 321 | { 322 | "output_type": "stream", 323 | "stream": "stdout", 324 | "text": [ 325 | "\n", 326 | "0.835203607449" 327 | ] 328 | }, 329 | { 330 | "output_type": "stream", 331 | "stream": "stdout", 332 | "text": [ 333 | "\n", 334 | "0.83325875185" 335 | ] 336 | }, 337 | { 338 | "output_type": "stream", 339 | "stream": "stdout", 340 | "text": [ 341 | "\n", 342 | "0.832897318509" 343 | ] 344 | }, 345 | { 346 | "output_type": "stream", 347 | "stream": "stdout", 348 | "text": [ 349 | "\n", 350 | "0.835117551892" 351 | ] 352 | }, 353 | { 354 | "output_type": "stream", 355 | "stream": "stdout", 356 | "text": [ 357 | "\n", 358 | "0.833620185192" 359 | ] 360 | }, 361 | { 362 | "output_type": "stream", 363 | "stream": "stdout", 364 | "text": [ 365 | "\n", 366 | "0.833275962962" 367 | ] 368 | }, 369 | { 370 | "output_type": "stream", 371 | "stream": "stdout", 372 | "text": [ 373 | "\n", 374 | "0.833585762969" 375 | ] 376 | }, 377 | { 378 | "output_type": "stream", 379 | "stream": "stdout", 380 | "text": [ 381 | "\n", 382 | "0.833293174073" 383 | ] 384 | }, 385 | { 386 | "output_type": "stream", 387 | "stream": "stdout", 388 | "text": [ 389 | "\n" 390 | ] 391 | } 392 | ], 393 | "prompt_number": 137 394 | }, 395 | { 396 | "cell_type": "code", 397 | "collapsed": false, 398 | "input": [], 399 | "language": "python", 400 | "metadata": {}, 401 | "outputs": [] 402 | } 403 | ], 404 | "metadata": {} 405 | } 406 | ] 407 | } -------------------------------------------------------------------------------- /Untitled Diagram.drawio: -------------------------------------------------------------------------------- 1 | UzV2zq1wL0osyPDNT0nNUTV2VTV2LsrPL4GwciucU3NyVI0MMlNUjV1UjYwMgFjVyA2HrCFY1qAgsSg1rwSLBiADYTaQg2Y1AA== -------------------------------------------------------------------------------- /cloudstorage.ini.example: -------------------------------------------------------------------------------- 1 | [account] 2 | libcloud_provider = azure_blobs 3 | account_name = TODO 4 | account_secret = deadcafe== 5 | -------------------------------------------------------------------------------- /dask/fold_learn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 48, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "\n", 12 | "\n", 13 | "\n", 20 | "\n", 28 | "\n", 29 | "
\n", 14 | "

Client

\n", 15 | "\n", 19 | "
\n", 21 | "

Cluster

\n", 22 | "
    \n", 23 | "
  • Workers: 4
  • \n", 24 | "
  • Cores: 4
  • \n", 25 | "
  • Memory: 10.00 GB
  • \n", 26 | "
\n", 27 | "
" 30 | ], 31 | "text/plain": [ 32 | "" 33 | ] 34 | }, 35 | "execution_count": 48, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "import os\n", 42 | "import numpy as np\n", 43 | "import scipy.sparse as sp\n", 44 | "import pandas as pd\n", 45 | "from glob import glob\n", 46 | "\n", 47 | "import dask\n", 48 | "import dask.bag as db\n", 49 | "import joblib\n", 50 | "\n", 51 | "from distributed import Client\n", 52 | "client = Client()\n", 53 | "client" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 37, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "rm -rf sparse_chunks/" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 38, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "folder = 'sparse_chunks'\n", 72 | "n_features = int(1e5)\n", 73 | "n_informative = int(1e4)\n", 74 | "\n", 75 | "n_chunks = int(1e1)\n", 76 | "chunk_size = int(1e2)\n", 77 | "\n", 78 | "rng = np.random.RandomState(42)\n", 79 | "true_coef = rng.randn(n_features)\n", 80 | "true_coef[n_informative:] = 0\n", 81 | "\n", 82 | "\n", 83 | "def make_chunk(n_samples, true_coef, chunk_idx, format='csr',\n", 84 | " density=1e-3, noise=1e-1):\n", 85 | " rng = np.random.RandomState(chunk_idx)\n", 86 | " n_features = true_coef.shape[0]\n", 87 | " input_data = sp.rand(n_samples, n_features, format=format,\n", 88 | " density=density, random_state=rng)\n", 89 | " noise = rng.normal(loc=0, scale=noise, size=n_samples)\n", 90 | " target = input_data.dot(true_coef).ravel() + noise\n", 91 | " return chunk_idx, input_data, (target > 0).astype(np.int32)\n", 92 | "\n", 93 | "\n", 94 | "def save_to_disk(chunk_idx, X, y, folder='sparse_chunks'):\n", 95 | " os.makedirs(folder, exist_ok=True)\n", 96 | " filename = \"sparse_chunk_{:04d}.pkl\".format(chunk_idx)\n", 97 | " joblib.dump((X, y), os.path.join(folder, filename))\n", 98 | " return filename\n", 99 | "\n", 100 | "\n", 101 | "def load_from_disk(chunk_idx, filename):\n", 102 | " X, y = joblib.load(filename)\n", 103 | " return chunk_idx, X, y" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 49, 109 | "metadata": {}, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "Lazy loading chunks from sparse_chunks\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "if not os.path.exists(folder):\n", 121 | " print(\"Generating chunks of sparse data into\", folder)\n", 122 | " b = db.from_sequence([(chunk_size, true_coef, i)\n", 123 | " for i in range(n_chunks)])\n", 124 | " b = b.starmap(make_chunk).starmap(save_to_disk).compute()\n", 125 | "\n", 126 | "\n", 127 | " \n", 128 | "print(\"Lazy loading chunks from\", folder)\n", 129 | "b = db.from_sequence(enumerate(sorted(glob('sparse_chunks/*.pkl'))))\n", 130 | "b = b.starmap(load_from_disk)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 50, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", 143 | "Wall time: 9.53 ms\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "%time b = b.persist()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 51, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "10" 160 | ] 161 | }, 162 | "execution_count": 51, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "len(b.compute())" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 52, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "name": "stdout", 178 | "output_type": "stream", 179 | "text": [ 180 | "CPU times: user 16 ms, sys: 4 ms, total: 20 ms\n", 181 | "Wall time: 25.8 ms\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "%%time\n", 187 | "chunk_idx, X_0, y_0 = b.take(1)[0]" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 53, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "data": { 197 | "text/plain": [ 198 | "0" 199 | ] 200 | }, 201 | "execution_count": 53, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "chunk_idx" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 54, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "<100x100000 sparse matrix of type ''\n", 219 | "\twith 10000 stored elements in Compressed Sparse Row format>" 220 | ] 221 | }, 222 | "execution_count": 54, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "X_0" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 55, 234 | "metadata": {}, 235 | "outputs": [ 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "0.97999999999999998" 240 | ] 241 | }, 242 | "execution_count": 55, 243 | "metadata": {}, 244 | "output_type": "execute_result" 245 | } 246 | ], 247 | "source": [ 248 | "np.mean((X_0.dot(true_coef).ravel() > 0) == y_0)" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## L1-penalized Logistic Regression with SGD" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 56, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "CPU times: user 760 ms, sys: 64 ms, total: 824 ms\n", 268 | "Wall time: 2.75 s\n" 269 | ] 270 | }, 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "(0.53333333333333333, 0.032998316455372205, 0.46405999999999997)" 275 | ] 276 | }, 277 | "execution_count": 56, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "from sklearn.linear_model import SGDClassifier\n", 284 | "from sklearn.model_selection import train_test_split\n", 285 | "from dask import delayed\n", 286 | "\n", 287 | "CLASSES = np.array([0, 1])\n", 288 | "\n", 289 | "\n", 290 | "def scan_fit(model, chunk):\n", 291 | " return model.partial_fit(*chunk, classes=CLASSES)\n", 292 | "\n", 293 | "\n", 294 | "def score(model, chunk):\n", 295 | " return model.score(*chunk)\n", 296 | "\n", 297 | "\n", 298 | "all_filenames = sorted(glob('sparse_chunks/*.pkl'))\n", 299 | "train_filenames, test_filenames = train_test_split(\n", 300 | " all_filenames, random_state=0)\n", 301 | "\n", 302 | "model = SGDClassifier(loss='log', alpha=1e-3, penalty='elasticnet', tol=0)\n", 303 | "\n", 304 | "for i in range(20):\n", 305 | " for filename in train_filenames:\n", 306 | " chunk = delayed(joblib.load)(filename)\n", 307 | " model = delayed(scan_fit)(model, chunk)\n", 308 | "\n", 309 | "\n", 310 | "scores = [delayed(score)(model, delayed(joblib.load)(filename))\n", 311 | " for filename in test_filenames]\n", 312 | " \n", 313 | "%time scores, model = dask.compute(scores, model)\n", 314 | "np.mean(scores), np.std(scores), np.mean(model.coef_ != 0)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 57, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "ename": "TypeError", 324 | "evalue": "'Future' object is not iterable", 325 | "output_type": "error", 326 | "traceback": [ 327 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 328 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 329 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscan_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_delayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 330 | "\u001b[0;32m~/code/dask/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 96\u001b[0m \u001b[0mExtra\u001b[0m \u001b[0mkeywords\u001b[0m \u001b[0mto\u001b[0m \u001b[0mforward\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mscheduler\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mget\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 97\u001b[0m \"\"\"\n\u001b[0;32m---> 98\u001b[0;31m \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 99\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 331 | "\u001b[0;32m~/code/dask/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 203\u001b[0m \u001b[0mdsk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcollections_to_dsk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvariables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimize_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 204\u001b[0m \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mvar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_keys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mvar\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvariables\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 205\u001b[0;31m \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 206\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0mresults_iter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 332 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mget_sync\u001b[0;34m(dsk, keys, **kwargs)\u001b[0m\n\u001b[1;32m 560\u001b[0m \"\"\"\n\u001b[1;32m 561\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'num_workers'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# if num_workers present, remove it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 562\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mget_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapply_sync\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 563\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 564\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 333 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mget_async\u001b[0;34m(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0;31m# Seed initial tasks into the thread pool\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ready'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'running'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mnum_workers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mfire_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0;31m# Main loop, wait on tasks to finish, insert new ones\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 334 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mfire_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m 502\u001b[0m args=(key, dumps((dsk[key], data)),\n\u001b[1;32m 503\u001b[0m dumps, loads, get_id, pack_exception),\n\u001b[0;32m--> 504\u001b[0;31m callback=queue.put)\n\u001b[0m\u001b[1;32m 505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0;31m# Seed initial tasks into the thread pool\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 335 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mapply_sync\u001b[0;34m(func, args, kwds, callback)\u001b[0m\n\u001b[1;32m 549\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_sync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0;34m\"\"\" A naive synchronous version of apply_async \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 551\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 552\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 553\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 336 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m(key, task_info, dumps, loads, get_id, pack_exception)\u001b[0m\n\u001b[1;32m 293\u001b[0m \u001b[0mfailed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpack_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 296\u001b[0m \u001b[0mfailed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 297\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfailed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 337 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m(key, task_info, dumps, loads, get_id, pack_exception)\u001b[0m\n\u001b[1;32m 288\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 291\u001b[0m \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 338 | "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m(arg, cache, dsk)\u001b[0m\n\u001b[1;32m 269\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 270\u001b[0m \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 271\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 272\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 339 | "\u001b[0;32m~/code/dask/dask/bag/core.py\u001b[0m in \u001b[0;36maccumulate_part\u001b[0;34m(binop, seq, initial, is_first)\u001b[0m\n\u001b[1;32m 1273\u001b[0m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbinop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1274\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1275\u001b[0;31m \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbinop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1276\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_first\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 340 | "\u001b[0;32m~/.virtualenvs/py36/lib/python3.6/site-packages/toolz/itertoolz.py\u001b[0m in \u001b[0;36maccumulate\u001b[0;34m(binop, seq, initial)\u001b[0m\n\u001b[1;32m 56\u001b[0m \u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccumulate\u001b[0m \u001b[0;34m:\u001b[0m \u001b[0mIn\u001b[0m \u001b[0mstandard\u001b[0m \u001b[0mitertools\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mPython\u001b[0m \u001b[0;36m3.2\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 57\u001b[0m \"\"\"\n\u001b[0;32m---> 58\u001b[0;31m \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 59\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minitial\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mno_default\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 341 | "\u001b[0;31mTypeError\u001b[0m: 'Future' object is not iterable" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "model = SGDClassifier(loss='log', penalty='l1', max_iter=1)\n", 347 | "\n", 348 | "def scan_fit(model, next_chunk):\n", 349 | " chunk_idx, X, y = next_chunk\n", 350 | " return model.partial_fit(X, y, classes=[0, 1])\n", 351 | "\n", 352 | "b.accumulate(scan_fit, initial=model).to_delayed()[-1].compute(get=dask.get)[0]" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [] 361 | } 362 | ], 363 | "metadata": { 364 | "kernelspec": { 365 | "display_name": "Python 3", 366 | "language": "python", 367 | "name": "python3" 368 | }, 369 | "language_info": { 370 | "codemirror_mode": { 371 | "name": "ipython", 372 | "version": 3 373 | }, 374 | "file_extension": ".py", 375 | "mimetype": "text/x-python", 376 | "name": "python", 377 | "nbconvert_exporter": "python", 378 | "pygments_lexer": "ipython3", 379 | "version": "3.6.1" 380 | } 381 | }, 382 | "nbformat": 4, 383 | "nbformat_minor": 2 384 | } 385 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | dependencies: 2 | - scikit-learn>=0.21 3 | - matplotlib 4 | - pandas 5 | -------------------------------------------------------------------------------- /fmri_vae/fmri_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nilearn import datasets, image 3 | from keras.layers import Conv3D, BatchNormalization, Flatten, Dense 4 | from keras.layers import Dropout, Reshape, Conv3DTranspose, Lambda 5 | from keras.models import Sequential 6 | from keras.optimizers import Adam 7 | from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau 8 | 9 | 10 | def crop_5_8_8(data): 11 | return data[:, :5, :8, :8] 12 | 13 | 14 | def make_models(input_shape=(40, 64, 64, 1), latent_dim=256, 15 | low_res_shape=(2, 2, 2, 128), dropout=0.2): 16 | encoder = Sequential([ 17 | Conv3D(16, kernel_size=3, activation='relu', 18 | padding="same", input_shape=input_shape), 19 | BatchNormalization(), 20 | Conv3D(32, kernel_size=3, activation='relu', 21 | padding="same", strides=2), 22 | BatchNormalization(), 23 | Conv3D(32, kernel_size=3, activation='relu', 24 | padding="same"), 25 | BatchNormalization(), 26 | Conv3D(64, kernel_size=3, activation='relu', 27 | padding="same", strides=2), 28 | BatchNormalization(), 29 | Conv3D(64, kernel_size=3, activation='relu', 30 | padding="same"), 31 | BatchNormalization(), 32 | Conv3D(128, kernel_size=3, activation='relu', 33 | padding="same", strides=2), 34 | BatchNormalization(), 35 | Conv3D(128, kernel_size=3, activation='relu', 36 | padding="same", strides=2), 37 | BatchNormalization(), 38 | Conv3D(latent_dim, kernel_size=3, padding="same", 39 | strides=2, activation='relu'), 40 | Flatten(), 41 | Dropout(dropout), 42 | Dense(latent_dim), 43 | ], name="encoder") 44 | 45 | decoder = Sequential([ 46 | Dense(np.prod(low_res_shape), input_shape=(latent_dim,)), 47 | Dropout(dropout), 48 | Reshape(low_res_shape), 49 | Conv3DTranspose(128, kernel_size=3, strides=2, activation='relu', 50 | padding="same"), 51 | BatchNormalization(), 52 | Conv3D(128, kernel_size=3, activation='relu', padding="same"), 53 | BatchNormalization(), 54 | Conv3DTranspose(128, kernel_size=3, strides=2, activation='relu', 55 | padding="same"), 56 | Lambda(function=crop_5_8_8), 57 | BatchNormalization(), 58 | Conv3D(64, kernel_size=3, activation='relu', padding="same"), 59 | BatchNormalization(), 60 | Conv3DTranspose(64, kernel_size=3, strides=2, activation='relu', 61 | padding="same"), 62 | BatchNormalization(), 63 | Conv3D(32, kernel_size=3, activation='relu', padding="same"), 64 | BatchNormalization(), 65 | Conv3DTranspose(32, kernel_size=3, strides=2, activation='relu', 66 | padding="same"), 67 | BatchNormalization(), 68 | Conv3D(16, kernel_size=3, activation='relu', padding="same"), 69 | BatchNormalization(), 70 | Conv3DTranspose(16, kernel_size=3, strides=2, activation='relu', 71 | padding="same"), 72 | BatchNormalization(), 73 | Conv3D(1, kernel_size=3, activation=None, padding="same"), 74 | ], name="decoder") 75 | autoencoder = Sequential([encoder, decoder], name="autoencoder") 76 | return encoder, decoder, autoencoder 77 | 78 | 79 | 80 | if __name__ == "__main__": 81 | data = datasets.fetch_haxby(subjects=(2,)) 82 | fmri_filename = data.func[0] 83 | smoothed_img = image.smooth_img(fmri_filename, 2) 84 | 85 | smoothed_data = smoothed_img.get_data().transpose(3, 0, 1, 2) 86 | #mean = smoothed_data.mean(axis=0) 87 | #smoothed_data -= mean 88 | #scale = smoothed_data.std(axis=0) + 1e-6 89 | scale = smoothed_data.std() # global scale 90 | smoothed_data /= scale 91 | smoothed_data = smoothed_data[:, :, :, :, None] 92 | input_shape = smoothed_data.shape[1:] 93 | smoothed_data_train = smoothed_data[:1200] 94 | smoothed_data_test = smoothed_data[1200:] 95 | 96 | encoder, decoder, autoencoder = make_models(input_shape=input_shape) 97 | autoencoder.compile(optimizer=Adam(lr=0.001), loss="mse") 98 | 99 | 100 | filename = "haxby_autoencoder.{epoch:02d}-{val_loss:.4f}.hdf5" 101 | ckpt_cb = ModelCheckpoint(filename, monitor='val_loss', 102 | verbose=1, save_best_only=False) 103 | filename = "haxby_autoencoder_best.hdf5" 104 | ckpt_best_cb = ModelCheckpoint(filename, monitor='val_loss', 105 | verbose=1, save_best_only=True) 106 | es_cb = EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001, 107 | verbose=1) 108 | lr_schedule_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, 109 | cooldown=5, epsilon=0.0001, verbose=1) 110 | autoencoder.fit(smoothed_data_train, smoothed_data_train, 111 | validation_data=(smoothed_data_test, smoothed_data_test), 112 | epochs=500, batch_size=32, 113 | callbacks=[ckpt_cb, ckpt_best_cb, lr_schedule_cb, es_cb]) -------------------------------------------------------------------------------- /generalization/run_mnist.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.optim as optim 7 | from torch.optim.lr_scheduler import ReduceLROnPlateau 8 | from torchvision import datasets, transforms 9 | from torch.autograd import Variable 10 | 11 | # Training settings 12 | parser = argparse.ArgumentParser(description='Study of generalization in MLPs') 13 | parser.add_argument('--batch-size', type=int, default=64, metavar='N', 14 | help='input batch size for training (default: 64)') 15 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', 16 | help='input batch size for testing (default: 1000)') 17 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 18 | help='number of epochs to train (default: 10)') 19 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 20 | help='learning rate (default: 0.01)') 21 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M', 22 | help='SGD momentum (default: 0.5)') 23 | parser.add_argument('--no-cuda', action='store_true', default=False, 24 | help='disables CUDA training') 25 | parser.add_argument('--seed', type=int, default=1, metavar='S', 26 | help='random seed (default: 1)') 27 | parser.add_argument('--log-interval', type=int, default=10, metavar='N', 28 | help='how many batches to wait before logging training status') 29 | parser.add_argument('--train-size', type=int, default=None, 30 | help='size of the subsample used for training') 31 | parser.add_argument('--test-size', type=int, default=None, 32 | help='size of the subsample used for test evaluation') 33 | parser.add_argument('--dropout', type=float, default=None, 34 | help='dropout probability (no dropout by default)') 35 | parser.add_argument('--mlp', action='store_true', default=False, 36 | help='use an MLP instead of a ConvNet') 37 | parser.add_argument('--hidden-dim', type=int, default=32, 38 | help='dimension of the MLP hidden layers') 39 | parser.add_argument('--depth', type=int, default=1, 40 | help='number of hidden layers for the MLP') 41 | args = parser.parse_args() 42 | args.cuda = not args.no_cuda and torch.cuda.is_available() 43 | 44 | torch.manual_seed(args.seed) 45 | if args.cuda: 46 | torch.cuda.manual_seed(args.seed) 47 | 48 | 49 | loader_kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} 50 | 51 | mnist_transformers = transforms.Compose([ 52 | transforms.ToTensor(), 53 | transforms.Normalize((0.1307,), (0.3081,)) 54 | ]) 55 | 56 | 57 | def make_mnist_loader(train=True, subsample=None): 58 | dataset = datasets.MNIST('../data', train=train, download=True, 59 | transform=mnist_transformers) 60 | if subsample is None: 61 | # Use the full training set 62 | loader = torch.utils.data.DataLoader( 63 | dataset, batch_size=args.batch_size, shuffle=True, **loader_kwargs) 64 | else: 65 | # Subsample a smaller training set at random 66 | mnist_loader = loader = torch.utils.data.DataLoader( 67 | dataset, batch_size=args.train_size, shuffle=True, **loader_kwargs) 68 | small_mnist_data, small_mnist_labels = next(iter(mnist_loader)) 69 | small_mnist_dataset = torch.utils.data.TensorDataset( 70 | small_mnist_data, small_mnist_labels) 71 | loader = torch.utils.data.DataLoader( 72 | small_mnist_dataset, batch_size=args.batch_size, shuffle=True, 73 | **loader_kwargs 74 | ) 75 | return loader 76 | 77 | 78 | train_loader = make_mnist_loader(train=True, subsample=args.train_size) 79 | test_loader = make_mnist_loader(train=False, subsample=args.test_size) 80 | 81 | 82 | class ConvNet(nn.Module): 83 | def __init__(self): 84 | super(ConvNet, self).__init__() 85 | self.conv1 = nn.Conv2d(1, 10, kernel_size=5) 86 | self.conv2 = nn.Conv2d(10, 20, kernel_size=5) 87 | if args.dropout: 88 | self.conv2_drop = nn.Dropout2d(p=args.dropout) 89 | self.fc1 = nn.Linear(320, 50) 90 | self.fc2 = nn.Linear(50, 10) 91 | 92 | def forward(self, x): 93 | x = F.relu(F.max_pool2d(self.conv1(x), 2)) 94 | x = self.conv2(x) 95 | if args.dropout: 96 | x = self.conv2_drop(x) 97 | x = F.relu(F.max_pool2d(x, 2)) 98 | x = x.view(-1, 320) 99 | x = F.relu(self.fc1(x)) 100 | if args.dropout: 101 | x = F.dropout(x, p=args.dropout, training=self.training) 102 | x = self.fc2(x) 103 | return F.log_softmax(x) 104 | 105 | 106 | class MLP(nn.Module): 107 | def __init__(self, input_dim=784, output_dim=10, hidden=(32,)): 108 | super(MLP, self).__init__() 109 | self.hidden_layers = layers = [] 110 | for hidden_dim in hidden: 111 | layers.append(nn.Linear(input_dim, hidden_dim)) 112 | input_dim = hidden_dim 113 | self.output_linear = nn.Linear(input_dim, output_dim) 114 | 115 | def forward(self, x): 116 | for h in self.hidden_layers: 117 | x = F.relu(h(x)) 118 | return F.log_softmax(self.output_linear(x)) 119 | 120 | 121 | if args.mlp: 122 | model = MLP(hidden=[args.hidden_dim] * args.depth) 123 | else: 124 | model = ConvNet() 125 | if args.cuda: 126 | model.cuda() 127 | 128 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) 129 | scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, cooldown=5, 130 | verbose=True) 131 | 132 | 133 | def train(epoch): 134 | model.train() 135 | for batch_idx, (data, target) in enumerate(train_loader): 136 | if args.cuda: 137 | data, target = data.cuda(), target.cuda() 138 | if isinstance(model, MLP): 139 | data = data.view(-1, 784) 140 | data, target = Variable(data), Variable(target) 141 | optimizer.zero_grad() 142 | output = model(data) 143 | loss = F.nll_loss(output, target) 144 | loss.backward() 145 | optimizer.step() 146 | if batch_idx % args.log_interval == 0: 147 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:f}' 148 | .format(epoch, batch_idx * len(data), 149 | len(train_loader.dataset), 150 | 100. * batch_idx / len(train_loader), loss.data[0], 151 | optimizer.param_groups[0]['lr'])) 152 | 153 | 154 | def evaluate(): 155 | should_stop = False 156 | model.eval() 157 | 158 | for name, loader in [('train', train_loader), ('test', test_loader)]: 159 | loss = 0 160 | correct = 0 161 | for data, target in loader: 162 | if args.cuda: 163 | data, target = data.cuda(), target.cuda() 164 | if isinstance(model, MLP): 165 | data = data.view(-1, 784) 166 | data, target = Variable(data, volatile=True), Variable(target) 167 | output = model(data) 168 | loss += F.nll_loss(output, target, size_average=False).data[0] 169 | # get the index of the max log-probability 170 | pred = output.data.max(1, keepdim=True)[1] 171 | correct += pred.eq(target.data.view_as(pred)).cpu().sum() 172 | 173 | loss /= len(loader.dataset) 174 | print('{} -- Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)' 175 | .format(name.ljust(5), loss, correct, len(loader.dataset), 176 | 100. * correct / len(loader.dataset))) 177 | if name == 'test': 178 | scheduler.step(loss) 179 | should_stop = should_stop or correct == len(loader.dataset) 180 | return should_stop or optimizer.param_groups[0]['lr'] < args.lr / 1e2 181 | 182 | 183 | for epoch in range(1, args.epochs + 1): 184 | train(epoch) 185 | if evaluate(): 186 | break 187 | -------------------------------------------------------------------------------- /gmm/gmmsgd.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from sklearn.model_selection import train_test_split 3 | import tensorflow as tf 4 | import numpy as np 5 | from math import sqrt 6 | 7 | 8 | class EpochSampler(object): 9 | """Helper function to cycle through a shuffled dataset by minibatches. 10 | 11 | The dataset is shuffled at the beginning of each epoch. 12 | """ 13 | 14 | def __init__(self, *data, n_epochs=1, batch_size=100, random_seed=None): 15 | self.data = data 16 | self.n_epochs = n_epochs 17 | self.batch_size = batch_size 18 | self.random_seed = random_seed 19 | 20 | def __iter__(self): 21 | rng = np.random.RandomState(0) 22 | n_samples = self.data[0].shape[0] 23 | n_seen = 0 24 | batch_size = self.batch_size 25 | for epoch in range(self.n_epochs): 26 | permutation = rng.permutation(n_samples) 27 | data = tuple(d[permutation] for d in self.data) 28 | for i in range(0, n_samples, batch_size): 29 | n_seen += len(data[0][i:i + batch_size]) 30 | yield n_seen, epoch, tuple(d[i:i + batch_size] for d in data) 31 | 32 | 33 | class GaussianMixtureSGD(object): 34 | def __init__(self, n_components=5, learning_rate=0.1, patience=3, 35 | batch_size=10, max_iter=1000, session=None, 36 | means_init=None, random_seed=0): 37 | self.n_components = n_components 38 | self.random_seed = random_seed 39 | self.learning_rate = learning_rate 40 | self.patience = patience 41 | self.max_iter = max_iter 42 | self.batch_size = batch_size 43 | self.session = session 44 | self.means_init = means_init 45 | 46 | def _make_model(self, n_features, dtype=np.float32): 47 | self._component_variables = defaultdict(list) 48 | X = tf.placeholder(shape=(None, n_features), dtype=dtype, name='X') 49 | 50 | # Mixture weights 51 | w = tf.Variable( 52 | tf.zeros(shape=(1, self.n_components), dtype=dtype), 53 | name='w') 54 | self._normalized_weights = tf.reshape( 55 | tf.nn.softmax(w), (self.n_components,)) 56 | logliks = [] 57 | 58 | # TODO: instead of masking using a numpy initialized densed tensor, use 59 | # a sparse tensorflow tensor with the triangular structure built-in bu 60 | # this would equire tensorflow >= 0.9 which is not released at this 61 | # point. 62 | M = tf.constant( 63 | np.tril( 64 | np.ones(shape=(n_features, n_features), dtype=dtype), 65 | k=-1), 66 | name='triangular_mask') 67 | for k in range(self.n_components): 68 | with tf.variable_scope('component_%03d' % k): 69 | if self.means_init is not None: 70 | m = np.asarray(self.means_init[k], dtype=dtype) 71 | else: 72 | m = tf.zeros(shape=(n_features,), dtype=dtype) 73 | mu = tf.Variable(m, name='mu_%03d' % k) 74 | self._component_variables['mu'].append(mu) 75 | d = tf.Variable( 76 | -2 * tf.ones(shape=[n_features], dtype=dtype), 77 | #tf.truncated_normal(shape=[n_features], 78 | # stddev=1 / sqrt(n_features), 79 | # dtype=dtype, 80 | # seed=self.random_seed + k), 81 | name='d_%03d' % k) 82 | 83 | self._component_variables['d'].append(d) 84 | H = tf.Variable( 85 | tf.zeros(shape=(n_features, n_features), dtype=dtype), 86 | #tf.truncated_normal(shape=(n_features, n_features), 87 | # stddev=1 / sqrt(n_features), 88 | # dtype=dtype, 89 | # seed=self.random_seed + k), 90 | name='H_%03d' % k) 91 | # M is an element-wise mask to set all diagonal and triangular 92 | # uppper entries of of H to zero: 93 | L = tf.add(tf.diag(tf.exp(d)), tf.mul(M, H), name='L_%03d' % k) 94 | P = tf.matmul(L, tf.transpose(L), name='P_%03d' % k) 95 | self._component_variables['P'].append(P) 96 | 97 | loglik = self._log_likelihood_one_gaussian( 98 | n_features, X, mu, P, d) 99 | logliks.append(loglik) 100 | 101 | # compute the log likelihood of the mixture 102 | # TODO: would it be better to find a way to vectorize the computation 103 | # of the log-likelihoods to avoid using tf.pack to make tensorflow 104 | # run somehow faster? 105 | 106 | # XXX: the following is wrong! We cannot get the loglikelood of a mixture 107 | # this way... I don't have time to fix it now though. 108 | # It should use tf.reduce_logsumexp instead. 109 | self._loglik = tf.reduce_sum( 110 | tf.mul(tf.transpose(tf.pack(logliks)), self._normalized_weights), 111 | reduction_indices=1) 112 | self._loss = -tf.reduce_mean(self._loglik) 113 | self._optimizer = tf.train.AdamOptimizer( 114 | learning_rate=self.learning_rate) 115 | train_op = self._optimizer.minimize(self._loss) 116 | 117 | if self.session is None: 118 | session = tf.InteractiveSession() 119 | else: 120 | session = self.session 121 | session.run(tf.initialize_all_variables()) 122 | for name, variables in self._component_variables.items(): 123 | print(name) 124 | for var in variables: 125 | print(var.eval()) 126 | if name == 'P': 127 | print('C') 128 | for var in variables: 129 | print(np.linalg.inv(var.eval())) 130 | self._train = lambda data: session.run( 131 | train_op, feed_dict={X: data} 132 | ) 133 | self.score_samples = lambda data: session.run( 134 | self._loglik, feed_dict={X: data} 135 | ) 136 | self._compute_loss = lambda data: session.run( 137 | self._loss, feed_dict={X: data} 138 | ) 139 | self.score = lambda data: -self._compute_loss(data) 140 | 141 | def _log_likelihood_one_gaussian(self, n_features, X, mu, P, d): 142 | X_mu = X - mu 143 | X_muTPX_mu = tf.reduce_sum( 144 | tf.mul(X_mu, tf.matmul(X_mu, P)), 145 | reduction_indices=1) 146 | # logdet(C) = -logdet(P) as C is the inverse of P 147 | # logdet(P) = 2 * logdet(L) = 2 * sum_i d_i 148 | return (-0.5 * n_features * tf.log(2 * np.pi) + tf.reduce_sum(d) - 0.5 149 | * X_muTPX_mu) 150 | 151 | 152 | def fit(self, X_train, X_val=None): 153 | if X_val is None: 154 | X_train, X_val = train_test_split(X_train, test_size=0.1, 155 | random_state=self.random_seed) 156 | n_samples, n_features = X_train.shape 157 | self._make_model(n_features=n_features) 158 | batch_sampler = EpochSampler(X_train, n_epochs=self.max_iter, 159 | batch_size=self.batch_size, 160 | random_seed=self.random_seed) 161 | best_val_loss = self._compute_loss(X_val) 162 | patience = self.patience 163 | for n_seen, epoch, (X_batch,) in batch_sampler: 164 | self._train(X_batch) 165 | if n_seen % 100 == 0: 166 | # XXX: ensure that this is a multiple of batch_size 167 | val_loss = self._compute_loss(X_val) 168 | if val_loss < best_val_loss: 169 | best_val_loss = val_loss 170 | patience = self.patience 171 | else: 172 | patience -= 1 173 | if patience == 0: 174 | break 175 | self.n_iter_ = epoch + 1 176 | -------------------------------------------------------------------------------- /letor_cluster/letor_gridpoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import os 4 | import json 5 | from time import time 6 | import numpy as np 7 | 8 | from sklearn.externals import joblib 9 | from sklearn.ensemble import GradientBoostingRegressor 10 | 11 | 12 | def dcg(relevances, rank=10): 13 | """Discounted cumulative gain at rank (DCG)""" 14 | relevances = np.asarray(relevances)[:rank] 15 | n_relevances = len(relevances) 16 | if n_relevances == 0: 17 | return 0. 18 | 19 | discounts = np.log2(np.arange(n_relevances) + 2) 20 | return np.sum(relevances / discounts) 21 | 22 | 23 | def ndcg(relevances, rank=10): 24 | """Normalized discounted cumulative gain (NDGC)""" 25 | best_dcg = dcg(sorted(relevances, reverse=True), rank) 26 | if best_dcg == 0: 27 | return 0. 28 | 29 | return dcg(relevances, rank) / best_dcg 30 | 31 | 32 | def mean_ndcg(y_true, y_pred, query_ids, rank=10): 33 | y_true = np.asarray(y_true) 34 | y_pred = np.asarray(y_pred) 35 | query_ids = np.asarray(query_ids) 36 | # assume query_ids are sorted 37 | ndcg_scores = [] 38 | previous_qid = query_ids[0] 39 | previous_loc = 0 40 | for loc, qid in enumerate(query_ids): 41 | if previous_qid != qid: 42 | chunk = slice(previous_loc, loc) 43 | ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]] 44 | ndcg_scores.append(ndcg(ranked_relevances, rank=rank)) 45 | previous_loc = loc 46 | previous_qid = qid 47 | 48 | chunk = slice(previous_loc, loc + 1) 49 | ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]] 50 | ndcg_scores.append(ndcg(ranked_relevances, rank=rank)) 51 | return np.mean(ndcg_scores) 52 | 53 | 54 | job_folder = sys.argv[1] 55 | with open(job_folder + '/parameters.json', 'r') as f: 56 | parameters = json.load(f) 57 | 58 | with open(job_folder + '/data.json', 'r') as f: 59 | data_filenames = json.load(f) 60 | 61 | 62 | print("Loading the data...") 63 | tic = time() 64 | X_train, y_train, qid_train = joblib.load(data_filenames['train'], 65 | mmap_mode='r') 66 | X_vali, y_vali, qid_vali = joblib.load(data_filenames['validation'], 67 | mmap_mode='r') 68 | # warm up (load the data from the drive) 69 | X_train.max(), X_vali.max() 70 | data_load_time = time() - tic 71 | print("done in{:.3f}s".format(data_load_time)) 72 | 73 | print("Training the model with parameters:") 74 | print(parameters) 75 | tic = time() 76 | model = GradientBoostingRegressor(random_state=0) 77 | model.set_params(**parameters) 78 | model.fit(X_train, y_train) 79 | training_time = time() - tic 80 | print("done in{:.3f}s".format(training_time)) 81 | 82 | print("Computing training NDGC@10...") 83 | tic = time() 84 | y_pred = model.predict(X_train) 85 | prediction_time = time() - tic 86 | train_score = mean_ndcg(y_train, y_pred, qid_train) 87 | print("{:.3f}".format(train_score)) 88 | print("done in{:.3f}s".format(prediction_time)) 89 | 90 | print("Computing validation NDGC@10...") 91 | y_pred = model.predict(X_vali) 92 | validation_score = mean_ndcg(y_vali, y_pred, qid_vali) 93 | print("{:.3f}".format(validation_score)) 94 | 95 | model_filename = job_folder + '/model.pkl' 96 | print("Saving model to {}".format(model_filename)) 97 | tic = time() 98 | model_filenames = joblib.dump(model, model_filename) 99 | model_save_time = time() - tic 100 | print("done in{:.3f}s".format(model_save_time)) 101 | model_size_bytes = 0 102 | for filename in model_filenames: 103 | model_size_bytes += os.stat(filename).st_size 104 | 105 | results = { 106 | 'data_load_time': data_load_time, 107 | 'training_time': training_time, 108 | 'prediction_time': prediction_time, 109 | 'model_save_time': model_save_time, 110 | 'model_size_bytes': model_size_bytes, 111 | 'train_score': train_score, 112 | 'validation_score': validation_score, 113 | 'model_filename': model_filename, 114 | } 115 | 116 | with open(job_folder + '/results.json', 'wb') as f: 117 | f.write(json.dumps(results).encode('utf-8')) 118 | -------------------------------------------------------------------------------- /letor_cluster/letor_gridresults.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import os.path as op 4 | import json 5 | 6 | 7 | def collect_results(jobs_folder): 8 | entries = [] 9 | 10 | for job_folder in os.listdir(jobs_folder): 11 | results_filename = op.join( 12 | jobs_folder, job_folder, 'results.json') 13 | parameters_filename = op.join( 14 | jobs_folder, job_folder, 'parameters.json') 15 | 16 | if (not op.exists(parameters_filename) 17 | or not op.exists(results_filename)): 18 | continue 19 | 20 | new_entry = dict() 21 | 22 | with open(parameters_filename, 'r') as f: 23 | new_entry.update(json.load(f)) 24 | 25 | with open(results_filename, 'r') as f: 26 | new_entry.update(json.load(f)) 27 | 28 | entries.append(new_entry) 29 | 30 | return pd.DataFrame(entries) 31 | 32 | 33 | if __name__ == '__main__': 34 | results = collect_results('/scratch/ogrisel/grid_jobs') 35 | results.to_json('letor_gridresults.json') -------------------------------------------------------------------------------- /letor_cluster/letor_gridsearch.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from random import Random 4 | import json 5 | 6 | from sklearn.externals import joblib 7 | from sklearn.grid_search import ParameterGrid 8 | 9 | 10 | MSLR_DATA = '/scratch/ogrisel/mslr-web10k_fold1.npz' 11 | DATA_FOLDER = '/home/parietal/ogrisel/data' 12 | TRAIN_SAMPLE_DATA = DATA_FOLDER + '/mslr-web10k_fold1_train_500.pkl' 13 | VALI_DATA = DATA_FOLDER + '/mslr-web10k_fold1_vali.pkl' 14 | GRID_JOBS_FOLDER = '/scratch/ogrisel/grid_jobs' 15 | 16 | rng = Random(42) 17 | 18 | 19 | def subsample(X, y, qid, size, seed=None): 20 | rng = np.random.RandomState(seed) 21 | unique_qid = np.unique(qid) 22 | qid_mask = rng.permutation(len(unique_qid))[:size] 23 | subset_mask = np.in1d(qid_train, unique_qid[qid_mask]) 24 | return X[subset_mask], y[subset_mask], qid[subset_mask] 25 | 26 | 27 | if not os.path.exists(TRAIN_SAMPLE_DATA) or not os.path.exists(VALI_DATA): 28 | if not os.path.exists(DATA_FOLDER): 29 | os.makedirs(DATA_FOLDER) 30 | 31 | data = np.load(os.path.expanduser(MSLR_DATA)) 32 | X_train, y_train, qid_train = data['X_train'], data['y_train'], data['qid_train'] 33 | X_vali, y_vali, qid_vali = data['X_vali'], data['y_vali'], data['qid_vali'] 34 | 35 | X_train_small, y_train_small, qid_train_small = subsample( 36 | X_train, y_train, qid_train, 500, seed=0) 37 | 38 | joblib.dump((X_train_small, y_train_small, qid_train_small), 39 | TRAIN_SAMPLE_DATA) 40 | joblib.dump((X_vali, y_vali, qid_vali), VALI_DATA) 41 | 42 | 43 | if not os.path.exists(GRID_JOBS_FOLDER): 44 | os.makedirs(GRID_JOBS_FOLDER) 45 | 46 | 47 | params = { 48 | 'max_features': [10, 20, 50, 100], 49 | 'max_depth': [2, 3, 4, 5], 50 | 'subsample': [0.5, 0.8, 1.0], 51 | 'loss': ['ls', 'huber', 'quantile'], 52 | 'learning_rate': [0.05, 0.1, 0.5], 53 | } 54 | 55 | for i, param in enumerate(ParameterGrid(params)): 56 | params_description = json.dumps(param) 57 | job_id = joblib.hash(params_description) 58 | job_folder = GRID_JOBS_FOLDER + '/' + job_id 59 | if not os.path.exists(job_folder): 60 | os.makedirs(job_folder) 61 | with open(job_folder + '/parameters.json', 'wb') as f: 62 | f.write(params_description.encode('utf-8')) 63 | 64 | data_filenames = {'train': TRAIN_SAMPLE_DATA, 'validation': VALI_DATA} 65 | with open(job_folder + '/data.json', 'wb') as f: 66 | f.write(json.dumps(data_filenames).encode('utf-8')) 67 | 68 | cmd = 'qsub -V -cwd letor_gridpoint.py {}'.format(job_folder) 69 | os.system(cmd) 70 | 71 | # if i > 100: 72 | # break -------------------------------------------------------------------------------- /nmf_topics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "worksheets": [ 3 | { 4 | "cells": [ 5 | { 6 | "source": "# Topics extraction with Non-Negative Matrix Factorization\n\nThis is a proof of concept application of Non Negative Matrix\nFactorization of the term frequency matrix of a corpus of documents so\nas to extract an additive model of the topic structure of the corpus.", 7 | "cell_type": "markdown" 8 | }, 9 | { 10 | "source": "## Load the 20 newsgroups dataset", 11 | "cell_type": "markdown" 12 | }, 13 | { 14 | "cell_type": "code", 15 | "language": "python", 16 | "outputs": [], 17 | "collapsed": false, 18 | "prompt_number": 14, 19 | "input": "from sklearn import datasets\ndataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1)\nprint dataset.target_names[dataset.target[0]]" 20 | }, 21 | { 22 | "cell_type": "code", 23 | "language": "python", 24 | "outputs": [], 25 | "collapsed": false, 26 | "prompt_number": 15, 27 | "input": "print dataset.data[0]" 28 | }, 29 | { 30 | "source": "## Restrict the dimensions of the problem\n\nFor shorter computation times.", 31 | "cell_type": "markdown" 32 | }, 33 | { 34 | "cell_type": "code", 35 | "language": "python", 36 | "outputs": [], 37 | "collapsed": true, 38 | "prompt_number": 16, 39 | "input": "n_samples = 1000\nn_features = 1000" 40 | }, 41 | { 42 | "source": "## Vectorize to compute word frequencies for each document\n\nRestrict to the most common word frequency and use TF-IDF weighting (without top 5% stop words)", 43 | "cell_type": "markdown" 44 | }, 45 | { 46 | "cell_type": "code", 47 | "language": "python", 48 | "outputs": [], 49 | "collapsed": false, 50 | "prompt_number": 17, 51 | "input": "from sklearn.feature_extraction import text\n\nvectorizer = text.CountVectorizer(max_df=0.95, max_features=n_features)\ncounts = vectorizer.fit_transform(dataset.data[:n_samples])\n\ntfidf = text.TfidfTransformer().fit_transform(counts)\ntfidf" 52 | }, 53 | { 54 | "source": "Convert from a `scipy.sparse.csr_matrix` representation to a dense `numpy` array and remove negative values.", 55 | "cell_type": "markdown" 56 | }, 57 | { 58 | "cell_type": "code", 59 | "language": "python", 60 | "outputs": [], 61 | "collapsed": false, 62 | "prompt_number": 18, 63 | "input": "tfidf.toarray()" 64 | }, 65 | { 66 | "source": "## Extract some topics with Non-negative Matrix Factorization", 67 | "cell_type": "markdown" 68 | }, 69 | { 70 | "cell_type": "code", 71 | "language": "python", 72 | "outputs": [], 73 | "collapsed": true, 74 | "prompt_number": 19, 75 | "input": "from sklearn import decomposition\nn_topics = 5\n\nnmf = decomposition.NMF(n_components=n_topics).fit(tfidf)" 76 | }, 77 | { 78 | "cell_type": "code", 79 | "language": "python", 80 | "outputs": [], 81 | "collapsed": false, 82 | "prompt_number": 20, 83 | "input": "print nmf" 84 | }, 85 | { 86 | "cell_type": "code", 87 | "language": "python", 88 | "outputs": [], 89 | "collapsed": false, 90 | "prompt_number": 21, 91 | "input": "print nmf.components_" 92 | }, 93 | { 94 | "source": "## Display the most important words for each extracted topic\n\nReuse the vocabulary of the vectorizer to find the words names from the matrix positions.", 95 | "cell_type": "markdown" 96 | }, 97 | { 98 | "cell_type": "code", 99 | "language": "python", 100 | "outputs": [], 101 | "collapsed": false, 102 | "prompt_number": 22, 103 | "input": "n_top_words = 12\ninverse_vocabulary = dict((v, k) for k, v in vectorizer.vocabulary.iteritems())\n\nfor topic_idx, topic in enumerate(nmf.components_):\n print \"Topic #%d: \" % topic_idx,\n print \" \".join([inverse_vocabulary[i]\n for i in topic.argsort()[:-(n_top_words + 1):-1]])\n print" 104 | }, 105 | { 106 | "input": "", 107 | "cell_type": "code", 108 | "collapsed": true, 109 | "language": "python", 110 | "outputs": [] 111 | } 112 | ] 113 | } 114 | ], 115 | "metadata": { 116 | "name": "nmf_topics" 117 | }, 118 | "nbformat": 2 119 | } -------------------------------------------------------------------------------- /screenshots/digits.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/notebooks/09692cc11f4d75cc31e4817e053a0b011b76680f/screenshots/digits.png -------------------------------------------------------------------------------- /screenshots/topics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ogrisel/notebooks/09692cc11f4d75cc31e4817e053a0b011b76680f/screenshots/topics.png -------------------------------------------------------------------------------- /sklearn_demos/Language Classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "", 4 | "signature": "sha256:2f7a72cbc8c7a4f9909a0ab9a42a77c3b1259c4fbbbcc4e2dfb3324fdfe6ab7e" 5 | }, 6 | "nbformat": 3, 7 | "nbformat_minor": 0, 8 | "worksheets": [ 9 | { 10 | "cells": [ 11 | { 12 | "cell_type": "code", 13 | "collapsed": false, 14 | "input": [ 15 | "%matplotlib inline\n", 16 | "import numpy as np\n", 17 | "import matplotlib.pyplot as plt\n", 18 | "import sys\n", 19 | "from sklearn.datasets import load_files\n", 20 | "from sklearn.cross_validation import train_test_split" 21 | ], 22 | "language": "python", 23 | "metadata": {}, 24 | "outputs": [], 25 | "prompt_number": 19 26 | }, 27 | { 28 | "cell_type": "heading", 29 | "level": 2, 30 | "metadata": {}, 31 | "source": [ 32 | "Dataset collection (from Wikipedia)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "collapsed": false, 38 | "input": [ 39 | "dataset = load_files('language/paragraphs')\n", 40 | "docs_train, docs_test, y_train, y_test = train_test_split(\n", 41 | " dataset.data, dataset.target, test_size=0.5, random_state=0)" 42 | ], 43 | "language": "python", 44 | "metadata": {}, 45 | "outputs": [], 46 | "prompt_number": 20 47 | }, 48 | { 49 | "cell_type": "code", 50 | "collapsed": false, 51 | "input": [ 52 | "for example, lang_code in list(zip(docs_train, y_train))[:3]:\n", 53 | " print(example.decode('utf-8'))\n", 54 | " print(\"=> %s\\n\" % dataset.target_names[lang_code])" 55 | ], 56 | "language": "python", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "output_type": "stream", 61 | "stream": "stdout", 62 | "text": [ 63 | "In 2005 publiceerde het natuurwetenschappelijke tijdschrift Nature de resultaten van een vergelijkend onderzoek naar de kwaliteit van artikelen in de Engelse Wikipedia (WP) en de Encyclop\u00e6dia Britannica.[18] Universitaire deskundigen bogen zich over natuurwetenschappelijke teksten, zonder te weten uit welke encyclopedie ze kwamen. In 42 paren van overeenkomstige artikelen uit beide encyclopedie\u00ebn vonden ze zowel in WP als in EB totaal acht ernstige fouten. Gemiddeld bevatte een WP-artikel vier en een EB-artikel drie foutjes, weglatingen of misleidende beweringen. Nature concludeerde dat hoewel de schrijfstijl van de Brittanica veel beter was, de Wikipedia op natuurwetenschappelijk gebied bijna net zo goed was als de Britannica. Orlowski formuleerde het als volgt: de kwaliteit van informatie in deze artikelen was in Wikipedia daarmee 31 procent minder dan in de Britannica.[19] Het betrof hier een betrekkelijk kleine selectie van artikelen over exacte wetenschappen en techniek, en veel van de aangetroffen 'fouten' betroffen meningsverschillen tussen de onderzoekers van Nature en de redacteuren van de Britannica over welke feiten vermeld zouden moeten worden in een encyclopedie.[20] In maart 2006 publiceerde de Encyclop\u00e6dia Britannica onder de titel \"Fatally Flawed\" een weerlegging van de onderzoeksresultaten van Nature.[21]\n", 64 | "=> nl\n", 65 | "\n", 66 | "Il existe \u00e9galement un classement qualitatif fond\u00e9 sur l'existence et la taille des articles d'une liste arbitraire d'environ 1\u00a0000 articles que toute \u00e9dition de Wikip\u00e9dia devrait avoir[note 18].\n", 67 | "=> fr\n", 68 | "\n", 69 | "On January 18, 2012, the English Wikipedia participated in a series of coordinated protests against two proposed laws in the United States Congress\u2014the Stop Online Piracy Act (SOPA) and the PROTECT IP Act (PIPA)\u2014by blacking out its pages for 24 hours.[144] More than 162 million people viewed the blackout explanation page that temporarily replaced Wikipedia content.[145][146]\n", 70 | "=> en\n", 71 | "\n" 72 | ] 73 | } 74 | ], 75 | "prompt_number": 21 76 | }, 77 | { 78 | "cell_type": "heading", 79 | "level": 2, 80 | "metadata": {}, 81 | "source": [ 82 | "Model fitting" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "collapsed": false, 88 | "input": [ 89 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 90 | "from sklearn.linear_model import Perceptron\n", 91 | "from sklearn.pipeline import make_pipeline\n", 92 | "\n", 93 | "vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',\n", 94 | " use_idf=False)\n", 95 | "\n", 96 | "clf = make_pipeline(vectorizer, Perceptron())\n", 97 | "clf.fit(docs_train, y_train);" 98 | ], 99 | "language": "python", 100 | "metadata": {}, 101 | "outputs": [], 102 | "prompt_number": 22 103 | }, 104 | { 105 | "cell_type": "heading", 106 | "level": 2, 107 | "metadata": {}, 108 | "source": [ 109 | "Model evaluation" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "collapsed": false, 115 | "input": [ 116 | "sentences = [\n", 117 | " \"This is a language detection test.\",\n", 118 | " \"Ceci est un test de d\u00e9tection de la langue.\",\n", 119 | " \"Das ist eine Spracherkennungstest.\",\n", 120 | " \"Je suis au S\u00e9nat pour pr\u00e9senter l'analyse pr\u00e9dictive de donn\u00e9es.\"\n", 121 | "]\n", 122 | "predicted = clf.predict(sentences)\n", 123 | "\n", 124 | "for s, p in zip(sentences, predicted):\n", 125 | " print(u'The language of \"%s\" is \"%s\"' % (s, dataset.target_names[p]))" 126 | ], 127 | "language": "python", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "stream": "stdout", 133 | "text": [ 134 | "The language of \"This is a language detection test.\" is \"en\"\n", 135 | "The language of \"Ceci est un test de d\u00e9tection de la langue.\" is \"fr\"\n", 136 | "The language of \"Das ist eine Spracherkennungstest.\" is \"de\"\n", 137 | "The language of \"Je suis au S\u00e9nat pour pr\u00e9senter l'analyse pr\u00e9dictive de donn\u00e9es.\" is \"fr\"\n" 138 | ] 139 | } 140 | ], 141 | "prompt_number": 23 142 | }, 143 | { 144 | "cell_type": "code", 145 | "collapsed": false, 146 | "input": [ 147 | "from sklearn.metrics import classification_report\n", 148 | "\n", 149 | "y_predicted = clf.predict(docs_test)\n", 150 | "print(classification_report(y_test, y_predicted,\n", 151 | " target_names=dataset.target_names))" 152 | ], 153 | "language": "python", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "output_type": "stream", 158 | "stream": "stdout", 159 | "text": [ 160 | " precision recall f1-score support\n", 161 | "\n", 162 | " ar 1.00 1.00 1.00 14\n", 163 | " de 0.98 1.00 0.99 47\n", 164 | " en 1.00 1.00 1.00 77\n", 165 | " es 1.00 1.00 1.00 45\n", 166 | " fr 1.00 0.98 0.99 59\n", 167 | " it 0.98 1.00 0.99 45\n", 168 | " ja 1.00 0.97 0.99 35\n", 169 | " nl 1.00 1.00 1.00 18\n", 170 | " pl 1.00 0.95 0.97 20\n", 171 | " pt 0.98 1.00 0.99 47\n", 172 | " ru 1.00 1.00 1.00 26\n", 173 | "\n", 174 | "avg / total 0.99 0.99 0.99 433\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "prompt_number": 18 180 | }, 181 | { 182 | "cell_type": "code", 183 | "collapsed": false, 184 | "input": [ 185 | "from sklearn.decomposition import TruncatedSVD\n", 186 | "from itertools import cycle\n", 187 | "\n", 188 | "\n", 189 | "X_train = vectorizer.fit_transform(docs_train)\n", 190 | "X_pca = TruncatedSVD(50).fit_transform(X_train)\n", 191 | "\n", 192 | "for i, c in zip(np.unique(y_train)[:5],\n", 193 | " cycle(['r', 'g', 'b', 'c', 'm', 'y'])):\n", 194 | " mask = y_train == i\n", 195 | " language = dataset.target_names[i]\n", 196 | " plt.scatter(X_pca[mask, 0], X_pca[mask, 1], color=c, label=language)\n", 197 | " \n", 198 | "plt.legend(loc='best');" 199 | ], 200 | "language": "python", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "metadata": {}, 205 | "output_type": "display_data", 206 | "png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEACAYAAABI5zaHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl4lOW9//H3PQmQsAQSUBAcEQQVDSJLUKTWFCSxeIlS\nRbBaty500eOpOedXFUWEQ2k9J7081f56qQe05Yj08LNQpepAPYKH4xaCpAYXiGgMsmgYVrNn7t8f\nz0xmySSZMFknn9d1zcU8yzzPPQ/wzZ3vvRlrLSIiklhcnV0AERFpewruIiIJSMFdRCQBKbiLiCQg\nBXcRkQSk4C4ikoDiDu7GmKuMMR8ZY/YYY34R5fi1xpgiY8x7xphCY8yMeO8pIiLNM/H0czfGJAEf\nA1cCXwAFwE3W2g9Dzulnrf3a/348sN5aOyauUouISLPirblPBUqstZ9Za2uBtcC1oScEArtff6A8\nznuKiEgL4g3uI4CykO19/n1hjDHXGWM+BF4B/iHOe4qISAviDe4x5XSstRusteOAa4DVcd5TRERa\nkBzn578A3CHbbpzae1TW2v8xxiQbYwZbaw+HHjPGaJIbEZFWstaaaPvjrblvB8YaY842xvQG5gMv\nhp5gjDnHGGP87yf5C3O40ZWc/Xq18Hr44Yc7vQzd4aXnpGfVE55Tc+KquVtr64wxdwEeIAlYaa39\n0Biz0H/8SeB64FZjTC1wElgQzz1FRKRl8aZlsNa+gtNQGrrvyZD3jwKPxnsfERGJnUaodjPZ2dmd\nXYRuQc8pdnpWseluzymuQUxtyRhju0pZRES6A2MMtokG1bjTMiIincnfXyPhtbbyq+AuIt1eov/W\nfyo/wJRzFxFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk3jIi0nN5vfDss3D8OFx9NWRldVpR6uvr\nSUpKarPrqeYuIomrqAjy82HVKqioCD/m9cL48fDAA7BsGWRnw0svtXkRfvWrXzFmzBjS0tK48MIL\n2bBhAwDPPvss06dP595772XIkCE88sgjbXpfBXcRSUwvvQTTpjnB++67YfLk8AD/1FNQXg7V1eDz\nOcfuuSf8GqWlcMkl0LcvnHsu7NjR6mKMGTOGbdu2cfz4cR5++GFuueUWDh48CMC7777LOeecw5df\nfskDDzwQz7dtRMG9LXg8kJPjvDyezi6NiAAsXAiVlVBT4wTuzz+H1SFrBR096hwLdeJE8H19vVOb\nLyx0rrNnD8yYAYejzljepBtuuIFhw4YBcOONNzJ27FjeffddAIYPH87PfvYzXC4XKSkpp/Itm6Tg\nHi+PB+bOhc2bndfcuQrwIl3B8ePh29XV4YH5mmucGnlAaipcG7IE9L598OWXTpAPVVjYqmL88Y9/\nZOLEiaSnp5Oenk5xcTHl5eUYY3C73S1f4BQpuMcrP9/5qR5QWensE5HONXMm9OkT3O7d29kXMH26\nk4sfMQLS02H+fHjiieDxgQOhri78mnV1MGhQzEUoLS3lRz/6Eb/73e/wer0cOXKEzMzMhukS2nNe\nHAX3WCjtItL9rF4NV17pBPjBg2HlSid/Hmr+fKeG7vXCM89AaGpk0CC4917o1w9cLufPmTNb1aPm\n66+/xhjDkCFD8Pl8PPPMMxQXFwPtPx+OukI2xeNxauDl5bBrVzA3t20brF8PubnOdl6esy9Qe09N\ndfaJSOdKS4ONG+O7xooVcPnlTkPq6NGwYAG0orZ9wQUXkJeXx7Rp03C5XNx666184xvfwBjT8Gov\nms89muXLYfFipwU9mlmzYNOm4HbgBwE4gT0Q+EWk3fnnNO/sYrSrpr5jc/O5K7hH8njg29+G5soS\nGdxFpNMouEcP7sq5R7r//uYDu9IuItINKLhHKi1t+lhGRni+XUSki1JwjzRyZNPHJk9WYBeRbkHB\nPdL110ffr3SMiHQjCu6Rtm5tvE/pGBHpZhTcY6F0jIh0MwrukfLynBRMgNIxItINqZ97NBqUJNJt\ndLd+7rfffjtut5tly5bF/JlT6eeu6Qeiyc1VQBeRdtHe0w4E9Iy0jCb+EpEovJVefvPWb1iyZQkF\nXxR02H074jeNxA/umm9dpMcqOlhE/pv5rHpvFRW14cvseSu9jP/9eB547QGWbV1G9rPZvPRx2y+z\n99577zFp0iTS0tJYsGABVVVVDcc2btzIxRdfTHp6OtOnT+f9999vs/smfnDvbvOt67cMkTbx0scv\nMW3lNB747we4+5W7mfzU5LAA/1ThU5RXlFNdX40PHxV1Fdzzavgye6VHS7nk6Uvou7wv5z5+LjsO\ntG6ZvZqaGq677jpuu+02jhw5wrx583jhhRcwxvDee+/x/e9/n6effhqv18vChQuZM2cONZGrQ52i\nxA/u3Yl+yxBpMws3LqSyrpKa+hoqaiv4/OjnrC4KLrN3tOooNfXhgfREdXCZvXpfPdl/yKbwQCGV\ndZXs8e5hxh9mcLgi9mX23n77berq6rjnnntISkri+uuvJysrC2stTz/9NAsXLiQrKwtjDLfeeit9\n+vTh7bffjv/L0xOCe3fq2tjdfssQ6cKOV4cvs1ddX83hymBgvubca+jbK7jMXmpyKteeH1xmb9/x\nfXz59ZfU2/Bl9goPxL7M3v79+xkxYkTYvpH+KU5KS0vJz89vWH4vPT2dffv2ceDAgZiv35zED+65\nuc7o0lmznJdGmor0CDNHz6RPUnCZvd5JvZk5KrjM3vSzprNqzipGDBhBeko68y+czxOzg8vsDUwZ\nSJ0vfJm9Ol8dg1JiX2bvjDPO4IsvvgjbV+qfnNDtdrNo0SKOHDnS8Dp58iTz589v1fdskrW2S7yc\novRwr75qbWqqtc6kw877V1/t7FKJdGlNxY5jVcfs1c9dbfss62MH/3qwXfP3Na2+9n2b77P9lvez\nrkdctt/yfnbOmjnW5/PF/Pmamhp71lln2X//93+3NTU19oUXXrC9evWyDz30kN2+fbt1u932nXfe\nsT6fz548edJu3LjRnjhxIubv6N8fNaZqEFNXowFUIq3S3oOYXt7zMjsO7GB0+mgWZC7AZVqX8Cgs\nLOSHP/whJSUlzJ49G2MMY8eOZenSpXg8Hh566CH27NlDamoql19+OStXrqR///5h1+iUlZiMMVcB\njwFJwH9Ya38dcfxm4P8ABjgB/MRa+/co11FwF5FW624jVE9Fh6/EZIxJAp4ArgIuAG4yxoyLOG0v\n8E1r7UXAMuCpeO4pIiIti7dBdSpQYq39zFpbC6wFrg09wVr7lrX2mH/zHeDMOO8pIiItiDe4jwDK\nQrb3+fc15fvAy3HeU0REWhDvxGExJ7qMMd8C7gSmN3XOkiVLGt5nZ2eTnZ0dR9FERBLLli1b2LJl\nS0znxtWgaoy5FFhirb3Kv30/4IvSqHoR8GfgKmttSRPXUoOqiLSaGlTboUEV2A6MNcacbYzpDcwH\nXoy4+Vk4gf2WpgK7iIi0rbjSMtbaOmPMXYAHpyvkSmvth8aYhf7jTwKLgXTg9/45jGuttVPjK7aI\niDRHg5hEpFtTWqZ90jIiItIFKbiLiCQgraEqIj2W1wvPPgvHj8PVV0NWVmeXqO2o5i4iCauoyJmH\nb9UqqAhfZQ+vF8aPhwcegGXLIDsbXmr7VfbYv38/119/PaeffjqjR4/m8ccfB5xxPTfeeCO33XYb\naWlpZGZmUlgY+1zxLVFwF5GE9NJLMG2aE7zvvhsmTw4P8E89BeXlUF0NPp9z7J7wVfYoLYVLLoG+\nfeHcc2FH61bZw+fzcc011zBx4kT279/Pa6+9xmOPPcamTZv8ZXyJm266iWPHjjFnzhzuuuuuOL91\nkIJ7KK1fKpIwFi50FjOrqXEC9+efw+rgKnscPeocC3UiuMoe9fVObb6w0LnOnj0wYwYcjn2VPQoK\nCigvL+fBBx8kOTmZUaNG8YMf/IC1a9dijOHyyy/nqquuwhjDLbfcQlFRUVzfOZRy7gGB9UsDy9xt\n26ZVm0S6sePhq+xRXR0emK+5Bh5/PFibT02Fa0OmPdy3D7780gnyoQoLnfpfLEpLS9m/fz/p6ekN\n++rr6/nmN7/JyJEjGTp0aMP+vn37UlVVhc/nw+WKv96tmnuA1i8VSSgzZ0Kf4Cp79O7t7AuYPt3J\nxY8YAenpMH8+PBFcZY+BA6EufJU96upgUOyr7HHWWWcxatSosKX0jh8/zsaNG0/tS7WCgruIJKTV\nq+HKK50AP3gwrFzp5M9DzZ/v1NC9XnjmGUhJCR4bNAjuvRf69QOXy/lz5szW9aiZOnUqAwYM4NFH\nH6WyspL6+nqKi4vZvn1723zJZigtE5CX56RiArX31FRnn4h0S2lpEG8FecUKuPxypyF19GhYsABM\n1PGg0blcLjZu3EheXh6jR4+murqa888/n2XLlgHOCNNQkdvx0PQDobR+qUi3o+kH2mkN1bbS7sFd\ngVskISm49+TgHtkTJjU1vCeMAr9It6Xg3pODe04ObN4cvm/WLNi0qeXALyJdmoK7ZoWMTl0gRSQB\n9Yzgnpfn1MgD1BNGRBJczwjuublOqmXWLOcVmnZR4BeRBNQzcu4tUYOqSLelnHtPblAVkYSl4K4G\nVRGRHkPBXUQkAWluGRHpsby1tTx78CDH6+q4evBgstLSOrtIbUY1dxFJWEUnT5JfVsaqAweoiJiY\n3Vtby/iCAh7Yu5dlpaVk79zJS+XlbV6GppbZe/fdd5kyZQoDBw5k2LBh5LVxLz3V3EUkIb1UXs78\nDz6g3lqSjeFfy8oonDyZvklJADy1fz/ltbXU+BsqK3w+7ikp4ZohQxquUVpVxY27dvH+119zZp8+\nrL3gAiYNGBBzGQLL7M2dO5c//elPlJWVceWVV3Leeefx8MMP8/Of/5ybb76ZiooK3n///Tb9/qq5\ni0hCWrh7N5U+HzXWUuHz8XlVFasPHWo4frSuriGwB5wIWZ2j3lqyd+6k8MQJKn0+9lRWMmPnTg7X\n1sZchqaW2Xv++efp3bs3e/bsoby8nL59+3JJ5GTzcVJwF5GEdDxiGaVqny8sMF8zZAh9Q5azS3W5\nuDak1r6vupova2qIWGWPwtCFVlsQusxe4LVixQq++uorVq1axe7duxk3bhxTp07lr3/9a+u+YAuU\nlhGRhDQzPR2P10u1v3be2+ViZshaptMHDmTVeeeR98knVPh8XDt4ME+MHdtwfGBSEnURNfs6axmU\nHHvYDCyzt3v37qjH16xZA8ALL7zADTfcgNfrJTV0xHwcVHMXkYS0etw4rkxPp48xDE5OZuV553FJ\nRG+Y+UOHsu+yy/B+4xs8M24cKf58PMCgXr2498wz6edy4QL6+X84ZLUi597cMnvPPfccX331FQAD\nBw7EGNMmC2MHaISqiHRr7T1C9eXDh9lx4gSjU1NZcPrpuFq5FN6BAwfIy8vj9ddfD1tm79lnn2XT\npk1UVFRw9tlns3z5cubMmRP1Gpp+QER6HE0/oOkHRER6DAV3EZEEpOAuIpKAFNxFRBKQgruISAKK\nO7gbY64yxnxkjNljjPlFlOPnG2PeMsZUGWO0fp2ISAeIa4SqMSYJeAK4EvgCKDDGvGit/TDktMPA\n3cB18dxLRERiF2/NfSpQYq39zFpbC6wFrg09wVr7lbV2OxD7bDsiIhKXeIP7CKAsZHuff5+IiAAf\nf/wxF198MWlpaTzxxBMddt94Jw5L7GFhIiJxevTRR5k5cyY7d+7s0PvGG9y/ANwh226c2vspWbJk\nScP77OxssrOzT/VSIiItqvXWcvDZg9Qdr2Pw1YNJy2r7ZfZKS0u57LLLoh7z+Xytmixsy5YtbNmy\nJaZz45pbxhiTDHwMzAT2A+8CN0U0qAbOXQKcsNbmN3EtzS0jIq3W3NwyJ4tOcuRvR0hOT+b0BaeT\n1Dc462Ott5aC8QXUHq7F1lpcKS4uWHsBQ64ZEvVap2LGjBm88cYb9OrVi+TkZObMmUNaWhqlpaW8\n8cYbvPjii8yYMeOUv2O7ThxmjPk28BiQBKy01q4wxiwEsNY+aYwZBhQAaYAPOAFcYK09GXEdBXcR\nabWmAl/5S+V8MP8DbL3FJBtSzkphcuHkhgBf+qtSPnv4M2xN8LMpo1K4dO+lDdtVpVXsunEXX7//\nNX3O7MMFay9gwKTYp/wF+Na3vsX3vvc97rzzTm6//XY2bNjAK6+8wrRp06iurqZPnz6n/B2bC+5x\nL9ZhrX0FeCVi35Mh7w8SnroREWl3uxfuxlfpA8DWWKo+r+LQ6kMMXzgcgLqjdWGBHaDuRHD1Jltv\n2Zm9k6qyKqiHyj2V7Jyxk0s/uZReg3udcrmuu+46pk2bBhBTYD9VGqEqIgmp7nj4Mnu+ah+1h4M9\nsodcMwRX32AIdKW6GHJtMCVTva+ami9riFxn70Rh7MvsRTLG4HZ3TF1XwV1EElL6zHRMn2DGwtXb\nRfrM4DJ7A6cP5LxV59F7RG+S05M5bf5pjH0iuMxe0sAkbF14zd7WWZIHdY/VSbtHKUVEWmnc6nF8\n8N0POPK3IyT1T2Ls42NJuyS8N8zQ+UMZOn9o1M/3GtSLM+89ky8e/wJfpQ9XqvPDYUBW63LuoTqy\nXVHBXUQSUnJaMhdtvCiua5yz4hwGXT6IEztOkDo6ldMXnI5p5TJ7oYwxcX2+VffqKj1U1FtGRE6F\nltnTMnsiIj2GgruISAJScBcRSUAK7iIiCUjBXUQkASm4i4gkIPVzF5Fur6P6jncnCu4i0q0leh/3\nU6W0jIhIAlJwFxFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk4C4ikoAU3EVEEpCCu4hIAlJwFxFJ\nQAruItKmvB4vRTlFFOUU4fV4O7s4PZbWUBWRNuP1eCmeW4yv0geAK9VF5vpMMnIzOrlkiUlrqIpI\nhyjLL2sI7AC+Sh9l+WWdWKKeS8FdRCQBKbiLSJtx57lxpQbDiivVhTvP3Ykl6rmUcxeRNuX1eBtS\nMe48t/Lt7ai5nLuCu4hIN6UGVRHpcjweyMlxXh5PZ5cm8Si4i0iH83hg7lzYvNl5zZ4Ny5fH/ln9\nUGiZ0jIi0uFycpygHsrlgpdfhtzc8P0er5f8MieHf8UhN8uvy6Cy0jmWmgrr1zf+TE/RXFpGC2SL\nSJfg80F+fnig9ni9zC0uptLn9J0/WnCUR3r1g8perMNNQWVGo8+IQ2kZEekwgZRKeTmYKPXNwsLw\nVEt+WVlDYM96F5Yus2QdP0kWR1hKMVloeoOmqOYuIh0ikGcPpFSSk6G+HkKzsV6vc060VMu8/4KU\n6uB2Cj4WuMq4ME9dLaOJu+ZujLnKGPORMWaPMeYXTZzzW//xImPMxHjvKSLdT35+MLAD1NXBxRfD\njP5eHqWIRykiCy+Vlc65AHluN67apsPUxROCPwTU0Bourpq7MSYJeAK4EvgCKDDGvGit/TDknNnA\nGGvtWGPMJcDvgUvjua+IJIYsvNxQUUwvnNTLeI6xmEzAqY3nZmSw1GayeEcZ6y6uZfyOr0nxV/Vd\nqS4uWuGMfo38rWDbNli0CLZudbbz8npeXj7etMxUoMRa+xmAMWYtcC3wYcg5c4A/AFhr3zHGDDLG\nDLXWHorz3iLSjeTlOUE3tKfLfMpw+YITjUVLtSzKyWCKzeDP/+nlSP+9DKOKAWNSGL1idMPo18jf\nCiorYfFip5EWnPv2tF418aZlRgChU77t8+9r6Zwz47yviHSSU01/5OY6AXbWLOe1fj0MHtL4vH4T\nT0BWeEOp2e7lhqJizjhxEnOijoqPKlq8X8jPjLBUT08Rb8091o7pke3i6tAu0g1FS3+0pkacmxt+\nrhc3x7Yda5gmuKoPrLyxjuLiYtZnZpKbkcHy5dDnwTKm0Hgq4UDN/Yor/P3ms7wwz1+XXOeGgp7b\n2BpvcP8CCJ3yzY1TM2/unDP9+xpZsmRJw/vs7Gyys7PjLJ6ItKVo6Y9T7Wfu8UB+fgajzs9kkusD\njlPHuhuhYCrg8zkDlwoyWLwYfhXl8ycKT1CUU4Q7z83WrRlOYF9aDCn+HwLjj8HiTCjIIDXVSQt1\nd1u2bGHLli0xnRvXCFVjTDLwMTAT2A+8C9wUpUH1LmvtbGPMpcBj1tpGDaoaoSrS9UUbWTprFmza\n1LrrRP4G4MovwjfpSPh109PhnyewebPT8LqUYlL8tXdLMB3gSnXx3PmZPHVTGWSFX6P/gf5Me2ZK\nwjaottvEYdbaOuAuwAN8APzJWvuhMWahMWah/5yXgb3GmBLgSeCn8dxTRDpPXp7TEBrQXI24ubVU\n778//DcA39rwLo/Ttxvuu7WSf3x9G0+yHYDFZFJAOsdIDsvz+ip9TsNsUuMyVJxxkry1XsjyklNU\nRE5RER5vzxj4pLllRKRVnHSK8z5ajdjj9fLnP+3lhp+fpJd/0FHoWqoejzNRWGiDJ8DEH3kZ8tMy\nRr1Zy4K7T2Lqg8dqMDzIeApdGfxl6Hb6HzgZ9tkDA/qTt/A0Dsz+tFEL38T+/fmooqJhpGuqy9WQ\nz+/uNOWv9FieEg85q3PIWZ2Dp0QjW9pCbq6Thtm0KXpgn1tczJiVwcAO4Wup5uc3DuwA15+VwaYJ\nE/jh04QFdoDeWH6cUcbLL8OwYY0/ezy5mgNXNQ7sAO9/WdUQ2AEqA/n8BKfpByRheUo8zP3TXCrr\nnN//t32+jfXz15M7JgGTr11E6FwwrbV8OUyZAv1Kq6IenzwZJuRCUX6vRseOj6qFKGkZLNS5Tq08\n3Z1q7pKw8t/KbwjsAJV1leS/1cM6O3eSdTc63RoDQtdSjczbBwR63qSMTGl80MChK9zk5MD/LXdT\nnxwMXVW9YN384KlZ78Kj/+S8sgpwes+EZHxTXS7y3Im/rquCu4i0mTy3m1SXi4KpsHgp7JgCvuz+\nDfl2CA5mairlPXrFaEzvkPyKgdpbR3Hd8gw2b4ankuH+2/tSMDaZgrT+LL61v9N9Ev/MkYshq9B5\nLV3s7MMAFs442T9h8u0tUVpGElbetDy2fb6tofaempxK3rQE6OzchXhKPA2/DeVNyyN3TC7rMzOd\nnHYuXPgDNzOiBNLcXFizxukOmVnpZR5lGKBwr5vfG/j8sX5c8scqzk5J4eL7RrMg379Ah78ve0GK\nj4KbgaoKeP00sCfBRJk5strZVzAVMHDgC9j++wxyF3XE0+lcCu6S0M4fcj6lx0oZOXAkK2auUL69\nDTXXphFLzTg3FzYs8sJDxfS2Tl48c+9RFhdAwXTLUysg1VXB+kzn/Cy8zDv6ATzoCw52SvHBZYej\nNqRGNayK3zzgTCqW6BTcJSFFBp7K2soWPiEtCV3uLs/tbrJNozU/QIduLeOIDZk4zFrmrYeC6f5r\n+nu2XDsczqGYlD3+2SOLnbRPIB0TsO5G51ig9l7Vx9nX4FCUfH6CUnCXhNQWgUeCIpe723bsGOen\njOqQe7/1QS2znytrGJ0KwXQLdTBvTT1p1U7l/dhA+M+bYUKRc15DDR+gzoCFPr8tYtJCN0M+zUjY\nkaug4C4iMYjs4ljp88GZ80ndvZrKjy6HN/NwmSSuWDSwxUFOodx5brxbj2Fq/BOHGcO674QPZjw5\n+Gv69u4DdeGfHfilYekjlpSa8POj1uot4LJw3kkOAAduPALPjGLb3JEJOxWwgrskJDWmtr8hfQez\naPgbLF6aia/GqVcv/bFzrKbG+XPbNli0wcvWocF0Tm5GRjDFMwz+6bmRJP/yKJ+Vwjsj3ey7aC8Q\nHIGatdNydk143/caDLbeNArsENGIGmAIz8snAXd8SuXuAeTnZyi4i3QXuWNyWT9/faOeHHJq8txu\nth07FjaEP8/tJv+fM/DVBM+rqQn/XGWml8WmGN+RYDpn0ciRLC8tDaZ4Tj/G+r9lcmdGBiM88HRx\nGUx2Pp/1Liz6F+gdUWuvbaEF9Yz9MXypJJzpgTcnZrdIzS0jIjGJbFDNzciIOktkmEeLGs3UmFyR\nTF3f8Gg90ZfOjhkTnOsddbo7Zv3dx9LF4V0bQ9X4q6aRgR+cLMy+M+Dxf2zc6BrKFKbzSuaEbltz\nb25uGQV3ETllkVP39u7t/BmowUebypdjyTDQichZ7zopFHMimUOnfcCInePo+3UqNr2aYXW1DDzR\n/P13nwPHBsCkIkiKEj5qesGD/9JEgK+Hc/7jIkqe77419+aCu9IyInLKAqNNQxtQIbh9Raab5a5g\nOocql7NC0i2lETXzOuzusRjqgZMQ8fOgKccGwboZvZiyszbq8d618IOnmwjun/Rn9OHuG9hbouAu\nIkD0tEssIpfOC+xzZDDFm8l3N5ThPUxw6bvdA5h39ANSqoM5FdNCHr0mCXA5ARuCfdjnrWk+Az/0\nUJSdVS6Snx1N3sPNf7fuTMFdRKL2Y2+rOVhyMzJYMyKDuXeFLNCxoy/U9wOOtfj5YwNg95ku1t3m\na5hiAPx92KfAbf+VBNQ3+fmDQ/1vaoFP+8OxXrDOzfi6xOwlE6DgLiJR+7Hnl5XFFdwj+7uvXw/f\nzSvEW3kYLstn3f5RjH99Pin+lLHFNqq9V/WB5fdZCgb3hfOc7pGhKZZUm8TY+86i5rufNtToQ/kM\nrLwTzP4U7GPnOjvnlTmvEgClZUREYhbZ0LptmxPcJ//ifjbvdbrXFIyBxcN38OP3fszZg87mlcGv\nMLR4KGkVadjkvhzPGMC6OQcpeHs0XB6cwz3QCAuw47Ze5Nw9kpeW7KP3rvDoboFVd0DBNMBXhevK\nQ/i++VXDAtq7phzD403cGSLVW0ZEGqVlWlqKLnQ2yH+q+yeGPu/kPtx5bjJyw7tIZuHM+jg4A878\nt0Nct/+6sMFli4a/wdbnp1C4vxDvxAdgTMRq2yU5UPIMPLKHrJ02rHukNXCwX38qMio45/PwRTk+\nHgM/fjpkR+iq2n6z0tPZNGFC6x5WF6LeMiLSrNyMjOBUvTTfoBo6KVtWSRZ2reVIndO95di2Y2Su\nzySQ7sjCy1KKnXlhvOD6WQobfreBf0v+NwCuqPsly382xV/Dnwwf/gVO3wV9ndQNYzbR+9wt+Fw/\nou6Je5i3N5mU6mAsMxbOOHmS+pPhsbuqD6z8YUTBY505MkEouIv0YJHzsW+a0HILY+ikbPPenEef\nuuCSS4Gy/tdJAAANkUlEQVS1UvPyMti2DeZVhk/45av0MfT5oWza5NTOc3JCGlkBfClw0D889fNv\ncM5P/pk7bhhO8VfFXPLmLsbuHwv0a1Sm0BX2LPD6FVCQ1fz3cEFCr8ik4C7SQ0VOi7y1dCsXnnYh\nQ/oOiW26hvQs6H9uo92Ha2vJH1bE+Rsh7dZa+CK28gTSNwDrcFNQl0Hajkd45fh1PPjcg6TUOdP1\nRmt4DWWAy95yltgL61kT0dd9Qv/+CZtvB+XcRXqsnNU5DY2bkVKTU5tcTNxT4mGOZxk15z9IVmFK\neA48xfDQMvjfKc7/5enbDcseAlPlbLtSXWFL7gUaXr9TWcodfNpQA6/CxWIy+eTCT7mv8r/J2hte\nDfe5oLl1r79OhSRf+LzuoTNFttSm0F00l3PXGqoi0ki0xcS9Hi9FOUUM++kwvlOzFJJSGtZKLZgM\ney9NZu1v+jUEdnCC/Nrf9CN9Vjrps9KpWpTJgnynwdXjCa7GdGdIYAdIwceN535A7cOfQq+BjcpX\nMhp8Jnpl0ALeQdGX28tITmZWenpCBPaWKC0j0kNFTovcHK/HS/HcYnyVTnX5zjdcfOKvCQdes9IH\nOCdHTB3w6WW9mPCTCU12j6zYsJV00hvd0w6o48RpQ1h3exrjF9eQUutMXFPVB/7jh3Dux4Y7nrEk\nNfSTh4pUeP4mZ7EO94HG32PygAHdundMayi4i/RQodMil1eUs+vLXdT45++NnP++LL+sIbAD9KqG\nBevC0xyBxsloUwODM6AptPG0shJ+9uBeFnpLyCI87VJvgsvjFVzam39dfIBr/jyQWlffhvx5wVTY\nPXw389Ych/7nsu7mgQ3l2X0eXFQMfULSMn9ZYHgogRtQIym4S6eK7K2hOdc7Vu6Y3IZn3tq/i4v7\n92dWujO4KLTrZKxdKgE+O/oZ6y5bx/jPxzc0mNYbyzN3mLAG0KRrLmDcT91hffGpr6bgtJUU3FoA\n7pth1PcJ9HcsmGopeWoY4575ms+qqnjn1hQemj864VMxodSgKp0msrdGc4140rki0zKRDaOx2LTc\nS/HiMup9Tm+Y4tQMqm+Yje+cV8g6eDPz3pwHwLpv76YgdzIYp0kwtPEzdHKzK5IOsfU9p7/8XvdP\n+CQitdPdByjFQvO5S5cUrbfGrNGz2PS9TU18QjqT1+OlLN8JrIGRqK35bOgPh1qXC7s0k58OmMon\nZMCFTgMtANaC8ccrW89tA308O2lWs9fPKSpi85HwZH9PD+7qLSMiMcnIzWDCpglM2DShVYHdU+Jh\nQ96G8Jy9z8fQrWX8bvbvwD0/GNghGNgBTBLPle0iZ3UOnhJPk/fIc7tJdQXDWWiuv6dScJdOkzct\nj9Tk1IZtLWKdeAKpt8OVh6Mezx2Tyznpo5u9Rl19LZv3bmb2c7MZ89sxTHpyUqNgH5g+YVZ6eo/p\n6tgSpWWkU6lBtXuJnMa3pfnQA6m3rJIslq5d2tBoGpqzn7RuIe9lzI2elqmvgl2L4UhBo2urjUY5\ndxFpA5H91FNTnX7qzQX40HaVrJIs5r05j8Gpg7ku/zoycjNYXryJh0t2UZ/UFzBQewyOFsEgf658\n37qogT2gp7fRaFZIEYlbtH7q+fnNB/fQgVIFYwooPr+Y9fPXkzHGCewPHvLBoInOyaG19LLn2vfL\n9ADKuYtIuwkMlJo1ehazRs8KS6P8uvTT8IbUpBQ4c17M11YbTfNUcxeRmOTlOVMGhKZl8mKIraED\npULFMu0BOEE88twBvQewbt66Hp1vb8kp19yNMRnGmM3GmN3GmE3GmEFNnLfKGHPIGPP+qRdTRDpb\nbq6TY581y3m1lG9viqfEw6QnJ+H7fK2Tigmor3Jy7BFqfY0XR+2V1EuBvQWn3KBqjHkUKLfWPmqM\n+QWQbq29L8p5lwMngT9aa8c3cz01qIp0Ae3Zg8lT4mHO2jnU1Dtz2JCeFUzFRGk8TU1OZfiA4Xxy\n5JOw/ROHTWTHwh1tVq7uql16yxhjPgKusNYeMsYMA7ZYa89v4tyzgZcU3EW6tsgpIVzGxYShE1gx\nc0WbBPnm5pCP5MLF0m8tZcrwKcx5fk7DpGa9Xb158aYXVXOn/UaoDrXWHvK/PwQMjeNaItIFhC6h\nB+CzPt47+B5z/zS32RGizfGUeMhZnUPO6hzKK8obn1CSA3/0kPSfrznvA/fGx9bSreSOyeXFm15s\naJRVYI9Nsw2qxpjNwLAohxaFblhrrTFNzJzfCkuWLGl4n52dTXZ2dryXFJE2EFi8o7mg6inxcP9r\n91N6rJSRA0eyYuYKgLDfBJJdESGnJAfWroe6vtQDfHYpLJgLY8L7rjfVKNvTbNmyhS1btsR0brxp\nmWxr7UFjzBnA60rLiHRvkWmZUM0NGPKUeMJSJ+AE8pTkFE7WnGz6hn/0wN6c8H2jN8GtuRqBGoP2\nSsu8CNzmf38bsCGOa4lIFxDolz5x2ERcIeGhpT7l+W/lhwV2gDpfXfOBvQkD+qQ16hMvrRdPcP8V\nMMsYsxuY4d/GGDPcGPPXwEnGmOeBN4FzjTFlxpg74imwiLSv3DG57Fi4g5dvfjnq4KN49E7qjSGk\nonlZPiRXBLeTK6ie+kvNM9QGNLeMiMQtWlomUkZqBmu+s4b7X7uf9w6+FzxQkgNv+n8ruCwfxmzq\n8XPGxEpzy4hIuwr0aAk0qKanpFN2rCxsTdY131nTUBsP62459m/4xiiQtzXV3EWkXTQ3GCr02BUj\nr2D5/yzXcounQFP+ikiXpnn9T42Cu4hIAtIaqiIiPYyCu4hIAlJwFxFJQAruItImQicIO9VJxqTt\nqEFVROIWOSeNujN2DDWoiki7ipwqODCLpHQeBXcRkQSk4C4iccublkdqcmrDdkuzSEr7U85dRNqE\nRpl2PI1QFRFJQGpQFRHpYRTcRUQSkIK7iEgCUnAXEUlACu4iIglIwV1EJAEpuIuIJCAFdxGRBKTg\nLiKSgBTcRUQSkIK7iEgCUnAXEUlACu4iIglIwV1EJAEpuIuIJCAFdxGRBKTgLiKSgBTcRUQSkIK7\niEgCUnAXEUlACu4iIgnolIO7MSbDGLPZGLPbGLPJGDMoyjluY8zrxphdxphiY8w/xFdcERGJRTw1\n9/uAzdbac4HX/NuRaoGfW2svBC4FfmaMGRfHPXu8LVu2dHYRugU9p9jpWcWmuz2neIL7HOAP/vd/\nAK6LPMFae9Bau9P//iTwITA8jnv2eN3tH1hn0XOKnZ5VbLrbc4onuA+11h7yvz8EDG3uZGPM2cBE\n4J047ikiIjFIbu6gMWYzMCzKoUWhG9Zaa4yxzVynP/D/gHv8NXgREWlHxtomY3LzHzTmIyDbWnvQ\nGHMG8Lq19vwo5/UCNgKvWGsfa+Z6p1YQEZEezFprou1vtubegheB24Bf+//cEHmCMcYAK4EPmgvs\nzRVQRERaL56aewbwX8BZwGfAjdbao8aY4cDT1tqrjTHfAN4A/g4EbnS/tfbVuEsuIiJNOuXgLiIi\nXVeHjlA1xlxljPnIGLPHGPOLKMdvNsYUGWP+boz5X2PMRR1Zvq6kpWcVcl6WMabOGPOdjixfVxHL\nczLGZBtj3vMPpNvSwUXsEmL4vzfEGPOqMWan/znd3gnF7HTGmFXGmEPGmPebOee3/udYZIyZ2JHl\naxVrbYe8gCSgBDgb6AXsBMZFnDMNGOh/fxXwdkeVryu9YnlWIef9N06D9fWdXe6u+JyAQcAu4Ez/\n9pDOLncXfU5LgBWBZwQcBpI7u+yd8Kwux+my/X4Tx2cDL/vfX9KVY1RH1tynAiXW2s+stbXAWuDa\n0BOstW9Za4/5N98BzuzA8nUlLT4rv7txuph+1ZGF60JieU7fBV6w1u4DsNaWd3AZu4JYntMBIM3/\nPg04bK2t68AydgnW2v8BjjRzSsPgTWvtO8AgY0yzY3w6S0cG9xFAWcj2Pv++pnwfeLldS9R1tfis\njDEjcP6D/t6/qyc2nsTyb2oskOGf42i7MeZ7HVa6riOW5/Q0cKExZj9QBNzTQWXrbqI9yy5ZCY2n\nK2RrxRx8jDHfAu4Eprdfcbq0WJ7VY8B91lrr73LaE7uSxvKcegGTgJlAX+AtY8zb1to97VqyriWW\n5/QAsNNam22MOQfYbIyZYK090c5l644i/691yYpVRwb3LwB3yLYb56deGH8j6tPAVdba5n49SmSx\nPKvJwFonrjME+LYxptZa+2LHFLFLiOU5lQHl1tpKoNIY8wYwAehJwT2W53QZsBzAWvuJMeZT4Dxg\ne4eUsPuIfJZn+vd1OR2ZltkOjDXGnG2M6Q3MxxkI1cAYcxbwZ+AWa21JB5atq2nxWVlrR1trR1lr\nR+Hk3X/SwwI7xPCcgL8A3zDGJBlj+uI0gn3QweXsbLE8p4+AKwH8OeTzgL0dWsru4UXgVgBjzKXA\nURucY6tL6bCau7W2zhhzF+DBab1faa390Biz0H/8SWAxkA783l8jrbXWTu2oMnYVMT6rHi+W52St\n/cgY8yrOQDofzgC7HhXcY/z39EvgGWNMEU6l7/9Ya72dVuhOYox5HrgCGGKMKQMexkntBf49vWyM\nmW2MKQG+Bu7ovNI2T4OYREQSkJbZExFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk4C4ikoAU3EVE\nEpCCu4hIAvr/eZkdM74bzcEAAAAASUVORK5CYII=\n", 207 | "text": [ 208 | "" 209 | ] 210 | } 211 | ], 212 | "prompt_number": 27 213 | }, 214 | { 215 | "cell_type": "code", 216 | "collapsed": false, 217 | "input": [], 218 | "language": "python", 219 | "metadata": {}, 220 | "outputs": [] 221 | } 222 | ], 223 | "metadata": {} 224 | } 225 | ] 226 | } -------------------------------------------------------------------------------- /sklearn_demos/fastText.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebooks is an experiment to see if a pure scikit-learn implementation of the fastText model can work better than a linear model on a small text classification problem: 20 newsgroups.\n", 8 | "\n", 9 | "http://arxiv.org/abs/1607.01759\n", 10 | "\n", 11 | "Those models are very similar to Deep Averaging Network (with only 1 hidden layer with a linear activation function):\n", 12 | "\n", 13 | "https://www.cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf\n", 14 | "\n", 15 | "\n", 16 | "Note that scikit-learn does not provide a hierarchical softmax implementation (but we don't need it on 20 newsgroups anyways)." 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "from sklearn.datasets import fetch_20newsgroups\n", 28 | "from sklearn.feature_extraction.text import CountVectorizer\n", 29 | "from sklearn.feature_extraction.text import HashingVectorizer\n", 30 | "\n", 31 | "from sklearn.model_selection import train_test_split" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "twentyng_train = fetch_20newsgroups(\n", 43 | " subset='train',\n", 44 | " #remove=('headers', 'footers'),\n", 45 | ")\n", 46 | "docs_train, target_train = twentyng_train.data, twentyng_train.target\n", 47 | "\n", 48 | "\n", 49 | "twentyng_test = fetch_20newsgroups(\n", 50 | " subset='test',\n", 51 | " #remove=('headers', 'footers'),\n", 52 | ")\n", 53 | "\n", 54 | "docs_test, target_test = twentyng_test.data, twentyng_test.target" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 18, 60 | "metadata": { 61 | "collapsed": false 62 | }, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "262144" 68 | ] 69 | }, 70 | "execution_count": 18, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "2 ** 18" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "The following uses the hashing tricks on unigrams and bigrams. `binary=True` makes us ignore repeated words in a document. The `l1` normalization ensures that we \"average\" the embeddings of the tokens in the document instead of summing them." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 17, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "CPU times: user 16.8 s, sys: 116 ms, total: 16.9 s\n", 98 | "Wall time: 16.9 s\n" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "%%time\n", 104 | "vec = HashingVectorizer(\n", 105 | " encoding='latin-1', binary=True, ngram_range=(1, 2),\n", 106 | " norm='l1', n_features=2 ** 18)\n", 107 | "\n", 108 | "X_train = vec.transform(docs_train)\n", 109 | "X_test = vec.transform(docs_test)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 19, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "array([[ 0., 0., 0., ..., 0., 0., 0.],\n", 123 | " [ 0., 0., 0., ..., 0., 0., 0.],\n", 124 | " [ 0., 0., 0., ..., 0., 0., 0.]])" 125 | ] 126 | }, 127 | "execution_count": 19, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "first_doc_vectors = X_train[:3].toarray()\n", 134 | "first_doc_vectors" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 20, 140 | "metadata": { 141 | "collapsed": false 142 | }, 143 | "outputs": [ 144 | { 145 | "data": { 146 | "text/plain": [ 147 | "array([ 0., 0., 0.])" 148 | ] 149 | }, 150 | "execution_count": 20, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "first_doc_vectors.min(axis=1)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 21, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [ 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "array([ 0.0049505 , 0.00469484, 0.00200401])" 170 | ] 171 | }, 172 | "execution_count": 21, 173 | "metadata": {}, 174 | "output_type": "execute_result" 175 | } 176 | ], 177 | "source": [ 178 | "first_doc_vectors.max(axis=1)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 22, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "array([ 1., 1., 1.])" 192 | ] 193 | }, 194 | "execution_count": 22, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "first_doc_vectors.sum(axis=1)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Baseline: OvR logistic regression (the multinomial logistic regression loss is currently not implemented in scikit-learn). In practice, the OvR reduction seems to work well enough." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 86, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "CPU times: user 1min 46s, sys: 6.69 s, total: 1min 53s\n", 222 | "Wall time: 11.1 s\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "%%time\n", 228 | "from sklearn.linear_model import SGDClassifier\n", 229 | "\n", 230 | "lr = SGDClassifier(loss='log', alpha=1e-10, n_iter=50, n_jobs=-1)\n", 231 | "lr.fit(X_train, target_train)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 87, 237 | "metadata": { 238 | "collapsed": false 239 | }, 240 | "outputs": [ 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "train score: 1.000\n", 246 | "test score: 0.827\n", 247 | "CPU times: user 588 ms, sys: 289 ms, total: 877 ms\n", 248 | "Wall time: 602 ms\n" 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "%%time\n", 254 | "print(\"train score: %0.3f\" % lr.score(X_train, target_train))\n", 255 | "print(\"test score: %0.3f\" % lr.score(X_test, target_test))" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "Let's now use the MLPClassifier of scikit-learn to add a single hidden layer with a small number of hidden units.\n", 263 | "\n", 264 | "Note: instead of tanh or relu we would rather like to use a linear / identity activation function for the hidden layer but this is not (yet) implemented in scikit-learn.\n", 265 | "\n", 266 | "In that respect the following model is closer to a Deep Averaging Network (without dropout) than fastText." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 90, 272 | "metadata": { 273 | "collapsed": false 274 | }, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "Iteration 1, loss = 2.94108225\n", 281 | "Validation score: 0.464664\n", 282 | "Iteration 2, loss = 2.49072336\n", 283 | "Validation score: 0.639576\n", 284 | "Iteration 3, loss = 1.63266821\n", 285 | "Validation score: 0.810954\n", 286 | "Iteration 4, loss = 0.90327443\n", 287 | "Validation score: 0.869258\n", 288 | "Iteration 5, loss = 0.48531751\n", 289 | "Validation score: 0.893993\n", 290 | "Iteration 6, loss = 0.27329257\n", 291 | "Validation score: 0.909894\n", 292 | "Iteration 7, loss = 0.16704835\n", 293 | "Validation score: 0.911661\n", 294 | "Iteration 8, loss = 0.11122343\n", 295 | "Validation score: 0.918728\n", 296 | "Iteration 9, loss = 0.07885910\n", 297 | "Validation score: 0.918728\n", 298 | "Iteration 10, loss = 0.05876991\n", 299 | "Validation score: 0.924028\n", 300 | "Iteration 11, loss = 0.04566916\n", 301 | "Validation score: 0.920495\n", 302 | "Iteration 12, loss = 0.03644058\n", 303 | "Validation score: 0.915194\n", 304 | "Iteration 13, loss = 0.02982519\n", 305 | "Validation score: 0.922261\n", 306 | "Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.\n", 307 | "CPU times: user 1min 21s, sys: 187 ms, total: 1min 21s\n", 308 | "Wall time: 1min 21s\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "%%time\n", 314 | "from sklearn.neural_network import MLPClassifier\n", 315 | "\n", 316 | "mlp = MLPClassifier(algorithm='adam', learning_rate_init=0.01,\n", 317 | " hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,\n", 318 | " early_stopping=True, validation_fraction=0.05, alpha=1e-10)\n", 319 | "mlp.fit(X_train, target_train)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 92, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [ 329 | { 330 | "name": "stdout", 331 | "output_type": "stream", 332 | "text": [ 333 | "train score: 0.996\n", 334 | "test score: 0.801\n", 335 | "CPU times: user 304 ms, sys: 54 µs, total: 304 ms\n", 336 | "Wall time: 303 ms\n" 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "%%time\n", 342 | "print(\"train score: %0.3f\" % mlp.score(X_train, target_train))\n", 343 | "print(\"test score: %0.3f\" % mlp.score(X_test, target_test))" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [] 354 | } 355 | ], 356 | "metadata": { 357 | "kernelspec": { 358 | "display_name": "Python 3", 359 | "language": "python", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.5.0" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 0 377 | } 378 | -------------------------------------------------------------------------------- /sklearn_demos/language/fetch_data.py: -------------------------------------------------------------------------------- 1 | 2 | # simple python script to collect text paragraphs from various languages on the 3 | # same topic namely the Wikipedia encyclopedia itself 4 | 5 | import os 6 | try: 7 | # Python 2 compat 8 | from urllib2 import Request, build_opener 9 | except ImportError: 10 | # Python 3 11 | from urllib.request import Request, build_opener 12 | 13 | import lxml.html 14 | from lxml.etree import ElementTree 15 | import numpy as np 16 | 17 | pages = { 18 | u'ar': u'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7', 19 | u'de': u'http://de.wikipedia.org/wiki/Wikipedia', 20 | u'en': u'http://en.wikipedia.org/wiki/Wikipedia', 21 | u'es': u'http://es.wikipedia.org/wiki/Wikipedia', 22 | u'fr': u'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia', 23 | u'it': u'http://it.wikipedia.org/wiki/Wikipedia', 24 | u'ja': u'http://ja.wikipedia.org/wiki/Wikipedia', 25 | u'nl': u'http://nl.wikipedia.org/wiki/Wikipedia', 26 | u'pl': u'http://pl.wikipedia.org/wiki/Wikipedia', 27 | u'pt': u'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia', 28 | u'ru': u'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F', 29 | # u'zh': u'http://zh.wikipedia.org/wiki/Wikipedia', 30 | } 31 | 32 | html_folder = u'html' 33 | text_folder = u'paragraphs' 34 | short_text_folder = u'short_paragraphs' 35 | n_words_per_short_text = 5 36 | 37 | 38 | if not os.path.exists(html_folder): 39 | os.makedirs(html_folder) 40 | 41 | for lang, page in pages.items(): 42 | 43 | text_lang_folder = os.path.join(text_folder, lang) 44 | if not os.path.exists(text_lang_folder): 45 | os.makedirs(text_lang_folder) 46 | 47 | short_text_lang_folder = os.path.join(short_text_folder, lang) 48 | if not os.path.exists(short_text_lang_folder): 49 | os.makedirs(short_text_lang_folder) 50 | 51 | opener = build_opener() 52 | html_filename = os.path.join(html_folder, lang + '.html') 53 | if not os.path.exists(html_filename): 54 | print("Downloading %s" % page) 55 | request = Request(page) 56 | # change the User Agent to avoid being blocked by Wikipedia 57 | # downloading a couple of articles ones should not be abusive 58 | request.add_header('User-Agent', 'OpenAnything/1.0') 59 | html_content = opener.open(request).read() 60 | open(html_filename, 'wb').write(html_content) 61 | 62 | # decode the payload explicitly as UTF-8 since lxml is confused for some 63 | # reason 64 | html_content = open(html_filename).read() 65 | if hasattr(html_content, 'decode'): 66 | html_content = html_content.decode('utf-8') 67 | tree = ElementTree(lxml.html.document_fromstring(html_content)) 68 | i = 0 69 | j = 0 70 | for p in tree.findall('//p'): 71 | content = p.text_content() 72 | if len(content) < 100: 73 | # skip paragraphs that are too short - probably too noisy and not 74 | # representative of the actual language 75 | continue 76 | 77 | text_filename = os.path.join(text_lang_folder, 78 | '%s_%04d.txt' % (lang, i)) 79 | print("Writing %s" % text_filename) 80 | open(text_filename, 'wb').write(content.encode('utf-8', 'ignore')) 81 | i += 1 82 | 83 | # split the paragraph into fake smaller paragraphs to make the 84 | # problem harder e.g. more similar to tweets 85 | if lang in ('zh', 'ja'): 86 | # FIXME: whitespace tokenizing does not work on chinese and japanese 87 | continue 88 | words = content.split() 89 | n_groups = len(words) / n_words_per_short_text 90 | if n_groups < 1: 91 | continue 92 | groups = np.array_split(words, n_groups) 93 | 94 | for group in groups: 95 | small_content = u" ".join(group) 96 | 97 | short_text_filename = os.path.join(short_text_lang_folder, 98 | '%s_%04d.txt' % (lang, j)) 99 | print("Writing %s" % short_text_filename) 100 | open(short_text_filename, 'wb').write( 101 | small_content.encode('utf-8', 'ignore')) 102 | j += 1 103 | if j >= 1000: 104 | break 105 | 106 | -------------------------------------------------------------------------------- /structure_digits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "worksheets": [ 3 | { 4 | "cells": [ 5 | { 6 | "source": "# Intrinsic structure of the Western-Arabic numerals scriptures\n\nUsing scikit-learn Spectral clustering.", 7 | "cell_type": "markdown" 8 | }, 9 | { 10 | "cell_type": "code", 11 | "language": "python", 12 | "outputs": [], 13 | "collapsed": true, 14 | "prompt_number": 22, 15 | "input": "import numpy as np\nimport pylab as pl" 16 | }, 17 | { 18 | "source": "## Load the digits dataset and plot the first elements", 19 | "cell_type": "markdown" 20 | }, 21 | { 22 | "source": "Small utility function to display a gallery of images:", 23 | "cell_type": "markdown" 24 | }, 25 | { 26 | "cell_type": "code", 27 | "language": "python", 28 | "outputs": [], 29 | "collapsed": true, 30 | "prompt_number": 23, 31 | "input": "def plot_images(images):\n pl.gray()\n pl.figure()\n for i, img in enumerate(images[:25]):\n pl.subplot(5, 5, i)\n pl.imshow(img, interpolation=\"nearest\")\n pl.xticks(())\n pl.yticks(())\n " 32 | }, 33 | { 34 | "source": "Lest load the digits dataset that comes with scikit learn (as a CSV file with gray level pixel values. Let's shuffle the dataset to make shure that the algorithm cannot exploit any ordering information. ", 35 | "cell_type": "markdown" 36 | }, 37 | { 38 | "cell_type": "code", 39 | "language": "python", 40 | "outputs": [], 41 | "collapsed": true, 42 | "prompt_number": 24, 43 | "input": "from sklearn import datasets\nfrom sklearn.utils import shuffle\n\ndigits = datasets.load_digits()\nimages, data, target = shuffle(\n digits.images, digits.data, digits.target)\n\nplot_images(images)\n" 44 | }, 45 | { 46 | "source": "## Group the pictures in 10 groups using Spectral Clustering", 47 | "cell_type": "markdown" 48 | }, 49 | { 50 | "cell_type": "code", 51 | "language": "python", 52 | "outputs": [], 53 | "collapsed": true, 54 | "prompt_number": 25, 55 | "input": "from sklearn import cluster, neighbors\n\nn_clusters = 10\nS = neighbors.kneighbors_graph(data, 10)\nsc = cluster.SpectralClustering(n_clusters, mode='arpack', n_init=50)\nsc.fit(S)\nsc.labels_" 56 | }, 57 | { 58 | "cell_type": "code", 59 | "language": "python", 60 | "outputs": [], 61 | "collapsed": true, 62 | "prompt_number": 26, 63 | "input": "for i in range(n_clusters):\n plot_images(images[sc.labels_ == i])" 64 | }, 65 | { 66 | "source": "\n## Profiling clustering algorithm\n\nThe following will runt the `cProfile` tool from the Python stdlib and display the output in a paged, tiled panel.", 67 | "cell_type": "markdown" 68 | }, 69 | { 70 | "cell_type": "code", 71 | "language": "python", 72 | "outputs": [], 73 | "collapsed": true, 74 | "prompt_number": 27, 75 | "input": "%prun cluster.SpectralClustering(10, mode='arpack').fit(S)" 76 | }, 77 | { 78 | "source": "## Supervised learning: learning to classify digits", 79 | "cell_type": "markdown" 80 | }, 81 | { 82 | "cell_type": "code", 83 | "language": "python", 84 | "outputs": [], 85 | "collapsed": true, 86 | "prompt_number": 28, 87 | "input": "from sklearn import svm, metrics\nX_train, y_train, X_test, y_test = data[:500], target[:500], data[500:], target[500:]\n\nclf = svm.SVC(gamma=0.001).fit(X_train, y_train)\n\nprint metrics.classification_report(y_test, clf.predict(X_test))" 88 | }, 89 | { 90 | "cell_type": "code", 91 | "language": "python", 92 | "outputs": [], 93 | "collapsed": true, 94 | "prompt_number": 29, 95 | "input": "cm = metrics.confusion_matrix(target[500:], clf.predict(data[500:]))\nprint cm" 96 | }, 97 | { 98 | "cell_type": "code", 99 | "language": "python", 100 | "outputs": [], 101 | "collapsed": true, 102 | "prompt_number": 30, 103 | "input": "pl.imshow(cm)" 104 | } 105 | ] 106 | } 107 | ], 108 | "metadata": { 109 | "name": "structure_digits" 110 | }, 111 | "nbformat": 2 112 | } -------------------------------------------------------------------------------- /test.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /ubuntu-quickstart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Script to quickly setup an ipython notebook env on a stock Ubuntu 13.04 3 | 4 | set -ex 5 | 6 | 7 | sudo apt-get install -y \ 8 | python-numpy python-scipy python-dev libatlas-dev \ 9 | python-zmq python-pip python-virtualenv \ 10 | git libnuma-dev numactl htop vim python-matplotlib libevent-dev 11 | 12 | sudo update-alternatives --set editor /usr/bin/vim.basic 13 | 14 | cd $HOME 15 | if [ ! -d "venv" ]; then 16 | virtualenv --system-site-packages venv 17 | fi 18 | . venv/bin/activate 19 | 20 | pip install scikit-learn ipython[notebook] blosc apache-libcloud gevent numa 21 | pip install git+https://github.com/esc/bloscpack 22 | 23 | git config --global user.name "Olivier Grisel" 24 | git config --global user.email olivier.grisel@ensta.org 25 | 26 | if [ ! -x "$HOME/.ssh/config" ]; then 27 | if f [ ! -d "$HOME/.ssh" ]; then 28 | mkdir $HOME/.ssh 29 | fi 30 | echo "Host github.com" >> $HOME/.ssh/config 31 | echo " StrictHostKeyChecking no" >> $HOME/.ssh/config 32 | fi 33 | 34 | if [ ! -d "$HOME/notebooks" ]; then 35 | git clone git@github.com:ogrisel/notebooks.git 36 | fi 37 | 38 | if [ -d "/mnt/resource" ]; then 39 | # Azure 40 | DATA_ROOT=/mnt/resource 41 | else 42 | # EC2 43 | DATA_ROOT=/mnt 44 | fi 45 | 46 | if [ ! -d "$DATA_ROOT/$USER" ]; then 47 | sudo mkdir $DATA_ROOT/$USER 48 | sudo chown -R $USER. $DATA_ROOT/$USER 49 | 50 | mkdir $DATA_ROOT/$USER/data 51 | ln -s $DATA_ROOT/$USER/data $HOME/data 52 | fi 53 | 54 | # (Re)start the notebook process 55 | cd $HOME/notebooks 56 | pkill -9 -f "disabled-ipython-browser" || echo "Nothing to kill" 57 | nohup ~/venv/bin/ipython notebook \ 58 | --ip="*" \ 59 | --browser="disabled-ipython-browser" & 60 | 61 | --------------------------------------------------------------------------------