├── .gitignore
├── Adjusted Variable Importances with Randomized Trees.ipynb
├── Attention
    └── Keras Attention.ipynb
├── Bootstrap.ipynb
├── Clustering Model Selection.ipynb
├── Data Preprocessing for the  Learning to Rank example.ipynb
├── Distributed Aggregate and Join.ipynb
├── Distributed Learning of Extra Trees with IPython.parallel.ipynb
├── Explained variances.ipynb
├── Function Approximation.ipynb
├── GP overfitting.ipynb
├── Gradient.ipynb
├── Labeled Faces in the Wild recognition.ipynb
├── Learning to Rank.ipynb
├── MNIST8M Chunking and Upload to Cloud Blob Storage.ipynb
├── Non IID cross-validation.ipynb
├── Numa-aware computation experiments.ipynb
├── Numba Parakeet Cython.ipynb
├── Numpy intro.ipynb
├── Overfitting
    └── linear-model-overfitting.ipynb
├── Parameter search for Extra Trees on the MNIST classificationt task.ipynb
├── Patch-Based Feature Extraction for Image Classification.ipynb
├── README.md
├── Reinforcement Learning
    └── Random walk policy evaluation.ipynb
├── SGD stuff.ipynb
├── Saddle Point LBFGS.ipynb
├── Semi-supervised Extra Trees.ipynb
├── Text Classification.ipynb
├── Time Series.ipynb
├── Untitled Diagram.drawio
├── Variable Importance with Completely Randomized Trees.ipynb
├── cloudstorage.ini.example
├── dask
    └── fold_learn.ipynb
├── environment.yml
├── fmri_vae
    ├── fmri_autoencoder.ipynb
    └── fmri_model.py
├── generalization
    └── run_mnist.py
├── gmm
    ├── GMM with PyTorch.ipynb
    ├── GMM with SGD.ipynb
    ├── Gaussian likelihood landscape.ipynb
    ├── Model Selection for GMM.ipynb
    └── gmmsgd.py
├── gradients
    └── custom optim.ipynb
├── letor_cluster
    ├── MSLR Grid Search.ipynb
    ├── letor_gridpoint.py
    ├── letor_gridresults.json
    ├── letor_gridresults.py
    └── letor_gridsearch.py
├── nmf_topics.ipynb
├── quantile_regression_as_classification.ipynb
├── representations
    ├── Autoencoder ELMs.ipynb
    ├── Entangled Manifolds.ipynb
    ├── MNIST experiments.ipynb
    ├── Sparse non-linear random projections.ipynb
    └── Unsupervised feature extraction with the Breiman trick.ipynb
├── screenshots
    ├── digits.png
    └── topics.png
├── sklearn_demos
    ├── Classifier calibration.ipynb
    ├── Face recognition.ipynb
    ├── Feature Importances.ipynb
    ├── Gradient Boosting.ipynb
    ├── Income classification - Column Transformer Edition.ipynb
    ├── Income classification.ipynb
    ├── Language Classification.ipynb
    ├── Large Scale 2D Clustering-1M.ipynb
    ├── Large Scale 2D Clustering.ipynb
    ├── Permutation Importances.ipynb
    ├── ames_gbrt_search_results.json
    ├── ames_housing.ipynb
    ├── fastText.ipynb
    ├── gbdt_vs_neural_nets_on_tabular_data.ipynb
    ├── iris.ipynb
    ├── language
    │   └── fetch_data.py
    └── splines_overfitting.ipynb
├── structure_digits.ipynb
├── test.drawio
└── ubuntu-quickstart.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | .ipynb_checkpoints
 2 | *.f
 3 | *.so
 4 | nohup.out
 5 | *.ini
 6 | *.pkl
 7 | *.npy
 8 | joblib
 9 | *.nii
10 | *_pkl
11 | nilearn_data
12 | Untitled*.ipynb
13 | *.dat
14 | *.lprof
15 | *.html
16 | *.pyc
17 | adult.data
18 | *.txt
19 | sparse_chunks
20 | *.hdf5
21 | 


--------------------------------------------------------------------------------
/Attention/Keras Attention.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 72,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import theano.tensor as tt\n",
 12 |     "from keras.layers.recurrent import GRU\n",
 13 |     "from keras.layers.core import Dense, MaskedLayer, Layer, Merge\n",
 14 |     "from keras.models import Graph\n",
 15 |     "from keras.utils.theano_utils import shared_zeros"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 46,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "class SoftSequentialAttentionLayer(MaskedLayer):\n",
 27 |     "    \n",
 28 |     "    def __init__(self, memmory_dim, driver_dim, inner_dim=128, init='glorot_uniform', inner_activation='relu'):\n",
 29 |     "        super(SoftSequentialAttentionLayer, self).__init__()\n",
 30 |     "        self.init = initializations.get(init)\n",
 31 |     "        self.W_m = self.init((memory_dim, inner_dim))\n",
 32 |     "        self.W_d = self.init((driver_dim, inner_dim))\n",
 33 |     "        self.W_a = self.init((inner_dim, 1))\n",
 34 |     "        self.inner_activation = activations.get(inner_activation)\n",
 35 |     "        self.b_inner = shared_zeros(inner_dim)\n",
 36 |     "        self.b_out = shared_zeros(1)\n",
 37 |     "    \n",
 38 |     "    def set_previous(self, *previous_layers):\n",
 39 |     "        type_name = self.__class__.__name__\n",
 40 |     "        if len(previous_layers) != 2:\n",
 41 |     "            raise ValueError(\"{}.set_previous expects 2 input layers, got {}\".format(\n",
 42 |     "                type_name, previous_layers))\n",
 43 |     "        sequential_memory, attention_driver = previous_layers\n",
 44 |     "        if not sequential_memory.return_sequences:\n",
 45 |     "            raise ValueError(\"The first input of {} should be a recurrent layer with\"\n",
 46 |     "                             \" return_sequences=True\".format(type_name))\n",
 47 |     "        self.sequential_memory = sequential_memory\n",
 48 |     "        self.attention_driver = attention_driver\n",
 49 |     "        \n",
 50 |     "    def get_input(self, train=False):\n",
 51 |     "        return [self.sequential_memory.get_output(train=train),\n",
 52 |     "                self.attention_driver.get_output(train=train)]\n",
 53 |     "        \n",
 54 |     "    def get_output(self, train=False):\n",
 55 |     "        sequential_memory, attention_driver = self.get_input(train=train)\n",
 56 |     "        # sequential_memory shape: (nb_samples, time (padded with zeros), input_dim)\n",
 57 |     "        # attentin_driver shape: (nb_samples, input_dim)\n",
 58 |     "        # new shape: (time, nb_samples, input_dim) -> because theano.scan iterates over main dimension\n",
 59 |     "        padded_mask = self.get_padded_shuffled_mask(train, sequential_memory, pad=1)\n",
 60 |     "        sequential_memory = sequential_memory.dimshuffle((1, 0, 2))\n",
 61 |     "        h = self.inner_activation(tt.dot(sequential_memory, self.W_m)\n",
 62 |     "                                  + tt.dot(driver, self.W_d)\n",
 63 |     "                                  + self.b_inner)\n",
 64 |     "        a = tt.exp(tt.dot(h, self.W_a) + self.b_out)\n",
 65 |     "        \n",
 66 |     "        \n",
 67 |     "        output = None  #XXX: TODO\n",
 68 |     "        return output\n",
 69 |     "    \n",
 70 |     "    def _variable_length_softmax_step(self, a_t, sum_t):\n",
 71 |     "        return )"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 47,
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "class CustomGraph(Graph):\n",
 83 |     "\n",
 84 |     "    def add_node(self, layer, name, input=None, inputs=[], merge_mode='concat', create_output=False):\n",
 85 |     "        if hasattr(layer, 'set_name'):\n",
 86 |     "            layer.set_name(name)\n",
 87 |     "        if name in self.namespace:\n",
 88 |     "            raise Exception('Duplicate node identifier: ' + name)\n",
 89 |     "        if input:\n",
 90 |     "            if input not in self.namespace:\n",
 91 |     "                raise Exception('Unknown node/input identifier: ' + input)\n",
 92 |     "            if input in self.nodes:\n",
 93 |     "                layer.set_previous(self.nodes[input])\n",
 94 |     "            elif input in self.inputs:\n",
 95 |     "                layer.set_previous(self.inputs[input])\n",
 96 |     "        if inputs:\n",
 97 |     "            to_merge = []\n",
 98 |     "            for n in inputs:\n",
 99 |     "                if n in self.nodes:\n",
100 |     "                    to_merge.append(self.nodes[n])\n",
101 |     "                elif n in self.inputs:\n",
102 |     "                    to_merge.append(self.inputs[n])\n",
103 |     "                else:\n",
104 |     "                    raise Exception('Unknown identifier: ' + n)\n",
105 |     "            # XXX: here is the change\n",
106 |     "            if merge_mode == 'distinct':\n",
107 |     "                layer.set_previous(*to_merge)\n",
108 |     "            else:\n",
109 |     "                merge = Merge(to_merge, mode=merge_mode)\n",
110 |     "                layer.set_previous(merge)\n",
111 |     "\n",
112 |     "        self.namespace.add(name)\n",
113 |     "        self.nodes[name] = layer\n",
114 |     "        self.node_config.append({'name': name,\n",
115 |     "                                 'input': input,\n",
116 |     "                                 'inputs': inputs,\n",
117 |     "                                 'merge_mode': merge_mode})\n",
118 |     "        layer.init_updates()\n",
119 |     "        params, regularizers, constraints, updates = layer.get_params()\n",
120 |     "        self.params += params\n",
121 |     "        self.regularizers += regularizers\n",
122 |     "        self.constraints += constraints\n",
123 |     "        self.updates += updates\n",
124 |     "\n",
125 |     "        if create_output:\n",
126 |     "            self.add_output(name, input=name)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 54,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "graph = CustomGraph()\n",
138 |     "graph.add_input(name='context_sequences', ndim=3)\n",
139 |     "graph.add_node(GRU(32, return_sequences=True), name='dense1', input='context_sequences')\n",
140 |     "graph.add_node(Dense(32, 4), name='dense2', input='context_sequences')\n",
141 |     "graph.add_node(SoftSequentialAttentionLayer(),\n",
142 |     "               name='attention', inputs=['dense1', 'dense2'],\n",
143 |     "               merge_mode='distinct')\n",
144 |     "graph.add_output(name='output1', input='dense2')\n",
145 |     "graph.add_output(name='output2', input='attention')"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 55,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [
155 |     {
156 |      "data": {
157 |       "text/plain": [
158 |        "{'attention': <__main__.SoftSequentialAttentionLayer at 0x10873d630>,\n",
159 |        " 'dense1': <keras.layers.recurrent.GRU at 0x1085caeb8>,\n",
160 |        " 'dense2': <keras.layers.core.Dense at 0x10873f438>}"
161 |       ]
162 |      },
163 |      "execution_count": 55,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "graph.nodes"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 56,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "{'attention', 'context_sequences', 'dense1', 'dense2'}"
183 |       ]
184 |      },
185 |      "execution_count": 56,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "graph.namespace"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 62,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "import numpy as np"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 69,
217 |    "metadata": {
218 |     "collapsed": false
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "x = np.arange(3 * 4 * 5).reshape(5, 3, 4)\n",
223 |     "a = np.arange(4 * 2).reshape(4, 2)"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 71,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "(5, 3, 2)"
237 |       ]
238 |      },
239 |      "execution_count": 71,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "np.dot(x, a).shape"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": null,
251 |    "metadata": {
252 |     "collapsed": true
253 |    },
254 |    "outputs": [],
255 |    "source": []
256 |   }
257 |  ],
258 |  "metadata": {
259 |   "kernelspec": {
260 |    "display_name": "Python 3",
261 |    "language": "python",
262 |    "name": "python3"
263 |   },
264 |   "language_info": {
265 |    "codemirror_mode": {
266 |     "name": "ipython",
267 |     "version": 3
268 |    },
269 |    "file_extension": ".py",
270 |    "mimetype": "text/x-python",
271 |    "name": "python",
272 |    "nbconvert_exporter": "python",
273 |    "pygments_lexer": "ipython3",
274 |    "version": "3.4.3"
275 |   }
276 |  },
277 |  "nbformat": 4,
278 |  "nbformat_minor": 0
279 | }
280 | 


--------------------------------------------------------------------------------
/Clustering Model Selection.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "name": "Clustering Model Selection"
 4 |  },
 5 |  "nbformat": 3,
 6 |  "nbformat_minor": 0,
 7 |  "worksheets": [
 8 |   {
 9 |    "cells": [
10 |     {
11 |      "cell_type": "code",
12 |      "collapsed": false,
13 |      "input": [],
14 |      "language": "python",
15 |      "metadata": {},
16 |      "outputs": []
17 |     }
18 |    ],
19 |    "metadata": {}
20 |   }
21 |  ]
22 | }


--------------------------------------------------------------------------------
/Data Preprocessing for the  Learning to Rank example.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "metadata": {
 3 |   "name": ""
 4 |  },
 5 |  "nbformat": 3,
 6 |  "nbformat_minor": 0,
 7 |  "worksheets": [
 8 |   {
 9 |    "cells": [
10 |     {
11 |      "cell_type": "heading",
12 |      "level": 2,
13 |      "metadata": {},
14 |      "source": [
15 |       "Svmlight formatted file parsing with sklearn for Learning to Rank data"
16 |      ]
17 |     },
18 |     {
19 |      "cell_type": "code",
20 |      "collapsed": false,
21 |      "input": [
22 |       "import numpy as np\n",
23 |       "\n",
24 |       "from os.path import expanduser\n",
25 |       "from sklearn.datasets import load_svmlight_file\n",
26 |       "from sklearn.externals import joblib"
27 |      ],
28 |      "language": "python",
29 |      "metadata": {},
30 |      "outputs": []
31 |     },
32 |     {
33 |      "cell_type": "code",
34 |      "collapsed": false,
35 |      "input": [
36 |       "memory = joblib.Memory(cachedir='.', mmap_mode='r')\n",
37 |       "\n",
38 |       "@memory.cache\n",
39 |       "def load_fold(dataset, subset, fold_idx=1, dtype=np.float32):\n",
40 |       "    DATA_FOLDER = expanduser('~/data')\n",
41 |       "    filepath = join(DATA_FOLDER, dataset, 'Fold%d' % fold_idx, subset + '.txt')\n",
42 |       "    X, y, qid = load_svmlight_file(filepath, dtype=dtype, query_id=True)\n",
43 |       "    return X.toarray(), y, qid"
44 |      ],
45 |      "language": "python",
46 |      "metadata": {},
47 |      "outputs": []
48 |     },
49 |     {
50 |      "cell_type": "code",
51 |      "collapsed": false,
52 |      "input": [
53 |       "X_train, y_train, qid_train = load_fold('MSLR-WEB10K','train', fold_idx=1)\n",
54 |       "X_vali, y_vali, qid_vali = load_fold('MSLR-WEB10K', 'vali', fold_idx=1)\n",
55 |       "X_test, y_test, qid_test = load_fold('MSLR-WEB10K', 'test', fold_idx=1)"
56 |      ],
57 |      "language": "python",
58 |      "metadata": {},
59 |      "outputs": []
60 |     },
61 |     {
62 |      "cell_type": "code",
63 |      "collapsed": false,
64 |      "input": [
65 |       "%%time\n",
66 |       "\n",
67 |       "np.savez_compressed(expanduser('~/data/MSLR-WEB10K/mslr_web10k_fold1.npz'),\n",
68 |       "                    X_train=X_train, y_train=y_train, qid_train=qid_train,\n",
69 |       "                    X_vali=X_vali, y_vali=y_vali, qid_vali=qid_vali,\n",
70 |       "                    X_test=X_test, y_test=y_test, qid_test=qid_test)"
71 |      ],
72 |      "language": "python",
73 |      "metadata": {},
74 |      "outputs": []
75 |     }
76 |    ],
77 |    "metadata": {}
78 |   }
79 |  ]
80 | }


--------------------------------------------------------------------------------
/Function Approximation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 307,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "from keras.layers import Dense\n",
 14 |     "from keras.models import Sequential\n",
 15 |     "from keras import optimizers"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 916,
 21 |    "metadata": {
 22 |     "collapsed": false
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "X = np.random.uniform(low=0, high=1, size=(300, 30))\n",
 27 |     "y = np.array([0, 1] * (X.shape[0] // 2))"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 917,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "# n_hidden = X.shape[0]\n",
 39 |     "# W0 = np.ones_like(X.T)\n",
 40 |     "# b0 =  -X.ravel() + 0.001\n",
 41 |     "# weights_0 = [W0, b0]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 918,
 47 |    "metadata": {
 48 |     "collapsed": false
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n",
 53 |     "# np.linalg.matrix_rank(hidden_activations)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 919,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "n_hidden = X.shape[0]\n",
 65 |     "W0 = X / (np.linalg.norm(X, axis=1, keepdims=True) ** 2 + 1e-8)\n",
 66 |     "W0 = W0.T\n",
 67 |     "b0 = -0.98\n",
 68 |     "weights_0 = [W0, b0]"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 920,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "300"
 82 |       ]
 83 |      },
 84 |      "execution_count": 920,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n",
 91 |     "np.linalg.matrix_rank(hidden_activations)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 921,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "0.0"
105 |       ]
106 |      },
107 |      "execution_count": 921,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "np.linalg.det(hidden_activations)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 931,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "n_hidden = X.shape[0]\n",
125 |     "W0 = X\n",
126 |     "W0 = W0.T\n",
127 |     "b0 = -0.98 * np.linalg.norm(X, axis=1) ** 2\n",
128 |     "weights_0 = [W0, b0]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 932,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "300"
142 |       ]
143 |      },
144 |      "execution_count": 932,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "hidden_activations = np.maximum(np.dot(X, W0) + b0, 0)\n",
151 |     "np.linalg.matrix_rank(hidden_activations)"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 933,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "5.278958284816851e-214"
165 |       ]
166 |      },
167 |      "execution_count": 933,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "np.linalg.det(hidden_activations)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 934,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "array([[ 0.20354264,  0.        ,  0.        ,  0.        ,  0.        ,\n",
187 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
188 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
189 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
190 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
191 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
192 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
193 |        "         0.        ,  0.        ,  0.        ,  0.05563168,  0.        ,\n",
194 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
195 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
196 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
197 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
198 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
199 |        "         0.01919508,  0.06842773,  0.        ,  0.        ,  0.        ,\n",
200 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
201 |        "         0.        ,  0.        ,  0.        ,  0.2154047 ,  0.        ,\n",
202 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
203 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
204 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
205 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
206 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
207 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
208 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
209 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
210 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
211 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
212 |        "         0.09565226,  0.        ,  0.        ,  0.        ,  0.        ,\n",
213 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
214 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
215 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
216 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
217 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
218 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
219 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
220 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
221 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
222 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
223 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
224 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
225 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
226 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
227 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
228 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
229 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
230 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
231 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
232 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
233 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
234 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
235 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
236 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
237 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
238 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
239 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
240 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
241 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
242 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
243 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
244 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
245 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],\n",
246 |        "       [ 0.        ,  0.18351664,  0.        ,  0.        ,  0.        ,\n",
247 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
248 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
249 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
250 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
251 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
252 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
253 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
254 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
255 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
256 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
257 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
258 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
259 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
260 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
261 |        "         0.        ,  0.        ,  0.        ,  0.28104634,  0.        ,\n",
262 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
263 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
264 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
265 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
266 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
267 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
268 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
269 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
270 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
271 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
272 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
273 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
274 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
275 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
276 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
277 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
278 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
279 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
280 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
281 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
282 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
283 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
284 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
285 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
286 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
287 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
288 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
289 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
290 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
291 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
292 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
293 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
294 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
295 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
296 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
297 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
298 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
299 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
300 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
301 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
302 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
303 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
304 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
305 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],\n",
306 |        "       [ 0.        ,  0.        ,  0.16006979,  0.        ,  0.        ,\n",
307 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
308 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
309 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
310 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
311 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
312 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
313 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
314 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
315 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
316 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
317 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
318 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
319 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
320 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
321 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
322 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
323 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
324 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
325 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
326 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
327 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
328 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
329 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
330 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
331 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
332 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
333 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
334 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
335 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
336 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
337 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
338 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
339 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
340 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
341 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
342 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
343 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
344 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
345 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
346 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
347 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
348 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
349 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
350 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
351 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
352 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
353 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
354 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
355 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
356 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
357 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
358 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
359 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
360 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
361 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
362 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
363 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
364 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,\n",
365 |        "         0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])"
366 |       ]
367 |      },
368 |      "execution_count": 934,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "hidden_activations[:3]"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": 935,
380 |    "metadata": {
381 |     "collapsed": false
382 |    },
383 |    "outputs": [],
384 |    "source": [
385 |     "# W1 = np.linalg.solve(hidden_activations, y)[:, np.newaxis]\n",
386 |     "W1 = np.dot(np.linalg.pinv(hidden_activations), y)[:, np.newaxis]\n",
387 |     "b1 = np.zeros(1)\n",
388 |     "weights_1 = [W1, b1]"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 936,
394 |    "metadata": {
395 |     "collapsed": false
396 |    },
397 |    "outputs": [
398 |     {
399 |      "data": {
400 |       "text/plain": [
401 |        "array([[  8.12960810e-14],\n",
402 |        "       [  1.00000000e+00],\n",
403 |        "       [  1.53071833e-13],\n",
404 |        "       [  1.00000000e+00],\n",
405 |        "       [  5.55215829e-15]])"
406 |       ]
407 |      },
408 |      "execution_count": 936,
409 |      "metadata": {},
410 |      "output_type": "execute_result"
411 |     }
412 |    ],
413 |    "source": [
414 |     "np.dot(hidden_activations, W1)[:5]"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": 937,
420 |    "metadata": {
421 |     "collapsed": false
422 |    },
423 |    "outputs": [
424 |     {
425 |      "name": "stdout",
426 |      "output_type": "stream",
427 |      "text": [
428 |       "Epoch 1/5\n",
429 |       "1s - loss: 2.0071e-08 - acc: 1.0000\n",
430 |       "Epoch 2/5\n",
431 |       "0s - loss: 2.0071e-08 - acc: 1.0000\n",
432 |       "Epoch 3/5\n",
433 |       "0s - loss: 2.0071e-08 - acc: 1.0000\n",
434 |       "Epoch 4/5\n",
435 |       "0s - loss: 2.0071e-08 - acc: 1.0000\n",
436 |       "Epoch 5/5\n",
437 |       "0s - loss: 2.0071e-08 - acc: 1.0000\n"
438 |      ]
439 |     },
440 |     {
441 |      "data": {
442 |       "text/plain": [
443 |        "<keras.callbacks.History at 0x7f065c88ac50>"
444 |       ]
445 |      },
446 |      "execution_count": 937,
447 |      "metadata": {},
448 |      "output_type": "execute_result"
449 |     }
450 |    ],
451 |    "source": [
452 |     "model = Sequential()\n",
453 |     "first_layer = Dense(output_dim=n_hidden, input_dim=X.shape[1], activation='relu',\n",
454 |     "                    weights=weights_0)\n",
455 |     "first_layer.trainable = False\n",
456 |     "model.add(first_layer)\n",
457 |     "second_layer = Dense(output_dim=1, activation='linear', weights=weights_1)\n",
458 |     "second_layer.trainable = False\n",
459 |     "model.add(second_layer)\n",
460 |     "\n",
461 |     "optimizer = optimizers.Adam(lr=0.001)\n",
462 |     "model.compile(optimizer=optimizer, loss='mse',\n",
463 |     "              metrics=['accuracy'])\n",
464 |     "\n",
465 |     "model.fit(X, y, nb_epoch=5, verbose=2)"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {
472 |     "collapsed": true
473 |    },
474 |    "outputs": [],
475 |    "source": []
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {
481 |     "collapsed": true
482 |    },
483 |    "outputs": [],
484 |    "source": []
485 |   }
486 |  ],
487 |  "metadata": {
488 |   "kernelspec": {
489 |    "display_name": "Python 3",
490 |    "language": "python",
491 |    "name": "python3"
492 |   },
493 |   "language_info": {
494 |    "codemirror_mode": {
495 |     "name": "ipython",
496 |     "version": 3
497 |    },
498 |    "file_extension": ".py",
499 |    "mimetype": "text/x-python",
500 |    "name": "python",
501 |    "nbconvert_exporter": "python",
502 |    "pygments_lexer": "ipython3",
503 |    "version": "3.5.2"
504 |   }
505 |  },
506 |  "nbformat": 4,
507 |  "nbformat_minor": 1
508 | }
509 | 


--------------------------------------------------------------------------------
/MNIST8M Chunking and Upload to Cloud Blob Storage.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Chunking the MNIST8M dataset and store the chunks in the cloud"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "This notebook is an example to demonstrate how to preprocess a large dataset in the svmlight format to convert into chunked, dense numpy arrays that are them compressed individually and stored in a cloud object store on Amazon S3 or Azure Blob Store for later consumption by machine learning models."
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "code",
 27 |      "collapsed": false,
 28 |      "input": [
 29 |       "import re\n",
 30 |       "import bz2\n",
 31 |       "import os\n",
 32 |       "from os.path import expanduser, join, exists\n",
 33 |       "from configparser import ConfigParser\n",
 34 |       "from time import time\n",
 35 |       "\n",
 36 |       "import numpy as np\n",
 37 |       "from concurrent.futures import ThreadPoolExecutor\n",
 38 |       "\n",
 39 |       "from libcloud.storage.types import Provider\n",
 40 |       "from libcloud.storage.types import ContainerDoesNotExistError\n",
 41 |       "from libcloud.storage.types import ObjectDoesNotExistError\n",
 42 |       "from libcloud.storage.providers import get_driver\n",
 43 |       "\n",
 44 |       "\n",
 45 |       "DATA_FOLDER = expanduser('~/data/mnist8m')\n",
 46 |       "SVMLIGHT_DATA_FOLDER = join(DATA_FOLDER, 'svmlight')\n",
 47 |       "NUMPY_DATA_FOLDER = join(DATA_FOLDER, 'numpy')\n",
 48 |       "\n",
 49 |       "MNIST8M_SRC_URL = ('http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/'\n",
 50 |       "                   'datasets/multiclass/mnist8m.bz2')\n",
 51 |       "MNIST8M_SRC_FILENAME = MNIST8M_SRC_URL.rsplit('/', 1)[1]\n",
 52 |       "MNIST8M_SRC_FILEPATH = join(DATA_FOLDER, MNIST8M_SRC_FILENAME)\n",
 53 |       "\n",
 54 |       "\n",
 55 |       "CHUNK_FILENAME_PREFIX = \"mnist8m-chunk-\"\n",
 56 |       "\n",
 57 |       "CHUNK_SIZE = 100000"
 58 |      ],
 59 |      "language": "python",
 60 |      "metadata": {},
 61 |      "outputs": [],
 62 |      "prompt_number": 15
 63 |     },
 64 |     {
 65 |      "cell_type": "heading",
 66 |      "level": 2,
 67 |      "metadata": {},
 68 |      "source": [
 69 |       "Decompressing and chunking the source dataset"
 70 |      ]
 71 |     },
 72 |     {
 73 |      "cell_type": "markdown",
 74 |      "metadata": {},
 75 |      "source": [
 76 |       "Download the `mnist8m.bz2` source file into the data folder if not previously downloaded:"
 77 |      ]
 78 |     },
 79 |     {
 80 |      "cell_type": "code",
 81 |      "collapsed": false,
 82 |      "input": [
 83 |       "if not exists(DATA_FOLDER):\n",
 84 |       "    os.makedirs(DATA_FOLDER)\n",
 85 |       "\n",
 86 |       "if not exists(MNIST8M_SRC_FILEPATH):\n",
 87 |       "    cmd = \"(cd '%s' && wget -c '%s')\" % (DATA_FOLDER, MNIST8M_SRC_URL)\n",
 88 |       "    print(cmd)\n",
 89 |       "    os.system(cmd)"
 90 |      ],
 91 |      "language": "python",
 92 |      "metadata": {},
 93 |      "outputs": [],
 94 |      "prompt_number": 16
 95 |     },
 96 |     {
 97 |      "cell_type": "markdown",
 98 |      "metadata": {},
 99 |      "source": [
100 |       "Decompress the big bz2 source file and chunk the source svmlight formatted data file to make it easier to process it in parallel:"
101 |      ]
102 |     },
103 |     {
104 |      "cell_type": "code",
105 |      "collapsed": false,
106 |      "input": [
107 |       "if not exists(SVMLIGHT_DATA_FOLDER):\n",
108 |       "    os.makedirs(SVMLIGHT_DATA_FOLDER)\n",
109 |       "\n",
110 |       "chunk_filenames = [fn for fn in os.listdir(SVMLIGHT_DATA_FOLDER)\n",
111 |       "                   if (fn.startswith(CHUNK_FILENAME_PREFIX)\n",
112 |       "                       and fn.endswith('.svmlight'))]\n",
113 |       "chunk_filenames.sort()\n",
114 |       "\n",
115 |       "\n",
116 |       "def get_svmlight_filename(chunk_idx):\n",
117 |       "    chunk_filename = \"%s%03d.svmlight\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n",
118 |       "    return join(SVMLIGHT_DATA_FOLDER, chunk_filename)\n",
119 |       "\n",
120 |       "\n",
121 |       "if not chunk_filenames:\n",
122 |       "    chunk_filenames = []\n",
123 |       "    with bz2.BZ2File(MNIST8M_SRC_FILEPATH) as source:\n",
124 |       "        target, line_no, chunk_idx = None, 0, 0\n",
125 |       "        for line in source:\n",
126 |       "            line_no += 1\n",
127 |       "            if target is None:\n",
128 |       "                chunk_filename = get_svmlight_filename(chunk_idx)\n",
129 |       "                target = open(chunk_filename, 'wb')\n",
130 |       "                chunk_idx += 1\n",
131 |       "                chunk_filenames.append(chunk_filename)\n",
132 |       "                \n",
133 |       "            target.write(line)\n",
134 |       "                \n",
135 |       "            if line_no >= CHUNK_SIZE:\n",
136 |       "                target.close()\n",
137 |       "                target, line_no = None, 0\n",
138 |       "        if target is not None:\n",
139 |       "            target.close()"
140 |      ],
141 |      "language": "python",
142 |      "metadata": {},
143 |      "outputs": [],
144 |      "prompt_number": 22
145 |     },
146 |     {
147 |      "cell_type": "heading",
148 |      "level": 2,
149 |      "metadata": {},
150 |      "source": [
151 |       "Parsing the svmlight format in parallel and compressing the resulting chunks locally"
152 |      ]
153 |     },
154 |     {
155 |      "cell_type": "markdown",
156 |      "metadata": {},
157 |      "source": [
158 |       "Parse the svmlight formatted chunks into dense numpy arrays and store the resulting chunks as compressed binary files using NumPy own format."
159 |      ]
160 |     },
161 |     {
162 |      "cell_type": "code",
163 |      "collapsed": false,
164 |      "input": [
165 |       "from IPython.parallel import Client\n",
166 |       "client = Client()\n",
167 |       "lb_view = client.load_balanced_view()\n",
168 |       "len(lb_view)"
169 |      ],
170 |      "language": "python",
171 |      "metadata": {},
172 |      "outputs": [
173 |       {
174 |        "metadata": {},
175 |        "output_type": "pyout",
176 |        "prompt_number": 23,
177 |        "text": [
178 |         "4"
179 |        ]
180 |       }
181 |      ],
182 |      "prompt_number": 23
183 |     },
184 |     {
185 |      "cell_type": "code",
186 |      "collapsed": false,
187 |      "input": [
188 |       "def parse_svmlight_chunk(input_chunk_filename, output_chunk_filename,\n",
189 |       "                         output_chunk_labels_filename,\n",
190 |       "                         n_features, chunk_size=CHUNK_SIZE):\n",
191 |       "    # Import dependencies lazily to be able to run this function\n",
192 |       "    # on remote nodes of the cluster in parallel with IPython\n",
193 |       "    from sklearn.datasets import load_svmlight_file\n",
194 |       "\n",
195 |       "    if (not exists(output_chunk_filename)\n",
196 |       "        or not exists(output_chunk_labels_filename)):\n",
197 |       "        X, y = load_svmlight_file(input_chunk_filename, n_features=n_features)\n",
198 |       "        np.savez_compressed(output_chunk_filename, X.toarray() / 255.)\n",
199 |       "        np.savez_compressed(output_chunk_labels_filename, y)\n",
200 |       "\n",
201 |       "\n",
202 |       "def get_numpy_filenames(i):\n",
203 |       "    data = \"%s%03d_data.npz\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n",
204 |       "    labels = \"%s%03d_labels.npz\" % (CHUNK_FILENAME_PREFIX, chunk_idx)\n",
205 |       "    return (\n",
206 |       "        join(NUMPY_DATA_FOLDER, data),\n",
207 |       "        join(NUMPY_DATA_FOLDER, labels),\n",
208 |       "    )\n",
209 |       "\n",
210 |       "    \n",
211 |       "tasks = []\n",
212 |       "n_features = 28 ** 2 # hardcoded for now\n",
213 |       "\n",
214 |       "for i in range(81): # 8100000 lines // 100000 lines per chunk:\n",
215 |       "    svmlight_chunk_name = get_svmlight_filename(i)\n",
216 |       "    data_chunk_name, label_chunk_name = get_numpy_filenames(i)\n",
217 |       "    tasks.append(lb_view.apply(parse_svmlight_chunk,\n",
218 |       "                               svmlight_chunk_name,\n",
219 |       "                               data_chunk_name,\n",
220 |       "                               label_chunk_name,\n",
221 |       "                               n_features))"
222 |      ],
223 |      "language": "python",
224 |      "metadata": {},
225 |      "outputs": [],
226 |      "prompt_number": 24
227 |     },
228 |     {
229 |      "cell_type": "code",
230 |      "collapsed": false,
231 |      "input": [
232 |       "sum(t.ready() for t in tasks), len(tasks)"
233 |      ],
234 |      "language": "python",
235 |      "metadata": {},
236 |      "outputs": [
237 |       {
238 |        "metadata": {},
239 |        "output_type": "pyout",
240 |        "prompt_number": 30,
241 |        "text": [
242 |         "(0, 81)"
243 |        ]
244 |       }
245 |      ],
246 |      "prompt_number": 30
247 |     },
248 |     {
249 |      "cell_type": "heading",
250 |      "level": 2,
251 |      "metadata": {},
252 |      "source": [
253 |       "Uploading the results to a cloud store"
254 |      ]
255 |     },
256 |     {
257 |      "cell_type": "code",
258 |      "collapsed": false,
259 |      "input": [
260 |       "CONFIGFILE_PATH = 'cloudstorage.ini'"
261 |      ],
262 |      "language": "python",
263 |      "metadata": {},
264 |      "outputs": [],
265 |      "prompt_number": 112
266 |     },
267 |     {
268 |      "cell_type": "markdown",
269 |      "metadata": {},
270 |      "source": [
271 |       "Let's use [Apache Libcloud](http://libcloud.apache.org) to upload the chunk objects to a permanent store for later usage in ephemeral VMs. We will store the credential in a configuration file named `cloudstorage.ini`. Here is the expected content for the Windows Azure Cloud:\n",
272 |       "\n",
273 |       "```\n",
274 |       "[account]\n",
275 |       "libcloud_provider = azure_blobs\n",
276 |       "account_name = myacount\n",
277 |       "account_secret = primarykey\n",
278 |       "```\n",
279 |       "\n",
280 |       "On Amazon S3, the config file would look like:\n",
281 |       "\n",
282 |       "```\n",
283 |       "[account]\n",
284 |       "libcloud_provider = s3\n",
285 |       "account_name = aws_key_id\n",
286 |       "account_secret = aws_secret_key\n",
287 |       "```\n",
288 |       "\n",
289 |       "Apache Libcloud supports many more [Cloud Object Store providers](https://ci.apache.org/projects/libcloud/docs/storage/supported_providers.html).\n",
290 |       "\n",
291 |       "The objects will be stored in a specific container. On some providers, the container name must be globally unique (such as is the case for bucket names on S3). On others like Azure, the container names are local to the cloud storage account. In case of conflict, just change the container name: "
292 |      ]
293 |     },
294 |     {
295 |      "cell_type": "code",
296 |      "collapsed": false,
297 |      "input": [
298 |       "CONTAINER_NAME = \"mnist8m\""
299 |      ],
300 |      "language": "python",
301 |      "metadata": {},
302 |      "outputs": [],
303 |      "prompt_number": 110
304 |     },
305 |     {
306 |      "cell_type": "markdown",
307 |      "metadata": {},
308 |      "source": [
309 |       "The following function parse the `cloudstorage.ini` file and build a Libcloud driver instance. This instance is not thread safe, hence we wrap the driver instanciation in a function to be reused in individual threads."
310 |      ]
311 |     },
312 |     {
313 |      "cell_type": "code",
314 |      "collapsed": false,
315 |      "input": [
316 |       "def build_driver(configfile_path=CONFIGFILE_PATH, section='account'):\n",
317 |       "    config = ConfigParser()\n",
318 |       "    config.read(configfile_path)\n",
319 |       "    provider_name = config.get(section, 'libcloud_provider')\n",
320 |       "    driver_type = get_driver(provider_name)\n",
321 |       "    account_name = config.get(section, 'account_name')\n",
322 |       "    account_secret = config.get(section, 'account_secret')\n",
323 |       "    return driver_type(account_name, account_secret)\n",
324 |       "\n",
325 |       "driver = build_driver()"
326 |      ],
327 |      "language": "python",
328 |      "metadata": {},
329 |      "outputs": [],
330 |      "prompt_number": 103
331 |     },
332 |     {
333 |      "cell_type": "markdown",
334 |      "metadata": {},
335 |      "source": [
336 |       "The following utility function checks that a container with a specific name exits on the Cloud Storage provider, otherwise it creates it:"
337 |      ]
338 |     },
339 |     {
340 |      "cell_type": "code",
341 |      "collapsed": false,
342 |      "input": [
343 |       "def get_or_create_container(driver, container_name=CONTAINER_NAME):\n",
344 |       "    try:\n",
345 |       "        return driver.get_container(container_name)\n",
346 |       "    except ContainerDoesNotExistError:\n",
347 |       "        return driver.create_container(container_name)\n",
348 |       "    \n",
349 |       "container = get_or_create_container(driver)"
350 |      ],
351 |      "language": "python",
352 |      "metadata": {},
353 |      "outputs": [],
354 |      "prompt_number": 104
355 |     },
356 |     {
357 |      "cell_type": "markdown",
358 |      "metadata": {},
359 |      "source": [
360 |       "We can now write a function that uploads invidual local files to a target object container. As this function will be called in parallel in various threads we instanciate a dedicated driver inside."
361 |      ]
362 |     },
363 |     {
364 |      "cell_type": "code",
365 |      "collapsed": false,
366 |      "input": [
367 |       "def upload_object(local_folder, object_name, container_name=CONTAINER_NAME, skip_if_exists=True):\n",
368 |       "    driver = build_driver()  # libcloud drivers are not thread-safe\n",
369 |       "    container = get_or_create_container(driver, container_name)\n",
370 |       "    filepath = os.path.join(local_folder, object_name)\n",
371 |       "    if skip_if_exists:\n",
372 |       "        try:\n",
373 |       "            # Check the size to deal with partially uploaded files\n",
374 |       "            ob =  container.get_object(object_name)\n",
375 |       "            if ob.size == os.stat(filepath).st_size:\n",
376 |       "                return ob\n",
377 |       "        except ObjectDoesNotExistError:\n",
378 |       "            pass\n",
379 |       "    return container.upload_object(filepath, object_name,\n",
380 |       "        extra={'content_type': 'application/octet-stream'})"
381 |      ],
382 |      "language": "python",
383 |      "metadata": {},
384 |      "outputs": [],
385 |      "prompt_number": 105
386 |     },
387 |     {
388 |      "cell_type": "markdown",
389 |      "metadata": {},
390 |      "source": [
391 |       "Finally let us upload all the chunks and labels from the MNIST8M dataset in parallel to speedup the upload. As IPython does not seem to be fully compatible with gevent monkeypatching we will use Python threads to upload data in parallel: "
392 |      ]
393 |     },
394 |     {
395 |      "cell_type": "code",
396 |      "collapsed": false,
397 |      "input": [
398 |       "n_workers = 10\n",
399 |       "filenames = os.listdir(NUMPY_DATA_FOLDER)\n",
400 |       "\n",
401 |       "tic = time()\n",
402 |       "with ThreadPoolExecutor(max_workers=n_workers) as e:\n",
403 |       "    for f in filenames:\n",
404 |       "        e.submit(upload_object, local_folder, f)\n",
405 |       "print(\"Uploaded {} files with {} workers in {:0.3f}s\".format(\n",
406 |       "      len(filenames), n_workers, time() - tic))"
407 |      ],
408 |      "language": "python",
409 |      "metadata": {},
410 |      "outputs": [
411 |       {
412 |        "output_type": "stream",
413 |        "stream": "stdout",
414 |        "text": [
415 |         "Uploaded 83 files with 10 workers in 281.750s\n"
416 |        ]
417 |       }
418 |      ],
419 |      "prompt_number": 106
420 |     }
421 |    ],
422 |    "metadata": {}
423 |   }
424 |  ]
425 | }


--------------------------------------------------------------------------------
/Non IID cross-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 2,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Impact of the dependency between samples on cross-validation test score estimates"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "code",
 20 |      "collapsed": false,
 21 |      "input": [
 22 |       "import numpy as np\n",
 23 |       "from sklearn.datasets import load_digits"
 24 |      ],
 25 |      "language": "python",
 26 |      "metadata": {},
 27 |      "outputs": [],
 28 |      "prompt_number": 1
 29 |     },
 30 |     {
 31 |      "cell_type": "code",
 32 |      "collapsed": false,
 33 |      "input": [
 34 |       "digits = load_digits()\n",
 35 |       "X, y = digits.data, digits.target"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": [],
 40 |      "prompt_number": 2
 41 |     },
 42 |     {
 43 |      "cell_type": "markdown",
 44 |      "metadata": {},
 45 |      "source": [
 46 |       "The digits dataset of scikit-learn is the test set of the [UCI optdigits dataset](http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/). Apparently consecutive samples are more likely to stem from the same writer on this dataset. Hence the samples are not independent and identically distributed (iid) as different writing styles grouped togethers effectively introduce a dependency. Unfortunately the exact per-sample authorship metadata has not be kept in the optdigits dataset.\n",
 47 |       "\n",
 48 |       "This is highlighted by the fact that shuffling the data significantly affects the test score estimated by K-Fold cross-validation. Let us build a model with non-optimal parameters to highlight the impact of dependent samples:"
 49 |      ]
 50 |     },
 51 |     {
 52 |      "cell_type": "code",
 53 |      "collapsed": false,
 54 |      "input": [
 55 |       "from sklearn.svm  import SVC\n",
 56 |       "\n",
 57 |       "model = SVC(C=10, gamma=0.005)"
 58 |      ],
 59 |      "language": "python",
 60 |      "metadata": {},
 61 |      "outputs": [],
 62 |      "prompt_number": 3
 63 |     },
 64 |     {
 65 |      "cell_type": "code",
 66 |      "collapsed": false,
 67 |      "input": [
 68 |       "from sklearn.cross_validation import cross_val_score\n",
 69 |       "\n",
 70 |       "def print_cv_score_summary(model, X, y, cv):\n",
 71 |       "    scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)\n",
 72 |       "    print(\"mean: {:3f}, stdev: {:3f}\".format(\n",
 73 |       "        np.mean(scores), np.std(scores)))"
 74 |      ],
 75 |      "language": "python",
 76 |      "metadata": {},
 77 |      "outputs": [],
 78 |      "prompt_number": 4
 79 |     },
 80 |     {
 81 |      "cell_type": "markdown",
 82 |      "metadata": {},
 83 |      "source": [
 84 |       "KFold does not shuffle the data by default hence takes the dependency structure of the dataset into account for small number of folds such as k=5:"
 85 |      ]
 86 |     },
 87 |     {
 88 |      "cell_type": "code",
 89 |      "collapsed": false,
 90 |      "input": [
 91 |       "from sklearn.cross_validation import KFold\n",
 92 |       "\n",
 93 |       "cv = KFold(len(y), 5)\n",
 94 |       "print_cv_score_summary(model, X, y, cv)"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": [
 99 |       {
100 |        "output_type": "stream",
101 |        "stream": "stdout",
102 |        "text": [
103 |         "mean: 0.901543, stdev: 0.037016\n"
104 |        ]
105 |       }
106 |      ],
107 |      "prompt_number": 5
108 |     },
109 |     {
110 |      "cell_type": "markdown",
111 |      "metadata": {},
112 |      "source": [
113 |       "If we shuffle the data, the estimated test score is much higher as we hide the dependency structure to the model hence we cannot detect the overfitting caused by the author writing styles:"
114 |      ]
115 |     },
116 |     {
117 |      "cell_type": "code",
118 |      "collapsed": false,
119 |      "input": [
120 |       "cv = KFold(len(y), 5, shuffle=True, random_state=0)\n",
121 |       "print_cv_score_summary(model, X, y, cv)"
122 |      ],
123 |      "language": "python",
124 |      "metadata": {},
125 |      "outputs": [
126 |       {
127 |        "output_type": "stream",
128 |        "stream": "stdout",
129 |        "text": [
130 |         "mean: 0.968836, stdev: 0.007350\n"
131 |        ]
132 |       }
133 |      ],
134 |      "prompt_number": 6
135 |     },
136 |     {
137 |      "cell_type": "code",
138 |      "collapsed": false,
139 |      "input": [
140 |       "cv = KFold(len(y), 5, shuffle=True, random_state=1)\n",
141 |       "print_cv_score_summary(model, X, y, cv)"
142 |      ],
143 |      "language": "python",
144 |      "metadata": {},
145 |      "outputs": [
146 |       {
147 |        "output_type": "stream",
148 |        "stream": "stdout",
149 |        "text": [
150 |         "mean: 0.967725, stdev: 0.004847\n"
151 |        ]
152 |       }
153 |      ],
154 |      "prompt_number": 7
155 |     },
156 |     {
157 |      "cell_type": "code",
158 |      "collapsed": false,
159 |      "input": [
160 |       "cv = KFold(len(y), 5, shuffle=True, random_state=2)\n",
161 |       "print_cv_score_summary(model, X, y, cv)"
162 |      ],
163 |      "language": "python",
164 |      "metadata": {},
165 |      "outputs": [
166 |       {
167 |        "output_type": "stream",
168 |        "stream": "stdout",
169 |        "text": [
170 |         "mean: 0.966622, stdev: 0.010217\n"
171 |        ]
172 |       }
173 |      ],
174 |      "prompt_number": 8
175 |     },
176 |     {
177 |      "cell_type": "markdown",
178 |      "metadata": {},
179 |      "source": [
180 |       "There is almost **7% discrepancy between the estimated score** probably caused by the dependency between samples.\n",
181 |       "\n",
182 |       "Those shuffled KFold cv scores are in-line with equivalent `ShuffleSplit`:"
183 |      ]
184 |     },
185 |     {
186 |      "cell_type": "code",
187 |      "collapsed": false,
188 |      "input": [
189 |       "from sklearn.cross_validation import ShuffleSplit\n",
190 |       "\n",
191 |       "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=0)\n",
192 |       "print_cv_score_summary(model, X, y, cv)"
193 |      ],
194 |      "language": "python",
195 |      "metadata": {},
196 |      "outputs": [
197 |       {
198 |        "output_type": "stream",
199 |        "stream": "stdout",
200 |        "text": [
201 |         "mean: 0.971667, stdev: 0.007115\n"
202 |        ]
203 |       }
204 |      ],
205 |      "prompt_number": 9
206 |     },
207 |     {
208 |      "cell_type": "code",
209 |      "collapsed": false,
210 |      "input": [
211 |       "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=1)\n",
212 |       "print_cv_score_summary(model, X, y, cv)"
213 |      ],
214 |      "language": "python",
215 |      "metadata": {},
216 |      "outputs": [
217 |       {
218 |        "output_type": "stream",
219 |        "stream": "stdout",
220 |        "text": [
221 |         "mean: 0.973333, stdev: 0.003333\n"
222 |        ]
223 |       }
224 |      ],
225 |      "prompt_number": 10
226 |     },
227 |     {
228 |      "cell_type": "code",
229 |      "collapsed": false,
230 |      "input": [
231 |       "cv = ShuffleSplit(len(y), n_iter=5, test_size=0.2, random_state=2)\n",
232 |       "print_cv_score_summary(model, X, y, cv)"
233 |      ],
234 |      "language": "python",
235 |      "metadata": {},
236 |      "outputs": [
237 |       {
238 |        "output_type": "stream",
239 |        "stream": "stdout",
240 |        "text": [
241 |         "mean: 0.958333, stdev: 0.008784\n"
242 |        ]
243 |       }
244 |      ],
245 |      "prompt_number": 11
246 |     },
247 |     {
248 |      "cell_type": "markdown",
249 |      "metadata": {},
250 |      "source": [
251 |       "Note that `StratifiedKFold` sorts the samples by classes prior to computing the folds hence breaks the dependency too (at least in scikit-learn 0.14):"
252 |      ]
253 |     },
254 |     {
255 |      "cell_type": "code",
256 |      "collapsed": false,
257 |      "input": [
258 |       "from sklearn.cross_validation import StratifiedKFold\n",
259 |       "\n",
260 |       "cv = StratifiedKFold(y, 5)\n",
261 |       "print_cv_score_summary(model, X, y, cv)"
262 |      ],
263 |      "language": "python",
264 |      "metadata": {},
265 |      "outputs": [
266 |       {
267 |        "output_type": "stream",
268 |        "stream": "stdout",
269 |        "text": [
270 |         "mean: 0.969404, stdev: 0.010674\n"
271 |        ]
272 |       }
273 |      ],
274 |      "prompt_number": 12
275 |     }
276 |    ],
277 |    "metadata": {}
278 |   }
279 |  ]
280 | }


--------------------------------------------------------------------------------
/Numba Parakeet Cython.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "Numba Parakeet Cython"
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "heading",
 12 |      "level": 1,
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "Numba vs. Parakeet vs. Cython"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "*This notebook is derived from a blog *\n",
 23 |       "[*post*](http://jakevdp.github.io/blog/2012/08/24/numba-vs-cython/)\n",
 24 |       "*by Jake Vanderplas on the blog*\n",
 25 |       "[*Pythonic Perambulations*](http://jakevdp.github.io) and updated by Olivier Grisel to add Parakeet."
 26 |      ]
 27 |     },
 28 |     {
 29 |      "cell_type": "code",
 30 |      "collapsed": false,
 31 |      "input": [
 32 |       "import numpy as np\n",
 33 |       "\n",
 34 |       "X = np.random.random((1000, 3))\n",
 35 |       "X_wide = np.random.random((1000, 100))"
 36 |      ],
 37 |      "language": "python",
 38 |      "metadata": {},
 39 |      "outputs": [],
 40 |      "prompt_number": 1
 41 |     },
 42 |     {
 43 |      "cell_type": "heading",
 44 |      "level": 2,
 45 |      "metadata": {},
 46 |      "source": [
 47 |       "Numpy Function With Broadcasting"
 48 |      ]
 49 |     },
 50 |     {
 51 |      "cell_type": "code",
 52 |      "collapsed": false,
 53 |      "input": [
 54 |       "def pairwise_numpy(X):\n",
 55 |       "    return np.sqrt(((X[:, None, :] - X) ** 2).sum(-1))\n",
 56 |       "%timeit pairwise_numpy(X)"
 57 |      ],
 58 |      "language": "python",
 59 |      "metadata": {},
 60 |      "outputs": [
 61 |       {
 62 |        "output_type": "stream",
 63 |        "stream": "stdout",
 64 |        "text": [
 65 |         "10 loops, best of 3: 64.7 ms per loop\n"
 66 |        ]
 67 |       }
 68 |      ],
 69 |      "prompt_number": 2
 70 |     },
 71 |     {
 72 |      "cell_type": "heading",
 73 |      "level": 2,
 74 |      "metadata": {},
 75 |      "source": [
 76 |       "Pure Python Function"
 77 |      ]
 78 |     },
 79 |     {
 80 |      "cell_type": "code",
 81 |      "collapsed": false,
 82 |      "input": [
 83 |       "def pairwise_python(X):\n",
 84 |       "    M = X.shape[0]\n",
 85 |       "    N = X.shape[1]\n",
 86 |       "    D = np.empty((M, M), dtype=np.float)\n",
 87 |       "    for i in range(M):\n",
 88 |       "        for j in range(M):\n",
 89 |       "            d = 0.0\n",
 90 |       "            for k in range(N):\n",
 91 |       "                tmp = X[i, k] - X[j, k]\n",
 92 |       "                d += tmp * tmp\n",
 93 |       "            D[i, j] = np.sqrt(d)\n",
 94 |       "    return D"
 95 |      ],
 96 |      "language": "python",
 97 |      "metadata": {},
 98 |      "outputs": [],
 99 |      "prompt_number": 3
100 |     },
101 |     {
102 |      "cell_type": "code",
103 |      "collapsed": false,
104 |      "input": [
105 |       "%timeit pairwise_python(X)"
106 |      ],
107 |      "language": "python",
108 |      "metadata": {},
109 |      "outputs": [
110 |       {
111 |        "output_type": "stream",
112 |        "stream": "stdout",
113 |        "text": [
114 |         "1 loops, best of 3: 9.51 s per loop\n"
115 |        ]
116 |       }
117 |      ],
118 |      "prompt_number": 4
119 |     },
120 |     {
121 |      "cell_type": "markdown",
122 |      "metadata": {},
123 |      "source": [
124 |       "Alternative python / numpy implementation closer to the parakeet example from the `examples` folder of its git repo to be fair."
125 |      ]
126 |     },
127 |     {
128 |      "cell_type": "code",
129 |      "collapsed": false,
130 |      "input": [
131 |       "def pairwise_python2(X):\n",
132 |       "    n_samples = X.shape[0]\n",
133 |       "    result = np.zeros((n_samples, n_samples), dtype=X.dtype)\n",
134 |       "    for i in xrange(X.shape[0]):\n",
135 |       "        for j in xrange(X.shape[0]):\n",
136 |       "            result[i, j] = np.sqrt(np.sum((X[i, :] - X[j, :]) ** 2))\n",
137 |       "    return result"
138 |      ],
139 |      "language": "python",
140 |      "metadata": {},
141 |      "outputs": [],
142 |      "prompt_number": 5
143 |     },
144 |     {
145 |      "cell_type": "code",
146 |      "collapsed": false,
147 |      "input": [
148 |       "%timeit pairwise_python2(X)"
149 |      ],
150 |      "language": "python",
151 |      "metadata": {},
152 |      "outputs": [
153 |       {
154 |        "output_type": "stream",
155 |        "stream": "stdout",
156 |        "text": [
157 |         "1 loops, best of 3: 18.2 s per loop\n"
158 |        ]
159 |       }
160 |      ],
161 |      "prompt_number": 6
162 |     },
163 |     {
164 |      "cell_type": "code",
165 |      "collapsed": false,
166 |      "input": [
167 |       "#np.allclose(pairwise_python(X), pairwise_python2(X))"
168 |      ],
169 |      "language": "python",
170 |      "metadata": {},
171 |      "outputs": [],
172 |      "prompt_number": 7
173 |     },
174 |     {
175 |      "cell_type": "heading",
176 |      "level": 2,
177 |      "metadata": {},
178 |      "source": [
179 |       "Numba Wrapper"
180 |      ]
181 |     },
182 |     {
183 |      "cell_type": "markdown",
184 |      "metadata": {},
185 |      "source": [
186 |       "Note: I did not use master as I get a `TypeError: 'numba.numbawrapper.NumbaCompiledWrapper' object is not callable` when calling it."
187 |      ]
188 |     },
189 |     {
190 |      "cell_type": "code",
191 |      "collapsed": false,
192 |      "input": [
193 |       "import numba\n",
194 |       "\n",
195 |       "numba.__version__"
196 |      ],
197 |      "language": "python",
198 |      "metadata": {},
199 |      "outputs": [
200 |       {
201 |        "metadata": {},
202 |        "output_type": "pyout",
203 |        "prompt_number": 8,
204 |        "text": [
205 |         "'0.9.0'"
206 |        ]
207 |       }
208 |      ],
209 |      "prompt_number": 8
210 |     },
211 |     {
212 |      "cell_type": "code",
213 |      "collapsed": false,
214 |      "input": [
215 |       "from numba import double\n",
216 |       "from numba.decorators import jit, autojit\n",
217 |       "\n",
218 |       "pairwise_numba = autojit(pairwise_python)\n",
219 |       "\n",
220 |       "%timeit pairwise_numba(X)"
221 |      ],
222 |      "language": "python",
223 |      "metadata": {},
224 |      "outputs": [
225 |       {
226 |        "output_type": "stream",
227 |        "stream": "stdout",
228 |        "text": [
229 |         "1 loops, best of 3: 6.72 ms per loop\n"
230 |        ]
231 |       }
232 |      ],
233 |      "prompt_number": 9
234 |     },
235 |     {
236 |      "cell_type": "code",
237 |      "collapsed": false,
238 |      "input": [
239 |       "%timeit pairwise_numba(X_wide)"
240 |      ],
241 |      "language": "python",
242 |      "metadata": {},
243 |      "outputs": [
244 |       {
245 |        "output_type": "stream",
246 |        "stream": "stdout",
247 |        "text": [
248 |         "10 loops, best of 3: 97.3 ms per loop\n"
249 |        ]
250 |       }
251 |      ],
252 |      "prompt_number": 10
253 |     },
254 |     {
255 |      "cell_type": "code",
256 |      "collapsed": false,
257 |      "input": [
258 |       "pairwise_numba2 = autojit(pairwise_python2)\n",
259 |       "\n",
260 |       "%timeit pairwise_numba2(X)"
261 |      ],
262 |      "language": "python",
263 |      "metadata": {},
264 |      "outputs": [
265 |       {
266 |        "output_type": "stream",
267 |        "stream": "stdout",
268 |        "text": [
269 |         "1 loops, best of 3: 13.9 s per loop"
270 |        ]
271 |       },
272 |       {
273 |        "output_type": "stream",
274 |        "stream": "stdout",
275 |        "text": [
276 |         "\n"
277 |        ]
278 |       }
279 |      ],
280 |      "prompt_number": 11
281 |     },
282 |     {
283 |      "cell_type": "heading",
284 |      "level": 2,
285 |      "metadata": {},
286 |      "source": [
287 |       "Parakeet Wrapper"
288 |      ]
289 |     },
290 |     {
291 |      "cell_type": "markdown",
292 |      "metadata": {},
293 |      "source": [
294 |       "Parakeet is installed from the master branch of the git repo on Jul. 3 2013"
295 |      ]
296 |     },
297 |     {
298 |      "cell_type": "code",
299 |      "collapsed": false,
300 |      "input": [
301 |       "from parakeet import jit\n",
302 |       "\n",
303 |       "pairwise_parakeet = jit(pairwise_python)\n",
304 |       "\n",
305 |       "%timeit pairwise_parakeet(X)"
306 |      ],
307 |      "language": "python",
308 |      "metadata": {},
309 |      "outputs": [
310 |       {
311 |        "output_type": "stream",
312 |        "stream": "stdout",
313 |        "text": [
314 |         "100 loops, best of 3: 12.3 ms per loop\n"
315 |        ]
316 |       }
317 |      ],
318 |      "prompt_number": 12
319 |     },
320 |     {
321 |      "cell_type": "code",
322 |      "collapsed": false,
323 |      "input": [
324 |       "%timeit pairwise_parakeet(X_wide)"
325 |      ],
326 |      "language": "python",
327 |      "metadata": {},
328 |      "outputs": [
329 |       {
330 |        "output_type": "stream",
331 |        "stream": "stdout",
332 |        "text": [
333 |         "10 loops, best of 3: 101 ms per loop\n"
334 |        ]
335 |       }
336 |      ],
337 |      "prompt_number": 13
338 |     },
339 |     {
340 |      "cell_type": "code",
341 |      "collapsed": false,
342 |      "input": [
343 |       "pairwise_parakeet2 = jit(pairwise_python2)\n",
344 |       "%timeit pairwise_parakeet2(X)"
345 |      ],
346 |      "language": "python",
347 |      "metadata": {},
348 |      "outputs": [
349 |       {
350 |        "output_type": "stream",
351 |        "stream": "stdout",
352 |        "text": [
353 |         "1 loops, best of 3: 13 ms per loop\n"
354 |        ]
355 |       }
356 |      ],
357 |      "prompt_number": 14
358 |     },
359 |     {
360 |      "cell_type": "code",
361 |      "collapsed": false,
362 |      "input": [
363 |       "%timeit pairwise_parakeet2(X_wide)"
364 |      ],
365 |      "language": "python",
366 |      "metadata": {},
367 |      "outputs": [
368 |       {
369 |        "output_type": "stream",
370 |        "stream": "stdout",
371 |        "text": [
372 |         "10 loops, best of 3: 103 ms per loop\n"
373 |        ]
374 |       }
375 |      ],
376 |      "prompt_number": 15
377 |     },
378 |     {
379 |      "cell_type": "code",
380 |      "collapsed": false,
381 |      "input": [
382 |       "np.allclose(pairwise_parakeet(X), pairwise_parakeet2(X))"
383 |      ],
384 |      "language": "python",
385 |      "metadata": {},
386 |      "outputs": [
387 |       {
388 |        "metadata": {},
389 |        "output_type": "pyout",
390 |        "prompt_number": 16,
391 |        "text": [
392 |         "True"
393 |        ]
394 |       }
395 |      ],
396 |      "prompt_number": 16
397 |     },
398 |     {
399 |      "cell_type": "code",
400 |      "collapsed": false,
401 |      "input": [
402 |       "np.allclose(pairwise_parakeet(X_wide), pairwise_parakeet2(X_wide))"
403 |      ],
404 |      "language": "python",
405 |      "metadata": {},
406 |      "outputs": [
407 |       {
408 |        "metadata": {},
409 |        "output_type": "pyout",
410 |        "prompt_number": 17,
411 |        "text": [
412 |         "True"
413 |        ]
414 |       }
415 |      ],
416 |      "prompt_number": 17
417 |     },
418 |     {
419 |      "cell_type": "heading",
420 |      "level": 2,
421 |      "metadata": {},
422 |      "source": [
423 |       "Optimized Cython Function"
424 |      ]
425 |     },
426 |     {
427 |      "cell_type": "code",
428 |      "collapsed": false,
429 |      "input": [
430 |       "!cython --version"
431 |      ],
432 |      "language": "python",
433 |      "metadata": {},
434 |      "outputs": [
435 |       {
436 |        "output_type": "stream",
437 |        "stream": "stdout",
438 |        "text": [
439 |         "Cython version 0.19.1\r\n"
440 |        ]
441 |       }
442 |      ],
443 |      "prompt_number": 18
444 |     },
445 |     {
446 |      "cell_type": "code",
447 |      "collapsed": false,
448 |      "input": [
449 |       "%load_ext cythonmagic"
450 |      ],
451 |      "language": "python",
452 |      "metadata": {},
453 |      "outputs": [],
454 |      "prompt_number": 19
455 |     },
456 |     {
457 |      "cell_type": "code",
458 |      "collapsed": false,
459 |      "input": [
460 |       "%%cython\n",
461 |       "\n",
462 |       "import numpy as np\n",
463 |       "cimport cython\n",
464 |       "from libc.math cimport sqrt\n",
465 |       "\n",
466 |       "@cython.boundscheck(False)\n",
467 |       "@cython.wraparound(False)\n",
468 |       "def pairwise_cython(double[:, ::1] X):\n",
469 |       "    cdef int M = X.shape[0]\n",
470 |       "    cdef int N = X.shape[1]\n",
471 |       "    cdef double tmp, d\n",
472 |       "    cdef double[:, ::1] D = np.empty((M, M), dtype=np.float64)\n",
473 |       "    for i in range(M):\n",
474 |       "        for j in range(M):\n",
475 |       "            d = 0.0\n",
476 |       "            for k in range(N):\n",
477 |       "                tmp = X[i, k] - X[j, k]\n",
478 |       "                d += tmp * tmp\n",
479 |       "            D[i, j] = sqrt(d)\n",
480 |       "    return np.asarray(D)"
481 |      ],
482 |      "language": "python",
483 |      "metadata": {},
484 |      "outputs": [],
485 |      "prompt_number": 20
486 |     },
487 |     {
488 |      "cell_type": "code",
489 |      "collapsed": false,
490 |      "input": [
491 |       "%timeit pairwise_cython(X)"
492 |      ],
493 |      "language": "python",
494 |      "metadata": {},
495 |      "outputs": [
496 |       {
497 |        "output_type": "stream",
498 |        "stream": "stdout",
499 |        "text": [
500 |         "100 loops, best of 3: 6.57 ms per loop\n"
501 |        ]
502 |       }
503 |      ],
504 |      "prompt_number": 21
505 |     },
506 |     {
507 |      "cell_type": "code",
508 |      "collapsed": false,
509 |      "input": [
510 |       "%timeit pairwise_cython(X_wide)"
511 |      ],
512 |      "language": "python",
513 |      "metadata": {},
514 |      "outputs": [
515 |       {
516 |        "output_type": "stream",
517 |        "stream": "stdout",
518 |        "text": [
519 |         "10 loops, best of 3: 95.5 ms per loop\n"
520 |        ]
521 |       }
522 |      ],
523 |      "prompt_number": 22
524 |     },
525 |     {
526 |      "cell_type": "heading",
527 |      "level": 2,
528 |      "metadata": {},
529 |      "source": [
530 |       "Fortran/F2Py"
531 |      ]
532 |     },
533 |     {
534 |      "cell_type": "code",
535 |      "collapsed": false,
536 |      "input": [
537 |       "%%file pairwise_fortran.f\n",
538 |       "\n",
539 |       "      subroutine pairwise_fortran(X,D,m,n)\n",
540 |       "          integer :: n,m\n",
541 |       "          double precision, intent(in) :: X(m,n)\n",
542 |       "          double precision, intent(out) :: D(m,m) \n",
543 |       "          integer :: i,j,k\n",
544 |       "          double precision :: r \n",
545 |       "          do i = 1,m \n",
546 |       "              do j = 1,m \n",
547 |       "                  r = 0\n",
548 |       "                  do k = 1,n \n",
549 |       "                      r = r + (X(i,k) - X(j,k)) * (X(i,k) - X(j,k)) \n",
550 |       "                  end do \n",
551 |       "                  D(i,j) = sqrt(r) \n",
552 |       "              end do \n",
553 |       "          end do \n",
554 |       "      end subroutine pairwise_fortran"
555 |      ],
556 |      "language": "python",
557 |      "metadata": {},
558 |      "outputs": [
559 |       {
560 |        "output_type": "stream",
561 |        "stream": "stdout",
562 |        "text": [
563 |         "Overwriting pairwise_fortran.f\n"
564 |        ]
565 |       }
566 |      ],
567 |      "prompt_number": 23
568 |     },
569 |     {
570 |      "cell_type": "code",
571 |      "collapsed": false,
572 |      "input": [
573 |       "# Compile the Fortran with f2py.\n",
574 |       "# We'll direct the output into /dev/null so it doesn't fill the screen\n",
575 |       "!f2py -c pairwise_fortran.f -m pairwise_fortran > /dev/null"
576 |      ],
577 |      "language": "python",
578 |      "metadata": {},
579 |      "outputs": [],
580 |      "prompt_number": 24
581 |     },
582 |     {
583 |      "cell_type": "code",
584 |      "collapsed": false,
585 |      "input": [
586 |       "from pairwise_fortran import pairwise_fortran\n",
587 |       "XF = np.asarray(X, order='F')\n",
588 |       "%timeit pairwise_fortran(XF)"
589 |      ],
590 |      "language": "python",
591 |      "metadata": {},
592 |      "outputs": [
593 |       {
594 |        "output_type": "stream",
595 |        "stream": "stdout",
596 |        "text": [
597 |         "100 loops, best of 3: 10.8 ms per loop\n"
598 |        ]
599 |       }
600 |      ],
601 |      "prompt_number": 25
602 |     },
603 |     {
604 |      "cell_type": "code",
605 |      "collapsed": false,
606 |      "input": [
607 |       "XF_wide = np.asarray(X_wide, order='F')\n",
608 |       "%timeit pairwise_fortran(XF_wide)"
609 |      ],
610 |      "language": "python",
611 |      "metadata": {},
612 |      "outputs": [
613 |       {
614 |        "output_type": "stream",
615 |        "stream": "stdout",
616 |        "text": [
617 |         "10 loops, best of 3: 111 ms per loop\n"
618 |        ]
619 |       }
620 |      ],
621 |      "prompt_number": 26
622 |     },
623 |     {
624 |      "cell_type": "heading",
625 |      "level": 2,
626 |      "metadata": {},
627 |      "source": [
628 |       "Scipy Pairwise Distances"
629 |      ]
630 |     },
631 |     {
632 |      "cell_type": "code",
633 |      "collapsed": false,
634 |      "input": [
635 |       "from scipy.spatial.distance import cdist\n",
636 |       "%timeit cdist(X, X)"
637 |      ],
638 |      "language": "python",
639 |      "metadata": {},
640 |      "outputs": [
641 |       {
642 |        "output_type": "stream",
643 |        "stream": "stdout",
644 |        "text": [
645 |         "100 loops, best of 3: 7.37 ms per loop\n"
646 |        ]
647 |       }
648 |      ],
649 |      "prompt_number": 27
650 |     },
651 |     {
652 |      "cell_type": "code",
653 |      "collapsed": false,
654 |      "input": [
655 |       "%timeit cdist(X_wide, X_wide)"
656 |      ],
657 |      "language": "python",
658 |      "metadata": {},
659 |      "outputs": [
660 |       {
661 |        "output_type": "stream",
662 |        "stream": "stdout",
663 |        "text": [
664 |         "10 loops, best of 3: 97.6 ms per loop\n"
665 |        ]
666 |       }
667 |      ],
668 |      "prompt_number": 28
669 |     },
670 |     {
671 |      "cell_type": "heading",
672 |      "level": 2,
673 |      "metadata": {},
674 |      "source": [
675 |       "Scikit-learn Pairwise Distances"
676 |      ]
677 |     },
678 |     {
679 |      "cell_type": "code",
680 |      "collapsed": false,
681 |      "input": [
682 |       "from sklearn.metrics import euclidean_distances\n",
683 |       "%timeit euclidean_distances(X, X)"
684 |      ],
685 |      "language": "python",
686 |      "metadata": {},
687 |      "outputs": [
688 |       {
689 |        "output_type": "stream",
690 |        "stream": "stdout",
691 |        "text": [
692 |         "100 loops, best of 3: 16.2 ms per loop\n"
693 |        ]
694 |       }
695 |      ],
696 |      "prompt_number": 29
697 |     },
698 |     {
699 |      "cell_type": "code",
700 |      "collapsed": false,
701 |      "input": [
702 |       "%timeit euclidean_distances(X_wide, X_wide)"
703 |      ],
704 |      "language": "python",
705 |      "metadata": {},
706 |      "outputs": [
707 |       {
708 |        "output_type": "stream",
709 |        "stream": "stdout",
710 |        "text": [
711 |         "10 loops, best of 3: 22.4 ms per loop\n"
712 |        ]
713 |       }
714 |      ],
715 |      "prompt_number": 30
716 |     },
717 |     {
718 |      "cell_type": "heading",
719 |      "level": 2,
720 |      "metadata": {},
721 |      "source": [
722 |       "Remarks and analysis"
723 |      ]
724 |     },
725 |     {
726 |      "cell_type": "markdown",
727 |      "metadata": {},
728 |      "source": [
729 |       "- This was run on a macbook air 2012 2Ghz Core i7 with the default system blas implementation (no MKL) for numpy\n",
730 |       "- Some of the timings vary quite a lot from Jake's original post.\n",
731 |       "- Numba seems to be able to go twice faster than Parakeet when `n_features` is small (e.g. 3 in Jake's original setting)\n",
732 |       "- Numba fails to optimize the python version that uses the numpy notation to compute distances on pairs of rows\n",
733 |       "- Maybe calling numba `nopython=True` would catch this but I did not understand how to add this option and make the first example work so I am not sure how to use that option correctly \n",
734 |       "- Parakeet is almost as fast as numba when `n_features` grows to more realistic sizes (e.g. 100)\n",
735 |       "- Parakeet can work as efficiently with the numpy row slice expression without any issue which allow for a more natural and concise syntax.\n",
736 |       "- Blas (as used in the scikit-learn implementation) is still a killer as soon as all the dimensions are not small (note: the scikit-learn implementation can be less numerically stable though)"
737 |      ]
738 |     }
739 |    ],
740 |    "metadata": {}
741 |   }
742 |  ]
743 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ogrisel's notebook
2 | 
3 | This is a bunch of IPython notebooks documents with mostly unfinished ML related experiments.
4 | 
5 | Some of them can be executed in a basic numpy / scipy / pandas / matplotlib / scikit-learn
6 | environment for instance using:
7 | 
8 | [![Binder](http://mybinder.org/badge.svg)](http://mybinder.org/repo/ogrisel/notebooks)
9 | 


--------------------------------------------------------------------------------
/Semi-supervised Extra Trees.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": ""
  4 |  },
  5 |  "nbformat": 3,
  6 |  "nbformat_minor": 0,
  7 |  "worksheets": [
  8 |   {
  9 |    "cells": [
 10 |     {
 11 |      "cell_type": "code",
 12 |      "collapsed": false,
 13 |      "input": [
 14 |       "%matplotlib inline\n",
 15 |       "import numpy as np\n",
 16 |       "import matplotlib.pyplot as plt\n",
 17 |       "\n",
 18 |       "from sklearn.ensemble import ExtraTreesClassifier\n",
 19 |       "from sklearn.datasets import fetch_covtype\n",
 20 |       "from sklearn.cross_validation import train_test_split\n",
 21 |       "from sklearn.cross_validation import cross_val_score\n",
 22 |       "from sklearn.utils import shuffle\n",
 23 |       "from sklearn.base import BaseEstimator\n",
 24 |       "from sklearn.base import clone"
 25 |      ],
 26 |      "language": "python",
 27 |      "metadata": {},
 28 |      "outputs": [],
 29 |      "prompt_number": 99
 30 |     },
 31 |     {
 32 |      "cell_type": "code",
 33 |      "collapsed": false,
 34 |      "input": [
 35 |       "covtype = fetch_covtype()\n",
 36 |       "X, y = covtype.data, covtype.target\n",
 37 |       "\n",
 38 |       "print(X.shape)\n",
 39 |       "print(np.unique(y))"
 40 |      ],
 41 |      "language": "python",
 42 |      "metadata": {},
 43 |      "outputs": [
 44 |       {
 45 |        "output_type": "stream",
 46 |        "stream": "stdout",
 47 |        "text": [
 48 |         "(581012, 54)\n",
 49 |         "[1 2 3 4 5 6 7]\n"
 50 |        ]
 51 |       }
 52 |      ],
 53 |      "prompt_number": 37
 54 |     },
 55 |     {
 56 |      "cell_type": "code",
 57 |      "collapsed": false,
 58 |      "input": [
 59 |       "X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.1)"
 60 |      ],
 61 |      "language": "python",
 62 |      "metadata": {},
 63 |      "outputs": [],
 64 |      "prompt_number": 13
 65 |     },
 66 |     {
 67 |      "cell_type": "code",
 68 |      "collapsed": false,
 69 |      "input": [
 70 |       "X_small, X_unlabeled, y_small, _ = train_test_split(X_dev, y_dev, train_size=10000)"
 71 |      ],
 72 |      "language": "python",
 73 |      "metadata": {},
 74 |      "outputs": [],
 75 |      "prompt_number": 30
 76 |     },
 77 |     {
 78 |      "cell_type": "code",
 79 |      "collapsed": false,
 80 |      "input": [
 81 |       "%%time\n",
 82 |       "\n",
 83 |       "etrees = ExtraTreesClassifier(n_estimators=80, n_jobs=4)\n",
 84 |       "scores = cross_val_score(etrees, X_small, y_small, cv=5)\n",
 85 |       "\n",
 86 |       "print(\"5-folds cv score: %0.3f+/-%0.3f\" % (np.mean(scores), np.std(scores)))"
 87 |      ],
 88 |      "language": "python",
 89 |      "metadata": {},
 90 |      "outputs": [
 91 |       {
 92 |        "output_type": "stream",
 93 |        "stream": "stdout",
 94 |        "text": [
 95 |         "5-folds cv score: 0.836+/-0.005\n",
 96 |         "CPU times: user 14.5 s, sys: 1.99 s, total: 16.5 s\n",
 97 |         "Wall time: 6.12 s\n"
 98 |        ]
 99 |       }
100 |      ],
101 |      "prompt_number": 36
102 |     },
103 |     {
104 |      "cell_type": "code",
105 |      "collapsed": false,
106 |      "input": [
107 |       "np.random.uniform(size=(3, 4))"
108 |      ],
109 |      "language": "python",
110 |      "metadata": {},
111 |      "outputs": [
112 |       {
113 |        "metadata": {},
114 |        "output_type": "pyout",
115 |        "prompt_number": 68,
116 |        "text": [
117 |         "array([[ 0.29906499,  0.61963946,  0.07382687,  0.51198165],\n",
118 |         "       [ 0.75008411,  0.32665691,  0.38846908,  0.26959562],\n",
119 |         "       [ 0.56896242,  0.1422773 ,  0.06123208,  0.77610519]])"
120 |        ]
121 |       }
122 |      ],
123 |      "prompt_number": 68
124 |     },
125 |     {
126 |      "cell_type": "code",
127 |      "collapsed": false,
128 |      "input": [
129 |       "def shuffle_columns(X, copy=True, seed=0):\n",
130 |       "    rng = np.random.RandomState(seed)\n",
131 |       "    if copy:\n",
132 |       "        X = X.copy()\n",
133 |       "    for i in range(X.shape[1]):\n",
134 |       "        rng.shuffle(X[:, i])\n",
135 |       "    return X\n",
136 |       "\n",
137 |       "\n",
138 |       "def corrupt(X, copy=True, rate=0.1, seed=0):\n",
139 |       "    rng = np.random.RandomState(seed)\n",
140 |       "    if copy:\n",
141 |       "        X = X.copy()\n",
142 |       "    X_shuffled = shuffle_columns(X, seed=0)\n",
143 |       "    mask = rng.uniform(size=X.shape) < rate\n",
144 |       "    X[mask] = X_shuffled[mask]\n",
145 |       "    return X\n",
146 |       "\n",
147 |       "\n",
148 |       "def make_normality_problem(X, seed=0):\n",
149 |       "    data = np.vstack([X, shuffle_columns(X, seed=seed)])\n",
150 |       "    target = np.zeros(X.shape[0] * 2, dtype=np.int)\n",
151 |       "    target[:X.shape[0]] = 1\n",
152 |       "    return shuffle(data, target, random_state=seed)\n",
153 |       "\n",
154 |       "\n",
155 |       "X_normal, y_normal = make_normality_problem(X_small)"
156 |      ],
157 |      "language": "python",
158 |      "metadata": {},
159 |      "outputs": [],
160 |      "prompt_number": 74
161 |     },
162 |     {
163 |      "cell_type": "code",
164 |      "collapsed": false,
165 |      "input": [
166 |       "%%time\n",
167 |       "\n",
168 |       "etrees_normality = ExtraTreesClassifier(n_estimators=80, n_jobs=4)\n",
169 |       "scores = cross_val_score(etrees_normality, X_normal, y_normal, cv=5)\n",
170 |       "\n",
171 |       "print(\"5-folds cv score: %0.3f+/-%0.3f\" % (np.mean(scores), np.std(scores)))"
172 |      ],
173 |      "language": "python",
174 |      "metadata": {},
175 |      "outputs": [
176 |       {
177 |        "output_type": "stream",
178 |        "stream": "stdout",
179 |        "text": [
180 |         "5-folds cv score: 0.978+/-0.001\n",
181 |         "CPU times: user 32 s, sys: 464 ms, total: 32.5 s\n",
182 |         "Wall time: 10 s\n"
183 |        ]
184 |       }
185 |      ],
186 |      "prompt_number": 75
187 |     },
188 |     {
189 |      "cell_type": "code",
190 |      "collapsed": false,
191 |      "input": [
192 |       "%%time\n",
193 |       "\n",
194 |       "_ = etrees_normality.fit(X_normal, y_normal)"
195 |      ],
196 |      "language": "python",
197 |      "metadata": {},
198 |      "outputs": [
199 |       {
200 |        "output_type": "stream",
201 |        "stream": "stdout",
202 |        "text": [
203 |         "CPU times: user 8.47 s, sys: 55.8 ms, total: 8.53 s\n",
204 |         "Wall time: 2.38 s\n"
205 |        ]
206 |       }
207 |      ],
208 |      "prompt_number": 76
209 |     },
210 |     {
211 |      "cell_type": "code",
212 |      "collapsed": false,
213 |      "input": [
214 |       "X_corrupted = corrupt(X_small, rate=0.2)\n",
215 |       "\n",
216 |       "predicted_normality = etrees_normality.predict_proba(X_corrupted)[:, 1]\n",
217 |       "_ = plt.hist(predicted_normality, bins=30)\n",
218 |       "\n",
219 |       "X_new_unlabeled = X_corrupted[predicted_normality > 0.5]\n",
220 |       "print(X_new_unlabeled.shape)"
221 |      ],
222 |      "language": "python",
223 |      "metadata": {},
224 |      "outputs": [
225 |       {
226 |        "metadata": {},
227 |        "output_type": "display_data",
228 |        "png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEACAYAAABbMHZzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAEwpJREFUeJzt3X+MHOVhh/Fni+0WWh/GdWVsnyu7xm4xIlFCsEkimkuT\nWhfU2lYrYWhDncSqKrlN0qpNgymC4w8oSdU0RJGpGmQwUbnKJcgyKVgYEquoCTghhBgO1z8Ut75L\nfSSBYrdqwJa3f7zv5cbH3e3M7N3s3r7PRxp29t13Z17Gt999552ZHZAkSZIkSZIkSZIkSZIkSdIM\ntQMYBg6OKf848DLwIvCZTPk24AhwCFiXKb8qLuMIcM90NVaSNDWuBd7B+eH/fmAfMDs+/6X4uBr4\nbixfBhwFavG1A8CaOP8Y0DttLZYkTYllnB/+u4DfGKfeNuDTmed7gWuARYS9hBE3AH8/tU2UJBXx\nMyXesxL4deAZYD/wrli+GBjM1BsEloxTPhTLJUktMqvkey4h9OqvJuwJ/MpUNkqSNL3KhP8g8Eic\n/xZwDlhA6NEvzdTrjnWH4ny2fGi8Ba9YsaJ+7NixEk2SpKQdAy4r8oYywz67GR3zXwXMAX4E7CGM\n588BlhOGhw4AJ4FTwFrCAeCb4jLe4tixY9Trdad6ndtvv73lbWiXyW3htnBbTD4BK4oGeaOefz/w\nPuAXgRPAbYTTP3cQDgK/CfxBrDtAGAIaAM4CW4F6fG0r8ABwIeFsn71FGypJmjqNwv/GCcpvmqD8\nrjiN9RxwZd5GSZKmV5lhH1Wgp6en1U1oG26LUW6LUW6L5tQaV6lUPY5fSZJyqtVqUDDP7flLUoIM\nf0lKkOEvSW2mq2s+tVot91SGY/6S1GZCoBfJQsf8JUk5GP6SlCDDX5ISZPhLUoIMf0lKkOEvSQky\n/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSlCj8N8BDBNu1j7WnwPngPmZsm3AEeAQsC5T\nflVcxhHgnrKNlSRNjUbhfz/QO075UuA3gf/IlK0GNsXHXmA7oz8xei+wBVgZp/GWKUmqSKPwfxp4\nbZzyzwF/OaZsA9APnAGOA0eBtcAiYC5wINZ7ENhYrrmSpKlQZsx/AzAIfG9M+eJYPmIQWDJO+VAs\nlyS1yKyC9S8CbiEM+Yxot7uBSZIaKBr+K4BlwAvxeTfwHGF4Z4hwLIDMa4OxvHtM+dBEK+jr6/vp\nfE9PDz09PQWbKEmdbn+cysvTa18GPApcOc5r3yecyfMq4UDvQ8AawrDOk8BlhBtRPgt8gjDu/y/A\nF4C94yzPe/hKSl473MO3H/gGsAo4AXx0zOvZ1g0Au+Lj48DWzOtbgfsIp3oeZfzglyRVpN3G6+35\nS0peO/T8JUkdyPCXpAQZ/pKUIMNfkhJk+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlCDD\nX5ISVPT3/Kfd448/nqve1VdfzYIFC6a5NZLUmdruVz0vvrjxvd1/8pPD3HbbFm655ZYKmiRJ1ari\nVz3bruf/+uuNe/612q2cO3eugtZIUmdyzF+SEmT4S1KCDH9JSpDhL0kJahT+O4Bh4GCm7G+Al4EX\ngEeAizOvbSPcpP0QsC5TflVcxhHgnuaaLElqVqPwvx8Ye+7lE8AVwNuBw4TAB1gNbIqPvcB2Rk89\nuhfYAqyMU+PzOSVJ06ZR+D8NvDambB8wcp7ls0B3nN8A9ANngOPAUWAtsAiYCxyI9R4ENjbTaElS\nc5od8/8Y8FicXwwMZl4bBJaMUz4UyyVJLdLMRV5/BbwJPDRFbYn6MvM9cZIkjdofp/LKhv9HgOuA\nD2TKhoClmefdhB7/EKNDQyPlQxMvuq9kkyQpFT2c3zG+o/ASygz79AKfIozx/yRTvge4AZgDLCcc\n2D0AnAROEcb/a8BNwO4S65UkTZFGPf9+4H3AAuAEcDvh7J45hAO/AN8EtgIDwK74eDaWjfwy0Vbg\nAeBCwjGCvVP1PyBJKq5R+N84TtmOSerfFaexngOuzNsoSdL08gpfSUqQ4S9JCTL8JSlBhr8kJcjw\nl6QEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9J\nSpDhL0kJMvwlKUGNwn8HMAwczJTNJ9y8/TDwBDAv89o24AhwCFiXKb8qLuMIcE9zTZYkNatR+N8P\n9I4pu5kQ/quAp+JzgNXApvjYC2wHavG1e4EtwMo4jV2mJKlCjcL/aeC1MWXrgZ1xfiewMc5vAPqB\nM8Bx4CiwFlgEzAUOxHoPZt4jSWqBMmP+CwlDQcTHhXF+MTCYqTcILBmnfCiWS5JaZFaT76/HaQr1\nZeZ74iRJGrU/TuWVCf9h4FLgJGFI55VYPgQszdTrJvT4h+J8tnxo4sX3lWiSJKWkh/M7xncUXkKZ\nYZ89wOY4vxnYnSm/AZgDLCcc2D1A+JI4RRj/rwE3Zd4jSWqBRj3/fuB9wALgBHAbcDewi3D2znHg\n+lh3IJYPAGeBrYwOCW0FHgAuBB4D9k5R+yVJJTQK/xsnKP/gBOV3xWms54Ar8zZKkjS9vMJXkhJk\n+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky/CUpQYa/\nJCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSlAz4b8NeAk4CDwE/CwwH9gHHAaeAOaNqX8EOASsa2K9\nkqQmlQ3/ZcAfAu8k3Jj9AuAG4GZC+K8CnorPAVYDm+JjL7C9iXVLkppUNoBPAWeAi4BZ8fEHwHpg\nZ6yzE9gY5zcA/fE9x4GjwJqS65YkNals+L8K/C3wn4TQ/29Cj38hMBzrDMfnAIuBwcz7B4ElJdct\nSWrSrJLvWwH8KWH453Xgn4EPj6lTj9NEJnitLzPfEydJ0qj9cSqvbPi/C/gG8OP4/BHg3cBJ4NL4\nuAh4Jb4+BCzNvL87lo2jr2STJCkVPZzfMb6j8BLKDvscAq4BLgRqwAeBAeBRYHOssxnYHef3EA4I\nzwGWAyuBAyXXLUlqUtme/wvAg8C3gXPAd4B/AOYCu4AthAO718f6A7F8ADgLbGXyISFJ0jQqG/4A\nn41T1quEvYDx3BUnSVKLea69JCWo1uoGjFHPMxpUq93K7Nmf5803/zfXQufOvYRTp15ttm2SVIla\nrUaxkfHaT/+TVzPDPi0Vgj/fxjl9ut2+4ySptRz2kaQEGf6SlCDDX5ISZPhLUoIMf0lKkOEvSQky\n/CUpQYa/JCXI8JekBBn+kpQgw1+SEmT4S1KCDH9JSpDhL0kJMvwlKUGGvyQlqJnwnwc8DLxMuDH7\nWmA+sA84DDwR64zYBhwBDgHrmlivJKlJzYT/PcBjwOXA2wihfjMh/FcBT8XnAKuBTfGxF9je5Lol\nSU0oG8AXA9cCO+Lzs8DrwHpgZyzbCWyM8xuAfuAMcBw4CqwpuW5JUpPKhv9y4IfA/cB3gC8BPw8s\nBIZjneH4HGAxMJh5/yCwpOS6JUlNKnsD91nAO4E/Ab4FfJ7RIZ4RdSa/w/oEr/Vl5nviJEkatT9O\n5ZUN/8E4fSs+f5hwQPckcGl8XAS8El8fApZm3t8dy8bRV7JJkpSKHs7vGN9ReAllh31OAicIB3YB\nPgi8BDwKbI5lm4HdcX4PcAMwhzBktBI4UHLdkqQmle35A3wc+EdCoB8DPgpcAOwCthAO7F4f6w7E\n8gHCweGtTD4kJEmaRrVWN2CMep7vhFrtVur1O8n//VGjXve7RtLMUKvVKNY/rv30P3l5rr0kJcjw\nl6QEGf6SlCDDX5Iq0NU1n1qtlmuqQjNn+0iScjp9+jWKnKQy3ez5S1KCDH9JSpDhL0kJMvwlKUGG\nvyQlyPCXpAQZ/pKUIMN/jCIXYnR1zW91cyWpFC/yGqPIhRinT7fbj6JKUj72/CUpQYa/JCXI8Jek\nBBn+kpQgw1+SEtRs+F8APA88Gp/PB/YBh4EngHmZutuAI8AhYF2T65UkNaHZ8P8kMMDouZE3E8J/\nFfBUfA6wGtgUH3uB7VOwbklSSc0EcDdwHXAfo3ceWA/sjPM7gY1xfgPQD5wBjgNHgTVNrFuS1IRm\nwv/vgE8B5zJlC4HhOD8cnwMsBgYz9QaBJU2sW5Jart1uzVhE2St8fwt4hTDe3zNBnTqTXyo7wWt9\nmfmeSRYvSa3Vulsz7o9TeWXD/z2EIZ7rgJ8DuoAvE3r7lwIngUWELwiAIWBp5v3dsWwcfSWbJEmp\n6OH8jvEdhZdQdtjnFkKYLwduAL4G3ATsATbHOpuB3XF+T6w3J75nJXCg5LolSU2aqh92G9nvuRvY\nBWwhHNi9PpYPxPIB4Cywlfz7SpKkKdZuRyHqeb4TarVbqdfvpMhYW72er244MDP1y5XUeYrmxfTU\nHalfLM89174ps3If6ff3/yW1E3/PvylnKfLt7O//S2oX9vwlKUGJ9PxnteVFFpLUKomEf5HhGb8k\nJHU+h30kKUGGvyQlyPCXpAQZ/pKUIMNfkhJk+EuakYr8lr5X179VIqd6Suo0RX5L//Tp2bmv9Zk7\n9xJOnXq1iZbNDIa/pATkv9YnlZ9hcdinTblLK2k62fNvU8V2adPoqUiaOvb8JU2bInuw7sVWy56/\npGlT7Abn7sVWyZ6/JCWobPgvBb4OvAS8CHwils8H9gGHgSeAeZn3bAOOAIeAdSXXq3Hlv6OYu9WS\noHz4nwH+DLgCuAb4Y+By4GZC+K8CnorPAVYDm+JjL7C9iXXrLUZOY2s8hd1wSRPL35maycoG8Eng\nu3H+f4CXgSXAemBnLN8JbIzzG4B+wpfGceAosKbkuiVpGuXvTM1kU9H7Xga8A3gWWAgMx/Lh+Bxg\nMTCYec8g4csiMWn0KNQevFZEk2n2bJ9fAL4CfBI4Pea1Rl+NM/trsxTvKKbqeK2IJtNM+M8mBP+X\ngd2xbBi4lDAstAh4JZYPEQ4Sj+iOZePoy8z3xEmt0NU1P/cxglR+D0XF/i40XfbHqbyyX/c1wpj+\njwkHfkd8NpZ9hnCwd158XA08RBjnXwI8CVzGW7sl9Tw9lVrtVur1O8d5+2TNbXXddmlHjXo9X90w\n/JR3ubMJezb5+GVRTvHgbfXfRdHPSLG/o3b4PLW+7kj9Ynletuf/XuDDwPeA52PZNuBuYBewhXBg\n9/r42kAsHyD8y24lyWGfTlZkSKvYMIN7IKOKXTQ1E4dyHBqtSrttPXv+FdRtjx7e9LUj73KLKNbj\nnk04sS2fIl9Y0/dv0j5/F+3wGZlZdUfqV9Pzl5JSvMftTxqovXmhlSQlyPCXpAQ57KMOM8vb9ZWS\nf7upMxj+yen0D7m36yvHs2xSY/gnxw+5JMNfagOdvjemdmT4K2HtErrujal6hr8SZugqXYa/WqRd\net1Smgx/tYi9bqmVvMhLkhJk+EtSggx/SUqQ4S9JCTL8JSlBhr8kJcjwl6QEGf6SlKCqw78XOAQc\nAT5d8bolSVGV4X8B8EXCF8Bq4Ebg8grXL0mKqgz/NcBR4DhwBvgnYEOF65ckRVWG/xLgROb5YCyT\nJFWsyh92y/UrXl1dv92wzhtvHOKNN5pujyQlq8rwHwKWZp4vJfT+s46dOvXVFfkXWeTXHtuhbru0\nox3qtks72qFuu7SjHeq2SztmWl2OFalctVmEBi4D5gDfxQO+kpSEDwH/Tjjwu63FbZEkSZI03fJc\n7PWF+PoLwDsqalcrNNoWv0/YBt8D/g14W3VNq1TeCwCvJtwG7HeqaFSL5NkWPcDzwIvA/kpa1RqN\ntsUCYC9hGPlF4COVtax6O4Bh4OAkddo6Ny8gDPssA2Yz/tj/dcBjcX4t8ExVjatYnm3xbuDiON9L\nZ26LPNthpN7XgK8Cv1tV4yqWZ1vMA14CuuPzBVU1rmJ5tkUf8NdxfgHwYzr39rTXEgJ9ovAvlJut\n+G2fPBd7rQd2xvlnCX/sCytqX5XybItvAq/H+WcZ/cB3krwXAH4ceBj4YWUtq16ebfF7wFcYPVvu\nR1U1rmJ5tsV/AV1xvosQ/mcral/VngZem+T1QrnZivDPc7HXeHU6MfSKXvi2hdFv9k6S929iA3Bv\nfJ737u8zTZ5tsRKYD3wd+DZwUzVNq1yebfEl4ArgB4Shjk9W07S2VCg3W7F7lPdDO/Yk1078sBf5\nf3o/8DHgvdPUllbKsx0+D9wc69YofvL4TJFnW8wG3gl8ALiIsHf4DGGst5Pk2Ra3EIaDeoAVwD7g\n7cDp6WtWW8udm60I/zwXe42t0x3LOk2ebQHhIO+XCGP+k+32zVR5tsNVhN1+CGO7HyIMBeyZ9tZV\nK8+2OEEY6vm/OP0rIfA6LfzzbIv3AHfG+WPA94FfJewRpabtczPPxV7ZAxfX0JkHOSHftvhlwrjn\nNZW2rFpFLwC8n8492yfPtvg14EnCAdGLCAcAV1fXxMrk2RafA26P8wsJXw7zK2pfKywj3wHfts3N\n8S72+qM4jfhifP0Fwi5up2q0Le4jHMR6Pk4Hqm5gRfL8TYzo5PCHfNviLwhn/BwEPlFp66rVaFss\nAB4l5MRBwsHwTtVPOLbxJmHv72Okm5uSJEmSJEmSJEmSJEmSJEmSJEmSNLP9P5m/8m5+bQgcAAAA\nAElFTkSuQmCC\n",
229 |        "text": [
230 |         "<matplotlib.figure.Figure at 0x10f7a2310>"
231 |        ]
232 |       }
233 |      ],
234 |      "prompt_number": 90
235 |     },
236 |     {
237 |      "cell_type": "code",
238 |      "collapsed": false,
239 |      "input": [
240 |       "predicted_normality = etrees_normality.predict_proba(X_unlabeled)[:, 1]\n",
241 |       "_ = plt.hist(predicted_normality, bins=30)"
242 |      ],
243 |      "language": "python",
244 |      "metadata": {},
245 |      "outputs": [
246 |       {
247 |        "metadata": {},
248 |        "output_type": "display_data",
249 |        "png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEACAYAAABPiSrXAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFPVJREFUeJzt3VuMVdd9x/HvsTEEN1w8psJcBowc3Ia0qR0acJu0OREy\nHvcBcIsMaYtpgypLtI7VqlJNHswQt039kDhEkXmIibm0JaCgGtxQLgGjRFUxToRtbEK5yEjMYHCK\nzSVKL6CcPqz/MJvJMbPmds6Z8fcjbfY6/32ZdXac/WPvtTcDkiRJkiRJkiRJkiRJkiRJkvrgQ8DL\nwKvAEeDLUW8F2oBDMT1Y2GYFcBw4Cswt1GcCh2PZ6kJ9BLA56geAqYVlS4FjMT3SD99HkjTAbo35\nMNJJ/dPASuCvqqw7gxQwtwB3AieAUiw7CMyK9g6gJdrLgWejvQj4drSbgJPA2Jg62pKkOrkpY52f\nxXw4cDPwXnwuVVl3PrAJuAKcIoXGbGACMIoUHAAbgAXRngesj/ZWYE60HwB2Axdi2kNn0EiS6iAn\nNG4iXT2cA14C3oz6Y8BrwFo6rwAmkm5bdWgDJlWpt0edmJ+O9lXgInD7DfYlSaqTnND4OXAPMBn4\nXaAMrAGmRf1t4CsD1D9JUgMZ1oN1LwLfBX4T2F+oPwe8GO12oLmwbDLpCqE92l3rHdtMAc5Ef8YA\n56NeLmzTDOzr2qm77rqrcvLkyR58DUkSaZz4Iz3dqLsrjXF03noaCdxPelrqjsI6D5GeigLYDiwm\njX9MA6aTxjHOApdI4xslYAmwrbDN0mgvBPZGezfp6auxwG3xs3d17eDJkyepVCpOlQorV66sex8a\nZfJYeCw8FjeegLu6Of9X1d2VxgTSIPVNMW2Mk/oG0q2pCvAW8GisfwTYEvOrpCejKrFsObCOFD47\ngJ1RXxv7PU66wlgc9XeBp4BX4vMq0oC4JKlOuguNw8AnqtRv9M7E38fU1Y+AX69S/1/g4ffZ1/Mx\nSZIaQM5AuAaJcrlc7y40DI9FJ49FJ49F31V712KwqcT9OUlSplKpBL3IAK80JEnZDA1JUjZDQ5KU\nzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KU\nzdCQpCFi9OgmSqVS1tRb/hImSRoiUhjkng/9JUySpAFmaEiSshkakqRshoYkKVt3ofEh4GXgVeAI\n8OWoNwF7gGPAbmBsYZsVwHHgKDC3UJ8JHI5lqwv1EcDmqB8AphaWLY2fcQx4JPM7SZIGSHeh8T/A\nZ4F7gI9H+9PAE6TQuBvYG58BZgCLYt4CPEvn6PwaYBkwPaaWqC8DzkftGeDpqDcBTwKzYlrJ9eEk\nSaqxnNtTP4v5cOBm4D1gHrA+6uuBBdGeD2wCrgCngBPAbGACMAo4GOttKGxT3NdWYE60HyBdxVyI\naQ+dQSNJqoOc0LiJdHvqHPAS8CYwPj4T8/HRngi0FbZtAyZVqbdHnZifjvZV4CJw+w32JUmqk2EZ\n6/ycdHtqDLCLdIuqqEL+2yQDorW19Vq7XC5TLpfr1hdJakz7Y+qbnNDocBH4LmlA+xxwB3CWdOvp\nnVinHWgubDOZdIXQHu2u9Y5tpgBnoj9jSGMc7UC5sE0zsK9ax4qhIUmqpsz1p9RVvdpLd7enxtE5\n+DwSuB84BGwnPdlEzF+I9nZgMWn8YxppcPsgKVwukcY3SsASYFthm459LSQNrEMaz5gbP/+2+Nm7\nevj9JEn9qLsrjQmkQeqbYtpIOqkfAraQnnw6BTwc6x+J+hHS+MRyOm9dLQfWkcJnB7Az6mtjv8dJ\nVxiLo/4u8BTwSnxeRRoQlyTVif9goSQNEf6DhZKkhmJoSJKyGRqSpGyGhiQpm6EhScpmaEiSshka\nkqRshoYkKZuhIUnKZmhIkrIZGpKkbIaGJCmboSFJymZoSJKyGRqSpGyGhiQpm6EhScpmaEiSshka\nkqRshoYkKZuhIUnKZmhIkrIZGpKkbN2FRjPwEvAm8Abwhai3Am3AoZgeLGyzAjgOHAXmFuozgcOx\nbHWhPgLYHPUDwNTCsqXAsZgeyftKkqSBUupm+R0xvQp8GPgRsAB4GLgMfLXL+jOAfwY+CUwCvgdM\nByrAQeAvYr4D+DqwE1gO/FrMFwEPAYuBJuAVUtgQP3smcKHLz6xUKpXMrytJQ1epVCKdbrPWvvZH\nT3R3pXGWFBgAPwV+TAqD9/th84FNwBXgFHACmA1MAEaRAgNgAyl8AOYB66O9FZgT7QeA3aSQuADs\nAVq6/0qSpIHSkzGNO4F7SbeQAB4DXgPWAmOjNpF026pDGylkutbb6QyfScDpaF8FLgK332BfkqQ6\nGZa53oeB7wCPk6441gBfimVPAV8BlvV77zK1trZea5fLZcrlcr26IkkNan9MfZMTGreQbhv9I/BC\n1N4pLH8OeDHa7aTB8w6TSVcI7dHuWu/YZgpwJvozBjgf9XJhm2ZgX7UOFkNDklRNmetPqat6tZfu\nbk+VSLefjgBfK9QnFNoPkZ6KAthOGsQeDkwjDYIfJI2NXCKNb5SAJcC2wjZLo70Q2Bvt3aSnr8YC\ntwH3A7uyv5kkqd91d6XxKeCPgddJj9YCfBH4HHAPaZj+LeDRWHYE2BLzq6QnojqG8pcD64CRpKen\ndkZ9LbCR9MjteVLoALxLuvX1SnxexS8+OSVJqqEeP27VgHzkVpJojEduJUm6xtCQJGUzNCRJ2QwN\nSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2QwN\nSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZuguNZuAl4E3gDeALUW8C9gDHgN3A\n2MI2K4DjwFFgbqE+Ezgcy1YX6iOAzVE/AEwtLFsaP+MY8Ejmd5IkDZDuQuMK8JfAx4D7gD8HPgo8\nQQqNu4G98RlgBrAo5i3As0Aplq0BlgHTY2qJ+jLgfNSeAZ6OehPwJDArppVcH06SpBrrLjTOAq9G\n+6fAj4FJwDxgfdTXAwuiPR/YRAqbU8AJYDYwARgFHIz1NhS2Ke5rKzAn2g+QrmIuxLSHzqCRJNVB\nT8Y07gTuBV4GxgPnon4uPgNMBNoK27SRQqZrvT3qxPx0tK8CF4Hbb7AvSVKdDMtc78Okq4DHgctd\nllViqpvW1tZr7XK5TLlcrltfJKkx7Y+pb3JC4xZSYGwEXojaOeAO0u2rCcA7UW8nDZ53mEy6QmiP\ndtd6xzZTgDPRnzGkMY52oFzYphnYV62DxdCQJFVT5vpT6qpe7aW721MlYC1wBPhaob6d9GQTMX+h\nUF8MDAemkQa3D5LC5RJpfKMELAG2VdnXQtLAOqTxjLmkwe/bgPuBXT35cpKk/lXqZvmnge8Dr9N5\nC2oFKQi2kK4QTgEPkwarAb4IfJ40PvE4nSf6mcA6YCSwg87Hd0eQrmLuJV1hLI59Avxp7A/gb+kc\nMC+qVCp1vTsmSQ2hVCqRP1pQuvZHj35GTzdoQIaGJFGb0PCNcElSNkNDkpTN0JAkZTM0JEnZDA1J\nUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQkSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1J\nUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlywmNbwHngMOFWivQBhyK6cHCshXAceAoMLdQ\nnxn7OA6sLtRHAJujfgCYWli2FDgW0yMZfZUkDaCc0HgeaOlSqwBfBe6N6d+iPgNYFPMW4FmgFMvW\nAMuA6TF17HMZcD5qzwBPR70JeBKYFdNKYGz2N5Mk9buc0PgB8F6VeqlKbT6wCbgCnAJOALOBCcAo\n4GCstwFYEO15wPpobwXmRPsBYDdwIaY9/GJ4SZJqqC9jGo8BrwFr6bwCmEi6bdWhDZhUpd4edWJ+\nOtpXgYvA7TfYlySpTob1crs1wJei/RTwFdJtprpobW291i6Xy5TL5Xp1RZIa1P6Y+qa3ofFOof0c\n8GK024HmwrLJpCuE9mh3rXdsMwU4E/0ZQxrjaAfKhW2agX3VOlMMDUlSNWWuP6Wu6tVeent7akKh\n/RCdT1ZtBxYDw4FppMHtg8BZ4BJpfKMELAG2FbZZGu2FwN5o7yY9fTUWuA24H9jVy/5KkvpBzpXG\nJuAzwDjS2MNKUlzdQ3qK6i3g0Vj3CLAl5leB5bEO0V4HjAR2ADujvhbYSHrk9jwpdADeJd36eiU+\nryINiEuS6qTaE1CDTaVSqXS/liQNcaVSic6/p3e79rU/esI3wiVJ2QwNSVI2Q0OSlM3QkCRlMzQk\nSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2QwNSVI2Q0OSlM3QkCRlMzQk\nSdkMDUlSNkNDkpTN0JAkZTM0JEnZDA1JUjZDQ5KULSc0vgWcAw4Xak3AHuAYsBsYW1i2AjgOHAXm\nFuozYx/HgdWF+ghgc9QPAFMLy5bGzzgGPJLRV0nSAMoJjeeBli61J0ihcTewNz4DzAAWxbwFeBYo\nxbI1wDJgekwd+1wGnI/aM8DTUW8CngRmxbSS68NJklRjOaHxA+C9LrV5wPporwcWRHs+sAm4ApwC\nTgCzgQnAKOBgrLehsE1xX1uBOdF+gHQVcyGmPfxieEmSaqi3YxrjSbesiPn4aE8E2grrtQGTqtTb\no07MT0f7KnARuP0G+5Ik1cmwfthHJaa6aW1tvdYul8uUy+W69UWSGtP+mPqmt6FxDrgDOEu69fRO\n1NuB5sJ6k0lXCO3R7lrv2GYKcCb6M4Y0xtEOlAvbNAP7qnWmGBqSpGrKXH9KXdWrvfT29tR20pNN\nxPyFQn0xMByYRhrcPkgKl0uk8Y0SsATYVmVfC0kD65DGM+aSBr9vA+4HdvWyv5KkfpBzpbEJ+Aww\njjT28CTwD8AW0pNPp4CHY90jUT9CGp9YTuetq+XAOmAksAPYGfW1wEbSI7fnSaED8C7wFPBKfF5F\nGhCXJNVJqftVGl6lUqnrkIokNYRSqUT+EHPp2h894RvhkqRshoYkKZuhIUnKZmhIkrIZGpKkbIaG\nJCmboSFJymZoSJKyGRqSpGyGhiQpm6EhScpmaEiSshkakqRshoYkNbjRo5solUrdTrXgP40uSQ0u\n/588959GlyQ1EENDkpTN0JAkZTM0JEnZDA1JUjZDQ5KUzdCQJGUzNCRJ2foaGqeA14FDwMGoNQF7\ngGPAbmBsYf0VwHHgKDC3UJ8JHI5lqwv1EcDmqB8Apvaxv5KkPuhraFSAMnAvMCtqT5BC425gb3wG\nmAEsinkL8CydbyOuAZYB02Nqifoy4HzUngGe7mN/JUl90B+3p7q+hj4PWB/t9cCCaM8HNgFXSFco\nJ4DZwARgFJ1XKhsK2xT3tRWY0w/9laSG0Ej/plSu/rjS+B7wQ+DPojYeOBftc/EZYCLQVti2DZhU\npd4edWJ+OtpXgYuk21+SNOhdvvwe6TTa3dQ4hvVx+08BbwO/TLoldbTL8pp849bW1mvtcrlMuVwe\n6B8pSYPM/pj6pq+h8XbMfwL8C2lc4xxwB3CWdOvpnVinHWgubDuZdIXRHu2u9Y5tpgBnoq9jgHe7\ndqIYGpKkasoxdVjVq7305fbUraSxCIBfIj0NdRjYDiyN+lLghWhvBxYDw4FppMHtg6RwuUQa3ygB\nS4BthW069rWQNLAuSaqTvlxpjCddXXTs559Ij9j+ENhCevLpFPBwrHMk6kdI4xPL6bx1tRxYB4wE\ndgA7o74W2Eh65PY8KXQkSXXSWMPyveMvYZI0KPX/L1fylzBJkhqIoSFJymZoSJKyGRqSpGyGhiQp\nm6EhScpmaEiSshkakqRshoYkKZuhIUnKZmhIkrIZGpKkbIaGJCmboSFJymZoSJKy9fXXvUrSB8Lo\n0U1cvvxexpq3AFcGujt14y9hkqQM/f8Lk3qyrr+ESZI0CBkakqRshoYkKZuhIUnKZmhIkrINhtBo\nAY4Cx4G/qXNfJA0So0c3USqVMqbhWespafTQuBn4Bik4ZgCfAz5a1x41sP3799e7Cw3DY9Hpg3os\n0jsVlS7TS1VqV6rUqk2Cxg+NWcAJ4BTpf9lvA/Pr2aFG9kE9OVTjseg0GI5Ff18VvP+Vwf5afq0h\nqdHfCJ8EnC58bgNm16kvkujJm9HQs7ejB+TlNfWzRr/SqPs14YEDB7L/ZrNkyZJ6d1eD3ED8jXvV\nqqf6dZ/Vb/u83+Stn6Gm0aP4PqCVNKYBsAL4OfB0YZ0TwF217ZYkDXongY/UuxP9bRjpi90JDAde\nxYFwSdINPAj8J+mKYkWd+yJJkiRpqMl5ye/rsfw14N4a9aseujsWf0Q6Bq8D/w58vHZdq7nclz8/\nCVwFfr8WnaqDnONQBg4BbzC0nz3t7liMA3aSbne/AfxJzXpWe98CzgGHb7DOkDxv3ky6PXUn6Rm+\namMbvwfsiPZs4ECtOldjOcfit4Ax0W7hg30sOtbbB/wr8Ae16lwN5RyHscCbwOT4PK5WnauxnGPR\nCnw52uOA8zT+6we99TukIHi/0OjxebPRH7ntkPOS3zxgfbRfJv2fZHyN+ldLOcfiP4CL0X6ZzhPF\nUJP78udjwHeAn9SsZ7WVcxz+ENhKetcJ4L9q1bkayzkWbwOjoz2aFBpXa9S/WvsBcKOXanp83hws\noVHtJb9JGesMxZNlzrEoWkbn3ySGmtz/LuYDa+LzUHwpIOc4TAeaSP+Oxg+BofpSUc6x+CbwMeAM\n6ZbM47XpWkPq8XlzsFyS9fYV0KF4gujJd/os8HngUwPUl3rLORZfA56IdUs0/rtJvZFzHG4BPgHM\nAW4lXY0eIN3LHkpyjsUXSbetyqR3vPYAvwFcHrhuNbQenTcHS2i0A82Fz810Xma/3zqTozbU5BwL\nSIPf3ySNaeT+mw+DTc6xmEm6RQHp/vWDpNsW2we8d7WTcxxOk25J/XdM3yedKIdaaOQci98G/i7a\nJ4G3gF8hXYF90AzZ82bOS37FAZ37GLqDvznHYgrpvu59Ne1Z7fX05c/nGZpPT+Uch18FvkcaKL6V\nNDA6o3ZdrJmcY/FVYGW0x5NCpalG/auHO8kbCB9y581qL/k9GlOHb8Ty10iX4kNVd8fiOdLg3qGY\nDta6gzWU899Fh6EaGpB3HP6a9ATVYeALNe1dbXV3LMYBL5LOE4dJDwkMVZtIYzf/R7ra/Dwf3POm\nJEmSJEmSJEmSJEmSJEmSJEmSJEmD0/8DKGxa/p8Kz/YAAAAASUVORK5CYII=\n",
250 |        "text": [
251 |         "<matplotlib.figure.Figure at 0x10f1954d0>"
252 |        ]
253 |       }
254 |      ],
255 |      "prompt_number": 138
256 |     },
257 |     {
258 |      "cell_type": "code",
259 |      "collapsed": false,
260 |      "input": [
261 |       "class SelfTrainingClassifier(BaseEstimator):\n",
262 |       "    \n",
263 |       "    def __init__(self, base_estimator=None, n_iter=10, clamp_true_target=False):\n",
264 |       "        self.base_estimator = base_estimator\n",
265 |       "        self.n_iter = n_iter\n",
266 |       "        self.clamp_true_target = clamp_true_target\n",
267 |       "        \n",
268 |       "    def fit(self, X, y, X_unlabeled=None, X_val=None, y_val=None):\n",
269 |       "        if self.base_estimator is None:\n",
270 |       "            model = ExtraTreesClassifier(n_estimators=100)\n",
271 |       "        else:\n",
272 |       "            model = clone(self.base_estimator)\n",
273 |       "        \n",
274 |       "        X_train, y_train = X, y\n",
275 |       "               \n",
276 |       "        for i in range(self.n_iter):\n",
277 |       "            model.fit(X_train, y_train)\n",
278 |       "            \n",
279 |       "            if X_val is not None and y_val is not None:\n",
280 |       "                print(model.score(X_val, y_val))\n",
281 |       "\n",
282 |       "            if self.clamp_true_target:\n",
283 |       "                y_predicted = y\n",
284 |       "            else:\n",
285 |       "                y_predicted = model.predict(X)\n",
286 |       "            \n",
287 |       "            X_train = np.vstack([X, X_unlabeled])\n",
288 |       "            y_train = np.concatenate([y, model.predict(X_unlabeled)])\n",
289 |       "\n",
290 |       "        self.estimator_ = model\n",
291 |       "        \n",
292 |       "    def predict(self, X):\n",
293 |       "        return self.estimator_.predict(X)\n",
294 |       "    \n",
295 |       "    def score(self, X, y):\n",
296 |       "        return self.estimator_.score(X, y)\n",
297 |       "    "
298 |      ],
299 |      "language": "python",
300 |      "metadata": {},
301 |      "outputs": [],
302 |      "prompt_number": 133
303 |     },
304 |     {
305 |      "cell_type": "code",
306 |      "collapsed": false,
307 |      "input": [
308 |       "ssc = SelfTrainingClassifier(etrees).fit(X_small, y_small, X_new_unlabeled, X_val=X_test, y_val=y_test)"
309 |      ],
310 |      "language": "python",
311 |      "metadata": {},
312 |      "outputs": [
313 |       {
314 |        "output_type": "stream",
315 |        "stream": "stdout",
316 |        "text": [
317 |         "0.840573474235\n",
318 |         "0.834170940759"
319 |        ]
320 |       },
321 |       {
322 |        "output_type": "stream",
323 |        "stream": "stdout",
324 |        "text": [
325 |         "\n",
326 |         "0.835203607449"
327 |        ]
328 |       },
329 |       {
330 |        "output_type": "stream",
331 |        "stream": "stdout",
332 |        "text": [
333 |         "\n",
334 |         "0.83325875185"
335 |        ]
336 |       },
337 |       {
338 |        "output_type": "stream",
339 |        "stream": "stdout",
340 |        "text": [
341 |         "\n",
342 |         "0.832897318509"
343 |        ]
344 |       },
345 |       {
346 |        "output_type": "stream",
347 |        "stream": "stdout",
348 |        "text": [
349 |         "\n",
350 |         "0.835117551892"
351 |        ]
352 |       },
353 |       {
354 |        "output_type": "stream",
355 |        "stream": "stdout",
356 |        "text": [
357 |         "\n",
358 |         "0.833620185192"
359 |        ]
360 |       },
361 |       {
362 |        "output_type": "stream",
363 |        "stream": "stdout",
364 |        "text": [
365 |         "\n",
366 |         "0.833275962962"
367 |        ]
368 |       },
369 |       {
370 |        "output_type": "stream",
371 |        "stream": "stdout",
372 |        "text": [
373 |         "\n",
374 |         "0.833585762969"
375 |        ]
376 |       },
377 |       {
378 |        "output_type": "stream",
379 |        "stream": "stdout",
380 |        "text": [
381 |         "\n",
382 |         "0.833293174073"
383 |        ]
384 |       },
385 |       {
386 |        "output_type": "stream",
387 |        "stream": "stdout",
388 |        "text": [
389 |         "\n"
390 |        ]
391 |       }
392 |      ],
393 |      "prompt_number": 137
394 |     },
395 |     {
396 |      "cell_type": "code",
397 |      "collapsed": false,
398 |      "input": [],
399 |      "language": "python",
400 |      "metadata": {},
401 |      "outputs": []
402 |     }
403 |    ],
404 |    "metadata": {}
405 |   }
406 |  ]
407 | }


--------------------------------------------------------------------------------
/Untitled Diagram.drawio:
--------------------------------------------------------------------------------
1 | <mxfile host="app.diagrams.net" modified="2021-06-10T14:35:29.841Z" agent="5.0 (Macintosh)" version="14.6.13" etag="vsY3mkUJGtxC_jE36hh5" type="github"><diagram id="7yUGQr6HZjsoyeyFMjNX">UzV2zq1wL0osyPDNT0nNUTV2VTV2LsrPL4GwciucU3NyVI0MMlNUjV1UjYwMgFjVyA2HrCFY1qAgsSg1rwSLBiADYTaQg2Y1AA==</diagram></mxfile>


--------------------------------------------------------------------------------
/cloudstorage.ini.example:
--------------------------------------------------------------------------------
1 | [account]
2 | libcloud_provider = azure_blobs
3 | account_name = TODO
4 | account_secret = deadcafe==
5 | 


--------------------------------------------------------------------------------
/dask/fold_learn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 48,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<table style=\"border: 2px solid white;\">\n",
 12 |        "<tr>\n",
 13 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 14 |        "<h3>Client</h3>\n",
 15 |        "<ul>\n",
 16 |        "  <li><b>Scheduler: </b>tcp://127.0.0.1:45838\n",
 17 |        "  <li><b>Dashboard: </b><a href='http://127.0.0.1:8787' target='_blank'>http://127.0.0.1:8787</a>\n",
 18 |        "</ul>\n",
 19 |        "</td>\n",
 20 |        "<td style=\"vertical-align: top; border: 0px solid white\">\n",
 21 |        "<h3>Cluster</h3>\n",
 22 |        "<ul>\n",
 23 |        "  <li><b>Workers: </b>4</li>\n",
 24 |        "  <li><b>Cores: </b>4</li>\n",
 25 |        "  <li><b>Memory: </b>10.00 GB</li>\n",
 26 |        "</ul>\n",
 27 |        "</td>\n",
 28 |        "</tr>\n",
 29 |        "</table>"
 30 |       ],
 31 |       "text/plain": [
 32 |        "<Client: scheduler='tcp://127.0.0.1:45838' processes=4 cores=4>"
 33 |       ]
 34 |      },
 35 |      "execution_count": 48,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "import os\n",
 42 |     "import numpy as np\n",
 43 |     "import scipy.sparse as sp\n",
 44 |     "import pandas as pd\n",
 45 |     "from glob import glob\n",
 46 |     "\n",
 47 |     "import dask\n",
 48 |     "import dask.bag as db\n",
 49 |     "import joblib\n",
 50 |     "\n",
 51 |     "from distributed import Client\n",
 52 |     "client = Client()\n",
 53 |     "client"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 37,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "rm -rf sparse_chunks/"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 38,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "folder = 'sparse_chunks'\n",
 72 |     "n_features = int(1e5)\n",
 73 |     "n_informative = int(1e4)\n",
 74 |     "\n",
 75 |     "n_chunks = int(1e1)\n",
 76 |     "chunk_size = int(1e2)\n",
 77 |     "\n",
 78 |     "rng = np.random.RandomState(42)\n",
 79 |     "true_coef = rng.randn(n_features)\n",
 80 |     "true_coef[n_informative:] = 0\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "def make_chunk(n_samples, true_coef, chunk_idx, format='csr',\n",
 84 |     "               density=1e-3, noise=1e-1):\n",
 85 |     "    rng = np.random.RandomState(chunk_idx)\n",
 86 |     "    n_features = true_coef.shape[0]\n",
 87 |     "    input_data = sp.rand(n_samples, n_features, format=format,\n",
 88 |     "                         density=density, random_state=rng)\n",
 89 |     "    noise = rng.normal(loc=0, scale=noise, size=n_samples)\n",
 90 |     "    target = input_data.dot(true_coef).ravel() + noise\n",
 91 |     "    return chunk_idx, input_data, (target > 0).astype(np.int32)\n",
 92 |     "\n",
 93 |     "\n",
 94 |     "def save_to_disk(chunk_idx, X, y, folder='sparse_chunks'):\n",
 95 |     "    os.makedirs(folder, exist_ok=True)\n",
 96 |     "    filename = \"sparse_chunk_{:04d}.pkl\".format(chunk_idx)\n",
 97 |     "    joblib.dump((X, y), os.path.join(folder, filename))\n",
 98 |     "    return filename\n",
 99 |     "\n",
100 |     "\n",
101 |     "def load_from_disk(chunk_idx, filename):\n",
102 |     "    X, y = joblib.load(filename)\n",
103 |     "    return chunk_idx, X, y"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 49,
109 |    "metadata": {},
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "Lazy loading chunks from sparse_chunks\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "if not os.path.exists(folder):\n",
121 |     "    print(\"Generating chunks of sparse data into\", folder)\n",
122 |     "    b = db.from_sequence([(chunk_size, true_coef, i)\n",
123 |     "                          for i in range(n_chunks)])\n",
124 |     "    b = b.starmap(make_chunk).starmap(save_to_disk).compute()\n",
125 |     "\n",
126 |     "\n",
127 |     "    \n",
128 |     "print(\"Lazy loading chunks from\", folder)\n",
129 |     "b = db.from_sequence(enumerate(sorted(glob('sparse_chunks/*.pkl'))))\n",
130 |     "b = b.starmap(load_from_disk)"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 50,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "name": "stdout",
140 |      "output_type": "stream",
141 |      "text": [
142 |       "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
143 |       "Wall time: 9.53 ms\n"
144 |      ]
145 |     }
146 |    ],
147 |    "source": [
148 |     "%time b = b.persist()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 51,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "10"
160 |       ]
161 |      },
162 |      "execution_count": 51,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "len(b.compute())"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 52,
174 |    "metadata": {},
175 |    "outputs": [
176 |     {
177 |      "name": "stdout",
178 |      "output_type": "stream",
179 |      "text": [
180 |       "CPU times: user 16 ms, sys: 4 ms, total: 20 ms\n",
181 |       "Wall time: 25.8 ms\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "%%time\n",
187 |     "chunk_idx, X_0, y_0 = b.take(1)[0]"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 53,
193 |    "metadata": {},
194 |    "outputs": [
195 |     {
196 |      "data": {
197 |       "text/plain": [
198 |        "0"
199 |       ]
200 |      },
201 |      "execution_count": 53,
202 |      "metadata": {},
203 |      "output_type": "execute_result"
204 |     }
205 |    ],
206 |    "source": [
207 |     "chunk_idx"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 54,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "<100x100000 sparse matrix of type '<class 'numpy.float64'>'\n",
219 |        "\twith 10000 stored elements in Compressed Sparse Row format>"
220 |       ]
221 |      },
222 |      "execution_count": 54,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "X_0"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 55,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "0.97999999999999998"
240 |       ]
241 |      },
242 |      "execution_count": 55,
243 |      "metadata": {},
244 |      "output_type": "execute_result"
245 |     }
246 |    ],
247 |    "source": [
248 |     "np.mean((X_0.dot(true_coef).ravel() > 0) == y_0)"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "## L1-penalized Logistic Regression with SGD"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 56,
261 |    "metadata": {},
262 |    "outputs": [
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "CPU times: user 760 ms, sys: 64 ms, total: 824 ms\n",
268 |       "Wall time: 2.75 s\n"
269 |      ]
270 |     },
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "(0.53333333333333333, 0.032998316455372205, 0.46405999999999997)"
275 |       ]
276 |      },
277 |      "execution_count": 56,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "from sklearn.linear_model import SGDClassifier\n",
284 |     "from sklearn.model_selection import train_test_split\n",
285 |     "from dask import delayed\n",
286 |     "\n",
287 |     "CLASSES = np.array([0, 1])\n",
288 |     "\n",
289 |     "\n",
290 |     "def scan_fit(model, chunk):\n",
291 |     "    return model.partial_fit(*chunk, classes=CLASSES)\n",
292 |     "\n",
293 |     "\n",
294 |     "def score(model, chunk):\n",
295 |     "    return model.score(*chunk)\n",
296 |     "\n",
297 |     "\n",
298 |     "all_filenames = sorted(glob('sparse_chunks/*.pkl'))\n",
299 |     "train_filenames, test_filenames = train_test_split(\n",
300 |     "    all_filenames, random_state=0)\n",
301 |     "\n",
302 |     "model = SGDClassifier(loss='log', alpha=1e-3, penalty='elasticnet', tol=0)\n",
303 |     "\n",
304 |     "for i in range(20):\n",
305 |     "    for filename in train_filenames:\n",
306 |     "        chunk = delayed(joblib.load)(filename)\n",
307 |     "        model = delayed(scan_fit)(model, chunk)\n",
308 |     "\n",
309 |     "\n",
310 |     "scores = [delayed(score)(model, delayed(joblib.load)(filename))\n",
311 |     "          for filename in test_filenames]\n",
312 |     "    \n",
313 |     "%time scores, model = dask.compute(scores, model)\n",
314 |     "np.mean(scores), np.std(scores), np.mean(model.coef_ != 0)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 57,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "ename": "TypeError",
324 |      "evalue": "'Future' object is not iterable",
325 |      "output_type": "error",
326 |      "traceback": [
327 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
328 |       "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
329 |       "\u001b[0;32m<ipython-input-57-4e7666067cab>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpartial_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscan_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_delayed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
330 |       "\u001b[0;32m~/code/dask/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     96\u001b[0m             \u001b[0mExtra\u001b[0m \u001b[0mkeywords\u001b[0m \u001b[0mto\u001b[0m \u001b[0mforward\u001b[0m \u001b[0mto\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mscheduler\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mget\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0mfunction\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     97\u001b[0m         \"\"\"\n\u001b[0;32m---> 98\u001b[0;31m         \u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcompute\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtraverse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     99\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    100\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
331 |       "\u001b[0;32m~/code/dask/dask/base.py\u001b[0m in \u001b[0;36mcompute\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    203\u001b[0m     \u001b[0mdsk\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcollections_to_dsk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvariables\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moptimize_graph\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    204\u001b[0m     \u001b[0mkeys\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mvar\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_keys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mvar\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mvariables\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 205\u001b[0;31m     \u001b[0mresults\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    206\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    207\u001b[0m     \u001b[0mresults_iter\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresults\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
332 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mget_sync\u001b[0;34m(dsk, keys, **kwargs)\u001b[0m\n\u001b[1;32m    560\u001b[0m     \"\"\"\n\u001b[1;32m    561\u001b[0m     \u001b[0mkwargs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'num_workers'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m    \u001b[0;31m# if num_workers present, remove it\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 562\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mget_async\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mapply_sync\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdsk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    563\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    564\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
333 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mget_async\u001b[0;34m(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)\u001b[0m\n\u001b[1;32m    506\u001b[0m             \u001b[0;31m# Seed initial tasks into the thread pool\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'ready'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstate\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'running'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0mnum_workers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m                 \u001b[0mfire_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    510\u001b[0m             \u001b[0;31m# Main loop, wait on tasks to finish, insert new ones\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
334 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mfire_task\u001b[0;34m()\u001b[0m\n\u001b[1;32m    502\u001b[0m                             args=(key, dumps((dsk[key], data)),\n\u001b[1;32m    503\u001b[0m                                   dumps, loads, get_id, pack_exception),\n\u001b[0;32m--> 504\u001b[0;31m                             callback=queue.put)\n\u001b[0m\u001b[1;32m    505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    506\u001b[0m             \u001b[0;31m# Seed initial tasks into the thread pool\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
335 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mapply_sync\u001b[0;34m(func, args, kwds, callback)\u001b[0m\n\u001b[1;32m    549\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mapply_sync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    550\u001b[0m     \u001b[0;34m\"\"\" A naive synchronous version of apply_async \"\"\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 551\u001b[0;31m     \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    552\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mcallback\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    553\u001b[0m         \u001b[0mcallback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mres\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
336 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m(key, task_info, dumps, loads, get_id, pack_exception)\u001b[0m\n\u001b[1;32m    293\u001b[0m         \u001b[0mfailed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    294\u001b[0m     \u001b[0;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 295\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpack_exception\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0me\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    296\u001b[0m         \u001b[0mfailed\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    297\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfailed\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
337 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36mexecute_task\u001b[0;34m(key, task_info, dumps, loads, get_id, pack_exception)\u001b[0m\n\u001b[1;32m    288\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    289\u001b[0m         \u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask_info\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 290\u001b[0;31m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtask\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    291\u001b[0m         \u001b[0mid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_id\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    292\u001b[0m         \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdumps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
338 |       "\u001b[0;32m~/code/dask/dask/local.py\u001b[0m in \u001b[0;36m_execute_task\u001b[0;34m(arg, cache, dsk)\u001b[0m\n\u001b[1;32m    269\u001b[0m         \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    270\u001b[0m         \u001b[0margs2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0m_execute_task\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0ma\u001b[0m \u001b[0;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 271\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    272\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mishashable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    273\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
339 |       "\u001b[0;32m~/code/dask/dask/bag/core.py\u001b[0m in \u001b[0;36maccumulate_part\u001b[0;34m(binop, seq, initial, is_first)\u001b[0m\n\u001b[1;32m   1273\u001b[0m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbinop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1274\u001b[0m     \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1275\u001b[0;31m         \u001b[0mres\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccumulate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbinop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mseq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minitial\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1276\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mis_first\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1277\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mres\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mres\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
340 |       "\u001b[0;32m~/.virtualenvs/py36/lib/python3.6/site-packages/toolz/itertoolz.py\u001b[0m in \u001b[0;36maccumulate\u001b[0;34m(binop, seq, initial)\u001b[0m\n\u001b[1;32m     56\u001b[0m         \u001b[0mitertools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maccumulate\u001b[0m \u001b[0;34m:\u001b[0m  \u001b[0mIn\u001b[0m \u001b[0mstandard\u001b[0m \u001b[0mitertools\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mPython\u001b[0m \u001b[0;36m3.2\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     57\u001b[0m     \"\"\"\n\u001b[0;32m---> 58\u001b[0;31m     \u001b[0mseq\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     59\u001b[0m     \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseq\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minitial\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mno_default\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0minitial\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     60\u001b[0m     \u001b[0;32myield\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
341 |       "\u001b[0;31mTypeError\u001b[0m: 'Future' object is not iterable"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "model = SGDClassifier(loss='log', penalty='l1', max_iter=1)\n",
347 |     "\n",
348 |     "def scan_fit(model, next_chunk):\n",
349 |     "    chunk_idx, X, y = next_chunk\n",
350 |     "    return model.partial_fit(X, y, classes=[0, 1])\n",
351 |     "\n",
352 |     "b.accumulate(scan_fit, initial=model).to_delayed()[-1].compute(get=dask.get)[0]"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": []
361 |   }
362 |  ],
363 |  "metadata": {
364 |   "kernelspec": {
365 |    "display_name": "Python 3",
366 |    "language": "python",
367 |    "name": "python3"
368 |   },
369 |   "language_info": {
370 |    "codemirror_mode": {
371 |     "name": "ipython",
372 |     "version": 3
373 |    },
374 |    "file_extension": ".py",
375 |    "mimetype": "text/x-python",
376 |    "name": "python",
377 |    "nbconvert_exporter": "python",
378 |    "pygments_lexer": "ipython3",
379 |    "version": "3.6.1"
380 |   }
381 |  },
382 |  "nbformat": 4,
383 |  "nbformat_minor": 2
384 | }
385 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | dependencies:
2 | - scikit-learn>=0.21
3 | - matplotlib
4 | - pandas
5 | 


--------------------------------------------------------------------------------
/fmri_vae/fmri_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from nilearn import datasets, image
  3 | from keras.layers import Conv3D, BatchNormalization, Flatten, Dense
  4 | from keras.layers import Dropout, Reshape, Conv3DTranspose, Lambda
  5 | from keras.models import Sequential
  6 | from keras.optimizers import Adam
  7 | from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
  8 | 
  9 | 
 10 | def crop_5_8_8(data):
 11 |     return data[:, :5, :8, :8]
 12 | 
 13 | 
 14 | def make_models(input_shape=(40, 64, 64, 1), latent_dim=256,
 15 |                 low_res_shape=(2, 2, 2, 128), dropout=0.2):
 16 |     encoder = Sequential([
 17 |         Conv3D(16, kernel_size=3, activation='relu',
 18 |                padding="same", input_shape=input_shape),
 19 |         BatchNormalization(),
 20 |         Conv3D(32, kernel_size=3, activation='relu',
 21 |                padding="same", strides=2),
 22 |         BatchNormalization(),
 23 |         Conv3D(32, kernel_size=3, activation='relu',
 24 |                padding="same"),
 25 |         BatchNormalization(),
 26 |         Conv3D(64, kernel_size=3, activation='relu',
 27 |                padding="same", strides=2),
 28 |         BatchNormalization(),
 29 |         Conv3D(64, kernel_size=3, activation='relu',
 30 |                padding="same"),
 31 |         BatchNormalization(),
 32 |         Conv3D(128, kernel_size=3, activation='relu',
 33 |                padding="same", strides=2),
 34 |         BatchNormalization(),
 35 |         Conv3D(128, kernel_size=3, activation='relu',
 36 |                padding="same", strides=2),
 37 |         BatchNormalization(),
 38 |         Conv3D(latent_dim, kernel_size=3, padding="same",
 39 |                strides=2, activation='relu'),
 40 |         Flatten(),
 41 |         Dropout(dropout),
 42 |         Dense(latent_dim),
 43 |     ], name="encoder")
 44 | 
 45 |     decoder = Sequential([
 46 |         Dense(np.prod(low_res_shape), input_shape=(latent_dim,)),
 47 |         Dropout(dropout),
 48 |         Reshape(low_res_shape),
 49 |         Conv3DTranspose(128, kernel_size=3, strides=2, activation='relu',
 50 |                         padding="same"),
 51 |         BatchNormalization(),
 52 |         Conv3D(128, kernel_size=3, activation='relu', padding="same"),
 53 |         BatchNormalization(),
 54 |         Conv3DTranspose(128, kernel_size=3, strides=2, activation='relu',
 55 |                         padding="same"),
 56 |         Lambda(function=crop_5_8_8),
 57 |         BatchNormalization(),
 58 |         Conv3D(64, kernel_size=3, activation='relu', padding="same"),
 59 |         BatchNormalization(),
 60 |         Conv3DTranspose(64, kernel_size=3, strides=2, activation='relu',
 61 |                         padding="same"),
 62 |         BatchNormalization(),
 63 |         Conv3D(32, kernel_size=3, activation='relu', padding="same"),
 64 |         BatchNormalization(),
 65 |         Conv3DTranspose(32, kernel_size=3, strides=2, activation='relu',
 66 |                         padding="same"),
 67 |         BatchNormalization(),
 68 |         Conv3D(16, kernel_size=3, activation='relu', padding="same"),
 69 |         BatchNormalization(),
 70 |         Conv3DTranspose(16, kernel_size=3, strides=2, activation='relu',
 71 |                         padding="same"),
 72 |         BatchNormalization(),
 73 |         Conv3D(1, kernel_size=3, activation=None, padding="same"),
 74 |     ], name="decoder")
 75 |     autoencoder = Sequential([encoder, decoder], name="autoencoder")
 76 |     return encoder, decoder, autoencoder
 77 | 
 78 | 
 79 | 
 80 | if __name__ == "__main__":
 81 |     data = datasets.fetch_haxby(subjects=(2,))
 82 |     fmri_filename = data.func[0]
 83 |     smoothed_img = image.smooth_img(fmri_filename, 2)
 84 |     
 85 |     smoothed_data = smoothed_img.get_data().transpose(3, 0, 1, 2)
 86 |     #mean = smoothed_data.mean(axis=0)
 87 |     #smoothed_data -= mean
 88 |     #scale = smoothed_data.std(axis=0) + 1e-6
 89 |     scale = smoothed_data.std()  # global scale
 90 |     smoothed_data /= scale
 91 |     smoothed_data = smoothed_data[:, :, :, :, None]
 92 |     input_shape = smoothed_data.shape[1:]
 93 |     smoothed_data_train = smoothed_data[:1200]
 94 |     smoothed_data_test = smoothed_data[1200:]
 95 |     
 96 |     encoder, decoder, autoencoder = make_models(input_shape=input_shape)
 97 |     autoencoder.compile(optimizer=Adam(lr=0.001), loss="mse")
 98 | 
 99 |     
100 |     filename = "haxby_autoencoder.{epoch:02d}-{val_loss:.4f}.hdf5"
101 |     ckpt_cb = ModelCheckpoint(filename, monitor='val_loss',
102 |                               verbose=1, save_best_only=False)
103 |     filename = "haxby_autoencoder_best.hdf5"
104 |     ckpt_best_cb = ModelCheckpoint(filename, monitor='val_loss',
105 |                                    verbose=1, save_best_only=True)
106 |     es_cb = EarlyStopping(monitor='val_loss', patience=20, min_delta=0.0001,
107 |                           verbose=1)
108 |     lr_schedule_cb = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10,
109 |                                        cooldown=5, epsilon=0.0001, verbose=1)
110 |     autoencoder.fit(smoothed_data_train, smoothed_data_train, 
111 |                     validation_data=(smoothed_data_test, smoothed_data_test),
112 |                     epochs=500, batch_size=32,
113 |                     callbacks=[ckpt_cb, ckpt_best_cb, lr_schedule_cb, es_cb])


--------------------------------------------------------------------------------
/generalization/run_mnist.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import argparse
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.optim as optim
  7 | from torch.optim.lr_scheduler import ReduceLROnPlateau
  8 | from torchvision import datasets, transforms
  9 | from torch.autograd import Variable
 10 | 
 11 | # Training settings
 12 | parser = argparse.ArgumentParser(description='Study of generalization in MLPs')
 13 | parser.add_argument('--batch-size', type=int, default=64, metavar='N',
 14 |                     help='input batch size for training (default: 64)')
 15 | parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
 16 |                     help='input batch size for testing (default: 1000)')
 17 | parser.add_argument('--epochs', type=int, default=10, metavar='N',
 18 |                     help='number of epochs to train (default: 10)')
 19 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
 20 |                     help='learning rate (default: 0.01)')
 21 | parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
 22 |                     help='SGD momentum (default: 0.5)')
 23 | parser.add_argument('--no-cuda', action='store_true', default=False,
 24 |                     help='disables CUDA training')
 25 | parser.add_argument('--seed', type=int, default=1, metavar='S',
 26 |                     help='random seed (default: 1)')
 27 | parser.add_argument('--log-interval', type=int, default=10, metavar='N',
 28 |                     help='how many batches to wait before logging training status')
 29 | parser.add_argument('--train-size', type=int, default=None,
 30 |                     help='size of the subsample used for training')
 31 | parser.add_argument('--test-size', type=int, default=None,
 32 |                     help='size of the subsample used for test evaluation')
 33 | parser.add_argument('--dropout', type=float, default=None,
 34 |                     help='dropout probability (no dropout by default)')
 35 | parser.add_argument('--mlp', action='store_true', default=False,
 36 |                     help='use an MLP instead of a ConvNet')
 37 | parser.add_argument('--hidden-dim', type=int, default=32,
 38 |                     help='dimension of the MLP hidden layers')
 39 | parser.add_argument('--depth', type=int, default=1,
 40 |                     help='number of hidden layers for the MLP')             
 41 | args = parser.parse_args()
 42 | args.cuda = not args.no_cuda and torch.cuda.is_available()
 43 | 
 44 | torch.manual_seed(args.seed)
 45 | if args.cuda:
 46 |     torch.cuda.manual_seed(args.seed)
 47 | 
 48 | 
 49 | loader_kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
 50 | 
 51 | mnist_transformers = transforms.Compose([
 52 |     transforms.ToTensor(),
 53 |     transforms.Normalize((0.1307,), (0.3081,))
 54 | ])
 55 | 
 56 | 
 57 | def make_mnist_loader(train=True, subsample=None):
 58 |     dataset = datasets.MNIST('../data', train=train, download=True,
 59 |                              transform=mnist_transformers)
 60 |     if subsample is None:
 61 |         # Use the full training set
 62 |         loader = torch.utils.data.DataLoader(
 63 |             dataset, batch_size=args.batch_size, shuffle=True, **loader_kwargs)
 64 |     else:
 65 |         # Subsample a smaller training set at random
 66 |         mnist_loader = loader = torch.utils.data.DataLoader(
 67 |             dataset, batch_size=args.train_size, shuffle=True, **loader_kwargs)
 68 |         small_mnist_data, small_mnist_labels = next(iter(mnist_loader))
 69 |         small_mnist_dataset = torch.utils.data.TensorDataset(
 70 |             small_mnist_data, small_mnist_labels)
 71 |         loader = torch.utils.data.DataLoader(
 72 |             small_mnist_dataset, batch_size=args.batch_size, shuffle=True,
 73 |             **loader_kwargs
 74 |         )
 75 |     return loader
 76 | 
 77 | 
 78 | train_loader = make_mnist_loader(train=True, subsample=args.train_size)
 79 | test_loader = make_mnist_loader(train=False, subsample=args.test_size)
 80 | 
 81 | 
 82 | class ConvNet(nn.Module):
 83 |     def __init__(self):
 84 |         super(ConvNet, self).__init__()
 85 |         self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
 86 |         self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
 87 |         if args.dropout:
 88 |             self.conv2_drop = nn.Dropout2d(p=args.dropout)
 89 |         self.fc1 = nn.Linear(320, 50)
 90 |         self.fc2 = nn.Linear(50, 10)
 91 | 
 92 |     def forward(self, x):
 93 |         x = F.relu(F.max_pool2d(self.conv1(x), 2))
 94 |         x = self.conv2(x)
 95 |         if args.dropout:
 96 |             x = self.conv2_drop(x)
 97 |         x = F.relu(F.max_pool2d(x, 2))
 98 |         x = x.view(-1, 320)
 99 |         x = F.relu(self.fc1(x))
100 |         if args.dropout:
101 |             x = F.dropout(x, p=args.dropout, training=self.training)
102 |         x = self.fc2(x)
103 |         return F.log_softmax(x)
104 | 
105 | 
106 | class MLP(nn.Module):
107 |     def __init__(self, input_dim=784, output_dim=10, hidden=(32,)):
108 |         super(MLP, self).__init__()
109 |         self.hidden_layers = layers = []
110 |         for hidden_dim in hidden:
111 |             layers.append(nn.Linear(input_dim, hidden_dim))
112 |             input_dim = hidden_dim
113 |         self.output_linear = nn.Linear(input_dim, output_dim)
114 | 
115 |     def forward(self, x):
116 |         for h in self.hidden_layers:
117 |             x = F.relu(h(x))
118 |         return F.log_softmax(self.output_linear(x))
119 | 
120 | 
121 | if args.mlp:
122 |     model = MLP(hidden=[args.hidden_dim] * args.depth)
123 | else:
124 |     model = ConvNet()
125 | if args.cuda:
126 |     model.cuda()
127 | 
128 | optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
129 | scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5, cooldown=5,
130 |                               verbose=True)
131 | 
132 | 
133 | def train(epoch):
134 |     model.train()
135 |     for batch_idx, (data, target) in enumerate(train_loader):
136 |         if args.cuda:
137 |             data, target = data.cuda(), target.cuda()
138 |         if isinstance(model, MLP):
139 |             data = data.view(-1, 784)
140 |         data, target = Variable(data), Variable(target)
141 |         optimizer.zero_grad()
142 |         output = model(data)
143 |         loss = F.nll_loss(output, target)
144 |         loss.backward()
145 |         optimizer.step()
146 |         if batch_idx % args.log_interval == 0:
147 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:f}'
148 |                   .format(epoch, batch_idx * len(data),
149 |                           len(train_loader.dataset),
150 |                           100. * batch_idx / len(train_loader), loss.data[0],
151 |                           optimizer.param_groups[0]['lr']))
152 | 
153 | 
154 | def evaluate():
155 |     should_stop = False
156 |     model.eval()
157 | 
158 |     for name, loader in [('train', train_loader), ('test', test_loader)]:
159 |         loss = 0
160 |         correct = 0
161 |         for data, target in loader:
162 |             if args.cuda:
163 |                 data, target = data.cuda(), target.cuda()
164 |             if isinstance(model, MLP):
165 |                 data = data.view(-1, 784)
166 |             data, target = Variable(data, volatile=True), Variable(target)
167 |             output = model(data)
168 |             loss += F.nll_loss(output, target, size_average=False).data[0]
169 |             # get the index of the max log-probability
170 |             pred = output.data.max(1, keepdim=True)[1]
171 |             correct += pred.eq(target.data.view_as(pred)).cpu().sum()
172 | 
173 |         loss /= len(loader.dataset)
174 |         print('{} -- Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'
175 |               .format(name.ljust(5), loss, correct, len(loader.dataset),
176 |                       100. * correct / len(loader.dataset)))
177 |         if name == 'test':
178 |             scheduler.step(loss)
179 |             should_stop = should_stop or correct == len(loader.dataset)
180 |     return should_stop or optimizer.param_groups[0]['lr'] < args.lr / 1e2
181 | 
182 | 
183 | for epoch in range(1, args.epochs + 1):
184 |     train(epoch)
185 |     if evaluate():
186 |         break
187 | 


--------------------------------------------------------------------------------
/gmm/gmmsgd.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from sklearn.model_selection import train_test_split
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | from math import sqrt
  6 | 
  7 | 
  8 | class EpochSampler(object):
  9 |     """Helper function to cycle through a shuffled dataset by minibatches.
 10 | 
 11 |     The dataset is shuffled at the beginning of each epoch.
 12 |     """
 13 | 
 14 |     def __init__(self, *data, n_epochs=1, batch_size=100, random_seed=None):
 15 |         self.data = data
 16 |         self.n_epochs = n_epochs
 17 |         self.batch_size = batch_size
 18 |         self.random_seed = random_seed
 19 | 
 20 |     def __iter__(self):
 21 |         rng = np.random.RandomState(0)
 22 |         n_samples = self.data[0].shape[0]
 23 |         n_seen = 0
 24 |         batch_size = self.batch_size
 25 |         for epoch in range(self.n_epochs):
 26 |             permutation = rng.permutation(n_samples)
 27 |             data = tuple(d[permutation] for d in self.data)
 28 |             for i in range(0, n_samples, batch_size):
 29 |                 n_seen += len(data[0][i:i + batch_size])
 30 |                 yield n_seen, epoch, tuple(d[i:i + batch_size] for d in data)
 31 | 
 32 | 
 33 | class GaussianMixtureSGD(object):
 34 |     def __init__(self, n_components=5, learning_rate=0.1, patience=3,
 35 |                  batch_size=10, max_iter=1000, session=None,
 36 |                  means_init=None, random_seed=0):
 37 |         self.n_components = n_components
 38 |         self.random_seed = random_seed
 39 |         self.learning_rate = learning_rate
 40 |         self.patience = patience
 41 |         self.max_iter = max_iter
 42 |         self.batch_size = batch_size
 43 |         self.session = session
 44 |         self.means_init = means_init
 45 | 
 46 |     def _make_model(self, n_features, dtype=np.float32):
 47 |         self._component_variables = defaultdict(list)
 48 |         X = tf.placeholder(shape=(None, n_features), dtype=dtype, name='X')
 49 | 
 50 |         # Mixture weights
 51 |         w = tf.Variable(
 52 |             tf.zeros(shape=(1, self.n_components), dtype=dtype),
 53 |             name='w')
 54 |         self._normalized_weights = tf.reshape(
 55 |             tf.nn.softmax(w), (self.n_components,))
 56 |         logliks = []
 57 | 
 58 |         # TODO: instead of masking using a numpy initialized densed tensor, use
 59 |         # a sparse tensorflow tensor with the triangular structure built-in bu
 60 |         # this would equire tensorflow >= 0.9 which is not released at this
 61 |         # point.
 62 |         M = tf.constant(
 63 |             np.tril(
 64 |                 np.ones(shape=(n_features, n_features), dtype=dtype),
 65 |                 k=-1),
 66 |             name='triangular_mask')
 67 |         for k in range(self.n_components):
 68 |             with tf.variable_scope('component_%03d' % k):
 69 |                 if self.means_init is not None:
 70 |                     m = np.asarray(self.means_init[k], dtype=dtype)
 71 |                 else:
 72 |                     m = tf.zeros(shape=(n_features,), dtype=dtype)
 73 |                 mu = tf.Variable(m, name='mu_%03d' % k)
 74 |                 self._component_variables['mu'].append(mu)
 75 |                 d = tf.Variable(
 76 |                     -2 * tf.ones(shape=[n_features], dtype=dtype),
 77 |                     #tf.truncated_normal(shape=[n_features],
 78 |                     #                    stddev=1 / sqrt(n_features),
 79 |                     #                    dtype=dtype,
 80 |                     #                    seed=self.random_seed + k),
 81 |                     name='d_%03d' % k)
 82 | 
 83 |                 self._component_variables['d'].append(d)
 84 |                 H = tf.Variable(
 85 |                     tf.zeros(shape=(n_features, n_features), dtype=dtype),
 86 |                     #tf.truncated_normal(shape=(n_features, n_features),
 87 |                     #                    stddev=1 / sqrt(n_features),
 88 |                     #                    dtype=dtype,
 89 |                     #                    seed=self.random_seed + k),
 90 |                     name='H_%03d' % k)
 91 |                 # M is an element-wise mask to set all diagonal and triangular
 92 |                 # uppper entries of of H to zero:
 93 |                 L = tf.add(tf.diag(tf.exp(d)), tf.mul(M, H), name='L_%03d' % k)
 94 |                 P = tf.matmul(L, tf.transpose(L), name='P_%03d' % k)
 95 |                 self._component_variables['P'].append(P)
 96 | 
 97 |                 loglik = self._log_likelihood_one_gaussian(
 98 |                     n_features, X, mu, P, d)
 99 |                 logliks.append(loglik)
100 | 
101 |         # compute the log likelihood of the mixture
102 |         # TODO: would it be better to find a way to vectorize the computation
103 |         # of the log-likelihoods to avoid using tf.pack to make tensorflow
104 |         # run somehow faster?
105 | 
106 |         # XXX: the following is wrong! We cannot get the loglikelood of a mixture
107 |         # this way... I don't have time to fix it now though.
108 |         # It should use tf.reduce_logsumexp instead.
109 |         self._loglik = tf.reduce_sum(
110 |             tf.mul(tf.transpose(tf.pack(logliks)), self._normalized_weights),
111 |             reduction_indices=1)
112 |         self._loss = -tf.reduce_mean(self._loglik)
113 |         self._optimizer = tf.train.AdamOptimizer(
114 |             learning_rate=self.learning_rate)
115 |         train_op = self._optimizer.minimize(self._loss)
116 | 
117 |         if self.session is None:
118 |             session = tf.InteractiveSession()
119 |         else:
120 |             session = self.session
121 |         session.run(tf.initialize_all_variables())
122 |         for name, variables in self._component_variables.items():
123 |             print(name)
124 |             for var in variables:
125 |                 print(var.eval())
126 |             if name == 'P':
127 |                 print('C')
128 |                 for var in variables:
129 |                     print(np.linalg.inv(var.eval()))
130 |         self._train = lambda data: session.run(
131 |             train_op, feed_dict={X: data}
132 |         )
133 |         self.score_samples = lambda data: session.run(
134 |             self._loglik, feed_dict={X: data}
135 |         )
136 |         self._compute_loss = lambda data: session.run(
137 |             self._loss, feed_dict={X: data}
138 |         )
139 |         self.score = lambda data: -self._compute_loss(data)
140 | 
141 |     def _log_likelihood_one_gaussian(self, n_features, X, mu, P, d):
142 |         X_mu = X - mu
143 |         X_muTPX_mu = tf.reduce_sum(
144 |             tf.mul(X_mu, tf.matmul(X_mu, P)),
145 |             reduction_indices=1)
146 |         # logdet(C) = -logdet(P) as C is the inverse of P
147 |         # logdet(P) = 2 * logdet(L) = 2 * sum_i d_i
148 |         return (-0.5 * n_features * tf.log(2 * np.pi) + tf.reduce_sum(d) - 0.5
149 |                 * X_muTPX_mu)
150 | 
151 | 
152 |     def fit(self, X_train, X_val=None):
153 |         if X_val is None:
154 |             X_train, X_val = train_test_split(X_train, test_size=0.1,
155 |                                               random_state=self.random_seed)
156 |         n_samples, n_features = X_train.shape
157 |         self._make_model(n_features=n_features)
158 |         batch_sampler = EpochSampler(X_train, n_epochs=self.max_iter,
159 |                                      batch_size=self.batch_size,
160 |                                      random_seed=self.random_seed)
161 |         best_val_loss = self._compute_loss(X_val)
162 |         patience = self.patience
163 |         for n_seen, epoch, (X_batch,) in batch_sampler:
164 |             self._train(X_batch)
165 |             if n_seen % 100 == 0:
166 |                 # XXX: ensure that this is a multiple of batch_size
167 |                 val_loss = self._compute_loss(X_val)
168 |                 if val_loss < best_val_loss:
169 |                     best_val_loss = val_loss
170 |                     patience = self.patience
171 |                 else:
172 |                     patience -= 1
173 |                     if patience == 0:
174 |                         break
175 |         self.n_iter_ = epoch + 1
176 | 


--------------------------------------------------------------------------------
/letor_cluster/letor_gridpoint.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import os
  4 | import json
  5 | from time import time
  6 | import numpy as np
  7 | 
  8 | from sklearn.externals import joblib
  9 | from sklearn.ensemble import GradientBoostingRegressor
 10 | 
 11 | 
 12 | def dcg(relevances, rank=10):
 13 |     """Discounted cumulative gain at rank (DCG)"""
 14 |     relevances = np.asarray(relevances)[:rank]
 15 |     n_relevances = len(relevances)
 16 |     if n_relevances == 0:
 17 |         return 0.
 18 | 
 19 |     discounts = np.log2(np.arange(n_relevances) + 2)
 20 |     return np.sum(relevances / discounts)
 21 |  
 22 |  
 23 | def ndcg(relevances, rank=10):
 24 |     """Normalized discounted cumulative gain (NDGC)"""
 25 |     best_dcg = dcg(sorted(relevances, reverse=True), rank)
 26 |     if best_dcg == 0:
 27 |         return 0.
 28 | 
 29 |     return dcg(relevances, rank) / best_dcg
 30 | 
 31 | 
 32 | def mean_ndcg(y_true, y_pred, query_ids, rank=10):
 33 |     y_true = np.asarray(y_true)
 34 |     y_pred = np.asarray(y_pred)
 35 |     query_ids = np.asarray(query_ids)
 36 |     # assume query_ids are sorted
 37 |     ndcg_scores = []
 38 |     previous_qid = query_ids[0]
 39 |     previous_loc = 0
 40 |     for loc, qid in enumerate(query_ids):
 41 |         if previous_qid != qid:
 42 |             chunk = slice(previous_loc, loc)
 43 |             ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
 44 |             ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
 45 |             previous_loc = loc
 46 |         previous_qid = qid
 47 | 
 48 |     chunk = slice(previous_loc, loc + 1)
 49 |     ranked_relevances = y_true[chunk][np.argsort(y_pred[chunk])[::-1]]
 50 |     ndcg_scores.append(ndcg(ranked_relevances, rank=rank))
 51 |     return np.mean(ndcg_scores)
 52 | 
 53 | 
 54 | job_folder = sys.argv[1]
 55 | with open(job_folder + '/parameters.json', 'r') as f:
 56 |     parameters = json.load(f)
 57 | 
 58 | with open(job_folder + '/data.json', 'r') as f:
 59 |     data_filenames = json.load(f)
 60 | 
 61 | 
 62 | print("Loading the data...")
 63 | tic = time()
 64 | X_train, y_train, qid_train = joblib.load(data_filenames['train'],
 65 |                                           mmap_mode='r')
 66 | X_vali, y_vali, qid_vali = joblib.load(data_filenames['validation'],
 67 |                                        mmap_mode='r')
 68 | # warm up (load the data from the drive)
 69 | X_train.max(), X_vali.max()
 70 | data_load_time = time() - tic
 71 | print("done in{:.3f}s".format(data_load_time))
 72 | 
 73 | print("Training the model with parameters:")
 74 | print(parameters)
 75 | tic = time()
 76 | model = GradientBoostingRegressor(random_state=0)
 77 | model.set_params(**parameters)
 78 | model.fit(X_train, y_train)
 79 | training_time = time() - tic
 80 | print("done in{:.3f}s".format(training_time))
 81 | 
 82 | print("Computing training NDGC@10...")
 83 | tic = time()
 84 | y_pred = model.predict(X_train)
 85 | prediction_time = time() - tic
 86 | train_score = mean_ndcg(y_train, y_pred, qid_train)
 87 | print("{:.3f}".format(train_score))
 88 | print("done in{:.3f}s".format(prediction_time))
 89 | 
 90 | print("Computing validation NDGC@10...")
 91 | y_pred = model.predict(X_vali)
 92 | validation_score = mean_ndcg(y_vali, y_pred, qid_vali)
 93 | print("{:.3f}".format(validation_score))
 94 | 
 95 | model_filename = job_folder + '/model.pkl'
 96 | print("Saving model to {}".format(model_filename))
 97 | tic = time()
 98 | model_filenames = joblib.dump(model, model_filename)
 99 | model_save_time = time() - tic
100 | print("done in{:.3f}s".format(model_save_time))
101 | model_size_bytes = 0
102 | for filename in model_filenames:
103 |     model_size_bytes += os.stat(filename).st_size
104 | 
105 | results = {
106 |     'data_load_time': data_load_time,
107 |     'training_time': training_time,
108 |     'prediction_time': prediction_time,
109 |     'model_save_time': model_save_time,
110 |     'model_size_bytes': model_size_bytes,
111 |     'train_score': train_score,
112 |     'validation_score': validation_score,
113 |     'model_filename': model_filename,
114 | }
115 | 
116 | with open(job_folder + '/results.json', 'wb') as f:
117 |     f.write(json.dumps(results).encode('utf-8'))
118 | 


--------------------------------------------------------------------------------
/letor_cluster/letor_gridresults.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import os
 3 | import os.path as op
 4 | import json
 5 | 
 6 | 
 7 | def collect_results(jobs_folder):
 8 |     entries = []
 9 | 
10 |     for job_folder in os.listdir(jobs_folder):
11 |         results_filename = op.join(
12 |             jobs_folder, job_folder, 'results.json')
13 |         parameters_filename = op.join(
14 |             jobs_folder, job_folder, 'parameters.json')
15 | 
16 |         if (not op.exists(parameters_filename)
17 |                 or not op.exists(results_filename)):
18 |             continue
19 | 
20 |         new_entry = dict()
21 | 
22 |         with open(parameters_filename, 'r') as f:
23 |             new_entry.update(json.load(f))
24 | 
25 |         with open(results_filename, 'r') as f:
26 |             new_entry.update(json.load(f))
27 | 
28 |         entries.append(new_entry)
29 | 
30 |     return pd.DataFrame(entries)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     results = collect_results('/scratch/ogrisel/grid_jobs')
35 |     results.to_json('letor_gridresults.json')


--------------------------------------------------------------------------------
/letor_cluster/letor_gridsearch.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | from random import Random
 4 | import json
 5 | 
 6 | from sklearn.externals import joblib
 7 | from sklearn.grid_search import ParameterGrid
 8 | 
 9 | 
10 | MSLR_DATA = '/scratch/ogrisel/mslr-web10k_fold1.npz'
11 | DATA_FOLDER = '/home/parietal/ogrisel/data'
12 | TRAIN_SAMPLE_DATA = DATA_FOLDER + '/mslr-web10k_fold1_train_500.pkl'
13 | VALI_DATA = DATA_FOLDER + '/mslr-web10k_fold1_vali.pkl'
14 | GRID_JOBS_FOLDER = '/scratch/ogrisel/grid_jobs'
15 | 
16 | rng = Random(42)
17 | 
18 | 
19 | def subsample(X, y, qid, size, seed=None):
20 |     rng = np.random.RandomState(seed)
21 |     unique_qid = np.unique(qid)
22 |     qid_mask = rng.permutation(len(unique_qid))[:size]
23 |     subset_mask = np.in1d(qid_train, unique_qid[qid_mask])
24 |     return X[subset_mask], y[subset_mask], qid[subset_mask]
25 | 
26 | 
27 | if not os.path.exists(TRAIN_SAMPLE_DATA) or not os.path.exists(VALI_DATA):
28 |     if not os.path.exists(DATA_FOLDER):
29 |         os.makedirs(DATA_FOLDER)
30 | 
31 |     data = np.load(os.path.expanduser(MSLR_DATA))
32 |     X_train, y_train, qid_train = data['X_train'], data['y_train'], data['qid_train']
33 |     X_vali, y_vali, qid_vali = data['X_vali'], data['y_vali'], data['qid_vali']
34 | 
35 |     X_train_small, y_train_small, qid_train_small = subsample(
36 |         X_train, y_train, qid_train, 500, seed=0)
37 | 
38 |     joblib.dump((X_train_small, y_train_small, qid_train_small),
39 |                 TRAIN_SAMPLE_DATA)
40 |     joblib.dump((X_vali, y_vali, qid_vali), VALI_DATA)
41 | 
42 | 
43 | if not os.path.exists(GRID_JOBS_FOLDER):
44 |     os.makedirs(GRID_JOBS_FOLDER)
45 | 
46 | 
47 | params = {
48 |     'max_features': [10, 20, 50, 100],
49 |     'max_depth': [2, 3, 4, 5],
50 |     'subsample': [0.5, 0.8, 1.0],
51 |     'loss': ['ls', 'huber', 'quantile'],
52 |     'learning_rate': [0.05, 0.1, 0.5],
53 | }
54 | 
55 | for i, param in enumerate(ParameterGrid(params)):
56 |     params_description = json.dumps(param)
57 |     job_id = joblib.hash(params_description)
58 |     job_folder = GRID_JOBS_FOLDER + '/' + job_id
59 |     if not os.path.exists(job_folder):
60 |         os.makedirs(job_folder)
61 |     with open(job_folder + '/parameters.json', 'wb') as f:
62 |         f.write(params_description.encode('utf-8'))
63 | 
64 |     data_filenames = {'train': TRAIN_SAMPLE_DATA, 'validation': VALI_DATA}
65 |     with open(job_folder + '/data.json', 'wb') as f:
66 |         f.write(json.dumps(data_filenames).encode('utf-8'))
67 | 
68 |     cmd = 'qsub -V -cwd letor_gridpoint.py {}'.format(job_folder)
69 |     os.system(cmd)
70 | 
71 |     # if i > 100:
72 |     #     break


--------------------------------------------------------------------------------
/nmf_topics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "worksheets": [
  3 |         {
  4 |             "cells": [
  5 |                 {
  6 |                     "source": "# Topics extraction with Non-Negative Matrix Factorization\n\nThis is a proof of concept application of Non Negative Matrix\nFactorization of the term frequency matrix of a corpus of documents so\nas to extract an additive model of the topic structure of the corpus.", 
  7 |                     "cell_type": "markdown"
  8 |                 }, 
  9 |                 {
 10 |                     "source": "## Load the 20 newsgroups dataset", 
 11 |                     "cell_type": "markdown"
 12 |                 }, 
 13 |                 {
 14 |                     "cell_type": "code", 
 15 |                     "language": "python", 
 16 |                     "outputs": [], 
 17 |                     "collapsed": false, 
 18 |                     "prompt_number": 14, 
 19 |                     "input": "from sklearn import datasets\ndataset = datasets.fetch_20newsgroups(shuffle=True, random_state=1)\nprint dataset.target_names[dataset.target[0]]"
 20 |                 }, 
 21 |                 {
 22 |                     "cell_type": "code", 
 23 |                     "language": "python", 
 24 |                     "outputs": [], 
 25 |                     "collapsed": false, 
 26 |                     "prompt_number": 15, 
 27 |                     "input": "print dataset.data[0]"
 28 |                 }, 
 29 |                 {
 30 |                     "source": "## Restrict the dimensions of the problem\n\nFor shorter computation times.", 
 31 |                     "cell_type": "markdown"
 32 |                 }, 
 33 |                 {
 34 |                     "cell_type": "code", 
 35 |                     "language": "python", 
 36 |                     "outputs": [], 
 37 |                     "collapsed": true, 
 38 |                     "prompt_number": 16, 
 39 |                     "input": "n_samples = 1000\nn_features = 1000"
 40 |                 }, 
 41 |                 {
 42 |                     "source": "## Vectorize to compute word frequencies for each document\n\nRestrict to the most common word frequency and use TF-IDF weighting (without top 5% stop words)", 
 43 |                     "cell_type": "markdown"
 44 |                 }, 
 45 |                 {
 46 |                     "cell_type": "code", 
 47 |                     "language": "python", 
 48 |                     "outputs": [], 
 49 |                     "collapsed": false, 
 50 |                     "prompt_number": 17, 
 51 |                     "input": "from sklearn.feature_extraction import text\n\nvectorizer = text.CountVectorizer(max_df=0.95, max_features=n_features)\ncounts = vectorizer.fit_transform(dataset.data[:n_samples])\n\ntfidf = text.TfidfTransformer().fit_transform(counts)\ntfidf"
 52 |                 }, 
 53 |                 {
 54 |                     "source": "Convert from a `scipy.sparse.csr_matrix` representation to a dense `numpy` array and remove negative values.", 
 55 |                     "cell_type": "markdown"
 56 |                 }, 
 57 |                 {
 58 |                     "cell_type": "code", 
 59 |                     "language": "python", 
 60 |                     "outputs": [], 
 61 |                     "collapsed": false, 
 62 |                     "prompt_number": 18, 
 63 |                     "input": "tfidf.toarray()"
 64 |                 }, 
 65 |                 {
 66 |                     "source": "## Extract some topics with Non-negative Matrix Factorization", 
 67 |                     "cell_type": "markdown"
 68 |                 }, 
 69 |                 {
 70 |                     "cell_type": "code", 
 71 |                     "language": "python", 
 72 |                     "outputs": [], 
 73 |                     "collapsed": true, 
 74 |                     "prompt_number": 19, 
 75 |                     "input": "from sklearn import decomposition\nn_topics = 5\n\nnmf = decomposition.NMF(n_components=n_topics).fit(tfidf)"
 76 |                 }, 
 77 |                 {
 78 |                     "cell_type": "code", 
 79 |                     "language": "python", 
 80 |                     "outputs": [], 
 81 |                     "collapsed": false, 
 82 |                     "prompt_number": 20, 
 83 |                     "input": "print nmf"
 84 |                 }, 
 85 |                 {
 86 |                     "cell_type": "code", 
 87 |                     "language": "python", 
 88 |                     "outputs": [], 
 89 |                     "collapsed": false, 
 90 |                     "prompt_number": 21, 
 91 |                     "input": "print nmf.components_"
 92 |                 }, 
 93 |                 {
 94 |                     "source": "## Display the most important words for each extracted topic\n\nReuse the vocabulary of the vectorizer to find the words names from the matrix positions.", 
 95 |                     "cell_type": "markdown"
 96 |                 }, 
 97 |                 {
 98 |                     "cell_type": "code", 
 99 |                     "language": "python", 
100 |                     "outputs": [], 
101 |                     "collapsed": false, 
102 |                     "prompt_number": 22, 
103 |                     "input": "n_top_words = 12\ninverse_vocabulary = dict((v, k) for k, v in vectorizer.vocabulary.iteritems())\n\nfor topic_idx, topic in enumerate(nmf.components_):\n    print \"Topic #%d: \" % topic_idx,\n    print \" \".join([inverse_vocabulary[i]\n                    for i in topic.argsort()[:-(n_top_words + 1):-1]])\n    print"
104 |                 }, 
105 |                 {
106 |                     "input": "", 
107 |                     "cell_type": "code", 
108 |                     "collapsed": true, 
109 |                     "language": "python", 
110 |                     "outputs": []
111 |                 }
112 |             ]
113 |         }
114 |     ], 
115 |     "metadata": {
116 |         "name": "nmf_topics"
117 |     }, 
118 |     "nbformat": 2
119 | }


--------------------------------------------------------------------------------
/screenshots/digits.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ogrisel/notebooks/09692cc11f4d75cc31e4817e053a0b011b76680f/screenshots/digits.png


--------------------------------------------------------------------------------
/screenshots/topics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ogrisel/notebooks/09692cc11f4d75cc31e4817e053a0b011b76680f/screenshots/topics.png


--------------------------------------------------------------------------------
/sklearn_demos/Language Classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:2f7a72cbc8c7a4f9909a0ab9a42a77c3b1259c4fbbbcc4e2dfb3324fdfe6ab7e"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "code",
 13 |      "collapsed": false,
 14 |      "input": [
 15 |       "%matplotlib inline\n",
 16 |       "import numpy as np\n",
 17 |       "import matplotlib.pyplot as plt\n",
 18 |       "import sys\n",
 19 |       "from sklearn.datasets import load_files\n",
 20 |       "from sklearn.cross_validation import train_test_split"
 21 |      ],
 22 |      "language": "python",
 23 |      "metadata": {},
 24 |      "outputs": [],
 25 |      "prompt_number": 19
 26 |     },
 27 |     {
 28 |      "cell_type": "heading",
 29 |      "level": 2,
 30 |      "metadata": {},
 31 |      "source": [
 32 |       "Dataset collection (from Wikipedia)"
 33 |      ]
 34 |     },
 35 |     {
 36 |      "cell_type": "code",
 37 |      "collapsed": false,
 38 |      "input": [
 39 |       "dataset = load_files('language/paragraphs')\n",
 40 |       "docs_train, docs_test, y_train, y_test = train_test_split(\n",
 41 |       "    dataset.data, dataset.target, test_size=0.5, random_state=0)"
 42 |      ],
 43 |      "language": "python",
 44 |      "metadata": {},
 45 |      "outputs": [],
 46 |      "prompt_number": 20
 47 |     },
 48 |     {
 49 |      "cell_type": "code",
 50 |      "collapsed": false,
 51 |      "input": [
 52 |       "for example, lang_code in list(zip(docs_train, y_train))[:3]:\n",
 53 |       "    print(example.decode('utf-8'))\n",
 54 |       "    print(\"=> %s\\n\" % dataset.target_names[lang_code])"
 55 |      ],
 56 |      "language": "python",
 57 |      "metadata": {},
 58 |      "outputs": [
 59 |       {
 60 |        "output_type": "stream",
 61 |        "stream": "stdout",
 62 |        "text": [
 63 |         "In 2005 publiceerde het natuurwetenschappelijke tijdschrift Nature de resultaten van een vergelijkend onderzoek naar de kwaliteit van artikelen in de Engelse Wikipedia (WP) en de Encyclop\u00e6dia Britannica.[18] Universitaire deskundigen bogen zich over natuurwetenschappelijke teksten, zonder te weten uit welke encyclopedie ze kwamen. In 42 paren van overeenkomstige artikelen uit beide encyclopedie\u00ebn vonden ze zowel in WP als in EB totaal acht ernstige fouten. Gemiddeld bevatte een WP-artikel vier en een EB-artikel drie foutjes, weglatingen of misleidende beweringen. Nature concludeerde dat hoewel de schrijfstijl van de Brittanica veel beter was, de Wikipedia op natuurwetenschappelijk gebied bijna net zo goed was als de Britannica. Orlowski formuleerde het als volgt: de kwaliteit van informatie in deze artikelen was in Wikipedia daarmee 31 procent minder dan in de Britannica.[19] Het betrof hier een betrekkelijk kleine selectie van artikelen over exacte wetenschappen en techniek, en veel van de aangetroffen 'fouten' betroffen meningsverschillen tussen de onderzoekers van Nature en de redacteuren van de Britannica over welke feiten vermeld zouden moeten worden in een encyclopedie.[20] In maart 2006 publiceerde de Encyclop\u00e6dia Britannica onder de titel \"Fatally Flawed\" een weerlegging van de onderzoeksresultaten van Nature.[21]\n",
 64 |         "=> nl\n",
 65 |         "\n",
 66 |         "Il existe \u00e9galement un classement qualitatif fond\u00e9 sur l'existence et la taille des articles d'une liste arbitraire d'environ 1\u00a0000 articles que toute \u00e9dition de Wikip\u00e9dia devrait avoir[note 18].\n",
 67 |         "=> fr\n",
 68 |         "\n",
 69 |         "On January 18, 2012, the English Wikipedia participated in a series of coordinated protests against two proposed laws in the United States Congress\u2014the Stop Online Piracy Act (SOPA) and the PROTECT IP Act (PIPA)\u2014by blacking out its pages for 24 hours.[144] More than 162 million people viewed the blackout explanation page that temporarily replaced Wikipedia content.[145][146]\n",
 70 |         "=> en\n",
 71 |         "\n"
 72 |        ]
 73 |       }
 74 |      ],
 75 |      "prompt_number": 21
 76 |     },
 77 |     {
 78 |      "cell_type": "heading",
 79 |      "level": 2,
 80 |      "metadata": {},
 81 |      "source": [
 82 |       "Model fitting"
 83 |      ]
 84 |     },
 85 |     {
 86 |      "cell_type": "code",
 87 |      "collapsed": false,
 88 |      "input": [
 89 |       "from sklearn.feature_extraction.text import TfidfVectorizer\n",
 90 |       "from sklearn.linear_model import Perceptron\n",
 91 |       "from sklearn.pipeline import make_pipeline\n",
 92 |       "\n",
 93 |       "vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',\n",
 94 |       "                             use_idf=False)\n",
 95 |       "\n",
 96 |       "clf = make_pipeline(vectorizer, Perceptron())\n",
 97 |       "clf.fit(docs_train, y_train);"
 98 |      ],
 99 |      "language": "python",
100 |      "metadata": {},
101 |      "outputs": [],
102 |      "prompt_number": 22
103 |     },
104 |     {
105 |      "cell_type": "heading",
106 |      "level": 2,
107 |      "metadata": {},
108 |      "source": [
109 |       "Model evaluation"
110 |      ]
111 |     },
112 |     {
113 |      "cell_type": "code",
114 |      "collapsed": false,
115 |      "input": [
116 |       "sentences = [\n",
117 |       "    \"This is a language detection test.\",\n",
118 |       "    \"Ceci est un test de d\u00e9tection de la langue.\",\n",
119 |       "    \"Das ist eine Spracherkennungstest.\",\n",
120 |       "    \"Je suis au S\u00e9nat pour pr\u00e9senter l'analyse pr\u00e9dictive de donn\u00e9es.\"\n",
121 |       "]\n",
122 |       "predicted = clf.predict(sentences)\n",
123 |       "\n",
124 |       "for s, p in zip(sentences, predicted):\n",
125 |       "    print(u'The language of \"%s\" is \"%s\"' % (s, dataset.target_names[p]))"
126 |      ],
127 |      "language": "python",
128 |      "metadata": {},
129 |      "outputs": [
130 |       {
131 |        "output_type": "stream",
132 |        "stream": "stdout",
133 |        "text": [
134 |         "The language of \"This is a language detection test.\" is \"en\"\n",
135 |         "The language of \"Ceci est un test de d\u00e9tection de la langue.\" is \"fr\"\n",
136 |         "The language of \"Das ist eine Spracherkennungstest.\" is \"de\"\n",
137 |         "The language of \"Je suis au S\u00e9nat pour pr\u00e9senter l'analyse pr\u00e9dictive de donn\u00e9es.\" is \"fr\"\n"
138 |        ]
139 |       }
140 |      ],
141 |      "prompt_number": 23
142 |     },
143 |     {
144 |      "cell_type": "code",
145 |      "collapsed": false,
146 |      "input": [
147 |       "from sklearn.metrics import classification_report\n",
148 |       "\n",
149 |       "y_predicted = clf.predict(docs_test)\n",
150 |       "print(classification_report(y_test, y_predicted,\n",
151 |       "                            target_names=dataset.target_names))"
152 |      ],
153 |      "language": "python",
154 |      "metadata": {},
155 |      "outputs": [
156 |       {
157 |        "output_type": "stream",
158 |        "stream": "stdout",
159 |        "text": [
160 |         "             precision    recall  f1-score   support\n",
161 |         "\n",
162 |         "         ar       1.00      1.00      1.00        14\n",
163 |         "         de       0.98      1.00      0.99        47\n",
164 |         "         en       1.00      1.00      1.00        77\n",
165 |         "         es       1.00      1.00      1.00        45\n",
166 |         "         fr       1.00      0.98      0.99        59\n",
167 |         "         it       0.98      1.00      0.99        45\n",
168 |         "         ja       1.00      0.97      0.99        35\n",
169 |         "         nl       1.00      1.00      1.00        18\n",
170 |         "         pl       1.00      0.95      0.97        20\n",
171 |         "         pt       0.98      1.00      0.99        47\n",
172 |         "         ru       1.00      1.00      1.00        26\n",
173 |         "\n",
174 |         "avg / total       0.99      0.99      0.99       433\n",
175 |         "\n"
176 |        ]
177 |       }
178 |      ],
179 |      "prompt_number": 18
180 |     },
181 |     {
182 |      "cell_type": "code",
183 |      "collapsed": false,
184 |      "input": [
185 |       "from sklearn.decomposition import TruncatedSVD\n",
186 |       "from itertools import cycle\n",
187 |       "\n",
188 |       "\n",
189 |       "X_train = vectorizer.fit_transform(docs_train)\n",
190 |       "X_pca = TruncatedSVD(50).fit_transform(X_train)\n",
191 |       "\n",
192 |       "for i, c in zip(np.unique(y_train)[:5],\n",
193 |       "                cycle(['r', 'g', 'b', 'c', 'm', 'y'])):\n",
194 |       "    mask = y_train == i\n",
195 |       "    language = dataset.target_names[i]\n",
196 |       "    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], color=c, label=language)\n",
197 |       "    \n",
198 |       "plt.legend(loc='best');"
199 |      ],
200 |      "language": "python",
201 |      "metadata": {},
202 |      "outputs": [
203 |       {
204 |        "metadata": {},
205 |        "output_type": "display_data",
206 |        "png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEACAYAAABI5zaHAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl4lOW9//H3PQmQsAQSUBAcEQQVDSJLUKTWFCSxeIlS\nRbBaty500eOpOedXFUWEQ2k9J7081f56qQe05Yj08LNQpepAPYKH4xaCpAYXiGgMsmgYVrNn7t8f\nz0xmySSZMFknn9d1zcU8yzzPPQ/wzZ3vvRlrLSIiklhcnV0AERFpewruIiIJSMFdRCQBKbiLiCQg\nBXcRkQSk4C4ikoDiDu7GmKuMMR8ZY/YYY34R5fi1xpgiY8x7xphCY8yMeO8pIiLNM/H0czfGJAEf\nA1cCXwAFwE3W2g9Dzulnrf3a/348sN5aOyauUouISLPirblPBUqstZ9Za2uBtcC1oScEArtff6A8\nznuKiEgL4g3uI4CykO19/n1hjDHXGWM+BF4B/iHOe4qISAviDe4x5XSstRusteOAa4DVcd5TRERa\nkBzn578A3CHbbpzae1TW2v8xxiQbYwZbaw+HHjPGaJIbEZFWstaaaPvjrblvB8YaY842xvQG5gMv\nhp5gjDnHGGP87yf5C3O40ZWc/Xq18Hr44Yc7vQzd4aXnpGfVE55Tc+KquVtr64wxdwEeIAlYaa39\n0Biz0H/8SeB64FZjTC1wElgQzz1FRKRl8aZlsNa+gtNQGrrvyZD3jwKPxnsfERGJnUaodjPZ2dmd\nXYRuQc8pdnpWseluzymuQUxtyRhju0pZRES6A2MMtokG1bjTMiIincnfXyPhtbbyq+AuIt1eov/W\nfyo/wJRzFxFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk3jIi0nN5vfDss3D8OFx9NWRldVpR6uvr\nSUpKarPrqeYuIomrqAjy82HVKqioCD/m9cL48fDAA7BsGWRnw0svtXkRfvWrXzFmzBjS0tK48MIL\n2bBhAwDPPvss06dP595772XIkCE88sgjbXpfBXcRSUwvvQTTpjnB++67YfLk8AD/1FNQXg7V1eDz\nOcfuuSf8GqWlcMkl0LcvnHsu7NjR6mKMGTOGbdu2cfz4cR5++GFuueUWDh48CMC7777LOeecw5df\nfskDDzwQz7dtRMG9LXg8kJPjvDyezi6NiAAsXAiVlVBT4wTuzz+H1SFrBR096hwLdeJE8H19vVOb\nLyx0rrNnD8yYAYejzljepBtuuIFhw4YBcOONNzJ27FjeffddAIYPH87PfvYzXC4XKSkpp/Itm6Tg\nHi+PB+bOhc2bndfcuQrwIl3B8ePh29XV4YH5mmucGnlAaipcG7IE9L598OWXTpAPVVjYqmL88Y9/\nZOLEiaSnp5Oenk5xcTHl5eUYY3C73S1f4BQpuMcrP9/5qR5QWensE5HONXMm9OkT3O7d29kXMH26\nk4sfMQLS02H+fHjiieDxgQOhri78mnV1MGhQzEUoLS3lRz/6Eb/73e/wer0cOXKEzMzMhukS2nNe\nHAX3WCjtItL9rF4NV17pBPjBg2HlSid/Hmr+fKeG7vXCM89AaGpk0CC4917o1w9cLufPmTNb1aPm\n66+/xhjDkCFD8Pl8PPPMMxQXFwPtPx+OukI2xeNxauDl5bBrVzA3t20brF8PubnOdl6esy9Qe09N\ndfaJSOdKS4ONG+O7xooVcPnlTkPq6NGwYAG0orZ9wQUXkJeXx7Rp03C5XNx666184xvfwBjT8Gov\nms89muXLYfFipwU9mlmzYNOm4HbgBwE4gT0Q+EWk3fnnNO/sYrSrpr5jc/O5K7hH8njg29+G5soS\nGdxFpNMouEcP7sq5R7r//uYDu9IuItINKLhHKi1t+lhGRni+XUSki1JwjzRyZNPHJk9WYBeRbkHB\nPdL110ffr3SMiHQjCu6Rtm5tvE/pGBHpZhTcY6F0jIh0MwrukfLynBRMgNIxItINqZ97NBqUJNJt\ndLd+7rfffjtut5tly5bF/JlT6eeu6Qeiyc1VQBeRdtHe0w4E9Iy0jCb+EpEovJVefvPWb1iyZQkF\nXxR02H074jeNxA/umm9dpMcqOlhE/pv5rHpvFRW14cvseSu9jP/9eB547QGWbV1G9rPZvPRx2y+z\n99577zFp0iTS0tJYsGABVVVVDcc2btzIxRdfTHp6OtOnT+f9999vs/smfnDvbvOt67cMkTbx0scv\nMW3lNB747we4+5W7mfzU5LAA/1ThU5RXlFNdX40PHxV1Fdzzavgye6VHS7nk6Uvou7wv5z5+LjsO\ntG6ZvZqaGq677jpuu+02jhw5wrx583jhhRcwxvDee+/x/e9/n6effhqv18vChQuZM2cONZGrQ52i\nxA/u3Yl+yxBpMws3LqSyrpKa+hoqaiv4/OjnrC4KLrN3tOooNfXhgfREdXCZvXpfPdl/yKbwQCGV\ndZXs8e5hxh9mcLgi9mX23n77berq6rjnnntISkri+uuvJysrC2stTz/9NAsXLiQrKwtjDLfeeit9\n+vTh7bffjv/L0xOCe3fq2tjdfssQ6cKOV4cvs1ddX83hymBgvubca+jbK7jMXmpyKteeH1xmb9/x\nfXz59ZfU2/Bl9goPxL7M3v79+xkxYkTYvpH+KU5KS0vJz89vWH4vPT2dffv2ceDAgZiv35zED+65\nuc7o0lmznJdGmor0CDNHz6RPUnCZvd5JvZk5KrjM3vSzprNqzipGDBhBeko68y+czxOzg8vsDUwZ\nSJ0vfJm9Ol8dg1JiX2bvjDPO4IsvvgjbV+qfnNDtdrNo0SKOHDnS8Dp58iTz589v1fdskrW2S7yc\novRwr75qbWqqtc6kw877V1/t7FKJdGlNxY5jVcfs1c9dbfss62MH/3qwXfP3Na2+9n2b77P9lvez\nrkdctt/yfnbOmjnW5/PF/Pmamhp71lln2X//93+3NTU19oUXXrC9evWyDz30kN2+fbt1u932nXfe\nsT6fz548edJu3LjRnjhxIubv6N8fNaZqEFNXowFUIq3S3oOYXt7zMjsO7GB0+mgWZC7AZVqX8Cgs\nLOSHP/whJSUlzJ49G2MMY8eOZenSpXg8Hh566CH27NlDamoql19+OStXrqR///5h1+iUlZiMMVcB\njwFJwH9Ya38dcfxm4P8ABjgB/MRa+/co11FwF5FW624jVE9Fh6/EZIxJAp4ArgIuAG4yxoyLOG0v\n8E1r7UXAMuCpeO4pIiIti7dBdSpQYq39zFpbC6wFrg09wVr7lrX2mH/zHeDMOO8pIiItiDe4jwDK\nQrb3+fc15fvAy3HeU0REWhDvxGExJ7qMMd8C7gSmN3XOkiVLGt5nZ2eTnZ0dR9FERBLLli1b2LJl\nS0znxtWgaoy5FFhirb3Kv30/4IvSqHoR8GfgKmttSRPXUoOqiLSaGlTboUEV2A6MNcacbYzpDcwH\nXoy4+Vk4gf2WpgK7iIi0rbjSMtbaOmPMXYAHpyvkSmvth8aYhf7jTwKLgXTg9/45jGuttVPjK7aI\niDRHg5hEpFtTWqZ90jIiItIFKbiLiCQgraEqIj2W1wvPPgvHj8PVV0NWVmeXqO2o5i4iCauoyJmH\nb9UqqAhfZQ+vF8aPhwcegGXLIDsbXmr7VfbYv38/119/PaeffjqjR4/m8ccfB5xxPTfeeCO33XYb\naWlpZGZmUlgY+1zxLVFwF5GE9NJLMG2aE7zvvhsmTw4P8E89BeXlUF0NPp9z7J7wVfYoLYVLLoG+\nfeHcc2FH61bZw+fzcc011zBx4kT279/Pa6+9xmOPPcamTZv8ZXyJm266iWPHjjFnzhzuuuuuOL91\nkIJ7KK1fKpIwFi50FjOrqXEC9+efw+rgKnscPeocC3UiuMoe9fVObb6w0LnOnj0wYwYcjn2VPQoK\nCigvL+fBBx8kOTmZUaNG8YMf/IC1a9dijOHyyy/nqquuwhjDLbfcQlFRUVzfOZRy7gGB9UsDy9xt\n26ZVm0S6sePhq+xRXR0emK+5Bh5/PFibT02Fa0OmPdy3D7780gnyoQoLnfpfLEpLS9m/fz/p6ekN\n++rr6/nmN7/JyJEjGTp0aMP+vn37UlVVhc/nw+WKv96tmnuA1i8VSSgzZ0Kf4Cp79O7t7AuYPt3J\nxY8YAenpMH8+PBFcZY+BA6EufJU96upgUOyr7HHWWWcxatSosKX0jh8/zsaNG0/tS7WCgruIJKTV\nq+HKK50AP3gwrFzp5M9DzZ/v1NC9XnjmGUhJCR4bNAjuvRf69QOXy/lz5szW9aiZOnUqAwYM4NFH\nH6WyspL6+nqKi4vZvn1723zJZigtE5CX56RiArX31FRnn4h0S2lpEG8FecUKuPxypyF19GhYsABM\n1PGg0blcLjZu3EheXh6jR4+murqa888/n2XLlgHOCNNQkdvx0PQDobR+qUi3o+kH2mkN1bbS7sFd\ngVskISm49+TgHtkTJjU1vCeMAr9It6Xg3pODe04ObN4cvm/WLNi0qeXALyJdmoK7ZoWMTl0gRSQB\n9Yzgnpfn1MgD1BNGRBJczwjuublOqmXWLOcVmnZR4BeRBNQzcu4tUYOqSLelnHtPblAVkYSl4K4G\nVRGRHkPBXUQkAWluGRHpsby1tTx78CDH6+q4evBgstLSOrtIbUY1dxFJWEUnT5JfVsaqAweoiJiY\n3Vtby/iCAh7Yu5dlpaVk79zJS+XlbV6GppbZe/fdd5kyZQoDBw5k2LBh5LVxLz3V3EUkIb1UXs78\nDz6g3lqSjeFfy8oonDyZvklJADy1fz/ltbXU+BsqK3w+7ikp4ZohQxquUVpVxY27dvH+119zZp8+\nrL3gAiYNGBBzGQLL7M2dO5c//elPlJWVceWVV3Leeefx8MMP8/Of/5ybb76ZiooK3n///Tb9/qq5\ni0hCWrh7N5U+HzXWUuHz8XlVFasPHWo4frSuriGwB5wIWZ2j3lqyd+6k8MQJKn0+9lRWMmPnTg7X\n1sZchqaW2Xv++efp3bs3e/bsoby8nL59+3JJ5GTzcVJwF5GEdDxiGaVqny8sMF8zZAh9Q5azS3W5\nuDak1r6vupova2qIWGWPwtCFVlsQusxe4LVixQq++uorVq1axe7duxk3bhxTp07lr3/9a+u+YAuU\nlhGRhDQzPR2P10u1v3be2+ViZshaptMHDmTVeeeR98knVPh8XDt4ME+MHdtwfGBSEnURNfs6axmU\nHHvYDCyzt3v37qjH16xZA8ALL7zADTfcgNfrJTV0xHwcVHMXkYS0etw4rkxPp48xDE5OZuV553FJ\nRG+Y+UOHsu+yy/B+4xs8M24cKf58PMCgXr2498wz6edy4QL6+X84ZLUi597cMnvPPfccX331FQAD\nBw7EGNMmC2MHaISqiHRr7T1C9eXDh9lx4gSjU1NZcPrpuFq5FN6BAwfIy8vj9ddfD1tm79lnn2XT\npk1UVFRw9tlns3z5cubMmRP1Gpp+QER6HE0/oOkHRER6DAV3EZEEpOAuIpKAFNxFRBKQgruISAKK\nO7gbY64yxnxkjNljjPlFlOPnG2PeMsZUGWO0fp2ISAeIa4SqMSYJeAK4EvgCKDDGvGit/TDktMPA\n3cB18dxLRERiF2/NfSpQYq39zFpbC6wFrg09wVr7lbV2OxD7bDsiIhKXeIP7CKAsZHuff5+IiAAf\nf/wxF198MWlpaTzxxBMddt94Jw5L7GFhIiJxevTRR5k5cyY7d+7s0PvGG9y/ANwh226c2vspWbJk\nScP77OxssrOzT/VSIiItqvXWcvDZg9Qdr2Pw1YNJy2r7ZfZKS0u57LLLoh7z+Xytmixsy5YtbNmy\nJaZz45pbxhiTDHwMzAT2A+8CN0U0qAbOXQKcsNbmN3EtzS0jIq3W3NwyJ4tOcuRvR0hOT+b0BaeT\n1Dc462Ott5aC8QXUHq7F1lpcKS4uWHsBQ64ZEvVap2LGjBm88cYb9OrVi+TkZObMmUNaWhqlpaW8\n8cYbvPjii8yYMeOUv2O7ThxmjPk28BiQBKy01q4wxiwEsNY+aYwZBhQAaYAPOAFcYK09GXEdBXcR\nabWmAl/5S+V8MP8DbL3FJBtSzkphcuHkhgBf+qtSPnv4M2xN8LMpo1K4dO+lDdtVpVXsunEXX7//\nNX3O7MMFay9gwKTYp/wF+Na3vsX3vvc97rzzTm6//XY2bNjAK6+8wrRp06iurqZPnz6n/B2bC+5x\nL9ZhrX0FeCVi35Mh7w8SnroREWl3uxfuxlfpA8DWWKo+r+LQ6kMMXzgcgLqjdWGBHaDuRHD1Jltv\n2Zm9k6qyKqiHyj2V7Jyxk0s/uZReg3udcrmuu+46pk2bBhBTYD9VGqEqIgmp7nj4Mnu+ah+1h4M9\nsodcMwRX32AIdKW6GHJtMCVTva+ami9riFxn70Rh7MvsRTLG4HZ3TF1XwV1EElL6zHRMn2DGwtXb\nRfrM4DJ7A6cP5LxV59F7RG+S05M5bf5pjH0iuMxe0sAkbF14zd7WWZIHdY/VSbtHKUVEWmnc6nF8\n8N0POPK3IyT1T2Ls42NJuyS8N8zQ+UMZOn9o1M/3GtSLM+89ky8e/wJfpQ9XqvPDYUBW63LuoTqy\nXVHBXUQSUnJaMhdtvCiua5yz4hwGXT6IEztOkDo6ldMXnI5p5TJ7oYwxcX2+VffqKj1U1FtGRE6F\nltnTMnsiIj2GgruISAJScBcRSUAK7iIiCUjBXUQkASm4i4gkIPVzF5Fur6P6jncnCu4i0q0leh/3\nU6W0jIhIAlJwFxFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk4C4ikoAU3EVEEpCCu4hIAlJwFxFJ\nQAruItKmvB4vRTlFFOUU4fV4O7s4PZbWUBWRNuP1eCmeW4yv0geAK9VF5vpMMnIzOrlkiUlrqIpI\nhyjLL2sI7AC+Sh9l+WWdWKKeS8FdRCQBKbiLSJtx57lxpQbDiivVhTvP3Ykl6rmUcxeRNuX1eBtS\nMe48t/Lt7ai5nLuCu4hIN6UGVRHpcjweyMlxXh5PZ5cm8Si4i0iH83hg7lzYvNl5zZ4Ny5fH/ln9\nUGiZ0jIi0uFycpygHsrlgpdfhtzc8P0er5f8MieHf8UhN8uvy6Cy0jmWmgrr1zf+TE/RXFpGC2SL\nSJfg80F+fnig9ni9zC0uptLn9J0/WnCUR3r1g8perMNNQWVGo8+IQ2kZEekwgZRKeTmYKPXNwsLw\nVEt+WVlDYM96F5Yus2QdP0kWR1hKMVloeoOmqOYuIh0ikGcPpFSSk6G+HkKzsV6vc060VMu8/4KU\n6uB2Cj4WuMq4ME9dLaOJu+ZujLnKGPORMWaPMeYXTZzzW//xImPMxHjvKSLdT35+MLAD1NXBxRfD\njP5eHqWIRykiCy+Vlc65AHluN67apsPUxROCPwTU0Bourpq7MSYJeAK4EvgCKDDGvGit/TDknNnA\nGGvtWGPMJcDvgUvjua+IJIYsvNxQUUwvnNTLeI6xmEzAqY3nZmSw1GayeEcZ6y6uZfyOr0nxV/Vd\nqS4uWuGMfo38rWDbNli0CLZudbbz8npeXj7etMxUoMRa+xmAMWYtcC3wYcg5c4A/AFhr3zHGDDLG\nDLXWHorz3iLSjeTlOUE3tKfLfMpw+YITjUVLtSzKyWCKzeDP/+nlSP+9DKOKAWNSGL1idMPo18jf\nCiorYfFip5EWnPv2tF418aZlRgChU77t8+9r6Zwz47yviHSSU01/5OY6AXbWLOe1fj0MHtL4vH4T\nT0BWeEOp2e7lhqJizjhxEnOijoqPKlq8X8jPjLBUT08Rb8091o7pke3i6tAu0g1FS3+0pkacmxt+\nrhc3x7Yda5gmuKoPrLyxjuLiYtZnZpKbkcHy5dDnwTKm0Hgq4UDN/Yor/P3ms7wwz1+XXOeGgp7b\n2BpvcP8CCJ3yzY1TM2/unDP9+xpZsmRJw/vs7Gyys7PjLJ6ItKVo6Y9T7Wfu8UB+fgajzs9kkusD\njlPHuhuhYCrg8zkDlwoyWLwYfhXl8ycKT1CUU4Q7z83WrRlOYF9aDCn+HwLjj8HiTCjIIDXVSQt1\nd1u2bGHLli0xnRvXCFVjTDLwMTAT2A+8C9wUpUH1LmvtbGPMpcBj1tpGDaoaoSrS9UUbWTprFmza\n1LrrRP4G4MovwjfpSPh109PhnyewebPT8LqUYlL8tXdLMB3gSnXx3PmZPHVTGWSFX6P/gf5Me2ZK\nwjaottvEYdbaOuAuwAN8APzJWvuhMWahMWah/5yXgb3GmBLgSeCn8dxTRDpPXp7TEBrQXI24ubVU\n778//DcA39rwLo/Ttxvuu7WSf3x9G0+yHYDFZFJAOsdIDsvz+ip9TsNsUuMyVJxxkry1XsjyklNU\nRE5RER5vzxj4pLllRKRVnHSK8z5ajdjj9fLnP+3lhp+fpJd/0FHoWqoejzNRWGiDJ8DEH3kZ8tMy\nRr1Zy4K7T2Lqg8dqMDzIeApdGfxl6Hb6HzgZ9tkDA/qTt/A0Dsz+tFEL38T+/fmooqJhpGuqy9WQ\nz+/uNOWv9FieEg85q3PIWZ2Dp0QjW9pCbq6Thtm0KXpgn1tczJiVwcAO4Wup5uc3DuwA15+VwaYJ\nE/jh04QFdoDeWH6cUcbLL8OwYY0/ezy5mgNXNQ7sAO9/WdUQ2AEqA/n8BKfpByRheUo8zP3TXCrr\nnN//t32+jfXz15M7JgGTr11E6FwwrbV8OUyZAv1Kq6IenzwZJuRCUX6vRseOj6qFKGkZLNS5Tq08\n3Z1q7pKw8t/KbwjsAJV1leS/1cM6O3eSdTc63RoDQtdSjczbBwR63qSMTGl80MChK9zk5MD/LXdT\nnxwMXVW9YN384KlZ78Kj/+S8sgpwes+EZHxTXS7y3Im/rquCu4i0mTy3m1SXi4KpsHgp7JgCvuz+\nDfl2CA5mairlPXrFaEzvkPyKgdpbR3Hd8gw2b4ankuH+2/tSMDaZgrT+LL61v9N9Ev/MkYshq9B5\nLV3s7MMAFs442T9h8u0tUVpGElbetDy2fb6tofaempxK3rQE6OzchXhKPA2/DeVNyyN3TC7rMzOd\nnHYuXPgDNzOiBNLcXFizxukOmVnpZR5lGKBwr5vfG/j8sX5c8scqzk5J4eL7RrMg379Ah78ve0GK\nj4KbgaoKeP00sCfBRJk5strZVzAVMHDgC9j++wxyF3XE0+lcCu6S0M4fcj6lx0oZOXAkK2auUL69\nDTXXphFLzTg3FzYs8sJDxfS2Tl48c+9RFhdAwXTLUysg1VXB+kzn/Cy8zDv6ATzoCw52SvHBZYej\nNqRGNayK3zzgTCqW6BTcJSFFBp7K2soWPiEtCV3uLs/tbrJNozU/QIduLeOIDZk4zFrmrYeC6f5r\n+nu2XDsczqGYlD3+2SOLnbRPIB0TsO5G51ig9l7Vx9nX4FCUfH6CUnCXhNQWgUeCIpe723bsGOen\njOqQe7/1QS2znytrGJ0KwXQLdTBvTT1p1U7l/dhA+M+bYUKRc15DDR+gzoCFPr8tYtJCN0M+zUjY\nkaug4C4iMYjs4ljp88GZ80ndvZrKjy6HN/NwmSSuWDSwxUFOodx5brxbj2Fq/BOHGcO674QPZjw5\n+Gv69u4DdeGfHfilYekjlpSa8POj1uot4LJw3kkOAAduPALPjGLb3JEJOxWwgrskJDWmtr8hfQez\naPgbLF6aia/GqVcv/bFzrKbG+XPbNli0wcvWocF0Tm5GRjDFMwz+6bmRJP/yKJ+Vwjsj3ey7aC8Q\nHIGatdNydk143/caDLbeNArsENGIGmAIz8snAXd8SuXuAeTnZyi4i3QXuWNyWT9/faOeHHJq8txu\nth07FjaEP8/tJv+fM/DVBM+rqQn/XGWml8WmGN+RYDpn0ciRLC8tDaZ4Tj/G+r9lcmdGBiM88HRx\nGUx2Pp/1Liz6F+gdUWuvbaEF9Yz9MXypJJzpgTcnZrdIzS0jIjGJbFDNzciIOktkmEeLGs3UmFyR\nTF3f8Gg90ZfOjhkTnOsddbo7Zv3dx9LF4V0bQ9X4q6aRgR+cLMy+M+Dxf2zc6BrKFKbzSuaEbltz\nb25uGQV3ETllkVP39u7t/BmowUebypdjyTDQichZ7zopFHMimUOnfcCInePo+3UqNr2aYXW1DDzR\n/P13nwPHBsCkIkiKEj5qesGD/9JEgK+Hc/7jIkqe77419+aCu9IyInLKAqNNQxtQIbh9Raab5a5g\nOocql7NC0i2lETXzOuzusRjqgZMQ8fOgKccGwboZvZiyszbq8d618IOnmwjun/Rn9OHuG9hbouAu\nIkD0tEssIpfOC+xzZDDFm8l3N5ThPUxw6bvdA5h39ANSqoM5FdNCHr0mCXA5ARuCfdjnrWk+Az/0\nUJSdVS6Snx1N3sPNf7fuTMFdRKL2Y2+rOVhyMzJYMyKDuXeFLNCxoy/U9wOOtfj5YwNg95ku1t3m\na5hiAPx92KfAbf+VBNQ3+fmDQ/1vaoFP+8OxXrDOzfi6xOwlE6DgLiJR+7Hnl5XFFdwj+7uvXw/f\nzSvEW3kYLstn3f5RjH99Pin+lLHFNqq9V/WB5fdZCgb3hfOc7pGhKZZUm8TY+86i5rufNtToQ/kM\nrLwTzP4U7GPnOjvnlTmvEgClZUREYhbZ0LptmxPcJ//ifjbvdbrXFIyBxcN38OP3fszZg87mlcGv\nMLR4KGkVadjkvhzPGMC6OQcpeHs0XB6cwz3QCAuw47Ze5Nw9kpeW7KP3rvDoboFVd0DBNMBXhevK\nQ/i++VXDAtq7phzD403cGSLVW0ZEGqVlWlqKLnQ2yH+q+yeGPu/kPtx5bjJyw7tIZuHM+jg4A878\nt0Nct/+6sMFli4a/wdbnp1C4vxDvxAdgTMRq2yU5UPIMPLKHrJ02rHukNXCwX38qMio45/PwRTk+\nHgM/fjpkR+iq2n6z0tPZNGFC6x5WF6LeMiLSrNyMjOBUvTTfoBo6KVtWSRZ2reVIndO95di2Y2Su\nzySQ7sjCy1KKnXlhvOD6WQobfreBf0v+NwCuqPsly382xV/Dnwwf/gVO3wV9ndQNYzbR+9wt+Fw/\nou6Je5i3N5mU6mAsMxbOOHmS+pPhsbuqD6z8YUTBY505MkEouIv0YJHzsW+a0HILY+ikbPPenEef\nuuCSS4Gy/tdJAAANkUlEQVS1UvPyMti2DeZVhk/45av0MfT5oWza5NTOc3JCGlkBfClw0D889fNv\ncM5P/pk7bhhO8VfFXPLmLsbuHwv0a1Sm0BX2LPD6FVCQ1fz3cEFCr8ik4C7SQ0VOi7y1dCsXnnYh\nQ/oOiW26hvQs6H9uo92Ha2vJH1bE+Rsh7dZa+CK28gTSNwDrcFNQl0Hajkd45fh1PPjcg6TUOdP1\nRmt4DWWAy95yltgL61kT0dd9Qv/+CZtvB+XcRXqsnNU5DY2bkVKTU5tcTNxT4mGOZxk15z9IVmFK\neA48xfDQMvjfKc7/5enbDcseAlPlbLtSXWFL7gUaXr9TWcodfNpQA6/CxWIy+eTCT7mv8r/J2hte\nDfe5oLl1r79OhSRf+LzuoTNFttSm0F00l3PXGqoi0ki0xcS9Hi9FOUUM++kwvlOzFJJSGtZKLZgM\ney9NZu1v+jUEdnCC/Nrf9CN9Vjrps9KpWpTJgnynwdXjCa7GdGdIYAdIwceN535A7cOfQq+BjcpX\nMhp8Jnpl0ALeQdGX28tITmZWenpCBPaWKC0j0kNFTovcHK/HS/HcYnyVTnX5zjdcfOKvCQdes9IH\nOCdHTB3w6WW9mPCTCU12j6zYsJV00hvd0w6o48RpQ1h3exrjF9eQUutMXFPVB/7jh3Dux4Y7nrEk\nNfSTh4pUeP4mZ7EO94HG32PygAHdundMayi4i/RQodMil1eUs+vLXdT45++NnP++LL+sIbAD9KqG\nBevC0xyBxsloUwODM6AptPG0shJ+9uBeFnpLyCI87VJvgsvjFVzam39dfIBr/jyQWlffhvx5wVTY\nPXw389Ych/7nsu7mgQ3l2X0eXFQMfULSMn9ZYHgogRtQIym4S6eK7K2hOdc7Vu6Y3IZn3tq/i4v7\n92dWujO4KLTrZKxdKgE+O/oZ6y5bx/jPxzc0mNYbyzN3mLAG0KRrLmDcT91hffGpr6bgtJUU3FoA\n7pth1PcJ9HcsmGopeWoY4575ms+qqnjn1hQemj864VMxodSgKp0msrdGc4140rki0zKRDaOx2LTc\nS/HiMup9Tm+Y4tQMqm+Yje+cV8g6eDPz3pwHwLpv76YgdzIYp0kwtPEzdHKzK5IOsfU9p7/8XvdP\n+CQitdPdByjFQvO5S5cUrbfGrNGz2PS9TU18QjqT1+OlLN8JrIGRqK35bOgPh1qXC7s0k58OmMon\nZMCFTgMtANaC8ccrW89tA308O2lWs9fPKSpi85HwZH9PD+7qLSMiMcnIzWDCpglM2DShVYHdU+Jh\nQ96G8Jy9z8fQrWX8bvbvwD0/GNghGNgBTBLPle0iZ3UOnhJPk/fIc7tJdQXDWWiuv6dScJdOkzct\nj9Tk1IZtLWKdeAKpt8OVh6Mezx2Tyznpo5u9Rl19LZv3bmb2c7MZ89sxTHpyUqNgH5g+YVZ6eo/p\n6tgSpWWkU6lBtXuJnMa3pfnQA6m3rJIslq5d2tBoGpqzn7RuIe9lzI2elqmvgl2L4UhBo2urjUY5\ndxFpA5H91FNTnX7qzQX40HaVrJIs5r05j8Gpg7ku/zoycjNYXryJh0t2UZ/UFzBQewyOFsEgf658\n37qogT2gp7fRaFZIEYlbtH7q+fnNB/fQgVIFYwooPr+Y9fPXkzHGCewPHvLBoInOyaG19LLn2vfL\n9ADKuYtIuwkMlJo1ehazRs8KS6P8uvTT8IbUpBQ4c17M11YbTfNUcxeRmOTlOVMGhKZl8mKIraED\npULFMu0BOEE88twBvQewbt66Hp1vb8kp19yNMRnGmM3GmN3GmE3GmEFNnLfKGHPIGPP+qRdTRDpb\nbq6TY581y3m1lG9viqfEw6QnJ+H7fK2Tigmor3Jy7BFqfY0XR+2V1EuBvQWn3KBqjHkUKLfWPmqM\n+QWQbq29L8p5lwMngT9aa8c3cz01qIp0Ae3Zg8lT4mHO2jnU1Dtz2JCeFUzFRGk8TU1OZfiA4Xxy\n5JOw/ROHTWTHwh1tVq7uql16yxhjPgKusNYeMsYMA7ZYa89v4tyzgZcU3EW6tsgpIVzGxYShE1gx\nc0WbBPnm5pCP5MLF0m8tZcrwKcx5fk7DpGa9Xb158aYXVXOn/UaoDrXWHvK/PwQMjeNaItIFhC6h\nB+CzPt47+B5z/zS32RGizfGUeMhZnUPO6hzKK8obn1CSA3/0kPSfrznvA/fGx9bSreSOyeXFm15s\naJRVYI9Nsw2qxpjNwLAohxaFblhrrTFNzJzfCkuWLGl4n52dTXZ2dryXFJE2EFi8o7mg6inxcP9r\n91N6rJSRA0eyYuYKgLDfBJJdESGnJAfWroe6vtQDfHYpLJgLY8L7rjfVKNvTbNmyhS1btsR0brxp\nmWxr7UFjzBnA60rLiHRvkWmZUM0NGPKUeMJSJ+AE8pTkFE7WnGz6hn/0wN6c8H2jN8GtuRqBGoP2\nSsu8CNzmf38bsCGOa4lIFxDolz5x2ERcIeGhpT7l+W/lhwV2gDpfXfOBvQkD+qQ16hMvrRdPcP8V\nMMsYsxuY4d/GGDPcGPPXwEnGmOeBN4FzjTFlxpg74imwiLSv3DG57Fi4g5dvfjnq4KN49E7qjSGk\nonlZPiRXBLeTK6ie+kvNM9QGNLeMiMQtWlomUkZqBmu+s4b7X7uf9w6+FzxQkgNv+n8ruCwfxmzq\n8XPGxEpzy4hIuwr0aAk0qKanpFN2rCxsTdY131nTUBsP62459m/4xiiQtzXV3EWkXTQ3GCr02BUj\nr2D5/yzXcounQFP+ikiXpnn9T42Cu4hIAtIaqiIiPYyCu4hIAlJwFxFJQAruItImQicIO9VJxqTt\nqEFVROIWOSeNujN2DDWoiki7ipwqODCLpHQeBXcRkQSk4C4iccublkdqcmrDdkuzSEr7U85dRNqE\nRpl2PI1QFRFJQGpQFRHpYRTcRUQSkIK7iEgCUnAXEUlACu4iIglIwV1EJAEpuIuIJCAFdxGRBKTg\nLiKSgBTcRUQSkIK7iEgCUnAXEUlACu4iIglIwV1EJAEpuIuIJCAFdxGRBKTgLiKSgBTcRUQSkIK7\niEgCUnAXEUlACu4iIgnolIO7MSbDGLPZGLPbGLPJGDMoyjluY8zrxphdxphiY8w/xFdcERGJRTw1\n9/uAzdbac4HX/NuRaoGfW2svBC4FfmaMGRfHPXu8LVu2dHYRugU9p9jpWcWmuz2neIL7HOAP/vd/\nAK6LPMFae9Bau9P//iTwITA8jnv2eN3tH1hn0XOKnZ5VbLrbc4onuA+11h7yvz8EDG3uZGPM2cBE\n4J047ikiIjFIbu6gMWYzMCzKoUWhG9Zaa4yxzVynP/D/gHv8NXgREWlHxtomY3LzHzTmIyDbWnvQ\nGHMG8Lq19vwo5/UCNgKvWGsfa+Z6p1YQEZEezFprou1vtubegheB24Bf+//cEHmCMcYAK4EPmgvs\nzRVQRERaL56aewbwX8BZwGfAjdbao8aY4cDT1tqrjTHfAN4A/g4EbnS/tfbVuEsuIiJNOuXgLiIi\nXVeHjlA1xlxljPnIGLPHGPOLKMdvNsYUGWP+boz5X2PMRR1Zvq6kpWcVcl6WMabOGPOdjixfVxHL\nczLGZBtj3vMPpNvSwUXsEmL4vzfEGPOqMWan/znd3gnF7HTGmFXGmEPGmPebOee3/udYZIyZ2JHl\naxVrbYe8gCSgBDgb6AXsBMZFnDMNGOh/fxXwdkeVryu9YnlWIef9N06D9fWdXe6u+JyAQcAu4Ez/\n9pDOLncXfU5LgBWBZwQcBpI7u+yd8Kwux+my/X4Tx2cDL/vfX9KVY1RH1tynAiXW2s+stbXAWuDa\n0BOstW9Za4/5N98BzuzA8nUlLT4rv7txuph+1ZGF60JieU7fBV6w1u4DsNaWd3AZu4JYntMBIM3/\nPg04bK2t68AydgnW2v8BjjRzSsPgTWvtO8AgY0yzY3w6S0cG9xFAWcj2Pv++pnwfeLldS9R1tfis\njDEjcP6D/t6/qyc2nsTyb2oskOGf42i7MeZ7HVa6riOW5/Q0cKExZj9QBNzTQWXrbqI9yy5ZCY2n\nK2RrxRx8jDHfAu4Eprdfcbq0WJ7VY8B91lrr73LaE7uSxvKcegGTgJlAX+AtY8zb1to97VqyriWW\n5/QAsNNam22MOQfYbIyZYK090c5l644i/691yYpVRwb3LwB3yLYb56deGH8j6tPAVdba5n49SmSx\nPKvJwFonrjME+LYxptZa+2LHFLFLiOU5lQHl1tpKoNIY8wYwAehJwT2W53QZsBzAWvuJMeZT4Dxg\ne4eUsPuIfJZn+vd1OR2ZltkOjDXGnG2M6Q3MxxkI1cAYcxbwZ+AWa21JB5atq2nxWVlrR1trR1lr\nR+Hk3X/SwwI7xPCcgL8A3zDGJBlj+uI0gn3QweXsbLE8p4+AKwH8OeTzgL0dWsru4UXgVgBjzKXA\nURucY6tL6bCau7W2zhhzF+DBab1faa390Biz0H/8SWAxkA783l8jrbXWTu2oMnYVMT6rHi+W52St\n/cgY8yrOQDofzgC7HhXcY/z39EvgGWNMEU6l7/9Ya72dVuhOYox5HrgCGGKMKQMexkntBf49vWyM\nmW2MKQG+Bu7ovNI2T4OYREQSkJbZExFJQAruIiIJSMFdRCQBKbiLiCQgBXcRkQSk4C4ikoAU3EVE\nEpCCu4hIAvr/eZkdM74bzcEAAAAASUVORK5CYII=\n",
207 |        "text": [
208 |         "<matplotlib.figure.Figure at 0x109550160>"
209 |        ]
210 |       }
211 |      ],
212 |      "prompt_number": 27
213 |     },
214 |     {
215 |      "cell_type": "code",
216 |      "collapsed": false,
217 |      "input": [],
218 |      "language": "python",
219 |      "metadata": {},
220 |      "outputs": []
221 |     }
222 |    ],
223 |    "metadata": {}
224 |   }
225 |  ]
226 | }


--------------------------------------------------------------------------------
/sklearn_demos/fastText.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This notebooks is an experiment to see if a pure scikit-learn implementation of the fastText model can work better than a linear model on a small text classification problem: 20 newsgroups.\n",
  8 |     "\n",
  9 |     "http://arxiv.org/abs/1607.01759\n",
 10 |     "\n",
 11 |     "Those models are very similar to Deep Averaging Network (with only 1 hidden layer with a linear activation function):\n",
 12 |     "\n",
 13 |     "https://www.cs.umd.edu/~miyyer/pubs/2015_acl_dan.pdf\n",
 14 |     "\n",
 15 |     "\n",
 16 |     "Note that scikit-learn does not provide a hierarchical softmax implementation (but we don't need it on 20 newsgroups anyways)."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "from sklearn.datasets import fetch_20newsgroups\n",
 28 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 29 |     "from sklearn.feature_extraction.text import HashingVectorizer\n",
 30 |     "\n",
 31 |     "from sklearn.model_selection import train_test_split"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "twentyng_train = fetch_20newsgroups(\n",
 43 |     "    subset='train',\n",
 44 |     "    #remove=('headers', 'footers'),\n",
 45 |     ")\n",
 46 |     "docs_train, target_train = twentyng_train.data, twentyng_train.target\n",
 47 |     "\n",
 48 |     "\n",
 49 |     "twentyng_test = fetch_20newsgroups(\n",
 50 |     "    subset='test',\n",
 51 |     "    #remove=('headers', 'footers'),\n",
 52 |     ")\n",
 53 |     "\n",
 54 |     "docs_test, target_test = twentyng_test.data, twentyng_test.target"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 18,
 60 |    "metadata": {
 61 |     "collapsed": false
 62 |    },
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "262144"
 68 |       ]
 69 |      },
 70 |      "execution_count": 18,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "2 ** 18"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "The following uses the hashing tricks on unigrams and bigrams. `binary=True` makes us ignore repeated words in a document. The `l1` normalization ensures that we \"average\" the embeddings of the tokens in the document instead of summing them."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 17,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "CPU times: user 16.8 s, sys: 116 ms, total: 16.9 s\n",
 98 |       "Wall time: 16.9 s\n"
 99 |      ]
100 |     }
101 |    ],
102 |    "source": [
103 |     "%%time\n",
104 |     "vec = HashingVectorizer(\n",
105 |     "    encoding='latin-1', binary=True, ngram_range=(1, 2),\n",
106 |     "    norm='l1', n_features=2 ** 18)\n",
107 |     "\n",
108 |     "X_train = vec.transform(docs_train)\n",
109 |     "X_test = vec.transform(docs_test)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 19,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
123 |        "       [ 0.,  0.,  0., ...,  0.,  0.,  0.],\n",
124 |        "       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
125 |       ]
126 |      },
127 |      "execution_count": 19,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "first_doc_vectors = X_train[:3].toarray()\n",
134 |     "first_doc_vectors"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 20,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [
144 |     {
145 |      "data": {
146 |       "text/plain": [
147 |        "array([ 0.,  0.,  0.])"
148 |       ]
149 |      },
150 |      "execution_count": 20,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "first_doc_vectors.min(axis=1)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 21,
162 |    "metadata": {
163 |     "collapsed": false
164 |    },
165 |    "outputs": [
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "array([ 0.0049505 ,  0.00469484,  0.00200401])"
170 |       ]
171 |      },
172 |      "execution_count": 21,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "first_doc_vectors.max(axis=1)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 22,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "array([ 1.,  1.,  1.])"
192 |       ]
193 |      },
194 |      "execution_count": 22,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "first_doc_vectors.sum(axis=1)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Baseline: OvR logistic regression (the multinomial logistic regression loss is currently not implemented in scikit-learn). In practice, the OvR reduction seems to work well enough."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 86,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "CPU times: user 1min 46s, sys: 6.69 s, total: 1min 53s\n",
222 |       "Wall time: 11.1 s\n"
223 |      ]
224 |     }
225 |    ],
226 |    "source": [
227 |     "%%time\n",
228 |     "from sklearn.linear_model import SGDClassifier\n",
229 |     "\n",
230 |     "lr = SGDClassifier(loss='log', alpha=1e-10, n_iter=50, n_jobs=-1)\n",
231 |     "lr.fit(X_train, target_train)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 87,
237 |    "metadata": {
238 |     "collapsed": false
239 |    },
240 |    "outputs": [
241 |     {
242 |      "name": "stdout",
243 |      "output_type": "stream",
244 |      "text": [
245 |       "train score: 1.000\n",
246 |       "test score: 0.827\n",
247 |       "CPU times: user 588 ms, sys: 289 ms, total: 877 ms\n",
248 |       "Wall time: 602 ms\n"
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "%%time\n",
254 |     "print(\"train score: %0.3f\" % lr.score(X_train, target_train))\n",
255 |     "print(\"test score: %0.3f\" % lr.score(X_test, target_test))"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "Let's now use the MLPClassifier of scikit-learn to add a single hidden layer with a small number of hidden units.\n",
263 |     "\n",
264 |     "Note: instead of tanh or relu we would rather like to use a linear / identity activation function for the hidden layer but this is not (yet) implemented in scikit-learn.\n",
265 |     "\n",
266 |     "In that respect the following model is closer to a Deep Averaging Network (without dropout) than fastText."
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 90,
272 |    "metadata": {
273 |     "collapsed": false
274 |    },
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "Iteration 1, loss = 2.94108225\n",
281 |       "Validation score: 0.464664\n",
282 |       "Iteration 2, loss = 2.49072336\n",
283 |       "Validation score: 0.639576\n",
284 |       "Iteration 3, loss = 1.63266821\n",
285 |       "Validation score: 0.810954\n",
286 |       "Iteration 4, loss = 0.90327443\n",
287 |       "Validation score: 0.869258\n",
288 |       "Iteration 5, loss = 0.48531751\n",
289 |       "Validation score: 0.893993\n",
290 |       "Iteration 6, loss = 0.27329257\n",
291 |       "Validation score: 0.909894\n",
292 |       "Iteration 7, loss = 0.16704835\n",
293 |       "Validation score: 0.911661\n",
294 |       "Iteration 8, loss = 0.11122343\n",
295 |       "Validation score: 0.918728\n",
296 |       "Iteration 9, loss = 0.07885910\n",
297 |       "Validation score: 0.918728\n",
298 |       "Iteration 10, loss = 0.05876991\n",
299 |       "Validation score: 0.924028\n",
300 |       "Iteration 11, loss = 0.04566916\n",
301 |       "Validation score: 0.920495\n",
302 |       "Iteration 12, loss = 0.03644058\n",
303 |       "Validation score: 0.915194\n",
304 |       "Iteration 13, loss = 0.02982519\n",
305 |       "Validation score: 0.922261\n",
306 |       "Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.\n",
307 |       "CPU times: user 1min 21s, sys: 187 ms, total: 1min 21s\n",
308 |       "Wall time: 1min 21s\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "%%time\n",
314 |     "from sklearn.neural_network import MLPClassifier\n",
315 |     "\n",
316 |     "mlp = MLPClassifier(algorithm='adam', learning_rate_init=0.01,\n",
317 |     "                    hidden_layer_sizes=10, max_iter=100, activation='tanh', verbose=100,\n",
318 |     "                    early_stopping=True, validation_fraction=0.05, alpha=1e-10)\n",
319 |     "mlp.fit(X_train, target_train)"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 92,
325 |    "metadata": {
326 |     "collapsed": false
327 |    },
328 |    "outputs": [
329 |     {
330 |      "name": "stdout",
331 |      "output_type": "stream",
332 |      "text": [
333 |       "train score: 0.996\n",
334 |       "test score: 0.801\n",
335 |       "CPU times: user 304 ms, sys: 54 µs, total: 304 ms\n",
336 |       "Wall time: 303 ms\n"
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "%%time\n",
342 |     "print(\"train score: %0.3f\" % mlp.score(X_train, target_train))\n",
343 |     "print(\"test score: %0.3f\" % mlp.score(X_test, target_test))"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "collapsed": true
351 |    },
352 |    "outputs": [],
353 |    "source": []
354 |   }
355 |  ],
356 |  "metadata": {
357 |   "kernelspec": {
358 |    "display_name": "Python 3",
359 |    "language": "python",
360 |    "name": "python3"
361 |   },
362 |   "language_info": {
363 |    "codemirror_mode": {
364 |     "name": "ipython",
365 |     "version": 3
366 |    },
367 |    "file_extension": ".py",
368 |    "mimetype": "text/x-python",
369 |    "name": "python",
370 |    "nbconvert_exporter": "python",
371 |    "pygments_lexer": "ipython3",
372 |    "version": "3.5.0"
373 |   }
374 |  },
375 |  "nbformat": 4,
376 |  "nbformat_minor": 0
377 | }
378 | 


--------------------------------------------------------------------------------
/sklearn_demos/language/fetch_data.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # simple python script to collect text paragraphs from various languages on the
  3 | # same topic namely the Wikipedia encyclopedia itself
  4 | 
  5 | import os
  6 | try:
  7 |     # Python 2 compat
  8 |     from urllib2 import Request, build_opener
  9 | except ImportError:
 10 |     # Python 3
 11 |     from urllib.request import Request, build_opener
 12 | 
 13 | import lxml.html
 14 | from lxml.etree import ElementTree
 15 | import numpy as np
 16 | 
 17 | pages = {
 18 |     u'ar': u'http://ar.wikipedia.org/wiki/%D9%88%D9%8A%D9%83%D9%8A%D8%A8%D9%8A%D8%AF%D9%8A%D8%A7',
 19 |     u'de': u'http://de.wikipedia.org/wiki/Wikipedia',
 20 |     u'en': u'http://en.wikipedia.org/wiki/Wikipedia',
 21 |     u'es': u'http://es.wikipedia.org/wiki/Wikipedia',
 22 |     u'fr': u'http://fr.wikipedia.org/wiki/Wikip%C3%A9dia',
 23 |     u'it': u'http://it.wikipedia.org/wiki/Wikipedia',
 24 |     u'ja': u'http://ja.wikipedia.org/wiki/Wikipedia',
 25 |     u'nl': u'http://nl.wikipedia.org/wiki/Wikipedia',
 26 |     u'pl': u'http://pl.wikipedia.org/wiki/Wikipedia',
 27 |     u'pt': u'http://pt.wikipedia.org/wiki/Wikip%C3%A9dia',
 28 |     u'ru': u'http://ru.wikipedia.org/wiki/%D0%92%D0%B8%D0%BA%D0%B8%D0%BF%D0%B5%D0%B4%D0%B8%D1%8F',
 29 | #    u'zh': u'http://zh.wikipedia.org/wiki/Wikipedia',
 30 | }
 31 | 
 32 | html_folder = u'html'
 33 | text_folder = u'paragraphs'
 34 | short_text_folder = u'short_paragraphs'
 35 | n_words_per_short_text = 5
 36 | 
 37 | 
 38 | if not os.path.exists(html_folder):
 39 |     os.makedirs(html_folder)
 40 | 
 41 | for lang, page in pages.items():
 42 | 
 43 |     text_lang_folder = os.path.join(text_folder, lang)
 44 |     if not os.path.exists(text_lang_folder):
 45 |         os.makedirs(text_lang_folder)
 46 | 
 47 |     short_text_lang_folder = os.path.join(short_text_folder, lang)
 48 |     if not os.path.exists(short_text_lang_folder):
 49 |         os.makedirs(short_text_lang_folder)
 50 | 
 51 |     opener = build_opener()
 52 |     html_filename = os.path.join(html_folder, lang + '.html')
 53 |     if not os.path.exists(html_filename):
 54 |         print("Downloading %s" % page)
 55 |         request = Request(page)
 56 |         # change the User Agent to avoid being blocked by Wikipedia
 57 |         # downloading a couple of articles ones should not be abusive
 58 |         request.add_header('User-Agent', 'OpenAnything/1.0')
 59 |         html_content = opener.open(request).read()
 60 |         open(html_filename, 'wb').write(html_content)
 61 | 
 62 |     # decode the payload explicitly as UTF-8 since lxml is confused for some
 63 |     # reason
 64 |     html_content = open(html_filename).read()
 65 |     if hasattr(html_content, 'decode'):
 66 |         html_content = html_content.decode('utf-8')
 67 |     tree = ElementTree(lxml.html.document_fromstring(html_content))
 68 |     i = 0
 69 |     j = 0
 70 |     for p in tree.findall('//p'):
 71 |         content = p.text_content()
 72 |         if len(content) < 100:
 73 |             # skip paragraphs that are too short - probably too noisy and not
 74 |             # representative of the actual language
 75 |             continue
 76 | 
 77 |         text_filename = os.path.join(text_lang_folder,
 78 |                                      '%s_%04d.txt' % (lang, i))
 79 |         print("Writing %s" % text_filename)
 80 |         open(text_filename, 'wb').write(content.encode('utf-8', 'ignore'))
 81 |         i += 1
 82 | 
 83 |         # split the paragraph into fake smaller paragraphs to make the
 84 |         # problem harder e.g. more similar to tweets
 85 |         if lang in ('zh', 'ja'):
 86 |         # FIXME: whitespace tokenizing does not work on chinese and japanese
 87 |             continue
 88 |         words = content.split()
 89 |         n_groups = len(words) / n_words_per_short_text
 90 |         if n_groups < 1:
 91 |             continue
 92 |         groups = np.array_split(words, n_groups)
 93 | 
 94 |         for group in groups:
 95 |             small_content = u" ".join(group)
 96 | 
 97 |             short_text_filename = os.path.join(short_text_lang_folder,
 98 |                                                '%s_%04d.txt' % (lang, j))
 99 |             print("Writing %s" % short_text_filename)
100 |             open(short_text_filename, 'wb').write(
101 |                 small_content.encode('utf-8', 'ignore'))
102 |             j += 1
103 |             if j >= 1000:
104 |                 break
105 | 
106 | 


--------------------------------------------------------------------------------
/structure_digits.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |     "worksheets": [
  3 |         {
  4 |             "cells": [
  5 |                 {
  6 |                     "source": "# Intrinsic structure of the Western-Arabic numerals scriptures\n\nUsing scikit-learn Spectral clustering.", 
  7 |                     "cell_type": "markdown"
  8 |                 }, 
  9 |                 {
 10 |                     "cell_type": "code", 
 11 |                     "language": "python", 
 12 |                     "outputs": [], 
 13 |                     "collapsed": true, 
 14 |                     "prompt_number": 22, 
 15 |                     "input": "import numpy as np\nimport pylab as pl"
 16 |                 }, 
 17 |                 {
 18 |                     "source": "## Load the digits dataset and plot the first elements", 
 19 |                     "cell_type": "markdown"
 20 |                 }, 
 21 |                 {
 22 |                     "source": "Small utility function to display a gallery of images:", 
 23 |                     "cell_type": "markdown"
 24 |                 }, 
 25 |                 {
 26 |                     "cell_type": "code", 
 27 |                     "language": "python", 
 28 |                     "outputs": [], 
 29 |                     "collapsed": true, 
 30 |                     "prompt_number": 23, 
 31 |                     "input": "def plot_images(images):\n    pl.gray()\n    pl.figure()\n    for i, img in enumerate(images[:25]):\n        pl.subplot(5, 5, i)\n        pl.imshow(img, interpolation=\"nearest\")\n        pl.xticks(())\n        pl.yticks(())\n    "
 32 |                 }, 
 33 |                 {
 34 |                     "source": "Lest load the digits dataset that comes with scikit learn (as a CSV file with gray level pixel values. Let's shuffle the dataset to make shure that the algorithm cannot exploit any ordering information. ", 
 35 |                     "cell_type": "markdown"
 36 |                 }, 
 37 |                 {
 38 |                     "cell_type": "code", 
 39 |                     "language": "python", 
 40 |                     "outputs": [], 
 41 |                     "collapsed": true, 
 42 |                     "prompt_number": 24, 
 43 |                     "input": "from sklearn import datasets\nfrom sklearn.utils import shuffle\n\ndigits = datasets.load_digits()\nimages, data, target = shuffle(\n    digits.images, digits.data, digits.target)\n\nplot_images(images)\n"
 44 |                 }, 
 45 |                 {
 46 |                     "source": "## Group the pictures in 10 groups using Spectral Clustering", 
 47 |                     "cell_type": "markdown"
 48 |                 }, 
 49 |                 {
 50 |                     "cell_type": "code", 
 51 |                     "language": "python", 
 52 |                     "outputs": [], 
 53 |                     "collapsed": true, 
 54 |                     "prompt_number": 25, 
 55 |                     "input": "from sklearn import cluster, neighbors\n\nn_clusters = 10\nS = neighbors.kneighbors_graph(data, 10)\nsc = cluster.SpectralClustering(n_clusters, mode='arpack', n_init=50)\nsc.fit(S)\nsc.labels_"
 56 |                 }, 
 57 |                 {
 58 |                     "cell_type": "code", 
 59 |                     "language": "python", 
 60 |                     "outputs": [], 
 61 |                     "collapsed": true, 
 62 |                     "prompt_number": 26, 
 63 |                     "input": "for i in range(n_clusters):\n    plot_images(images[sc.labels_ == i])"
 64 |                 }, 
 65 |                 {
 66 |                     "source": "\n## Profiling clustering algorithm\n\nThe following will runt the `cProfile` tool from the Python stdlib and display the output in a paged, tiled panel.", 
 67 |                     "cell_type": "markdown"
 68 |                 }, 
 69 |                 {
 70 |                     "cell_type": "code", 
 71 |                     "language": "python", 
 72 |                     "outputs": [], 
 73 |                     "collapsed": true, 
 74 |                     "prompt_number": 27, 
 75 |                     "input": "%prun cluster.SpectralClustering(10, mode='arpack').fit(S)"
 76 |                 }, 
 77 |                 {
 78 |                     "source": "## Supervised learning: learning to classify digits", 
 79 |                     "cell_type": "markdown"
 80 |                 }, 
 81 |                 {
 82 |                     "cell_type": "code", 
 83 |                     "language": "python", 
 84 |                     "outputs": [], 
 85 |                     "collapsed": true, 
 86 |                     "prompt_number": 28, 
 87 |                     "input": "from sklearn import svm, metrics\nX_train, y_train, X_test, y_test = data[:500], target[:500], data[500:], target[500:]\n\nclf = svm.SVC(gamma=0.001).fit(X_train, y_train)\n\nprint metrics.classification_report(y_test, clf.predict(X_test))"
 88 |                 }, 
 89 |                 {
 90 |                     "cell_type": "code", 
 91 |                     "language": "python", 
 92 |                     "outputs": [], 
 93 |                     "collapsed": true, 
 94 |                     "prompt_number": 29, 
 95 |                     "input": "cm = metrics.confusion_matrix(target[500:], clf.predict(data[500:]))\nprint cm"
 96 |                 }, 
 97 |                 {
 98 |                     "cell_type": "code", 
 99 |                     "language": "python", 
100 |                     "outputs": [], 
101 |                     "collapsed": true, 
102 |                     "prompt_number": 30, 
103 |                     "input": "pl.imshow(cm)"
104 |                 }
105 |             ]
106 |         }
107 |     ], 
108 |     "metadata": {
109 |         "name": "structure_digits"
110 |     }, 
111 |     "nbformat": 2
112 | }


--------------------------------------------------------------------------------
/test.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="app.diagrams.net" modified="2021-03-10T10:58:52.625Z" agent="5.0 (Macintosh)" etag="_Y5496tpfcfYtsVGTi3J" version="14.4.6" type="github">
 2 |   <diagram id="BAbo_2kxki6wbcmEBhtL" name="Page-1">
 3 |     <mxGraphModel dx="946" dy="626" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
 4 |       <root>
 5 |         <mxCell id="0" />
 6 |         <mxCell id="1" parent="0" />
 7 |         <mxCell id="BHyEHo-JyhSXneRQvZ70-1" value="" style="shape=image;html=1;verticalLabelPosition=bottom;verticalAlign=top;imageAspect=1;aspect=fixed;image=img/clipart/Gear_128x128.png" vertex="1" parent="1">
 8 |           <mxGeometry x="400" y="290" width="52" height="61" as="geometry" />
 9 |         </mxCell>
10 |       </root>
11 |     </mxGraphModel>
12 |   </diagram>
13 | </mxfile>
14 | 


--------------------------------------------------------------------------------
/ubuntu-quickstart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | # Script to quickly setup an ipython notebook env on a stock Ubuntu 13.04
 3 | 
 4 | set -ex
 5 | 
 6 | 
 7 | sudo apt-get install -y \
 8 |     python-numpy python-scipy python-dev libatlas-dev \
 9 |     python-zmq python-pip python-virtualenv \
10 |     git libnuma-dev numactl htop vim python-matplotlib libevent-dev
11 | 
12 | sudo update-alternatives --set editor /usr/bin/vim.basic
13 | 
14 | cd $HOME
15 | if [ ! -d "venv" ]; then
16 |     virtualenv --system-site-packages venv
17 | fi
18 | . venv/bin/activate
19 | 
20 | pip install scikit-learn ipython[notebook] blosc apache-libcloud gevent numa
21 | pip install git+https://github.com/esc/bloscpack
22 | 
23 | git config --global user.name "Olivier Grisel"
24 | git config --global user.email olivier.grisel@ensta.org
25 | 
26 | if [ ! -x "$HOME/.ssh/config" ]; then
27 |     if f [ ! -d "$HOME/.ssh" ]; then
28 |         mkdir $HOME/.ssh
29 |     fi
30 |     echo "Host github.com" >> $HOME/.ssh/config
31 |     echo "    StrictHostKeyChecking no" >> $HOME/.ssh/config
32 | fi
33 | 
34 | if [ ! -d "$HOME/notebooks" ]; then
35 |     git clone git@github.com:ogrisel/notebooks.git
36 | fi
37 | 
38 | if [ -d "/mnt/resource" ]; then
39 |     # Azure
40 |     DATA_ROOT=/mnt/resource
41 | else
42 |     # EC2
43 |     DATA_ROOT=/mnt
44 | fi
45 | 
46 | if [ ! -d "$DATA_ROOT/$USER" ]; then
47 |     sudo mkdir $DATA_ROOT/$USER
48 |     sudo chown -R $USER. $DATA_ROOT/$USER
49 | 
50 |     mkdir $DATA_ROOT/$USER/data
51 |     ln -s $DATA_ROOT/$USER/data $HOME/data
52 | fi
53 | 
54 | # (Re)start the notebook process
55 | cd $HOME/notebooks
56 | pkill -9 -f "disabled-ipython-browser" || echo "Nothing to kill"
57 | nohup ~/venv/bin/ipython notebook \
58 |     --ip="*" \
59 |     --browser="disabled-ipython-browser" &
60 | 
61 | 


--------------------------------------------------------------------------------