├── 4740
    ├── Gradient descent.ipynb
    └── QR.ipynb
├── .gitignore
├── AutoML
    ├── AutoML_in_class_demo.ipynb
    ├── README.md
    └── oboe
    │   ├── LICENSE.txt
    │   ├── README.md
    │   ├── automl
    │       ├── README.md
    │       ├── __init__.py
    │       ├── auto_learner.py
    │       ├── convex_opt.py
    │       ├── defaults
    │       │   ├── classification.json
    │       │   ├── dataset_sizes.csv
    │       │   ├── error_matrix.csv
    │       │   ├── regression.json
    │       │   ├── runtime_matrix.csv
    │       │   └── runtime_predictor.pkl
    │       ├── generate_matrix.sh
    │       ├── generate_vector.py
    │       ├── linalg.py
    │       ├── model.py
    │       ├── preprocessing.py
    │       └── util.py
    │   └── examples
    │       ├── README.md
    │       ├── classification.ipynb
    │       └── error_matrix_generation
    │           ├── README.md
    │           ├── dataset_11_features_and_labels.csv
    │           ├── dataset_18_features_and_labels.csv
    │           ├── generate.sh
    │           └── merge.sh
├── Bootstrap.ipynb
├── QR.ipynb
├── README.md
├── RegularizedRegression.ipynb
├── SVD.ipynb
├── eda.ipynb
├── ensembles.ipynb
├── feature_engineering.ipynb
├── forecasting.ipynb
├── gradient_descent.ipynb
├── great_embedder.py
├── julia
    ├── Classification.ipynb
    ├── Crime.ipynb
    ├── Fairness-Income.ipynb
    ├── GitHub Tutorials.ipynb
    ├── Gradient descent.ipynb
    ├── Julia Syntax Tutorial.ipynb
    ├── Linear models.ipynb
    ├── LossFunctions_multiclass.ipynb
    ├── LowRankModelsDemo-long.ipynb
    ├── LowRankModelsDemo.ipynb
    ├── Multiclass and Ordinal.ipynb
    ├── Predicting COVID.ipynb
    ├── Predicting crime.ipynb
    ├── ProximalGradient.ipynb
    ├── QR.ipynb
    ├── RegularizedRegression.ipynb
    ├── Robust regression.ipynb
    ├── SIR.ipynb
    ├── SVD.ipynb
    ├── Scikit-learn.ipynb
    ├── Section-Regularization+Scaling.ipynb
    ├── Sklearn_demo.ipynb
    ├── Untitled.ipynb
    ├── double-descent.ipynb
    ├── eda.ipynb
    ├── proxgrad-starter-code.ipynb
    └── spectralGraphTheory.ipynb
├── linear_models.ipynb
├── python-refresher.ipynb
├── robust_regression.ipynb
└── trees.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .ipynb_checkpoints
3 | 


--------------------------------------------------------------------------------
/4740/QR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# generate random data matrix\n",
 12 |     "n,d = 6,4\n",
 13 |     "X = randn(n,d)\n",
 14 |     "\n",
 15 |     "# optional: give it linearly dependent columns\n",
 16 |     "# X[:,3] = X[:,2]"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "Q,R = qr(X)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "metadata": {
 34 |     "collapsed": false
 35 |    },
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "Q"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {
 45 |     "collapsed": false
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "R"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": false
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "Q'*Q"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "collapsed": true
 68 |    },
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "# form data from noisy linear model\n",
 72 |     "β♮ = randn(d)\n",
 73 |     "y = X*β♮ + .1*randn(n);"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "# solve least squares problem to estimate w\n",
 85 |     "β = R \\ (Q'*y)"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": false
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "# how good is our estimate?\n",
 97 |     "norm(β - β♮)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {
104 |     "collapsed": false
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "# compute mean square error\n",
109 |     "mean((y - X*β).^2)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "# let's use the shorthand\n",
121 |     "β_backslash = X \\ y\n",
122 |     "norm(w_backslash - β)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "β_backslash"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Julia 0.5.0",
140 |    "language": "julia",
141 |    "name": "julia-0.5"
142 |   },
143 |   "language_info": {
144 |    "file_extension": ".jl",
145 |    "mimetype": "application/julia",
146 |    "name": "julia",
147 |    "version": "0.5.0"
148 |   }
149 |  },
150 |  "nbformat": 4,
151 |  "nbformat_minor": 0
152 | }
153 | 


--------------------------------------------------------------------------------
/AutoML/AutoML_in_class_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "We use the Airbnb dataset from Homework 3 to illustrate how different AutoML frameworks work, by doing model selection on the training set and then evaluate on test set. The error metric we are using is balanced error rate, which is the average of false positive rate and false negative rate, and then take the average of those averages across classes."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import sys\n",
 17 |     "import pandas as pd\n",
 18 |     "import os\n",
 19 |     "import time\n",
 20 |     "from datetime import datetime\n",
 21 |     "import numpy as np\n",
 22 |     "import multiprocessing as mp\n",
 23 |     "\n",
 24 |     "from sklearn.datasets import load_iris\n",
 25 |     "from sklearn.model_selection import train_test_split\n",
 26 |     "\n",
 27 |     "import autosklearn.classification\n",
 28 |     "from autosklearn.metrics import balanced_accuracy"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 2,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "automl_path = 'oboe/automl/'\n",
 38 |     "sys.path.append(automl_path)\n",
 39 |     "from auto_learner import AutoLearner\n",
 40 |     "import util\n",
 41 |     "\n",
 42 |     "# disable warnings\n",
 43 |     "import warnings\n",
 44 |     "warnings.filterwarnings('ignore')"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "Prepare the Airbnb dataset."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "airbnb_dataset_size = 3000 # number of points to keep in subsampling\n",
 61 |     "\n",
 62 |     "df_airbnb = pd.read_csv(\"airbnb.csv\", index_col=None, header=0)\n",
 63 |     "df_airbnb.drop(df_airbnb[df_airbnb.price == np.nan].index, inplace=True)\n",
 64 |     "features_real = [\n",
 65 |     "  \"host_listings_count\",\n",
 66 |     "  \"host_total_listings_count\",\n",
 67 |     "  \"accommodates\",\n",
 68 |     "  \"bathrooms\",\n",
 69 |     "  \"bedrooms\",\n",
 70 |     "  \"guests_included\",\n",
 71 |     "  \"extra_people\",\n",
 72 |     "  \"minimum_nights\",\n",
 73 |     "  \"maximum_nights\",\n",
 74 |     "  \"availability_30\",\n",
 75 |     "  \"availability_60\",\n",
 76 |     "  \"availability_90\",\n",
 77 |     "  \"availability_365\",\n",
 78 |     "  \"number_of_reviews\",\n",
 79 |     "  \"review_scores_rating\",\n",
 80 |     "  \"review_scores_accuracy\",\n",
 81 |     "  \"review_scores_cleanliness\",\n",
 82 |     "  \"review_scores_checkin\",\n",
 83 |     "  \"review_scores_communication\",\n",
 84 |     "  \"review_scores_location\",\n",
 85 |     "  \"price\"\n",
 86 |     "]\n",
 87 |     "\n",
 88 |     "label = [\"review_scores_value\"]\n",
 89 |     "x = df_airbnb[features_real].values\n",
 90 |     "y = df_airbnb[label].values.flatten()\n",
 91 |     "\n",
 92 |     "np.random.seed(0)\n",
 93 |     "idx_to_keep = np.random.choice(np.arange(y.shape[0]), size=airbnb_dataset_size, replace=False)\n",
 94 |     "x = x[idx_to_keep]\n",
 95 |     "y = y[idx_to_keep]\n",
 96 |     "    \n",
 97 |     "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Part I: auto-sklearn"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "We may restrict the estimator search space to only search for a good classifier among these models."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 4,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "include_estimators = [\"adaboost\",\"gaussian_nb\", \"extra_trees\", \"gradient_boosting\", \n",
121 |     "                                 \"liblinear_svc\", \"libsvm_svc\",\"random_forest\",\n",
122 |     "                                 \"k_nearest_neighbors\",\"decision_tree\"]"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "We may also specify a running time limit."
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 5,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "runtime_limit = 120"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 6,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "# A wrapper class for the auto-sklearn learner.\n",
148 |     "def AutoSklearn(total_runtime, train_features, train_labels):\n",
149 |     "    clf = autosklearn.classification.AutoSklearnClassifier(\n",
150 |     "            time_left_for_this_task=total_runtime,\n",
151 |     "            tmp_folder='tmp/autosklearn_tmp_'+str(datetime.now()), \n",
152 |     "            output_folder='tmp/autosklearn_output_'+str(datetime.now()),\n",
153 |     "            metric=balanced_accuracy,\n",
154 |     "            include_estimators = include_estimators,\n",
155 |     "    )\n",
156 |     "        \n",
157 |     "    clf.fit(train_features, train_labels)    \n",
158 |     "    return clf"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "Run auto-sklearn for 120 seconds."
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 7,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "runtime = 120\n",
175 |     "clf = AutoSklearn(runtime, x_train, y_train)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "markdown",
180 |    "metadata": {},
181 |    "source": [
182 |     "Get predicted training and test labels."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 8,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "y_train_pred_autosklearn = clf.predict(x_train)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 9,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "y_test_pred_autosklearn = clf.predict(x_test)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "Show which models the learner has picked."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 10,
213 |    "metadata": {
214 |     "scrolled": true
215 |    },
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "\"[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'polynomial', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.21794354428393548, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_samples_split': 16, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0025451910134387575, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 1477, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:polynomial:degree': 2, 'feature_preprocessor:polynomial:include_bias': 'True', 'feature_preprocessor:polynomial:interaction_only': 'False'},\\ndataset_properties={\\n  'task': 2,\\n  'sparse': False,\\n  'multilabel': False,\\n  'multiclass': True,\\n  'target_type': 'classification',\\n  'signed': False})),\\n]\""
221 |       ]
222 |      },
223 |      "execution_count": 10,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "clf.show_models()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "Show the error on test dataset."
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 11,
242 |    "metadata": {},
243 |    "outputs": [
244 |     {
245 |      "data": {
246 |       "text/plain": [
247 |        "0.10065950071453614"
248 |       ]
249 |      },
250 |      "execution_count": 11,
251 |      "metadata": {},
252 |      "output_type": "execute_result"
253 |     }
254 |    ],
255 |    "source": [
256 |     "util.error(y_train, y_train_pred_autosklearn, 'classification')"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 12,
262 |    "metadata": {
263 |     "scrolled": true
264 |    },
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "text/plain": [
269 |        "0.18850923114927096"
270 |       ]
271 |      },
272 |      "execution_count": 12,
273 |      "metadata": {},
274 |      "output_type": "execute_result"
275 |     }
276 |    ],
277 |    "source": [
278 |     "util.error(y_test, y_test_pred_autosklearn, 'classification')"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "# Part II: TPOT"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "TPOT is an AutoML tool that optimizes machine learning pipelines by genetic programming."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 13,
298 |    "metadata": {
299 |     "scrolled": true
300 |    },
301 |    "outputs": [],
302 |    "source": [
303 |     "from tpot import TPOTClassifier"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "Run TPOT for 120 seconds."
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 14,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "application/vnd.jupyter.widget-view+json": {
321 |        "model_id": "681aa82e4f3645c4a8e3dc17a89200a3",
322 |        "version_major": 2,
323 |        "version_minor": 0
324 |       },
325 |       "text/plain": [
326 |        "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=20.0, style=ProgressStyle(des…"
327 |       ]
328 |      },
329 |      "metadata": {},
330 |      "output_type": "display_data"
331 |     },
332 |     {
333 |      "name": "stdout",
334 |      "output_type": "stream",
335 |      "text": [
336 |       "\r\n",
337 |       "\r",
338 |       "0.58 minutes have elapsed. TPOT will close down.\n",
339 |       "TPOT closed during evaluation in one generation.\n",
340 |       "WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.\n",
341 |       "\r\n",
342 |       "\r\n",
343 |       "TPOT closed prematurely. Will use the current best pipeline.\n",
344 |       "\r\n",
345 |       "Best pipeline: DecisionTreeClassifier(SelectFwe(input_matrix, alpha=0.042), criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=7)\n"
346 |      ]
347 |     },
348 |     {
349 |      "data": {
350 |       "text/plain": [
351 |        "TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,\n",
352 |        "               disable_update_check=False, early_stop=None, generations=5,\n",
353 |        "               log_file=<ipykernel.iostream.OutStream object at 0x7f26b2870780>,\n",
354 |        "               max_eval_time_mins=5, max_time_mins=0.5, memory=None,\n",
355 |        "               mutation_rate=0.9, n_jobs=1, offspring_size=None,\n",
356 |        "               periodic_checkpoint_folder=None, population_size=20,\n",
357 |        "               random_state=None, scoring=None, subsample=1.0, template=None,\n",
358 |        "               use_dask=False, verbosity=2, warm_start=False)"
359 |       ]
360 |      },
361 |      "execution_count": 14,
362 |      "metadata": {},
363 |      "output_type": "execute_result"
364 |     }
365 |    ],
366 |    "source": [
367 |     "tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, max_time_mins=.5)\n",
368 |     "tpot.fit(x_train, y_train)"
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 15,
374 |    "metadata": {},
375 |    "outputs": [],
376 |    "source": [
377 |     "y_train_pred_tpot = tpot.predict(x_train)\n",
378 |     "y_test_pred_tpot = tpot.predict(x_test)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "Show the error on test dataset."
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 16,
391 |    "metadata": {},
392 |    "outputs": [
393 |     {
394 |      "data": {
395 |       "text/plain": [
396 |        "0.3397069433952885"
397 |       ]
398 |      },
399 |      "execution_count": 16,
400 |      "metadata": {},
401 |      "output_type": "execute_result"
402 |     }
403 |    ],
404 |    "source": [
405 |     "#tpot training error\n",
406 |     "util.error(y_train, y_train_pred_tpot, 'classification')"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 18,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "data": {
416 |       "text/plain": [
417 |        "0.3557644506500882"
418 |       ]
419 |      },
420 |      "execution_count": 18,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "#tpot test error\n",
427 |     "util.error(y_test, y_test_pred_tpot, 'classification')"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "markdown",
432 |    "metadata": {},
433 |    "source": [
434 |     "# Part III: Oboe (still under development)"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "markdown",
439 |    "metadata": {},
440 |    "source": [
441 |     "## Oboe Example 1: build an ensemble of models"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 19,
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": [
450 |     "#experimental settings\n",
451 |     "VERBOSE = False #whether to print out information indicating current fitting progress\n",
452 |     "N_CORES = 1 #number of cores\n",
453 |     "RUNTIME_BUDGET = 30"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 20,
459 |    "metadata": {},
460 |    "outputs": [],
461 |    "source": [
462 |     "#optional: limit the types of algorithms\n",
463 |     "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 21,
469 |    "metadata": {},
470 |    "outputs": [],
471 |    "source": [
472 |     "#autolearner arguments\n",
473 |     "autolearner_kwargs = {\n",
474 |     "    'p_type': 'classification',\n",
475 |     "    'runtime_limit': RUNTIME_BUDGET,\n",
476 |     "    'verbose': VERBOSE,\n",
477 |     "    'selection_method': 'min_variance',\n",
478 |     "    'algorithms': s,\n",
479 |     "    'stacking_alg': 'greedy',\n",
480 |     "    'n_cores': N_CORES,\n",
481 |     "    'build_ensemble': True,\n",
482 |     "}"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": 22,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "#intialize the autolearner class\n",
492 |     "m = AutoLearner(**autolearner_kwargs)"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 23,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "# fit autolearner on training set and record runtime\n",
502 |     "start = time.time()\n",
503 |     "m.fit(x_train, y_train)\n",
504 |     "elapsed_time = time.time() - start"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "code",
509 |    "execution_count": 24,
510 |    "metadata": {},
511 |    "outputs": [
512 |     {
513 |      "name": "stdout",
514 |      "output_type": "stream",
515 |      "text": [
516 |       "prediction error: 0.3139487158460067\n",
517 |       "elapsed time: 27.216959714889526\n",
518 |       "individual accuracies of selected models: [0.31651432260061413, 0.32036420549246775, 0.26796126609153437, 0.42171443806106124, 0.31651432260061413]\n"
519 |      ]
520 |     }
521 |    ],
522 |    "source": [
523 |     "# use the fitted autolearner for prediction on test set\n",
524 |     "y_predicted = m.predict(x_test)\n",
525 |     "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n",
526 |     "print(\"elapsed time: {}\".format(elapsed_time))\n",
527 |     "print(\"individual accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 25,
533 |    "metadata": {
534 |     "scrolled": true
535 |    },
536 |    "outputs": [
537 |     {
538 |      "data": {
539 |       "text/plain": [
540 |        "{'ensemble method': 'greedy selection',\n",
541 |        " 'base learners': {'DT': [{'min_samples_split': 0.0001},\n",
542 |        "   {'min_samples_split': 4},\n",
543 |        "   {'min_samples_split': 1024},\n",
544 |        "   {'min_samples_split': 1e-05}],\n",
545 |        "  'GNB': [{}]}}"
546 |       ]
547 |      },
548 |      "execution_count": 25,
549 |      "metadata": {},
550 |      "output_type": "execute_result"
551 |     }
552 |    ],
553 |    "source": [
554 |     "# get names of the selected machine learning models\n",
555 |     "m.get_models()"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "markdown",
560 |    "metadata": {},
561 |    "source": [
562 |     "## Oboe Example 2: just select a collection of promising models without building an ensemble afterwards"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 26,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "#experimental settings\n",
572 |     "VERBOSE = False #whether to print out information indicating current fitting progress\n",
573 |     "N_CORES = 1 #number of cores\n",
574 |     "RUNTIME_BUDGET = 30"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "code",
579 |    "execution_count": 27,
580 |    "metadata": {},
581 |    "outputs": [],
582 |    "source": [
583 |     "#optional: limit the types of algorithms\n",
584 |     "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": 28,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "#autolearner arguments\n",
594 |     "autolearner_kwargs = {\n",
595 |     "    'p_type': 'classification',\n",
596 |     "    'runtime_limit': RUNTIME_BUDGET,\n",
597 |     "    'verbose': VERBOSE,\n",
598 |     "    'selection_method': 'min_variance',\n",
599 |     "    'algorithms': s,\n",
600 |     "    'stacking_alg': 'greedy',\n",
601 |     "    'n_cores': N_CORES,\n",
602 |     "    'build_ensemble': False,\n",
603 |     "}"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": 29,
609 |    "metadata": {},
610 |    "outputs": [],
611 |    "source": [
612 |     "#intialize the autolearner class\n",
613 |     "m = AutoLearner(**autolearner_kwargs)"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 30,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "# fit autolearner on training set and record runtime\n",
623 |     "start = time.time()\n",
624 |     "m.fit(x_train, y_train)\n",
625 |     "elapsed_time = time.time() - start"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": 31,
631 |    "metadata": {},
632 |    "outputs": [
633 |     {
634 |      "name": "stdout",
635 |      "output_type": "stream",
636 |      "text": [
637 |       "elapsed time: 10.797972679138184\n",
638 |       "accuracies of selected models: [0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.3124772208748183, 0.32036420549246775, 0.35449564337367473, 0.3495441004342578, 0.39048041301086595, 0.26796126609153437, 0.42171443806106124, 0.3001999674360773, 0.3515055898002219, 0.2644854823131578, 0.32215416955204695, 0.31565880270077473, 0.377873248684861, 0.297898906174363]\n"
639 |      ]
640 |     }
641 |    ],
642 |    "source": [
643 |     "# use the fitted autolearner for prediction on test set\n",
644 |     "y_predicted = m.predict(x_test)\n",
645 |     " \n",
646 |     "print(\"elapsed time: {}\".format(elapsed_time))\n",
647 |     "print(\"accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "markdown",
652 |    "metadata": {},
653 |    "source": [
654 |     "Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported.\n",
655 |     "\n",
656 |     "The following shows which models we have picked."
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": 32,
662 |    "metadata": {},
663 |    "outputs": [
664 |     {
665 |      "data": {
666 |       "text/plain": [
667 |        "{'DT': [{'min_samples_split': 1e-05},\n",
668 |        "  {'min_samples_split': 1e-05},\n",
669 |        "  {'min_samples_split': 0.0001},\n",
670 |        "  {'min_samples_split': 2},\n",
671 |        "  {'min_samples_split': 0.001},\n",
672 |        "  {'min_samples_split': 4},\n",
673 |        "  {'min_samples_split': 64},\n",
674 |        "  {'min_samples_split': 128},\n",
675 |        "  {'min_samples_split': 256},\n",
676 |        "  {'min_samples_split': 1024},\n",
677 |        "  {'min_samples_split': 8},\n",
678 |        "  {'min_samples_split': 16},\n",
679 |        "  {'min_samples_split': 32},\n",
680 |        "  {'min_samples_split': 0.01}],\n",
681 |        " 'GNB': [{}],\n",
682 |        " 'AB': [{'n_estimators': 50, 'learning_rate': 1},\n",
683 |        "  {'n_estimators': 50, 'learning_rate': 1.5},\n",
684 |        "  {'n_estimators': 100, 'learning_rate': 1}]}"
685 |       ]
686 |      },
687 |      "execution_count": 32,
688 |      "metadata": {},
689 |      "output_type": "execute_result"
690 |     }
691 |    ],
692 |    "source": [
693 |     "m.get_models()"
694 |    ]
695 |   }
696 |  ],
697 |  "metadata": {
698 |   "kernelspec": {
699 |    "display_name": "Python 3",
700 |    "language": "python",
701 |    "name": "python3"
702 |   },
703 |   "language_info": {
704 |    "codemirror_mode": {
705 |     "name": "ipython",
706 |     "version": 3
707 |    },
708 |    "file_extension": ".py",
709 |    "mimetype": "text/x-python",
710 |    "name": "python",
711 |    "nbconvert_exporter": "python",
712 |    "pygments_lexer": "ipython3",
713 |    "version": "3.7.3"
714 |   }
715 |  },
716 |  "nbformat": 4,
717 |  "nbformat_minor": 2
718 | }
719 | 


--------------------------------------------------------------------------------
/AutoML/README.md:
--------------------------------------------------------------------------------
 1 | # AutoML demo
 2 | 
 3 | Please find the demo as the Jupyter notebook. Before running the demo, you would need to download the Airbnb dataset we used in HW3 from <https://www.kaggle.com/c/airbnblala/data#>, and also install `auto-sklearn` and `TPOT`. The installation guides are:
 4 | 
 5 | - auto-sklearn (may only work on Linux): <https://automl.github.io/auto-sklearn/master/installation.html#installation>
 6 | 
 7 | - TPOT: <http://epistasislab.github.io/tpot/installing/>
 8 | 
 9 | The Oboe system is developed by us and is still in progress of development (at <https://github.com/udellgroup/oboe>). Suggestions are welcomed!
10 | 
11 | ## References
12 | [1] Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum, Frank Hutter. Efficient and robust automated machine learning. NIPS 2015.
13 | 
14 | [2] Randal S Olson, Jason H Moore. TPOT: A tree-based pipeline opti-mization tool for automating machine learning. Automated Machine Learning 2019.
15 | 
16 | [3] Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell. OBOE: Collaborative filtering for AutoML model selection. KDD 2019.


--------------------------------------------------------------------------------
/AutoML/oboe/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019, Chengrun Yang
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 |     * Redistributions of source code must retain the above copyright
 7 |       notice, this list of conditions and the following disclaimer.
 8 |     * Redistributions in binary form must reproduce the above copyright
 9 |       notice, this list of conditions and the following disclaimer in the
10 |       documentation and/or other materials provided with the distribution.
11 |     * Neither the name of the <organization> nor the
12 |       names of its contributors may be used to endorse or promote products
13 |       derived from this software without specific prior written permission.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/AutoML/oboe/README.md:
--------------------------------------------------------------------------------
 1 | # Oboe
 2 | 
 3 | In an orchestra, the oboe plays an initial note which the other instruments use to tune to the right frequency before the performance begins; this package, Oboe, is an automated machine learning/model selection system that uses collaborative filtering to find good models for supervised learning tasks within a user-specified time limit. Further hyperparameter tuning can be performed afterwards.
 4 | 
 5 | Oboe is based on matrix factorization and classical experiment design. For a complete description, refer to our paper at KDD 2019: [OBOE: Collaborative Filtering for AutoML Model Selection](https://arxiv.org/abs/1808.03233).
 6 | 
 7 | This system is still under developement and subjects to change.
 8 | 
 9 | ## Installation
10 | 
11 | #### Dependencies
12 | oboe requires:
13 | * Python (>= 3.5)
14 | * numpy  (>= 1.8.2)
15 | * scipy  (>= 0.13.3)
16 | * pandas (>=0.22.0)
17 | * scikit-learn  (>= 0.18)
18 | * multiprocessing (>=0.70.5)
19 | * OpenML (>=0.7.0)
20 | * mkl (>=1.0.0)
21 | * re
22 | * os
23 | * json
24 | * util
25 | 
26 | #### User Installation
27 | This part is currently under development; an example for code usage is in the `example` folder. The package will be pip installable in the future.
28 | 
29 | ## Usage
30 | 
31 | ### Online Phase (AutoML model selection)
32 | Given a new dataset, we want to select promising models and hyperparameters. Denote features and labels of the training set as `x_train` and `y_train`, and features of the test set as `x_test`, a short example of training and testing is
33 | ```
34 | from auto_learner import AutoLearner
35 | m = AutoLearner(runtime_limit=20) #set the time limit for model fitting to be 20 seconds
36 | m.fit(x_train, y_train)
37 | m.predict(x_test)
38 | ```
39 | Additional arguments can be applied to customize the `AutoLearner` instance, including:
40 | 1. Basics
41 | * p_type (str): Problem type, which is one of {'classification', 'regression'}. By default, 'classification'.
42 | * verbose (Boolean): Whether or not to generate print statements that showcase the progress. By default, false.
43 | * n_cores (int): Maximum number of CPU cores to use. The default value 'None' means no limit, i.e., up to all the CPU cores of the machine.
44 | * runtime_limit (int): Maximum runtime for AutoLearner fitting, in seconds. By default, 512 seconds as the timeout limit.
45 | * scalarization (str): Scalarization of the covariance matrix for mininum variance selection. One of {'D', 'A', 'E'}. 'D', as default, enjoys best performance and fastest speed in practice.
46 | * build_ensemble (Boolean): Whether to build an ensemble of promising models.
47 | * stacking_alg (str): The method used for ensemble construction. One of {'greedy', 'stacking'}. By default, 'greedy'.
48 | * dataset_ratio_threshold (float): The threshold of dataset ratio for dataset subsampling, if the training set is tall and skinny (i.e., number of data points much larger than number of features).
49 | 
50 | 2. Advanced customization
51 | * algorithms (list): A list of algorithm types to be considered, in strings, e.g. ['KNN', 'lSVM']. By default, all the algorithms in the error matrix. The supported classification algorithms are: 'AB' (Adaboost), 'DT' (decision tree), 'ExtraTrees' (extra trees), 'GBT' (gradient boosting), 'GNB' (Gaussian naive Bayes), 'KNN' (kNN), 'Logit' (logistic regression), 'MLP' (multilayer perceptron), 'Perceptron' (perceptron), 'RF' (random forest), 'kSVM' (kernel SVM), 'lSVM' (linear SVM).
52 | * hyperparameters (dict): A nested dict of hyperparameters to be considered. By default, all the model hyperparameters in the error matrix.
53 | * error_matrix (DataFrame): Error matrix to use for imputation, includes indices and headers. The one in `defaults` folder is used by default.
54 | * runtime_matrix (DataFrame): Runtime matrix to use for runtime prediction, includes indices and headers. The one in `defaults` folder is used by default.
55 | * new_row (np.ndarray): Predicted row of error matrix; corresponds to the new dataset. By default, 'None'.
56 | * selection_method (str): Method of selecting entries of new row to sample. One of {'min_variance', 'qr'}. 'min_variance' corresponds to the selection approach via classic experiment design; 'qr' selects the pivot columns in the error matrix and thus does not provide the functionality of maximizing performance within given runtime budget. By default, 'min_variance'.
57 | * runtime_predictor (str): Model for runtime prediction. One of {'LinearRegression', 'KNeighborsRegressor'}. By default, 'LinearRegression'. Dataset sizes (number of data points and number of features) are used as feature vectors for both runtime predictor models.
58 | 
59 | For executable and more detailed examples, please refer to the `example` folder.
60 | 
61 | ### Offline Phase
62 | 
63 | ##### Error Matrix Generation
64 | Please refer to `examples/error_matrix_generation` for an error matrix generation example.


--------------------------------------------------------------------------------
/AutoML/oboe/automl/README.md:
--------------------------------------------------------------------------------
 1 | # Oboe
 2 | Technical & implementation details of the Oboe
 3 | package.
 4 | 
 5 | #### Classification Algorithms
 6 | * K-Nearest Neighbors
 7 | * Decision Tree
 8 | * Random Forest
 9 | * Gradient Boosting Tree
10 | * Adaboost Tree
11 | * Linear SVM
12 | * Kernel SVM
13 | * Logistic Regression
14 | * Perceptron
15 | * Gaussian Naive Bayes
16 | 
17 | #### Regression Algorithms
18 | * (under development)
19 | 
20 | #### Notes on usage
21 | * Oboe currently only supports datasets that
22 | are saved as .csv files. Additionally, it is assumed that
23 | these .csv files contain *only* the data, i.e. there are
24 | no row or column names included in the file.
25 | 
26 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/__init__.py:
--------------------------------------------------------------------------------
1 | from auto_learner import AutoLearner
2 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/convex_opt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Find columns of error matrix to minimize variance of predicted latent features.
  3 | Solves convex optimization problem as described in chapter 7.5 in https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf
  4 | """
  5 | 
  6 | import numpy as np
  7 | import os
  8 | import pandas as pd
  9 | import pickle
 10 | import openml
 11 | import subprocess
 12 | from scipy.optimize import minimize
 13 | from sklearn.preprocessing import PolynomialFeatures
 14 | from sklearn.linear_model import LinearRegression
 15 | from sklearn.neighbors import KNeighborsRegressor
 16 | 
 17 | 
 18 | def solve(t_predicted, t_max, n_cores, Y, scalarization='D'):
 19 |     """Solve the following optimization problem:
 20 |     minimize -log(det(sum_i v[i]*Y[:, i]*Y[:, i].T)) subject to 0 <= v[i] <= 1 and t_predicted.T * v <= t_max
 21 |     The optimal vector v is an approximation of a boolean vector indicating which entries to sample.
 22 | 
 23 |     Args:
 24 |          t_predicted (np.ndarray): 1-d array specifying predicted runtime for each model setting
 25 |          t_max (float):            maximum runtime of sampled model
 26 |          n_cores (int):            number of cores to use
 27 |          Y (np.ndarray):           matrix representing latent variable weights of error matrix
 28 |          scalarization (str):      scalarization method in experimental design.
 29 |     Returns:
 30 |         np.ndarray:                optimal vector v (not truncated to binary values)
 31 |     """
 32 | 
 33 |     n = len(t_predicted)
 34 | 
 35 |     if scalarization == 'D':
 36 |         def objective(v):
 37 |             sign, log_det = np.linalg.slogdet(Y @ np.diag(v) @ Y.T)
 38 |             return -1 * sign * log_det
 39 |     elif scalarization == 'A':
 40 |         def objective(v):
 41 |             return np.trace(np.linalg.pinv(Y @ np.diag(v) @ Y.T))
 42 |     elif scalarization == 'E':
 43 |         def objective(v):
 44 |             return np.linalg.norm(np.linalg.pinv(Y @ np.diag(v) @ Y.T), ord=2)
 45 |     def constraint(v):
 46 |         return t_max * n_cores- t_predicted @ v
 47 |     v0 = np.full((n, ), 0.5)
 48 |     constraints = {'type': 'ineq', 'fun': constraint}
 49 |     v_opt = minimize(objective, v0, method='SLSQP', bounds=[(0, 1)] * n, options={'maxiter': 30},
 50 |                      constraints=constraints)
 51 |     
 52 |     return v_opt.x
 53 | 
 54 | def predict_runtime(size, runtime_matrix=None, saved_model=None, model_name='LinearRegression', save=False):
 55 |     """Predict the runtime for each model setting on a dataset with given shape.
 56 | 
 57 |     Args:
 58 |         size (tuple):               tuple specifying dataset size as [n_rows, n_columns]
 59 |         runtime_matrix (DataFrame): the DataFame containing runtime.
 60 |         saved_model (str):          path to pre-trained model; defaults to None
 61 |         save (bool):                whether to save pre-trained model
 62 |     Returns:
 63 |         np.ndarray:        1-d array of predicted runtimes
 64 |     """
 65 |     assert len(size) == 2, "Dataset must be 2-dimensional."
 66 |     shape = np.array(size)
 67 | 
 68 |     if saved_model:
 69 |         with open(saved_model, 'rb') as file:
 70 |             model = pickle.load(file)
 71 |         return model.predict(shape)
 72 | 
 73 |     defaults_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'defaults')
 74 |     try:
 75 |         dataset_sizes = pd.read_csv(os.path.join(defaults_path, 'dataset_sizes.csv'), index_col=0)
 76 |         sizes_index = np.array(dataset_sizes.index)
 77 |         sizes = dataset_sizes.values
 78 |     except FileNotFoundError:
 79 |         sizes_index = []
 80 |         sizes = []
 81 |     if runtime_matrix is None:
 82 |         runtime_matrix = pd.read_csv(os.path.join(defaults_path, 'runtime_matrix.csv'), index_col=0)
 83 |     runtimes_index = np.array(runtime_matrix.index)
 84 |     runtimes = runtime_matrix.values
 85 |     model = RuntimePredictor(3, sizes, sizes_index, np.log(runtimes), runtimes_index, model_name=model_name)
 86 |     if save:
 87 |         with open(os.path.join(defaults_path, 'runtime_predictor.pkl'), 'wb') as file:
 88 |             pickle.dump(model, file)
 89 | 
 90 |     return np.exp(model.predict(shape))
 91 | 
 92 | 
 93 | class RuntimePredictor:
 94 |     """Model that predicts the runtime for each model setting on a dataset with given shape. Performs polynomial
 95 |     regression on n (# samples), p (# features), and log(n).
 96 | 
 97 |     Attributes:
 98 |         degree (int):   degree of polynomial basis function
 99 |         n_models (int): number of model settings
100 |         models (list):  list of scikit-learn regression models
101 |     """
102 |     def __init__(self, degree, sizes, sizes_index, runtimes, runtimes_index, model_name='LinearRegression'):
103 |         self.degree = degree
104 |         self.n_models = runtimes.shape[1]
105 |         self.model_name = model_name
106 |         self.models = [None] * self.n_models
107 |         self.fit(sizes, sizes_index, runtimes, runtimes_index)
108 | 
109 |     def fit(self, sizes, sizes_index, runtimes, runtimes_index):
110 |         """Fit polynomial regression on pre-recorded runtimes on datasets."""
111 |         # assert sizes.shape[0] == runtimes.shape[0], "Dataset sizes and runtimes must be recorded on same datasets."
112 |         for i in set(runtimes_index).difference(set(sizes_index)):
113 |             dataset = openml.datasets.get_dataset(i)
114 |             data_numeric, data_labels, categorical = dataset.get_data(target=dataset.default_target_attribute,
115 |                                                                       return_categorical_indicator=True)
116 |             if len(sizes) == 0:
117 |                 sizes = np.array([data_numeric.shape])
118 |                 sizes_index = np.array(i)
119 |             else:
120 |                 sizes = np.concatenate((sizes, np.array([data_numeric.shape])))
121 |                 sizes_index = np.append(sizes_index, i)
122 | 
123 |         sizes_train = np.array([sizes[list(sizes_index).index(i), :] for i in runtimes_index])
124 |         sizes_log = np.concatenate((sizes_train, np.log(sizes_train[:, 0]).reshape(-1, 1)), axis=1)
125 |         sizes_train_poly = PolynomialFeatures(self.degree).fit_transform(sizes_log)
126 | 
127 |         # train independent regression model to predict each runtime of each model setting
128 |         for i in range(self.n_models):
129 |             runtime = runtimes[:, i]
130 |             if self.model_name == 'LinearRegression':
131 |                 self.models[i] = LinearRegression().fit(sizes_train_poly, runtime)
132 |             elif self.model_name == 'KNeighborsRegressor':
133 |                 def metric(a, b):
134 |                     coefficients = [1, 100]
135 |                     return np.sum(np.multiply((a - b) ** 2, coefficients))
136 |                         
137 |                 def weights(distances):
138 |                     return distances
139 | 
140 |                 neigh = KNeighborsRegressor(n_neighbors=5, metric=metric, weights=weights)
141 |                 self.models[i] = neigh.fit(sizes_train, runtime)
142 | #            print(self.models[i].coef_)
143 | #            print(self.models[i].intercept_)
144 |             # self.models[i] = Lasso().fit(sizes_train_poly, runtime)
145 | 
146 |     def predict(self, size):
147 |         """Predict runtime of all model settings on a dataset of given size.
148 |         
149 |         Args:
150 |             size(np.array): Size of the dataset to fit runtime onto.
151 |         Returns:
152 |             predictions (np.array): The predicted runtime.
153 |         """
154 |         if self.model_name == 'LinearRegression':
155 |             size_test = np.append(size, np.log(size[0]))
156 |             size_test_poly = PolynomialFeatures(self.degree).fit_transform([size_test])
157 |             predictions = np.zeros(self.n_models)
158 |             for i in range(self.n_models):
159 |                 predictions[i] = self.models[i].predict(size_test_poly)[0]
160 |     
161 |         elif self.model_name == 'KNeighborsRegressor':
162 |             predictions = np.zeros(self.n_models)
163 |             for i in range(self.n_models):
164 |                 predictions[i] = self.models[i].predict(np.array(size).reshape(1, -1))[0]
165 |         
166 | #        # TO BE REMOVED: sanity check
167 | #
168 | #        size_check = (1000, 10)
169 | #        size_check = np.append(size, np.log(size[0]))
170 | #        size_check_poly = PolynomialFeatures(self.degree).fit_transform([size_check])
171 | #        print(size_check_poly)
172 | #        for i in range(self.n_models):
173 | #            print(self.models[i].predict(size_check_poly)[0])
174 | 
175 |         return predictions
176 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/defaults/classification.json:
--------------------------------------------------------------------------------
 1 | {"algorithms": ["KNN", "DT", "RF", "GBT", "AB", "lSVM", "kSVM", "Logit", "Perceptron", "GNB", "MLP", "ExtraTrees"], 
 2 |  "hyperparameters": {
 3 |      "KNN": {"n_neighbors": [1, 3, 5, 7, 9, 11, 13, 15], "p": [1, 2]}, 
 4 |      "DT": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.01,0.001,0.0001,1e-05]}, 
 5 |      "RF": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.1,0.01,0.001,0.0001,1e-05], "criterion": ["gini", "entropy"]}, 
 6 |      "GBT": {"learning_rate": [0.001,0.01,0.025,0.05,0.1,0.25,0.5], "max_depth": [3, 6], "max_features": [null, "log2"]}, 
 7 |      "AB": {"n_estimators": [50, 100], "learning_rate": [1.0, 1.5, 2.0, 2.5, 3.0]}, 
 8 |      "lSVM": {"C": [0.125,0.25,0.5,0.75,1,2,4,8,16]}, 
 9 |      "kSVM": {"C": [0.125,0.25,0.5,0.75,1,2,4,8,16], "kernel": ["rbf", "poly"], "coef0": [0, 10]}, 
10 |      "Logit": {"C": [0.25,0.5,0.75,1,1.5,2,3,4], "solver": ["liblinear", "saga"], "penalty": ["l1", "l2"]}, 
11 |      "Perceptron": {}, 
12 |      "GNB": {}, 
13 |      "MLP": {"learning_rate_init": [0.0001,0.001,0.01], "learning_rate": ["adaptive"], "solver": ["sgd", "adam"], "alpha": [0.0001, 0.01]}, 
14 |      "ExtraTrees": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.1,0.01,0.001,0.0001,1e-05], "criterion": ["gini", "entropy"]}}}


--------------------------------------------------------------------------------
/AutoML/oboe/automl/defaults/dataset_sizes.csv:
--------------------------------------------------------------------------------
  1 | ,0,1
  2 | 1121,294,14
  3 | 1005,214,10
  4 | 1011,336,8
  5 | 1442,253,38
  6 | 1054,161,40
  7 | 1026,155,49
  8 | 1449,253,38
  9 | 1448,194,40
 10 | 1073,274,9
 11 | 1167,320,11
 12 | 1048,369,9
 13 | 1025,400,19
 14 | 1012,194,121
 15 | 1446,296,38
 16 | 1447,327,38
 17 | 1071,403,38
 18 | 1063,522,22
 19 | 1065,458,40
 20 | 1115,151,58
 21 | 1004,600,61
 22 | 1452,745,37
 23 | 1488,195,23
 24 | 1443,661,38
 25 | 1016,990,28
 26 | 1462,1372,5
 27 | 1014,797,22
 28 | 1451,705,38
 29 | 1490,182,13
 30 | 1495,250,19
 31 | 1467,540,21
 32 | 11,625,5
 33 | 1480,583,12
 34 | 1068,1109,22
 35 | 1100,478,23
 36 | 1444,1043,38
 37 | 1500,210,8
 38 | 1499,210,8
 39 | 1498,462,11
 40 | 1453,1077,38
 41 | 1506,470,38
 42 | 1511,440,11
 43 | 1464,748,5
 44 | 1454,1458,38
 45 | 1510,569,31
 46 | 1049,1458,38
 47 | 1524,310,7
 48 | 1494,1055,42
 49 | 1508,403,6
 50 | 1050,1563,38
 51 | 1512,200,14
 52 | 1523,310,7
 53 | 1520,164,91
 54 | 1067,2109,22
 55 | 1020,2000,65
 56 | 1547,1000,21
 57 | 1021,5473,11
 58 | 1529,1521,4
 59 | 1487,2534,73
 60 | 1069,5589,37
 61 | 1530,1515,4
 62 | 1553,700,19
 63 | 1554,500,19
 64 | 1551,400,45
 65 | 1482,340,16
 66 | 1525,5456,3
 67 | 1600,267,45
 68 | 1565,294,14
 69 | 1526,5456,5
 70 | 1489,5404,6
 71 | 1527,3252,4
 72 | 1552,1100,19
 73 | 1544,1277,4
 74 | 1545,1252,4
 75 | 1543,1080,4
 76 | 1542,1183,4
 77 | 187,178,14
 78 | 1466,2126,36
 79 | 1549,750,45
 80 | 1546,1112,4
 81 | 23499,277,42
 82 | 1056,9466,39
 83 | 1560,2126,36
 84 | 1570,4839,6
 85 | 311,937,50
 86 | 1460,5300,3
 87 | 1507,7400,21
 88 | 285,194,121
 89 | 329,160,5
 90 | 333,556,18
 91 | 335,554,18
 92 | 334,601,18
 93 | 1566,1212,101
 94 | 336,267,45
 95 | 31,1000,62
 96 | 337,349,45
 97 | 1131,193,10936
 98 | 338,155,49
 99 | 1555,1000,45
100 | 1496,7400,21
101 | 23,1473,25
102 | 37,768,9
103 | 1558,4521,52
104 | 1557,4177,11
105 | 1497,5456,25
106 | 18,2000,7
107 | 307,990,28
108 | 377,600,61
109 | 316,2417,130
110 | 1472,768,47
111 | 30,5473,11
112 | 1155,195,10936
113 | 1514,360,1301
114 | 36,2310,20
115 | 39,336,8
116 | 3,3196,74
117 | 1154,187,10936
118 | 1152,267,10936
119 | 1458,200,10001
120 | 1132,203,10936
121 | 22,2000,48
122 | 1116,6598,269
123 | 14,2000,77
124 | 312,2407,305
125 | 16,2000,65
126 | 1548,2500,227
127 | 1124,201,10936
128 | 40496,500,8
129 | 40474,2800,47
130 | 40475,2800,47
131 | 1144,329,10936
132 | 1136,250,10936
133 | 40647,1600,59
134 | 40497,3772,22
135 | 1147,337,10936
136 | 40649,1600,60
137 | 1164,185,10936
138 | 40646,1600,61
139 | 40669,160,13
140 | 1022,2000,1649
141 | 40665,476,169
142 | 40648,1600,61
143 | 40682,215,6
144 | 40663,399,132
145 | 40476,2800,47
146 | 40650,1600,60
147 | 40671,327,8
148 | 40680,1324,21
149 | 40690,512,19
150 | 1501,1593,257
151 | 40686,315,35
152 | 40477,2800,47
153 | 40693,973,19
154 | 40700,392,12
155 | 40702,1066,32
156 | 40710,303,31
157 | 12,2000,217
158 | 1157,321,10936
159 | 40664,1728,43
160 | 40705,959,45
161 | 40704,2201,4
162 | 40478,2800,47
163 | 40994,540,19
164 | 40916,158,20
165 | 1485,2600,501
166 | 1135,355,10936
167 | 40711,303,21
168 | 40,208,61
169 | 40687,1066,43
170 | 313,531,105
171 | 40999,2351,70
172 | 40691,1599,12
173 | 40706,1124,21
174 | 1129,384,10936
175 | 40713,3772,55
176 | 41007,2352,72
177 | 4329,470,38
178 | 41,214,10
179 | 4153,180,67
180 | 4340,383,6
181 | 43,306,15
182 | 446,200,9
183 | 41005,3660,72
184 | 450,264,6
185 | 1159,259,10936
186 | 463,180,77
187 | 40707,3772,55
188 | 40678,3200,15
189 | 464,250,3
190 | 40708,3772,55
191 | 48,151,8
192 | 458,841,71
193 | 475,400,16
194 | 40701,5000,34
195 | 53,270,14
196 | 50,958,28
197 | 40601,333,2
198 | 512,2001,2
199 | 61,150,5
200 | 1133,347,10936
201 | 59,351,35
202 | 469,797,22
203 | 40666,6598,169
204 | 721,200,11
205 | 717,508,11
206 | 1163,386,10936
207 | 694,310,9
208 | 724,468,12
209 | 54,846,19
210 | 730,250,6
211 | 40997,4704,73
212 | 715,1000,26
213 | 40998,4704,73
214 | 41000,4704,71
215 | 733,209,7
216 | 732,250,51
217 | 723,1000,26
218 | 375,9961,15
219 | 741,1024,6
220 | 41004,4704,73
221 | 744,250,6
222 | 745,159,21
223 | 747,167,20
224 | 44,4601,58
225 | 746,250,26
226 | 748,163,8
227 | 28,5620,65
228 | 743,1000,6
229 | 718,1000,101
230 | 749,500,6
231 | 740,1000,11
232 | 756,159,16
233 | 742,500,101
234 | 753,194,33
235 | 728,4052,8
236 | 763,250,11
237 | 764,450,12
238 | 750,500,8
239 | 765,475,12
240 | 767,475,12
241 | 769,250,51
242 | 720,4177,11
243 | 388,204,5833
244 | 776,250,6
245 | 773,250,26
246 | 770,625,7
247 | 778,252,15
248 | 751,1000,11
249 | 766,500,51
250 | 774,662,4
251 | 788,186,61
252 | 793,250,11
253 | 794,250,26
254 | 792,500,6
255 | 796,209,37
256 | 801,185,3
257 | 779,500,26
258 | 795,662,4
259 | 811,264,14
260 | 737,3107,7
261 | 818,310,17
262 | 814,468,3
263 | 799,1000,6
264 | 820,235,13
265 | 813,1000,6
266 | 830,250,11
267 | 772,2178,4
268 | 832,250,26
269 | 827,662,4
270 | 826,576,37
271 | 183,4177,11
272 | 797,1000,51
273 | 40971,1000,20
274 | 834,250,101
275 | 838,500,26
276 | 806,1000,51
277 | 841,950,10
278 | 860,380,3
279 | 853,506,15
280 | 863,250,11
281 | 851,240,125
282 | 845,1000,11
283 | 855,500,11
284 | 40677,3200,49
285 | 849,1000,26
286 | 870,500,6
287 | 873,250,51
288 | 880,284,11
289 | 877,250,51
290 | 869,500,11
291 | 879,500,26
292 | 895,222,3
293 | 886,500,8
294 | 900,400,7
295 | 896,500,26
296 | 866,1000,51
297 | 888,500,51
298 | 907,400,8
299 | 906,400,8
300 | 909,400,8
301 | 908,400,8
302 | 911,250,6
303 | 903,1000,26
304 | 725,8192,9
305 | 1160,410,10936
306 | 904,1000,51
307 | 1156,275,10936
308 | 915,315,19
309 | 912,1000,6
310 | 910,1000,11
311 | 918,250,51
312 | 925,323,5
313 | 933,250,26
314 | 935,250,11
315 | 913,1000,11
316 | 926,500,26
317 | 931,662,4
318 | 8,345,6
319 | 914,2001,2
320 | 936,500,11
321 | 941,189,24
322 | 1143,363,10936
323 | 917,1000,26
324 | 947,559,47
325 | 943,500,11
326 | 1140,324,10936
327 | 934,1156,40
328 | 735,8192,13
329 | 951,559,47
330 | 950,559,47
331 | 937,500,51
332 | 969,150,5
333 | 955,151,8
334 | 949,559,47
335 | 973,178,14
336 | 1149,458,10936
337 | 871,3848,6
338 | 952,214,10
339 | 962,2000,7
340 | 954,531,105
341 | 987,500,37
342 | 1123,405,10936
343 | 970,841,71
344 | 996,214,10
345 | 958,2310,20
346 | 948,2178,4
347 | 997,625,5
348 | 994,846,19
349 | 923,8641,6
350 | 60,5000,41
351 | 803,7129,6
352 | 991,1728,22
353 | 1126,412,10936
354 | 761,8192,22
355 | 983,1473,25
356 | 1515,571,1301
357 | 971,2000,77
358 | 995,2000,48
359 | 1162,322,10936
360 | 978,2000,217
361 | 40498,4898,12
362 | 847,6574,15
363 | 980,5620,65
364 | 976,9961,15
365 | 40499,5500,41
366 | 979,5000,41
367 | 1127,421,10936
368 | 1137,546,10936
369 | 807,8192,9
370 | 46,3190,288
371 | 953,3190,288
372 | 819,9517,7
373 | 816,8192,9
374 | 40670,3186,361
375 | 40910,3686,401
376 | 1148,468,10936
377 | 1150,470,10936
378 | 4537,9901,20
379 | 1153,484,10936
380 | 1165,542,10936
381 | 1158,604,10936
382 | 833,8192,33
383 | 1535,9989,4
384 | 397,313,5805
385 | 40645,1600,2970
386 | 1491,1600,65
387 | 1039,4229,1618
388 | 1493,1599,65
389 | 384,336,7903
390 | 1145,630,10936
391 | 4538,9873,33
392 | 4136,600,20001
393 | 1492,1600,65
394 | 1085,159,61360
395 | 394,918,3013
396 | 1084,220,22284
397 | 752,8192,33
398 | 386,913,3101
399 | 387,414,6430
400 | 1541,8654,4
401 | 1540,9285,4
402 | 4134,3751,1777
403 | 392,1003,3183
404 | 20,2000,1649
405 | 1106,190,16064
406 | 1134,1545,10936
407 | 1139,1545,10936
408 | 1130,1545,10936
409 | 383,690,8262
410 | 1128,1545,10936
411 | 391,1504,2887
412 | 400,878,7455
413 | 385,927,10129
414 | 1086,283,54622
415 | 1087,283,54622
416 | 4552,5665,105
417 | 1083,214,45102
418 | 389,2463,2001
419 | 1233,945,6374
420 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/defaults/regression.json:
--------------------------------------------------------------------------------
1 | {"algorithms": ["Lasso", "Ridge", "ElasticNet"], "hyperparameters": {}}


--------------------------------------------------------------------------------
/AutoML/oboe/automl/defaults/runtime_predictor.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ORIE4741/demos/1172fdf1fd6bb53998a300361473bf8f974f8d20/AutoML/oboe/automl/defaults/runtime_predictor.pkl


--------------------------------------------------------------------------------
/AutoML/oboe/automl/generate_matrix.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Shell script to generate error matrix (and merge results), parallelized across datasets.
  4 | 
  5 | usage () {
  6 | cat <<HELP_USAGE
  7 | Usage:
  8 | $0  [-m] mode [-s] SAVE_DIR [-d] DATA_DIR [-p] P_TYPE [-j] JSON_FILE [-e] ERROR_MATRIX [-n] MAX_PROCS [-a] AUC [-f] FULLNAME [-h]
  9 | 
 10 | -m:         mode in which to run, either "generate" or "merge"
 11 | -s:         where to save results, or in the merge mode, where results are saved
 12 | -d:         path to directory containing training datasets are located
 13 | -p:         problem type, either "classification" or "regression"
 14 | -j:         path to model configurations json file
 15 | -e:         error matrix already generated
 16 | -n:         maximum number of processes assigned to error matrix generation
 17 | -a:         whether to use AUC instead of BER
 18 | -f:         whether to use dataset file full name as dataset name
 19 | -h:         show this help information
 20 | HELP_USAGE
 21 | }
 22 | 
 23 | # parse user arguments
 24 | while getopts ":hm:s:d:p:j:e:n:a:f:" opt; do
 25 |     case ${opt} in
 26 |     h)
 27 |         usage
 28 |         exit 1
 29 |         ;;
 30 |     m)
 31 |         if [ ${OPTARG} != "generate" ] && [ ${OPTARG} != "merge" ]
 32 |         then
 33 |             echo "Invalid mode."
 34 |             usage
 35 |             exit 1
 36 |         fi
 37 |         echo "Running in ${OPTARG} mode..." >&2
 38 |         mode=${OPTARG}
 39 |         ;;
 40 |     s)
 41 |         SAVE_DIR=$OPTARG
 42 |         ;;
 43 |     d)
 44 |         DATA_DIR=$OPTARG
 45 |         ;;
 46 |     p)
 47 |         P_TYPE=$OPTARG
 48 |         ;;
 49 |     j)
 50 |         JSON_FILE=$OPTARG
 51 |         ;;
 52 |     e)
 53 |         ERROR_MATRIX=$OPTARG
 54 |         ;;
 55 |     n)
 56 |         MAX_PROCS=$OPTARG
 57 |         ;;
 58 |     a)
 59 |         AUC=$OPTARG
 60 |         ;;
 61 |     f)
 62 |         FULLNAME=$OPTARG
 63 |         ;;
 64 |     \?)
 65 |         echo "Invalid option: -${OPTARG}" >&2
 66 |         usage
 67 |         exit 1
 68 |         ;;
 69 | esac
 70 | done
 71 | 
 72 | #if [ "$1" == "" ]
 73 | #then
 74 | #  echo "Must specify mode."
 75 | #  usage
 76 | #  exit 1
 77 | #fi
 78 | 
 79 | # no limit for maximum number of processes if no number is given
 80 | if [ "${MAX_PROCS}" == "" ]
 81 | then
 82 |     MAX_PROCS="0"
 83 | fi
 84 | 
 85 | # default to not using AUC
 86 | if [ "${AUC}" == "" ]
 87 | then
 88 |     AUC="False"
 89 | fi
 90 | 
 91 | # default to not using fullname
 92 | if [ "${FULLNAME}" == "" ]
 93 | then
 94 |     FULLNAME="False"
 95 | fi
 96 | 
 97 | # strip '/' from end of file path (if there is one)
 98 | #SAVE_DIR=${3%/}
 99 | #DATA_DIR=${4%/}
100 | 
101 | # location of this script
102 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
103 | 
104 | # generate mode
105 | if [ "${mode}" == "generate" ]
106 | then
107 |   time=`date +%Y%m%d%H%M`
108 |   mkdir -p ${SAVE_DIR}/${time}
109 |   echo -e "SAVE_DIR=${SAVE_DIR}\nDATA_DIR=${DATA_DIR}\nP_TYPE=${P_TYPE}\nJSON_FILE=${JSON_FILE}\nAUC=${AUC}\nERROR_MATRIX=${ERROR_MATRIX}\n" >> ${SAVE_DIR}/${time}/configurations.txt
110 |   echo "Error matrix generation started at ${time}" >> ${SAVE_DIR}/${time}/log_${time}.txt
111 | 
112 |   ls ${DATA_DIR}/*.csv | xargs -i --max-procs=${MAX_PROCS} bash -c \
113 |   "python ${DIR}/generate_vector.py '${P_TYPE}' {} --file=${JSON_FILE} --save_dir=${SAVE_DIR}/${time} \
114 |   --error_matrix=${ERROR_MATRIX} --auc=${AUC} --fullname=${FULLNAME} &>> ${SAVE_DIR}/${time}/warnings_and_errors.txt"
115 | fi
116 | 
117 | # merge mode
118 | if [ "${mode}" == "merge" ]
119 | then
120 |   python ${DIR}/util.py ${SAVE_DIR}
121 | fi
122 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/generate_vector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generate a row of the error matrix for a given dataset. Records cross-validation error & elapsed time for each
  3 | algorithm & hyperparameter combination.
  4 | 
  5 | Note the difference between model "configurations" and "settings": configurations is a nested dictionary, containing
  6 | a list of algorithms, and a dictionary of lists of hyperparameters; settings is a list of dictionaries, with one
  7 | algorithm and a dictionary of hyperparameters. Below is an example of each:
  8 | 
  9 | Config: {'algorithms': ['KNN', 'DT'],
 10 |          'hyperparameters': {'KNN': {'n_neighbors': [1, 3, 5, 7], 'p': [1, 2]},
 11 |                              'DT':  {'min_samples_split': [0.01, 0.001]}
 12 |                             }
 13 |         }
 14 | 
 15 | Settings: [{'algorithm': 'KNN', 'hyperparameters': {'n_neighbors': 1, 'p': 1}},
 16 |            {'algorithm': 'KNN', 'hyperparameters': {'n_neighbors': 3, 'p': 2}},
 17 |            {'algorithm': 'DT',  'hyperparameters': {'min_samples_split': 0.01}}
 18 |           ]
 19 | """
 20 | 
 21 | import argparse
 22 | import numpy as np
 23 | import pandas as pd
 24 | import json
 25 | import os
 26 | import sys
 27 | import re
 28 | import time
 29 | import util
 30 | from model import Model
 31 | import mkl
 32 | 
 33 | mkl.set_num_threads(1)
 34 | RANDOM_STATE = 0
 35 | 
 36 | def main(args):
 37 |     # load selected algorithms & hyperparameters from string or JSON file
 38 |     assert (args.string is None) != (args.file is None), 'Exactly one of --string and --file must be specified.'
 39 |     if args.string:
 40 |         configs = json.loads(args.string)
 41 |     elif args.file:
 42 |         with open(args.file) as f:
 43 |             configs = json.load(f)
 44 |     assert set(configs.keys()) == {'algorithms', 'hyperparameters'}, 'Invalid arguments.'
 45 | 
 46 |     # load training dataset
 47 |     dataset = pd.read_csv(args.data, header=None).values
 48 |     filename = args.data.split('/')[-1].split('.')[0]
 49 |     # whether to use dataset filename as error matrix vector filename 
 50 |     if args.fullname:
 51 |         dataset_id = filename        
 52 |     else:
 53 |         dataset_id = int(re.findall("\\d+", filename)[0])        
 54 | 
 55 |     # do not generate error matrices twice on one dataset
 56 |     if args.error_matrix != None:
 57 |         if args.error_matrix.endswith('.csv'):
 58 |             generated_datasets = pd.read_csv(args.error_matrix, index_col=0).index.tolist()
 59 |             assert dataset_id not in generated_datasets, 'Already generated.'
 60 | 
 61 |     t0 = time.time()
 62 |     x = dataset[:, :-1]
 63 |     y = dataset[:, -1]
 64 | 
 65 |     settings = util.generate_settings(configs['algorithms'], configs['hyperparameters'])
 66 |     headings = [str(s) for s in settings]
 67 |     results = np.full((2, len(settings)), np.nan)
 68 | 
 69 |     # generate error matrix entries, i.e. compute k-fold cross validation error
 70 |     log_file = [file for file in os.listdir(args.save_dir) if file.startswith('log')][0]
 71 |     for i, setting in enumerate(settings):
 72 |         model = Model(args.p_type, setting['algorithm'], setting['hyperparameters'], args.auc, args.verbose)
 73 |         start = time.time()
 74 |         try:
 75 |             cv_errors, _ = model.kfold_fit_validate(x, y, n_folds=args.n_folds, random_state=RANDOM_STATE)
 76 |         except (ZeroDivisionError, KeyError, TypeError, ValueError) as e:
 77 |             with open(os.path.join(args.save_dir, log_file), 'a') as log:
 78 |                 line = '\nID={}, model={}, {}'.format(dataset_id, setting, e)
 79 |                 log.write(line)
 80 |         results[:, i] = np.array([cv_errors.mean(), time.time() - start])
 81 |         if args.fullname:
 82 |             save_path = os.path.join(args.save_dir, str(dataset_id) + '.csv')
 83 |         else:
 84 |             save_path = os.path.join(args.save_dir, str(dataset_id).zfill(5) + '.csv')
 85 |         pd.DataFrame(results, columns=headings, index=['Error', 'Time']).to_csv(save_path)
 86 | 
 87 |     # log results
 88 |     elapsed = time.time() - t0
 89 |     line = '\nID={}, Size={}, Time={:.0f}s, Avg. Error={:.3f}'\
 90 |            .format(dataset_id, dataset.shape, elapsed, results[0, :].mean())
 91 |     with open(os.path.join(args.save_dir, log_file), 'a') as log:
 92 |         log.write(line)
 93 |     print(line)
 94 | 
 95 | 
 96 | def parse_args(argv):
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument('p_type', type=str, help='Problem type. Either classification or regression.')
 99 |     parser.add_argument('data', type=str, help='File path to training dataset.')
100 |     parser.add_argument('--string', type=str,
101 |                         help='JSON-style string listing all algorithm types and hyperparameters. '
102 |                              'See automl/util.py for example.')
103 |     parser.add_argument('--file', type=str,
104 |                         help='JSON file listing all algorithm types and hyperparameters. '
105 |                              'See automl/defaults/models.json for example.')
106 |     parser.add_argument('--save_dir', type=str, default='./custom',
107 |                         help='Directory in which to save new error matrix.')
108 |     parser.add_argument('--n_folds', type=int, default=5, help='Number of folds to use for k-fold cross validation.')
109 |     parser.add_argument('--verbose', type=lambda x: x == 'True', default=False,
110 |                         help='Whether to generate print statements on completion.')
111 |     parser.add_argument('--error_matrix', type=str, default=None,
112 |                         help='Existing error matrix. Avoid re-generate its rows.')
113 |     parser.add_argument('--auc', type=lambda x: x == 'True', default=False, help='Whether to use AUC instead of BER')
114 |     parser.add_argument('--fullname', type=lambda x: x == 'True', default=False,
115 |                         help='Whether to use the full name of dataset as corresponding error matrix vectors.')
116 |     return parser.parse_args(argv)
117 | 
118 | 
119 | if __name__ == '__main__':
120 |     main(parse_args(sys.argv[1:]))
121 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/linalg.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Linear algebra helper functions.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse.linalg import svds
 7 | from scipy.linalg import qr
 8 | 
 9 | 
10 | def approx_rank(a, threshold=0.03):
11 |     """Compute approximate rank of a matrix.
12 | 
13 |     Args:
14 |         a (np.ndarray):    Matrix for which to compute rank.
15 |         threshold (float): All singular values less than threshold * (largest singular value) will be set to 0
16 |     Returns:
17 |         int: The approximate rank of a.
18 |     """
19 |     s = np.linalg.svd(a, compute_uv=False)
20 |     rank = s[s >= threshold * s[0]]
21 |     return len(rank)
22 | 
23 | 
24 | def pivot_columns(a, rank=None, threshold=None):
25 |     """Computes the QR decomposition of a matrix with column pivoting, i.e. solves the equation AP=QR such that Q is
26 |     orthogonal, R is upper triangular, and P is a permutation matrix.
27 | 
28 |     Args:
29 |         a (np.ndarray):    Matrix for which to compute QR decomposition.
30 |         threshold (float): Threshold specifying approximate rank of a. All singular values less than threshold * (largest singular value) will be set to 0
31 |         rank (int):        The approximate rank.
32 |     Returns:
33 |         np.array: The permutation p.
34 |     """
35 |     assert (threshold is None) != (rank is None), "Exactly one of threshold and rank should be specified."
36 |     if threshold is not None:
37 |         rank = approx_rank(a, threshold)
38 |     return qr(a, pivoting=True)[2][:rank]
39 | 
40 | 
41 | def pca(a, rank=None, threshold=None):
42 |     """Solves: minimize ||A_XY||^2 where ||.|| is the Frobenius norm.
43 | 
44 |     Args:
45 |         a (np.ndarray):    Matrix for which to compute PCA.
46 |         threshold (float): Threshold specifying approximate rank of a.
47 |         rank (int):        The approximate rank.
48 |     Returns:
49 |         x, y (np.ndarray): The solutions to the PCA problem.
50 |         vt (np.ndarray):   Transpose of V as specified in the singular value decomposition.
51 |     """
52 |     assert (threshold is None) != (rank is None), "Exactly one of threshold and rank should be specified."
53 |     if threshold is not None:
54 |         rank = approx_rank(a, threshold)
55 |     # std = np.std(a, axis=0)
56 |     u, s, vt = svds(a, k=rank)
57 | 
58 |     nonzero_pos = np.where(s > 0)[0]
59 |     s = s[nonzero_pos]
60 |     u = u[:, nonzero_pos]
61 |     vt = vt[nonzero_pos, :]
62 | 
63 |     u = np.fliplr(u)
64 |     s = np.flipud(s)
65 |     vt = np.flipud(vt)
66 |     # sigma_sqrt = np.diag(np.sqrt(s))
67 |     # x = np.dot(u, sigma_sqrt).T
68 |     # # y = np.dot(np.dot(sigma_sqrt, vt), np.diag(std))
69 |     # y = np.dot(sigma_sqrt, vt)
70 | 
71 |     sigma = np.diag(s)
72 |     x = np.dot(u, sigma).T
73 |     y = vt
74 |     return x, y, vt
75 | 
76 | 
77 | def impute(A, a, known_indices, rank=None):
78 |     """Imputes the missing entries of a vector a, given a fully observed matrix A of which a forms a new row.
79 | 
80 |     Args:
81 |         A (np.ndarray):           Fully observed matrix.
82 |         a (np.ndarray):           1xn partially observed array.
83 |         known_indices (np.array): Array of observed entries; from the set {1,...,n}
84 |         rank (int):               Approximate rank of A.
85 |     Returns:
86 |         np.ndarray: 1xn imputed array.
87 |     """
88 |     rank = rank or len(known_indices)
89 |     x, y, _ = pca(A, rank=rank)
90 |     # find x using matrix division using known portion of a, corresponding columns of A
91 |     x = np.linalg.lstsq(y[:, known_indices].T, a[:, known_indices].T, rcond=None)[0].T
92 |     # approximate full a as x*Y
93 |     return np.dot(x, y)
94 | 
95 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Parent class for all ML models.
  3 | """
  4 | 
  5 | import numpy as np
  6 | import util
  7 | from scipy.stats import mode
  8 | from sklearn.model_selection import StratifiedKFold, train_test_split
  9 | 
 10 | 
 11 | RANDOM_STATE = 0
 12 | 
 13 | 
 14 | class Model:
 15 |     """An object representing a machine learning model.
 16 | 
 17 |     Attributes:
 18 |         p_type (str):           Either 'classification' or 'regression'.
 19 |         algorithm (str):        Algorithm type (e.g. 'KNN').
 20 |         hyperparameters (dict): Hyperparameters (e.g. {'n_neighbors': 5}).
 21 |         model (object):         A scikit-learn object for the model.
 22 |         fitted (bool):          Whether or not the model has been trained.
 23 |         verbose (bool):         Whether or not to generate print statements when fitting complete.
 24 |     """
 25 | 
 26 |     def __init__(self, p_type, algorithm, hyperparameters={}, verbose=False, index=None):
 27 |         self.p_type = p_type
 28 |         self.algorithm = algorithm
 29 |         self.hyperparameters = hyperparameters
 30 |         self.model = self.instantiate()
 31 |         self.cv_error = np.nan
 32 |         self.cv_predictions = None
 33 |         self.sampled = False
 34 |         self.fitted = False
 35 |         self.verbose = verbose
 36 |         self.index = index
 37 | 
 38 |     def instantiate(self):
 39 |         """Creates a scikit-learn object of specified algorithm type and with specified hyperparameters.
 40 | 
 41 |         Returns:
 42 |             object: A scikit-learn object.
 43 |         """
 44 |         if self.algorithm == None or self.algorithm.lower() == 'greedy':
 45 |             return None
 46 |         try:
 47 |             return getattr(util, self.algorithm)(random_state=RANDOM_STATE, **self.hyperparameters)
 48 |         except TypeError:
 49 |             return getattr(util, self.algorithm)(**self.hyperparameters)
 50 | 
 51 |     def fit(self, x_train, y_train, runtime_limit=None):
 52 |         """Fits the model on training data. Note that this function is only used once a model has been identified as a
 53 |         model to be included in the final ensemble.
 54 | 
 55 |         Args:
 56 |             x_train (np.ndarray):   Features of the training dataset.
 57 |             y_train (np.ndarray):   Labels of the training dataset.
 58 |             runtime_limit (float):  Maximum amount of time to allocate to fitting.
 59 |         """
 60 |         self.model.fit(x_train, y_train)
 61 |         self.fitted = True
 62 |         if self.verbose:
 63 |             print("{} {} complete.".format(self.algorithm, self.hyperparameters))
 64 | 
 65 |     def predict(self, x_test):
 66 |         """Predicts labels on a new dataset.
 67 | 
 68 |         Args:
 69 |             x_test (np.ndarray): Features of the test dataset.
 70 | 
 71 |         Returns:
 72 |             np.array: Predicted features of the test dataset.
 73 |         """
 74 |         return self.model.predict(x_test)
 75 | 
 76 |     def kfold_fit_validate(self, x_train, y_train, n_folds, random_state=None):
 77 |         """Performs k-fold cross validation on a training dataset. Note that this is the function used to fill entries
 78 |         of the error matrix.
 79 | 
 80 |         Args:
 81 |             x_train (np.ndarray): Features of the training dataset.
 82 |             y_train (np.ndarray): Labels of the training dataset.
 83 |             n_folds (int):        Number of folds to use for cross validation.
 84 | 
 85 |         Returns:
 86 |             float: Mean of k-fold cross validation error.
 87 |             np.ndarray: Predictions on the training dataset from cross validation.
 88 |         """
 89 |         y_predicted = np.empty(y_train.shape)
 90 |         cv_errors = np.empty(n_folds)
 91 |         kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
 92 | 
 93 |         for i, (train_idx, test_idx) in enumerate(kf.split(x_train, y_train)):
 94 |             x_tr = x_train[train_idx, :]
 95 |             y_tr = y_train[train_idx]
 96 |             x_te = x_train[test_idx, :]
 97 |             y_te = y_train[test_idx]
 98 | 
 99 |             model = self.instantiate()
100 |             if len(np.unique(y_tr)) > 1:
101 |                 model.fit(x_tr, y_tr)
102 |                 y_predicted[test_idx] = model.predict(x_te)
103 |             else:
104 |                 y_predicted[test_idx] = y_tr[0]
105 |             cv_errors[i] = self.error(y_te, y_predicted[test_idx])
106 | 
107 |         self.cv_error = cv_errors.mean()
108 |         self.cv_predictions = y_predicted
109 |         self.sampled = True
110 |         if self.verbose:
111 |             print("{} {} complete.".format(self.algorithm, self.hyperparameters))
112 | 
113 |         return cv_errors, y_predicted
114 | 
115 |     def kfold_fit_validate_testing(self, x_train, y_train, n_folds, random_state=None):
116 |         """Performs k-fold cross validation on a training dataset, with fitting on a portion of the training fold and testing on the test fold. 
117 |         
118 |         Args:
119 |         x_train (np.ndarray): Features of the training dataset.
120 |         y_train (np.ndarray): Labels of the training dataset.
121 |         n_folds (int):        Number of folds to use for cross validation.
122 |         
123 |         Returns:
124 |         float: Mean of k-fold cross validation error.
125 |         np.ndarray: Predictions on the training dataset from cross validation.
126 |         """
127 |         y_predicted = np.empty(y_train.shape)
128 |         cv_errors = np.empty(n_folds)
129 |         kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state)
130 |             
131 |         for i, (train_idx, test_idx) in enumerate(kf.split(x_train, y_train)):
132 |             x_tr_val = x_train[train_idx, :]
133 |             y_tr_val = y_train[train_idx]
134 |             x_te = x_train[test_idx, :]
135 |             y_te = y_train[test_idx]
136 |             # split data into training and validation sets
137 |             try:
138 |                 x_tr, x_va, y_tr, y_va = train_test_split(x_tr_val, y_tr_val, test_size=0.15, stratify=y_tr_val, random_state=random_state)
139 |             except ValueError:
140 |                 x_tr, x_va, y_tr, y_va = train_test_split(x_tr_val, y_tr_val, test_size=0.15, random_state=random_state)
141 |                 
142 |             model = self.instantiate()
143 |             if len(np.unique(y_tr)) > 1:
144 |                 model.fit(x_tr, y_tr)
145 |                 y_predicted[test_idx] = model.predict(x_te)
146 |             else:
147 |                 y_predicted[test_idx] = y_tr[0]
148 |             cv_errors[i] = self.error(y_te, y_predicted[test_idx])
149 |                 
150 |         self.cv_error = cv_errors.mean()
151 |         self.cv_predictions = y_predicted
152 |         self.sampled = True
153 |         if self.verbose:
154 |             print("{} {} complete.".format(self.algorithm, self.hyperparameters))
155 | 
156 |         return cv_errors, y_predicted
157 | 
158 | 
159 |     def error(self, y_true, y_predicted):
160 |         """Compute error metric for the model.
161 | 
162 |         Args:
163 |             y_true (np.ndarray):      Observed labels.
164 |             y_predicted (np.ndarray): Predicted labels.
165 |         Returns:
166 |             float: Error metric
167 |         """
168 |         return util.error(y_true, y_predicted, self.p_type)
169 | 
170 | 
171 | class Ensemble(Model):
172 |     """An object representing an ensemble of machine learning models.
173 | 
174 |     Attributes:
175 |         p_type (str):           Either 'classification' or 'regression'.
176 |         algorithm (str):        Algorithm type (e.g. 'Logit').
177 |         hyperparameters (dict): Hyperparameters (e.g. {'C': 1.0}).
178 |         model (object):         A scikit-learn object for the model.
179 |     """
180 | 
181 |     def __init__(self, p_type, algorithm, hyperparameters={}):
182 |         super().__init__(p_type, algorithm, hyperparameters)
183 |         self.candidate_learners = []
184 |         self.base_learners = []
185 |         self.second_layer_features = None
186 | 
187 |     def select_base_learners(self, y_train, fitted_base_learners):
188 |         """Select base learners from candidate learners based on ensembling algorithm.
189 |         """
190 |         cv_errors = np.array([m.cv_error for m in self.candidate_learners])
191 |         # greedy ensemble forward selection
192 |         assert self.algorithm in {'greedy', 'stacking'}, "The ensemble selection method must be either greedy forward selection (by Caruana et al.) or stacking."
193 |         if self.algorithm == 'greedy':
194 |             x_tr = ()
195 |             # initial number of models in ensemble
196 |             n_initial = 3
197 |             for i in np.argsort(cv_errors)[:n_initial]:
198 |                 x_tr += (self.candidate_learners[i].cv_predictions.reshape(-1, 1), )
199 |                 if fitted_base_learners is None:
200 |                     pre_fitted = None
201 |                 else:
202 |                     pre_fitted = fitted_base_learners[self.candidate_learners[i].index]
203 |                 if pre_fitted is not None:
204 |                     self.base_learners.append(pre_fitted)
205 |                 else:
206 |                     self.base_learners.append(self.candidate_learners[i])
207 | 
208 |             x_tr = np.hstack(x_tr)
209 |             candidates = list(np.argsort(cv_errors))
210 |             error = util.error(y_train, mode(x_tr, axis=1)[0], self.p_type)
211 | 
212 |             while True:
213 |                 looped = True
214 |                 for i, idx in enumerate(candidates):
215 |                     slm = np.hstack((x_tr, self.candidate_learners[i].cv_predictions.reshape(-1, 1)))
216 |                     err = util.error(y_train, mode(slm, axis=1)[0], self.p_type)
217 |                     if err < error:
218 |                         error = err
219 |                         x_tr = slm
220 |                         if fitted_base_learners is None:
221 |                             pre_fitted = None
222 |                         else:
223 |                             pre_fitted = fitted_base_learners[self.candidate_learners[i].index]
224 |                         if pre_fitted is not None:
225 |                             self.base_learners.append(pre_fitted)
226 |                         else:
227 |                             self.base_learners.append(self.candidate_learners[i])
228 |                         looped = False
229 |                         break
230 |                 if looped:
231 |                     break
232 |             self.second_layer_features = x_tr
233 |         elif self.algorithm == 'stacking':
234 |             self.base_learners = self.candidate_learners
235 |             x_tr = [m.cv_predictions.reshape(-1, 1) for m in self.candidate_learners]
236 |             self.second_layer_features = np.hstack(tuple(x_tr))
237 | 
238 |     def fit(self, x_train, y_train, runtime_limit=None, fitted_base_learners=None):
239 |         """Add models to the ensemble and fit the ensemble on training data.
240 | 
241 |         Args:
242 |             x_train (np.ndarray):        Features of the training dataset.
243 |             y_train (np.ndarray):        Labels of the training dataset.
244 |             fitted_base_learners (list): A list of already fitted models.
245 |             
246 |         Args to be implemented:
247 |             runtime_limit (float):       Maximum runtime to be allocated to fitting.
248 |         """
249 |         self.select_base_learners(y_train, fitted_base_learners)
250 |         # TODO: parallelize training over base learners
251 |         for model in self.base_learners:
252 |             if not model.fitted:
253 |                 model.fit(x_train, y_train)
254 |         if self.algorithm != 'greedy':
255 |             self.model.fit(self.second_layer_features, y_train)
256 |         self.fitted = True
257 | 
258 |     def refit(self, x_train, y_train):
259 |         """Fit ensemble model on training data with base learners already added and unchanged.
260 |             
261 |         Args:
262 |             x_train (np.ndarray):        Features of the training dataset.
263 |             y_train (np.ndarray):        Labels of the training dataset.
264 |             
265 |         Args to be implemented:
266 |             runtime_limit (float):       Maximum runtime to be allocated to fitting.
267 |         """
268 |         # TODO: parallelize training over base learners
269 |         for model in self.base_learners:
270 |             if not model.fitted:
271 |                 model.fit(x_train, y_train)
272 |         if self.algorithm == 'stacking':
273 |             self.model.fit(self.second_layer_features, y_train)
274 | 
275 |     def predict(self, x_test):
276 |         """Generate predictions of the ensemble model on test data.
277 | 
278 |         Args:
279 |             x_test (np.ndarray): Features of the test dataset.
280 |         Returns:
281 |             np.array: Predicted labels of the test dataset.
282 |         """
283 |         assert len(self.base_learners) > 0, "Ensemble size must be greater than zero."
284 | 
285 |         base_learner_predictions = ()
286 |         for model in self.base_learners:
287 |             y_predicted = np.reshape(model.predict(x_test), [-1, 1])
288 |             base_learner_predictions += (y_predicted, )
289 |         self.x_te = np.hstack(base_learner_predictions)
290 |         if self.algorithm == 'greedy':
291 |             return mode(self.x_te, axis=1)[0].reshape((1, -1))
292 |         else:
293 |             return self.model.predict(self.x_te)
294 | 
295 |     def get_models(self):
296 |         """Get details of the selected machine learning models and the ensemble.
297 |         """
298 |         base_learner_names = {}
299 |         for model in self.base_learners:
300 |             if model.algorithm in base_learner_names.keys():
301 |                 base_learner_names[model.algorithm].append(model.hyperparameters)
302 |             else:
303 |                 base_learner_names[model.algorithm] = [model.hyperparameters]
304 |         if self.algorithm == 'greedy':
305 |             return {'ensemble method': 'greedy selection', 'base learners': base_learner_names}
306 |         elif self.algorithm == 'stacking':
307 |             ensemble_learner_name = {}
308 |             ensemble_learner_name[self.model.algorithm] = self.model.hyperparameters
309 |             return {'ensemble method': 'stacking', 'ensemble learner': ensemble_learner_name, 'base learners': base_learner_names}
310 | 
311 |     def get_model_accuracy(self, y_test):
312 |         """ Get prediction accuracies of each base learner when the true test labels are provided.
313 |             
314 |             Args:
315 |                 y_test (np.array):      True labels of the test set.
316 |                 
317 |             Returns:
318 |                 accuracies (list):      A numerical list of individual model accuracies on the test set.
319 |         """
320 |         accuracies = []
321 |         for iter in range(self.x_te.shape[1]):
322 |             accuracies.append(util.error(y_test, self.x_te[:, iter], self.p_type))
323 |         return accuracies
324 | 
325 | 
326 | class Model_collection(Ensemble):
327 |     """An object representing a collection of individual machine learning models.
328 |         
329 |         Attributes:
330 |             p_type (str):           Either 'classification' or 'regression'.
331 |     """
332 |     def __init__(self, p_type):
333 |         super().__init__(p_type=p_type, algorithm=None, hyperparameters=None)
334 | 
335 |     def select_base_learners(self):
336 |         """ Set inidividual learners to be all the learners added to the collection.
337 |         """
338 |         self.base_learners = self.candidate_learners
339 | 
340 |     def fit(self, x_train, y_train, runtime_limit=None, fitted_base_learners=None):
341 |         """ Fit inidividual learners in the model collection on training dataset.
342 |         
343 |         Args:
344 |             x_train (np.ndarray):        Features of the training dataset.
345 |             y_train (np.ndarray):        Labels of the training dataset.
346 |         """
347 |         self.select_base_learners()
348 |         super().refit(x_train=x_train, y_train=y_train)
349 |         self.fitted = True
350 | 
351 |     def predict(self, x_test):
352 |         """Generate predictions of the individual learners on test data.
353 |             
354 |         Args:
355 |             x_test (np.ndarray): Features of the test dataset.
356 |             
357 |         Returns:
358 |             np.ndarray: A 2-dimensional array containing predicted labels of the test dataset. Each column corresponds to the predictions of one single base learner.
359 |         """
360 |         assert len(self.base_learners) > 0, "Ensemble size must be greater than zero."
361 |         
362 |         base_learner_predictions = ()
363 |         for model in self.base_learners:
364 |             y_predicted = np.reshape(model.predict(x_test), [-1, 1])
365 |             base_learner_predictions += (y_predicted, )
366 |         # concatenation of predictions of each base learner
367 |         self.x_te = np.hstack(base_learner_predictions)
368 |         return self.x_te
369 | 
370 |     def get_models(self):
371 |         """Get details of the selected machine learning models and the ensemble.
372 |         """
373 |         base_learner_names = {}
374 |         for model in self.base_learners:
375 |             if model.algorithm in base_learner_names.keys():
376 |                 base_learner_names[model.algorithm].append(model.hyperparameters)
377 |             else:
378 |                 base_learner_names[model.algorithm] = [model.hyperparameters]
379 |         return base_learner_names
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/preprocessing.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pre-process datasets.
 3 | """
 4 | 
 5 | import numpy as np
 6 | from sklearn.preprocessing import scale
 7 | from sklearn.preprocessing import OneHotEncoder
 8 | from sklearn.preprocessing import Imputer
 9 | 
10 | 
11 | def pre_process(raw_data, categorical, impute=True, standardize=True, one_hot_encode=True):
12 |     """
13 |     Pre-process one dataset.
14 | 
15 |     Args:
16 |         raw_data (np.ndarray):    raw features of the n-by-d dataset, without indices and headings.
17 |         categorical (list):       a boolean list of length d indicating whether each raw feature is categorical.
18 |         impute (bool):            whether to impute missing entries or not.
19 |         standardize (bool):       whether to standardize each feature or not.
20 |         one_hot_encode (bool):    whether to use one hot encoding to pre-process categorical features or not.
21 |     Returns:
22 |         np.ndarray:               pre-processed dataset.
23 |     """
24 |     # list of pre-processed arrays (sub-portions of dataset)
25 |     processed = []
26 | 
27 |     # whether to impute missing entries
28 |     if impute:
29 |         # if there are any categorical features
30 |         if np.array(categorical).any():
31 |             raw_categorical = raw_data[:, categorical]
32 |             # impute missing entries in categorical features using the most frequent number
33 |             imp_categorical = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy=False)
34 |             processed.append(imp_categorical.fit_transform(raw_categorical))
35 | 
36 |         # if there are any numeric features
37 |         if np.invert(categorical).any():
38 |             raw_numeric = raw_data[:, np.invert(categorical)]
39 |             # impute missing entries in non-categorical features using mean
40 |             imp_numeric = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False)
41 |             processed.append(imp_numeric.fit_transform(raw_numeric))
42 | 
43 |         # data has now been re-ordered so all categorical features appear first
44 |         categorical = np.array(sorted(categorical, reverse=True))
45 |         processed_data = np.hstack(tuple(processed))
46 | 
47 |     else:
48 |         processed_data = raw_data
49 | 
50 |     # one-hot encoding for categorical features (only if there exist any)
51 |     if one_hot_encode and np.array(categorical).any():
52 |         encoder = OneHotEncoder(categorical_features=categorical)
53 |         processed_data = encoder.fit_transform(processed_data).toarray()
54 |         categorical = np.zeros(processed_data.shape[1], dtype=bool)
55 |             
56 |     # standardize all numeric and one-hot encoded categorical features
57 |     if standardize:
58 |         processed_data[:, np.invert(categorical)] = scale(processed_data[:, np.invert(categorical)])
59 |         
60 |     print('Data pre-processing finished')
61 |     return processed_data, categorical
62 | 


--------------------------------------------------------------------------------
/AutoML/oboe/automl/util.py:
--------------------------------------------------------------------------------
  1 | #requires a log file in the folder that contains csv files.
  2 | 
  3 | """
  4 | Miscellaneous helper functions.
  5 | """
  6 | 
  7 | import inspect
  8 | import itertools
  9 | import json
 10 | import numpy as np
 11 | import os
 12 | import pandas as pd
 13 | import pkg_resources
 14 | import re
 15 | import sys
 16 | import glob
 17 | from math import isclose
 18 | from sklearn.metrics import mean_squared_error
 19 | 
 20 | # Classification algorithms
 21 | from sklearn.neighbors import KNeighborsClassifier as KNN
 22 | from sklearn.tree import DecisionTreeClassifier as DT
 23 | from sklearn.ensemble import RandomForestClassifier as RF
 24 | from sklearn.ensemble import ExtraTreesClassifier as ExtraTrees
 25 | from sklearn.ensemble import GradientBoostingClassifier as GBT
 26 | from sklearn.ensemble import AdaBoostClassifier as AB
 27 | from sklearn.svm import LinearSVC as lSVM
 28 | from sklearn.svm import SVC as kSVM
 29 | from sklearn.linear_model import LogisticRegression as Logit
 30 | from sklearn.linear_model import Perceptron
 31 | from sklearn.naive_bayes import GaussianNB as GNB
 32 | from sklearn.neural_network import MLPClassifier as MLP
 33 | 
 34 | # Regression algorithms
 35 | from sklearn.linear_model import Lasso
 36 | from sklearn.linear_model import Ridge
 37 | from sklearn.linear_model import ElasticNet
 38 | # TODO: include more regression algorithms
 39 | 
 40 | 
 41 | defaults_path = pkg_resources.resource_filename(__name__, 'defaults')
 42 | with open(os.path.join(defaults_path, 'classification.json'), 'r') as f:
 43 |     CLS = json.load(f)
 44 | with open(os.path.join(defaults_path, 'regression.json'), 'r') as f:
 45 |     REG = json.load(f)
 46 | 
 47 | ALGORITHMS_C = dict(zip(CLS['algorithms'], list(map(lambda name: eval(name), CLS['algorithms']))))
 48 | ALGORITHMS_R = dict(zip(REG['algorithms'], list(map(lambda name: eval(name), REG['algorithms']))))
 49 | 
 50 | DEFAULTS = {'algorithms':       {'classification': ALGORITHMS_C,           'regression': ALGORITHMS_R},
 51 |             'hyperparameters': {'classification': CLS['hyperparameters'],  'regression': REG['hyperparameters']}}
 52 | 
 53 | 
 54 | def extract_columns(df, algorithms=None, hyperparameters=None):
 55 |     """
 56 |     Extract certain columns of the error matrix.
 57 |     
 58 |     Args:
 59 |         error_matrix (DataFrame):    The error matrix to be extracted.
 60 |         algorithms (string or list): One or a list of algorithms as search space.
 61 |         
 62 |     Args to be implemented:
 63 |         hyperparameters (list):      A list of hyperparameters as search space.
 64 |         
 65 |     Returns:
 66 |         DataFrame:                   A DataFrame consisting of corresponding columns.
 67 |     """
 68 |     assert algorithms is not None or hyperparameters is not None, \
 69 |     "At least one of the 'algorithms' and 'hyperparameters' need to be specified!"
 70 |     sampled_columns = []
 71 |     for item in list(df):
 72 |         to_sample_this_column = False
 73 |         if algorithms is None:
 74 |             to_sample_this_column = True
 75 |         elif eval(item)['algorithm'] in algorithms:
 76 |             if hyperparameters is None:
 77 |                 to_sample_this_column = True
 78 |             else:
 79 |                 to_sample_this_column = True
 80 |                 hyperparameter_column = eval(item)['algorithm']
 81 |                 hyperparameter_allowed = hyperparameters[eval(item)['algorithm']]
 82 |                 for key in hyperparameter_column:
 83 |                     if not key in hyperparameter_allowed.keys():
 84 |                         continue
 85 |                     else:
 86 |                         if hyperparameter_column[key] in hyperparameter_allowed[key]:
 87 |                             continue
 88 |                         else:
 89 |                             to_sample_this_column = False
 90 |                             break        
 91 |         if to_sample_this_column == True:
 92 |             sampled_columns.append(item)
 93 |     return df[sampled_columns]
 94 | 
 95 | def extract_column_names(df, algorithms=None, hyperparameters=None):
 96 |     """
 97 |     Extract names of certain columns of the error matrix.
 98 |     
 99 |     Args:
100 |         error_matrix (DataFrame):    The error matrix to be extracted.
101 |         algorithms (string or list): One or a list of algorithms as search space.
102 |         
103 |     Args to be implemented:
104 |         hyperparameters (list):      A list of hyperparameters as search space.
105 |         
106 |     Returns:
107 |         list:                        A list of column names.
108 |     """
109 |     return list(extract_columns(df, algorithms=algorithms, hyperparameters=hyperparameters))
110 | 
111 | def error(y_true, y_predicted, p_type):
112 |     """Compute error metric for the model; varies based on classification/regression and algorithm type.
113 |     BER (Balanced Error Rate): For classification.
114 |                               1/n * sum (0.5*(true positives/predicted positives + true negatives/predicted negatives))
115 |     MSE (Mean Squared Error): For regression. 1/n * sum(||y_pred - y_obs||^2).
116 | 
117 |     Args:
118 |         y_true (np.ndarray):      Observed labels.
119 |         y_predicted (np.ndarray): Predicted labels.
120 |         p_type (str):             Type of problem. One of {'classification', 'regression'}
121 |     Returns:
122 |         float: Error metric.
123 |     """
124 | 
125 |     assert p_type in {'classification', 'regression'}, "Please specify a valid type."
126 |     y_true = np.squeeze(y_true)
127 |     y_predicted = np.squeeze(y_predicted)
128 | 
129 |     if p_type == 'classification':
130 |         errors = []
131 |         epsilon = 1e-15
132 |         for i in np.unique(y_true):
133 |             tp = ((y_true == i) & (y_predicted == i)).sum()
134 |             tn = ((y_true != i) & (y_predicted != i)).sum()
135 |             fp = ((y_true != i) & (y_predicted == i)).sum()
136 |             fn = ((y_true == i) & (y_predicted != i)).sum()
137 |             errors.append(1 - 0.5*(tp / np.maximum(tp + fn, epsilon)) - 0.5*(tn / np.maximum(tn + fp, epsilon)))
138 |         return np.mean(errors)
139 | 
140 |     elif p_type == 'regression':
141 |         return mean_squared_error(y_true, y_predicted)
142 | 
143 | 
144 | def invalid_args(func, arglist):
145 |     """Check if args is a valid list of arguments to be passed to the function func.
146 | 
147 |     Args:
148 |         func (function): Function to check arguments for
149 |         arglist (list):  Proposed arguments
150 |     Returns:
151 |         set: Set of arguments in args that are invalid (returns empty set if there are none).
152 |     """
153 |     args = inspect.getfullargspec(func)[0]
154 |     return set(arglist) - set(args)
155 | 
156 | 
157 | def check_arguments(p_type, algorithms, hyperparameters, defaults=DEFAULTS):
158 |     """Check if arguments to constructor of AutoLearner object are valid, and default error matrix can be used.
159 | 
160 |     Args:
161 |         p_type (str):           Problem type. One of {'classification', 'regression'}
162 |         algorithms (list):      List of selected algorithms as strings. (e.g. ['KNN', 'lSVM', 'kSVM']
163 |         hyperparameters (dict): Nested dict of selected hyperparameters.
164 |         defaults (dict):        Nested dict of default algorithms & hyperparameters.
165 |     Returns:
166 |         bool: Whether or not the default error matrix can be used.
167 |     """
168 |     # check if valid problem type
169 |     assert p_type.lower() in ['classification', 'regression'], "Please specify a valid type."
170 | 
171 |     # set selected algorithms to default set if not specified
172 |     all_algs = list(defaults['algorithms'][p_type].keys())
173 |     if algorithms is None:
174 |         algorithms = all_algs
175 | 
176 |     # check if selected algorithms are a subset of supported algorithms for given problem type
177 |     assert set(algorithms).issubset(set(all_algs)), \
178 |         "Unsupported algorithm(s) {}.".format(set(algorithms) - set(all_algs))
179 | 
180 |     # set selected hyperparameters to default set if not specified
181 |     all_hyp = defaults['hyperparameters'][p_type]
182 |     if hyperparameters is None:
183 |         hyperparameters = all_hyp
184 | 
185 |     # check if selected hyperparameters are valid arguments to scikit-learn models
186 |     invalid = [invalid_args(defaults['algorithms'][p_type][alg], hyperparameters[alg].keys())
187 |                for alg in hyperparameters.keys()]
188 |     for i, args in enumerate(invalid):
189 |         assert len(args) == 0, "Unsupported hyperparameter(s) {} for algorithm {}" \
190 |             .format(args, list(hyperparameters.keys())[i])
191 | 
192 |     # check if it is necessary to generate new error matrix, i.e. are all hyperparameters in default error matrix
193 |     compatible_columns = []
194 |     new_columns = []
195 |     default_settings = generate_settings(defaults['algorithms'][p_type].keys(), defaults['hyperparameters'][p_type])
196 |     for alg in hyperparameters.keys():
197 |         for values in itertools.product(*hyperparameters[alg].values()):
198 |             setting = {'algorithm': alg, 'hyperparameters': dict(zip(hyperparameters[alg].keys(), list(values)))}
199 |             if setting in default_settings:
200 |                 compatible_columns.append(setting)
201 |             else:
202 |                 new_columns.append(setting)
203 |     return compatible_columns, new_columns
204 | 
205 | 
206 | def knapsack(weights, values, capacity):
207 |     """Solve the knapsack problem; maximize sum_i v[i]*x[i] subject to sum_i w[i]*x[i] <= W and x[i] in {0, 1}
208 | 
209 |     Args:
210 |         weights (np.ndarray): "weights" of each item
211 |         values (np.ndarray):  "values" of each item
212 |         capacity (int):       maximum "weight" allowed
213 |     Returns:
214 |         set: list of selected indices
215 |     """
216 |     assert len(weights) == len(values), "Weights & values must have same shape."
217 |     assert type(capacity) == int, "Capacity must be an integer."
218 |     n = len(weights)
219 |     m = np.zeros((n+1, capacity+1)).astype(int)
220 | 
221 |     for i in range(n+1):
222 |         for w in range(capacity+1):
223 |             if i == 0 or w == 0:
224 |                 pass
225 |             elif weights[i-1] <= w:
226 |                 m[i, w] = max(values[i-1] + m[i-1, w-weights[i-1]], m[i-1, w])
227 |             else:
228 |                 m[i, w] = m[i-1, w]
229 | 
230 |     def find_selected(j, v):
231 |         if j == 0:
232 |             return set()
233 |         if m[j, v] > m[j-1, v]:
234 |             return {j-1}.union(find_selected(j-1, v - weights[j-1]))
235 |         else:
236 |             return find_selected(j-1, v)
237 | 
238 |     return find_selected(n, capacity)
239 | 
240 | 
241 | def check_dataframes(m1, m2):
242 |     """Check if 2 dataframes have the same shape and share the same index column.
243 | 
244 |     Args:
245 |         m1 (DataFrame): first dataframe
246 |         m2 (DataFrame): second dataframe
247 |     Returns:
248 |         bool:           Whether the conditions are satisfied
249 |     """
250 |     assert m1.shape == m2.shape
251 |     assert set(m1.index) == set(m2.index)
252 |     return True
253 | 
254 | 
255 | def generate_settings(algorithms, hyperparameters, sort=True):
256 |     """Generate column headings of error matrix.
257 | 
258 |     Args:
259 |         algorithms (list):      A list of algorithms in strings (e.g. ['KNN', 'RF', 'lSVM'])
260 |         hyperparameters (dict): A nested dictionary of hyperparameters. First key is algorithm type (str), second key
261 |                                 is hyperparameter name (str); argument to pass to scikit-learn constructor with array
262 |                                 of values
263 |                                 (e.g. {'KNN': {'n_neighbors': np.array([1, 3, 5, 7]),
264 |                                                'p':           np.array([1, 2])}}).
265 |         sort (bool):            Whether to sort settings in alphabetical order with respect to algorithm name.
266 |     Returns:
267 |         list: List of nested dictionaries, one entry for each model setting.
268 |               (e.g. [{'algorithm': 'KNN',  'hyperparameters': {'n_neighbors': 1, 'p': 1}},
269 |                      {'algorithm': 'lSVM', 'hyperparameters': {'C': 1.0}}])
270 |     """
271 |     settings = []
272 |     for alg in algorithms:
273 |         hyperparams = hyperparameters[alg]
274 |         for values in itertools.product(*hyperparams.values()):
275 |             configs = dict(zip(hyperparams.keys(), list(values)))
276 |             for key, val in configs.items():
277 |                 if isinstance(val, (int, float)):
278 |                     if isclose(val, round(val)):
279 |                         configs[key] = int(round(val))
280 |             settings.append({'algorithm': alg, 'hyperparameters': configs})
281 |     if sort:
282 |         settings = sorted(settings, key=lambda k: k['algorithm'])
283 |     return settings
284 | 
285 | 
286 | def merge_rows(save_dir):
287 |     """Merge rows of error matrix. Creates two CSV files: one error matrix and one runtime matrix.
288 | 
289 |     Args:
290 |         save_dir (str): Directory containing per-dataset CSV files of cross-validation errors & time for each model.
291 |     """
292 |     if not os.path.isdir(save_dir):
293 |         print('Invalid path.')
294 |         return
295 | 
296 |     # find files to concatenate (all .csv files; may contain previously merged results)
297 |     files = [file for file in os.listdir(save_dir) if file.endswith('.csv') and 'sizes' not in file]
298 |     em, rm = 'error_matrix.csv', 'runtime_matrix.csv'
299 |     headers, ids, error_matrix_rows, runtime_matrix_rows = None, [], (), ()
300 | 
301 |     if (em in files) and (rm in files):
302 |         errors = pd.read_csv(os.path.join(save_dir, files.pop(files.index(em))), index_col=0)
303 |         runtimes = pd.read_csv(os.path.join(save_dir, files.pop(files.index(rm))), index_col=0)
304 |         assert set(errors.index) == set(runtimes.index), "Previous results must share index column."
305 |         assert set(list(errors)) == set(list(runtimes)), "Previous results must share headers."
306 |         ids += list(errors.index)
307 |         headers = list(errors)
308 |         error_matrix_rows += (errors.values, )
309 |         runtime_matrix_rows += (runtimes.values, )
310 | 
311 |     # concatenate new results
312 |     # TODO: only load files corresponding to completed files in log.txt
313 |     for file in files:
314 |         file_path = os.path.join(save_dir, file)
315 |         dataframe = pd.read_csv(file_path, index_col=0)
316 |         if headers is None:
317 |             headers = list(dataframe)
318 |         else:
319 |             assert set(headers) == set(list(dataframe)), "All results must share same headers."
320 |         if np.isnan(dataframe.values).any():
321 |             # if values contain NaNs, generation has not yet finished
322 |             pass
323 |         else:
324 |             permutation = [headers.index(h) for h in list(dataframe)]
325 |             error_matrix_rows += (np.expand_dims(dataframe.values[0, permutation], 0), )
326 |             runtime_matrix_rows += (np.expand_dims(dataframe.values[1, permutation], 0), )
327 |             ids.append(file.split('.')[0])
328 |             try:
329 |                 os.mkdir(os.path.join(save_dir, "merged_csv_files"))
330 |             except:
331 |                 pass
332 |             os.rename(file_path, os.path.join(save_dir, "merged_csv_files", file))
333 | #             os.remove(file_path)
334 |             if len(error_matrix_rows) % 50 == 0:
335 |                 print('Merging {} files...'.format(len(error_matrix_rows)))
336 | 
337 |     # get dataset sizes
338 |     # openml_datasets = openml.datasets.list_datasets()
339 |     # openml_datasets = pd.DataFrame.from_dict(openml_datasets, orient='index')
340 |     # dataset_sizes = openml_datasets[['NumberOfInstances', 'NumberOfFeatures']]
341 | 
342 | #     #find the log file
343 | #     for f in glob.glob('{}/log*.txt'.format(save_dir)):
344 | #          log_path = f
345 | #     # save dataset sizes
346 | #     with open(log_path, 'r') as file:
347 | #         lines = file.readlines()
348 | #     dataset_ids, sizes = [], []
349 | #     for line in lines:
350 | #         if 'Size' in line:
351 | #             log_ids = [int(n) for n in re.findall(r'ID=(\d+)', line)]
352 | #             size = [eval(n) for n in re.findall(r'Size=\((\d+, \d+)\)', line)]
353 | #             if len(log_ids) == 1 and len(size) == 1:
354 | #                 dataset_ids.append(log_ids[0])
355 | #                 sizes.append(size[0])
356 | 
357 |     # save results
358 |     pd.DataFrame(np.vstack(error_matrix_rows), index=ids, columns=headers).to_csv(os.path.join(save_dir, em))
359 |     pd.DataFrame(np.vstack(runtime_matrix_rows), index=ids, columns=headers).to_csv(os.path.join(save_dir, rm))
360 | #     pd.DataFrame(np.vstack(sizes), index=dataset_ids).to_csv(os.path.join(save_dir, 'dataset_sizes.csv'))    
361 |     # dataset_sizes.to_csv(os.path.join(save_dir, 'dataset_sizes.csv'))
362 | 
363 |     
364 | if __name__ == '__main__':
365 |     merge_rows(sys.argv[1])
366 | 


--------------------------------------------------------------------------------
/AutoML/oboe/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | Examples of how to use the Oboe system.
 3 | 
 4 | 1. `error_matrix_generation`
 5 | 
 6 | This directory contains an example of the offline error matrix generation.
 7 | 
 8 | 2. `classification`
 9 | 
10 | This Jupyter notebook contains examples of the online AutoML fitting and prediction.
11 | 


--------------------------------------------------------------------------------
/AutoML/oboe/examples/classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This is a classification example to show how to use Oboe for training and testing, in the context of AutoML, i.e., do model selection on the training set and then evaluate the performance of the selected model on the test set."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# necessary modules\n",
 17 |     "import sys\n",
 18 |     "import pandas as pd\n",
 19 |     "import os\n",
 20 |     "import time\n",
 21 |     "import numpy as np\n",
 22 |     "import multiprocessing\n",
 23 |     "\n",
 24 |     "#Oboe modules; this will be simplified when Oboe becomes pip installable\n",
 25 |     "automl_path = '../automl/'\n",
 26 |     "sys.path.append(automl_path)\n",
 27 |     "from auto_learner import AutoLearner\n",
 28 |     "import util\n",
 29 |     "\n",
 30 |     "#import scikit-learn modules\n",
 31 |     "from sklearn.datasets import load_iris\n",
 32 |     "from sklearn.model_selection import train_test_split\n",
 33 |     "from sklearn.metrics import accuracy_score\n",
 34 |     "\n",
 35 |     "# disable warnings\n",
 36 |     "import warnings\n",
 37 |     "warnings.filterwarnings('ignore')"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 2,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "#load and split dataset into training and test folds\n",
 47 |     "data = load_iris()\n",
 48 |     "x = np.array(data['data'])\n",
 49 |     "y = np.array(data['target'])\n",
 50 |     "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "# Example 1: a no-brainer use"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 11,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# initialize the autolearner class\n",
 67 |     "m = AutoLearner(p_type='classification', runtime_limit=10, verbose=False)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 12,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "# fit autolearner on training set and record runtime\n",
 77 |     "start = time.time()\n",
 78 |     "m.fit(x_train, y_train)\n",
 79 |     "elapsed_time = time.time() - start"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 13,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "prediction error: 0.05087719298245613\n",
 92 |       "elapsed time: 7.21905517578125\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# use the fitted autolearner for prediction on test set\n",
 98 |     "y_predicted = m.predict(x_test)\n",
 99 |     "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))    \n",
100 |     "print(\"elapsed time: {}\".format(elapsed_time))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 14,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "{'ensemble method': 'greedy selection',\n",
112 |        " 'base learners': {'kSVM': [{'C': 0.25, 'kernel': 'poly', 'coef0': 10}],\n",
113 |        "  'AB': [{'n_estimators': 100, 'learning_rate': 3}],\n",
114 |        "  'GBT': [{'learning_rate': 0.001, 'max_depth': 3, 'max_features': 'log2'}]}}"
115 |       ]
116 |      },
117 |      "execution_count": 14,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "# get names of the selected machine learning models\n",
124 |     "m.get_models()"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "# Example 2: build an ensemble of models"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 15,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "#experimental settings\n",
141 |     "VERBOSE = False #whether to print out information indicating current fitting progress\n",
142 |     "N_CORES = 1 #number of cores\n",
143 |     "RUNTIME_BUDGET = 15"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 16,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "#optional: limit the types of algorithms\n",
153 |     "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 17,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "#autolearner arguments\n",
163 |     "autolearner_kwargs = {\n",
164 |     "    'p_type': 'classification',\n",
165 |     "    'runtime_limit': RUNTIME_BUDGET,\n",
166 |     "    'verbose': VERBOSE,\n",
167 |     "    'selection_method': 'min_variance',\n",
168 |     "    'algorithms': s,\n",
169 |     "    'stacking_alg': 'greedy',\n",
170 |     "    'n_cores': N_CORES,\n",
171 |     "    'build_ensemble': True,\n",
172 |     "}"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 18,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "#intialize the autolearner class\n",
182 |     "m = AutoLearner(**autolearner_kwargs)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 19,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "# fit autolearner on training set and record runtime\n",
192 |     "start = time.time()\n",
193 |     "m.fit(x_train, y_train)\n",
194 |     "elapsed_time = time.time() - start"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 20,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "prediction error: 0.025438596491228094\n",
207 |       "elapsed time: 7.217111110687256\n",
208 |       "individual accuracies of selected models: [0.025438596491228094, 0.025438596491228094, 0.05087719298245613]\n"
209 |      ]
210 |     }
211 |    ],
212 |    "source": [
213 |     "# use the fitted autolearner for prediction on test set\n",
214 |     "y_predicted = m.predict(x_test)\n",
215 |     "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n",
216 |     "print(\"elapsed time: {}\".format(elapsed_time))\n",
217 |     "print(\"individual accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 21,
223 |    "metadata": {
224 |     "scrolled": true
225 |    },
226 |    "outputs": [
227 |     {
228 |      "data": {
229 |       "text/plain": [
230 |        "{'ensemble method': 'greedy selection',\n",
231 |        " 'base learners': {'KNN': [{'n_neighbors': 11, 'p': 1},\n",
232 |        "   {'n_neighbors': 13, 'p': 1},\n",
233 |        "   {'n_neighbors': 11, 'p': 2}]}}"
234 |       ]
235 |      },
236 |      "execution_count": 21,
237 |      "metadata": {},
238 |      "output_type": "execute_result"
239 |     }
240 |    ],
241 |    "source": [
242 |     "# get names of the selected machine learning models\n",
243 |     "m.get_models()"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "markdown",
248 |    "metadata": {},
249 |    "source": [
250 |     "# Example 3: just select a collection of promising models without building an ensemble afterwards"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 22,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "#experimental settings\n",
260 |     "VERBOSE = False #whether to print out information indicating current fitting progress\n",
261 |     "N_CORES = 1 #number of cores\n",
262 |     "RUNTIME_BUDGET = 15"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 23,
268 |    "metadata": {},
269 |    "outputs": [],
270 |    "source": [
271 |     "#optional: limit the types of algorithms\n",
272 |     "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 24,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "#autolearner arguments\n",
282 |     "autolearner_kwargs = {\n",
283 |     "    'p_type': 'classification',\n",
284 |     "    'runtime_limit': RUNTIME_BUDGET,\n",
285 |     "    'verbose': VERBOSE,\n",
286 |     "    'selection_method': 'min_variance',\n",
287 |     "    'algorithms': s,\n",
288 |     "    'stacking_alg': 'greedy',\n",
289 |     "    'n_cores': N_CORES,\n",
290 |     "    'build_ensemble': False,\n",
291 |     "}"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 25,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "#intialize the autolearner class\n",
301 |     "m = AutoLearner(**autolearner_kwargs)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 26,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "# fit autolearner on training set and record runtime\n",
311 |     "start = time.time()\n",
312 |     "m.fit(x_train, y_train)\n",
313 |     "elapsed_time = time.time() - start"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 27,
319 |    "metadata": {},
320 |    "outputs": [
321 |     {
322 |      "name": "stdout",
323 |      "output_type": "stream",
324 |      "text": [
325 |       "elapsed time: 5.099339008331299\n",
326 |       "accuracies of selected models: [0.0, 0.0, 0.0, 0.0, 0.0, 0.025438596491228094, 0.0, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.025438596491228094, 0.0, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.02348484848484848, 0.05087719298245613, 0.0, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.02348484848484848, 0.05087719298245613, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05087719298245613, 0.05087719298245613, 0.025438596491228094, 0.0, 0.025438596491228094, 0.05087719298245613, 0.025438596491228094, 0.025438596491228094, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613]\n"
327 |      ]
328 |     }
329 |    ],
330 |    "source": [
331 |     "# use the fitted autolearner for prediction on test set\n",
332 |     "y_predicted = m.predict(x_test)\n",
333 |     " \n",
334 |     "print(\"elapsed time: {}\".format(elapsed_time))\n",
335 |     "print(\"accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "markdown",
340 |    "metadata": {},
341 |    "source": [
342 |     "Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported."
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": 28,
348 |    "metadata": {},
349 |    "outputs": [
350 |     {
351 |      "data": {
352 |       "text/plain": [
353 |        "{'KNN': [{'n_neighbors': 1, 'p': 2},\n",
354 |        "  {'n_neighbors': 1, 'p': 2},\n",
355 |        "  {'n_neighbors': 1, 'p': 1},\n",
356 |        "  {'n_neighbors': 3, 'p': 1},\n",
357 |        "  {'n_neighbors': 3, 'p': 2},\n",
358 |        "  {'n_neighbors': 5, 'p': 1},\n",
359 |        "  {'n_neighbors': 5, 'p': 2},\n",
360 |        "  {'n_neighbors': 7, 'p': 1},\n",
361 |        "  {'n_neighbors': 7, 'p': 2},\n",
362 |        "  {'n_neighbors': 9, 'p': 1},\n",
363 |        "  {'n_neighbors': 9, 'p': 2},\n",
364 |        "  {'n_neighbors': 11, 'p': 1},\n",
365 |        "  {'n_neighbors': 11, 'p': 2},\n",
366 |        "  {'n_neighbors': 13, 'p': 1},\n",
367 |        "  {'n_neighbors': 13, 'p': 2},\n",
368 |        "  {'n_neighbors': 15, 'p': 1},\n",
369 |        "  {'n_neighbors': 15, 'p': 2}],\n",
370 |        " 'DT': [{'min_samples_split': 0.0001},\n",
371 |        "  {'min_samples_split': 1e-05},\n",
372 |        "  {'min_samples_split': 2},\n",
373 |        "  {'min_samples_split': 0.001},\n",
374 |        "  {'min_samples_split': 4},\n",
375 |        "  {'min_samples_split': 8},\n",
376 |        "  {'min_samples_split': 0.01},\n",
377 |        "  {'min_samples_split': 16},\n",
378 |        "  {'min_samples_split': 64}],\n",
379 |        " 'AB': [{'n_estimators': 50, 'learning_rate': 2.5},\n",
380 |        "  {'n_estimators': 50, 'learning_rate': 2},\n",
381 |        "  {'n_estimators': 50, 'learning_rate': 3},\n",
382 |        "  {'n_estimators': 100, 'learning_rate': 2},\n",
383 |        "  {'n_estimators': 100, 'learning_rate': 2.5},\n",
384 |        "  {'n_estimators': 100, 'learning_rate': 3}],\n",
385 |        " 'GNB': [{}],\n",
386 |        " 'ExtraTrees': [{'min_samples_split': 2, 'criterion': 'gini'},\n",
387 |        "  {'min_samples_split': 2, 'criterion': 'entropy'},\n",
388 |        "  {'min_samples_split': 4, 'criterion': 'gini'},\n",
389 |        "  {'min_samples_split': 4, 'criterion': 'entropy'},\n",
390 |        "  {'min_samples_split': 8, 'criterion': 'gini'},\n",
391 |        "  {'min_samples_split': 8, 'criterion': 'entropy'},\n",
392 |        "  {'min_samples_split': 16, 'criterion': 'gini'},\n",
393 |        "  {'min_samples_split': 16, 'criterion': 'entropy'},\n",
394 |        "  {'min_samples_split': 0.1, 'criterion': 'gini'},\n",
395 |        "  {'min_samples_split': 0.1, 'criterion': 'entropy'},\n",
396 |        "  {'min_samples_split': 0.01, 'criterion': 'gini'},\n",
397 |        "  {'min_samples_split': 0.01, 'criterion': 'entropy'},\n",
398 |        "  {'min_samples_split': 0.001, 'criterion': 'gini'},\n",
399 |        "  {'min_samples_split': 0.001, 'criterion': 'entropy'},\n",
400 |        "  {'min_samples_split': 0.0001, 'criterion': 'gini'},\n",
401 |        "  {'min_samples_split': 0.0001, 'criterion': 'entropy'},\n",
402 |        "  {'min_samples_split': 1e-05, 'criterion': 'gini'},\n",
403 |        "  {'min_samples_split': 1e-05, 'criterion': 'entropy'}],\n",
404 |        " 'RF': [{'min_samples_split': 2, 'criterion': 'gini'},\n",
405 |        "  {'min_samples_split': 2, 'criterion': 'entropy'},\n",
406 |        "  {'min_samples_split': 4, 'criterion': 'gini'},\n",
407 |        "  {'min_samples_split': 4, 'criterion': 'entropy'},\n",
408 |        "  {'min_samples_split': 8, 'criterion': 'gini'},\n",
409 |        "  {'min_samples_split': 8, 'criterion': 'entropy'},\n",
410 |        "  {'min_samples_split': 16, 'criterion': 'gini'},\n",
411 |        "  {'min_samples_split': 16, 'criterion': 'entropy'},\n",
412 |        "  {'min_samples_split': 0.01, 'criterion': 'gini'},\n",
413 |        "  {'min_samples_split': 0.01, 'criterion': 'entropy'},\n",
414 |        "  {'min_samples_split': 0.001, 'criterion': 'gini'},\n",
415 |        "  {'min_samples_split': 0.001, 'criterion': 'entropy'},\n",
416 |        "  {'min_samples_split': 0.0001, 'criterion': 'gini'},\n",
417 |        "  {'min_samples_split': 0.0001, 'criterion': 'entropy'},\n",
418 |        "  {'min_samples_split': 1e-05, 'criterion': 'gini'},\n",
419 |        "  {'min_samples_split': 1e-05, 'criterion': 'entropy'}]}"
420 |       ]
421 |      },
422 |      "execution_count": 28,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "# get names of the selected machine learning models\n",
429 |     "m.get_models()"
430 |    ]
431 |   }
432 |  ],
433 |  "metadata": {
434 |   "kernelspec": {
435 |    "display_name": "Python 3",
436 |    "language": "python",
437 |    "name": "python3"
438 |   },
439 |   "language_info": {
440 |    "codemirror_mode": {
441 |     "name": "ipython",
442 |     "version": 3
443 |    },
444 |    "file_extension": ".py",
445 |    "mimetype": "text/x-python",
446 |    "name": "python",
447 |    "nbconvert_exporter": "python",
448 |    "pygments_lexer": "ipython3",
449 |    "version": "3.7.3"
450 |   }
451 |  },
452 |  "nbformat": 4,
453 |  "nbformat_minor": 2
454 | }
455 | 


--------------------------------------------------------------------------------
/AutoML/oboe/examples/error_matrix_generation/README.md:
--------------------------------------------------------------------------------
 1 | This is a quick example to show how to generate the error matrix from preprocessed datasets. It works on Unix and Linux but not on OS X for now.
 2 | 
 3 | # Dataset format
 4 | 
 5 | The datasets should be `csv` files. All the columns except the last are features; the last column is the class label.
 6 | 
 7 | # Recording model errors and runtime
 8 | Run
 9 | ```
10 | bash generate.sh
11 | ```
12 | It will create a `results` directory, with a subdirectory named by the start time of the generation procedure and containing results on individual datasets. We call this subdirectory the "csv directory".
13 | # Merging into the error and runtime matrices
14 | First, modify the directory name in angle brackets in `merge.sh`to be the name of the "csv directory". Then do
15 | ```
16 | bash merge.sh
17 | ```
18 | It will generate a `error_matrix.csv` and a `runtime_matrix.csv` in the "csv directory", and move the csv files already merged into these matrices into `merged_csv_files`.
19 | 


--------------------------------------------------------------------------------
/AutoML/oboe/examples/error_matrix_generation/generate.sh:
--------------------------------------------------------------------------------
1 | bash ../../automl/generate_matrix.sh -p classification -m generate -s results -d . -j ../../automl/defaults/classification.json -n 5
2 | 


--------------------------------------------------------------------------------
/AutoML/oboe/examples/error_matrix_generation/merge.sh:
--------------------------------------------------------------------------------
1 | bash ../../automl/generate_matrix.sh -m merge -s results/<directory_containing_csv_files_on_individual_datasets>
2 | 


--------------------------------------------------------------------------------
/QR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from scipy.linalg import qr, pinv, solve, norm\n",
 10 |     "from numpy.random import randn\n",
 11 |     "from numpy.linalg import lstsq\n",
 12 |     "import numpy as np"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "# generate random data matrix\n",
 22 |     "n,d = 6,4\n",
 23 |     "X = randn(n,d)\n",
 24 |     "\n",
 25 |     "# optional: give it linearly dependent columns\n",
 26 |     "# X[:,3] = X[:,2]"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# Understanding the pseudoinverse"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 4,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "# form pseudoinverse\n",
 43 |     "Xd = pinv(X)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 5,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "array([[ 1.00000000e+00,  3.26194267e-16, -2.32599523e-16,\n",
 55 |        "        -1.76538379e-16],\n",
 56 |        "       [ 1.23838370e-16,  1.00000000e+00, -7.15454140e-16,\n",
 57 |        "         3.20159051e-16],\n",
 58 |        "       [-2.99593137e-16, -2.24746860e-16,  1.00000000e+00,\n",
 59 |        "         1.57207172e-16],\n",
 60 |        "       [-8.35114587e-17, -2.19207791e-16,  3.92819212e-17,\n",
 61 |        "         1.00000000e+00]])"
 62 |       ]
 63 |      },
 64 |      "execution_count": 5,
 65 |      "metadata": {},
 66 |      "output_type": "execute_result"
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# X†X ≈ I_d\n",
 71 |     "Xd @ X"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 6,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "data": {
 81 |       "text/plain": [
 82 |        "True"
 83 |       ]
 84 |      },
 85 |      "execution_count": 6,
 86 |      "metadata": {},
 87 |      "output_type": "execute_result"
 88 |     }
 89 |    ],
 90 |    "source": [
 91 |     "np.allclose(Xd @ X, np.identity(4))"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "data": {
101 |       "text/plain": [
102 |        "array([[ 0.92292723, -0.19207847, -0.02487266, -0.02034622,  0.15734442,\n",
103 |        "        -0.0919159 ],\n",
104 |        "       [-0.19207847,  0.29368018,  0.04785243, -0.24610263,  0.05077907,\n",
105 |        "        -0.32419612],\n",
106 |        "       [-0.02487266,  0.04785243,  0.93897132,  0.08772047,  0.21549265,\n",
107 |        "         0.01623931],\n",
108 |        "       [-0.02034622, -0.24610263,  0.08772047,  0.82689984, -0.25147931,\n",
109 |        "        -0.10592118],\n",
110 |        "       [ 0.15734442,  0.05077907,  0.21549265, -0.25147931,  0.16689241,\n",
111 |        "         0.04499583],\n",
112 |        "       [-0.0919159 , -0.32419612,  0.01623931, -0.10592118,  0.04499583,\n",
113 |        "         0.85062902]])"
114 |       ]
115 |      },
116 |      "execution_count": 7,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "# XX† !≈ I_n\n",
123 |     "X @ Xd"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 8,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "False"
135 |       ]
136 |      },
137 |      "execution_count": 8,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "np.allclose(X @ Xd, np.identity(6))"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 9,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "Q,R = qr(X)\n",
153 |     "Q,R = qr(X, mode='economic')"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 10,
159 |    "metadata": {},
160 |    "outputs": [
161 |     {
162 |      "data": {
163 |       "text/plain": [
164 |        "True"
165 |       ]
166 |      },
167 |      "execution_count": 10,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "np.allclose(X, Q @ R)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 11,
179 |    "metadata": {},
180 |    "outputs": [
181 |     {
182 |      "data": {
183 |       "text/plain": [
184 |        "array([[-0.56201486,  0.63759906, -0.23057605, -0.38388624],\n",
185 |        "       [-0.14763691, -0.21633913,  0.46831834,  0.07588703],\n",
186 |        "       [-0.27210148, -0.70101235, -0.21471219, -0.57219968],\n",
187 |        "       [ 0.66267283,  0.14334266, -0.15547092, -0.58570148],\n",
188 |        "       [-0.38277792, -0.12482775, -0.06724561, -0.01641728],\n",
189 |        "       [ 0.05147032, -0.13826577, -0.80790972,  0.41969547]])"
190 |       ]
191 |      },
192 |      "execution_count": 11,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "Q"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 12,
204 |    "metadata": {},
205 |    "outputs": [
206 |     {
207 |      "data": {
208 |       "text/plain": [
209 |        "array([[ 2.82036938, -0.4006528 ,  1.90362682,  0.34118195],\n",
210 |        "       [ 0.        ,  2.55806078, -0.13829193, -0.39913027],\n",
211 |        "       [ 0.        ,  0.        , -3.47705222,  0.39879886],\n",
212 |        "       [ 0.        ,  0.        ,  0.        , -1.72866855]])"
213 |       ]
214 |      },
215 |      "execution_count": 12,
216 |      "metadata": {},
217 |      "output_type": "execute_result"
218 |     }
219 |    ],
220 |    "source": [
221 |     "R"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 13,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "name": "stdout",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "True\n"
234 |      ]
235 |     },
236 |     {
237 |      "data": {
238 |       "text/plain": [
239 |        "array([[ 1.00000000e+00, -7.06106860e-17,  1.63377044e-17,\n",
240 |        "        -4.64978195e-17],\n",
241 |        "       [-7.06106860e-17,  1.00000000e+00,  9.28390587e-17,\n",
242 |        "        -4.27972210e-17],\n",
243 |        "       [ 1.63377044e-17,  9.28390587e-17,  1.00000000e+00,\n",
244 |        "         3.18741738e-17],\n",
245 |        "       [-4.64978195e-17, -4.27972210e-17,  3.18741738e-17,\n",
246 |        "         1.00000000e+00]])"
247 |       ]
248 |      },
249 |      "execution_count": 13,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "print(np.allclose(Q.T @ Q, np.identity(Q.shape[1])))\n",
256 |     "Q.T @ Q"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 18,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "# form data from noisy linear model\n",
266 |     "wtrue = randn(d)\n",
267 |     "y = X.dot(wtrue) + .01*randn(n)"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 26,
273 |    "metadata": {},
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "0.0015392303466796875"
279 |       ]
280 |      },
281 |      "execution_count": 26,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "# solve least squares problem to estimate w\n",
288 |     "Q,R = qr(X, mode='economic')\n",
289 |     "w = solve(R, Q.T @ y)"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 20,
295 |    "metadata": {},
296 |    "outputs": [
297 |     {
298 |      "data": {
299 |       "text/plain": [
300 |        "0.013572977104475286"
301 |       ]
302 |      },
303 |      "execution_count": 20,
304 |      "metadata": {},
305 |      "output_type": "execute_result"
306 |     }
307 |    ],
308 |    "source": [
309 |     "# how good is our estimate?\n",
310 |     "norm(w - wtrue)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 21,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "6.898214932602962e-05"
322 |       ]
323 |      },
324 |      "execution_count": 21,
325 |      "metadata": {},
326 |      "output_type": "execute_result"
327 |     }
328 |    ],
329 |    "source": [
330 |     "# compute mean square error\n",
331 |     "def mse(y,z):\n",
332 |     "    return sum((y-z)**2)/len(y)\n",
333 |     "    \n",
334 |     "mse(y,X.dot(w))"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 22,
340 |    "metadata": {},
341 |    "outputs": [
342 |     {
343 |      "data": {
344 |       "text/plain": [
345 |        "1.025079205553247e-15"
346 |       ]
347 |      },
348 |      "execution_count": 22,
349 |      "metadata": {},
350 |      "output_type": "execute_result"
351 |     }
352 |    ],
353 |    "source": [
354 |     "# we can use the numpy.lstsq call instead\n",
355 |     "w_lstsq = np.linalg.lstsq(X, y, rcond=None)[0]\n",
356 |     "norm(w_lstsq - w)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "# Compute QR by hand"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 18,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "data": {
373 |       "text/plain": [
374 |        "array([[-0.47534812,  0.        ,  0.        ,  0.        ,  0.        ,\n",
375 |        "         0.        ],\n",
376 |        "       [ 0.02906357,  0.        ,  0.        ,  0.        ,  0.        ,\n",
377 |        "         0.        ],\n",
378 |        "       [ 0.00071364,  0.        ,  0.        ,  0.        ,  0.        ,\n",
379 |        "         0.        ],\n",
380 |        "       [ 0.45358044,  0.        ,  0.        ,  0.        ,  0.        ,\n",
381 |        "         0.        ],\n",
382 |        "       [-0.26950649,  0.        ,  0.        ,  0.        ,  0.        ,\n",
383 |        "         0.        ],\n",
384 |        "       [-0.70344154,  0.        ,  0.        ,  0.        ,  0.        ,\n",
385 |        "         0.        ]])"
386 |       ]
387 |      },
388 |      "execution_count": 18,
389 |      "metadata": {},
390 |      "output_type": "execute_result"
391 |     }
392 |    ],
393 |    "source": [
394 |     "n,d = X.shape \n",
395 |     "X0 = X.copy()\n",
396 |     "R = np.zeros((n,d))\n",
397 |     "Q = np.zeros((n,n))\n",
398 |     "\n",
399 |     "# first column of Q points in direction of first column of X\n",
400 |     "r = norm(X[:,0])\n",
401 |     "Q[:,0] = X[:,0]/r\n",
402 |     "Q"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 19,
408 |    "metadata": {},
409 |    "outputs": [],
410 |    "source": [
411 |     "# ensure Q*R matches X on first column\n",
412 |     "R[0,0] = r"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": 20,
418 |    "metadata": {},
419 |    "outputs": [
420 |     {
421 |      "data": {
422 |       "text/plain": [
423 |        "array([0., 0., 0., 0., 0., 0.])"
424 |       ]
425 |      },
426 |      "execution_count": 20,
427 |      "metadata": {},
428 |      "output_type": "execute_result"
429 |     }
430 |    ],
431 |    "source": [
432 |     "# verify Q*R matches X in first column\n",
433 |     "(Q@R - X)[:,0]"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": 21,
439 |    "metadata": {},
440 |    "outputs": [],
441 |    "source": [
442 |     "# now delete that part from X; we've covered it already\n",
443 |     "X[:,0] -= Q[:,0]*R[0,0]"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 22,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "data": {
453 |       "text/plain": [
454 |        "array([[ True,  True,  True,  True],\n",
455 |        "       [ True,  True,  True,  True],\n",
456 |        "       [ True,  True,  True,  True],\n",
457 |        "       [ True,  True,  True,  True],\n",
458 |        "       [ True,  True,  True,  True],\n",
459 |        "       [ True,  True,  True,  True]])"
460 |       ]
461 |      },
462 |      "execution_count": 22,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "# verify Q*R + X = X0\n",
469 |     "np.isclose(Q@R + X, X0)"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 23,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/plain": [
480 |        "array([[ 3.74544392, -0.69063861,  0.96961492,  0.82682362],\n",
481 |        "       [ 0.        ,  0.        ,  0.        ,  0.        ],\n",
482 |        "       [ 0.        ,  0.        ,  0.        ,  0.        ],\n",
483 |        "       [ 0.        ,  0.        ,  0.        ,  0.        ],\n",
484 |        "       [ 0.        ,  0.        ,  0.        ,  0.        ],\n",
485 |        "       [ 0.        ,  0.        ,  0.        ,  0.        ]])"
486 |       ]
487 |      },
488 |      "execution_count": 23,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "# eliminate component of other columns in direction of first column of Q \n",
495 |     "for j in range(1,d):\n",
496 |     "    R[0,j] = Q[:,0].dot(X[:,j])\n",
497 |     "    X[:,j] -= Q[:,0]*R[0,j]\n",
498 |     "R"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "code",
503 |    "execution_count": 24,
504 |    "metadata": {},
505 |    "outputs": [
506 |     {
507 |      "data": {
508 |       "text/plain": [
509 |        "array([[ True,  True,  True,  True],\n",
510 |        "       [ True,  True,  True,  True],\n",
511 |        "       [ True,  True,  True,  True],\n",
512 |        "       [ True,  True,  True,  True],\n",
513 |        "       [ True,  True,  True,  True],\n",
514 |        "       [ True,  True,  True,  True]])"
515 |       ]
516 |      },
517 |      "execution_count": 24,
518 |      "metadata": {},
519 |      "output_type": "execute_result"
520 |     }
521 |    ],
522 |    "source": [
523 |     "# verify Q*R + X = X0\n",
524 |     "np.isclose(Q@R + X, X0)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 25,
530 |    "metadata": {},
531 |    "outputs": [
532 |     {
533 |      "name": "stdout",
534 |      "output_type": "stream",
535 |      "text": [
536 |       "iteration 0 : QR + X = X0? True\n",
537 |       "iteration 1 : QR + X = X0? True\n",
538 |       "iteration 2 : QR + X = X0? True\n",
539 |       "iteration 3 : QR + X = X0? True\n"
540 |      ]
541 |     }
542 |    ],
543 |    "source": [
544 |     "# now for all the columns!\n",
545 |     "X = X0.copy()\n",
546 |     "Q *= 0\n",
547 |     "R *= 0\n",
548 |     "\n",
549 |     "# compute the QR decomposition\n",
550 |     "for i in range(d):\n",
551 |     "    r = norm(X[:,i])\n",
552 |     "    Q[:,i] = X[:,i]/r\n",
553 |     "    for j in range(i,d):\n",
554 |     "        R[i,j] = Q[:,i].dot(X[:,j])\n",
555 |     "        X[:,j] -= Q[:,i]*R[i,j]\n",
556 |     "    print(\"iteration\",i,\": QR + X = X0?\", np.isclose(Q@R + X, X0).all())"
557 |    ]
558 |   },
559 |   {
560 |    "cell_type": "code",
561 |    "execution_count": 26,
562 |    "metadata": {},
563 |    "outputs": [],
564 |    "source": [
565 |     "\"\"\"Our very own QR function to compute the economy QR\"\"\"\n",
566 |     "def ourQR(X0):\n",
567 |     "    X = X0.copy()\n",
568 |     "    n,d = X.shape\n",
569 |     "    R = np.zeros((n,d))\n",
570 |     "    Q = np.zeros((n,n))\n",
571 |     "\n",
572 |     "    # compute the QR decomposition\n",
573 |     "    for i in range(d):\n",
574 |     "        r = norm(X[:,i])\n",
575 |     "        Q[:,i] = X[:,i]/r\n",
576 |     "        for j in range(i,d):\n",
577 |     "            R[i,j] = Q[:,i].dot(X[:,j])\n",
578 |     "            X[:,j] -= Q[:,i]*R[i,j]\n",
579 |     "    return Q,R"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 31,
585 |    "metadata": {},
586 |    "outputs": [],
587 |    "source": [
588 |     "# solve least squares problem to estimate w\n",
589 |     "Q,R = ourQR(X0)\n",
590 |     "w_byhand = solve(R[:d,:d], (Q.T @ y)[:d])"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": 32,
596 |    "metadata": {},
597 |    "outputs": [
598 |     {
599 |      "data": {
600 |       "text/plain": [
601 |        "1.3822165187958571e-15"
602 |       ]
603 |      },
604 |      "execution_count": 32,
605 |      "metadata": {},
606 |      "output_type": "execute_result"
607 |     }
608 |    ],
609 |    "source": [
610 |     "norm(w_byhand - w)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {},
617 |    "outputs": [],
618 |    "source": []
619 |   }
620 |  ],
621 |  "metadata": {
622 |   "@webio": {
623 |    "lastCommId": null,
624 |    "lastKernelId": null
625 |   },
626 |   "kernelspec": {
627 |    "display_name": "Python 3",
628 |    "language": "python",
629 |    "name": "python3"
630 |   },
631 |   "language_info": {
632 |    "codemirror_mode": {
633 |     "name": "ipython",
634 |     "version": 3
635 |    },
636 |    "file_extension": ".py",
637 |    "mimetype": "text/x-python",
638 |    "name": "python",
639 |    "nbconvert_exporter": "python",
640 |    "pygments_lexer": "ipython3",
641 |    "version": "3.8.3"
642 |   }
643 |  },
644 |  "nbformat": 4,
645 |  "nbformat_minor": 4
646 | }
647 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # demos
2 | Demos and tutorials for ORIE 4741
3 | 


--------------------------------------------------------------------------------
/SVD.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "from scipy.linalg import svd, norm\n",
 11 |     "from numpy.random import randn, rand\n",
 12 |     "np.random.seed(0)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "array([[ 1.76405235,  0.40015721,  0.97873798,  0.97873798],\n",
 24 |        "       [ 1.86755799, -0.97727788,  0.95008842,  0.95008842],\n",
 25 |        "       [-0.10321885,  0.4105985 ,  0.14404357,  0.14404357],\n",
 26 |        "       [ 0.76103773,  0.12167502,  0.44386323,  0.44386323],\n",
 27 |        "       [ 1.49407907, -0.20515826,  0.3130677 ,  0.3130677 ],\n",
 28 |        "       [-2.55298982,  0.6536186 ,  0.8644362 ,  0.8644362 ]])"
 29 |       ]
 30 |      },
 31 |      "execution_count": 2,
 32 |      "metadata": {},
 33 |      "output_type": "execute_result"
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "# generate random data matrix\n",
 38 |     "n,d = 6,4\n",
 39 |     "X = randn(n,d)\n",
 40 |     "\n",
 41 |     "# optional: give it linearly dependent columns\n",
 42 |     "X[:,3] = X[:,2]\n",
 43 |     "X"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 3,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/plain": [
 54 |        "array([0., 0., 0., 0., 0., 0.])"
 55 |       ]
 56 |      },
 57 |      "execution_count": 3,
 58 |      "metadata": {},
 59 |      "output_type": "execute_result"
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "# find a vector w in the nullspace of X\n",
 64 |     "w = np.zeros(d)\n",
 65 |     "w[2] = -1\n",
 66 |     "w[3] = 1\n",
 67 |     "X@w"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 4,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "U,S,Vt = svd(X, full_matrices=False)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "data": {
 86 |       "text/plain": [
 87 |        "True"
 88 |       ]
 89 |      },
 90 |      "execution_count": 5,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "np.allclose(U@np.diag(S)@Vt, X)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 6,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "array([[ 1.00000000e+00, -7.38697485e-17,  3.23622199e-17,\n",
108 |        "        -3.99817880e-17],\n",
109 |        "       [-7.38697485e-17,  1.00000000e+00, -3.81412273e-17,\n",
110 |        "         5.93941004e-17],\n",
111 |        "       [ 3.23622199e-17, -3.81412273e-17,  1.00000000e+00,\n",
112 |        "        -4.16017539e-17],\n",
113 |        "       [-3.99817880e-17,  5.93941004e-17, -4.16017539e-17,\n",
114 |        "         1.00000000e+00]])"
115 |       ]
116 |      },
117 |      "execution_count": 6,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "U.T@U"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 7,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "True"
135 |       ]
136 |      },
137 |      "execution_count": 7,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "np.allclose(U.T@U, np.identity(d))"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 8,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "(6, 4)"
155 |       ]
156 |      },
157 |      "execution_count": 8,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "U.shape"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 9,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "array([[ 0.80175178, -0.06399596, -0.00612596,  0.30096884,  0.24817008,\n",
175 |        "         0.05140258],\n",
176 |        "       [-0.06399596,  0.9701751 ,  0.00504366,  0.0348882 ,  0.15022773,\n",
177 |        "         0.03207643],\n",
178 |        "       [-0.00612596,  0.00504366,  0.9944328 ,  0.05699362, -0.04603767,\n",
179 |        "        -0.01027116],\n",
180 |        "       [ 0.30096884,  0.0348882 ,  0.05699362,  0.12011976,  0.0995328 ,\n",
181 |        "         0.02713897],\n",
182 |        "       [ 0.24817008,  0.15022773, -0.04603767,  0.0995328 ,  0.15300143,\n",
183 |        "        -0.18278125],\n",
184 |        "       [ 0.05140258,  0.03207643, -0.01027116,  0.02713897, -0.18278125,\n",
185 |        "         0.96051913]])"
186 |       ]
187 |      },
188 |      "execution_count": 9,
189 |      "metadata": {},
190 |      "output_type": "execute_result"
191 |     }
192 |    ],
193 |    "source": [
194 |     "U@U.T"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 10,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "data": {
204 |       "text/plain": [
205 |        "(4, 4)"
206 |       ]
207 |      },
208 |      "execution_count": 10,
209 |      "metadata": {},
210 |      "output_type": "execute_result"
211 |     }
212 |    ],
213 |    "source": [
214 |     "Vt.shape"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 11,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "True"
226 |       ]
227 |      },
228 |      "execution_count": 11,
229 |      "metadata": {},
230 |      "output_type": "execute_result"
231 |     }
232 |    ],
233 |    "source": [
234 |     "np.allclose(Vt @ Vt.T, np.identity(d))"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 12,
240 |    "metadata": {},
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/plain": [
245 |        "True"
246 |       ]
247 |      },
248 |      "execution_count": 12,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "np.allclose(Vt.T @ Vt, np.identity(d))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 13,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "array([4.15760175e+00, 2.28949949e+00, 1.01350732e+00, 1.48389401e-16])"
266 |       ]
267 |      },
268 |      "execution_count": 13,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "S"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 14,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "Error of rank 0 approximation:  4.853314053310529\n",
287 |       "Error of rank 1 approximation:  2.5037981161489284\n",
288 |       "Error of rank 2 approximation:  1.0135073191135213\n",
289 |       "Error of rank 3 approximation:  2.0282945925593685e-15\n",
290 |       "Error of rank 4 approximation:  2.0404123996834285e-15\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "# if we have a linearly dependent column, \n",
296 |     "# decomposition is just as good if we ignore the 0 in sigma and reduce r by 1\n",
297 |     "for k in range(d+1):\n",
298 |     "    print(f\"Error of rank {k} approximation: \", \n",
299 |     "          np.linalg.norm(X - U[:,:k]@np.diag(S[:k])@(Vt[:k,:])))\n",
300 |     "    "
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 15,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "array([[0., 0., 0., 0.],\n",
312 |        "       [0., 0., 0., 0.],\n",
313 |        "       [0., 0., 0., 0.],\n",
314 |        "       [0., 0., 0., 0.],\n",
315 |        "       [0., 0., 0., 0.],\n",
316 |        "       [0., 0., 0., 0.]])"
317 |       ]
318 |      },
319 |      "execution_count": 15,
320 |      "metadata": {},
321 |      "output_type": "execute_result"
322 |     }
323 |    ],
324 |    "source": [
325 |     "# what is a rank 0 approximation?\n",
326 |     "k = 0\n",
327 |     "U[:,:k]@np.diag(S[:k])@(Vt[:k,:])"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 16,
333 |    "metadata": {},
334 |    "outputs": [
335 |     {
336 |      "data": {
337 |       "text/plain": [
338 |        "array([[ 1.63569576, -0.14072899,  1.04290218,  1.04290218],\n",
339 |        "       [ 2.03375744, -0.27692435,  0.86700695,  0.86700695],\n",
340 |        "       [-0.18022995,  0.08607883,  0.18254066,  0.18254066],\n",
341 |        "       [ 0.71793643, -0.05995099,  0.46540914,  0.46540914],\n",
342 |        "       [ 1.47764056, -0.27442906,  0.32128515,  0.32128515],\n",
343 |        "       [-2.51856443,  0.7986849 ,  0.84722729,  0.84722729]])"
344 |       ]
345 |      },
346 |      "execution_count": 16,
347 |      "metadata": {},
348 |      "output_type": "execute_result"
349 |     }
350 |    ],
351 |    "source": [
352 |     "# form rank 2 apx of X by zeroing last two singular values\n",
353 |     "S2 = S.copy()\n",
354 |     "S2[2:] = 0\n",
355 |     "U@np.diag(S2)@Vt"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 17,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "# form data from noisy linear model\n",
365 |     "wtrue = randn(d)\n",
366 |     "y = X@wtrue + .1*randn(n);"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 18,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "data": {
376 |       "text/plain": [
377 |        "array([ 2.32981892e+00, -1.49417792e+00, -1.55210116e+14,  1.55210116e+14])"
378 |       ]
379 |      },
380 |      "execution_count": 18,
381 |      "metadata": {},
382 |      "output_type": "execute_result"
383 |     }
384 |    ],
385 |    "source": [
386 |     "# solve least squares problem to estimate w\n",
387 |     "w4 = Vt.T@np.diag(S**(-1))@U.T@y\n",
388 |     "w4"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 19,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "residual given w4: 0.3224116203732873\n",
401 |       "residual given wtrue: 0.3063847433022363\n"
402 |      ]
403 |     }
404 |    ],
405 |    "source": [
406 |     "# it gives a low norm solution, but definitely not optimal...\n",
407 |     "print(\"residual given w4:\", norm(y - X.dot(w4)))\n",
408 |     "print(\"residual given wtrue:\", norm(y - X.dot(wtrue)))"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 20,
414 |    "metadata": {},
415 |    "outputs": [
416 |     {
417 |      "data": {
418 |       "text/plain": [
419 |        "0.7526118592612931"
420 |       ]
421 |      },
422 |      "execution_count": 20,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "# error in normal equations not zero! uh oh!\n",
429 |     "norm(X.T@X@w4 - X.T@y)"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 21,
435 |    "metadata": {},
436 |    "outputs": [
437 |     {
438 |      "data": {
439 |       "text/plain": [
440 |        "array([ 2.32981892, -1.45396574, -0.07338535, -0.07338535])"
441 |       ]
442 |      },
443 |      "execution_count": 21,
444 |      "metadata": {},
445 |      "output_type": "execute_result"
446 |     }
447 |    ],
448 |    "source": [
449 |     "# use rank k approximation to design matrix X\n",
450 |     "# k=4 is full rank\n",
451 |     "# when design matrix X has rank 3, k=3 gives 0 error approximation\n",
452 |     "# while k=2 results in loss of accuracy\n",
453 |     "k = 3\n",
454 |     "w3 = Vt[:k,:].T@np.diag(S[:k]**(-1))@(U[:,:k]).T@y\n",
455 |     "w3"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": 22,
461 |    "metadata": {},
462 |    "outputs": [
463 |     {
464 |      "name": "stdout",
465 |      "output_type": "stream",
466 |      "text": [
467 |       "residual given w3: 0.1943391668277365\n",
468 |       "error in normal equations given w3: 2.175583928816829e-15\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "print(\"residual given w3:\", norm(y - X.dot(w3)))\n",
474 |     "print(\"error in normal equations given w3:\", norm(X.T@X@w3 - X.T@y))"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 23,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "data": {
484 |       "text/plain": [
485 |        "1.2560739669470201e-15"
486 |       ]
487 |      },
488 |      "execution_count": 23,
489 |      "metadata": {},
490 |      "output_type": "execute_result"
491 |     }
492 |    ],
493 |    "source": [
494 |     "# add a vector in the nullspace to w3:\n",
495 |     "w = w3.copy()\n",
496 |     "w[2] += 1\n",
497 |     "w[3] -= 1\n",
498 |     "norm(X.T@X@w - X.T@y)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "markdown",
503 |    "metadata": {},
504 |    "source": [
505 |     "Poll:\n",
506 |     "* A) least squares residual norm(y-Xw) will be higher for w than w3\n",
507 |     "* B) least squares residual norm(y-Xw) will be lower for w than w3\n",
508 |     "* C) least squares residual norm(y-Xw) will be the same for w than w3"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 24,
514 |    "metadata": {},
515 |    "outputs": [
516 |     {
517 |      "name": "stdout",
518 |      "output_type": "stream",
519 |      "text": [
520 |       "residual given w: 0.19433916682773625\n",
521 |       "error in normal equations given w: 1.2560739669470201e-15\n"
522 |      ]
523 |     }
524 |    ],
525 |    "source": [
526 |     "print(\"residual given w:\", norm(y - X.dot(w)))\n",
527 |     "print(\"error in normal equations given w:\", norm(X.T@X@w - X.T@y))"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 25,
533 |    "metadata": {},
534 |    "outputs": [
535 |     {
536 |      "data": {
537 |       "text/plain": [
538 |        "array([ 2.32981892, -1.45396574,  0.92661465, -1.07338535])"
539 |       ]
540 |      },
541 |      "execution_count": 25,
542 |      "metadata": {},
543 |      "output_type": "execute_result"
544 |     }
545 |    ],
546 |    "source": [
547 |     "w"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "Poll:\n",
555 |     "* A) there is one global minimum of least squares\n",
556 |     "* B) there are two global minima of least squares\n",
557 |     "* C) there are many global minima of least squares\n",
558 |     "* D) there are infinitely many global minima of least squares"
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 26,
564 |    "metadata": {},
565 |    "outputs": [
566 |     {
567 |      "data": {
568 |       "text/plain": [
569 |        "0.4628663965326436"
570 |       ]
571 |      },
572 |      "execution_count": 26,
573 |      "metadata": {},
574 |      "output_type": "execute_result"
575 |     }
576 |    ],
577 |    "source": [
578 |     "# how good is our estimate of w?\n",
579 |     "norm(w - wtrue) / norm(wtrue)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 28,
585 |    "metadata": {},
586 |    "outputs": [
587 |     {
588 |      "data": {
589 |       "text/plain": [
590 |        "1.0990647210786425e-15"
591 |       ]
592 |      },
593 |      "execution_count": 28,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "# we can use the numpy.lstsq call instead\n",
600 |     "w_lstsq = np.linalg.lstsq(X, y, rcond=None)[0]\n",
601 |     "norm(w_lstsq - w3)"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "code",
606 |    "execution_count": null,
607 |    "metadata": {},
608 |    "outputs": [],
609 |    "source": []
610 |   }
611 |  ],
612 |  "metadata": {
613 |   "@webio": {
614 |    "lastCommId": null,
615 |    "lastKernelId": null
616 |   },
617 |   "kernelspec": {
618 |    "display_name": "Python 3",
619 |    "language": "python",
620 |    "name": "python3"
621 |   },
622 |   "language_info": {
623 |    "codemirror_mode": {
624 |     "name": "ipython",
625 |     "version": 3
626 |    },
627 |    "file_extension": ".py",
628 |    "mimetype": "text/x-python",
629 |    "name": "python",
630 |    "nbconvert_exporter": "python",
631 |    "pygments_lexer": "ipython3",
632 |    "version": "3.8.3"
633 |   }
634 |  },
635 |  "nbformat": 4,
636 |  "nbformat_minor": 1
637 | }
638 | 


--------------------------------------------------------------------------------
/ensembles.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# <font color='#B31B1'> Ensemble Methods </font>\n",
  8 |     "\n",
  9 |     "So far we've seen how to construct a single decision tree, now we'll see how to combine multiple trees together into a more powerful ensemble method."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 21,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from IPython.display import Image\n",
 19 |     "import pandas as pd\n",
 20 |     "import numpy as np\n",
 21 |     "import warnings\n",
 22 |     "warnings.simplefilter(\"ignore\")\n",
 23 |     "\n",
 24 |     "import seaborn as sns\n",
 25 |     "sns.set(rc={'figure.figsize':(6,6)}) "
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## <font color='#B31B1'> California Housing Dataset </font>\n",
 33 |     "\n",
 34 |     "We'll use the boston housing dataset, the goal of which is to predict house prices in California from scikit-learn."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 22,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "from sklearn.datasets import fetch_california_housing\n",
 44 |     "data = fetch_california_housing()\n",
 45 |     "X = data['data']\n",
 46 |     "Y = data['target']"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 23,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/html": [
 57 |        "<div>\n",
 58 |        "<style scoped>\n",
 59 |        "    .dataframe tbody tr th:only-of-type {\n",
 60 |        "        vertical-align: middle;\n",
 61 |        "    }\n",
 62 |        "\n",
 63 |        "    .dataframe tbody tr th {\n",
 64 |        "        vertical-align: top;\n",
 65 |        "    }\n",
 66 |        "\n",
 67 |        "    .dataframe thead th {\n",
 68 |        "        text-align: right;\n",
 69 |        "    }\n",
 70 |        "</style>\n",
 71 |        "<table border=\"1\" class=\"dataframe\">\n",
 72 |        "  <thead>\n",
 73 |        "    <tr style=\"text-align: right;\">\n",
 74 |        "      <th></th>\n",
 75 |        "      <th>MedInc</th>\n",
 76 |        "      <th>HouseAge</th>\n",
 77 |        "      <th>AveRooms</th>\n",
 78 |        "      <th>AveBedrms</th>\n",
 79 |        "      <th>Population</th>\n",
 80 |        "      <th>AveOccup</th>\n",
 81 |        "      <th>Latitude</th>\n",
 82 |        "      <th>Longitude</th>\n",
 83 |        "      <th>Y</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>0</th>\n",
 89 |        "      <td>8.3252</td>\n",
 90 |        "      <td>41.0</td>\n",
 91 |        "      <td>6.984127</td>\n",
 92 |        "      <td>1.023810</td>\n",
 93 |        "      <td>322.0</td>\n",
 94 |        "      <td>2.555556</td>\n",
 95 |        "      <td>37.88</td>\n",
 96 |        "      <td>-122.23</td>\n",
 97 |        "      <td>4.526</td>\n",
 98 |        "    </tr>\n",
 99 |        "    <tr>\n",
100 |        "      <th>1</th>\n",
101 |        "      <td>8.3014</td>\n",
102 |        "      <td>21.0</td>\n",
103 |        "      <td>6.238137</td>\n",
104 |        "      <td>0.971880</td>\n",
105 |        "      <td>2401.0</td>\n",
106 |        "      <td>2.109842</td>\n",
107 |        "      <td>37.86</td>\n",
108 |        "      <td>-122.22</td>\n",
109 |        "      <td>3.585</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>2</th>\n",
113 |        "      <td>7.2574</td>\n",
114 |        "      <td>52.0</td>\n",
115 |        "      <td>8.288136</td>\n",
116 |        "      <td>1.073446</td>\n",
117 |        "      <td>496.0</td>\n",
118 |        "      <td>2.802260</td>\n",
119 |        "      <td>37.85</td>\n",
120 |        "      <td>-122.24</td>\n",
121 |        "      <td>3.521</td>\n",
122 |        "    </tr>\n",
123 |        "    <tr>\n",
124 |        "      <th>3</th>\n",
125 |        "      <td>5.6431</td>\n",
126 |        "      <td>52.0</td>\n",
127 |        "      <td>5.817352</td>\n",
128 |        "      <td>1.073059</td>\n",
129 |        "      <td>558.0</td>\n",
130 |        "      <td>2.547945</td>\n",
131 |        "      <td>37.85</td>\n",
132 |        "      <td>-122.25</td>\n",
133 |        "      <td>3.413</td>\n",
134 |        "    </tr>\n",
135 |        "    <tr>\n",
136 |        "      <th>4</th>\n",
137 |        "      <td>3.8462</td>\n",
138 |        "      <td>52.0</td>\n",
139 |        "      <td>6.281853</td>\n",
140 |        "      <td>1.081081</td>\n",
141 |        "      <td>565.0</td>\n",
142 |        "      <td>2.181467</td>\n",
143 |        "      <td>37.85</td>\n",
144 |        "      <td>-122.25</td>\n",
145 |        "      <td>3.422</td>\n",
146 |        "    </tr>\n",
147 |        "  </tbody>\n",
148 |        "</table>\n",
149 |        "</div>"
150 |       ],
151 |       "text/plain": [
152 |        "   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \\\n",
153 |        "0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   \n",
154 |        "1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   \n",
155 |        "2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   \n",
156 |        "3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   \n",
157 |        "4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   \n",
158 |        "\n",
159 |        "   Longitude      Y  \n",
160 |        "0    -122.23  4.526  \n",
161 |        "1    -122.22  3.585  \n",
162 |        "2    -122.24  3.521  \n",
163 |        "3    -122.25  3.413  \n",
164 |        "4    -122.25  3.422  "
165 |       ]
166 |      },
167 |      "execution_count": 23,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "data_df = (pd.DataFrame(X, columns = data['feature_names'])\n",
174 |     "           .assign(Y = Y))\n",
175 |     "\n",
176 |     "data_df.head()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## <font color='#B31B1'> Bagging </font>\n",
184 |     "Bagging is the process of generating a set of weak learners by training on random bootstrapped samples of our dataset (i.e. sampling a dataset from our training data with replacement). To show the power of bagging, we can use random trees: these trees use a *random* feature and *random* threshold to generate the split at each node and then predict the most common value at the leaf."
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 24,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "from sklearn.ensemble import ExtraTreesRegressor\n",
194 |     "from sklearn.tree import DecisionTreeRegressor\n",
195 |     "from sklearn.model_selection import cross_val_score\n",
196 |     "\n",
197 |     "# Random trees are usually used just in ensemble methods\n",
198 |     "# so we have to manually specify we only want one to start\n",
199 |     "random_tree = ExtraTreesRegressor(n_estimators = 1)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 25,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "ExtraTreesRegressor?"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "We can see that on its own, the random tree has a mean squared error of:"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 26,
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "data": {
225 |       "text/plain": [
226 |        "-0.9024777591605524"
227 |       ]
228 |      },
229 |      "execution_count": 26,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "cross_val_score(random_tree, X, Y,\n",
236 |     "                scoring=\"neg_mean_squared_error\", \n",
237 |     "                cv=3).mean()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "markdown",
242 |    "metadata": {},
243 |    "source": [
244 |     "We could bag by randomly generating the bootstrap samples ourselves... or we could use scikit-learn's BaggingRegressor or BaggingClassifier! We simply need to specify the number of weak learners."
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 7,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "from sklearn.ensemble import BaggingRegressor"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 31,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n",
263 |     "                                        n_estimators = 10\n",
264 |     "                                       )"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "Bagging the random trees together leads to a big jump in performance... even though they're random trees!"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 32,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/plain": [
282 |        "-0.47476478556319784"
283 |       ]
284 |      },
285 |      "execution_count": 32,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "cross_val_score(bagged_random_trees, X, Y,\n",
292 |     "                scoring=\"neg_mean_squared_error\", \n",
293 |     "                cv=3).mean()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "We can also see how the performance changes as we change the number of estimators."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 33,
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "data": {
310 |       "text/plain": [
311 |        "-0.4167362638559448"
312 |       ]
313 |      },
314 |      "execution_count": 33,
315 |      "metadata": {},
316 |      "output_type": "execute_result"
317 |     }
318 |    ],
319 |    "source": [
320 |     "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n",
321 |     "                                        n_estimators = 100\n",
322 |     "                                       )\n",
323 |     "cross_val_score(bagged_random_trees, X, Y,\n",
324 |     "                scoring=\"neg_mean_squared_error\", \n",
325 |     "                cv=3).mean()"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 34,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "data": {
335 |       "text/plain": [
336 |        "-0.4118286946534265"
337 |       ]
338 |      },
339 |      "execution_count": 34,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n",
346 |     "                                        n_estimators = 200\n",
347 |     "                                       )\n",
348 |     "cross_val_score(bagged_random_trees, X, Y,\n",
349 |     "                scoring=\"neg_mean_squared_error\", \n",
350 |     "                cv=3).mean()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "Here \n",
358 |     "* increasing from 10 to 100 estimators improves performance a lot!\n",
359 |     "* increasing from 100 to 200 estimators has almost no effect."
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "metadata": {},
365 |    "source": [
366 |     "## <font color='#B31B1'> Random Forests </font>\n",
367 |     "\n",
368 |     "Random forests is a bagging approach for trees that also randomly selects the set of features each tree can use (to help decorrelate results). Scikit-learn offers a great implementation of random forests.\n",
369 |     "\n",
370 |     "In addition to all the decision tree hyperparameters, random forests also let us choose the number of trees, whether to use bootstrapped samples for each tree, and the max number of features every tree can use."
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": 38,
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "data": {
380 |       "text/plain": [
381 |        "-0.46905893268797305"
382 |       ]
383 |      },
384 |      "execution_count": 38,
385 |      "metadata": {},
386 |      "output_type": "execute_result"
387 |     }
388 |    ],
389 |    "source": [
390 |     "from sklearn.ensemble import RandomForestRegressor\n",
391 |     "\n",
392 |     "random_forest = RandomForestRegressor(n_estimators = 100)\n",
393 |     "\n",
394 |     "cross_val_score(random_forest, X, Y,\n",
395 |     "                scoring=\"neg_mean_squared_error\", \n",
396 |     "                cv=3).mean()"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 36,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "RandomForestRegressor?"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "## <font color='#B31B1'> Gradient Boosting </font>\n",
413 |     "Recall that boosting is the process of sequentially training weak learners to create a powerful prediction. In gradient boosting, each subsequent model is going to try to replicate the gradient of the loss function evaluated at the current model (almost mimicing gradient descent!). Let's try walking through a simple example manually."
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 39,
419 |    "metadata": {},
420 |    "outputs": [],
421 |    "source": [
422 |     "#Start by splitting our data into training and testing\n",
423 |     "train_df = data_df.sample(frac=0.8)\n",
424 |     "test_df = data_df[~data_df.index.isin(train_df.index)]\n",
425 |     "\n",
426 |     "X_tr = train_df.drop('Y',axis=1)\n",
427 |     "Y_tr = train_df['Y']\n",
428 |     "\n",
429 |     "X_tst = test_df.drop('Y',axis=1)\n",
430 |     "Y_tst = test_df['Y']"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "We start by creating our initial predictions, here, by fitting a decision tree to our data."
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 40,
443 |    "metadata": {},
444 |    "outputs": [
445 |     {
446 |      "name": "stdout",
447 |      "output_type": "stream",
448 |      "text": [
449 |       "Our initial training MSE is  0.4879983243722546\n"
450 |      ]
451 |     }
452 |    ],
453 |    "source": [
454 |     "# Start with our base prediction using a decision tree with only 5 layers\n",
455 |     "from sklearn.tree import DecisionTreeRegressor\n",
456 |     "\n",
457 |     "base_tree = DecisionTreeRegressor(max_depth=5)\n",
458 |     "\n",
459 |     "base_tree.fit(X_tr, Y_tr)\n",
460 |     "\n",
461 |     "#Current MSE\n",
462 |     "print('Our initial training MSE is ', np.mean((base_tree.predict(X_tr) - Y_tr)**2))"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "metadata": {},
468 |    "source": [
469 |     "Next, we want to compute the gradient so we can construct a training dataset for our second tree. Since our objective is mean squared error, our gradient is going to be $\\hat{y} - y$"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 46,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/plain": [
480 |        "DecisionTreeRegressor(max_depth=5)"
481 |       ]
482 |      },
483 |      "execution_count": 46,
484 |      "metadata": {},
485 |      "output_type": "execute_result"
486 |     }
487 |    ],
488 |    "source": [
489 |     "residuals =  base_tree.predict(X_tr) - Y_tr\n",
490 |     "\n",
491 |     "second_tree = DecisionTreeRegressor(max_depth=5)\n",
492 |     "second_tree.fit(X_tr, residuals)"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     "Next we figure out the step size using line search (we'll just manually try gamma values)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": 48,
505 |    "metadata": {},
506 |    "outputs": [
507 |     {
508 |      "name": "stdout",
509 |      "output_type": "stream",
510 |      "text": [
511 |       "The best step size was  1.0  for a new MSE of  0.35254102596712683\n"
512 |      ]
513 |     }
514 |    ],
515 |    "source": [
516 |     "best_mse = 99999\n",
517 |     "best_gamma = None\n",
518 |     "\n",
519 |     "for gamma in np.linspace(0, 1, 100):\n",
520 |     "    mse =  np.mean((base_tree.predict(X_tr) - gamma*second_tree.predict(X_tr) - Y_tr)**2)\n",
521 |     "    if mse < best_mse:\n",
522 |     "        best_gamma = gamma\n",
523 |     "        best_mse = mse\n",
524 |     "\n",
525 |     "print('The best step size was ', best_gamma,' for a new MSE of ', best_mse)"
526 |    ]
527 |   },
528 |   {
529 |    "cell_type": "markdown",
530 |    "metadata": {},
531 |    "source": [
532 |     "We could now continue this process and try to add in a third tree and so on. Instead, let's show how to do this with scikit-learn."
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 17,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "from sklearn.ensemble import GradientBoostingRegressor"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 51,
547 |    "metadata": {},
548 |    "outputs": [],
549 |    "source": [
550 |     "GradientBoostingRegressor?"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "markdown",
555 |    "metadata": {},
556 |    "source": [
557 |     "The gradient boosted trees implementation allows us to pick a loss function, \n",
558 |     "a fixed learning rate, and all the usual decision tree hyperparameters."
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "code",
563 |    "execution_count": 53,
564 |    "metadata": {},
565 |    "outputs": [
566 |     {
567 |      "name": "stdout",
568 |      "output_type": "stream",
569 |      "text": [
570 |       "The gradient boosted MSE is  0.16017001437539927\n"
571 |      ]
572 |     }
573 |    ],
574 |    "source": [
575 |     "grad_boost_tree = GradientBoostingRegressor(\n",
576 |     "                    loss = 'ls',\n",
577 |     "                    learning_rate = 1)\n",
578 |     "\n",
579 |     "grad_boost_tree.fit(X_tr, Y_tr)\n",
580 |     "\n",
581 |     "print('The gradient boosted MSE is ', np.mean((grad_boost_tree.predict(X_tr) - Y_tr)**2))"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "We can also compare the test set error:"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 54,
594 |    "metadata": {},
595 |    "outputs": [
596 |     {
597 |      "name": "stdout",
598 |      "output_type": "stream",
599 |      "text": [
600 |       "The original tree MSE is  0.5142762445291711\n",
601 |       "The one-step boosted tree MSE is  0.9159771034480831\n",
602 |       "The gradient boosted test MSE is  0.2669416618696991\n"
603 |      ]
604 |     }
605 |    ],
606 |    "source": [
607 |     "print('The original tree MSE is ', np.mean((base_tree.predict(X_tst) - Y_tst)**2))\n",
608 |     "print('The one-step boosted tree MSE is ', np.mean((base_tree.predict(X_tst) + best_gamma*second_tree.predict(X_tst) - Y_tst)**2))\n",
609 |     "print('The gradient boosted test MSE is ', np.mean((grad_boost_tree.predict(X_tst) - Y_tst)**2))"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": null,
615 |    "metadata": {},
616 |    "outputs": [],
617 |    "source": []
618 |   }
619 |  ],
620 |  "metadata": {
621 |   "@webio": {
622 |    "lastCommId": null,
623 |    "lastKernelId": null
624 |   },
625 |   "kernelspec": {
626 |    "display_name": "Python 3 (ipykernel)",
627 |    "language": "python",
628 |    "name": "python3"
629 |   },
630 |   "language_info": {
631 |    "codemirror_mode": {
632 |     "name": "ipython",
633 |     "version": 3
634 |    },
635 |    "file_extension": ".py",
636 |    "mimetype": "text/x-python",
637 |    "name": "python",
638 |    "nbconvert_exporter": "python",
639 |    "pygments_lexer": "ipython3",
640 |    "version": "3.9.7"
641 |   }
642 |  },
643 |  "nbformat": 4,
644 |  "nbformat_minor": 2
645 | }
646 | 


--------------------------------------------------------------------------------
/great_embedder.py:
--------------------------------------------------------------------------------
  1 | # Works in py36.6, tt 1.15.0, tensorflow-hub 0.4.0
  2 | # Look here for help with installation: https://www.tensorflow.org/hub
  3 | # Look here for help with the universal sentence encoder: 
  4 | # https://towardsdatascience.com/use-cases-of-googles-universal-sentence-encoder-in-production-dd5aaab4fc15
  5 | # 
  6 | # It's possible that this is not the most efficient code . . .
  7 |  
  8 | import pandas as pd
  9 | import numpy as np
 10 | import tensorflow as tf
 11 | import tensorflow_hub as hub
 12 | import time
 13 | 
 14 | top_cutoff = 2 #29142
 15 | inds = [4, 5, 6, 9, 10, 11, 12, 13, 14]
 16 | 
 17 | data = pd.read_csv('airbnblala/analysisData.csv')
 18 | print(data.shape)
 19 | n = data.shape[0]
 20 | 
 21 | # f = open("All_embeddings.csv", 'w')
 22 | # f2 = open("All_embeddings2.csv", 'w')
 23 | 
 24 | s = []
 25 | for ind in inds:
 26 |   for i in range(512):
 27 |     s.append(str(ind+1) + ':' + str(i))
 28 | #f.write(','.join(s) + '\n')
 29 | #f2.write(','.join([str(ind + 1) for ind in inds]) + '\n')
 30 | 
 31 | def text(b, t):
 32 |     s = []
 33 |     for rownum in range(b, t):
 34 |         for i in inds:
 35 |             s.append(str(data.iloc[rownum, i]))
 36 |     print(len(s))
 37 |     return s
 38 | 
 39 | 
 40 | module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
 41 | 
 42 | #with tf.device('/job:localhost/replica:0/task:0/device:XLA_GPU:0'):
 43 | embed = hub.Module(module_url)
 44 | 
 45 | 
 46 | #word = "Elephant"
 47 | #sentence = "I am a sentence for which I would like to get its embedding."
 48 | #paragraph = (
 49 | #    "Universal Sentence Encoder embeddings also support short paragraphs. "
 50 | #    "There is no hard limit on how long the paragraph is. Roughly, the longer "
 51 | #    "the more 'diluted' the embedding will be.")
 52 | #messages = [word, sentence, paragraph]
 53 | 
 54 | 
 55 | tf.logging.set_verbosity(tf.logging.ERROR)
 56 | 
 57 | #    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
 58 | start = time.time()
 59 | step = 1000
 60 | 
 61 | for rownum in range(0, n, step):
 62 |     bot = rownum
 63 |     top = min(n-1, rownum + step)
 64 |     print(top)
 65 |     messages = text(bot, top)
 66 |     t1 = time.time()
 67 |     with tf.Session() as session:
 68 |         session.run([tf.global_variables_initializer(), tf.tables_initializer()])
 69 |         message_embeddings = np.array(session.run(embed(messages)))
 70 |     print(message_embeddings.shape)
 71 |     mb = message_embeddings.reshape(top-bot, len(inds)*512)
 72 |     mb2 = message_embeddings.reshape(top - bot, len(inds), 512)
 73 |     f.write('\n'.join([','.join([str(f) for f in row]) for row in mb]) + '\n')
 74 |     f2.write('\n'.join([','.join([str(arr) for arr in row]) for row in mb2]) + '\n')
 75 |    i print(time.time() - t1)
 76 | 
 77 | bot = n-1
 78 | top = n
 79 | print(top)
 80 | messages = text(bot, top)
 81 | t1 = time.time()
 82 | with tf.Session() as session:
 83 |     session.run([tf.global_variables_initializer(), tf.tables_initializer()])
 84 |     message_embeddings = np.array(session.run(embed(messages)))
 85 | print(message_embeddings.shape)
 86 | #mb = message_embeddings.reshape(top-bot, len(inds)*512)
 87 | mb2 = message_embeddings.reshape(len(inds), 512)
 88 | bigList = []
 89 | for arr in mb2:
 90 |     bigList += [str(val) for val in arr]
 91 | 
 92 | 
 93 | 
 94 | f = open("All_embeddings.csv", 'a')
 95 | f.write(','.join(bigList))
 96 | f.close()
 97 | 
 98 | 
 99 | #        row = []
100 | #        for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
101 | #            row.append(','.join([str(x) for x in message_embedding]))
102 | #        f.write(','.join(row) + '\n')
103 | stop = time.time()
104 | print(stop - start)
105 | 
106 | #f.close()
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/julia/GitHub Tutorials.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# GitHub Tutorials\n",
 8 |     " * [GitHub Hello World](https://guides.github.com/activities/hello-world/)\n",
 9 |     " * [GitHub Desktop Tutorial](https://guides.github.com/introduction/getting-your-project-on-github/)\n",
10 |     " * [Git command line (optional)](https://git-scm.com/docs/gittutorial)"
11 |    ]
12 |   }
13 |  ],
14 |  "metadata": {
15 |   "kernelspec": {
16 |    "display_name": "Julia 0.5.0-rc4",
17 |    "language": "julia",
18 |    "name": "julia-0.5"
19 |   },
20 |   "language_info": {
21 |    "file_extension": ".jl",
22 |    "mimetype": "application/julia",
23 |    "name": "julia",
24 |    "version": "0.5.0"
25 |   }
26 |  },
27 |  "nbformat": 4,
28 |  "nbformat_minor": 0
29 | }
30 | 


--------------------------------------------------------------------------------
/julia/Julia Syntax Tutorial.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Julia Syntax Tutorial\n",
  8 |     "\n",
  9 |     "Everything covered today (and **even more**) can be found [here](https://docs.julialang.org/en/v1/). Code in this notebook has been tested to be compatible with Julia 1.0.3 and 1.2.0. \n",
 10 |     "\n",
 11 |     "To test a specific part of your code, you may create a new cell, paste the code there and run it."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "The following cell brings the linear algebra package to the main namespace: "
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "using LinearAlgebra"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "# Variables"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "x = 3 \n",
 44 |     "x = 3.0\n",
 45 |     "words = \"Hello world!\"\n",
 46 |     "character = 'a'\n",
 47 |     "δ = 1e-5\n",
 48 |     "pi\n",
 49 |     "MathConstants.e\n",
 50 |     "α̂₁ = pi / 2"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "Then, the arrays."
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "u = [1 3 5 7]\n",
 67 |     "v = ones(4)\n",
 68 |     "w = Vector{Float64}(undef, 2)\n",
 69 |     "\n",
 70 |     "X = rand(4, 4)\n",
 71 |     "Identity_matrix = Matrix{Float64}(I, 4, 4)\n",
 72 |     "column = [1,2,3]\n",
 73 |     "row = [1 2 3]\n",
 74 |     "A = [1 2 3; 4 5 6; 7 8 9]\n",
 75 |     "b = [[1 2 3] [4 5 6] [7 8 9]]\n",
 76 |     "B = reshape(b, 3, 3)\n",
 77 |     "C = fill(15, 2, 3)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "Check the number of entries in A:"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "length(A)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Number of dimensions of A:"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "ndims(A)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Size of A:"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "size(A)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "**Exercise 1: Make an array consisting of the programming languages you know, and check its number of entries and dimensionality. The entries should be strings, e.g., \"Julia\".**"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "There are several ways to do value extraction and assignment on arrays."
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "A[3, 1]\n",
149 |     "A[1, :]\n",
150 |     "A[1:2, 2:end]\n",
151 |     "A[2, [1 3]]\n",
152 |     "\n",
153 |     "A[2, 3] = 10\n",
154 |     "A[3, 1:2] = [-2 -3]"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "A"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "Some Mathematical operations are listed below. You may create new cells to check the effect of each individual command."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "metadata": {},
177 |    "outputs": [],
178 |    "source": [
179 |     "2 + 5\n",
180 |     "3.5 ^ 2\n",
181 |     "[1 2] + [2 3]\n",
182 |     "[1 2] * 1.5\n",
183 |     "A[1, :]' * ones(3)\n",
184 |     "A .* B"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "A"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {},
200 |    "outputs": [],
201 |    "source": [
202 |     "B"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "3.0 > 2.0\n",
212 |     "9 ≤ 9\n",
213 |     "1 != 2\n",
214 |     "[1 2] .< [2 3]"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "Control flows:"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "x = 2\n",
231 |     "y = 1\n",
232 |     "if x < y\n",
233 |     "    println(\"x is less than y: $x < $y\")\n",
234 |     "elseif x > y\n",
235 |     "  println(\"x is greater than y: $x > $y\")\n",
236 |     "else\n",
237 |     "  println(\"x is equal to y: $x = $y\")\n",
238 |     "end"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "i = 1\n",
248 |     "while i <= 5\n",
249 |     "    println(i)\n",
250 |     "    i += 1 \n",
251 |     "end"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "for i = 1:5\n",
261 |     "    if i == 3\n",
262 |     "        continue\n",
263 |     "        #break\n",
264 |     "    end\n",
265 |     "    println(i)\n",
266 |     "end"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "u = [1,3,5,7]\n",
276 |     "for i in u\n",
277 |     "    println(i)\n",
278 |     "end"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "**Exercise 2: Make an array, with each entry being the number of lines of code you have written in the corresponding language in the array of Exercise 1. Then use a loop to sum them up.**"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "# functions"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {},
298 |    "source": [
299 |     "The following three syntaxes all define the function f(x) = 2x:"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": null,
305 |    "metadata": {},
306 |    "outputs": [],
307 |    "source": [
308 |     "f1(x) = 2*x\n",
309 |     "function f1(x)\n",
310 |     "    return 2*x\n",
311 |     "end"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "# default values for positional arguments\n",
321 |     "function f2(x, mult=2)\n",
322 |     "    return mult*x\n",
323 |     "end\n",
324 |     "f2(2)    # ==4\n",
325 |     "f2(2, 3) # ==6"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "# default values for keyword arguments: use a semicolon instead of a comma\n",
335 |     "function f3(x; mult=2)\n",
336 |     "    return mult*x\n",
337 |     "end\n",
338 |     "f3(2)         # ==4\n",
339 |     "f3(2, mult=3) # ==6"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "We will get an **error** if we don't specify the name of keyword argument:"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": null,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "f3(2, 3)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": [
364 |     "# a changeable number of arguments\n",
365 |     "function f4(x...)\n",
366 |     "    for xi in x\n",
367 |     "        println(xi)\n",
368 |     "    end\n",
369 |     "end\n",
370 |     "\n",
371 |     "f4(1,2,3,4,\"a\",\"b\",\"c\")"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "**Exercise 3: Use a function to find the second largest number of lines of code in the vector you made in Exercise 2.**"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "There are more kinds of data structures:"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "# list comprehensions: shorthand for loops (not very memory efficient)\n",
395 |     "[x^2 for x in 1:5]\n",
396 |     "[x^2 for x in 1:5 if x>2]\n",
397 |     "[x^k for x in 1:10 for k in 1:5]"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": null,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "# dictionaries and sets\n",
407 |     "\n",
408 |     "x = Set([1,2,3,3,4,1])\n",
409 |     "d = Dict()\n",
410 |     "d[4] = 7\n",
411 |     "# XXX add more here"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {},
417 |    "source": [
418 |     "# advanced topic for people who like object-oriented programming: types and multiple dispatch"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "# x::y is an assertion that x is a variable of type y\n",
428 |     "1.0::Float64"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": null,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "# if we assert something false, we get a type error \n",
438 |     "1.0::Int"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": null,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "# we can define many functions with the same name, which call arguments of different types\n",
448 |     "# julia decides which to call by looking at the type of the argument\n",
449 |     "\n",
450 |     "f(x::Int) = println(\"$x is an integer\")\n",
451 |     "f(x::Float64) = println(\"$x is an float\")\n",
452 |     "f(2)\n",
453 |     "f(2.0)"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "metadata": {},
459 |    "source": [
460 |     "We may define a composite type as follows:"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {},
467 |    "outputs": [],
468 |    "source": [
469 |     "struct Student\n",
470 |     "    name::String    \n",
471 |     "    gpa::Float64\n",
472 |     "end"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "s1 = Student(\"Alice\", 3.9)\n",
482 |     "s2 = Student(\"Bob\", 3.2)"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "s1.name"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "f(s::Student) = println(\"$(s.name) has a $(s.gpa) gpa\")\n",
501 |     "\n",
502 |     "# we've defined the \"f\" function on three different types\n",
503 |     "# julia knows to call the write one by looking at the type of the argument\n",
504 |     "\n",
505 |     "f(s1)\n",
506 |     "f(s2)\n",
507 |     "f(4)\n",
508 |     "f(sqrt(2))"
509 |    ]
510 |   }
511 |  ],
512 |  "metadata": {
513 |   "kernelspec": {
514 |    "display_name": "Julia 1.2.0",
515 |    "language": "julia",
516 |    "name": "julia-1.2"
517 |   },
518 |   "language_info": {
519 |    "file_extension": ".jl",
520 |    "mimetype": "application/julia",
521 |    "name": "julia",
522 |    "version": "1.2.0"
523 |   }
524 |  },
525 |  "nbformat": 4,
526 |  "nbformat_minor": 1
527 | }
528 | 


--------------------------------------------------------------------------------
/julia/QR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "Plots.PyPlotBackend()"
 12 |       ]
 13 |      },
 14 |      "execution_count": 1,
 15 |      "metadata": {},
 16 |      "output_type": "execute_result"
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "using Random\n",
 21 |     "using LinearAlgebra\n",
 22 |     "using Statistics\n",
 23 |     "using Plots\n",
 24 |     "using LaTeXStrings\n",
 25 |     "pyplot()"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "6×4 Array{Float64,2}:\n",
 37 |        " -0.890685   0.764352    -0.244713    0.922554\n",
 38 |        "  1.09337    0.00810613   0.754116   -0.871233\n",
 39 |        "  1.36687    1.44676      0.582838    1.27395 \n",
 40 |        " -0.109862   1.23566      0.981122   -1.00478 \n",
 41 |        "  1.1635    -0.883671    -0.0394113   0.463097\n",
 42 |        "  0.458859   0.43612      0.693942   -0.874769"
 43 |       ]
 44 |      },
 45 |      "execution_count": 2,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "# generate random data matrix\n",
 52 |     "n,d = 6,4\n",
 53 |     "X = randn(n,d)\n",
 54 |     "\n",
 55 |     "# optional: give it linearly dependent columns\n",
 56 |     "# X[:,3] = X[:,2]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "# Understanding the pseudoinverse"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/plain": [
 74 |        "4×6 Array{Float64,2}:\n",
 75 |        " -1.26168   0.00544988   0.715924  -0.900509  -1.1001    0.158552\n",
 76 |        " -1.20029  -0.246117     0.849552  -0.820945  -1.83688   0.187011\n",
 77 |        "  3.03502   0.567742    -1.36888    2.44695    3.70982  -0.204841\n",
 78 |        "  1.36334   0.047329    -0.327362   0.745252   1.55705  -0.260946"
 79 |       ]
 80 |      },
 81 |      "execution_count": 3,
 82 |      "metadata": {},
 83 |      "output_type": "execute_result"
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "# form pseudoinverse\n",
 88 |     "Xd = pinv(X)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 4,
 94 |    "metadata": {},
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/plain": [
 99 |        "4×4 Array{Float64,2}:\n",
100 |        "  1.0          -1.25141e-15  -1.04455e-15  -4.57249e-16\n",
101 |        " -6.39156e-16   1.0          -4.04297e-16  -2.45006e-16\n",
102 |        "  3.0523e-16    1.42441e-15   1.0           1.36557e-15\n",
103 |        "  6.93224e-16   3.62154e-16   7.95279e-16   1.0        "
104 |       ]
105 |      },
106 |      "execution_count": 4,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "# X†X ≈ I_d\n",
113 |     "Xd*X"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 5,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "6×6 Array{Float64,2}:\n",
125 |        "  0.721359   -0.288244    0.0446677   0.263312    0.104445   -0.188887 \n",
126 |        " -0.288244    0.390872    0.0425729   0.204752    0.22337     0.247742 \n",
127 |        "  0.0446677   0.0425729   0.992797   -0.0430009  -0.0153952   0.0354588\n",
128 |        "  0.263312    0.204752   -0.0430009   0.736462   -0.0736144   0.274883 \n",
129 |        "  0.104445    0.22337    -0.0153952  -0.0736144   0.918078   -0.0935505\n",
130 |        " -0.188887    0.247742    0.0354588   0.274883   -0.0935505   0.240431 "
131 |       ]
132 |      },
133 |      "execution_count": 5,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "# XX† !≈ I_n\n",
140 |     "X*Xd"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "Q,R = qr(X)\n",
150 |     "Q = Q[:,1:d];"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 7,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "6×4 Array{Float64,2}:\n",
162 |        " -1.11022e-16   1.11022e-16  -3.05311e-16   1.11022e-16\n",
163 |        "  0.0           1.19696e-16   3.33067e-16  -3.33067e-16\n",
164 |        "  0.0          -2.22045e-16   3.33067e-16  -8.88178e-16\n",
165 |        "  1.38778e-17   0.0           1.11022e-16  -2.22045e-16\n",
166 |        "  0.0          -1.11022e-16   8.32667e-17   5.55112e-17\n",
167 |        "  0.0           5.55112e-17   0.0           1.11022e-16"
168 |       ]
169 |      },
170 |      "execution_count": 7,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "X - Q*R"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 8,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "data": {
186 |       "text/plain": [
187 |        "6×4 Array{Float64,2}:\n",
188 |        " -0.382108   -0.361369   -0.272415   -0.608732 \n",
189 |        "  0.46906     0.0267269   0.411939   -0.0211324\n",
190 |        "  0.586391   -0.599405   -0.517968    0.146167 \n",
191 |        " -0.0471311  -0.547338    0.569154   -0.332754 \n",
192 |        "  0.499148    0.421487   -0.0891341  -0.695222 \n",
193 |        "  0.196852   -0.17939     0.394873    0.116512 "
194 |       ]
195 |      },
196 |      "execution_count": 8,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "Q"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 9,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "data": {
212 |       "text/plain": [
213 |        "4×4 Array{Float64,2}:\n",
214 |        " 2.33098   0.146637   0.859695   0.0921657\n",
215 |        " 0.0      -2.27021   -0.918873  -0.218209 \n",
216 |        " 0.0       0.0        0.911363  -2.22865  \n",
217 |        " 0.0       0.0        0.0       -0.446499 "
218 |       ]
219 |      },
220 |      "execution_count": 9,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "R"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 10,
232 |    "metadata": {},
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "4×4 Array{Float64,2}:\n",
238 |        "  1.0          -1.53135e-16  -3.56308e-16  -1.06328e-16\n",
239 |        " -1.53135e-16   1.0           2.17932e-16   9.85739e-17\n",
240 |        " -3.56308e-16   2.17932e-16   1.0          -9.20757e-17\n",
241 |        " -1.06328e-16   9.85739e-17  -9.20757e-17   1.0        "
242 |       ]
243 |      },
244 |      "execution_count": 10,
245 |      "metadata": {},
246 |      "output_type": "execute_result"
247 |     }
248 |    ],
249 |    "source": [
250 |     "Q'*Q"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 11,
256 |    "metadata": {},
257 |    "outputs": [],
258 |    "source": [
259 |     "# form data from noisy linear model\n",
260 |     "w♮ = randn(d)\n",
261 |     "y = X*w♮ + .1*randn(n);"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 12,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "4-element Array{Float64,1}:\n",
273 |        " -0.10036720947429924\n",
274 |        "  0.1177262734257617 \n",
275 |        "  0.4330665815147097 \n",
276 |        "  0.10108585641128666"
277 |       ]
278 |      },
279 |      "execution_count": 12,
280 |      "metadata": {},
281 |      "output_type": "execute_result"
282 |     }
283 |    ],
284 |    "source": [
285 |     "# solve least squares problem to estimate w\n",
286 |     "w = R \\ (Q'*y)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 13,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "0.22139501473072573"
298 |       ]
299 |      },
300 |      "execution_count": 13,
301 |      "metadata": {},
302 |      "output_type": "execute_result"
303 |     }
304 |    ],
305 |    "source": [
306 |     "# how good is our estimate?\n",
307 |     "norm(w - w♮)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 14,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "0.0054774834985621"
319 |       ]
320 |      },
321 |      "execution_count": 14,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "# compute mean square error\n",
328 |     "mean((y - X*w).^2)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 15,
334 |    "metadata": {},
335 |    "outputs": [
336 |     {
337 |      "data": {
338 |       "text/plain": [
339 |        "3.9399232672427924e-16"
340 |       ]
341 |      },
342 |      "execution_count": 15,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "# let's use the shorthand\n",
349 |     "w_backslash = X \\ y\n",
350 |     "norm(w_backslash - w)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 16,
356 |    "metadata": {},
357 |    "outputs": [
358 |     {
359 |      "data": {
360 |       "text/plain": [
361 |        "4-element Array{Float64,1}:\n",
362 |        " -0.10036720947429938\n",
363 |        "  0.11772627342576154\n",
364 |        "  0.43306658151471006\n",
365 |        "  0.1010858564112867 "
366 |       ]
367 |      },
368 |      "execution_count": 16,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "w_backslash"
375 |    ]
376 |   },
377 |   {
378 |    "cell_type": "code",
379 |    "execution_count": null,
380 |    "metadata": {},
381 |    "outputs": [],
382 |    "source": []
383 |   }
384 |  ],
385 |  "metadata": {
386 |   "kernelspec": {
387 |    "display_name": "Julia 1.2.0",
388 |    "language": "julia",
389 |    "name": "julia-1.2"
390 |   },
391 |   "language_info": {
392 |    "file_extension": ".jl",
393 |    "mimetype": "application/julia",
394 |    "name": "julia",
395 |    "version": "1.2.0"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 1
400 | }
401 | 


--------------------------------------------------------------------------------
/julia/SVD.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 14,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "using LinearAlgebra, Random, Statistics"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 16,
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "data": {
 19 |       "text/plain": [
 20 |        "6×4 Array{Float64,2}:\n",
 21 |        "  0.573077   0.468126    0.468126    0.518838\n",
 22 |        "  0.699222   0.0420342   0.0420342   0.12317\n",
 23 |        " -0.156645   0.374166    0.374166    0.0239568\n",
 24 |        "  0.310086  -0.764711   -0.764711   -0.360399\n",
 25 |        " -0.507746   0.543277    0.543277   -2.1375\n",
 26 |        "  3.18472   -0.58651    -0.58651    -1.86139"
 27 |       ]
 28 |      },
 29 |      "execution_count": 16,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "# generate random data matrix\n",
 36 |     "n,d = 6,4\n",
 37 |     "X = randn(n,d)\n",
 38 |     "\n",
 39 |     "# optional: give it linearly dependent columns\n",
 40 |     "X[:,3] = X[:,2]\n",
 41 |     "X"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 17,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "data": {
 51 |       "text/plain": [
 52 |        "SVD{Float64,Float64,Array{Float64,2}}\n",
 53 |        "U factor:\n",
 54 |        "6×4 Array{Float64,2}:\n",
 55 |        " -0.00539504  -0.190392   -0.642291  -0.0379604\n",
 56 |        " -0.122173    -0.169425   -0.241944  -0.701953\n",
 57 |        "  0.0663229    0.101954   -0.298173  -0.462216\n",
 58 |        " -0.177398    -0.0991203   0.651604  -0.500171\n",
 59 |        " -0.152621     0.95091    -0.081925  -0.117032\n",
 60 |        " -0.962228    -0.102945   -0.093368   0.168255\n",
 61 |        "singular values:\n",
 62 |        "4-element Array{Float64,1}:\n",
 63 |        " 3.918752252818832\n",
 64 |        " 2.3622345810143344\n",
 65 |        " 1.4211344525401286\n",
 66 |        " 7.032906093625886e-17\n",
 67 |        "Vt factor:\n",
 68 |        "4×4 Array{Float64,2}:\n",
 69 |        " -0.801491   0.161851   0.161851   0.552467\n",
 70 |        " -0.459291   0.251746   0.251746  -0.813818\n",
 71 |        " -0.382967  -0.640647  -0.640647  -0.180221\n",
 72 |        "  0.0       -0.707107   0.707107   5.55112e-17"
 73 |       ]
 74 |      },
 75 |      "execution_count": 17,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "U,σ,V = svd(X)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 18,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "data": {
 91 |       "text/plain": [
 92 |        "4×4 Array{Float64,2}:\n",
 93 |        " 1.0           8.68792e-17   6.79998e-17   6.41545e-17\n",
 94 |        " 8.68792e-17   1.0          -4.68924e-17  -1.55497e-16\n",
 95 |        " 6.79998e-17  -4.68924e-17   1.0          -1.31907e-16\n",
 96 |        " 6.41545e-17  -1.55497e-16  -1.31907e-16   1.0"
 97 |       ]
 98 |      },
 99 |      "execution_count": 18,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "U'*U"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 19,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "6×6 Array{Float64,2}:\n",
117 |        "  0.450257    0.214961    0.189291   -0.379704   -0.12316     0.0783736\n",
118 |        "  0.214961    0.594906    0.371219    0.231911   -0.0404891   0.0394824\n",
119 |        "  0.189291    0.371219    0.317344    0.0150248   0.165348   -0.124244\n",
120 |        " -0.379704    0.231911    0.0150248   0.716054   -0.0620265   0.0359058\n",
121 |        " -0.12316    -0.0404891   0.165348   -0.0620265   0.94793     0.0369224\n",
122 |        "  0.0783736   0.0394824  -0.124244    0.0359058   0.0369224   0.973508"
123 |       ]
124 |      },
125 |      "execution_count": 19,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "U*U'"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 20,
137 |    "metadata": {},
138 |    "outputs": [
139 |     {
140 |      "data": {
141 |       "text/plain": [
142 |        "4×4 Array{Float64,2}:\n",
143 |        "  1.0          8.68884e-17  1.27238e-17  -2.13768e-17\n",
144 |        "  8.68884e-17  1.0          5.38631e-17   5.32179e-17\n",
145 |        "  1.27238e-17  5.38631e-17  1.0           8.5981e-17\n",
146 |        " -2.13768e-17  5.32179e-17  8.5981e-17    1.0"
147 |       ]
148 |      },
149 |      "execution_count": 20,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "V'*V"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 21,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "data": {
165 |       "text/plain": [
166 |        "4×4 Array{Float64,2}:\n",
167 |        " 1.0          2.29502e-16  1.73991e-16  1.815e-16\n",
168 |        " 2.29502e-16  1.0          1.21169e-16  1.75265e-16\n",
169 |        " 1.73991e-16  1.21169e-16  1.0          7.33583e-17\n",
170 |        " 1.815e-16    1.75265e-16  7.33583e-17  1.0"
171 |       ]
172 |      },
173 |      "execution_count": 21,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "V*V'"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 22,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "4-element Array{Float64,1}:\n",
191 |        " 3.918752252818832\n",
192 |        " 2.3622345810143344\n",
193 |        " 1.4211344525401286\n",
194 |        " 7.032906093625886e-17"
195 |       ]
196 |      },
197 |      "execution_count": 22,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "σ"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 23,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "Error of full rank svd: 3.222488743844374e-15\n",
216 |       "Error of rank 3 approximation: 3.221816314104331e-15\n",
217 |       "Error of rank 2 approximation: 1.4211344525401282\n",
218 |       "Error of rank 1 approximation: 2.7567690051827887\n"
219 |      ]
220 |     }
221 |    ],
222 |    "source": [
223 |     "# if we have a linearly dependent column, \n",
224 |     "# decomposition is just as good if we ignore the 0 in sigma and reduce r by 1\n",
225 |     "println(\"Error of full rank svd: \", norm(X - U*diagm(σ)*V'))\n",
226 |     "for k=3:-1:1\n",
227 |     "    println(\"Error of rank $k approximation: \", norm(X - U[:,1:k]*diagm(σ[1:k])*(V[:,1:k])'))\n",
228 |     "end"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 24,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "# form data from noisy linear model\n",
238 |     "w♮ = randn(d)\n",
239 |     "y = X*w♮ + .1*randn(n);"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 31,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "4-element Array{Float64,1}:\n",
251 |        " -0.24723376947937492\n",
252 |        "  1.2296831134048805e15\n",
253 |        " -1.2296831134048812e15\n",
254 |        " -1.4601005663102384"
255 |       ]
256 |      },
257 |      "execution_count": 31,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "# solve least squares problem to estimate w\n",
264 |     "w = V*diagm(σ.^(-1))*U'*y"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 44,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "data": {
274 |       "text/plain": [
275 |        "4-element Array{Float64,1}:\n",
276 |        " -0.24723376947937492\n",
277 |        " -0.37987541412463555\n",
278 |        " -0.3798754141246354\n",
279 |        " -1.3635647571638487"
280 |       ]
281 |      },
282 |      "execution_count": 44,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "# use rank k approximation to design matrix X\n",
289 |     "# k=4 is full rank\n",
290 |     "# when design matrix X has rank 3, k=3 gives 0 error approximation\n",
291 |     "# while k=2 results in some loss of accuracy - but not much!\n",
292 |     "k = 3\n",
293 |     "w = V[:,1:k]*diagm(σ[1:k].^(-1))*(U[:,1:k])'*y"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 45,
299 |    "metadata": {},
300 |    "outputs": [
301 |     {
302 |      "data": {
303 |       "text/plain": [
304 |        "7.188211828540968e-15"
305 |       ]
306 |      },
307 |      "execution_count": 45,
308 |      "metadata": {},
309 |      "output_type": "execute_result"
310 |     }
311 |    ],
312 |    "source": [
313 |     "# error in normal equations \n",
314 |     "norm(X'*X*w - X'*y)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 46,
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "data": {
324 |       "text/plain": [
325 |        "7.222425309945365e-15"
326 |       ]
327 |      },
328 |      "execution_count": 46,
329 |      "metadata": {},
330 |      "output_type": "execute_result"
331 |     }
332 |    ],
333 |    "source": [
334 |     "w[2] += 1\n",
335 |     "w[3] -= 1\n",
336 |     "norm(X'*X*w - X'*y)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 47,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "4-element Array{Float64,1}:\n",
348 |        " -0.24723376947937492\n",
349 |        "  0.6201245858753645\n",
350 |        " -1.3798754141246354\n",
351 |        " -1.3635647571638487"
352 |       ]
353 |      },
354 |      "execution_count": 47,
355 |      "metadata": {},
356 |      "output_type": "execute_result"
357 |     }
358 |    ],
359 |    "source": [
360 |     "w"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 39,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "data": {
370 |       "text/plain": [
371 |        "0.5997851656441274"
372 |       ]
373 |      },
374 |      "execution_count": 39,
375 |      "metadata": {},
376 |      "output_type": "execute_result"
377 |     }
378 |    ],
379 |    "source": [
380 |     "# how good is our estimate of w?\n",
381 |     "norm(w - w♮) / norm(w♮)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 40,
387 |    "metadata": {},
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/plain": [
392 |        "0.010609982025196208"
393 |       ]
394 |      },
395 |      "execution_count": 40,
396 |      "metadata": {},
397 |      "output_type": "execute_result"
398 |     }
399 |    ],
400 |    "source": [
401 |     "# compute mean square error\n",
402 |     "mean((y - X*w).^2)"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": 41,
408 |    "metadata": {},
409 |    "outputs": [
410 |     {
411 |      "data": {
412 |       "text/plain": [
413 |        "9.646809571707696e-16"
414 |       ]
415 |      },
416 |      "execution_count": 41,
417 |      "metadata": {},
418 |      "output_type": "execute_result"
419 |     }
420 |    ],
421 |    "source": [
422 |     "# let's use the shorthand\n",
423 |     "# backslash finds least norm solution to normal eqns\n",
424 |     "# using SVD when design matrix X is rank deficient\n",
425 |     "w_backslash = X \\ y\n",
426 |     "norm(w_backslash - w)"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": []
435 |   }
436 |  ],
437 |  "metadata": {
438 |   "@webio": {
439 |    "lastCommId": null,
440 |    "lastKernelId": null
441 |   },
442 |   "kernelspec": {
443 |    "display_name": "Julia 1.5.1",
444 |    "language": "julia",
445 |    "name": "julia-1.5"
446 |   },
447 |   "language_info": {
448 |    "file_extension": ".jl",
449 |    "mimetype": "application/julia",
450 |    "name": "julia",
451 |    "version": "1.5.1"
452 |   }
453 |  },
454 |  "nbformat": 4,
455 |  "nbformat_minor": 1
456 | }
457 | 


--------------------------------------------------------------------------------
/julia/proxgrad-starter-code.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "┌ Info: Precompiling LowRankModels [15d4e49f-4837-5ea3-a885-5b28bfa376dc]\n",
 13 |       "└ @ Base loading.jl:1278\n"
 14 |      ]
 15 |     },
 16 |     {
 17 |      "data": {
 18 |       "text/plain": [
 19 |        "proxgrad_const"
 20 |       ]
 21 |      },
 22 |      "execution_count": 1,
 23 |      "metadata": {},
 24 |      "output_type": "execute_result"
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "using Plots, Random, LinearAlgebra, Statistics, SparseArrays\n",
 29 |     "include(\"proxgrad.jl\")"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Solving ERM problems\n",
 37 |     "\n",
 38 |     "The file `proxgrad.jl` contains code for solving regularized empirical risk minimization (ERM) problems. It provides the optimization function `proxgrad` together with a large number of predefined loss functions and regularizers.\n",
 39 |     "    \n",
 40 |     "The function `proxgrad` solves regularized ERM problems of the form\n",
 41 |     "$$\n",
 42 |     "\\mbox{minimize} \\quad \\sum_{i=1}^n \\ell(y_i, w^T x_i) + r(w).    \n",
 43 |     "$$\n",
 44 |     "It solves these with the proximal gradient method, which we will learn shortly.\n",
 45 |     "\n",
 46 |     "You can select from a range of losses. For real valued $y$, try:\n",
 47 |     "   * quadratic loss - `QuadLoss()`\n",
 48 |     "   * $\\ell_1$ loss - `L1Loss()`\n",
 49 |     "   * quantile loss (for $\\alpha$ quantile) - `QuantileLoss(α)`\n",
 50 |     " \n",
 51 |     "For Boolean $y$, try\n",
 52 |     "   * hinge loss - `HingeLoss()`\n",
 53 |     "   * logistic loss - `LogisticLoss()`\n",
 54 |     "   * weighted hinge loss - `WeightedHingeLoss()`\n",
 55 |     "\n",
 56 |     "For nominal $y$, try\n",
 57 |     "   * multinomial loss - `MultinomialLoss()`\n",
 58 |     "   * one vs all loss - `OvALoss()`\n",
 59 |     "       * (by default, it uses the logistic loss for the underlying binary classifier)\n",
 60 |     "\n",
 61 |     "For ordinal $y$, try\n",
 62 |     "   * ordinal hinge loss - `OrdinalHingeLoss()`\n",
 63 |     "   * bigger vs smaller loss - `BvSLoss()`\n",
 64 |     "       * (by default, it uses the logistic loss for the underlying binary classifier)\n",
 65 |     "       \n",
 66 |     "It also provides a few regularizers, including \n",
 67 |     "   * no regularization - `ZeroReg()`\n",
 68 |     "   * quadratic regularization - `QuadReg()`\n",
 69 |     "   * $\\ell_1$ regularization - `OneReg()`\n",
 70 |     "   * nonnegative constraint - `NonNegConstraint()`\n",
 71 |     "       \n",
 72 |     "Below, we provide some examples for how to use the proxgrad function to fit regularized ERM problems."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## generate random data set\n",
 80 |     "\n",
 81 |     "First (as usual), we'll generate some random data to try our methods on."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 2,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "Random.seed!(0)\n",
 91 |     "n = 50\n",
 92 |     "d = 10\n",
 93 |     "X = randn(n,d)\n",
 94 |     "w♮ = randn(d)\n",
 95 |     "y = X*w♮ + .1*randn(n);"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Quadratic loss, quadratic regularizer\n",
103 |     "\n",
104 |     "$$\n",
105 |     "\\mbox{minimize} \\quad \\frac 1 n ||Xw - y||^2 + λ||w||^2\n",
106 |     "$$"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 3,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "0.11180045233465635"
118 |       ]
119 |      },
120 |      "execution_count": 3,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "# we form \\frac 1 n || ⋅ ||^2 by multiplying the QuadLoss() function by 1/n\n",
127 |     "loss = 1/n*QuadLoss()\n",
128 |     "\n",
129 |     "# we form λ|| ⋅ ||^2 by multiplying the QuadReg() function by λ\n",
130 |     "λ = .1\n",
131 |     "reg = λ*QuadReg()\n",
132 |     "\n",
133 |     "# minimize 1/n ||Xw - y||^2 + λ||w||^2\n",
134 |     "#w = proxgrad(loss, reg, X, y, maxiters=5, c=.1, stepsize=1, max_inner_iters=10000) \n",
135 |     "w = proxgrad(loss, reg, X, y, maxiters=5)\n",
136 |     "\n",
137 |     "norm(X*w-y) / norm(y)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "`maxiters`, the maximum number of iterations, controls how fully we converge.\n",
145 |     "You can try increasing it to see if the error improves.\n",
146 |     "\n",
147 |     "In the next code block, do you think the error will be \n",
148 |     "* A) higher \n",
149 |     "* B) lower\n",
150 |     "* C) the same"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 4,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "0.09941186768969058"
162 |       ]
163 |      },
164 |      "execution_count": 4,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "w = proxgrad(loss, reg, X, y, maxiters=100) \n",
171 |     "norm(X*w-y) / norm(y)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "## Hinge loss, quadratic regularizer\n",
179 |     "\n",
180 |     "$$\n",
181 |     "\\mbox{minimize} \\quad \\frac 1 n \\sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2\n",
182 |     "$$"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 5,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "data": {
192 |       "text/plain": [
193 |        "0.1"
194 |       ]
195 |      },
196 |      "execution_count": 5,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "ybool = (y.>=0) # form a boolean target\n",
203 |     "\n",
204 |     "# we form \\frac 1 n \\sum_{i=1}^n (1 - ⋅ )_+ by multiplying the HingeLoss() function by 1/n\n",
205 |     "loss = 1/n*HingeLoss()\n",
206 |     "\n",
207 |     "# we form λ|| ⋅ ||^2 by multiplying the QuadReg() function by λ\n",
208 |     "λ = .1\n",
209 |     "reg = λ*QuadReg()\n",
210 |     "\n",
211 |     "# minimize 1/n \\frac 1 n \\sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2\n",
212 |     "w = proxgrad(loss, reg, X, ybool, maxiters=10) \n",
213 |     "\n",
214 |     "# predict output values using learned classifier\n",
215 |     "yhat = impute(loss, X*w)\n",
216 |     "\n",
217 |     "# misclassification error \n",
218 |     "(n - sum(yhat .== ybool)) / n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "For nonsmooth problems (like the hinge loss), a smaller stepsize can also help.\n",
226 |     "\n",
227 |     "In the next code block, do you think the error will be\n",
228 |     "\n",
229 |     "* A) higher\n",
230 |     "* B) lower\n",
231 |     "* C) the same"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 7,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "0.1"
243 |       ]
244 |      },
245 |      "execution_count": 7,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "w = proxgrad(loss, reg, X, ybool, maxiters=100, stepsize=.1) \n",
252 |     "yhat = impute(loss, X*w)\n",
253 |     "\n",
254 |     "# misclassification error \n",
255 |     "(n - sum(yhat .== ybool)) / n"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "# Homework question \n",
263 |     "\n",
264 |     "Use the proxgrad function to fit the following objective\n",
265 |     "    \n",
266 |     "$$\n",
267 |     "\\mbox{minimize} \\quad \\frac 1 n \\sum_{i=1}^n \\log(1 + \\exp(- \\text{ybool}_i w^T x_i)) + λ||w||^2\n",
268 |     "$$\n",
269 |     "for $\\lambda = .5$"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {},
276 |    "outputs": [],
277 |    "source": []
278 |   }
279 |  ],
280 |  "metadata": {
281 |   "@webio": {
282 |    "lastCommId": null,
283 |    "lastKernelId": null
284 |   },
285 |   "kernelspec": {
286 |    "display_name": "Julia 1.5.1",
287 |    "language": "julia",
288 |    "name": "julia-1.5"
289 |   },
290 |   "language_info": {
291 |    "file_extension": ".jl",
292 |    "mimetype": "application/julia",
293 |    "name": "julia",
294 |    "version": "1.5.1"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 2
299 | }
300 | 


--------------------------------------------------------------------------------
/python-refresher.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"python-refresher.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO08ekooqoupVWgmdf1IIX/"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","metadata":{"id":"gFnSdOj4Vs45","executionInfo":{"status":"ok","timestamp":1630348520194,"user_tz":240,"elapsed":270,"user":{"displayName":"Madeleine Udell","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhUKEdymdlNMsHJJD3vNYOzwpzwn1GIGBL9AgAxnQ=s64","userId":"09278725283779960205"}}},"source":["# import all packages needed in one cell at the top \n","import numpy as np              # for linear algebra\n","import matplotlib.pyplot as plt # for plotting"],"execution_count":3,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_7x81X4PVdto"},"source":["# Basic python"]},{"cell_type":"code","metadata":{"id":"HMKKIgvEVdtp","outputId":"e3126c95-b1e2-44c6-a093-380ab325e39e"},"source":["# basic math \n","2+2+17"],"execution_count":null,"outputs":[{"data":{"text/plain":["21"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"Gl8dPQ0TVyER","outputId":"8b7467e0-437e-4ed8-c229-70f2a9fc3d33"},"source":["2*pi"],"execution_count":null,"outputs":[{"ename":"NameError","evalue":"name 'pi' is not defined","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)","\u001b[0;32m<ipython-input-4-a47d20d835e9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;36m2\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mNameError\u001b[0m: name 'pi' is not defined"]}]},{"cell_type":"code","metadata":{"id":"X-I1GMezVdtr","outputId":"bb5391c5-ff44-4500-8e38-f0d080b4c374"},"source":["2 * np.pi"],"execution_count":null,"outputs":[{"data":{"text/plain":["6.283185307179586"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"jNkTFsQLVdts","outputId":"ce98483f-45de-4013-b3f7-e0bf8e5e8fdc"},"source":["# lists\n","a = [1, 2, 3]\n","a"],"execution_count":null,"outputs":[{"data":{"text/plain":["[1, 2, 3]"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"_W5hgb9zVdtt","outputId":"6aca11b9-2f51-4fc4-ee0f-7f340276e17c"},"source":["a.append(17)\n","a"],"execution_count":null,"outputs":[{"data":{"text/plain":["[1, 2, 3, 17]"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"r8QeEe6TVdtt","outputId":"20460df9-c5dc-4618-d97d-64e045c4d8fa"},"source":["# python uses 0-based indexing \n","a[0]"],"execution_count":null,"outputs":[{"data":{"text/plain":["1"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"VvwJxd6tVdtu","outputId":"05abb6c4-4742-4ba3-e362-ebc982c2b799"},"source":["# dictionaries \n","d = {'a': 1, 'b': 2, 'c': 3}\n","d"],"execution_count":null,"outputs":[{"data":{"text/plain":["{'a': 1, 'b': 2, 'c': 3}"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"jLRqAAlvVdtu","outputId":"c5f7ff00-d086-47d9-b4cf-baa3d099f0f9"},"source":["d['b']"],"execution_count":null,"outputs":[{"data":{"text/plain":["2"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"BPYvLBSeVdtu","outputId":"637d0d90-0bcc-4bb9-e9f1-e5537795df50"},"source":["d['d']"],"execution_count":null,"outputs":[{"ename":"KeyError","evalue":"'d'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)","\u001b[0;32m<ipython-input-11-0d700facf69e>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'd'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mKeyError\u001b[0m: 'd'"]}]},{"cell_type":"code","metadata":{"id":"fxrQl3HBVdtv","outputId":"04f3bba5-9503-49cf-f1de-85f6108e7ce2"},"source":["d['d'] = 4\n","d['d']"],"execution_count":null,"outputs":[{"data":{"text/plain":["4"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"JHjcpwclVdtv","outputId":"8f0b2d99-ff57-4976-8610-a890b910cf36"},"source":["# for loops\n","for i in range(10):\n","    print(\"hello\", i)\n","    print(\"hello\", i)"],"execution_count":null,"outputs":[{"name":"stdout","output_type":"stream","text":["hello 0\n","hello 0\n","hello 1\n","hello 1\n","hello 2\n","hello 2\n","hello 3\n","hello 3\n","hello 4\n","hello 4\n","hello 5\n","hello 5\n","hello 6\n","hello 6\n","hello 7\n","hello 7\n","hello 8\n","hello 8\n","hello 9\n","hello 9\n"]}]},{"cell_type":"code","metadata":{"id":"Z5W1GUjvVdtw","outputId":"5e63a893-5bad-45b5-e8a3-01ba195dc237"},"source":["# functions, if statements\n","def fibonacci(n=5):\n","    if n==0:\n","        return 1\n","    else:\n","        return n*fibonacci(n-1)\n","    \n","fibonacci()"],"execution_count":null,"outputs":[{"data":{"text/plain":["120"]},"execution_count":51,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"markdown","metadata":{"id":"Njo-8j9YVdtw"},"source":["# Jupyter workflow tips\n","\n","* run a cell with Shift-Enter\n","* Jupyter displays value of last expression in cell \n","* open a new cell below the current one (Alt-Enter) to see values of variables, test assumptions\n","* you can reorder cells using the arrows in the toolbar, or by copy-pasting cells up or down\n","\n","How to troubleshoot and debug in Jupyter\n","* find the line in the cell that's not working as expected \n","* test that every input to that line is what you expect by opening a new cell below the one that's not working, copy pasting code if needed\n","  * check the type (eg, integer? string? array? matrix?)\n","  * check the values (eg, are there NaNs or Infs? are there negative numbers where you expect positive?)"]}]}


--------------------------------------------------------------------------------