├── 4740 ├── Gradient descent.ipynb └── QR.ipynb ├── .gitignore ├── AutoML ├── AutoML_in_class_demo.ipynb ├── README.md └── oboe │ ├── LICENSE.txt │ ├── README.md │ ├── automl │ ├── README.md │ ├── __init__.py │ ├── auto_learner.py │ ├── convex_opt.py │ ├── defaults │ │ ├── classification.json │ │ ├── dataset_sizes.csv │ │ ├── error_matrix.csv │ │ ├── regression.json │ │ ├── runtime_matrix.csv │ │ └── runtime_predictor.pkl │ ├── generate_matrix.sh │ ├── generate_vector.py │ ├── linalg.py │ ├── model.py │ ├── preprocessing.py │ └── util.py │ └── examples │ ├── README.md │ ├── classification.ipynb │ └── error_matrix_generation │ ├── README.md │ ├── dataset_11_features_and_labels.csv │ ├── dataset_18_features_and_labels.csv │ ├── generate.sh │ └── merge.sh ├── Bootstrap.ipynb ├── QR.ipynb ├── README.md ├── RegularizedRegression.ipynb ├── SVD.ipynb ├── eda.ipynb ├── ensembles.ipynb ├── feature_engineering.ipynb ├── forecasting.ipynb ├── gradient_descent.ipynb ├── great_embedder.py ├── julia ├── Classification.ipynb ├── Crime.ipynb ├── Fairness-Income.ipynb ├── GitHub Tutorials.ipynb ├── Gradient descent.ipynb ├── Julia Syntax Tutorial.ipynb ├── Linear models.ipynb ├── LossFunctions_multiclass.ipynb ├── LowRankModelsDemo-long.ipynb ├── LowRankModelsDemo.ipynb ├── Multiclass and Ordinal.ipynb ├── Predicting COVID.ipynb ├── Predicting crime.ipynb ├── ProximalGradient.ipynb ├── QR.ipynb ├── RegularizedRegression.ipynb ├── Robust regression.ipynb ├── SIR.ipynb ├── SVD.ipynb ├── Scikit-learn.ipynb ├── Section-Regularization+Scaling.ipynb ├── Sklearn_demo.ipynb ├── Untitled.ipynb ├── double-descent.ipynb ├── eda.ipynb ├── proxgrad-starter-code.ipynb └── spectralGraphTheory.ipynb ├── linear_models.ipynb ├── python-refresher.ipynb ├── robust_regression.ipynb └── trees.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /4740/QR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# generate random data matrix\n", 12 | "n,d = 6,4\n", 13 | "X = randn(n,d)\n", 14 | "\n", 15 | "# optional: give it linearly dependent columns\n", 16 | "# X[:,3] = X[:,2]" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "Q,R = qr(X)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "collapsed": false 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "Q" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": { 45 | "collapsed": false 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "R" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": false 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "Q'*Q" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "collapsed": true 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# form data from noisy linear model\n", 72 | "β♮ = randn(d)\n", 73 | "y = X*β♮ + .1*randn(n);" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "# solve least squares problem to estimate w\n", 85 | "β = R \\ (Q'*y)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "# how good is our estimate?\n", 97 | "norm(β - β♮)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# compute mean square error\n", 109 | "mean((y - X*β).^2)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "# let's use the shorthand\n", 121 | "β_backslash = X \\ y\n", 122 | "norm(w_backslash - β)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "β_backslash" 134 | ] 135 | } 136 | ], 137 | "metadata": { 138 | "kernelspec": { 139 | "display_name": "Julia 0.5.0", 140 | "language": "julia", 141 | "name": "julia-0.5" 142 | }, 143 | "language_info": { 144 | "file_extension": ".jl", 145 | "mimetype": "application/julia", 146 | "name": "julia", 147 | "version": "0.5.0" 148 | } 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 0 152 | } 153 | -------------------------------------------------------------------------------- /AutoML/AutoML_in_class_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "We use the Airbnb dataset from Homework 3 to illustrate how different AutoML frameworks work, by doing model selection on the training set and then evaluate on test set. The error metric we are using is balanced error rate, which is the average of false positive rate and false negative rate, and then take the average of those averages across classes." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import sys\n", 17 | "import pandas as pd\n", 18 | "import os\n", 19 | "import time\n", 20 | "from datetime import datetime\n", 21 | "import numpy as np\n", 22 | "import multiprocessing as mp\n", 23 | "\n", 24 | "from sklearn.datasets import load_iris\n", 25 | "from sklearn.model_selection import train_test_split\n", 26 | "\n", 27 | "import autosklearn.classification\n", 28 | "from autosklearn.metrics import balanced_accuracy" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "automl_path = 'oboe/automl/'\n", 38 | "sys.path.append(automl_path)\n", 39 | "from auto_learner import AutoLearner\n", 40 | "import util\n", 41 | "\n", 42 | "# disable warnings\n", 43 | "import warnings\n", 44 | "warnings.filterwarnings('ignore')" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "Prepare the Airbnb dataset." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "airbnb_dataset_size = 3000 # number of points to keep in subsampling\n", 61 | "\n", 62 | "df_airbnb = pd.read_csv(\"airbnb.csv\", index_col=None, header=0)\n", 63 | "df_airbnb.drop(df_airbnb[df_airbnb.price == np.nan].index, inplace=True)\n", 64 | "features_real = [\n", 65 | " \"host_listings_count\",\n", 66 | " \"host_total_listings_count\",\n", 67 | " \"accommodates\",\n", 68 | " \"bathrooms\",\n", 69 | " \"bedrooms\",\n", 70 | " \"guests_included\",\n", 71 | " \"extra_people\",\n", 72 | " \"minimum_nights\",\n", 73 | " \"maximum_nights\",\n", 74 | " \"availability_30\",\n", 75 | " \"availability_60\",\n", 76 | " \"availability_90\",\n", 77 | " \"availability_365\",\n", 78 | " \"number_of_reviews\",\n", 79 | " \"review_scores_rating\",\n", 80 | " \"review_scores_accuracy\",\n", 81 | " \"review_scores_cleanliness\",\n", 82 | " \"review_scores_checkin\",\n", 83 | " \"review_scores_communication\",\n", 84 | " \"review_scores_location\",\n", 85 | " \"price\"\n", 86 | "]\n", 87 | "\n", 88 | "label = [\"review_scores_value\"]\n", 89 | "x = df_airbnb[features_real].values\n", 90 | "y = df_airbnb[label].values.flatten()\n", 91 | "\n", 92 | "np.random.seed(0)\n", 93 | "idx_to_keep = np.random.choice(np.arange(y.shape[0]), size=airbnb_dataset_size, replace=False)\n", 94 | "x = x[idx_to_keep]\n", 95 | "y = y[idx_to_keep]\n", 96 | " \n", 97 | "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Part I: auto-sklearn" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "We may restrict the estimator search space to only search for a good classifier among these models." 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "include_estimators = [\"adaboost\",\"gaussian_nb\", \"extra_trees\", \"gradient_boosting\", \n", 121 | " \"liblinear_svc\", \"libsvm_svc\",\"random_forest\",\n", 122 | " \"k_nearest_neighbors\",\"decision_tree\"]" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "We may also specify a running time limit." 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "runtime_limit = 120" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# A wrapper class for the auto-sklearn learner.\n", 148 | "def AutoSklearn(total_runtime, train_features, train_labels):\n", 149 | " clf = autosklearn.classification.AutoSklearnClassifier(\n", 150 | " time_left_for_this_task=total_runtime,\n", 151 | " tmp_folder='tmp/autosklearn_tmp_'+str(datetime.now()), \n", 152 | " output_folder='tmp/autosklearn_output_'+str(datetime.now()),\n", 153 | " metric=balanced_accuracy,\n", 154 | " include_estimators = include_estimators,\n", 155 | " )\n", 156 | " \n", 157 | " clf.fit(train_features, train_labels) \n", 158 | " return clf" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "Run auto-sklearn for 120 seconds." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 7, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "runtime = 120\n", 175 | "clf = AutoSklearn(runtime, x_train, y_train)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "Get predicted training and test labels." 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "y_train_pred_autosklearn = clf.predict(x_train)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 9, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "y_test_pred_autosklearn = clf.predict(x_test)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "Show which models the learner has picked." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": { 214 | "scrolled": true 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "\"[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'weighting', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'one_hot_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'median', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'polynomial', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.21794354428393548, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 2, 'classifier:random_forest:min_samples_split': 16, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'data_preprocessing:categorical_transformer:category_coalescence:minority_coalescer:minimum_fraction': 0.0025451910134387575, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:n_quantiles': 1477, 'data_preprocessing:numerical_transformer:rescaling:quantile_transformer:output_distribution': 'uniform', 'feature_preprocessor:polynomial:degree': 2, 'feature_preprocessor:polynomial:include_bias': 'True', 'feature_preprocessor:polynomial:interaction_only': 'False'},\\ndataset_properties={\\n 'task': 2,\\n 'sparse': False,\\n 'multilabel': False,\\n 'multiclass': True,\\n 'target_type': 'classification',\\n 'signed': False})),\\n]\"" 221 | ] 222 | }, 223 | "execution_count": 10, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "clf.show_models()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "Show the error on test dataset." 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 11, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "0.10065950071453614" 248 | ] 249 | }, 250 | "execution_count": 11, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "util.error(y_train, y_train_pred_autosklearn, 'classification')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 12, 262 | "metadata": { 263 | "scrolled": true 264 | }, 265 | "outputs": [ 266 | { 267 | "data": { 268 | "text/plain": [ 269 | "0.18850923114927096" 270 | ] 271 | }, 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "output_type": "execute_result" 275 | } 276 | ], 277 | "source": [ 278 | "util.error(y_test, y_test_pred_autosklearn, 'classification')" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "# Part II: TPOT" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "TPOT is an AutoML tool that optimizes machine learning pipelines by genetic programming." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 13, 298 | "metadata": { 299 | "scrolled": true 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "from tpot import TPOTClassifier" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "Run TPOT for 120 seconds." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 14, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "application/vnd.jupyter.widget-view+json": { 321 | "model_id": "681aa82e4f3645c4a8e3dc17a89200a3", 322 | "version_major": 2, 323 | "version_minor": 0 324 | }, 325 | "text/plain": [ 326 | "HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=20.0, style=ProgressStyle(des…" 327 | ] 328 | }, 329 | "metadata": {}, 330 | "output_type": "display_data" 331 | }, 332 | { 333 | "name": "stdout", 334 | "output_type": "stream", 335 | "text": [ 336 | "\r\n", 337 | "\r", 338 | "0.58 minutes have elapsed. TPOT will close down.\n", 339 | "TPOT closed during evaluation in one generation.\n", 340 | "WARNING: TPOT may not provide a good pipeline if TPOT is stopped/interrupted in a early generation.\n", 341 | "\r\n", 342 | "\r\n", 343 | "TPOT closed prematurely. Will use the current best pipeline.\n", 344 | "\r\n", 345 | "Best pipeline: DecisionTreeClassifier(SelectFwe(input_matrix, alpha=0.042), criterion=entropy, max_depth=10, min_samples_leaf=20, min_samples_split=7)\n" 346 | ] 347 | }, 348 | { 349 | "data": { 350 | "text/plain": [ 351 | "TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,\n", 352 | " disable_update_check=False, early_stop=None, generations=5,\n", 353 | " log_file=,\n", 354 | " max_eval_time_mins=5, max_time_mins=0.5, memory=None,\n", 355 | " mutation_rate=0.9, n_jobs=1, offspring_size=None,\n", 356 | " periodic_checkpoint_folder=None, population_size=20,\n", 357 | " random_state=None, scoring=None, subsample=1.0, template=None,\n", 358 | " use_dask=False, verbosity=2, warm_start=False)" 359 | ] 360 | }, 361 | "execution_count": 14, 362 | "metadata": {}, 363 | "output_type": "execute_result" 364 | } 365 | ], 366 | "source": [ 367 | "tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2, max_time_mins=.5)\n", 368 | "tpot.fit(x_train, y_train)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 15, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "y_train_pred_tpot = tpot.predict(x_train)\n", 378 | "y_test_pred_tpot = tpot.predict(x_test)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "Show the error on test dataset." 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 16, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "0.3397069433952885" 397 | ] 398 | }, 399 | "execution_count": 16, 400 | "metadata": {}, 401 | "output_type": "execute_result" 402 | } 403 | ], 404 | "source": [ 405 | "#tpot training error\n", 406 | "util.error(y_train, y_train_pred_tpot, 'classification')" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 18, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "0.3557644506500882" 418 | ] 419 | }, 420 | "execution_count": 18, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "#tpot test error\n", 427 | "util.error(y_test, y_test_pred_tpot, 'classification')" 428 | ] 429 | }, 430 | { 431 | "cell_type": "markdown", 432 | "metadata": {}, 433 | "source": [ 434 | "# Part III: Oboe (still under development)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "markdown", 439 | "metadata": {}, 440 | "source": [ 441 | "## Oboe Example 1: build an ensemble of models" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 19, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "#experimental settings\n", 451 | "VERBOSE = False #whether to print out information indicating current fitting progress\n", 452 | "N_CORES = 1 #number of cores\n", 453 | "RUNTIME_BUDGET = 30" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 20, 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [ 462 | "#optional: limit the types of algorithms\n", 463 | "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 21, 469 | "metadata": {}, 470 | "outputs": [], 471 | "source": [ 472 | "#autolearner arguments\n", 473 | "autolearner_kwargs = {\n", 474 | " 'p_type': 'classification',\n", 475 | " 'runtime_limit': RUNTIME_BUDGET,\n", 476 | " 'verbose': VERBOSE,\n", 477 | " 'selection_method': 'min_variance',\n", 478 | " 'algorithms': s,\n", 479 | " 'stacking_alg': 'greedy',\n", 480 | " 'n_cores': N_CORES,\n", 481 | " 'build_ensemble': True,\n", 482 | "}" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 22, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "#intialize the autolearner class\n", 492 | "m = AutoLearner(**autolearner_kwargs)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 23, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "# fit autolearner on training set and record runtime\n", 502 | "start = time.time()\n", 503 | "m.fit(x_train, y_train)\n", 504 | "elapsed_time = time.time() - start" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 24, 510 | "metadata": {}, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "prediction error: 0.3139487158460067\n", 517 | "elapsed time: 27.216959714889526\n", 518 | "individual accuracies of selected models: [0.31651432260061413, 0.32036420549246775, 0.26796126609153437, 0.42171443806106124, 0.31651432260061413]\n" 519 | ] 520 | } 521 | ], 522 | "source": [ 523 | "# use the fitted autolearner for prediction on test set\n", 524 | "y_predicted = m.predict(x_test)\n", 525 | "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n", 526 | "print(\"elapsed time: {}\".format(elapsed_time))\n", 527 | "print(\"individual accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 25, 533 | "metadata": { 534 | "scrolled": true 535 | }, 536 | "outputs": [ 537 | { 538 | "data": { 539 | "text/plain": [ 540 | "{'ensemble method': 'greedy selection',\n", 541 | " 'base learners': {'DT': [{'min_samples_split': 0.0001},\n", 542 | " {'min_samples_split': 4},\n", 543 | " {'min_samples_split': 1024},\n", 544 | " {'min_samples_split': 1e-05}],\n", 545 | " 'GNB': [{}]}}" 546 | ] 547 | }, 548 | "execution_count": 25, 549 | "metadata": {}, 550 | "output_type": "execute_result" 551 | } 552 | ], 553 | "source": [ 554 | "# get names of the selected machine learning models\n", 555 | "m.get_models()" 556 | ] 557 | }, 558 | { 559 | "cell_type": "markdown", 560 | "metadata": {}, 561 | "source": [ 562 | "## Oboe Example 2: just select a collection of promising models without building an ensemble afterwards" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 26, 568 | "metadata": {}, 569 | "outputs": [], 570 | "source": [ 571 | "#experimental settings\n", 572 | "VERBOSE = False #whether to print out information indicating current fitting progress\n", 573 | "N_CORES = 1 #number of cores\n", 574 | "RUNTIME_BUDGET = 30" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 27, 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "#optional: limit the types of algorithms\n", 584 | "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 28, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "#autolearner arguments\n", 594 | "autolearner_kwargs = {\n", 595 | " 'p_type': 'classification',\n", 596 | " 'runtime_limit': RUNTIME_BUDGET,\n", 597 | " 'verbose': VERBOSE,\n", 598 | " 'selection_method': 'min_variance',\n", 599 | " 'algorithms': s,\n", 600 | " 'stacking_alg': 'greedy',\n", 601 | " 'n_cores': N_CORES,\n", 602 | " 'build_ensemble': False,\n", 603 | "}" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 29, 609 | "metadata": {}, 610 | "outputs": [], 611 | "source": [ 612 | "#intialize the autolearner class\n", 613 | "m = AutoLearner(**autolearner_kwargs)" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 30, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "# fit autolearner on training set and record runtime\n", 623 | "start = time.time()\n", 624 | "m.fit(x_train, y_train)\n", 625 | "elapsed_time = time.time() - start" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 31, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "elapsed time: 10.797972679138184\n", 638 | "accuracies of selected models: [0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.31651432260061413, 0.3124772208748183, 0.32036420549246775, 0.35449564337367473, 0.3495441004342578, 0.39048041301086595, 0.26796126609153437, 0.42171443806106124, 0.3001999674360773, 0.3515055898002219, 0.2644854823131578, 0.32215416955204695, 0.31565880270077473, 0.377873248684861, 0.297898906174363]\n" 639 | ] 640 | } 641 | ], 642 | "source": [ 643 | "# use the fitted autolearner for prediction on test set\n", 644 | "y_predicted = m.predict(x_test)\n", 645 | " \n", 646 | "print(\"elapsed time: {}\".format(elapsed_time))\n", 647 | "print(\"accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))" 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": {}, 653 | "source": [ 654 | "Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported.\n", 655 | "\n", 656 | "The following shows which models we have picked." 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 32, 662 | "metadata": {}, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/plain": [ 667 | "{'DT': [{'min_samples_split': 1e-05},\n", 668 | " {'min_samples_split': 1e-05},\n", 669 | " {'min_samples_split': 0.0001},\n", 670 | " {'min_samples_split': 2},\n", 671 | " {'min_samples_split': 0.001},\n", 672 | " {'min_samples_split': 4},\n", 673 | " {'min_samples_split': 64},\n", 674 | " {'min_samples_split': 128},\n", 675 | " {'min_samples_split': 256},\n", 676 | " {'min_samples_split': 1024},\n", 677 | " {'min_samples_split': 8},\n", 678 | " {'min_samples_split': 16},\n", 679 | " {'min_samples_split': 32},\n", 680 | " {'min_samples_split': 0.01}],\n", 681 | " 'GNB': [{}],\n", 682 | " 'AB': [{'n_estimators': 50, 'learning_rate': 1},\n", 683 | " {'n_estimators': 50, 'learning_rate': 1.5},\n", 684 | " {'n_estimators': 100, 'learning_rate': 1}]}" 685 | ] 686 | }, 687 | "execution_count": 32, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "m.get_models()" 694 | ] 695 | } 696 | ], 697 | "metadata": { 698 | "kernelspec": { 699 | "display_name": "Python 3", 700 | "language": "python", 701 | "name": "python3" 702 | }, 703 | "language_info": { 704 | "codemirror_mode": { 705 | "name": "ipython", 706 | "version": 3 707 | }, 708 | "file_extension": ".py", 709 | "mimetype": "text/x-python", 710 | "name": "python", 711 | "nbconvert_exporter": "python", 712 | "pygments_lexer": "ipython3", 713 | "version": "3.7.3" 714 | } 715 | }, 716 | "nbformat": 4, 717 | "nbformat_minor": 2 718 | } 719 | -------------------------------------------------------------------------------- /AutoML/README.md: -------------------------------------------------------------------------------- 1 | # AutoML demo 2 | 3 | Please find the demo as the Jupyter notebook. Before running the demo, you would need to download the Airbnb dataset we used in HW3 from , and also install `auto-sklearn` and `TPOT`. The installation guides are: 4 | 5 | - auto-sklearn (may only work on Linux): 6 | 7 | - TPOT: 8 | 9 | The Oboe system is developed by us and is still in progress of development (at ). Suggestions are welcomed! 10 | 11 | ## References 12 | [1] Matthias Feurer, Aaron Klein, Katharina Eggensperger, Jost Springenberg, Manuel Blum, Frank Hutter. Efficient and robust automated machine learning. NIPS 2015. 13 | 14 | [2] Randal S Olson, Jason H Moore. TPOT: A tree-based pipeline opti-mization tool for automating machine learning. Automated Machine Learning 2019. 15 | 16 | [3] Chengrun Yang, Yuji Akimoto, Dae Won Kim, Madeleine Udell. OBOE: Collaborative filtering for AutoML model selection. KDD 2019. -------------------------------------------------------------------------------- /AutoML/oboe/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019, Chengrun Yang 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of the nor the 12 | names of its contributors may be used to endorse or promote products 13 | derived from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY 19 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /AutoML/oboe/README.md: -------------------------------------------------------------------------------- 1 | # Oboe 2 | 3 | In an orchestra, the oboe plays an initial note which the other instruments use to tune to the right frequency before the performance begins; this package, Oboe, is an automated machine learning/model selection system that uses collaborative filtering to find good models for supervised learning tasks within a user-specified time limit. Further hyperparameter tuning can be performed afterwards. 4 | 5 | Oboe is based on matrix factorization and classical experiment design. For a complete description, refer to our paper at KDD 2019: [OBOE: Collaborative Filtering for AutoML Model Selection](https://arxiv.org/abs/1808.03233). 6 | 7 | This system is still under developement and subjects to change. 8 | 9 | ## Installation 10 | 11 | #### Dependencies 12 | oboe requires: 13 | * Python (>= 3.5) 14 | * numpy (>= 1.8.2) 15 | * scipy (>= 0.13.3) 16 | * pandas (>=0.22.0) 17 | * scikit-learn (>= 0.18) 18 | * multiprocessing (>=0.70.5) 19 | * OpenML (>=0.7.0) 20 | * mkl (>=1.0.0) 21 | * re 22 | * os 23 | * json 24 | * util 25 | 26 | #### User Installation 27 | This part is currently under development; an example for code usage is in the `example` folder. The package will be pip installable in the future. 28 | 29 | ## Usage 30 | 31 | ### Online Phase (AutoML model selection) 32 | Given a new dataset, we want to select promising models and hyperparameters. Denote features and labels of the training set as `x_train` and `y_train`, and features of the test set as `x_test`, a short example of training and testing is 33 | ``` 34 | from auto_learner import AutoLearner 35 | m = AutoLearner(runtime_limit=20) #set the time limit for model fitting to be 20 seconds 36 | m.fit(x_train, y_train) 37 | m.predict(x_test) 38 | ``` 39 | Additional arguments can be applied to customize the `AutoLearner` instance, including: 40 | 1. Basics 41 | * p_type (str): Problem type, which is one of {'classification', 'regression'}. By default, 'classification'. 42 | * verbose (Boolean): Whether or not to generate print statements that showcase the progress. By default, false. 43 | * n_cores (int): Maximum number of CPU cores to use. The default value 'None' means no limit, i.e., up to all the CPU cores of the machine. 44 | * runtime_limit (int): Maximum runtime for AutoLearner fitting, in seconds. By default, 512 seconds as the timeout limit. 45 | * scalarization (str): Scalarization of the covariance matrix for mininum variance selection. One of {'D', 'A', 'E'}. 'D', as default, enjoys best performance and fastest speed in practice. 46 | * build_ensemble (Boolean): Whether to build an ensemble of promising models. 47 | * stacking_alg (str): The method used for ensemble construction. One of {'greedy', 'stacking'}. By default, 'greedy'. 48 | * dataset_ratio_threshold (float): The threshold of dataset ratio for dataset subsampling, if the training set is tall and skinny (i.e., number of data points much larger than number of features). 49 | 50 | 2. Advanced customization 51 | * algorithms (list): A list of algorithm types to be considered, in strings, e.g. ['KNN', 'lSVM']. By default, all the algorithms in the error matrix. The supported classification algorithms are: 'AB' (Adaboost), 'DT' (decision tree), 'ExtraTrees' (extra trees), 'GBT' (gradient boosting), 'GNB' (Gaussian naive Bayes), 'KNN' (kNN), 'Logit' (logistic regression), 'MLP' (multilayer perceptron), 'Perceptron' (perceptron), 'RF' (random forest), 'kSVM' (kernel SVM), 'lSVM' (linear SVM). 52 | * hyperparameters (dict): A nested dict of hyperparameters to be considered. By default, all the model hyperparameters in the error matrix. 53 | * error_matrix (DataFrame): Error matrix to use for imputation, includes indices and headers. The one in `defaults` folder is used by default. 54 | * runtime_matrix (DataFrame): Runtime matrix to use for runtime prediction, includes indices and headers. The one in `defaults` folder is used by default. 55 | * new_row (np.ndarray): Predicted row of error matrix; corresponds to the new dataset. By default, 'None'. 56 | * selection_method (str): Method of selecting entries of new row to sample. One of {'min_variance', 'qr'}. 'min_variance' corresponds to the selection approach via classic experiment design; 'qr' selects the pivot columns in the error matrix and thus does not provide the functionality of maximizing performance within given runtime budget. By default, 'min_variance'. 57 | * runtime_predictor (str): Model for runtime prediction. One of {'LinearRegression', 'KNeighborsRegressor'}. By default, 'LinearRegression'. Dataset sizes (number of data points and number of features) are used as feature vectors for both runtime predictor models. 58 | 59 | For executable and more detailed examples, please refer to the `example` folder. 60 | 61 | ### Offline Phase 62 | 63 | ##### Error Matrix Generation 64 | Please refer to `examples/error_matrix_generation` for an error matrix generation example. -------------------------------------------------------------------------------- /AutoML/oboe/automl/README.md: -------------------------------------------------------------------------------- 1 | # Oboe 2 | Technical & implementation details of the Oboe 3 | package. 4 | 5 | #### Classification Algorithms 6 | * K-Nearest Neighbors 7 | * Decision Tree 8 | * Random Forest 9 | * Gradient Boosting Tree 10 | * Adaboost Tree 11 | * Linear SVM 12 | * Kernel SVM 13 | * Logistic Regression 14 | * Perceptron 15 | * Gaussian Naive Bayes 16 | 17 | #### Regression Algorithms 18 | * (under development) 19 | 20 | #### Notes on usage 21 | * Oboe currently only supports datasets that 22 | are saved as .csv files. Additionally, it is assumed that 23 | these .csv files contain *only* the data, i.e. there are 24 | no row or column names included in the file. 25 | 26 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/__init__.py: -------------------------------------------------------------------------------- 1 | from auto_learner import AutoLearner 2 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/convex_opt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Find columns of error matrix to minimize variance of predicted latent features. 3 | Solves convex optimization problem as described in chapter 7.5 in https://web.stanford.edu/~boyd/cvxbook/bv_cvxbook.pdf 4 | """ 5 | 6 | import numpy as np 7 | import os 8 | import pandas as pd 9 | import pickle 10 | import openml 11 | import subprocess 12 | from scipy.optimize import minimize 13 | from sklearn.preprocessing import PolynomialFeatures 14 | from sklearn.linear_model import LinearRegression 15 | from sklearn.neighbors import KNeighborsRegressor 16 | 17 | 18 | def solve(t_predicted, t_max, n_cores, Y, scalarization='D'): 19 | """Solve the following optimization problem: 20 | minimize -log(det(sum_i v[i]*Y[:, i]*Y[:, i].T)) subject to 0 <= v[i] <= 1 and t_predicted.T * v <= t_max 21 | The optimal vector v is an approximation of a boolean vector indicating which entries to sample. 22 | 23 | Args: 24 | t_predicted (np.ndarray): 1-d array specifying predicted runtime for each model setting 25 | t_max (float): maximum runtime of sampled model 26 | n_cores (int): number of cores to use 27 | Y (np.ndarray): matrix representing latent variable weights of error matrix 28 | scalarization (str): scalarization method in experimental design. 29 | Returns: 30 | np.ndarray: optimal vector v (not truncated to binary values) 31 | """ 32 | 33 | n = len(t_predicted) 34 | 35 | if scalarization == 'D': 36 | def objective(v): 37 | sign, log_det = np.linalg.slogdet(Y @ np.diag(v) @ Y.T) 38 | return -1 * sign * log_det 39 | elif scalarization == 'A': 40 | def objective(v): 41 | return np.trace(np.linalg.pinv(Y @ np.diag(v) @ Y.T)) 42 | elif scalarization == 'E': 43 | def objective(v): 44 | return np.linalg.norm(np.linalg.pinv(Y @ np.diag(v) @ Y.T), ord=2) 45 | def constraint(v): 46 | return t_max * n_cores- t_predicted @ v 47 | v0 = np.full((n, ), 0.5) 48 | constraints = {'type': 'ineq', 'fun': constraint} 49 | v_opt = minimize(objective, v0, method='SLSQP', bounds=[(0, 1)] * n, options={'maxiter': 30}, 50 | constraints=constraints) 51 | 52 | return v_opt.x 53 | 54 | def predict_runtime(size, runtime_matrix=None, saved_model=None, model_name='LinearRegression', save=False): 55 | """Predict the runtime for each model setting on a dataset with given shape. 56 | 57 | Args: 58 | size (tuple): tuple specifying dataset size as [n_rows, n_columns] 59 | runtime_matrix (DataFrame): the DataFame containing runtime. 60 | saved_model (str): path to pre-trained model; defaults to None 61 | save (bool): whether to save pre-trained model 62 | Returns: 63 | np.ndarray: 1-d array of predicted runtimes 64 | """ 65 | assert len(size) == 2, "Dataset must be 2-dimensional." 66 | shape = np.array(size) 67 | 68 | if saved_model: 69 | with open(saved_model, 'rb') as file: 70 | model = pickle.load(file) 71 | return model.predict(shape) 72 | 73 | defaults_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'defaults') 74 | try: 75 | dataset_sizes = pd.read_csv(os.path.join(defaults_path, 'dataset_sizes.csv'), index_col=0) 76 | sizes_index = np.array(dataset_sizes.index) 77 | sizes = dataset_sizes.values 78 | except FileNotFoundError: 79 | sizes_index = [] 80 | sizes = [] 81 | if runtime_matrix is None: 82 | runtime_matrix = pd.read_csv(os.path.join(defaults_path, 'runtime_matrix.csv'), index_col=0) 83 | runtimes_index = np.array(runtime_matrix.index) 84 | runtimes = runtime_matrix.values 85 | model = RuntimePredictor(3, sizes, sizes_index, np.log(runtimes), runtimes_index, model_name=model_name) 86 | if save: 87 | with open(os.path.join(defaults_path, 'runtime_predictor.pkl'), 'wb') as file: 88 | pickle.dump(model, file) 89 | 90 | return np.exp(model.predict(shape)) 91 | 92 | 93 | class RuntimePredictor: 94 | """Model that predicts the runtime for each model setting on a dataset with given shape. Performs polynomial 95 | regression on n (# samples), p (# features), and log(n). 96 | 97 | Attributes: 98 | degree (int): degree of polynomial basis function 99 | n_models (int): number of model settings 100 | models (list): list of scikit-learn regression models 101 | """ 102 | def __init__(self, degree, sizes, sizes_index, runtimes, runtimes_index, model_name='LinearRegression'): 103 | self.degree = degree 104 | self.n_models = runtimes.shape[1] 105 | self.model_name = model_name 106 | self.models = [None] * self.n_models 107 | self.fit(sizes, sizes_index, runtimes, runtimes_index) 108 | 109 | def fit(self, sizes, sizes_index, runtimes, runtimes_index): 110 | """Fit polynomial regression on pre-recorded runtimes on datasets.""" 111 | # assert sizes.shape[0] == runtimes.shape[0], "Dataset sizes and runtimes must be recorded on same datasets." 112 | for i in set(runtimes_index).difference(set(sizes_index)): 113 | dataset = openml.datasets.get_dataset(i) 114 | data_numeric, data_labels, categorical = dataset.get_data(target=dataset.default_target_attribute, 115 | return_categorical_indicator=True) 116 | if len(sizes) == 0: 117 | sizes = np.array([data_numeric.shape]) 118 | sizes_index = np.array(i) 119 | else: 120 | sizes = np.concatenate((sizes, np.array([data_numeric.shape]))) 121 | sizes_index = np.append(sizes_index, i) 122 | 123 | sizes_train = np.array([sizes[list(sizes_index).index(i), :] for i in runtimes_index]) 124 | sizes_log = np.concatenate((sizes_train, np.log(sizes_train[:, 0]).reshape(-1, 1)), axis=1) 125 | sizes_train_poly = PolynomialFeatures(self.degree).fit_transform(sizes_log) 126 | 127 | # train independent regression model to predict each runtime of each model setting 128 | for i in range(self.n_models): 129 | runtime = runtimes[:, i] 130 | if self.model_name == 'LinearRegression': 131 | self.models[i] = LinearRegression().fit(sizes_train_poly, runtime) 132 | elif self.model_name == 'KNeighborsRegressor': 133 | def metric(a, b): 134 | coefficients = [1, 100] 135 | return np.sum(np.multiply((a - b) ** 2, coefficients)) 136 | 137 | def weights(distances): 138 | return distances 139 | 140 | neigh = KNeighborsRegressor(n_neighbors=5, metric=metric, weights=weights) 141 | self.models[i] = neigh.fit(sizes_train, runtime) 142 | # print(self.models[i].coef_) 143 | # print(self.models[i].intercept_) 144 | # self.models[i] = Lasso().fit(sizes_train_poly, runtime) 145 | 146 | def predict(self, size): 147 | """Predict runtime of all model settings on a dataset of given size. 148 | 149 | Args: 150 | size(np.array): Size of the dataset to fit runtime onto. 151 | Returns: 152 | predictions (np.array): The predicted runtime. 153 | """ 154 | if self.model_name == 'LinearRegression': 155 | size_test = np.append(size, np.log(size[0])) 156 | size_test_poly = PolynomialFeatures(self.degree).fit_transform([size_test]) 157 | predictions = np.zeros(self.n_models) 158 | for i in range(self.n_models): 159 | predictions[i] = self.models[i].predict(size_test_poly)[0] 160 | 161 | elif self.model_name == 'KNeighborsRegressor': 162 | predictions = np.zeros(self.n_models) 163 | for i in range(self.n_models): 164 | predictions[i] = self.models[i].predict(np.array(size).reshape(1, -1))[0] 165 | 166 | # # TO BE REMOVED: sanity check 167 | # 168 | # size_check = (1000, 10) 169 | # size_check = np.append(size, np.log(size[0])) 170 | # size_check_poly = PolynomialFeatures(self.degree).fit_transform([size_check]) 171 | # print(size_check_poly) 172 | # for i in range(self.n_models): 173 | # print(self.models[i].predict(size_check_poly)[0]) 174 | 175 | return predictions 176 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/defaults/classification.json: -------------------------------------------------------------------------------- 1 | {"algorithms": ["KNN", "DT", "RF", "GBT", "AB", "lSVM", "kSVM", "Logit", "Perceptron", "GNB", "MLP", "ExtraTrees"], 2 | "hyperparameters": { 3 | "KNN": {"n_neighbors": [1, 3, 5, 7, 9, 11, 13, 15], "p": [1, 2]}, 4 | "DT": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.01,0.001,0.0001,1e-05]}, 5 | "RF": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.1,0.01,0.001,0.0001,1e-05], "criterion": ["gini", "entropy"]}, 6 | "GBT": {"learning_rate": [0.001,0.01,0.025,0.05,0.1,0.25,0.5], "max_depth": [3, 6], "max_features": [null, "log2"]}, 7 | "AB": {"n_estimators": [50, 100], "learning_rate": [1.0, 1.5, 2.0, 2.5, 3.0]}, 8 | "lSVM": {"C": [0.125,0.25,0.5,0.75,1,2,4,8,16]}, 9 | "kSVM": {"C": [0.125,0.25,0.5,0.75,1,2,4,8,16], "kernel": ["rbf", "poly"], "coef0": [0, 10]}, 10 | "Logit": {"C": [0.25,0.5,0.75,1,1.5,2,3,4], "solver": ["liblinear", "saga"], "penalty": ["l1", "l2"]}, 11 | "Perceptron": {}, 12 | "GNB": {}, 13 | "MLP": {"learning_rate_init": [0.0001,0.001,0.01], "learning_rate": ["adaptive"], "solver": ["sgd", "adam"], "alpha": [0.0001, 0.01]}, 14 | "ExtraTrees": {"min_samples_split": [2,4,8,16,32,64,128,256,512,1024,0.1,0.01,0.001,0.0001,1e-05], "criterion": ["gini", "entropy"]}}} -------------------------------------------------------------------------------- /AutoML/oboe/automl/defaults/dataset_sizes.csv: -------------------------------------------------------------------------------- 1 | ,0,1 2 | 1121,294,14 3 | 1005,214,10 4 | 1011,336,8 5 | 1442,253,38 6 | 1054,161,40 7 | 1026,155,49 8 | 1449,253,38 9 | 1448,194,40 10 | 1073,274,9 11 | 1167,320,11 12 | 1048,369,9 13 | 1025,400,19 14 | 1012,194,121 15 | 1446,296,38 16 | 1447,327,38 17 | 1071,403,38 18 | 1063,522,22 19 | 1065,458,40 20 | 1115,151,58 21 | 1004,600,61 22 | 1452,745,37 23 | 1488,195,23 24 | 1443,661,38 25 | 1016,990,28 26 | 1462,1372,5 27 | 1014,797,22 28 | 1451,705,38 29 | 1490,182,13 30 | 1495,250,19 31 | 1467,540,21 32 | 11,625,5 33 | 1480,583,12 34 | 1068,1109,22 35 | 1100,478,23 36 | 1444,1043,38 37 | 1500,210,8 38 | 1499,210,8 39 | 1498,462,11 40 | 1453,1077,38 41 | 1506,470,38 42 | 1511,440,11 43 | 1464,748,5 44 | 1454,1458,38 45 | 1510,569,31 46 | 1049,1458,38 47 | 1524,310,7 48 | 1494,1055,42 49 | 1508,403,6 50 | 1050,1563,38 51 | 1512,200,14 52 | 1523,310,7 53 | 1520,164,91 54 | 1067,2109,22 55 | 1020,2000,65 56 | 1547,1000,21 57 | 1021,5473,11 58 | 1529,1521,4 59 | 1487,2534,73 60 | 1069,5589,37 61 | 1530,1515,4 62 | 1553,700,19 63 | 1554,500,19 64 | 1551,400,45 65 | 1482,340,16 66 | 1525,5456,3 67 | 1600,267,45 68 | 1565,294,14 69 | 1526,5456,5 70 | 1489,5404,6 71 | 1527,3252,4 72 | 1552,1100,19 73 | 1544,1277,4 74 | 1545,1252,4 75 | 1543,1080,4 76 | 1542,1183,4 77 | 187,178,14 78 | 1466,2126,36 79 | 1549,750,45 80 | 1546,1112,4 81 | 23499,277,42 82 | 1056,9466,39 83 | 1560,2126,36 84 | 1570,4839,6 85 | 311,937,50 86 | 1460,5300,3 87 | 1507,7400,21 88 | 285,194,121 89 | 329,160,5 90 | 333,556,18 91 | 335,554,18 92 | 334,601,18 93 | 1566,1212,101 94 | 336,267,45 95 | 31,1000,62 96 | 337,349,45 97 | 1131,193,10936 98 | 338,155,49 99 | 1555,1000,45 100 | 1496,7400,21 101 | 23,1473,25 102 | 37,768,9 103 | 1558,4521,52 104 | 1557,4177,11 105 | 1497,5456,25 106 | 18,2000,7 107 | 307,990,28 108 | 377,600,61 109 | 316,2417,130 110 | 1472,768,47 111 | 30,5473,11 112 | 1155,195,10936 113 | 1514,360,1301 114 | 36,2310,20 115 | 39,336,8 116 | 3,3196,74 117 | 1154,187,10936 118 | 1152,267,10936 119 | 1458,200,10001 120 | 1132,203,10936 121 | 22,2000,48 122 | 1116,6598,269 123 | 14,2000,77 124 | 312,2407,305 125 | 16,2000,65 126 | 1548,2500,227 127 | 1124,201,10936 128 | 40496,500,8 129 | 40474,2800,47 130 | 40475,2800,47 131 | 1144,329,10936 132 | 1136,250,10936 133 | 40647,1600,59 134 | 40497,3772,22 135 | 1147,337,10936 136 | 40649,1600,60 137 | 1164,185,10936 138 | 40646,1600,61 139 | 40669,160,13 140 | 1022,2000,1649 141 | 40665,476,169 142 | 40648,1600,61 143 | 40682,215,6 144 | 40663,399,132 145 | 40476,2800,47 146 | 40650,1600,60 147 | 40671,327,8 148 | 40680,1324,21 149 | 40690,512,19 150 | 1501,1593,257 151 | 40686,315,35 152 | 40477,2800,47 153 | 40693,973,19 154 | 40700,392,12 155 | 40702,1066,32 156 | 40710,303,31 157 | 12,2000,217 158 | 1157,321,10936 159 | 40664,1728,43 160 | 40705,959,45 161 | 40704,2201,4 162 | 40478,2800,47 163 | 40994,540,19 164 | 40916,158,20 165 | 1485,2600,501 166 | 1135,355,10936 167 | 40711,303,21 168 | 40,208,61 169 | 40687,1066,43 170 | 313,531,105 171 | 40999,2351,70 172 | 40691,1599,12 173 | 40706,1124,21 174 | 1129,384,10936 175 | 40713,3772,55 176 | 41007,2352,72 177 | 4329,470,38 178 | 41,214,10 179 | 4153,180,67 180 | 4340,383,6 181 | 43,306,15 182 | 446,200,9 183 | 41005,3660,72 184 | 450,264,6 185 | 1159,259,10936 186 | 463,180,77 187 | 40707,3772,55 188 | 40678,3200,15 189 | 464,250,3 190 | 40708,3772,55 191 | 48,151,8 192 | 458,841,71 193 | 475,400,16 194 | 40701,5000,34 195 | 53,270,14 196 | 50,958,28 197 | 40601,333,2 198 | 512,2001,2 199 | 61,150,5 200 | 1133,347,10936 201 | 59,351,35 202 | 469,797,22 203 | 40666,6598,169 204 | 721,200,11 205 | 717,508,11 206 | 1163,386,10936 207 | 694,310,9 208 | 724,468,12 209 | 54,846,19 210 | 730,250,6 211 | 40997,4704,73 212 | 715,1000,26 213 | 40998,4704,73 214 | 41000,4704,71 215 | 733,209,7 216 | 732,250,51 217 | 723,1000,26 218 | 375,9961,15 219 | 741,1024,6 220 | 41004,4704,73 221 | 744,250,6 222 | 745,159,21 223 | 747,167,20 224 | 44,4601,58 225 | 746,250,26 226 | 748,163,8 227 | 28,5620,65 228 | 743,1000,6 229 | 718,1000,101 230 | 749,500,6 231 | 740,1000,11 232 | 756,159,16 233 | 742,500,101 234 | 753,194,33 235 | 728,4052,8 236 | 763,250,11 237 | 764,450,12 238 | 750,500,8 239 | 765,475,12 240 | 767,475,12 241 | 769,250,51 242 | 720,4177,11 243 | 388,204,5833 244 | 776,250,6 245 | 773,250,26 246 | 770,625,7 247 | 778,252,15 248 | 751,1000,11 249 | 766,500,51 250 | 774,662,4 251 | 788,186,61 252 | 793,250,11 253 | 794,250,26 254 | 792,500,6 255 | 796,209,37 256 | 801,185,3 257 | 779,500,26 258 | 795,662,4 259 | 811,264,14 260 | 737,3107,7 261 | 818,310,17 262 | 814,468,3 263 | 799,1000,6 264 | 820,235,13 265 | 813,1000,6 266 | 830,250,11 267 | 772,2178,4 268 | 832,250,26 269 | 827,662,4 270 | 826,576,37 271 | 183,4177,11 272 | 797,1000,51 273 | 40971,1000,20 274 | 834,250,101 275 | 838,500,26 276 | 806,1000,51 277 | 841,950,10 278 | 860,380,3 279 | 853,506,15 280 | 863,250,11 281 | 851,240,125 282 | 845,1000,11 283 | 855,500,11 284 | 40677,3200,49 285 | 849,1000,26 286 | 870,500,6 287 | 873,250,51 288 | 880,284,11 289 | 877,250,51 290 | 869,500,11 291 | 879,500,26 292 | 895,222,3 293 | 886,500,8 294 | 900,400,7 295 | 896,500,26 296 | 866,1000,51 297 | 888,500,51 298 | 907,400,8 299 | 906,400,8 300 | 909,400,8 301 | 908,400,8 302 | 911,250,6 303 | 903,1000,26 304 | 725,8192,9 305 | 1160,410,10936 306 | 904,1000,51 307 | 1156,275,10936 308 | 915,315,19 309 | 912,1000,6 310 | 910,1000,11 311 | 918,250,51 312 | 925,323,5 313 | 933,250,26 314 | 935,250,11 315 | 913,1000,11 316 | 926,500,26 317 | 931,662,4 318 | 8,345,6 319 | 914,2001,2 320 | 936,500,11 321 | 941,189,24 322 | 1143,363,10936 323 | 917,1000,26 324 | 947,559,47 325 | 943,500,11 326 | 1140,324,10936 327 | 934,1156,40 328 | 735,8192,13 329 | 951,559,47 330 | 950,559,47 331 | 937,500,51 332 | 969,150,5 333 | 955,151,8 334 | 949,559,47 335 | 973,178,14 336 | 1149,458,10936 337 | 871,3848,6 338 | 952,214,10 339 | 962,2000,7 340 | 954,531,105 341 | 987,500,37 342 | 1123,405,10936 343 | 970,841,71 344 | 996,214,10 345 | 958,2310,20 346 | 948,2178,4 347 | 997,625,5 348 | 994,846,19 349 | 923,8641,6 350 | 60,5000,41 351 | 803,7129,6 352 | 991,1728,22 353 | 1126,412,10936 354 | 761,8192,22 355 | 983,1473,25 356 | 1515,571,1301 357 | 971,2000,77 358 | 995,2000,48 359 | 1162,322,10936 360 | 978,2000,217 361 | 40498,4898,12 362 | 847,6574,15 363 | 980,5620,65 364 | 976,9961,15 365 | 40499,5500,41 366 | 979,5000,41 367 | 1127,421,10936 368 | 1137,546,10936 369 | 807,8192,9 370 | 46,3190,288 371 | 953,3190,288 372 | 819,9517,7 373 | 816,8192,9 374 | 40670,3186,361 375 | 40910,3686,401 376 | 1148,468,10936 377 | 1150,470,10936 378 | 4537,9901,20 379 | 1153,484,10936 380 | 1165,542,10936 381 | 1158,604,10936 382 | 833,8192,33 383 | 1535,9989,4 384 | 397,313,5805 385 | 40645,1600,2970 386 | 1491,1600,65 387 | 1039,4229,1618 388 | 1493,1599,65 389 | 384,336,7903 390 | 1145,630,10936 391 | 4538,9873,33 392 | 4136,600,20001 393 | 1492,1600,65 394 | 1085,159,61360 395 | 394,918,3013 396 | 1084,220,22284 397 | 752,8192,33 398 | 386,913,3101 399 | 387,414,6430 400 | 1541,8654,4 401 | 1540,9285,4 402 | 4134,3751,1777 403 | 392,1003,3183 404 | 20,2000,1649 405 | 1106,190,16064 406 | 1134,1545,10936 407 | 1139,1545,10936 408 | 1130,1545,10936 409 | 383,690,8262 410 | 1128,1545,10936 411 | 391,1504,2887 412 | 400,878,7455 413 | 385,927,10129 414 | 1086,283,54622 415 | 1087,283,54622 416 | 4552,5665,105 417 | 1083,214,45102 418 | 389,2463,2001 419 | 1233,945,6374 420 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/defaults/regression.json: -------------------------------------------------------------------------------- 1 | {"algorithms": ["Lasso", "Ridge", "ElasticNet"], "hyperparameters": {}} -------------------------------------------------------------------------------- /AutoML/oboe/automl/defaults/runtime_predictor.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ORIE4741/demos/1172fdf1fd6bb53998a300361473bf8f974f8d20/AutoML/oboe/automl/defaults/runtime_predictor.pkl -------------------------------------------------------------------------------- /AutoML/oboe/automl/generate_matrix.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Shell script to generate error matrix (and merge results), parallelized across datasets. 4 | 5 | usage () { 6 | cat <&2 38 | mode=${OPTARG} 39 | ;; 40 | s) 41 | SAVE_DIR=$OPTARG 42 | ;; 43 | d) 44 | DATA_DIR=$OPTARG 45 | ;; 46 | p) 47 | P_TYPE=$OPTARG 48 | ;; 49 | j) 50 | JSON_FILE=$OPTARG 51 | ;; 52 | e) 53 | ERROR_MATRIX=$OPTARG 54 | ;; 55 | n) 56 | MAX_PROCS=$OPTARG 57 | ;; 58 | a) 59 | AUC=$OPTARG 60 | ;; 61 | f) 62 | FULLNAME=$OPTARG 63 | ;; 64 | \?) 65 | echo "Invalid option: -${OPTARG}" >&2 66 | usage 67 | exit 1 68 | ;; 69 | esac 70 | done 71 | 72 | #if [ "$1" == "" ] 73 | #then 74 | # echo "Must specify mode." 75 | # usage 76 | # exit 1 77 | #fi 78 | 79 | # no limit for maximum number of processes if no number is given 80 | if [ "${MAX_PROCS}" == "" ] 81 | then 82 | MAX_PROCS="0" 83 | fi 84 | 85 | # default to not using AUC 86 | if [ "${AUC}" == "" ] 87 | then 88 | AUC="False" 89 | fi 90 | 91 | # default to not using fullname 92 | if [ "${FULLNAME}" == "" ] 93 | then 94 | FULLNAME="False" 95 | fi 96 | 97 | # strip '/' from end of file path (if there is one) 98 | #SAVE_DIR=${3%/} 99 | #DATA_DIR=${4%/} 100 | 101 | # location of this script 102 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 103 | 104 | # generate mode 105 | if [ "${mode}" == "generate" ] 106 | then 107 | time=`date +%Y%m%d%H%M` 108 | mkdir -p ${SAVE_DIR}/${time} 109 | echo -e "SAVE_DIR=${SAVE_DIR}\nDATA_DIR=${DATA_DIR}\nP_TYPE=${P_TYPE}\nJSON_FILE=${JSON_FILE}\nAUC=${AUC}\nERROR_MATRIX=${ERROR_MATRIX}\n" >> ${SAVE_DIR}/${time}/configurations.txt 110 | echo "Error matrix generation started at ${time}" >> ${SAVE_DIR}/${time}/log_${time}.txt 111 | 112 | ls ${DATA_DIR}/*.csv | xargs -i --max-procs=${MAX_PROCS} bash -c \ 113 | "python ${DIR}/generate_vector.py '${P_TYPE}' {} --file=${JSON_FILE} --save_dir=${SAVE_DIR}/${time} \ 114 | --error_matrix=${ERROR_MATRIX} --auc=${AUC} --fullname=${FULLNAME} &>> ${SAVE_DIR}/${time}/warnings_and_errors.txt" 115 | fi 116 | 117 | # merge mode 118 | if [ "${mode}" == "merge" ] 119 | then 120 | python ${DIR}/util.py ${SAVE_DIR} 121 | fi 122 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/generate_vector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate a row of the error matrix for a given dataset. Records cross-validation error & elapsed time for each 3 | algorithm & hyperparameter combination. 4 | 5 | Note the difference between model "configurations" and "settings": configurations is a nested dictionary, containing 6 | a list of algorithms, and a dictionary of lists of hyperparameters; settings is a list of dictionaries, with one 7 | algorithm and a dictionary of hyperparameters. Below is an example of each: 8 | 9 | Config: {'algorithms': ['KNN', 'DT'], 10 | 'hyperparameters': {'KNN': {'n_neighbors': [1, 3, 5, 7], 'p': [1, 2]}, 11 | 'DT': {'min_samples_split': [0.01, 0.001]} 12 | } 13 | } 14 | 15 | Settings: [{'algorithm': 'KNN', 'hyperparameters': {'n_neighbors': 1, 'p': 1}}, 16 | {'algorithm': 'KNN', 'hyperparameters': {'n_neighbors': 3, 'p': 2}}, 17 | {'algorithm': 'DT', 'hyperparameters': {'min_samples_split': 0.01}} 18 | ] 19 | """ 20 | 21 | import argparse 22 | import numpy as np 23 | import pandas as pd 24 | import json 25 | import os 26 | import sys 27 | import re 28 | import time 29 | import util 30 | from model import Model 31 | import mkl 32 | 33 | mkl.set_num_threads(1) 34 | RANDOM_STATE = 0 35 | 36 | def main(args): 37 | # load selected algorithms & hyperparameters from string or JSON file 38 | assert (args.string is None) != (args.file is None), 'Exactly one of --string and --file must be specified.' 39 | if args.string: 40 | configs = json.loads(args.string) 41 | elif args.file: 42 | with open(args.file) as f: 43 | configs = json.load(f) 44 | assert set(configs.keys()) == {'algorithms', 'hyperparameters'}, 'Invalid arguments.' 45 | 46 | # load training dataset 47 | dataset = pd.read_csv(args.data, header=None).values 48 | filename = args.data.split('/')[-1].split('.')[0] 49 | # whether to use dataset filename as error matrix vector filename 50 | if args.fullname: 51 | dataset_id = filename 52 | else: 53 | dataset_id = int(re.findall("\\d+", filename)[0]) 54 | 55 | # do not generate error matrices twice on one dataset 56 | if args.error_matrix != None: 57 | if args.error_matrix.endswith('.csv'): 58 | generated_datasets = pd.read_csv(args.error_matrix, index_col=0).index.tolist() 59 | assert dataset_id not in generated_datasets, 'Already generated.' 60 | 61 | t0 = time.time() 62 | x = dataset[:, :-1] 63 | y = dataset[:, -1] 64 | 65 | settings = util.generate_settings(configs['algorithms'], configs['hyperparameters']) 66 | headings = [str(s) for s in settings] 67 | results = np.full((2, len(settings)), np.nan) 68 | 69 | # generate error matrix entries, i.e. compute k-fold cross validation error 70 | log_file = [file for file in os.listdir(args.save_dir) if file.startswith('log')][0] 71 | for i, setting in enumerate(settings): 72 | model = Model(args.p_type, setting['algorithm'], setting['hyperparameters'], args.auc, args.verbose) 73 | start = time.time() 74 | try: 75 | cv_errors, _ = model.kfold_fit_validate(x, y, n_folds=args.n_folds, random_state=RANDOM_STATE) 76 | except (ZeroDivisionError, KeyError, TypeError, ValueError) as e: 77 | with open(os.path.join(args.save_dir, log_file), 'a') as log: 78 | line = '\nID={}, model={}, {}'.format(dataset_id, setting, e) 79 | log.write(line) 80 | results[:, i] = np.array([cv_errors.mean(), time.time() - start]) 81 | if args.fullname: 82 | save_path = os.path.join(args.save_dir, str(dataset_id) + '.csv') 83 | else: 84 | save_path = os.path.join(args.save_dir, str(dataset_id).zfill(5) + '.csv') 85 | pd.DataFrame(results, columns=headings, index=['Error', 'Time']).to_csv(save_path) 86 | 87 | # log results 88 | elapsed = time.time() - t0 89 | line = '\nID={}, Size={}, Time={:.0f}s, Avg. Error={:.3f}'\ 90 | .format(dataset_id, dataset.shape, elapsed, results[0, :].mean()) 91 | with open(os.path.join(args.save_dir, log_file), 'a') as log: 92 | log.write(line) 93 | print(line) 94 | 95 | 96 | def parse_args(argv): 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument('p_type', type=str, help='Problem type. Either classification or regression.') 99 | parser.add_argument('data', type=str, help='File path to training dataset.') 100 | parser.add_argument('--string', type=str, 101 | help='JSON-style string listing all algorithm types and hyperparameters. ' 102 | 'See automl/util.py for example.') 103 | parser.add_argument('--file', type=str, 104 | help='JSON file listing all algorithm types and hyperparameters. ' 105 | 'See automl/defaults/models.json for example.') 106 | parser.add_argument('--save_dir', type=str, default='./custom', 107 | help='Directory in which to save new error matrix.') 108 | parser.add_argument('--n_folds', type=int, default=5, help='Number of folds to use for k-fold cross validation.') 109 | parser.add_argument('--verbose', type=lambda x: x == 'True', default=False, 110 | help='Whether to generate print statements on completion.') 111 | parser.add_argument('--error_matrix', type=str, default=None, 112 | help='Existing error matrix. Avoid re-generate its rows.') 113 | parser.add_argument('--auc', type=lambda x: x == 'True', default=False, help='Whether to use AUC instead of BER') 114 | parser.add_argument('--fullname', type=lambda x: x == 'True', default=False, 115 | help='Whether to use the full name of dataset as corresponding error matrix vectors.') 116 | return parser.parse_args(argv) 117 | 118 | 119 | if __name__ == '__main__': 120 | main(parse_args(sys.argv[1:])) 121 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/linalg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Linear algebra helper functions. 3 | """ 4 | 5 | import numpy as np 6 | from scipy.sparse.linalg import svds 7 | from scipy.linalg import qr 8 | 9 | 10 | def approx_rank(a, threshold=0.03): 11 | """Compute approximate rank of a matrix. 12 | 13 | Args: 14 | a (np.ndarray): Matrix for which to compute rank. 15 | threshold (float): All singular values less than threshold * (largest singular value) will be set to 0 16 | Returns: 17 | int: The approximate rank of a. 18 | """ 19 | s = np.linalg.svd(a, compute_uv=False) 20 | rank = s[s >= threshold * s[0]] 21 | return len(rank) 22 | 23 | 24 | def pivot_columns(a, rank=None, threshold=None): 25 | """Computes the QR decomposition of a matrix with column pivoting, i.e. solves the equation AP=QR such that Q is 26 | orthogonal, R is upper triangular, and P is a permutation matrix. 27 | 28 | Args: 29 | a (np.ndarray): Matrix for which to compute QR decomposition. 30 | threshold (float): Threshold specifying approximate rank of a. All singular values less than threshold * (largest singular value) will be set to 0 31 | rank (int): The approximate rank. 32 | Returns: 33 | np.array: The permutation p. 34 | """ 35 | assert (threshold is None) != (rank is None), "Exactly one of threshold and rank should be specified." 36 | if threshold is not None: 37 | rank = approx_rank(a, threshold) 38 | return qr(a, pivoting=True)[2][:rank] 39 | 40 | 41 | def pca(a, rank=None, threshold=None): 42 | """Solves: minimize ||A_XY||^2 where ||.|| is the Frobenius norm. 43 | 44 | Args: 45 | a (np.ndarray): Matrix for which to compute PCA. 46 | threshold (float): Threshold specifying approximate rank of a. 47 | rank (int): The approximate rank. 48 | Returns: 49 | x, y (np.ndarray): The solutions to the PCA problem. 50 | vt (np.ndarray): Transpose of V as specified in the singular value decomposition. 51 | """ 52 | assert (threshold is None) != (rank is None), "Exactly one of threshold and rank should be specified." 53 | if threshold is not None: 54 | rank = approx_rank(a, threshold) 55 | # std = np.std(a, axis=0) 56 | u, s, vt = svds(a, k=rank) 57 | 58 | nonzero_pos = np.where(s > 0)[0] 59 | s = s[nonzero_pos] 60 | u = u[:, nonzero_pos] 61 | vt = vt[nonzero_pos, :] 62 | 63 | u = np.fliplr(u) 64 | s = np.flipud(s) 65 | vt = np.flipud(vt) 66 | # sigma_sqrt = np.diag(np.sqrt(s)) 67 | # x = np.dot(u, sigma_sqrt).T 68 | # # y = np.dot(np.dot(sigma_sqrt, vt), np.diag(std)) 69 | # y = np.dot(sigma_sqrt, vt) 70 | 71 | sigma = np.diag(s) 72 | x = np.dot(u, sigma).T 73 | y = vt 74 | return x, y, vt 75 | 76 | 77 | def impute(A, a, known_indices, rank=None): 78 | """Imputes the missing entries of a vector a, given a fully observed matrix A of which a forms a new row. 79 | 80 | Args: 81 | A (np.ndarray): Fully observed matrix. 82 | a (np.ndarray): 1xn partially observed array. 83 | known_indices (np.array): Array of observed entries; from the set {1,...,n} 84 | rank (int): Approximate rank of A. 85 | Returns: 86 | np.ndarray: 1xn imputed array. 87 | """ 88 | rank = rank or len(known_indices) 89 | x, y, _ = pca(A, rank=rank) 90 | # find x using matrix division using known portion of a, corresponding columns of A 91 | x = np.linalg.lstsq(y[:, known_indices].T, a[:, known_indices].T, rcond=None)[0].T 92 | # approximate full a as x*Y 93 | return np.dot(x, y) 94 | 95 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Parent class for all ML models. 3 | """ 4 | 5 | import numpy as np 6 | import util 7 | from scipy.stats import mode 8 | from sklearn.model_selection import StratifiedKFold, train_test_split 9 | 10 | 11 | RANDOM_STATE = 0 12 | 13 | 14 | class Model: 15 | """An object representing a machine learning model. 16 | 17 | Attributes: 18 | p_type (str): Either 'classification' or 'regression'. 19 | algorithm (str): Algorithm type (e.g. 'KNN'). 20 | hyperparameters (dict): Hyperparameters (e.g. {'n_neighbors': 5}). 21 | model (object): A scikit-learn object for the model. 22 | fitted (bool): Whether or not the model has been trained. 23 | verbose (bool): Whether or not to generate print statements when fitting complete. 24 | """ 25 | 26 | def __init__(self, p_type, algorithm, hyperparameters={}, verbose=False, index=None): 27 | self.p_type = p_type 28 | self.algorithm = algorithm 29 | self.hyperparameters = hyperparameters 30 | self.model = self.instantiate() 31 | self.cv_error = np.nan 32 | self.cv_predictions = None 33 | self.sampled = False 34 | self.fitted = False 35 | self.verbose = verbose 36 | self.index = index 37 | 38 | def instantiate(self): 39 | """Creates a scikit-learn object of specified algorithm type and with specified hyperparameters. 40 | 41 | Returns: 42 | object: A scikit-learn object. 43 | """ 44 | if self.algorithm == None or self.algorithm.lower() == 'greedy': 45 | return None 46 | try: 47 | return getattr(util, self.algorithm)(random_state=RANDOM_STATE, **self.hyperparameters) 48 | except TypeError: 49 | return getattr(util, self.algorithm)(**self.hyperparameters) 50 | 51 | def fit(self, x_train, y_train, runtime_limit=None): 52 | """Fits the model on training data. Note that this function is only used once a model has been identified as a 53 | model to be included in the final ensemble. 54 | 55 | Args: 56 | x_train (np.ndarray): Features of the training dataset. 57 | y_train (np.ndarray): Labels of the training dataset. 58 | runtime_limit (float): Maximum amount of time to allocate to fitting. 59 | """ 60 | self.model.fit(x_train, y_train) 61 | self.fitted = True 62 | if self.verbose: 63 | print("{} {} complete.".format(self.algorithm, self.hyperparameters)) 64 | 65 | def predict(self, x_test): 66 | """Predicts labels on a new dataset. 67 | 68 | Args: 69 | x_test (np.ndarray): Features of the test dataset. 70 | 71 | Returns: 72 | np.array: Predicted features of the test dataset. 73 | """ 74 | return self.model.predict(x_test) 75 | 76 | def kfold_fit_validate(self, x_train, y_train, n_folds, random_state=None): 77 | """Performs k-fold cross validation on a training dataset. Note that this is the function used to fill entries 78 | of the error matrix. 79 | 80 | Args: 81 | x_train (np.ndarray): Features of the training dataset. 82 | y_train (np.ndarray): Labels of the training dataset. 83 | n_folds (int): Number of folds to use for cross validation. 84 | 85 | Returns: 86 | float: Mean of k-fold cross validation error. 87 | np.ndarray: Predictions on the training dataset from cross validation. 88 | """ 89 | y_predicted = np.empty(y_train.shape) 90 | cv_errors = np.empty(n_folds) 91 | kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) 92 | 93 | for i, (train_idx, test_idx) in enumerate(kf.split(x_train, y_train)): 94 | x_tr = x_train[train_idx, :] 95 | y_tr = y_train[train_idx] 96 | x_te = x_train[test_idx, :] 97 | y_te = y_train[test_idx] 98 | 99 | model = self.instantiate() 100 | if len(np.unique(y_tr)) > 1: 101 | model.fit(x_tr, y_tr) 102 | y_predicted[test_idx] = model.predict(x_te) 103 | else: 104 | y_predicted[test_idx] = y_tr[0] 105 | cv_errors[i] = self.error(y_te, y_predicted[test_idx]) 106 | 107 | self.cv_error = cv_errors.mean() 108 | self.cv_predictions = y_predicted 109 | self.sampled = True 110 | if self.verbose: 111 | print("{} {} complete.".format(self.algorithm, self.hyperparameters)) 112 | 113 | return cv_errors, y_predicted 114 | 115 | def kfold_fit_validate_testing(self, x_train, y_train, n_folds, random_state=None): 116 | """Performs k-fold cross validation on a training dataset, with fitting on a portion of the training fold and testing on the test fold. 117 | 118 | Args: 119 | x_train (np.ndarray): Features of the training dataset. 120 | y_train (np.ndarray): Labels of the training dataset. 121 | n_folds (int): Number of folds to use for cross validation. 122 | 123 | Returns: 124 | float: Mean of k-fold cross validation error. 125 | np.ndarray: Predictions on the training dataset from cross validation. 126 | """ 127 | y_predicted = np.empty(y_train.shape) 128 | cv_errors = np.empty(n_folds) 129 | kf = StratifiedKFold(n_folds, shuffle=True, random_state=random_state) 130 | 131 | for i, (train_idx, test_idx) in enumerate(kf.split(x_train, y_train)): 132 | x_tr_val = x_train[train_idx, :] 133 | y_tr_val = y_train[train_idx] 134 | x_te = x_train[test_idx, :] 135 | y_te = y_train[test_idx] 136 | # split data into training and validation sets 137 | try: 138 | x_tr, x_va, y_tr, y_va = train_test_split(x_tr_val, y_tr_val, test_size=0.15, stratify=y_tr_val, random_state=random_state) 139 | except ValueError: 140 | x_tr, x_va, y_tr, y_va = train_test_split(x_tr_val, y_tr_val, test_size=0.15, random_state=random_state) 141 | 142 | model = self.instantiate() 143 | if len(np.unique(y_tr)) > 1: 144 | model.fit(x_tr, y_tr) 145 | y_predicted[test_idx] = model.predict(x_te) 146 | else: 147 | y_predicted[test_idx] = y_tr[0] 148 | cv_errors[i] = self.error(y_te, y_predicted[test_idx]) 149 | 150 | self.cv_error = cv_errors.mean() 151 | self.cv_predictions = y_predicted 152 | self.sampled = True 153 | if self.verbose: 154 | print("{} {} complete.".format(self.algorithm, self.hyperparameters)) 155 | 156 | return cv_errors, y_predicted 157 | 158 | 159 | def error(self, y_true, y_predicted): 160 | """Compute error metric for the model. 161 | 162 | Args: 163 | y_true (np.ndarray): Observed labels. 164 | y_predicted (np.ndarray): Predicted labels. 165 | Returns: 166 | float: Error metric 167 | """ 168 | return util.error(y_true, y_predicted, self.p_type) 169 | 170 | 171 | class Ensemble(Model): 172 | """An object representing an ensemble of machine learning models. 173 | 174 | Attributes: 175 | p_type (str): Either 'classification' or 'regression'. 176 | algorithm (str): Algorithm type (e.g. 'Logit'). 177 | hyperparameters (dict): Hyperparameters (e.g. {'C': 1.0}). 178 | model (object): A scikit-learn object for the model. 179 | """ 180 | 181 | def __init__(self, p_type, algorithm, hyperparameters={}): 182 | super().__init__(p_type, algorithm, hyperparameters) 183 | self.candidate_learners = [] 184 | self.base_learners = [] 185 | self.second_layer_features = None 186 | 187 | def select_base_learners(self, y_train, fitted_base_learners): 188 | """Select base learners from candidate learners based on ensembling algorithm. 189 | """ 190 | cv_errors = np.array([m.cv_error for m in self.candidate_learners]) 191 | # greedy ensemble forward selection 192 | assert self.algorithm in {'greedy', 'stacking'}, "The ensemble selection method must be either greedy forward selection (by Caruana et al.) or stacking." 193 | if self.algorithm == 'greedy': 194 | x_tr = () 195 | # initial number of models in ensemble 196 | n_initial = 3 197 | for i in np.argsort(cv_errors)[:n_initial]: 198 | x_tr += (self.candidate_learners[i].cv_predictions.reshape(-1, 1), ) 199 | if fitted_base_learners is None: 200 | pre_fitted = None 201 | else: 202 | pre_fitted = fitted_base_learners[self.candidate_learners[i].index] 203 | if pre_fitted is not None: 204 | self.base_learners.append(pre_fitted) 205 | else: 206 | self.base_learners.append(self.candidate_learners[i]) 207 | 208 | x_tr = np.hstack(x_tr) 209 | candidates = list(np.argsort(cv_errors)) 210 | error = util.error(y_train, mode(x_tr, axis=1)[0], self.p_type) 211 | 212 | while True: 213 | looped = True 214 | for i, idx in enumerate(candidates): 215 | slm = np.hstack((x_tr, self.candidate_learners[i].cv_predictions.reshape(-1, 1))) 216 | err = util.error(y_train, mode(slm, axis=1)[0], self.p_type) 217 | if err < error: 218 | error = err 219 | x_tr = slm 220 | if fitted_base_learners is None: 221 | pre_fitted = None 222 | else: 223 | pre_fitted = fitted_base_learners[self.candidate_learners[i].index] 224 | if pre_fitted is not None: 225 | self.base_learners.append(pre_fitted) 226 | else: 227 | self.base_learners.append(self.candidate_learners[i]) 228 | looped = False 229 | break 230 | if looped: 231 | break 232 | self.second_layer_features = x_tr 233 | elif self.algorithm == 'stacking': 234 | self.base_learners = self.candidate_learners 235 | x_tr = [m.cv_predictions.reshape(-1, 1) for m in self.candidate_learners] 236 | self.second_layer_features = np.hstack(tuple(x_tr)) 237 | 238 | def fit(self, x_train, y_train, runtime_limit=None, fitted_base_learners=None): 239 | """Add models to the ensemble and fit the ensemble on training data. 240 | 241 | Args: 242 | x_train (np.ndarray): Features of the training dataset. 243 | y_train (np.ndarray): Labels of the training dataset. 244 | fitted_base_learners (list): A list of already fitted models. 245 | 246 | Args to be implemented: 247 | runtime_limit (float): Maximum runtime to be allocated to fitting. 248 | """ 249 | self.select_base_learners(y_train, fitted_base_learners) 250 | # TODO: parallelize training over base learners 251 | for model in self.base_learners: 252 | if not model.fitted: 253 | model.fit(x_train, y_train) 254 | if self.algorithm != 'greedy': 255 | self.model.fit(self.second_layer_features, y_train) 256 | self.fitted = True 257 | 258 | def refit(self, x_train, y_train): 259 | """Fit ensemble model on training data with base learners already added and unchanged. 260 | 261 | Args: 262 | x_train (np.ndarray): Features of the training dataset. 263 | y_train (np.ndarray): Labels of the training dataset. 264 | 265 | Args to be implemented: 266 | runtime_limit (float): Maximum runtime to be allocated to fitting. 267 | """ 268 | # TODO: parallelize training over base learners 269 | for model in self.base_learners: 270 | if not model.fitted: 271 | model.fit(x_train, y_train) 272 | if self.algorithm == 'stacking': 273 | self.model.fit(self.second_layer_features, y_train) 274 | 275 | def predict(self, x_test): 276 | """Generate predictions of the ensemble model on test data. 277 | 278 | Args: 279 | x_test (np.ndarray): Features of the test dataset. 280 | Returns: 281 | np.array: Predicted labels of the test dataset. 282 | """ 283 | assert len(self.base_learners) > 0, "Ensemble size must be greater than zero." 284 | 285 | base_learner_predictions = () 286 | for model in self.base_learners: 287 | y_predicted = np.reshape(model.predict(x_test), [-1, 1]) 288 | base_learner_predictions += (y_predicted, ) 289 | self.x_te = np.hstack(base_learner_predictions) 290 | if self.algorithm == 'greedy': 291 | return mode(self.x_te, axis=1)[0].reshape((1, -1)) 292 | else: 293 | return self.model.predict(self.x_te) 294 | 295 | def get_models(self): 296 | """Get details of the selected machine learning models and the ensemble. 297 | """ 298 | base_learner_names = {} 299 | for model in self.base_learners: 300 | if model.algorithm in base_learner_names.keys(): 301 | base_learner_names[model.algorithm].append(model.hyperparameters) 302 | else: 303 | base_learner_names[model.algorithm] = [model.hyperparameters] 304 | if self.algorithm == 'greedy': 305 | return {'ensemble method': 'greedy selection', 'base learners': base_learner_names} 306 | elif self.algorithm == 'stacking': 307 | ensemble_learner_name = {} 308 | ensemble_learner_name[self.model.algorithm] = self.model.hyperparameters 309 | return {'ensemble method': 'stacking', 'ensemble learner': ensemble_learner_name, 'base learners': base_learner_names} 310 | 311 | def get_model_accuracy(self, y_test): 312 | """ Get prediction accuracies of each base learner when the true test labels are provided. 313 | 314 | Args: 315 | y_test (np.array): True labels of the test set. 316 | 317 | Returns: 318 | accuracies (list): A numerical list of individual model accuracies on the test set. 319 | """ 320 | accuracies = [] 321 | for iter in range(self.x_te.shape[1]): 322 | accuracies.append(util.error(y_test, self.x_te[:, iter], self.p_type)) 323 | return accuracies 324 | 325 | 326 | class Model_collection(Ensemble): 327 | """An object representing a collection of individual machine learning models. 328 | 329 | Attributes: 330 | p_type (str): Either 'classification' or 'regression'. 331 | """ 332 | def __init__(self, p_type): 333 | super().__init__(p_type=p_type, algorithm=None, hyperparameters=None) 334 | 335 | def select_base_learners(self): 336 | """ Set inidividual learners to be all the learners added to the collection. 337 | """ 338 | self.base_learners = self.candidate_learners 339 | 340 | def fit(self, x_train, y_train, runtime_limit=None, fitted_base_learners=None): 341 | """ Fit inidividual learners in the model collection on training dataset. 342 | 343 | Args: 344 | x_train (np.ndarray): Features of the training dataset. 345 | y_train (np.ndarray): Labels of the training dataset. 346 | """ 347 | self.select_base_learners() 348 | super().refit(x_train=x_train, y_train=y_train) 349 | self.fitted = True 350 | 351 | def predict(self, x_test): 352 | """Generate predictions of the individual learners on test data. 353 | 354 | Args: 355 | x_test (np.ndarray): Features of the test dataset. 356 | 357 | Returns: 358 | np.ndarray: A 2-dimensional array containing predicted labels of the test dataset. Each column corresponds to the predictions of one single base learner. 359 | """ 360 | assert len(self.base_learners) > 0, "Ensemble size must be greater than zero." 361 | 362 | base_learner_predictions = () 363 | for model in self.base_learners: 364 | y_predicted = np.reshape(model.predict(x_test), [-1, 1]) 365 | base_learner_predictions += (y_predicted, ) 366 | # concatenation of predictions of each base learner 367 | self.x_te = np.hstack(base_learner_predictions) 368 | return self.x_te 369 | 370 | def get_models(self): 371 | """Get details of the selected machine learning models and the ensemble. 372 | """ 373 | base_learner_names = {} 374 | for model in self.base_learners: 375 | if model.algorithm in base_learner_names.keys(): 376 | base_learner_names[model.algorithm].append(model.hyperparameters) 377 | else: 378 | base_learner_names[model.algorithm] = [model.hyperparameters] 379 | return base_learner_names 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/preprocessing.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pre-process datasets. 3 | """ 4 | 5 | import numpy as np 6 | from sklearn.preprocessing import scale 7 | from sklearn.preprocessing import OneHotEncoder 8 | from sklearn.preprocessing import Imputer 9 | 10 | 11 | def pre_process(raw_data, categorical, impute=True, standardize=True, one_hot_encode=True): 12 | """ 13 | Pre-process one dataset. 14 | 15 | Args: 16 | raw_data (np.ndarray): raw features of the n-by-d dataset, without indices and headings. 17 | categorical (list): a boolean list of length d indicating whether each raw feature is categorical. 18 | impute (bool): whether to impute missing entries or not. 19 | standardize (bool): whether to standardize each feature or not. 20 | one_hot_encode (bool): whether to use one hot encoding to pre-process categorical features or not. 21 | Returns: 22 | np.ndarray: pre-processed dataset. 23 | """ 24 | # list of pre-processed arrays (sub-portions of dataset) 25 | processed = [] 26 | 27 | # whether to impute missing entries 28 | if impute: 29 | # if there are any categorical features 30 | if np.array(categorical).any(): 31 | raw_categorical = raw_data[:, categorical] 32 | # impute missing entries in categorical features using the most frequent number 33 | imp_categorical = Imputer(missing_values='NaN', strategy='most_frequent', axis=0, copy=False) 34 | processed.append(imp_categorical.fit_transform(raw_categorical)) 35 | 36 | # if there are any numeric features 37 | if np.invert(categorical).any(): 38 | raw_numeric = raw_data[:, np.invert(categorical)] 39 | # impute missing entries in non-categorical features using mean 40 | imp_numeric = Imputer(missing_values='NaN', strategy='mean', axis=0, copy=False) 41 | processed.append(imp_numeric.fit_transform(raw_numeric)) 42 | 43 | # data has now been re-ordered so all categorical features appear first 44 | categorical = np.array(sorted(categorical, reverse=True)) 45 | processed_data = np.hstack(tuple(processed)) 46 | 47 | else: 48 | processed_data = raw_data 49 | 50 | # one-hot encoding for categorical features (only if there exist any) 51 | if one_hot_encode and np.array(categorical).any(): 52 | encoder = OneHotEncoder(categorical_features=categorical) 53 | processed_data = encoder.fit_transform(processed_data).toarray() 54 | categorical = np.zeros(processed_data.shape[1], dtype=bool) 55 | 56 | # standardize all numeric and one-hot encoded categorical features 57 | if standardize: 58 | processed_data[:, np.invert(categorical)] = scale(processed_data[:, np.invert(categorical)]) 59 | 60 | print('Data pre-processing finished') 61 | return processed_data, categorical 62 | -------------------------------------------------------------------------------- /AutoML/oboe/automl/util.py: -------------------------------------------------------------------------------- 1 | #requires a log file in the folder that contains csv files. 2 | 3 | """ 4 | Miscellaneous helper functions. 5 | """ 6 | 7 | import inspect 8 | import itertools 9 | import json 10 | import numpy as np 11 | import os 12 | import pandas as pd 13 | import pkg_resources 14 | import re 15 | import sys 16 | import glob 17 | from math import isclose 18 | from sklearn.metrics import mean_squared_error 19 | 20 | # Classification algorithms 21 | from sklearn.neighbors import KNeighborsClassifier as KNN 22 | from sklearn.tree import DecisionTreeClassifier as DT 23 | from sklearn.ensemble import RandomForestClassifier as RF 24 | from sklearn.ensemble import ExtraTreesClassifier as ExtraTrees 25 | from sklearn.ensemble import GradientBoostingClassifier as GBT 26 | from sklearn.ensemble import AdaBoostClassifier as AB 27 | from sklearn.svm import LinearSVC as lSVM 28 | from sklearn.svm import SVC as kSVM 29 | from sklearn.linear_model import LogisticRegression as Logit 30 | from sklearn.linear_model import Perceptron 31 | from sklearn.naive_bayes import GaussianNB as GNB 32 | from sklearn.neural_network import MLPClassifier as MLP 33 | 34 | # Regression algorithms 35 | from sklearn.linear_model import Lasso 36 | from sklearn.linear_model import Ridge 37 | from sklearn.linear_model import ElasticNet 38 | # TODO: include more regression algorithms 39 | 40 | 41 | defaults_path = pkg_resources.resource_filename(__name__, 'defaults') 42 | with open(os.path.join(defaults_path, 'classification.json'), 'r') as f: 43 | CLS = json.load(f) 44 | with open(os.path.join(defaults_path, 'regression.json'), 'r') as f: 45 | REG = json.load(f) 46 | 47 | ALGORITHMS_C = dict(zip(CLS['algorithms'], list(map(lambda name: eval(name), CLS['algorithms'])))) 48 | ALGORITHMS_R = dict(zip(REG['algorithms'], list(map(lambda name: eval(name), REG['algorithms'])))) 49 | 50 | DEFAULTS = {'algorithms': {'classification': ALGORITHMS_C, 'regression': ALGORITHMS_R}, 51 | 'hyperparameters': {'classification': CLS['hyperparameters'], 'regression': REG['hyperparameters']}} 52 | 53 | 54 | def extract_columns(df, algorithms=None, hyperparameters=None): 55 | """ 56 | Extract certain columns of the error matrix. 57 | 58 | Args: 59 | error_matrix (DataFrame): The error matrix to be extracted. 60 | algorithms (string or list): One or a list of algorithms as search space. 61 | 62 | Args to be implemented: 63 | hyperparameters (list): A list of hyperparameters as search space. 64 | 65 | Returns: 66 | DataFrame: A DataFrame consisting of corresponding columns. 67 | """ 68 | assert algorithms is not None or hyperparameters is not None, \ 69 | "At least one of the 'algorithms' and 'hyperparameters' need to be specified!" 70 | sampled_columns = [] 71 | for item in list(df): 72 | to_sample_this_column = False 73 | if algorithms is None: 74 | to_sample_this_column = True 75 | elif eval(item)['algorithm'] in algorithms: 76 | if hyperparameters is None: 77 | to_sample_this_column = True 78 | else: 79 | to_sample_this_column = True 80 | hyperparameter_column = eval(item)['algorithm'] 81 | hyperparameter_allowed = hyperparameters[eval(item)['algorithm']] 82 | for key in hyperparameter_column: 83 | if not key in hyperparameter_allowed.keys(): 84 | continue 85 | else: 86 | if hyperparameter_column[key] in hyperparameter_allowed[key]: 87 | continue 88 | else: 89 | to_sample_this_column = False 90 | break 91 | if to_sample_this_column == True: 92 | sampled_columns.append(item) 93 | return df[sampled_columns] 94 | 95 | def extract_column_names(df, algorithms=None, hyperparameters=None): 96 | """ 97 | Extract names of certain columns of the error matrix. 98 | 99 | Args: 100 | error_matrix (DataFrame): The error matrix to be extracted. 101 | algorithms (string or list): One or a list of algorithms as search space. 102 | 103 | Args to be implemented: 104 | hyperparameters (list): A list of hyperparameters as search space. 105 | 106 | Returns: 107 | list: A list of column names. 108 | """ 109 | return list(extract_columns(df, algorithms=algorithms, hyperparameters=hyperparameters)) 110 | 111 | def error(y_true, y_predicted, p_type): 112 | """Compute error metric for the model; varies based on classification/regression and algorithm type. 113 | BER (Balanced Error Rate): For classification. 114 | 1/n * sum (0.5*(true positives/predicted positives + true negatives/predicted negatives)) 115 | MSE (Mean Squared Error): For regression. 1/n * sum(||y_pred - y_obs||^2). 116 | 117 | Args: 118 | y_true (np.ndarray): Observed labels. 119 | y_predicted (np.ndarray): Predicted labels. 120 | p_type (str): Type of problem. One of {'classification', 'regression'} 121 | Returns: 122 | float: Error metric. 123 | """ 124 | 125 | assert p_type in {'classification', 'regression'}, "Please specify a valid type." 126 | y_true = np.squeeze(y_true) 127 | y_predicted = np.squeeze(y_predicted) 128 | 129 | if p_type == 'classification': 130 | errors = [] 131 | epsilon = 1e-15 132 | for i in np.unique(y_true): 133 | tp = ((y_true == i) & (y_predicted == i)).sum() 134 | tn = ((y_true != i) & (y_predicted != i)).sum() 135 | fp = ((y_true != i) & (y_predicted == i)).sum() 136 | fn = ((y_true == i) & (y_predicted != i)).sum() 137 | errors.append(1 - 0.5*(tp / np.maximum(tp + fn, epsilon)) - 0.5*(tn / np.maximum(tn + fp, epsilon))) 138 | return np.mean(errors) 139 | 140 | elif p_type == 'regression': 141 | return mean_squared_error(y_true, y_predicted) 142 | 143 | 144 | def invalid_args(func, arglist): 145 | """Check if args is a valid list of arguments to be passed to the function func. 146 | 147 | Args: 148 | func (function): Function to check arguments for 149 | arglist (list): Proposed arguments 150 | Returns: 151 | set: Set of arguments in args that are invalid (returns empty set if there are none). 152 | """ 153 | args = inspect.getfullargspec(func)[0] 154 | return set(arglist) - set(args) 155 | 156 | 157 | def check_arguments(p_type, algorithms, hyperparameters, defaults=DEFAULTS): 158 | """Check if arguments to constructor of AutoLearner object are valid, and default error matrix can be used. 159 | 160 | Args: 161 | p_type (str): Problem type. One of {'classification', 'regression'} 162 | algorithms (list): List of selected algorithms as strings. (e.g. ['KNN', 'lSVM', 'kSVM'] 163 | hyperparameters (dict): Nested dict of selected hyperparameters. 164 | defaults (dict): Nested dict of default algorithms & hyperparameters. 165 | Returns: 166 | bool: Whether or not the default error matrix can be used. 167 | """ 168 | # check if valid problem type 169 | assert p_type.lower() in ['classification', 'regression'], "Please specify a valid type." 170 | 171 | # set selected algorithms to default set if not specified 172 | all_algs = list(defaults['algorithms'][p_type].keys()) 173 | if algorithms is None: 174 | algorithms = all_algs 175 | 176 | # check if selected algorithms are a subset of supported algorithms for given problem type 177 | assert set(algorithms).issubset(set(all_algs)), \ 178 | "Unsupported algorithm(s) {}.".format(set(algorithms) - set(all_algs)) 179 | 180 | # set selected hyperparameters to default set if not specified 181 | all_hyp = defaults['hyperparameters'][p_type] 182 | if hyperparameters is None: 183 | hyperparameters = all_hyp 184 | 185 | # check if selected hyperparameters are valid arguments to scikit-learn models 186 | invalid = [invalid_args(defaults['algorithms'][p_type][alg], hyperparameters[alg].keys()) 187 | for alg in hyperparameters.keys()] 188 | for i, args in enumerate(invalid): 189 | assert len(args) == 0, "Unsupported hyperparameter(s) {} for algorithm {}" \ 190 | .format(args, list(hyperparameters.keys())[i]) 191 | 192 | # check if it is necessary to generate new error matrix, i.e. are all hyperparameters in default error matrix 193 | compatible_columns = [] 194 | new_columns = [] 195 | default_settings = generate_settings(defaults['algorithms'][p_type].keys(), defaults['hyperparameters'][p_type]) 196 | for alg in hyperparameters.keys(): 197 | for values in itertools.product(*hyperparameters[alg].values()): 198 | setting = {'algorithm': alg, 'hyperparameters': dict(zip(hyperparameters[alg].keys(), list(values)))} 199 | if setting in default_settings: 200 | compatible_columns.append(setting) 201 | else: 202 | new_columns.append(setting) 203 | return compatible_columns, new_columns 204 | 205 | 206 | def knapsack(weights, values, capacity): 207 | """Solve the knapsack problem; maximize sum_i v[i]*x[i] subject to sum_i w[i]*x[i] <= W and x[i] in {0, 1} 208 | 209 | Args: 210 | weights (np.ndarray): "weights" of each item 211 | values (np.ndarray): "values" of each item 212 | capacity (int): maximum "weight" allowed 213 | Returns: 214 | set: list of selected indices 215 | """ 216 | assert len(weights) == len(values), "Weights & values must have same shape." 217 | assert type(capacity) == int, "Capacity must be an integer." 218 | n = len(weights) 219 | m = np.zeros((n+1, capacity+1)).astype(int) 220 | 221 | for i in range(n+1): 222 | for w in range(capacity+1): 223 | if i == 0 or w == 0: 224 | pass 225 | elif weights[i-1] <= w: 226 | m[i, w] = max(values[i-1] + m[i-1, w-weights[i-1]], m[i-1, w]) 227 | else: 228 | m[i, w] = m[i-1, w] 229 | 230 | def find_selected(j, v): 231 | if j == 0: 232 | return set() 233 | if m[j, v] > m[j-1, v]: 234 | return {j-1}.union(find_selected(j-1, v - weights[j-1])) 235 | else: 236 | return find_selected(j-1, v) 237 | 238 | return find_selected(n, capacity) 239 | 240 | 241 | def check_dataframes(m1, m2): 242 | """Check if 2 dataframes have the same shape and share the same index column. 243 | 244 | Args: 245 | m1 (DataFrame): first dataframe 246 | m2 (DataFrame): second dataframe 247 | Returns: 248 | bool: Whether the conditions are satisfied 249 | """ 250 | assert m1.shape == m2.shape 251 | assert set(m1.index) == set(m2.index) 252 | return True 253 | 254 | 255 | def generate_settings(algorithms, hyperparameters, sort=True): 256 | """Generate column headings of error matrix. 257 | 258 | Args: 259 | algorithms (list): A list of algorithms in strings (e.g. ['KNN', 'RF', 'lSVM']) 260 | hyperparameters (dict): A nested dictionary of hyperparameters. First key is algorithm type (str), second key 261 | is hyperparameter name (str); argument to pass to scikit-learn constructor with array 262 | of values 263 | (e.g. {'KNN': {'n_neighbors': np.array([1, 3, 5, 7]), 264 | 'p': np.array([1, 2])}}). 265 | sort (bool): Whether to sort settings in alphabetical order with respect to algorithm name. 266 | Returns: 267 | list: List of nested dictionaries, one entry for each model setting. 268 | (e.g. [{'algorithm': 'KNN', 'hyperparameters': {'n_neighbors': 1, 'p': 1}}, 269 | {'algorithm': 'lSVM', 'hyperparameters': {'C': 1.0}}]) 270 | """ 271 | settings = [] 272 | for alg in algorithms: 273 | hyperparams = hyperparameters[alg] 274 | for values in itertools.product(*hyperparams.values()): 275 | configs = dict(zip(hyperparams.keys(), list(values))) 276 | for key, val in configs.items(): 277 | if isinstance(val, (int, float)): 278 | if isclose(val, round(val)): 279 | configs[key] = int(round(val)) 280 | settings.append({'algorithm': alg, 'hyperparameters': configs}) 281 | if sort: 282 | settings = sorted(settings, key=lambda k: k['algorithm']) 283 | return settings 284 | 285 | 286 | def merge_rows(save_dir): 287 | """Merge rows of error matrix. Creates two CSV files: one error matrix and one runtime matrix. 288 | 289 | Args: 290 | save_dir (str): Directory containing per-dataset CSV files of cross-validation errors & time for each model. 291 | """ 292 | if not os.path.isdir(save_dir): 293 | print('Invalid path.') 294 | return 295 | 296 | # find files to concatenate (all .csv files; may contain previously merged results) 297 | files = [file for file in os.listdir(save_dir) if file.endswith('.csv') and 'sizes' not in file] 298 | em, rm = 'error_matrix.csv', 'runtime_matrix.csv' 299 | headers, ids, error_matrix_rows, runtime_matrix_rows = None, [], (), () 300 | 301 | if (em in files) and (rm in files): 302 | errors = pd.read_csv(os.path.join(save_dir, files.pop(files.index(em))), index_col=0) 303 | runtimes = pd.read_csv(os.path.join(save_dir, files.pop(files.index(rm))), index_col=0) 304 | assert set(errors.index) == set(runtimes.index), "Previous results must share index column." 305 | assert set(list(errors)) == set(list(runtimes)), "Previous results must share headers." 306 | ids += list(errors.index) 307 | headers = list(errors) 308 | error_matrix_rows += (errors.values, ) 309 | runtime_matrix_rows += (runtimes.values, ) 310 | 311 | # concatenate new results 312 | # TODO: only load files corresponding to completed files in log.txt 313 | for file in files: 314 | file_path = os.path.join(save_dir, file) 315 | dataframe = pd.read_csv(file_path, index_col=0) 316 | if headers is None: 317 | headers = list(dataframe) 318 | else: 319 | assert set(headers) == set(list(dataframe)), "All results must share same headers." 320 | if np.isnan(dataframe.values).any(): 321 | # if values contain NaNs, generation has not yet finished 322 | pass 323 | else: 324 | permutation = [headers.index(h) for h in list(dataframe)] 325 | error_matrix_rows += (np.expand_dims(dataframe.values[0, permutation], 0), ) 326 | runtime_matrix_rows += (np.expand_dims(dataframe.values[1, permutation], 0), ) 327 | ids.append(file.split('.')[0]) 328 | try: 329 | os.mkdir(os.path.join(save_dir, "merged_csv_files")) 330 | except: 331 | pass 332 | os.rename(file_path, os.path.join(save_dir, "merged_csv_files", file)) 333 | # os.remove(file_path) 334 | if len(error_matrix_rows) % 50 == 0: 335 | print('Merging {} files...'.format(len(error_matrix_rows))) 336 | 337 | # get dataset sizes 338 | # openml_datasets = openml.datasets.list_datasets() 339 | # openml_datasets = pd.DataFrame.from_dict(openml_datasets, orient='index') 340 | # dataset_sizes = openml_datasets[['NumberOfInstances', 'NumberOfFeatures']] 341 | 342 | # #find the log file 343 | # for f in glob.glob('{}/log*.txt'.format(save_dir)): 344 | # log_path = f 345 | # # save dataset sizes 346 | # with open(log_path, 'r') as file: 347 | # lines = file.readlines() 348 | # dataset_ids, sizes = [], [] 349 | # for line in lines: 350 | # if 'Size' in line: 351 | # log_ids = [int(n) for n in re.findall(r'ID=(\d+)', line)] 352 | # size = [eval(n) for n in re.findall(r'Size=\((\d+, \d+)\)', line)] 353 | # if len(log_ids) == 1 and len(size) == 1: 354 | # dataset_ids.append(log_ids[0]) 355 | # sizes.append(size[0]) 356 | 357 | # save results 358 | pd.DataFrame(np.vstack(error_matrix_rows), index=ids, columns=headers).to_csv(os.path.join(save_dir, em)) 359 | pd.DataFrame(np.vstack(runtime_matrix_rows), index=ids, columns=headers).to_csv(os.path.join(save_dir, rm)) 360 | # pd.DataFrame(np.vstack(sizes), index=dataset_ids).to_csv(os.path.join(save_dir, 'dataset_sizes.csv')) 361 | # dataset_sizes.to_csv(os.path.join(save_dir, 'dataset_sizes.csv')) 362 | 363 | 364 | if __name__ == '__main__': 365 | merge_rows(sys.argv[1]) 366 | -------------------------------------------------------------------------------- /AutoML/oboe/examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | Examples of how to use the Oboe system. 3 | 4 | 1. `error_matrix_generation` 5 | 6 | This directory contains an example of the offline error matrix generation. 7 | 8 | 2. `classification` 9 | 10 | This Jupyter notebook contains examples of the online AutoML fitting and prediction. 11 | -------------------------------------------------------------------------------- /AutoML/oboe/examples/classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This is a classification example to show how to use Oboe for training and testing, in the context of AutoML, i.e., do model selection on the training set and then evaluate the performance of the selected model on the test set." 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# necessary modules\n", 17 | "import sys\n", 18 | "import pandas as pd\n", 19 | "import os\n", 20 | "import time\n", 21 | "import numpy as np\n", 22 | "import multiprocessing\n", 23 | "\n", 24 | "#Oboe modules; this will be simplified when Oboe becomes pip installable\n", 25 | "automl_path = '../automl/'\n", 26 | "sys.path.append(automl_path)\n", 27 | "from auto_learner import AutoLearner\n", 28 | "import util\n", 29 | "\n", 30 | "#import scikit-learn modules\n", 31 | "from sklearn.datasets import load_iris\n", 32 | "from sklearn.model_selection import train_test_split\n", 33 | "from sklearn.metrics import accuracy_score\n", 34 | "\n", 35 | "# disable warnings\n", 36 | "import warnings\n", 37 | "warnings.filterwarnings('ignore')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 2, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "#load and split dataset into training and test folds\n", 47 | "data = load_iris()\n", 48 | "x = np.array(data['data'])\n", 49 | "y = np.array(data['target'])\n", 50 | "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "# Example 1: a no-brainer use" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 11, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# initialize the autolearner class\n", 67 | "m = AutoLearner(p_type='classification', runtime_limit=10, verbose=False)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 12, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# fit autolearner on training set and record runtime\n", 77 | "start = time.time()\n", 78 | "m.fit(x_train, y_train)\n", 79 | "elapsed_time = time.time() - start" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 13, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "prediction error: 0.05087719298245613\n", 92 | "elapsed time: 7.21905517578125\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# use the fitted autolearner for prediction on test set\n", 98 | "y_predicted = m.predict(x_test)\n", 99 | "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification'))) \n", 100 | "print(\"elapsed time: {}\".format(elapsed_time))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 14, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "{'ensemble method': 'greedy selection',\n", 112 | " 'base learners': {'kSVM': [{'C': 0.25, 'kernel': 'poly', 'coef0': 10}],\n", 113 | " 'AB': [{'n_estimators': 100, 'learning_rate': 3}],\n", 114 | " 'GBT': [{'learning_rate': 0.001, 'max_depth': 3, 'max_features': 'log2'}]}}" 115 | ] 116 | }, 117 | "execution_count": 14, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "# get names of the selected machine learning models\n", 124 | "m.get_models()" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "# Example 2: build an ensemble of models" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 15, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "#experimental settings\n", 141 | "VERBOSE = False #whether to print out information indicating current fitting progress\n", 142 | "N_CORES = 1 #number of cores\n", 143 | "RUNTIME_BUDGET = 15" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 16, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "#optional: limit the types of algorithms\n", 153 | "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 17, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "#autolearner arguments\n", 163 | "autolearner_kwargs = {\n", 164 | " 'p_type': 'classification',\n", 165 | " 'runtime_limit': RUNTIME_BUDGET,\n", 166 | " 'verbose': VERBOSE,\n", 167 | " 'selection_method': 'min_variance',\n", 168 | " 'algorithms': s,\n", 169 | " 'stacking_alg': 'greedy',\n", 170 | " 'n_cores': N_CORES,\n", 171 | " 'build_ensemble': True,\n", 172 | "}" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 18, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "#intialize the autolearner class\n", 182 | "m = AutoLearner(**autolearner_kwargs)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 19, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# fit autolearner on training set and record runtime\n", 192 | "start = time.time()\n", 193 | "m.fit(x_train, y_train)\n", 194 | "elapsed_time = time.time() - start" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 20, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "prediction error: 0.025438596491228094\n", 207 | "elapsed time: 7.217111110687256\n", 208 | "individual accuracies of selected models: [0.025438596491228094, 0.025438596491228094, 0.05087719298245613]\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "# use the fitted autolearner for prediction on test set\n", 214 | "y_predicted = m.predict(x_test)\n", 215 | "print(\"prediction error: {}\".format(util.error(y_test, y_predicted, 'classification')))\n", 216 | "print(\"elapsed time: {}\".format(elapsed_time))\n", 217 | "print(\"individual accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 21, 223 | "metadata": { 224 | "scrolled": true 225 | }, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "{'ensemble method': 'greedy selection',\n", 231 | " 'base learners': {'KNN': [{'n_neighbors': 11, 'p': 1},\n", 232 | " {'n_neighbors': 13, 'p': 1},\n", 233 | " {'n_neighbors': 11, 'p': 2}]}}" 234 | ] 235 | }, 236 | "execution_count": 21, 237 | "metadata": {}, 238 | "output_type": "execute_result" 239 | } 240 | ], 241 | "source": [ 242 | "# get names of the selected machine learning models\n", 243 | "m.get_models()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "# Example 3: just select a collection of promising models without building an ensemble afterwards" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 22, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "#experimental settings\n", 260 | "VERBOSE = False #whether to print out information indicating current fitting progress\n", 261 | "N_CORES = 1 #number of cores\n", 262 | "RUNTIME_BUDGET = 15" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 23, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "#optional: limit the types of algorithms\n", 272 | "s = ['AB', 'ExtraTrees', 'GNB', 'KNN', 'RF', 'DT']" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 24, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "#autolearner arguments\n", 282 | "autolearner_kwargs = {\n", 283 | " 'p_type': 'classification',\n", 284 | " 'runtime_limit': RUNTIME_BUDGET,\n", 285 | " 'verbose': VERBOSE,\n", 286 | " 'selection_method': 'min_variance',\n", 287 | " 'algorithms': s,\n", 288 | " 'stacking_alg': 'greedy',\n", 289 | " 'n_cores': N_CORES,\n", 290 | " 'build_ensemble': False,\n", 291 | "}" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 25, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "#intialize the autolearner class\n", 301 | "m = AutoLearner(**autolearner_kwargs)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 26, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "# fit autolearner on training set and record runtime\n", 311 | "start = time.time()\n", 312 | "m.fit(x_train, y_train)\n", 313 | "elapsed_time = time.time() - start" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 27, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "elapsed time: 5.099339008331299\n", 326 | "accuracies of selected models: [0.0, 0.0, 0.0, 0.0, 0.0, 0.025438596491228094, 0.0, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.025438596491228094, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.025438596491228094, 0.0, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.02348484848484848, 0.05087719298245613, 0.0, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.02348484848484848, 0.05087719298245613, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05087719298245613, 0.05087719298245613, 0.025438596491228094, 0.0, 0.025438596491228094, 0.05087719298245613, 0.025438596491228094, 0.025438596491228094, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613, 0.05087719298245613]\n" 327 | ] 328 | } 329 | ], 330 | "source": [ 331 | "# use the fitted autolearner for prediction on test set\n", 332 | "y_predicted = m.predict(x_test)\n", 333 | " \n", 334 | "print(\"elapsed time: {}\".format(elapsed_time))\n", 335 | "print(\"accuracies of selected models: {}\".format(m.get_model_accuracy(y_test)))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": {}, 341 | "source": [ 342 | "Note that we do not have a single accuracy value here if we do not build an ensemble, instead, we just have a collection of fitted models with individual accuracies reported." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 28, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "{'KNN': [{'n_neighbors': 1, 'p': 2},\n", 354 | " {'n_neighbors': 1, 'p': 2},\n", 355 | " {'n_neighbors': 1, 'p': 1},\n", 356 | " {'n_neighbors': 3, 'p': 1},\n", 357 | " {'n_neighbors': 3, 'p': 2},\n", 358 | " {'n_neighbors': 5, 'p': 1},\n", 359 | " {'n_neighbors': 5, 'p': 2},\n", 360 | " {'n_neighbors': 7, 'p': 1},\n", 361 | " {'n_neighbors': 7, 'p': 2},\n", 362 | " {'n_neighbors': 9, 'p': 1},\n", 363 | " {'n_neighbors': 9, 'p': 2},\n", 364 | " {'n_neighbors': 11, 'p': 1},\n", 365 | " {'n_neighbors': 11, 'p': 2},\n", 366 | " {'n_neighbors': 13, 'p': 1},\n", 367 | " {'n_neighbors': 13, 'p': 2},\n", 368 | " {'n_neighbors': 15, 'p': 1},\n", 369 | " {'n_neighbors': 15, 'p': 2}],\n", 370 | " 'DT': [{'min_samples_split': 0.0001},\n", 371 | " {'min_samples_split': 1e-05},\n", 372 | " {'min_samples_split': 2},\n", 373 | " {'min_samples_split': 0.001},\n", 374 | " {'min_samples_split': 4},\n", 375 | " {'min_samples_split': 8},\n", 376 | " {'min_samples_split': 0.01},\n", 377 | " {'min_samples_split': 16},\n", 378 | " {'min_samples_split': 64}],\n", 379 | " 'AB': [{'n_estimators': 50, 'learning_rate': 2.5},\n", 380 | " {'n_estimators': 50, 'learning_rate': 2},\n", 381 | " {'n_estimators': 50, 'learning_rate': 3},\n", 382 | " {'n_estimators': 100, 'learning_rate': 2},\n", 383 | " {'n_estimators': 100, 'learning_rate': 2.5},\n", 384 | " {'n_estimators': 100, 'learning_rate': 3}],\n", 385 | " 'GNB': [{}],\n", 386 | " 'ExtraTrees': [{'min_samples_split': 2, 'criterion': 'gini'},\n", 387 | " {'min_samples_split': 2, 'criterion': 'entropy'},\n", 388 | " {'min_samples_split': 4, 'criterion': 'gini'},\n", 389 | " {'min_samples_split': 4, 'criterion': 'entropy'},\n", 390 | " {'min_samples_split': 8, 'criterion': 'gini'},\n", 391 | " {'min_samples_split': 8, 'criterion': 'entropy'},\n", 392 | " {'min_samples_split': 16, 'criterion': 'gini'},\n", 393 | " {'min_samples_split': 16, 'criterion': 'entropy'},\n", 394 | " {'min_samples_split': 0.1, 'criterion': 'gini'},\n", 395 | " {'min_samples_split': 0.1, 'criterion': 'entropy'},\n", 396 | " {'min_samples_split': 0.01, 'criterion': 'gini'},\n", 397 | " {'min_samples_split': 0.01, 'criterion': 'entropy'},\n", 398 | " {'min_samples_split': 0.001, 'criterion': 'gini'},\n", 399 | " {'min_samples_split': 0.001, 'criterion': 'entropy'},\n", 400 | " {'min_samples_split': 0.0001, 'criterion': 'gini'},\n", 401 | " {'min_samples_split': 0.0001, 'criterion': 'entropy'},\n", 402 | " {'min_samples_split': 1e-05, 'criterion': 'gini'},\n", 403 | " {'min_samples_split': 1e-05, 'criterion': 'entropy'}],\n", 404 | " 'RF': [{'min_samples_split': 2, 'criterion': 'gini'},\n", 405 | " {'min_samples_split': 2, 'criterion': 'entropy'},\n", 406 | " {'min_samples_split': 4, 'criterion': 'gini'},\n", 407 | " {'min_samples_split': 4, 'criterion': 'entropy'},\n", 408 | " {'min_samples_split': 8, 'criterion': 'gini'},\n", 409 | " {'min_samples_split': 8, 'criterion': 'entropy'},\n", 410 | " {'min_samples_split': 16, 'criterion': 'gini'},\n", 411 | " {'min_samples_split': 16, 'criterion': 'entropy'},\n", 412 | " {'min_samples_split': 0.01, 'criterion': 'gini'},\n", 413 | " {'min_samples_split': 0.01, 'criterion': 'entropy'},\n", 414 | " {'min_samples_split': 0.001, 'criterion': 'gini'},\n", 415 | " {'min_samples_split': 0.001, 'criterion': 'entropy'},\n", 416 | " {'min_samples_split': 0.0001, 'criterion': 'gini'},\n", 417 | " {'min_samples_split': 0.0001, 'criterion': 'entropy'},\n", 418 | " {'min_samples_split': 1e-05, 'criterion': 'gini'},\n", 419 | " {'min_samples_split': 1e-05, 'criterion': 'entropy'}]}" 420 | ] 421 | }, 422 | "execution_count": 28, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "# get names of the selected machine learning models\n", 429 | "m.get_models()" 430 | ] 431 | } 432 | ], 433 | "metadata": { 434 | "kernelspec": { 435 | "display_name": "Python 3", 436 | "language": "python", 437 | "name": "python3" 438 | }, 439 | "language_info": { 440 | "codemirror_mode": { 441 | "name": "ipython", 442 | "version": 3 443 | }, 444 | "file_extension": ".py", 445 | "mimetype": "text/x-python", 446 | "name": "python", 447 | "nbconvert_exporter": "python", 448 | "pygments_lexer": "ipython3", 449 | "version": "3.7.3" 450 | } 451 | }, 452 | "nbformat": 4, 453 | "nbformat_minor": 2 454 | } 455 | -------------------------------------------------------------------------------- /AutoML/oboe/examples/error_matrix_generation/README.md: -------------------------------------------------------------------------------- 1 | This is a quick example to show how to generate the error matrix from preprocessed datasets. It works on Unix and Linux but not on OS X for now. 2 | 3 | # Dataset format 4 | 5 | The datasets should be `csv` files. All the columns except the last are features; the last column is the class label. 6 | 7 | # Recording model errors and runtime 8 | Run 9 | ``` 10 | bash generate.sh 11 | ``` 12 | It will create a `results` directory, with a subdirectory named by the start time of the generation procedure and containing results on individual datasets. We call this subdirectory the "csv directory". 13 | # Merging into the error and runtime matrices 14 | First, modify the directory name in angle brackets in `merge.sh`to be the name of the "csv directory". Then do 15 | ``` 16 | bash merge.sh 17 | ``` 18 | It will generate a `error_matrix.csv` and a `runtime_matrix.csv` in the "csv directory", and move the csv files already merged into these matrices into `merged_csv_files`. 19 | -------------------------------------------------------------------------------- /AutoML/oboe/examples/error_matrix_generation/generate.sh: -------------------------------------------------------------------------------- 1 | bash ../../automl/generate_matrix.sh -p classification -m generate -s results -d . -j ../../automl/defaults/classification.json -n 5 2 | -------------------------------------------------------------------------------- /AutoML/oboe/examples/error_matrix_generation/merge.sh: -------------------------------------------------------------------------------- 1 | bash ../../automl/generate_matrix.sh -m merge -s results/ 2 | -------------------------------------------------------------------------------- /QR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from scipy.linalg import qr, pinv, solve, norm\n", 10 | "from numpy.random import randn\n", 11 | "from numpy.linalg import lstsq\n", 12 | "import numpy as np" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# generate random data matrix\n", 22 | "n,d = 6,4\n", 23 | "X = randn(n,d)\n", 24 | "\n", 25 | "# optional: give it linearly dependent columns\n", 26 | "# X[:,3] = X[:,2]" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Understanding the pseudoinverse" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# form pseudoinverse\n", 43 | "Xd = pinv(X)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "array([[ 1.00000000e+00, 3.26194267e-16, -2.32599523e-16,\n", 55 | " -1.76538379e-16],\n", 56 | " [ 1.23838370e-16, 1.00000000e+00, -7.15454140e-16,\n", 57 | " 3.20159051e-16],\n", 58 | " [-2.99593137e-16, -2.24746860e-16, 1.00000000e+00,\n", 59 | " 1.57207172e-16],\n", 60 | " [-8.35114587e-17, -2.19207791e-16, 3.92819212e-17,\n", 61 | " 1.00000000e+00]])" 62 | ] 63 | }, 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "# X†X ≈ I_d\n", 71 | "Xd @ X" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "True" 83 | ] 84 | }, 85 | "execution_count": 6, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "np.allclose(Xd @ X, np.identity(4))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "data": { 101 | "text/plain": [ 102 | "array([[ 0.92292723, -0.19207847, -0.02487266, -0.02034622, 0.15734442,\n", 103 | " -0.0919159 ],\n", 104 | " [-0.19207847, 0.29368018, 0.04785243, -0.24610263, 0.05077907,\n", 105 | " -0.32419612],\n", 106 | " [-0.02487266, 0.04785243, 0.93897132, 0.08772047, 0.21549265,\n", 107 | " 0.01623931],\n", 108 | " [-0.02034622, -0.24610263, 0.08772047, 0.82689984, -0.25147931,\n", 109 | " -0.10592118],\n", 110 | " [ 0.15734442, 0.05077907, 0.21549265, -0.25147931, 0.16689241,\n", 111 | " 0.04499583],\n", 112 | " [-0.0919159 , -0.32419612, 0.01623931, -0.10592118, 0.04499583,\n", 113 | " 0.85062902]])" 114 | ] 115 | }, 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "# XX† !≈ I_n\n", 123 | "X @ Xd" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 8, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "False" 135 | ] 136 | }, 137 | "execution_count": 8, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "np.allclose(X @ Xd, np.identity(6))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 9, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "Q,R = qr(X)\n", 153 | "Q,R = qr(X, mode='economic')" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "True" 165 | ] 166 | }, 167 | "execution_count": 10, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "np.allclose(X, Q @ R)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 11, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "array([[-0.56201486, 0.63759906, -0.23057605, -0.38388624],\n", 185 | " [-0.14763691, -0.21633913, 0.46831834, 0.07588703],\n", 186 | " [-0.27210148, -0.70101235, -0.21471219, -0.57219968],\n", 187 | " [ 0.66267283, 0.14334266, -0.15547092, -0.58570148],\n", 188 | " [-0.38277792, -0.12482775, -0.06724561, -0.01641728],\n", 189 | " [ 0.05147032, -0.13826577, -0.80790972, 0.41969547]])" 190 | ] 191 | }, 192 | "execution_count": 11, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "Q" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 12, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "array([[ 2.82036938, -0.4006528 , 1.90362682, 0.34118195],\n", 210 | " [ 0. , 2.55806078, -0.13829193, -0.39913027],\n", 211 | " [ 0. , 0. , -3.47705222, 0.39879886],\n", 212 | " [ 0. , 0. , 0. , -1.72866855]])" 213 | ] 214 | }, 215 | "execution_count": 12, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "R" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 13, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "True\n" 234 | ] 235 | }, 236 | { 237 | "data": { 238 | "text/plain": [ 239 | "array([[ 1.00000000e+00, -7.06106860e-17, 1.63377044e-17,\n", 240 | " -4.64978195e-17],\n", 241 | " [-7.06106860e-17, 1.00000000e+00, 9.28390587e-17,\n", 242 | " -4.27972210e-17],\n", 243 | " [ 1.63377044e-17, 9.28390587e-17, 1.00000000e+00,\n", 244 | " 3.18741738e-17],\n", 245 | " [-4.64978195e-17, -4.27972210e-17, 3.18741738e-17,\n", 246 | " 1.00000000e+00]])" 247 | ] 248 | }, 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "print(np.allclose(Q.T @ Q, np.identity(Q.shape[1])))\n", 256 | "Q.T @ Q" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 18, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# form data from noisy linear model\n", 266 | "wtrue = randn(d)\n", 267 | "y = X.dot(wtrue) + .01*randn(n)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 26, 273 | "metadata": {}, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "text/plain": [ 278 | "0.0015392303466796875" 279 | ] 280 | }, 281 | "execution_count": 26, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "# solve least squares problem to estimate w\n", 288 | "Q,R = qr(X, mode='economic')\n", 289 | "w = solve(R, Q.T @ y)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 20, 295 | "metadata": {}, 296 | "outputs": [ 297 | { 298 | "data": { 299 | "text/plain": [ 300 | "0.013572977104475286" 301 | ] 302 | }, 303 | "execution_count": 20, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "# how good is our estimate?\n", 310 | "norm(w - wtrue)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 21, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "6.898214932602962e-05" 322 | ] 323 | }, 324 | "execution_count": 21, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "# compute mean square error\n", 331 | "def mse(y,z):\n", 332 | " return sum((y-z)**2)/len(y)\n", 333 | " \n", 334 | "mse(y,X.dot(w))" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 22, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "data": { 344 | "text/plain": [ 345 | "1.025079205553247e-15" 346 | ] 347 | }, 348 | "execution_count": 22, 349 | "metadata": {}, 350 | "output_type": "execute_result" 351 | } 352 | ], 353 | "source": [ 354 | "# we can use the numpy.lstsq call instead\n", 355 | "w_lstsq = np.linalg.lstsq(X, y, rcond=None)[0]\n", 356 | "norm(w_lstsq - w)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "metadata": {}, 362 | "source": [ 363 | "# Compute QR by hand" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 18, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "data": { 373 | "text/plain": [ 374 | "array([[-0.47534812, 0. , 0. , 0. , 0. ,\n", 375 | " 0. ],\n", 376 | " [ 0.02906357, 0. , 0. , 0. , 0. ,\n", 377 | " 0. ],\n", 378 | " [ 0.00071364, 0. , 0. , 0. , 0. ,\n", 379 | " 0. ],\n", 380 | " [ 0.45358044, 0. , 0. , 0. , 0. ,\n", 381 | " 0. ],\n", 382 | " [-0.26950649, 0. , 0. , 0. , 0. ,\n", 383 | " 0. ],\n", 384 | " [-0.70344154, 0. , 0. , 0. , 0. ,\n", 385 | " 0. ]])" 386 | ] 387 | }, 388 | "execution_count": 18, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "n,d = X.shape \n", 395 | "X0 = X.copy()\n", 396 | "R = np.zeros((n,d))\n", 397 | "Q = np.zeros((n,n))\n", 398 | "\n", 399 | "# first column of Q points in direction of first column of X\n", 400 | "r = norm(X[:,0])\n", 401 | "Q[:,0] = X[:,0]/r\n", 402 | "Q" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 19, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# ensure Q*R matches X on first column\n", 412 | "R[0,0] = r" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 20, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "text/plain": [ 423 | "array([0., 0., 0., 0., 0., 0.])" 424 | ] 425 | }, 426 | "execution_count": 20, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "# verify Q*R matches X in first column\n", 433 | "(Q@R - X)[:,0]" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 21, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# now delete that part from X; we've covered it already\n", 443 | "X[:,0] -= Q[:,0]*R[0,0]" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 22, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "array([[ True, True, True, True],\n", 455 | " [ True, True, True, True],\n", 456 | " [ True, True, True, True],\n", 457 | " [ True, True, True, True],\n", 458 | " [ True, True, True, True],\n", 459 | " [ True, True, True, True]])" 460 | ] 461 | }, 462 | "execution_count": 22, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "# verify Q*R + X = X0\n", 469 | "np.isclose(Q@R + X, X0)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 23, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "array([[ 3.74544392, -0.69063861, 0.96961492, 0.82682362],\n", 481 | " [ 0. , 0. , 0. , 0. ],\n", 482 | " [ 0. , 0. , 0. , 0. ],\n", 483 | " [ 0. , 0. , 0. , 0. ],\n", 484 | " [ 0. , 0. , 0. , 0. ],\n", 485 | " [ 0. , 0. , 0. , 0. ]])" 486 | ] 487 | }, 488 | "execution_count": 23, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "# eliminate component of other columns in direction of first column of Q \n", 495 | "for j in range(1,d):\n", 496 | " R[0,j] = Q[:,0].dot(X[:,j])\n", 497 | " X[:,j] -= Q[:,0]*R[0,j]\n", 498 | "R" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 24, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "array([[ True, True, True, True],\n", 510 | " [ True, True, True, True],\n", 511 | " [ True, True, True, True],\n", 512 | " [ True, True, True, True],\n", 513 | " [ True, True, True, True],\n", 514 | " [ True, True, True, True]])" 515 | ] 516 | }, 517 | "execution_count": 24, 518 | "metadata": {}, 519 | "output_type": "execute_result" 520 | } 521 | ], 522 | "source": [ 523 | "# verify Q*R + X = X0\n", 524 | "np.isclose(Q@R + X, X0)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 25, 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "iteration 0 : QR + X = X0? True\n", 537 | "iteration 1 : QR + X = X0? True\n", 538 | "iteration 2 : QR + X = X0? True\n", 539 | "iteration 3 : QR + X = X0? True\n" 540 | ] 541 | } 542 | ], 543 | "source": [ 544 | "# now for all the columns!\n", 545 | "X = X0.copy()\n", 546 | "Q *= 0\n", 547 | "R *= 0\n", 548 | "\n", 549 | "# compute the QR decomposition\n", 550 | "for i in range(d):\n", 551 | " r = norm(X[:,i])\n", 552 | " Q[:,i] = X[:,i]/r\n", 553 | " for j in range(i,d):\n", 554 | " R[i,j] = Q[:,i].dot(X[:,j])\n", 555 | " X[:,j] -= Q[:,i]*R[i,j]\n", 556 | " print(\"iteration\",i,\": QR + X = X0?\", np.isclose(Q@R + X, X0).all())" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 26, 562 | "metadata": {}, 563 | "outputs": [], 564 | "source": [ 565 | "\"\"\"Our very own QR function to compute the economy QR\"\"\"\n", 566 | "def ourQR(X0):\n", 567 | " X = X0.copy()\n", 568 | " n,d = X.shape\n", 569 | " R = np.zeros((n,d))\n", 570 | " Q = np.zeros((n,n))\n", 571 | "\n", 572 | " # compute the QR decomposition\n", 573 | " for i in range(d):\n", 574 | " r = norm(X[:,i])\n", 575 | " Q[:,i] = X[:,i]/r\n", 576 | " for j in range(i,d):\n", 577 | " R[i,j] = Q[:,i].dot(X[:,j])\n", 578 | " X[:,j] -= Q[:,i]*R[i,j]\n", 579 | " return Q,R" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 31, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "# solve least squares problem to estimate w\n", 589 | "Q,R = ourQR(X0)\n", 590 | "w_byhand = solve(R[:d,:d], (Q.T @ y)[:d])" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 32, 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "1.3822165187958571e-15" 602 | ] 603 | }, 604 | "execution_count": 32, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "norm(w_byhand - w)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": {}, 617 | "outputs": [], 618 | "source": [] 619 | } 620 | ], 621 | "metadata": { 622 | "@webio": { 623 | "lastCommId": null, 624 | "lastKernelId": null 625 | }, 626 | "kernelspec": { 627 | "display_name": "Python 3", 628 | "language": "python", 629 | "name": "python3" 630 | }, 631 | "language_info": { 632 | "codemirror_mode": { 633 | "name": "ipython", 634 | "version": 3 635 | }, 636 | "file_extension": ".py", 637 | "mimetype": "text/x-python", 638 | "name": "python", 639 | "nbconvert_exporter": "python", 640 | "pygments_lexer": "ipython3", 641 | "version": "3.8.3" 642 | } 643 | }, 644 | "nbformat": 4, 645 | "nbformat_minor": 4 646 | } 647 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # demos 2 | Demos and tutorials for ORIE 4741 3 | -------------------------------------------------------------------------------- /SVD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from scipy.linalg import svd, norm\n", 11 | "from numpy.random import randn, rand\n", 12 | "np.random.seed(0)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "array([[ 1.76405235, 0.40015721, 0.97873798, 0.97873798],\n", 24 | " [ 1.86755799, -0.97727788, 0.95008842, 0.95008842],\n", 25 | " [-0.10321885, 0.4105985 , 0.14404357, 0.14404357],\n", 26 | " [ 0.76103773, 0.12167502, 0.44386323, 0.44386323],\n", 27 | " [ 1.49407907, -0.20515826, 0.3130677 , 0.3130677 ],\n", 28 | " [-2.55298982, 0.6536186 , 0.8644362 , 0.8644362 ]])" 29 | ] 30 | }, 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "# generate random data matrix\n", 38 | "n,d = 6,4\n", 39 | "X = randn(n,d)\n", 40 | "\n", 41 | "# optional: give it linearly dependent columns\n", 42 | "X[:,3] = X[:,2]\n", 43 | "X" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/plain": [ 54 | "array([0., 0., 0., 0., 0., 0.])" 55 | ] 56 | }, 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "output_type": "execute_result" 60 | } 61 | ], 62 | "source": [ 63 | "# find a vector w in the nullspace of X\n", 64 | "w = np.zeros(d)\n", 65 | "w[2] = -1\n", 66 | "w[3] = 1\n", 67 | "X@w" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "U,S,Vt = svd(X, full_matrices=False)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "data": { 86 | "text/plain": [ 87 | "True" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "np.allclose(U@np.diag(S)@Vt, X)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "array([[ 1.00000000e+00, -7.38697485e-17, 3.23622199e-17,\n", 108 | " -3.99817880e-17],\n", 109 | " [-7.38697485e-17, 1.00000000e+00, -3.81412273e-17,\n", 110 | " 5.93941004e-17],\n", 111 | " [ 3.23622199e-17, -3.81412273e-17, 1.00000000e+00,\n", 112 | " -4.16017539e-17],\n", 113 | " [-3.99817880e-17, 5.93941004e-17, -4.16017539e-17,\n", 114 | " 1.00000000e+00]])" 115 | ] 116 | }, 117 | "execution_count": 6, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "U.T@U" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "True" 135 | ] 136 | }, 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "np.allclose(U.T@U, np.identity(d))" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "(6, 4)" 155 | ] 156 | }, 157 | "execution_count": 8, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "U.shape" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 9, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "array([[ 0.80175178, -0.06399596, -0.00612596, 0.30096884, 0.24817008,\n", 175 | " 0.05140258],\n", 176 | " [-0.06399596, 0.9701751 , 0.00504366, 0.0348882 , 0.15022773,\n", 177 | " 0.03207643],\n", 178 | " [-0.00612596, 0.00504366, 0.9944328 , 0.05699362, -0.04603767,\n", 179 | " -0.01027116],\n", 180 | " [ 0.30096884, 0.0348882 , 0.05699362, 0.12011976, 0.0995328 ,\n", 181 | " 0.02713897],\n", 182 | " [ 0.24817008, 0.15022773, -0.04603767, 0.0995328 , 0.15300143,\n", 183 | " -0.18278125],\n", 184 | " [ 0.05140258, 0.03207643, -0.01027116, 0.02713897, -0.18278125,\n", 185 | " 0.96051913]])" 186 | ] 187 | }, 188 | "execution_count": 9, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "U@U.T" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 10, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "(4, 4)" 206 | ] 207 | }, 208 | "execution_count": 10, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "Vt.shape" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 11, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "True" 226 | ] 227 | }, 228 | "execution_count": 11, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "np.allclose(Vt @ Vt.T, np.identity(d))" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 12, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "True" 246 | ] 247 | }, 248 | "execution_count": 12, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "np.allclose(Vt.T @ Vt, np.identity(d))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 13, 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "array([4.15760175e+00, 2.28949949e+00, 1.01350732e+00, 1.48389401e-16])" 266 | ] 267 | }, 268 | "execution_count": 13, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "S" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 14, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "Error of rank 0 approximation: 4.853314053310529\n", 287 | "Error of rank 1 approximation: 2.5037981161489284\n", 288 | "Error of rank 2 approximation: 1.0135073191135213\n", 289 | "Error of rank 3 approximation: 2.0282945925593685e-15\n", 290 | "Error of rank 4 approximation: 2.0404123996834285e-15\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "# if we have a linearly dependent column, \n", 296 | "# decomposition is just as good if we ignore the 0 in sigma and reduce r by 1\n", 297 | "for k in range(d+1):\n", 298 | " print(f\"Error of rank {k} approximation: \", \n", 299 | " np.linalg.norm(X - U[:,:k]@np.diag(S[:k])@(Vt[:k,:])))\n", 300 | " " 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 15, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "array([[0., 0., 0., 0.],\n", 312 | " [0., 0., 0., 0.],\n", 313 | " [0., 0., 0., 0.],\n", 314 | " [0., 0., 0., 0.],\n", 315 | " [0., 0., 0., 0.],\n", 316 | " [0., 0., 0., 0.]])" 317 | ] 318 | }, 319 | "execution_count": 15, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "# what is a rank 0 approximation?\n", 326 | "k = 0\n", 327 | "U[:,:k]@np.diag(S[:k])@(Vt[:k,:])" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 16, 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "array([[ 1.63569576, -0.14072899, 1.04290218, 1.04290218],\n", 339 | " [ 2.03375744, -0.27692435, 0.86700695, 0.86700695],\n", 340 | " [-0.18022995, 0.08607883, 0.18254066, 0.18254066],\n", 341 | " [ 0.71793643, -0.05995099, 0.46540914, 0.46540914],\n", 342 | " [ 1.47764056, -0.27442906, 0.32128515, 0.32128515],\n", 343 | " [-2.51856443, 0.7986849 , 0.84722729, 0.84722729]])" 344 | ] 345 | }, 346 | "execution_count": 16, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "# form rank 2 apx of X by zeroing last two singular values\n", 353 | "S2 = S.copy()\n", 354 | "S2[2:] = 0\n", 355 | "U@np.diag(S2)@Vt" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 17, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "# form data from noisy linear model\n", 365 | "wtrue = randn(d)\n", 366 | "y = X@wtrue + .1*randn(n);" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 18, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "array([ 2.32981892e+00, -1.49417792e+00, -1.55210116e+14, 1.55210116e+14])" 378 | ] 379 | }, 380 | "execution_count": 18, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "# solve least squares problem to estimate w\n", 387 | "w4 = Vt.T@np.diag(S**(-1))@U.T@y\n", 388 | "w4" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 19, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "residual given w4: 0.3224116203732873\n", 401 | "residual given wtrue: 0.3063847433022363\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "# it gives a low norm solution, but definitely not optimal...\n", 407 | "print(\"residual given w4:\", norm(y - X.dot(w4)))\n", 408 | "print(\"residual given wtrue:\", norm(y - X.dot(wtrue)))" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 20, 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "0.7526118592612931" 420 | ] 421 | }, 422 | "execution_count": 20, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "# error in normal equations not zero! uh oh!\n", 429 | "norm(X.T@X@w4 - X.T@y)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 21, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "data": { 439 | "text/plain": [ 440 | "array([ 2.32981892, -1.45396574, -0.07338535, -0.07338535])" 441 | ] 442 | }, 443 | "execution_count": 21, 444 | "metadata": {}, 445 | "output_type": "execute_result" 446 | } 447 | ], 448 | "source": [ 449 | "# use rank k approximation to design matrix X\n", 450 | "# k=4 is full rank\n", 451 | "# when design matrix X has rank 3, k=3 gives 0 error approximation\n", 452 | "# while k=2 results in loss of accuracy\n", 453 | "k = 3\n", 454 | "w3 = Vt[:k,:].T@np.diag(S[:k]**(-1))@(U[:,:k]).T@y\n", 455 | "w3" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 22, 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "residual given w3: 0.1943391668277365\n", 468 | "error in normal equations given w3: 2.175583928816829e-15\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "print(\"residual given w3:\", norm(y - X.dot(w3)))\n", 474 | "print(\"error in normal equations given w3:\", norm(X.T@X@w3 - X.T@y))" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 23, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/plain": [ 485 | "1.2560739669470201e-15" 486 | ] 487 | }, 488 | "execution_count": 23, 489 | "metadata": {}, 490 | "output_type": "execute_result" 491 | } 492 | ], 493 | "source": [ 494 | "# add a vector in the nullspace to w3:\n", 495 | "w = w3.copy()\n", 496 | "w[2] += 1\n", 497 | "w[3] -= 1\n", 498 | "norm(X.T@X@w - X.T@y)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "Poll:\n", 506 | "* A) least squares residual norm(y-Xw) will be higher for w than w3\n", 507 | "* B) least squares residual norm(y-Xw) will be lower for w than w3\n", 508 | "* C) least squares residual norm(y-Xw) will be the same for w than w3" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 24, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "residual given w: 0.19433916682773625\n", 521 | "error in normal equations given w: 1.2560739669470201e-15\n" 522 | ] 523 | } 524 | ], 525 | "source": [ 526 | "print(\"residual given w:\", norm(y - X.dot(w)))\n", 527 | "print(\"error in normal equations given w:\", norm(X.T@X@w - X.T@y))" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 25, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/plain": [ 538 | "array([ 2.32981892, -1.45396574, 0.92661465, -1.07338535])" 539 | ] 540 | }, 541 | "execution_count": 25, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "w" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "Poll:\n", 555 | "* A) there is one global minimum of least squares\n", 556 | "* B) there are two global minima of least squares\n", 557 | "* C) there are many global minima of least squares\n", 558 | "* D) there are infinitely many global minima of least squares" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 26, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "data": { 568 | "text/plain": [ 569 | "0.4628663965326436" 570 | ] 571 | }, 572 | "execution_count": 26, 573 | "metadata": {}, 574 | "output_type": "execute_result" 575 | } 576 | ], 577 | "source": [ 578 | "# how good is our estimate of w?\n", 579 | "norm(w - wtrue) / norm(wtrue)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 28, 585 | "metadata": {}, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "1.0990647210786425e-15" 591 | ] 592 | }, 593 | "execution_count": 28, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "# we can use the numpy.lstsq call instead\n", 600 | "w_lstsq = np.linalg.lstsq(X, y, rcond=None)[0]\n", 601 | "norm(w_lstsq - w3)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [] 610 | } 611 | ], 612 | "metadata": { 613 | "@webio": { 614 | "lastCommId": null, 615 | "lastKernelId": null 616 | }, 617 | "kernelspec": { 618 | "display_name": "Python 3", 619 | "language": "python", 620 | "name": "python3" 621 | }, 622 | "language_info": { 623 | "codemirror_mode": { 624 | "name": "ipython", 625 | "version": 3 626 | }, 627 | "file_extension": ".py", 628 | "mimetype": "text/x-python", 629 | "name": "python", 630 | "nbconvert_exporter": "python", 631 | "pygments_lexer": "ipython3", 632 | "version": "3.8.3" 633 | } 634 | }, 635 | "nbformat": 4, 636 | "nbformat_minor": 1 637 | } 638 | -------------------------------------------------------------------------------- /ensembles.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Ensemble Methods \n", 8 | "\n", 9 | "So far we've seen how to construct a single decision tree, now we'll see how to combine multiple trees together into a more powerful ensemble method." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 21, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "from IPython.display import Image\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "import warnings\n", 22 | "warnings.simplefilter(\"ignore\")\n", 23 | "\n", 24 | "import seaborn as sns\n", 25 | "sns.set(rc={'figure.figsize':(6,6)}) " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## California Housing Dataset \n", 33 | "\n", 34 | "We'll use the boston housing dataset, the goal of which is to predict house prices in California from scikit-learn." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 22, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "from sklearn.datasets import fetch_california_housing\n", 44 | "data = fetch_california_housing()\n", 45 | "X = data['data']\n", 46 | "Y = data['target']" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 23, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "
\n", 58 | "\n", 71 | "\n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeY
08.325241.06.9841271.023810322.02.55555637.88-122.234.526
18.301421.06.2381370.9718802401.02.10984237.86-122.223.585
27.257452.08.2881361.073446496.02.80226037.85-122.243.521
35.643152.05.8173521.073059558.02.54794537.85-122.253.413
43.846252.06.2818531.081081565.02.18146737.85-122.253.422
\n", 149 | "
" 150 | ], 151 | "text/plain": [ 152 | " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", 153 | "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", 154 | "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", 155 | "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", 156 | "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", 157 | "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", 158 | "\n", 159 | " Longitude Y \n", 160 | "0 -122.23 4.526 \n", 161 | "1 -122.22 3.585 \n", 162 | "2 -122.24 3.521 \n", 163 | "3 -122.25 3.413 \n", 164 | "4 -122.25 3.422 " 165 | ] 166 | }, 167 | "execution_count": 23, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "data_df = (pd.DataFrame(X, columns = data['feature_names'])\n", 174 | " .assign(Y = Y))\n", 175 | "\n", 176 | "data_df.head()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Bagging \n", 184 | "Bagging is the process of generating a set of weak learners by training on random bootstrapped samples of our dataset (i.e. sampling a dataset from our training data with replacement). To show the power of bagging, we can use random trees: these trees use a *random* feature and *random* threshold to generate the split at each node and then predict the most common value at the leaf." 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 24, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "from sklearn.ensemble import ExtraTreesRegressor\n", 194 | "from sklearn.tree import DecisionTreeRegressor\n", 195 | "from sklearn.model_selection import cross_val_score\n", 196 | "\n", 197 | "# Random trees are usually used just in ensemble methods\n", 198 | "# so we have to manually specify we only want one to start\n", 199 | "random_tree = ExtraTreesRegressor(n_estimators = 1)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 25, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "ExtraTreesRegressor?" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "We can see that on its own, the random tree has a mean squared error of:" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 26, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "-0.9024777591605524" 227 | ] 228 | }, 229 | "execution_count": 26, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "cross_val_score(random_tree, X, Y,\n", 236 | " scoring=\"neg_mean_squared_error\", \n", 237 | " cv=3).mean()" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": {}, 243 | "source": [ 244 | "We could bag by randomly generating the bootstrap samples ourselves... or we could use scikit-learn's BaggingRegressor or BaggingClassifier! We simply need to specify the number of weak learners." 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 7, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "from sklearn.ensemble import BaggingRegressor" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 31, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n", 263 | " n_estimators = 10\n", 264 | " )" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Bagging the random trees together leads to a big jump in performance... even though they're random trees!" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 32, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/plain": [ 282 | "-0.47476478556319784" 283 | ] 284 | }, 285 | "execution_count": 32, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "cross_val_score(bagged_random_trees, X, Y,\n", 292 | " scoring=\"neg_mean_squared_error\", \n", 293 | " cv=3).mean()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "We can also see how the performance changes as we change the number of estimators." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 33, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "-0.4167362638559448" 312 | ] 313 | }, 314 | "execution_count": 33, 315 | "metadata": {}, 316 | "output_type": "execute_result" 317 | } 318 | ], 319 | "source": [ 320 | "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n", 321 | " n_estimators = 100\n", 322 | " )\n", 323 | "cross_val_score(bagged_random_trees, X, Y,\n", 324 | " scoring=\"neg_mean_squared_error\", \n", 325 | " cv=3).mean()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 34, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/plain": [ 336 | "-0.4118286946534265" 337 | ] 338 | }, 339 | "execution_count": 34, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "bagged_random_trees = BaggingRegressor(base_estimator = ExtraTreesRegressor(n_estimators = 1),\n", 346 | " n_estimators = 200\n", 347 | " )\n", 348 | "cross_val_score(bagged_random_trees, X, Y,\n", 349 | " scoring=\"neg_mean_squared_error\", \n", 350 | " cv=3).mean()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "Here \n", 358 | "* increasing from 10 to 100 estimators improves performance a lot!\n", 359 | "* increasing from 100 to 200 estimators has almost no effect." 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## Random Forests \n", 367 | "\n", 368 | "Random forests is a bagging approach for trees that also randomly selects the set of features each tree can use (to help decorrelate results). Scikit-learn offers a great implementation of random forests.\n", 369 | "\n", 370 | "In addition to all the decision tree hyperparameters, random forests also let us choose the number of trees, whether to use bootstrapped samples for each tree, and the max number of features every tree can use." 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 38, 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "data": { 380 | "text/plain": [ 381 | "-0.46905893268797305" 382 | ] 383 | }, 384 | "execution_count": 38, 385 | "metadata": {}, 386 | "output_type": "execute_result" 387 | } 388 | ], 389 | "source": [ 390 | "from sklearn.ensemble import RandomForestRegressor\n", 391 | "\n", 392 | "random_forest = RandomForestRegressor(n_estimators = 100)\n", 393 | "\n", 394 | "cross_val_score(random_forest, X, Y,\n", 395 | " scoring=\"neg_mean_squared_error\", \n", 396 | " cv=3).mean()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 36, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "RandomForestRegressor?" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "## Gradient Boosting \n", 413 | "Recall that boosting is the process of sequentially training weak learners to create a powerful prediction. In gradient boosting, each subsequent model is going to try to replicate the gradient of the loss function evaluated at the current model (almost mimicing gradient descent!). Let's try walking through a simple example manually." 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 39, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "#Start by splitting our data into training and testing\n", 423 | "train_df = data_df.sample(frac=0.8)\n", 424 | "test_df = data_df[~data_df.index.isin(train_df.index)]\n", 425 | "\n", 426 | "X_tr = train_df.drop('Y',axis=1)\n", 427 | "Y_tr = train_df['Y']\n", 428 | "\n", 429 | "X_tst = test_df.drop('Y',axis=1)\n", 430 | "Y_tst = test_df['Y']" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "We start by creating our initial predictions, here, by fitting a decision tree to our data." 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 40, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "Our initial training MSE is 0.4879983243722546\n" 450 | ] 451 | } 452 | ], 453 | "source": [ 454 | "# Start with our base prediction using a decision tree with only 5 layers\n", 455 | "from sklearn.tree import DecisionTreeRegressor\n", 456 | "\n", 457 | "base_tree = DecisionTreeRegressor(max_depth=5)\n", 458 | "\n", 459 | "base_tree.fit(X_tr, Y_tr)\n", 460 | "\n", 461 | "#Current MSE\n", 462 | "print('Our initial training MSE is ', np.mean((base_tree.predict(X_tr) - Y_tr)**2))" 463 | ] 464 | }, 465 | { 466 | "cell_type": "markdown", 467 | "metadata": {}, 468 | "source": [ 469 | "Next, we want to compute the gradient so we can construct a training dataset for our second tree. Since our objective is mean squared error, our gradient is going to be $\\hat{y} - y$" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 46, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "DecisionTreeRegressor(max_depth=5)" 481 | ] 482 | }, 483 | "execution_count": 46, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "residuals = base_tree.predict(X_tr) - Y_tr\n", 490 | "\n", 491 | "second_tree = DecisionTreeRegressor(max_depth=5)\n", 492 | "second_tree.fit(X_tr, residuals)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "Next we figure out the step size using line search (we'll just manually try gamma values)" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 48, 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "The best step size was 1.0 for a new MSE of 0.35254102596712683\n" 512 | ] 513 | } 514 | ], 515 | "source": [ 516 | "best_mse = 99999\n", 517 | "best_gamma = None\n", 518 | "\n", 519 | "for gamma in np.linspace(0, 1, 100):\n", 520 | " mse = np.mean((base_tree.predict(X_tr) - gamma*second_tree.predict(X_tr) - Y_tr)**2)\n", 521 | " if mse < best_mse:\n", 522 | " best_gamma = gamma\n", 523 | " best_mse = mse\n", 524 | "\n", 525 | "print('The best step size was ', best_gamma,' for a new MSE of ', best_mse)" 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": {}, 531 | "source": [ 532 | "We could now continue this process and try to add in a third tree and so on. Instead, let's show how to do this with scikit-learn." 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 17, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "from sklearn.ensemble import GradientBoostingRegressor" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 51, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "GradientBoostingRegressor?" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "The gradient boosted trees implementation allows us to pick a loss function, \n", 558 | "a fixed learning rate, and all the usual decision tree hyperparameters." 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 53, 564 | "metadata": {}, 565 | "outputs": [ 566 | { 567 | "name": "stdout", 568 | "output_type": "stream", 569 | "text": [ 570 | "The gradient boosted MSE is 0.16017001437539927\n" 571 | ] 572 | } 573 | ], 574 | "source": [ 575 | "grad_boost_tree = GradientBoostingRegressor(\n", 576 | " loss = 'ls',\n", 577 | " learning_rate = 1)\n", 578 | "\n", 579 | "grad_boost_tree.fit(X_tr, Y_tr)\n", 580 | "\n", 581 | "print('The gradient boosted MSE is ', np.mean((grad_boost_tree.predict(X_tr) - Y_tr)**2))" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "We can also compare the test set error:" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 54, 594 | "metadata": {}, 595 | "outputs": [ 596 | { 597 | "name": "stdout", 598 | "output_type": "stream", 599 | "text": [ 600 | "The original tree MSE is 0.5142762445291711\n", 601 | "The one-step boosted tree MSE is 0.9159771034480831\n", 602 | "The gradient boosted test MSE is 0.2669416618696991\n" 603 | ] 604 | } 605 | ], 606 | "source": [ 607 | "print('The original tree MSE is ', np.mean((base_tree.predict(X_tst) - Y_tst)**2))\n", 608 | "print('The one-step boosted tree MSE is ', np.mean((base_tree.predict(X_tst) + best_gamma*second_tree.predict(X_tst) - Y_tst)**2))\n", 609 | "print('The gradient boosted test MSE is ', np.mean((grad_boost_tree.predict(X_tst) - Y_tst)**2))" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [] 618 | } 619 | ], 620 | "metadata": { 621 | "@webio": { 622 | "lastCommId": null, 623 | "lastKernelId": null 624 | }, 625 | "kernelspec": { 626 | "display_name": "Python 3 (ipykernel)", 627 | "language": "python", 628 | "name": "python3" 629 | }, 630 | "language_info": { 631 | "codemirror_mode": { 632 | "name": "ipython", 633 | "version": 3 634 | }, 635 | "file_extension": ".py", 636 | "mimetype": "text/x-python", 637 | "name": "python", 638 | "nbconvert_exporter": "python", 639 | "pygments_lexer": "ipython3", 640 | "version": "3.9.7" 641 | } 642 | }, 643 | "nbformat": 4, 644 | "nbformat_minor": 2 645 | } 646 | -------------------------------------------------------------------------------- /great_embedder.py: -------------------------------------------------------------------------------- 1 | # Works in py36.6, tt 1.15.0, tensorflow-hub 0.4.0 2 | # Look here for help with installation: https://www.tensorflow.org/hub 3 | # Look here for help with the universal sentence encoder: 4 | # https://towardsdatascience.com/use-cases-of-googles-universal-sentence-encoder-in-production-dd5aaab4fc15 5 | # 6 | # It's possible that this is not the most efficient code . . . 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import tensorflow as tf 11 | import tensorflow_hub as hub 12 | import time 13 | 14 | top_cutoff = 2 #29142 15 | inds = [4, 5, 6, 9, 10, 11, 12, 13, 14] 16 | 17 | data = pd.read_csv('airbnblala/analysisData.csv') 18 | print(data.shape) 19 | n = data.shape[0] 20 | 21 | # f = open("All_embeddings.csv", 'w') 22 | # f2 = open("All_embeddings2.csv", 'w') 23 | 24 | s = [] 25 | for ind in inds: 26 | for i in range(512): 27 | s.append(str(ind+1) + ':' + str(i)) 28 | #f.write(','.join(s) + '\n') 29 | #f2.write(','.join([str(ind + 1) for ind in inds]) + '\n') 30 | 31 | def text(b, t): 32 | s = [] 33 | for rownum in range(b, t): 34 | for i in inds: 35 | s.append(str(data.iloc[rownum, i])) 36 | print(len(s)) 37 | return s 38 | 39 | 40 | module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 41 | 42 | #with tf.device('/job:localhost/replica:0/task:0/device:XLA_GPU:0'): 43 | embed = hub.Module(module_url) 44 | 45 | 46 | #word = "Elephant" 47 | #sentence = "I am a sentence for which I would like to get its embedding." 48 | #paragraph = ( 49 | # "Universal Sentence Encoder embeddings also support short paragraphs. " 50 | # "There is no hard limit on how long the paragraph is. Roughly, the longer " 51 | # "the more 'diluted' the embedding will be.") 52 | #messages = [word, sentence, paragraph] 53 | 54 | 55 | tf.logging.set_verbosity(tf.logging.ERROR) 56 | 57 | # session.run([tf.global_variables_initializer(), tf.tables_initializer()]) 58 | start = time.time() 59 | step = 1000 60 | 61 | for rownum in range(0, n, step): 62 | bot = rownum 63 | top = min(n-1, rownum + step) 64 | print(top) 65 | messages = text(bot, top) 66 | t1 = time.time() 67 | with tf.Session() as session: 68 | session.run([tf.global_variables_initializer(), tf.tables_initializer()]) 69 | message_embeddings = np.array(session.run(embed(messages))) 70 | print(message_embeddings.shape) 71 | mb = message_embeddings.reshape(top-bot, len(inds)*512) 72 | mb2 = message_embeddings.reshape(top - bot, len(inds), 512) 73 | f.write('\n'.join([','.join([str(f) for f in row]) for row in mb]) + '\n') 74 | f2.write('\n'.join([','.join([str(arr) for arr in row]) for row in mb2]) + '\n') 75 | i print(time.time() - t1) 76 | 77 | bot = n-1 78 | top = n 79 | print(top) 80 | messages = text(bot, top) 81 | t1 = time.time() 82 | with tf.Session() as session: 83 | session.run([tf.global_variables_initializer(), tf.tables_initializer()]) 84 | message_embeddings = np.array(session.run(embed(messages))) 85 | print(message_embeddings.shape) 86 | #mb = message_embeddings.reshape(top-bot, len(inds)*512) 87 | mb2 = message_embeddings.reshape(len(inds), 512) 88 | bigList = [] 89 | for arr in mb2: 90 | bigList += [str(val) for val in arr] 91 | 92 | 93 | 94 | f = open("All_embeddings.csv", 'a') 95 | f.write(','.join(bigList)) 96 | f.close() 97 | 98 | 99 | # row = [] 100 | # for i, message_embedding in enumerate(np.array(message_embeddings).tolist()): 101 | # row.append(','.join([str(x) for x in message_embedding])) 102 | # f.write(','.join(row) + '\n') 103 | stop = time.time() 104 | print(stop - start) 105 | 106 | #f.close() 107 | 108 | 109 | -------------------------------------------------------------------------------- /julia/GitHub Tutorials.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GitHub Tutorials\n", 8 | " * [GitHub Hello World](https://guides.github.com/activities/hello-world/)\n", 9 | " * [GitHub Desktop Tutorial](https://guides.github.com/introduction/getting-your-project-on-github/)\n", 10 | " * [Git command line (optional)](https://git-scm.com/docs/gittutorial)" 11 | ] 12 | } 13 | ], 14 | "metadata": { 15 | "kernelspec": { 16 | "display_name": "Julia 0.5.0-rc4", 17 | "language": "julia", 18 | "name": "julia-0.5" 19 | }, 20 | "language_info": { 21 | "file_extension": ".jl", 22 | "mimetype": "application/julia", 23 | "name": "julia", 24 | "version": "0.5.0" 25 | } 26 | }, 27 | "nbformat": 4, 28 | "nbformat_minor": 0 29 | } 30 | -------------------------------------------------------------------------------- /julia/Julia Syntax Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Julia Syntax Tutorial\n", 8 | "\n", 9 | "Everything covered today (and **even more**) can be found [here](https://docs.julialang.org/en/v1/). Code in this notebook has been tested to be compatible with Julia 1.0.3 and 1.2.0. \n", 10 | "\n", 11 | "To test a specific part of your code, you may create a new cell, paste the code there and run it." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "The following cell brings the linear algebra package to the main namespace: " 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "using LinearAlgebra" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "# Variables" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "x = 3 \n", 44 | "x = 3.0\n", 45 | "words = \"Hello world!\"\n", 46 | "character = 'a'\n", 47 | "δ = 1e-5\n", 48 | "pi\n", 49 | "MathConstants.e\n", 50 | "α̂₁ = pi / 2" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "Then, the arrays." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "u = [1 3 5 7]\n", 67 | "v = ones(4)\n", 68 | "w = Vector{Float64}(undef, 2)\n", 69 | "\n", 70 | "X = rand(4, 4)\n", 71 | "Identity_matrix = Matrix{Float64}(I, 4, 4)\n", 72 | "column = [1,2,3]\n", 73 | "row = [1 2 3]\n", 74 | "A = [1 2 3; 4 5 6; 7 8 9]\n", 75 | "b = [[1 2 3] [4 5 6] [7 8 9]]\n", 76 | "B = reshape(b, 3, 3)\n", 77 | "C = fill(15, 2, 3)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "Check the number of entries in A:" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "length(A)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Number of dimensions of A:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "ndims(A)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Size of A:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "size(A)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "**Exercise 1: Make an array consisting of the programming languages you know, and check its number of entries and dimensionality. The entries should be strings, e.g., \"Julia\".**" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "There are several ways to do value extraction and assignment on arrays." 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "A[3, 1]\n", 149 | "A[1, :]\n", 150 | "A[1:2, 2:end]\n", 151 | "A[2, [1 3]]\n", 152 | "\n", 153 | "A[2, 3] = 10\n", 154 | "A[3, 1:2] = [-2 -3]" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "A" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "Some Mathematical operations are listed below. You may create new cells to check the effect of each individual command." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "2 + 5\n", 180 | "3.5 ^ 2\n", 181 | "[1 2] + [2 3]\n", 182 | "[1 2] * 1.5\n", 183 | "A[1, :]' * ones(3)\n", 184 | "A .* B" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "A" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "B" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "3.0 > 2.0\n", 212 | "9 ≤ 9\n", 213 | "1 != 2\n", 214 | "[1 2] .< [2 3]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "Control flows:" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "x = 2\n", 231 | "y = 1\n", 232 | "if x < y\n", 233 | " println(\"x is less than y: $x < $y\")\n", 234 | "elseif x > y\n", 235 | " println(\"x is greater than y: $x > $y\")\n", 236 | "else\n", 237 | " println(\"x is equal to y: $x = $y\")\n", 238 | "end" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "i = 1\n", 248 | "while i <= 5\n", 249 | " println(i)\n", 250 | " i += 1 \n", 251 | "end" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "for i = 1:5\n", 261 | " if i == 3\n", 262 | " continue\n", 263 | " #break\n", 264 | " end\n", 265 | " println(i)\n", 266 | "end" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "u = [1,3,5,7]\n", 276 | "for i in u\n", 277 | " println(i)\n", 278 | "end" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "**Exercise 2: Make an array, with each entry being the number of lines of code you have written in the corresponding language in the array of Exercise 1. Then use a loop to sum them up.**" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "# functions" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "The following three syntaxes all define the function f(x) = 2x:" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "f1(x) = 2*x\n", 309 | "function f1(x)\n", 310 | " return 2*x\n", 311 | "end" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "# default values for positional arguments\n", 321 | "function f2(x, mult=2)\n", 322 | " return mult*x\n", 323 | "end\n", 324 | "f2(2) # ==4\n", 325 | "f2(2, 3) # ==6" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "# default values for keyword arguments: use a semicolon instead of a comma\n", 335 | "function f3(x; mult=2)\n", 336 | " return mult*x\n", 337 | "end\n", 338 | "f3(2) # ==4\n", 339 | "f3(2, mult=3) # ==6" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "We will get an **error** if we don't specify the name of keyword argument:" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "f3(2, 3)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "# a changeable number of arguments\n", 365 | "function f4(x...)\n", 366 | " for xi in x\n", 367 | " println(xi)\n", 368 | " end\n", 369 | "end\n", 370 | "\n", 371 | "f4(1,2,3,4,\"a\",\"b\",\"c\")" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "**Exercise 3: Use a function to find the second largest number of lines of code in the vector you made in Exercise 2.**" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "There are more kinds of data structures:" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "# list comprehensions: shorthand for loops (not very memory efficient)\n", 395 | "[x^2 for x in 1:5]\n", 396 | "[x^2 for x in 1:5 if x>2]\n", 397 | "[x^k for x in 1:10 for k in 1:5]" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "# dictionaries and sets\n", 407 | "\n", 408 | "x = Set([1,2,3,3,4,1])\n", 409 | "d = Dict()\n", 410 | "d[4] = 7\n", 411 | "# XXX add more here" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "# advanced topic for people who like object-oriented programming: types and multiple dispatch" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# x::y is an assertion that x is a variable of type y\n", 428 | "1.0::Float64" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "# if we assert something false, we get a type error \n", 438 | "1.0::Int" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "# we can define many functions with the same name, which call arguments of different types\n", 448 | "# julia decides which to call by looking at the type of the argument\n", 449 | "\n", 450 | "f(x::Int) = println(\"$x is an integer\")\n", 451 | "f(x::Float64) = println(\"$x is an float\")\n", 452 | "f(2)\n", 453 | "f(2.0)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "metadata": {}, 459 | "source": [ 460 | "We may define a composite type as follows:" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "struct Student\n", 470 | " name::String \n", 471 | " gpa::Float64\n", 472 | "end" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "s1 = Student(\"Alice\", 3.9)\n", 482 | "s2 = Student(\"Bob\", 3.2)" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "s1.name" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "f(s::Student) = println(\"$(s.name) has a $(s.gpa) gpa\")\n", 501 | "\n", 502 | "# we've defined the \"f\" function on three different types\n", 503 | "# julia knows to call the write one by looking at the type of the argument\n", 504 | "\n", 505 | "f(s1)\n", 506 | "f(s2)\n", 507 | "f(4)\n", 508 | "f(sqrt(2))" 509 | ] 510 | } 511 | ], 512 | "metadata": { 513 | "kernelspec": { 514 | "display_name": "Julia 1.2.0", 515 | "language": "julia", 516 | "name": "julia-1.2" 517 | }, 518 | "language_info": { 519 | "file_extension": ".jl", 520 | "mimetype": "application/julia", 521 | "name": "julia", 522 | "version": "1.2.0" 523 | } 524 | }, 525 | "nbformat": 4, 526 | "nbformat_minor": 1 527 | } 528 | -------------------------------------------------------------------------------- /julia/QR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "Plots.PyPlotBackend()" 12 | ] 13 | }, 14 | "execution_count": 1, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "using Random\n", 21 | "using LinearAlgebra\n", 22 | "using Statistics\n", 23 | "using Plots\n", 24 | "using LaTeXStrings\n", 25 | "pyplot()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "6×4 Array{Float64,2}:\n", 37 | " -0.890685 0.764352 -0.244713 0.922554\n", 38 | " 1.09337 0.00810613 0.754116 -0.871233\n", 39 | " 1.36687 1.44676 0.582838 1.27395 \n", 40 | " -0.109862 1.23566 0.981122 -1.00478 \n", 41 | " 1.1635 -0.883671 -0.0394113 0.463097\n", 42 | " 0.458859 0.43612 0.693942 -0.874769" 43 | ] 44 | }, 45 | "execution_count": 2, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "# generate random data matrix\n", 52 | "n,d = 6,4\n", 53 | "X = randn(n,d)\n", 54 | "\n", 55 | "# optional: give it linearly dependent columns\n", 56 | "# X[:,3] = X[:,2]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# Understanding the pseudoinverse" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 3, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "4×6 Array{Float64,2}:\n", 75 | " -1.26168 0.00544988 0.715924 -0.900509 -1.1001 0.158552\n", 76 | " -1.20029 -0.246117 0.849552 -0.820945 -1.83688 0.187011\n", 77 | " 3.03502 0.567742 -1.36888 2.44695 3.70982 -0.204841\n", 78 | " 1.36334 0.047329 -0.327362 0.745252 1.55705 -0.260946" 79 | ] 80 | }, 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "output_type": "execute_result" 84 | } 85 | ], 86 | "source": [ 87 | "# form pseudoinverse\n", 88 | "Xd = pinv(X)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/plain": [ 99 | "4×4 Array{Float64,2}:\n", 100 | " 1.0 -1.25141e-15 -1.04455e-15 -4.57249e-16\n", 101 | " -6.39156e-16 1.0 -4.04297e-16 -2.45006e-16\n", 102 | " 3.0523e-16 1.42441e-15 1.0 1.36557e-15\n", 103 | " 6.93224e-16 3.62154e-16 7.95279e-16 1.0 " 104 | ] 105 | }, 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "# X†X ≈ I_d\n", 113 | "Xd*X" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 5, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "6×6 Array{Float64,2}:\n", 125 | " 0.721359 -0.288244 0.0446677 0.263312 0.104445 -0.188887 \n", 126 | " -0.288244 0.390872 0.0425729 0.204752 0.22337 0.247742 \n", 127 | " 0.0446677 0.0425729 0.992797 -0.0430009 -0.0153952 0.0354588\n", 128 | " 0.263312 0.204752 -0.0430009 0.736462 -0.0736144 0.274883 \n", 129 | " 0.104445 0.22337 -0.0153952 -0.0736144 0.918078 -0.0935505\n", 130 | " -0.188887 0.247742 0.0354588 0.274883 -0.0935505 0.240431 " 131 | ] 132 | }, 133 | "execution_count": 5, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "# XX† !≈ I_n\n", 140 | "X*Xd" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "Q,R = qr(X)\n", 150 | "Q = Q[:,1:d];" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 7, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "6×4 Array{Float64,2}:\n", 162 | " -1.11022e-16 1.11022e-16 -3.05311e-16 1.11022e-16\n", 163 | " 0.0 1.19696e-16 3.33067e-16 -3.33067e-16\n", 164 | " 0.0 -2.22045e-16 3.33067e-16 -8.88178e-16\n", 165 | " 1.38778e-17 0.0 1.11022e-16 -2.22045e-16\n", 166 | " 0.0 -1.11022e-16 8.32667e-17 5.55112e-17\n", 167 | " 0.0 5.55112e-17 0.0 1.11022e-16" 168 | ] 169 | }, 170 | "execution_count": 7, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "X - Q*R" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 8, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "6×4 Array{Float64,2}:\n", 188 | " -0.382108 -0.361369 -0.272415 -0.608732 \n", 189 | " 0.46906 0.0267269 0.411939 -0.0211324\n", 190 | " 0.586391 -0.599405 -0.517968 0.146167 \n", 191 | " -0.0471311 -0.547338 0.569154 -0.332754 \n", 192 | " 0.499148 0.421487 -0.0891341 -0.695222 \n", 193 | " 0.196852 -0.17939 0.394873 0.116512 " 194 | ] 195 | }, 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "Q" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 9, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "4×4 Array{Float64,2}:\n", 214 | " 2.33098 0.146637 0.859695 0.0921657\n", 215 | " 0.0 -2.27021 -0.918873 -0.218209 \n", 216 | " 0.0 0.0 0.911363 -2.22865 \n", 217 | " 0.0 0.0 0.0 -0.446499 " 218 | ] 219 | }, 220 | "execution_count": 9, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "R" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 10, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "4×4 Array{Float64,2}:\n", 238 | " 1.0 -1.53135e-16 -3.56308e-16 -1.06328e-16\n", 239 | " -1.53135e-16 1.0 2.17932e-16 9.85739e-17\n", 240 | " -3.56308e-16 2.17932e-16 1.0 -9.20757e-17\n", 241 | " -1.06328e-16 9.85739e-17 -9.20757e-17 1.0 " 242 | ] 243 | }, 244 | "execution_count": 10, 245 | "metadata": {}, 246 | "output_type": "execute_result" 247 | } 248 | ], 249 | "source": [ 250 | "Q'*Q" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 11, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "# form data from noisy linear model\n", 260 | "w♮ = randn(d)\n", 261 | "y = X*w♮ + .1*randn(n);" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "4-element Array{Float64,1}:\n", 273 | " -0.10036720947429924\n", 274 | " 0.1177262734257617 \n", 275 | " 0.4330665815147097 \n", 276 | " 0.10108585641128666" 277 | ] 278 | }, 279 | "execution_count": 12, 280 | "metadata": {}, 281 | "output_type": "execute_result" 282 | } 283 | ], 284 | "source": [ 285 | "# solve least squares problem to estimate w\n", 286 | "w = R \\ (Q'*y)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 13, 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "data": { 296 | "text/plain": [ 297 | "0.22139501473072573" 298 | ] 299 | }, 300 | "execution_count": 13, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "# how good is our estimate?\n", 307 | "norm(w - w♮)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 14, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "0.0054774834985621" 319 | ] 320 | }, 321 | "execution_count": 14, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "# compute mean square error\n", 328 | "mean((y - X*w).^2)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 15, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "data": { 338 | "text/plain": [ 339 | "3.9399232672427924e-16" 340 | ] 341 | }, 342 | "execution_count": 15, 343 | "metadata": {}, 344 | "output_type": "execute_result" 345 | } 346 | ], 347 | "source": [ 348 | "# let's use the shorthand\n", 349 | "w_backslash = X \\ y\n", 350 | "norm(w_backslash - w)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 16, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "4-element Array{Float64,1}:\n", 362 | " -0.10036720947429938\n", 363 | " 0.11772627342576154\n", 364 | " 0.43306658151471006\n", 365 | " 0.1010858564112867 " 366 | ] 367 | }, 368 | "execution_count": 16, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "w_backslash" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "metadata": {}, 381 | "outputs": [], 382 | "source": [] 383 | } 384 | ], 385 | "metadata": { 386 | "kernelspec": { 387 | "display_name": "Julia 1.2.0", 388 | "language": "julia", 389 | "name": "julia-1.2" 390 | }, 391 | "language_info": { 392 | "file_extension": ".jl", 393 | "mimetype": "application/julia", 394 | "name": "julia", 395 | "version": "1.2.0" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 1 400 | } 401 | -------------------------------------------------------------------------------- /julia/SVD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "using LinearAlgebra, Random, Statistics" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 16, 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "data": { 19 | "text/plain": [ 20 | "6×4 Array{Float64,2}:\n", 21 | " 0.573077 0.468126 0.468126 0.518838\n", 22 | " 0.699222 0.0420342 0.0420342 0.12317\n", 23 | " -0.156645 0.374166 0.374166 0.0239568\n", 24 | " 0.310086 -0.764711 -0.764711 -0.360399\n", 25 | " -0.507746 0.543277 0.543277 -2.1375\n", 26 | " 3.18472 -0.58651 -0.58651 -1.86139" 27 | ] 28 | }, 29 | "execution_count": 16, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "# generate random data matrix\n", 36 | "n,d = 6,4\n", 37 | "X = randn(n,d)\n", 38 | "\n", 39 | "# optional: give it linearly dependent columns\n", 40 | "X[:,3] = X[:,2]\n", 41 | "X" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 17, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "SVD{Float64,Float64,Array{Float64,2}}\n", 53 | "U factor:\n", 54 | "6×4 Array{Float64,2}:\n", 55 | " -0.00539504 -0.190392 -0.642291 -0.0379604\n", 56 | " -0.122173 -0.169425 -0.241944 -0.701953\n", 57 | " 0.0663229 0.101954 -0.298173 -0.462216\n", 58 | " -0.177398 -0.0991203 0.651604 -0.500171\n", 59 | " -0.152621 0.95091 -0.081925 -0.117032\n", 60 | " -0.962228 -0.102945 -0.093368 0.168255\n", 61 | "singular values:\n", 62 | "4-element Array{Float64,1}:\n", 63 | " 3.918752252818832\n", 64 | " 2.3622345810143344\n", 65 | " 1.4211344525401286\n", 66 | " 7.032906093625886e-17\n", 67 | "Vt factor:\n", 68 | "4×4 Array{Float64,2}:\n", 69 | " -0.801491 0.161851 0.161851 0.552467\n", 70 | " -0.459291 0.251746 0.251746 -0.813818\n", 71 | " -0.382967 -0.640647 -0.640647 -0.180221\n", 72 | " 0.0 -0.707107 0.707107 5.55112e-17" 73 | ] 74 | }, 75 | "execution_count": 17, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "U,σ,V = svd(X)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 18, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/plain": [ 92 | "4×4 Array{Float64,2}:\n", 93 | " 1.0 8.68792e-17 6.79998e-17 6.41545e-17\n", 94 | " 8.68792e-17 1.0 -4.68924e-17 -1.55497e-16\n", 95 | " 6.79998e-17 -4.68924e-17 1.0 -1.31907e-16\n", 96 | " 6.41545e-17 -1.55497e-16 -1.31907e-16 1.0" 97 | ] 98 | }, 99 | "execution_count": 18, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "U'*U" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 19, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "6×6 Array{Float64,2}:\n", 117 | " 0.450257 0.214961 0.189291 -0.379704 -0.12316 0.0783736\n", 118 | " 0.214961 0.594906 0.371219 0.231911 -0.0404891 0.0394824\n", 119 | " 0.189291 0.371219 0.317344 0.0150248 0.165348 -0.124244\n", 120 | " -0.379704 0.231911 0.0150248 0.716054 -0.0620265 0.0359058\n", 121 | " -0.12316 -0.0404891 0.165348 -0.0620265 0.94793 0.0369224\n", 122 | " 0.0783736 0.0394824 -0.124244 0.0359058 0.0369224 0.973508" 123 | ] 124 | }, 125 | "execution_count": 19, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "U*U'" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 20, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "4×4 Array{Float64,2}:\n", 143 | " 1.0 8.68884e-17 1.27238e-17 -2.13768e-17\n", 144 | " 8.68884e-17 1.0 5.38631e-17 5.32179e-17\n", 145 | " 1.27238e-17 5.38631e-17 1.0 8.5981e-17\n", 146 | " -2.13768e-17 5.32179e-17 8.5981e-17 1.0" 147 | ] 148 | }, 149 | "execution_count": 20, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "V'*V" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 21, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "4×4 Array{Float64,2}:\n", 167 | " 1.0 2.29502e-16 1.73991e-16 1.815e-16\n", 168 | " 2.29502e-16 1.0 1.21169e-16 1.75265e-16\n", 169 | " 1.73991e-16 1.21169e-16 1.0 7.33583e-17\n", 170 | " 1.815e-16 1.75265e-16 7.33583e-17 1.0" 171 | ] 172 | }, 173 | "execution_count": 21, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "V*V'" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 22, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "4-element Array{Float64,1}:\n", 191 | " 3.918752252818832\n", 192 | " 2.3622345810143344\n", 193 | " 1.4211344525401286\n", 194 | " 7.032906093625886e-17" 195 | ] 196 | }, 197 | "execution_count": 22, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "σ" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 23, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "Error of full rank svd: 3.222488743844374e-15\n", 216 | "Error of rank 3 approximation: 3.221816314104331e-15\n", 217 | "Error of rank 2 approximation: 1.4211344525401282\n", 218 | "Error of rank 1 approximation: 2.7567690051827887\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "# if we have a linearly dependent column, \n", 224 | "# decomposition is just as good if we ignore the 0 in sigma and reduce r by 1\n", 225 | "println(\"Error of full rank svd: \", norm(X - U*diagm(σ)*V'))\n", 226 | "for k=3:-1:1\n", 227 | " println(\"Error of rank $k approximation: \", norm(X - U[:,1:k]*diagm(σ[1:k])*(V[:,1:k])'))\n", 228 | "end" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 24, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# form data from noisy linear model\n", 238 | "w♮ = randn(d)\n", 239 | "y = X*w♮ + .1*randn(n);" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 31, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "4-element Array{Float64,1}:\n", 251 | " -0.24723376947937492\n", 252 | " 1.2296831134048805e15\n", 253 | " -1.2296831134048812e15\n", 254 | " -1.4601005663102384" 255 | ] 256 | }, 257 | "execution_count": 31, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "# solve least squares problem to estimate w\n", 264 | "w = V*diagm(σ.^(-1))*U'*y" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 44, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "data": { 274 | "text/plain": [ 275 | "4-element Array{Float64,1}:\n", 276 | " -0.24723376947937492\n", 277 | " -0.37987541412463555\n", 278 | " -0.3798754141246354\n", 279 | " -1.3635647571638487" 280 | ] 281 | }, 282 | "execution_count": 44, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "# use rank k approximation to design matrix X\n", 289 | "# k=4 is full rank\n", 290 | "# when design matrix X has rank 3, k=3 gives 0 error approximation\n", 291 | "# while k=2 results in some loss of accuracy - but not much!\n", 292 | "k = 3\n", 293 | "w = V[:,1:k]*diagm(σ[1:k].^(-1))*(U[:,1:k])'*y" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 45, 299 | "metadata": {}, 300 | "outputs": [ 301 | { 302 | "data": { 303 | "text/plain": [ 304 | "7.188211828540968e-15" 305 | ] 306 | }, 307 | "execution_count": 45, 308 | "metadata": {}, 309 | "output_type": "execute_result" 310 | } 311 | ], 312 | "source": [ 313 | "# error in normal equations \n", 314 | "norm(X'*X*w - X'*y)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 46, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "data": { 324 | "text/plain": [ 325 | "7.222425309945365e-15" 326 | ] 327 | }, 328 | "execution_count": 46, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "w[2] += 1\n", 335 | "w[3] -= 1\n", 336 | "norm(X'*X*w - X'*y)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 47, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "4-element Array{Float64,1}:\n", 348 | " -0.24723376947937492\n", 349 | " 0.6201245858753645\n", 350 | " -1.3798754141246354\n", 351 | " -1.3635647571638487" 352 | ] 353 | }, 354 | "execution_count": 47, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "w" 361 | ] 362 | }, 363 | { 364 | "cell_type": "code", 365 | "execution_count": 39, 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "0.5997851656441274" 372 | ] 373 | }, 374 | "execution_count": 39, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "# how good is our estimate of w?\n", 381 | "norm(w - w♮) / norm(w♮)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 40, 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "0.010609982025196208" 393 | ] 394 | }, 395 | "execution_count": 40, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "# compute mean square error\n", 402 | "mean((y - X*w).^2)" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 41, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "9.646809571707696e-16" 414 | ] 415 | }, 416 | "execution_count": 41, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "# let's use the shorthand\n", 423 | "# backslash finds least norm solution to normal eqns\n", 424 | "# using SVD when design matrix X is rank deficient\n", 425 | "w_backslash = X \\ y\n", 426 | "norm(w_backslash - w)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [] 435 | } 436 | ], 437 | "metadata": { 438 | "@webio": { 439 | "lastCommId": null, 440 | "lastKernelId": null 441 | }, 442 | "kernelspec": { 443 | "display_name": "Julia 1.5.1", 444 | "language": "julia", 445 | "name": "julia-1.5" 446 | }, 447 | "language_info": { 448 | "file_extension": ".jl", 449 | "mimetype": "application/julia", 450 | "name": "julia", 451 | "version": "1.5.1" 452 | } 453 | }, 454 | "nbformat": 4, 455 | "nbformat_minor": 1 456 | } 457 | -------------------------------------------------------------------------------- /julia/proxgrad-starter-code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "┌ Info: Precompiling LowRankModels [15d4e49f-4837-5ea3-a885-5b28bfa376dc]\n", 13 | "└ @ Base loading.jl:1278\n" 14 | ] 15 | }, 16 | { 17 | "data": { 18 | "text/plain": [ 19 | "proxgrad_const" 20 | ] 21 | }, 22 | "execution_count": 1, 23 | "metadata": {}, 24 | "output_type": "execute_result" 25 | } 26 | ], 27 | "source": [ 28 | "using Plots, Random, LinearAlgebra, Statistics, SparseArrays\n", 29 | "include(\"proxgrad.jl\")" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Solving ERM problems\n", 37 | "\n", 38 | "The file `proxgrad.jl` contains code for solving regularized empirical risk minimization (ERM) problems. It provides the optimization function `proxgrad` together with a large number of predefined loss functions and regularizers.\n", 39 | " \n", 40 | "The function `proxgrad` solves regularized ERM problems of the form\n", 41 | "$$\n", 42 | "\\mbox{minimize} \\quad \\sum_{i=1}^n \\ell(y_i, w^T x_i) + r(w). \n", 43 | "$$\n", 44 | "It solves these with the proximal gradient method, which we will learn shortly.\n", 45 | "\n", 46 | "You can select from a range of losses. For real valued $y$, try:\n", 47 | " * quadratic loss - `QuadLoss()`\n", 48 | " * $\\ell_1$ loss - `L1Loss()`\n", 49 | " * quantile loss (for $\\alpha$ quantile) - `QuantileLoss(α)`\n", 50 | " \n", 51 | "For Boolean $y$, try\n", 52 | " * hinge loss - `HingeLoss()`\n", 53 | " * logistic loss - `LogisticLoss()`\n", 54 | " * weighted hinge loss - `WeightedHingeLoss()`\n", 55 | "\n", 56 | "For nominal $y$, try\n", 57 | " * multinomial loss - `MultinomialLoss()`\n", 58 | " * one vs all loss - `OvALoss()`\n", 59 | " * (by default, it uses the logistic loss for the underlying binary classifier)\n", 60 | "\n", 61 | "For ordinal $y$, try\n", 62 | " * ordinal hinge loss - `OrdinalHingeLoss()`\n", 63 | " * bigger vs smaller loss - `BvSLoss()`\n", 64 | " * (by default, it uses the logistic loss for the underlying binary classifier)\n", 65 | " \n", 66 | "It also provides a few regularizers, including \n", 67 | " * no regularization - `ZeroReg()`\n", 68 | " * quadratic regularization - `QuadReg()`\n", 69 | " * $\\ell_1$ regularization - `OneReg()`\n", 70 | " * nonnegative constraint - `NonNegConstraint()`\n", 71 | " \n", 72 | "Below, we provide some examples for how to use the proxgrad function to fit regularized ERM problems." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## generate random data set\n", 80 | "\n", 81 | "First (as usual), we'll generate some random data to try our methods on." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 2, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "Random.seed!(0)\n", 91 | "n = 50\n", 92 | "d = 10\n", 93 | "X = randn(n,d)\n", 94 | "w♮ = randn(d)\n", 95 | "y = X*w♮ + .1*randn(n);" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Quadratic loss, quadratic regularizer\n", 103 | "\n", 104 | "$$\n", 105 | "\\mbox{minimize} \\quad \\frac 1 n ||Xw - y||^2 + λ||w||^2\n", 106 | "$$" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 3, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "0.11180045233465635" 118 | ] 119 | }, 120 | "execution_count": 3, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "# we form \\frac 1 n || ⋅ ||^2 by multiplying the QuadLoss() function by 1/n\n", 127 | "loss = 1/n*QuadLoss()\n", 128 | "\n", 129 | "# we form λ|| ⋅ ||^2 by multiplying the QuadReg() function by λ\n", 130 | "λ = .1\n", 131 | "reg = λ*QuadReg()\n", 132 | "\n", 133 | "# minimize 1/n ||Xw - y||^2 + λ||w||^2\n", 134 | "#w = proxgrad(loss, reg, X, y, maxiters=5, c=.1, stepsize=1, max_inner_iters=10000) \n", 135 | "w = proxgrad(loss, reg, X, y, maxiters=5)\n", 136 | "\n", 137 | "norm(X*w-y) / norm(y)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "`maxiters`, the maximum number of iterations, controls how fully we converge.\n", 145 | "You can try increasing it to see if the error improves.\n", 146 | "\n", 147 | "In the next code block, do you think the error will be \n", 148 | "* A) higher \n", 149 | "* B) lower\n", 150 | "* C) the same" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 4, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "0.09941186768969058" 162 | ] 163 | }, 164 | "execution_count": 4, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "w = proxgrad(loss, reg, X, y, maxiters=100) \n", 171 | "norm(X*w-y) / norm(y)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Hinge loss, quadratic regularizer\n", 179 | "\n", 180 | "$$\n", 181 | "\\mbox{minimize} \\quad \\frac 1 n \\sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2\n", 182 | "$$" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 5, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "data": { 192 | "text/plain": [ 193 | "0.1" 194 | ] 195 | }, 196 | "execution_count": 5, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "ybool = (y.>=0) # form a boolean target\n", 203 | "\n", 204 | "# we form \\frac 1 n \\sum_{i=1}^n (1 - ⋅ )_+ by multiplying the HingeLoss() function by 1/n\n", 205 | "loss = 1/n*HingeLoss()\n", 206 | "\n", 207 | "# we form λ|| ⋅ ||^2 by multiplying the QuadReg() function by λ\n", 208 | "λ = .1\n", 209 | "reg = λ*QuadReg()\n", 210 | "\n", 211 | "# minimize 1/n \\frac 1 n \\sum_{i=1}^n (1 - y_i w^T x_i)_+ + λ||w||^2\n", 212 | "w = proxgrad(loss, reg, X, ybool, maxiters=10) \n", 213 | "\n", 214 | "# predict output values using learned classifier\n", 215 | "yhat = impute(loss, X*w)\n", 216 | "\n", 217 | "# misclassification error \n", 218 | "(n - sum(yhat .== ybool)) / n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "For nonsmooth problems (like the hinge loss), a smaller stepsize can also help.\n", 226 | "\n", 227 | "In the next code block, do you think the error will be\n", 228 | "\n", 229 | "* A) higher\n", 230 | "* B) lower\n", 231 | "* C) the same" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 7, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "0.1" 243 | ] 244 | }, 245 | "execution_count": 7, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "w = proxgrad(loss, reg, X, ybool, maxiters=100, stepsize=.1) \n", 252 | "yhat = impute(loss, X*w)\n", 253 | "\n", 254 | "# misclassification error \n", 255 | "(n - sum(yhat .== ybool)) / n" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "# Homework question \n", 263 | "\n", 264 | "Use the proxgrad function to fit the following objective\n", 265 | " \n", 266 | "$$\n", 267 | "\\mbox{minimize} \\quad \\frac 1 n \\sum_{i=1}^n \\log(1 + \\exp(- \\text{ybool}_i w^T x_i)) + λ||w||^2\n", 268 | "$$\n", 269 | "for $\\lambda = .5$" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [] 278 | } 279 | ], 280 | "metadata": { 281 | "@webio": { 282 | "lastCommId": null, 283 | "lastKernelId": null 284 | }, 285 | "kernelspec": { 286 | "display_name": "Julia 1.5.1", 287 | "language": "julia", 288 | "name": "julia-1.5" 289 | }, 290 | "language_info": { 291 | "file_extension": ".jl", 292 | "mimetype": "application/julia", 293 | "name": "julia", 294 | "version": "1.5.1" 295 | } 296 | }, 297 | "nbformat": 4, 298 | "nbformat_minor": 2 299 | } 300 | -------------------------------------------------------------------------------- /python-refresher.ipynb: -------------------------------------------------------------------------------- 1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"python-refresher.ipynb","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO08ekooqoupVWgmdf1IIX/"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"code","metadata":{"id":"gFnSdOj4Vs45","executionInfo":{"status":"ok","timestamp":1630348520194,"user_tz":240,"elapsed":270,"user":{"displayName":"Madeleine Udell","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GhUKEdymdlNMsHJJD3vNYOzwpzwn1GIGBL9AgAxnQ=s64","userId":"09278725283779960205"}}},"source":["# import all packages needed in one cell at the top \n","import numpy as np # for linear algebra\n","import matplotlib.pyplot as plt # for plotting"],"execution_count":3,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_7x81X4PVdto"},"source":["# Basic python"]},{"cell_type":"code","metadata":{"id":"HMKKIgvEVdtp","outputId":"e3126c95-b1e2-44c6-a093-380ab325e39e"},"source":["# basic math \n","2+2+17"],"execution_count":null,"outputs":[{"data":{"text/plain":["21"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"Gl8dPQ0TVyER","outputId":"8b7467e0-437e-4ed8-c229-70f2a9fc3d33"},"source":["2*pi"],"execution_count":null,"outputs":[{"ename":"NameError","evalue":"name 'pi' is not defined","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;36m2\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0mpi\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mNameError\u001b[0m: name 'pi' is not defined"]}]},{"cell_type":"code","metadata":{"id":"X-I1GMezVdtr","outputId":"bb5391c5-ff44-4500-8e38-f0d080b4c374"},"source":["2 * np.pi"],"execution_count":null,"outputs":[{"data":{"text/plain":["6.283185307179586"]},"execution_count":5,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"jNkTFsQLVdts","outputId":"ce98483f-45de-4013-b3f7-e0bf8e5e8fdc"},"source":["# lists\n","a = [1, 2, 3]\n","a"],"execution_count":null,"outputs":[{"data":{"text/plain":["[1, 2, 3]"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"_W5hgb9zVdtt","outputId":"6aca11b9-2f51-4fc4-ee0f-7f340276e17c"},"source":["a.append(17)\n","a"],"execution_count":null,"outputs":[{"data":{"text/plain":["[1, 2, 3, 17]"]},"execution_count":7,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"r8QeEe6TVdtt","outputId":"20460df9-c5dc-4618-d97d-64e045c4d8fa"},"source":["# python uses 0-based indexing \n","a[0]"],"execution_count":null,"outputs":[{"data":{"text/plain":["1"]},"execution_count":8,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"VvwJxd6tVdtu","outputId":"05abb6c4-4742-4ba3-e362-ebc982c2b799"},"source":["# dictionaries \n","d = {'a': 1, 'b': 2, 'c': 3}\n","d"],"execution_count":null,"outputs":[{"data":{"text/plain":["{'a': 1, 'b': 2, 'c': 3}"]},"execution_count":9,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"jLRqAAlvVdtu","outputId":"c5f7ff00-d086-47d9-b4cf-baa3d099f0f9"},"source":["d['b']"],"execution_count":null,"outputs":[{"data":{"text/plain":["2"]},"execution_count":10,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"BPYvLBSeVdtu","outputId":"637d0d90-0bcc-4bb9-e9f1-e5537795df50"},"source":["d['d']"],"execution_count":null,"outputs":[{"ename":"KeyError","evalue":"'d'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'd'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;31mKeyError\u001b[0m: 'd'"]}]},{"cell_type":"code","metadata":{"id":"fxrQl3HBVdtv","outputId":"04f3bba5-9503-49cf-f1de-85f6108e7ce2"},"source":["d['d'] = 4\n","d['d']"],"execution_count":null,"outputs":[{"data":{"text/plain":["4"]},"execution_count":12,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"code","metadata":{"id":"JHjcpwclVdtv","outputId":"8f0b2d99-ff57-4976-8610-a890b910cf36"},"source":["# for loops\n","for i in range(10):\n"," print(\"hello\", i)\n"," print(\"hello\", i)"],"execution_count":null,"outputs":[{"name":"stdout","output_type":"stream","text":["hello 0\n","hello 0\n","hello 1\n","hello 1\n","hello 2\n","hello 2\n","hello 3\n","hello 3\n","hello 4\n","hello 4\n","hello 5\n","hello 5\n","hello 6\n","hello 6\n","hello 7\n","hello 7\n","hello 8\n","hello 8\n","hello 9\n","hello 9\n"]}]},{"cell_type":"code","metadata":{"id":"Z5W1GUjvVdtw","outputId":"5e63a893-5bad-45b5-e8a3-01ba195dc237"},"source":["# functions, if statements\n","def fibonacci(n=5):\n"," if n==0:\n"," return 1\n"," else:\n"," return n*fibonacci(n-1)\n"," \n","fibonacci()"],"execution_count":null,"outputs":[{"data":{"text/plain":["120"]},"execution_count":51,"metadata":{},"output_type":"execute_result"}]},{"cell_type":"markdown","metadata":{"id":"Njo-8j9YVdtw"},"source":["# Jupyter workflow tips\n","\n","* run a cell with Shift-Enter\n","* Jupyter displays value of last expression in cell \n","* open a new cell below the current one (Alt-Enter) to see values of variables, test assumptions\n","* you can reorder cells using the arrows in the toolbar, or by copy-pasting cells up or down\n","\n","How to troubleshoot and debug in Jupyter\n","* find the line in the cell that's not working as expected \n","* test that every input to that line is what you expect by opening a new cell below the one that's not working, copy pasting code if needed\n"," * check the type (eg, integer? string? array? matrix?)\n"," * check the values (eg, are there NaNs or Infs? are there negative numbers where you expect positive?)"]}]} --------------------------------------------------------------------------------