├── .gitignore ├── Fall2016 ├── README.md ├── install_R_Python │ └── installation_help.txt ├── intro.pptx ├── nov18_GBM │ ├── AutoML.ipynb │ ├── GBM.ipynb │ ├── R GBM walkthrough.Rmd │ └── R_GBM_walkthrough.html ├── nov4_LASSO │ ├── Python-RidgeLassoEN.ipynb │ ├── nov4_examples.rtf │ ├── penalized-regression.Rmd │ └── penalized-regression.html ├── oct13_decisionTrees │ ├── Horning 2016.pdf │ ├── Lewicki 2007.pdf │ ├── Python-DecisionTreesRF.ipynb │ └── r-decision-trees.Rmd └── oct21_randomForests │ ├── Python-RF.ipynb │ ├── R-random forests.Rmd │ └── tree.dot ├── Fall2017 ├── Fall2017info ├── Sep22-images-cnn │ ├── Intro_to_CNNs_in_Python.ipynb │ ├── images-cnn-R.Rmd │ ├── images-cnn-R.html │ ├── images-cnn-R │ │ ├── image_001.jpg │ │ ├── image_002.jpg │ │ ├── image_003.jpg │ │ ├── image_004.jpg │ │ ├── image_005.jpg │ │ ├── image_006.jpg │ │ ├── image_007.jpg │ │ ├── image_008.jpg │ │ ├── image_009.jpg │ │ ├── image_010.jpg │ │ ├── image_011.jpg │ │ ├── image_012.jpg │ │ ├── image_013.jpg │ │ ├── image_014.jpg │ │ ├── image_015.jpg │ │ ├── image_016.jpg │ │ ├── image_017.jpg │ │ ├── image_018.jpg │ │ ├── image_019.jpg │ │ └── image_020.jpg │ ├── imgs │ │ ├── alexnet.jpeg │ │ ├── cnn.jpeg │ │ ├── conv_box.gif.png │ │ ├── conv_gif copy.gif.png │ │ ├── conv_gif.gif │ │ ├── depthcol.jpeg │ │ ├── maxpool.jpeg │ │ ├── neural_net2.jpeg │ │ └── pool1.jpeg │ └── utils │ │ └── util.py └── Sep8-neural-nets │ ├── Neural Networks.ipynb │ ├── nn-from-scratch-3-layer-network.png │ └── r-neural-nets.Rmd ├── Fall2018 ├── 1-sep5-PCA │ ├── PCA-R.Rmd │ ├── PCA-R.html │ ├── PCA-python.ipynb │ └── iris.csv ├── 2-sep19-k-means │ ├── k-means-ucr.Rmd │ ├── k-means-ucr.html │ └── readme.md ├── 3-oct3-hier_agg_clust │ ├── Oct3-hier_agg_clust.Rmd │ └── Oct3-hier_agg_clust.html ├── 4-medoids │ ├── medoid-clustering.Rmd │ └── readme.md ├── 5-Oct30-tSNE │ ├── r-tSNE.Rmd │ └── r-tSNE.html └── 6-nov14-umap │ ├── UMAP- ML Working Group.ipynb │ ├── umap-r.Rmd │ └── umap-r.html ├── LICENSE ├── MachineLearningWG.Rproj ├── Math4ML_2017 ├── Math4ML notes July 19.docx ├── Math4ML notes July 5th .docx ├── Math4MLJune7.docx ├── Math4MLMay24.docx └── README.md ├── R and Python installation help.txt ├── README.md ├── Spring2017 ├── Apr14-svm │ ├── SVM basics.ipynb │ ├── proj.png │ └── r-svm.Rmd ├── Apr28-neural-nets │ ├── Neural Networks.ipynb │ ├── nn-from-scratch-3-layer-network.png │ └── r-neural-nets.Rmd ├── Feb17-stepwise │ ├── r-stepwise-selection.Rmd │ ├── r-stepwise-selection.html │ └── stepwise-regression.ipynb ├── Feb3-knn │ ├── Feb3kNN-R.Rmd │ └── KNN-python.ipynb ├── Mar17-gam and mars │ ├── Mar3-gamearth-R.Rmd │ ├── Mar3-gamearth-R.html │ ├── Splines_Take_Two.ipynb │ └── macro.csv ├── Mar3-reg and splines │ ├── Mar3-regsplines-R.Rmd │ ├── Mar3-regsplines-R_files │ │ └── figure-html │ │ │ └── unnamed-chunk-2-1.png │ ├── Mar3-regsplines-py.ipynb │ └── macro.csv ├── May12-lightning │ └── CNNs with Keras.ipynb ├── data │ ├── Boston.csv │ ├── BreastCancer.csv │ └── sleep_VIM.csv └── spring 2017 schedule.rtf ├── Spring2018 ├── Apr11-BoostingTrees │ ├── GBM.ipynb │ └── boosting-R.Rmd ├── Apr25 - Elastic Net │ ├── Elastic Net.ipynb │ └── elastic-net.Rmd ├── Feb28-randomForest │ ├── Random Forest Python.ipynb │ ├── Random Forest R.Rmd │ └── Random_Forest_R.html ├── Jan31-knn │ ├── Jan31knn-R.Rmd │ ├── Jan31knn-R.html │ └── kNN.ipynb └── decision-trees-feb14 │ ├── Python-DecisionTreesRF.ipynb │ └── decision-trees-r.Rmd ├── binder ├── apt.txt ├── binder.md ├── install.R └── runtime.txt └── intro.pptx /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rhistory 3 | .Rproj.user 4 | .Rhistory 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | # C extensions 10 | *.so 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .hypothesis/ 46 | # Translations 47 | *.mo 48 | *.pot 49 | # Django stuff: 50 | *.log 51 | local_settings.py 52 | # Flask stuff: 53 | instance/ 54 | .webassets-cache 55 | # Scrapy stuff: 56 | .scrapy 57 | # Sphinx documentation 58 | docs/_build/ 59 | # PyBuilder 60 | target/ 61 | # IPython Notebook 62 | .ipynb_checkpoints 63 | # pyenv 64 | .python-version 65 | # celery beat schedule file 66 | celerybeat-schedule 67 | # dotenv 68 | .env 69 | # virtualenv 70 | venv/ 71 | ENV/ 72 | # Spyder project settings 73 | .spyderproject 74 | # Rope project settings 75 | .ropeproject 76 | mars2 plotmo.pdf 77 | -------------------------------------------------------------------------------- /Fall2016/README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Working Group 2 | 3 | Fridays, 12-1pm in 356 Barrows Hall 4 | 5 | Fall 2016 Schedule 6 | 7 | * September 23 - Introductory meeting 8 | * October 7 - [Decision trees](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/oct13_decisionTrees) 9 | * October 21 - [Random forests](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/oct21_randomForests) 10 | * November 4 - [Penalized regression](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/nov4_LASSO) - lasso, ridge, elastic net 11 | * November 18 - [Evan's skull dataset and GBM](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/nov18_GBM) 12 | * December 2 13 | 14 | Spring 2017 Schedule - to be determined, topics welcome! 15 | 16 | More information on the [D-Lab website](http://dlab.berkeley.edu/working-groups/machine-learning-working-group) 17 | 18 | ## Resources 19 | 20 | Books: 21 | 22 | * Intro to Statistical Learning [(free pdf)](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) [(Amazon page)](https://smile.amazon.com/Introduction-Statistical-Learning-Applications-Statistics-ebook/dp/B01IBM7790/) by Gareth James et al. 23 | * [Applied Predictive Modeling](https://smile.amazon.com/Applied-Predictive-Modeling-Max-Kuhn-ebook/dp/B00K15TZU0/) by Max Kuhn 24 | * Elements of Statistical Learning 25 | * Many others (any recommendations?) 26 | 27 | Courses at Berkeley: 28 | 29 | * Stat 154 - Statistical Learning 30 | * CS 189 / CS 289A - Machine Learning 31 | * PH 252D - Causal Inference 32 | * PH 295 - Big Data 33 | * PH 295 - Targeted Learning for Biomedical Big Data 34 | * INFO - TBD 35 | 36 | Coursera and other online classes 37 | 38 | * To add 39 | 40 | D-Lab Machine Learning Trainings 41 | 42 | * D-Lab - Intro to Machine Learning 43 | * Erin LeDell - h2o.ai 44 | * Rochelle Terman - scikit-learn 45 | 46 | [Specifics on the D-Lab calendar](http://dlab.berkeley.edu/calendar-node-field-date) 47 | 48 | Other Campus Groups 49 | 50 | * [Machine Learning @ Berkeley](https://ml.berkeley.edu/) 51 | * D-Lab's Cloud Computing Working Group 52 | * D-Lab's Computational Text Analysis Working Group 53 | * [The Hacker Within](http://www.thehackerwithin.org/berkeley/) / Berkeley Institute for Data Science 54 | -------------------------------------------------------------------------------- /Fall2016/install_R_Python/installation_help.txt: -------------------------------------------------------------------------------- 1 | Before class, please download and install R Studio: 2 | https://www.rstudio.com/products/rstudio/download3/ 3 | 4 | If your installation does not work and says you need to install the binary files, please do so here: 5 | https://cloud.r-project.org/ 6 | 7 | Also download and install Python by following these instructions: 8 | https://github.com/dlab-berkeley/python-intensive/blob/master/Install.md 9 | (you can also just pip install scikit-learn if you have Python but not Anaconda). 10 | -------------------------------------------------------------------------------- /Fall2016/intro.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/intro.pptx -------------------------------------------------------------------------------- /Fall2016/nov18_GBM/AutoML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Auto ML Regression:" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "np.random.seed(1)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## auto-sklearn" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "from sklearn.datasets import load_boston\n", 38 | "from sklearn.cross_validation import train_test_split\n", 39 | "from sklearn import preprocessing\n", 40 | "\n", 41 | "boston = load_boston()\n", 42 | "\n", 43 | "X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,\n", 44 | " train_size=0.8, test_size=0.2)\n", 45 | "\n", 46 | "scaler = preprocessing.StandardScaler().fit(X_train)\n", 47 | "X_train = scaler.transform(X_train)\n", 48 | "X_test = scaler.transform(X_test)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "from autosklearn.regression import AutoSklearnRegressor\n", 60 | "import sklearn.cross_validation\n", 61 | "import sklearn.metrics\n", 62 | "\n", 63 | "automl_r = AutoSklearnRegressor(time_left_for_this_task=100)\n", 64 | " #include_estimators={\"gradient_boosting\": ()}) # time_left_for_this_task=100\n", 65 | "automl_r.fit(X_train, y_train)\n", 66 | "y_hat = automl_r.predict(X_test)\n", 67 | "print(\"R2 score\", sklearn.metrics.r2_score(y_test, y_hat))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "### Get final ensemble:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "print(automl_r.show_models())" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Get iteration scores:\n", 93 | "From docs: `(list of named tuples) Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes: * parameters, a dict of parameter settings * mean_validation_score, the mean score over the cross-validation folds * cv_validation_scores, the list of scores for each fold`" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "automl_r.grid_scores_" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Cross-validation results:\n", 112 | "\n", 113 | "From docs: `(dict of numpy (masked) ndarrays) A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame. This attribute is a backward port to already support the advanced output of scikit-learn 0.18. Not all keys returned by scikit-learn are supported yet.`" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": { 120 | "collapsed": false 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "automl_r.cv_results_" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## TPOT" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from tpot import TPOTRegressor\n", 143 | "\n", 144 | "tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2) # generations for optimization, , pop size is models\n", 145 | "tpot.fit(X_train, y_train)\n", 146 | "print(tpot.score(X_test, y_test))\n", 147 | "tpot.export('tpot_boston_pipeline.py')" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "# AutoML Classification" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "from sklearn.datasets import load_iris\n", 166 | "from sklearn.cross_validation import train_test_split\n", 167 | "\n", 168 | "iris = load_iris()\n", 169 | "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,\n", 170 | " train_size=0.75, test_size=0.25)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## auto-sklearn" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "from autosklearn.classification import AutoSklearnClassifier\n", 189 | "import sklearn.cross_validation\n", 190 | "import sklearn.metrics\n", 191 | "\n", 192 | "automl_cl = AutoSklearnClassifier() # time_left_for_this_task=100\n", 193 | "automl_cl.fit(X_train, y_train)\n", 194 | "y_hat = automl_cl.predict(X_test)\n", 195 | "print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, y_hat))" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "### Get final ensemble:" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "print(automl_cl.show_models())" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "### Get iteration scores:\n", 221 | "From docs: `(list of named tuples) Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes: * parameters, a dict of parameter settings * mean_validation_score, the mean score over the cross-validation folds * cv_validation_scores, the list of scores for each fold`" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": { 228 | "collapsed": false 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "automl_cl.grid_scores_" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### Cross-validation results:\n", 240 | "\n", 241 | "From docs: `(dict of numpy (masked) ndarrays) A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame. This attribute is a backward port to already support the advanced output of scikit-learn 0.18. Not all keys returned by scikit-learn are supported yet.`" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "collapsed": false 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "automl_cl.cv_results_" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "## TPOT" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "from tpot import TPOTClassifier\n", 271 | "from sklearn.datasets import load_digits\n", 272 | "from sklearn.cross_validation import train_test_split\n", 273 | "\n", 274 | "tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)\n", 275 | "tpot.fit(X_train, y_train)\n", 276 | "print(tpot.score(X_test, y_test))\n", 277 | "tpot.export('tpot_iris_pipeline.py')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [] 288 | } 289 | ], 290 | "metadata": { 291 | "anaconda-cloud": {}, 292 | "kernelspec": { 293 | "display_name": "Python [conda root]", 294 | "language": "python", 295 | "name": "conda-root-py" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.5.2" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 1 312 | } 313 | -------------------------------------------------------------------------------- /Fall2016/nov18_GBM/R GBM walkthrough.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R GBM walkthrough" 3 | author: "Evan Muzzall" 4 | date: "November 18, 2016" 5 | output: 6 | html_document: 7 | toc: yes 8 | toc_float: yes 9 | --- 10 | 11 | # 0. Freund and Schapire 1999 - background 12 | From [Freund and Schapire 1999](https://cseweb.ucsd.edu/~yfreund/papers/IntroToBoosting.pdf). 13 | "Boosting is a general method for improving the accuracy of any given learning algorithm" and originated in the AdaBoost and PAC learning (p. 1-2). Gradient boosted machines are ensembles decision tree methods of "weak" trees that are just slightly more accurate than random guessing which are then "boosted" into "strong" learners. That is, the models don't have to be accurate over the entire feature space. 14 | 15 | The model first tried to predict each value in a dataset - the cases that can be predicted easily are _downweighted_ so that the algorithm does not have to try as hard to predic them. 16 | 17 | However, the cases that the model has difficulty predicting are _upweighted_ so that the model tries more assertively to predict them. This continues for multiple "boosting" iterations. A resample-based performance measure is produced at each iteration. Error is measured on the weak learners so that even performing slightly better than random guessing improves accuracy fast (p.2). This method can drive down generalization error thus preventing overfitting (p. 5). While it is susceptible to noise, it is robust to outlier detection. 18 | 19 | # 1. install packages 20 | ```{r, eval=FALSE} 21 | install.packages("car", dependencies=TRUE) 22 | install.packages("caret", dependencies=TRUE) 23 | install.packages("pROC", dependencies=TRUE) 24 | ``` 25 | ```{r, eval=FALSE} 26 | library(car) 27 | library(caret) 28 | library(pROC) 29 | ``` 30 | 31 | # 2. load the Mroz dataset 32 | ```{r} 33 | library(car) 34 | data(Mroz) 35 | str(Mroz) 36 | ``` 37 | ### 2.1 See variable definitions with `?Mroz` 38 | 39 | # 3. use createDataPartition() to create an 75/25 stratified random split 40 | ```{r} 41 | library(caret) 42 | split <- createDataPartition(Mroz$lfp, p=0.75, list=FALSE) 43 | training.set <- Mroz[split,] 44 | test.set <- Mroz[-split,] 45 | nrow(training.set) + nrow(test.set) == nrow(Mroz) # sanity check 46 | ``` 47 | 48 | `createDataPartition` = creates a stratified random split 49 | training.set = train model here 50 | test.set = does trained model maintain its performance here? 51 | 52 | # 4. train() a GBM model 53 | ```{r} 54 | set.seed(1) 55 | gbm.fit1 <- train(lfp ~ ., data=training.set, method="gbm", verbose=FALSE) 56 | ``` 57 | 58 | `train()` holds the tuning parameters; fits each one then calculates a resampling based performance metric 59 | 60 | " . " comes from Perl's regex library and stands for "everything else". 61 | 62 | ### 4.1 model summary 63 | View a model summary table by calling the object. caret shows us the optimal model based on its attributes 64 | ```{r} 65 | gbm.fit1 66 | ``` 67 | 68 | interaction.depth = tree depth/complexity 69 | n.trees = number of boosting iterations 70 | Accuracy = overall agreement rate averaged over the cross-validated boosting iterations 71 | Kappa = Cohen's unweighted kappa averaged across resampling results (1 = perfect agreement) 72 | 73 | ### 4.2 Plot bargraph of variable relative influence with summary() 74 | ```{r} 75 | summary(gbm.fit1, las=2, main="GBM relative influence") 76 | gbm.fit1$times 77 | ``` 78 | 79 | # 5. trainControl() and expand.grid() 80 | ### 5.1 define the parameters of the control mechanism with `trainControl()` 81 | ```{r} 82 | control <- trainControl(method="repeatedcv", 83 | repeats=5, 84 | classProbs=TRUE, 85 | summaryFunction=twoClassSummary) 86 | ``` 87 | 88 | method = "repeatedcv": CV measures predictive performance of a statistical model; 89 | repeats = number of times to repeat the cross-validation 90 | classProbs = this will calculate predicted class probabilities (ROC) within the resampling process (Kuhn, 2015:4) 91 | summaryFunction = uses observed versus predicted values to estimate performance (AUC, sensitivity, specificity) (Kuhn, 2015:4) 92 | 93 | ### 5.2 compare multiple models at once with `expand.grid()` 94 | ```{r} 95 | grid <- expand.grid(n.trees=seq(100,2100, by=100), 96 | interaction.depth=seq(1,3,5), 97 | shrinkage=c(0.01,0.05, 0.1), 98 | n.minobsinnode=10) 99 | ``` 100 | 101 | shrinkage = learning rate of the algorithm; how quickly the model adapts to the data at each iteration 102 | n.minobsinnode = minimum number of observations needed to commence splitting 103 | 104 | n.trees = number of boosting iterations 105 | interaction.depth = tree depth/complexity 106 | shrinkage = learning rate of the model; how quickly it adapts at each iteration 107 | n.minobsinnode = minimum number of samples needed to commence splitting 108 | 109 | ```{r} 110 | set.seed(1) 111 | gbm.fit2 <- train(lfp ~ ., data=training.set, 112 | method="gbm", 113 | metric="ROC", 114 | trControl=control, 115 | tuneGrid=grid, 116 | verbose=FALSE) 117 | gbm.fit2$times 118 | ``` 119 | 120 | verbose = print lengthy output (`TRUE` or `FALSE`?) 121 | 122 | ### 5.3 model summary table 123 | ```{r} 124 | gbm.fit2 125 | ``` 126 | 127 | ### 5.4 bargraph of variable relative influence 128 | ```{r} 129 | summary(gbm.fit2, las=2) 130 | ``` 131 | 132 | # 6. ggplot line graph of the tunded models 133 | ```{r} 134 | ggplot(gbm.fit2) + theme_grey() + ggtitle("Model comparisons") 135 | ``` 136 | 137 | Want to learn more about ggplot2 themes? :) See [the ggplot2 themes help page](http://docs.ggplot2.org/dev/vignettes/themes.html) 138 | 139 | # 7. generate GBM predicted values and probabilities with with `predict()` 140 | ```{r} 141 | set.seed(1) 142 | gbm.pred <- predict(gbm.fit2, test.set) 143 | gbm.prob <- predict(gbm.fit2, test.set, type="prob") 144 | ``` 145 | 146 | `predict()` = predictions of various model fitting functions 147 | 148 | ### 7.1 view GBM final model 149 | ```{r} 150 | gbm.cm <- confusionMatrix(gbm.pred, test.set$lfp) 151 | gbm.cm 152 | ``` 153 | 154 | A confusion/error matrix is a cross-tabulation of observed versus predicted classes 155 | 156 | # 8. plot GBM ROC curve 157 | ```{r} 158 | library(pROC) 159 | rocCurve <- roc(response=test.set$lfp, 160 | predictor = gbm.prob[, "yes"], 161 | levels = rev(levels(test.set$lfp)), 162 | auc=TRUE, ci=TRUE) 163 | ``` 164 | 165 | ```{r} 166 | plot(rocCurve, main="GBM", col="blue", col.main="blue", col.lab="blue") 167 | ``` 168 | 169 | # Help 170 | * The [caret help page](https://topepo.github.io/caret/) 171 | 172 | * [Package 'caret](https://cran.r-project.org/web/packages/caret/caret.pdf) 173 | 174 | * Kuhn M. 2008. [Building predictive models in R using the caret package](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwiytr_K0YjPAhVjImMKHTdwCaMQFgghMAA&url=https%3A%2F%2Fwww.jstatsoft.org%2Farticle%2Fview%2Fv028i05%2Fv28i05.pdf&usg=AFQjCNF6qKoSkwaevSrCzgHwKWOyGqnmMQ&cad=rja). J Stat Softw 28:1-26. 175 | 176 | * Kuhn M. 2013. [Predictive modeling with R and the caret package](https://www.r-project.org/nosvn/conferences/useR-2013/Tutorials/kuhn/user_caret_2up.pdf). useR! The R User Conference, July 10-12, University of Castilla-La Mancha, Albacete, Spain 177 | 178 | * Kuhn M. 2015. [A Short Introduction to the caret Package](https://cran.r-project.org/web/packages/caret/vignettes/caret.pdf). 179 | -------------------------------------------------------------------------------- /Fall2016/nov4_LASSO/nov4_examples.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1404\cocoasubrtf470 2 | {\fonttbl\f0\fnil\fcharset0 Calibri;} 3 | {\colortbl;\red255\green255\blue255;\red0\green0\blue233;} 4 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 5 | \deftab720 6 | \pard\pardeftab720\sl280\partightenfactor0 7 | 8 | \f0\fs32 \cf0 \expnd0\expndtw0\kerning0 9 | Python example: \ 10 | \pard\pardeftab720\sl280\partightenfactor0 11 | {\field{\*\fldinst{HYPERLINK "https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/"}}{\fldrslt \cf2 \ul \ulc2 https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/}}\ 12 | \ 13 | R example:\ 14 | \pard\pardeftab720\sl280\partightenfactor0 15 | {\field{\*\fldinst{HYPERLINK "http://machinelearningmastery.com/penalized-regression-in-r/"}}{\fldrslt \cf2 \ul \ulc2 http://machinelearningmastery.com/penalized-regression-in-r/}}} -------------------------------------------------------------------------------- /Fall2016/nov4_LASSO/penalized-regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Penalized regression in R" 3 | output: 4 | html_document: default 5 | html_notebook: default 6 | --- 7 | # Data prep 8 | 9 | ```{r} 10 | library(MASS) 11 | data(Boston) 12 | help(Boston) 13 | str(Boston) 14 | summary(Boston) 15 | 16 | # Our outcome is median home value. 17 | outcome = "medv" 18 | 19 | # Divide into 80% training, 20% test split. 20 | library(caret) 21 | set.seed(1) 22 | train_index = caret::createDataPartition(Boston[, outcome], p = .8, 23 | list = F, 24 | times = 1) 25 | 26 | # Glmnet wants the data to be matrices, not data frames. 27 | X_train = as.matrix(Boston[train_index, !names(Boston) == outcome]) 28 | X_test = as.matrix(Boston[-train_index, !names(Boston) == outcome]) 29 | 30 | Y_train = Boston[train_index, outcome] 31 | Y_test = Boston[-train_index, outcome] 32 | 33 | dim(X_train) 34 | length(Y_train) 35 | 36 | dim(X_test) 37 | length(Y_test) 38 | ``` 39 | 40 | 41 | # Lasso 42 | 43 | Lasso penalizes coefficients and imposes sparsity, so some coefficients may be shrunk to 0 if they do not appear to be related to the outcome. 44 | 45 | ```{r} 46 | library(glmnet) 47 | # Fit the lasso to continuous Y 48 | reg = cv.glmnet(X_train, Y_train, family = "gaussian", alpha = 1) 49 | 50 | # Look at distribution of penalty term lambda. 51 | plot(reg) 52 | 53 | # Plot the underlying glmnet object, showing 54 | # coefficients for differnt lambda values. 55 | plot(reg$glmnet.fit, xvar = "lambda", label = T) 56 | 57 | # Lambda with minimum mean-squared error. 58 | reg$lambda.min 59 | 60 | # Higher lambda within 1SE of performance of the minimum. 61 | # (the "one standard error" rule from Leo Breiman.) 62 | reg$lambda.1se 63 | 64 | # Review coeffients 65 | coef(reg, s = "lambda.1se") 66 | 67 | # What about for lambda.min? 68 | coef(reg, s = "lambda.min") 69 | 70 | # Predict on test set. 71 | pred = predict(reg, s = reg$lambda.1se, newx = X_test) 72 | 73 | # Calculate mean-squared error. 74 | mean((pred - Y_test)^2) 75 | ``` 76 | 77 | # Ridge 78 | 79 | Ridge penalizes the coefficients but does not impose sparsity, so no coefficient will ever be 0. 80 | 81 | ```{r} 82 | 83 | # Fit the ridge to continuous Y 84 | # We just change alpha to 0 to get ridge regression. 85 | reg = cv.glmnet(X_train, Y_train, family = "gaussian", alpha = 0) 86 | 87 | # Look at distribution of penalty term lambda. 88 | plot(reg) 89 | 90 | # Plot the underlying glmnet object, showing 91 | # coefficients for differnt lambda values. 92 | plot(reg$glmnet.fit, xvar = "lambda", label = T) 93 | 94 | # Predict on test set. 95 | pred = predict(reg, s = reg$lambda.1se, newx = X_test) 96 | 97 | # Calculate mean-squared error. 98 | mean((pred - Y_test)^2) 99 | ``` 100 | 101 | As expected, we do a little worse with ridge compared to lasso. 102 | 103 | # Elastic net 104 | 105 | ```{r} 106 | train_control = trainControl(method = "repeatedCV", 107 | number = 10, 108 | repeats = 3) 109 | 110 | set.seed(1) 111 | 112 | # Create a custom tuning grid. 113 | enet_grid = expand.grid(alpha = seq(0, 1, length.out = 5), 114 | lambda = 2^seq(-1, -7, length = 5)) 115 | 116 | # Review the grid. 117 | enet_grid 118 | 119 | # To be simpler we could just say e.g. tuneLength = 5. 120 | 121 | enet = train(X_train, Y_train, method = "glmnet", 122 | #tuneLength = 5, 123 | tuneGrid = enet_grid, 124 | trControl = train_control) 125 | 126 | print(enet) 127 | 128 | plot(enet) 129 | 130 | enet$bestTune 131 | 132 | # Predict on test. 133 | pred = predict(enet, X_test) 134 | 135 | # Review performance 136 | mean((pred - Y_test)^2) 137 | ``` 138 | 139 | # References 140 | 141 | Intro to Statistical Learning, Chapter 6 142 | 143 | [Glmnet vignette by Hastie and Qian](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) - lots of great code examples -------------------------------------------------------------------------------- /Fall2016/oct13_decisionTrees/Horning 2016.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/oct13_decisionTrees/Horning 2016.pdf -------------------------------------------------------------------------------- /Fall2016/oct13_decisionTrees/Lewicki 2007.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/oct13_decisionTrees/Lewicki 2007.pdf -------------------------------------------------------------------------------- /Fall2016/oct13_decisionTrees/r-decision-trees.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Decision trees for machine learning" 3 | output: html_notebook 4 | --- 5 | 6 | Topics 7 | 8 | * rpart 9 | * Caret 10 | * SuperLearner 11 | * h2o.ai 12 | * mlr 13 | * book 14 | 15 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Use the latest RStudio preview release to run within RStudio. 16 | 17 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 18 | 19 | ```{r} 20 | # Load iris dataset. 21 | data(iris) 22 | 23 | # Review data structure. 24 | str(iris) 25 | 26 | # Review species distribution. 27 | table(iris$Species, useNA = "ifany") 28 | ``` 29 | 30 | ```{r} 31 | # install rpart first if you don't already have it. 32 | # rpart = recursive partitioning and regression trees (aka decision trees) 33 | library(rpart) 34 | 35 | # Review package help and vignette if available. 36 | # HINT: vignette covers all of this in much better detail. 37 | help(package = "rpart") 38 | 39 | # Review main decision tree function. 40 | ?rpart 41 | 42 | # Review the configuration options for trees. 43 | ?rpart.control 44 | 45 | # We need to set a seed due to randomness in the cross-validation. 46 | set.seed(1) 47 | 48 | # Fit a classification decision tree to predict Species using all other variables. 49 | # We don't need to specify method="class" because Species is a factor variable. 50 | # We specify 10 cross-validation folds to determine the best complexity. 51 | # Minbucket is the minimum number of observations in a node. 52 | tree_model = rpart(Species ~ ., data = iris, 53 | control = rpart.control(xval = 10, minbucket = 5, cp = 0)) 54 | 55 | # Display the decision tree in text form. 56 | tree_model 57 | 58 | # Plot tree graphically. 59 | plot(tree_model, compress = T) 60 | # We have to add the plot text manually for some reason. 61 | text(tree_model, use.n = T) 62 | ``` 63 | 64 | Wow, this is one of the worst plots I've ever seen! Hard to get much worse than that. 65 | 66 | Let's tree a better decision tree plotting package. 67 | 68 | ```{r} 69 | # Install from CRAN if you don't already have this: 70 | library(rpart.plot) 71 | 72 | rpart.plot(tree_model) 73 | 74 | # What other settings can we modify? 75 | ?rpart.plot 76 | 77 | # Review the vignette if interested. 78 | help(package = "rpart.plot") 79 | 80 | # Another way to plot it. 81 | library(partykit) 82 | plot(as.party(tree_model)) 83 | 84 | # fancyRpartPlot() in the rattle package is also good. 85 | 86 | ``` 87 | 88 | We can dig into the details of the tree a bit more. 89 | 90 | ```{r} 91 | # Review accuracy for different complexity parameters. 92 | # When nsplits = 0 we have 0 nodes and are just guessing the most common class. 93 | # When nsplits is large we have 1 + # splits nodes and each node is its own prediction. 94 | printcp(tree_model) 95 | 96 | # Save the complexit parameter table. 97 | cp_table = printcp(tree_model) 98 | 99 | # Review structure of the cp table. 100 | str(cp_table) 101 | 102 | # Which row has minimum cross-validation error? 103 | # Alternatively we could choose the tree within 1 SD of the minimum. 104 | best_row = cp_table[which.min(cp_table[, "xerror"]), ] 105 | best_row 106 | best_row["CP"] 107 | 108 | # Get all the details on the tree. 109 | summary(tree_model, cp = best_row["CP"]) 110 | 111 | # Prune to the optimal complexity parameter (no change in this case). 112 | tree_model = prune(tree_model, cp = best_row["CP"]) 113 | 114 | tree_model 115 | ``` 116 | 117 | We did not create a separate holdout or test set, so let's predict back on the original data. 118 | 119 | ```{r} 120 | predictions = predict(tree_model, iris) 121 | summary(predictions) 122 | 123 | # How do the predictions look compared to the outcome data? 124 | data.frame(iris$Species, predictions) 125 | 126 | # This is an optimistic view because the model was built on this same data. 127 | # With a random holdout set we would get a more realistic view of accuracy. 128 | 129 | ``` 130 | 131 | ## Regression 132 | 133 | Quick regression example. 134 | ```{r} 135 | # This data is in the rpart package. 136 | data(car90) 137 | 138 | # Review structure of dataset. 139 | str(car90) 140 | 141 | # Set seed due to cross-validation randomness. 142 | set.seed(1) 143 | 144 | # Predict price using most other fields. 145 | # Remove a few fields that are too predictive (rim) or too many categories. 146 | reg_tree = rpart(Price ~ ., data = car90[, !names(car90) %in% c("Rim", "Tires", "Model2")]) 147 | 148 | # How'd it go? 149 | reg_tree 150 | 151 | # Review complexity parameter options. 152 | printcp(reg_tree) 153 | 154 | # Visualize results across complexity parameter. 155 | rsq.rpart(reg_tree) 156 | 157 | # Save the complexit parameter table. 158 | cp_table = printcp(reg_tree) 159 | 160 | # Which row has minimum cross-validation error? 161 | best_row = cp_table[which.min(cp_table[, "xerror"]), ] 162 | best_row 163 | best_row["CP"] 164 | 165 | # Review summary with the best complexity parameter. 166 | summary(reg_tree, cp = best_row["CP"]) 167 | 168 | # Prune our tree back to the best complexity parameter. 169 | # Note that in this case no real pruning is needed, because 170 | # the full tree is the best. 171 | reg_tree = prune(reg_tree, cp = best_row["CP"]) 172 | 173 | # Visualize our final tree. 174 | rpart.plot(reg_tree) 175 | 176 | ``` 177 | 178 | # Caret 179 | 180 | ```{r} 181 | library(caret) 182 | 183 | # Nice and simple - using default settings for everything. 184 | # caret tries 3 complexity parameters by default, but tuneLength customizes that. 185 | model = train(Species ~ ., data = iris, method = "rpart", tuneLength = 5) 186 | 187 | # We see again that cp= 0 gives us the best accuracy. 188 | model 189 | 190 | # Use the handle built-in caret plotting. 191 | plot(model) 192 | 193 | # Look at the final model object (rpart). 194 | model$finalModel 195 | ``` 196 | 197 | # SuperLearner 198 | 199 | SuperLearner unfortunately cannot do multiple-class classification (yet) so let's convert to a binary classification problem. 200 | 201 | ```{r} 202 | 203 | # Review 204 | table(iris$Species) 205 | 206 | # Copy into a new dataframe. 207 | data = iris 208 | 209 | # Convert Species to a binary indicator for setosa. 210 | data$Species = 1*(data$Species == "versicolor") 211 | 212 | # Confirm distribution of modified outcome variable. 213 | table(data$Species, iris$Species, useNA = "ifany") 214 | 215 | library(SuperLearner) 216 | 217 | set.seed(1) 218 | 219 | sl = SuperLearner(X = data[, -5], Y = data$Species, family = binomial(), 220 | SL.library = c("SL.mean", "SL.rpart")) 221 | sl 222 | 223 | # Review the raw rpart object. 224 | sl$fitLibrary$SL.rpart_All$object 225 | 226 | # Use our nice plotting library. 227 | rpart.plot::rpart.plot(sl$fitLibrary$SL.rpart_All$object) 228 | 229 | ``` 230 | 231 | # h2o.ai 232 | 233 | We can get close to a single decision tree by using randomForest in h2o. We set RF to fit a single decision tree and to search all variables at each split. It will not be exactly the same due to boostrap sampling but will be similar. 234 | 235 | ```{r} 236 | library(h2o) 237 | 238 | # Start h2o backend. 239 | h2o.init() 240 | 241 | # Load iris data into h2o. 242 | iris_h2o = h2o.uploadFile(path = system.file("extdata", "iris_wheader.csv", package="h2o"), 243 | destination_frame = "iris_h2o") 244 | 245 | # Confirm it loaded correctly. 246 | summary(iris_h2o) 247 | 248 | # Specify x and y by the column indices. 249 | # Set ntree to 1, and mtries to # of covariates. 250 | # Seed only reproducible when running single-threaded. 251 | iris_tree = h2o.randomForest(y = 5, x = 1:4, training_frame = iris_h2o, 252 | ntrees = 1, mtries = 4, seed = 1) 253 | 254 | # Review results. 255 | iris_tree 256 | 257 | summary(iris_tree) 258 | 259 | # Review variable importance. 260 | h2o.varimp(iris_tree) 261 | 262 | # Plot variable importance - nice. 263 | h2o.varimp_plot(iris_tree) 264 | 265 | # Shutdown h2o backend. 266 | h2o.shutdown(prompt = F) 267 | ``` 268 | 269 | # mlr 270 | 271 | ```{r} 272 | library(mlr) 273 | 274 | # Generate the task for multiple classification (also works for binary). 275 | task = makeClassifTask(data = iris, target = "Species") 276 | 277 | # Get the number of observations 278 | n = getTaskSize(task) 279 | 280 | # Generate the learners. 281 | learners = list(makeLearner("classif.rpart", id = "rpart", predict.type = "prob")) 282 | 283 | # 5-fold cross-validation, stratifying on Y to ensure balance across folds. 284 | # could use stratify.cols to stratify on certain important covariates. 285 | rdesc = makeResampleDesc("CV", iters = 5, stratify = T) 286 | 287 | # Fit model across cross-validation folds and calculate the performance. 288 | result = benchmark(learners, task, rdesc, measures = list(acc, mmce)) 289 | 290 | # MMCE = mean misclassification error (i.e. 1 - accuracy) 291 | result 292 | 293 | # Plot the results. Generally we would plot multiple models here. 294 | plotBMRBoxplots(result, measure = acc) 295 | ``` 296 | 297 | 298 | # Decision tree references 299 | 300 | This book has nearly everything you would want to know about the theory of decision trees: 301 | 302 | Breiman, L., Friedman, J., Stone, C. J., & Olshen, R. A. (1984). Classification and regression trees. CRC press. 303 | 304 | The book has 32,000 citations according to Google Scholar. Not too shabby! Breiman and Stone were both Berkeley professors, and Breiman invented Random Forest, bagging, and some of the SuperLearner theory. Friedman is at Stanford and invented many other machine learning algorithms, particularly gradient boosted machines GBM) and multivariate adaptive regression splines (MARS). Olshen is also at Stanford. 305 | -------------------------------------------------------------------------------- /Fall2016/oct21_randomForests/R-random forests.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "UC Berkeley D-Lab MLWG random forests in R" 3 | author: "Evan Muzzall" 4 | date: "October 21, 2016" 5 | output: 6 | html_document: 7 | toc: yes 8 | pdf_document: 9 | toc: yes 10 | word_document: 11 | toc: yes 12 | --- 13 | ```{r} 14 | rm(list=ls()) 15 | #options(scipen = 999) 16 | ``` 17 | 18 | ```{r setup, include=FALSE} 19 | knitr::opts_chunk$set(echo = TRUE) 20 | ``` 21 | 22 | # 1. What are random forests? 23 | Random forests are ensemble classifier methods that use multiple decision tree models for classification and regression. 24 | 25 | Unlike decision trees/bagged trees, by default results generally do not require pruning and include accuracy and variable importance information. Furthermore, at each random forest tree split, only a small portion of the predictors are used (rather than the full suite). 26 | 27 | We will fit four different random forest models: 28 | 1. rf1: 'randomForest' package model 29 | 30 | 2. rf2: 'SuperLearner' package model 31 | 32 | 3. rf3: 'SuperLearnerL' package model compared to 'rpart' decision tree model and SL mean 33 | 34 | 4. rf4: 'SuperLearner' package model with external cross-validation for multi-model comparison and visualization of model differences 35 | 36 | ## 1.1 install packages 37 | Install and `library()` necessary packages. 38 | ```{r, eval=FALSE} 39 | library(car) 40 | library(caret) 41 | library(gbm) 42 | library(ggplot2) 43 | library(lattice) 44 | library(randomForest) 45 | library(rpart) 46 | library(ROCR) 47 | library(SuperLearner) 48 | library(survival) 49 | ``` 50 | 51 | ## 1.2 `data(Mroz)` 52 | Load and explore Mroz dataset. 53 | ```{r} 54 | library(car) 55 | data(Mroz) 56 | ?Mroz 57 | str(Mroz) 58 | head(Mroz) 59 | ``` 60 | 61 | ## 1.3 `lfp` 62 | Let's examine frequencies of the `lfp` variable (labor force participation), since it is the one we want to predict. 63 | ```{r, eval=FALSE} 64 | Mroz$lfp 65 | ``` 66 | ```{r} 67 | library(lattice) 68 | table(Mroz$lfp) 69 | barchart(table(Mroz$lfp), col="orange") 70 | ``` 71 | 72 | ## 1.4 stratified random split 73 | Now, we will use the `createDataPartition` command from the 'caret' package to perform a 75/25 stratified random split of the Mroz data into training and test sets. 74 | ```{r} 75 | library(caret) 76 | set.seed(1) 77 | split <- createDataPartition(Mroz$lfp, p=0.75, list=FALSE) 78 | training.set <- Mroz[split,] 79 | test.set <- Mroz[-split,] 80 | 81 | nrow(training.set) + nrow(test.set) == nrow(Mroz) # sanity check 82 | ``` 83 | 84 | ## 1.5 `randomForest()` model on 'training.set' 85 | Using the 'randomForest' package, let's fit a random forest model to predict the number of women who participated or did not participate in the labor force in 1975. 86 | ```{r} 87 | library(randomForest) 88 | ?randomForest 89 | set.seed(1) 90 | rf1 <- randomForest(lfp ~ ., 91 | data=training.set, 92 | ntree=500, 93 | mtry=2, 94 | importance=TRUE) 95 | #NOTE: notice that our response vector 'lfp' is a factor - this will assume classification models, otherwise regression will be assumed. If it is omitted entirely, randomForest becomes unsupervised. 96 | rf1 97 | 98 | # check accuracy on training set 99 | (189+247) / nrow(training.set) # training.set = 77% accuracy 100 | 101 | rf1$importance 102 | barchart(rf1$importance, main="rf barchart", col="blue", border="black") 103 | dotplot(rf1$importance, main="rf dotplot", col=c(1,4)) 104 | ``` 105 | 106 | ## 1.6 model performance on 'test.set' 107 | Now, let's see how our model performs on the test data. 108 | ```{r} 109 | set.seed(1) 110 | pred <- predict(rf1, newdata=test.set) 111 | table(pred, test.set$lfp) 112 | ``` 113 | 114 | ## 1.7 check model accuracy 115 | Of the 188 test.set observations, We have 56 true negatives (correct 'no' predictions), and 81 true positives (correct 'yes' predictions). 116 | 117 | Now, we can quickly check the accuracy of the model using the holdout dataset. 118 | ```{r} 119 | (56 + 81) / nrow(test.set) #test.set = 73% accuracy 120 | ``` 121 | 122 | # 2 Compare multiple models using the 'SuperLearner' R package 123 | 'SuperLearner' is an R package that allows you to easily compare multiple machine learning algorithms at once and/or the same algorithm with different settings. 124 | 125 | It then creates an optimal weighted average of those models, aka an "ensemble", using the test data performance. This approach has been proven to be asymptotically as accurate as the best possible prediction algorithm that is tested. 126 | 127 | ## 2.1 Coerce `lfp` to integer type 128 | For binary classification, SuperLearner prefers that your categorical outcome is numeric/integer, rather than factor data type. 129 | 130 | Let's first coerce `lfp` from factor to integer type. 131 | ```{r} 132 | class(training.set$lfp) 133 | class(test.set$lfp) 134 | 135 | ?ifelse 136 | training.set$lfp <- ifelse(training.set$lfp=="yes", 1L, 0L) 137 | test.set$lfp <- ifelse(test.set$lfp=="yes", 1L, 0L) 138 | 139 | class(training.set$lfp) 140 | class(test.set$lfp) 141 | ``` 142 | ```{r, eval=FALSE} 143 | training.set$lfp 144 | test.set$lfp 145 | ``` 146 | 147 | ## 2.2 Assign Y variables 148 | Now, we should assign binary outcome variables for the training and test sets for the 'SuperLearner' computations. 149 | ```{r} 150 | Y <- training.set$lfp 151 | Y_test <- test.set$lfp 152 | table(Y) 153 | table(Y_test) 154 | ``` 155 | 156 | However, because we specify our outcome and predictor variables in SuperLearner, we must remove the outcome variable from our training and test sets because we do not want to include them as a predictor: 157 | ```{r} 158 | training.set2 <- training.set[,c(2:8)] 159 | test.set2 <- test.set[,c(2:8)] 160 | dim(training.set2) 161 | dim(test.set2) 162 | ``` 163 | 164 | ## 2.3 View code for randomForest and fit the second random forest model 165 | ```{r} 166 | library(SuperLearner) 167 | listWrappers() 168 | SL.randomForest 169 | ?SL.randomForest 170 | 171 | rf2 <- SuperLearner(Y = Y, X = training.set2, family = binomial(), SL.library = "SL.randomForest") 172 | 173 | rf2 174 | ``` 175 | In the output, Risk is an estimate of model accuracy/performance as estimated by cross-validation of risk on future data. By default it uses 10 folds. 176 | 177 | Coef is how much weight SuperLearner puts on that model in the weighted-average. If Coef = 0 it means that model is not used at all. 178 | 179 | ## 2.4 Compare multiple models simultaneously 180 | Now, let's compare our random forest model to a decision tree model from R package 'rpart' as well as the weighted mean of the models. 181 | 182 | Based on model performance (risk), SuperLearner will then tell us which model is the best (Discrete winner) and also create a weighted average of multiple models. 183 | 184 | We include the mean of Y ("SL.mean") as a benchmark algorithm - if it is the discrete winner, then we can assume that our model fits the data poorly. 185 | 186 | Fit the third random forest model along with the SL.mean and rpart decision tree models as well: 187 | ```{r} 188 | rf3 <- SuperLearner(Y = Y, X = training.set2, family = binomial(), SL.library = c("SL.mean", "SL.rpart", "SL.randomForest")) 189 | 190 | rf3 191 | ``` 192 | 193 | ## 2.5 Assess model performance on test.set 194 | Then, we want to assess the model performance on test.set and illustrate with a simple barplot. 195 | ```{r} 196 | pred2 <- predict(rf3, test.set2, onlySL=TRUE) 197 | 198 | summary(pred2$library.predict) 199 | qplot(pred2$pred) + theme_linedraw() + xlab("predicted values") 200 | ``` 201 | 202 | We can then check the area under the receiver operator characteristic (ROC) curve to see how accurate the model fits to test.set 203 | ```{r} 204 | library(ROCR) 205 | pred_rocr <- prediction(pred2$pred, Y_test) 206 | auc <- performance(pred_rocr, measure = "auc", x.measure = "cutoff")@y.values[[1]] 207 | auc # AUC = 0.79 - this is approximately consistent with our other accuracies! 208 | ``` 209 | 210 | # 3. Cross validation of random forest in SuperLearner and visualization 211 | Default cross-validation is set to 10-fold in SuperLearner. If we want to change it to 5 repeats of 10-fold cross-validation, we can use `V`. This makes plotting easy based on v-fold cross-validated risk estimation. 212 | ```{r} 213 | set.seed(1) 214 | rf4 <- CV.SuperLearner(Y = Y, X = training.set2, family = binomial(), V = 5, SL.library = c("SL.mean", "SL.rpart", "SL.randomForest")) 215 | 216 | summary(rf4) 217 | 218 | table(simplify2array(rf4$whichDiscreteSL)) 219 | plot(rf4) + theme_linedraw() 220 | ``` 221 | 222 | Acknowledgements: 223 | Chris Kennedy 224 | 225 | [James G, Witten D, Hastie T, Tibshirani R. 2013. An Introduction to Statistical Learning - with Applications in R. New York: Springer](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) 226 | [Package "SuperLearner"](https://cran.r-project.org/web/packages/SuperLearner/SuperLearner.pdf) -------------------------------------------------------------------------------- /Fall2016/oct21_randomForests/tree.dot: -------------------------------------------------------------------------------- 1 | digraph Tree { 2 | node [shape=box] ; 3 | -------------------------------------------------------------------------------- /Fall2017/Fall2017info: -------------------------------------------------------------------------------- 1 | Machine Learning Working Group Fall 2017 2 | 3 | This semester's topic is neural networks for image processing! 4 | 5 | [Click to find out where neural networks might fit in the data science universe, by Swami Chandrasekaran](http://nirvacana.com/thoughts/becoming-a-data-scientist/) 6 | 7 | [Check out the Neural Network Zoo, by Fjodor Van Veen](http://www.asimovinstitute.org/neural-network-zoo/) 8 | 9 | [View this post for a classic StackExchange response to the question "What does the hidden layer in a neural network compute?"](https://stats.stackexchange.com/questions/63152/what-does-the-hidden-layer-in-a-neural-network-compute) 10 | 11 | [Practice with quick tutorials thanks to fast.ai here ](http://course.fast.ai/) and [here](https://github.com/fastai/courses/tree/master/deeplearning1/nbs) 12 | 13 | [View what a ten-week course on convolutional neural networks would look like here](http://cs231n.stanford.edu/syllabus.html) 14 | 15 | ### Dataset 16 | We will be using [The Nature Convervancy Fisheries Monitoring dataset](https://www.kaggle.com/c/the-nature-conservancy-fisheries-monitoring) for the walkthroughs this semester. 17 | 18 | ### Schedule 19 | Alternating Fridays 9/8 to 12/15 from 12:30-00pm in the D-Lab Convening Room (356B Barrows). 20 | 21 | - September 8: BRC Savio lightning talk, introduction, resources, dataset; feed forward and deep feed forward neural networks 22 | 23 | - September 22: Benten lightning talk(?); convolutional neural networks 24 | 25 | - October 6: lightning talks! 26 | -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_001.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_002.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_002.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_003.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_003.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_004.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_005.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_005.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_006.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_006.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_007.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_008.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_008.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_009.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_009.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_010.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_010.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_011.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_011.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_012.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_012.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_013.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_013.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_014.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_014.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_015.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_015.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_016.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_016.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_017.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_017.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_018.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_018.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_019.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_019.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/images-cnn-R/image_020.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_020.jpg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/alexnet.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/alexnet.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/cnn.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/cnn.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/conv_gif copy.gif.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/conv_gif copy.gif.png -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/conv_gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/conv_gif.gif -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/depthcol.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/depthcol.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/maxpool.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/maxpool.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/neural_net2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/neural_net2.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/imgs/pool1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/pool1.jpeg -------------------------------------------------------------------------------- /Fall2017/Sep22-images-cnn/utils/util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import time 4 | 5 | # A Whole Bunch of Convenience Functions for Cleaning Up Plots 6 | def removeAxes(ax): 7 | ax.get_xaxis().set_visible(False) 8 | ax.get_yaxis().set_visible(False) 9 | 10 | def removeFrames(ax,sides=['top','right']): 11 | for side in sides: 12 | ax.spines[side].set_visible(False) 13 | 14 | def removeTicks(ax,axes): 15 | if 'x' in axes: 16 | ax.tick_params(axis='x', 17 | which='both', 18 | top='off', 19 | labeltop='off', 20 | bottom='off', 21 | labelbottom='off') 22 | if 'y' in axes: 23 | ax.tick_params(axis='y', 24 | which='both', 25 | left='off', 26 | labelleft='off', 27 | right='off', 28 | labelright='off') 29 | 30 | def addAxis(ax,axis='horizontal'): 31 | if axis == 'horizontal': 32 | xmin,xmax = ax.get_xlim() 33 | ax.hlines(0,xmin,xmax) 34 | elif axis == 'vertical': 35 | ymin,ymax = ax.get_ylim() 36 | ax.vlines(0,ymin,ymax) 37 | 38 | def cleanPlot(ax): 39 | removeFrames(plt.gca(),['top','right','bottom']); 40 | removeTicks(plt.gca(),['x','y']); 41 | 42 | def setLims(ax,xBounds,yBounds): 43 | ax.set_xlim(xBounds); ax.set_ylim(yBounds); 44 | 45 | def plot_across(imgs,cmap='Greys_r'): 46 | plt.figure(figsize=(12,3)) 47 | for i in range(len(imgs)): 48 | img = imgs[i] 49 | plt.subplot(1,len(imgs),i+1) 50 | plt.imshow(img,cmap=cmap) 51 | plt.grid(b=False) 52 | -------------------------------------------------------------------------------- /Fall2017/Sep8-neural-nets/nn-from-scratch-3-layer-network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep8-neural-nets/nn-from-scratch-3-layer-network.png -------------------------------------------------------------------------------- /Fall2017/Sep8-neural-nets/r-neural-nets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Neural networks in R" 3 | output: 4 | html_document: default 5 | html_notebook: default 6 | --- 7 | 8 | Topics to cover: 9 | 10 | * Background 11 | * Single-layer networks 12 | * Multi-layer networks 13 | * Possibly more 14 | 15 | 16 | Before we dig in, we will install the R packages we'll be using. 17 | 18 | **R packages** 19 | ```{r} 20 | # List of packages we will use. 21 | packages = c("MASS", "nnet", "h2o", "devtools", "NeuralNetTools") 22 | 23 | github_packages = c( 24 | # Chris's tools package for plotting the SuperLearner. 25 | "ck37r" = "ck37/ck37r", 26 | # Use more up-to-date SuperLearner from github. 27 | "SuperLearner" = "ecpolley/SuperLearner") 28 | 29 | devtools::install_github(github_packages) 30 | 31 | # Load those github packages. 32 | ck37r::load_packages(names(github_packages)) 33 | 34 | # Load required non-github packages and install from CRAN if necessary. 35 | ck37r::load_packages(packages, auto_install = T, verbose = T) 36 | 37 | # Also install mxnet for potential usage. 38 | # This unfortunately is Mac/Windows only; probably will not work for Linux. 39 | # Actually not working for Mac either. 40 | if (F) { 41 | # Skip this for now. 42 | install.packages("drat", repos="https://cran.rstudio.com") 43 | drat:::addRepo("dmlc") 44 | install.packages("mxnet") 45 | } 46 | 47 | # Could install Keras, but this can get complicated. 48 | if (F) { 49 | devtools::install_github("rstudio/keras") 50 | # One version: 51 | install_keras() 52 | # Or: 53 | install_keras(method = "conda") 54 | } 55 | 56 | # Clean up variables. 57 | rm(packages, github_packages) 58 | ``` 59 | 60 | # Background 61 | 62 | Please see Deb's python code for more details on neural network theory. 63 | 64 | # Software packages 65 | 66 | We'll be using `nnet` for simple neural networks and `h2o` for deep neural networks. 67 | 68 | # Data preparation 69 | 70 | ```{r} 71 | data(Boston, package = "MASS") 72 | 73 | # Remove our outcome variable from the covariate list. 74 | X_df = Boston[, -14] 75 | 76 | # Convert X from a dataframe to a matrix. 77 | X_mat = model.matrix(~ ., data = X_df) 78 | 79 | # Notice the extra intercept column added by model.matrix. 80 | colnames(X_mat) 81 | 82 | # Remove extra intercept term. 83 | X_mat = X_mat[, -1] 84 | 85 | # Regression (continuous) version of our outcome variable. 86 | Y_reg = Boston$medv 87 | 88 | # Review outcome distribution. 89 | summary(Y_reg) 90 | 91 | # Classification (binary) version of our outcome variable. 92 | Y_class = as.factor(as.numeric(Boston$medv > 23)) 93 | 94 | # Review outcome distribution. 95 | table(Y_class) 96 | prop.table(table(Y_class)) 97 | 98 | ``` 99 | 100 | # Single-layer neural network 101 | 102 | 103 | Quick classification example 104 | 105 | ```{r} 106 | library(nnet) 107 | 108 | # Classification 109 | 110 | # Set seed because weights are initialized randomly. 111 | set.seed(1) 112 | 113 | # X can be a dataframe or matrix. 114 | # If Y is a factor we need to use this formula notation. 115 | fit = nnet(Y_class ~ X_mat, size = 2, decay = 5e-4, maxit = 200) 116 | 117 | # Review our neural network fit. 118 | fit 119 | 120 | # Plot our neural network. 121 | library(NeuralNetTools) 122 | plotnet(fit) 123 | 124 | # Predict back to our original data. 125 | pred = predict(fit, X_mat) 126 | 127 | # Review predictions. 128 | summary(pred) 129 | 130 | # 131 | ``` 132 | 133 | Quick regression example 134 | 135 | ```{r} 136 | library(nnet) 137 | 138 | # Set seed because weights are initialized randomly. 139 | set.seed(1) 140 | 141 | # Again, X can be a dataframe or matrix. 142 | fit = nnet(Y_reg ~ X_mat, size = 2, decay = 5e-4, maxit = 200, 143 | # Enable linear output to support regression. 144 | linout = T) 145 | 146 | # Challenge: try with linout = F (the default) and see what happens. 147 | 148 | # Review our neural network fit. 149 | fit 150 | 151 | # Visualize neural network. 152 | plotnet(fit) 153 | 154 | # Predict back to our original data. 155 | pred = predict(fit, X_mat) 156 | 157 | # Review predictions. 158 | summary(pred) 159 | 160 | # Calculate mean-squared error (MSE). 161 | mean((pred - Y_reg)^2) 162 | 163 | # And root mean squared error (RMSE), which is on the original scale 164 | # of the outcome variable (easier to interpret). 165 | sqrt(mean((pred - Y_reg)^2)) 166 | 167 | ``` 168 | 169 | # SuperLearner optimization 170 | 171 | These challenges can be done in pairs/groups to make it easier. 172 | 173 | Challenge 1: use SL.nnet wrapper to estimate performance of the neural network. 174 | 175 | Challenge 2: use create.Learner() to test 2, 3, 4, or 5 hidden units and create a weighted average ensemble. 176 | 177 | # Multi-layer neural network 178 | 179 | Challenge: use h2o to design this. 180 | 181 | ```{r} 182 | library(h2o) 183 | # Startup and connect to our existing h2o cluster. 184 | # Use all available threads. 185 | # Could increase ram with option (e.g.) max_mem_size = "8g" 186 | h2o.init(nthreads = -1) 187 | 188 | # Clean slate - just in case the cluster was already running. 189 | h2o.removeAll() 190 | 191 | # Load x data into h2o. 192 | data = as.h2o(cbind(X_df, `_outcome` = Y_reg)) 193 | dim(data) 194 | 195 | outcome = "_outcome" 196 | x = colnames(X_df) 197 | 198 | # Fit the deep learning model here. 199 | # key optional arguments: 200 | # hidden = c(200, 200) 201 | # epochs = 10 202 | # seed = -1 203 | # rate_decay = 1 204 | # reproducible = FALSE 205 | # See ?h2.deeplearning for more - huge variety of configurations 206 | model = h2o.deeplearning(x = x, y = outcome, 207 | training_frame = data, 208 | nfolds = 10) 209 | 210 | # Review model, in particular the cross-validation section. 211 | model 212 | 213 | # Estimate model performance on another data set. 214 | # Could be a test set but here it's just the resubstitution performance. 215 | # So this is more biased than the cross-validated results reported above. 216 | h2o.performance(model, data) 217 | 218 | # Shutdown server when we're done. 219 | # This will also happen automatically if we close RStudio, provided 220 | # the server was started within R. 221 | h2o.shutdown(prompt = F) 222 | ``` 223 | 224 | See also Erin LeDell's [excellent tutorial on deep learning](https://github.com/ledell/useR-machine-learning-tutorial/blob/master/deep-neural-networks.Rmd). 225 | 226 | ## To add: Keras and mxnet versions. 227 | -------------------------------------------------------------------------------- /Fall2018/1-sep5-PCA/PCA-python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MachineLearning Working Group\n", 8 | "\n", 9 | "### Python PCA - September 5, 2018\n", 10 | "\n", 11 | "As with the [R walkthrough](https://github.com/dlab-berkeley/MachineLearningWG/blob/master/Fall2018/sep5-PCA/PCA-R.Rmd), let's begin by replicating [another great example](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60) for conducting PCA in Python and then see a machine learning application. " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import matplotlib as mpl\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "%matplotlib inline\n", 24 | "\n", 25 | "from sklearn.preprocessing import StandardScaler\n", 26 | "from sklearn.decomposition import PCA\n", 27 | "from sklearn.datasets import fetch_mldata\n", 28 | "from sklearn.model_selection import train_test_split\n", 29 | "from sklearn.linear_model import LogisticRegression" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "# Load the iris dataset" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "iris = pd.read_csv('./iris.csv')\n", 46 | "print(type(iris))\n", 47 | "iris.head()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "# Define the nuemric features" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "Features = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]\n", 64 | "x = iris.loc[:, Features].values\n", 65 | "x" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "# Standardize the numeric features" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "x = StandardScaler().fit_transform(x)\n", 82 | "x" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "# Extract the target variable" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "y = iris.loc[:,[\"Species\"]].values\n", 99 | "y" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Define the 2D PCA feature space" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "pca = PCA(n_components=2)\n", 116 | "principalComponents = pca.fit_transform(x)\n", 117 | "pca_df = pd.DataFrame(data = principalComponents\n", 118 | " , columns = ['principal component 1', 'principal component 2'])\n", 119 | "pca_df" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "# Concatenate the Species vector the principal component arrays" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": { 133 | "scrolled": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "iris_pca = pd.concat([iris[[\"Species\"]], pca_df], axis = 1)\n", 138 | "iris_pca.head()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# Construct the scatterplot" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "fig = plt.figure(figsize = (8,8))\n", 155 | "ax = fig.add_subplot(1,1,1) \n", 156 | "ax.set_xlabel(\"Principal Component 1\", fontsize = 15)\n", 157 | "ax.set_ylabel(\"Principal Component 2\", fontsize = 15)\n", 158 | "ax.set_title(\"PCA iris scatterplot\", fontsize = 20)\n", 159 | "targets = [\"setosa\", \"versicolor\", \"virginica\"]\n", 160 | "colors = [\"r\", \"g\", \"b\"]\n", 161 | "for target, color in zip(targets,colors):\n", 162 | " indicesToKeep = iris_pca[\"Species\"] == target\n", 163 | " ax.scatter(iris_pca.loc[indicesToKeep, \"principal component 1\"]\n", 164 | " , iris_pca.loc[indicesToKeep, \"principal component 2\"]\n", 165 | " , c = color\n", 166 | " , s = 50)\n", 167 | "ax.legend(targets)\n", 168 | "ax.grid()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "pca.explained_variance_ratio_" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "# Proportions of variance are similar to R!\n", 185 | "The proportions of variance are virtually identical to those we obtained in R: \n", 186 | "\n", 187 | "- PC 1 = 0.7296245 \n", 188 | "- PC 2 = 0.2285076" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "# Machine Learning example\n", 203 | "\n", 204 | "Now, let's use PCA to optimize a logistic regression model. " 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# Load the mnist dataset\n", 214 | "mnist = fetch_mldata('MNIST original')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "# Split the data with a 70/30 split\n", 224 | "# Define our training and test images and our training and test labels\n", 225 | "# random_state is like setting the seed in R and ensures reproducible results\n", 226 | "train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data, mnist.target, test_size=1/7.0, random_state=0)\n", 227 | "\n", 228 | "# Initialize the scaler to standardize the data (remember that PCA is grossly affected by scale!)\n", 229 | "scaler = StandardScaler()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "# Fit model to training set" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "scaler.fit(train_img)\n", 246 | "\n", 247 | "train_img = scaler.transform(train_img)\n", 248 | "\n", 249 | "test_img = scaler.transform(test_img)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# Initialize the PCA model" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# change the value in the parentheses to tell the model how much variation should be retained. \n", 266 | "# We want 95% of it so we enter 0.95\n", 267 | "mnist_pca = PCA(0.95)\n", 268 | "mnist_pca.fit(train_img)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "# Do the transform on the training and test sets" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "train_img = mnist_pca.transform(train_img)\n", 285 | "test_img = mnist_pca.transform(test_img)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": {}, 291 | "source": [ 292 | "# Initialize logistic regression\n", 293 | "... with default settings" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# all parameters not specified are set to their defaults\n", 303 | "# default solver is incredibly slow which is why it was changed to 'lbfgs'\n", 304 | "logisticRegr = LogisticRegression(solver = 'lbfgs')\n", 305 | "logisticRegr.fit(train_img, train_lbl)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "# Predict for One Observation (image)\n", 315 | "logisticRegr.predict(test_img[0].reshape(1,-1))" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "# Predict for One Observation (image)\n", 325 | "logisticRegr.predict(test_img[0:10])" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "logisticRegr.score(test_img, test_lbl)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "View [this webpage](https://plot.ly/ipython-notebooks/principal-component-analysis/) for another great iris example. " 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [] 350 | } 351 | ], 352 | "metadata": { 353 | "kernelspec": { 354 | "display_name": "Python 3", 355 | "language": "python", 356 | "name": "python3" 357 | }, 358 | "language_info": { 359 | "codemirror_mode": { 360 | "name": "ipython", 361 | "version": 3 362 | }, 363 | "file_extension": ".py", 364 | "mimetype": "text/x-python", 365 | "name": "python", 366 | "nbconvert_exporter": "python", 367 | "pygments_lexer": "ipython3", 368 | "version": "3.6.5" 369 | } 370 | }, 371 | "nbformat": 4, 372 | "nbformat_minor": 2 373 | } 374 | -------------------------------------------------------------------------------- /Fall2018/1-sep5-PCA/iris.csv: -------------------------------------------------------------------------------- 1 | Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width 2 | setosa,5.1,3.5,1.4,0.2 3 | setosa,4.9,3,1.4,0.2 4 | setosa,4.7,3.2,1.3,0.2 5 | setosa,4.6,3.1,1.5,0.2 6 | setosa,5,3.6,1.4,0.2 7 | setosa,5.4,3.9,1.7,0.4 8 | setosa,4.6,3.4,1.4,0.3 9 | setosa,5,3.4,1.5,0.2 10 | setosa,4.4,2.9,1.4,0.2 11 | setosa,4.9,3.1,1.5,0.1 12 | setosa,5.4,3.7,1.5,0.2 13 | setosa,4.8,3.4,1.6,0.2 14 | setosa,4.8,3,1.4,0.1 15 | setosa,4.3,3,1.1,0.1 16 | setosa,5.8,4,1.2,0.2 17 | setosa,5.7,4.4,1.5,0.4 18 | setosa,5.4,3.9,1.3,0.4 19 | setosa,5.1,3.5,1.4,0.3 20 | setosa,5.7,3.8,1.7,0.3 21 | setosa,5.1,3.8,1.5,0.3 22 | setosa,5.4,3.4,1.7,0.2 23 | setosa,5.1,3.7,1.5,0.4 24 | setosa,4.6,3.6,1,0.2 25 | setosa,5.1,3.3,1.7,0.5 26 | setosa,4.8,3.4,1.9,0.2 27 | setosa,5,3,1.6,0.2 28 | setosa,5,3.4,1.6,0.4 29 | setosa,5.2,3.5,1.5,0.2 30 | setosa,5.2,3.4,1.4,0.2 31 | setosa,4.7,3.2,1.6,0.2 32 | setosa,4.8,3.1,1.6,0.2 33 | setosa,5.4,3.4,1.5,0.4 34 | setosa,5.2,4.1,1.5,0.1 35 | setosa,5.5,4.2,1.4,0.2 36 | setosa,4.9,3.1,1.5,0.2 37 | setosa,5,3.2,1.2,0.2 38 | setosa,5.5,3.5,1.3,0.2 39 | setosa,4.9,3.6,1.4,0.1 40 | setosa,4.4,3,1.3,0.2 41 | setosa,5.1,3.4,1.5,0.2 42 | setosa,5,3.5,1.3,0.3 43 | setosa,4.5,2.3,1.3,0.3 44 | setosa,4.4,3.2,1.3,0.2 45 | setosa,5,3.5,1.6,0.6 46 | setosa,5.1,3.8,1.9,0.4 47 | setosa,4.8,3,1.4,0.3 48 | setosa,5.1,3.8,1.6,0.2 49 | setosa,4.6,3.2,1.4,0.2 50 | setosa,5.3,3.7,1.5,0.2 51 | setosa,5,3.3,1.4,0.2 52 | versicolor,7,3.2,4.7,1.4 53 | versicolor,6.4,3.2,4.5,1.5 54 | versicolor,6.9,3.1,4.9,1.5 55 | versicolor,5.5,2.3,4,1.3 56 | versicolor,6.5,2.8,4.6,1.5 57 | versicolor,5.7,2.8,4.5,1.3 58 | versicolor,6.3,3.3,4.7,1.6 59 | versicolor,4.9,2.4,3.3,1 60 | versicolor,6.6,2.9,4.6,1.3 61 | versicolor,5.2,2.7,3.9,1.4 62 | versicolor,5,2,3.5,1 63 | versicolor,5.9,3,4.2,1.5 64 | versicolor,6,2.2,4,1 65 | versicolor,6.1,2.9,4.7,1.4 66 | versicolor,5.6,2.9,3.6,1.3 67 | versicolor,6.7,3.1,4.4,1.4 68 | versicolor,5.6,3,4.5,1.5 69 | versicolor,5.8,2.7,4.1,1 70 | versicolor,6.2,2.2,4.5,1.5 71 | versicolor,5.6,2.5,3.9,1.1 72 | versicolor,5.9,3.2,4.8,1.8 73 | versicolor,6.1,2.8,4,1.3 74 | versicolor,6.3,2.5,4.9,1.5 75 | versicolor,6.1,2.8,4.7,1.2 76 | versicolor,6.4,2.9,4.3,1.3 77 | versicolor,6.6,3,4.4,1.4 78 | versicolor,6.8,2.8,4.8,1.4 79 | versicolor,6.7,3,5,1.7 80 | versicolor,6,2.9,4.5,1.5 81 | versicolor,5.7,2.6,3.5,1 82 | versicolor,5.5,2.4,3.8,1.1 83 | versicolor,5.5,2.4,3.7,1 84 | versicolor,5.8,2.7,3.9,1.2 85 | versicolor,6,2.7,5.1,1.6 86 | versicolor,5.4,3,4.5,1.5 87 | versicolor,6,3.4,4.5,1.6 88 | versicolor,6.7,3.1,4.7,1.5 89 | versicolor,6.3,2.3,4.4,1.3 90 | versicolor,5.6,3,4.1,1.3 91 | versicolor,5.5,2.5,4,1.3 92 | versicolor,5.5,2.6,4.4,1.2 93 | versicolor,6.1,3,4.6,1.4 94 | versicolor,5.8,2.6,4,1.2 95 | versicolor,5,2.3,3.3,1 96 | versicolor,5.6,2.7,4.2,1.3 97 | versicolor,5.7,3,4.2,1.2 98 | versicolor,5.7,2.9,4.2,1.3 99 | versicolor,6.2,2.9,4.3,1.3 100 | versicolor,5.1,2.5,3,1.1 101 | versicolor,5.7,2.8,4.1,1.3 102 | virginica,6.3,3.3,6,2.5 103 | virginica,5.8,2.7,5.1,1.9 104 | virginica,7.1,3,5.9,2.1 105 | virginica,6.3,2.9,5.6,1.8 106 | virginica,6.5,3,5.8,2.2 107 | virginica,7.6,3,6.6,2.1 108 | virginica,4.9,2.5,4.5,1.7 109 | virginica,7.3,2.9,6.3,1.8 110 | virginica,6.7,2.5,5.8,1.8 111 | virginica,7.2,3.6,6.1,2.5 112 | virginica,6.5,3.2,5.1,2 113 | virginica,6.4,2.7,5.3,1.9 114 | virginica,6.8,3,5.5,2.1 115 | virginica,5.7,2.5,5,2 116 | virginica,5.8,2.8,5.1,2.4 117 | virginica,6.4,3.2,5.3,2.3 118 | virginica,6.5,3,5.5,1.8 119 | virginica,7.7,3.8,6.7,2.2 120 | virginica,7.7,2.6,6.9,2.3 121 | virginica,6,2.2,5,1.5 122 | virginica,6.9,3.2,5.7,2.3 123 | virginica,5.6,2.8,4.9,2 124 | virginica,7.7,2.8,6.7,2 125 | virginica,6.3,2.7,4.9,1.8 126 | virginica,6.7,3.3,5.7,2.1 127 | virginica,7.2,3.2,6,1.8 128 | virginica,6.2,2.8,4.8,1.8 129 | virginica,6.1,3,4.9,1.8 130 | virginica,6.4,2.8,5.6,2.1 131 | virginica,7.2,3,5.8,1.6 132 | virginica,7.4,2.8,6.1,1.9 133 | virginica,7.9,3.8,6.4,2 134 | virginica,6.4,2.8,5.6,2.2 135 | virginica,6.3,2.8,5.1,1.5 136 | virginica,6.1,2.6,5.6,1.4 137 | virginica,7.7,3,6.1,2.3 138 | virginica,6.3,3.4,5.6,2.4 139 | virginica,6.4,3.1,5.5,1.8 140 | virginica,6,3,4.8,1.8 141 | virginica,6.9,3.1,5.4,2.1 142 | virginica,6.7,3.1,5.6,2.4 143 | virginica,6.9,3.1,5.1,2.3 144 | virginica,5.8,2.7,5.1,1.9 145 | virginica,6.8,3.2,5.9,2.3 146 | virginica,6.7,3.3,5.7,2.5 147 | virginica,6.7,3,5.2,2.3 148 | virginica,6.3,2.5,5,1.9 149 | virginica,6.5,3,5.2,2 150 | virginica,6.2,3.4,5.4,2.3 151 | virginica,5.9,3,5.1,1.8 -------------------------------------------------------------------------------- /Fall2018/2-sep19-k-means/readme.md: -------------------------------------------------------------------------------- 1 | # K-means clustering 2 | 3 | * [R Markdown File](k-means-ucr.Rmd) 4 | * [Python Jupyter File](https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html) 5 | * (Can open in Google Collab for a nice interactive experience) -------------------------------------------------------------------------------- /Fall2018/3-oct3-hier_agg_clust/Oct3-hier_agg_clust.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Hierarchical clustering" 3 | author: "Evan" 4 | date: "10/2/2018" 5 | output: 6 | html_document: 7 | number_sections: yes 8 | toc: yes 9 | toc_float: yes 10 | --- 11 | 12 | ```{r set-options, echo = FALSE, cache = FALSE} 13 | options(width = 140) 14 | ``` 15 | 16 | # Hierarchical agglomerative clustering 17 | Hierarchical agglomerative clustering is a "bottom-up" method of clustering. Each observation begins as its own cluster and forms clusters with like items as it moves up the hierarchy. That is, all leaves are their own clusters to begin with and form clusters as we move up the trunk and various branches are formed. 18 | 19 | Distance and cluster method information are usually displayed at the bottom of the graph, while the vertical axis displays the height, which refers to the distance between two clusters. We are not concerned as much with distances along the horizontal axis. We can also "cut" the dendrogram to specify a number of clusters, which is similar to defining _k_ in k-means clustering (which is also equally problematic). 20 | 21 | In a real-life research situation, you will likely want to scale the data. However, raw data are used in this example. 22 | # Package installation 23 | ```{r} 24 | if (FALSE) { 25 | # Run this line manually (once) to install the necessary packages. 26 | # Install packages from CRAN: 27 | install.packages(c("ape", "pvclust", "mclust")) 28 | } 29 | 30 | # fancy dendrogram options 31 | library(ape) 32 | # dendrograms with p-values 33 | library(pvclust) 34 | # model-based clustering 35 | library(mclust) 36 | ``` 37 | 38 | # Load data 39 | ```{r} 40 | data(mtcars) 41 | ?mtcars 42 | ``` 43 | 44 | Start by using the `hclust` built-in function from [{stats}](https://www.rdocumentation.org/packages/stats/versions/3.5.1). `hclust` prefers a dissimilarity matrix via the `dist` function, thus it plots rows as opposed to columns like the methods further below. 45 | 46 | # The `hclust` built-in function 47 | ```{r} 48 | # See the help files 49 | ?hclust 50 | 51 | # Create distance matrix 52 | mtcars_dist = dist(mtcars, method = "euclidean") 53 | 54 | # Fit hclust_model 55 | system.time({ 56 | hclust_model = hclust(mtcars_dist, method = "complete") 57 | }) 58 | 59 | # Plot hclust_model dendrogram 60 | plot(hclust_model, hang = -1) 61 | ``` 62 | 63 | Data are visualized in dendrograms, or branching tree-like structures similar to decision trees, albeit with less information displayed at each node. The most similar items are found lower in the dendrogram and fuse into $n-1$ clusters as we move up the tree; the next two items to fuse into a cluster produce $n-2$ clusters and so on as we move up the tree until there is just one overarching cluster. Thus, clusters become more inclusive as we move up the hierarchy. 64 | 65 | Dissimilarity is applied not just to single observations, but to groups as well (linkage). Thus the "Cadillac Fleetwood / Lincoln Continental" cluster " cluster fuses with "Chrysler Imperial" instead of "Maserati Bora" or something else. 66 | 67 | You can also cut the tree to see how the tree varies: 68 | ```{r} 69 | # If we want only 5 clusters, for example (must be a number between 1-32, since mtcars has only 32 observations: 70 | cutree(hclust_model, 5) 71 | ``` 72 | 73 | # The `ape` package 74 | 75 | The [`ape` package](https://cran.r-project.org/web/packages/ape/index.html) provides some great functionality for constructing and plotting clusters: 76 | ```{r} 77 | library(ape) 78 | # various plots 79 | plot(as.phylo(hclust_model)) 80 | plot(as.phylo(hclust_model), type = "cladogram") 81 | plot(as.phylo(hclust_model), type = "unrooted") 82 | 83 | # radial plot 84 | colors = c("red", "orange", "blue", "green", "purple") 85 | clus5 = cutree(hclust_model, 5) 86 | plot(as.phylo(hclust_model), type = "fan", tip.color = colors[clus5], lwd = 2, cex = 1) 87 | ``` 88 | 89 | > NOTE: the color settings for the radial plot apply to the other ape plots as well. 90 | 91 | # The `pvclust` package 92 | The [pvclust](http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/) package offers a straightfoward way to perform hierarchical agglomerative clustering of columns with two types of p-values at each split: approximately unbiased **(AU)** and bootstrap probability **(BP)**. 93 | ```{r} 94 | library(pvclust) 95 | # Cluster features 96 | 97 | # Ward's method: minimum variance between clusters 98 | system.time({ 99 | pvclust_model_ward = pvclust(mtcars, 100 | method.hclust = "ward.D", 101 | method.dist = "euclidean", 102 | nboot = 1000, parallel = T) 103 | }) 104 | 105 | plot(pvclust_model_ward) 106 | 107 | # pvrect will draw rectangles around clusters with high or low p-values 108 | pvrect(pvclust_model_ward, alpha = 0.95) 109 | ``` 110 | 111 | ### Compare different dissimilarity measures 112 | ```{r} 113 | # Complete linkage: largest intercluster difference 114 | system.time({ 115 | pvclust_model_complete = pvclust(mtcars, 116 | method.hclust = "complete", 117 | method.dist = "euclidean", 118 | nboot = 1000, parallel = T) 119 | }) 120 | 121 | # Single linkage: smallest intercluster difference 122 | system.time({ 123 | pvclust_model_single = pvclust(mtcars, 124 | method.hclust = "single", 125 | method.dist = "euclidean", 126 | nboot = 1000, parallel = T) 127 | }) 128 | 129 | # Average linkage: mean intercluster difference 130 | system.time({ 131 | pvclust_model_average = pvclust(mtcars, 132 | method.hclust = "average", 133 | method.dist = "euclidean", 134 | nboot = 1000, parallel = T) 135 | }) 136 | 137 | # View summaries 138 | pvclust_model_ward 139 | pvclust_model_complete 140 | pvclust_model_single 141 | pvclust_model_average 142 | 143 | # Plot Euclidean distance linkages 144 | par(mfrow = c(2,2)) 145 | plot(pvclust_model_ward, main = "Ward", xlab = "", sub = "") 146 | pvrect(pvclust_model_ward) 147 | plot(pvclust_model_complete, main = "Complete", xlab = "", sub = "") 148 | pvrect(pvclust_model_complete) 149 | plot(pvclust_model_single, main = "Single", xlab = "", sub = "") 150 | pvrect(pvclust_model_single) 151 | plot(pvclust_model_average, main = "Average", xlab = "", sub = "") 152 | pvrect(pvclust_model_average) 153 | par(mfrow = c(1,1)) 154 | ``` 155 | 156 | ### View standard error plots: 157 | ```{r} 158 | par(mfrow=c(2,2)) 159 | seplot(pvclust_model_ward, main = "Ward") 160 | seplot(pvclust_model_complete, main = "Complete") 161 | seplot(pvclust_model_single, main = "Single") 162 | seplot(pvclust_model_average, main = "Average") 163 | par(mfrow=c(1,1)) 164 | ``` 165 | 166 | # Going further - the `mclust` package 167 | The [`mclust`](https://cran.r-project.org/web/packages/mclust/index.html) package provides "Gaussian finite mixture models fitted via EM algorithm for model-based clustering, classification, and density estimation, including Bayesian regularization, dimension reduction for visualisation, and resampling-based inference." 168 | ```{r} 169 | library(mclust) 170 | 171 | # Fit model 172 | mclust_model = Mclust(mtcars) 173 | 174 | # View various plots 175 | plot(mclust_model, what = "BIC") 176 | plot(mclust_model, what = "classification") 177 | plot(mclust_model, what = "uncertainty") 178 | plot(mclust_model, what = "density") 179 | ``` 180 | 181 | ### Return best performing model 182 | ```{r} 183 | summary(mclust_model) 184 | ``` 185 | 186 | ### Cross-validated mclust 187 | ```{r} 188 | # sort mpg in decreasing order 189 | mtcars = mtcars[order(-mtcars$mpg),] 190 | mtcars 191 | 192 | # create a binary factor variable from mpg: "less than 20mpg" and "greater than 20mpg" 193 | mtcars$class = cut(mtcars$mpg, 194 | breaks = c(0, 20, 40), 195 | levels = c(1, 2), 196 | labels = c("less than 20mpg", "greater than 20mpg")) 197 | mtcars 198 | 199 | # define our predictors (X) and class labels (class) 200 | X = mtcars[ , -12] 201 | class = mtcars$class 202 | 203 | # fit the model (EEE covariance structure, basically the same as linear discriminant analysis) 204 | mclust_model2 = MclustDA(X, class = class, modelType = "EDDA", modelNames = "EEE") 205 | 206 | # cross-validate! 207 | set.seed(1) 208 | cv_mclust = cvMclustDA(mclust_model2, nfold = 20) 209 | 210 | # View cross-validation error and standard error of the cv error 211 | cv_mclust[c("error", "se")] 212 | ``` 213 | 214 | References and resources: 215 | - [Quick-R: Cluster Analysis](https://www.statmethods.net/advstats/cluster.html) 216 | - [James et al. Introduction to Statistical Learning, pp. 390-401](https://www-bcf.usc.edu/~gareth/ISL/) 217 | - [pvclust](http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/) 218 | - [STHDA: Beautiful dendrogram visualizations](http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning) 219 | - [Gaston Sanchez: Visualizing Dendrograms in R](https://rpubs.com/gaston/dendrograms) 220 | - [Analysis of Phylogenetics and Evolution](http://ape-package.ird.fr/) 221 | - [A Quick Tour of mclust](https://cran.r-project.org/web/packages/mclust/vignettes/mclust.html) 222 | - [mclust vignette (from 2012, but more detailed)](https://www.stat.washington.edu/sites/default/files/files/reports/2012/tr597.pdf) 223 | - A very [useful walkthrough](https://quantdev.ssri.psu.edu/sites/qdev/files/Unsupervised_Machine_Learning_The_mclust_Package_and_others.html) by Christian Lopez 224 | - [MoEClust:](https://cran.r-project.org/web/packages/MoEClust/vignettes/MoEClust.html) Gaussian Parsimonious Clustering Models with Gating and Expert Network Covariates 225 | - See the [cluster](https://cran.r-project.org/web/packages/cluster/cluster.pdf) R package to learn more about agnes, clara, daisy, diana, fanny, flower, mona, and pam cluster methods! 226 | 227 | 228 | -------------------------------------------------------------------------------- /Fall2018/4-medoids/medoid-clustering.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Medoid clustering" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | Implementations: 11 | 12 | 1. [cluster::pam](https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/pam.html) 13 | 2. [kmed](https://cran.r-project.org/web/packages/kmed/vignettes/kmedoid.html) 14 | 3. [hopach](https://www.bioconductor.org/packages/release/bioc/html/hopach.html) 15 | 16 | ## Installation 17 | 18 | Run this section once manually - it will not be run when knitting the markdown file. 19 | 20 | ```{r eval=FALSE} 21 | # cluster is a built-in package. 22 | 23 | # kmed 24 | install.packages("kmed") 25 | 26 | # hopach 27 | ## try http:// if https:// URLs are not supported 28 | source("https://bioconductor.org/biocLite.R") 29 | biocLite("hopach") 30 | 31 | ``` 32 | 33 | ### Load libraries 34 | 35 | ```{r} 36 | library(cluster) 37 | library(kmed) 38 | library(hopach) 39 | ``` 40 | 41 | ## Data prep 42 | 43 | We're trying out a birth weight dataset. 44 | 45 | ```{r} 46 | data = MASS::birthwt 47 | summary(data) 48 | ?MASS::birthwt 49 | data$race = factor(data$race, labels = c("white", "black", "other")) 50 | str(data) 51 | 52 | # Create a list to hold different variables. 53 | vars = list( 54 | # Birth weight or low are generally our outcomes for supervised analyses. 55 | outcomes = c("bwt", "low"), 56 | 57 | # Variables we want to exclude from our analysis - none currently. 58 | exclude = NULL 59 | ) 60 | 61 | vars$covariates = setdiff(names(data), vars$outcomes) 62 | 63 | # Review our data structure. 64 | vars 65 | ``` 66 | 67 | 68 | ## K-med package 69 | 70 | ```{r kmed} 71 | 72 | # Review covariate structure 73 | str(data[, vars$covariates]) 74 | 75 | # Create distance matrix. 76 | # NOTE: perhaps we should center & scale data beforehand. 77 | dist_mat = 78 | # This function is for "mixed" variable data - numeric, binary, and/or categorical. 79 | distmix(data[, vars$covariates], 80 | # There are 6 options for method here. 81 | method = "gower", 82 | # method = "huang", 83 | # Harikumar seems to require all integer data. 84 | # method = "harikumar", 85 | # method = "wishart", 86 | # Provide column numbers for the numeric variables. 87 | idnum = which(vars$covariates %in% c("age", "lwt", "ptl", "ftv")), 88 | # Binary variables. 89 | idbin = which(vars$covariates %in% c("smoke", "ht", "ui")), 90 | # Categorical variables. 91 | idcat = which(vars$covariates %in% c("race"))) 92 | 93 | # 189 x 189. 94 | dim(dist_mat) 95 | # Same as the number of observations. 96 | nrow(data) 97 | 98 | # Conduct the medoids analysis with 3 clusters. 99 | # Other function options: rankkmed, stepkmed. 100 | result = fastkmed(dist_mat, ncluster = 3, iterate = 50) 101 | 102 | # Examine distribution of low birth weight across clusters. 103 | table("cluster" = result$cluster, "low wgt" = data$low) 104 | prop.table(table("cluster" = result$cluster, "low wgt" = data$low), margin = 1) 105 | ``` 106 | 107 | Does cluster help us predict birth weight? 108 | 109 | ```{r kmed_ols} 110 | # OLS 1: don't include cluster. 111 | reg1 = lm(bwt ~ ., 112 | data = data[, c(vars$covariates, vars$outcomes[1])]) 113 | summary(reg1) 114 | 115 | # OLS 2: with cluster included. 116 | reg2 = lm(bwt ~ ., 117 | data = cbind(data[, c(vars$covariates, vars$outcomes[1])], 118 | cluster = factor(result$cluster))) 119 | summary(reg2) 120 | ``` 121 | 122 | We have a reasonable increase in adjusted R-squared. How else could we examine the possible benefit of the cluster variable on our predictive accuracy? Are other methods preferable to gower? 123 | 124 | ```{r} 125 | # a simple and fast k-medoids function for bootstrap evaluation 126 | boot_kmed = function(distmat, nclust) { 127 | result = fastkmed(distmat, nclust, iterate = 50) 128 | return(result$cluster) 129 | } 130 | 131 | # k-means function for bootstrap evaluation 132 | boot_kmeans = function(x, nclust) { 133 | result = kmeans(x, nclust) 134 | return(result$cluster) 135 | } 136 | 137 | k = 3 138 | num_boots = 50 139 | fastkmedboot = clustboot(dist_mat, nclust = k, boot_kmed, nboot = num_boots) 140 | # For k-means we need to create a numeric matrix (i.e. convert factor to indicators) 141 | data_mat = model.matrix(~ ., data = data[, vars$covariates])[, -1] 142 | kmeansboot = clustboot(data_mat, nclust = k, boot_kmeans, 143 | nboot = num_boots, diss = FALSE) 144 | 145 | # Consensus matrix creation. 146 | 147 | wardorder <- function(x, nclust) { 148 | res <- hclust(x, method = "ward.D2") 149 | member <- cutree(res, nclust) 150 | return(member) 151 | } 152 | consensusfastkmed <- consensusmatrix(fastkmedboot, nclust = k, wardorder) 153 | 154 | clustheatmap(consensusfastkmed, "Clustering via Fast K-medoids") 155 | 156 | consensuskmeans <- consensusmatrix(kmeansboot, nclust = k, wardorder) 157 | clustheatmap(consensuskmeans, "Clustering via K-means") 158 | ``` 159 | 160 | 161 | ## Cluster: partitioning around medoids 162 | 163 | ```{r cluster_pam} 164 | # Maybe we can figure out during MLWG? 165 | 166 | result_pam = 167 | cluster::pam(data[, vars$covariates], k = 3, 168 | metric = "euclidean") 169 | #metric = "manhattan") 170 | 171 | # Output is a bit too verbose. 172 | summary(result_pam) 173 | 174 | # We get a PCA plot with ellipsoids, 175 | # Then a silhouette plot. 176 | plot(result_pam) 177 | ``` 178 | 179 | (sklearn info on silhouette plots)[http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html] 180 | 181 | ## HOPACH package 182 | 183 | ```{r hopach} 184 | # We use the numeric data matrix here, which has converted factors to indicators. 185 | dist = distancematrix(data_mat, 186 | d = "cosangle", 187 | # d = "cor", 188 | na.rm = TRUE) 189 | dim(dist) 190 | 191 | hobj = hopach(data_mat, dmat = dist) 192 | 193 | # Number of clusters identified. 194 | hobj$clust$k 195 | 196 | # Review sizes of each cluster. 197 | hobj$clust$sizes 198 | 199 | # This plot is recommended but does not seeem that useful. 200 | dplot(dist, hobj, ord="final", main="Distance matrix", showclusters = FALSE) 201 | 202 | # Bootstrap analysis 203 | # TODO: identify how to set seed. 204 | bobj = boothopach(data_mat, hobj, B = 100) 205 | 206 | ################################################### 207 | ### code chunk number 7: bootplot (eval = FALSE) 208 | ################################################ 209 | bootplot(bobj, hobj, ord = "bootp", 210 | main = "Bootstrap plot", showclusters = FALSE) 211 | 212 | ``` 213 | 214 | ## Resources 215 | 216 | Please see the package references - several great articles in there, especially kmed. Kaufman and Rousseeuw (1990) is one of the classic textbooks. 217 | -------------------------------------------------------------------------------- /Fall2018/4-medoids/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2018/4-medoids/readme.md -------------------------------------------------------------------------------- /Fall2018/5-Oct30-tSNE/r-tSNE.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "tSNE" 3 | author: "Evan" 4 | date: "10/24/2018" 5 | output: 6 | html_document: 7 | toc: yes 8 | toc_float: yes 9 | --- 10 | 11 | ```{r set-options, echo = FALSE, cache = FALSE} 12 | options(width = 140) 13 | ``` 14 | 15 | # tSNE! 16 | t-distributed stochastic neighbor embedding (tSNE) is a nonlinear, nonparametric, and unsupervised dimension reduction machine learning algorithm. It is used to find patterns in high-dimensional data. 17 | 18 | Recall that dimension reduction techniques such as [PCA](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/1-sep5-PCA) help us reduce high-dimensional linear data into a reduced feature space, such as 2 or 3 main axes of "distilled" variation that can be efficiently visualized. 19 | 20 | These visualizations often look a little nicer than those for PCA because instead of plotting distances between observations, tSNE plots the _probabilities_ instead! This is based on [Kullback-Leibler divergences](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (the loss function). It becomes difficult to say what PCA data separation looks like in higher-dimensional space because it can be dubious to extrapolate lower dimension representations into higher ones. 21 | 22 | ### Some key hyperparameters include: 23 | * dims - the number of dimensions to be returned. 24 | * [Perplexity](https://en.wikipedia.org/wiki/Perplexity) - essentially the number of nearest neighbors, but in the curved/surface-like [manifold](https://stats.stackexchange.com/questions/289467/what-is-a-manifold) setting instead of stright-line distances. Should be less than the number of observations, but it is not that simple... 25 | * theta - the Barnes-Hut tradeoff, ranging from 0 to 1. This is the speed/accuracy tradeoff with lower values give slower but more accurate optimizations. 0.0 returns he exact tSNE value (defaults to 0.5). 26 | * eta - learning rate. 27 | * check_duplicates - should duplicate observations be removed? 28 | 29 | # Package installation 30 | Run these lines manually if you need to install or update the following packages: 31 | ```{r} 32 | if (FALSE) { 33 | install.packages(c( 34 | # train/test data splitting 35 | "caret", 36 | # Our sole ML algorithm this time around 37 | "randomForest", 38 | # tSNE algorithms 39 | "Rtsne", "tsne" 40 | )) 41 | } 42 | ``` 43 | 44 | Library the required packages 45 | ```{r} 46 | library(caret) 47 | library(randomForest) 48 | library(Rtsne) 49 | library(tsne) 50 | ``` 51 | 52 | # Load the `iris` dataset 53 | ```{r} 54 | data(iris) 55 | 56 | # Learn about the dawta 57 | ?iris 58 | 59 | # View its structure 60 | str(iris) 61 | 62 | # How many of each species? 63 | table(iris$Species) 64 | ``` 65 | 66 | # Goals 67 | We will fit one model using the tsne package and one using the Rtsne package. Then, we will use the Rtsne model to add coordinates to our dataset and to train and evaluate a random forest algorithm on these new data. 68 | 69 | # `tsne` package 70 | Here, the help files outline a concise way to fit the tSNE algorithm via a brief plotting function: 71 | ```{r} 72 | # Define colors for plotting 73 | colors = rainbow(length(unique(iris$Species))) 74 | 75 | # Assign one color to each species 76 | names(colors) = unique(iris$Species) 77 | colors 78 | 79 | # Define the function 80 | ecb = function(x,y){ 81 | plot(x,t = 'n') 82 | text(x,labels = iris$Species, col = colors[iris$Species]) 83 | } 84 | 85 | # Fit 86 | set.seed(1) 87 | system.time({ 88 | tsne_iris = tsne::tsne(iris[, -5], epoch_callback = ecb, perplexity = 50) 89 | }) 90 | ``` 91 | 92 | ### `Rtsne` example 93 | Rtsne provides clearer hyperparameters, better help, and more flexibility compared to the tsne model. 94 | ```{r} 95 | # You might want to remove duplicate observations (even if they are stochastic)... (so that you are not computing distances between two identical points?) 96 | 97 | set.seed(1) 98 | Rtsne_iris <- Rtsne::Rtsne(as.matrix(iris[, -5]), 99 | # Return just the first two dimensions 100 | dims = 2, 101 | # Let's set perplexity to 5% of the number of rows 102 | # Try setting it to a larger value as well, like 25% 103 | perplexity = nrow(iris) * 0.05, 104 | # try changing theta to 0.0 to see what happens 105 | theta = 0.5, 106 | # change eta to 0 and see what happens! 107 | eta = 1, 108 | # Tell the algorithm it is okay to have duplicate rows 109 | check_duplicates = F) 110 | # Unpack! 111 | names(Rtsne_iris) 112 | 113 | # Plot first two dimensions 114 | plot(Rtsne_iris$Y[, 1:2],col = iris$Species) 115 | ``` 116 | 117 | # Visual comparison to PCA 118 | ```{r} 119 | pca_iris = princomp(iris[,1:4])$scores[,1:2] 120 | plot(pca_iris, t = 'n') 121 | text(pca_iris, labels = iris$Species, col = colors[iris$Species]) 122 | ``` 123 | 124 | # A machine learning example 125 | Let's recapitulate [Mark Borg's walkthrough here](https://mark-borg.github.io/blog/2016/tsne-ml/). Let's keep working with our `Rtsne_iris` model from above. cbind the tSNE coordinates into our dataset in order to fit a random forest on this new dataset! 126 | ```{r} 127 | # Add tSNE coordinates via cbind 128 | data = cbind(iris, Rtsne_iris$Y) 129 | 130 | # Rename the new columns 131 | colnames(data)[6] = "tSNE_Dim1" 132 | colnames(data)[7] = "tSNE_Dim2" 133 | 134 | # Check out the dataset 135 | head(data) 136 | 137 | # Split the data 138 | set.seed(1) 139 | split = caret::createDataPartition(data$Species, p = 0.75, list = FALSE) 140 | training_set = data[split,] 141 | test_set = data[-split,] 142 | 143 | # Identify species "target" variable and predictors for train and test sets 144 | X_train = training_set[, -5] 145 | Y_train = training_set$Species 146 | 147 | X_test = test_set[, -5] 148 | Y_test = test_set$Species 149 | ``` 150 | 151 | Fit the random forest: 152 | ```{r, echo = T, results = "hide"} 153 | set.seed(1) 154 | RF = randomForest(X_train, Y_train, X_test, Y_test, 155 | ntree = 500, 156 | proximity = T, 157 | importance = T, 158 | keep.forest = T, 159 | do.trace = T) 160 | ``` 161 | ```{r} 162 | predicted = predict(RF, X_test) 163 | table(predicted, Y_test) 164 | mean(predicted == Y_test) 165 | varImpPlot(RF) 166 | ``` 167 | 168 | # Resources 169 | [tSNE FAQ](https://lvdmaaten.github.io/tsne/). Laurens van der Maaten blog. 170 | 171 | Cao, Y and L Wang. 2017. [Automatic selection of t-SNE perplexity.](https://arxiv.org/pdf/1708.03229.pdf) Journal of Machine Learning Research: Workshop and Conference Proceedings 1:1-7. 172 | 173 | Linderman, GC and S. Stenerberger. 2017. [Clustering with t-SNE, provably.](https://arxiv.org/pdf/1706.02582.pdf) arXiv:1706.02582 [cs.LG]. 174 | 175 | Pezzotti et al. 2017. [Approximated and user steerable tSNE for progressive visual analytics.](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7473883&tag=1) IEEE Transactions on Visualization and Computer Graphics 23:1739-1752. 176 | 177 | Schubert E. and M. Gertz. 2017. [Intrinsic t-stochastic neighbor embedding for visualization and outlier detection: A remedy against the curse of dimensionality?](https://pdfs.semanticscholar.org/97a0/d8798aec210c68a8532d907e4e7c193754a6.pdf) In: Beecks C., Borutta F., Kröger P., Seidl T. (eds) Similarity Search and Applications (SISAP). Lecture Notes in Computer Science, Springer, 10609:188-203. 178 | 179 | Wattenberg et al. 2016. [How to use t-SNE effectively](https://distill.pub/2016/misread-tsne/) 180 | 181 | colah's blog. 2015. [Visualizing representations: Deep learning and human beings.](https://colah.github.io/posts/2015-01-Visualizing-Representations/) 182 | 183 | Wang W et al. 2015. [On deep multi-view representation learning.](http://proceedings.mlr.press/v37/wangb15.pdf) Journal of Machine Learning Research: Workshop and Conference Proceedings 37. 184 | 185 | van der Maaten, LJP. 2014. [Accelerating t-SNE using Tree-Based Algorithms.](http://jmlr.org/papers/volume15/vandermaaten14a/vandermaaten14a.pdf) Journal of Machine Learning Research, 15:3221-3245. 186 | 187 | Hamel, P and D. Eck. 2010. [Learning features from music audio with deep belief networks.](http://www.mirlab.org/conference_papers/international_conference/ISMIR%202010/ISMIR_2010_papers/ismir2010-58.pdf) 11th International Society for Music Information Retrieval Conference 339-344. 188 | 189 | Jamieson AR et al. 2010. [Exploring nonlinear feature space dimension reduction and data representation in breast CADx with Laplacian eigenmaps and t-SNE.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2807447/) Medical Physics 37:339-351. 190 | 191 | van der Maaten, LJP. 2009. [Learning a Parametric Embedding by Preserving Local Structure.](https://lvdmaaten.github.io/publications/papers/AISTATS_2009.pdf) In Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS), Journal of Machine Learning Research Workshop and Conference Proceedings 5:384-391. 192 | 193 | van der Maaten LJP and GE Hinton. 2008. [Visualizing Data Using t-SNE.](http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) Journal of Machine Learning Research 9:2579-2605. 194 | 195 | Also check out [umapr](https://ropensci.org/blog/2018/08/01/umapr/) and [uwot](https://github.com/jlmelville/uwot). -------------------------------------------------------------------------------- /Fall2018/6-nov14-umap/umap-r.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "UMAP: Uniform Manifold Approximation and Projection" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ## Background 11 | 12 | * Arxiv paper: (https://arxiv.org/abs/1802.03426) 13 | * PyData 2018 talk (PCA, tSNE, and UMAP): (https://www.youtube.com/watch?v=YPJQydzTLwQ) 14 | * PyCon 2018 talk: (https://www.youtube.com/watch?v=nq6iPZVUxZU) 15 | 16 | ## Data prep 17 | 18 | We're trying out a birth weight dataset. 19 | 20 | ```{r} 21 | data = MASS::birthwt 22 | summary(data) 23 | ?MASS::birthwt 24 | data$race = factor(data$race, labels = c("white", "black", "other")) 25 | str(data) 26 | 27 | # Create a list to hold different variables. 28 | vars = list( 29 | # Birth weight or low are generally our outcomes for supervised analyses. 30 | outcomes = c("bwt", "low"), 31 | 32 | # Variables we want to exclude from our analysis - none currently. 33 | exclude = NULL 34 | ) 35 | 36 | vars$covariates = setdiff(names(data), vars$outcomes) 37 | 38 | # Review our data structure. 39 | vars 40 | 41 | dplyr::glimpse(data[vars$covariates]) 42 | sapply(data[vars$covariates], class) 43 | ``` 44 | 45 | ```{r} 46 | library(umap) 47 | class(data[vars$covariates]) 48 | # Convert factor to indicators and remove intercept column. 49 | data_mat = model.matrix(~ ., data[vars$covariates])[, -1] 50 | summary(data_mat) 51 | 52 | # Conduct UMAP analysis of our matrix data, setting a random seed. 53 | result = umap(data_mat, random_state = 1) 54 | ``` 55 | 56 | ## Plot UMAP 57 | 58 | ```{r umap_plot} 59 | dim(result) 60 | class(result) 61 | 62 | library(ggplot2) 63 | 64 | # Compile results into a dataframe. 65 | plot_data = data.frame(x = result$layout[, 1], 66 | y = result$layout[, 2], 67 | data[, vars$outcomes]) 68 | 69 | # Create an initial plot object. 70 | p = ggplot(data = plot_data, aes(x = x, y = y, color = low)) + 71 | theme_minimal() 72 | 73 | # Plot binary outcome 74 | p + geom_point() + ggtitle("Low birth weight = 1") 75 | 76 | # Compare to continuous outcome. 77 | p + geom_point(aes(color = bwt)) + ggtitle("Continuous birth weight") 78 | ``` 79 | 80 | ## Hyperparameters 81 | 82 | ```{r} 83 | # Review default settings. 84 | umap.defaults 85 | 86 | config = umap.defaults 87 | 88 | # Set a seed. 89 | config$random_state = 1 90 | config$n_neighbors = 30 91 | 92 | result2 = umap(data_mat, config) 93 | 94 | p + geom_point(aes(x = result2$layout[, 1], 95 | y = result2$layout[, 2])) 96 | 97 | # Try even more neighbors. 98 | config$n_neighbors = 60 99 | 100 | result3 = umap(data_mat, config) 101 | 102 | p + geom_point(aes(x = result3$layout[, 1], 103 | y = result3$layout[, 2])) 104 | ``` 105 | 106 | More info on hyperparameters on the [umap-learn python documentation page](https://umap-learn.readthedocs.io/en/latest/parameters.html). 107 | 108 | ```{r} 109 | ?umap 110 | ``` 111 | 112 | ## Challenge 113 | 114 | * Compare to tSNE using code from our last meeting. 115 | * Other datasets to try: MNIST, iris, your own dataset. 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /MachineLearningWG.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /Math4ML_2017/Math4ML notes July 19.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4ML notes July 19.docx -------------------------------------------------------------------------------- /Math4ML_2017/Math4ML notes July 5th .docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4ML notes July 5th .docx -------------------------------------------------------------------------------- /Math4ML_2017/Math4MLJune7.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4MLJune7.docx -------------------------------------------------------------------------------- /Math4ML_2017/Math4MLMay24.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4MLMay24.docx -------------------------------------------------------------------------------- /Math4ML_2017/README.md: -------------------------------------------------------------------------------- 1 | # Math for machine learning 2 | This is the summer 2017 reading group for the D-Lab Machine Learning Working Group. 3 | 4 | ### Dates: 5 | 6 | ### Starter resources: 7 | [Floridi L. 2017. Robots, jobs, taxes, and responsibilities. Philosophy & Technology 30:1-4](https://link.springer.com/article/10.1007/s13347-017-0257-3) 8 | 9 | [IBM Data Science Experience. The mathematics of machine learning](http://datascience.ibm.com/blog/the-mathematics-of-machine-learning/) 10 | 11 | [Jordan MI. 1986/1987. An introduction to linear algebra in parallel distributed processing. Parallel distributed processing 1: 365-422.](https://www.cs.cmu.edu/afs/cs/academic/class/15883-f15/readings/jordan-1986-ch9.pdf) 12 | 13 | [Li H. 2017. Which machine learning algorithm should I use? SAS blog (Subconscious Musings)](http://blogs.sas.com/content/subconsciousmusings/2017/04/12/machine-learning-algorithm-use/) 14 | 15 | [MIT 18.657 Mathematics of Machine Learning - course syllabus and resources](https://ocw.mit.edu/courses/mathematics/18-657-mathematics-of-machine-learning-fall-2015/syllabus/) 16 | 17 | [Rahm E, Do HH. 2000. Data cleaning: Problems and current approaches. IEEE Data Eng. Bull. 23.4:3-13](https://dbs.uni-leipzig.de/en/publication/title/data_cleaning_problems_and_current_approaches) 18 | 19 | [Valiant LG. 1984. A theory of the learnable. Communications of the ACM 27:1134-1142](https://people.mpi-inf.mpg.de/~mehlhorn/SeminarEvolvability/ValiantLearnable.pdf) -------------------------------------------------------------------------------- /R and Python installation help.txt: -------------------------------------------------------------------------------- 1 | Before class, please download and install R Studio: 2 | https://www.rstudio.com/products/rstudio/download3/ 3 | 4 | If your installation does not work and says you need to install the binary files, please do so here: 5 | https://cloud.r-project.org/ 6 | 7 | Also download and install Python by following these instructions: 8 | https://github.com/dlab-berkeley/python-intensive/blob/master/Install.md 9 | (you can also just pip install scikit-learn if you have Python but not Anaconda). 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning Working Group, Fall 2018 2 | 3 | We meet on alternating Wednesdays from 3-5pm at D-Lab (Barrows 356). We have no expectation of prior machine learning experience, and simply go through one algorithm a meeting, with about 30 minutes each in R & Python. We also incorporate lightning talks and other guest presentations throughout our meetings. 4 | 5 | **Fall 2018 - unsupervised methods** 6 | - Sept. 5: [Principal component analysis (PCA)](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/1-sep5-PCA) 7 | - Sept. 19: [K-means clustering](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/2-sep19-k-means) 8 | - Oct. 3: [Hierarchical clustering](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/3-oct3-hier_agg_clust) 9 | - Oct. 17: [Medoid partitioning](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/4-medoids) 10 | - Oct. 31: tSNE 11 | - Nov. 14: [UMAP](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/6-nov14-umap) 12 | - Dec. 12: Lightning talks 13 | 14 | We are always looking for student/staff/faculty presenters. Please contact us if you are interested! 15 | 16 | More information on the [D-Lab MLWG website](http://dlab.berkeley.edu/working-groups/machine-learning-working-group-0) 17 | 18 | ## Previous Semesters 19 | 20 | * [Spring 2018](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Spring2018) 21 | - k-nearest neighbors 22 | - decision tree 23 | - random forest 24 | - gradient boosting 25 | - elastic net 26 | * [Fall 2017](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2017) 27 | - basics of neural networks for image processing 28 | * [Spring 2017](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Spring2017) 29 | - k-nearest neighbors 30 | - stepwise regression 31 | - linear and polynomial regression, smoothing splines 32 | - multivariate adaptive regression splines and generalized additive models 33 | - support vector machines 34 | - neural networks. 35 | * [Fall 2016](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2016) 36 | - decision trees, random forests, penalized regression, and boosting 37 | 38 | ## Resources 39 | 40 | Books: 41 | 42 | 1. **Intro to Statistical Learning** by James et al. [(free pdf)](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) [(Amazon)](https://smile.amazon.com/Introduction-Statistical-Learning-Applications-Statistics-ebook/dp/B01IBM7790/) 43 | 2. **Applied Predictive Modeling** by Max Kuhn [(Amazon)](https://smile.amazon.com/Applied-Predictive-Modeling-Max-Kuhn-ebook/dp/B00K15TZU0/) 44 | 3. **Python Data Science Handbook** by Jake VanderPlas [(online version)](https://jakevdp.github.io/PythonDataScienceHandbook/) 45 | 4. **Elements of Statistical Learning** by Hastie et al. [(free pdf)](http://statweb.stanford.edu/~tibs/ElemStatLearn/download.html) [(Amazon)](https://smile.amazon.com/Elements-Statistical-Learning-Prediction-Statistics-ebook/dp/B00475AS2E/) 46 | 5. **Modern Multivariate Statistical Techniques** by Alan Izenman [(Amazon)](https://smile.amazon.com/Modern-Multivariate-Statistical-Techniques-Classification-ebook/dp/B00HWUR9CS/) 47 | 6. **Differential Equations and Linear Algebra** by Stephen Goode and Scott Annin [(Amazon)](https://www.amazon.com/Differential-Equations-Linear-Algebra-Stephen-ebook/dp/B00HR7MR3W/ref=mt_kindle?_encoding=UTF8&me=) 48 | 7. **Statistical Learning with Sparsity: The Lasso and Generalizations** by Trevor Hastie, Robert Tibshirani, and Martin Wainwright [(free pdf)](https://web.stanford.edu/~hastie/StatLearnSparsity/) [(Amazon)](https://www.amazon.com/Statistical-Learning-Sparsity-Generalizations-Probability/dp/1498712169/ref=sr_1_fkmrnull_1?crid=2OQXF1KIQYDUX&keywords=statistical+learning+with+sparsity+the+lasso+and+generalizations&qid=1552196190&s=gateway&sprefix=Statistical+Learning+with+Sparsity%3A+the+Lasso+and+gener%2Caps%2C178&sr=8-1-fkmrnull) and 49 | 50 | Help: 51 | 52 | * [Getting Help with R](https://www.r-project.org/help.html) 53 | * [Stack Overflow](https://stackoverflow.com/questions/tagged/r) 54 | * [Quick-R](https://www.statmethods.net/) 55 | * [R-Bloggers](https://www.r-bloggers.com/) 56 | 57 | Courses at Berkeley: 58 | 59 | * Stat 154 - Statistical Learning 60 | * CS 189 / CS 289A - Machine Learning 61 | * COMPSCI x460 - [Practical Machine Learning with R](https://extension.berkeley.edu/search/publicCourseSearchDetails.do?method=load&courseId=17483923&selectedProgramAreaId=15499&selectedProgramStreamId=25856348) [UC Berkeley Extension] 62 | * PH 252D - Causal Inference 63 | * PH 295 - Big Data 64 | * PH 295 - Targeted Learning for Biomedical Big Data 65 | 66 | Online classes: 67 | 68 | * [Tibshirani and Hastie's Statistical Learning Free Course](https://lagunita.stanford.edu/courses/HumanitiesSciences/StatLearning/Winter2016/about) 69 | * [Coursera Data Science Specialization](https://www.coursera.org/specializations/jhu-data-science) 70 | * [edX - Principles of Machine Learning](https://www.edx.org/course/principles-machine-learning-microsoft-dat203-2x-2) 71 | * [edX - Applied Machine Learning](https://www.edx.org/course/applied-machine-learning-microsoft-dat203-3x-0) 72 | * [Coursera - Machine Learning](https://www.coursera.org/learn/machine-learning) 73 | 74 | Other Campus Groups: 75 | 76 | * [D-Lab's Cloud Computing Working Group](http://dlab.berkeley.edu/working-groups/cloud-working-group-0) 77 | * [D-Lab's Computational Text Analysis Working Group](http://dlabctawg.github.io/) 78 | * [The Hacker Within](http://www.thehackerwithin.org/berkeley/) / [Berkeley Institute for Data Science](https://bids.berkeley.edu/) 79 | * [Machine Learning @ Berkeley](https://ml.berkeley.edu/) 80 | * [Berkeley Statistics and Machine Learning Discussion Group](https://bids.berkeley.edu/news/bids-launches-new-berkeley-statistics-and-machine-learning-discussion-group) 81 | -------------------------------------------------------------------------------- /Spring2017/Apr14-svm/proj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Apr14-svm/proj.png -------------------------------------------------------------------------------- /Spring2017/Apr14-svm/r-svm.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Support vector machines in R" 3 | output: 4 | html_document: default 5 | html_notebook: default 6 | --- 7 | 8 | Topics to cover: 9 | 10 | * Background 11 | * Polynomial kernels 12 | * Radial basis function (RBF) kernel 13 | * Hyperparameter optimization 14 | 15 | Before we dig in, we will install the R packages we'll be using. 16 | 17 | **R packages** 18 | ```{r} 19 | # List of packages we will use. 20 | packages = c("MASS", "kernlab", "devtools") 21 | 22 | # Try to load each package and save the result. 23 | success = sapply(packages, require, character.only = T, quietly = T) 24 | 25 | # Check if any packages still need to be installed. 26 | if (sum(!success) > 0) { 27 | # Install any needed packages. 28 | install.packages(packages[!success]) 29 | 30 | # Load the newly installed packages. 31 | sapply(packages[!success], require, character.only = T, quietly = T) 32 | } 33 | 34 | github_packages = c( 35 | # Chris's tools package for plotting the SuperLearner. 36 | "ck37/ck37r", 37 | # Use more up-to-date SuperLearner from github. 38 | "ecpolley/SuperLearner") 39 | 40 | devtools::install_github(github_packages) 41 | 42 | # Clean up variables. 43 | rm(packages, success, github_packages) 44 | ``` 45 | 46 | # Background 47 | 48 | Deb has done a great job covering the background & theory for SVM in the python materials. See also Intro to Statistical Learning Chapter 9, Applied Predictive Modeling sections 7.3, 13.4, 13.7.4, or pretty much any other machine learning textbook. 49 | 50 | Motivational quote: "SVMs not only have a more solid foundation than artificial neural nets, but are able to serve as a replacement for neural nets." - Learning with Kernels, 2002. 51 | 52 | # Software packages 53 | 54 | There are many R packages that implement support vector machines. `e1071` is one of the oldest, but we will use `kernlab` because it implements more kernels and has a broader set of features. `klaR` and `svmPath` are other options; `svmPath` is particularly designed to be fast. 55 | 56 | # Data preparation 57 | 58 | ```{r} 59 | data(Boston, package = "MASS") 60 | 61 | # Remove our outcome variable from the covariate list. 62 | X_df = Boston[, -14] 63 | 64 | # Convert X from a dataframe to a matrix. 65 | X_mat = model.matrix(~ ., data = X_df) 66 | 67 | # Notice the extra intercept column added by model.matrix. 68 | colnames(X_mat) 69 | 70 | # Remove extra intercept term. 71 | X_mat = X_mat[, -1] 72 | 73 | # Regression (continuous) version of our outcome variable. 74 | Y_reg = Boston$medv 75 | 76 | # Review outcome distribution. 77 | summary(Y_reg) 78 | 79 | # Classification (binary) version of our outcome variable. 80 | Y_class = as.factor(as.numeric(Boston$medv > 23)) 81 | 82 | # Review outcome distribution. 83 | table(Y_class) 84 | prop.table(table(Y_class)) 85 | 86 | ``` 87 | 88 | Also note that the SVM algorithms will internally center (mean 0) and scale all variables so that they can be compared to each other, as we did with K-nearest neighbors (lasso also does this). 89 | 90 | # Polynomial kernels 91 | 92 | Polynomial kernels are the simplest kernel you would want to use with SVM. You can do a linear kernel (i.e. no polynomial expansion) but that's essentially equivalent to OLS and is only done if the data is so big that a better kernel is computationlly infeasible. 93 | 94 | ```{r errors=T} 95 | library(kernlab) 96 | 97 | # Regression version. 98 | # X should be a matrix, not a dataframe. 99 | fit = ksvm(x = X_mat, y = Y_reg, kernel = "polydot", kpar = list(degree = 3)) 100 | fit 101 | 102 | # Compare to using a dataframe rather than a matrix. 103 | # The error message is particularly unhelpful. 104 | fit = ksvm(x = X, y = Y_reg, kernel = "polydot") 105 | 106 | # Classification version. 107 | # Y should be a factor so that classification is run automatically. 108 | fit = ksvm(x = X_mat, y = Y_class, kernel = "polydot", kpar = list(degree = 3)) 109 | fit 110 | 111 | ``` 112 | 113 | # Radial basis function kernels 114 | 115 | RBF is the best kernel if you have to pick one. 116 | 117 | ```{r} 118 | 119 | # Regression version. 120 | fit = ksvm(x = X_mat, y = Y_reg, kernel = "rbfdot") 121 | fit 122 | 123 | # Classification 124 | fit = ksvm(x = X_mat, y = Y_class, kernel = "rbfdot") 125 | fit 126 | 127 | ``` 128 | 129 | # Hyperparameter optimization 130 | 131 | There are two hyperparameters that can potentially have major effects on the performance of SVM: the amount of regularization (called "C" often), and the bandwidth (scale parameter often called gamma or sigma). 132 | 133 | The regularization parameter C is the non-negative error budget for the number of misclassifications allowed, critical to establish bias-variance trade-off. When C is small we want low bias but high variance, and the reverse when C is large. Thorough C grid points: $C \in \{2^{−5},2^{−3},...,2^{15}\}$ 134 | 135 | The scale parameter Gamma ($\gamma$) aka Sigma ($\sigma$) is effectively the inverse bandwidth of the SVM kernel. So a large gamma/sigma corresponds to a wide bandwidth used to calculate proximity, meaning that a wider range of points are incorporated. When gamma/sigma is small only very nearby observations are used. Thorough grid points: $\gamma or \sigma \in \{2^{-15}, 2^{-13}, ..., 2^{3}\}$. Notably, a good initial guess is generated by kernlab's `sigest` function, which may allow one to skip optimizing this hyperparameter. 136 | 137 | ```{r} 138 | 139 | # Make sure you have the latest version from github. 140 | library(SuperLearner) 141 | 142 | tuning_list = list( 143 | # Try different kernels. 144 | # Unfortunately we cannot currently customize the degree for the polynomial kernel when 145 | # using create.Learner() - this will be fixed. We could make the functions manually though. 146 | kernel = c("polydot", "rbfdot", "laplacedot"), 147 | # Regularization parameter, could be 2^-5 to 2^15. 148 | C = 2^c(-4, -2, 0, 2, 4, 6, 8) 149 | ) 150 | 151 | # Review the C values we are testing. 152 | tuning_list$C 153 | 154 | svm_learners = create.Learner("SL.ksvm", detailed_names = T, 155 | tune = tuning_list) 156 | 157 | # Review the learners that were created. 158 | svm_learners$names 159 | 160 | sl_lib = c("SL.mean", "SL.glm", "SL.ksvm", svm_learners$names) 161 | 162 | set.seed(1) 163 | 164 | Y_num = as.numeric(Y_class) - 1 165 | table(Y_num, Y_class) 166 | 167 | # Currently displays some extra output unfortunately. 168 | result = SuperLearner(Y = Y_num, X = Boston[, -14], family = "binomial", 169 | SL.library = sl_lib) 170 | result 171 | 172 | # Use plot.SuperLearner() from here. 173 | library(ck37r) 174 | 175 | # Plot the results. Use CV.SuperLearner to also plot the SL performance. 176 | # We need to pass in the outcome variable for this plot. 177 | plot(result, Y_num) 178 | ``` 179 | 180 | # Other notes 181 | 182 | SVM has to compute the kernel function for every pair of observations, so it is not ideal when you have many observations. It's best with a reasonable number of observations but possibly a large number of covariates. 183 | 184 | # Further reading on SVMs 185 | 186 | Scholkopf, B., & Smola, A. J. (2002). Learning with kernels: support vector machines, regularization, optimization, and beyond. MIT press. -------------------------------------------------------------------------------- /Spring2017/Apr28-neural-nets/nn-from-scratch-3-layer-network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Apr28-neural-nets/nn-from-scratch-3-layer-network.png -------------------------------------------------------------------------------- /Spring2017/Apr28-neural-nets/r-neural-nets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Neural networks in R" 3 | output: 4 | html_document: default 5 | html_notebook: default 6 | --- 7 | 8 | Topics to cover: 9 | 10 | * Background 11 | * Single-layer networks 12 | * Multi-layer networks 13 | * Possibly more 14 | 15 | 16 | Before we dig in, we will install the R packages we'll be using. 17 | 18 | **R packages** 19 | ```{r} 20 | # List of packages we will use. 21 | packages = c("MASS", "nnet", "h2o", "devtools", "NeuralNetTools") 22 | 23 | github_packages = c( 24 | # Chris's tools package for plotting the SuperLearner. 25 | "ck37r" = "ck37/ck37r", 26 | # Use more up-to-date SuperLearner from github. 27 | "SuperLearner" = "ecpolley/SuperLearner") 28 | 29 | devtools::install_github(github_packages) 30 | 31 | # Load those github packages. 32 | ck37r::load_packages(names(github_packages)) 33 | 34 | # Load required non-github packages and install from CRAN if necessary. 35 | ck37r::load_packages(packages, install = T) 36 | 37 | # Also install mxnet for potential usage. 38 | # This unfortunately is Mac/Windows only; probably will not work for Linux. 39 | # Actually not working for Mac either. 40 | if (F) { 41 | # Skip this for now. 42 | install.packages("drat", repos="https://cran.rstudio.com") 43 | drat:::addRepo("dmlc") 44 | install.packages("mxnet") 45 | } 46 | 47 | # Clean up variables. 48 | rm(packages, success, github_packages) 49 | ``` 50 | 51 | # Background 52 | 53 | Please see Deb's python code for more details on neural network theory. 54 | 55 | # Software packages 56 | 57 | We'll be using `nnet` for simple neural networks and `h2o` for deep neural networks. 58 | 59 | # Data preparation 60 | 61 | ```{r} 62 | data(Boston, package = "MASS") 63 | 64 | # Remove our outcome variable from the covariate list. 65 | X_df = Boston[, -14] 66 | 67 | # Convert X from a dataframe to a matrix. 68 | X_mat = model.matrix(~ ., data = X_df) 69 | 70 | # Notice the extra intercept column added by model.matrix. 71 | colnames(X_mat) 72 | 73 | # Remove extra intercept term. 74 | X_mat = X_mat[, -1] 75 | 76 | # Regression (continuous) version of our outcome variable. 77 | Y_reg = Boston$medv 78 | 79 | # Review outcome distribution. 80 | summary(Y_reg) 81 | 82 | # Classification (binary) version of our outcome variable. 83 | Y_class = as.factor(as.numeric(Boston$medv > 23)) 84 | 85 | # Review outcome distribution. 86 | table(Y_class) 87 | prop.table(table(Y_class)) 88 | 89 | ``` 90 | 91 | # Single-layer neural network 92 | 93 | 94 | Quick classification example 95 | 96 | ```{r} 97 | library(nnet) 98 | 99 | # Classification 100 | 101 | # Set seed because weights are initialized randomly. 102 | set.seed(1) 103 | 104 | # X can be a dataframe or matrix. 105 | # If Y is a factor we need to use this formula notation. 106 | fit = nnet(Y_class ~ X_mat, size = 2, decay = 5e-4, maxit = 200) 107 | 108 | # Review our neural network fit. 109 | fit 110 | 111 | # Plot our neural network. 112 | library(NeuralNetTools) 113 | plotnet(fit) 114 | 115 | # Predict back to our original data. 116 | pred = predict(fit, X_mat) 117 | 118 | # Review predictions. 119 | summary(pred) 120 | 121 | # 122 | ``` 123 | 124 | Quick regression example 125 | 126 | ```{r} 127 | library(nnet) 128 | 129 | # Set seed because weights are initialized randomly. 130 | set.seed(1) 131 | 132 | # Again, X can be a dataframe or matrix. 133 | fit = nnet(Y_reg ~ X_mat, size = 2, decay = 5e-4, maxit = 200, 134 | # Enable linear output to support regression. 135 | linout = T) 136 | 137 | # Challenge: try with linout = F (the default) and see what happens. 138 | 139 | # Review our neural network fit. 140 | fit 141 | 142 | # Visualize neural network. 143 | plotnet(fit) 144 | 145 | # Predict back to our original data. 146 | pred = predict(fit, X_mat) 147 | 148 | # Review predictions. 149 | summary(pred) 150 | 151 | # Calculate mean-squared error (MSE). 152 | mean((pred - Y_reg)^2) 153 | 154 | # And root mean squared error (RMSE). 155 | sqrt(mean((pred - Y_reg)^2)) 156 | 157 | ``` 158 | 159 | # SuperLearner optimization 160 | 161 | These challenges can be done in pairs/groups to make it easier. 162 | 163 | Challenge 1: use SL.nnet wrapper to estimate performance of the neural network. 164 | 165 | Challenge 2: use create.Learner() to test 2, 3, 4, or 5 hidden units and create a weighted average ensemble. 166 | 167 | # Multi-layer neural network 168 | 169 | Challenge: use h2o to design this. 170 | 171 | ```{r} 172 | library(h2o) 173 | # Startup and connect to our existing h2o cluster. 174 | # Use all available threads. 175 | h2o.init(nthreads = -1) 176 | 177 | # Clean slate - just in case the cluster was already running. 178 | h2o.removeAll() 179 | 180 | # Load x data into h2o. 181 | data = as.h2o(cbind(X_df, `_outcome` = Y_reg)) 182 | dim(data) 183 | 184 | outcome = "_outcome" 185 | x = colnames(X_df) 186 | 187 | # Fit the deep learning model here. 188 | 189 | # Shutdown server when we're done. 190 | h2o.shutdown(prompt = F) 191 | ``` 192 | 193 | See also Erin LeDell's [excellent tutorial on deep learning](https://github.com/ledell/useR-machine-learning-tutorial/blob/master/deep-neural-networks.Rmd). 194 | -------------------------------------------------------------------------------- /Spring2017/Feb17-stepwise/r-stepwise-selection.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: '' 3 | output: 4 | html_document: default 5 | html_notebook: default 6 | --- 7 | 8 | # Stepwise selection 9 | 10 | Topics to cover: 11 | 12 | * Best subset selection 13 | * Forward selection 14 | * Backward selection 15 | * Cross-validation 16 | 17 | Before we dig in, we will install the R packages we'll be using. 18 | 19 | **R packages** 20 | ```{r} 21 | # List of packages we will use. 22 | packages = c("MASS", "mlbench", "SuperLearner", "devtools") 23 | 24 | # Try to load each package and save the result. 25 | success = sapply(packages, require, character.only = T, quietly = T) 26 | 27 | # Check if any packages still need to be installed. 28 | if (sum(!success) > 0) { 29 | # Install any needed packages. 30 | install.packages(packages[!success]) 31 | 32 | # Load the newly installed packages. 33 | sapply(packages[!success], require, character.only = T, quietly = T) 34 | } 35 | 36 | # Install Chris K.'s tools package, which we'll use for imputing missing values. 37 | devtools::install_github("ck37/ck37r") 38 | 39 | # Clean up variables. 40 | rm(packages, success) 41 | ``` 42 | 43 | ## Background 44 | 45 | Stepwise selection, or [stepwise regression](https://en.wikipedia.org/wiki/Stepwise_regression), is a commonly used technique to include a subset of covariates in a regression model. The goal is to increase accuracy compared to including all covariates in the model, because we can often improve model performance by removing some covariates (as we did with lasso / elastic net). Stepwise is a simple form of **feature selection** - choosing a subset of variables for incorporation into a machine learning algorithm. 46 | 47 | ### Best subset selection 48 | 49 | Ideally we would test every possible combination of covariates and use the combination with the best performance. This is **best subset selection**. 50 | 51 | Consider the case of three covariates: X1, X2, and X3. We would estimate the accuracy of the following models: 52 | 53 | * All variables: X1, X2, X3 - our default regression 54 | * X1 and X2 (exclude X3) 55 | * X2 and X3 (exclude X1) 56 | * X1 and X3 (exclude X2) 57 | * X1-only 58 | * X2-only 59 | * X3-only 60 | * No variables (intercept only) 61 | 62 | The one with the best performance (e.g. cross-validated mean-squared error) is the one we would use. Stepwise algorithms are commonly used without cross-validation, and as a result they are usually overfitting the data - capturing random error in addition to true relationships in the data, resulting in worse performance on new data. 63 | 64 | To generalize to any model size, if we have p covariates we would have to check $2^p$ different combinations: each covariate is either included or not (2 possibilities), so combining that for all covariates we have the product of p twos: $2 * 2 * 2...$ which simplifies to $2^p$. With 10 covariates that is 1,024 models to check, with 20 covariates it's a million, etc. 65 | 66 | ### Stepwise selection 67 | 68 | Stepwise selection is a simplification of best subset selection to make it computationally feasible for any number of covariates. It comes in three forms: forward, backward, and combined forward & backward. Confusingly, sometimes "stepwise" is meant to refer specifically to the "both" approach. 69 | 70 | **Forward selection** starts with just the intercept and considers which single variable to incorporate next. It loops over every variable, runs a regression with that variable plus the intercept, and chooses the variable with the best performance on a certain metric: adjusted $R^2$, [f-statistic](https://en.wikipedia.org/wiki/F-test#Regression_problems), [Aikake Information Criterion](https://en.wikipedia.org/wiki/Akaike_information_criterion), or other preferred performance estimate. It then adds that variable to the model and considers the next variable to add, continuing to repeat until no remaining variable improves performance. 71 | 72 | ## Clean dataset 73 | 74 | Let's try out some code. First we prep a demo dataset. 75 | ```{r} 76 | # Load a test dataset. 77 | data(PimaIndiansDiabetes2, package = "mlbench") 78 | 79 | data = PimaIndiansDiabetes2 80 | 81 | # Review data structure. 82 | str(data) 83 | 84 | # Do we have missing values? Yes. 85 | sum(is.na(data)) 86 | 87 | library(ck37r) 88 | 89 | outcome = "diabetes" 90 | 91 | # Impute missing data and add missingness indicators. 92 | # Don't impute the outcome though. 93 | result = ck37r::impute_missing_values(data, skip_vars = outcome) 94 | # Use the imputed dataframe. 95 | data = result$data 96 | 97 | str(data) 98 | 99 | # Now do we have missing values? 100 | sum(is.na(data)) 101 | 102 | # Create a vector just for the outcome variable. 103 | # Convert to numeric for glm(). 104 | Y = as.numeric(data[, outcome] == "pos") 105 | 106 | # Confirm our outcome vector is correct. 107 | table(data[, outcome], Y) 108 | 109 | # Remove the outcome variable from our covariate list. 110 | X = data[, !names(data) == outcome] 111 | 112 | # Confirm our covariates and dimensions are right. 113 | colnames(X) 114 | dim(X) 115 | length(Y) 116 | ``` 117 | 118 | ## Stepwise selection code 119 | 120 | Now let's look at stepwise selection. 121 | 122 | ```{r} 123 | # Fit the intercept-only model. Specify data because we will use later. 124 | initial_reg = glm(Y ~ 1, data = X, family = "binomial") 125 | summary(initial_reg) 126 | 127 | # Define the largest possible model specification. 128 | largest_model = glm(Y ~ ., data = X, family = "binomial") 129 | summary(largest_model) 130 | 131 | # Review step() 132 | ?step 133 | 134 | # Run stepwise forward selection. 135 | step_reg = step(initial_reg, formula(largest_model), 136 | direction = "forward", trace = 0) 137 | step_reg 138 | ``` 139 | 140 | **Backward selection** does the same thing but it starts with all variables in the model and considers which variable to first remove from the model. It checks the performance for each variable when it is removed and removes the variable that is least useful to the regression performance. It continues this until no variable yields an increase in performance upon removal. 141 | 142 | **Challenges** 143 | 144 | 1. How similar are stepwise results compared to the significant covariates from the standard OLS we ran first? Hint: compare `step_reg` with `summary(largest_model)`. 145 | 2. Try running with `trace = 1` to see more details in the stepwise process. 146 | 3. Try running with `direction = "backward"` and then `direction = "both"` - do you get the same variables selected? Hint: with backward you will need to change the first argument to use the full model rather than the intercept-only model. 147 | 148 | ## Cross-validated comparison 149 | 150 | As mentioned earlier, it is critical that we use cross-validation to estimate the accuracy of the stepwise procedure. If we don't we will definitely get an overly optimistic estimate of model performance. 151 | 152 | ```{r} 153 | 154 | sl_lib = c("SL.mean", "SL.glm", "SL.glmnet", "SL.step.forward", "SL.stepAIC") 155 | 156 | set.seed(1) 157 | sl = SuperLearner(Y, X, family = binomial(), SL.library = sl_lib) 158 | sl 159 | ``` 160 | 161 | **Challenges** 162 | 163 | 1. Add in one or two other algorithms we've used. How do they compare to stepwise? 164 | 2. Look at the code for `SL.step.forward` and `SL.stepAIC` - any questions on how they work? 165 | 166 | ## Further reading 167 | 168 | * Intro to Statistical Learning, section 6.1.2 169 | * Applied Predictive Modeling, chapter 19 "Feature Selection". 170 | * ["What are some of the problems with stepwise regression?"](http://www.stata.com/support/faqs/statistics/stepwise-regression-problems/) [CK: note that they are assuming no cross-validation.] 171 | * [Regression Modeling Strategies](https://smile.amazon.com/Regression-Modeling-Strategies-Applications-Statistics-ebook/dp/B0140XQAXI), section 4.3. 172 | * [Statistical Learning from a Regression Perspective](https://smile.amazon.com/Statistical-Learning-Regression-Perspective-Statistics-ebook/dp/B01M333153) section 1.4.6. -------------------------------------------------------------------------------- /Spring2017/Feb3-knn/Feb3kNN-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "k-Nearest Neighbor classification and regression" 3 | author: "Evan Muzzall" 4 | date: "2/3/2017" 5 | output: 6 | html_document: 7 | toc: yes 8 | toc_float: yes 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | knitr::opts_chunk$set(echo = TRUE, fig.width=9, fig.height=6, width=160) 13 | ``` 14 | 15 | #0. Package installation 16 | Today we will use the following packages. Although we won't use it today, we recommend installing "SuperLearner" as well. 17 | ```{r, eval=FALSE} 18 | install.packages(c("caret", "chemometrics", "class", "FNN", "gmodels", "ggplot2", "MASS", "SuperLearner"), dependencies=TRUE) 19 | library(caret) 20 | library(chemometrics) 21 | library(class) 22 | library(FNN) 23 | library(gmodels) 24 | library(ggplot2) 25 | library(MASS) 26 | library(SuperLearner) 27 | ``` 28 | 29 | #1. What is k-Nearest Neighbors? (kNN) 30 | kNN is a form of "lazy" learning in which data are categorized based on similarities with their "nearest" neighbors. kNN can be thought of as non-parametric instance-based learning. Compared to other algorithms, KNN is simple and makes no assumptions about the underlying data structure. 31 | 32 | The data are treated as coordinates in a multidimensional feature space to organize the desired groups that are identified. kNN is distance-based and distills variation contained within multiple variables into a reduced number of principal axes. 33 | 34 | Euclidean (straight-line Cartesian) distance is one standard for KNN and the distance we will use today. However, many consider Mahalanobis distance a more appropriate multivariate distance for kNN and other statistical tests. See for example: 35 | 36 | [Weinberger et al. 2009. Distance Metric Learning for Large Margin 37 | Nearest Neighbor Classification. Journal of Machine Learning Reseach 10: 207-244](http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf) 38 | 39 | #2. The data 40 | Load the "Boston" housing dataset from the "MASS" R package and check it out: 41 | ```{r} 42 | library(MASS) 43 | data(Boston) 44 | ``` 45 | ```{r, eval=FALSE} 46 | head(Boston) 47 | ?Boston 48 | str(Boston) 49 | ``` 50 | 51 | Today we will walk through classification and then regression using kNN. For classification, we want the response variable to be a factor. For regression, we want the response variable to remain numeric. Thus, we will first make a copy of the "Boston" dataset for the regression, before we arbitrarily convert it to a factor for classification: 52 | ```{r} 53 | B_reg <- Boston 54 | ``` 55 | 56 | Time to get creative! Let's do a little data wrangling and coerce the "dis" variable (weighted mean of distances to five Boston employment centers) into a factor category. The distances will now be categorized as "short", "medium", or "long". 57 | ```{r} 58 | summary(Boston$dis) 59 | Boston$dis <- cut(Boston$dis, 60 | breaks=c(0, 3, 6, 13), 61 | levels=c(1,2,3), 62 | labels=c("short", "medium", "long")) 63 | table(Boston$dis) 64 | ``` 65 | ```{r, eval=FALSE} 66 | str(Boston) 67 | head(Boston, 10) 68 | levels(Boston$dis) 69 | ``` 70 | 71 | #3. Choosing a proper k 72 | The "k" in kNN represents the number of other "neighboring" data points used to classify the point in question. Consider the bias-variance tradeoff when choosing a proper "k". 73 | 74 | [Click here for Jason Brownlee's excellent introduction to the bias-variance tradeoff](http://machinelearningmastery.com/gentle-introduction-to-the-bias-variance-trade-off-in-machine-learning/) 75 | 76 | For example, **if we choose a large "k",** it is easy for the majority class to win because it will always get the most votes and the "nearest neighbors" would not exert their proper influence. Or, **if we choose a tiny "k",** noise and outliers could unduly influence the classification of the point being classified, again disregarding the influence of the other "nearest neighbors". 77 | 78 | For our example, we will set "k" to the square root of the number of training observations (see below). However, this might not result in the best "k". Thus, we will perform cross-validation on 1:50 "k's" to see how misclassification error varies across the different k-values. 79 | #4. Split the data 80 | Now, use caret's handy `createDataPartition` funciton to conduct a stratified random split and divide the Boston data into train and test sets. We choose to put 70% of the data into the training set, and the remaining 30% into the test set. Also create label vectors to be used as identifiers in the classification process: 81 | ```{r} 82 | library(caret) 83 | set.seed(1) 84 | split <- createDataPartition(Boston$dis, p=0.70, list=FALSE) 85 | train <- Boston[split, ] 86 | test <- Boston[-split, ] 87 | 88 | train_labels <- train[,8] 89 | test_labels <- test[,8] 90 | ``` 91 | 92 | #5. Train the model 93 | Time to classify! Fit the model to the training data using the `knn` function from the "class" package. This outputs a vector of the predicted classifications. However, let's first choose a "k" using the square root method: 94 | ```{r, eval=FALSE} 95 | ?knn # (click the option from the "class" library) 96 | ``` 97 | ```{r} 98 | round(sqrt(nrow(train)),2) # 18.84 99 | 100 | library(class) 101 | set.seed(1) 102 | Boston_p <- knn(train=train[,-8], test=test[,-8], cl=train_labels, k=19, prob=TRUE) 103 | ``` 104 | 105 | ###5.1. Evaluate its performance 106 | How did it do? Check out its performance on the test set using the `CrossTable` function from the "gmodels" package: 107 | ```{r} 108 | library(gmodels) 109 | CrossTable(x=test_labels, y=Boston_p, 110 | prop.chisq=FALSE, 111 | prop.r=FALSE, 112 | prop.c=FALSE, 113 | prop.t=FALSE) 114 | ``` 115 | How did it do? 116 | 117 | > NOTE: remember that the breaks specified in the cut function above were arbitrary for this toy example. You will probably want to make more informed decisions for your thesis, dissertation, and other professional work! 118 | 119 | ###5.2. Improve model performance 120 | #####1. Normalize the data 121 | We don't want larger values to indiscriminately affect results. Let's standardize the data to a normal range so that their contributions to the decision-making process become roughly equal. We can do this with `scale`: 122 | 123 | Let's name this scaled dataframe "B": 124 | ```{r} 125 | B <- Boston 126 | B[,-8] <- scale(Boston[,-8], center=TRUE, scale=TRUE) 127 | ``` 128 | ```{r, eval=FALSE} 129 | head(B, 10) 130 | ``` 131 | 132 | Re-split the data using this transformed "B" dataframe: 133 | ```{r} 134 | set.seed(1) 135 | split_scale <- createDataPartition(B$dis, p=0.70, list=FALSE) 136 | train_scale <- B[split_scale, ] 137 | test_scale <- B[-split_scale, ] 138 | 139 | train_labels_scale <- train_scale[,8] 140 | test_labels_scale <- test_scale[,8] 141 | ``` 142 | 143 | Fit the model again: 144 | ```{r} 145 | set.seed(1) 146 | B_p <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=19, prob=TRUE) 147 | 148 | CrossTable(x=test_labels_scale, y=B_p, 149 | prop.chisq=FALSE, 150 | prop.r=FALSE, 151 | prop.c=FALSE, 152 | prop.t=FALSE) 153 | ``` 154 | How did it do? 155 | 156 | #####5.3. Change "k" 157 | We can also change "k" to evaluate the performance of several models. Ideally, you would use the ["SuperLearner" R package](https://cran.r-project.org/web/packages/SuperLearner/index.html to examine a handful of kNN algorithms with different k-values simultaneously against other algorithms. Below we will examine a range of cross-validated k-values. For now, let's just try a few extremes: 158 | ```{r} 159 | B_p_k1 <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=1, prob=TRUE) # k=1 160 | B_p_k50 <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=50, prob=TRUE) # k=50 161 | 162 | CrossTable(x=test_labels_scale, y=B_p_k1, # k=1 163 | prop.chisq=FALSE, 164 | prop.r=FALSE, 165 | prop.c=FALSE, 166 | prop.t=FALSE) 167 | 168 | CrossTable(x=test_labels_scale, y=B_p_k50, # k=50 169 | prop.chisq=FALSE, 170 | prop.r=FALSE, 171 | prop.c=FALSE, 172 | prop.t=FALSE) 173 | 174 | ``` 175 | What happened? 176 | 177 | #6. Another method with a CV error plotting function 178 | Tidy the transformed "B" data a little: 179 | ```{r} 180 | grp <- B$dis 181 | X <- scale(B[-8], center=TRUE, scale=TRUE) 182 | k <- length(unique(grp)) 183 | dat <- data.frame(grp, X) 184 | n <- nrow(X) 185 | n_train <- round(n*2/3) 186 | 187 | set.seed(123) 188 | train_plot <- sample(1:n,n_train) 189 | ``` 190 | ###6.1 Plot the cross-validated errors 191 | ```{r} 192 | library(chemometrics) 193 | #pdf("kNN classification.pdf", 9, 6) 194 | knn_k <- knnEval(X, grp, train_plot, 195 | knnvec=seq(1,50, by=1), 196 | legpo="bottomright", las=2) 197 | title("kNN classification") 198 | #dev.off() 199 | ``` 200 | 201 | #7. Regression example 202 | For the regression example, let's return to the B_reg copy of the Boston dataset that we made at the beginning because it preserved the numeric class of the "dis" variable. 203 | 204 | First, scale the data: 205 | ```{r} 206 | B_reg <- as.data.frame(scale(B_reg, center=TRUE, scale=TRUE)) 207 | ``` 208 | 209 | Second, split the data again: 210 | ```{r} 211 | library(caret) 212 | set.seed(1) 213 | split_reg <- createDataPartition(B_reg$dis, p=0.70, list=FALSE) # split 214 | train_reg <- B_reg[split,] 215 | test_reg <- B_reg[-split,] 216 | ``` 217 | 218 | Third, fit the model and plot it! 219 | ```{r} 220 | library(FNN) 221 | set.seed(1) 222 | knn_reg <- knn.reg(train_reg[,-8], test=NULL, y=train_reg[,8], k=3) 223 | plot(train_reg[,8], knn_reg$pred, xlab="y", ylab=expression(hat(y))) 224 | ``` 225 | 226 | Or, use ggplot2 :) 227 | ```{r} 228 | library(ggplot2) 229 | gg_df <- data.frame(train_reg[,8], knn_reg$pred) 230 | colnames(gg_df) <- c("distance", "predictions") 231 | str(gg_df) 232 | 233 | gg <- ggplot(gg_df, aes(distance, predictions, color=distance)) + 234 | geom_point() + 235 | theme_bw() + 236 | xlab("y") + 237 | ylab(expression(hat("y"))) + 238 | ggtitle("kNN regression") + 239 | scale_color_continuous(low="yellow", high="red") 240 | 241 | c <- coef(lm(predictions ~ distance, data=gg_df)) # compute intercept and slope to plot ab line 242 | c 243 | class(c) 244 | 245 | #pdf("kNN regression.pdf", 9, 6) 246 | gg + geom_abline(intercept=c[1], slope=c[2], col="green3") 247 | #dev.off() 248 | ``` 249 | 250 | See the below links for information on plotting decision boundaries: 251 | [Stack Overflow - Variation on “How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?”](http://stackoverflow.com/questions/31234621/variation-on-how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-f) 252 | [Stack Overflow - How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?](http://stats.stackexchange.com/questions/21572/how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-from-elements-o/21602#21602) 253 | 254 | Materials compiled from: 255 | [-Lantz, Brett. 2013. Machine Learning with R. Birmingham, UK: Packt Publishing, Ltd.](https://www.amazon.com/Machine-Learning-Second-Brett-Lantz/dp/1784393908) 256 | [-James G, Witten D, Hastie T, Tibshirani R. 2015. An Introduction to Statistical Learning - with applications in R, 6th ed. Springer: New York](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf) 257 | [-knnEval help page](https://artax.karlin.mff.cuni.cz/r-help/library/chemometrics/html/knnEval.html) -------------------------------------------------------------------------------- /Spring2017/Mar17-gam and mars/Mar3-gamearth-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Generalized additive models (GAMs) and Multivariate adaptive regression splines (MARS/EARTH) - rough draft" 3 | author: "Evan Muzzall" 4 | date: "3/17/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ##1. Package installation 13 | We will use the following packages for this example: 14 | ```{r} 15 | if (F) { 16 | install.packages(c("akima", "caret", "devtools", "earth", "gam", "ggplot2", "mgcv", "mlbench", "plotmo")) # run lines 16 and 17 manually if needed 17 | devtools::install_github("ck37/ck37r") 18 | } 19 | 20 | library(akima) 21 | library(caret) 22 | library(ck37r) 23 | library(devtools) 24 | library(gam) 25 | library(ggplot2) 26 | library(mgcv) 27 | library(mlbench) 28 | library(plotmo) 29 | library(earth) 30 | ``` 31 | 32 | ##2. Goals 33 | Use the "PimaIndiansDiabetes2" dataset to construct a generalized additive model (GAM) and multivariate additive regression model (MARS, aka EARTH). blood pressure will be the response variable. Missing data will be median-imputed and indicators will be created to document their missingness. 34 | 35 | ##3. Preprocess the data 36 | ```{r} 37 | # load the dataset 38 | data(PimaIndiansDiabetes2) 39 | ?PimaIndiansDiabetes2 40 | data <- PimaIndiansDiabetes2 # give the data a simpler name 41 | str(data) 42 | ``` 43 | 44 | Check for missing data: 45 | ```{r} 46 | # check for missing cases 47 | sum(is.na(data)) 48 | 49 | # how much of the data is missing? 50 | sum(is.na(data)) / (nrow(data)*ncol(data)) # about 9% 51 | ``` 52 | 53 | Recode the "diabetes" vector to numeric type: 54 | ```{r} 55 | data$diabetes <- ifelse(data$diabetes=="pos", 1, 0) 56 | ``` 57 | 58 | Use Chris K's handy median impute function to impute missing values: 59 | ```{r} 60 | # impute and add missingness indicators 61 | result = ck37r::impute_missing_values(data) 62 | 63 | # overwrite "data" with new imputed data frame 64 | data <- result$data 65 | ``` 66 | 67 | Double check that missing values have been imputed: 68 | ```{r} 69 | # no more NA values 70 | sum(is.na(data)) 71 | 72 | # check that missingness indicators have been added 73 | str(data) 74 | ``` 75 | 76 | ##4. Generalized additive models (GAMs) 77 | This semester, MLWG has explored linear, polynomial, and spline regression models using single predictors (March 3) as well as stepwise selection using multiple predictors (Feb 17). Deb also offered an informative take on splines earlier today (Mar 17). Last semester, we talked about improving linear regression models via penalized regression (LASSO and ridge) using multiple predictors (Nov 4). 78 | 79 | When considering multilple predictor variables, another extension of multiple linear regression can be used - generalized additive models. 80 | 81 | Generalized additive models (GAMs) are another extension of multiple linear regression. They are not bound by linear relationships between predictor and response variable and can instead incorporate smoothed, nonlinear relationships. Each relationships is computed and summed (thus making it "additive"). Smoothed splines are not the only constructs used to build GAMs, as they can be built using natural splines, local regression, polynomial regression, etc. 82 | 83 | "Backfitting", or updating the model as each predictor is approximated using penalized likelihood maximization, comprises the smoothed spline. 84 | 85 | See [Wood's book](https://www.crcpress.com/Generalized-Additive-Models-An-Introduction-with-R/Wood/p/book/9781584884743) for thorough walkthroughs of GAMs in R. 86 | 87 | As always, we also encourage [Introduction to Statistical Learning - Chapter 7](http://www-bcf.usc.edu/~gareth/ISL/) for a nice introductory overview and exercises. 88 | See [Faraway 2002](https://cran.r-project.org/doc/contrib/Faraway-PRA.pdf) for a great intro to regression and ANOVA 89 | 90 | Fit the GAM: 91 | ```{r} 92 | gam1 <- gam(pressure ~ s(glucose) + s(insulin) + s(age) + diabetes, 93 | family="gaussian", 94 | method="GCV.Cp", 95 | data=data) 96 | 97 | gam1 98 | # view summary output 99 | gam.check(gam1) 100 | 101 | names(gam1) 102 | gam1$aic 103 | gam1$sig2 104 | ``` 105 | 106 | Play with some basic plotting features 107 | ```{r} 108 | plot(gam1, se=T, 109 | shade=T, col="black", shade.col="gray80", 110 | residuals=F, 111 | pages=1) 112 | title("gam1") 113 | ``` 114 | 115 | ##5. Compare the GAM to other similar GAMs! 116 | Our plots suggest that "glucose" is fairly linear. What if we compare `gam1` to two other GAMs - one that _excludes_ the predictor glucose, and another that _assumes a linear relationship_ of glucose? 117 | ```{r} 118 | # model that excludes glucose 119 | gam2 <- gam(pressure ~ s(insulin) + s(age) + diabetes, 120 | family="gaussian", 121 | method="GCV.Cp", 122 | data=data) 123 | 124 | plot(gam2, pages=1) 125 | 126 | # model that assumes linear glucose 127 | gam3 <- gam(pressure ~ glucose + s(insulin) + s(age) + diabetes, 128 | family="gaussian", 129 | method="GCV.Cp", 130 | data=data) 131 | 132 | plot(gam3, pages=1) 133 | 134 | anova(gam1, gam2, gam3, test="F") # small p-value suggests that a non-linear function for glucose is preferable? 135 | 136 | AIC(gam1, gam2, gam3) # is this a multiple comparison problem? 137 | BIC(gam1, gam2, gam3) 138 | ``` 139 | 140 | What if we want to identify unhelpful predictors and remove them for better results? 141 | ```{r} 142 | table(data$diabetes, I(data$pregnant>14)) 143 | 144 | gam4 <- gam(pressure ~ s(glucose) + s(insulin) + s(age) + diabetes, 145 | family="gaussian", 146 | data=data, 147 | subset=(diabetes !=0)) 148 | 149 | plot(gam4, se=TRUE, seWithMean=TRUE, 150 | shade=TRUE, col="blue", shade.col="lightgreen", 151 | residuals=FALSE, 152 | pages=1) 153 | title("GAM - adjusted predictors") 154 | 155 | AIC(gam1, gam2, gam3, gam4) 156 | ``` 157 | 158 | ##6. plotmo 159 | The "plotmo" R package offers a great way to visualize regression splines in three dimensions: 160 | ```{r} 161 | plotmo(gam1, all2=TRUE) # show simplfied seWithMean plots AND three dimensional splines for all variable relationships 162 | 163 | # non-additive shapes have correlated effects in 3D plane surfaces. 164 | 165 | # plot partial dependencies (takes a few minutes) 166 | # plotmo(gam1, all2=TRUE, pmethod = "partdep") 167 | 168 | # faster version of pmethod="partdep" 169 | plotmo(gam1, all2=TRUE, pmethod = "apartdep", 170 | caption = "What have I gotten myself in to...") 171 | 172 | # let's play around with a few more parameters! 173 | plotmo(gam1, all2=TRUE, pt.col = "green3") 174 | plotmo(gam1, all2=TRUE, pt.col = "green3", smooth.col = "red") 175 | plotmo(gam1, all2=TRUE, 176 | pt.col = "green3", 177 | smooth.col = "red", 178 | grid.col="gray80") 179 | 180 | # return just some of the plots! 181 | plotmo(gam1, all2=TRUE, degree1 = c(1,2), degree2=0, col="tomato") # show just the first two predictor plots 182 | 183 | plotmo(gam1, all2=TRUE, degree1 = 0, degree2 = 1, # return just glucose v. pregnant perspective plot 184 | caption = "this is called a 'perspective plot'", 185 | persp.col="orange") 186 | ``` 187 | 188 | See [Wood S. 2006. Generalized additive models: An introduction with R](https://www.amazon.com/Generalized-Additive-Models-Introduction-Statistical/dp/1584884746) for expert explanations. 189 | 190 | ["gam" R package](https://cran.r-project.org/web/packages/gam/index.html) 191 | 192 | ["mgcv" R package](https://cran.r-project.org/web/packages/mgcv/mgcv.pdf) 193 | 194 | Also check out [Stephen Milborrow's excellent instructions on the "plotmo" R package](http://www.milbo.org/doc/plotmo-notes.pdf) 195 | 196 | ##7. Multivariate adaptive regression splines (MARS) and (earth) 197 | Multivariate adaptive regression splines (MARS) are a technique developed by Jerome H. Friedman in 1991 and copyrighted by Salford Systems. Open source implementations are thusly referred to as "earth", but may not be identical to MARS. Also see the ["mda" R package](https://cran.r-project.org/web/packages/mda/index.html) and Friedman papers for specifics. 198 | 199 | earth = Enhanced Adaptive Regression Through Hinges 200 | 201 | These approaches use "surrogate features" (or, models of the models), usually versions of one or two predictors at a time. Each predictor is divided into two groups and each group models the outcome variable for each group. This creates a "piecewise linear model" where each new feature is some proportion of the data. 202 | 203 | Group definitions are provided via linear regression models! Those with the smallest error are used. See [Kuhn and Johnson, 2016:145 ](http://appliedpredictivemodeling.com/) for more information. 204 | 205 | Fit the earth model 206 | ```{r} 207 | # fit the model 208 | set.seed(1) 209 | earth1 <- earth(pressure ~ ., data=data, 210 | degree=1, nk=5, 211 | keepxy=TRUE, nprune=20, nfold=10, ncross=2, 212 | pmethod="cv", trace=4) 213 | 214 | # view summary output 215 | summary(earth1, details=TRUE) 216 | 217 | # view predictor importance 218 | evimp(earth1) 219 | 220 | # compute predicted values 221 | earth_pred <- predict(earth1) 222 | 223 | # print accuracy 224 | (mse <- mean((data$pressure - earth_pred)^2)) 225 | ``` 226 | 227 | Earth plots 228 | ```{r} 229 | # plot 230 | # png("earth1.png") 231 | plot(earth1) 232 | # dev.off() 233 | plot(earth1, info=T, type="response", trace=1) 234 | plotmo(earth1, info=T, type="response", trace=1)#, level=.9) 235 | 236 | # 3d MARS plots! 237 | # same syntactical rules apply here as well 238 | plotmo(earth1) 239 | 240 | plotmo(earth1, all2=TRUE, persp.col="azure") 241 | ``` 242 | 243 | We can also see the ideal number of terms 244 | ```{r} 245 | control <- trainControl(method = "repeatedcv", 246 | repeats = 1, number = 1) 247 | 248 | grid <- expand.grid(.degree = 1, .nprune = 2:25) 249 | 250 | earth_best_terms <- train(pressure ~ ., data = data, method = "earth", 251 | tuneGrid= grid) 252 | 253 | earth_best_terms 254 | plot(earth_best_terms) 255 | ``` 256 | 257 | TODO: 258 | - determine best value for nfold 259 | - explore the ncross argument 260 | - plot cross validation results 261 | - collect $R^2$ in different ways 262 | - use cross-validation to select the number of terms 263 | - better discuss partial dependence plots 264 | - include confidence intervals versus prediction intervals 265 | - investigate assumptions of prediction intervals 266 | - include text about interpretaiton of 3D plotmo regression surfaces 267 | - comprehensively discuss limitations 268 | 269 | [See Stephen Milborrow's excellent notes on earth here](http://www.milbo.org/doc/earth-notes.pdf) for lots of handy tips and tricks. 270 | 271 | [... and view his notes on variance models in earth here](http://www.milbo.org/doc/earth-varmod.pdf) 272 | 273 | ["earth" R package](https://cran.r-project.org/web/packages/earth/earth.pdf) 274 | 275 | [Friedman 1991 - MARS](https://projecteuclid.org/download/pdf_1/euclid.aos/1176347963) 276 | 277 | [Friedman 1993- Fast MARS](https://statistics.stanford.edu/sites/default/files/LCS%20110.pdf) 278 | 279 | -------------------------------------------------------------------------------- /Spring2017/Mar3-reg and splines/Mar3-regsplines-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "regression and splines (working version)" 3 | author: "Evan Muzzall" 4 | date: "3/3/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ##1. Package installations 13 | ```{r} 14 | if (FALSE) { 15 | install.packages("Zelig") 16 | devtools::install_github("ck37/ck37r") 17 | } 18 | 19 | library(splines) # call the base "splines" 20 | library(Zelig) # this contains the "macro" dataset we will use 21 | ``` 22 | 23 | ##2. Goals 24 | First, we will walk through linear regression, polynomial regression, polynomial splines, and smoothing splines using an incredibly simple example. 25 | 26 | Then, we will attempt to apply what we learned and do the same thing using the "macro" dataset from the "Zelig" package to see a more real life example. 27 | 28 | ##3. Simple linear regression 29 | Simple linear regression uses a single predictor/input/independent variable (X) to predict one target/outcome/response/dependent variably (Y). Ideally, we want to find the best estimates for B0 (intercept) and B1 (slope) that minimize the error terms when using X to predict Y. 30 | ```{r} 31 | ## generate toy predictors and responses 32 | X <- c(2, 4, 8, 12, 18, 20) 33 | Y <- c(1, 3, 5, 9, 19, 21) 34 | 35 | ## calculate means 36 | mean(X) 37 | mean(Y) 38 | 39 | ## calculate error for each observation 40 | X-mean(X) 41 | Y-mean(Y) 42 | 43 | ## plot the data 44 | plot(x=X, y=Y, main="example") 45 | 46 | ## estimate B1 coefficient (slope) 47 | B1 <- sum((X-mean(X)) * (Y-mean(Y))) / sum((X-mean(X))^2) 48 | B1 49 | 50 | ## now estimate B0 coefficient (intercept) 51 | B0 <- mean(Y) - (B1 * mean(X)) 52 | B0 53 | 54 | ## plot the abline 55 | abline(B0, B1, col="black", lwd=2) 56 | legend("topleft", inset=.0, c("linear"), lty=1, lwd=2, col="black", cex=.75) 57 | 58 | ## generate predicted values by plugging in our X values to the equation: 59 | Y_hat <- B0 + B1 * X 60 | Y_hat 61 | 62 | ## calculate root mean sqaure error (RMSE) for our predictions. First, calculatte the error for each observation by subracting it from the predicted value: 63 | Y_err <- Y_hat - Y 64 | Y_err 65 | 66 | ## then, calculate the square of each of these errors: 67 | Y_err_sq <- Y_err^2 68 | Y_err_sq 69 | 70 | ## sum these values 71 | sum_squared_err <- sum(Y_err_sq) 72 | sum_squared_err 73 | 74 | ## divide by n and take square root to produce the RMSE: 75 | RMSE <- sqrt(sum_squared_err / length(Y)) 76 | RMSE 77 | 78 | ## sanity check 79 | RMSE == sqrt(sum((Y_hat - Y)^2) / length(Y)) 80 | 81 | ## double sanity check 82 | ## fit the model using the lm function in R: 83 | lm_toy <- lm(Y ~ X) 84 | lm_toy 85 | B1 86 | B0 87 | summary(lm_toy) 88 | 89 | ## is our B1 the same as the slope generated by "lm" in R? 90 | round(B1, digits=5) == round(lm_toy$coefficients[2], digits=5) 91 | 92 | ## is our B0 the same as the intercept generated by "lm" in R? 93 | round(B0, digits=5) == round(lm_toy$coefficients[1], digits=5) 94 | ``` 95 | 96 | ##4. Polynomial regression 97 | However, it is not always advisable to assume linear relationships within data. Although linear models are flexible, they might not best express the relationships between your predictor and response variable. Thus, your resulting p-values might not accurately reflect the null hypothesis that the variables are not associated. 98 | 99 | Polynomial regression raises the original predictor variable to the n^th^ degree. These scalars act as a means to increase the fit of the model by assuming the point distributions are more parabolic shaped than linear. 100 | 101 | We will fit a 3^rd^ degree (cubic) polynomial so that our series of equations looks like this: 102 | Y ~ X 103 | Y ~ X^2 104 | Y ~ X^3 105 | (plus the error term) 106 | 107 | The major drawback of polynomial regression however, is that the function is fit to the global feature space. That is, a single polynomial function is fit in an attempt to represent all data points. However, since this is essentially a linear regression model, coefficients can still be estimated using least squares. 108 | ```{r} 109 | poly1 <- lm(Y ~ X + I(X^2) + I(X^3)) 110 | poly1 111 | summary(poly1) 112 | ## so, what is really happening here? 113 | 114 | ## imagine we take our X variable and create a new column in a data frame that would look like this: 115 | X2 <- X^2 116 | X3 <- X^3 117 | 118 | toy_df <- data.frame(Y, X, X2, X3) 119 | toy_df 120 | 121 | ## the "poly" function produces the same results (when raw=TRUE) 122 | poly2 <- lm(Y ~ poly(X, 3, raw=TRUE)) 123 | poly2 124 | summary(poly2) 125 | lines(X ~ fitted(poly2), lty=2, lwd=2, col="red") 126 | legend("topleft", inset=.0, c("linear", "poly 3"), lty=c(1,2), lwd=2, col=c("black","red"), cex=.75) 127 | 128 | ## sanity check 129 | poly1$coefficients == poly2$coefficients 130 | ``` 131 | 132 | ##5. Polynomial splines 133 | Polynomial splines are [piecewise polynomial functions](https://www.khanacademy.org/math/algebra/algebra-functions/piecewise-functions/v/piecewise-function-example) that form smoothed curved shapes at their junctions (called "knots"). The X predictor is divided into K regions, and a polynomial function is fit to the data within each region. This allows for greater flexibility than linear or polynomial fits. This is a k^th^ order spline where coefficients can be estimated by least squares. Derivatives are fit between each knot, and each imposed constraint releases a degree of freedom, thus smoothing the polynomial fits. As such, polynomial splines are generally more flexible fits than polynomial regression models. 134 | 135 | "poly spline" (basis-spline) is the function that allows for continuous joins at the spline knots. It is the matrix that contains the information of the piecewise polynomial functions used to fit the spline. 136 | ```{r} 137 | ## create xy data frame using our X and Y variables 138 | xy <- data.frame(X, Y) 139 | xy 140 | 141 | bs(xy$X, df=3) 142 | summary(ps1 <- lm(Y ~ bs(X, df=3, knots=8), data = xy)) 143 | 144 | ## example of "safe" prediction (see Chambers JM, Hastie TJ (editors). 1992. Statistical Models in S. Pacific Grove, CA: Wadsworth and Brooks/Cole. p 288-289 for "smart" versus "safe" prediction) 145 | summary(xy$X) 146 | X_pred <- seq(min(xy$X), max(xy$X), len = 200) 147 | lines(X_pred, predict(ps1, data.frame(X=X_pred)), lty=3, lwd=2, col="green") 148 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline"), lty=c(1,2,3), lwd=2, col=c("black","red", "green"), cex=.75) 149 | ``` 150 | 151 | Consider using "SuperLearner" to find optimal number of knots! 152 | 153 | ##6. Smoothing splines 154 | Smoothing splines are similar to the poly splines above, except they produce knots at each data point and coefficients of the estimated function are shrunk via regularization, thus helping prevent overfitting. 155 | 156 | The goal is to minimize loss+penalty in addition to a small residual sum of squares - this forms the smoothing spline. However, by treating each X observation as a data point we might be concerned with getting an overfit model. Thus it is useful to talk about degrees of freedom in terms of their _effective degrees of freedom_, or a general representation of the flexibility of the smoothing spline of shrunken degrees of freedom. 157 | Also, intervals are allowed to overlap. 158 | ```{r} 159 | smooth1 <- smooth.spline(y=Y, x=X, cv=FALSE, keep.data=TRUE, spar=NULL, penalty=1) 160 | smooth1 161 | names(smooth1) 162 | smooth1$data 163 | 164 | lines(smooth1, col="blue", lty=4) 165 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline", "smooth1"), lty=c(1,2,3,4), lwd=2, col=c("black","red", "green", "blue"), cex=.75) 166 | ``` 167 | 168 | 169 | ##7. Repeat with the "macro" data from the "Zelig" package 170 | ```{r} 171 | data(macro, package = "Zelig") 172 | macro_lm <- lm(gdp ~ unem, data=macro) 173 | macro_lm 174 | summary(macro_lm) 175 | 176 | ## plot residuals 177 | hist(macro_lm$residuals) 178 | 179 | ## plot it 180 | plot(macro$unem, macro$gdp, col="gray80", 181 | main="'macro' gdp ~ unem", 182 | xlab="unem deficit", 183 | ylab=) 184 | 185 | abline(macro_lm$coefficients[1], macro_lm$coefficients[2], 186 | lwd=2, col="black") 187 | legend("topleft", inset=.0, c("linear"), lty=1, lwd=2, col="black", cex=.75) 188 | 189 | ## generate predicted values by plugging in our X values. 190 | macro_pred <- predict(macro_lm, macro) 191 | 192 | ## or, this is the same as our formula way from the toy example: 193 | macro_pred <- macro_lm$coefficients[1] + macro_lm$coefficients[2] * macro$unem 194 | 195 | ## check MSE on the predicted values 196 | MSE <- mean((macro$gdp - macro_pred)^2) 197 | MSE 198 | ``` 199 | 200 | ##9. Cubic polynomial regression 201 | ```{r} 202 | poly_macro <- lm(gdp ~ poly(unem, 3, raw=TRUE), data=macro) 203 | poly_macro 204 | summary(poly_macro) 205 | 206 | unem_lims <- range(macro$unem) 207 | unem_grid <- seq(from=unem_lims[1], to=unem_lims[2]) 208 | poly_preds <- predict(poly_macro, newdata=list(unem=unem_grid), se=TRUE) 209 | 210 | lines(unem_grid, poly_preds$fit, lty=2, lwd=2, col="red") 211 | legend("topleft", inset=.0, c("linear", "ploy 4"), lty=c(1,2), lwd=2, col=c("black", "red"), cex=.75) 212 | ``` 213 | 214 | ##10. poly spline 215 | ```{r} 216 | ps2 <- lm(gdp ~ bs(unem, df=55), data = macro) 217 | summary(ps2) 218 | 219 | ## example of "safe" prediction 220 | summary(macro$unem) 221 | X_pred2 <- seq(min(macro$unem), max(macro$unem), len = 200) 222 | lines(X_pred2, predict(ps2, data.frame(unem=X_pred2)), lty=3, lwd=2, col="green") 223 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline"), lty=c(1,2,3), lwd=2, col=c("black","red", "green"), cex=.75) 224 | ``` 225 | 226 | ##11. Smoothing spline 227 | ```{r} 228 | smooth2 <- smooth.spline(y=macro$gdp, x=macro$unem, cv=FALSE) 229 | smooth2 230 | 231 | lines(smooth2, col="blue", lty=4) 232 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline", "smooth1"), lty=c(1,2,3,4), lwd=2, col=c("black","red", "green", "blue"), cex=.75) 233 | ``` 234 | 235 | ## Acknowledgements 236 | James G, Witten D, Hastie T, Tibshirani R. 2015. An Introduction to Statistical Learning: With Applications in R (6th printing). New York: Springer. 237 | -------------------------------------------------------------------------------- /Spring2017/Mar3-reg and splines/Mar3-regsplines-R_files/figure-html/unnamed-chunk-2-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Mar3-reg and splines/Mar3-regsplines-R_files/figure-html/unnamed-chunk-2-1.png -------------------------------------------------------------------------------- /Spring2017/data/sleep_VIM.csv: -------------------------------------------------------------------------------- 1 | "BodyWgt","BrainWgt","NonD","Dream","Sleep","Span","Gest","Pred","Exp","Danger" 2 | 6654,5712,NA,NA,3.3,38.6,645,3,5,3 3 | 1,6.6,6.3,2,8.3,4.5,42,3,1,3 4 | 3.385,44.5,NA,NA,12.5,14,60,1,1,1 5 | 0.92,5.7,NA,NA,16.5,NA,25,5,2,3 6 | 2547,4603,2.1,1.8,3.9,69,624,3,5,4 7 | 10.55,179.5,9.1,0.7,9.8,27,180,4,4,4 8 | 0.023,0.3,15.8,3.9,19.7,19,35,1,1,1 9 | 160,169,5.2,1,6.2,30.4,392,4,5,4 10 | 3.3,25.6,10.9,3.6,14.5,28,63,1,2,1 11 | 52.16,440,8.3,1.4,9.7,50,230,1,1,1 12 | 0.425,6.4,11,1.5,12.5,7,112,5,4,4 13 | 465,423,3.2,0.7,3.9,30,281,5,5,5 14 | 0.55,2.4,7.6,2.7,10.3,NA,NA,2,1,2 15 | 187.1,419,NA,NA,3.1,40,365,5,5,5 16 | 0.075,1.2,6.3,2.1,8.4,3.5,42,1,1,1 17 | 3,25,8.6,0,8.6,50,28,2,2,2 18 | 0.785,3.5,6.6,4.1,10.7,6,42,2,2,2 19 | 0.2,5,9.5,1.2,10.7,10.4,120,2,2,2 20 | 1.41,17.5,4.8,1.3,6.1,34,NA,1,2,1 21 | 60,81,12,6.1,18.1,7,NA,1,1,1 22 | 529,680,NA,0.3,NA,28,400,5,5,5 23 | 27.66,115,3.3,0.5,3.8,20,148,5,5,5 24 | 0.12,1,11,3.4,14.4,3.9,16,3,1,2 25 | 207,406,NA,NA,12,39.3,252,1,4,1 26 | 85,325,4.7,1.5,6.2,41,310,1,3,1 27 | 36.33,119.5,NA,NA,13,16.2,63,1,1,1 28 | 0.101,4,10.4,3.4,13.8,9,28,5,1,3 29 | 1.04,5.5,7.4,0.8,8.2,7.6,68,5,3,4 30 | 521,655,2.1,0.8,2.9,46,336,5,5,5 31 | 100,157,NA,NA,10.8,22.4,100,1,1,1 32 | 35,56,NA,NA,NA,16.3,33,3,5,4 33 | 0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4 34 | 0.01,0.25,17.9,2,19.9,24,50,1,1,1 35 | 62,1320,6.1,1.9,8,100,267,1,1,1 36 | 0.122,3,8.2,2.4,10.6,NA,30,2,1,1 37 | 1.35,8.1,8.4,2.8,11.2,NA,45,3,1,3 38 | 0.023,0.4,11.9,1.3,13.2,3.2,19,4,1,3 39 | 0.048,0.33,10.8,2,12.8,2,30,4,1,3 40 | 1.7,6.3,13.8,5.6,19.4,5,12,2,1,1 41 | 3.5,10.8,14.3,3.1,17.4,6.5,120,2,1,1 42 | 250,490,NA,1,NA,23.6,440,5,5,5 43 | 0.48,15.5,15.2,1.8,17,12,140,2,2,2 44 | 10,115,10,0.9,10.9,20.2,170,4,4,4 45 | 1.62,11.4,11.9,1.8,13.7,13,17,2,1,2 46 | 192,180,6.5,1.9,8.4,27,115,4,4,4 47 | 2.5,12.1,7.5,0.9,8.4,18,31,5,5,5 48 | 4.288,39.2,NA,NA,12.5,13.7,63,2,2,2 49 | 0.28,1.9,10.6,2.6,13.2,4.7,21,3,1,3 50 | 4.235,50.4,7.4,2.4,9.8,9.8,52,1,1,1 51 | 6.8,179,8.4,1.2,9.6,29,164,2,3,2 52 | 0.75,12.3,5.7,0.9,6.6,7,225,2,2,2 53 | 3.6,21,4.9,0.5,5.4,6,225,3,2,3 54 | 14.83,98.2,NA,NA,2.6,17,150,5,5,5 55 | 55.5,175,3.2,0.6,3.8,20,151,5,5,5 56 | 1.4,12.5,NA,NA,11,12.7,90,2,2,2 57 | 0.06,1,8.1,2.2,10.3,3.5,NA,3,1,2 58 | 0.9,2.6,11,2.3,13.3,4.5,60,2,1,2 59 | 2,12.3,4.9,0.5,5.4,7.5,200,3,1,3 60 | 0.104,2.5,13.2,2.6,15.8,2.3,46,3,2,2 61 | 4.19,58,9.7,0.6,10.3,24,210,4,3,4 62 | 3.5,3.9,12.8,6.6,19.4,3,14,2,1,1 63 | 4.05,17,NA,NA,NA,13,38,3,1,1 64 | -------------------------------------------------------------------------------- /Spring2017/spring 2017 schedule.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf830 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;} 3 | {\colortbl;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;} 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0 6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 7 | 8 | \f0\fs24 \cf0 Spring 2017 schedule\ 9 | \ 10 | * February 3 - Introductory meeting and k-nearest neighbor classification and regression \ 11 | [click for KNN example in R](https://www.datacamp.com/community/tutorials/machine-learning-in-r#gs.GpuyCu0) \ 12 | [click for KNN example in Python](http://scikit-learn.org/stable/modules/neighbors.html)\ 13 | \ 14 | * February 17 - Stepwise regression \ 15 | [see Chapter 6 from An Introduction to Statistical Learning for an overview of stepwise regression](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf) \ 16 | \ 17 | [click for an overview of regerssion analysis in Python](http://www.turingfinance.com/regression-analysis-using-python-statsmodels-and-quandl/) \ 18 | [click for an overview of generalized linear models in Python](http://scikit-learn.org/stable/modules/linear_model.html) \ 19 | \ 20 | * March 3 - Linear regression, polynomial regression, polynomial splines, smoothing splines \ 21 | [see Chapters 3 and 7 from an Introduction to Statistical Learning for overviews of regression and splines](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf) \ 22 | \ 23 | [click here for a linear regression example in Python](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html) \ 24 | \ 25 | [click here for a spline walkthrough in Python](https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html) \ 26 | \ 27 | * March 17 - Multivariate adaptive regression splines, generalized additive models \ 28 | \ 29 | * April 14 - Support vector machines \ 30 | \ 31 | [click here for an explanation of why and when to use SVMs](http://blog.yhat.com/posts/why-support-vector-machine.html)\ 32 | \ 33 | * April 28 - Neural networks } -------------------------------------------------------------------------------- /Spring2018/Apr11-BoostingTrees/GBM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gradient Boosting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Basic single model:\n", 15 | "Adapted from [here](http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html)." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "from sklearn.datasets import load_boston\n", 25 | "import numpy as np\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "from sklearn import ensemble, preprocessing\n", 28 | "from sklearn.utils import shuffle\n", 29 | "from sklearn.metrics import mean_squared_error\n", 30 | "from sklearn.model_selection import train_test_split\n", 31 | "from sklearn.ensemble import GradientBoostingRegressor" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "np.random.seed(1)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "boston = load_boston()\n", 50 | "\n", 51 | "X, y = shuffle(boston.data, boston.target, random_state=1)\n", 52 | "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)\n", 53 | "\n", 54 | "scaler = preprocessing.StandardScaler().fit(X_train)\n", 55 | "X_train = scaler.transform(X_train)\n", 56 | "X_test = scaler.transform(X_test)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "scrolled": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,\n", 68 | " 'learning_rate': 0.01, 'loss': 'ls'}\n", 69 | "\n", 70 | "gb_r = ensemble.GradientBoostingRegressor(**params)\n", 71 | "\n", 72 | "gb_r.fit(X_train, y_train)\n", 73 | "mse = mean_squared_error(y_test, gb_r.predict(X_test))\n", 74 | "print(\"MSE: %.4f\" % mse)\n", 75 | "print(gb_r.score(X_train, y_train))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "% matplotlib inline\n", 85 | "\n", 86 | "# compute test set deviance\n", 87 | "test_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n", 88 | "\n", 89 | "for i, y_pred in enumerate(gb_r.staged_predict(X_test)):\n", 90 | " test_score[i] = gb_r.loss_(y_test, y_pred)\n", 91 | "\n", 92 | "plt.figure(figsize=(12, 6))\n", 93 | "plt.subplot(1, 2, 1)\n", 94 | "plt.title('Deviance')\n", 95 | "plt.plot(np.arange(params['n_estimators']) + 1, gb_r.train_score_, 'b-',\n", 96 | " label='Training Set Deviance')\n", 97 | "plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n", 98 | " label='Test Set Deviance')\n", 99 | "plt.legend(loc='upper right')\n", 100 | "plt.xlabel('Boosting Iterations')\n", 101 | "plt.ylabel('Deviance')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "feature_importance = gb_r.feature_importances_\n", 111 | "# make importances relative to max importance\n", 112 | "feature_importance = 100.0 * (feature_importance / feature_importance.max())\n", 113 | "sorted_idx = np.argsort(feature_importance)\n", 114 | "pos = np.arange(sorted_idx.shape[0]) + .5\n", 115 | "plt.subplot(1, 2, 2)\n", 116 | "plt.barh(pos, feature_importance[sorted_idx], align='center')\n", 117 | "plt.yticks(pos, boston.feature_names[sorted_idx])\n", 118 | "plt.xlabel('Relative Importance')\n", 119 | "plt.title('Variable Importance')\n", 120 | "plt.show()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Grid search:" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "from sklearn.model_selection import GridSearchCV\n", 137 | "\n", 138 | "param_grid = {'n_estimators': range(450, 551, 50),\n", 139 | " 'max_depth': range(1, 12, 5),\n", 140 | " 'min_samples_split': [2],\n", 141 | " 'learning_rate': np.arange(0.01, .22, .1),\n", 142 | " 'loss': ['ls']}\n", 143 | "\n", 144 | "gb_r = GridSearchCV(GradientBoostingRegressor(), param_grid)\n", 145 | "gb_r.fit(X_train, y_train)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "sorted(gb_r.cv_results_.keys())" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "gb_r.cv_results_[\"mean_test_score\"]" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "best_index = np.argmax(gb_r.cv_results_[\"mean_test_score\"])" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "gb_r.cv_results_[\"params\"]" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "print(gb_r.cv_results_[\"params\"][best_index])\n", 191 | "print()\n", 192 | "print(max(gb_r.cv_results_[\"mean_test_score\"]))" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "gb_r.score(X_test, \n", 202 | " y_test)" 203 | ] 204 | } 205 | ], 206 | "metadata": { 207 | "anaconda-cloud": {}, 208 | "kernelspec": { 209 | "display_name": "Python 3", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.6.4" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 1 228 | } 229 | -------------------------------------------------------------------------------- /Spring2018/Apr11-BoostingTrees/boosting-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R boosted trees walkthrough" 3 | author: "Evan Muzzall" 4 | date: "April 11, 2018" 5 | output: 6 | html_document: 7 | toc: yes 8 | toc_float: yes 9 | --- 10 | 11 | # Objectives 12 | #####1 introduction 13 | *tree based methods - quick review 14 | *install packages, load and split data 15 | #####2 gbm 16 | *train gbm_fit1 (no tuning) 17 | *train gbm_fit2 (tune with trainControl and expand.grid) 18 | *visualize gbm_fit2 models, generate predicted values, compute and plot AUC 19 | *trainControl and expand.grid functions; train the gbm again 20 | *generate predicted values and compute AUC 21 | #####3 xgboost 22 | *xgboost example 23 | #####4 SuperLearner 24 | *SuperLearner example 25 | 26 | # 1 introduction: Review tree based methods 27 | _(summarized from Chapter 8 or Introduction to Statistical Learning, 7th ed. by James et al. 2013)_ 28 | Recall that **decision trees** divide the predictor space (the set of possible predicted values) into simpler regions. Through recursive binary splitting, each tree splits based on minimizing RSS for regression trees or classification error for classification trees (% of training observations that do not belong to the most common class) using the mean and mode, repsectively. Decision trees have high variance and single decision trees are likely to overfit. When a large, overly complex tree is grown, **pruning** can be used to prune it back to a subtree with the lowest test error. 29 | 30 | To improve predictive performance, **bagging**, or "bootstrap aggregating", will estimate each tree on a new dataset [sampled with replacement](https://en.wikipedia.org/wiki/Simple_random_sample) from the original dataset. Each bootstrap sample will include about two-thirds of observations, some included multiple times. We then average the predictions across each of these trees. The out of bag error rate **OOB** is often used to estimate test error of the bagged model and is performed on the remaining observations (roughly 1/3). Variable importance gets lost in the shuffle! 31 | 32 | Unlike bagging, **random forests** decorrelate the trees. They build bootstrap training samples but only use a small number of predictors for each tree, then takes the average of the performances of these trees. Thus, the response for each observation is predicted using only trees that included that observation. 33 | 34 | **Boosting** takes this even further - fit decision trees to residuals, add each tree's performance to the fitted function, update residuals, and improve $\hat{f}$. 35 | 36 | From [Freund and Schapire 1999](https://cseweb.ucsd.edu/~yfreund/papers/IntroToBoosting.pdf). 37 | "Boosting is a general method for improving the accuracy of any given learning algorithm" and originated in the AdaBoost and PAC learning (p. 1-2). Gradient boosted machines are ensembles decision tree methods of "weak" trees that are just slightly more accurate than random guessing which are then "boosted" into "strong" learners. That is, the models don't have to be accurate over the entire feature space. 38 | 39 | The model first tried to predict each value in a dataset - the cases that can be predicted easily are _downweighted_ so that the algorithm does not have to try as hard to predic them. 40 | 41 | However, the cases that the model has difficulty predicting are _upweighted_ so that the model tries more assertively to predict them. This continues for multiple "boosting" iterations. A resample-based performance measure is produced at each iteration. Error is measured on the weak learners so that even performing slightly better than random guessing improves accuracy fast (p.2). This method can drive down generalization error thus preventing overfitting (p. 5). While it is susceptible to noise, it is robust to outlier detection. 42 | 43 | Boosted trees utilize three main hyperparameters: 44 | 1. B: number of trees to grow 45 | 2. $\lambda$: shrinkage (learning rate) 46 | 3. d: tree depth (number of splits) 47 | 48 | # research question 49 | How well can we predict "low" versus "high" median home value prices using the other variables from the "BostonHousing" dataset? 50 | 51 | # Install packages 52 | ```{r, eval=FALSE} 53 | install.packages(c("car", "caret", "mlbench", "pROC", "randomForest", 54 | "ranger", "rpart", "SuperLearner", "xgboost")) 55 | 56 | ``` 57 | ```{r, eval=TRUE} 58 | library(car) 59 | library(caret) 60 | library(mlbench) 61 | library(pROC) 62 | library(randomForest) 63 | library(rpart) 64 | library(SuperLearner) 65 | library(xgboost) 66 | ``` 67 | 68 | load data 69 | ```{r} 70 | library(mlbench) 71 | data(BostonHousing) 72 | ?BostonHousing 73 | dat = BostonHousing 74 | str(dat) 75 | 76 | # convert medv to factor: less than or equal to 21.20 = "low", greater than 21.20 = "high" 77 | dat$medv = cut(dat$medv, 78 | breaks = c(0, 21.20, 50), 79 | levels = c(1,2), 80 | labels = c("low", "high")) 81 | ``` 82 | 83 | # split data 84 | ```{r} 85 | library(caret) 86 | set.seed(1) 87 | split <- createDataPartition(dat$medv, p = 0.70, list = FALSE) 88 | training_set <- dat[split,] # for gbm; response variable is included 89 | test_set <- dat[-split,] # for gbm; responses variable is included 90 | 91 | X_train = subset(training_set, select = -medv) # for xgboost; response variable is Y_train 92 | X_test = subset(test_set, select = -medv) # for xgboost; response variable is Y_test 93 | 94 | Y_train = dat$medv[split] # xgboost train response, but need to convert to numeric 95 | Y_test = dat$medv[-split] # xgboost test response, but need to convert to numeric 96 | 97 | Y_train = as.integer(Y_train == "low") # xgboost only allows numeric input; train response 98 | Y_test = as.integer(Y_test == "low") # xgboost only allows numeric input; test response 99 | 100 | X = subset(dat, select = -medv) # for SuperLearner, we can use all the data (minus our response medv) 101 | Y = subset(dat, select = medv) # include only medv response for SuperLearner! 102 | 103 | ``` 104 | 105 | # 2 gbm 106 | # train gbm_fit1 107 | ```{r} 108 | set.seed(1) 109 | gbm_fit1 <- train(medv ~ ., 110 | data = training_set, 111 | method="gbm", 112 | verbose = FALSE) 113 | gbm_fit1$times 114 | 115 | gbm_fit1 116 | 117 | summary(gbm_fit1, las=1, main="GBM relative influence") 118 | ``` 119 | 120 | # trainControl and expand.grid 121 | Define hyperparameters of the control mechanism via `trainControl` 122 | ```{r} 123 | control <- trainControl(method="repeatedcv", 124 | repeats=5, 125 | classProbs=TRUE, 126 | summaryFunction=twoClassSummary) 127 | ``` 128 | 129 | Compare multiple models at once with `expand.grid` 130 | ```{r} 131 | grid <- expand.grid(n.trees = seq(500, 2500, by = 500), 132 | interaction.depth = c(1, 3, 5), 133 | shrinkage = c(0.001, 0.01, 0.1), 134 | n.minobsinnode = 10) 135 | nrow(grid) 136 | ``` 137 | 138 | Train the gbm again with the control and grid in place: 139 | ```{r} 140 | set.seed(1) 141 | gbm_fit2 <- train(medv ~ ., data = training_set, 142 | method = "gbm", 143 | metric = "ROC", 144 | trControl = control, 145 | tuneGrid = grid, 146 | verbose = FALSE) 147 | gbm_fit2$times 148 | 149 | gbm_fit2 150 | 151 | summary(gbm_fit2, las = 2) 152 | ``` 153 | 154 | # ggplot line graph of the tuned models 155 | ```{r} 156 | library(ggplot2) 157 | ggplot(gbm_fit2) + theme_bw() + ggtitle("Model comparisons") + ylab("AUC") + theme(legend.position = "top") 158 | ``` 159 | 160 | # generate predicted values and probabilities 161 | ```{r} 162 | set.seed(1) 163 | gbm_predicted <- predict(gbm_fit2, test_set) 164 | gbm_prob <- predict(gbm_fit2, test_set, type="prob") 165 | ``` 166 | 167 | view final model 168 | ```{r} 169 | gbm_cm <- confusionMatrix(gbm_predicted, test_set$medv) 170 | gbm_cm 171 | ``` 172 | 173 | A confusion/error matrix is a cross-tabulation of observed versus predicted classes 174 | 175 | # plot AUC 176 | ```{r} 177 | library(pROC) 178 | rocCurve <- roc(response=test_set$medv, 179 | predictor = gbm_prob[, "low"], 180 | levels = rev(levels(test_set$medv)), 181 | auc=TRUE, ci=TRUE) 182 | ``` 183 | 184 | ```{r} 185 | plot(rocCurve, main="GBM", col="blue", col.main="blue", col.lab="blue") 186 | rocCurve$auc 187 | ``` 188 | 189 | # 3 xgboost 190 | # xgboost example 191 | ```{r} 192 | library(xgboost) 193 | bstSparse <- xgboost(data = data.matrix(X_train), label = Y_train, max.depth = 2, eta = 1, nthread = 2, nround = 20, objective = "binary:logistic") 194 | 195 | prediction_values <- predict(bstSparse, xgb.DMatrix(data.matrix(X_test))) 196 | options(scipen=999) 197 | prediction_values 198 | 199 | prediction_class = as.numeric(prediction_values > 0.5) 200 | prediction_class 201 | 202 | err <- mean(as.numeric(prediction_values >= 0.5) != Y_test) 203 | print(paste("test-error =", err)) 204 | ``` 205 | 206 | # 4 SuperLearner 207 | # Superlearner example 208 | 209 | ```{r superlearner} 210 | library(SuperLearner) 211 | cv_sl = CV.SuperLearner(X = dat[, -14], 212 | Y = as.integer(dat[, 14] == "low"), 213 | family = binomial(), 214 | SL.library = c("SL.xgboost","SL.rpart","SL.ranger","SL.mean"), 215 | V = 5) 216 | 217 | cv_sl 218 | 219 | summary(cv_sl) 220 | 221 | table(simplify2array(cv_sl$whichDiscreteSL)) # view best 222 | 223 | plot(cv_sl) + theme_bw() 224 | ``` 225 | 226 | # Help 227 | 228 | Examples were drawn from these helpful pages - check them out below! 229 | *[caret help page](https://topepo.github.io/caret/) 230 | 231 | *[XGBoost R Tutorial](http://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html) 232 | 233 | *[SuperLearner example](https://github.com/ecpolley/SuperLearner) 234 | -------------------------------------------------------------------------------- /Spring2018/Apr25 - Elastic Net/elastic-net.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Penalized regression in R" 3 | date: "April 25, 2018" 4 | output: 5 | html_document: 6 | toc: yes 7 | toc_float: yes 8 | --- 9 | 10 | ```{r install, eval = F} 11 | # Run if needed. 12 | install.packages(c("caret", "glmnet", "ranger", "SuperLearner")) 13 | ``` 14 | 15 | # Background 16 | 17 | Penalized regression is a modification of ordinary least squares (OLS) or 18 | generalized linear models regression (namely logistic regression) to shrink the 19 | estimated coefficients closer to zero. This is because the default estimated 20 | coefficients from linear regression inherently contain some overfitting - they 21 | are incorporating some random noise in the data that will not be the same for 22 | new, unseen data. 23 | 24 | Penalization addresses this inherent overfitting by changing the objective 25 | function used to choose the optimal estimated beta coefficients. It says: "I 26 | want to choose beta coefficients that minimize my loss function (often 27 | mean-squared error) but I also don't want the sum of the coefficients to be too large." 28 | 29 | There are two types of penalization: L1 (sum of absolute values) or L2 (sum of squared values). L2 penalization was the first type of penalized regression, and is called **ridge regression**. It was first published by Hoerl & Kennard in 1970 and allows regression to be used with there are more covariates than observations (p > n). L1 penalization is called **LASSO** ([least absolute shrinkage and selection operator](https://en.wikipedia.org/wiki/Lasso_(statistics))) and was first published by Tibshirani in 1996. Lasso has the special property of **sparsity** - it assumes that only a subset of variables are related to the outcome and tends to zero out the coefficients on the least related covariates. 30 | 31 | Ridge and lasso can be combined into a single regression called **elastic net**, which takes a weighted average of the L1 and L2 penalties. Elastic net was first published in 2005 by Zou and Hastie; the weighting between L1 and L2 penalties is controlled by the $\alpha$ hyperparamter which ranges between 0 (ridge) and 1 (lasso). 32 | 33 | ## Data prep 34 | 35 | ```{r} 36 | library(MASS) 37 | data(Boston) 38 | help(Boston) 39 | str(Boston) 40 | summary(Boston) 41 | 42 | # Our outcome is median home value. 43 | outcome = "medv" 44 | 45 | # Divide into 80% training, 20% test split. 46 | # NOTE: this is a shortcut; we prefer to use cross-validation for real projects. 47 | library(caret) 48 | set.seed(1) 49 | train_index = caret::createDataPartition(Boston[, outcome], p = .8, 50 | list = F, 51 | times = 1) 52 | 53 | # Glmnet wants the data to be matrices, not data frames. 54 | x_train = as.matrix(Boston[train_index, !names(Boston) == outcome]) 55 | x_test = as.matrix(Boston[-train_index, !names(Boston) == outcome]) 56 | 57 | y_train = Boston[train_index, outcome] 58 | y_test = Boston[-train_index, outcome] 59 | 60 | dim(x_train) 61 | length(y_train) 62 | 63 | dim(x_test) 64 | length(y_test) 65 | ``` 66 | 67 | 68 | ## Lasso 69 | 70 | Lasso penalizes coefficients and imposes sparsity, so some coefficients may be shrunk to 0 if they do not appear to be related to the outcome. 71 | 72 | ```{r} 73 | library(glmnet) 74 | # Fit the lasso to continuous Y 75 | reg = cv.glmnet(x_train, y_train, family = "gaussian", alpha = 1) 76 | 77 | # Look at distribution of penalty term lambda. 78 | plot(reg) 79 | 80 | # Plot the underlying glmnet object, showing 81 | # coefficients for differnt lambda values. 82 | plot(reg$glmnet.fit, xvar = "lambda", label = T) 83 | 84 | # Lambda with minimum mean-squared error. 85 | reg$lambda.min 86 | 87 | # Higher lambda within 1SE of performance of the minimum. 88 | # (the "one standard error" rule from Leo Breiman.) 89 | reg$lambda.1se 90 | 91 | # Review coeffients 92 | coef(reg, s = "lambda.1se") 93 | 94 | # What about for lambda.min? 95 | coef(reg, s = "lambda.min") 96 | 97 | # Predict on test set. 98 | pred = predict(reg, s = reg$lambda.1se, newx = x_test) 99 | 100 | # Calculate mean-squared error. 101 | mean((pred - y_test)^2) 102 | ``` 103 | 104 | ## Ridge 105 | 106 | Ridge penalizes the coefficients but does not impose sparsity, so no coefficient will ever be 0. 107 | 108 | ```{r} 109 | 110 | # Fit the ridge to continuous Y 111 | # We just change alpha to 0 to get ridge regression. 112 | reg = cv.glmnet(x_train, y_train, family = "gaussian", alpha = 0) 113 | 114 | # Look at distribution of penalty term lambda. 115 | plot(reg) 116 | 117 | # Plot the underlying glmnet object, showing 118 | # coefficients for differnt lambda values. 119 | plot(reg$glmnet.fit, xvar = "lambda", label = T) 120 | 121 | # Predict on test set. 122 | pred = predict(reg, s = reg$lambda.1se, newx = x_test) 123 | 124 | # Calculate mean-squared error. 125 | mean((pred - y_test)^2) 126 | ``` 127 | 128 | As expected, we do a little worse with ridge compared to lasso. 129 | 130 | ## Elastic net 131 | 132 | ```{r} 133 | set.seed(1) 134 | train_control = trainControl(method = "repeatedcv", 135 | number = 10L, 136 | repeats = 3L) 137 | 138 | 139 | # Create a custom tuning grid. 140 | enet_grid = expand.grid(alpha = seq(0, 1, length.out = 5), 141 | lambda = 2^seq(-1, -7, length = 5)) 142 | 143 | # Review the grid. 144 | enet_grid 145 | 146 | # To be simpler we could just say e.g. tuneLength = 5. 147 | 148 | enet = train(x_train, y_train, method = "glmnet", 149 | #tuneLength = 5, 150 | tuneGrid = enet_grid, 151 | trControl = train_control) 152 | 153 | print(enet) 154 | 155 | plot(enet) 156 | 157 | enet$bestTune 158 | 159 | # Predict on test. 160 | pred = predict(enet, x_test) 161 | 162 | # Review performance 163 | mean((pred - y_test)^2) 164 | ``` 165 | 166 | ## SuperLearner 167 | 168 | ```{r} 169 | library(SuperLearner) 170 | 171 | enet = create.Learner("SL.glmnet", 172 | tune = list(alpha = c(0, 0.1, 0.5, 0.9, 1.0)), 173 | detailed_names = TRUE) 174 | 175 | sl_lib = c("SL.mean", "SL.lm", "SL.stepAIC", enet$names, "SL.ranger") 176 | 177 | set.seed(1, "L'Ecuyer-CMRG") 178 | 179 | # This will take a few minutes to execute - take a look at the .html file to see the output! 180 | cv_sl = CV.SuperLearner(Y = y_train, X = data.frame(x_train), verbose = TRUE, 181 | SL.library = sl_lib, family = gaussian(), 182 | cvControl = list(V = 10L)) 183 | 184 | summary(cv_sl) 185 | 186 | plot(cv_sl) + theme_bw() 187 | 188 | # devtools::install_github("ck37/ck37r") 189 | # library(ck37r) 190 | ``` 191 | 192 | # References 193 | 194 | Intro to Statistical Learning, Chapter 6 195 | 196 | [Glmnet vignette by Hastie and Qian](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) - lots of great code examples -------------------------------------------------------------------------------- /Spring2018/Feb28-randomForest/Random Forest R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "random forest" 3 | author: "Evan Muzzall" 4 | date: "February 28, 2018" 5 | output: 6 | html_document: 7 | toc: yes 8 | toc_float: yes 9 | --- 10 | ```{r} 11 | # clear environment 12 | rm(list=ls()) 13 | ``` 14 | 15 | ```{r setup, include=FALSE} 16 | knitr::opts_chunk$set(echo = TRUE) 17 | ``` 18 | 19 | # Quick review: decision trees and "bagging" (bootstrap aggregating) 20 | 21 | # What are random forests? 22 | Random forests are ensemble classifier methods that use multiple decision tree models for classification and regression. 23 | 24 | Unlike decision trees/bagged trees, by default results generally do not require pruning and include accuracy and variable importance information. Furthermore, at each random forest tree split, only a small portion of the predictors are used (rather than the full suite). 25 | 26 | We will four different random forest models: 27 | 1. `rf1`: `randomForest` package model 28 | 29 | 2. `rf2`: fit this same model in SuperLearn via `SL.SuperLearner` 30 | 31 | 3. `rf3`: `SuperLearner` package model compared to `SL.rpart`, `SL.xgboost`, and `SL.mean` models 32 | 33 | 4. `rf4`: `SuperLearner` package model with external cross-validation for multi-model comparison and visualization of model differences compared to `SL.rpart`, `SL.xgboost`, and `SL.mean` 34 | 35 | # Install packages 36 | Install and `library()` necessary packages. 37 | ```{r, eval=FALSE} 38 | # install.packages(c("car", "caret", "ggplot2", "lattice", "plotmo", "randomForest", "rpart", "ROCR", "SuperLearner", "survival", "xgboost"), dependencies = F) 39 | library(car) 40 | library(caret) 41 | library(ggplot2) 42 | library(lattice) 43 | library(plotmo) 44 | library(randomForest) 45 | library(rpart) 46 | library(ROCR) 47 | library(SuperLearner) 48 | library(survival) 49 | library(xgboost) 50 | ``` 51 | 52 | # Data setup - `Mroz` 53 | Load and explore Mroz dataset. 54 | ```{r} 55 | library(car) 56 | data(Mroz) 57 | ?Mroz 58 | str(Mroz) 59 | head(Mroz) 60 | ``` 61 | 62 | ### `lfp` 63 | Let's examine frequencies of the `lfp` variable (labor force participation), since it is the one we want to predict. 64 | ```{r, eval=FALSE} 65 | Mroz$lfp 66 | ``` 67 | ```{r} 68 | library(lattice) 69 | table(Mroz$lfp) 70 | barchart(table(Mroz$lfp), col="purple", horizontal = F) 71 | ``` 72 | 73 | ### Stratified random split 74 | Now, we will use the `createDataPartition` command from the `caret` package to perform a 70/30 stratified random split of the Mroz data into training and test sets. 75 | ```{r} 76 | library(caret) 77 | set.seed(1) 78 | split <- createDataPartition(Mroz$lfp, p=0.70, list=FALSE) 79 | training_set <- Mroz[split,] 80 | test_set <- Mroz[-split,] 81 | 82 | dim(Mroz) 83 | dim(training_set) 84 | dim(test_set) 85 | nrow(training_set) + nrow(test_set) == nrow(Mroz) # double check 86 | ``` 87 | 88 | ##### 1.1 `rf1` - fit the model and evaluate `training_set` accuracy 89 | Using the `randomForest` package, let`s fit a random forest model to predict the number of women who participated or did not participate in the labor force in 1975. 90 | ```{r} 91 | library(randomForest) 92 | ?randomForest 93 | set.seed(1) 94 | rf1 <- randomForest(lfp ~ ., 95 | data=training_set, 96 | ntree=500, 97 | mtry=round(sqrt(ncol(Mroz)), digits = 0), 98 | importance=TRUE) 99 | #NOTE: notice that our response vector `lfp` is a factor - this will assume classification models, otherwise regression will be assumed. If it is omitted entirely, randomForest becomes unsupervised! 100 | rf1 101 | 102 | # check accuracy on training set 103 | (170+235) / nrow(training_set) # training_set = 77% accuracy 104 | 105 | rf1$importance 106 | barchart(rf1$importance, main="rf variable importance - barchart", col="blue", border="black") 107 | ``` 108 | 109 | ##### 1.2 `randomForest` `test_set` accuracy 110 | Now, let`s see how our model performs on the test data. 111 | ```{r} 112 | set.seed(1) 113 | pred <- predict(rf1, newdata=test_set) 114 | table(pred, test_set$lfp) 115 | ``` 116 | 117 | Of the 225 test_set observations, We have 68 true negatives (correct `no` predictions) and 99 true positives (correct `yes` predictions). 118 | 119 | Now, we can quickly check the accuracy of the model using the holdout dataset. 120 | 121 | ```{r} 122 | (68 + 99) / nrow(test_set) #test_set = 74% accuracy 123 | ``` 124 | 125 | ##### 1.3 `plotmo` on `rf1` 126 | Plot `rf1`! 127 | ```{r} 128 | library(plotmo) 129 | ?plotmo 130 | plotmo(rf1, all1 = T) # all1 = T will plot all predictors 131 | plotmo(rf1, all2 = T) # all2 = T will plot all pairs of predictors 132 | plotmo(rf1, all2 = T, pt.col = "green", smooth.col = "purple", grid.col = "gray80") 133 | 134 | set.seed(1) 135 | plotmo(rf1, all1 = T, pmethod = "apartdep") 136 | 137 | set.seed(1) 138 | plotmo(rf1, all1 = T, pmethod = "apartdep", degree1 = 0, degree2 = 3, 139 | caption = "title goes here", 140 | persp.col="orange") 141 | 142 | # image plots 143 | plotmo(rf1, degree1 = F, type="prob", nresponse="yes", # right graph 144 | type2="image", pt.col=ifelse(Mroz$lfp == "yes", "purple", "green3")) 145 | ``` 146 | 147 | # Compare multiple models using the `SuperLearner` R package 148 | `SuperLearner` is an R package that allows you to easily compare multiple machine learning algorithms at once and/or the same algorithm with different settings. 149 | 150 | It then creates an optimal weighted average of those models, aka an "ensemble", using the test data performance. This approach has been proven to be asymptotically as accurate as the best possible prediction algorithm that is tested. 151 | 152 | ### Coerce `lfp` to integer type 153 | For binary classification, SuperLearner prefers that your categorical outcome is numeric/integer, rather than factor data type. 154 | 155 | Let's coerce `lfp` from factor to integer type, but first make a copy of `training_set` and `test_set`. 156 | ```{r} 157 | training_set2 = training_set 158 | test_set2 = test_set 159 | 160 | class(training_set2$lfp) 161 | class(test_set2$lfp) 162 | 163 | ?ifelse 164 | training_set2$lfp <- ifelse(training_set2$lfp=="yes", 1L, 0L) 165 | test_set2$lfp <- ifelse(test_set2$lfp=="yes", 1L, 0L) 166 | 167 | class(training_set2$lfp) 168 | class(test_set2$lfp) 169 | ``` 170 | ```{r, eval=FALSE} 171 | training_set2$lfp 172 | test_set2$lfp 173 | ``` 174 | 175 | ### Assign Y variables 176 | Now, we should assign binary outcome variables for the training and test sets for the `SuperLearner` computations. 177 | ```{r} 178 | Y <- training_set2$lfp 179 | Y_test <- test_set2$lfp 180 | table(Y) 181 | table(Y_test) 182 | ``` 183 | 184 | However, because we specify our outcome and predictor variables in SuperLearner, we must remove the outcome variable from our training and test sets because we do not want to include them as a predictor: 185 | 186 | ```{r} 187 | training_set2 <- training_set2[,c(2:8)] 188 | test_set2 <- test_set2[,c(2:8)] 189 | dim(training_set2) 190 | dim(test_set2) 191 | ``` 192 | 193 | ##### 2.1 `rf2` fit the second random forest model inside SuperLearner 194 | ```{r} 195 | library(SuperLearner) 196 | listWrappers() # we want "SL.randomForest" 197 | 198 | rf2 <- SuperLearner(Y = Y, X = training_set2, family = binomial(), SL.library = "SL.randomForest") 199 | 200 | rf2 201 | ``` 202 | In the output, Risk is an estimate of model accuracy/performance as estimated by cross-validation of risk on future data. By default it uses 10 folds. 203 | 204 | Coef is how much weight SuperLearner puts on that model in the ensemble weighted-average. If Coef = 0 it means that model is not used at all. 205 | 206 | # Compare multiple models simultaneously 207 | Now, let's compare our random forest model to two other tree-based models: `SL.rpart` and `SL.xgboost`. 208 | 209 | We also include the mean of Y (`SL.mean`) as a benchmark algorithm - if it is the discrete winner, then we can assume that our model fits the data poorly. 210 | 211 | Based on model performance (risk), SuperLearner will also tell us which model is the best (Discrete winner) and also create a weighted average of the multiple models (SuperLearnerer). 212 | 213 | ##### 3.1 `rf3` fit the SuperLearner randomForest model in an ensemble 214 | ```{r} 215 | rf3 <- SuperLearner(Y = Y, X = training_set2, family = binomial(), SL.library = c("SL.mean", "SL.rpart", "SL.randomForest", "SL.xgboost")) 216 | 217 | rf3 218 | ``` 219 | 220 | ##### 3.2 Assess model performance on `test_set2` 221 | Then, we want to assess the model performance on test_set and illustrate with a simple barplot. 222 | ```{r} 223 | pred2 <- predict(rf3, test_set2, onlySL = T) 224 | 225 | summary(pred2$library.predict) 226 | 227 | ggplot(as.data.frame(pred2), aes(x = pred)) + 228 | geom_histogram(fill = "blue", color = "black") + 229 | xlab("Predicted values") + 230 | theme_minimal() 231 | ``` 232 | 233 | ##### 3.3 AUC on `test_set2` 234 | We can then check the area under the receiver operator characteristic (ROC) curve to see an alternative performance metric of `rf3` on `test_set2`: 235 | ```{r} 236 | library(ROCR) 237 | pred_rocr <- prediction(pred2$pred, Y_test) 238 | auc <- performance(pred_rocr, measure = "auc", x.measure = "cutoff")@y.values[[1]] 239 | auc # AUC = 0.82 240 | ``` 241 | 242 | ##### 4.1 `rf4` fit the SuperLearner randomForest model in an ensemble with external cross-validation 243 | Default cross-validation is set to 10-fold in SuperLearner. However, we can use (external) cross-validation via the `CV.SuperLearner` function. We can also use all the data since we are using this external layer of cross-validation. 244 | 245 | ```{r} 246 | SL_Y = ifelse(Mroz$lfp == "yes", 1, 0) 247 | SL_X = Mroz[,-1] 248 | 249 | set.seed(1) 250 | 251 | rf4 <- CV.SuperLearner(Y = SL_Y, X = SL_X, family = binomial(), V = 10, SL.library = c("SL.mean", "SL.rpart", "SL.randomForest", "SL.xgboost")) 252 | 253 | rf4 254 | names(rf4) 255 | 256 | summary(rf4) 257 | 258 | table(simplify2array(rf4$whichDiscreteSL)) 259 | plot(rf4) + theme_linedraw() 260 | ``` 261 | 262 | See these guides for more: 263 | [SuperLearner Guide](https://github.com/ck37/superlearner-guide) 264 | 265 | To learn more about plotting decision boundaries in R, check out the mlr package examples [Quick start](http://mlr-org.github.io/mlr-tutorial/release/html/) and [Visualizations of predictions](https://mlr-org.github.io/Visualisation-of-predictions/) 266 | 267 | [James G, Witten D, Hastie T, Tibshirani R. 2013. An Introduction to Statistical Learning - with Applications in R. New York: Springer](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) 268 | 269 | [Package "SuperLearner"](https://cran.r-project.org/web/packages/SuperLearner/SuperLearner.pdf) -------------------------------------------------------------------------------- /Spring2018/decision-trees-feb14/decision-trees-r.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Decision trees for machine learning" 3 | #output: html_notebook 4 | output: html_document 5 | #editor_options: 6 | #chunk_output_type: inline 7 | --- 8 | 9 | Topics 10 | 11 | * rpart 12 | * Caret 13 | * SuperLearner 14 | * h2o.ai 15 | * mlr 16 | * book 17 | 18 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Use the latest RStudio preview release to run within RStudio. 19 | 20 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 21 | 22 | ```{r} 23 | # Load iris dataset. 24 | data(iris) 25 | 26 | # Review data structure. 27 | str(iris) 28 | 29 | # Review species distribution. 30 | table(iris$Species, useNA = "ifany") 31 | 32 | # Review all variables. 33 | summary(iris) 34 | ``` 35 | 36 | ```{r} 37 | # install rpart first if you don't already have it. 38 | # install.packages("rpart") 39 | # rpart = recursive partitioning and regression trees (aka decision trees) 40 | library(rpart) 41 | 42 | # Review package help and vignette if available. 43 | # HINT: vignette covers all of this in much better detail. 44 | help(package = "rpart") 45 | 46 | # To be reproducible we need to set a seed due to randomness in the cross-validation. 47 | set.seed(1) 48 | 49 | # Fit a classification decision tree to predict Species using all other variables. 50 | # We don't need to specify method="class" because Species is a factor variable. 51 | # For regression we'd do method = "anova" (default if outcome variable is not a factor) 52 | tree_model = rpart(Species ~ ., data = iris) 53 | 54 | # Display the decision tree in text form. 55 | tree_model 56 | 57 | # Plot tree graphically. 58 | plot(tree_model, compress = T) 59 | # We have to add the plot text manually for some reason. 60 | # NOTE: you may need to select the plot() and text() lines and run them simultaneously 61 | # depending on your RStudio settings, e.g. if you get a "plot.new has not been called yet" error. 62 | text(tree_model, use.n = T) 63 | ``` 64 | 65 | Wow, this is one of the worst plots I've ever seen! Hard to get much worse than that. 66 | 67 | The help pages will give more details on the function arguments as well as handy examples. 68 | 69 | ```{r} 70 | # Review main decision tree function. 71 | ?rpart 72 | 73 | # Review the configuration options for trees. 74 | ?rpart.control 75 | 76 | # Same thing as above but with explicitly setting key options. 77 | # We specify 10 cross-validation folds to determine the best complexity. 78 | # Minbucket is the minimum number of observations in a node. 79 | # Tip: I put parentheses around the whole line so that the result is printed. 80 | (tree_model = rpart(Species ~ ., data = iris, 81 | control = rpart.control(xval = 10, minbucket = 5, cp = 0.01))) 82 | 83 | ``` 84 | 85 | 86 | Let's get a better decision tree plotting package. 87 | 88 | ```{r} 89 | # Install from CRAN if you don't already have this: 90 | # install.packages("rpart.plot") 91 | library(rpart.plot) 92 | 93 | rpart.plot(tree_model) 94 | 95 | # What other settings can we modify? 96 | ?rpart.plot 97 | 98 | # Review the vignette if interested. 99 | help(package = "rpart.plot") 100 | 101 | # Another way to plot it. 102 | library(partykit) 103 | plot(as.party(tree_model)) 104 | 105 | # fancyRpartPlot() in the rattle package is also good. 106 | 107 | ``` 108 | 109 | We can dig into the details of the tree a bit more. 110 | 111 | ```{r} 112 | # Review accuracy for different complexity parameters. 113 | # When nsplits = 0 we have 0 nodes and are merely guessing the most common class. 114 | # When nsplits is large we have 1 + # splits nodes and each node is its own prediction. 115 | printcp(tree_model) 116 | 117 | # Save the complexity parameter table, and also print. 118 | cp_table = printcp(tree_model) 119 | 120 | # Review structure of the cp table. 121 | str(cp_table) 122 | 123 | # Which row has minimum cross-validation error? 124 | # Alternatively we could choose the tree within 1 SD of the minimum. 125 | best_row = cp_table[which.min(cp_table[, "xerror"]), ] 126 | best_row 127 | best_row["CP"] 128 | 129 | # Get all the details on the tree. 130 | summary(tree_model, cp = best_row["CP"]) 131 | 132 | # Prune to the optimal complexity parameter (no change in this case). 133 | tree_model = prune(tree_model, cp = best_row["CP"]) 134 | 135 | tree_model 136 | ``` 137 | 138 | We did not create a separate holdout or test set, so let's predict back on the original data. 139 | 140 | ```{r} 141 | predictions = predict(tree_model, iris) 142 | summary(predictions) 143 | 144 | # How do the predictions look compared to the outcome data? 145 | data.frame(iris$Species, predictions) 146 | 147 | # This is an optimistic view because the model was built on this same data. 148 | # With a random holdout set we would get a more realistic view of accuracy. 149 | 150 | ``` 151 | 152 | ## Regression 153 | 154 | Quick regression example. 155 | ```{r} 156 | # This data is in the rpart package. 157 | data(car90) 158 | 159 | # Review structure of dataset. 160 | str(car90) 161 | 162 | # Set seed due to cross-validation randomness. 163 | set.seed(1) 164 | 165 | # Predict price using most other fields. 166 | # Remove a few fields that are too predictive (rim) or too many categories. 167 | reg_tree = rpart(Price ~ ., data = car90[, !names(car90) %in% c("Rim", "Tires", "Model2")]) 168 | 169 | # How'd it go? 170 | reg_tree 171 | 172 | # Review complexity parameter options. 173 | printcp(reg_tree) 174 | 175 | # Visualize results across complexity parameter. 176 | rsq.rpart(reg_tree) 177 | 178 | # Save the complexit parameter table. 179 | cp_table = printcp(reg_tree) 180 | 181 | # Which row has minimum cross-validation error? 182 | (best_row = cp_table[which.min(cp_table[, "xerror"]), ]) 183 | best_row["CP"] 184 | 185 | # Review summary with the best complexity parameter. 186 | summary(reg_tree, cp = best_row["CP"]) 187 | 188 | # Prune our tree back to the best complexity parameter. 189 | # Note that in this case no real pruning is needed, because 190 | # the full tree is the best. 191 | reg_tree = prune(reg_tree, cp = best_row["CP"]) 192 | 193 | # Visualize our final tree. 194 | rpart.plot(reg_tree) 195 | 196 | ``` 197 | 198 | # Caret 199 | 200 | ```{r} 201 | library(caret) 202 | 203 | # Nice and simple - using default settings for everything. 204 | # caret tries 3 complexity parameters by default, but tuneLength customizes that. 205 | model = train(Species ~ ., data = iris, method = "rpart", tuneLength = 5) 206 | 207 | # We see again that cp= 0 gives us the best accuracy. 208 | model 209 | 210 | # Use the handy built-in caret plotting. 211 | plot(model) 212 | 213 | # Look at the final model object (rpart). 214 | model$finalModel 215 | ``` 216 | 217 | # SuperLearner 218 | 219 | SuperLearner unfortunately cannot do multiple-class classification (yet) so let's convert to a binary classification problem. 220 | 221 | ```{r} 222 | 223 | # Review 224 | table(iris$Species) 225 | 226 | # Copy into a new dataframe. 227 | data = iris 228 | 229 | # Convert Species to a binary indicator for setosa. 230 | data$Species = as.integer(data$Species == "versicolor") 231 | 232 | # Confirm distribution of modified outcome variable. 233 | table(data$Species, iris$Species, useNA = "ifany") 234 | 235 | library(SuperLearner) 236 | 237 | set.seed(1) 238 | 239 | # family = binomial() is used for classification; family = gaussian() for regression. 240 | sl = SuperLearner(X = data[, -5], Y = data$Species, family = binomial(), 241 | SL.library = c("SL.mean", "SL.rpart")) 242 | sl 243 | 244 | # Review the raw rpart object. 245 | sl$fitLibrary$SL.rpart_All$object 246 | 247 | # Use our nice plotting library. 248 | rpart.plot::rpart.plot(sl$fitLibrary$SL.rpart_All$object) 249 | 250 | ``` 251 | 252 | # h2o.ai 253 | 254 | We can get close to a single decision tree by using randomForest in h2o. We set RF to fit a single decision tree and to search all variables at each split. It will not be exactly the same due to boostrap sampling but will be similar. 255 | 256 | ```{r} 257 | # install.packages("h2o") # version 3.16 258 | # Or version 3.18: 259 | # install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/1/R") 260 | # Or nightly release (3.19): 261 | # install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/master/4203/R") 262 | library(h2o) 263 | 264 | # Start h2o backend. 265 | h2o.init(nthreads = -1) 266 | 267 | # Load iris data into h2o. 268 | iris_h2o = h2o.uploadFile(path = system.file("extdata", "iris_wheader.csv", 269 | package = "h2o"), 270 | destination_frame = "iris_h2o") 271 | 272 | # Confirm it loaded correctly. 273 | summary(iris_h2o) 274 | 275 | # Specify x and y by the column indices. 276 | # Set ntree to 1, and mtries to # of covariates. 277 | # Seed only reproducible when running single-threaded. 278 | iris_tree = h2o.randomForest(y = 5, x = 1:4, training_frame = iris_h2o, 279 | ntrees = 1, mtries = 4, seed = 1) 280 | 281 | # Review results. 282 | iris_tree 283 | 284 | summary(iris_tree) 285 | 286 | # Review variable importance. 287 | h2o.varimp(iris_tree) 288 | 289 | # Plot variable importance - nice. 290 | h2o.varimp_plot(iris_tree) 291 | 292 | # Shutdown h2o backend. 293 | h2o.shutdown(prompt = F) 294 | ``` 295 | 296 | h2o debugging notes: 297 | 298 | * If you get a "connection refused" error it may mean that your version of Java is too new. 299 | * Java must be JDK 8; h2o does not yet support JDK 9. 300 | * More info here: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/faq/java.html 301 | * Info on how to install JDK8 with homebrew here: http://www.lonecpluspluscoder.com/2017/10/08/installing-other-versions-of-the-java-jdk-via-homebrew/ 302 | 303 | # mlr 304 | 305 | ```{r} 306 | library(mlr) 307 | 308 | # Generate the task for multiple classification (also works for binary). 309 | task = makeClassifTask(data = iris, target = "Species") 310 | 311 | # Get the number of observations 312 | n = getTaskSize(task) 313 | 314 | # Generate the learners. 315 | learners = list(makeLearner("classif.rpart", id = "rpart", predict.type = "prob")) 316 | 317 | # 5-fold cross-validation, stratifying on Y to ensure balance across folds. 318 | # could use stratify.cols to stratify on certain important covariates. 319 | rdesc = makeResampleDesc("CV", iters = 5L, stratify = TRUE) 320 | 321 | # Fit model across cross-validation folds and calculate the performance. 322 | result = benchmark(learners, task, rdesc, measures = list(acc, mmce)) 323 | 324 | # MMCE = mean misclassification error (i.e. 1 - accuracy) 325 | result 326 | 327 | # Plot the results. Generally we would plot multiple models here. 328 | plotBMRBoxplots(result, measure = acc) 329 | ``` 330 | 331 | 332 | # Decision tree references 333 | 334 | Awesome new data camp course: [Machine Learning with Tree-based Models in R](https://www.datacamp.com/courses/machine-learning-with-tree-based-models-in-r) 335 | 336 | * By Berkeley's own Erin LeDell, now machine learning scientist at h2o.ai 337 | 338 | This book has nearly everything you would want to know about the theory of decision trees: 339 | 340 | Breiman, L., Friedman, J., Stone, C. J., & Olshen, R. A. (1984). Classification and regression trees. CRC press. 341 | 342 | The book has 32,000 citations according to Google Scholar. Not too shabby! Breiman and Stone were both Berkeley professors, and Breiman invented Random Forest, bagging, and some of the theory for SuperLearner & gradient boosted machines. Friedman is at Stanford and invented many other machine learning algorithms, particularly gradient boosted machines GBM) and multivariate adaptive regression splines (MARS). Olshen is also at Stanford. 343 | -------------------------------------------------------------------------------- /binder/apt.txt: -------------------------------------------------------------------------------- 1 | libnlopt-dev 2 | default-jdk -------------------------------------------------------------------------------- /binder/binder.md: -------------------------------------------------------------------------------- 1 | RStudio: [![Binder](http://mybinder.org/badge.svg)](http://beta.mybinder.org/v2/gh/dlab-berkeley/MachineLearningWG/master?urlpath=rstudio) -------------------------------------------------------------------------------- /binder/install.R: -------------------------------------------------------------------------------- 1 | install.packages("devtools") 2 | devtools::install_github(c("ecpolley/SuperLearner", "ck37/ck37r")) 3 | cran_packages = 4 | c("rpart", "rpart.plot", "partykit", "mlr", "car", "caret", 5 | "ggplot2", "lattice", "plotmo", "randomForest", "ROCR", 6 | "survival", "xgboost", "h2o", "glmnet") 7 | ck37r::load_packages(cran_packages, auto_install = TRUE) 8 | -------------------------------------------------------------------------------- /binder/runtime.txt: -------------------------------------------------------------------------------- 1 | r-2018-02-15 -------------------------------------------------------------------------------- /intro.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/intro.pptx --------------------------------------------------------------------------------