├── .gitignore
├── Fall2016
    ├── README.md
    ├── install_R_Python
    │   └── installation_help.txt
    ├── intro.pptx
    ├── nov18_GBM
    │   ├── AutoML.ipynb
    │   ├── GBM.ipynb
    │   ├── R GBM walkthrough.Rmd
    │   └── R_GBM_walkthrough.html
    ├── nov4_LASSO
    │   ├── Python-RidgeLassoEN.ipynb
    │   ├── nov4_examples.rtf
    │   ├── penalized-regression.Rmd
    │   └── penalized-regression.html
    ├── oct13_decisionTrees
    │   ├── Horning 2016.pdf
    │   ├── Lewicki 2007.pdf
    │   ├── Python-DecisionTreesRF.ipynb
    │   └── r-decision-trees.Rmd
    └── oct21_randomForests
    │   ├── Python-RF.ipynb
    │   ├── R-random forests.Rmd
    │   └── tree.dot
├── Fall2017
    ├── Fall2017info
    ├── Sep22-images-cnn
    │   ├── Intro_to_CNNs_in_Python.ipynb
    │   ├── images-cnn-R.Rmd
    │   ├── images-cnn-R.html
    │   ├── images-cnn-R
    │   │   ├── image_001.jpg
    │   │   ├── image_002.jpg
    │   │   ├── image_003.jpg
    │   │   ├── image_004.jpg
    │   │   ├── image_005.jpg
    │   │   ├── image_006.jpg
    │   │   ├── image_007.jpg
    │   │   ├── image_008.jpg
    │   │   ├── image_009.jpg
    │   │   ├── image_010.jpg
    │   │   ├── image_011.jpg
    │   │   ├── image_012.jpg
    │   │   ├── image_013.jpg
    │   │   ├── image_014.jpg
    │   │   ├── image_015.jpg
    │   │   ├── image_016.jpg
    │   │   ├── image_017.jpg
    │   │   ├── image_018.jpg
    │   │   ├── image_019.jpg
    │   │   └── image_020.jpg
    │   ├── imgs
    │   │   ├── alexnet.jpeg
    │   │   ├── cnn.jpeg
    │   │   ├── conv_box.gif.png
    │   │   ├── conv_gif copy.gif.png
    │   │   ├── conv_gif.gif
    │   │   ├── depthcol.jpeg
    │   │   ├── maxpool.jpeg
    │   │   ├── neural_net2.jpeg
    │   │   └── pool1.jpeg
    │   └── utils
    │   │   └── util.py
    └── Sep8-neural-nets
    │   ├── Neural Networks.ipynb
    │   ├── nn-from-scratch-3-layer-network.png
    │   └── r-neural-nets.Rmd
├── Fall2018
    ├── 1-sep5-PCA
    │   ├── PCA-R.Rmd
    │   ├── PCA-R.html
    │   ├── PCA-python.ipynb
    │   └── iris.csv
    ├── 2-sep19-k-means
    │   ├── k-means-ucr.Rmd
    │   ├── k-means-ucr.html
    │   └── readme.md
    ├── 3-oct3-hier_agg_clust
    │   ├── Oct3-hier_agg_clust.Rmd
    │   └── Oct3-hier_agg_clust.html
    ├── 4-medoids
    │   ├── medoid-clustering.Rmd
    │   └── readme.md
    ├── 5-Oct30-tSNE
    │   ├── r-tSNE.Rmd
    │   └── r-tSNE.html
    └── 6-nov14-umap
    │   ├── UMAP- ML Working Group.ipynb
    │   ├── umap-r.Rmd
    │   └── umap-r.html
├── LICENSE
├── MachineLearningWG.Rproj
├── Math4ML_2017
    ├── Math4ML notes July 19.docx
    ├── Math4ML notes July 5th .docx
    ├── Math4MLJune7.docx
    ├── Math4MLMay24.docx
    └── README.md
├── R and Python installation help.txt
├── README.md
├── Spring2017
    ├── Apr14-svm
    │   ├── SVM basics.ipynb
    │   ├── proj.png
    │   └── r-svm.Rmd
    ├── Apr28-neural-nets
    │   ├── Neural Networks.ipynb
    │   ├── nn-from-scratch-3-layer-network.png
    │   └── r-neural-nets.Rmd
    ├── Feb17-stepwise
    │   ├── r-stepwise-selection.Rmd
    │   ├── r-stepwise-selection.html
    │   └── stepwise-regression.ipynb
    ├── Feb3-knn
    │   ├── Feb3kNN-R.Rmd
    │   └── KNN-python.ipynb
    ├── Mar17-gam and mars
    │   ├── Mar3-gamearth-R.Rmd
    │   ├── Mar3-gamearth-R.html
    │   ├── Splines_Take_Two.ipynb
    │   └── macro.csv
    ├── Mar3-reg and splines
    │   ├── Mar3-regsplines-R.Rmd
    │   ├── Mar3-regsplines-R_files
    │   │   └── figure-html
    │   │   │   └── unnamed-chunk-2-1.png
    │   ├── Mar3-regsplines-py.ipynb
    │   └── macro.csv
    ├── May12-lightning
    │   └── CNNs with Keras.ipynb
    ├── data
    │   ├── Boston.csv
    │   ├── BreastCancer.csv
    │   └── sleep_VIM.csv
    └── spring 2017 schedule.rtf
├── Spring2018
    ├── Apr11-BoostingTrees
    │   ├── GBM.ipynb
    │   └── boosting-R.Rmd
    ├── Apr25 - Elastic Net
    │   ├── Elastic Net.ipynb
    │   └── elastic-net.Rmd
    ├── Feb28-randomForest
    │   ├── Random Forest Python.ipynb
    │   ├── Random Forest R.Rmd
    │   └── Random_Forest_R.html
    ├── Jan31-knn
    │   ├── Jan31knn-R.Rmd
    │   ├── Jan31knn-R.html
    │   └── kNN.ipynb
    └── decision-trees-feb14
    │   ├── Python-DecisionTreesRF.ipynb
    │   └── decision-trees-r.Rmd
├── binder
    ├── apt.txt
    ├── binder.md
    ├── install.R
    └── runtime.txt
└── intro.pptx


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .Rhistory
 3 | .Rproj.user
 4 | .Rhistory
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | # C extensions
10 | *.so
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | .hypothesis/
46 | # Translations
47 | *.mo
48 | *.pot
49 | # Django stuff:
50 | *.log
51 | local_settings.py
52 | # Flask stuff:
53 | instance/
54 | .webassets-cache
55 | # Scrapy stuff:
56 | .scrapy
57 | # Sphinx documentation
58 | docs/_build/
59 | # PyBuilder
60 | target/
61 | # IPython Notebook
62 | .ipynb_checkpoints
63 | # pyenv
64 | .python-version
65 | # celery beat schedule file
66 | celerybeat-schedule
67 | # dotenv
68 | .env
69 | # virtualenv
70 | venv/
71 | ENV/
72 | # Spyder project settings
73 | .spyderproject
74 | # Rope project settings
75 | .ropeproject
76 | mars2 plotmo.pdf
77 | 


--------------------------------------------------------------------------------
/Fall2016/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Working Group
 2 | 
 3 | Fridays, 12-1pm in 356 Barrows Hall
 4 | 
 5 | Fall 2016 Schedule
 6 | 
 7 | * September 23 - Introductory meeting
 8 | * October 7 - [Decision trees](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/oct13_decisionTrees)
 9 | * October 21 - [Random forests](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/oct21_randomForests)
10 | * November 4 - [Penalized regression](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/nov4_LASSO) - lasso, ridge, elastic net
11 | * November 18 - [Evan's skull dataset and GBM](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/MLWG_Fall2016/nov18_GBM)
12 | * December 2
13 | 
14 | Spring 2017 Schedule - to be determined, topics welcome!
15 | 
16 | More information on the [D-Lab website](http://dlab.berkeley.edu/working-groups/machine-learning-working-group)
17 | 
18 | ## Resources
19 | 
20 | Books:
21 | 
22 | * Intro to Statistical Learning [(free pdf)](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) [(Amazon page)](https://smile.amazon.com/Introduction-Statistical-Learning-Applications-Statistics-ebook/dp/B01IBM7790/) by Gareth James et al.
23 | * [Applied Predictive Modeling](https://smile.amazon.com/Applied-Predictive-Modeling-Max-Kuhn-ebook/dp/B00K15TZU0/) by Max Kuhn
24 | * Elements of Statistical Learning
25 | * Many others (any recommendations?)
26 | 
27 | Courses at Berkeley:
28 | 
29 | * Stat 154 - Statistical Learning
30 | * CS 189 / CS 289A - Machine Learning
31 | * PH 252D  - Causal Inference
32 | * PH 295 - Big Data
33 | * PH 295 - Targeted Learning for Biomedical Big Data
34 | * INFO - TBD
35 | 
36 | Coursera and other online classes
37 | 
38 | * To add
39 | 
40 | D-Lab Machine Learning Trainings
41 | 
42 | * D-Lab - Intro to Machine Learning
43 | * Erin LeDell - h2o.ai
44 | * Rochelle Terman - scikit-learn
45 | 
46 | [Specifics on the D-Lab calendar](http://dlab.berkeley.edu/calendar-node-field-date)
47 | 
48 | Other Campus Groups
49 | 
50 | * [Machine Learning @ Berkeley](https://ml.berkeley.edu/)
51 | * D-Lab's Cloud Computing Working Group
52 | * D-Lab's Computational Text Analysis Working Group
53 | * [The Hacker Within](http://www.thehackerwithin.org/berkeley/) / Berkeley Institute for Data Science
54 | 


--------------------------------------------------------------------------------
/Fall2016/install_R_Python/installation_help.txt:
--------------------------------------------------------------------------------
 1 | Before class, please download and install R Studio: 
 2 | 	https://www.rstudio.com/products/rstudio/download3/ 
 3 | 
 4 | If your installation does not work and says you need to install the binary files, please do so here: 
 5 | 	https://cloud.r-project.org/
 6 | 
 7 | Also download and install Python by following these instructions: 
 8 | 	https://github.com/dlab-berkeley/python-intensive/blob/master/Install.md
 9 | 		(you can also just pip install scikit-learn if you have Python but not Anaconda).
10 | 


--------------------------------------------------------------------------------
/Fall2016/intro.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/intro.pptx


--------------------------------------------------------------------------------
/Fall2016/nov18_GBM/AutoML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Auto ML Regression:"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "np.random.seed(1)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## auto-sklearn"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "from sklearn.datasets import load_boston\n",
 38 |     "from sklearn.cross_validation import train_test_split\n",
 39 |     "from sklearn import preprocessing\n",
 40 |     "\n",
 41 |     "boston = load_boston()\n",
 42 |     "\n",
 43 |     "X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target,\n",
 44 |     "                                                    train_size=0.8, test_size=0.2)\n",
 45 |     "\n",
 46 |     "scaler = preprocessing.StandardScaler().fit(X_train)\n",
 47 |     "X_train = scaler.transform(X_train)\n",
 48 |     "X_test = scaler.transform(X_test)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "from autosklearn.regression import AutoSklearnRegressor\n",
 60 |     "import sklearn.cross_validation\n",
 61 |     "import sklearn.metrics\n",
 62 |     "\n",
 63 |     "automl_r = AutoSklearnRegressor(time_left_for_this_task=100)\n",
 64 |     "                                #include_estimators={\"gradient_boosting\": ()})  # time_left_for_this_task=100\n",
 65 |     "automl_r.fit(X_train, y_train)\n",
 66 |     "y_hat = automl_r.predict(X_test)\n",
 67 |     "print(\"R2 score\", sklearn.metrics.r2_score(y_test, y_hat))"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Get final ensemble:"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "print(automl_r.show_models())"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "### Get iteration scores:\n",
 93 |     "From docs: `(list of named tuples) Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes: * parameters, a dict of parameter settings * mean_validation_score, the mean score over the cross-validation folds * cv_validation_scores, the list of scores for each fold`"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [],
103 |    "source": [
104 |     "automl_r.grid_scores_"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "### Cross-validation results:\n",
112 |     "\n",
113 |     "From docs: `(dict of numpy (masked) ndarrays) A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame. This attribute is a backward port to already support the advanced output of scikit-learn 0.18. Not all keys returned by scikit-learn are supported yet.`"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {
120 |     "collapsed": false
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "automl_r.cv_results_"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## TPOT"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "from tpot import TPOTRegressor\n",
143 |     "\n",
144 |     "tpot = TPOTRegressor(generations=5, population_size=20, verbosity=2)  # generations for optimization, , pop size is models\n",
145 |     "tpot.fit(X_train, y_train)\n",
146 |     "print(tpot.score(X_test, y_test))\n",
147 |     "tpot.export('tpot_boston_pipeline.py')"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "# AutoML Classification"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "from sklearn.datasets import load_iris\n",
166 |     "from sklearn.cross_validation import train_test_split\n",
167 |     "\n",
168 |     "iris = load_iris()\n",
169 |     "X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target,\n",
170 |     "                                                    train_size=0.75, test_size=0.25)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "## auto-sklearn"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "from autosklearn.classification import AutoSklearnClassifier\n",
189 |     "import sklearn.cross_validation\n",
190 |     "import sklearn.metrics\n",
191 |     "\n",
192 |     "automl_cl = AutoSklearnClassifier()  # time_left_for_this_task=100\n",
193 |     "automl_cl.fit(X_train, y_train)\n",
194 |     "y_hat = automl_cl.predict(X_test)\n",
195 |     "print(\"Accuracy score\", sklearn.metrics.accuracy_score(y_test, y_hat))"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "### Get final ensemble:"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": false
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "print(automl_cl.show_models())"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "### Get iteration scores:\n",
221 |     "From docs: `(list of named tuples) Contains scores for all parameter combinations in param_grid. Each entry corresponds to one parameter setting. Each named tuple has the attributes: * parameters, a dict of parameter settings * mean_validation_score, the mean score over the cross-validation folds * cv_validation_scores, the list of scores for each fold`"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": null,
227 |    "metadata": {
228 |     "collapsed": false
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "automl_cl.grid_scores_"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Cross-validation results:\n",
240 |     "\n",
241 |     "From docs: `(dict of numpy (masked) ndarrays) A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame. This attribute is a backward port to already support the advanced output of scikit-learn 0.18. Not all keys returned by scikit-learn are supported yet.`"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {
248 |     "collapsed": false
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "automl_cl.cv_results_"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "## TPOT"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "metadata": {
266 |     "collapsed": false
267 |    },
268 |    "outputs": [],
269 |    "source": [
270 |     "from tpot import TPOTClassifier\n",
271 |     "from sklearn.datasets import load_digits\n",
272 |     "from sklearn.cross_validation import train_test_split\n",
273 |     "\n",
274 |     "tpot = TPOTClassifier(generations=5, population_size=20, verbosity=2)\n",
275 |     "tpot.fit(X_train, y_train)\n",
276 |     "print(tpot.score(X_test, y_test))\n",
277 |     "tpot.export('tpot_iris_pipeline.py')"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {
284 |     "collapsed": true
285 |    },
286 |    "outputs": [],
287 |    "source": []
288 |   }
289 |  ],
290 |  "metadata": {
291 |   "anaconda-cloud": {},
292 |   "kernelspec": {
293 |    "display_name": "Python [conda root]",
294 |    "language": "python",
295 |    "name": "conda-root-py"
296 |   },
297 |   "language_info": {
298 |    "codemirror_mode": {
299 |     "name": "ipython",
300 |     "version": 3
301 |    },
302 |    "file_extension": ".py",
303 |    "mimetype": "text/x-python",
304 |    "name": "python",
305 |    "nbconvert_exporter": "python",
306 |    "pygments_lexer": "ipython3",
307 |    "version": "3.5.2"
308 |   }
309 |  },
310 |  "nbformat": 4,
311 |  "nbformat_minor": 1
312 | }
313 | 


--------------------------------------------------------------------------------
/Fall2016/nov18_GBM/R GBM walkthrough.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R GBM walkthrough"
  3 | author: "Evan Muzzall"
  4 | date: "November 18, 2016"
  5 | output: 
  6 |   html_document: 
  7 |     toc: yes
  8 |     toc_float: yes
  9 | ---
 10 | 
 11 | # 0.  Freund and Schapire 1999 - background
 12 | From [Freund and Schapire 1999](https://cseweb.ucsd.edu/~yfreund/papers/IntroToBoosting.pdf). 
 13 | "Boosting is a general method for improving the accuracy of any given learning algorithm" and originated in the AdaBoost and PAC learning (p. 1-2). Gradient boosted machines are ensembles decision tree methods of "weak" trees that are just slightly more accurate than random guessing which are then "boosted" into "strong" learners. That is, the models don't have to be accurate over the entire feature space. 
 14 | 
 15 | The model first tried to predict each value in a dataset - the cases that can be predicted easily are _downweighted_ so that the algorithm does not have to try as hard to predic them. 
 16 | 
 17 | However, the cases that the model has difficulty predicting are _upweighted_ so that the model tries more assertively to predict them. This continues for multiple "boosting" iterations. A resample-based performance measure is produced at each iteration. Error is measured on the weak learners so that even performing slightly better than random guessing improves accuracy fast (p.2). This method can drive down generalization error thus preventing overfitting (p. 5). While it is susceptible to noise, it is robust to outlier detection. 
 18 | 
 19 | # 1.  install packages
 20 | ```{r, eval=FALSE}
 21 | install.packages("car", dependencies=TRUE)
 22 | install.packages("caret", dependencies=TRUE)
 23 | install.packages("pROC", dependencies=TRUE)
 24 | ```
 25 | ```{r, eval=FALSE}
 26 | library(car)
 27 | library(caret)
 28 | library(pROC)
 29 | ``` 
 30 | 
 31 | # 2.  load the Mroz dataset
 32 | ```{r}
 33 | library(car)
 34 | data(Mroz)
 35 | str(Mroz)
 36 | ```
 37 | ### 2.1 See variable definitions with `?Mroz`
 38 | 
 39 | # 3.  use createDataPartition() to create an 75/25 stratified random split
 40 | ```{r}
 41 | library(caret)
 42 | split <- createDataPartition(Mroz$lfp, p=0.75, list=FALSE)
 43 | training.set <- Mroz[split,]
 44 | test.set <- Mroz[-split,]
 45 | nrow(training.set) + nrow(test.set) == nrow(Mroz) # sanity check
 46 | ```
 47 | 
 48 | `createDataPartition` = creates a stratified random split
 49 | training.set = train model here
 50 | test.set = does trained model maintain its performance here?
 51 | 
 52 | # 4.  train() a GBM model
 53 | ```{r}
 54 | set.seed(1)
 55 | gbm.fit1 <- train(lfp ~ ., data=training.set, method="gbm", verbose=FALSE)
 56 | ```
 57 | 
 58 | `train()` holds the tuning parameters; fits each one then calculates a resampling based performance metric
 59 | 
 60 | " . " comes from Perl's regex library and stands for "everything else".
 61 | 
 62 | ### 4.1 model summary
 63 | View a model summary table by calling the object. caret shows us the optimal model based on its attributes
 64 | ```{r}
 65 | gbm.fit1
 66 | ```
 67 | 
 68 | interaction.depth = tree depth/complexity
 69 | n.trees = number of boosting iterations
 70 | Accuracy = overall agreement rate averaged over the cross-validated boosting iterations
 71 | Kappa = Cohen's unweighted kappa averaged across resampling results (1 = perfect agreement)
 72 | 
 73 | ### 4.2 Plot bargraph of variable relative influence with summary()
 74 | ```{r}
 75 | summary(gbm.fit1, las=2, main="GBM relative influence")
 76 | gbm.fit1$times
 77 | ```
 78 | 
 79 | # 5. trainControl() and expand.grid()
 80 | ### 5.1 define the parameters of the control mechanism with `trainControl()`
 81 | ```{r}
 82 | control <- trainControl(method="repeatedcv", 
 83 | 	repeats=5,
 84 | 	classProbs=TRUE,
 85 | 	summaryFunction=twoClassSummary)
 86 | ```
 87 | 
 88 | method = "repeatedcv": CV measures predictive performance of a statistical model; 
 89 | repeats = number of times to repeat the cross-validation
 90 | classProbs = this will calculate predicted class probabilities (ROC) within the resampling process (Kuhn, 2015:4)
 91 | summaryFunction = uses observed versus predicted values to estimate performance (AUC, sensitivity, specificity) (Kuhn, 2015:4)
 92 | 
 93 | ### 5.2 compare multiple models at once with `expand.grid()`
 94 | ```{r}
 95 | grid <- expand.grid(n.trees=seq(100,2100, by=100),
 96 | 	interaction.depth=seq(1,3,5),
 97 | 	shrinkage=c(0.01,0.05, 0.1),
 98 | 	n.minobsinnode=10)
 99 | ```
100 | 
101 | shrinkage = learning rate of the algorithm; how quickly the model adapts to the data at each iteration
102 | n.minobsinnode = minimum number of observations needed to commence splitting
103 | 
104 | n.trees = number of boosting iterations
105 | interaction.depth = tree depth/complexity
106 | shrinkage = learning rate of the model; how quickly it adapts at each iteration
107 | n.minobsinnode = minimum number of samples needed to commence splitting
108 | 
109 | ```{r}
110 | set.seed(1)
111 | gbm.fit2 <- train(lfp ~ ., data=training.set,
112 | 	method="gbm",
113 | 	metric="ROC",
114 | 	trControl=control,
115 | 	tuneGrid=grid,
116 | 	verbose=FALSE)
117 | gbm.fit2$times
118 | ```
119 | 
120 | verbose = print lengthy output (`TRUE` or `FALSE`?)
121 | 
122 | ### 5.3 model summary table
123 | ```{r}
124 | gbm.fit2
125 | ```
126 | 
127 | ### 5.4 bargraph of variable relative influence
128 | ```{r}
129 | summary(gbm.fit2, las=2)
130 | ```
131 | 
132 | # 6.  ggplot line graph of the tunded models
133 | ```{r}
134 | ggplot(gbm.fit2) + theme_grey() + ggtitle("Model comparisons")
135 | ```
136 | 
137 | Want to learn more about ggplot2 themes? :) See [the ggplot2 themes help page](http://docs.ggplot2.org/dev/vignettes/themes.html)
138 | 
139 | # 7.  generate GBM predicted values and probabilities with with `predict()`
140 | ```{r}
141 | set.seed(1)
142 | gbm.pred <- predict(gbm.fit2, test.set)
143 | gbm.prob <- predict(gbm.fit2, test.set, type="prob")
144 | ```
145 | 
146 | `predict()` = predictions of various model fitting functions
147 | 
148 | ### 7.1 view GBM final model
149 | ```{r}
150 | gbm.cm <- confusionMatrix(gbm.pred, test.set$lfp)
151 | gbm.cm
152 | ```
153 | 
154 | A confusion/error matrix is a cross-tabulation of observed versus predicted classes
155 | 
156 | # 8.  plot GBM ROC curve
157 | ```{r}
158 | library(pROC)
159 | rocCurve <- roc(response=test.set$lfp,
160 | 	predictor = gbm.prob[, "yes"],
161 | 	levels = rev(levels(test.set$lfp)),
162 | 	auc=TRUE, ci=TRUE)
163 | ```
164 | 
165 | ```{r}
166 | plot(rocCurve, main="GBM", col="blue", col.main="blue", col.lab="blue")
167 | ```
168 | 
169 | # Help
170 | * The [caret help page](https://topepo.github.io/caret/)
171 | 
172 | * [Package 'caret](https://cran.r-project.org/web/packages/caret/caret.pdf)
173 | 
174 | * Kuhn M. 2008. [Building predictive models in R using the caret package](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&ved=0ahUKEwiytr_K0YjPAhVjImMKHTdwCaMQFgghMAA&url=https%3A%2F%2Fwww.jstatsoft.org%2Farticle%2Fview%2Fv028i05%2Fv28i05.pdf&usg=AFQjCNF6qKoSkwaevSrCzgHwKWOyGqnmMQ&cad=rja). J Stat Softw 28:1-26.
175 | 
176 | * Kuhn M. 2013. [Predictive modeling with R and the caret package](https://www.r-project.org/nosvn/conferences/useR-2013/Tutorials/kuhn/user_caret_2up.pdf). useR! The R User Conference, July 10-12, University of Castilla-La Mancha, Albacete, Spain
177 | 
178 | * Kuhn M. 2015. [A Short Introduction to the caret Package](https://cran.r-project.org/web/packages/caret/vignettes/caret.pdf).  
179 | 


--------------------------------------------------------------------------------
/Fall2016/nov4_LASSO/nov4_examples.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1404\cocoasubrtf470
 2 | {\fonttbl\f0\fnil\fcharset0 Calibri;}
 3 | {\colortbl;\red255\green255\blue255;\red0\green0\blue233;}
 4 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 5 | \deftab720
 6 | \pard\pardeftab720\sl280\partightenfactor0
 7 | 
 8 | \f0\fs32 \cf0 \expnd0\expndtw0\kerning0
 9 | Python example: \
10 | \pard\pardeftab720\sl280\partightenfactor0
11 | {\field{\*\fldinst{HYPERLINK "https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/"}}{\fldrslt \cf2 \ul \ulc2 https://www.analyticsvidhya.com/blog/2016/01/complete-tutorial-ridge-lasso-regression-python/}}\
12 | \
13 | R example:\
14 | \pard\pardeftab720\sl280\partightenfactor0
15 | {\field{\*\fldinst{HYPERLINK "http://machinelearningmastery.com/penalized-regression-in-r/"}}{\fldrslt \cf2 \ul \ulc2 http://machinelearningmastery.com/penalized-regression-in-r/}}}


--------------------------------------------------------------------------------
/Fall2016/nov4_LASSO/penalized-regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Penalized regression in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | # Data prep
  8 | 
  9 | ```{r}
 10 | library(MASS)
 11 | data(Boston)
 12 | help(Boston)
 13 | str(Boston)
 14 | summary(Boston)
 15 | 
 16 | # Our outcome is median home value.
 17 | outcome = "medv"
 18 | 
 19 | # Divide into 80% training, 20% test split.
 20 | library(caret)
 21 | set.seed(1)
 22 | train_index = caret::createDataPartition(Boston[, outcome], p = .8, 
 23 |                                   list = F, 
 24 |                                   times = 1)
 25 | 
 26 | # Glmnet wants the data to be matrices, not data frames.
 27 | X_train = as.matrix(Boston[train_index, !names(Boston) == outcome])
 28 | X_test = as.matrix(Boston[-train_index, !names(Boston) == outcome])
 29 | 
 30 | Y_train = Boston[train_index, outcome]
 31 | Y_test = Boston[-train_index, outcome]
 32 | 
 33 | dim(X_train)
 34 | length(Y_train)
 35 | 
 36 | dim(X_test)
 37 | length(Y_test)
 38 | ```
 39 | 
 40 | 
 41 | # Lasso
 42 | 
 43 | Lasso penalizes coefficients and imposes sparsity, so some coefficients may be shrunk to 0 if they do not appear to be related to the outcome.
 44 | 
 45 | ```{r}
 46 | library(glmnet)
 47 | # Fit the lasso to continuous Y
 48 | reg = cv.glmnet(X_train, Y_train, family = "gaussian", alpha = 1)
 49 | 
 50 | # Look at distribution of penalty term lambda.
 51 | plot(reg)
 52 | 
 53 | # Plot the underlying glmnet object, showing
 54 | # coefficients for differnt lambda values.
 55 | plot(reg$glmnet.fit, xvar = "lambda", label = T)
 56 | 
 57 | # Lambda with minimum mean-squared error.
 58 | reg$lambda.min
 59 | 
 60 | # Higher lambda within 1SE of performance of the minimum.
 61 | # (the "one standard error" rule from Leo Breiman.)
 62 | reg$lambda.1se
 63 | 
 64 | # Review coeffients
 65 | coef(reg, s = "lambda.1se")
 66 | 
 67 | # What about for lambda.min?
 68 | coef(reg, s = "lambda.min")
 69 | 
 70 | # Predict on test set.
 71 | pred = predict(reg, s = reg$lambda.1se, newx = X_test)
 72 | 
 73 | # Calculate mean-squared error.
 74 | mean((pred - Y_test)^2)
 75 | ```
 76 | 
 77 | # Ridge
 78 | 
 79 | Ridge penalizes the coefficients but does not impose sparsity, so no coefficient will ever be 0.
 80 | 
 81 | ```{r}
 82 | 
 83 | # Fit the ridge to continuous Y
 84 | # We just change alpha to 0 to get ridge regression.
 85 | reg = cv.glmnet(X_train, Y_train, family = "gaussian", alpha = 0)
 86 | 
 87 | # Look at distribution of penalty term lambda.
 88 | plot(reg)
 89 | 
 90 | # Plot the underlying glmnet object, showing
 91 | # coefficients for differnt lambda values.
 92 | plot(reg$glmnet.fit, xvar = "lambda", label = T)
 93 | 
 94 | # Predict on test set.
 95 | pred = predict(reg, s = reg$lambda.1se, newx = X_test)
 96 | 
 97 | # Calculate mean-squared error.
 98 | mean((pred - Y_test)^2)
 99 | ```
100 | 
101 | As expected, we do a little worse with ridge compared to lasso.
102 | 
103 | # Elastic net
104 | 
105 | ```{r}
106 | train_control = trainControl(method = "repeatedCV",
107 |                              number = 10,
108 |                              repeats = 3)
109 | 
110 | set.seed(1)
111 | 
112 | # Create a custom tuning grid.
113 | enet_grid = expand.grid(alpha = seq(0, 1, length.out = 5),
114 |                         lambda = 2^seq(-1, -7, length = 5))
115 | 
116 | # Review the grid.
117 | enet_grid
118 | 
119 | # To be simpler we could just say e.g. tuneLength = 5.
120 | 
121 | enet = train(X_train, Y_train, method = "glmnet",
122 |              #tuneLength = 5,
123 |              tuneGrid = enet_grid,
124 |              trControl = train_control)
125 | 
126 | print(enet)
127 | 
128 | plot(enet)
129 | 
130 | enet$bestTune
131 | 
132 | # Predict on test.
133 | pred = predict(enet, X_test)
134 | 
135 | # Review performance
136 | mean((pred - Y_test)^2)
137 | ```
138 | 
139 | # References
140 | 
141 | Intro to Statistical Learning, Chapter 6
142 | 
143 | [Glmnet vignette by Hastie and Qian](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) - lots of great code examples


--------------------------------------------------------------------------------
/Fall2016/oct13_decisionTrees/Horning 2016.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/oct13_decisionTrees/Horning 2016.pdf


--------------------------------------------------------------------------------
/Fall2016/oct13_decisionTrees/Lewicki 2007.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2016/oct13_decisionTrees/Lewicki 2007.pdf


--------------------------------------------------------------------------------
/Fall2016/oct13_decisionTrees/r-decision-trees.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Decision trees for machine learning"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | Topics
  7 | 
  8 | * rpart
  9 | * Caret
 10 | * SuperLearner
 11 | * h2o.ai
 12 | * mlr
 13 | * book
 14 | 
 15 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Use the latest RStudio preview release to run within RStudio.
 16 | 
 17 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 
 18 | 
 19 | ```{r}
 20 | # Load iris dataset.
 21 | data(iris)
 22 | 
 23 | # Review data structure.
 24 | str(iris)
 25 | 
 26 | # Review species distribution.
 27 | table(iris$Species, useNA = "ifany")
 28 | ```
 29 | 
 30 | ```{r}
 31 | # install rpart first if you don't already have it.
 32 | # rpart = recursive partitioning and regression trees (aka decision trees)
 33 | library(rpart)
 34 | 
 35 | # Review package help and vignette if available.
 36 | # HINT: vignette covers all of this in much better detail.
 37 | help(package = "rpart")
 38 | 
 39 | # Review main decision tree function.
 40 | ?rpart
 41 | 
 42 | # Review the configuration options for trees.
 43 | ?rpart.control
 44 | 
 45 | # We need to set a seed due to randomness in the cross-validation.
 46 | set.seed(1)
 47 | 
 48 | # Fit a classification decision tree to predict Species using all other variables.
 49 | # We don't need to specify method="class" because Species is a factor variable.
 50 | # We specify 10 cross-validation folds to determine the best complexity.
 51 | # Minbucket is the minimum number of observations in a node.
 52 | tree_model = rpart(Species ~ ., data = iris,
 53 |             control = rpart.control(xval = 10, minbucket = 5, cp = 0))
 54 | 
 55 | # Display the decision tree in text form.
 56 | tree_model
 57 | 
 58 | # Plot tree graphically.
 59 | plot(tree_model, compress = T)
 60 | # We have to add the plot text manually for some reason.
 61 | text(tree_model, use.n = T)
 62 | ```
 63 | 
 64 | Wow, this is one of the worst plots I've ever seen! Hard to get much worse than that.
 65 | 
 66 | Let's tree a better decision tree plotting package.
 67 | 
 68 | ```{r}
 69 | # Install from CRAN if you don't already have this:
 70 | library(rpart.plot)
 71 | 
 72 | rpart.plot(tree_model)
 73 | 
 74 | # What other settings can we modify?
 75 | ?rpart.plot
 76 | 
 77 | # Review the vignette if interested.
 78 | help(package = "rpart.plot")
 79 | 
 80 | # Another way to plot it.
 81 | library(partykit)
 82 | plot(as.party(tree_model))
 83 | 
 84 | # fancyRpartPlot() in the rattle package is also good.
 85 | 
 86 | ```
 87 | 
 88 | We can dig into the details of the tree a bit more.
 89 | 
 90 | ```{r}
 91 | # Review accuracy for different complexity parameters.
 92 | # When nsplits = 0 we have 0 nodes and are just guessing the most common class.
 93 | # When nsplits is large we have 1  + # splits nodes and each node is its own prediction.
 94 | printcp(tree_model)
 95 | 
 96 | # Save the complexit parameter table.
 97 | cp_table = printcp(tree_model)
 98 | 
 99 | # Review structure of the cp table.
100 | str(cp_table)
101 | 
102 | # Which row has minimum cross-validation error?
103 | # Alternatively we could choose the tree within 1 SD of the minimum.
104 | best_row = cp_table[which.min(cp_table[, "xerror"]), ]
105 | best_row
106 | best_row["CP"]
107 | 
108 | # Get all the details on the tree.
109 | summary(tree_model, cp = best_row["CP"])
110 | 
111 | # Prune to the optimal complexity parameter (no change in this case).
112 | tree_model = prune(tree_model, cp = best_row["CP"])
113 | 
114 | tree_model
115 | ```
116 | 
117 | We did not create a separate holdout or test set, so let's predict back on the original data.
118 | 
119 | ```{r}
120 | predictions = predict(tree_model, iris)
121 | summary(predictions)
122 | 
123 | # How do the predictions look compared to the outcome data?
124 | data.frame(iris$Species, predictions)
125 | 
126 | # This is an optimistic view because the model was built on this same data.
127 | # With a random holdout set we would get a more realistic view of accuracy.
128 | 
129 | ```
130 | 
131 | ## Regression
132 | 
133 | Quick regression example.
134 | ```{r}
135 | # This data is in the rpart package.
136 | data(car90)
137 | 
138 | # Review structure of dataset.
139 | str(car90)
140 | 
141 | # Set seed due to cross-validation randomness.
142 | set.seed(1)
143 | 
144 | # Predict price using most other fields.
145 | # Remove a few fields that are too predictive (rim) or too many categories.
146 | reg_tree = rpart(Price ~ ., data = car90[, !names(car90) %in% c("Rim", "Tires", "Model2")])
147 | 
148 | # How'd it go?
149 | reg_tree
150 | 
151 | # Review complexity parameter options.
152 | printcp(reg_tree)
153 | 
154 | # Visualize results across complexity parameter.
155 | rsq.rpart(reg_tree)
156 | 
157 | # Save the complexit parameter table.
158 | cp_table = printcp(reg_tree)
159 | 
160 | # Which row has minimum cross-validation error?
161 | best_row = cp_table[which.min(cp_table[, "xerror"]), ]
162 | best_row
163 | best_row["CP"]
164 | 
165 | # Review summary with the best complexity parameter.
166 | summary(reg_tree, cp = best_row["CP"])
167 | 
168 | # Prune our tree back to the best complexity parameter.
169 | # Note that in this case no real pruning is needed, because
170 | # the full tree is the best.
171 | reg_tree = prune(reg_tree, cp = best_row["CP"])
172 | 
173 | # Visualize our final tree.
174 | rpart.plot(reg_tree)
175 | 
176 | ```
177 | 
178 | # Caret
179 | 
180 | ```{r}
181 | library(caret)
182 | 
183 | # Nice and simple - using default settings for everything.
184 | # caret tries 3 complexity parameters by default, but tuneLength customizes that.
185 | model = train(Species ~ ., data = iris, method = "rpart", tuneLength = 5)
186 | 
187 | # We see again that cp= 0 gives us the best accuracy.
188 | model
189 | 
190 | # Use the handle built-in caret plotting.
191 | plot(model)
192 | 
193 | # Look at the final model object (rpart).
194 | model$finalModel
195 | ```
196 | 
197 | # SuperLearner
198 | 
199 | SuperLearner unfortunately cannot do multiple-class classification (yet) so let's convert to a binary classification problem.
200 | 
201 | ```{r}
202 | 
203 | # Review 
204 | table(iris$Species)
205 | 
206 | # Copy into a new dataframe.
207 | data = iris
208 | 
209 | # Convert Species to a binary indicator for setosa.
210 | data$Species = 1*(data$Species == "versicolor")
211 | 
212 | # Confirm distribution of modified outcome variable.
213 | table(data$Species, iris$Species, useNA = "ifany")
214 | 
215 | library(SuperLearner)
216 | 
217 | set.seed(1)
218 | 
219 | sl = SuperLearner(X = data[, -5], Y = data$Species, family = binomial(),
220 |                   SL.library = c("SL.mean", "SL.rpart"))
221 | sl
222 | 
223 | # Review the raw rpart object.
224 | sl$fitLibrary$SL.rpart_All$object
225 | 
226 | # Use our nice plotting library.
227 | rpart.plot::rpart.plot(sl$fitLibrary$SL.rpart_All$object)
228 | 
229 | ```
230 | 
231 | # h2o.ai
232 | 
233 | We can get close to a single decision tree by using randomForest in h2o. We set RF to fit a single decision tree and to search all variables at each split. It will not be exactly the same due to boostrap sampling but will be similar.
234 | 
235 | ```{r}
236 | library(h2o)
237 | 
238 | # Start h2o backend.
239 | h2o.init()
240 | 
241 | # Load iris data into h2o.
242 | iris_h2o = h2o.uploadFile(path = system.file("extdata", "iris_wheader.csv", package="h2o"),
243 |                           destination_frame = "iris_h2o")
244 | 
245 | # Confirm it loaded correctly.
246 | summary(iris_h2o)
247 | 
248 | # Specify x and y by the column indices.
249 | # Set ntree to 1, and mtries to # of covariates.
250 | # Seed only reproducible when running single-threaded.
251 | iris_tree = h2o.randomForest(y = 5, x = 1:4, training_frame = iris_h2o,
252 |                              ntrees = 1, mtries = 4, seed = 1)
253 | 
254 | # Review results.
255 | iris_tree
256 | 
257 | summary(iris_tree)
258 | 
259 | # Review variable importance.
260 | h2o.varimp(iris_tree)
261 | 
262 | # Plot variable importance - nice.
263 | h2o.varimp_plot(iris_tree)
264 | 
265 | # Shutdown h2o backend.
266 | h2o.shutdown(prompt = F)
267 | ```
268 | 
269 | # mlr
270 | 
271 | ```{r}
272 | library(mlr)
273 | 
274 | # Generate the task for multiple classification (also works for binary).
275 | task = makeClassifTask(data = iris, target = "Species")
276 | 
277 | # Get the number of observations
278 | n = getTaskSize(task)
279 | 
280 | # Generate the learners.
281 | learners = list(makeLearner("classif.rpart", id = "rpart", predict.type = "prob"))
282 | 
283 | # 5-fold cross-validation, stratifying on Y to ensure balance across folds.
284 | # could use stratify.cols to stratify on certain important covariates.
285 | rdesc = makeResampleDesc("CV", iters = 5, stratify = T)
286 | 
287 | # Fit model across cross-validation folds and calculate the performance.
288 | result = benchmark(learners, task, rdesc, measures = list(acc, mmce))
289 | 
290 | # MMCE = mean misclassification error (i.e. 1 - accuracy)
291 | result
292 | 
293 | # Plot the results. Generally we would plot multiple models here.
294 | plotBMRBoxplots(result, measure = acc)
295 | ```
296 | 
297 | 
298 | # Decision tree references
299 | 
300 | This book has nearly everything you would want to know about the theory of decision trees:
301 | 
302 | Breiman, L., Friedman, J., Stone, C. J., & Olshen, R. A. (1984). Classification and regression trees. CRC press.
303 | 
304 | The book has 32,000 citations according to Google Scholar. Not too shabby! Breiman and Stone were both Berkeley professors, and Breiman invented Random Forest, bagging, and some of the SuperLearner theory. Friedman is at Stanford and invented many other machine learning algorithms, particularly gradient boosted machines GBM) and multivariate adaptive regression splines (MARS). Olshen is also at Stanford.
305 | 


--------------------------------------------------------------------------------
/Fall2016/oct21_randomForests/R-random forests.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "UC Berkeley D-Lab MLWG random forests in R"
  3 | author: "Evan Muzzall"
  4 | date: "October 21, 2016"
  5 | output:
  6 |   html_document:
  7 |     toc: yes
  8 |   pdf_document:
  9 |     toc: yes
 10 |   word_document:
 11 |     toc: yes
 12 | ---
 13 | ```{r}
 14 | rm(list=ls())
 15 | #options(scipen = 999)
 16 | ```
 17 | 
 18 | ```{r setup, include=FALSE}
 19 | knitr::opts_chunk$set(echo = TRUE)
 20 | ```
 21 | 
 22 | # 1.  What are random forests?
 23 | Random forests are ensemble classifier methods that use multiple decision tree models for classification and regression. 
 24 | 
 25 | Unlike decision trees/bagged trees, by default results generally do not require pruning and include accuracy and variable importance information. Furthermore, at each random forest tree split, only a small portion of the predictors are used (rather than the full suite).
 26 | 
 27 | We will fit four different random forest models:
 28 | 1. rf1: 'randomForest' package model
 29 | 
 30 | 2. rf2: 'SuperLearner' package model
 31 | 
 32 | 3. rf3: 'SuperLearnerL' package model compared to 'rpart' decision tree model and SL mean
 33 | 
 34 | 4. rf4: 'SuperLearner' package model with external cross-validation for multi-model comparison and visualization of model differences
 35 | 
 36 | ##  1.1 install packages
 37 | Install and `library()` necessary packages.
 38 | ```{r, eval=FALSE}
 39 | library(car)
 40 | library(caret)
 41 | library(gbm)
 42 | library(ggplot2)
 43 | library(lattice)
 44 | library(randomForest)
 45 | library(rpart)
 46 | library(ROCR)
 47 | library(SuperLearner)
 48 | library(survival)
 49 | ```
 50 | 
 51 | ##  1.2 `data(Mroz)`
 52 | Load and explore Mroz dataset.
 53 | ```{r}
 54 | library(car)
 55 | data(Mroz)
 56 | ?Mroz
 57 | str(Mroz)
 58 | head(Mroz)
 59 | ```
 60 | 
 61 | ##  1.3 `lfp`
 62 | Let's examine frequencies of the `lfp` variable (labor force participation), since it is the one we want to predict.
 63 | ```{r, eval=FALSE}
 64 | Mroz$lfp
 65 | ```
 66 | ```{r}
 67 | library(lattice)
 68 | table(Mroz$lfp)
 69 | barchart(table(Mroz$lfp), col="orange")
 70 | ```
 71 | 
 72 | ##  1.4 stratified random split
 73 | Now, we will use the `createDataPartition` command from the 'caret' package to perform a 75/25 stratified random split of the Mroz data into training and test sets. 
 74 | ```{r}
 75 | library(caret)
 76 | set.seed(1)
 77 | split <- createDataPartition(Mroz$lfp, p=0.75, list=FALSE)
 78 | training.set <- Mroz[split,]
 79 | test.set <- Mroz[-split,]
 80 | 
 81 | nrow(training.set) + nrow(test.set) == nrow(Mroz) # sanity check
 82 | ```
 83 | 
 84 | ##  1.5 `randomForest()` model on 'training.set'
 85 | Using the 'randomForest' package, let's fit a random forest model to predict the number of women who participated or did not participate in the labor force in 1975.
 86 | ```{r}
 87 | library(randomForest)
 88 | ?randomForest
 89 | set.seed(1)
 90 | rf1 <- randomForest(lfp ~ ., 
 91 |                     data=training.set, 
 92 |                     ntree=500,
 93 |                     mtry=2,
 94 |                     importance=TRUE)
 95 | #NOTE: notice that our response vector 'lfp' is a factor - this will assume classification models, otherwise regression will be assumed. If it is omitted entirely, randomForest becomes unsupervised. 
 96 | rf1
 97 | 
 98 | # check accuracy on training set
 99 | (189+247) / nrow(training.set)  # training.set = 77% accuracy
100 | 
101 | rf1$importance
102 | barchart(rf1$importance, main="rf barchart", col="blue", border="black")
103 | dotplot(rf1$importance, main="rf dotplot", col=c(1,4))
104 | ```
105 | 
106 | ##  1.6 model performance on 'test.set'
107 | Now, let's see how our model performs on the test data.
108 | ```{r}
109 | set.seed(1)
110 | pred <- predict(rf1, newdata=test.set)
111 | table(pred, test.set$lfp)
112 | ```
113 | 
114 | ##  1.7 check model accuracy
115 | Of the 188 test.set observations, We have 56 true negatives (correct 'no' predictions), and 81 true positives (correct 'yes' predictions).
116 | 
117 | Now, we can quickly check the accuracy of the model using the holdout dataset. 
118 | ```{r}
119 | (56 + 81) / nrow(test.set)  #test.set = 73% accuracy
120 | ```
121 | 
122 | # 2 Compare multiple models using the 'SuperLearner' R package
123 | 'SuperLearner' is an R package that allows you to easily compare multiple machine learning algorithms at once and/or the same algorithm with different settings.
124 | 
125 | It then creates an optimal weighted average of those models, aka an "ensemble", using the test data performance. This approach has been proven to be asymptotically as accurate as the best possible prediction algorithm that is tested.
126 | 
127 | ##  2.1 Coerce `lfp` to integer type
128 | For binary classification, SuperLearner prefers that your categorical outcome is numeric/integer, rather than factor data type. 
129 | 
130 | Let's first coerce `lfp` from factor to integer type.
131 | ```{r}
132 | class(training.set$lfp)
133 | class(test.set$lfp)
134 | 
135 | ?ifelse
136 | training.set$lfp <- ifelse(training.set$lfp=="yes", 1L, 0L)
137 | test.set$lfp <- ifelse(test.set$lfp=="yes", 1L, 0L)
138 | 
139 | class(training.set$lfp)
140 | class(test.set$lfp)
141 | ```
142 | ```{r, eval=FALSE}
143 | training.set$lfp
144 | test.set$lfp
145 | ```
146 | 
147 | ##  2.2 Assign Y variables
148 | Now, we should assign binary outcome variables for the training and test sets for the 'SuperLearner' computations.
149 | ```{r}
150 | Y <- training.set$lfp
151 | Y_test <- test.set$lfp
152 | table(Y)
153 | table(Y_test)
154 | ```
155 | 
156 | However, because we specify our outcome and predictor variables in SuperLearner, we must remove the outcome variable from our training and test sets because we do not want to include them as a predictor:
157 | ```{r}
158 | training.set2 <- training.set[,c(2:8)]
159 | test.set2 <- test.set[,c(2:8)]
160 | dim(training.set2)
161 | dim(test.set2)
162 | ```
163 | 
164 | ##  2.3 View code for randomForest and fit the second random forest model
165 | ```{r}
166 | library(SuperLearner)
167 | listWrappers()
168 | SL.randomForest
169 | ?SL.randomForest
170 | 
171 | rf2 <- SuperLearner(Y = Y, X = training.set2, family = binomial(), SL.library = "SL.randomForest")
172 | 
173 | rf2
174 | ```
175 | In the output, Risk is an estimate of model accuracy/performance as estimated by cross-validation of risk on future data. By default it uses 10 folds. 
176 | 
177 | Coef is how much weight SuperLearner puts on that model in the weighted-average. If Coef = 0 it means that model is not used at all. 
178 | 
179 | ##  2.4 Compare multiple models simultaneously
180 | Now, let's compare our random forest model to a decision tree model from R package 'rpart' as well as the weighted mean of the models.
181 | 
182 | Based on model performance (risk), SuperLearner will then tell us which model is the best (Discrete winner) and also create a weighted average of multiple models.
183 | 
184 | We include the mean of Y ("SL.mean") as a benchmark algorithm - if it is the discrete winner, then we can assume that our model fits the data poorly. 
185 | 
186 | Fit the third random forest model along with the SL.mean and rpart decision tree models as well:
187 | ```{r}
188 | rf3 <- SuperLearner(Y = Y, X = training.set2, family = binomial(), SL.library = c("SL.mean", "SL.rpart", "SL.randomForest"))
189 | 
190 | rf3
191 | ```
192 | 
193 | ##  2.5 Assess model performance on test.set
194 | Then, we want to assess the model performance on test.set and illustrate with a simple barplot.
195 | ```{r}
196 | pred2 <- predict(rf3, test.set2, onlySL=TRUE)
197 | 
198 | summary(pred2$library.predict)
199 | qplot(pred2$pred) + theme_linedraw() + xlab("predicted values")
200 | ```
201 | 
202 | We can then check the area under the receiver operator characteristic (ROC) curve to see how accurate the model fits to test.set
203 | ```{r}
204 | library(ROCR)
205 | pred_rocr <- prediction(pred2$pred, Y_test)
206 | auc <- performance(pred_rocr, measure = "auc", x.measure = "cutoff")@y.values[[1]]
207 | auc     # AUC = 0.79 - this is approximately consistent with our other accuracies!
208 | ```
209 | 
210 | # 3. Cross validation of random forest in SuperLearner and visualization
211 | Default cross-validation is set to 10-fold in SuperLearner. If we want to change it to 5 repeats of 10-fold cross-validation, we can use `V`. This makes plotting easy based on v-fold cross-validated risk estimation.
212 | ```{r}
213 | set.seed(1)
214 | rf4 <- CV.SuperLearner(Y = Y, X = training.set2, family = binomial(), V = 5, SL.library = c("SL.mean", "SL.rpart", "SL.randomForest"))
215 |   
216 | summary(rf4)
217 | 
218 | table(simplify2array(rf4$whichDiscreteSL))
219 | plot(rf4) + theme_linedraw()
220 | ```
221 | 
222 | Acknowledgements:
223 | Chris Kennedy
224 | 
225 | [James G, Witten D, Hastie T, Tibshirani R. 2013. An Introduction to Statistical Learning - with Applications in R. New York: Springer](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf)
226 | [Package "SuperLearner"](https://cran.r-project.org/web/packages/SuperLearner/SuperLearner.pdf)


--------------------------------------------------------------------------------
/Fall2016/oct21_randomForests/tree.dot:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box] ;
3 | 


--------------------------------------------------------------------------------
/Fall2017/Fall2017info:
--------------------------------------------------------------------------------
 1 | Machine Learning Working Group Fall 2017
 2 | 
 3 | This semester's topic is neural networks for image processing! 
 4 | 
 5 | [Click to find out where neural networks might fit in the data science universe, by Swami Chandrasekaran](http://nirvacana.com/thoughts/becoming-a-data-scientist/)  
 6 | 
 7 | [Check out the Neural Network Zoo, by Fjodor Van Veen](http://www.asimovinstitute.org/neural-network-zoo/)
 8 | 
 9 | [View this post for a classic StackExchange response to the question "What does the hidden layer in a  neural network compute?"](https://stats.stackexchange.com/questions/63152/what-does-the-hidden-layer-in-a-neural-network-compute)
10 | 
11 | [Practice with quick tutorials thanks to fast.ai here ](http://course.fast.ai/) and [here](https://github.com/fastai/courses/tree/master/deeplearning1/nbs)
12 | 
13 | [View what a ten-week course on convolutional neural networks would look like here](http://cs231n.stanford.edu/syllabus.html)
14 | 
15 | ### Dataset
16 | We will be using [The Nature Convervancy Fisheries Monitoring dataset](https://www.kaggle.com/c/the-nature-conservancy-fisheries-monitoring) for the walkthroughs this semester. 
17 | 
18 | ### Schedule
19 | Alternating Fridays 9/8 to 12/15 from 12:30-00pm in the D-Lab Convening Room (356B Barrows). 
20 | 
21 | - September 8: BRC Savio lightning talk, introduction, resources, dataset; feed forward and deep feed forward neural networks
22 | 
23 | - September 22: Benten lightning talk(?); convolutional neural networks
24 | 
25 | - October 6:  lightning talks!  
26 | 


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_001.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_002.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_002.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_003.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_003.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_004.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_004.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_005.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_005.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_006.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_006.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_007.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_008.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_008.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_009.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_009.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_010.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_010.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_011.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_011.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_012.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_012.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_013.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_013.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_014.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_014.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_015.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_015.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_016.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_016.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_017.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_017.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_018.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_018.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_019.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_019.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/images-cnn-R/image_020.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/images-cnn-R/image_020.jpg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/alexnet.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/alexnet.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/cnn.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/cnn.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/conv_gif copy.gif.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/conv_gif copy.gif.png


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/conv_gif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/conv_gif.gif


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/depthcol.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/depthcol.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/maxpool.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/maxpool.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/neural_net2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/neural_net2.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/imgs/pool1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep22-images-cnn/imgs/pool1.jpeg


--------------------------------------------------------------------------------
/Fall2017/Sep22-images-cnn/utils/util.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import time
 4 | 
 5 | # A Whole Bunch of Convenience Functions for Cleaning Up Plots
 6 | def removeAxes(ax):
 7 |     ax.get_xaxis().set_visible(False)
 8 |     ax.get_yaxis().set_visible(False)
 9 | 
10 | def removeFrames(ax,sides=['top','right']):
11 |     for side in sides:
12 |         ax.spines[side].set_visible(False)
13 | 
14 | def removeTicks(ax,axes):
15 |     if 'x' in axes:
16 |         ax.tick_params(axis='x',
17 |                         which='both',
18 |                         top='off',
19 |                         labeltop='off',
20 |                         bottom='off',
21 |                         labelbottom='off')
22 |     if 'y' in axes:
23 |         ax.tick_params(axis='y',
24 |                         which='both',
25 |                         left='off',
26 |                         labelleft='off',
27 |                         right='off',
28 |                         labelright='off')
29 | 
30 | def addAxis(ax,axis='horizontal'):
31 |     if axis == 'horizontal':
32 |         xmin,xmax = ax.get_xlim()
33 |         ax.hlines(0,xmin,xmax)
34 |     elif axis == 'vertical':
35 |         ymin,ymax = ax.get_ylim()
36 |         ax.vlines(0,ymin,ymax)
37 | 
38 | def cleanPlot(ax):
39 |     removeFrames(plt.gca(),['top','right','bottom']);
40 |     removeTicks(plt.gca(),['x','y']);
41 | 
42 | def setLims(ax,xBounds,yBounds):
43 |     ax.set_xlim(xBounds); ax.set_ylim(yBounds);
44 | 
45 | def plot_across(imgs,cmap='Greys_r'):
46 |     plt.figure(figsize=(12,3))
47 |     for i in range(len(imgs)):
48 |         img = imgs[i]
49 |         plt.subplot(1,len(imgs),i+1)
50 |         plt.imshow(img,cmap=cmap)
51 |         plt.grid(b=False)
52 | 


--------------------------------------------------------------------------------
/Fall2017/Sep8-neural-nets/nn-from-scratch-3-layer-network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2017/Sep8-neural-nets/nn-from-scratch-3-layer-network.png


--------------------------------------------------------------------------------
/Fall2017/Sep8-neural-nets/r-neural-nets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Neural networks in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | Topics to cover:
  9 | 
 10 | * Background
 11 | * Single-layer networks
 12 | * Multi-layer networks
 13 | * Possibly more
 14 | 
 15 | 
 16 | Before we dig in, we will install the R packages we'll be using.
 17 | 
 18 | **R packages**
 19 | ```{r}
 20 | # List of packages we will use.
 21 | packages = c("MASS", "nnet", "h2o", "devtools", "NeuralNetTools")
 22 | 
 23 | github_packages = c(
 24 |   # Chris's tools package for plotting the SuperLearner.
 25 |   "ck37r" = "ck37/ck37r",
 26 |   # Use more up-to-date SuperLearner from github.
 27 |   "SuperLearner" = "ecpolley/SuperLearner")
 28 | 
 29 | devtools::install_github(github_packages)
 30 | 
 31 | # Load those github packages.
 32 | ck37r::load_packages(names(github_packages))
 33 | 
 34 | # Load required non-github packages and install from CRAN if necessary.
 35 | ck37r::load_packages(packages, auto_install = T, verbose = T)
 36 | 
 37 | # Also install mxnet for potential usage.
 38 | # This unfortunately is Mac/Windows only; probably will not work for Linux.
 39 | # Actually not working for Mac either.
 40 | if (F) {
 41 |   # Skip this for now.
 42 |   install.packages("drat", repos="https://cran.rstudio.com")
 43 |   drat:::addRepo("dmlc")
 44 |   install.packages("mxnet")
 45 | }
 46 | 
 47 | # Could install Keras, but this can get complicated.
 48 | if (F) {
 49 |   devtools::install_github("rstudio/keras")
 50 |   # One version:
 51 |   install_keras()
 52 |   # Or:
 53 |   install_keras(method = "conda")
 54 | }
 55 | 
 56 | # Clean up variables.
 57 | rm(packages, github_packages)
 58 | ```
 59 | 
 60 | # Background
 61 | 
 62 | Please see Deb's python code for more details on neural network theory.
 63 | 
 64 | # Software packages
 65 | 
 66 | We'll be using `nnet` for simple neural networks and `h2o` for deep neural networks.
 67 | 
 68 | # Data preparation
 69 | 
 70 | ```{r}
 71 | data(Boston, package = "MASS")
 72 | 
 73 | # Remove our outcome variable from the covariate list.
 74 | X_df = Boston[, -14]
 75 | 
 76 | # Convert X from a dataframe to a matrix.
 77 | X_mat = model.matrix(~ .,  data = X_df)
 78 | 
 79 | # Notice the extra intercept column added by model.matrix.
 80 | colnames(X_mat)
 81 |                  
 82 | #  Remove extra intercept term.
 83 | X_mat = X_mat[, -1]
 84 | 
 85 | # Regression (continuous) version of our outcome variable.
 86 | Y_reg = Boston$medv
 87 | 
 88 | # Review outcome distribution.
 89 | summary(Y_reg)
 90 | 
 91 | # Classification (binary) version of our outcome variable.
 92 | Y_class = as.factor(as.numeric(Boston$medv > 23))
 93 | 
 94 | # Review outcome distribution.
 95 | table(Y_class)
 96 | prop.table(table(Y_class))
 97 | 
 98 | ```
 99 | 
100 | # Single-layer neural network
101 | 
102 | 
103 | Quick classification example
104 | 
105 | ```{r}
106 | library(nnet)
107 | 
108 | # Classification
109 | 
110 | # Set seed because weights are initialized randomly.
111 | set.seed(1)
112 | 
113 | # X can be a dataframe or matrix.
114 | # If Y is a factor we need to use this formula notation.
115 | fit = nnet(Y_class ~ X_mat, size = 2, decay = 5e-4, maxit = 200)
116 | 
117 | # Review our neural network fit.
118 | fit
119 | 
120 | # Plot our neural network.
121 | library(NeuralNetTools)
122 | plotnet(fit)
123 | 
124 | # Predict back to our original data.
125 | pred = predict(fit, X_mat)
126 | 
127 | # Review predictions.
128 | summary(pred)
129 | 
130 | # 
131 | ```
132 | 
133 | Quick regression example
134 | 
135 | ```{r}
136 | library(nnet)
137 | 
138 | # Set seed because weights are initialized randomly.
139 | set.seed(1)
140 | 
141 | # Again, X can be a dataframe or matrix.
142 | fit = nnet(Y_reg ~ X_mat, size = 2, decay = 5e-4, maxit = 200,
143 |            # Enable linear output to support regression.
144 |            linout = T)
145 | 
146 | # Challenge: try with linout = F (the default) and see what happens.
147 | 
148 | # Review our neural network fit.
149 | fit
150 | 
151 | # Visualize neural network.
152 | plotnet(fit)
153 | 
154 | # Predict back to our original data.
155 | pred = predict(fit, X_mat)
156 | 
157 | # Review predictions.
158 | summary(pred)
159 | 
160 | # Calculate mean-squared error (MSE).
161 | mean((pred - Y_reg)^2)
162 | 
163 | # And root mean squared error (RMSE), which is on the original scale
164 | # of the outcome variable (easier to interpret).
165 | sqrt(mean((pred - Y_reg)^2))
166 | 
167 | ```
168 | 
169 | # SuperLearner optimization
170 | 
171 | These challenges can be done in pairs/groups to make it easier.
172 | 
173 | Challenge 1: use SL.nnet wrapper to estimate performance of the neural network.
174 | 
175 | Challenge 2: use create.Learner() to test 2, 3, 4, or 5 hidden units and create a weighted average ensemble.
176 | 
177 | # Multi-layer neural network
178 | 
179 | Challenge: use h2o to design this.
180 | 
181 | ```{r}
182 | library(h2o)
183 | # Startup and connect to our existing h2o cluster.
184 | # Use all available threads.
185 | # Could increase ram with option (e.g.) max_mem_size = "8g"
186 | h2o.init(nthreads = -1)
187 | 
188 | # Clean slate - just in case the cluster was already running.
189 | h2o.removeAll()
190 | 
191 | # Load x data into h2o.
192 | data = as.h2o(cbind(X_df, `_outcome` = Y_reg))
193 | dim(data)
194 | 
195 | outcome = "_outcome"
196 | x = colnames(X_df)
197 | 
198 | # Fit the deep learning model here.
199 | # key optional arguments:
200 | # hidden = c(200, 200)
201 | # epochs = 10
202 | # seed = -1
203 | # rate_decay = 1
204 | # reproducible = FALSE
205 | # See ?h2.deeplearning for more - huge variety of configurations
206 | model = h2o.deeplearning(x = x, y = outcome,
207 |                          training_frame = data,
208 |                          nfolds = 10)
209 | 
210 | # Review model, in particular the cross-validation section.
211 | model
212 | 
213 | # Estimate model performance on another data set.
214 | # Could be a test set but here it's just the resubstitution performance.
215 | # So this is more biased than the cross-validated results reported above.
216 | h2o.performance(model, data)
217 | 
218 | # Shutdown server when we're done.
219 | # This will also happen automatically if we close RStudio, provided
220 | # the server was started within R.
221 | h2o.shutdown(prompt = F)
222 | ```
223 | 
224 | See also Erin LeDell's [excellent tutorial on deep learning](https://github.com/ledell/useR-machine-learning-tutorial/blob/master/deep-neural-networks.Rmd).
225 | 
226 | ## To add: Keras and mxnet versions.
227 | 


--------------------------------------------------------------------------------
/Fall2018/1-sep5-PCA/PCA-python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# MachineLearning Working Group\n",
  8 |     "\n",
  9 |     "### Python PCA - September 5, 2018\n",
 10 |     "\n",
 11 |     "As with the [R walkthrough](https://github.com/dlab-berkeley/MachineLearningWG/blob/master/Fall2018/sep5-PCA/PCA-R.Rmd), let's begin by replicating [another great example](https://towardsdatascience.com/pca-using-python-scikit-learn-e653f8989e60) for conducting PCA in Python and then see a machine learning application. "
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import matplotlib as mpl\n",
 22 |     "import matplotlib.pyplot as plt\n",
 23 |     "%matplotlib inline\n",
 24 |     "\n",
 25 |     "from sklearn.preprocessing import StandardScaler\n",
 26 |     "from sklearn.decomposition import PCA\n",
 27 |     "from sklearn.datasets import fetch_mldata\n",
 28 |     "from sklearn.model_selection import train_test_split\n",
 29 |     "from sklearn.linear_model import LogisticRegression"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Load the iris dataset"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {},
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "iris = pd.read_csv('./iris.csv')\n",
 46 |     "print(type(iris))\n",
 47 |     "iris.head()"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "# Define the nuemric features"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "Features = [\"Sepal.Length\", \"Sepal.Width\", \"Petal.Length\", \"Petal.Width\"]\n",
 64 |     "x = iris.loc[:, Features].values\n",
 65 |     "x"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "# Standardize the numeric features"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "x = StandardScaler().fit_transform(x)\n",
 82 |     "x"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "# Extract the target variable"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "y = iris.loc[:,[\"Species\"]].values\n",
 99 |     "y"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "# Define the 2D PCA feature space"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "pca = PCA(n_components=2)\n",
116 |     "principalComponents = pca.fit_transform(x)\n",
117 |     "pca_df = pd.DataFrame(data = principalComponents\n",
118 |     "             , columns = ['principal component 1', 'principal component 2'])\n",
119 |     "pca_df"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "# Concatenate the Species vector the principal component arrays"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {
133 |     "scrolled": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "iris_pca = pd.concat([iris[[\"Species\"]], pca_df], axis = 1)\n",
138 |     "iris_pca.head()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "# Construct the scatterplot"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "fig = plt.figure(figsize = (8,8))\n",
155 |     "ax = fig.add_subplot(1,1,1) \n",
156 |     "ax.set_xlabel(\"Principal Component 1\", fontsize = 15)\n",
157 |     "ax.set_ylabel(\"Principal Component 2\", fontsize = 15)\n",
158 |     "ax.set_title(\"PCA iris scatterplot\", fontsize = 20)\n",
159 |     "targets = [\"setosa\", \"versicolor\", \"virginica\"]\n",
160 |     "colors = [\"r\", \"g\", \"b\"]\n",
161 |     "for target, color in zip(targets,colors):\n",
162 |     "    indicesToKeep = iris_pca[\"Species\"] == target\n",
163 |     "    ax.scatter(iris_pca.loc[indicesToKeep, \"principal component 1\"]\n",
164 |     "               , iris_pca.loc[indicesToKeep, \"principal component 2\"]\n",
165 |     "               , c = color\n",
166 |     "               , s = 50)\n",
167 |     "ax.legend(targets)\n",
168 |     "ax.grid()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "pca.explained_variance_ratio_"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "markdown",
182 |    "metadata": {},
183 |    "source": [
184 |     "# Proportions of variance are similar to R!\n",
185 |     "The proportions of variance are virtually identical to those we obtained in R: \n",
186 |     "\n",
187 |     "- PC 1 = 0.7296245 \n",
188 |     "- PC 2 = 0.2285076"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "metadata": {},
195 |    "outputs": [],
196 |    "source": []
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "# Machine Learning example\n",
203 |     "\n",
204 |     "Now, let's use PCA to optimize a logistic regression model. "
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "# Load the mnist dataset\n",
214 |     "mnist = fetch_mldata('MNIST original')"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "# Split the data with a 70/30 split\n",
224 |     "# Define our training and test images and our training and test labels\n",
225 |     "# random_state is like setting the seed in R and ensures reproducible results\n",
226 |     "train_img, test_img, train_lbl, test_lbl = train_test_split(mnist.data, mnist.target, test_size=1/7.0, random_state=0)\n",
227 |     "\n",
228 |     "# Initialize the scaler to standardize the data (remember that PCA is grossly affected by scale!)\n",
229 |     "scaler = StandardScaler()"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "markdown",
234 |    "metadata": {},
235 |    "source": [
236 |     "# Fit model to training set"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "scaler.fit(train_img)\n",
246 |     "\n",
247 |     "train_img = scaler.transform(train_img)\n",
248 |     "\n",
249 |     "test_img = scaler.transform(test_img)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# Initialize the PCA model"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "# change the value in the parentheses to tell the model how much variation should be retained. \n",
266 |     "# We want 95% of it so we enter 0.95\n",
267 |     "mnist_pca = PCA(0.95)\n",
268 |     "mnist_pca.fit(train_img)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "# Do the transform on the training and test sets"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {},
282 |    "outputs": [],
283 |    "source": [
284 |     "train_img = mnist_pca.transform(train_img)\n",
285 |     "test_img = mnist_pca.transform(test_img)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "markdown",
290 |    "metadata": {},
291 |    "source": [
292 |     "# Initialize logistic regression\n",
293 |     "... with default settings"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# all parameters not specified are set to their defaults\n",
303 |     "# default solver is incredibly slow which is why it was changed to 'lbfgs'\n",
304 |     "logisticRegr = LogisticRegression(solver = 'lbfgs')\n",
305 |     "logisticRegr.fit(train_img, train_lbl)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "# Predict for One Observation (image)\n",
315 |     "logisticRegr.predict(test_img[0].reshape(1,-1))"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "metadata": {},
322 |    "outputs": [],
323 |    "source": [
324 |     "# Predict for One Observation (image)\n",
325 |     "logisticRegr.predict(test_img[0:10])"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": null,
331 |    "metadata": {},
332 |    "outputs": [],
333 |    "source": [
334 |     "logisticRegr.score(test_img, test_lbl)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "View [this webpage](https://plot.ly/ipython-notebooks/principal-component-analysis/) for another great iris example. "
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": []
350 |   }
351 |  ],
352 |  "metadata": {
353 |   "kernelspec": {
354 |    "display_name": "Python 3",
355 |    "language": "python",
356 |    "name": "python3"
357 |   },
358 |   "language_info": {
359 |    "codemirror_mode": {
360 |     "name": "ipython",
361 |     "version": 3
362 |    },
363 |    "file_extension": ".py",
364 |    "mimetype": "text/x-python",
365 |    "name": "python",
366 |    "nbconvert_exporter": "python",
367 |    "pygments_lexer": "ipython3",
368 |    "version": "3.6.5"
369 |   }
370 |  },
371 |  "nbformat": 4,
372 |  "nbformat_minor": 2
373 | }
374 | 


--------------------------------------------------------------------------------
/Fall2018/1-sep5-PCA/iris.csv:
--------------------------------------------------------------------------------
  1 | Species,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
  2 | setosa,5.1,3.5,1.4,0.2
  3 | setosa,4.9,3,1.4,0.2
  4 | setosa,4.7,3.2,1.3,0.2
  5 | setosa,4.6,3.1,1.5,0.2
  6 | setosa,5,3.6,1.4,0.2
  7 | setosa,5.4,3.9,1.7,0.4
  8 | setosa,4.6,3.4,1.4,0.3
  9 | setosa,5,3.4,1.5,0.2
 10 | setosa,4.4,2.9,1.4,0.2
 11 | setosa,4.9,3.1,1.5,0.1
 12 | setosa,5.4,3.7,1.5,0.2
 13 | setosa,4.8,3.4,1.6,0.2
 14 | setosa,4.8,3,1.4,0.1
 15 | setosa,4.3,3,1.1,0.1
 16 | setosa,5.8,4,1.2,0.2
 17 | setosa,5.7,4.4,1.5,0.4
 18 | setosa,5.4,3.9,1.3,0.4
 19 | setosa,5.1,3.5,1.4,0.3
 20 | setosa,5.7,3.8,1.7,0.3
 21 | setosa,5.1,3.8,1.5,0.3
 22 | setosa,5.4,3.4,1.7,0.2
 23 | setosa,5.1,3.7,1.5,0.4
 24 | setosa,4.6,3.6,1,0.2
 25 | setosa,5.1,3.3,1.7,0.5
 26 | setosa,4.8,3.4,1.9,0.2
 27 | setosa,5,3,1.6,0.2
 28 | setosa,5,3.4,1.6,0.4
 29 | setosa,5.2,3.5,1.5,0.2
 30 | setosa,5.2,3.4,1.4,0.2
 31 | setosa,4.7,3.2,1.6,0.2
 32 | setosa,4.8,3.1,1.6,0.2
 33 | setosa,5.4,3.4,1.5,0.4
 34 | setosa,5.2,4.1,1.5,0.1
 35 | setosa,5.5,4.2,1.4,0.2
 36 | setosa,4.9,3.1,1.5,0.2
 37 | setosa,5,3.2,1.2,0.2
 38 | setosa,5.5,3.5,1.3,0.2
 39 | setosa,4.9,3.6,1.4,0.1
 40 | setosa,4.4,3,1.3,0.2
 41 | setosa,5.1,3.4,1.5,0.2
 42 | setosa,5,3.5,1.3,0.3
 43 | setosa,4.5,2.3,1.3,0.3
 44 | setosa,4.4,3.2,1.3,0.2
 45 | setosa,5,3.5,1.6,0.6
 46 | setosa,5.1,3.8,1.9,0.4
 47 | setosa,4.8,3,1.4,0.3
 48 | setosa,5.1,3.8,1.6,0.2
 49 | setosa,4.6,3.2,1.4,0.2
 50 | setosa,5.3,3.7,1.5,0.2
 51 | setosa,5,3.3,1.4,0.2
 52 | versicolor,7,3.2,4.7,1.4
 53 | versicolor,6.4,3.2,4.5,1.5
 54 | versicolor,6.9,3.1,4.9,1.5
 55 | versicolor,5.5,2.3,4,1.3
 56 | versicolor,6.5,2.8,4.6,1.5
 57 | versicolor,5.7,2.8,4.5,1.3
 58 | versicolor,6.3,3.3,4.7,1.6
 59 | versicolor,4.9,2.4,3.3,1
 60 | versicolor,6.6,2.9,4.6,1.3
 61 | versicolor,5.2,2.7,3.9,1.4
 62 | versicolor,5,2,3.5,1
 63 | versicolor,5.9,3,4.2,1.5
 64 | versicolor,6,2.2,4,1
 65 | versicolor,6.1,2.9,4.7,1.4
 66 | versicolor,5.6,2.9,3.6,1.3
 67 | versicolor,6.7,3.1,4.4,1.4
 68 | versicolor,5.6,3,4.5,1.5
 69 | versicolor,5.8,2.7,4.1,1
 70 | versicolor,6.2,2.2,4.5,1.5
 71 | versicolor,5.6,2.5,3.9,1.1
 72 | versicolor,5.9,3.2,4.8,1.8
 73 | versicolor,6.1,2.8,4,1.3
 74 | versicolor,6.3,2.5,4.9,1.5
 75 | versicolor,6.1,2.8,4.7,1.2
 76 | versicolor,6.4,2.9,4.3,1.3
 77 | versicolor,6.6,3,4.4,1.4
 78 | versicolor,6.8,2.8,4.8,1.4
 79 | versicolor,6.7,3,5,1.7
 80 | versicolor,6,2.9,4.5,1.5
 81 | versicolor,5.7,2.6,3.5,1
 82 | versicolor,5.5,2.4,3.8,1.1
 83 | versicolor,5.5,2.4,3.7,1
 84 | versicolor,5.8,2.7,3.9,1.2
 85 | versicolor,6,2.7,5.1,1.6
 86 | versicolor,5.4,3,4.5,1.5
 87 | versicolor,6,3.4,4.5,1.6
 88 | versicolor,6.7,3.1,4.7,1.5
 89 | versicolor,6.3,2.3,4.4,1.3
 90 | versicolor,5.6,3,4.1,1.3
 91 | versicolor,5.5,2.5,4,1.3
 92 | versicolor,5.5,2.6,4.4,1.2
 93 | versicolor,6.1,3,4.6,1.4
 94 | versicolor,5.8,2.6,4,1.2
 95 | versicolor,5,2.3,3.3,1
 96 | versicolor,5.6,2.7,4.2,1.3
 97 | versicolor,5.7,3,4.2,1.2
 98 | versicolor,5.7,2.9,4.2,1.3
 99 | versicolor,6.2,2.9,4.3,1.3
100 | versicolor,5.1,2.5,3,1.1
101 | versicolor,5.7,2.8,4.1,1.3
102 | virginica,6.3,3.3,6,2.5
103 | virginica,5.8,2.7,5.1,1.9
104 | virginica,7.1,3,5.9,2.1
105 | virginica,6.3,2.9,5.6,1.8
106 | virginica,6.5,3,5.8,2.2
107 | virginica,7.6,3,6.6,2.1
108 | virginica,4.9,2.5,4.5,1.7
109 | virginica,7.3,2.9,6.3,1.8
110 | virginica,6.7,2.5,5.8,1.8
111 | virginica,7.2,3.6,6.1,2.5
112 | virginica,6.5,3.2,5.1,2
113 | virginica,6.4,2.7,5.3,1.9
114 | virginica,6.8,3,5.5,2.1
115 | virginica,5.7,2.5,5,2
116 | virginica,5.8,2.8,5.1,2.4
117 | virginica,6.4,3.2,5.3,2.3
118 | virginica,6.5,3,5.5,1.8
119 | virginica,7.7,3.8,6.7,2.2
120 | virginica,7.7,2.6,6.9,2.3
121 | virginica,6,2.2,5,1.5
122 | virginica,6.9,3.2,5.7,2.3
123 | virginica,5.6,2.8,4.9,2
124 | virginica,7.7,2.8,6.7,2
125 | virginica,6.3,2.7,4.9,1.8
126 | virginica,6.7,3.3,5.7,2.1
127 | virginica,7.2,3.2,6,1.8
128 | virginica,6.2,2.8,4.8,1.8
129 | virginica,6.1,3,4.9,1.8
130 | virginica,6.4,2.8,5.6,2.1
131 | virginica,7.2,3,5.8,1.6
132 | virginica,7.4,2.8,6.1,1.9
133 | virginica,7.9,3.8,6.4,2
134 | virginica,6.4,2.8,5.6,2.2
135 | virginica,6.3,2.8,5.1,1.5
136 | virginica,6.1,2.6,5.6,1.4
137 | virginica,7.7,3,6.1,2.3
138 | virginica,6.3,3.4,5.6,2.4
139 | virginica,6.4,3.1,5.5,1.8
140 | virginica,6,3,4.8,1.8
141 | virginica,6.9,3.1,5.4,2.1
142 | virginica,6.7,3.1,5.6,2.4
143 | virginica,6.9,3.1,5.1,2.3
144 | virginica,5.8,2.7,5.1,1.9
145 | virginica,6.8,3.2,5.9,2.3
146 | virginica,6.7,3.3,5.7,2.5
147 | virginica,6.7,3,5.2,2.3
148 | virginica,6.3,2.5,5,1.9
149 | virginica,6.5,3,5.2,2
150 | virginica,6.2,3.4,5.4,2.3
151 | virginica,5.9,3,5.1,1.8


--------------------------------------------------------------------------------
/Fall2018/2-sep19-k-means/readme.md:
--------------------------------------------------------------------------------
1 | # K-means clustering
2 | 
3 |   * [R Markdown File](k-means-ucr.Rmd)
4 |   * [Python Jupyter File](https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html)
5 |     * (Can open in Google Collab for a nice interactive experience)


--------------------------------------------------------------------------------
/Fall2018/3-oct3-hier_agg_clust/Oct3-hier_agg_clust.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Hierarchical clustering"
  3 | author: "Evan"
  4 | date: "10/2/2018"
  5 | output: 
  6 |   html_document: 
  7 |     number_sections: yes
  8 |     toc: yes
  9 |     toc_float: yes
 10 | ---
 11 | 
 12 | ```{r set-options, echo = FALSE, cache = FALSE}
 13 | options(width = 140)
 14 | ```
 15 | 
 16 | # Hierarchical agglomerative clustering
 17 | Hierarchical agglomerative clustering is a "bottom-up" method of clustering. Each observation begins as its own cluster and forms clusters with like items as it moves up the hierarchy. That is, all leaves are their own clusters to begin with and form clusters as we move up the trunk and various branches are formed.  
 18 | 
 19 | Distance and cluster method information are usually displayed at the bottom of the graph, while the vertical axis displays the height, which refers to the distance between two clusters. We are not concerned as much with distances along the horizontal axis. We can also "cut" the dendrogram to specify a number of clusters, which is similar to defining _k_ in k-means clustering (which is also equally problematic).  
 20 | 
 21 | In a real-life research situation, you will likely want to scale the data. However, raw data are used in this example. 
 22 | # Package installation
 23 | ```{r}
 24 | if (FALSE) {
 25 |   # Run this line manually (once) to install the necessary packages.
 26 |   # Install packages from CRAN:
 27 |   install.packages(c("ape", "pvclust", "mclust"))
 28 | }
 29 | 
 30 | # fancy dendrogram options
 31 | library(ape)
 32 | # dendrograms with p-values
 33 | library(pvclust)
 34 | # model-based clustering
 35 | library(mclust)
 36 | ```
 37 | 
 38 | # Load data
 39 | ```{r}
 40 | data(mtcars)
 41 | ?mtcars
 42 | ```
 43 | 
 44 | Start by using the `hclust` built-in function from [{stats}](https://www.rdocumentation.org/packages/stats/versions/3.5.1). `hclust` prefers a dissimilarity matrix via the `dist` function, thus it plots rows as opposed to columns like the methods further below. 
 45 | 
 46 | # The `hclust` built-in function
 47 | ```{r}
 48 | # See the help files
 49 | ?hclust
 50 | 
 51 | # Create distance matrix
 52 | mtcars_dist = dist(mtcars, method = "euclidean")
 53 | 
 54 | # Fit hclust_model
 55 | system.time({
 56 |   hclust_model = hclust(mtcars_dist, method = "complete")
 57 |   })
 58 | 
 59 | # Plot hclust_model dendrogram
 60 | plot(hclust_model, hang = -1)
 61 | ```
 62 | 
 63 | Data are visualized in dendrograms, or branching tree-like structures similar to decision trees, albeit with less information displayed at each node. The most similar items are found lower in the dendrogram and fuse into $n-1$ clusters as we move up the tree; the next two items to fuse into a cluster produce $n-2$ clusters and so on as we move up the tree until there is just one overarching cluster. Thus, clusters become more inclusive as we move up the hierarchy.  
 64 | 
 65 | Dissimilarity is applied not just to single observations, but to groups as well (linkage). Thus the "Cadillac Fleetwood / Lincoln Continental" cluster " cluster fuses with "Chrysler Imperial" instead of "Maserati Bora" or something else.  
 66 | 
 67 | You can also cut the tree to see how the tree varies:
 68 | ```{r}
 69 | # If we want only 5 clusters, for example (must be a number between 1-32, since mtcars has only 32 observations:
 70 | cutree(hclust_model, 5) 
 71 | ```
 72 | 
 73 | # The `ape` package
 74 | 
 75 | The [`ape` package](https://cran.r-project.org/web/packages/ape/index.html) provides some great functionality for constructing and plotting clusters:
 76 | ```{r}
 77 | library(ape)
 78 | # various plots
 79 | plot(as.phylo(hclust_model))
 80 | plot(as.phylo(hclust_model), type = "cladogram")
 81 | plot(as.phylo(hclust_model), type = "unrooted")
 82 | 
 83 | # radial plot
 84 | colors = c("red", "orange", "blue", "green", "purple")
 85 | clus5 = cutree(hclust_model, 5)
 86 | plot(as.phylo(hclust_model), type = "fan", tip.color = colors[clus5], lwd = 2, cex = 1)
 87 | ```
 88 | 
 89 | > NOTE: the color settings for the radial plot apply to the other ape plots as well.  
 90 | 
 91 | # The `pvclust` package
 92 | The [pvclust](http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/) package offers a straightfoward way to perform hierarchical agglomerative clustering of columns with two types of p-values at each split: approximately unbiased **(AU)** and bootstrap probability **(BP)**. 
 93 | ```{r}
 94 | library(pvclust)
 95 | # Cluster features
 96 | 
 97 | # Ward's method: minimum variance between clusters
 98 | system.time({
 99 |   pvclust_model_ward = pvclust(mtcars, 
100 |                           method.hclust = "ward.D",
101 |                           method.dist = "euclidean",
102 |                           nboot = 1000, parallel = T)
103 |   })
104 | 
105 | plot(pvclust_model_ward)
106 | 
107 | # pvrect will draw rectangles around clusters with high or low p-values
108 | pvrect(pvclust_model_ward, alpha = 0.95)
109 | ```
110 | 
111 | ### Compare different dissimilarity measures
112 | ```{r}
113 | # Complete linkage: largest intercluster difference
114 | system.time({
115 |   pvclust_model_complete = pvclust(mtcars, 
116 |                           method.hclust = "complete",
117 |                           method.dist = "euclidean",
118 |                           nboot = 1000, parallel = T)
119 | })
120 | 
121 | # Single linkage: smallest intercluster difference
122 | system.time({
123 |   pvclust_model_single = pvclust(mtcars, 
124 |                           method.hclust = "single",
125 |                           method.dist = "euclidean",
126 |                           nboot = 1000, parallel = T)
127 | })
128 | 
129 | # Average linkage: mean intercluster difference
130 | system.time({
131 |   pvclust_model_average = pvclust(mtcars, 
132 |                           method.hclust = "average",
133 |                           method.dist = "euclidean",
134 |                           nboot = 1000, parallel = T)
135 | })
136 | 
137 | # View summaries
138 | pvclust_model_ward
139 | pvclust_model_complete
140 | pvclust_model_single
141 | pvclust_model_average
142 | 
143 | # Plot Euclidean distance linkages
144 | par(mfrow = c(2,2))
145 | plot(pvclust_model_ward, main = "Ward", xlab = "", sub = "")
146 | pvrect(pvclust_model_ward)
147 | plot(pvclust_model_complete, main = "Complete", xlab = "", sub = "")
148 | pvrect(pvclust_model_complete)
149 | plot(pvclust_model_single, main = "Single", xlab = "", sub = "")
150 | pvrect(pvclust_model_single)
151 | plot(pvclust_model_average, main = "Average", xlab = "", sub = "")
152 | pvrect(pvclust_model_average)
153 | par(mfrow = c(1,1))
154 | ```
155 | 
156 | ### View standard error plots:
157 | ```{r}
158 | par(mfrow=c(2,2))
159 | seplot(pvclust_model_ward, main = "Ward")
160 | seplot(pvclust_model_complete, main = "Complete")
161 | seplot(pvclust_model_single, main = "Single")
162 | seplot(pvclust_model_average, main = "Average")
163 | par(mfrow=c(1,1))
164 | ```
165 | 
166 | # Going further - the `mclust` package
167 | The [`mclust`](https://cran.r-project.org/web/packages/mclust/index.html) package provides "Gaussian finite mixture models fitted via EM algorithm for model-based clustering, classification, and density estimation, including Bayesian regularization, dimension reduction for visualisation, and resampling-based inference."
168 | ```{r}
169 | library(mclust)
170 | 
171 | # Fit model
172 | mclust_model = Mclust(mtcars)
173 | 
174 | # View various plots
175 | plot(mclust_model, what = "BIC") 
176 | plot(mclust_model, what = "classification")
177 | plot(mclust_model, what = "uncertainty")
178 | plot(mclust_model, what = "density")
179 | ```
180 | 
181 | ### Return best performing model
182 | ```{r}
183 | summary(mclust_model)
184 | ```
185 | 
186 | ### Cross-validated mclust
187 | ```{r}
188 | # sort mpg in decreasing order
189 | mtcars = mtcars[order(-mtcars$mpg),]
190 | mtcars 
191 | 
192 | # create a binary factor variable from mpg: "less than 20mpg" and "greater than 20mpg"
193 | mtcars$class = cut(mtcars$mpg, 
194 |                    breaks = c(0, 20, 40),
195 |                    levels = c(1, 2),
196 |                    labels = c("less than 20mpg", "greater than 20mpg"))
197 | mtcars
198 | 
199 | # define our predictors (X) and class labels (class)
200 | X = mtcars[ , -12]
201 | class = mtcars$class
202 | 
203 | # fit the model (EEE covariance structure, basically the same as linear discriminant analysis)
204 | mclust_model2 = MclustDA(X, class = class, modelType = "EDDA", modelNames = "EEE")
205 | 
206 | # cross-validate!
207 | set.seed(1)
208 | cv_mclust = cvMclustDA(mclust_model2, nfold = 20)
209 | 
210 | # View cross-validation error and standard error of the cv error
211 | cv_mclust[c("error", "se")]
212 | ```
213 | 
214 | References and resources:  
215 | - [Quick-R: Cluster Analysis](https://www.statmethods.net/advstats/cluster.html)  
216 | - [James et al. Introduction to Statistical Learning, pp. 390-401](https://www-bcf.usc.edu/~gareth/ISL/)  
217 | - [pvclust](http://stat.sys.i.kyoto-u.ac.jp/prog/pvclust/)  
218 | - [STHDA: Beautiful dendrogram visualizations](http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning)  
219 | - [Gaston Sanchez: Visualizing Dendrograms in R](https://rpubs.com/gaston/dendrograms)  
220 | - [Analysis of Phylogenetics and Evolution](http://ape-package.ird.fr/)  
221 | - [A Quick Tour of mclust](https://cran.r-project.org/web/packages/mclust/vignettes/mclust.html)  
222 | - [mclust vignette (from 2012, but more detailed)](https://www.stat.washington.edu/sites/default/files/files/reports/2012/tr597.pdf)  
223 | - A very [useful walkthrough](https://quantdev.ssri.psu.edu/sites/qdev/files/Unsupervised_Machine_Learning_The_mclust_Package_and_others.html) by Christian Lopez  
224 | - [MoEClust:](https://cran.r-project.org/web/packages/MoEClust/vignettes/MoEClust.html) Gaussian Parsimonious Clustering Models with Gating and Expert Network Covariates  
225 | - See the [cluster](https://cran.r-project.org/web/packages/cluster/cluster.pdf) R package to learn more about agnes, clara, daisy, diana, fanny, flower, mona, and pam cluster methods!  
226 | 
227 | 
228 | 


--------------------------------------------------------------------------------
/Fall2018/4-medoids/medoid-clustering.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Medoid clustering"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | Implementations:
 11 | 
 12 | 1. [cluster::pam](https://stat.ethz.ch/R-manual/R-devel/library/cluster/html/pam.html)
 13 | 2. [kmed](https://cran.r-project.org/web/packages/kmed/vignettes/kmedoid.html)
 14 | 3. [hopach](https://www.bioconductor.org/packages/release/bioc/html/hopach.html)
 15 | 
 16 | ## Installation
 17 | 
 18 | Run this section once manually - it will not be run when knitting the markdown file.
 19 | 
 20 | ```{r eval=FALSE}
 21 | # cluster is a built-in package.
 22 | 
 23 | # kmed
 24 | install.packages("kmed")
 25 | 
 26 | # hopach
 27 | ## try http:// if https:// URLs are not supported
 28 | source("https://bioconductor.org/biocLite.R")
 29 | biocLite("hopach")
 30 | 
 31 | ```
 32 | 
 33 | ### Load libraries
 34 | 
 35 | ```{r}
 36 | library(cluster)
 37 | library(kmed)
 38 | library(hopach)
 39 | ```
 40 | 
 41 | ## Data prep
 42 | 
 43 | We're trying out a birth weight dataset.
 44 | 
 45 | ```{r}
 46 | data = MASS::birthwt
 47 | summary(data)
 48 | ?MASS::birthwt
 49 | data$race = factor(data$race, labels = c("white", "black", "other"))
 50 | str(data)
 51 | 
 52 | # Create a list to hold different variables.
 53 | vars = list(
 54 |   # Birth weight or low are generally our outcomes for supervised analyses.
 55 |   outcomes = c("bwt", "low"),
 56 |   
 57 |   # Variables we want to exclude from our analysis - none currently.
 58 |   exclude = NULL
 59 | )
 60 | 
 61 | vars$covariates = setdiff(names(data), vars$outcomes)
 62 | 
 63 | # Review our data structure.
 64 | vars
 65 | ```
 66 | 
 67 | 
 68 | ## K-med package
 69 | 
 70 | ```{r kmed}
 71 | 
 72 | # Review covariate structure
 73 | str(data[, vars$covariates])
 74 | 
 75 | # Create distance matrix.
 76 | # NOTE: perhaps we should center & scale data beforehand.
 77 | dist_mat =
 78 |   # This function is for "mixed" variable data - numeric, binary, and/or categorical.
 79 |   distmix(data[, vars$covariates],
 80 |           # There are 6 options for method here. 
 81 |           method = "gower",
 82 |           # method = "huang",
 83 |           # Harikumar seems to require all integer data.
 84 |           # method = "harikumar",
 85 |           # method = "wishart",
 86 |           # Provide column numbers for the numeric variables.
 87 |           idnum = which(vars$covariates %in% c("age", "lwt", "ptl", "ftv")),
 88 |           # Binary variables.
 89 |           idbin = which(vars$covariates %in% c("smoke", "ht", "ui")),
 90 |           # Categorical variables.
 91 |           idcat = which(vars$covariates %in% c("race")))
 92 | 
 93 | # 189 x 189.
 94 | dim(dist_mat)
 95 | # Same as the number of observations.
 96 | nrow(data)
 97 | 
 98 | # Conduct the medoids analysis with 3 clusters.
 99 | # Other function options: rankkmed, stepkmed.
100 | result = fastkmed(dist_mat, ncluster = 3, iterate = 50)
101 | 
102 | # Examine distribution of low birth weight across clusters.
103 | table("cluster" = result$cluster, "low wgt" = data$low)
104 | prop.table(table("cluster" = result$cluster, "low wgt" = data$low), margin = 1)
105 | ```
106 | 
107 | Does cluster help us predict birth weight?
108 | 
109 | ```{r kmed_ols}
110 | # OLS 1: don't include cluster.
111 | reg1 = lm(bwt ~ .,
112 |          data = data[, c(vars$covariates, vars$outcomes[1])])
113 | summary(reg1)
114 | 
115 | # OLS 2: with cluster included.
116 | reg2 = lm(bwt ~ .,
117 |          data = cbind(data[, c(vars$covariates, vars$outcomes[1])],
118 |                       cluster = factor(result$cluster)))
119 | summary(reg2)
120 | ```
121 | 
122 | We have a reasonable increase in adjusted R-squared.  How else could we examine the possible benefit of the cluster variable on our predictive accuracy? Are other methods preferable to gower?
123 | 
124 | ```{r}
125 | # a simple and fast k-medoids function for bootstrap evaluation
126 | boot_kmed = function(distmat, nclust) {
127 |   result = fastkmed(distmat, nclust, iterate = 50)
128 |   return(result$cluster)
129 | }
130 | 
131 | # k-means function for bootstrap evaluation
132 | boot_kmeans = function(x, nclust) {
133 |   result = kmeans(x, nclust)
134 |   return(result$cluster)
135 | }
136 | 
137 | k = 3
138 | num_boots = 50
139 | fastkmedboot = clustboot(dist_mat, nclust = k, boot_kmed, nboot = num_boots)
140 | # For k-means we need to create a numeric matrix (i.e. convert factor to indicators)
141 | data_mat = model.matrix(~ ., data = data[, vars$covariates])[, -1]
142 | kmeansboot = clustboot(data_mat, nclust = k, boot_kmeans,
143 |                         nboot = num_boots, diss = FALSE)
144 | 
145 | # Consensus matrix creation.
146 | 
147 | wardorder <- function(x, nclust) {
148 |   res <- hclust(x, method = "ward.D2")
149 |   member <- cutree(res, nclust)
150 |   return(member)
151 | }
152 | consensusfastkmed <- consensusmatrix(fastkmedboot, nclust = k, wardorder)
153 | 
154 | clustheatmap(consensusfastkmed, "Clustering via Fast K-medoids")
155 | 
156 | consensuskmeans <- consensusmatrix(kmeansboot, nclust = k, wardorder)
157 | clustheatmap(consensuskmeans, "Clustering via K-means")
158 | ```
159 | 
160 | 
161 | ## Cluster: partitioning around medoids
162 | 
163 | ```{r cluster_pam}
164 | # Maybe we can figure out during MLWG?
165 | 
166 | result_pam =
167 |   cluster::pam(data[, vars$covariates], k = 3,
168 |                metric = "euclidean")
169 |                #metric = "manhattan")
170 | 
171 | # Output is a bit too verbose.
172 | summary(result_pam)
173 | 
174 | # We get a PCA plot with ellipsoids,
175 | # Then a silhouette plot.
176 | plot(result_pam)
177 | ```
178 | 
179 | (sklearn info on silhouette plots)[http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html]
180 | 
181 | ## HOPACH package
182 | 
183 | ```{r hopach}
184 | # We use the numeric data matrix here, which has converted factors to indicators.
185 | dist = distancematrix(data_mat,
186 |                       d = "cosangle",
187 |                       # d = "cor",
188 |                       na.rm = TRUE)
189 | dim(dist)
190 | 
191 | hobj = hopach(data_mat, dmat = dist)
192 | 
193 | # Number of clusters identified.
194 | hobj$clust$k  
195 | 
196 | # Review sizes of each cluster.
197 | hobj$clust$sizes 
198 | 
199 | # This plot is recommended but does not seeem that useful.
200 | dplot(dist, hobj, ord="final", main="Distance matrix", showclusters = FALSE)  
201 | 
202 | # Bootstrap analysis
203 | # TODO: identify how to set seed.
204 | bobj = boothopach(data_mat, hobj, B = 100)
205 | 
206 | ###################################################
207 | ### code chunk number 7: bootplot (eval = FALSE)
208 | ################################################
209 | bootplot(bobj, hobj, ord = "bootp",
210 |          main = "Bootstrap plot", showclusters = FALSE)
211 | 
212 | ```
213 | 
214 | ## Resources
215 | 
216 | Please see the package references - several great articles in there, especially kmed. Kaufman and Rousseeuw (1990) is one of the classic textbooks.
217 | 


--------------------------------------------------------------------------------
/Fall2018/4-medoids/readme.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Fall2018/4-medoids/readme.md


--------------------------------------------------------------------------------
/Fall2018/5-Oct30-tSNE/r-tSNE.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "tSNE"
  3 | author: "Evan"
  4 | date: "10/24/2018"
  5 | output: 
  6 |   html_document: 
  7 |     toc: yes
  8 |     toc_float: yes
  9 | ---
 10 | 
 11 | ```{r set-options, echo = FALSE, cache = FALSE}
 12 | options(width = 140)
 13 | ```
 14 | 
 15 | # tSNE!
 16 | t-distributed stochastic neighbor embedding (tSNE) is a nonlinear, nonparametric, and unsupervised dimension reduction machine learning algorithm. It is used to find patterns in high-dimensional data.  
 17 | 
 18 | Recall that dimension reduction techniques such as [PCA](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/1-sep5-PCA) help us reduce high-dimensional linear data into a reduced feature space, such as 2 or 3 main axes of "distilled" variation that can be efficiently visualized.  
 19 | 
 20 | These visualizations often look a little nicer than those for PCA because instead of plotting distances between observations, tSNE plots the _probabilities_ instead! This is based on [Kullback-Leibler divergences](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) (the loss function). It becomes difficult to say what PCA data separation looks like in higher-dimensional space because it can be dubious to extrapolate lower dimension representations into higher ones. 
 21 | 
 22 | ### Some key hyperparameters include:  
 23 | * dims - the number of dimensions to be returned.  
 24 | * [Perplexity](https://en.wikipedia.org/wiki/Perplexity) - essentially the number of nearest neighbors, but in the curved/surface-like [manifold](https://stats.stackexchange.com/questions/289467/what-is-a-manifold) setting instead of stright-line distances. Should be less than the number of observations, but it is not that simple...  
 25 | * theta - the Barnes-Hut tradeoff, ranging from 0 to 1. This is the speed/accuracy tradeoff with lower values give slower but more accurate optimizations. 0.0 returns  he exact tSNE value (defaults to 0.5).  
 26 | * eta - learning rate.  
 27 | * check_duplicates - should duplicate observations be removed?  
 28 | 
 29 | # Package installation
 30 | Run these lines manually if you need to install or update the following packages:
 31 | ```{r}
 32 | if (FALSE) {
 33 |   install.packages(c(
 34 |     # train/test data splitting
 35 |     "caret",
 36 |     # Our sole ML algorithm this time around
 37 |     "randomForest",
 38 |     # tSNE algorithms
 39 |     "Rtsne", "tsne"
 40 |     )) 
 41 | }
 42 | ```
 43 | 
 44 | Library the required packages
 45 | ```{r}
 46 | library(caret)
 47 | library(randomForest)
 48 | library(Rtsne)
 49 | library(tsne)
 50 | ```
 51 | 
 52 | # Load the `iris` dataset
 53 | ```{r}
 54 | data(iris)
 55 | 
 56 | # Learn about the dawta
 57 | ?iris
 58 | 
 59 | # View its structure
 60 | str(iris)
 61 | 
 62 | # How many of each species?
 63 | table(iris$Species)
 64 | ```
 65 | 
 66 | # Goals
 67 | We will fit one model using the tsne package and one using the Rtsne package. Then, we will use the Rtsne model to add coordinates to our dataset and to train and evaluate a random forest algorithm on these new data.  
 68 | 
 69 | # `tsne` package
 70 | Here, the help files outline a concise way to fit the tSNE algorithm via a brief plotting function:
 71 | ```{r}
 72 | # Define colors for plotting
 73 | colors = rainbow(length(unique(iris$Species)))
 74 | 
 75 | # Assign one color to each species
 76 | names(colors) = unique(iris$Species)
 77 | colors
 78 | 
 79 | # Define the function
 80 | ecb = function(x,y){
 81 |   plot(x,t = 'n')
 82 |   text(x,labels = iris$Species, col = colors[iris$Species]) 
 83 |   }
 84 | 
 85 | # Fit
 86 | set.seed(1)
 87 | system.time({
 88 | tsne_iris = tsne::tsne(iris[, -5], epoch_callback = ecb, perplexity = 50)
 89 | })
 90 | ```
 91 | 
 92 | ### `Rtsne` example
 93 | Rtsne provides clearer hyperparameters, better help, and more flexibility compared to the tsne model. 
 94 | ```{r}
 95 | # You might want to remove duplicate observations (even if they are stochastic)... (so that you are not computing distances between two identical points?)
 96 | 
 97 | set.seed(1)
 98 | Rtsne_iris <- Rtsne::Rtsne(as.matrix(iris[, -5]), 
 99 |                     # Return just the first two dimensions
100 |                     dims = 2,
101 |                     # Let's set perplexity to 5% of the number of rows
102 |                     # Try setting it to a larger value as well, like 25%
103 |                     perplexity = nrow(iris) * 0.05,
104 |                     # try changing theta to 0.0 to see what happens
105 |                     theta = 0.5, 
106 |                     # change eta to 0 and see what happens!
107 |                     eta = 1, 
108 |                     # Tell the algorithm it is okay to have duplicate rows
109 |                     check_duplicates = F) 
110 | # Unpack!
111 | names(Rtsne_iris)
112 | 
113 | # Plot first two dimensions
114 | plot(Rtsne_iris$Y[, 1:2],col = iris$Species) 
115 | ```
116 | 
117 | # Visual comparison to PCA
118 | ```{r}
119 | pca_iris = princomp(iris[,1:4])$scores[,1:2]
120 | plot(pca_iris, t = 'n')
121 | text(pca_iris, labels = iris$Species, col = colors[iris$Species])
122 | ```
123 | 
124 | # A machine learning example
125 | Let's recapitulate [Mark Borg's walkthrough here](https://mark-borg.github.io/blog/2016/tsne-ml/). Let's keep working with our `Rtsne_iris` model from above. cbind the tSNE coordinates into our dataset in order to fit a random forest on this new dataset!
126 | ```{r}
127 | # Add tSNE coordinates via cbind
128 | data = cbind(iris, Rtsne_iris$Y)
129 | 
130 | # Rename the new columns
131 | colnames(data)[6] = "tSNE_Dim1"
132 | colnames(data)[7] = "tSNE_Dim2"
133 | 
134 | # Check out the dataset
135 | head(data)
136 | 
137 | # Split the data
138 | set.seed(1)
139 | split = caret::createDataPartition(data$Species, p = 0.75, list = FALSE)
140 | training_set = data[split,]
141 | test_set = data[-split,]
142 | 
143 | # Identify species "target" variable and predictors for train and test sets
144 | X_train = training_set[, -5]
145 | Y_train = training_set$Species
146 | 
147 | X_test = test_set[, -5]
148 | Y_test = test_set$Species
149 | ```
150 | 
151 | Fit the random forest:
152 | ```{r, echo = T, results = "hide"}
153 | set.seed(1)
154 | RF = randomForest(X_train, Y_train, X_test, Y_test,
155 |                   ntree = 500, 
156 |                   proximity = T,
157 |                   importance = T,
158 |                   keep.forest = T,
159 |                   do.trace = T)
160 | ```
161 | ```{r}
162 | predicted = predict(RF, X_test)
163 | table(predicted, Y_test)
164 | mean(predicted == Y_test)
165 | varImpPlot(RF)
166 | ```
167 | 
168 | # Resources
169 | [tSNE FAQ](https://lvdmaaten.github.io/tsne/). Laurens van der Maaten blog.  
170 | 
171 | Cao, Y and L Wang. 2017. [Automatic selection of t-SNE perplexity.](https://arxiv.org/pdf/1708.03229.pdf) Journal of Machine Learning Research: Workshop and Conference Proceedings 1:1-7.  
172 | 
173 | Linderman, GC and S. Stenerberger. 2017. [Clustering with t-SNE, provably.](https://arxiv.org/pdf/1706.02582.pdf) 	arXiv:1706.02582 [cs.LG].  
174 | 
175 | Pezzotti et al. 2017. [Approximated and user steerable tSNE for progressive visual analytics.](https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7473883&tag=1) IEEE Transactions on Visualization and Computer Graphics 23:1739-1752.  
176 | 
177 | Schubert E. and M. Gertz. 2017. [Intrinsic t-stochastic neighbor embedding for visualization and outlier detection: A remedy against the curse of dimensionality?](https://pdfs.semanticscholar.org/97a0/d8798aec210c68a8532d907e4e7c193754a6.pdf) In: Beecks C., Borutta F., Kröger P., Seidl T. (eds) Similarity Search and Applications (SISAP). Lecture Notes in Computer Science, Springer, 10609:188-203.  
178 | 
179 | Wattenberg et al. 2016. [How to use t-SNE effectively](https://distill.pub/2016/misread-tsne/)
180 | 
181 | colah's blog. 2015. [Visualizing representations: Deep learning and human beings.](https://colah.github.io/posts/2015-01-Visualizing-Representations/)  
182 | 
183 | Wang W et al. 2015. [On deep multi-view representation learning.](http://proceedings.mlr.press/v37/wangb15.pdf) Journal of Machine Learning Research: Workshop and Conference Proceedings 37.  
184 | 
185 | van der Maaten, LJP. 2014. [Accelerating t-SNE using Tree-Based Algorithms.](http://jmlr.org/papers/volume15/vandermaaten14a/vandermaaten14a.pdf) Journal of Machine Learning Research, 15:3221-3245.  
186 | 
187 | Hamel, P and D. Eck. 2010. [Learning features from music audio with deep belief networks.](http://www.mirlab.org/conference_papers/international_conference/ISMIR%202010/ISMIR_2010_papers/ismir2010-58.pdf) 11th International Society for Music Information Retrieval Conference 339-344.  
188 | 
189 | Jamieson AR et al. 2010. [Exploring nonlinear feature space dimension reduction and data representation in breast CADx with Laplacian eigenmaps and t-SNE.](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2807447/) Medical Physics 37:339-351.  
190 | 
191 | van der Maaten, LJP. 2009. [Learning a Parametric Embedding by Preserving Local Structure.](https://lvdmaaten.github.io/publications/papers/AISTATS_2009.pdf) In Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AISTATS), Journal of Machine Learning Research Workshop and Conference Proceedings 5:384-391.  
192 | 
193 | van der Maaten LJP and GE Hinton. 2008. [Visualizing Data Using t-SNE.](http://www.jmlr.org/papers/volume9/vandermaaten08a/vandermaaten08a.pdf) Journal of Machine Learning Research 9:2579-2605.  
194 | 
195 | Also check out [umapr](https://ropensci.org/blog/2018/08/01/umapr/) and [uwot](https://github.com/jlmelville/uwot).  


--------------------------------------------------------------------------------
/Fall2018/6-nov14-umap/umap-r.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "UMAP: Uniform Manifold Approximation and Projection"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ## Background
 11 | 
 12 | * Arxiv paper: (https://arxiv.org/abs/1802.03426)
 13 | * PyData 2018 talk (PCA, tSNE, and UMAP): (https://www.youtube.com/watch?v=YPJQydzTLwQ)
 14 | * PyCon 2018 talk: (https://www.youtube.com/watch?v=nq6iPZVUxZU)
 15 | 
 16 | ## Data prep
 17 | 
 18 | We're trying out a birth weight dataset.
 19 | 
 20 | ```{r}
 21 | data = MASS::birthwt
 22 | summary(data)
 23 | ?MASS::birthwt
 24 | data$race = factor(data$race, labels = c("white", "black", "other"))
 25 | str(data)
 26 | 
 27 | # Create a list to hold different variables.
 28 | vars = list(
 29 |   # Birth weight or low are generally our outcomes for supervised analyses.
 30 |   outcomes = c("bwt", "low"),
 31 |   
 32 |   # Variables we want to exclude from our analysis - none currently.
 33 |   exclude = NULL
 34 | )
 35 | 
 36 | vars$covariates = setdiff(names(data), vars$outcomes)
 37 | 
 38 | # Review our data structure.
 39 | vars
 40 | 
 41 | dplyr::glimpse(data[vars$covariates])
 42 | sapply(data[vars$covariates], class)
 43 | ```
 44 | 
 45 | ```{r}
 46 | library(umap)
 47 | class(data[vars$covariates])
 48 | # Convert factor to indicators and remove intercept column.
 49 | data_mat = model.matrix(~ ., data[vars$covariates])[, -1]
 50 | summary(data_mat)
 51 | 
 52 | # Conduct UMAP analysis of our matrix data, setting a random seed.
 53 | result = umap(data_mat, random_state = 1)
 54 | ```
 55 | 
 56 | ## Plot UMAP
 57 | 
 58 | ```{r umap_plot}
 59 | dim(result)
 60 | class(result)
 61 | 
 62 | library(ggplot2)
 63 | 
 64 | # Compile results into a dataframe.
 65 | plot_data = data.frame(x = result$layout[, 1],
 66 |                        y = result$layout[, 2],
 67 |                        data[, vars$outcomes])
 68 | 
 69 | # Create an initial plot object.
 70 | p = ggplot(data = plot_data, aes(x = x, y = y, color = low)) +
 71 |   theme_minimal()
 72 | 
 73 | # Plot binary outcome
 74 | p + geom_point() + ggtitle("Low birth weight = 1")
 75 | 
 76 | # Compare to continuous outcome.
 77 | p + geom_point(aes(color = bwt)) + ggtitle("Continuous birth weight")
 78 | ```
 79 | 
 80 | ## Hyperparameters
 81 | 
 82 | ```{r}
 83 | # Review default settings.
 84 | umap.defaults
 85 | 
 86 | config = umap.defaults
 87 | 
 88 | # Set a seed.
 89 | config$random_state = 1
 90 | config$n_neighbors = 30
 91 | 
 92 | result2 = umap(data_mat, config)
 93 | 
 94 | p + geom_point(aes(x = result2$layout[, 1],
 95 |                    y = result2$layout[, 2]))
 96 | 
 97 | # Try even more neighbors.
 98 | config$n_neighbors = 60
 99 | 
100 | result3 = umap(data_mat, config)
101 | 
102 | p + geom_point(aes(x = result3$layout[, 1],
103 |                    y = result3$layout[, 2]))
104 | ```
105 | 
106 | More info on hyperparameters on the [umap-learn python documentation page](https://umap-learn.readthedocs.io/en/latest/parameters.html).
107 | 
108 | ```{r}
109 | ?umap
110 | ```
111 | 
112 | ## Challenge
113 | 
114 | * Compare to tSNE using code from our last meeting.
115 | * Other datasets to try: MNIST, iris, your own dataset.
116 | 
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/MachineLearningWG.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/Math4ML_2017/Math4ML notes July 19.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4ML notes July 19.docx


--------------------------------------------------------------------------------
/Math4ML_2017/Math4ML notes July 5th .docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4ML notes July 5th .docx


--------------------------------------------------------------------------------
/Math4ML_2017/Math4MLJune7.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4MLJune7.docx


--------------------------------------------------------------------------------
/Math4ML_2017/Math4MLMay24.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Math4ML_2017/Math4MLMay24.docx


--------------------------------------------------------------------------------
/Math4ML_2017/README.md:
--------------------------------------------------------------------------------
 1 | # Math for machine learning
 2 | This is the summer 2017 reading group for the D-Lab Machine Learning Working Group.  
 3 | 
 4 | ### Dates: 
 5 | 
 6 | ### Starter resources: 
 7 | [Floridi L. 2017. Robots, jobs, taxes, and responsibilities. Philosophy & Technology 30:1-4](https://link.springer.com/article/10.1007/s13347-017-0257-3)
 8 | 
 9 | [IBM Data Science Experience. The mathematics of machine learning](http://datascience.ibm.com/blog/the-mathematics-of-machine-learning/)  
10 | 
11 | [Jordan MI. 1986/1987. An introduction to linear algebra in parallel distributed processing. Parallel distributed processing 1: 365-422.](https://www.cs.cmu.edu/afs/cs/academic/class/15883-f15/readings/jordan-1986-ch9.pdf)  
12 | 
13 | [Li H. 2017. Which machine learning algorithm should I use? SAS blog (Subconscious Musings)](http://blogs.sas.com/content/subconsciousmusings/2017/04/12/machine-learning-algorithm-use/)  
14 | 
15 | [MIT 18.657 Mathematics of Machine Learning - course syllabus and resources](https://ocw.mit.edu/courses/mathematics/18-657-mathematics-of-machine-learning-fall-2015/syllabus/)  
16 | 
17 | [Rahm E, Do HH. 2000. Data cleaning: Problems and current approaches. IEEE Data Eng. Bull. 23.4:3-13](https://dbs.uni-leipzig.de/en/publication/title/data_cleaning_problems_and_current_approaches)  
18 | 
19 | [Valiant LG. 1984. A theory of the learnable. Communications of the ACM 27:1134-1142](https://people.mpi-inf.mpg.de/~mehlhorn/SeminarEvolvability/ValiantLearnable.pdf)


--------------------------------------------------------------------------------
/R and Python installation help.txt:
--------------------------------------------------------------------------------
 1 | Before class, please download and install R Studio: 
 2 | 	https://www.rstudio.com/products/rstudio/download3/ 
 3 | 
 4 | If your installation does not work and says you need to install the binary files, please do so here: 
 5 | 	https://cloud.r-project.org/
 6 | 
 7 | Also download and install Python by following these instructions: 
 8 | 	https://github.com/dlab-berkeley/python-intensive/blob/master/Install.md
 9 | 		(you can also just pip install scikit-learn if you have Python but not Anaconda).
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning Working Group, Fall 2018
 2 | 
 3 | We meet on alternating Wednesdays from 3-5pm at D-Lab (Barrows 356). We have no expectation of prior machine learning experience, and simply go through one algorithm a meeting, with about 30 minutes each in R & Python. We also incorporate lightning talks and other guest presentations throughout our meetings.
 4 | 
 5 | **Fall 2018 - unsupervised methods**  
 6 |   - Sept. 5: [Principal component analysis (PCA)](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/1-sep5-PCA)
 7 |   - Sept. 19: [K-means clustering](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/2-sep19-k-means)
 8 |   - Oct. 3: [Hierarchical clustering](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/3-oct3-hier_agg_clust)
 9 |   - Oct. 17: [Medoid partitioning](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/4-medoids)
10 |   - Oct. 31: tSNE  
11 |   - Nov. 14: [UMAP](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2018/6-nov14-umap)
12 |   - Dec. 12: Lightning talks  
13 | 
14 | We are always looking for student/staff/faculty presenters. Please contact us if you are interested!
15 | 
16 | More information on the [D-Lab MLWG website](http://dlab.berkeley.edu/working-groups/machine-learning-working-group-0)
17 | 
18 | ## Previous Semesters
19 | 
20 | * [Spring 2018](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Spring2018)  
21 |   - k-nearest neighbors  
22 |   - decision tree  
23 |   - random forest  
24 |   - gradient boosting  
25 |   - elastic net  
26 | * [Fall 2017](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2017)
27 |   - basics of neural networks for image processing 
28 | * [Spring 2017](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Spring2017)
29 |   - k-nearest neighbors
30 |   - stepwise regression
31 |   - linear and polynomial regression, smoothing splines
32 |   - multivariate adaptive regression splines and generalized additive models
33 |   - support vector machines
34 |   - neural networks.
35 | * [Fall 2016](https://github.com/dlab-berkeley/MachineLearningWG/tree/master/Fall2016)
36 |   - decision trees, random forests, penalized regression, and boosting
37 | 
38 | ## Resources
39 | 
40 | Books:
41 | 
42 | 1. **Intro to Statistical Learning** by James et al. [(free pdf)](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf) [(Amazon)](https://smile.amazon.com/Introduction-Statistical-Learning-Applications-Statistics-ebook/dp/B01IBM7790/)
43 | 2. **Applied Predictive Modeling** by Max Kuhn [(Amazon)](https://smile.amazon.com/Applied-Predictive-Modeling-Max-Kuhn-ebook/dp/B00K15TZU0/)
44 | 3. **Python Data Science Handbook** by Jake VanderPlas [(online version)](https://jakevdp.github.io/PythonDataScienceHandbook/)
45 | 4. **Elements of Statistical Learning** by Hastie et al.  [(free pdf)](http://statweb.stanford.edu/~tibs/ElemStatLearn/download.html) [(Amazon)](https://smile.amazon.com/Elements-Statistical-Learning-Prediction-Statistics-ebook/dp/B00475AS2E/)
46 | 5. **Modern Multivariate Statistical Techniques** by Alan Izenman [(Amazon)](https://smile.amazon.com/Modern-Multivariate-Statistical-Techniques-Classification-ebook/dp/B00HWUR9CS/)
47 | 6. **Differential Equations and Linear Algebra** by Stephen Goode and Scott Annin [(Amazon)](https://www.amazon.com/Differential-Equations-Linear-Algebra-Stephen-ebook/dp/B00HR7MR3W/ref=mt_kindle?_encoding=UTF8&me=)  
48 | 7. **Statistical Learning with Sparsity: The Lasso and Generalizations** by Trevor Hastie, Robert Tibshirani, and Martin Wainwright [(free pdf)](https://web.stanford.edu/~hastie/StatLearnSparsity/) [(Amazon)](https://www.amazon.com/Statistical-Learning-Sparsity-Generalizations-Probability/dp/1498712169/ref=sr_1_fkmrnull_1?crid=2OQXF1KIQYDUX&keywords=statistical+learning+with+sparsity+the+lasso+and+generalizations&qid=1552196190&s=gateway&sprefix=Statistical+Learning+with+Sparsity%3A+the+Lasso+and+gener%2Caps%2C178&sr=8-1-fkmrnull) and   
49 | 
50 | Help:  
51 | 
52 | * [Getting Help with R](https://www.r-project.org/help.html)  
53 | * [Stack Overflow](https://stackoverflow.com/questions/tagged/r)  
54 | * [Quick-R](https://www.statmethods.net/)  
55 | * [R-Bloggers](https://www.r-bloggers.com/)  
56 | 
57 | Courses at Berkeley:  
58 | 
59 | * Stat 154 - Statistical Learning  
60 | * CS 189 / CS 289A - Machine Learning
61 | * COMPSCI x460 - [Practical Machine Learning with R](https://extension.berkeley.edu/search/publicCourseSearchDetails.do?method=load&courseId=17483923&selectedProgramAreaId=15499&selectedProgramStreamId=25856348) [UC Berkeley Extension]
62 | * PH 252D  - Causal Inference
63 | * PH 295 - Big Data
64 | * PH 295 - Targeted Learning for Biomedical Big Data
65 | 
66 | Online classes:  
67 | 
68 | * [Tibshirani and Hastie's Statistical Learning Free Course](https://lagunita.stanford.edu/courses/HumanitiesSciences/StatLearning/Winter2016/about)
69 | * [Coursera Data Science Specialization](https://www.coursera.org/specializations/jhu-data-science)
70 | * [edX - Principles of Machine Learning](https://www.edx.org/course/principles-machine-learning-microsoft-dat203-2x-2)
71 | * [edX - Applied Machine Learning](https://www.edx.org/course/applied-machine-learning-microsoft-dat203-3x-0)
72 | * [Coursera - Machine Learning](https://www.coursera.org/learn/machine-learning)
73 | 
74 | Other Campus Groups:  
75 | 
76 | * [D-Lab's Cloud Computing Working Group](http://dlab.berkeley.edu/working-groups/cloud-working-group-0)  
77 | * [D-Lab's Computational Text Analysis Working Group](http://dlabctawg.github.io/)  
78 | * [The Hacker Within](http://www.thehackerwithin.org/berkeley/) / [Berkeley Institute for Data Science](https://bids.berkeley.edu/)
79 | * [Machine Learning @ Berkeley](https://ml.berkeley.edu/)  
80 | * [Berkeley Statistics and Machine Learning Discussion Group](https://bids.berkeley.edu/news/bids-launches-new-berkeley-statistics-and-machine-learning-discussion-group)  
81 | 


--------------------------------------------------------------------------------
/Spring2017/Apr14-svm/proj.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Apr14-svm/proj.png


--------------------------------------------------------------------------------
/Spring2017/Apr14-svm/r-svm.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Support vector machines in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | Topics to cover:
  9 | 
 10 | * Background
 11 | * Polynomial kernels
 12 | * Radial basis function (RBF) kernel
 13 | * Hyperparameter optimization
 14 | 
 15 | Before we dig in, we will install the R packages we'll be using.
 16 | 
 17 | **R packages**
 18 | ```{r}
 19 | # List of packages we will use.
 20 | packages = c("MASS", "kernlab", "devtools")
 21 | 
 22 | # Try to load each package and save the result.
 23 | success = sapply(packages, require, character.only = T, quietly = T)
 24 | 
 25 | # Check if any packages still need to be installed.
 26 | if (sum(!success) > 0) {
 27 |   # Install any needed packages.
 28 |   install.packages(packages[!success])
 29 |   
 30 |   # Load the newly installed packages.
 31 |   sapply(packages[!success], require, character.only = T, quietly = T)
 32 | }
 33 | 
 34 | github_packages = c(
 35 |   # Chris's tools package for plotting the SuperLearner.
 36 |   "ck37/ck37r",
 37 |   # Use more up-to-date SuperLearner from github.
 38 |   "ecpolley/SuperLearner")
 39 | 
 40 | devtools::install_github(github_packages)
 41 | 
 42 | # Clean up variables.
 43 | rm(packages, success, github_packages)
 44 | ```
 45 | 
 46 | # Background
 47 | 
 48 | Deb has done a great job covering the background & theory for SVM in the python materials. See also Intro to Statistical Learning Chapter 9, Applied Predictive Modeling sections 7.3, 13.4, 13.7.4, or pretty much any other machine learning textbook.
 49 | 
 50 | Motivational quote: "SVMs not only have a more solid foundation than artificial neural nets, but are able to serve as a replacement for neural nets." - Learning with Kernels, 2002.
 51 | 
 52 | # Software packages
 53 | 
 54 | There are many R packages that implement support vector machines. `e1071` is one of the oldest, but we will use `kernlab` because it implements more kernels and has a broader set of features. `klaR` and `svmPath` are other options; `svmPath` is particularly designed to be fast.
 55 | 
 56 | # Data preparation
 57 | 
 58 | ```{r}
 59 | data(Boston, package = "MASS")
 60 | 
 61 | # Remove our outcome variable from the covariate list.
 62 | X_df = Boston[, -14]
 63 | 
 64 | # Convert X from a dataframe to a matrix.
 65 | X_mat = model.matrix(~ .,  data = X_df)
 66 | 
 67 | # Notice the extra intercept column added by model.matrix.
 68 | colnames(X_mat)
 69 |                  
 70 | #  Remove extra intercept term.
 71 | X_mat = X_mat[, -1]
 72 | 
 73 | # Regression (continuous) version of our outcome variable.
 74 | Y_reg = Boston$medv
 75 | 
 76 | # Review outcome distribution.
 77 | summary(Y_reg)
 78 | 
 79 | # Classification (binary) version of our outcome variable.
 80 | Y_class = as.factor(as.numeric(Boston$medv > 23))
 81 | 
 82 | # Review outcome distribution.
 83 | table(Y_class)
 84 | prop.table(table(Y_class))
 85 | 
 86 | ```
 87 | 
 88 | Also note that the SVM algorithms will internally center (mean 0) and scale all variables so that they can be compared to each other, as we did with K-nearest neighbors (lasso also does this).
 89 | 
 90 | # Polynomial kernels
 91 | 
 92 | Polynomial kernels are the simplest kernel you would want to use with SVM. You can do a linear kernel (i.e. no polynomial expansion) but that's essentially equivalent to OLS and is only done if the data is so big that a better kernel is computationlly infeasible. 
 93 | 
 94 | ```{r errors=T}
 95 | library(kernlab)
 96 | 
 97 | # Regression version.
 98 | # X should be a matrix, not a dataframe.
 99 | fit = ksvm(x = X_mat, y = Y_reg, kernel = "polydot", kpar = list(degree = 3))
100 | fit
101 | 
102 | # Compare to using a dataframe rather than a matrix.
103 | # The error message is particularly unhelpful.
104 | fit = ksvm(x = X, y = Y_reg, kernel = "polydot")
105 | 
106 | # Classification version.
107 | # Y should be a factor so that classification is run automatically.
108 | fit = ksvm(x = X_mat, y = Y_class, kernel = "polydot",  kpar = list(degree = 3))
109 | fit
110 | 
111 | ```
112 | 
113 | # Radial basis function kernels
114 | 
115 | RBF is the best kernel if you have to pick one.
116 | 
117 | ```{r}
118 | 
119 | # Regression version.
120 | fit = ksvm(x = X_mat, y = Y_reg, kernel = "rbfdot")
121 | fit
122 | 
123 | # Classification
124 | fit = ksvm(x = X_mat, y = Y_class, kernel = "rbfdot")
125 | fit
126 | 
127 | ```
128 | 
129 | # Hyperparameter optimization
130 | 
131 | There are two hyperparameters that can potentially have major effects on the performance of SVM: the amount of regularization (called "C" often), and the bandwidth (scale parameter often called gamma or sigma).
132 | 
133 | The regularization parameter C is the non-negative error budget for the number of misclassifications allowed, critical to establish bias-variance trade-off. When C is small we want low bias but high variance, and the reverse when C is large. Thorough C grid points: $C \in \{2^{−5},2^{−3},...,2^{15}\}$
134 | 
135 | The scale parameter Gamma ($\gamma$) aka Sigma ($\sigma$) is effectively the inverse bandwidth of the SVM kernel. So a large gamma/sigma corresponds to a wide bandwidth used to calculate proximity, meaning that a wider range of points are incorporated.  When gamma/sigma is small only very nearby observations are used. Thorough grid points: $\gamma or \sigma \in \{2^{-15}, 2^{-13}, ..., 2^{3}\}$. Notably, a good initial guess is generated by kernlab's `sigest` function, which may allow one to skip optimizing this hyperparameter.
136 | 
137 | ```{r}
138 | 
139 | # Make sure you have the latest version from github.
140 | library(SuperLearner)
141 | 
142 | tuning_list = list(
143 |   # Try different kernels.
144 |   # Unfortunately we cannot currently customize the degree for the polynomial kernel when
145 |   # using create.Learner() - this will be fixed. We could make the functions manually though.
146 |   kernel = c("polydot", "rbfdot", "laplacedot"),
147 |   # Regularization parameter, could be 2^-5 to 2^15.
148 |   C = 2^c(-4, -2, 0, 2, 4, 6, 8)
149 | )
150 | 
151 | # Review the C values we are testing.
152 | tuning_list$C
153 | 
154 | svm_learners = create.Learner("SL.ksvm", detailed_names = T,
155 |                               tune = tuning_list)
156 | 
157 | # Review the learners that were created.
158 | svm_learners$names
159 | 
160 | sl_lib = c("SL.mean", "SL.glm", "SL.ksvm", svm_learners$names)
161 | 
162 | set.seed(1)
163 | 
164 | Y_num = as.numeric(Y_class) - 1
165 | table(Y_num, Y_class)
166 | 
167 | # Currently displays some extra output unfortunately.
168 | result = SuperLearner(Y = Y_num, X = Boston[, -14], family = "binomial",
169 |                       SL.library = sl_lib)
170 | result
171 | 
172 | # Use plot.SuperLearner() from here.
173 | library(ck37r)
174 | 
175 | # Plot the results. Use CV.SuperLearner to also plot the SL performance.
176 | # We need to pass in the outcome variable for this plot.
177 | plot(result, Y_num)
178 | ```
179 | 
180 | # Other notes
181 | 
182 | SVM has to compute the kernel function for every pair of observations, so it is not ideal when you have many observations. It's best with a reasonable number of observations but possibly a large number of covariates.
183 | 
184 | # Further reading on SVMs
185 | 
186 | Scholkopf, B., & Smola, A. J. (2002). Learning with kernels: support vector machines, regularization, optimization, and beyond. MIT press.


--------------------------------------------------------------------------------
/Spring2017/Apr28-neural-nets/nn-from-scratch-3-layer-network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Apr28-neural-nets/nn-from-scratch-3-layer-network.png


--------------------------------------------------------------------------------
/Spring2017/Apr28-neural-nets/r-neural-nets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Neural networks in R"
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | Topics to cover:
  9 | 
 10 | * Background
 11 | * Single-layer networks
 12 | * Multi-layer networks
 13 | * Possibly more
 14 | 
 15 | 
 16 | Before we dig in, we will install the R packages we'll be using.
 17 | 
 18 | **R packages**
 19 | ```{r}
 20 | # List of packages we will use.
 21 | packages = c("MASS", "nnet", "h2o", "devtools", "NeuralNetTools")
 22 | 
 23 | github_packages = c(
 24 |   # Chris's tools package for plotting the SuperLearner.
 25 |   "ck37r" = "ck37/ck37r",
 26 |   # Use more up-to-date SuperLearner from github.
 27 |   "SuperLearner" = "ecpolley/SuperLearner")
 28 | 
 29 | devtools::install_github(github_packages)
 30 | 
 31 | # Load those github packages.
 32 | ck37r::load_packages(names(github_packages))
 33 | 
 34 | # Load required non-github packages and install from CRAN if necessary.
 35 | ck37r::load_packages(packages, install = T)
 36 | 
 37 | # Also install mxnet for potential usage.
 38 | # This unfortunately is Mac/Windows only; probably will not work for Linux.
 39 | # Actually not working for Mac either.
 40 | if (F) {
 41 |   # Skip this for now.
 42 |   install.packages("drat", repos="https://cran.rstudio.com")
 43 |   drat:::addRepo("dmlc")
 44 |   install.packages("mxnet")
 45 | }
 46 | 
 47 | # Clean up variables.
 48 | rm(packages, success, github_packages)
 49 | ```
 50 | 
 51 | # Background
 52 | 
 53 | Please see Deb's python code for more details on neural network theory.
 54 | 
 55 | # Software packages
 56 | 
 57 | We'll be using `nnet` for simple neural networks and `h2o` for deep neural networks.
 58 | 
 59 | # Data preparation
 60 | 
 61 | ```{r}
 62 | data(Boston, package = "MASS")
 63 | 
 64 | # Remove our outcome variable from the covariate list.
 65 | X_df = Boston[, -14]
 66 | 
 67 | # Convert X from a dataframe to a matrix.
 68 | X_mat = model.matrix(~ .,  data = X_df)
 69 | 
 70 | # Notice the extra intercept column added by model.matrix.
 71 | colnames(X_mat)
 72 |                  
 73 | #  Remove extra intercept term.
 74 | X_mat = X_mat[, -1]
 75 | 
 76 | # Regression (continuous) version of our outcome variable.
 77 | Y_reg = Boston$medv
 78 | 
 79 | # Review outcome distribution.
 80 | summary(Y_reg)
 81 | 
 82 | # Classification (binary) version of our outcome variable.
 83 | Y_class = as.factor(as.numeric(Boston$medv > 23))
 84 | 
 85 | # Review outcome distribution.
 86 | table(Y_class)
 87 | prop.table(table(Y_class))
 88 | 
 89 | ```
 90 | 
 91 | # Single-layer neural network
 92 | 
 93 | 
 94 | Quick classification example
 95 | 
 96 | ```{r}
 97 | library(nnet)
 98 | 
 99 | # Classification
100 | 
101 | # Set seed because weights are initialized randomly.
102 | set.seed(1)
103 | 
104 | # X can be a dataframe or matrix.
105 | # If Y is a factor we need to use this formula notation.
106 | fit = nnet(Y_class ~ X_mat, size = 2, decay = 5e-4, maxit = 200)
107 | 
108 | # Review our neural network fit.
109 | fit
110 | 
111 | # Plot our neural network.
112 | library(NeuralNetTools)
113 | plotnet(fit)
114 | 
115 | # Predict back to our original data.
116 | pred = predict(fit, X_mat)
117 | 
118 | # Review predictions.
119 | summary(pred)
120 | 
121 | # 
122 | ```
123 | 
124 | Quick regression example
125 | 
126 | ```{r}
127 | library(nnet)
128 | 
129 | # Set seed because weights are initialized randomly.
130 | set.seed(1)
131 | 
132 | # Again, X can be a dataframe or matrix.
133 | fit = nnet(Y_reg ~ X_mat, size = 2, decay = 5e-4, maxit = 200,
134 |            # Enable linear output to support regression.
135 |            linout = T)
136 | 
137 | # Challenge: try with linout = F (the default) and see what happens.
138 | 
139 | # Review our neural network fit.
140 | fit
141 | 
142 | # Visualize neural network.
143 | plotnet(fit)
144 | 
145 | # Predict back to our original data.
146 | pred = predict(fit, X_mat)
147 | 
148 | # Review predictions.
149 | summary(pred)
150 | 
151 | # Calculate mean-squared error (MSE).
152 | mean((pred - Y_reg)^2)
153 | 
154 | # And root mean squared error (RMSE).
155 | sqrt(mean((pred - Y_reg)^2))
156 | 
157 | ```
158 | 
159 | # SuperLearner optimization
160 | 
161 | These challenges can be done in pairs/groups to make it easier.
162 | 
163 | Challenge 1: use SL.nnet wrapper to estimate performance of the neural network.
164 | 
165 | Challenge 2: use create.Learner() to test 2, 3, 4, or 5 hidden units and create a weighted average ensemble.
166 | 
167 | # Multi-layer neural network
168 | 
169 | Challenge: use h2o to design this.
170 | 
171 | ```{r}
172 | library(h2o)
173 | # Startup and connect to our existing h2o cluster.
174 | # Use all available threads.
175 | h2o.init(nthreads = -1)
176 | 
177 | # Clean slate - just in case the cluster was already running.
178 | h2o.removeAll()
179 | 
180 | # Load x data into h2o.
181 | data = as.h2o(cbind(X_df, `_outcome` = Y_reg))
182 | dim(data)
183 | 
184 | outcome = "_outcome"
185 | x = colnames(X_df)
186 | 
187 | # Fit the deep learning model here.
188 | 
189 | # Shutdown server when we're done.
190 | h2o.shutdown(prompt = F)
191 | ```
192 | 
193 | See also Erin LeDell's [excellent tutorial on deep learning](https://github.com/ledell/useR-machine-learning-tutorial/blob/master/deep-neural-networks.Rmd).
194 | 


--------------------------------------------------------------------------------
/Spring2017/Feb17-stepwise/r-stepwise-selection.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: ''
  3 | output:
  4 |   html_document: default
  5 |   html_notebook: default
  6 | ---
  7 | 
  8 | # Stepwise selection
  9 | 
 10 | Topics to cover:
 11 | 
 12 | * Best subset selection
 13 | * Forward selection
 14 | * Backward selection
 15 | * Cross-validation
 16 | 
 17 | Before we dig in, we will install the R packages we'll be using.
 18 | 
 19 | **R packages**
 20 | ```{r}
 21 | # List of packages we will use.
 22 | packages = c("MASS", "mlbench", "SuperLearner", "devtools")
 23 | 
 24 | # Try to load each package and save the result.
 25 | success = sapply(packages, require, character.only = T, quietly = T)
 26 | 
 27 | # Check if any packages still need to be installed.
 28 | if (sum(!success) > 0) {
 29 |   # Install any needed packages.
 30 |   install.packages(packages[!success])
 31 |   
 32 |   # Load the newly installed packages.
 33 |   sapply(packages[!success], require, character.only = T, quietly = T)
 34 | }
 35 | 
 36 | # Install Chris K.'s tools package, which we'll use for imputing missing values.
 37 | devtools::install_github("ck37/ck37r")
 38 | 
 39 | # Clean up variables.
 40 | rm(packages, success)
 41 | ```
 42 | 
 43 | ## Background
 44 | 
 45 | Stepwise selection, or [stepwise regression](https://en.wikipedia.org/wiki/Stepwise_regression), is a commonly used technique to include a subset of covariates in a regression model. The goal is to increase accuracy compared to including all covariates in the model, because we can often improve model performance by removing some covariates (as we did with lasso / elastic net). Stepwise is a simple form of **feature selection** - choosing a subset of variables for incorporation into a machine learning algorithm.
 46 | 
 47 | ### Best subset selection
 48 | 
 49 | Ideally we would test every possible combination of covariates and use the combination with the best performance. This is **best subset selection**.
 50 | 
 51 | Consider the case of three covariates: X1, X2, and X3. We would estimate the accuracy of the following models:
 52 | 
 53 | * All variables: X1, X2, X3 - our default regression
 54 | * X1 and X2 (exclude X3)
 55 | * X2 and X3 (exclude X1)
 56 | * X1 and X3 (exclude X2)
 57 | * X1-only
 58 | * X2-only
 59 | * X3-only
 60 | * No variables (intercept only)
 61 | 
 62 | The one with the best performance (e.g. cross-validated mean-squared error) is the one we would use. Stepwise algorithms are commonly used without cross-validation, and as a result they are usually overfitting the data - capturing random error in addition to true relationships in the data, resulting in worse performance on new data.
 63 | 
 64 | To generalize to any model size, if we have p covariates we would have to check $2^p$ different combinations: each covariate is either included or not (2 possibilities), so combining that for all covariates we have the product of p twos: $2 * 2 * 2...$ which simplifies to $2^p$. With 10 covariates that is 1,024 models to check, with 20 covariates it's a million, etc.
 65 | 
 66 | ### Stepwise selection
 67 | 
 68 | Stepwise selection is a simplification of best subset selection to make it computationally feasible for any number of covariates. It comes in three forms: forward, backward, and combined forward & backward. Confusingly, sometimes "stepwise" is meant to refer specifically to the "both" approach.
 69 | 
 70 | **Forward selection** starts with just the intercept and considers which single variable to incorporate next. It loops over every variable, runs a regression with that variable plus the intercept, and chooses the variable with the best performance on a certain metric: adjusted $R^2$, [f-statistic](https://en.wikipedia.org/wiki/F-test#Regression_problems), [Aikake Information Criterion](https://en.wikipedia.org/wiki/Akaike_information_criterion), or other preferred performance estimate. It then adds that variable to the model and considers the next variable to add, continuing to repeat until no remaining variable improves performance.
 71 | 
 72 | ## Clean dataset
 73 | 
 74 | Let's try out some code. First we prep a demo dataset.
 75 | ```{r}
 76 | # Load a test dataset.
 77 | data(PimaIndiansDiabetes2, package = "mlbench")
 78 | 
 79 | data = PimaIndiansDiabetes2
 80 | 
 81 | # Review data structure.
 82 | str(data)
 83 | 
 84 | # Do we have missing values? Yes.
 85 | sum(is.na(data))
 86 | 
 87 | library(ck37r)
 88 | 
 89 | outcome = "diabetes"
 90 | 
 91 | # Impute missing data and add missingness indicators.
 92 | # Don't impute the outcome though.
 93 | result = ck37r::impute_missing_values(data, skip_vars = outcome)
 94 | # Use the imputed dataframe.
 95 | data = result$data
 96 | 
 97 | str(data)
 98 | 
 99 | # Now do we have missing values?
100 | sum(is.na(data))
101 | 
102 | # Create a vector just for the outcome variable.
103 | # Convert to numeric for glm().
104 | Y = as.numeric(data[, outcome] == "pos")
105 | 
106 | # Confirm our outcome vector is correct.
107 | table(data[, outcome], Y)
108 | 
109 | # Remove the outcome variable from our covariate list.
110 | X = data[, !names(data) == outcome]
111 | 
112 | # Confirm our covariates and dimensions are right.
113 | colnames(X)
114 | dim(X)
115 | length(Y)
116 | ```
117 | 
118 | ## Stepwise selection code
119 | 
120 | Now let's look at stepwise selection.
121 | 
122 | ```{r}
123 | # Fit the intercept-only model. Specify data because we will use later.
124 | initial_reg = glm(Y ~ 1, data = X, family = "binomial")
125 | summary(initial_reg)
126 | 
127 | # Define the largest possible model specification.
128 | largest_model = glm(Y ~ ., data = X, family = "binomial")
129 | summary(largest_model)
130 | 
131 | # Review step()
132 | ?step
133 | 
134 | # Run stepwise forward selection.
135 | step_reg = step(initial_reg, formula(largest_model),
136 |                 direction = "forward", trace = 0)
137 | step_reg
138 | ```
139 | 
140 | **Backward selection** does the same thing but it starts with all variables in the model and considers which variable to first remove from the model. It checks the performance for each variable when it is removed and removes the variable that is least useful to the regression performance. It continues this until no variable yields an increase in performance upon removal.
141 | 
142 | **Challenges**
143 | 
144 | 1. How similar are stepwise results compared to the significant covariates from the standard OLS we ran first? Hint: compare `step_reg` with `summary(largest_model)`.
145 | 2. Try running with `trace = 1` to see more details in the stepwise process.
146 | 3. Try running with `direction = "backward"` and then `direction = "both"` - do you get the same variables selected? Hint: with backward you will need to change the first argument to use the full model rather than the intercept-only model.
147 | 
148 | ## Cross-validated comparison
149 | 
150 | As mentioned earlier, it is critical that we use cross-validation to estimate the accuracy of the stepwise procedure. If we don't we will definitely get an overly optimistic estimate of model performance.
151 | 
152 | ```{r}
153 | 
154 | sl_lib = c("SL.mean", "SL.glm", "SL.glmnet", "SL.step.forward", "SL.stepAIC")
155 | 
156 | set.seed(1)
157 | sl = SuperLearner(Y, X, family = binomial(), SL.library = sl_lib)
158 | sl
159 | ```
160 | 
161 | **Challenges**
162 | 
163 | 1. Add in one or two other algorithms we've used. How do they compare to stepwise?
164 | 2. Look at the code for `SL.step.forward` and `SL.stepAIC` - any questions on how they work?
165 | 
166 | ## Further reading
167 | 
168 | * Intro to Statistical Learning, section 6.1.2
169 | * Applied Predictive Modeling, chapter 19 "Feature Selection".
170 | * ["What are some of the problems with stepwise regression?"](http://www.stata.com/support/faqs/statistics/stepwise-regression-problems/) [CK: note that they are assuming no cross-validation.]
171 | * [Regression Modeling Strategies](https://smile.amazon.com/Regression-Modeling-Strategies-Applications-Statistics-ebook/dp/B0140XQAXI), section 4.3.
172 | * [Statistical Learning from a Regression Perspective](https://smile.amazon.com/Statistical-Learning-Regression-Perspective-Statistics-ebook/dp/B01M333153) section 1.4.6.


--------------------------------------------------------------------------------
/Spring2017/Feb3-knn/Feb3kNN-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "k-Nearest Neighbor classification and regression"
  3 | author: "Evan Muzzall"
  4 | date: "2/3/2017"
  5 | output: 
  6 |   html_document: 
  7 |     toc: yes
  8 |     toc_float: yes
  9 | ---
 10 | 
 11 | ```{r setup, include=FALSE}
 12 | knitr::opts_chunk$set(echo = TRUE, fig.width=9, fig.height=6, width=160)
 13 | ```
 14 | 
 15 | #0. Package installation
 16 | Today we will use the following packages. Although we won't use it today, we recommend installing "SuperLearner" as well. 
 17 | ```{r, eval=FALSE}
 18 | install.packages(c("caret", "chemometrics", "class", "FNN", "gmodels", "ggplot2", "MASS", "SuperLearner"), dependencies=TRUE)
 19 | library(caret)
 20 | library(chemometrics)
 21 | library(class)
 22 | library(FNN)
 23 | library(gmodels)
 24 | library(ggplot2)
 25 | library(MASS)
 26 | library(SuperLearner)
 27 | ```
 28 | 
 29 | #1. What is k-Nearest Neighbors? (kNN)
 30 | kNN is a form of "lazy" learning in which data are categorized based on similarities with their "nearest" neighbors. kNN can be thought of as non-parametric instance-based learning. Compared to other algorithms, KNN is simple and makes no assumptions about the underlying data structure.  
 31 | 
 32 | The data are treated as coordinates in a multidimensional feature space to organize the desired groups that are identified. kNN is distance-based and distills variation contained within multiple variables into a reduced number of principal axes.  
 33 | 
 34 | Euclidean (straight-line Cartesian) distance is one standard for KNN and the distance we will use today. However, many consider Mahalanobis distance a more appropriate multivariate distance for kNN and other statistical tests. See for example:  
 35 | 
 36 | [Weinberger et al. 2009. Distance Metric Learning for Large Margin
 37 | Nearest Neighbor Classification. Journal of Machine Learning Reseach 10: 207-244](http://jmlr.csail.mit.edu/papers/volume10/weinberger09a/weinberger09a.pdf)
 38 | 
 39 | #2. The data
 40 | Load the "Boston" housing dataset from the "MASS" R package and check it out:
 41 | ```{r}
 42 | library(MASS)
 43 | data(Boston)
 44 | ```
 45 | ```{r, eval=FALSE}
 46 | head(Boston)
 47 | ?Boston
 48 | str(Boston)
 49 | ```
 50 | 
 51 | Today we will walk through classification and then regression using kNN. For classification, we want the response variable to be a factor. For regression, we want the response variable to remain numeric. Thus, we will first make a copy of the "Boston" dataset for the regression, before we arbitrarily convert it to a factor for classification:
 52 | ```{r}
 53 | B_reg <- Boston
 54 | ```
 55 | 
 56 | Time to get creative! Let's do a little data wrangling and coerce the "dis" variable (weighted mean of distances to five Boston employment centers) into a factor category. The distances will now be categorized as "short", "medium", or "long". 
 57 | ```{r}
 58 | summary(Boston$dis)
 59 | Boston$dis <- cut(Boston$dis, 
 60 |                   breaks=c(0, 3, 6, 13),
 61 |                   levels=c(1,2,3),
 62 |                   labels=c("short", "medium", "long"))
 63 | table(Boston$dis)
 64 | ```
 65 | ```{r, eval=FALSE}
 66 | str(Boston)
 67 | head(Boston, 10)
 68 | levels(Boston$dis)
 69 | ```
 70 | 
 71 | #3. Choosing a proper k
 72 | The "k" in kNN represents the number of other "neighboring" data points used to classify the point in question. Consider the bias-variance tradeoff when choosing a proper "k".  
 73 | 
 74 | [Click here for Jason Brownlee's excellent introduction to the bias-variance tradeoff](http://machinelearningmastery.com/gentle-introduction-to-the-bias-variance-trade-off-in-machine-learning/)  
 75 | 
 76 | For example, **if we choose a large "k",** it is easy for the majority class to win because it will always get the most votes and the "nearest neighbors" would not exert their proper influence. Or, **if we choose a tiny "k",** noise and outliers could unduly influence the classification of the point being classified, again disregarding the influence of the other "nearest neighbors".  
 77 | 
 78 | For our example, we will set "k" to the square root of the number of training observations (see below). However, this might not result in the best "k". Thus, we will perform cross-validation on 1:50 "k's" to see how misclassification error varies across the different k-values.  
 79 | #4. Split the data
 80 | Now, use caret's handy `createDataPartition` funciton to conduct a stratified random split and divide the Boston data into train and test sets. We choose to put 70% of the data into the training set, and the remaining 30% into the test set. Also create label vectors to be used as identifiers in the classification process:
 81 | ```{r}
 82 | library(caret)
 83 | set.seed(1)
 84 | split <- createDataPartition(Boston$dis, p=0.70, list=FALSE)
 85 | train <- Boston[split, ]
 86 | test <- Boston[-split, ]
 87 | 
 88 | train_labels <- train[,8]
 89 | test_labels <- test[,8]
 90 | ```
 91 | 
 92 | #5. Train the model
 93 | Time to classify! Fit the model to the training data using the `knn` function from the "class" package. This outputs a vector of the predicted classifications. However, let's first choose a "k" using the square root method:
 94 | ```{r, eval=FALSE}
 95 | ?knn # (click the option from the "class" library)
 96 | ```
 97 | ```{r}
 98 | round(sqrt(nrow(train)),2) # 18.84
 99 | 
100 | library(class)
101 | set.seed(1)
102 | Boston_p <- knn(train=train[,-8], test=test[,-8], cl=train_labels, k=19, prob=TRUE)
103 | ```
104 | 
105 | ###5.1. Evaluate its performance
106 | How did it do? Check out its performance on the test set using the `CrossTable` function from the "gmodels" package:
107 | ```{r}
108 | library(gmodels)
109 | CrossTable(x=test_labels, y=Boston_p, 
110 |            prop.chisq=FALSE,
111 |            prop.r=FALSE,
112 |            prop.c=FALSE,
113 |            prop.t=FALSE)
114 | ```
115 | How did it do?   
116 | 
117 | > NOTE: remember that the breaks specified in the cut function above were arbitrary for this toy example. You will probably want to make more informed decisions for your thesis, dissertation, and other professional work!  
118 | 
119 | ###5.2. Improve model performance
120 | #####1. Normalize the data
121 | We don't want larger values to indiscriminately affect results. Let's standardize the data to a normal range so that their contributions to the decision-making process become roughly equal. We can do this with `scale`:  
122 | 
123 | Let's name this scaled dataframe "B":
124 | ```{r}
125 | B <- Boston
126 | B[,-8] <- scale(Boston[,-8], center=TRUE, scale=TRUE)
127 | ```
128 | ```{r, eval=FALSE}
129 | head(B, 10)
130 | ```
131 | 
132 | Re-split the data using this transformed "B" dataframe: 
133 | ```{r}
134 | set.seed(1)
135 | split_scale <- createDataPartition(B$dis, p=0.70, list=FALSE)
136 | train_scale <- B[split_scale, ]
137 | test_scale <- B[-split_scale, ]
138 | 
139 | train_labels_scale <- train_scale[,8]
140 | test_labels_scale <- test_scale[,8]
141 | ```
142 | 
143 | Fit the model again:
144 | ```{r}
145 | set.seed(1)
146 | B_p <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=19, prob=TRUE)
147 | 
148 | CrossTable(x=test_labels_scale, y=B_p, 
149 |            prop.chisq=FALSE,
150 |            prop.r=FALSE,
151 |            prop.c=FALSE,
152 |            prop.t=FALSE)
153 | ```
154 | How did it do?  
155 | 
156 | #####5.3. Change "k"
157 | We can also change "k" to evaluate the performance of several models. Ideally, you would use the ["SuperLearner" R package](https://cran.r-project.org/web/packages/SuperLearner/index.html to examine a handful of kNN algorithms with different k-values simultaneously against other algorithms. Below we will examine a range of cross-validated k-values. For now, let's just try a few extremes:
158 | ```{r}
159 | B_p_k1 <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=1, prob=TRUE)  # k=1
160 | B_p_k50 <- knn(train=train_scale[,-8], test=test_scale[,-8], cl=train_labels_scale, k=50, prob=TRUE)  # k=50
161 | 
162 | CrossTable(x=test_labels_scale, y=B_p_k1,  # k=1
163 |            prop.chisq=FALSE,
164 |            prop.r=FALSE,
165 |            prop.c=FALSE,
166 |            prop.t=FALSE)
167 | 
168 | CrossTable(x=test_labels_scale, y=B_p_k50,  # k=50
169 |            prop.chisq=FALSE,
170 |            prop.r=FALSE,
171 |            prop.c=FALSE,
172 |            prop.t=FALSE)
173 |  
174 | ```
175 | What happened?  
176 | 
177 | #6. Another method with a CV error plotting function
178 | Tidy the transformed "B" data a little:
179 | ```{r}
180 | grp <- B$dis
181 | X <- scale(B[-8], center=TRUE, scale=TRUE)
182 | k <- length(unique(grp))
183 | dat <- data.frame(grp, X)
184 | n <- nrow(X)
185 | n_train <- round(n*2/3)
186 | 
187 | set.seed(123)
188 | train_plot <- sample(1:n,n_train)
189 | ```
190 | ###6.1 Plot the cross-validated errors
191 | ```{r}
192 | library(chemometrics)
193 | #pdf("kNN classification.pdf", 9, 6)
194 | knn_k <- knnEval(X, grp, train_plot, 
195 |                  knnvec=seq(1,50, by=1), 
196 |                  legpo="bottomright", las=2)
197 | title("kNN classification")
198 | #dev.off()
199 | ```
200 | 
201 | #7. Regression example
202 | For the regression example, let's return to the B_reg copy of the Boston dataset that we made at the beginning because it preserved the numeric class of the "dis" variable. 
203 | 
204 | First, scale the data:
205 | ```{r}
206 | B_reg <- as.data.frame(scale(B_reg, center=TRUE, scale=TRUE)) 
207 | ```
208 | 
209 | Second, split the data again: 
210 | ```{r}
211 | library(caret)
212 | set.seed(1)
213 | split_reg <- createDataPartition(B_reg$dis, p=0.70, list=FALSE) # split
214 | train_reg <- B_reg[split,]
215 | test_reg <- B_reg[-split,]
216 | ```
217 | 
218 | Third, fit the model and plot it!
219 | ```{r}
220 | library(FNN)
221 | set.seed(1)
222 | knn_reg <- knn.reg(train_reg[,-8], test=NULL, y=train_reg[,8], k=3)
223 | plot(train_reg[,8], knn_reg$pred, xlab="y", ylab=expression(hat(y)))
224 | ```
225 | 
226 | Or, use ggplot2 :) 
227 | ```{r}
228 | library(ggplot2)
229 | gg_df <- data.frame(train_reg[,8], knn_reg$pred)
230 | colnames(gg_df) <- c("distance", "predictions")
231 | str(gg_df)
232 | 
233 | gg <- ggplot(gg_df, aes(distance, predictions, color=distance)) + 
234 |   geom_point() + 
235 |   theme_bw() + 
236 |   xlab("y") + 
237 |   ylab(expression(hat("y"))) +
238 |   ggtitle("kNN regression") + 
239 |   scale_color_continuous(low="yellow", high="red")
240 | 
241 | c <- coef(lm(predictions ~ distance, data=gg_df)) # compute intercept and slope to plot ab line
242 | c
243 | class(c) 
244 | 
245 | #pdf("kNN regression.pdf", 9, 6)
246 | gg + geom_abline(intercept=c[1], slope=c[2], col="green3")
247 | #dev.off()
248 | ```
249 | 
250 | See the below links for information on plotting decision boundaries:  
251 | [Stack Overflow - Variation on “How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?”](http://stackoverflow.com/questions/31234621/variation-on-how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-f)  
252 | [Stack Overflow - How to plot decision boundary of a k-nearest neighbor classifier from Elements of Statistical Learning?](http://stats.stackexchange.com/questions/21572/how-to-plot-decision-boundary-of-a-k-nearest-neighbor-classifier-from-elements-o/21602#21602)  
253 | 
254 | Materials compiled from:  
255 | [-Lantz, Brett. 2013. Machine Learning with R. Birmingham, UK: Packt Publishing, Ltd.](https://www.amazon.com/Machine-Learning-Second-Brett-Lantz/dp/1784393908)  
256 | [-James G, Witten D, Hastie T, Tibshirani R. 2015. An Introduction to Statistical Learning - with applications in R, 6th ed. Springer: New York](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf)  
257 | [-knnEval help page](https://artax.karlin.mff.cuni.cz/r-help/library/chemometrics/html/knnEval.html)


--------------------------------------------------------------------------------
/Spring2017/Mar17-gam and mars/Mar3-gamearth-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Generalized additive models (GAMs) and Multivariate adaptive regression splines (MARS/EARTH) - rough draft"
  3 | author: "Evan Muzzall"
  4 | date: "3/17/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ##1. Package installation  
 13 | We will use the following packages for this example:
 14 | ```{r}
 15 | if (F) {
 16 |   install.packages(c("akima", "caret", "devtools", "earth", "gam", "ggplot2", "mgcv", "mlbench", "plotmo")) # run lines 16 and 17 manually if needed
 17 |   devtools::install_github("ck37/ck37r")
 18 | }
 19 | 
 20 | library(akima)
 21 | library(caret)
 22 | library(ck37r)
 23 | library(devtools)
 24 | library(gam)
 25 | library(ggplot2)
 26 | library(mgcv)
 27 | library(mlbench)
 28 | library(plotmo)
 29 | library(earth)
 30 | ```
 31 | 
 32 | ##2. Goals
 33 | Use the "PimaIndiansDiabetes2" dataset to construct a generalized additive model (GAM) and multivariate additive regression model (MARS, aka EARTH). blood pressure will be the response variable. Missing data will be median-imputed and indicators will be created to document their missingness.  
 34 | 
 35 | ##3. Preprocess the data
 36 | ```{r}
 37 | # load the dataset
 38 | data(PimaIndiansDiabetes2)
 39 | ?PimaIndiansDiabetes2
 40 | data <- PimaIndiansDiabetes2 # give the data a simpler name
 41 | str(data)
 42 | ```
 43 | 
 44 | Check for missing data:
 45 | ```{r}
 46 | # check for missing cases
 47 | sum(is.na(data)) 
 48 | 
 49 | # how much of the data is missing? 
 50 | sum(is.na(data)) / (nrow(data)*ncol(data)) # about 9% 
 51 | ```
 52 | 
 53 | Recode the "diabetes" vector to numeric type:
 54 | ```{r}
 55 | data$diabetes <- ifelse(data$diabetes=="pos", 1, 0)
 56 | ```
 57 | 
 58 | Use Chris K's handy median impute function to impute missing values: 
 59 | ```{r}
 60 | # impute and add missingness indicators
 61 | result = ck37r::impute_missing_values(data) 
 62 | 
 63 | # overwrite "data" with new imputed data frame
 64 | data <- result$data 
 65 | ```
 66 | 
 67 | Double check that missing values have been imputed:
 68 | ```{r}
 69 | # no more NA values
 70 | sum(is.na(data))
 71 | 
 72 | # check that missingness indicators have been added
 73 | str(data)
 74 | ```
 75 | 
 76 | ##4. Generalized additive models (GAMs)
 77 | This semester, MLWG has explored linear, polynomial, and spline regression models using single predictors (March 3) as well as stepwise selection using multiple predictors (Feb 17). Deb also offered an informative take on splines earlier today (Mar 17). Last semester, we talked about improving linear regression models via penalized regression (LASSO and ridge) using multiple predictors (Nov 4).  
 78 | 
 79 | When considering multilple predictor variables, another extension of multiple linear regression can be used - generalized additive models.  
 80 | 
 81 | Generalized additive models (GAMs) are another extension of multiple linear regression. They are not bound by linear relationships between predictor and response variable and can instead incorporate smoothed, nonlinear relationships. Each relationships is computed and summed (thus making it "additive"). Smoothed splines are not the only constructs used to build GAMs, as they can be built using natural splines, local regression, polynomial regression, etc.  
 82 | 
 83 | "Backfitting", or updating the model as each predictor is approximated using penalized likelihood maximization,  comprises the smoothed spline. 
 84 | 
 85 | See [Wood's book](https://www.crcpress.com/Generalized-Additive-Models-An-Introduction-with-R/Wood/p/book/9781584884743) for thorough walkthroughs of GAMs in R. 
 86 | 
 87 | As always, we also encourage [Introduction to Statistical Learning - Chapter 7](http://www-bcf.usc.edu/~gareth/ISL/) for a nice introductory overview and exercises.  
 88 | See [Faraway 2002](https://cran.r-project.org/doc/contrib/Faraway-PRA.pdf) for a great intro to regression and ANOVA
 89 | 
 90 | Fit the GAM: 
 91 | ```{r}
 92 | gam1 <- gam(pressure ~ s(glucose) + s(insulin) + s(age) + diabetes,
 93 |             family="gaussian",
 94 |             method="GCV.Cp",
 95 |             data=data)
 96 | 
 97 | gam1
 98 | # view summary output
 99 | gam.check(gam1)
100 | 
101 | names(gam1)
102 | gam1$aic 
103 | gam1$sig2
104 | ```
105 | 
106 | Play with some basic plotting features
107 | ```{r}
108 | plot(gam1, se=T, 
109 |          shade=T, col="black", shade.col="gray80", 
110 |          residuals=F,
111 |          pages=1)
112 | title("gam1")
113 | ```
114 | 
115 | ##5. Compare the GAM to other similar GAMs!
116 | Our plots suggest that "glucose" is fairly linear. What if we compare `gam1` to two other GAMs - one that _excludes_ the predictor glucose, and another that _assumes a linear relationship_ of glucose?
117 | ```{r}
118 | # model that excludes glucose
119 | gam2 <- gam(pressure ~ s(insulin) + s(age) + diabetes,
120 |             family="gaussian",
121 |             method="GCV.Cp",
122 |             data=data)
123 | 
124 | plot(gam2, pages=1)
125 | 
126 | # model that assumes linear glucose
127 | gam3 <- gam(pressure ~ glucose + s(insulin) + s(age) + diabetes,
128 |             family="gaussian",
129 |             method="GCV.Cp",
130 |             data=data)
131 | 
132 | plot(gam3, pages=1)
133 | 
134 | anova(gam1, gam2, gam3, test="F") # small p-value suggests that a non-linear function for glucose is preferable?
135 | 
136 | AIC(gam1, gam2, gam3) # is this a multiple comparison problem? 
137 | BIC(gam1, gam2, gam3)
138 | ```
139 | 
140 | What if we want to identify unhelpful predictors and remove them for better results?
141 | ```{r}
142 | table(data$diabetes, I(data$pregnant>14))
143 | 
144 | gam4 <- gam(pressure ~ s(glucose) + s(insulin) + s(age) + diabetes,
145 |             family="gaussian",
146 |             data=data,
147 |             subset=(diabetes !=0))
148 | 
149 | plot(gam4, se=TRUE, seWithMean=TRUE, 
150 |          shade=TRUE, col="blue", shade.col="lightgreen", 
151 |          residuals=FALSE,
152 |          pages=1)
153 | title("GAM - adjusted predictors")
154 | 
155 | AIC(gam1, gam2, gam3, gam4)
156 | ```
157 | 
158 | ##6. plotmo
159 | The "plotmo" R package offers a great way to visualize regression splines in three dimensions:
160 | ```{r}
161 | plotmo(gam1, all2=TRUE) # show simplfied seWithMean plots AND three dimensional splines for all variable relationships
162 | 
163 | # non-additive shapes have correlated effects in 3D plane surfaces.
164 | 
165 | # plot partial dependencies (takes a few minutes)
166 | # plotmo(gam1, all2=TRUE, pmethod = "partdep") 
167 | 
168 | # faster version of pmethod="partdep"
169 | plotmo(gam1, all2=TRUE, pmethod = "apartdep", 
170 |        caption = "What have I gotten myself in to...") 
171 | 
172 | # let's play around with a few more parameters! 
173 | plotmo(gam1, all2=TRUE, pt.col = "green3")
174 | plotmo(gam1, all2=TRUE, pt.col = "green3", smooth.col = "red")
175 | plotmo(gam1, all2=TRUE,  
176 |        pt.col = "green3", 
177 |        smooth.col = "red",
178 |        grid.col="gray80")
179 | 
180 | # return just some of the plots! 
181 | plotmo(gam1, all2=TRUE, degree1 = c(1,2), degree2=0, col="tomato") # show just the first two predictor plots
182 | 
183 | plotmo(gam1, all2=TRUE, degree1 = 0, degree2 = 1, # return just glucose v. pregnant perspective plot
184 |        caption = "this is called a 'perspective plot'",
185 |        persp.col="orange")
186 | ```
187 | 
188 | See [Wood S. 2006. Generalized additive models: An introduction with R](https://www.amazon.com/Generalized-Additive-Models-Introduction-Statistical/dp/1584884746) for expert explanations.  
189 | 
190 | ["gam" R package](https://cran.r-project.org/web/packages/gam/index.html)
191 | 
192 | ["mgcv" R package](https://cran.r-project.org/web/packages/mgcv/mgcv.pdf)
193 | 
194 | Also check out [Stephen Milborrow's excellent instructions on the "plotmo" R package](http://www.milbo.org/doc/plotmo-notes.pdf)
195 | 
196 | ##7. Multivariate adaptive regression splines (MARS) and (earth)
197 | Multivariate adaptive regression splines (MARS) are a technique developed by Jerome H. Friedman in 1991 and copyrighted by Salford Systems. Open source implementations are thusly referred to as "earth", but may not be identical to MARS. Also see the ["mda" R package](https://cran.r-project.org/web/packages/mda/index.html) and Friedman papers for specifics.  
198 | 
199 | earth = Enhanced Adaptive Regression Through Hinges   
200 | 
201 | These approaches use "surrogate features" (or, models of the models), usually versions of one or two predictors at a time. Each predictor is divided into two groups and each group models the outcome variable for each group. This creates a "piecewise linear model" where each new feature is some proportion of the data. 
202 | 
203 | Group definitions are provided via linear regression models! Those with the smallest error are used. See [Kuhn and Johnson, 2016:145 ](http://appliedpredictivemodeling.com/) for more information. 
204 | 
205 | Fit the earth model
206 | ```{r}
207 | # fit the model
208 | set.seed(1)
209 | earth1 <- earth(pressure ~ ., data=data, 
210 |                 degree=1, nk=5, 
211 |                 keepxy=TRUE, nprune=20, nfold=10, ncross=2,
212 |                 pmethod="cv", trace=4) 
213 | 
214 | # view summary output
215 | summary(earth1, details=TRUE)
216 | 
217 | # view predictor importance
218 | evimp(earth1)
219 | 
220 | # compute predicted values
221 | earth_pred <- predict(earth1)
222 | 
223 | # print accuracy
224 | (mse <- mean((data$pressure - earth_pred)^2))
225 | ```
226 | 
227 | Earth plots
228 | ```{r}
229 | # plot
230 | # png("earth1.png")
231 | plot(earth1)
232 | # dev.off()
233 | plot(earth1, info=T, type="response", trace=1)
234 | plotmo(earth1, info=T, type="response", trace=1)#, level=.9)
235 | 
236 | # 3d MARS plots!
237 | # same syntactical rules apply here as well
238 | plotmo(earth1)
239 | 
240 | plotmo(earth1, all2=TRUE, persp.col="azure")
241 | ```
242 | 
243 | We can also see the ideal number of terms
244 | ```{r}
245 | control <- trainControl(method = "repeatedcv",
246 |                         repeats = 1, number = 1)
247 | 
248 | grid <- expand.grid(.degree = 1, .nprune = 2:25)
249 | 
250 | earth_best_terms <- train(pressure ~ ., data = data, method = "earth",
251 | tuneGrid= grid)
252 | 
253 | earth_best_terms
254 | plot(earth_best_terms)
255 | ```
256 | 
257 | TODO: 
258 | - determine best value for nfold
259 | - explore the ncross argument
260 | - plot cross validation results
261 | - collect $R^2$ in different ways
262 | - use cross-validation to select the number of terms
263 | - better discuss partial dependence plots
264 | - include confidence intervals versus prediction intervals
265 | - investigate assumptions of prediction intervals
266 | - include text about interpretaiton of 3D plotmo regression surfaces
267 | - comprehensively discuss limitations
268 | 
269 | [See Stephen Milborrow's excellent notes on earth here](http://www.milbo.org/doc/earth-notes.pdf) for lots of handy tips and tricks.  
270 | 
271 | [... and view his notes on variance models in earth here](http://www.milbo.org/doc/earth-varmod.pdf)
272 | 
273 | ["earth" R package](https://cran.r-project.org/web/packages/earth/earth.pdf)
274 | 
275 | [Friedman 1991 - MARS](https://projecteuclid.org/download/pdf_1/euclid.aos/1176347963)
276 | 
277 | [Friedman 1993- Fast MARS](https://statistics.stanford.edu/sites/default/files/LCS%20110.pdf) 
278 | 
279 | 


--------------------------------------------------------------------------------
/Spring2017/Mar3-reg and splines/Mar3-regsplines-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "regression and splines (working version)" 
  3 | author: "Evan Muzzall"
  4 | date: "3/3/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ##1. Package installations
 13 | ```{r}
 14 | if (FALSE) {
 15 |   install.packages("Zelig")
 16 |   devtools::install_github("ck37/ck37r")
 17 | }
 18 | 
 19 | library(splines) # call the base "splines"
 20 | library(Zelig) # this contains the "macro" dataset we will use 
 21 | ```
 22 | 
 23 | ##2. Goals
 24 | First, we will walk through linear regression, polynomial regression, polynomial splines, and smoothing splines using an incredibly simple example.  
 25 | 
 26 | Then, we will attempt to apply what we learned and do the same thing using the "macro" dataset from the "Zelig" package to see a more real life example.   
 27 | 
 28 | ##3. Simple linear regression
 29 | Simple linear regression uses a single predictor/input/independent variable (X) to predict one target/outcome/response/dependent variably (Y). Ideally, we want to find the best estimates for B0 (intercept) and B1 (slope) that minimize the error terms when using X to predict Y.  
 30 | ```{r}
 31 | ## generate toy predictors and responses
 32 | X <- c(2, 4, 8, 12, 18, 20)
 33 | Y <- c(1, 3, 5, 9, 19, 21)
 34 | 
 35 | ## calculate means
 36 | mean(X)
 37 | mean(Y)
 38 | 
 39 | ## calculate error for each observation
 40 | X-mean(X)
 41 | Y-mean(Y)
 42 | 
 43 | ## plot the data
 44 | plot(x=X, y=Y, main="example")
 45 | 
 46 | ## estimate B1 coefficient (slope)
 47 | B1 <- sum((X-mean(X)) * (Y-mean(Y))) / sum((X-mean(X))^2)
 48 | B1
 49 | 
 50 | ## now estimate B0 coefficient (intercept)
 51 | B0 <- mean(Y) - (B1 * mean(X))
 52 | B0
 53 | 
 54 | ## plot the abline
 55 | abline(B0, B1, col="black", lwd=2)
 56 | legend("topleft", inset=.0, c("linear"), lty=1, lwd=2, col="black", cex=.75)
 57 | 
 58 | ## generate predicted values by plugging in our X values to the equation: 
 59 | Y_hat <- B0 + B1 * X
 60 | Y_hat
 61 | 
 62 | ## calculate root mean sqaure error (RMSE) for our predictions. First, calculatte the error for each observation by subracting it from the predicted value:
 63 | Y_err <- Y_hat - Y
 64 | Y_err
 65 | 
 66 | ## then, calculate the square of each of these errors:
 67 | Y_err_sq <- Y_err^2
 68 | Y_err_sq
 69 | 
 70 | ## sum these values
 71 | sum_squared_err <- sum(Y_err_sq)
 72 | sum_squared_err
 73 | 
 74 | ## divide by n and take square root to produce the RMSE:
 75 | RMSE <- sqrt(sum_squared_err / length(Y))
 76 | RMSE
 77 | 
 78 | ## sanity check
 79 | RMSE == sqrt(sum((Y_hat - Y)^2) / length(Y))
 80 | 
 81 | ## double sanity check
 82 | ## fit the model using the lm function in R:
 83 | lm_toy <- lm(Y ~ X)
 84 | lm_toy
 85 | B1
 86 | B0
 87 | summary(lm_toy)
 88 | 
 89 | ## is our B1 the same as the slope generated by "lm" in R?
 90 | round(B1, digits=5) == round(lm_toy$coefficients[2], digits=5)
 91 | 
 92 | ## is our B0 the same as the intercept generated by "lm" in R?
 93 | round(B0, digits=5) == round(lm_toy$coefficients[1], digits=5)
 94 | ```
 95 | 
 96 | ##4. Polynomial regression
 97 | However, it is not always advisable to assume linear relationships within data. Although linear models are flexible, they might not best express the relationships between your predictor and response variable. Thus, your resulting p-values might not accurately reflect the null hypothesis that the variables are not associated. 
 98 | 
 99 | Polynomial regression raises the original predictor variable to the n^th^ degree. These scalars act as a means to increase the fit of the model by assuming the point distributions are more parabolic shaped than linear.  
100 | 
101 | We will fit a 3^rd^ degree (cubic) polynomial so that our series of equations looks like this:
102 | Y ~ X  
103 | Y ~ X^2  
104 | Y ~ X^3  
105 | (plus the error term)  
106 | 
107 | The major drawback of polynomial regression however, is that the function is fit to the global feature space. That is, a single polynomial function is fit in an attempt to represent all data points. However, since this is essentially a linear regression model, coefficients can still be estimated using least squares. 
108 | ```{r}
109 | poly1 <- lm(Y ~ X + I(X^2) + I(X^3))
110 | poly1
111 | summary(poly1)
112 | ## so, what is really happening here? 
113 | 
114 | ## imagine we take our X variable and create a new column in a data frame that would look like this:
115 | X2 <- X^2
116 | X3 <- X^3
117 | 
118 | toy_df <- data.frame(Y, X, X2, X3)
119 | toy_df
120 | 
121 | ## the "poly" function produces the same results (when raw=TRUE)
122 | poly2 <- lm(Y ~ poly(X, 3, raw=TRUE))
123 | poly2
124 | summary(poly2)
125 | lines(X ~ fitted(poly2), lty=2, lwd=2, col="red")
126 | legend("topleft", inset=.0, c("linear", "poly 3"), lty=c(1,2), lwd=2, col=c("black","red"), cex=.75)
127 | 
128 | ## sanity check
129 | poly1$coefficients == poly2$coefficients
130 | ```
131 | 
132 | ##5. Polynomial splines
133 | Polynomial splines are [piecewise polynomial functions](https://www.khanacademy.org/math/algebra/algebra-functions/piecewise-functions/v/piecewise-function-example) that form smoothed curved shapes at their junctions (called "knots"). The X predictor is divided into K regions, and a polynomial function is fit to the data within each region. This allows for greater flexibility than linear or polynomial fits. This is a k^th^ order spline where coefficients can be estimated by least squares. Derivatives are fit between each knot, and each imposed constraint releases a degree of freedom, thus smoothing the polynomial fits.  As such, polynomial splines are generally more flexible fits than polynomial regression models.
134 | 
135 | "poly spline" (basis-spline) is the function that allows for continuous joins at the spline knots. It is the matrix that contains the information of the piecewise polynomial functions used to fit the spline.  
136 | ```{r}
137 | ## create xy data frame using our X and Y variables
138 | xy <- data.frame(X, Y)
139 | xy
140 | 
141 | bs(xy$X, df=3)
142 | summary(ps1 <- lm(Y ~ bs(X, df=3, knots=8), data = xy))
143 | 
144 | ## example of "safe" prediction (see Chambers JM, Hastie TJ (editors). 1992. Statistical Models in S. Pacific Grove, CA:  Wadsworth and Brooks/Cole. p 288-289 for "smart" versus "safe" prediction)
145 | summary(xy$X)
146 | X_pred <- seq(min(xy$X), max(xy$X), len = 200)
147 | lines(X_pred, predict(ps1, data.frame(X=X_pred)), lty=3, lwd=2, col="green")
148 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline"), lty=c(1,2,3), lwd=2, col=c("black","red", "green"), cex=.75)
149 | ```
150 | 
151 | Consider using "SuperLearner" to find optimal number of knots!  
152 | 
153 | ##6. Smoothing splines
154 | Smoothing splines are similar to the poly splines above, except they produce knots at each data point and coefficients of the estimated function are shrunk via regularization, thus helping prevent overfitting.  
155 | 
156 | The goal is to minimize loss+penalty in addition to a small residual sum of squares - this forms the smoothing spline. However, by treating each X observation as a data point we might be concerned with getting an overfit model. Thus it is useful to talk about degrees of freedom in terms of their _effective degrees of freedom_, or a general representation of the flexibility of the smoothing spline of shrunken degrees of freedom.  
157 | Also, intervals are allowed to overlap. 
158 | ```{r}
159 | smooth1 <- smooth.spline(y=Y, x=X, cv=FALSE, keep.data=TRUE, spar=NULL, penalty=1)
160 | smooth1
161 | names(smooth1)
162 | smooth1$data
163 | 
164 | lines(smooth1, col="blue", lty=4)
165 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline", "smooth1"), lty=c(1,2,3,4), lwd=2, col=c("black","red", "green", "blue"), cex=.75)
166 | ```
167 | 
168 | 
169 | ##7. Repeat with the "macro" data from the "Zelig" package
170 | ```{r}
171 | data(macro, package = "Zelig")
172 | macro_lm <- lm(gdp ~ unem, data=macro)
173 | macro_lm
174 | summary(macro_lm)
175 | 
176 | ## plot residuals
177 | hist(macro_lm$residuals)
178 | 
179 | ## plot it
180 | plot(macro$unem, macro$gdp, col="gray80", 
181 |      main="'macro' gdp ~ unem",
182 |      xlab="unem deficit",
183 |      ylab=)
184 | 
185 | abline(macro_lm$coefficients[1], macro_lm$coefficients[2],
186 |        lwd=2, col="black")
187 | legend("topleft", inset=.0, c("linear"), lty=1, lwd=2, col="black", cex=.75)
188 | 
189 | ## generate predicted values by plugging in our X values. 
190 | macro_pred <- predict(macro_lm, macro)
191 | 
192 | ## or, this is the same as our formula way from the toy example:
193 | macro_pred <- macro_lm$coefficients[1] + macro_lm$coefficients[2] * macro$unem
194 | 
195 | ## check MSE on the predicted values
196 | MSE <- mean((macro$gdp - macro_pred)^2)
197 | MSE
198 | ```
199 | 
200 | ##9. Cubic polynomial regression
201 | ```{r}
202 | poly_macro <- lm(gdp ~ poly(unem, 3, raw=TRUE), data=macro)
203 | poly_macro
204 | summary(poly_macro)
205 | 
206 | unem_lims <- range(macro$unem)
207 | unem_grid <- seq(from=unem_lims[1], to=unem_lims[2])
208 | poly_preds <- predict(poly_macro, newdata=list(unem=unem_grid), se=TRUE)
209 | 
210 | lines(unem_grid, poly_preds$fit, lty=2, lwd=2, col="red")
211 | legend("topleft", inset=.0, c("linear", "ploy 4"), lty=c(1,2), lwd=2, col=c("black", "red"), cex=.75)
212 | ```
213 | 
214 | ##10. poly spline
215 | ```{r}
216 | ps2 <- lm(gdp ~ bs(unem, df=55), data = macro)
217 | summary(ps2)
218 | 
219 | ## example of "safe" prediction
220 | summary(macro$unem)
221 | X_pred2 <- seq(min(macro$unem), max(macro$unem), len = 200)
222 | lines(X_pred2, predict(ps2, data.frame(unem=X_pred2)), lty=3, lwd=2, col="green")
223 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline"), lty=c(1,2,3), lwd=2, col=c("black","red", "green"), cex=.75)
224 | ```
225 | 
226 | ##11. Smoothing spline
227 | ```{r}
228 | smooth2 <- smooth.spline(y=macro$gdp, x=macro$unem, cv=FALSE)
229 | smooth2
230 | 
231 | lines(smooth2, col="blue", lty=4)
232 | legend("topleft", inset=.0, c("linear", "poly 3", "poly spline", "smooth1"), lty=c(1,2,3,4), lwd=2, col=c("black","red", "green", "blue"), cex=.75)
233 | ```
234 | 
235 | ## Acknowledgements
236 | James G, Witten D, Hastie T, Tibshirani R. 2015. An Introduction to Statistical Learning: With Applications in R (6th printing). New York: Springer. 
237 | 


--------------------------------------------------------------------------------
/Spring2017/Mar3-reg and splines/Mar3-regsplines-R_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/Spring2017/Mar3-reg and splines/Mar3-regsplines-R_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/Spring2017/data/sleep_VIM.csv:
--------------------------------------------------------------------------------
 1 | "BodyWgt","BrainWgt","NonD","Dream","Sleep","Span","Gest","Pred","Exp","Danger"
 2 | 6654,5712,NA,NA,3.3,38.6,645,3,5,3
 3 | 1,6.6,6.3,2,8.3,4.5,42,3,1,3
 4 | 3.385,44.5,NA,NA,12.5,14,60,1,1,1
 5 | 0.92,5.7,NA,NA,16.5,NA,25,5,2,3
 6 | 2547,4603,2.1,1.8,3.9,69,624,3,5,4
 7 | 10.55,179.5,9.1,0.7,9.8,27,180,4,4,4
 8 | 0.023,0.3,15.8,3.9,19.7,19,35,1,1,1
 9 | 160,169,5.2,1,6.2,30.4,392,4,5,4
10 | 3.3,25.6,10.9,3.6,14.5,28,63,1,2,1
11 | 52.16,440,8.3,1.4,9.7,50,230,1,1,1
12 | 0.425,6.4,11,1.5,12.5,7,112,5,4,4
13 | 465,423,3.2,0.7,3.9,30,281,5,5,5
14 | 0.55,2.4,7.6,2.7,10.3,NA,NA,2,1,2
15 | 187.1,419,NA,NA,3.1,40,365,5,5,5
16 | 0.075,1.2,6.3,2.1,8.4,3.5,42,1,1,1
17 | 3,25,8.6,0,8.6,50,28,2,2,2
18 | 0.785,3.5,6.6,4.1,10.7,6,42,2,2,2
19 | 0.2,5,9.5,1.2,10.7,10.4,120,2,2,2
20 | 1.41,17.5,4.8,1.3,6.1,34,NA,1,2,1
21 | 60,81,12,6.1,18.1,7,NA,1,1,1
22 | 529,680,NA,0.3,NA,28,400,5,5,5
23 | 27.66,115,3.3,0.5,3.8,20,148,5,5,5
24 | 0.12,1,11,3.4,14.4,3.9,16,3,1,2
25 | 207,406,NA,NA,12,39.3,252,1,4,1
26 | 85,325,4.7,1.5,6.2,41,310,1,3,1
27 | 36.33,119.5,NA,NA,13,16.2,63,1,1,1
28 | 0.101,4,10.4,3.4,13.8,9,28,5,1,3
29 | 1.04,5.5,7.4,0.8,8.2,7.6,68,5,3,4
30 | 521,655,2.1,0.8,2.9,46,336,5,5,5
31 | 100,157,NA,NA,10.8,22.4,100,1,1,1
32 | 35,56,NA,NA,NA,16.3,33,3,5,4
33 | 0.005,0.14,7.7,1.4,9.1,2.6,21.5,5,2,4
34 | 0.01,0.25,17.9,2,19.9,24,50,1,1,1
35 | 62,1320,6.1,1.9,8,100,267,1,1,1
36 | 0.122,3,8.2,2.4,10.6,NA,30,2,1,1
37 | 1.35,8.1,8.4,2.8,11.2,NA,45,3,1,3
38 | 0.023,0.4,11.9,1.3,13.2,3.2,19,4,1,3
39 | 0.048,0.33,10.8,2,12.8,2,30,4,1,3
40 | 1.7,6.3,13.8,5.6,19.4,5,12,2,1,1
41 | 3.5,10.8,14.3,3.1,17.4,6.5,120,2,1,1
42 | 250,490,NA,1,NA,23.6,440,5,5,5
43 | 0.48,15.5,15.2,1.8,17,12,140,2,2,2
44 | 10,115,10,0.9,10.9,20.2,170,4,4,4
45 | 1.62,11.4,11.9,1.8,13.7,13,17,2,1,2
46 | 192,180,6.5,1.9,8.4,27,115,4,4,4
47 | 2.5,12.1,7.5,0.9,8.4,18,31,5,5,5
48 | 4.288,39.2,NA,NA,12.5,13.7,63,2,2,2
49 | 0.28,1.9,10.6,2.6,13.2,4.7,21,3,1,3
50 | 4.235,50.4,7.4,2.4,9.8,9.8,52,1,1,1
51 | 6.8,179,8.4,1.2,9.6,29,164,2,3,2
52 | 0.75,12.3,5.7,0.9,6.6,7,225,2,2,2
53 | 3.6,21,4.9,0.5,5.4,6,225,3,2,3
54 | 14.83,98.2,NA,NA,2.6,17,150,5,5,5
55 | 55.5,175,3.2,0.6,3.8,20,151,5,5,5
56 | 1.4,12.5,NA,NA,11,12.7,90,2,2,2
57 | 0.06,1,8.1,2.2,10.3,3.5,NA,3,1,2
58 | 0.9,2.6,11,2.3,13.3,4.5,60,2,1,2
59 | 2,12.3,4.9,0.5,5.4,7.5,200,3,1,3
60 | 0.104,2.5,13.2,2.6,15.8,2.3,46,3,2,2
61 | 4.19,58,9.7,0.6,10.3,24,210,4,3,4
62 | 3.5,3.9,12.8,6.6,19.4,3,14,2,1,1
63 | 4.05,17,NA,NA,NA,13,38,3,1,1
64 | 


--------------------------------------------------------------------------------
/Spring2017/spring 2017 schedule.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf830
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;}
 3 | {\colortbl;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;}
 5 | \margl1440\margr1440\vieww10800\viewh8400\viewkind0
 6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 7 | 
 8 | \f0\fs24 \cf0 Spring 2017 schedule\
 9 | \
10 | * February 3 - Introductory meeting and k-nearest neighbor classification and regression  \
11 | [click for KNN example in R](https://www.datacamp.com/community/tutorials/machine-learning-in-r#gs.GpuyCu0)  \
12 | [click for KNN example in Python](http://scikit-learn.org/stable/modules/neighbors.html)\
13 | \
14 | * February 17 - Stepwise regression  \
15 | [see Chapter 6 from An Introduction to Statistical Learning for an overview of stepwise regression](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf)  \
16 | \
17 | [click for an overview of regerssion analysis in Python](http://www.turingfinance.com/regression-analysis-using-python-statsmodels-and-quandl/)  \
18 | [click for an overview of generalized linear models in Python](http://scikit-learn.org/stable/modules/linear_model.html)  \
19 | \
20 | * March 3 - Linear regression, polynomial regression, polynomial splines, smoothing splines  \
21 | [see Chapters 3 and 7 from an Introduction to Statistical Learning for overviews of regression and splines](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20Sixth%20Printing.pdf)  \
22 | \
23 | [click here for a linear regression example in Python](http://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html)  \
24 | \
25 | [click here for a spline walkthrough in Python](https://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html)  \
26 | \
27 | * March 17 - Multivariate adaptive regression splines, generalized additive models  \
28 | \
29 | * April 14 - Support vector machines  \
30 | \
31 | [click here for an explanation of why and when to use SVMs](http://blog.yhat.com/posts/why-support-vector-machine.html)\
32 | \
33 | * April 28 - Neural networks  }


--------------------------------------------------------------------------------
/Spring2018/Apr11-BoostingTrees/GBM.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Gradient Boosting"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Basic single model:\n",
 15 |     "Adapted from [here](http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html)."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "from sklearn.datasets import load_boston\n",
 25 |     "import numpy as np\n",
 26 |     "import matplotlib.pyplot as plt\n",
 27 |     "from sklearn import ensemble, preprocessing\n",
 28 |     "from sklearn.utils import shuffle\n",
 29 |     "from sklearn.metrics import mean_squared_error\n",
 30 |     "from sklearn.model_selection import train_test_split\n",
 31 |     "from sklearn.ensemble import GradientBoostingRegressor"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "np.random.seed(1)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "boston = load_boston()\n",
 50 |     "\n",
 51 |     "X, y = shuffle(boston.data, boston.target, random_state=1)\n",
 52 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)\n",
 53 |     "\n",
 54 |     "scaler = preprocessing.StandardScaler().fit(X_train)\n",
 55 |     "X_train = scaler.transform(X_train)\n",
 56 |     "X_test = scaler.transform(X_test)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "scrolled": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,\n",
 68 |     "          'learning_rate': 0.01, 'loss': 'ls'}\n",
 69 |     "\n",
 70 |     "gb_r = ensemble.GradientBoostingRegressor(**params)\n",
 71 |     "\n",
 72 |     "gb_r.fit(X_train, y_train)\n",
 73 |     "mse = mean_squared_error(y_test, gb_r.predict(X_test))\n",
 74 |     "print(\"MSE: %.4f\" % mse)\n",
 75 |     "print(gb_r.score(X_train, y_train))"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "% matplotlib inline\n",
 85 |     "\n",
 86 |     "# compute test set deviance\n",
 87 |     "test_score = np.zeros((params['n_estimators'],), dtype=np.float64)\n",
 88 |     "\n",
 89 |     "for i, y_pred in enumerate(gb_r.staged_predict(X_test)):\n",
 90 |     "    test_score[i] = gb_r.loss_(y_test, y_pred)\n",
 91 |     "\n",
 92 |     "plt.figure(figsize=(12, 6))\n",
 93 |     "plt.subplot(1, 2, 1)\n",
 94 |     "plt.title('Deviance')\n",
 95 |     "plt.plot(np.arange(params['n_estimators']) + 1, gb_r.train_score_, 'b-',\n",
 96 |     "         label='Training Set Deviance')\n",
 97 |     "plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',\n",
 98 |     "         label='Test Set Deviance')\n",
 99 |     "plt.legend(loc='upper right')\n",
100 |     "plt.xlabel('Boosting Iterations')\n",
101 |     "plt.ylabel('Deviance')"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "feature_importance = gb_r.feature_importances_\n",
111 |     "# make importances relative to max importance\n",
112 |     "feature_importance = 100.0 * (feature_importance / feature_importance.max())\n",
113 |     "sorted_idx = np.argsort(feature_importance)\n",
114 |     "pos = np.arange(sorted_idx.shape[0]) + .5\n",
115 |     "plt.subplot(1, 2, 2)\n",
116 |     "plt.barh(pos, feature_importance[sorted_idx], align='center')\n",
117 |     "plt.yticks(pos, boston.feature_names[sorted_idx])\n",
118 |     "plt.xlabel('Relative Importance')\n",
119 |     "plt.title('Variable Importance')\n",
120 |     "plt.show()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "## Grid search:"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "from sklearn.model_selection import GridSearchCV\n",
137 |     "\n",
138 |     "param_grid = {'n_estimators': range(450, 551, 50),\n",
139 |     "              'max_depth': range(1, 12, 5),\n",
140 |     "              'min_samples_split': [2],\n",
141 |     "              'learning_rate': np.arange(0.01, .22, .1),\n",
142 |     "              'loss': ['ls']}\n",
143 |     "\n",
144 |     "gb_r = GridSearchCV(GradientBoostingRegressor(), param_grid)\n",
145 |     "gb_r.fit(X_train, y_train)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "sorted(gb_r.cv_results_.keys())"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "gb_r.cv_results_[\"mean_test_score\"]"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "best_index = np.argmax(gb_r.cv_results_[\"mean_test_score\"])"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "gb_r.cv_results_[\"params\"]"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "print(gb_r.cv_results_[\"params\"][best_index])\n",
191 |     "print()\n",
192 |     "print(max(gb_r.cv_results_[\"mean_test_score\"]))"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "gb_r.score(X_test, \n",
202 |     "           y_test)"
203 |    ]
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "anaconda-cloud": {},
208 |   "kernelspec": {
209 |    "display_name": "Python 3",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.6.4"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 1
228 | }
229 | 


--------------------------------------------------------------------------------
/Spring2018/Apr11-BoostingTrees/boosting-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R boosted trees walkthrough"
  3 | author: "Evan Muzzall"
  4 | date: "April 11, 2018"
  5 | output: 
  6 |   html_document: 
  7 |     toc: yes
  8 |     toc_float: yes
  9 | ---
 10 | 
 11 | # Objectives
 12 | #####1 introduction
 13 | *tree based methods - quick review  
 14 | *install packages, load and split data
 15 | #####2 gbm
 16 | *train gbm_fit1 (no tuning)
 17 | *train gbm_fit2 (tune with trainControl and expand.grid)
 18 | *visualize gbm_fit2 models, generate predicted values, compute and plot AUC
 19 | *trainControl and expand.grid functions; train the gbm again
 20 | *generate predicted values and compute AUC
 21 | #####3 xgboost
 22 | *xgboost example
 23 | #####4 SuperLearner
 24 | *SuperLearner example
 25 | 
 26 | # 1 introduction: Review tree based methods  
 27 | _(summarized from Chapter 8 or Introduction to Statistical Learning, 7th ed. by James et al. 2013)_  
 28 | Recall that **decision trees** divide the predictor space (the set of possible predicted values) into simpler regions. Through recursive binary splitting, each tree splits based on minimizing RSS for regression trees or classification error for classification trees (% of training observations that do not belong to the most common class) using the mean and mode, repsectively. Decision trees have high variance and single decision trees are likely to overfit. When a large, overly complex tree is grown, **pruning** can be used to prune it back to a subtree with the lowest test error. 
 29 | 
 30 | To improve predictive performance, **bagging**, or "bootstrap aggregating", will estimate each tree on a new dataset [sampled with replacement](https://en.wikipedia.org/wiki/Simple_random_sample) from the original dataset. Each bootstrap sample will include about two-thirds of observations, some included multiple times. We then average the predictions across each of these trees. The out of bag error rate **OOB** is often used to estimate test error of the bagged model and is performed on the remaining observations (roughly 1/3). Variable importance gets lost in the shuffle!
 31 | 
 32 | Unlike bagging, **random forests** decorrelate the trees. They build bootstrap training samples but only use a small number of predictors for each tree, then takes the average of the performances of these trees. Thus, the response for each observation is predicted using only trees that included that observation. 
 33 | 
 34 | **Boosting** takes this even further - fit decision trees to residuals, add each tree's performance to the fitted function, update residuals, and improve $\hat{f}$. 
 35 | 
 36 | From [Freund and Schapire 1999](https://cseweb.ucsd.edu/~yfreund/papers/IntroToBoosting.pdf). 
 37 | "Boosting is a general method for improving the accuracy of any given learning algorithm" and originated in the AdaBoost and PAC learning (p. 1-2). Gradient boosted machines are ensembles decision tree methods of "weak" trees that are just slightly more accurate than random guessing which are then "boosted" into "strong" learners. That is, the models don't have to be accurate over the entire feature space. 
 38 | 
 39 | The model first tried to predict each value in a dataset - the cases that can be predicted easily are _downweighted_ so that the algorithm does not have to try as hard to predic them. 
 40 | 
 41 | However, the cases that the model has difficulty predicting are _upweighted_ so that the model tries more assertively to predict them. This continues for multiple "boosting" iterations. A resample-based performance measure is produced at each iteration. Error is measured on the weak learners so that even performing slightly better than random guessing improves accuracy fast (p.2). This method can drive down generalization error thus preventing overfitting (p. 5). While it is susceptible to noise, it is robust to outlier detection. 
 42 | 
 43 | Boosted trees utilize three main hyperparameters:  
 44 | 1. B: number of trees to grow  
 45 | 2. $\lambda$: shrinkage (learning rate)  
 46 | 3. d: tree depth (number of splits)  
 47 | 
 48 | # research question
 49 | How well can we predict "low" versus "high" median home value prices using the other variables from the "BostonHousing" dataset?  
 50 | 
 51 | # Install packages
 52 | ```{r, eval=FALSE}
 53 | install.packages(c("car", "caret", "mlbench", "pROC", "randomForest",
 54 |                    "ranger", "rpart", "SuperLearner", "xgboost"))
 55 | 
 56 | ```
 57 | ```{r, eval=TRUE}
 58 | library(car)
 59 | library(caret)
 60 | library(mlbench)
 61 | library(pROC)
 62 | library(randomForest)
 63 | library(rpart)
 64 | library(SuperLearner)
 65 | library(xgboost)
 66 | ``` 
 67 | 
 68 | load data
 69 | ```{r}
 70 | library(mlbench)
 71 | data(BostonHousing)
 72 | ?BostonHousing
 73 | dat = BostonHousing
 74 | str(dat)
 75 | 
 76 | # convert medv to factor: less than or equal to 21.20 = "low", greater than 21.20 = "high"
 77 | dat$medv = cut(dat$medv,
 78 |                breaks = c(0, 21.20, 50),
 79 |                levels = c(1,2),
 80 |                labels = c("low", "high"))
 81 | ```
 82 | 
 83 | # split data
 84 | ```{r}
 85 | library(caret)
 86 | set.seed(1)
 87 | split <- createDataPartition(dat$medv, p = 0.70, list = FALSE)
 88 | training_set <- dat[split,] # for gbm; response variable is included
 89 | test_set <- dat[-split,] # for gbm; responses variable is included
 90 | 
 91 | X_train = subset(training_set, select = -medv) # for xgboost; response variable is Y_train
 92 | X_test = subset(test_set, select = -medv) # for xgboost; response variable is Y_test
 93 |   
 94 | Y_train = dat$medv[split] # xgboost train response, but need to convert to numeric
 95 | Y_test = dat$medv[-split] # xgboost test response, but need to convert to numeric
 96 | 
 97 | Y_train = as.integer(Y_train == "low") # xgboost only allows numeric input; train response
 98 | Y_test = as.integer(Y_test == "low") # xgboost only allows numeric input; test response
 99 | 
100 | X = subset(dat, select = -medv) # for SuperLearner, we can use all the data (minus our response medv)
101 | Y = subset(dat, select = medv) # include only medv response for SuperLearner!
102 | 
103 | ```
104 | 
105 | # 2 gbm
106 | # train gbm_fit1
107 | ```{r}
108 | set.seed(1)
109 | gbm_fit1 <- train(medv ~ ., 
110 |                   data = training_set, 
111 |                   method="gbm", 
112 |                   verbose = FALSE)
113 | gbm_fit1$times
114 | 
115 | gbm_fit1
116 | 
117 | summary(gbm_fit1, las=1, main="GBM relative influence")
118 | ```
119 | 
120 | # trainControl and expand.grid
121 | Define hyperparameters of the control mechanism via `trainControl`
122 | ```{r}
123 | control <- trainControl(method="repeatedcv", 
124 | 	repeats=5,
125 | 	classProbs=TRUE,
126 | 	summaryFunction=twoClassSummary)
127 | ```
128 | 
129 | Compare multiple models at once with `expand.grid`
130 | ```{r}
131 | grid <- expand.grid(n.trees = seq(500, 2500, by = 500),
132 |                     interaction.depth = c(1, 3, 5),
133 |                     shrinkage = c(0.001, 0.01, 0.1),
134 |                     n.minobsinnode = 10)
135 | nrow(grid)
136 | ```
137 | 
138 | Train the gbm again with the control and grid in place: 
139 | ```{r}
140 | set.seed(1)
141 | gbm_fit2 <- train(medv ~ ., data = training_set,
142 | 	method = "gbm",
143 | 	metric = "ROC",
144 | 	trControl = control,
145 | 	tuneGrid = grid,
146 | 	verbose = FALSE)
147 | gbm_fit2$times
148 | 
149 | gbm_fit2
150 | 
151 | summary(gbm_fit2, las = 2)
152 | ```
153 | 
154 | # ggplot line graph of the tuned models
155 | ```{r}
156 | library(ggplot2)
157 | ggplot(gbm_fit2) + theme_bw() + ggtitle("Model comparisons") + ylab("AUC") + theme(legend.position = "top")
158 | ```
159 | 
160 | # generate predicted values and probabilities
161 | ```{r}
162 | set.seed(1)
163 | gbm_predicted <- predict(gbm_fit2, test_set)
164 | gbm_prob <- predict(gbm_fit2, test_set, type="prob")
165 | ```
166 | 
167 | view final model
168 | ```{r}
169 | gbm_cm <- confusionMatrix(gbm_predicted, test_set$medv)
170 | gbm_cm
171 | ```
172 | 
173 | A confusion/error matrix is a cross-tabulation of observed versus predicted classes
174 | 
175 | # plot AUC
176 | ```{r}
177 | library(pROC)
178 | rocCurve <- roc(response=test_set$medv,
179 | 	predictor = gbm_prob[, "low"],
180 | 	levels = rev(levels(test_set$medv)),
181 | 	auc=TRUE, ci=TRUE)
182 | ```
183 | 
184 | ```{r}
185 | plot(rocCurve, main="GBM", col="blue", col.main="blue", col.lab="blue")
186 | rocCurve$auc
187 | ```
188 | 
189 | # 3 xgboost 
190 | # xgboost example
191 | ```{r}
192 | library(xgboost)
193 | bstSparse <- xgboost(data = data.matrix(X_train), label = Y_train, max.depth = 2, eta = 1, nthread = 2, nround = 20, objective = "binary:logistic")
194 | 
195 | prediction_values <- predict(bstSparse, xgb.DMatrix(data.matrix(X_test)))
196 | options(scipen=999)
197 | prediction_values
198 | 
199 | prediction_class = as.numeric(prediction_values > 0.5)
200 | prediction_class
201 | 
202 | err <- mean(as.numeric(prediction_values >= 0.5) != Y_test)
203 | print(paste("test-error =", err))
204 | ```
205 | 
206 | # 4 SuperLearner
207 | # Superlearner example
208 | 
209 | ```{r superlearner}
210 | library(SuperLearner)
211 | cv_sl = CV.SuperLearner(X = dat[, -14], 
212 |                         Y = as.integer(dat[, 14] == "low"),
213 |                         family = binomial(),
214 |                         SL.library = c("SL.xgboost","SL.rpart","SL.ranger","SL.mean"),
215 |                         V = 5)
216 | 
217 | cv_sl
218 | 
219 | summary(cv_sl)
220 | 
221 | table(simplify2array(cv_sl$whichDiscreteSL)) # view best
222 | 
223 | plot(cv_sl) + theme_bw()
224 | ```
225 | 
226 | # Help
227 | 
228 | Examples were drawn from these helpful pages - check them out below!  
229 | *[caret help page](https://topepo.github.io/caret/)  
230 | 
231 | *[XGBoost R Tutorial](http://xgboost.readthedocs.io/en/latest/R-package/xgboostPresentation.html)  
232 | 
233 | *[SuperLearner example](https://github.com/ecpolley/SuperLearner)  
234 | 


--------------------------------------------------------------------------------
/Spring2018/Apr25 - Elastic Net/elastic-net.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Penalized regression in R"
  3 | date: "April 25, 2018"
  4 | output: 
  5 |   html_document: 
  6 |     toc: yes
  7 |     toc_float: yes
  8 | ---
  9 | 
 10 | ```{r install, eval = F}
 11 | # Run if needed.
 12 | install.packages(c("caret", "glmnet", "ranger", "SuperLearner"))
 13 | ```
 14 | 
 15 | # Background
 16 | 
 17 | Penalized regression is a modification of ordinary least squares (OLS) or
 18 | generalized linear models regression (namely logistic regression) to shrink the
 19 | estimated coefficients closer to zero. This is because the default estimated
 20 | coefficients from linear regression inherently contain some overfitting - they
 21 | are incorporating some random noise in the data that will not be the same for
 22 | new, unseen data.
 23 | 
 24 | Penalization addresses this inherent overfitting by changing the objective
 25 | function used to choose the optimal estimated beta coefficients. It says: "I
 26 | want to choose beta coefficients that minimize my loss function (often
 27 | mean-squared error) but I also don't want the sum of the coefficients to be too large."
 28 | 
 29 | There are two types of penalization: L1 (sum of absolute values) or L2 (sum of squared values). L2 penalization was the first type of penalized regression, and is called **ridge regression**. It was first published by Hoerl & Kennard in 1970 and allows regression to be used with there are more covariates than observations (p > n). L1 penalization is called **LASSO** ([least absolute shrinkage and selection operator](https://en.wikipedia.org/wiki/Lasso_(statistics))) and was first published by Tibshirani in 1996. Lasso has the special property of **sparsity** - it assumes that only a subset of variables are related to the outcome and tends to zero out the coefficients on the least related covariates.
 30 | 
 31 | Ridge and lasso can be combined into a single regression called **elastic net**, which takes a weighted average of the L1 and L2 penalties. Elastic net was first published in 2005 by Zou and Hastie; the weighting between L1 and L2 penalties is controlled by the $\alpha$ hyperparamter which ranges between 0 (ridge) and 1 (lasso).
 32 | 
 33 | ## Data prep
 34 | 
 35 | ```{r}
 36 | library(MASS)
 37 | data(Boston)
 38 | help(Boston)
 39 | str(Boston)
 40 | summary(Boston)
 41 | 
 42 | # Our outcome is median home value.
 43 | outcome = "medv"
 44 | 
 45 | # Divide into 80% training, 20% test split.
 46 | # NOTE: this is a shortcut; we prefer to use cross-validation for real projects.
 47 | library(caret)
 48 | set.seed(1)
 49 | train_index = caret::createDataPartition(Boston[, outcome], p = .8, 
 50 |                                   list = F, 
 51 |                                   times = 1)
 52 | 
 53 | # Glmnet wants the data to be matrices, not data frames.
 54 | x_train = as.matrix(Boston[train_index, !names(Boston) == outcome])
 55 | x_test = as.matrix(Boston[-train_index, !names(Boston) == outcome])
 56 | 
 57 | y_train = Boston[train_index, outcome]
 58 | y_test = Boston[-train_index, outcome]
 59 | 
 60 | dim(x_train)
 61 | length(y_train)
 62 | 
 63 | dim(x_test)
 64 | length(y_test)
 65 | ```
 66 | 
 67 | 
 68 | ## Lasso
 69 | 
 70 | Lasso penalizes coefficients and imposes sparsity, so some coefficients may be shrunk to 0 if they do not appear to be related to the outcome.
 71 | 
 72 | ```{r}
 73 | library(glmnet)
 74 | # Fit the lasso to continuous Y
 75 | reg = cv.glmnet(x_train, y_train, family = "gaussian", alpha = 1)
 76 | 
 77 | # Look at distribution of penalty term lambda.
 78 | plot(reg)
 79 | 
 80 | # Plot the underlying glmnet object, showing
 81 | # coefficients for differnt lambda values.
 82 | plot(reg$glmnet.fit, xvar = "lambda", label = T)
 83 | 
 84 | # Lambda with minimum mean-squared error.
 85 | reg$lambda.min
 86 | 
 87 | # Higher lambda within 1SE of performance of the minimum.
 88 | # (the "one standard error" rule from Leo Breiman.)
 89 | reg$lambda.1se
 90 | 
 91 | # Review coeffients
 92 | coef(reg, s = "lambda.1se")
 93 | 
 94 | # What about for lambda.min?
 95 | coef(reg, s = "lambda.min")
 96 | 
 97 | # Predict on test set.
 98 | pred = predict(reg, s = reg$lambda.1se, newx = x_test)
 99 | 
100 | # Calculate mean-squared error.
101 | mean((pred - y_test)^2)
102 | ```
103 | 
104 | ## Ridge
105 | 
106 | Ridge penalizes the coefficients but does not impose sparsity, so no coefficient will ever be 0.
107 | 
108 | ```{r}
109 | 
110 | # Fit the ridge to continuous Y
111 | # We just change alpha to 0 to get ridge regression.
112 | reg = cv.glmnet(x_train, y_train, family = "gaussian", alpha = 0)
113 | 
114 | # Look at distribution of penalty term lambda.
115 | plot(reg)
116 | 
117 | # Plot the underlying glmnet object, showing
118 | # coefficients for differnt lambda values.
119 | plot(reg$glmnet.fit, xvar = "lambda", label = T)
120 | 
121 | # Predict on test set.
122 | pred = predict(reg, s = reg$lambda.1se, newx = x_test)
123 | 
124 | # Calculate mean-squared error.
125 | mean((pred - y_test)^2)
126 | ```
127 | 
128 | As expected, we do a little worse with ridge compared to lasso.
129 | 
130 | ## Elastic net
131 | 
132 | ```{r}
133 | set.seed(1)
134 | train_control = trainControl(method = "repeatedcv",
135 |                              number = 10L,
136 |                              repeats = 3L)
137 | 
138 | 
139 | # Create a custom tuning grid.
140 | enet_grid = expand.grid(alpha = seq(0, 1, length.out = 5),
141 |                         lambda = 2^seq(-1, -7, length = 5))
142 | 
143 | # Review the grid.
144 | enet_grid
145 | 
146 | # To be simpler we could just say e.g. tuneLength = 5.
147 | 
148 | enet = train(x_train, y_train, method = "glmnet",
149 |              #tuneLength = 5,
150 |              tuneGrid = enet_grid,
151 |              trControl = train_control)
152 | 
153 | print(enet)
154 | 
155 | plot(enet)
156 | 
157 | enet$bestTune
158 | 
159 | # Predict on test.
160 | pred = predict(enet, x_test)
161 | 
162 | # Review performance
163 | mean((pred - y_test)^2)
164 | ```
165 | 
166 | ## SuperLearner
167 | 
168 | ```{r}
169 | library(SuperLearner)
170 | 
171 | enet = create.Learner("SL.glmnet",
172 |                       tune = list(alpha = c(0, 0.1, 0.5, 0.9, 1.0)),
173 |                       detailed_names = TRUE)
174 | 
175 | sl_lib = c("SL.mean", "SL.lm", "SL.stepAIC", enet$names, "SL.ranger")
176 | 
177 | set.seed(1, "L'Ecuyer-CMRG") 
178 | 
179 | # This will take a few minutes to execute - take a look at the .html file to see the output!
180 | cv_sl = CV.SuperLearner(Y = y_train, X = data.frame(x_train), verbose = TRUE,
181 |                         SL.library = sl_lib, family = gaussian(),
182 |                         cvControl = list(V = 10L))
183 | 
184 | summary(cv_sl)
185 | 
186 | plot(cv_sl) + theme_bw()
187 | 
188 | # devtools::install_github("ck37/ck37r")
189 | # library(ck37r)
190 | ```
191 | 
192 | # References
193 | 
194 | Intro to Statistical Learning, Chapter 6
195 | 
196 | [Glmnet vignette by Hastie and Qian](https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html) - lots of great code examples


--------------------------------------------------------------------------------
/Spring2018/Feb28-randomForest/Random Forest R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "random forest"
  3 | author: "Evan Muzzall"
  4 | date: "February 28, 2018"
  5 | output:
  6 |   html_document:
  7 |     toc: yes
  8 |     toc_float: yes
  9 | ---
 10 | ```{r}
 11 | # clear environment
 12 | rm(list=ls())
 13 | ```
 14 | 
 15 | ```{r setup, include=FALSE}
 16 | knitr::opts_chunk$set(echo = TRUE)
 17 | ```
 18 | 
 19 | # Quick review: decision trees and "bagging" (bootstrap aggregating)  
 20 | 
 21 | # What are random forests?
 22 | Random forests are ensemble classifier methods that use multiple decision tree models for classification and regression. 
 23 | 
 24 | Unlike decision trees/bagged trees, by default results generally do not require pruning and include accuracy and variable importance information. Furthermore, at each random forest tree split, only a small portion of the predictors are used (rather than the full suite).
 25 | 
 26 | We will four different random forest models:
 27 | 1. `rf1`: `randomForest` package model  
 28 | 
 29 | 2. `rf2`: fit this same model in SuperLearn via `SL.SuperLearner`   
 30 | 
 31 | 3. `rf3`: `SuperLearner` package model compared to `SL.rpart`, `SL.xgboost`, and `SL.mean` models  
 32 | 
 33 | 4. `rf4`: `SuperLearner` package model with external cross-validation for multi-model comparison and visualization of model differences compared to `SL.rpart`, `SL.xgboost`, and `SL.mean`  
 34 | 
 35 | # Install packages
 36 | Install and `library()` necessary packages.
 37 | ```{r, eval=FALSE}
 38 | # install.packages(c("car", "caret", "ggplot2", "lattice", "plotmo", "randomForest", "rpart", "ROCR", "SuperLearner", "survival", "xgboost"), dependencies = F)
 39 | library(car)
 40 | library(caret)
 41 | library(ggplot2)
 42 | library(lattice)
 43 | library(plotmo)
 44 | library(randomForest)
 45 | library(rpart)
 46 | library(ROCR)
 47 | library(SuperLearner)
 48 | library(survival)
 49 | library(xgboost)
 50 | ```
 51 | 
 52 | # Data setup - `Mroz`
 53 | Load and explore Mroz dataset.
 54 | ```{r}
 55 | library(car)
 56 | data(Mroz)
 57 | ?Mroz
 58 | str(Mroz)
 59 | head(Mroz)
 60 | ```
 61 | 
 62 | ### `lfp` 
 63 | Let's examine frequencies of the `lfp` variable (labor force participation), since it is the one we want to predict.
 64 | ```{r, eval=FALSE}
 65 | Mroz$lfp
 66 | ```
 67 | ```{r}
 68 | library(lattice)
 69 | table(Mroz$lfp)
 70 | barchart(table(Mroz$lfp), col="purple", horizontal = F)
 71 | ```
 72 | 
 73 | ### Stratified random split
 74 | Now, we will use the `createDataPartition` command from the `caret` package to perform a 70/30 stratified random split of the Mroz data into training and test sets. 
 75 | ```{r}
 76 | library(caret)
 77 | set.seed(1)
 78 | split <- createDataPartition(Mroz$lfp, p=0.70, list=FALSE)
 79 | training_set <- Mroz[split,]
 80 | test_set <- Mroz[-split,]
 81 | 
 82 | dim(Mroz)
 83 | dim(training_set)
 84 | dim(test_set)
 85 | nrow(training_set) + nrow(test_set) == nrow(Mroz) # double check
 86 | ```
 87 | 
 88 | ##### 1.1 `rf1` - fit the model and evaluate `training_set` accuracy
 89 | Using the `randomForest` package, let`s fit a random forest model to predict the number of women who participated or did not participate in the labor force in 1975.
 90 | ```{r}
 91 | library(randomForest)
 92 | ?randomForest
 93 | set.seed(1)
 94 | rf1 <- randomForest(lfp ~ ., 
 95 |                     data=training_set, 
 96 |                     ntree=500,
 97 |                     mtry=round(sqrt(ncol(Mroz)), digits = 0),
 98 |                     importance=TRUE)
 99 | #NOTE: notice that our response vector `lfp` is a factor - this will assume classification models, otherwise regression will be assumed. If it is omitted entirely, randomForest becomes unsupervised! 
100 | rf1
101 | 
102 | # check accuracy on training set
103 | (170+235) / nrow(training_set)  # training_set = 77% accuracy
104 | 
105 | rf1$importance
106 | barchart(rf1$importance, main="rf variable importance - barchart", col="blue", border="black")
107 | ```
108 | 
109 | ##### 1.2 `randomForest` `test_set` accuracy
110 | Now, let`s see how our model performs on the test data.
111 | ```{r}
112 | set.seed(1)
113 | pred <- predict(rf1, newdata=test_set)
114 | table(pred, test_set$lfp)
115 | ```
116 | 
117 | Of the 225 test_set observations, We have 68 true negatives (correct `no` predictions) and 99 true positives (correct `yes` predictions).
118 | 
119 | Now, we can quickly check the accuracy of the model using the holdout dataset. 
120 | 
121 | ```{r}
122 | (68 + 99) / nrow(test_set)  #test_set = 74% accuracy
123 | ```
124 | 
125 | ##### 1.3 `plotmo` on `rf1`
126 | Plot `rf1`!
127 | ```{r}
128 | library(plotmo)
129 | ?plotmo
130 | plotmo(rf1, all1 = T) # all1 = T will plot all predictors
131 | plotmo(rf1, all2 = T) # all2 = T will plot all pairs of predictors
132 | plotmo(rf1, all2 = T, pt.col = "green", smooth.col = "purple", grid.col = "gray80")
133 | 
134 | set.seed(1)
135 | plotmo(rf1, all1 = T, pmethod = "apartdep") 
136 | 
137 | set.seed(1)
138 | plotmo(rf1, all1 = T, pmethod = "apartdep", degree1 = 0, degree2 = 3, 
139 |        caption = "title goes here",
140 |        persp.col="orange")
141 | 
142 | # image plots
143 | plotmo(rf1, degree1 = F, type="prob", nresponse="yes", # right graph
144 |        type2="image", pt.col=ifelse(Mroz$lfp == "yes", "purple", "green3"))
145 | ```
146 | 
147 | # Compare multiple models using the `SuperLearner` R package
148 | `SuperLearner` is an R package that allows you to easily compare multiple machine learning algorithms at once and/or the same algorithm with different settings.
149 | 
150 | It then creates an optimal weighted average of those models, aka an "ensemble", using the test data performance. This approach has been proven to be asymptotically as accurate as the best possible prediction algorithm that is tested.
151 | 
152 | ### Coerce `lfp` to integer type
153 | For binary classification, SuperLearner prefers that your categorical outcome is numeric/integer, rather than factor data type. 
154 | 
155 | Let's coerce `lfp` from factor to integer type, but first make a copy of `training_set` and `test_set`.
156 | ```{r}
157 | training_set2 = training_set
158 | test_set2 = test_set
159 | 
160 | class(training_set2$lfp)
161 | class(test_set2$lfp)
162 | 
163 | ?ifelse
164 | training_set2$lfp <- ifelse(training_set2$lfp=="yes", 1L, 0L)
165 | test_set2$lfp <- ifelse(test_set2$lfp=="yes", 1L, 0L)
166 | 
167 | class(training_set2$lfp)
168 | class(test_set2$lfp)
169 | ```
170 | ```{r, eval=FALSE}
171 | training_set2$lfp
172 | test_set2$lfp
173 | ```
174 | 
175 | ### Assign Y variables
176 | Now, we should assign binary outcome variables for the training and test sets for the `SuperLearner` computations.
177 | ```{r}
178 | Y <- training_set2$lfp
179 | Y_test <- test_set2$lfp
180 | table(Y)
181 | table(Y_test)
182 | ```
183 | 
184 | However, because we specify our outcome and predictor variables in SuperLearner, we must remove the outcome variable from our training and test sets because we do not want to include them as a predictor:
185 | 
186 | ```{r}
187 | training_set2 <- training_set2[,c(2:8)]
188 | test_set2 <- test_set2[,c(2:8)]
189 | dim(training_set2)
190 | dim(test_set2)
191 | ```
192 | 
193 | ##### 2.1 `rf2` fit the second random forest model inside SuperLearner
194 | ```{r}
195 | library(SuperLearner)
196 | listWrappers() # we want "SL.randomForest"
197 | 
198 | rf2 <- SuperLearner(Y = Y, X = training_set2, family = binomial(), SL.library = "SL.randomForest")
199 | 
200 | rf2
201 | ```
202 | In the output, Risk is an estimate of model accuracy/performance as estimated by cross-validation of risk on future data. By default it uses 10 folds. 
203 | 
204 | Coef is how much weight SuperLearner puts on that model in the ensemble weighted-average. If Coef = 0 it means that model is not used at all. 
205 | 
206 | # Compare multiple models simultaneously
207 | Now, let's compare our random forest model to two other tree-based models: `SL.rpart` and `SL.xgboost`.
208 | 
209 | We also include the mean of Y (`SL.mean`) as a benchmark algorithm - if it is the discrete winner, then we can assume that our model fits the data poorly.  
210 | 
211 | Based on model performance (risk), SuperLearner will also tell us which model is the best (Discrete winner) and also create a weighted average of the multiple models (SuperLearnerer). 
212 | 
213 | ##### 3.1 `rf3` fit the SuperLearner randomForest model in an ensemble
214 | ```{r}
215 | rf3 <- SuperLearner(Y = Y, X = training_set2, family = binomial(), SL.library = c("SL.mean", "SL.rpart", "SL.randomForest", "SL.xgboost"))
216 | 
217 | rf3
218 | ```
219 | 
220 | ##### 3.2 Assess model performance on `test_set2`
221 | Then, we want to assess the model performance on test_set and illustrate with a simple barplot.
222 | ```{r}
223 | pred2 <- predict(rf3, test_set2, onlySL = T)
224 | 
225 | summary(pred2$library.predict)
226 | 
227 | ggplot(as.data.frame(pred2), aes(x = pred)) + 
228 |   geom_histogram(fill = "blue", color = "black") + 
229 |   xlab("Predicted values") +
230 |   theme_minimal()
231 | ```
232 | 
233 | ##### 3.3 AUC on `test_set2`
234 | We can then check the area under the receiver operator characteristic (ROC) curve to see an alternative performance metric of `rf3` on `test_set2`: 
235 | ```{r}
236 | library(ROCR)
237 | pred_rocr <- prediction(pred2$pred, Y_test)
238 | auc <- performance(pred_rocr, measure = "auc", x.measure = "cutoff")@y.values[[1]]
239 | auc     # AUC = 0.82 
240 | ```
241 | 
242 | ##### 4.1 `rf4` fit the SuperLearner randomForest model in an ensemble with external cross-validation
243 | Default cross-validation is set to 10-fold in SuperLearner. However, we can use (external) cross-validation via the `CV.SuperLearner` function. We can also use all the data since we are using this external layer of cross-validation. 
244 | 
245 | ```{r}
246 | SL_Y = ifelse(Mroz$lfp == "yes", 1, 0)
247 | SL_X = Mroz[,-1]
248 | 
249 | set.seed(1)
250 | 
251 | rf4 <- CV.SuperLearner(Y = SL_Y, X = SL_X, family = binomial(), V = 10, SL.library = c("SL.mean", "SL.rpart", "SL.randomForest", "SL.xgboost"))
252 | 
253 | rf4
254 | names(rf4)
255 |   
256 | summary(rf4)
257 | 
258 | table(simplify2array(rf4$whichDiscreteSL))
259 | plot(rf4) + theme_linedraw()
260 | ```
261 | 
262 | See these guides for more:  
263 | [SuperLearner Guide](https://github.com/ck37/superlearner-guide)  
264 | 
265 | To learn more about plotting decision boundaries in R, check out the mlr package examples [Quick start](http://mlr-org.github.io/mlr-tutorial/release/html/) and [Visualizations of predictions](https://mlr-org.github.io/Visualisation-of-predictions/)
266 | 
267 | [James G, Witten D, Hastie T, Tibshirani R. 2013. An Introduction to Statistical Learning - with Applications in R. New York: Springer](http://www-bcf.usc.edu/~gareth/ISL/ISLR%20First%20Printing.pdf)  
268 | 
269 | [Package "SuperLearner"](https://cran.r-project.org/web/packages/SuperLearner/SuperLearner.pdf)


--------------------------------------------------------------------------------
/Spring2018/decision-trees-feb14/decision-trees-r.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Decision trees for machine learning"
  3 | #output: html_notebook
  4 | output: html_document
  5 | #editor_options: 
  6 |   #chunk_output_type: inline
  7 | ---
  8 | 
  9 | Topics
 10 | 
 11 | * rpart
 12 | * Caret
 13 | * SuperLearner
 14 | * h2o.ai
 15 | * mlr
 16 | * book
 17 | 
 18 | This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Use the latest RStudio preview release to run within RStudio.
 19 | 
 20 | Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 
 21 | 
 22 | ```{r}
 23 | # Load iris dataset.
 24 | data(iris)
 25 | 
 26 | # Review data structure.
 27 | str(iris)
 28 | 
 29 | # Review species distribution.
 30 | table(iris$Species, useNA = "ifany")
 31 | 
 32 | # Review all variables.
 33 | summary(iris)
 34 | ```
 35 | 
 36 | ```{r}
 37 | # install rpart first if you don't already have it.
 38 | # install.packages("rpart")
 39 | # rpart = recursive partitioning and regression trees (aka decision trees)
 40 | library(rpart)
 41 | 
 42 | # Review package help and vignette if available.
 43 | # HINT: vignette covers all of this in much better detail.
 44 | help(package = "rpart")
 45 | 
 46 | # To be reproducible we need to set a seed due to randomness in the cross-validation.
 47 | set.seed(1)
 48 | 
 49 | # Fit a classification decision tree to predict Species using all other variables.
 50 | # We don't need to specify method="class" because Species is a factor variable.
 51 | # For regression we'd do method = "anova" (default if outcome variable is not a factor)
 52 | tree_model = rpart(Species ~ ., data = iris)
 53 | 
 54 | # Display the decision tree in text form.
 55 | tree_model
 56 | 
 57 | # Plot tree graphically.
 58 | plot(tree_model, compress = T)
 59 | # We have to add the plot text manually for some reason.
 60 | # NOTE: you may need to select the plot() and text() lines and run them simultaneously
 61 | # depending on your RStudio settings, e.g. if you get a "plot.new has not been called yet" error.
 62 | text(tree_model, use.n = T)
 63 | ```
 64 | 
 65 | Wow, this is one of the worst plots I've ever seen! Hard to get much worse than that.
 66 | 
 67 | The help pages will give more details on the function arguments as well as handy examples.
 68 | 
 69 | ```{r}
 70 | # Review main decision tree function.
 71 | ?rpart
 72 | 
 73 | # Review the configuration options for trees.
 74 | ?rpart.control
 75 | 
 76 | # Same thing as above but with explicitly setting key options.
 77 | # We specify 10 cross-validation folds to determine the best complexity.
 78 | # Minbucket is the minimum number of observations in a node.
 79 | # Tip: I put parentheses around the whole line so that the result is printed.
 80 | (tree_model = rpart(Species ~ ., data = iris,
 81 |                     control = rpart.control(xval = 10, minbucket = 5, cp = 0.01)))
 82 | 
 83 | ```
 84 | 
 85 | 
 86 | Let's get a better decision tree plotting package.
 87 | 
 88 | ```{r}
 89 | # Install from CRAN if you don't already have this:
 90 | # install.packages("rpart.plot")
 91 | library(rpart.plot)
 92 | 
 93 | rpart.plot(tree_model)
 94 | 
 95 | # What other settings can we modify?
 96 | ?rpart.plot
 97 | 
 98 | # Review the vignette if interested.
 99 | help(package = "rpart.plot")
100 | 
101 | # Another way to plot it.
102 | library(partykit)
103 | plot(as.party(tree_model))
104 | 
105 | # fancyRpartPlot() in the rattle package is also good.
106 | 
107 | ```
108 | 
109 | We can dig into the details of the tree a bit more.
110 | 
111 | ```{r}
112 | # Review accuracy for different complexity parameters.
113 | # When nsplits = 0 we have 0 nodes and are merely guessing the most common class.
114 | # When nsplits is large we have 1  + # splits nodes and each node is its own prediction.
115 | printcp(tree_model)
116 | 
117 | # Save the complexity parameter table, and also print.
118 | cp_table = printcp(tree_model)
119 | 
120 | # Review structure of the cp table.
121 | str(cp_table)
122 | 
123 | # Which row has minimum cross-validation error?
124 | # Alternatively we could choose the tree within 1 SD of the minimum.
125 | best_row = cp_table[which.min(cp_table[, "xerror"]), ]
126 | best_row
127 | best_row["CP"]
128 | 
129 | # Get all the details on the tree.
130 | summary(tree_model, cp = best_row["CP"])
131 | 
132 | # Prune to the optimal complexity parameter (no change in this case).
133 | tree_model = prune(tree_model, cp = best_row["CP"])
134 | 
135 | tree_model
136 | ```
137 | 
138 | We did not create a separate holdout or test set, so let's predict back on the original data.
139 | 
140 | ```{r}
141 | predictions = predict(tree_model, iris)
142 | summary(predictions)
143 | 
144 | # How do the predictions look compared to the outcome data?
145 | data.frame(iris$Species, predictions)
146 | 
147 | # This is an optimistic view because the model was built on this same data.
148 | # With a random holdout set we would get a more realistic view of accuracy.
149 | 
150 | ```
151 | 
152 | ## Regression
153 | 
154 | Quick regression example.
155 | ```{r}
156 | # This data is in the rpart package.
157 | data(car90)
158 | 
159 | # Review structure of dataset.
160 | str(car90)
161 | 
162 | # Set seed due to cross-validation randomness.
163 | set.seed(1)
164 | 
165 | # Predict price using most other fields.
166 | # Remove a few fields that are too predictive (rim) or too many categories.
167 | reg_tree = rpart(Price ~ ., data = car90[, !names(car90) %in% c("Rim", "Tires", "Model2")])
168 | 
169 | # How'd it go?
170 | reg_tree
171 | 
172 | # Review complexity parameter options.
173 | printcp(reg_tree)
174 | 
175 | # Visualize results across complexity parameter.
176 | rsq.rpart(reg_tree)
177 | 
178 | # Save the complexit parameter table.
179 | cp_table = printcp(reg_tree)
180 | 
181 | # Which row has minimum cross-validation error?
182 | (best_row = cp_table[which.min(cp_table[, "xerror"]), ])
183 | best_row["CP"]
184 | 
185 | # Review summary with the best complexity parameter.
186 | summary(reg_tree, cp = best_row["CP"])
187 | 
188 | # Prune our tree back to the best complexity parameter.
189 | # Note that in this case no real pruning is needed, because
190 | # the full tree is the best.
191 | reg_tree = prune(reg_tree, cp = best_row["CP"])
192 | 
193 | # Visualize our final tree.
194 | rpart.plot(reg_tree)
195 | 
196 | ```
197 | 
198 | # Caret
199 | 
200 | ```{r}
201 | library(caret)
202 | 
203 | # Nice and simple - using default settings for everything.
204 | # caret tries 3 complexity parameters by default, but tuneLength customizes that.
205 | model = train(Species ~ ., data = iris, method = "rpart", tuneLength = 5)
206 | 
207 | # We see again that cp= 0 gives us the best accuracy.
208 | model
209 | 
210 | # Use the handy built-in caret plotting.
211 | plot(model)
212 | 
213 | # Look at the final model object (rpart).
214 | model$finalModel
215 | ```
216 | 
217 | # SuperLearner
218 | 
219 | SuperLearner unfortunately cannot do multiple-class classification (yet) so let's convert to a binary classification problem.
220 | 
221 | ```{r}
222 | 
223 | # Review 
224 | table(iris$Species)
225 | 
226 | # Copy into a new dataframe.
227 | data = iris
228 | 
229 | # Convert Species to a binary indicator for setosa.
230 | data$Species = as.integer(data$Species == "versicolor")
231 | 
232 | # Confirm distribution of modified outcome variable.
233 | table(data$Species, iris$Species, useNA = "ifany")
234 | 
235 | library(SuperLearner)
236 | 
237 | set.seed(1)
238 | 
239 | # family = binomial() is used for classification; family = gaussian() for regression.
240 | sl = SuperLearner(X = data[, -5], Y = data$Species, family = binomial(),
241 |                   SL.library = c("SL.mean", "SL.rpart"))
242 | sl
243 | 
244 | # Review the raw rpart object.
245 | sl$fitLibrary$SL.rpart_All$object
246 | 
247 | # Use our nice plotting library.
248 | rpart.plot::rpart.plot(sl$fitLibrary$SL.rpart_All$object)
249 | 
250 | ```
251 | 
252 | # h2o.ai
253 | 
254 | We can get close to a single decision tree by using randomForest in h2o. We set RF to fit a single decision tree and to search all variables at each split. It will not be exactly the same due to boostrap sampling but will be similar.
255 | 
256 | ```{r}
257 | # install.packages("h2o") # version 3.16
258 | # Or version 3.18:
259 | # install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/rel-wolpert/1/R")
260 | # Or nightly release (3.19):
261 | # install.packages("h2o", type="source", repos="http://h2o-release.s3.amazonaws.com/h2o/master/4203/R")
262 | library(h2o)
263 | 
264 | # Start h2o backend.
265 | h2o.init(nthreads = -1)
266 | 
267 | # Load iris data into h2o.
268 | iris_h2o = h2o.uploadFile(path = system.file("extdata", "iris_wheader.csv",
269 |                                              package = "h2o"),
270 |                           destination_frame = "iris_h2o")
271 | 
272 | # Confirm it loaded correctly.
273 | summary(iris_h2o)
274 | 
275 | # Specify x and y by the column indices.
276 | # Set ntree to 1, and mtries to # of covariates.
277 | # Seed only reproducible when running single-threaded.
278 | iris_tree = h2o.randomForest(y = 5, x = 1:4, training_frame = iris_h2o,
279 |                              ntrees = 1, mtries = 4, seed = 1)
280 | 
281 | # Review results.
282 | iris_tree
283 | 
284 | summary(iris_tree)
285 | 
286 | # Review variable importance.
287 | h2o.varimp(iris_tree)
288 | 
289 | # Plot variable importance - nice.
290 | h2o.varimp_plot(iris_tree)
291 | 
292 | # Shutdown h2o backend.
293 | h2o.shutdown(prompt = F)
294 | ```
295 | 
296 | h2o debugging notes:
297 | 
298 | * If you get a "connection refused" error it may mean that your version of Java is too new.
299 |   * Java must be JDK 8; h2o does not yet support JDK 9.
300 |   * More info here: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/faq/java.html
301 |   * Info on how to install JDK8 with homebrew here: http://www.lonecpluspluscoder.com/2017/10/08/installing-other-versions-of-the-java-jdk-via-homebrew/
302 | 
303 | # mlr
304 | 
305 | ```{r}
306 | library(mlr)
307 | 
308 | # Generate the task for multiple classification (also works for binary).
309 | task = makeClassifTask(data = iris, target = "Species")
310 | 
311 | # Get the number of observations
312 | n = getTaskSize(task)
313 | 
314 | # Generate the learners.
315 | learners = list(makeLearner("classif.rpart", id = "rpart", predict.type = "prob"))
316 | 
317 | # 5-fold cross-validation, stratifying on Y to ensure balance across folds.
318 | # could use stratify.cols to stratify on certain important covariates.
319 | rdesc = makeResampleDesc("CV", iters = 5L, stratify = TRUE)
320 | 
321 | # Fit model across cross-validation folds and calculate the performance.
322 | result = benchmark(learners, task, rdesc, measures = list(acc, mmce))
323 | 
324 | # MMCE = mean misclassification error (i.e. 1 - accuracy)
325 | result
326 | 
327 | # Plot the results. Generally we would plot multiple models here.
328 | plotBMRBoxplots(result, measure = acc)
329 | ```
330 | 
331 | 
332 | # Decision tree references
333 | 
334 | Awesome new data camp course: [Machine Learning with Tree-based Models in R](https://www.datacamp.com/courses/machine-learning-with-tree-based-models-in-r)
335 | 
336 |   * By Berkeley's own Erin LeDell, now machine learning scientist at h2o.ai
337 | 
338 | This book has nearly everything you would want to know about the theory of decision trees:
339 | 
340 | Breiman, L., Friedman, J., Stone, C. J., & Olshen, R. A. (1984). Classification and regression trees. CRC press.
341 | 
342 | The book has 32,000 citations according to Google Scholar. Not too shabby! Breiman and Stone were both Berkeley professors, and Breiman invented Random Forest, bagging, and some of the theory for SuperLearner & gradient boosted machines. Friedman is at Stanford and invented many other machine learning algorithms, particularly gradient boosted machines GBM) and multivariate adaptive regression splines (MARS). Olshen is also at Stanford.
343 | 


--------------------------------------------------------------------------------
/binder/apt.txt:
--------------------------------------------------------------------------------
1 | libnlopt-dev
2 | default-jdk


--------------------------------------------------------------------------------
/binder/binder.md:
--------------------------------------------------------------------------------
1 | RStudio: [![Binder](http://mybinder.org/badge.svg)](http://beta.mybinder.org/v2/gh/dlab-berkeley/MachineLearningWG/master?urlpath=rstudio)


--------------------------------------------------------------------------------
/binder/install.R:
--------------------------------------------------------------------------------
1 | install.packages("devtools")
2 | devtools::install_github(c("ecpolley/SuperLearner", "ck37/ck37r"))
3 | cran_packages =
4 |   c("rpart", "rpart.plot", "partykit", "mlr", "car", "caret",
5 |     "ggplot2", "lattice", "plotmo", "randomForest", "ROCR",
6 |     "survival", "xgboost", "h2o", "glmnet")
7 | ck37r::load_packages(cran_packages, auto_install = TRUE)
8 | 


--------------------------------------------------------------------------------
/binder/runtime.txt:
--------------------------------------------------------------------------------
1 | r-2018-02-15


--------------------------------------------------------------------------------
/intro.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dlab-berkeley/MachineLearningWG/0d098fb4a4d00d8f73cfded1c287c9fb465b3bb9/intro.pptx


--------------------------------------------------------------------------------