├── .Rbuildignore
├── .gitignore
├── DESCRIPTION
├── IntroductiontoMachineLearning.Rproj
├── LICENSE
├── LICENSE.md
├── NAMESPACE
├── NEWS.md
├── README.md
├── Your_forest_explained_files
└── figure-html
│ ├── unnamed-chunk-3-1.png
│ ├── unnamed-chunk-5-1.png
│ ├── unnamed-chunk-6-1.png
│ ├── unnamed-chunk-7-1.png
│ └── unnamed-chunk-8-1.png
├── data
├── x.rda
└── y.rda
├── docs
├── Douglass_IntroductionToMachineLearning_2018_FinalExame.nb.html
├── Douglass_IntroductionToMachineLearning_2018_FinalExame.rmd
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day1.nb.html
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day1.rmd
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2.nb.html
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2.rmd
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2b.nb.html
├── Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2b.rmd
└── min_depth_frame.rda
├── revdep
├── .gitignore
└── email.yml
├── src
└── .gitignore
└── tests
├── testthat.R
└── testthat
└── test-my-test.R
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^LICENSE\.md$
2 | ^revdep$
3 | ^.*\.Rproj$
4 | ^\.Rproj\.user$
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 |
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: IntroductiontoMachineLearning
2 | Version: 0.0.0.9000
3 | Title: What the Package Does (One Line, Title Case)
4 | Description: What the package does (one paragraph).
5 | Authors@R: person("First", "Last", , "first.last@example.com", c("aut", "cre"))
6 | License: MIT + file LICENSE
7 | Encoding: UTF-8
8 | LazyData: true
9 | Suggests:
10 | MASS,
11 | testthat
12 | Roxygen: list(markdown = TRUE)
13 | RoxygenNote: 6.0.1
14 | LinkingTo:
15 | Rcpp
16 | Imports:
17 | Rcpp
18 |
--------------------------------------------------------------------------------
/IntroductiontoMachineLearning.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2018
2 | COPYRIGHT HOLDER: My Name
3 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | Copyright (c) 2018 My Name
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: fake comment so roxygen2 overwrites silently.
2 | exportPattern("^[^\\.]")
3 |
--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
1 | # IntroductiontoMachineLearning 0.0.0.9000
2 |
3 | * Added a `NEWS.md` file to track changes to the package.
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to Machine Learning
2 |
3 | [Course Syllabus and Code Day 1](https://CenterForPeaceAndSecurityStudies.github.io/IntroductiontoMachineLearning/Douglass_IntroductionToMachineLearning_2018_Syllabus_Day1.nb.html)
4 |
5 | [Course Syllabus and Code Day 2](https://CenterForPeaceAndSecurityStudies.github.io/IntroductiontoMachineLearning/Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2.nb.html)
6 |
7 | [Course Slides (Day 1 Part A)](https://docs.google.com/presentation/d/19i2om_jwK8m3a-jNvgtM-WMT1l1HAGaGuWeb4bgLsTM/edit?usp=sharing)
8 |
9 | [Course Slides (Day 1 Part B)](https://docs.google.com/presentation/d/1Z857fFS692ijppXZzrPVjsVxsllwDKzoFBWOchRjDfU/edit?usp=sharing)
10 |
11 | [Course Slides (Day 2 Part A)](https://docs.google.com/presentation/d/1HRzRTjz31vt_HwOkKE_jNUYhg1a2LrOxS6LM3RI9dE4/edit?usp=sharing)
12 |
13 | [Course Slides (Day 2 Part B)](https://docs.google.com/presentation/d/1GSKQeoYWTVlIfWQIV9pXyoZW3DVdL2ylmaUU5dN9sGA/edit?usp=sharing)
14 |
15 |
16 | Rex W. Douglass
17 | Director Machine Learning for Social Science Lab (MSSL)
18 | Center for Peace and Security Studies (cPASS)
19 | Department of Political Science
20 | University of California San Diego
21 | rexdouglass@gmail.com
22 | www.rexdouglass.com
23 | @rexdouglass
24 |
25 |
26 |
--------------------------------------------------------------------------------
/Your_forest_explained_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/Your_forest_explained_files/figure-html/unnamed-chunk-3-1.png
--------------------------------------------------------------------------------
/Your_forest_explained_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/Your_forest_explained_files/figure-html/unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/Your_forest_explained_files/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/Your_forest_explained_files/figure-html/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/Your_forest_explained_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/Your_forest_explained_files/figure-html/unnamed-chunk-7-1.png
--------------------------------------------------------------------------------
/Your_forest_explained_files/figure-html/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/Your_forest_explained_files/figure-html/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/data/x.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/data/x.rda
--------------------------------------------------------------------------------
/data/y.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/data/y.rda
--------------------------------------------------------------------------------
/docs/Douglass_IntroductionToMachineLearning_2018_FinalExame.rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "R Notebook"
3 | output: html_notebook
4 | editor_options:
5 | chunk_output_type: inline
6 | ---
7 |
8 | 0) (Yes/No) Can I touch the test data now?
9 |
10 | 1) (T/F) Machine learning solves the fundamental problem of causal inference.
11 |
12 | 2) (T/F) Machine learning makes statistics uncessary.
13 |
14 | 3) (T/F) Machine learning is only necessary for 'big' data while statistics is appropriate for 'small' data
15 |
16 | 4) What are the parts of the Shannon-Weaver model of communication?
17 |
18 | 5) (T/F) Poor accuracy of a prediction is always due to measurement error/noise in the message.
19 |
20 | 6) (T/F) Information sources can be constants, that never vary.
21 |
22 | 7) (T/F) Messages can carry information even if the source itself does not vary.
23 |
24 | 8) (T/F) The amount of information in a message increases in the length of the message.
25 |
26 | 9) (T/F) A fair coin flip contains for information than an unfair one.
27 |
28 | 10) (T/F) A prediction $\hat{Y}$ is statement about the true state of the information source $Y$
29 |
30 | 11) (T/F) Predictions only come from quantitative models.
31 |
32 | 11) Name 5 methods for scoring binary predictions against binary information sources.
33 |
34 | 12) (T/F) All kinds of prediction errors are equally important/equally bad independent of the application.
35 |
36 | 13) (T/F) Accuracy weights false positives and false negatives equally.
37 |
38 | 13) (T/F) Precision penalizes false positives more.
39 |
40 | 14) Why might you want to penalize false positives more?
41 |
42 | 15) (T/F) Recall penalizes false negatives more.
43 |
44 | 16) Why might you want to penalize false negatives more?
45 |
46 | 17) F1 combines both precision and recall. How does it weight the two?
47 |
48 | 18) Why doesn't combining precision and recall just turn back into accuracy?
49 |
50 | 19) A prediction that scores highly one rule should score highly on others.
51 |
52 | 20) Random chance predictions should score poorly on any rule.
53 |
54 | 21) If a prediction scores well, the method/person making the prediction must understand the information source/transmission function.
55 |
56 | 22) If a soft method provides a fractional prediction, e.g. [0.0-1.0], then it is necessarily a probability.
57 |
58 | 23) Name three methods for evaluating fractional predictions.
59 |
60 | 24) What is on the Y axis of an ROC plot?
61 |
62 | 25) What is on the X axis of an ROC plot?
63 |
64 | 26) What does an ROC curve lying on the 45 degree angle represent?
65 |
66 | 27) What does an ROC curve in top upper left corner represent?
67 |
68 | 28) What does an ROC curve strictly further out toward the top left than another curve represent?
69 |
70 | 29) What does two ROC curves crossing each other represent?
71 |
72 | 30) What is on the Y axis of a Precision-Recall Curve plot represent?
73 |
74 | 31) Why might you want to use a PRC plot instead of an AUC plot?
75 |
76 | 32) What is class imbalance (in a binary outcome setting), e.g. what is skew?
77 |
78 | 33) How can changing class imbalance effect the score of a prediction, holding everything else the same?
79 |
80 | 34) What are two scoring rules that are robust to changes in skew?
81 |
82 | 35) What are three scoring rules for real valued outcomes?
83 |
84 | 36) Substantively, what kinds of errors does mean squared error care about more in contrast to absolute error?
85 |
86 | 37) What problem is Huber loss designed to solve?
87 |
88 | 38) What is a message?
89 |
90 | 39) (T/F) A message must contain as many symbols as there are states of the information source
91 |
92 | 40) (T/F) A message must map to the true state of the information source
93 |
94 | 41) (T/F) The longer the message, the more information contained about the informatino source
95 |
96 | 42) (T/F) Two identical messages can be constructed with completely different symbols.
97 |
98 | 43) What is information?
99 |
100 | 44) (T/F) All messages are informative
101 |
102 | 45) What are the maximum number of bits transmitted about a 0-bit source by a 0-bit message?
103 |
104 | 46) What are the maximum number of bits transmitted about a 0-bit source by a 1-bit message?
105 |
106 | 47) What are the maximum number of bits transmitted about a 0-bit source by an N-bit message?
107 |
108 | 48) What are the maximum number of bits transmitted about a 1-bit source by an 0-bit message?
109 |
110 | 49) What are the maximum number of bits transmitted about a 1-bit source by an 1-bit message?
111 |
112 | 50) What are the maximum number of bits transmitted about a 1-bit source by an N-bit message?
113 |
114 | 51) What are the maximum number of bits transmitted about a 2-bit source by a 0-bit message?
115 |
116 | 51) What are the maximum number of bits transmitted about a 2-bit source by a 0-bit message?
117 |
118 | 52) What are the maximum number of bits transmitted about a 2-bit source by a 1-bit message?
119 |
120 | 53) What are the maximum number of bits transmitted about a 2-bit source by a 2-bit message?
121 |
122 | 53) What are the maximum number of bits transmitted about a 2-bit source by a 2-bit message?
123 |
124 | 54) What are the maximum number of bits transmitted about a 2-bit source by an N-bit message?
125 |
126 | 55) (T/F) Holding bandiwdth of the channel constant, the medium through which you send the message doesn't matter.
127 |
128 | 56) (T/F) A transmitter must necessarily condition on the state of the information source.
129 |
130 | 57) (T/F) Even if we don't design it, we always get to observe the transmitter.
131 |
132 | 58) (T/F) The transmitter necessarily encodes all of the information from the information source.
133 |
134 | 59) (T/F) Some transmitters intentionally obscure the state of the information source.
135 |
136 | 60) (T/F) Some transmitters are intentionally lossy, partially encoding the state of the information source.
137 |
138 | 61) (T/F) A microphone, a temperature guage, and a public opinion poll are all transmitters.
139 |
140 | 62) (T/F) Every transmitter necessarily has a equivalent receiver that perfectly inverts the encoding.
141 |
142 | 63) (T/F) Every transmitter necessarily has one and only one receiver that prefectly inverts the encoding.
143 |
144 | 64) (T/F) Receivers can be degenerate, producing predictions not mapped to true states of the information source.
145 |
146 | 65) (T/F) Receivers can ignore the message entirely.
147 |
148 | 66) (T/F) A receiver that successfully decodes a message must necessarily invert the operations performed by the transmitter.
149 |
150 | 67) (T/F) In unsupervised learning there is no true $Y$
151 |
152 | 68) (T/F) Supervised learning is when we you get to observe the true state $Y$ for some set of examples, and unsupervised learning is when you don't get to observe the true state for any set of examples.
153 |
154 | 69) (T/F) The is no receiver that is equally good for every transmitter
155 |
156 | 70) (T/F) If a receiver produces predictions with high accuracy, it is inverting the transmission function.
157 |
158 | 71) (T/F) If a receiver produces poor predictions, that must be due do noise in the message.
159 |
160 | 72) (T/F) Accuracy of predictions developed from in-sample data are a good estimate of predictions made on new unseen observations.
161 |
162 | 73) (T/F) Accuracy of predictions developed from out-of-sample data are a good estimate of predictions made on new unseen observations.
163 |
164 | 74) What is the cost of holding back some data as a test set?
165 |
166 | 75) Before seeing any messages, can you correctly guess which receiver will be the most accurate?
167 |
168 | 76) What's the difference between best case, worst case, and average accuracy of a receiver across potential transmitters?
169 |
170 | 77) What does the no free lunch theorem state?
171 |
172 | 78) Why is feature selection necessary?
173 |
174 | 79) Why is feature engineering necessary?
175 |
176 | 80) What does it mean for a model to be underdetermined?
177 |
178 | 81) What is the bias-variance tradeoff?
179 |
180 | 82) What is overfitting?
181 |
182 | 83) What is the curse of dimensionality?
183 |
184 | 96) What are your three favorite observational quantiative social science papers?
185 |
186 | 97) Did they employ training/test split or cross-validation? How?
187 |
188 | 98) Did they evaluate the accuracy of their models predictions? How?
189 |
190 | 99) Did they they compare that accuracy to a reasonable null baseline?
191 |
192 | 100) (Yes/No) Can I touch the test data now?
193 |
194 |
--------------------------------------------------------------------------------
/docs/Douglass_IntroductionToMachineLearning_2018_Syllabus_Day1.rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Machine Learning (Syllabus/Code for Day 1): Information Theory and Problems in Learning"
3 | output:
4 | html_notebook:
5 | toc: true # table of content true
6 | toc_depth: 3 # upto three depths of headings (specified by #, ## and ###)
7 | number_sections: true ## if you want number sections at each table header
8 | highlight: tango # specifies the syntax highlighting style
9 | toc_float: true
10 | ---
11 |
12 |
13 | ```{css}
14 |
15 | pre code, pre, code {
16 | white-space: pre !important;
17 | overflow-x: !scroll !important;
18 | word-break: keep-all !important;
19 | word-wrap: initial !important;
20 | }
21 |
22 | code.r{
23 | overflow-x: !scroll !important;
24 | }
25 |
26 | ```
27 |
28 | Rex W. Douglass
29 | Director Machine Learning for Social Science Lab (MSSL)
30 | Center for Peace and Security Studies (cPASS)
31 | Department of Political Science
32 | University of California San Diego
33 | rexdouglass@gmail.com
34 | www.rexdouglass.com
35 | @rexdouglass
36 |
37 | # Course Overview
38 |
39 | Please bring a two sided coin(s) and scratch paper to class for passing notes for the demonstrations.
40 |
41 | This is a 6 hour introduction to machine learning spread across two three-hour lectures. The goal of this very short course is narrow: to give you enough of an overview, vocabulary, and intuition, so that you can identify machine learning problems in the wild and begin your own research into relevant literatures and possible approaches. The goal is not to train you to execute a particular machine learning solution. There are far too many approaches available; they may not cover whatever problem you find; and the state of the art will be different in a year or two anyway. Instead, we will learn how to think about and classify problems into broad types, how to define and measure the efficacy of different solutions to that problem, how to avoid some common and subtle mistakes, and how to think about a full machine learning pipeline from start to finish.
42 |
43 |
44 | ## Course Slides
45 | * [Course Slides (1A)](https://docs.google.com/presentation/d/19i2om_jwK8m3a-jNvgtM-WMT1l1HAGaGuWeb4bgLsTM/edit?usp=sharing)
46 | * [Course Slides (1B)](https://docs.google.com/presentation/d/1Z857fFS692ijppXZzrPVjsVxsllwDKzoFBWOchRjDfU/edit?usp=sharing)
47 | * [Course Slides (2A)](https://docs.google.com/presentation/d/1HRzRTjz31vt_HwOkKE_jNUYhg1a2LrOxS6LM3RI9dE4/edit?usp=sharing)
48 | * [Course Slides (2B)](https://docs.google.com/presentation/d/1GSKQeoYWTVlIfWQIV9pXyoZW3DVdL2ylmaUU5dN9sGA/edit?usp=sharing)
49 |
50 | ## Readings Policy
51 | Math and programming are not something you learn, they're something you get used to. The readings of this course are, with a few exceptions, voluntary and intended for self study. They are to help point you in the right direction when you realize you need to start brushing up on a particular set of tools in order to tackle a particular problem.
52 |
53 | ## Textbooks
54 |
55 | Do not purchase any books - each of these should be available for free on-line at the link given. Any individual one would provide a decent background to the field of machine learning. For this course, I've picked select chapters when I thought they did a good job reviewing a specific subtopic.
56 |
57 | * (CIML) [A course in machine learning](ciml.info/), Hal Daume III
58 | * (ESL) [Elements of Statistical Learning](https://web.stanford.edu/~hastie/ElemStatLearn/), Trevor Hastie and Robert Tibshirani
59 | * (ISL) [An introduction to statistical learning: with application in R, Gareth James](https://www-bcf.usc.edu/~gareth/ISL/ISLR%20Seventh%20Printing.pdf), Daniela Witten, Trevor Hastie, and Robert Tibshirani
60 | * (IML) [Introduction to Machine Learning](http://alex.smola.org/drafts/thebook.pdf), Alex Smola and S.V.N. Vishwanathan
61 | * (IntroMachineLearningWithR) ["An Introduction to Machine Learning with R"](https://lgatto.github.io/IntroMachineLearningWithR/index.html),Laurent Gatto, 2017-10-18
62 | * (ML) [Machine Learning: The art and science of algorithms that make sense of data](http://dsd.future-lab.cn/members/2015nlp/Peter_Flach_Machine_Learning._The_Art_and_Scienc(BookZZ.org).pdf), Flach
63 | * (MLPP) [Machine Learning: A Probabilistic Perspective](https://www.cs.ubc.ca/~murphyk/MLbook/), Kevin Murphy
64 | * (PRML) [Patter recognition and machine learning](http://users.isr.ist.utl.pt/~wurmd/Livros/school/Bishop%20-%20Pattern%20Recognition%20And%20Machine%20Learning%20-%20Springer%20%202006.pdf), Christopher M. Bishop
65 | * (WMLW) ["WHY MACHINE LEARNING WORKS"](http://www.cs.cmu.edu/~gmontane/montanez_dissertation.pdf), George D. Montanez, May 2017, Dissertation
66 |
67 | ## General Resources
68 |
69 | Related Classes
70 | *[COMS W4721 Machine Learning for Data Science](http://www.columbia.edu/~jwp2128/Teaching/W4721/Spring2017/W4721Spring2017.html)
71 |
72 | There are a number places on-line for constant updates on machine learning
73 | * [Reddit Machine Learning Subreddit](https://www.reddit.com/r/MachineLearning/)
74 | * [arxiv](https://twitter.com/arxiv_org)
75 | * [Arxiv Sanity Preserver](http://www.arxiv-sanity.com/)
76 | * [My twitter feed](https://twitter.com/RexDouglass)
77 | * [Political Analysis](https://www.cambridge.org/core/journals/political-analysis)
78 | * [openreview](https://openreview.net/)
79 | * [Distill](https://distill.pub/)
80 |
81 | Conferences
82 | * [Top Conferences for Machine Learning & Arti. Intelligence](http://www.guide2research.com/topconf/machine-learning)
83 | * [Neural Information Processing Systems (NIPS)](https://nips.cc/)
84 | * [International Conference on Machine Learning](https://icml.cc/)
85 |
86 | ## Data Archives
87 | * [UC Irvine Machine Learning Repository](http://archive.ics.uci.edu/ml/index.php)
88 | * [Kaggle Datasets](https://www.kaggle.com/datasets?sortBy=votes&group=all)
89 | * [List_of_datasets_for_machine_learning_research](https://en.wikipedia.org/wiki/List_of_datasets_for_machine_learning_research)
90 |
91 | ## Software and Programming
92 |
93 | Students are not expected to know any particular language or set of software. We will be demonstrating best practices as used in the Machine Learning for Social Science Lab at the Center for Peace and Security Studies, UCSD. In that lab, our software stack consists of Python and R for data preparation and analysis, Spark for database management, Keras/Tensorflow for deep learning, Github for revision control, and Ubuntu for our operating system and command-line tools.
94 |
95 | * [MACS 305001 - Computing for the Social Sciences](https://cfss.uchicago.edu/index.html),Benjamin Soltoff
96 | * ["R for Data Science"](http://r4ds.had.co.nz/), Garrett Grolemund
97 | * ["Spark and sparklyr,"](https://cfss.uchicago.edu/distrib003_spark.html)
98 | * ["GitHub and RStudio,"](https://resources.github.com/articles/github-and-rstudio/)
99 | * ["Data Science at the Command Line,"](https://www.datascienceatthecommandline.com/), Jeroen Janssens, February 8, 2018
100 | * ["Data Visualization: A practical introduction"](http://socviz.co/index.html?utm_content=buffer09710&utm_medium=social&utm_source=twitter.com&utm_campaign=buffer),Kieran Healy
101 | * ["Introduction to Validate," https://cran.r-project.org/web/packages/validate/vignettes/introduction.html)
102 | * ["An introduction to regular expressions"](https://www.oreilly.com/ideas/an-introduction-to-regular-expressions), Thomas Nield, December 13, 2017 ,
103 | * [RegExplain]("https://github.com/gadenbuie/regexplain/#readme")
104 | * ["The Plain Person’s Guide to Plain Text Social Science"](http://plain-text.co/), Kieran Healy, 2018-04-28
105 | * "Statistical Data Cleaning with Applications in R"
106 | * [scikit-learn](http://scikit-learn.org/stable/)
107 | * [Guide to SuperLearner](ftp://cran.r-project.org/pub/R/web/packages/SuperLearner/vignettes/Guide-to-SuperLearner.html), Chris Kennedy, March 16, 2017
108 |
109 | ## Applications
110 | * ["ViEWS: a political Violence Early-Warning System,"](http://pcr.uu.se/research/views/)
111 | * ["Safe Disposal of Unexploded WWII Bombs,"](http://staff.math.su.se/hoehle/blog/2018/05/25/uxb.html) Michael Höhle, May 25, 2018,
112 | * ["Predicting Race and Ethnicity From the Sequence of Characters in a Name,"](https://arxiv.org/pdf/1805.02109.pdf),Gaurav Sood and Suriyan Laohaprapanon, May 8, 2018,
113 | * ["ethnicolr: Predict Race and Ethnicity From Name,"](https://github.com/appeler/ethnicolr)
114 | * Weidmann, Nils B. and Sebastian Schutte. Using Night Lights for the Prediction of Local Wealth. Journal
115 | of Peace Research 54(2).
116 |
117 |
118 | ## Notes on this Guide
119 |
120 | This guide is written as an [R notebook](https://bookdown.org/yihui/rmarkdown/notebook.html) using [R-Studio](https://www.rstudio.com/). It renders output as static HTML that you should be able to view on a regular web browser.
121 |
122 | ```{r}
123 | #install.packages("pacman")
124 | library(pacman)
125 | p_load(infotheo)
126 | p_load(tidyverse)
127 | p_load(ggplot2)
128 | p_load(cowplot)
129 | p_load(mlbench)
130 | p_load(Metrics)
131 |
132 | set.seed(123)
133 |
134 | ```
135 |
136 | # Introduction
137 |
138 | ## What is machine learning?
139 | * (CIML), [Chapter 1,](http://ciml.info/dl/v0_99/ciml-v0_99-ch01.pdf)
140 | * (WMLW 1.0) "Introduction"
141 | * (ISL) "Chapter 2 Statistical Learning""
142 |
143 | ## What isn't machine learning?
144 |
145 | ### Statistics
146 | * ["Statistical Modeling: The Two Cultures (with comments and a rejoinder by the author),"](https://projecteuclid.org/download/pdf_1/euclid.ss/1009213726) Leo Breiman, Statistical Science, 2001, Vol. 16, No. 3, 199–231
147 |
148 | ### Causal Inference
149 | * Joshua D. Angrist & Jörn-Steffen Pischke, "Mostly Harmless Econometrics An Empiricist's Companion," 2009
150 | * ["Basic Concepts of Statistical Inference for Causal Effects in Experiments and Observational Studies,"](http://www.stat.columbia.edu/~cook/qr33.pdf) Donald B. Rubin, 2004
151 | * ["When and How Should One Use Deep Learning for Causal Effect Inference"](https://technionmail-my.sharepoint.com/personal/urishalit_technion_ac_il/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Furishalit_technion_ac_il%2FDocuments%2FPresentations%2FIAS2018%2FIAS2018_2_for_public%2Epdf&parent=%2Fpersonal%2Furishalit_technion_ac_il%2FDocuments%2FPresentations%2FIAS2018&slrid=aaa2759e-909e-5000-fd16-3e33cabf926f)
152 | * ["Comparing Covariate Prioritization via Matching to Machine Learning Methods for Causal Inference using Five Empirical Applications,"](https://arxiv.org/pdf/1805.03743.pdf),Luke Keele, Dylan Small, May 11, 2018,
153 |
154 | # Information Theory
155 |
156 | * [Information_theory](https://en.wikipedia.org/wiki/Information_theory)
157 | * ["Visual Information Theory,"](http://colah.github.io/posts/2015-09-Visual-Information/) Christopher Olah, , October 14, 2015,
158 | * [Shannon Weaver Model](https://en.wikipedia.org/wiki/Shannon%E2%80%93Weaver_model)
159 | * PRML 1.6 "Information Theory"
160 | * ["A Mathematical Theory of Communication,"](http://math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf), C. E. SHANNON, October 1948, The Bell System * Technical Journal, Vol. 27, pp. 379–423, 623–656, July,
161 |
162 | # Information Sources $Y$
163 |
164 | ## Random Variables and Distributions
165 | * [Random_variable](https://en.wikipedia.org/wiki/Random_variable)
166 | * [Binomial_distribution](https://en.wikipedia.org/wiki/Binomial_distribution)
167 | * ["Review of Probability Theory"](http://cs229.stanford.edu/section/cs229-prob.pdf), Arian Maleki and Tom Do
168 | * PRML 2.0
169 |
170 | ## Entropy
171 | * [Entropy (information_theory)](https://en.wikipedia.org/wiki/Entropy_(information_theory))
172 |
173 | * [Package ‘entropy,’ "Estimation of Entropy, Mutual Information and Related Quantities,"](http://strimmerlab.org/software/entropy/), February 19, 2015
174 | * [infotheo](https://cran.r-project.org/web/packages/infotheo/index.html), Patrick E. Meyer, 2014-07-26, R Package
175 | * [Philentropy: Information Theory and Distance Quantification with R.](https://doi.org/10.21105/joss.00765), Drost, (2018), Journal of Open Source Software, 3(26), 765
176 |
177 | ## Zero-Bit source
178 |
179 | * Zero-Bit information sources are constants that don't vary.
180 | * Zero-Bit source, Zero-Bit message: No variation and no measurement.
181 | * Zero-Bit information source, One-Bit message: No variation, but a single measurement with variation between two states.
182 | * Zero-Bit information source, N-Bit message: No variation, and an arbitrary number of measurements with arbitrary number of states
183 |
184 | * ["Design, Inference, and the Strategic Logic of Suicide Terrorism"](https://pdfs.semanticscholar.org/f192/cf69908c84d92d269ef52c337fa487d6b65e.pdf), SCOTT ASHWORTH, JOSHUA D. CLINTON, ADAM MEIROWITZ, and KRISTOPHER W. RAMSAY, 2008, APSR
185 |
186 | ## Binary Source ($<=1bit$)
187 |
188 | One-Bit information sources are variables that can take on two different states, e.g. a coin flip. Call $Y$ the true state at the source, and $\hat{Y}$ the mental model of the state at the destination.
189 |
190 | (IntroMachineLearningWithR) "Chapter 3 Example datasets"
191 | * [Binary_classification](https://en.wikipedia.org/wiki/Binary_classification)
192 |
193 |
194 | ```{r}
195 | library(infotheo)
196 | N <- 699 #Flip a coin N times (Matched to Breast Cancer Dataset Below)
197 | sample_space <- c(1,0) #Heads and Tails
198 | ```
199 |
200 | ### Fair Coin
201 |
202 | A fair coin has equal likelihood of both heads and tails. Estimated entropy is close to the true value of 1 bit.
203 |
204 | ```{r}
205 | p <- 0.5 #Fair Coin
206 | Y_coin_fair <- sample(sample_space, size = N, replace = TRUE, prob = c(p, 1 - p))
207 | print(table(Y_coin_fair))
208 | print(natstobits(entropy(Y_coin_fair, method="emp")))
209 | ```
210 |
211 | ### Unfair Coin (p=0.8)
212 | An unfair coin is weighted to be more likely to land heads or tails. Estimated entropy is less than a full bit. There is less surprise than from a full fair coin flip.
213 |
214 | ```{r}
215 | p <- 0.8 #Unfair Coin
216 | Y_coin_unfair <- sample(sample_space, size = N, replace = TRUE, prob = c(p, 1 - p))
217 | print(table(Y_coin_unfair))
218 | print(natstobits(entropy(Y_coin_unfair, method="emp")))
219 | ```
220 |
221 | ### Two Headed Coin
222 | A two headed coin will only ever land one way.
223 | ```{r}
224 | p <- 1 #Two headed coin
225 | Y_coin_twoheaded <- sample(sample_space, size = N, replace = TRUE, prob = c(p, 1 - p))
226 | print(table(Y_coin_twoheaded))
227 | print(natstobits(entropy(Y_coin_twoheaded, method="emp")))
228 | ```
229 |
230 | ### [UCI Breast Cancer Dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))
231 | ["Breast Cancer"](https://rpubs.com/raviolli77/352956), Raul Eulogio, January 26, 2018
232 | ```{r}
233 | data(BreastCancer)
234 | glimpse(BreastCancer)
235 | summary(BreastCancer$Class)
236 | ```
237 |
238 | ```{r}
239 | print(natstobits(entropy(BreastCancer$Class, method="emp")))
240 | ```
241 |
242 | ## Multi-class Sources
243 |
244 | ### [Iris Dataset](https://archive.ics.uci.edu/ml/datasets/iris)
245 |
246 | ```{r}
247 | data(iris)
248 | glimpse(iris)
249 | summary(iris$Species)
250 | print(natstobits(entropy(iris$Species, method="emp")))
251 | ```
252 |
253 | ## Real Valued Sources
254 |
255 | ### [Boston Housing Data](https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html)
256 |
257 | ```{r, echo=F}
258 | #crim per capita crime rate by town
259 | #zn proportion of residential land zoned for lots over 25,000 sq.ft
260 | #indus proportion of non-retail business acres per town
261 | #chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
262 | #nox nitric oxides concentration (parts per 10 million)
263 | #rm average number of rooms per dwelling
264 | #age proportion of owner-occupied units built prior to 1940
265 | #dis weighted distances to five Boston employment centres
266 | #rad index of accessibility to radial highways
267 | #tax full-value property-tax rate per USD 10,000
268 | #ptratio pupil-teacher ratio by town
269 | #b 1000(B − 0.63)2 where B is the proportion of blacks by town
270 | #lstat percentage of lower status of the population
271 | #medv median value of owner-occupied homes in USD 1000’s
272 | ```
273 |
274 | ```{r}
275 | data(BostonHousing)
276 | glimpse(BostonHousing)
277 | summary(BostonHousing$medv)
278 | print(natstobits(entropy(discretize(BostonHousing$medv), method="emp")))
279 | ```
280 |
281 | # Comparing $Y$ and $\hat{Y}$
282 |
283 | (IntroMachineLearningWithR) 5.4 Classification performance
284 |
285 | ## Binary
286 |
287 | How should we compare the true reality $Y$ to our mental model of it $\hat{Y}$?
288 |
289 | * [Scoring Rules](https://en.wikipedia.org/wiki/Scoring_rule)
290 | * [Evaluation_of_binary_classifiers](https://en.wikipedia.org/wiki/Evaluation_of_binary_classifiers)
291 | * [Confusion_matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
292 |
293 | ### Confusion Matrix
294 |
295 | ```{r}
296 | table(BreastCancer$Class, BreastCancer$Class)
297 | table(BreastCancer$Class, Y_coin_fair)
298 | table(BreastCancer$Class, Y_coin_unfair)
299 | table(BreastCancer$Class, Y_coin_twoheaded)
300 | ```
301 | ### Accuracy
302 | ```{r}
303 | p_load(Metrics)
304 | BreastCancer$Class_binary <- as.numeric(BreastCancer$Class=="malignant")
305 | accuracy(BreastCancer$Class_binary, BreastCancer$Class_binary)
306 | accuracy(BreastCancer$Class_binary, Y_coin_fair)
307 | accuracy(BreastCancer$Class_binary, Y_coin_unfair)
308 | accuracy(BreastCancer$Class_binary, Y_coin_twoheaded)
309 | ```
310 |
311 | ### Precision
312 |
313 | * [Precision_and_recall](https://en.wikipedia.org/wiki/Precision_and_recall)
314 | ```{r}
315 | p_load(Metrics)
316 | Metrics::precision(BreastCancer$Class_binary,
317 | BreastCancer$Class_binary)
318 | Metrics::precision(BreastCancer$Class_binary, Y_coin_fair)
319 | Metrics::precision(BreastCancer$Class_binary, Y_coin_unfair)
320 | Metrics::precision(BreastCancer$Class_binary, Y_coin_twoheaded)
321 | ```
322 |
323 | ### Recall
324 |
325 | ```{r}
326 | p_load(Metrics)
327 | Metrics::recall(BreastCancer$Class_binary,
328 | BreastCancer$Class_binary)
329 | Metrics::recall(BreastCancer$Class_binary, Y_coin_fair)
330 | Metrics::recall(BreastCancer$Class_binary, Y_coin_unfair)
331 | Metrics::recall(BreastCancer$Class_binary, Y_coin_twoheaded)
332 | ```
333 |
334 | ### F1
335 | * [F1_score](https://en.wikipedia.org/wiki/F1_score)
336 | ```{r}
337 | #Note Metrics::f1 doesn't give the correct values
338 | p_load(MLmetrics)
339 | F1_Score(BreastCancer$Class_binary,
340 | BreastCancer$Class_binary)
341 | F1_Score(BreastCancer$Class_binary, Y_coin_fair)
342 | F1_Score(BreastCancer$Class_binary, Y_coin_unfair)
343 | #F1_Score(BreastCancer$Class_binary, Y_coin_twoheaded)
344 | #Not happy about all 1 prediction
345 | print( (0.3447783*1)/(0.3447783+1)*2 ) #Calculate by hand
346 | ```
347 |
348 |
349 | * [Sensitivity_and_specificity](https://en.wikipedia.org/wiki/Sensitivity_and_specificity)
350 | * ["Finding Similar Items"](http://infolab.stanford.edu/~ullman/mmds/ch3.pdf), Jure Leskovec, Anand Rajaraman, Jeff Ullman, Chapter 3, [Mining of Massive Datasets](http://www.mmds.org/), 2014
351 |
352 | ## Probalistic Predictions
353 | * [Proper Scoring Rules](https://en.wikipedia.org/wiki/Scoring_rule#ProperScoringRules)
354 |
355 | ### Log Loss
356 |
357 | [Log Loss](http://wiki.fast.ai/index.php/Log_Loss)
358 |
359 | ```{r}
360 | MLmetrics::LogLoss(BreastCancer$Class_binary,
361 | BreastCancer$Class_binary)
362 | MLmetrics::LogLoss(BreastCancer$Class_binary, Y_coin_fair)
363 | MLmetrics::LogLoss(BreastCancer$Class_binary, Y_coin_unfair)
364 | MLmetrics::LogLoss(BreastCancer$Class_binary, Y_coin_twoheaded)
365 | ```
366 |
367 | ### Area Under the Curve
368 | * [Receiver operating characteristic](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
369 | * [An introduction to ROC analysis](http://people.inf.elte.hu/kiss/13dwhdm/roc.pdf), Tom Fawcett, 2006, Pattern Recongnition Letters
370 | * [what-does-auc-stand-for-and-what-is-it](https://stats.stackexchange.com/questions/132777/what-does-auc-stand-for-and-what-is-it)
371 | * [roc-and-precision-recall-with-imbalanced-datasets](https://classeval.wordpress.com/simulation-analysis/roc-and-precision-recall-with-imbalanced-datasets/)
372 |
373 | [Measuring classifier performance: a coherent alternative to the area under the ROC curve](http://web.cs.iastate.edu/~cs573x/Notes/hand-article.pdf), David J. Hand, Mach Learn (2009) 77: 103–123
374 |
375 | [Generate ROC Curve Charts for Print and Interactive Use](https://cran.r-project.org/web/packages/plotROC/vignettes/examples.html), Michael C Sachs, 2018-06-23
376 | [Illustrated Guide to ROC and AUC](https://www.r-bloggers.com/illustrated-guide-to-roc-and-auc/), Raffael Vogler, June 23, 2015
377 |
378 | ```{r}
379 | #Simulate a probalistic prediction
380 | noised_prediction <- function(prediction){ noised <-runif(N,0,0.5) ; noised[prediction==1] <-noised[prediction==1]+0.5; return(noised) }
381 | AUC(noised_prediction(BreastCancer$Class_binary), BreastCancer$Class_binary)
382 | AUC(noised_prediction(Y_coin_fair), BreastCancer$Class_binary)
383 | AUC(noised_prediction(Y_coin_unfair), BreastCancer$Class_binary)
384 | AUC(noised_prediction(Y_coin_twoheaded), BreastCancer$Class_binary)
385 | ```
386 |
387 | ```{r}
388 | p_load(plotROC)
389 | set.seed(2529)
390 | D.ex <- rbinom(200, size = 1, prob = .5)
391 | M1 <- rnorm(200, mean = D.ex, sd = .65)
392 | M2 <- rnorm(200, mean = D.ex, sd = 1.5)
393 |
394 | test <- data.frame(D = D.ex, D.str = c("Healthy", "Ill")[D.ex + 1],
395 | M1 = M1, M2 = M2, stringsAsFactors = FALSE)
396 | basicplot <- ggplot(test, aes(d = D, m = M1)) + geom_roc(labels = FALSE)
397 | basicplot
398 |
399 |
400 | D.ex <- rbinom(50, 1, .5)
401 | rocdata <- data.frame(D = c(D.ex, D.ex),
402 | M = c(rnorm(50, mean = D.ex, sd = .4), rnorm(50, mean = D.ex, sd = 1)),
403 | Z = c(rep("A", 50), rep("B", 50)))
404 |
405 | ggplot(rocdata, aes(m = M, d = D)) + geom_roc()
406 |
407 | devtools::install_github("sachsmc/plotROC")
408 |
409 | ggplot(rocdata, aes(m = M, d = D, color = Z)) + geom_roc()
410 |
411 |
412 | ```
413 |
414 | ###
415 |
416 | ["Introduction to the precision-recall plot"](https://classeval.wordpress.com/introduction/introduction-to-the-precision-recall-plot/),
417 |
418 | ### Others
419 | * [Brier_score](https://en.wikipedia.org/wiki/Brier_score)
420 |
421 | ### Imbalanced Data
422 |
423 | * ["Facing Imbalanced Data Recommendations for the Use of Performance Metrics"](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4285355/), László A. Jeni,1 Jeffrey F. Cohn,1,2 and Fernando De La Torre, Int Conf Affect Comput Intell Interact Workshops. 2013; 2013: 245–251.
424 |
425 | ## Multiclass
426 |
427 | * [Multiclass_classification](https://en.wikipedia.org/wiki/Multiclass_classification)
428 | * [Cross Entropy](https://en.wikipedia.org/wiki/Cross_entropy)
429 | * Tom Fawcett (2006) “An introduction to ROC analysis”. Pattern Recognition Letters 27, 861–874. DOI: 10.1016/j.patrec.2005.10.010.
430 | * David J. Hand and Robert J. Till (2001). A Simple Generalisation of the Area Under the ROC Curve for Multiple Class Classification Problems. Machine Learning 45(2), p. 171–186. DOI: 10.1023/A:1010920819831.
431 | * ["pROC: Display and Analyze ROC Curves"](https://web.expasy.org/pROC/), 2018-05-06
432 | * [pROC: an open-source package for R and S+ to analyze and compare ROC curves](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-12-77), Xavier * * Robin, Natacha Turck, Alexandre Hainard, Natalia Tiberti, Frédérique Lisacek, Jean-Charles Sanchez and Markus Müller (2011). BMC Bioinformatics, 12, p. 77
433 |
434 |
435 | ## Real Valued
436 |
437 | * [Mean_squared_error](https://en.wikipedia.org/wiki/Mean_squared_error)
438 | * [Huber Loss](https://en.wikipedia.org/wiki/Huber_loss)
439 |
440 | ```{r}
441 | y_hat=mean(BostonHousing$medv)
442 | MAE(BostonHousing$medv, y_hat)
443 | MSE(BostonHousing$medv, y_hat)
444 | ```
445 |
446 | # Transmitters and Receivers
447 | [Function_(mathematics)](https://en.wikipedia.org/wiki/Function_(mathematics))
448 | * [Inverse_function](https://en.wikipedia.org/wiki/Inverse_function)
449 |
450 | ```{r, include=F}
451 | n=100
452 | Y_unif=runif(n=n)
453 | beta_1=.5
454 | X=(Y_unif)/beta_1 #epsilon_gaussian
455 | p1 <- data.frame(X,Y_unif) %>% ggplot(aes(x=Y_unif,y=X)) + geom_point()
456 |
457 | epsilon_gaussian=rnorm(n=n, mean = 0, sd = 1)
458 | beta_1=.5
459 | X=(Y_unif)/beta_1 + (Y_unif^2)/beta_1 #epsilon_gaussian
460 | p2 <- data.frame(X,Y_unif) %>% ggplot(aes(x=Y_unif,y=X)) + geom_point()
461 |
462 | X= Y_unif > .5
463 | p3 <- data.frame(X,Y_unif) %>% ggplot(aes(x=Y_unif,y=X)) + geom_point()
464 |
465 | X= cos(Y_unif)
466 | p4 <- data.frame(X,Y_unif) %>% ggplot(aes(x=Y_unif,y=X)) + geom_point()
467 |
468 | plot_grid(p1,p2,p3,p4, labels = c("A", "B", "C", "D"), ncol = 2)
469 |
470 | ```
471 |
472 |
473 | ## What makes a good receiver?
474 |
475 | ### Risk Analysis
476 | * PRML 1.5 "Decision Theory"
477 | * ESL 2.4 Statistical Decision Theory
478 |
479 | ### Bias-Variance Tradeoff
480 | * ["Understanding the Bias-Variance Tradeoff,"](http://scott.fortmann-roe.com/docs/BiasVariance.html), Scott Fortmann-Roe, 2012,
481 |
482 | ### Overfitting
483 | * [Overfitting](https://en.wikipedia.org/wiki/Overfitting)
484 | * [Training,_test,_and_validation_sets](https://en.wikipedia.org/wiki/Training,_test,_and_validation_sets)
485 |
486 | ### No Free Lunch
487 |
488 | * WMLW 2.0 "Related Work"
489 | * ["No Free Lunch Theorems for Optimization,"](https://ti.arc.nasa.gov/m/profile/dhw/papers/78.pdf) David H. Wolpert and William G. Macready, 1997,
490 |
491 | ## Ommitted Variable Bias
492 | * [Simpsons Paradox](https://en.wikipedia.org/wiki/Simpson%27s_paradox)
493 |
494 | ## Feature Selection / Included Variable Bias
495 | * [Let’s Put Garbage—Can Regressions and Garbage—Can Probits Where They Belong](http://www.columbia.edu/~gjw10/achen04.pdf), Christopher H. Achen, Conflict Management and Peace Science, 2005
496 | * ["The Phantom Menace: Omitted Variable Bias in Econometric Research"](http://www.saramitchell.org/clarke05.pdf), Kevin A. Clarke, 2005, Conflict Management and Peace Science
497 | * ["Achieving Statistical Significance with Covariates and without Transparency"](https://drive.google.com/file/d/0B13GrSdju4CpcjBKMGU5ZmRjN2c/view), Gabriel Lenz, Alexander Sahn, November 27, 2017
498 |
499 | * [SCANNING DEAD SALMON IN FMRI MACHINE HIGHLIGHTS RISK OF RED HERRINGS](https://www.wired.com/2009/09/fmrisalmon/)
500 | * [“Do We Really Know the WTO Cures Cancer?”](http://www.stephenchaudoin.com/CHH_Cancer_2014_09_18.pdf) Stephen Chaudoin, Jude Hays and Raymond Hicks, British Journal of Political Science.
501 |
502 | * [Feature_selection](https://en.wikipedia.org/wiki/Feature_selection)
503 | * [Confounding](https://en.wikipedia.org/wiki/Confounding)
504 | * ["Statistical learning and selective inference"](http://www.pnas.org/content/pnas/112/25/7629.full.pdf) Jonathan Taylora and Robert J. Tibshirani, June 23, 2015
505 | * ["The garden of forking paths: Why multiple comparisons can be a problem, when there is no “fishing expedition” or “p-hacking” and the research was posited ahead of time"](http://www.stat.columbia.edu/~gelman/research/unpublished/p_hacking.pdf), Andrew Gelman and Eric Loken, 14 Nov 2013, Unpublished Manuscript
506 | * [P-Hacking](https://projects.fivethirtyeight.com/p-hacking/)
507 |
508 | ## Feature Engineering
509 | * [Feature_engineering](https://en.wikipedia.org/wiki/Feature_engineering)
510 |
511 | ## Bandwidth / Under determination
512 |
513 | * [Underdetermined system](https://en.wikipedia.org/wiki/Underdetermined_system)
514 |
515 | * [Overdetermined_system](https://en.wikipedia.org/wiki/Overdetermined_system)
516 |
517 | ## Curse of Dimensionality
518 |
519 | * [Curse_of_dimensionality](https://en.wikipedia.org/wiki/Curse_of_dimensionality)
520 | * [The Curse of Dimensionality in classification](http://www.visiondummy.com/2014/04/curse-dimensionality-affect-classification/), Vincent Spruyt
521 | (PRML) "1.4 The Curse of Dimensionality"
522 |
523 |
--------------------------------------------------------------------------------
/docs/Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2.rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Machine Learning (Syllabus/Code for Day 2): Solutions for Learning in Supervised and Unsupervised Settings"
3 | output:
4 | html_notebook:
5 | toc: true # table of content true
6 | toc_depth: 3 # upto three depths of headings (specified by #, ## and ###)
7 | number_sections: true ## if you want number sections at each table header
8 | highlight: tango # specifies the syntax highlighting style
9 | toc_float: true
10 | ---
11 |
12 |
13 | ```{css}
14 |
15 | pre code, pre, code {
16 | white-space: pre !important;
17 | overflow-x: !scroll !important;
18 | word-break: keep-all !important;
19 | word-wrap: initial !important;
20 | }
21 |
22 | code.r{
23 | overflow-x: !scroll !important;
24 | }
25 |
26 | ```
27 |
28 | ```{r, eval=F, include=F}
29 | #I had some trouble install caret all in one go so went dependency by dependency
30 | install.packages('robustbase')
31 | install.packages('sfsmisc')
32 | install.packages('geometry')
33 | install.packages('profileModel')
34 | install.packages('labelled')
35 | install.packages('dimRed')
36 | install.packages('timeDate')
37 | install.packages('ddalpha')
38 | install.packages('gower')
39 | install.packages('RcppRoll')
40 | install.packages('brglm')
41 | install.packages('qvcalc')
42 | install.packages('plotmo')
43 | install.packages('TeachingDemos')
44 | install.packages('combinat')
45 | install.packages('questionr')
46 | install.packages('ISwR')
47 | install.packages('corpcor')
48 | install.packages('ModelMetrics')
49 | install.packages('recipes')
50 | install.packages('BradleyTerry2')
51 | install.packages('earth')
52 | install.packages('fastICA')
53 | install.packages('gam')
54 | install.packages('ipred')
55 | install.packages('klaR')
56 | install.packages('ellipse')
57 | install.packages('mda')
58 | install.packages('pls')
59 | install.packages('pROC')
60 | install.packages('proxy')
61 | install.packages('spls')
62 |
63 | ```
64 |
65 |
66 | ```{r}
67 | #install.packages("pacman")
68 | library(pacman)
69 | p_load(infotheo)
70 | p_load(tidyverse)
71 | p_load(ggplot2)
72 | p_load(cowplot)
73 | p_load(mlbench)
74 | p_load(Metrics)
75 | #remove.packages("rlang")
76 | #install.packages("rlang", repos = "https://cloud.r-project.org")
77 |
78 | set.seed(123)
79 |
80 | ```
81 |
82 |
83 | # Wisconsin Breast Cancer Dataset
84 |
85 | BreastCancer Dataset
86 | A data frame with 699 observations on 11 variables, one being a character variable, 9 being ordered or nominal, and 1 target class.
87 |
88 | 1. Sample code number: id number
89 | 2. Clump Thickness: 1 - 10
90 | 3. Uniformity of Cell Size: 1 - 10
91 | 4. Uniformity of Cell Shape: 1 - 10
92 | 5. Marginal Adhesion: 1 - 10
93 | 6. Single Epithelial Cell Size: 1 - 10
94 | 7. Bare Nuclei: 1 - 10
95 | 8. Bland Chromatin: 1 - 10
96 | 9. Normal Nucleoli: 1 - 10
97 | 10. Mitoses: 1 - 10
98 | 11. Class: (benign, malignant)
99 |
100 |
101 | [Breast Cancer Wisconsin (Original) Data Set ](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original))
102 |
103 | ["Multisurface method of pattern separation for medical diagnosis applied to breast cytology."](http://www.pnas.org/content/pnas/87/23/9193.full.pdf), Wolberg,W.H., Mangasarian,O.L. (1990). In Proceedings of the National Academy of Sciences, 87, 9193-9196.
104 |
105 | Zhang,J. (1992). Selecting typical instances in instance-based learning. In Proceedings of the Ninth International Machine Learning Conference (pp. 470-479). Aberdeen, Scotland: Morgan Kaufmann.
106 |
107 |
108 | ## Cleaning and documentation
109 |
110 | ```{r}
111 | data(BreastCancer)
112 | glimpse(BreastCancer)
113 | summary(BreastCancer$Class)
114 |
115 | BreastCancer$y <- as.factor(as.numeric(BreastCancer$Class=="malignant"))
116 | BreastCancer$Class <- NULL
117 | BreastCancer$Id <- NULL
118 |
119 | BreastCancer[,1:5] <- lapply(BreastCancer[,1:5] , as.numeric)
120 | summary(BreastCancer)
121 | ```
122 |
123 | ```{r, fig.width=25, fig.height=15, cache=T, message=FALSE}
124 | p_load(GGally)
125 | ggpairs(BreastCancer, title = "Breast Cancer Dataset")
126 | ```
127 |
128 | ```{r, fig.width=15, fig.height=10, cache=T }
129 | p_load(corrplot)
130 | p_load(infotheo)
131 | BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
132 | #BreastCancer_mi <- BreastCancer_mi/max(BreastCancer_mi)
133 | mi_max <- max( BreastCancer_mi[lower.tri(BreastCancer_mi, diag = FALSE)])
134 | diag(BreastCancer_mi) <-0
135 |
136 | corrplot.mixed(BreastCancer_mi,
137 | cl.lim = c(0,mi_max),
138 | title = "Normalised Mutual Information Breast Cancer Dataset",
139 | mar=c(0,0,1,0),
140 | lower = "ellipse",
141 | upper="number",
142 | is.corr = FALSE,
143 | order = "hclust"
144 | )
145 |
146 |
147 | ```
148 |
149 | ```{r}
150 | p_load(infotheo)
151 | BreastCancer_mi <- mutinformation(BreastCancer, method="emp") %>% natstobits()
152 |
153 | BreastCancer_mi_d <- as.dist(max(BreastCancer_mi)-BreastCancer_mi)
154 | hc <- hclust(BreastCancer_mi_d, method="ward.D2")
155 | plot(hc)
156 |
157 | ```
158 |
159 | There are 16 unexplained missing values on one of the features. We're going to impute those values, being careful to not use the outcome as one of the predictors. This will allows us to make comparisons across methods that do not handle missing values well, and also will protect us on predicting onto new test data which might also have unexplained missingness.
160 |
161 | [MissForest—non-parametric missing value imputation for mixed-type data](https://academic.oup.com/bioinformatics/article/28/1/112/219101), Daniel J. Stekhoven Peter Bühlmann, Bioinformatics, Volume 28, Issue 1, 1 January 2012, Pages 112–118,
162 |
163 | ```{r}
164 | #There are 16 missing values in Bare.nuclei, they're continous
165 | p_load("missForest")
166 | BreastCancer_imputed <- BreastCancer
167 | BreastCancer_imputed <- missForest(BreastCancer %>% select(-y), verbose = TRUE)$ximp
168 | BreastCancer_imputed$y <- BreastCancer$y
169 |
170 | ```
171 |
172 | Convert categorical variables to 'one-hot' dummy variables
173 |
174 | [Making dummy variables with dummy_cols()](https://cran.r-project.org/web/packages/fastDummies/vignettes/making-dummy-variables.html), Jacob Kaplan, 2018-06-21
175 |
176 | ```{r}
177 | #install.packages('data.table')
178 | p_load(fastDummies)
179 | BreastCancer_onehot <- fastDummies::dummy_cols(BreastCancer_imputed,
180 | select_columns=c("Bare.nuclei",
181 | "Bl.cromatin",
182 | "Normal.nucleoli",
183 | "Mitoses"))
184 | BreastCancer_onehot[,c('Bare.nuclei','Bl.cromatin','Normal.nucleoli','Mitoses')] <- NULL
185 | ```
186 |
187 | # Hold out a Test Set
188 |
189 | The Very first thing we're going to do is pull 20% of the Breat Cancer dataset out as a test set and we're never going to touch it for any reason other than final model evaluation.
190 |
191 | Immediately split off a test set that we will not touch until the very final evaluation.
192 |
193 | ```{r}
194 | N=nrow(BreastCancer)
195 | condition_train <- runif(N)<.8; table(condition_train)
196 |
197 | BreastCancer_train <- BreastCancer_imputed[condition_train,]
198 | BreastCancer_test <- BreastCancer_imputed[!condition_train,]
199 |
200 | BreastCancer_onehot_train <- BreastCancer_onehot[condition_train,]
201 | BreastCancer_onehot_test <- BreastCancer_onehot[!condition_train,]
202 |
203 | ```
204 |
205 | # Supervised Learning
206 | * IMLR ["Chapter 5 Supervised Learning"](https://lgatto.github.io/IntroMachineLearningWithR/supervised-learning.html)
207 |
208 | ```{r}
209 |
210 | formula= y ~ Cl.thickness +
211 | Cell.size +
212 | Cell.shape +
213 | Marg.adhesion +
214 | Epith.c.size +
215 | Bare.nuclei +
216 | Bl.cromatin +
217 | Normal.nucleoli +
218 | Mitoses
219 |
220 | #One Hot formula dummies
221 | formula_onehot = y ~
222 |
223 | Cl.thickness +
224 | Cell.size +
225 | Cell.shape +
226 | Marg.adhesion +
227 | Epith.c.size +
228 |
229 | Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 + Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 +
230 | Bare.nuclei_5 + Bare.nuclei_8 + Bare.nuclei_6 + Bl.cromatin_3 +
231 |
232 | Bl.cromatin_9 + Bl.cromatin_1+Bl.cromatin_2+Bl.cromatin_4+Bl.cromatin_5+Bl.cromatin_7 +
233 | Bl.cromatin_8+Bl.cromatin_6+Bl.cromatin_10+
234 |
235 | Normal.nucleoli_1 + Normal.nucleoli_2 + Normal.nucleoli_7 +
236 | Normal.nucleoli_4 + Normal.nucleoli_5 + Normal.nucleoli_3 +
237 | Normal.nucleoli_10 + Normal.nucleoli_6 + Normal.nucleoli_9 +
238 | Normal.nucleoli_8 +
239 |
240 | Mitoses_1+ Mitoses_5 + Mitoses_4 + Mitoses_2+Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6
241 |
242 | ```
243 |
244 | Register a single back end for cross-validation
245 |
246 | ```{r}
247 |
248 | p_load(caret)
249 | set.seed(123)
250 | cctrl1 <- trainControl(method="cv",
251 | number=10,
252 | returnResamp="all",
253 | classProbs=TRUE,
254 | summaryFunction=twoClassSummary,
255 | savePredictions=TRUE
256 | )
257 |
258 | ```
259 |
260 | # Linear Models
261 | * (ISLR) "Chapter 3 Linear Regression"
262 | * [Ordinary_least_squares](https://en.wikipedia.org/wiki/)
263 | * (ISLR) "Chapter 4.3 Logistic Regression"
264 | * [Logistic_regression](https://en.wikipedia.org/wiki/Logistic_regression)
265 |
266 | * [Glmnet Vignette](https://cran.r-project.org/web/packages/glmnet/vignettes/glmnet_beta.pdf)
267 |
268 | ```{r}
269 | p_load(glmnet)
270 | set.seed(123)
271 | glm1 <- glm(formula_onehot ,
272 | data=BreastCancer_onehot_train ,
273 | family=binomial(link='probit')
274 | )
275 |
276 | library(broom)
277 | tidy(glm1 ) #There are 44 features, counting dummified categorical variables
278 |
279 | ```
280 |
281 | Out of sample accuracy?
282 |
283 | ```{r, echo=FALSE, cache=T, results=T, warning=FALSE, comment=FALSE, warning=FALSE}
284 |
285 | set.seed(123)
286 | glm_cv <- train(x=BreastCancer_train[,-c(10)],
287 | y=as.factor(paste0('Outcome',BreastCancer_train$y)),
288 | method = "glm",
289 | trControl = cctrl1,
290 | metric = "ROC"#,
291 | #tuneGrid = expand.grid(alpha = 1,lambda = seq(0.001,0.1,by = 0.001) )
292 | )
293 |
294 | print(glm_cv$results$ROC) #Very decent area under the ROC for just a linear model
295 |
296 | #devtools::install_github("hadley/ggplot2")
297 | #devtools::install_github("sachsmc/plotROC")
298 | p_load(plotROC)
299 |
300 | out_of_sample_predictions <- data.frame(y_hat=glm_cv$pred$Outcome1,
301 | y=BreastCancer_train$y[glm_cv$pred$rowIndex],
302 | model="GLM")
303 |
304 | basicplot <- ggplot(out_of_sample_predictions,
305 | aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
306 | style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
307 | basicplot
308 |
309 | ```
310 |
311 |
312 |
313 |
314 | # Variable Selection
315 | * (ESL) "3 Linear Methods for Regression, 3.3 Subset Methods"
316 | * [Stepwise_regression](https://en.wikipedia.org/wiki/Stepwise_regression)
317 |
318 |
319 | ## Feature Importance and P Values
320 | * [A Machine Learning Alternative to P-values](https://arxiv.org/pdf/1701.04944.pdf),Min Lu and Hemant Ishwaran, February 22, 2017
321 | * [ELI5](https://github.com/TeamHG-Memex/eli5)
322 | * "Why Should I Trust You?": Explaining the Predictions of Any Classifier, Marco Tulio Ribeiro, Sameer Singh, Carlos Guestrin, https://arxiv.org/abs/1602.04938
323 | lime, Python Package, https://github.com/marcotcr/lime
324 | * [Feature Selection with the R Package MXM: Statistically-Equivalent Feature Subsets](https://arxiv.org/pdf/1611.03227.pdf)
325 | * ["bounceR"](https://github.com/STATWORX/bounceR), R Package
326 |
327 | * ['I JUST RAN Two MILLION REGRESSIONS'](http://www.ecostat.unical.it/Aiello/Didattica/economia_Crescita/CRESCITA/CRESCITA_Sala-i-Martin-AER-1997.pdf), Xavier Sala-i-Martin, 1997, American Economic Review
328 | * [Extreme_bounds_analysis](https://en.wikipedia.org/wiki/Extreme_bounds_analysis)
329 | * ["ExtremeBounds: Extreme Bounds Analysis in R"](https://cran.r-project.org/web/packages/ExtremeBounds/vignettes/ExtremeBounds.pdf)
330 |
331 | [Introduction to vimp](https://cran.r-project.org/web/packages/vimp/vignettes/introduction_to_vimp.html), Brian D. Williamson, 2018-06-19
332 |
333 |
334 |
335 | ## Regularization, e.g. Lasso/Ridge Regression
336 | * https://en.wikipedia.org/wiki/Lasso_(statistics)
337 | * ["Regression shrinkage and selection via the lasso"](http://statweb.stanford.edu/~tibs/lasso/lasso.pdf), Tibshirani, R., 1996, J. Royal. Statist. Soc B., Vol. 58, No. 1, pages 267-288)
338 | * ["Glmnet Vignette"](https://cran.r-project.org/web/packages/glmnet/vignettes/glmnet_beta.pdf), Trevor Hastie and Junyang Qian, September 13, 2016
339 | * (ISLR) "6 Linear Model Selection and Regularization"
340 | * (ESL) "3 Linear Methods for Regression, 3.4 Shrinkage Methods"
341 |
342 |
343 |
344 | ```{r}
345 | set.seed(123)
346 | glmnet1 <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% as.matrix(),
347 | y=as.factor(BreastCancer_onehot_train$y),
348 | family="binomial"
349 | )
350 | plot(glmnet1)
351 |
352 | glmnet1_cv <- cv.glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
353 | y=as.factor(BreastCancer_onehot_train$y),
354 | family="binomial",
355 | nfolds=5)
356 |
357 | glmnet1_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
358 |
359 | plot(glmnet1_cv)
360 |
361 | glmnet_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame() %>%
362 | rename(beta='1') %>%
363 | rownames_to_column() %>% arrange(desc(beta) )
364 |
365 | #There are 44 features
366 | #14 have been set to nonzero coefficients
367 | #the cofficients are relatively small
368 |
369 | #Design a single model around that optimal lambda
370 | glmnet_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
371 | y=BreastCancer_onehot_train$y,
372 | family="binomial",
373 | lambda=glmnet1_cv$lambda.1se
374 | )
375 |
376 |
377 | #cross validate that model to get estimate of accuracy on the test set
378 | glmnet_lambda.1se_cv <- train(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
379 | y=as.factor(paste0('Outcome',BreastCancer_train$y)),
380 | method = "glmnet",
381 | trControl = cctrl1,
382 | metric = "ROC",
383 | tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
384 |
385 | #Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
386 | print(glmnet_lambda.1se_cv$results$ROC) #0.99
387 |
388 | p_load(plotROC)
389 | out_of_sample_predictions2 <- data.frame(y_hat=glmnet_lambda.1se_cv$pred$Outcome1,
390 | y=BreastCancer_train$y[glmnet_lambda.1se_cv$pred$rowIndex],
391 | model="Lasso")
392 | basicplot <- ggplot(bind_rows(out_of_sample_predictions,
393 | out_of_sample_predictions2),
394 | aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
395 | style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
396 | basicplot
397 |
398 | ```
399 |
400 |
401 | ## Linear Expansions and Interaction Terms
402 |
403 | We can put the same feature in a linear multiple times with polynomials to capture nonlinear relationships.
404 |
405 | ```{r}
406 | set.seed(123)
407 | library(dplyr)
408 | df <- data.frame(x=seq(0,100)) %>%
409 | mutate(y=0+x+x^2+x^3) %>%
410 | mutate(pred_lm = lm(y~x )$fitted.values) %>%
411 | mutate(pred_lm_quad = lm(y~x+I(x^2))$fitted.values)
412 |
413 | library(ggplot2)
414 | ggplot(df, aes(x,y)) +
415 | geom_point( aes(x,y)) +
416 | geom_line(aes(x=x,y=pred_lm), col='red') +
417 | geom_line(aes(x=x,y=pred_lm_quad), col='blue')
418 |
419 | ```
420 |
421 | ## Interaction Terms
422 | * [Interaction_(statistics)](https://en.wikipedia.org/wiki/Interaction_(statistics))
423 | * ["How Much Should We Trust Estimates from Multiplicative Interaction Models? Simple Tools to Improve Empirical Practice,"](http://yiqingxu.org/papers/english/2018_HMX_interaction/main.pdf), Jens Hainmueller Jonathan Mummolo Yiqing Xu,, April 20, 2018, Political Analysis
424 | * ["Exploring interactions with continuous predictors in regression models"](https://cran.r-project.org/web/packages/jtools/vignettes/interactions.html), Jacob Long, 2018-05-07
425 |
426 | Nonlinear Models
427 | * (ISLR) "Chapter 7 Moving Beyond Linearity"
428 | [Linear_separability](https://en.wikipedia.org/wiki/Linear_separability)
429 |
430 |
431 | ```{r}
432 | set.seed(123)
433 | form <- ~ .^2
434 |
435 | y <- BreastCancer_onehot_train$Class_binary
436 |
437 | BreastCancer_onehot_train_twoway <- model.matrix(form, data = BreastCancer_onehot_train[,-c(6)])
438 | BreastCancer_onehot_test_twoway <- model.matrix(form, data = BreastCancer_onehot_test[,-c(6)])
439 |
440 | dim(BreastCancer_onehot_train_twoway)#991 terms
441 |
442 | condition = colnames(BreastCancer_onehot_train_twoway)=='Class_binary'
443 | glmnet_twoway <- glmnet(x=BreastCancer_onehot_train_twoway ,
444 | y=as.factor(BreastCancer_onehot_train$y),
445 | family="binomial"
446 | )
447 | plot(glmnet_twoway)
448 |
449 |
450 |
451 | glmnet_twoway_cv <- cv.glmnet(x=BreastCancer_onehot_train_twoway,
452 | y=as.factor(BreastCancer_onehot_train$y),
453 | family="binomial",
454 | nfolds=5)
455 |
456 | glmnet_twoway_cv$lambda.1se #smallest model with error within 1se error of the minimum ever observed
457 |
458 | plot(glmnet_twoway_cv)
459 |
460 | glmnet_twoway_lambda.1se_betas <- coef(glmnet1_cv,s="lambda.1se") %>% as.matrix() %>% as.data.frame() %>%
461 | rename(beta='1') %>%
462 | rownames_to_column() %>% arrange(desc(beta) )
463 |
464 |
465 |
466 | #Design a single model around that optimal lambda
467 | glmnet_twoway_lambda.1se <- glmnet(x=BreastCancer_onehot_train %>% select(-y) %>% data.matrix(),
468 | y=BreastCancer_onehot_train$y,
469 | family="binomial",
470 | lambda=glmnet1_cv$lambda.1se
471 | )
472 |
473 |
474 | #cross validate that model to get estimate of accuracy on the test set
475 | glmnet_twoway_lambda.1se_cv <- train(x=BreastCancer_onehot_train_twoway,
476 | y=as.factor(paste0('Outcome',BreastCancer_train$y)),
477 | method = "glmnet",
478 | trControl = cctrl1,
479 | metric = "ROC",
480 | tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se))
481 |
482 | #Area Under the Curve Almost Perfect Now despite using only 14 of the 44 features
483 | print(glmnet_twoway_lambda.1se_cv$results$ROC) #0.991
484 |
485 | p_load(plotROC)
486 | out_of_sample_predictions3 <- data.frame(y_hat=glmnet_twoway_lambda.1se_cv$pred$Outcome1,
487 | y=BreastCancer_train$y[glmnet_twoway_lambda.1se_cv$pred$rowIndex],
488 | model="Lasso Interactions")
489 | basicplot <- ggplot(bind_rows(out_of_sample_predictions,
490 | out_of_sample_predictions2,
491 | out_of_sample_predictions3),
492 | aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
493 | style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
494 | basicplot
495 |
496 | ```
497 |
498 | Interpreting the model
499 | There are some measures that unambigiously look bad for cancer outcomes.
500 | There are certain interactions that are good news.
501 | Bare.nuclei_8:Normal.nucleoli_2
502 | Bare.nuclei_1:Mitoses_1
503 | Bare.nuclei_7:Normal.nucleoli_8
504 | Normal.nucleoli_1:Mitoses_1
505 | are.nuclei_1:Normal.nucleoli_1
506 |
507 | Bare.nuclei_1 by itself looks like good news, but in combination with something else it's especially helpful.
508 |
509 | ```{r}
510 | #There are 991 terms
511 | #By a mirracle, also 14 chosen
512 | #Some of the cofficients are relatively small
513 |
514 | glmnet_twoway_cv_betas <- coef(glmnet_twoway_cv,s="lambda.1se") %>%
515 | as.matrix() %>% as.data.frame() %>%
516 | rename(beta='1') %>%
517 | rownames_to_column() %>% arrange(desc(beta) )
518 | glmnet_twoway_cv_betas %>% filter(beta!=0)
519 |
520 |
521 | ```
522 |
523 |
524 | # Decision Trees
525 | * https://en.wikipedia.org/wiki/Decision_tree
526 | * ["Tree-Based Models"](https://www.statmethods.net/advstats/cart.html)
527 | (ISLR) "8 Tree-Based Methods"
528 | (IntroMachineLearningWithR) "5.5 Random forest"
529 | * [“Induction of Decision Trees.”](https://link.springer.com/content/pdf/10.1007/BF00116251.pdf), Quinlan, Ross. 1986., Machine Learning 1(1):81–106.
530 |
531 | ```{r, fig.width=12, fig.height=8}
532 | set.seed(123)
533 | p_load(party)
534 | single_decision_tree <- ctree(formula, data = BreastCancer_train)
535 | plot(single_decision_tree)
536 |
537 | ```
538 |
539 | Out of sample
540 |
541 | Slightly worse but arguably an easier to interpret model.
542 |
543 | ```{r}
544 | set.seed(123)
545 | single_decision_tree_cv_model <- train(x=BreastCancer_train[,-c(10)],
546 | y=as.factor(paste0('Outcome',BreastCancer_train$y)),
547 | method = "ctree",
548 | trControl = cctrl1,
549 | metric = "ROC",
550 | tuneGrid = expand.grid(mincriterion = 0.99)
551 | )
552 |
553 | print(single_decision_tree_cv_model$results$ROC) #0.9668608
554 |
555 |
556 |
557 | p_load(plotROC)
558 | out_of_sample_predictions4 <- data.frame(y_hat=single_decision_tree_cv_model$pred$Outcome1,
559 | y=BreastCancer_train$y[single_decision_tree_cv_model$pred$rowIndex],
560 | model="Tree")
561 | basicplot <- ggplot(bind_rows(out_of_sample_predictions,
562 | out_of_sample_predictions2,
563 | out_of_sample_predictions3,
564 | out_of_sample_predictions4),
565 | aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
566 | style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
567 | basicplot
568 |
569 |
570 |
571 | ```
572 |
573 |
574 | # Overfitting
575 |
576 | ## Bootstrapping Observations
577 |
578 | * [Bootstrap_aggregating](https://en.wikipedia.org/wiki/Bootstrap_aggregating)
579 | * [Cross-validation_(statistics)](https://en.wikipedia.org/wiki/Cross-validation_(statistics))
580 | * ["Linear Model Selection by Cross-Validation,"](http://www.libpls.net/publication/MCCV_Shao_1993.pdf), Jun Shao, 1993
581 | * ["Cross-validation failure: small sample sizes lead to large error bars,"](https://hal.inria.fr/hal-01545002/), Gaël Varoquaux, 2017
582 | * (ESL) "7 Model Assessment and Selection"
583 | * (ISLR) "Chapter 5 Resampling Methods"
584 |
585 | ## Model Complexity/Parismony
586 | * AIC (Akaike 1973)
587 | * [Akaike information criterion (AIC)](https://en.wikipedia.org/wiki/Akaike_information_criterion)
588 | * BIC (Schwarz 1978)
589 | * [Bayesian information criterion (BIC)](https://en.wikipedia.org/wiki/Bayesian_information_criterion)
590 |
591 |
592 | # Curse of dimensionality
593 |
594 | ## Feature Bagging/Subspace Mtethods
595 | * [Random subspace method](https://en.wikipedia.org/wiki/Random_subspace_method)
596 | * [“Bagging Predictors.”](https://link.springer.com/content/pdf/10.1007/BF00058655.pdf),Breiman, Leo. 1996. , Machine Learning 24:123–140.
597 |
598 | # Random Forests
599 | * https://en.wikipedia.org/wiki/Random_forest
600 | * ["RANDOM FORESTS"](https://www.stat.berkeley.edu/~breiman/randomforest2001.pdf) Leo Breiman, January 2001
601 | * ["Exploratory Data Analysis using Random Forests"](http://zmjones.com/static/papers/rfss_manuscript.pdf)
602 |
603 |
604 | ```{r}
605 | set.seed(123)
606 | #install.packages('randomForest', dependencies=T)
607 |
608 | p_load(randomForest)
609 |
610 | forest <- randomForest(formula,
611 | data = BreastCancer_train,
612 | localImp = TRUE,
613 | na.action=na.omit)
614 | print(forest)
615 | ```
616 |
617 | ```{r}
618 |
619 | set.seed(123)
620 | p_load(randomForest)
621 |
622 | forest_cv_model <- train(x=BreastCancer_train[,-c(10)],
623 | y=as.factor(paste0('Outcome',BreastCancer_train$y)),
624 | method = "rf",
625 | trControl = cctrl1,
626 | metric = "ROC"
627 | #tuneGrid = expand.grid(alpha = 1,lambda = glmnet1_cv$lambda.1se)
628 | )
629 |
630 | print(forest_cv_model$results)
631 |
632 |
633 | p_load(plotROC)
634 | condition <- forest_cv_model$pred$mtry==5
635 | out_of_sample_predictions5 <- data.frame(y_hat=forest_cv_model$pred$Outcome1[condition] ,
636 | y=BreastCancer_train$y[forest_cv_model$pred$rowIndex[condition]] ,
637 | model="Forest")
638 | basicplot <- ggplot(bind_rows(out_of_sample_predictions,
639 | out_of_sample_predictions2,
640 | out_of_sample_predictions3,
641 | out_of_sample_predictions4,
642 | out_of_sample_predictions5),
643 | aes(d = as.numeric(y), m = y_hat, color=model)) + geom_roc(n.cuts=0) +
644 | style_roc(theme = theme_grey, xlab = "1 - Specificity") + ggtitle("AUC")
645 | basicplot
646 |
647 |
648 | ```
649 |
650 | ## Depth
651 |
652 | [Understanding random forests with randomForestExplainer](https://cran.rstudio.com/web/packages/randomForestExplainer/vignettes/randomForestExplainer.html), Aleksandra Paluszyńska
653 |
654 |
655 | ```{r}
656 | set.seed(123)
657 | #devtools::install_github("MI2DataLab/randomForestExplainer")
658 | p_load(randomForestExplainer)
659 | #install.packages('rlang')
660 |
661 | min_depth_frame <- min_depth_distribution(forest)
662 | save(min_depth_frame, file = "min_depth_frame.rda")
663 | load("min_depth_frame.rda")
664 | head(min_depth_frame, n = 10)
665 |
666 | # plot_min_depth_distribution(forest) # gives the same result as below but takes longer
667 | plot_min_depth_distribution(min_depth_frame)
668 |
669 | ```
670 |
671 | Variable Importance
672 | Pay particular attention to "accuracy_decrease" which is the drop in the classifier's accuracy if that variable is shuffled destroying its information.
673 |
674 | ```{r}
675 | importance_frame <- measure_importance(forest)
676 | importance_frame
677 | ```
678 |
679 | ```{r}
680 | plot_multi_way_importance(importance_frame, size_measure = "no_of_nodes")
681 | ```
682 |
683 | ```{r}
684 | (vars <- important_variables(importance_frame, k = 5, measures = c("mean_min_depth", "no_of_trees")))
685 | interactions_frame <- min_depth_interactions(forest, vars)
686 | head(interactions_frame[order(interactions_frame$occurrences, decreasing = TRUE), ])
687 | ```
688 |
689 | ```{r}
690 | plot_min_depth_interactions(interactions_frame)
691 | ```
692 |
693 | ```{r, eval=F}
694 | plot_predict_interaction(forest, BreastCancer_train[,-c(10)], "Cell.size", "Cl.thickness")
695 | ```
696 |
697 | Can even generate an automated report
698 | ```{r, eval=F}
699 | explain_forest(forest, interactions = TRUE, data = BreastCancer_train)
700 | ```
701 |
702 | # Compare out of Sample Accuracy
703 |
704 | ```{r}
705 | set.seed(123)
706 | df_predictions <- data.frame(y_true=BreastCancer_test$y,
707 | y_hat_glm=stats::predict.glm(glm1, newdata=BreastCancer_onehot_test, type = "response" ),
708 | y_hat_lasso = predict(glmnet1_cv, newx=BreastCancer_onehot_test %>%
709 | select(-y) %>% data.matrix(), s=c("lambda.1se") ,
710 | type = "response")[,1],
711 | y_hat_lasso_twoway <- predict(glmnet_twoway_cv,
712 | newx=BreastCancer_onehot_test_twoway %>%
713 | data.matrix(),
714 | s=c("lambda.1se") , type = "response")[,1],
715 | y_hat_single_tree = predict(single_decision_tree, newdata=BreastCancer_test,
716 | type = "prob") %>% sapply(rbind) %>% t() %>%
717 | data.frame() %>% pull(X2),
718 | y_hat_forest = predict(forest, newdata=BreastCancer_test, type = "prob")[,'1']#,
719 | #y_hat_nn = predict(NN, newdata=BreastCancer_test, type = "prob")
720 | )
721 | ```
722 | ```{r}
723 | p_load(MLmetrics)
724 | AUC(df_predictions$y_hat_glm,df_predictions$y_true) %>% round(3)
725 | AUC(df_predictions$y_hat_lasso,df_predictions$y_true) %>% round(3)
726 | AUC(df_predictions$y_hat_lasso_twoway,df_predictions$y_true) %>% round(3)
727 | AUC(df_predictions$y_hat_single_tree,df_predictions$y_true) %>% round(3)
728 | AUC(df_predictions$y_hat_forest,df_predictions$y_true) %>% round(3)
729 |
730 | table(df_predictions$y_hat_lasso>.5,
731 | df_predictions$y_true)
732 |
733 | ```
734 |
735 |
736 |
737 | # Neural Networks
738 | * ["Neural Networks, Manifolds, and Topology"](http://colah.github.io/posts/2014-03-NN-Manifolds-Topology/), Christopher Olah
739 | * (DL) Deep Learning, Ian Goodfellow and Yoshua Bengio and Aaron Courville, 2016, http://www.deeplearningbook.org/
740 | * (PRML) "Chapter 5 Neural Networks"
741 | * [Tensorflow Playground](http://playground.tensorflow.org/#activation=tanh&batchSize=10&dataset=circle®Dataset=reg-plane&learningRate=0.03®ularizationRate=0&noise=0&networkShape=4,2&seed=0.47077&showTestData=false&discretize=false&percTrainData=50&x=true&y=true&xTimesY=false&xSquared=false&ySquared=false&cosX=false&sinX=false&cosY=false&sinY=false&collectStats=false&problem=classification&initZero=false&hideText=false)
742 | * [ConvNetJS Deep Learning in your browser](https://cs.stanford.edu/people/karpathy/convnetjs/)
743 | * [KerasJS](https://transcranial.github.io/keras-js/#/)
744 | * ["Understanding LSTM Networks,"](http://colah.github.io/posts/2015-08-Understanding-LSTMs/), Christopher Olah, August 27, 2015,
745 | * [The Building Blocks of Interpretability](https://distill.pub/2018/building-blocks/), Chris Olah, Arvind Satyanarayan, Ian Johnson, Shan Carter, Ludwig Schubert, Katherine Ye, Alexander Mordvintsev, 2018, Distill
746 | * [Feature Visualization How neural networks build up their understanding of images](https://distill.pub/2017/feature-visualization/), Chris Olah, Alexander Mordvintsev, Ludwig Schubert, Nov. 7, 2017, Distill
747 |
748 | ```{r, fig.width=12, fig.height=8}
749 | p_load("neuralnet")
750 |
751 | formula_onehot_2 = y + y_not ~ Cl.thickness + Cell.size + Cell.shape + Marg.adhesion + Epith.c.size +
752 | Bare.nuclei_1 + Bare.nuclei_10 + Bare.nuclei_2 + Bare.nuclei_4 +
753 | Bare.nuclei_3 + Bare.nuclei_9 + Bare.nuclei_7 + Bare.nuclei_5 +
754 | Bare.nuclei_8 + Bare.nuclei_6 + Bl.cromatin_3 + Bl.cromatin_9 +
755 | Bl.cromatin_1 + Bl.cromatin_2 + Bl.cromatin_4 + Bl.cromatin_5 +
756 | Bl.cromatin_7 + Bl.cromatin_8 + Bl.cromatin_6 + Bl.cromatin_10 +
757 | Normal.nucleoli_1 + Normal.nucleoli_2 + Normal.nucleoli_7 +
758 | Normal.nucleoli_4 + Normal.nucleoli_5 + Normal.nucleoli_3 +
759 | Normal.nucleoli_10 + Normal.nucleoli_6 + Normal.nucleoli_9 +
760 | Normal.nucleoli_8 + Mitoses_1 + Mitoses_5 + Mitoses_4 + Mitoses_2 +
761 | Mitoses_3 + Mitoses_7 + Mitoses_10 + Mitoses_8 + Mitoses_6
762 |
763 | BreastCancer_onehot_train_2 = BreastCancer_onehot_train
764 | BreastCancer_onehot_train_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_train_2$y)-1))
765 | BreastCancer_onehot_test_2 = BreastCancer_onehot_test
766 | BreastCancer_onehot_test_2$y_not = as.numeric(!as.logical(as.numeric(BreastCancer_onehot_test_2$y)-1))
767 | table(BreastCancer_onehot_test_2$y_not, BreastCancer_onehot_test_2$y)
768 |
769 | NN = neuralnet(formula_onehot_2,
770 | data= BreastCancer_onehot_train_2 %>% data.matrix(),
771 | hidden = 10 ,
772 | linear.output = F
773 | )
774 |
775 | # plot neural network
776 | plot(NN)
777 |
778 | ```
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 | # Unsupervised Learning
787 |
788 | * (ISLR) "Chapter 10 Unsupervised Learning"
789 | * IMLR ["Chapter 4 Unsupervised Learning"](https://lgatto.github.io/IntroMachineLearningWithR/unsupervised-learning.html)
790 |
791 | ## Dimensionality Reduction
792 | [Principal_component_analysis](https://en.wikipedia.org/wiki/Principal_component_analysis)
793 | [Multiple correspondence analysis](https://en.wikipedia.org/wiki/Multiple_correspondence_analysis)
794 |
795 | ## Clustering
796 | * [Cluster analysis](https://en.wikipedia.org/wiki/Cluster_analysis
)
797 | * [K-means_clustering](https://en.wikipedia.org/wiki/K-means_clustering)
798 | * ["Unsupervised Machine Learning: The hclust, pvclust, cluster, mclust, and more,"](https://quantdev.ssri.psu.edu/sites/qdev/files/Unsupervised_Machine_Learning_The_mclust_Package_and_others.html)
799 |
800 | # Special Topics
801 |
802 | ## Time
803 | * ["Investigating Sequences in Ordinal Data: A New Approach With Adapted Evolutionary Models,"](https://www.cambridge.org/core/journals/political-science-research-and-methods/article/investigating-sequences-in-ordinal-data-a-new-approach-with-adapted-evolutionary-models/F3747D8A1908902BA7F26C5EE28AFAEF),Patrik Lindenfors, Fredrik Jansson, Yi-ting Wang and Staffan I. Lindberg, Christian Lopez, 05 March 2018,
804 |
805 | ## Text
806 | * ["Text Mining with R: A Tidy Approach,"](https://www.tidytextmining.com/), Julia Silge and David Robinson, 2018-04-02,
807 | * ["Introducing Monte Carlo Methods with R,"](https://www.slideshare.net/xianblog/introducing-monte-carlo-methods-with-r)
808 | * ["Text as Data,"](http://web.stanford.edu/~gentzkow/research/text-as-data.pdf), Matthew Gentzkow, Bryan T. Kelly, Matt Taddy
809 |
810 | ## Images
811 | * https://keras.rstudio.com/articles/examples/cifar10_cnn.html
812 |
813 | # Examples
814 | * ["Examining Explanations for Nuclear Proliferation"](https://doi.org/10.1093/isq/sqv007), Mark S. Bell, International Studies Quarterly, Volume 60, Issue 3, 1 September 2016, Pages 520–529
815 |
816 | # Extras
817 |
818 | ## Gradient Boosting
819 | * Terence Parr and Jeremy Howard, "How to explain gradient boosting," http://explained.ai/gradient-boosting/index.html
820 | * [XGBoost eXtreme Gradient Boosting](https://github.com/dmlc/xgboost)
821 |
822 | ## SVM
823 | * [Support_vector_machine](https://en.wikipedia.org/wiki/Support_vector_machine)
824 |
825 |
826 | ## Nearest Neighbor
827 | * [K-nearest_neighbors_algorithm](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)
828 |
829 |
830 | ## How the Sausage is Made
831 | * ["Troubling Trends in Machine Learning Scholarship"](https://www.dropbox.com/s/ao7c090p8bg1hk3/Lipton%20and%20Steinhardt%20-%20Troubling%20Trends%20in%20Machine%20Learning%20Scholarship.pdf?dl=0), Zachary C. Lipton∗& Jacob Steinhardt, July 9, 2018
832 |
833 |
834 |
--------------------------------------------------------------------------------
/docs/Douglass_IntroductionToMachineLearning_2018_Syllabus_Day2b.rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Introduction to Machine Learning (Syllabus/Code for Day 2b): Solutions for Learning in Unsupervised Settings"
3 | output:
4 | html_notebook:
5 | toc: true # table of content true
6 | toc_depth: 3 # upto three depths of headings (specified by #, ## and ###)
7 | number_sections: true ## if you want number sections at each table header
8 | highlight: tango # specifies the syntax highlighting style
9 | toc_float: true
10 | ---
11 |
12 |
13 | ```{css}
14 |
15 | pre code, pre, code {
16 | white-space: pre !important;
17 | overflow-x: !scroll !important;
18 | word-break: keep-all !important;
19 | word-wrap: initial !important;
20 | }
21 |
22 | code.r{
23 | overflow-x: !scroll !important;
24 | }
25 |
26 | ```
27 |
28 |
29 | # Wisconsin Breast Cancer Dataset
30 |
31 | BreastCancer Dataset
32 | A data frame with 699 observations on 11 variables, one being a character variable, 9 being ordered or nominal, and 1 target class.
33 |
34 | 1. Sample code number: id number
35 | 2. Clump Thickness: 1 - 10
36 | 3. Uniformity of Cell Size: 1 - 10
37 | 4. Uniformity of Cell Shape: 1 - 10
38 | 5. Marginal Adhesion: 1 - 10
39 | 6. Single Epithelial Cell Size: 1 - 10
40 | 7. Bare Nuclei: 1 - 10
41 | 8. Bland Chromatin: 1 - 10
42 | 9. Normal Nucleoli: 1 - 10
43 | 10. Mitoses: 1 - 10
44 | 11. Class: (benign, malignant)
45 |
46 |
47 | [Breast Cancer Wisconsin (Original) Data Set ](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original))
48 |
49 | ["Multisurface method of pattern separation for medical diagnosis applied to breast cytology."](http://www.pnas.org/content/pnas/87/23/9193.full.pdf), Wolberg,W.H., Mangasarian,O.L. (1990). In Proceedings of the National Academy of Sciences, 87, 9193-9196.
50 |
51 | Zhang,J. (1992). Selecting typical instances in instance-based learning. In Proceedings of the Ninth International Machine Learning Conference (pp. 470-479). Aberdeen, Scotland: Morgan Kaufmann.
52 |
53 |
54 | ## Cleaning and documentation
55 |
56 | ```{r}
57 | data(BreastCancer)
58 | glimpse(BreastCancer)
59 | summary(BreastCancer$Class)
60 |
61 | BreastCancer$y <- as.factor(as.numeric(BreastCancer$Class=="malignant"))
62 | BreastCancer$Class <- NULL
63 | BreastCancer$Id <- NULL
64 |
65 | BreastCancer[,1:5] <- lapply(BreastCancer[,1:5] , as.numeric)
66 | summary(BreastCancer)
67 | ```
68 |
69 |
70 | #
71 |
72 | [Understanding Civil War Violence through Military Intelligence: Mining Civilian Targeting Records from the Vietnam War](https://arxiv.org/pdf/1506.05413.pdf), Rex W. Douglass, 2016, Chapter in C.A. Anderton and J. Brauer, eds., Economic Aspects of Genocides, Mass Atrocities, and Their Prevention. New York: Oxford University Press, 2016
73 |
74 | ["How “Free” is Free Riding in civil wars?: Violence, insurgency, and the collective action problem."](http://repositorio-digital.cide.edu/bitstream/handle/11651/1431/153367.pdf?sequence=1;How), Kalyvas, Stathis N., and Matthew Adam Kocher. World politics 59, no. 2 (2007): 177-216]
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/docs/min_depth_frame.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CenterForPeaceAndSecurityStudies/IntroductiontoMachineLearning/ea94951f58823d4a940417e9b669d3f03ec0f7a7/docs/min_depth_frame.rda
--------------------------------------------------------------------------------
/revdep/.gitignore:
--------------------------------------------------------------------------------
1 | checks
2 | library
3 | checks.noindex
4 | library.noindex
5 | data.sqlite
6 | *.html
7 |
--------------------------------------------------------------------------------
/revdep/email.yml:
--------------------------------------------------------------------------------
1 | release_date: ???
2 | rel_release_date: ???
3 | my_news_url: ???
4 | release_version: ???
5 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.so
3 | *.dll
4 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(IntroductiontoMachineLearning)
3 |
4 | test_check("IntroductiontoMachineLearning")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-my-test.R:
--------------------------------------------------------------------------------
1 | context("test-my-test")
2 |
3 | test_that("multiplication works", {
4 | expect_equal(2 * 2, 4)
5 | })
6 |
--------------------------------------------------------------------------------