├── .gitignore
├── Intro2R.Rmd
├── Intro2R.Rproj
├── LICENSE
├── README.md
├── all_code.Rmd
├── ensembles.Rmd
├── heights.txt
├── make_samples.R
├── massive_data.Rmd
├── notes
├── .gitignore
├── Intro2MachineLearning.bib
├── Intro2R.txss
├── appendices.tex
├── art
│ ├── avoid-overfitting.png
│ ├── bias_variance.png
│ ├── censored.pdf
│ ├── imputing.pdf
│ ├── irrelevant-features-hurt-knn-clustering.png
│ ├── irrelevant-features.png
│ ├── non-linear-basis-functions.png
│ ├── som_simulation.png
│ ├── support-vector-machine-15-728.jpg
│ ├── uncensored.pdf
│ └── why-complex-models-can-turn-out-to-be-less-probable.png
├── collaborative.tex
├── commands.tex
├── estimation.tex
├── graphics.Rmd
├── introduction.tex
├── notes.loa
├── notes.loe
├── notes.pdf
├── notes.tex
├── statistical_decision.tex
├── supervised.tex
└── unsupervised.tex
├── project.Rmd
├── sample_questions.Rmd
├── sample_questions.pdf
├── self_practice.Rmd
├── supervised.Rmd
└── unsupervised.Rmd
/.gitignore:
--------------------------------------------------------------------------------
1 | # History files
2 | .Rhistory
3 | # Example code in package build process
4 | *-Ex.R
5 | # R data files from past sessions
6 | .Rdata
7 | # RStudio files
8 | .Rproj.user/
9 | .Rproj.user
10 | notes.Rmd
11 |
12 | # LaTeX files
13 | *.aux
14 | *.glo
15 | *.idx
16 | *.log
17 | *.toc
18 | *.ist
19 | *.acn
20 | *.acr
21 | *.alg
22 | *.bbl
23 | *.blg
24 | *.dvi
25 | *.glg
26 | *.gls
27 | *.ilg
28 | *.ind
29 | *.lof
30 | *.lot
31 | *.maf
32 | *.mtc
33 | *.mtc1
34 | *.out
35 | *.synctex.gz
36 |
37 | # Questions
38 | questions.*
39 | AmitClass
40 |
41 | # Other
42 | test_*
43 | 2010*
44 | Questions
45 | sample_questions_cache/
46 | sample_questions_files/
47 | Exam
48 |
--------------------------------------------------------------------------------
/Intro2R.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Intro2R"
3 | author: "Jonathan Rosenblatt"
4 | date: "March 18, 2015"
5 | output: html_document
6 |
7 | ---
8 | # R Basics
9 |
10 | Tips for this introduction:
11 | - If you are working alone, consider starting with "An Introduction to R" here:
12 | http://cran.r-project.org/manuals.html
13 | - Make sure you use RStudio.
14 | - Ctrl+return to run lines from editor.
15 | - alt+shift+k for RStudio keyboard shortcuts.
16 | - Ctrl+alt+j to navigate between sections
17 | - tab for auto-completion
18 | - Ctrl+1 to skip to editor.
19 | - Ctrl+2 to skip to console.
20 | - Ctrl+8 to skip to the environment list.
21 | - Folding:
22 | - alt+l collapse chunk.
23 | - alt+shift+l unfold chunk.
24 | - alt+o collapse all.
25 | - alt+shift+o unfold all.
26 |
27 |
28 |
29 | ## Simple calculator
30 | ```{r example}
31 | 10+5
32 | 70*81
33 | 2**4
34 | 2^4
35 | log(10)
36 | log(16, 2)
37 | log(1000, 10)
38 | ```
39 |
40 |
41 | ## Probability calculator
42 | Wish you knew this when you did Intro To Probability class?
43 | ```{r}
44 | dbinom(x=3, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns P(X=3)
45 | dbinom(3, 10, 0.5)
46 |
47 | pbinom(q=3, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns P(X<=3)
48 | dbinom(x=0, size=10, prob=0.5)+dbinom(x=1, size=10, prob=0.5)+dbinom(x=2, size=10, prob=0.5)+dbinom(x=3, size=10, prob=0.5) # Same as previous
49 |
50 | qbinom(p=0.1718, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns k such that P(X<=k)=0.1718
51 |
52 | rbinom(n=1, size=10, prob=0.5)
53 | rbinom(n=10, size=10, prob=0.5)
54 | rbinom(n=100, size=10, prob=0.5)
55 | ```
56 |
57 |
58 | ## Getting help
59 | Get help for a particular function.
60 | ```{r, eval=FALSE}
61 | ?dbinom
62 | help(dbinom)
63 | ```
64 |
65 | Search local help files for a particular string.
66 | ```{r, eval=FALSE}
67 | ??binomial
68 | help.search('dbinom')
69 | ```
70 |
71 | Load a menu with several important manuals:
72 | ```{r, eval=FALSE}
73 | help.start()
74 | ```
75 |
76 |
77 | ## Variable asignment:
78 | Assignments into a variable named "x":
79 | ```{r}
80 | x = rbinom(n=1000, size=10, prob=0.5) # Works. Bad style.
81 | x <- rbinom(n=1000, size=10, prob=0.5) # Asignments into a variable named "x"
82 | ```
83 | More on style: http://adv-r.had.co.nz/Style.html
84 |
85 |
86 | Print contents:
87 | ```{r}
88 | x
89 | print(x)
90 | (x <- rbinom(n=1000, size=10, prob=0.5)) # Assign and print.
91 | ```
92 |
93 |
94 | Operate on the object
95 | ```{r}
96 | mean(x)
97 | var(x)
98 | hist(x)
99 | rm(x) # remove variable
100 | ```
101 |
102 |
103 | For more information on distributions see http://cran.r-project.org/web/views/Distributions.html
104 |
105 |
106 | ## Piping for better style and readability
107 | ```{r}
108 | # install.packages('magrittr')
109 | library(magrittr)
110 | ```
111 |
112 | ```{r}
113 | x <- rbinom(n=1000, size=10, prob=0.5)
114 |
115 | x %>% var() # Instead of var(x)
116 | x %>% hist() # Instead of hist(x)
117 | x %>% mean() %>% round(2) %>% add(10)
118 | ```
119 |
120 | This example clearly demonstrates the benefits (from http://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html)
121 | ```{r}
122 | # Functional (onion) style
123 | car_data <-
124 | transform(aggregate(. ~ cyl,
125 | data = subset(mtcars, hp > 100),
126 | FUN = function(x) round(mean(x, 2))),
127 | kpl = mpg*0.4251)
128 |
129 |
130 | # magrittr style
131 | car_data <-
132 | mtcars %>%
133 | subset(hp > 100) %>%
134 | aggregate(. ~ cyl, data = ., FUN = . %>% mean %>% round(2)) %>%
135 | transform(kpl = mpg %>% multiply_by(0.4251)) %>%
136 | print
137 | ```
138 |
139 |
140 | ## Vector creation and manipulation
141 | ```{r}
142 | c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)
143 | 10:21
144 | seq(from=10, to=21, by=1)
145 | x <- seq(from=10, to=21, by=2)
146 | x <- c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21)
147 | x
148 | ```
149 |
150 |
151 |
152 | You can assign AFTER the computation is finished:
153 | ```{r}
154 | c(1,2,3)
155 | y<- .Last.value
156 | y
157 | ```
158 |
159 |
160 | Operations usually work element-wise:
161 | ```{r}
162 | x+2
163 | x*2
164 | x^2
165 | sqrt(x)
166 | log(x)
167 | ```
168 |
169 |
170 | ## Simple plotting
171 | ```{r}
172 | x<- 1:100; y<- 3+sin(x) # Create arbitrary data
173 | plot(x = x, y = y) # x,y syntax
174 | plot(y ~ x) # y~x syntax (I like better)
175 | ```
176 |
177 | Control plot appearance:
178 | ```{r}
179 | plot(y~x, type='l', main='Plotting a connected line')
180 | plot(y~x, type='h', main='Sticks plot', xlab='Insert x axis label', ylab='Insert y axis label')
181 | plot(y~x, pch=5)
182 | plot(y~x, pch=10, type='p', col='blue', cex=4)
183 | abline(3, 0.002)
184 | ```
185 |
186 | Available plotting options
187 | ```{r, eval=FALSE}
188 | example(plot)
189 | example(points)
190 | ?plot
191 | help(package='graphics')
192 | ```
193 |
194 | When your plotting gets serious, move to `ggplot2` and `ggvis` as soon as possible.
195 |
196 |
197 |
198 |
199 | ___
200 |
201 |
202 | ## Data frame Manipulation
203 | `data.frames` extend the `matrix` class, in that they allow the binding of vectors of several classes (with same length).
204 | ```{r}
205 | x<- 1:100; y<- 3 + sin(x)
206 | class(x) # R (high) level representation of an object.
207 |
208 | # mode(x)
209 | # typeof(x)
210 | ```
211 |
212 |
213 | Create and checkout your first data frame
214 | ```{r}
215 | frame1 <- data.frame(x=x, sin=y)
216 | frame1
217 | head(frame1)
218 | frame1 %>% head() # just print the beginning
219 | frame1 %>% View() # Excel-like view (never edit!)
220 |
221 | class(frame1) # the object is of type data.frame
222 | dim(frame1)
223 | dim(x)
224 | length(frame1)
225 | length(x)
226 |
227 | str(frame1) # the inner structure of an object
228 | attributes(frame1) # get the object's meta data
229 | ```
230 |
231 | ### Exctraction
232 | single element:
233 | ```{r}
234 | frame1[1, 2]
235 | frame1[2, 1]
236 | ```
237 |
238 | Extract _column_ by index:
239 | ```{r}
240 | frame1[1, ]
241 | frame1[,1] %>% t
242 | frame1[,1] %>% t %>% dim
243 | ```
244 |
245 | Extract column by name:
246 | ```{r}
247 | names(frame1)
248 | frame1[, 'sin']
249 | dim(frame1[, 'sin']) # extract as a vector. no dim attribute.
250 | frame1['sin']
251 | dim(frame1['x',]) # extract as a data.frame. has dim attribute.
252 | frame1[,1:2] %>% class
253 | frame1[2] %>% class
254 | frame1[2, ] # extract a row
255 |
256 | frame1$sin %>% class
257 | ```
258 |
259 | `subset()` does the same
260 | ```{r}
261 | subset(frame1, select=sin)
262 | subset(frame1, select=2)
263 | subset(frame1, select= c(2,0))
264 | ```
265 |
266 |
267 | Sanity conservation notice!
268 | Always think if you want to extract a vector or a frame:
269 | - Note the difference between `[]` and `[[]]` extraction!
270 | - Note the difference between `frame[,1]` and `frame[1]`.
271 | ```{r}
272 | a <- frame1[1]
273 | b <- frame1[[1]]
274 | a==b # Seems identical. But not really:
275 | class(a)
276 | class(b)
277 | # Causes different behaviour:
278 | a[1]
279 | b[1]
280 | ```
281 |
282 | More about extraction: http://adv-r.had.co.nz/Subsetting.html
283 |
284 | ### dplyr package
285 | `dplyr` makes the manipulation of data.frames a breeze.
286 | It is very fast, and straightforward to use.
287 |
288 | Install the package:
289 | ```{r}
290 | # install.packages('dplyr')
291 | ```
292 |
293 | The following examples are taken from:
294 | https://github.com/justmarkham/dplyr-tutorial/blob/master/dplyr-tutorial.Rmd
295 | ```{r}
296 | # install.packages('nycflights13')
297 | library(nycflights13)
298 | dim(flights)
299 | View(flights)
300 | names(flights)
301 | class(flights) # a tbl_df is an extension of the data.frame class
302 | library(dplyr) # calling dplyr
303 |
304 | filter(flights, month == 1, day == 1) #dplyr style
305 | flights[flights$month == 1 & flights$day == 1, ] # old style
306 | flights %>% filter(month == 1, day == 1) # dplyr with magrittr style (yes!)
307 |
308 | filter(flights, month == 1 | month == 2)
309 | slice(flights, 1:10) # selects rows
310 |
311 | arrange(flights, year, month, day) # sort
312 | arrange(flights, desc(arr_delay)) # sort descending
313 |
314 | select(flights, year, month, day) # select columns
315 | select(flights, year:day) # select column range
316 | select(flights, -(year:day)) # drop columns
317 | rename(flights, tail_num = tailnum) # rename variables
318 | # add a new computed colume
319 | mutate(flights,
320 | gain = arr_delay - dep_delay,
321 | speed = distance / air_time * 60)
322 | # you can refer to columns just created!
323 | mutate(flights,
324 | gain = arr_delay - dep_delay,
325 | gain_per_hour = gain / (air_time / 60)
326 | )
327 | # keep only new variables
328 | transmute(flights,
329 | gain = arr_delay - dep_delay,
330 | gain_per_hour = gain / (air_time / 60)
331 | )
332 | # simple statistics
333 | summarise(flights,
334 | delay = mean(dep_delay, na.rm = TRUE)
335 | )
336 |
337 | sample_n(flights, 10) # random subsample
338 | sample_frac(flights, 0.01) # random subsample
339 | ```
340 |
341 | Subgroup operations
342 | ```{r}
343 | by_tailnum <- group_by(flights, tailnum)
344 | by_tailnum %>% class # a groupping object
345 | delay <- summarise(by_tailnum,
346 | count = n(),
347 | avg.dist = mean(distance, na.rm = TRUE),
348 | avg.delay = mean(arr_delay, na.rm = TRUE))
349 | delay <- filter(delay, count > 20, avg.dist < 2000)
350 | View(delay)
351 |
352 | destinations <- group_by(flights, dest)
353 | summarise(destinations,
354 | planes = n_distinct(tailnum),
355 | flights = n()
356 | )
357 |
358 | # Grouping works in a hirarchy. summarise() peels outer layer.
359 | daily <- group_by(flights, year, month, day)
360 | (per_day <- summarise(daily, flights = n()))
361 | (per_month <- summarise(per_day, flights = sum(flights)))
362 | (per_year <- summarise(per_month, flights = sum(flights)))
363 | ```
364 |
365 |
366 |
367 |
368 |
369 | Two table operations
370 | ```{r}
371 | airlines %>% View
372 | flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
373 |
374 | flights2 %>% left_join(airlines) # join on left table with automatic matching.
375 |
376 | flights2 %>% left_join(weather)
377 |
378 | flights2 %>% left_join(planes, by = "tailnum") # with named matching
379 |
380 | flights2 %>% left_join(airports, c("dest" = "faa"))
381 |
382 | flights2 %>% left_join(airports, c("origin" = "faa"))
383 | ```
384 |
385 | Types of join
386 | ```{r}
387 | (df1 <- data_frame(x = c(1, 2), y = 2:1))
388 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
389 |
390 | df1 %>% inner_join(df2) # SELECT * FROM x JOIN y ON x.a = y.a
391 |
392 | df1 %>% left_join(df2) # SELECT * FROM x LEFT JOIN y ON x.a = y.a
393 |
394 | df1 %>% right_join(df2) # SELECT * FROM x RIGHT JOIN y ON x.a = y.a
395 | df2 %>% left_join(df1)
396 |
397 | df1 %>% full_join(df2) # SELECT * FROM x FULL JOIN y ON x.a = y.a
398 |
399 | # return only unmatched cases
400 | flights %>%
401 | anti_join(planes, by = "tailnum") %>%
402 | count(tailnum, sort = TRUE)
403 | # SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a)
404 |
405 | df1 %>% semi_join(df2, by = "x") # SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a)
406 | ```
407 |
408 | Set operations
409 | ```{r}
410 | (df1 <- data_frame(x = 1:2, y = c(1L, 1L)))
411 | (df2 <- data_frame(x = 1:2, y = 1:2))
412 |
413 | intersect(df1, df2) # SELECT * FROM x INTERSECT SELECT * FROM y
414 |
415 | union(df1, df2) # SELECT * FROM x UNION SELECT * FROM y
416 |
417 | setdiff(df1, df2) # SELECT * FROM x EXCEPT SELECT * FROM y
418 |
419 | setdiff(df2, df1)
420 | ```
421 |
422 | Leaving dplyr for now...
423 |
424 |
425 | ## Data Import and export
426 |
427 | __Note__: The [readr](https://github.com/hadley/readr) package facilitates and accelerates data importing. This section should be updated to use it.
428 |
429 | For a complete review see:
430 | http://cran.r-project.org/doc/manuals/R-data.html
431 | also in help.start() -> "Import and Export Manual"
432 |
433 |
434 | ### Import from WEB
435 | `read.table()` is the main importing workhorse.
436 | ```{r}
437 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data'
438 | tirgul1 <- read.table(URL)
439 | ```
440 |
441 | Always look at the imported result!
442 | ```{r}
443 | View(tirgul1)
444 | # hmmm... header interpreted as data. Fix with header=TRUE:
445 | tirgul1 <- read.table(URL, header = TRUE)
446 | View(tirgul1)
447 | ```
448 |
449 | ### Import .csv files
450 | Let's write a simple file so that we have something to import:
451 | ```{r}
452 | View(airquality) # examine the data to export
453 | (temp.file.name <- tempfile()) # get an arbitrary file name
454 | write.csv(x = airquality, file = temp.file.name) #export
455 | ```
456 |
457 | Now let's import:
458 | ```{r}
459 | # my.data<- read.csv(file='/home/jonathan/Projects/...')
460 | my.data<- read.csv(file=temp.file.name)
461 | View(my.data)
462 | ```
463 |
464 | __Note__: Under MS Windows(R) you might want need '\\\' instead of '/'
465 |
466 | ### Imprt .txt files
467 | Tries to guess the separator
468 | ```{r, eval=FALSE}
469 | my.data<- read.table(file='C:\\Documents and Settings\\Jonathan\\My Documents\\...') #
470 | ```
471 | Specifies the separator explicitly
472 | ```{r, eval=FALSE}
473 | my.data<- read.delim(file='C:\\Documents and Settings\\Jonathan\\My Documents\\...')
474 | ```
475 | If you care about your sanity, see ?read.table before starting imports.
476 |
477 | ### Writing Data to files
478 |
479 | Get and set the current directory:
480 | ```{r, eval=FALSE}
481 | getwd() #What is the working directory?
482 | setwd() #Setting the working directory in Linux
483 | ```
484 |
485 | ```{r}
486 | write.csv(x=tirgul1, file='/tmp/tirgul1.csv') #
487 | ```
488 |
489 | See ?write.table for details.
490 |
491 | ### .XLS(X) files
492 | Strongly recommended to convert to .csv
493 | If you still insist see:
494 | http://cran.r-project.org/doc/manuals/R-data.html#Reading-Excel-spreadsheets
495 |
496 | ### Massive files
497 | Better store as matrices and not data.frames.
498 | `scan()` is faster than `read.table()` but less convenient:
499 |
500 | Create the example data:
501 | ```{r}
502 | cols<- 1e3
503 | # Note: On Windoes you might neet to change /tmp/A.txt to /temp/A.txt
504 | rnorm(cols^2) %>%
505 | matrix(ncol=cols) %>%
506 | write.table(file='/tmp/A.txt', col.names= F, row.names= F)
507 | # Measure speed of import:
508 | system.time(A<- read.table('/tmp/A.txt', header=F))
509 | system.time(A <- scan(file='/tmp/A.txt', n = cols^2) %>%
510 | matrix(ncol=cols, byrow = TRUE))
511 |
512 | file.remove('/tmp/A.txt')
513 | ```
514 |
515 | This matter will be revisited in the last class.
516 |
517 | ### Databases
518 | Start [here](https://rforanalytics.wordpress.com/useful-links-for-r/odbc-databases-for-r/)
519 |
520 | ### Hands on example (from the WEB)
521 | ```{r}
522 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data'
523 | tirgul1 <- read.table(URL, header = TRUE)
524 |
525 | names(tirgul1)
526 | tirgul1 %>% head
527 | tirgul1 %>% tail
528 | View(tirgul1)
529 | dim(tirgul1)
530 | length(tirgul1)
531 | ```
532 |
533 | R can be object oriented (read about S3 and S4 if interested).
534 | See how `summary()` behaves differently on different object classes:
535 | ```{r}
536 | class(tirgul1[, 1]); class(tirgul1[, 2]); class(tirgul1[, 3]); class(tirgul1[, 4])
537 | summary(tirgul1)
538 | ```
539 |
540 |
541 |
542 | Matrix is more efficient than data frames. But can store only a single class of vectors.
543 | ```{r}
544 | tirgul.matrix <- as.matrix(tirgul1)
545 | tirgul.matrix
546 | class(tirgul.matrix)
547 | # notice everything has been cast to the most general class.
548 | class(tirgul.matrix[, 1]); class(tirgul.matrix[, 2]); class(tirgul.matrix[, 3]); class(tirgul.matrix[, 4])
549 | summary(tirgul.matrix)
550 | ```
551 |
552 | Note: if copy-pasting an expression bothers you (as it should!), here are some solutions:
553 | ```{r}
554 | # The apply family of functions:
555 | sapply(tirgul.matrix, class)
556 |
557 | # looping
558 | for(j in 1:ncol(tirgul.matrix)) print(class(tirgul.matrix[,j]))
559 | ```
560 |
561 | Make sure you read `?sapply`.
562 | LISP fans might also like to read `?MAP`.
563 |
564 |
565 |
566 | Operations _within_ data objects:
567 | ```{r}
568 | plot(tirgul1$gender)
569 | tirgul1$gender %>% plot() #
570 | with(tirgul1, plot(gender) ) # Same opration. Different syntax.
571 |
572 | mean(tirgul1$age)
573 | tirgul1$age %>% mean() #
574 | with(tirgul1, mean(age) ) # Same opration. Different syntax.
575 | ```
576 |
577 |
578 | ```{r}
579 | tirgul1$age <- tirgul1$age * 365
580 | tirgul1<- transform(tirgul1, age=age*365 ) #Age in days
581 | with(tirgul1, mean(age) )
582 | tirgul1<- transform(tirgul1, age=age/365 ) #Does this revert back to years?
583 | with(tirgul1, mean(age) )
584 | ```
585 |
586 | Then again, many of these functions are replaced by more friendly functions in the `dplyr` package (see below).
587 |
588 |
589 | ## Sorting
590 | ```{r}
591 | (x<- c(20, 11, 13, 23, 7, 4))
592 | (y<- sort(x))
593 | (ord<- order(x))
594 | x[ord] # Exctracting along the order is the same as sorting.
595 | ranks<- rank(x)
596 | identical(y[ranks] , x) # Compares two objects
597 |
598 | (z<- c('b','a','c','d','e','z'))
599 | xz<- data.frame(x,z)
600 | sort(xz)
601 | xz[ord,] # Sorting a data frame using one column
602 | ```
603 |
604 |
605 | ## Looping
606 | For a crash course in R programming (not only data analysis) try:
607 | http://adv-r.had.co.nz/
608 | The usual for(), while(), repeat()
609 | ```{r}
610 | for (i in 1:100){
611 | print(i)
612 | }
613 | ```
614 |
615 |
616 | ```{r}
617 | for (helloeveryone in seq(10, 100, by=2) ){
618 | print(helloeveryone)
619 | }
620 | ```
621 |
622 |
623 | ## Recursion
624 | Typically very slow due to memory management issues.
625 |
626 | ```{r}
627 | fib<-function(n) {
628 | if (n < 2) fn<-1
629 | else fn<-Recall(n - 1) + Recall(n - 2)
630 | return(fn)
631 | }
632 | fib(30)
633 | ```
634 |
635 |
636 | ## Finding your objects
637 | ```{r}
638 | ls() #Lists all available objects
639 | ls(pattern='x')
640 |
641 | ls(pattern='[0-9]') # Search using regular expressions
642 | ls(pattern='[A-Z]')
643 | ```
644 |
645 | Ctrl+8 in RStudio.
646 |
647 |
648 |
649 |
650 | # Univariate Exploratory Statistics
651 |
652 |
653 | ## Exploring Categorical Variables
654 | ```{r}
655 | gender <- c(rep('Boy', 10), rep('Girl', 12))
656 | drink <- c(rep('Coke', 5), rep('Sprite', 3), rep('Coffee', 6), rep('Tea', 7), rep('Water', 1))
657 | class(gender);class(drink)
658 |
659 | cbind(gender, drink)
660 | table1 <- table(gender, drink)
661 | table1
662 | ```
663 |
664 |
665 |
666 |
667 | ## Exploring Continous Variables
668 |
669 | Generating and exploring data
670 | ```{r}
671 | sample1 <- rnorm(100)
672 | table(sample1)
673 | hist(sample1, freq=T, main='Counts')
674 | hist(sample1, freq=F, main='Frequencies')
675 | lines(density(sample1))
676 | rug(sample1)
677 | ```
678 |
679 |
680 | ## The Boxplot
681 | ```{r}
682 | boxplot(sample1)
683 | ```
684 |
685 |
686 |
687 | Several different visualizations:
688 | ```{r}
689 | sample2<-rnorm(1000)
690 | stem(sample2)
691 | hist(sample2)
692 | plot(density(sample2))
693 | rug(sample2)
694 | ```
695 |
696 |
697 |
698 | True data
699 | ```{r}
700 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data'
701 | bone <- read.table(URL, header = TRUE)
702 | names(bone)
703 | summary(bone)
704 | stripchart(bone['age'])
705 | stem(bone[, 'age'])
706 | hist(bone[, 'age'], prob=T)
707 | lines(density(bone[, 'age']))
708 | with(bone, rug(age))
709 |
710 | ind<-bone[, 'gender']=='male'
711 |
712 | boxplot(bone$age~bone$gender)
713 | ```
714 |
715 |
716 | ## Graphical parameters
717 | ```{r}
718 | attach(bone)
719 | stripchart(age)
720 | stripchart(age~gender)
721 | stripchart(age~gender, v=T)
722 |
723 | boxplot(age~gender)
724 | boxplot(age~gender, horizontal=T, col=c('pink','lightblue') )
725 | title(main='Amazing Boxplots!')
726 | title(sub="Well actually.. I've seen better Boxplots")
727 |
728 | plot(density(age), main='')
729 | plot(density(age), main='', type='h')
730 | plot(density(age), main='', type='o')
731 | plot(density(age), main='', type='p')
732 | plot(density(age), main='', type='l')
733 |
734 | ?plot.default
735 |
736 | plot(density(age),main='')
737 | rug(age)
738 | boxplot(age, add=T, horizontal=T, at=0.02, boxwex=0.05, col='grey')
739 | title(expression(alpha==f[i] (beta)))
740 | example(plotmath)
741 |
742 | par(mfrow=c(2,1))
743 | (males<- gender=='male')
744 | plot(density(age[males]), main='Male') ; rug(age[males])
745 | plot(density(age[!males]), main='Female') ; rug(age[!males])
746 |
747 | range(age)
748 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males])
749 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males])
750 | par(mfrow=c(1,2))
751 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males])
752 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males])
753 |
754 | par(mfrow=c(1,1),ask=T)
755 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males])
756 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males])
757 | ```
758 |
759 |
760 | ## Integer data
761 | Integer data will most certainly produce overlaps if plotted. Either add hitter, or treat as discrete.
762 | ```{r}
763 | r.age<-round(age)
764 | plot(density(r.age))
765 | rug(r.age)
766 | plot(density(r.age, from=9))
767 | rug(jitter(r.age))
768 | hist(r.age)
769 | rug(jitter(r.age))
770 | ```
771 |
772 |
773 | ## Plotting
774 |
775 | ### Preparing data for plotting
776 | 2D data can be in either _wide_ or _long_ format.
777 | Most R functions are designed for long formats.
778 | Let's start by trying to plot in the wide format.
779 | Notice each dosage is plotted separately (yes, I could have looped).
780 | ```{r}
781 | wide.data<-data.frame(id=1:4, age=c(40,50,60,50), dose1=c(1,2,1,2),dose2=c(2,1,2,1), dose4=c(3,3,3,3))
782 | wide.data
783 |
784 | plot(dose1~age, data=wide.data, ylim=range(c(dose1,dose2,dose4)), ylab='')
785 | points(dose2~age, data=wide.data, pch=2)
786 | points(dose4~age, data=wide.data, pch=3)
787 | ```
788 |
789 |
790 | Plotting in long format is much easier.
791 | I will first convert the data manually.
792 | ```{r}
793 | (dose.type<-c(
794 | rep('dose1', length(wide.data$dose1)),
795 | rep('dose2', length(wide.data$dose2)),
796 | rep('dose4', length(wide.data$dose4))))
797 | (dose<- c(wide.data$dose1,wide.data$dose2,wide.data$dose4))
798 | (long.id<- rep(wide.data$id,3))
799 | (long.age<- rep(wide.data$age,3))
800 |
801 | long.data <- data.frame(long.id, long.age, dose.type, dose)
802 | View(long.data)
803 |
804 | plot(dose~long.age, data=long.data, pch=as.numeric(dose.type))
805 | ```
806 | I will now try to avoid this manual reshaping.
807 |
808 | ### Reshaping data with `tidyr` package
809 |
810 | This is the package I recommend if you cannot reshape manually.
811 | Example from [here](http://blog.rstudio.org/2014/07/22/introducing-tidyr/)
812 | ```{r}
813 | library(tidyr)
814 | library(dplyr)
815 |
816 | # Data in wide format:
817 | messy <- data.frame(
818 | name = c("Wilbur", "Petunia", "Gregory"),
819 | a = c(67, 80, 64),
820 | b = c(56, 90, 50)
821 | )
822 | messy
823 |
824 | # Convert to long format:
825 | messy %>% gather(drug, heartrate, a:b)
826 | ```
827 |
828 | ```{r}
829 | # Another example- from wide to long:
830 | set.seed(10)
831 | messy <- data.frame(
832 | id = 1:4,
833 | trt = sample(rep(c('control', 'treatment'), each = 2)),
834 | work.T1 = runif(4),
835 | home.T1 = runif(4),
836 | work.T2 = runif(4),
837 | home.T2 = runif(4)
838 | )
839 | messy %>% head
840 | tidier <- messy %>% gather(key, time, -id, -trt)
841 | tidier %>% head(8)
842 |
843 | # From long to wide
844 | tidy <- tidier %>%
845 | separate(key, into = c("location", "time"), sep = "\\.")
846 | tidy %>% head(8)
847 | ```
848 |
849 | ### Fancy Plotting
850 | ```{r}
851 | library(ggplot2)
852 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data'
853 | bone <- read.table(URL, header = TRUE)
854 | qplot(spnbmd, data=bone)
855 | qplot(x=gender, y=spnbmd, data=bone, geom='boxplot')
856 | qplot(spnbmd, data=bone, geom='histogram')+ facet_wrap(~gender)
857 | qplot(spnbmd, data=bone, geom='density')+ facet_wrap(~gender)
858 | qplot(spnbmd, data=bone)+ geom_density(col='red', size=1)+ facet_wrap(~gender)
859 | qplot(spnbmd, data=bone, fill=gender, geom='density', alpha=1)
860 | ```
861 |
862 | Diamonds example (Taken from Wickham's web site: http://had.co.nz/stat405/)
863 | ```{r}
864 | ?diamonds
865 | dim(diamonds)
866 | head(diamonds)
867 | ```
868 |
869 | ```{r}
870 | qplot(carat, data = diamonds)
871 | qplot(carat, data = diamonds, binwidth = 1)
872 | qplot(carat, data = diamonds, binwidth = 0.1)
873 | qplot(carat, data = diamonds, binwidth = 0.01)
874 | resolution(diamonds$carat)
875 | last_plot() + xlim(0, 3)
876 |
877 | qplot(depth, data = diamonds, binwidth = 0.2)
878 | qplot(depth, data = diamonds, binwidth = 0.2,fill = cut) + xlim(55, 70)
879 | qplot(depth, data = diamonds, binwidth = 0.562) +xlim(55, 70) + facet_wrap(~ cut)
880 |
881 | qplot(table, price, data = diamonds)
882 | qplot(table, price, data = diamonds, geom = "boxplot")
883 | qplot(table, price, data = diamonds, geom="boxplot",group = round(table))
884 |
885 | qplot(carat, price, data = diamonds)
886 | qplot(carat, price, data = diamonds, alpha = I(1/10))
887 |
888 | qplot(carat, price, data = diamonds, geom = "bin2d", main='Count Heatmap')
889 | qplot(carat, price, data = diamonds, geom = "hex")
890 | qplot(carat, price, data = diamonds) + geom_smooth()
891 | ```
892 |
893 |
894 | For more information on ggplot2 see http://had.co.nz/ggplot2
895 |
896 |
897 | ## The QQ plot
898 | A simple and efficient tool to compare between distributions.
899 | ```{r}
900 | mystery.2<-function(y) {
901 | n<-length(y)
902 | y<-sort(y)
903 | i<-1:n
904 | q<-(i-0.5)/n
905 | x<-qnorm(q, mean(y), sqrt(var(y)))
906 | plot(y~x, xlab='Theoretical Quantiles', ylab='Empirical Quantiles')
907 | }
908 |
909 | normals.1<-rnorm(100, 0, 1); hist(normals.1)
910 | mystery.2(normals.1); abline(0, 1)
911 |
912 | normals.2<-rnorm(100, 0, 10); hist(normals.2)
913 | mystery.2(normals.2); abline(0, 1)
914 |
915 | ## No need to write the function every time...
916 | qqnorm(normals.1)
917 | qqnorm(normals.2)
918 |
919 | ## How would non-normal observations look? ##
920 | non.normals.1<-runif(100); hist(non.normals.1)
921 | mystery.2(non.normals.1); abline(0, 1)
922 |
923 | non.normals.2<-rexp(100, 1); hist(non.normals.2)
924 | mystery.2(non.normals.2); abline(0, 1)
925 |
926 | non.normals.3<-rgeom(100, 0.5); hist(non.normals.3)
927 | mystery.2(non.normals.3); abline(0, 1)
928 |
929 | ## Adapting for a non-normal distribution: ##
930 | qq.uniform<-function(y) {
931 | n<-length(y); y<-sort(y); i<-1:n; q<-(i-0.5)/n
932 | x<-qunif(q, min=min(y), max=max(y)) #each disribution will require it's own parameters!
933 | plot(y~x, xlab='Theoretical Quantiles', ylab='Empirical Quantiles')
934 | }
935 | qq.uniform(non.normals.1);abline(0, 1)
936 | qq.uniform(non.normals.2);abline(0, 1)
937 | qq.uniform(normals.2);abline(0, 1)
938 | ```
939 |
940 |
941 |
942 |
943 |
944 | # Multiple data vectors
945 | We now leave the single-vector world and move to the analysis of dependencies between several vectors.
946 |
947 | ## Scatter plots
948 | ```{r}
949 | # Sine function
950 | x<-seq(-pi, pi, 0.01)
951 | y<-sin(x)
952 | plot(y~x)
953 |
954 | #Exponent function
955 | x<-seq(-pi, pi, 0.01)
956 | y<-exp(x)
957 | plot(y~x)
958 |
959 | # Sinc function
960 | x<-seq(-10*pi, 10*pi, 0.01)
961 | y<-sin(x)/x
962 | plot(y~x)
963 |
964 | # Fancy function
965 | x<-seq(-pi, pi, 0.01)
966 | y<-sin(exp(x))+cos(2*x)
967 | plot(y~x)
968 | plot(y~x, type='l')
969 | plot(y~x, type='o')
970 |
971 | ## Some real life data
972 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/ozone.data'
973 | ozone <- read.table(URL, header=T)
974 | names(ozone)
975 | plot(ozone)
976 | ```
977 |
978 |
979 | ## 3D plotting
980 | ```{r}
981 | # install.packages('rgl')
982 | library(rgl)
983 | plot3d(ozone[, 1:3])
984 | ```
985 |
986 |
987 | ## Plotting a surface
988 | ```{r}
989 | x <- seq(0, 1, 0.01)
990 | y <- seq(0, 1, 0.01)
991 | xy.grid <- expand.grid(x, y)
992 | func1 <- function(mesh) exp(mesh[, 1]+mesh[, 2])
993 | z <- func1(xy.grid)
994 | xyz <- data.frame(xy.grid, z)
995 | plot3d(xyz, xlab='x', ylab='y')
996 | ```
997 |
998 |
999 | ## Fitting linear lines and surfaces
1000 | We will now try and fit linear surfaces to our data.
1001 |
1002 | ### Well behaved data
1003 | ```{r}
1004 | x <- 1:100
1005 | a <- 2
1006 | b <- 3.5
1007 | sigma <- 10
1008 | y <- a+b*x+rnorm(100, 0, sigma)
1009 | plot(y~x)
1010 | ```
1011 |
1012 | ### Ordinary Least Squares
1013 | ```{r}
1014 | ols.line<-function(x, y){
1015 | sxy<-sum( (x-mean(x) ) * (y-mean(y) ) )
1016 | sxx<-sum( (x-mean(x)) ^ 2 )
1017 | b1<-sxy / sxx
1018 | a1<-mean(y) - b1 * mean(x)
1019 | return(list(slope=b1, intercept=a1))
1020 | }
1021 |
1022 | ols<-ols.line(x, y) ; ols
1023 | abline(ols$intercept, ols$slope, lty=2, lwd=3)
1024 | predictions <- ols$intercept + ols$slope * x
1025 | residuals<- y - predictions
1026 | plot(residuals) ; abline(h=0)
1027 | ```
1028 |
1029 | ### Dangers of Extrapolation
1030 | ```{r}
1031 | x<-runif(1000)*5
1032 | y<-exp(x)+rnorm(1000)
1033 | plot(y~x, main='Whole relation')
1034 |
1035 | rect(xleft=0, ybottom=-5, xright=2, ytop=10)
1036 |
1037 | plot(y~x, main='Local relation', cex=0.5, xlim=c(0, 2), ylim=c(-5, 10));abline(v=2, lty=3)
1038 |
1039 | ind<-x<=2;ind
1040 | ols.interpolating<-ols.line(x[ind], y[ind]);ols.interpolating
1041 | abline(ols.interpolating$intercept , ols.interpolating$slope, col='red')
1042 | text(x=0.5, y=6, labels='Interpolates Nicely', cex=2)
1043 |
1044 | plot(y~x, main='Whole relation')
1045 | abline(ols.interpolating$intercept , ols.interpolating$slope, col='red')
1046 | abline(v=2, lty=3)
1047 | text(x=2, y=121, labels='Extrapolates Terribly!', cex=2)
1048 |
1049 | # Non-linearity might be fixed with a transformation:
1050 | # Which of the following looks better (more linear)?
1051 | plot(y~exp(x))
1052 | plot(log(y)~x)
1053 | plot(log(y)~log(x))
1054 | ```
1055 |
1056 | ### Multivariate linear regression
1057 | ```{r}
1058 | # install.packages('rgl')
1059 | library(rgl)
1060 |
1061 | xy.grid <- data.frame(x1=runif(10000), x2=runif(10000))
1062 |
1063 | func1<-function(mesh, a0, a1, a2, sigma) {
1064 | n<-nrow(mesh)
1065 | a0 + a1 * mesh[, 1] + a2 * mesh[, 2] + rnorm(n, 0, sigma)
1066 | }
1067 |
1068 | # More noise hides the stucture in the data:
1069 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .0); z; xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2')
1070 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .4); xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2')
1071 | z<-func1(xy.grid, a0=5, a1=1, a2=3, 11); xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2')
1072 |
1073 | ```
1074 |
1075 | `lm()` is the main workhorse for OLS solving $(X'X)^{-1} X'y$ with the QR decomposition.
1076 | ```{r}
1077 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .4)
1078 | xyz=data.frame(xy.grid, z)
1079 | plot3d(xyz, xlab='x1', ylab='x2')
1080 | lm(z~., xyz) # Did we exctract the correct coefficients?
1081 | ```
1082 |
1083 |
1084 |
1085 | # Date handeling
1086 | See the `lubridate` package and manual [here](http://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html).
1087 |
1088 |
1089 | # String handelind
1090 | ```{r}
1091 | print("Hello\n") # Wrong!
1092 | show("Hello\n") # Wrong!
1093 | cat("Hello\n") # Right!
1094 |
1095 | # Windows directories need double escapes:
1096 | print("C:\\Program Files\\")
1097 | cat("C:\\Program Files\\", sep="\n")
1098 |
1099 | # String concatenation:
1100 | paste("Hello", "World", "!")
1101 | paste("Hello", "World", "!", sep="")
1102 | paste("Hello", " World", "!", sep="")
1103 |
1104 | x <- 5
1105 | paste("x=", x)
1106 | paste("x=", x, paste="")
1107 |
1108 | cat("x=", x, "\n") #Too many spaces :-(
1109 | cat("x=", x, "\n", sep="")
1110 |
1111 | # Collapsing strings:
1112 | s <- c("Hello", " ", "World", "!")
1113 | paste(s)
1114 | paste(s, sep="")
1115 | paste(s, collapse="")
1116 | paste(s, collapse=" 1")
1117 |
1118 |
1119 | s <- c("Hello", "World!")
1120 | paste(1:3, "Hello World!")
1121 | paste(1:3, "Hello World!", sep=":")
1122 | paste(1:3, "Hello World!", sep=":", collapse="\n")
1123 | cat(paste(1:3, "Hello World!", sep=":", collapse="\n"), "\n") # cat() does not collapse :-(
1124 |
1125 |
1126 | # Substrings:
1127 | s <- "Hello World"
1128 | substring(s, start=4, stop=6)
1129 |
1130 | # Splits:
1131 | s <- "foo, bar, baz"
1132 | strsplit(s, ", ")
1133 |
1134 | s <- "foo-->bar-->baz"
1135 | strsplit(s, "-->")
1136 |
1137 | # Using regular expressions (see ?regexp):
1138 | s <- "foo, bar, baz"
1139 | strsplit(s, ", *")
1140 | strsplit(s, "")
1141 |
1142 | # Looking in *vectors* of strings:
1143 | (s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse=""))
1144 |
1145 | grep("O", s) # Returns location
1146 | grep("O", s, value=T) # Returns value
1147 |
1148 |
1149 | regexpr(pattern="o", text="Hello")
1150 | regexpr(pattern="o", text=c("Hello", "World!"))
1151 |
1152 | s <- c("Hello", "World!")
1153 | regexpr("o", s)
1154 | s <- c("Helll ooo", "Wrld!")
1155 | regexpr("o", s)
1156 |
1157 | # Fuzzy (approximate) matches:
1158 | grep ("abc", c("abbc", "jdfja", "cba")) # No match :-(
1159 | agrep ("abc", c("abbc", "jdfja", "cba")) # Match! :-)
1160 |
1161 | ## Note: agrep() is the function used in help.search()
1162 | s <- "foo bar baz"
1163 | gsub(pattern=" ", replacement="", s) # Remove all the spaces
1164 | s <- "foo bar baz"
1165 | gsub(" ", " ", s)
1166 | gsub(" +", "", s) # Using regular expression
1167 | gsub(" +", " ", s) # Remove multiple spaces and replace them by single spaces
1168 |
1169 | s <- "foo bar baz"
1170 | sub(pattern=" ", replacement="", s) # sub() only replaces first occurance.
1171 | gsub(" ", " ", s)
1172 | ```
1173 |
1174 |
1175 | If you use strings often, try the `stringr` package.
1176 |
1177 |
1178 |
1179 |
1180 |
1181 |
--------------------------------------------------------------------------------
/Intro2R.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: ASCII
11 |
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 2, June 1991
3 |
4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | Preamble
10 |
11 | The licenses for most software are designed to take away your
12 | freedom to share and change it. By contrast, the GNU General Public
13 | License is intended to guarantee your freedom to share and change free
14 | software--to make sure the software is free for all its users. This
15 | General Public License applies to most of the Free Software
16 | Foundation's software and to any other program whose authors commit to
17 | using it. (Some other Free Software Foundation software is covered by
18 | the GNU Lesser General Public License instead.) You can apply it to
19 | your programs, too.
20 |
21 | When we speak of free software, we are referring to freedom, not
22 | price. Our General Public Licenses are designed to make sure that you
23 | have the freedom to distribute copies of free software (and charge for
24 | this service if you wish), that you receive source code or can get it
25 | if you want it, that you can change the software or use pieces of it
26 | in new free programs; and that you know you can do these things.
27 |
28 | To protect your rights, we need to make restrictions that forbid
29 | anyone to deny you these rights or to ask you to surrender the rights.
30 | These restrictions translate to certain responsibilities for you if you
31 | distribute copies of the software, or if you modify it.
32 |
33 | For example, if you distribute copies of such a program, whether
34 | gratis or for a fee, you must give the recipients all the rights that
35 | you have. You must make sure that they, too, receive or can get the
36 | source code. And you must show them these terms so they know their
37 | rights.
38 |
39 | We protect your rights with two steps: (1) copyright the software, and
40 | (2) offer you this license which gives you legal permission to copy,
41 | distribute and/or modify the software.
42 |
43 | Also, for each author's protection and ours, we want to make certain
44 | that everyone understands that there is no warranty for this free
45 | software. If the software is modified by someone else and passed on, we
46 | want its recipients to know that what they have is not the original, so
47 | that any problems introduced by others will not reflect on the original
48 | authors' reputations.
49 |
50 | Finally, any free program is threatened constantly by software
51 | patents. We wish to avoid the danger that redistributors of a free
52 | program will individually obtain patent licenses, in effect making the
53 | program proprietary. To prevent this, we have made it clear that any
54 | patent must be licensed for everyone's free use or not licensed at all.
55 |
56 | The precise terms and conditions for copying, distribution and
57 | modification follow.
58 |
59 | GNU GENERAL PUBLIC LICENSE
60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
61 |
62 | 0. This License applies to any program or other work which contains
63 | a notice placed by the copyright holder saying it may be distributed
64 | under the terms of this General Public License. The "Program", below,
65 | refers to any such program or work, and a "work based on the Program"
66 | means either the Program or any derivative work under copyright law:
67 | that is to say, a work containing the Program or a portion of it,
68 | either verbatim or with modifications and/or translated into another
69 | language. (Hereinafter, translation is included without limitation in
70 | the term "modification".) Each licensee is addressed as "you".
71 |
72 | Activities other than copying, distribution and modification are not
73 | covered by this License; they are outside its scope. The act of
74 | running the Program is not restricted, and the output from the Program
75 | is covered only if its contents constitute a work based on the
76 | Program (independent of having been made by running the Program).
77 | Whether that is true depends on what the Program does.
78 |
79 | 1. You may copy and distribute verbatim copies of the Program's
80 | source code as you receive it, in any medium, provided that you
81 | conspicuously and appropriately publish on each copy an appropriate
82 | copyright notice and disclaimer of warranty; keep intact all the
83 | notices that refer to this License and to the absence of any warranty;
84 | and give any other recipients of the Program a copy of this License
85 | along with the Program.
86 |
87 | You may charge a fee for the physical act of transferring a copy, and
88 | you may at your option offer warranty protection in exchange for a fee.
89 |
90 | 2. You may modify your copy or copies of the Program or any portion
91 | of it, thus forming a work based on the Program, and copy and
92 | distribute such modifications or work under the terms of Section 1
93 | above, provided that you also meet all of these conditions:
94 |
95 | a) You must cause the modified files to carry prominent notices
96 | stating that you changed the files and the date of any change.
97 |
98 | b) You must cause any work that you distribute or publish, that in
99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 |
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License. (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 |
114 | These requirements apply to the modified work as a whole. If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works. But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 |
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 |
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 |
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 |
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 |
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 |
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code. (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 |
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it. For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable. However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 |
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 |
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License. Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 |
180 | 5. You are not required to accept this License, since you have not
181 | signed it. However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works. These actions are
183 | prohibited by law if you do not accept this License. Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 |
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions. You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 |
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License. If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all. For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 |
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 |
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices. Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 |
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 |
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded. In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 |
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time. Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 |
242 | Each version is given a distinguishing version number. If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation. If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 |
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission. For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this. Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 |
258 | NO WARRANTY
259 |
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 |
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 |
280 | END OF TERMS AND CONDITIONS
281 |
282 | How to Apply These Terms to Your New Programs
283 |
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 |
288 | To do so, attach the following notices to the program. It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 |
293 | {description}
294 | Copyright (C) {year} {fullname}
295 |
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 |
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
304 | GNU General Public License for more details.
305 |
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 |
310 | Also add information on how to contact you by electronic and paper mail.
311 |
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 |
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 |
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License. Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 |
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary. Here is a sample; alter the names:
328 |
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 |
332 | {signature of Ty Coon}, 1 April 1989
333 | Ty Coon, President of Vice
334 |
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs. If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library. If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 |
341 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intro 2 Data Mining and Machine Learning
2 | Some notes and code accompanying the Machine Learning course at BGU IE (2015B).
3 |
4 | - Introductory code to R can be found in [Intro2R.Rmd](https://github.com/johnros/Intro2R/blob/master/Intro2R.Rmd).
5 | - Class notes can be found in [notes/notes.pdf](https://github.com/johnros/Intro2R/blob/master/notes/notes.pdf)
6 | - Supervised learning with R can be found in [supervised.Rmd](https://github.com/johnros/Intro2R/blob/master/supervised.Rmd).
7 | - Unsupervised learning with R can be found in [unsupervised.Rmd](https://github.com/johnros/Intro2R/blob/master/unsupervised.Rmd).
8 | - Memory efficient examples of learning with R can be found in [massive_data.Rmd] (https://github.com/johnros/Intro2R/blob/master/massive_data.Rmd)
9 |
10 |
11 |
--------------------------------------------------------------------------------
/ensembles.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Ensembles"
3 | author: "Jonathan Rosenblatt"
4 | date: "April 14, 2015"
5 | output: html_document
6 | ---
7 |
8 | This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see .
9 |
10 | When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
11 |
12 | ```{r}
13 | summary(cars)
14 | ```
15 |
16 | You can also embed plots, for example:
17 |
18 | ```{r, echo=FALSE}
19 | plot(cars)
20 | ```
21 |
22 | Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
23 |
--------------------------------------------------------------------------------
/heights.txt:
--------------------------------------------------------------------------------
1 | "x"
2 | "1" 154.642231925250
3 | "2" 181.515415046854
4 | "3" 183.669450064676
5 | "4" 164.477480388096
6 | "5" 169.507524099543
7 | "6" 189.307146945314
8 | "7" 193.226804929068
9 | "8" 193.624383404466
10 | "9" 157.893148072242
11 | "10" 195.93135138899
12 | "11" 179.295385447133
13 | "12" 157.810599782384
14 | "13" 173.036294036202
15 | "14" 164.475677873598
16 | "15" 188.450490611646
17 | "16" 181.360921601115
18 | "17" 186.16343914743
19 | "18" 202.102560455985
20 | "19" 171.886308678447
21 | "20" 190.852218115841
22 | "21" 188.724663842758
23 | "22" 173.779216793308
24 | "23" 158.261222744983
25 | "24" 148.437784228267
26 | "25" 164.705238349013
27 | "26" 161.828633949433
28 | "27" 165.970280600691
29 | "28" 204.305316731687
30 | "29" 177.005233142843
31 | "30" 177.897164795354
32 | "31" 181.737172315478
33 | "32" 201.350957558292
34 | "33" 182.702695946265
35 | "34" 192.607675062521
36 | "35" 192.360470203139
37 | "36" 182.51500372402
38 | "37" 165.138482104219
39 | "38" 152.853469735449
40 | "39" 172.983712248375
41 | "40" 178.958805102191
42 | "41" 149.170125932993
43 | "42" 161.111975309129
44 | "43" 172.825260236284
45 | "44" 161.069621369286
46 | "45" 163.269021621464
47 | "46" 175.603795887767
48 | "47" 178.51097212948
49 | "48" 182.172158659397
50 | "49" 196.447207903926
51 | "50" 179.892990650191
52 | "51" 168.686430392467
53 | "52" 191.517024422907
54 | "53" 162.379700159572
55 | "54" 145.825462006366
56 | "55" 178.189038640029
57 | "56" 179.306007330531
58 | "57" 195.107291012887
59 | "58" 169.007854077717
60 | "59" 177.181215676435
61 | "60" 177.747229786197
62 | "61" 161.659336117893
63 | "62" 182.430623725975
64 | "63" 218.730275495048
65 | "64" 169.926181297895
66 | "65" 176.959469749844
67 | "66" 171.375459547603
68 | "67" 176.571705720853
69 | "68" 189.076098092261
70 | "69" 177.544191311252
71 | "70" 181.532436109357
72 | "71" 176.780880816625
73 | "72" 170.996911012109
74 | "73" 179.270023529253
75 | "74" 170.487924086484
76 | "75" 155.259733169198
77 | "76" 184.160624725216
78 | "77" 161.592427130666
79 | "78" 180.925187006070
80 | "79" 155.723890803099
81 | "80" 157.896310501347
82 | "81" 159.916096248596
83 | "82" 187.555838702435
84 | "83" 181.616771940129
85 | "84" 149.155308713604
86 | "85" 171.259895469393
87 | "86" 185.845811531962
88 | "87" 181.152073442719
89 | "88" 176.662954488154
90 | "89" 182.314011770510
91 | "90" 196.537826803738
92 | "91" 164.408233463266
93 | "92" 168.009564401806
94 | "93" 180.671764709218
95 | "94" 153.779862401297
96 | "95" 170.184617805034
97 | "96" 161.331411417559
98 | "97" 191.196346904921
99 | "98" 197.625866973540
100 | "99" 177.810957191829
101 | "100" 168.021944571873
102 | "101" 171.084646937414
103 | "102" 184.663810253697
104 | "103" 177.294679064144
105 | "104" 205.735547141656
106 | "105" 198.377420855761
107 | "106" 159.024803260539
108 | "107" 178.250153726594
109 | "108" 153.710786658130
110 | "109" 187.694850049530
111 | "110" 170.643629368827
112 | "111" 178.716334879969
113 | "112" 144.636422096204
114 | "113" 153.249288032108
115 | "114" 176.081100503641
116 | "115" 157.427804319223
117 | "116" 177.033902823186
118 | "117" 170.178904993776
119 | "118" 190.474248991012
120 | "119" 183.085023424552
121 | "120" 193.856511469621
122 | "121" 185.499811658772
123 | "122" 169.931326461033
124 | "123" 150.233076351352
125 | "124" 161.168186250448
126 | "125" 182.481635567953
127 | "126" 160.257377592658
128 | "127" 191.81113058419
129 | "128" 171.986403914315
130 | "129" 179.367258611334
131 | "130" 164.898806840904
132 | "131" 182.899347114643
133 | "132" 149.177190245355
134 | "133" 152.258124570863
135 | "134" 178.14150953484
136 | "135" 193.038062896634
137 | "136" 163.409690529186
138 | "137" 184.504908083032
139 | "138" 171.479092390151
140 | "139" 179.260048968711
141 | "140" 168.972514134737
142 | "141" 176.856469061484
143 | "142" 165.440897778976
144 | "143" 158.137172556529
145 | "144" 145.085703028274
146 | "145" 158.171404459938
147 | "146" 184.095842848338
148 | "147" 152.288387245497
149 | "148" 186.840380367486
150 | "149" 157.009104714750
151 | "150" 186.077553698985
152 | "151" 170.550995250485
153 | "152" 162.294059162132
154 | "153" 172.750487675374
155 | "154" 196.123600968753
156 | "155" 172.267601753096
157 | "156" 187.672987438595
158 | "157" 180.110886810124
159 | "158" 189.111794244441
160 | "159" 152.265176197954
161 | "160" 192.270450417775
162 | "161" 140.299655523388
163 | "162" 184.597044560618
164 | "163" 146.558622196391
165 | "164" 162.417570943497
166 | "165" 180.644457172588
167 | "166" 165.036710002312
168 | "167" 175.956551314607
169 | "168" 192.954916343350
170 | "169" 197.412868130900
171 | "170" 172.920420601755
172 | "171" 202.502550832053
173 | "172" 183.243333328404
174 | "173" 175.972050514168
175 | "174" 188.801922656066
176 | "175" 187.782581309347
177 | "176" 140.948126678169
178 | "177" 181.095616175404
179 | "178" 157.676954004513
180 | "179" 153.607025029908
181 | "180" 172.937550152389
182 | "181" 193.921470548975
183 | "182" 187.442536829346
184 | "183" 143.305587956638
185 | "184" 162.774894681295
186 | "185" 171.440845276852
187 | "186" 179.076926287560
188 | "187" 167.961077937356
189 | "188" 190.250038457275
190 | "189" 201.963484919159
191 | "190" 141.042162054683
192 | "191" 184.265069122516
193 | "192" 182.673553176274
194 | "193" 187.485732272253
195 | "194" 191.180068240384
196 | "195" 154.399840166867
197 | "196" 179.320659414185
198 | "197" 163.311945378322
199 | "198" 185.560706289541
200 | "199" 177.454870554195
201 | "200" 164.435553245849
202 | "201" 159.427517684274
203 | "202" 180.224446836658
204 | "203" 174.972792623367
205 | "204" 192.080560659665
206 | "205" 162.279950860632
207 | "206" 164.48146276803
208 | "207" 200.380128457540
209 | "208" 182.522412007671
210 | "209" 170.517535152204
211 | "210" 172.694744220837
212 | "211" 182.062711990664
213 | "212" 176.956174180894
214 | "213" 174.763608434611
215 | "214" 186.915073375985
216 | "215" 187.598758105258
217 | "216" 198.696129728959
218 | "217" 142.716214897597
219 | "218" 177.641213787158
220 | "219" 190.327947804155
221 | "220" 144.671734742830
222 | "221" 180.782760918096
223 | "222" 198.844300403074
224 | "223" 182.605006536142
225 | "224" 219.326098979731
226 | "225" 156.158453000064
227 | "226" 172.252620703346
228 | "227" 186.044661966251
229 | "228" 178.867663085854
230 | "229" 198.650162198613
231 | "230" 167.979915629824
232 | "231" 199.829570620415
233 | "232" 165.794645158794
234 | "233" 180.465456074427
235 | "234" 160.236206469431
236 | "235" 158.272746330946
237 | "236" 158.126130368779
238 | "237" 183.688668152816
239 | "238" 181.616152538631
240 | "239" 157.015107584028
241 | "240" 187.300616297201
242 | "241" 185.853278049716
243 | "242" 168.151057052129
244 | "243" 163.388790469436
245 | "244" 202.069698991767
246 | "245" 155.691779950907
247 | "246" 191.373405402332
248 | "247" 172.781335803882
249 | "248" 165.429010426692
250 | "249" 175.279711075057
251 | "250" 178.358900897957
252 | "251" 177.963343223211
253 | "252" 187.293122130642
254 | "253" 179.218874053826
255 | "254" 188.559765592407
256 | "255" 162.839838117308
257 | "256" 203.303304780748
258 | "257" 172.174062379368
259 | "258" 211.814999759702
260 | "259" 161.310082911835
261 | "260" 184.679045548268
262 | "261" 193.229871635395
263 | "262" 168.315858935628
264 | "263" 186.510678598511
265 | "264" 153.652416659175
266 | "265" 171.322276317139
267 | "266" 193.055700886032
268 | "267" 166.171376322696
269 | "268" 182.442937932258
270 | "269" 166.224171003048
271 | "270" 175.591636365803
272 | "271" 182.720750420831
273 | "272" 150.985495000653
274 | "273" 160.718851035696
275 | "274" 182.048045733700
276 | "275" 160.756319841451
277 | "276" 182.866206153823
278 | "277" 185.955916196744
279 | "278" 181.016896691375
280 | "279" 165.186654433254
281 | "280" 164.8483675701
282 | "281" 160.486375377322
283 | "282" 182.811311492578
284 | "283" 184.576189010998
285 | "284" 163.482449976648
286 | "285" 158.916523195602
287 | "286" 176.607539724103
288 | "287" 176.763030850391
289 | "288" 168.939521562625
290 | "289" 182.719602455499
291 | "290" 166.320287381078
292 | "291" 182.738929711734
293 | "292" 194.861048807758
294 | "293" 172.807759693463
295 | "294" 184.049657455787
296 | "295" 179.872957039333
297 | "296" 161.952408671362
298 | "297" 174.767475290942
299 | "298" 175.295065249966
300 | "299" 181.289243701683
301 | "300" 197.129730106169
302 | "301" 164.198900174296
303 | "302" 185.627781284498
304 | "303" 194.036881596675
305 | "304" 171.319949519604
306 | "305" 174.532277679364
307 | "306" 153.553184277542
308 | "307" 172.315835580037
309 | "308" 173.498678687561
310 | "309" 198.599707763334
311 | "310" 178.301845140724
312 | "311" 174.617660283316
313 | "312" 176.705767338396
314 | "313" 188.964358341085
315 | "314" 183.221334563170
316 | "315" 196.421488995261
317 | "316" 181.510883802772
318 | "317" 166.044681836748
319 | "318" 176.629184860139
320 | "319" 175.731902626491
321 | "320" 173.210208272426
322 | "321" 145.265036663022
323 | "322" 190.305195235554
324 | "323" 148.265087802415
325 | "324" 204.066415444731
326 | "325" 173.946706217892
327 | "326" 178.803185423479
328 | "327" 160.498622623532
329 | "328" 161.151523827894
330 | "329" 176.023313790272
331 | "330" 183.854957555561
332 | "331" 168.800034584258
333 | "332" 178.599765787053
334 | "333" 187.772067736776
335 | "334" 170.116452180722
336 | "335" 162.114133823019
337 | "336" 177.674849349452
338 | "337" 158.340689417691
339 | "338" 176.940108207029
340 | "339" 184.428186163844
341 | "340" 177.023840734806
342 | "341" 171.684048722634
343 | "342" 163.491527740157
344 | "343" 156.490691723403
345 | "344" 162.703292679845
346 | "345" 187.668512154051
347 | "346" 180.687665814883
348 | "347" 168.616157229943
349 | "348" 162.396089748833
350 | "349" 166.301296030222
351 | "350" 181.744456128940
352 | "351" 178.063325752123
353 | "352" 173.358737016462
354 | "353" 178.988670013496
355 | "354" 184.838605963780
356 | "355" 183.578257343549
357 | "356" 153.778167609730
358 | "357" 158.804087342316
359 | "358" 183.765465951731
360 | "359" 176.707010322159
361 | "360" 188.954998616583
362 | "361" 164.793222925059
363 | "362" 179.307368830717
364 | "363" 159.100707473351
365 | "364" 177.151347649305
366 | "365" 168.350324575276
367 | "366" 160.842323131067
368 | "367" 191.048389544374
369 | "368" 155.054513198013
370 | "369" 188.577579723576
371 | "370" 176.74738506733
372 | "371" 160.961060221237
373 | "372" 142.732111673602
374 | "373" 163.838976786749
375 | "374" 172.30085890264
376 | "375" 199.540731347306
377 | "376" 176.897711058568
378 | "377" 195.716455633224
379 | "378" 168.287980179958
380 | "379" 196.434322310898
381 | "380" 171.407564874279
382 | "381" 203.939169204876
383 | "382" 209.530228125875
384 | "383" 167.906266127632
385 | "384" 174.787449215420
386 | "385" 151.638661217451
387 | "386" 166.047516130843
388 | "387" 172.620046084553
389 | "388" 183.843707890951
390 | "389" 173.609280363838
391 | "390" 154.520648015293
392 | "391" 174.736374866009
393 | "392" 162.707998552559
394 | "393" 187.40582697193
395 | "394" 166.174083139717
396 | "395" 176.809811037152
397 | "396" 177.957216306957
398 | "397" 189.510751295879
399 | "398" 168.346173280156
400 | "399" 157.457097341197
401 | "400" 184.44868299107
402 | "401" 177.269519184665
403 | "402" 183.165864567249
404 | "403" 180.186177501262
405 | "404" 164.571732945580
406 | "405" 164.447851571180
407 | "406" 152.055618509910
408 | "407" 196.429360038572
409 | "408" 152.384298770638
410 | "409" 195.001939287470
411 | "410" 158.386947120237
412 | "411" 195.376551337271
413 | "412" 193.668994004159
414 | "413" 194.458375099331
415 | "414" 184.905317590638
416 | "415" 201.80903479507
417 | "416" 206.392737694348
418 | "417" 181.917008990256
419 | "418" 178.298008200284
420 | "419" 178.856768769408
421 | "420" 157.028903266883
422 | "421" 188.689512968359
423 | "422" 150.653752276514
424 | "423" 178.593098910637
425 | "424" 181.715775796939
426 | "425" 182.097283170211
427 | "426" 166.937476316204
428 | "427" 173.090440923646
429 | "428" 166.012795827896
430 | "429" 186.370617168312
431 | "430" 183.196712136737
432 | "431" 186.971886663421
433 | "432" 202.051049646338
434 | "433" 170.517405166906
435 | "434" 166.729016033653
436 | "435" 160.729667221702
437 | "436" 189.403162000057
438 | "437" 174.855701218518
439 | "438" 173.264991115568
440 | "439" 161.457213219188
441 | "440" 165.747790539936
442 | "441" 141.647420542422
443 | "442" 172.050626305387
444 | "443" 166.278123797179
445 | "444" 174.425684168269
446 | "445" 170.310161681767
447 | "446" 191.011686530823
448 | "447" 181.511058382853
449 | "448" 176.390937575651
450 | "449" 160.683023343649
451 | "450" 172.962048247563
452 | "451" 170.075108146494
453 | "452" 154.268572556346
454 | "453" 189.145807753854
455 | "454" 179.443783799274
456 | "455" 198.944443680990
457 | "456" 184.250273631561
458 | "457" 197.998016718115
459 | "458" 132.265514767070
460 | "459" 174.083352963022
461 | "460" 165.735214256863
462 | "461" 182.622677687991
463 | "462" 199.925347763529
464 | "463" 178.115390549151
465 | "464" 196.105489795473
466 | "465" 178.061311774555
467 | "466" 159.049179015494
468 | "467" 170.822143984335
469 | "468" 186.966542660391
470 | "469" 180.221528641344
471 | "470" 185.079955928013
472 | "471" 153.589935109175
473 | "472" 152.497471203635
474 | "473" 191.785903588667
475 | "474" 157.825893080683
476 | "475" 182.388147027416
477 | "476" 166.016656869199
478 | "477" 185.222519651954
479 | "478" 165.929035974702
480 | "479" 198.217448402947
481 | "480" 169.335083278694
482 | "481" 165.243005111587
483 | "482" 178.659255296227
484 | "483" 182.788741780399
485 | "484" 178.178098439817
486 | "485" 186.966512831434
487 | "486" 152.084666463771
488 | "487" 162.950577999665
489 | "488" 179.013082169688
490 | "489" 195.129031851928
491 | "490" 210.924034778640
492 | "491" 174.428311074424
493 | "492" 177.77051975902
494 | "493" 174.822800338396
495 | "494" 138.774022461865
496 | "495" 165.336530996786
497 | "496" 173.30873598773
498 | "497" 163.956121125432
499 | "498" 160.467298732182
500 | "499" 175.832120967381
501 | "500" 199.913146162215
502 | "501" 155.964878841759
503 | "502" 166.796382826151
504 | "503" 174.973053891717
505 | "504" 163.039267015936
506 | "505" 179.787580622214
507 | "506" 175.701883854837
508 | "507" 199.933140593470
509 | "508" 156.968838583641
510 | "509" 184.843888411655
511 | "510" 161.80247847189
512 | "511" 201.257564828762
513 | "512" 179.085048480902
514 | "513" 163.077045364664
515 | "514" 157.520449391653
516 | "515" 194.984431216109
517 | "516" 183.608222302032
518 | "517" 191.092321742153
519 | "518" 183.757981172173
520 | "519" 181.379263911818
521 | "520" 159.233064203777
522 | "521" 203.590571375374
523 | "522" 166.866274037342
524 | "523" 171.307699687387
525 | "524" 163.08188847367
526 | "525" 187.112382361810
527 | "526" 150.167721435222
528 | "527" 164.895153026682
529 | "528" 153.804013234172
530 | "529" 154.652526592447
531 | "530" 165.742961681111
532 | "531" 191.6872752319
533 | "532" 175.983531879868
534 | "533" 185.905480300651
535 | "534" 185.093928110123
536 | "535" 166.083474075138
537 | "536" 165.766560460107
538 | "537" 170.440220534698
539 | "538" 185.938414284405
540 | "539" 199.217434832544
541 | "540" 180.87302856521
542 | "541" 170.816010211100
543 | "542" 168.939975605928
544 | "543" 184.736483536528
545 | "544" 179.287219879986
546 | "545" 187.203050536959
547 | "546" 157.572722788549
548 | "547" 182.800768256700
549 | "548" 186.374601314682
550 | "549" 164.809101803407
551 | "550" 178.491121635725
552 | "551" 173.482799639154
553 | "552" 163.200798693547
554 | "553" 175.161439344675
555 | "554" 189.336322399167
556 | "555" 176.529890249700
557 | "556" 178.925164253393
558 | "557" 160.761332844183
559 | "558" 192.562729714031
560 | "559" 187.393621841816
561 | "560" 176.068015277767
562 | "561" 172.381080213471
563 | "562" 167.72762249227
564 | "563" 167.263771906048
565 | "564" 183.109496801787
566 | "565" 164.777836297268
567 | "566" 171.679105667423
568 | "567" 172.191445630999
569 | "568" 178.915706193654
570 | "569" 198.573578910996
571 | "570" 166.213596337005
572 | "571" 149.066566144206
573 | "572" 170.853481637291
574 | "573" 161.139135803052
575 | "574" 174.150659444601
576 | "575" 171.996709839293
577 | "576" 170.234324639672
578 | "577" 197.587417529768
579 | "578" 162.793244578473
580 | "579" 186.859461234664
581 | "580" 186.083799923945
582 | "581" 195.717174499425
583 | "582" 175.614371368475
584 | "583" 185.446880388690
585 | "584" 166.917296428806
586 | "585" 186.902630363617
587 | "586" 172.414819999352
588 | "587" 178.191465949287
589 | "588" 147.840521132375
590 | "589" 196.302627884954
591 | "590" 171.909679246555
592 | "591" 177.636886346887
593 | "592" 189.121338930815
594 | "593" 201.573257006643
595 | "594" 184.457930447635
596 | "595" 166.773096995064
597 | "596" 188.384970652984
598 | "597" 142.414178497676
599 | "598" 179.217573300661
600 | "599" 166.976139352456
601 | "600" 180.710422421230
602 | "601" 190.990969102576
603 | "602" 167.070851807613
604 | "603" 193.690588312798
605 | "604" 170.276843219783
606 | "605" 138.163334897795
607 | "606" 168.498364925024
608 | "607" 175.895535811671
609 | "608" 164.679731434988
610 | "609" 148.356122363571
611 | "610" 176.058657518635
612 | "611" 197.831476059284
613 | "612" 183.607359396439
614 | "613" 175.561425572291
615 | "614" 168.647571229575
616 | "615" 176.044819204732
617 | "616" 194.857926288700
618 | "617" 154.495425583718
619 | "618" 193.712316559755
620 | "619" 151.757863441684
621 | "620" 188.785733351911
622 | "621" 175.931220440509
623 | "622" 193.441564462227
624 | "623" 170.621086909187
625 | "624" 176.181932728350
626 | "625" 176.679228632157
627 | "626" 186.689305906061
628 | "627" 182.426770197615
629 | "628" 168.824324409370
630 | "629" 181.842481343291
631 | "630" 155.722910948057
632 | "631" 173.994102722085
633 | "632" 174.887885426625
634 | "633" 173.288105227204
635 | "634" 195.00806176919
636 | "635" 180.744081183266
637 | "636" 166.650856343618
638 | "637" 160.375471741992
639 | "638" 176.380189932622
640 | "639" 161.389260998669
641 | "640" 144.626448351016
642 | "641" 193.556498250189
643 | "642" 179.018143096707
644 | "643" 196.875671660888
645 | "644" 191.379490856748
646 | "645" 163.323565682411
647 | "646" 164.614060459367
648 | "647" 159.055123031765
649 | "648" 199.245884125381
650 | "649" 161.833445898662
651 | "650" 147.423923918488
652 | "651" 184.087773925858
653 | "652" 180.994201037299
654 | "653" 183.147874840789
655 | "654" 181.737854946865
656 | "655" 165.997895799993
657 | "656" 173.808142018463
658 | "657" 174.431727477913
659 | "658" 189.186367872759
660 | "659" 148.725937849016
661 | "660" 196.044690283686
662 | "661" 175.402244457156
663 | "662" 135.453057583091
664 | "663" 163.488810653472
665 | "664" 170.145424222187
666 | "665" 159.176036768623
667 | "666" 180.230059851796
668 | "667" 168.483798366892
669 | "668" 179.917571696136
670 | "669" 153.410694204960
671 | "670" 184.968870647362
672 | "671" 173.119351780117
673 | "672" 177.682142750716
674 | "673" 170.937870506549
675 | "674" 174.280913757452
676 | "675" 166.524197403539
677 | "676" 181.552570035715
678 | "677" 145.120176960029
679 | "678" 168.837310366893
680 | "679" 165.336439848885
681 | "680" 174.400241424235
682 | "681" 183.724895739295
683 | "682" 174.751552845865
684 | "683" 158.747384278538
685 | "684" 164.451344192133
686 | "685" 184.626382080994
687 | "686" 156.220013690916
688 | "687" 202.896805309913
689 | "688" 189.638023666131
690 | "689" 220.222314568603
691 | "690" 166.120790744227
692 | "691" 172.058065310796
693 | "692" 160.975088761266
694 | "693" 166.842037229817
695 | "694" 187.061734750517
696 | "695" 177.692975234530
697 | "696" 188.325188757857
698 | "697" 182.860510364993
699 | "698" 192.918694254393
700 | "699" 175.405976841235
701 | "700" 174.012904060021
702 | "701" 156.836548471586
703 | "702" 145.011508040850
704 | "703" 189.017161622121
705 | "704" 199.726708071824
706 | "705" 170.760939805503
707 | "706" 149.086217767916
708 | "707" 157.085088329393
709 | "708" 180.070403347209
710 | "709" 154.865596235964
711 | "710" 169.10531322038
712 | "711" 167.503225841124
713 | "712" 179.208202477236
714 | "713" 188.861474882465
715 | "714" 166.013350789149
716 | "715" 179.039536104201
717 | "716" 186.605360220183
718 | "717" 179.327878746743
719 | "718" 174.552119509597
720 | "719" 148.067581820210
721 | "720" 180.281822679742
722 | "721" 189.573687736597
723 | "722" 168.901887620761
724 | "723" 183.281966733900
725 | "724" 181.033053569180
726 | "725" 158.056434869983
727 | "726" 149.828182680041
728 | "727" 177.984344422450
729 | "728" 152.735240544595
730 | "729" 170.625332097780
731 | "730" 157.160910020433
732 | "731" 164.226331260782
733 | "732" 184.092578085197
734 | "733" 150.804039273175
735 | "734" 163.425479542623
736 | "735" 157.898685785876
737 | "736" 163.732990510667
738 | "737" 151.101623921934
739 | "738" 160.210900486758
740 | "739" 156.387546833666
741 | "740" 190.313807005817
742 | "741" 181.388112584315
743 | "742" 179.497550491261
744 | "743" 169.311249071892
745 | "744" 175.667286891803
746 | "745" 198.232477608869
747 | "746" 185.457350947889
748 | "747" 182.109408275760
749 | "748" 175.379852483831
750 | "749" 178.007246724946
751 | "750" 179.226918455877
752 | "751" 185.136947684203
753 | "752" 181.473472595963
754 | "753" 162.920548244272
755 | "754" 172.526549133301
756 | "755" 165.049283214815
757 | "756" 156.937053434863
758 | "757" 177.416676715852
759 | "758" 159.054160957737
760 | "759" 170.595112481695
761 | "760" 168.038698710784
762 | "761" 181.698822738156
763 | "762" 128.546002612522
764 | "763" 122.209459066065
765 | "764" 172.190892156975
766 | "765" 186.750139751262
767 | "766" 202.955016362004
768 | "767" 185.626890968403
769 | "768" 154.008505981866
770 | "769" 185.687440807349
771 | "770" 196.847873751050
772 | "771" 171.953316185312
773 | "772" 149.505067941148
774 | "773" 207.902619631389
775 | "774" 179.074689892608
776 | "775" 180.646843489277
777 | "776" 165.107780580013
778 | "777" 169.425841578293
779 | "778" 191.422816729689
780 | "779" 176.401413685863
781 | "780" 166.478634258473
782 | "781" 195.507105336363
783 | "782" 170.622970647857
784 | "783" 188.311635321259
785 | "784" 187.448611678417
786 | "785" 135.822911259681
787 | "786" 151.968788731697
788 | "787" 172.081867164090
789 | "788" 172.227388222652
790 | "789" 170.692672325331
791 | "790" 178.242399162286
792 | "791" 168.646236011802
793 | "792" 166.627359788482
794 | "793" 185.536074378311
795 | "794" 178.518112086116
796 | "795" 159.603288661557
797 | "796" 187.613688381090
798 | "797" 146.783595329556
799 | "798" 173.461719218919
800 | "799" 149.415407348639
801 | "800" 171.968806125003
802 | "801" 161.469982062398
803 | "802" 170.115584949635
804 | "803" 163.815093696623
805 | "804" 198.350954416808
806 | "805" 190.522240900673
807 | "806" 164.874201225063
808 | "807" 194.36523036219
809 | "808" 166.241292689966
810 | "809" 194.640280892311
811 | "810" 151.538471262220
812 | "811" 186.954805564224
813 | "812" 210.014304319917
814 | "813" 195.305369283628
815 | "814" 193.182803191660
816 | "815" 169.462698179736
817 | "816" 165.635325509888
818 | "817" 175.388159254454
819 | "818" 201.955661876965
820 | "819" 178.666469289603
821 | "820" 195.122611888756
822 | "821" 191.291782738848
823 |
--------------------------------------------------------------------------------
/make_samples.R:
--------------------------------------------------------------------------------
1 | # install.packages(ElemStatLearn)
2 | rm(list=ls())
3 |
4 |
5 | library(ElemStatLearn) # for data
6 | data("prostate")
7 | data("spam")
8 |
9 | library(magrittr) # for piping
10 | library(dplyr) # for handeling data frames
11 |
12 |
13 |
14 | # Continous outcome:
15 | prostate.train <- prostate %>%
16 | filter(train) %>%
17 | select(-train)
18 | prostate.test <- prostate %>%
19 | filter(!train) %>%
20 | select(-train)
21 | y.train <- prostate.train$lcavol
22 | X.train <- prostate.train %>% select(-lcavol) %>% as.matrix
23 | y.test <- prostate.test$lcavol
24 | X.test <- prostate.test %>% select(-lcavol) %>% as.matrix
25 |
26 |
27 |
28 | # Categorical outcome:
29 | n <- nrow(spam)
30 |
31 | train.prop <- 0.66
32 | train.ind <- c(TRUE,FALSE) %>%
33 | sample(size = n, prob = c(train.prop,1-train.prop), replace=TRUE)
34 | spam.train <- spam[train.ind,]
35 | spam.test <- spam[!train.ind,]
36 |
37 | y.train.spam <- spam.train$spam
38 | X.train.spam <- spam.train %>% select(-spam) %>% as.matrix
39 | y.test.spam <- spam.test$spam
40 | X.test.spam <- spam.test %>% select(-spam) %>% as.matrix
41 |
42 | spam.dummy <- spam %>% mutate(spam=as.numeric(spam=='spam'))
43 | spam.train.dummy <- spam.dummy[train.ind,]
44 | spam.test.dummy <- spam.dummy[!train.ind,]
45 |
46 |
47 |
--------------------------------------------------------------------------------
/massive_data.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Analyzing Massive Data Sets"
3 | author: "Jonathan Rosenblatt"
4 | date: "23/04/2015"
5 | output:
6 | html_document:
7 | toc: true
8 | ---
9 |
10 |
11 | ```{r setup, include=FALSE}
12 | library(knitr)
13 | opts_chunk$set(cache=TRUE)
14 | ```
15 |
16 | # Introduction
17 | When analyzing data, you may encounter several resource constraints:
18 |
19 | - Hard Disk Space: your data might not fit your HD. This matter is not discussed in this text.
20 | - RAM constraint: Your data fits in the HD but the implementation you are using of your favorite method needs more RAM that what you have. This is the main topic of this text, in which we demonstrate out-of-memory implementations of many popular algorithms.
21 | - CPU constraint: Your algorithms has all the memory it needs, it simply runs too slowly. Parralelizing the computation on more cores in your machines, or on more machines, is in order.
22 |
23 | ## Disagnostics
24 | In order to diagnose the resource limit you are encoutering, make sure you always work with your task-manager (Windows) or top (linux) open. The cases where you get error messages from your software are easy to diagnose. In other cases, where computations never end, but no erros are thrown, check which resource is runnning low in your task-manager.
25 |
26 |
27 | ## Terminology
28 |
29 | - In-memory: processing loads the required data into RAM.
30 | - Out-of-memory: processing is not done from RAM but rather from HD.
31 | - Batch algorithm: loads all the data when processing.
32 | - Streaming algorithm: the algorithm progresses by processing a sinle observation at a time.
33 | - Mini-batch algorith: mid-way between batch and streaming.
34 | - Swap file: a file in HD which mimiks RAM.
35 |
36 | ## Tips and Tricks
37 |
38 | 1. For *batch* algorithms memory usage should not exceed $30%$.
39 | 2. Swap files:
40 | - NEVER use swap file.
41 | 3. R releases memory only when needed, not when possible ("lazy" release).
42 | 4. Don't count on R returning RAM to the operating system (at least in Linux). Restart R if FACEBOOK slows down.
43 | 5. When you want to go pro- read [Hadley's memory usage guide](http://adv-r.had.co.nz/memory.html)
44 |
45 |
46 |
47 |
48 | ## Bla bla... Let's see some code!
49 |
50 | Inspiration from [here](http://www.r-bloggers.com/bigglm-on-your-big-data-set-in-open-source-r-it-just-works-similar-as-in-sas/).
51 |
52 |
53 | Download a fat data file:
54 | ```{r download_data}
55 | # download.file("http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/BSAPUFS/Downloads/2010_Carrier_PUF.zip", "2010_Carrier_PUF.zip")
56 | # unzip(zipfile="2010_Carrier_PUF.zip")
57 | ```
58 |
59 | `data.table` package is much more efficient than `read.table' functions.
60 | You should also consider the `readr` [package ](https://github.com/hadley/readr) which we did not document here (yet).
61 | ```{r import_data}
62 | # install.packages('data.table')
63 | library(data.table)
64 |
65 | data <- data.table::fread(input = "2010_BSA_Carrier_PUF.csv",
66 | sep = ',',
67 | header=TRUE)
68 |
69 |
70 | read.csv("2010_BSA_Carrier_PUF.csv")
71 |
72 |
73 |
74 | library(magrittr) % # for piping syntax
75 | .names <- c("sex", "age", "diagnose", "healthcare.procedure", "typeofservice", "service.count", "provider.type", "servicesprocessed", "place.served", "payment", "carrierline.count")
76 | data %>% setnames(.names)
77 | ```
78 |
79 | Now verify the size of your data in memory:
80 | ```{r}
81 | object.size(data)
82 | # But I prefer pryr:
83 | pryr::object_size(data)
84 | ```
85 |
86 | When does R create a copy of an object? Use `tracemem`
87 | ```{r tracemem}
88 | tracemem(data)
89 | .test <- glm(payment ~ sex + age + place.served, data = data[1:1e2,], family=poisson)
90 | ```
91 |
92 |
93 | Profile each line of code for time and memory usage using [lineprof](https://github.com/hadley/lineprof)
94 | ```{r lineprof}
95 | # devtools::install_github("hadley/lineprof")
96 | prof <- lineprof::lineprof(
97 | glm(payment ~ sex + age + place.served, data = data)
98 | )
99 | lineprof::shine(prof)
100 | ````
101 |
102 |
103 |
104 |
105 |
106 | But actually, I just like to have my Task-Manager constantly open:
107 | ```{r inspect_RAM}
108 | # Run and inspect RAM/CPU
109 | glm(payment ~ sex + age + place.served, data = data, family=poisson)
110 | ```
111 |
112 |
113 |
114 |
115 | Now lets artificially scale the problem.
116 | Note: `copies` is small so that fitting can be done in real-time.
117 | To demonstrate the problem, I would have set `copies <- 10`.
118 | ```{r artificial_scale}
119 | copies <- 2
120 | data.2 <- do.call(rbind, lapply(1:copies, function(x) data) )
121 | system.time(data.2 %>% dim)
122 | pryr::object_size(data)
123 | pryr::object_size(data.2)
124 | ```
125 |
126 |
127 |
128 | When you run the following code at home, it will *not* show memory exhaustion, but will take a long time to run and to release when stopped.
129 | It is thus a *memory* constraint.
130 | ```{r}
131 | ## Don't run:
132 | ## glm.2 <-glm(payment ~ sex + age + place.served, data = data.2, family=poisson)
133 | ```
134 | Since the data easily fits in RAM, it can be fixed simply by a *streaming* algorithm.
135 |
136 |
137 | The following object, can't even be stored in RAM.
138 | Streaming *from RAM* will not solve the problem.
139 | We will get back to this...
140 | ```{r}
141 | ## Don't run:
142 | ## copies <- 1e2
143 | ## data.3 <- do.call(rbind, lapply(1:copies, function(x) data) )
144 | ```
145 |
146 |
147 |
148 |
149 | # Streaming Regression
150 |
151 | We now discover several R implementations of streaming algorithms, which overcome RAM constraints at a moderate CPU cost.
152 |
153 | ## biglm
154 | ```{r biglm}
155 | # install.packages('biglm')
156 | library(biglm)
157 | mymodel <- biglm::bigglm(payment ~ sex + age + place.served,
158 | data = data.2,
159 | family = poisson(),
160 | maxit=1e3)
161 |
162 | # Too long! Quit the job and time the release.
163 |
164 | # For demonstration: OLS example with original data.
165 | mymodel <- bigglm(payment ~ sex + age + place.served, data =data )
166 | mymodel <- data %>% bigglm(payment ~ sex + age + place.served, data =. )
167 | ```
168 | Remarks:
169 | - R is immediatly(!) available after quitting the job.
170 | - `bigglm` objects behave (almost) like `glm` objects w.r.t. `coef`, `summary`,...
171 | - `bigglm` is aimed at *memory* constraints. Not speed.
172 |
173 |
174 | ## Exploit sparsity in your data
175 | Very relevant to factors with many levels.
176 | ```{r}
177 | reps <- 1e6
178 | y<-rnorm(reps)
179 | x<- letters %>%
180 | sample(reps, replace=TRUE) %>%
181 | factor
182 |
183 | X.1 <- model.matrix(~x-1) # Make dummy variable matrix
184 |
185 | library(MatrixModels)
186 | X.2<-as(x,"sparseMatrix") %>% t # Makes sparse dummy matrix
187 |
188 | dim(X.1)
189 | dim(X.2)
190 |
191 | pryr::object_size(X.1)
192 | pryr::object_size(X.2)
193 | ```
194 |
195 |
196 | ```{r}
197 | system.time(lm.1 <- lm(y ~ X.1))
198 | system.time(lm.1 <- lm.fit(y=y, x=X.1))
199 | system.time(lm.2 <- MatrixModels:::lm.fit.sparse(X.2,y))
200 |
201 | all.equal(lm.2, unname(lm.1$coefficients), tolerance = 1e-12)
202 | ```
203 |
204 |
205 |
206 | # Streaming classification
207 | [LiblineaR](http://cran.r-project.org/web/packages/LiblineaR/index.html), and [RSofia](http://cran.r-project.org/web/packages/RSofia/index.html) will stream from RAM your data for classification problems;
208 | mainly SVMs.
209 |
210 |
211 |
212 |
213 |
214 | # Out of memory Regression
215 |
216 | What if it is not the __algorithm__ that causes the problem, but merely __importing__ my objects?
217 |
218 |
219 | ## ff
220 | The `ff` packages replaces R's in-RAM storage mechanism with on-disk (efficient) storage.
221 | First open a connection to the file, without actually importing it.
222 | ```{r}
223 | # install.packages('LaF')
224 | library(LaF)
225 |
226 | .dat <- laf_open_csv(filename = "2010_BSA_Carrier_PUF.csv",
227 | column_types = c("integer", "integer", "categorical", "categorical", "categorical", "integer", "integer", "categorical", "integer", "integer", "integer"),
228 | column_names = c("sex", "age", "diagnose", "healthcare.procedure", "typeofservice", "service.count", "provider.type", "servicesprocessed", "place.served", "payment", "carrierline.count"),
229 | skip = 1)
230 | ```
231 | Now write the data to HD as an ff object:
232 | ```{r}
233 | # install.packages('ffbase')
234 | library(ffbase)
235 | data.ffdf <- laf_to_ffdf(laf = .dat)
236 | ```
237 | Notice the minimial RAM allocation:
238 | ```{r}
239 | pryr::object_size(data)
240 | pryr::object_size(data.ffdf)
241 | ```
242 |
243 |
244 |
245 |
246 | Caution: `base` functions are unaware of `ff`.
247 | Adapted algorithms are required...
248 | ```{r}
249 | data$age %>% table
250 | ffbase:::table.ff(data.ffdf$age)
251 | ```
252 |
253 |
254 | Luckily, `bigglm` has it's `ff` version:
255 | ```{r biglm_regression}
256 | mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served,
257 | data = data.ffdf,
258 | family = poisson(),
259 | maxit=1e3)
260 |
261 | # Again, too slow. Stop and run:
262 | mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served,
263 | data = data.ffdf)
264 | ```
265 | The previous can scale to any file I can store on disk (but might take a while).
266 |
267 |
268 |
269 |
270 | I will now inflate the data to a size that would not fit in RAM.
271 | ```{r}
272 | copies <- 2e1
273 | data.2.ffdf <- do.call(rbind, lapply(1:copies, function(x) data.ffdf) )
274 |
275 | # Actual size:
276 | cat('Size in GB ',sum(.rambytes[vmode(data.2.ffdf)]) * (nrow(data.2.ffdf) * 9.31322575 * 10^(-10)))
277 |
278 | # In memory:
279 | pryr::object_size(data.2.ffdf)
280 | ```
281 |
282 |
283 |
284 | And now I can run this MASSIVE regression:
285 | ```{r biglm_ffdf_regression}
286 | ## Do no run:
287 |
288 | # mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served,
289 | # data = data.2.ffdf,
290 | # family = poisson(),
291 | # maxit=1e3)
292 | ```
293 | Notes:
294 |
295 | - Notice again the quick release when aborting process.
296 | - Solving RAM constraints does not guarantee speed. This particular problem is actually worth parallelizing.
297 | - SAS, SPSS, Revolutios-R,... all rely on similar ideas.
298 | - Clearly, with so few variables I would be better of *subsampling*.
299 | - The [SOAR](http://cran.r-project.org/web/packages/SOAR/index.html) package also allows similar out-of-memory processing.
300 |
301 | # Out of memory Classification
302 | I do not know if there are `ff` versions of `LiblineaR` or `RSofia`.
303 | If you find out, let me know.
304 |
305 |
306 |
307 |
308 |
309 | # Parallelation
310 |
311 | ## Parallelized learning
312 | [TODO]
313 |
314 | ## Parallelized simulation
315 | [TODO]
316 |
317 | ## Distributed Graph algorithms
318 | [TODO]
319 |
--------------------------------------------------------------------------------
/notes/.gitignore:
--------------------------------------------------------------------------------
1 | notes.loa
2 |
--------------------------------------------------------------------------------
/notes/Intro2R.txss:
--------------------------------------------------------------------------------
1 | [Session]
2 | FileVersion=1
3 | File0\FileName=notes.tex
4 | File0\Line=71
5 | File0\Col=17
6 | File0\FirstLine=0
7 | File0\FoldedLines=
8 | File1\FileName=unsupervised.tex
9 | File1\Line=312
10 | File1\Col=12
11 | File1\FirstLine=0
12 | File1\FoldedLines="67,125"
13 | File2\FileName=introduction.tex
14 | File2\Line=61
15 | File2\Col=0
16 | File2\FirstLine=0
17 | File2\FoldedLines=
18 | MasterFile=
19 | CurrentFile=statistical_decision.tex
20 | Bookmarks=@Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x31\0\x31\0\x35\0\0\0\x2\0\x31\0\0\0<\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x45\0s\0t\0i\0m\0\x61\0t\0i\0o\0n\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x33\0\x34\0\x34\0\0\0\x2\0\x32\0\0\0t\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x46\0r\0o\0m\0 \0\x45\0s\0t\0i\0m\0\x61\0t\0i\0o\0n\0 \0t\0o\0 \0S\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x37\0\x32\0\x32\0\0\0\x2\0\x33\0\0\0^\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0N\0o\0n\0 \0\x45\0R\0M\0 \0S\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x38\0\x34\0\x34\0\0\0\x2\0\x34\0\0\0^\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0S\0t\0\x61\0t\0i\0s\0t\0i\0\x63\0\x61\0l\0 \0\x44\0\x65\0\x63\0i\0s\0i\0o\0n\0 \0T\0h\0\x65\0o\0r\0y\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x31\0\x35\0\x38\0\0\0\x2\0\x35\0\0\0R\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0U\0n\0s\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x32\0\x38\0\x37\0\0\0\x2\0\x36\0\0\0J\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0G\0\x65\0n\0\x65\0r\0\x61\0t\0i\0v\0\x65\0 \0M\0o\0\x64\0\x65\0l\0s\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x33\0\x33\0\x33\0\0\0\x2\0\x37\0\0\0X\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x44\0i\0m\0\x65\0n\0s\0i\0o\0n\0\x61\0l\0i\0t\0y\0 \0R\0\x65\0\x64\0u\0\x63\0t\0i\0o\0n\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x34\0\x39\0\x31\0\0\0\x2\0\x38\0\0\0N\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0L\0\x61\0t\0\x65\0n\0t\0 \0S\0p\0\x61\0\x63\0\x65\0 \0M\0o\0\x64\0\x65\0l\0s\0})
21 | File3\FileName=appendices.tex
22 | File3\Line=143
23 | File3\Col=0
24 | File3\FirstLine=0
25 | File3\FoldedLines=
26 | File4\FileName=supervised.tex
27 | File4\Line=429
28 | File4\Col=95
29 | File4\FirstLine=0
30 | File4\FoldedLines=
31 | File5\FileName=commands.tex
32 | File5\Line=85
33 | File5\Col=16
34 | File5\FirstLine=0
35 | File5\FoldedLines=
36 | File6\FileName=statistical_decision.tex
37 | File6\Line=0
38 | File6\Col=0
39 | File6\FirstLine=0
40 | File6\FoldedLines=
41 |
42 | [InternalPDFViewer]
43 | File=
44 | Embedded=false
45 |
--------------------------------------------------------------------------------
/notes/appendices.tex:
--------------------------------------------------------------------------------
1 | \chapter{The Relation Between Supervised and Unsupervised Learning}
2 | \label{sec:relation_supervised_unsupervised}
3 |
4 |
5 | It may be surprising that collaborative filtering can be seen as both an unsupervised and a supervised learning problem.
6 | But these are not mutually exclusive problems.
7 | In fact, the relation has already been implied in the introduction to the unsupervised learning section (\S\ref{sec:unsupervised}), and we now make it explicit.
8 |
9 | In unsupervised learning we try to learn the joint distribution of $x$, i.e., try to learn the relationship between any variable in $x$ to the rest, we may see it as several supervised learning problems. In each, a different variable in $x$ plays the role of $y$.
10 |
11 | Many unsupervised learning methods can be seen in this light. We, however, will not be exploring this avenue right now.
12 |
13 | [TODO: autoencoders].
14 |
15 |
16 |
17 |
18 | % % % % % % RKHS % % % % %
19 |
20 | \chapter{The Kernel Trick and Reproducing Kernel Hilbert Spaces (RKHS)}
21 | \label{apx:rkhs}
22 |
23 | In the context of supervised learning the \emph{kernel trick} is a mathematical device that allows to learn very complicated predictors ($\hyp$) in a computationally efficient manner.
24 | More generally, in the context of unsupervised learning, the kernel tricks allow to learn complicated non-linear mappings of the original features (and not only predictor functions).
25 |
26 | Not all predictors and not all problem admit this trick. Then again, many do.
27 | Methods for which it applies include:
28 | SVM's (\S\ref{sec:svm}), principal components analysis (\S\ref{sec:pca}), canonical correlation analysis (\S\ref{sec:cca}), ridge regression (\S\ref{sec:ridge}), spectral clustering (\S\ref{sec:spectral_clustering}), Gaussian processes\footnote{See the Bayesian interpretation below to see why they apply to Gaussian Processes.}, and more\footnote{This partial list is taken from Wikipedia: \url{http://en.wikipedia.org/wiki/Kernel_method}}.
29 |
30 | We now give an exposition of the method in the context of supervised learning.
31 |
32 |
33 | Think of smoothing splines (\S\ref{sec:smoothing_splines});
34 | It was quite magical that without constraining the hypothesis class $\hypclass$, the ERM problem in Eq.(\ref{eq:smoothing_spline}) has a finite dimensional closed form solution.
35 | The property of an infinite dimensional problem having a solution in a finite dimensional space is known as the \emph{kernel property}.\marginnote{Kernel Property}
36 | We with to generalize this observation and ask- which problems have the kernel property?
37 | Stating the general optimization problem:
38 | \begin{align}
39 | \label{eq:rkhs}
40 | \argmin{\hyp}{\frac{1}{n} \sum_i \loss(y_i,\hyp(x_i)) + \lambda J(\hyp) }
41 | \end{align}
42 | The problem is then- what type of penalties $J(\hyp)$ will return simple solutions to Eq.(\ref{eq:rkhs}).
43 | The answer is: function that belong to \emph{Reproducing Kernel Hilbert Space} function spaces.
44 | RKHS's are denoted by $\rkhs$.
45 | They include many functions, but they are a rather ``small'' subset of the space of all possible functions.
46 | These spaces, and the functions therein, are defined by another function called a \emph{Kernel} denoted by $\kernel$.
47 | Choosing a particular kernel defines the space and the functions therein.
48 | Choosing a particular kernel, also defines the form of $J$ in Eq.(\ref{eq:rkhs}).
49 | Put differently: for any choice of a kernel $\kernel$, there is a particular $J(\hyp)$ for which the solution of Eq.(\ref{eq:rkhs}) will be a function in $\rkhs$ and will be easily computable.
50 |
51 |
52 | \section{Mathematics of RKHS}
53 | We now show how choosing a kernel $\kernel$ defines a space $\rkhs$, and a penalty $J(\hyp)$.
54 |
55 | A kernel is a non-negative symmetric function of two arguments: $\kernel(x,y): \reals^p \times \reals^p \mapsto \reals_+$.
56 | By fixing $y$, $\kernel(x,y)$ is a function with a single argument $x \mapsto \kernel(x,y)$.
57 | $\rkhs$ is merely the space of functions of $x$, spanned at given $y$'s:
58 | \begin{align}
59 | \label{eq:rkhs_span}
60 | \hyp(x):\sum_m \al_m \kernel(x,y_m)
61 | \end{align}
62 |
63 | From linear algebra, you may know that positive definite matrices be diagonalized.
64 | This analogy carries to $\kernel$, which admits an eigen-expansion:
65 | \begin{align}
66 | \label{eq:rkhs_eigen}
67 | \kernel(x,y)=\sum_{i=1}^\infty \gamma_i \phi(x) \phi(y)
68 | \end{align}
69 | Using Eqs.(\ref{eq:rkhs_eigen}) and (\ref{eq:rkhs_span}) we can thus expand elements $f$ of $\rkhs$:
70 | \begin{align}
71 | \hyp(x)=\sum_{i=1}^\infty c_i \phi(x)
72 | \end{align}
73 | where $c_i=\gamma_i \sum_m \alpha_m \phi(y)$.
74 | We also define a norm $\normrkhs{\hyp}^2$ in this space, which is induced by $\kernel$:
75 | \begin{align}
76 | \label{eq:rkhs_norm}
77 | \normrkhs{\hyp}^2 := \sum_{i=1}^\infty \frac{c_i^2}{\gamma_i}
78 | \end{align}
79 |
80 | The penalty $J(\hyp)$ in Eq.(\ref{eq:rkhs}), is simply be $\normrkhs{\hyp}^2$.
81 | The $\hyp$'s that solve Eq.(\ref{eq:rkhs}) are guaranteed to have a simple form. They reside in an $n$ dimensional linear function space \citep{wahba_spline_1990}:
82 | \begin{align}
83 | \hyp(x)=\sum_{i=1}^n \al_i \kernel(x,x_i)
84 | \end{align}
85 |
86 | The functions $\kernel(x,x_i)$ can be seen as a basis to the solution space.
87 | The good news continue! Being only $n$ dimensional, the norms of these $\hyp$'s, do not require integration but rather only finite summation:
88 | \begin{align}
89 | \normrkhs{\hyp}^2=\sum_{i=1}^n \sum_{j=1}^n \kernel(x_i,x_j) \al_i \al_{j} := \al' K \al.
90 | \end{align}
91 |
92 | Adding the above results, we can restate Eq.(\ref{eq:rkhs}) and say that when fixing $\kernel$ and using the appropriate $J$, we only need to solve:
93 | \begin{align}
94 | \label{eq:rkhs_simple}
95 | \argmin{\al}{\frac{1}{n} \sum_i \loss(y_i-K_i \alpha) + \lambda \al' K \al }
96 | \end{align}
97 | which is a quadratic programming problem over an $n$ dimensional linear space, easily solvable with numeric routines.
98 |
99 |
100 | \section{The Bayesian View of RKHS}
101 | Just as the ridge regression (\S\ref{sec:ridge}) has a Bayesian interpretation, so does the kernel trick.
102 | Informally, the functions solving Eq.(\ref{eq:rkhs}) can be seen as the posterior mode if our prior beliefs postulate that the function we are trying to recover is a Gaussian zero-mean process with covariance given by $\kernel$.
103 | This view suggests the intuition that the regularization introduced by $J(\hyp)$ shrinks the estimated $\hyp$ towards a smoother function. At an extreme, where $\lambda\to\infty$, we will recover a constant function, since the the mode of our Gaussian process prior is at the origin of $\rkhs$.
104 |
105 |
106 | \section{Kernel Generalization of Other Methods}
107 | [TODO: Sec 18.5.2]
108 |
109 |
110 |
111 |
112 |
113 |
114 | % % % % % % % % % The Spectral Trick % % % % % % %
115 |
116 | \chapter{The Spectral Trick}
117 | \label{apx:spectral}
118 | [TODO]
119 |
120 |
121 | % % % % % % % % Generative models % % % % % % % %
122 |
123 | \chapter{Generative Models}
124 | \label{apx:generative_concept}
125 |
126 | By \emph{generative model} we mean that we specify the whole data distribution. This is particularly relevant to supervised learning where many methods only assume the distribution of $\dist(y|x)$ without stating the distribution of $\dist(x)$.
127 | Assuming only $\dist(y|x)$ is known as a \emph{discriminative model}, or \emph{discriminative analysis}.\marginnote{Descriminative Model}
128 | In a generative model, in contrast, we assume the whole $\dist(y,x)$.
129 |
130 | For the mere purpose of making a prediction, we do not need to learn $\dist(y,x)$.
131 | Knowing this distribution, however, does permit to make predictions, via Bayes Theorem:
132 | $\dist(y|x)=\frac{\dist(y,x)}{\int\dist(y,x)dy}$.
133 | Generative models make use of this relation to make predictions.
134 |
135 | To gain some intuition, consider a supervised learning problem where the data has an equal number of samples per class.
136 | Learning the distribution of $x$ withing each class, allows to a simple classification of a given $x$ to the class with highest probability. LDA (\S\ref{sec:lda}), QDA (\S\ref{sec:lda}), and \Naive Bayes (\S\ref{sec:naive_bayes}) follow this exact same rational.
137 |
138 |
139 |
140 |
141 |
142 |
143 | % % % % % % % % % Dimensionality Reduction % % % % % % %
144 |
145 | \chapter{Dimensionality Reduction}
146 | \label{apx:dim_reduce}
147 |
148 | Dimensionality reduction is a useful concept for both supervised and unsupervised learning.
149 | It allows to represent high dimensional data in a lower dimension.
150 | This allows the visualization of the data in a human-tractable dimension, the application of low-dimensional algorithms, and the reduction of computational burden when using the data for supervised learning.
151 |
152 | The fundamental idea behind dimensionality reduction is that while $\featureS$ may be high dimensional, thus $\dist(x)$ hard to learn, there is hope that $\x$ does not really vary in the whole space.
153 | If the mass of $\dist(x)$ is concentrated around some low dimensional manifold $\manifold$, then the original problem might be approximated to learning the distribution of the projection $\dist(X \project \manifold)$ on $\manifold$.
154 | If $\manifold$ is fairly low dimensional, we may hope to visualize and understand $\dist(X \project \manifold)$ with fairly simple tools.
155 | Dimensionality reduction also reduces the memory required to represent the data. It is thus intimately related to \emph{lossy compression} in information theory.\marginnote{Lossy Compression}
156 |
157 | A similar reasoning justifies dimensionality reduction in supervised learning.
158 | While $\dist(x)$ might vary in the whole $\featureS$, but there might be only few directions which carry information on $y$. Learning $\dist(y|x)$ can thus be well approximated by $\dist(y|x \project \manifold)$.
159 |
160 | As was first observed in the context of PCA (\S\ref{sec:pca}), for many types of embeddings, i.e., for many target manifolds and reconstruction errors, we do not really need the original data $X$, but rather only a graph of similarities between data points ($\similaritys$).
161 | This allow the dimensionality reduction theory to borrow from \emph{graph embedding} and \emph{graph drawing} literature.
162 |
163 |
164 | The different dimensionality methods can be stratified along these lines:
165 | We can further stratify dimensionality reduction methods along these lines:
166 | \begin{description}
167 | \item[Linear-Space vs. Non-Linear-Space Embeddings]
168 | When reducing the dimension of $X$, it can be mapped (embedded) into a linear subspace, $\manifold \subset \featureS$, or a non-linear $\manifold$.
169 |
170 | \item[Linear vs. Non-Linear Space Embeddings]
171 | Not to be confused with the previous item.
172 | The dimensionality reducing mapping, $X \project \manifold$, can be a linear operation on the data or a non-linear one.
173 |
174 | \item[Learning an Embedding vs. Learning an Embedding Function]
175 | When learning a mapping to a lower dimensional space, we can map the original data points (an embedding), or learn a mapping of the whole data space (an embedding function).
176 | \end{description}
177 |
178 |
179 |
180 | \section{Dimensionality Reduction in Supervised Learning}
181 | Dimensionality reduction is often performed before supervised learning to keep computational complexity low.
182 | It is sometimes performed on $X$ while ignoring $y$ (e.g. PCA Regression in \S\ref{sec:pca_regression}), and sometimes as part of the supervised learning (e.g. PLS in \S\ref{sec:pls}).
183 |
184 | From a statistical view-point, it is preferable to solve the supervised learning and dimensionality reduction simultaneously. This is because the subspace $\manifold$, which approximates $\dist(x)$ may differ than the one that approximates $\dist(y|x)$.
185 | From a computational view-point, however, it may be preferable to decouple the stages.
186 |
187 |
188 |
189 |
190 | \section{Graph Drawing}
191 | [TODO]
192 |
193 |
194 |
195 |
196 |
197 | % % % % % % % % % Latent Variables % % % % % % %
198 |
199 | \chapter{Latent Variables}
200 | \label{apx:latent}
201 | [TODO]
202 |
203 |
204 |
205 |
206 | % % % % % % % % % Information Theory % % % % % % %
207 |
208 | \chapter{Information Theory}
209 | \label{apx:information_theory}
210 |
211 |
212 | \begin{definition}[Entropy]
213 | \label{def:entropy}
214 | [TODO]
215 | \end{definition}
216 |
217 |
218 |
219 | \begin{definition}[Mutual Information]
220 | \label{def:mutual_information}
221 | [TODO]
222 | \end{definition}
223 |
224 |
225 |
226 | \begin{definition}[Kullback–Leibler Divergence]
227 | \label{def:kl_divergence}
228 | [TODO]
229 | \end{definition}
230 |
231 |
232 |
233 |
234 |
235 | % % % % % % Notation % % % % %
236 |
237 |
238 | \chapter{Notation}
239 | \label{apx:notation}
240 |
241 | In this text we use the following notation conventions:
242 | \begin{description}
243 | \item[$x$] A vector (or scalar). It is typically a column vector, but this should typically be implied from the text.
244 | \item[$\ones$] A vector of $1$'s.
245 | \item[$\x$] A vector (or scalar) valued random variable.
246 | \item[$X$] A matrix.
247 | \item[$\X$] A matrix valued random variable (a random matrix).
248 | \item[$X'$] The matrix transpose of $X$.
249 | \item[$\normII{x}$] The $l_2$ norm of $x$: $\sqrt{\sum_j x_j^2}$.
250 | \item[$\normI{x}$] The $l_1$ norm of $x$: $\sum_j |x_j|$
251 | \item[$\normF{X}$] The Frobenius matrix norm of X: $\normF{X}^2=\sum_{ij} x_{ij}^2$
252 | \item[$\ortho$] The space of orthogonal matrices.
253 | \item[$\scalar x y$] The scalar product of two vectors $x$ and $y$.
254 | \item[$\sample$] A data sample.
255 | \item[$\expect{\x}$] The expectation of $\x$.
256 | \item[$\expectn{x}$] The empirical expectation (average) of the vector $x$.
257 | \item[$\cov{\x}$] The covariance matrix of $\x$: $\expect{(\x-\expect{\x})(\x-\expect{\x})'}$.
258 | \item[$\covn{x}$] The empirical covariance matrix of x: $\expectn{(x-\expectn{x})(x-\expectn{x})'}$.
259 | \item[$\rho(\x,\y)$] The correlation coefficient.
260 | \item[$\cdf{x}{t}$] The CDF of $\x$ at $t$.
261 | \item[$\icdf{x}{\al}$] The inverse CDF at $\al$ (the quantile function).
262 | \item[$\cdfn{x}{t}$] The empirical CDF of data vector $x$.
263 | \item[$\icdfn{x}{\al}$] The empirical $\al$ quantile of the data vector $x$.
264 | \item[$\x \sim \dist$] The random variable $\x$ is $\dist$ distributed.
265 | \item[$\pdf(x)$] The density function of $\dist$ at $x$.
266 | \item[$\gauss{\mu,\sigma^2}$] The univariate Gaussian distribution with mean $\mu$ and variance $\sigma^2$.
267 | \item[$\gauss{\mu,\Sigma}$] The multivariate Gaussian distribution with mean vector $\mu$ and covariance matrix $\Sigma$.
268 | \item[$\lik(\theta)$] The likelihood function at $\theta$.
269 | \item[$\loglik(\theta)$] The log likelihood function at $\theta$.
270 | \item[$\loss(x,\theta)$] The loss function of $\theta$ at $x$.
271 | \item[$\risk(\theta)$] The risk at $\theta$.
272 | \item[$\riskn(\theta)$] The empirical risk at $\theta$.
273 | \item[$\hyp(x)$] A prediction (hypothesis) at $x$.
274 | \item[$\hypclass$] The class of all hypotheses $\hyp$.
275 | \item[$\plane$] A hyperplane.
276 | \item[$\categories$] A set of categories.
277 | \item[$\positive{t}$] The positive part of $t$: $\max\{0,t \}$.
278 | \item[$\kernel(x,y)$] A kernel function evaluated at $(x,y)$.
279 | \item[$\indicator{A}$] The indicator function of the set $A$.
280 | \item[$\manifold$] A manifold.
281 | \item[$\project$] A projection operator.
282 |
283 | \item[$\similarity_{ij}$] A similarity measure between observations $i$ and $j$.
284 | \item[$\dissimilarity_{ij}$] A dissimilarity (i.e., distance) measure between observations $i$ and $j$.
285 | \item[$\similaritys$] A weighted graph (i.e. network, or matrix) of similarities between observations.
286 | \item[$\dissimilaritys$] A weighted graph (i.e. network, or matrix) of dissimilarities between observations.
287 |
288 | \item[$\kl{\x}{\y}$] Kullbeck-Leibler divergence between random variable $\x$ to $\y$.
289 | \item[$\entropy(\x)$] The entropy of random variable $\x$.
290 | \item[$\mutual{\x}{\y}$] The mutual information between $\x$ and $\y$.
291 |
292 |
293 | \end{description}
294 |
295 |
296 |
--------------------------------------------------------------------------------
/notes/art/avoid-overfitting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/avoid-overfitting.png
--------------------------------------------------------------------------------
/notes/art/bias_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/bias_variance.png
--------------------------------------------------------------------------------
/notes/art/censored.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/censored.pdf
--------------------------------------------------------------------------------
/notes/art/imputing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/imputing.pdf
--------------------------------------------------------------------------------
/notes/art/irrelevant-features-hurt-knn-clustering.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/irrelevant-features-hurt-knn-clustering.png
--------------------------------------------------------------------------------
/notes/art/irrelevant-features.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/irrelevant-features.png
--------------------------------------------------------------------------------
/notes/art/non-linear-basis-functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/non-linear-basis-functions.png
--------------------------------------------------------------------------------
/notes/art/som_simulation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/som_simulation.png
--------------------------------------------------------------------------------
/notes/art/support-vector-machine-15-728.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/support-vector-machine-15-728.jpg
--------------------------------------------------------------------------------
/notes/art/uncensored.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/uncensored.pdf
--------------------------------------------------------------------------------
/notes/art/why-complex-models-can-turn-out-to-be-less-probable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/why-complex-models-can-turn-out-to-be-less-probable.png
--------------------------------------------------------------------------------
/notes/collaborative.tex:
--------------------------------------------------------------------------------
1 | \chapter{Recommender Systems}
2 | \label{sec:recomender_systems}
3 |
4 |
5 | % % % % recommender systems % % % % % % %
6 |
7 | A recommender system is a software that, as the name suggests, gives recommendations to the user.
8 | Notable examples include Book recommendations by Amazon, and film recommendations by Netflix.
9 | The two main approaches to recommender systems include \emph{content filtering} and \emph{collaborative filtering}.
10 |
11 | Two nice introductions to recommender systems can be found in \citet{koren_matrix_2009} and \citet{su_survey_2009}.
12 |
13 |
14 | % % % % Content filtering % % % % % %
15 | \section{Content Filtering}
16 | \label{sec:content_filtering}
17 |
18 | In content filtering, the system is assumed to have some background information on the user (say, because he logged in), and uses
19 | this information to give him recommendations.
20 | The recommendation in this case, is approached as a supervised learning problem:
21 | the system learns to predict a product's rating based on the user's features.
22 | It then computes the rating for many candidate products and recommends a set with high predicted ratings.
23 |
24 |
25 |
26 | % % % % collaborative filtering % % % % %
27 | \section{Collaborative Filtering}
28 | \label{sec:collaborative_filtering}
29 |
30 | Unlike content filtering, in \emph{collaborative filtering}, there is no external information on the user or the products, besides the ratings of other users.
31 | The term collaborative filtering, was coined by the authors of the first such system-- Tapestry \citep{goldberg_using_1992}.
32 |
33 | Collaborative filtering can be approached as a supervised learning problem, or as an unsupervised learning problem. This is because it is neither.
34 | It is essentially a \emph{missing data} problem.\marginnote{Missing Data}
35 | To see this consider a matrix of rankings, $\rankings$ where the $i,j$'th entry, $\ranking_{i,j}$, is the ranking of user $i$ movie $j$.
36 | Predicting $\ranking_{i,j'}$, i.e., the ranking of a currently unseen movie, is essentially an imputation of a missing value.
37 | It is exceptionally challenging however, as in typical applications there is much more missing data than observed data.
38 |
39 |
40 | The two main approaches to collaborative filtering include \emph{neighbourhood methods}, and \emph{latent factor models} \cite{koren_matrix_2009}.
41 |
42 | \subsubsection{Neighbourhood Methods}
43 | The neighbourhood methods to collaborative filtering rest on the assumption that similar individuals have similar tastes.
44 | If someone similar to individual $i$ has seen movie $j'$, then $i$ should have a similar opinion.
45 |
46 | The notion of using the neighbourhood of a data point is not a new one. We have seen it being used for supervised learning in kernel regression (\S\ref{sec:kernel}) and KNN (\S\ref{sec:knn}).
47 |
48 | Neighbourhood methods for collaborative filtering, or missing data imputation in general, can thus be seen as a non-parametric approach to supervised learning problems, and solved in the same way.
49 |
50 |
51 | \begin{remark}[Collaborative Filtering and Other Supervised Learning Methods]
52 | If you are wondering, why only neighbourhood methods for supervised learning apply to collaborative filtering, you are right.
53 | Any supervised learning method can be applied to impute entries in $\rankings$. Neighbourhood
54 | methods are merely the most popular.
55 | \end{remark}
56 |
57 |
58 | \subsubsection{Latent Factor Models}
59 | The latent factor approach to collaborative filtering rests on the assumption that the rankings are a function of some latent user attributes and latent movie attributes.
60 | This idea is not a new one, as we have seen it in the context of unsupervised learning in factor analysis (FA) (\S\ref{sec:factor_analysis}), independent component analysis (ICA) (\S\ref{sec:ica}), and other latent space generative models.
61 | We thus see that collaborative filtering, and missing data imputation in general, can be approached as an unsupervised learning problem.
62 |
63 | As we will soon see, just like the FA problem (\S\ref{sec:factor_analysis}), the latent factor model implies that the data arises as a multiplication of matrices. This is why, this approach is more commonly known as the \emph{matrix factorization} approach collaborative filtering.\marginnote{Matrix Facorization}
64 | We will present several matrix factorization problem in the ERM framework.
65 | Note, however, that while stating the optimization problem requires only basic math and imagination, actually solving them is far from trivial. In fact, if you try arbitrarily changing the basic ERM problems below with your favourite loss function and generative model, you will probably find the problem to be computationally unsolvable.
66 |
67 | Having movie ratings in mind, the simplest collaborative filtering ERM problem is
68 | \begin{align}
69 | \label{eq:matrix_factorization}
70 | \argmin{\latentn,\loadings}{\sum_{i,j \in \kappa} (\rankings_{i,j} - \latentn_j' \loadings_i)^2 + \lambda (\normII{\latentn_j}^2+ \normII{\loadings_i}^2)},
71 | \end{align}
72 | where $\latentn_j$ is the latent properties of movie $j$,
73 | $\loadings_i$ is the importance of a movies properties to viewer $i$,
74 | and summation is performed over $\kappa$, which is the set of movies and user which actually have a rating.
75 | As usual, the regularization $\lambda$ can be chosen with a cross-validation approach (\S\ref{sec:cv}), or the other unbiased risk estimation methods in Chapter~\ref{sec:desicion_theory}.
76 |
77 | It may seem quite miraculous that by assuming a lower dimensional generative model, one may impute missing values.
78 | The following figures try to suggest an intuition.
79 | [TODO: add figures]
80 |
81 |
82 |
83 | \begin{remark}[Matrix Norm Notation]
84 | We could write Eq.(\ref{eq:matrix_factorization}) using matrix norms, but we would then need to define multiplications with missing values. This is not hard to do, but I rather avoid it right now.
85 | \end{remark}
86 |
87 | \begin{remark}[Matrix Factorization and Factor Analysis]
88 | On the face of it, the matrix factorization problem in Eq.(\ref{eq:matrix_factorization}) seems very similar to the FA problem in Eq.(\ref{eq:factor}) with squared error loss.
89 | The reason we do not encounter the rotation invariance property in the solution to Eq.(\ref{eq:matrix_factorization}) is due to the $l_2$ regularization term
90 | \end{remark}
91 |
92 | We can now complicate the matrix factorization problem a little further.
93 | We account for personal effects, movie effects, and time-varying preferences.
94 | The implied ERM problem is
95 | \begin{align}
96 | \label{eq:matrix_factorization_comlicated}
97 | \argmin{\latentn,\loadings, b_i, b_j}{
98 | \sum_{i,j,t \in \kappa} (\rankings_{i,j}(t) - b_i(t) - b_j- \latentn_j' \loadings_i(t))^2 + \lambda (\normII{\latentn_j}^2+ \normII{\loadings_i(t)}^2 + b_i(t)^2 + b_j^2 )
99 | },
100 | \end{align}
101 | $\loadings_i(t)$ is the importance of a movies properties to viewer $i$ at period $t$,
102 | $b_j$ is an average appreciation of movie $j$~\footnote{A marketing effect?},
103 | $b_i(t)$ is the average appreciation of viewer $i$~\footnote{A mood effect?},
104 | and summation is performed over $\kappa$, which is the set of times, movies, and users, which actually have a rating.
105 |
106 | \begin{remark}[Temporal Dynamics and Tensor Factorization]
107 | When introducing a temporal dimension, the rating can not longer be presented as a matrix.
108 | Eq.(\ref{eq:matrix_factorization_comlicated}) can thus no longer be seen as a \emph{matrix} factorization problem.
109 | Indeed, this is a \emph{tensor} factorization problem.
110 | Tensor factorization is currently much less advanced than matrix factorization theory. Moreover, the numerical libraries the implement tensor factorization are much less developed than existing matrix algebra libraries \citet{lorica_lets_????}.
111 | This is why, IMHO, authors prefer to deal with tensors by stacking and kronecker products, rather then treating them as the tensors they are.
112 | \end{remark}
113 |
114 |
115 |
116 |
117 | % % % % Hybrid methods % % % %
118 | \section{Hybrid Filtering}
119 | After introducing the ideas of content filtering (\S\ref{sec:content_filtering}) and collaborative filtering (\S\ref{sec:collaborative_filtering}), why not marry the two?
120 | \emph{Hybrid filtering} is the idea of imputing the missing data, thus making recommendations, using both a viewer's attributes, and other viewers' preferences.
121 |
122 | A simple version of the implied ERM problem is
123 | \begin{align}
124 | \label{eq:hybrid_filtering}
125 | \argmin{\latentn,\loadings, \hyp}{\sum_{i,j \in \kappa} (\rankings_{i,j} - \latentn_j' \loadings_i - \hyp(x_i) )^2 + \lambda (\normII{\latentn_j}^2 + \normII{\loadings_i}^2+ J(\hyp)) },
126 | \end{align}
127 | where $\hyp(x_i)$ is the effect of the attributes of viewer $i$ to his preferences, and $J(\hyp)$ is some regularization for the predictor's complexity.
128 |
129 |
130 | \section{Recommender Systems Terminology}
131 |
132 | Since the recommender systems literature did not stem from the statistical learning literature, it typically uses different terminology for very similar, if not identical, concepts.
133 | Here is a partial list of some common terms:
134 |
135 | \begin{itemize}
136 | \item \textbf{Content Based Filtering}: A supervised learning approach to recommendations.
137 | \item \textbf{Collaborative Filtering}: A missing data imputation approach to recommendations.
138 | \item \textbf{Memory Based Filtering}: A non parametric (neighbourhood) approach (\S\ref{sec:non_erm}) to collaborative filtering.
139 | \item \textbf{Model Based Filtering}: A latent space generative model approach (\S\ref{sec:latent_space}) to collaborative filtering.
140 | \end{itemize}
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/notes/commands.tex:
--------------------------------------------------------------------------------
1 | % Custom environments
2 |
3 | \theoremstyle{plain}
4 | \newtheorem{thm}{Theorem}[section]
5 | \newtheorem{lemma}{Lemma}[section]
6 | \newtheorem{prop}{Proposition}[section]
7 |
8 | \theoremstyle{definition}
9 | \newtheorem{definition}{Definition}[chapter]
10 | \newtheorem{remark}{Remark}[section]
11 | \newtheorem{example}{Example}[section]
12 |
13 |
14 |
15 |
16 |
17 | % Custom commands
18 |
19 | \newcommand{\naive}{na\"{\i}ve }
20 | \newcommand{\Naive}{Na\"{\i}ve }
21 | \newcommand{\andor}{and\textbackslash or }
22 | \newcommand{\erdos}{Erd\H{o}s }
23 | \newcommand{\renyi}{R\`enyi }
24 |
25 |
26 | \newcommand{\al}{\alpha}
27 | \newcommand{\be}{\beta}
28 |
29 | \newcommand{\set}[1]{\left\{ #1 \right\}} % A set
30 | \newcommand{\rv}[1]{\mathbf{#1}} % A random variable
31 | \newcommand{\x}{\rv x} % The random variable x
32 | \newcommand{\y}{\rv y} % The random variable x
33 | \newcommand{\X}{\rv X} % The random variable x
34 | \newcommand{\Y}{\rv Y} % The random variable y
35 | \newcommand{\expect}[1]{\mathbf{E}\left[ #1 \right]} % The expectation operator
36 | \newcommand{\expectg}[2]{\mathbf{E}_{\rv{#1}}\left[ \rv{#2} \right]} % An expectation w.r.t. a particular random variable.
37 | \newcommand{\expectn}[1]{\mathbb{E}\left[#1\right]} % The empirical expectation
38 | \newcommand{\cov}[1]{\mathbf{C}ov \left[ #1 \right]} % The expectation operator
39 | \newcommand{\covn}[1]{\mathbb{C}ov \left[ #1 \right]} % The expectation operator
40 | \newcommand{\gauss}[1]{\mathcal{N}\left(#1\right)} % The gaussian distribution
41 | \newcommand{\cdf}[2]{F_\rv{#1} (#2)} % The CDF function
42 | \newcommand{\cdfn}[2]{\mathbb{F}_{#1}(#2)} % The empirical CDF function
43 | \newcommand{\icdf}[2]{F_\rv{#1}^{-1} (#2)} % The invecrse CDF function
44 | \newcommand{\icdfn}[2]{\mathbb{F}^{-1}_{#1}(#2)} % The inverse empirical CDF function
45 | \newcommand{\pdf}{p} % The probability density function
46 | \newcommand{\prob}[1]{P\left( #1 \right)} % the probability of an event
47 | \newcommand{\dist}{P} % The proabaiblity distribution
48 | \newcommand{\entropy}{H} % entropy
49 | \newcommand{\mutual}[2]{I\left(#1;#2\right)} % mutual information
50 |
51 | \newcommand{\estim}[1]{\widehat{#1}} % An estimator
52 |
53 | \newcommand{\norm}[1]{\Vert #1 \Vert} % The norm operator
54 | \newcommand{\normII}[1]{\norm{#1}_2} % The norm operator
55 | \newcommand{\normI}[1]{\norm{#1}_1} % The norm operator
56 | \newcommand{\normF}[1]{\norm{#1}_{Frob}} % The Frobenius matrix norm
57 | \newcommand{\ones}{\textbf{1}} % Vector of ones.
58 | \newcommand{\lik}{\mathcal{L}} % The likelihood function
59 | \newcommand{\loglik}{L} % The log likelihood function
60 | \newcommand{\loss}{l} % A loss function
61 | \newcommand{\risk}{R} % The risk function
62 | \newcommand{\riskn}{\mathbb{R}} % The empirical risk
63 | \newcommand{\deriv}[2]{\frac{\partial #1}{\partial #2}} % A derivative
64 | \newcommand{\argmin}[2]{\mathop{argmin} _{#1}\set{#2}} % The argmin operator
65 | \newcommand{\argmax}[2]{\mathop{argmax}_{#1}\set{#2}} % The argmin operator
66 | \newcommand{\hyp}{f} % A hypothesis
67 | \newcommand{\hypclass}{\mathcal{F}} % A hypothesis class
68 | \newcommand{\hilbert}{\mathcal{H}}
69 | \newcommand{\rkhs}{\hilbert_\kernel} % A hypothesis class
70 | \newcommand{\normrkhs}[1]{\norm{#1}_{\rkhs}} % the RKHS function norm
71 |
72 |
73 | \newcommand{\plane}{\mathbb{L}} % A hypoerplane
74 | \newcommand{\categories}{\mathcal{G}} % The categories set.
75 | \newcommand{\positive}[1]{\left[ #1 \right]_+} % The positive part function
76 | \newcommand{\kernel}{\mathcal{K}} % A kernel function
77 | \newcommand{\featureS}{\mathcal{X}} % The feature space
78 | \newcommand{\indicator}[1]{I_{\set{#1}}} % The indicator function.
79 | \newcommand{\reals}{\mathbb{R}} % the set of real numbers
80 |
81 |
82 |
83 | \newcommand{\latent}{\rv{s}} % latent variables matrix
84 | \newcommand{\latentn}{S} % latent variables matrix
85 | \newcommand{\loadings}{A} % factor loadings matrix
86 | \newcommand{\rotation}{R} % rotation matrix
87 | \newcommand{\similaritys}{\mathfrak{S}} % a similarity graph
88 | \newcommand{\similarity}{s} % A similarity measure.
89 | \newcommand{\dissimilarity}{d} % A dissimilarity measure.
90 | \newcommand{\dissimilaritys}{\mathfrak{D}} % a dissimilarity graph
91 | \newcommand{\scalar}[2]{\left< #1,#2 \right>} % a scalar product
92 |
93 |
94 |
95 | \newcommand{\manifold}{\mathcal{M}} % A manifold.
96 | \newcommand{\project}{\hookrightarrow} % The orthogonal projection operator.
97 | \newcommand{\projectMat}{H} % A projection matrix.
98 | \newcommand{\rank}{q} % A subspace rank.
99 | \newcommand{\dimy}{K} % The dimension of the output.
100 | \newcommand{\encode}{E} % a linear encoding matrix
101 | \newcommand{\decode}{D} % a linear decoding matrix
102 | \DeclareMathOperator{\Tr}{Tr}
103 | \newcommand{\ensembleSize}{M} % Size of a hypothesis ensemble.
104 | \newcommand{\ensembleInd}{m} % Index of a hypothesis in an ensemble.
105 |
106 |
107 | \newcommand{\sample}{\mathcal{S}} % A data sample.
108 | \newcommand{\test}{\risk(\hyp)} % The test error (risk)
109 | \newcommand{\train}{\riskn(\hyp)} % The train error (empirical risk)
110 | \newcommand{\insample}{\bar{\risk}(\hyp)} % The in-sample test error.
111 | \newcommand{\EPE}{\risk(\hat{\hyp}_n)} % The out-of-sample test error.
112 | \newcommand{\folds}{K} % Cross validation folds
113 | \newcommand{\fold}{k} % Index of a fold
114 | \newcommand{\bootstraps}{B} % Bootstrap samples
115 | \newcommand{\bootstrap}{{b^*}} % Index of a bootstrap replication
116 |
117 |
118 | \newcommand{\rankings}{\mathcal{R}} % Rankings, for colaborative filtering.
119 | \newcommand{\ranking}{\mathcal{R}} % Rankings, for colaborative filtering.
120 | \newcommand{\kl}[2]{D_{KL}\left(#1 \Vert #2 \right)}
121 | \newcommand{\ortho}{\mathbb{O}} % space of orthogonal matrices
122 |
123 | \newcommand{\id}[6]{
124 | \begin{tabular}{|p{2cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|}
125 | \hline Task & Type & Input & Output & Concept & Remark \\
126 | \hline
127 | \hline #1 & #2 & #3 & #4 & #5 & #6 \\
128 | \hline
129 | \end{tabular}
130 | \newline
131 | \newline
132 | }
133 |
134 | \newcommand{\union}{\cup}
135 | \newcommand{\intersect}{\cap}
136 | \newcommand{\supp}[1]{\mathop{support}(#1)}
137 | \newcommand{\conf}[2]{\mathop{confidence}(#1 \Rightarrow #2)}
138 | \newcommand{\lift}[2]{\mathop{lift}(#1 \Rightarrow #2)}
139 | \newcommand{\convic}[2]{\mathop{conviction}(#1 \Rightarrow #2)}
140 |
--------------------------------------------------------------------------------
/notes/estimation.tex:
--------------------------------------------------------------------------------
1 |
2 | \chapter{Estimation}
3 | \label{sec:estimation}
4 |
5 | In this section, we present several estimation principles.
6 | Their properties are not discussed, as the section is merely a reminder and a preparation for what follows.
7 | These concepts and examples can be found in many introductory books to statistics. I particularly recommend \cite{wasserman_all_2004} or \cite{abramovich_statistical_2013}.
8 |
9 | \section{Moment matching}
10 | \label{sec:moment_matching}
11 |
12 | The fundamental idea: match empirical moments to theoretical. I.e., estimate
13 | $$ \expect{g(X)} $$
14 | by
15 | $$ \expectn{g(X)} $$
16 | where $\expectn{g(x)}:=\frac{1}{n} \sum_i g(x_i)$, is the empirical mean.
17 |
18 | \begin{example}[Exponential Rate]
19 |
20 | Estimate $\lambda$ in $\x_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d.
21 | $\expect{x}=1/\lambda$.
22 | $\Rightarrow \estim{\lambda}=1/\expectn{x}$ .
23 |
24 | \end{example}
25 |
26 |
27 | \begin{example}[Linear Regression]
28 |
29 | Estimate $\be$ in $\y \sim \gauss{X\be,\sigma^2 I}$, a $p$ dimensional random vector.
30 | $\expect{y}=X\be$ and $\expectn{y}=y$.
31 | Clearly, moment matching won't work because no $\be$ satisfies $X\be=y$.
32 | A technical workaround:
33 | Since $\be$ is $p$ dimensional, I need to find some $g(\y): \mathbb{R}^n \mapsto \mathbb{R}^p$.
34 | Well, $g(y):=Xy$ is such a mapping. I will use it, even though my technical justification is currently unsatisfactory. We thus have:
35 | $\expect{X'y}=X'X\be$ which I match to $\expectn{X'y}=X'y$:
36 | $$
37 | X'X \be = X' y \Rightarrow \estim{\be}=(X'X)^{-1} X'y.
38 | $$
39 |
40 | \end{example}
41 |
42 |
43 | \section{Quantile matching}
44 | \label{sec:quantiles}
45 |
46 | The fundamental idea: match empirical quantiles to theoretical.
47 | Denoting by $\cdf{x}{t}$ the CDF of $\x$, then $\icdf x \al$ is the $\al$ quantile of $\x$.
48 | Also denoting by $\cdfn x t$ the Empirical CDF of $x_1,\dots, x_n$, then $\icdfn x \al$ is the $\al$ quantile of $x_1,\dots, x_n$.
49 | The quantile matching method thus implies estimating
50 | $$ \icdf x \al $$
51 | by
52 | $$ \icdfn x \al . $$
53 |
54 | \begin{example}[Exponential rate]
55 | Estimate $\lambda$ in $\x_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d.
56 | \begin{align*}
57 | & \cdf x t = 1-\exp(-\lambda t) = \al \Rightarrow \\
58 | & \icdf x \al = \frac{-\log(1-\al)}{\lambda} \Rightarrow \\
59 | & \icdf{x}{0.5} = \frac{-\log(0.5)}{\lambda} \Rightarrow \\
60 | & \estim{\lambda} = \frac{-\log(0.5)}{\icdfn{x}{0.5}}.
61 | \end{align*}
62 |
63 | \end{example}
64 |
65 |
66 | \section{Maximum Likelihood}
67 | \label{sec:ml}
68 |
69 | The fundamental idea is that if the data generating process (i.e., the \emph{sampling distribution}) can be assumed, then the observations are probably some high probability instance of this process, and not a low probability event:
70 | Let $\x_1,\dots,\x_n \sim P_\theta$, with density (or probability) $p_\theta(x_1,\dots,x_n)$.
71 | Denote the likelihood, as a function of $\theta$: $\lik(\theta): p_\theta(x_1,\dots,x_n)$.
72 | Then $$\estim{\theta}_{ML}:= argmax_{\theta}\set{ \lik(\theta) }.$$
73 |
74 | Using a monotone mapping such as the log, does not change the $argmax$.
75 | Denote $$\loglik(\theta):=\log(\lik(\theta)).$$
76 |
77 |
78 | \begin{example}[Exponential rate]
79 |
80 | Estimate $\lambda$ in $X_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d.
81 | Using the exponential PDF and the i.i.d. assumption
82 | $$ \lik(\lambda) = \lambda^n \exp(-\lambda \sum_i X_i), $$
83 | and
84 | $$ \loglik(\lambda) = n \log(\lambda) -\lambda \sum_i X_i. $$
85 |
86 | By differentiating and equating $0$, we get $\estim{\lambda}_{ML}=1/\expectn{X}$.
87 |
88 | \end{example}
89 |
90 | \begin{example}[Discrete time Markov Chain]
91 |
92 | Estimate the transition probabilities, $p_1$ and $p_2$ in a two state, $\set{0,1}$, discrete time, Markov chain where:
93 | $P(\x_{t+1}=1|x_{t}=0)=p_1$ and $P(\x_{t+1}=1|X_{t}=1)=p_2$.
94 | The likelihood:
95 | $$
96 | \lik(p_1,p_2)=
97 | P(X_2,\dots,X_T;X_1,p_1,p_2)=
98 | \prod_{t=1}^T P(X_{t+1}=x_{t+1}|X_{t}=x_t).
99 | $$
100 | We denote $n_{ij}$ the total number of observed transitions from $i$ to $j$ and get that $\estim{p}_1=\frac{n_{01}}{n_{01}+n_{00}}$, and that $\estim{p}_2=\frac{n_{11}}{n_{11}+n_{10}}$.
101 |
102 | \begin{remark}[Confession]
103 | Well, this is a rather artificial example, as because of the Markov property, and the stationarity of the process, we only need to look at transition events, themselves Bernoulli distributed.
104 | This example does show, however, the power of the ML method to deal with non i.i.d. samples. As does the next example.
105 | \end{remark}
106 | \end{example}
107 |
108 |
109 |
110 |
111 | \begin{example}[Autoregression of order 1 (AR(1))]
112 | Estimate the drift parameter $a$, in a discrete time Gaussian process where:
113 | $\x_{t+1}=a \x_t+ \varepsilon; \varepsilon \sim \gauss{0,\sigma^2} \Rightarrow \x_{t+1}|\x_t \sim \gauss{a x_t,\sigma^2}$.
114 |
115 | We start with the conditional density at time $t+1$:
116 | $$
117 | p_{\x_{t+1}|\x_t=x_t}(x_{t+1}) =
118 | (2 \pi \sigma^2)^{-1/2} \exp \left(
119 | -\frac{1}{2 \sigma^2}(x_{t+1}-a x_t)^2
120 | \right).
121 | $$
122 | Moving to the likelihood:
123 | $$
124 | \lik(a) =
125 | (2 \pi \sigma^2)^{-T/2} \exp \left(
126 | -\frac{1}{2 \sigma^2}\sum_{t=1}^T (x_{t+1}-a x_t)^2
127 | \right).
128 | $$
129 | Taking the log and differentiating with respect to $a$ and equating $0$ we get $\estim{a}_{ML}=\frac{\sum x_{t+1}x_{t}}{\sum x_t^2}$.
130 |
131 | We again see the power of the ML device.
132 | Could we have arrived to this estimator by intuiton alone? Hmmmm... maybe.
133 | See that $Cov[X_{t+1},X_t] = a \; Var[X_t] \Rightarrow a=\frac{Cov[X_{t+1},X_t]}{Var[X_t]}$.
134 | So $a$ can also be derived using the moment matching method which is probably more intuitive.
135 |
136 | \end{example}
137 |
138 |
139 |
140 |
141 | \begin{example}[Linear Regression]
142 |
143 | Estimate $\be$ in $Y \sim \gauss{X\be,\sigma^2 I}$, a $p$ dimensional random vector.
144 | Recalling the multivariate Gaussian PDF:
145 | $$
146 | p_{\mu,\Sigma}(y) =
147 | (2 \pi)^{-n/2} |\Sigma|^{-1/2} \exp\left(
148 | -\frac{1}{2} (y-\mu)' \Sigma^{-1} (y-\mu)
149 | \right)
150 | $$
151 | So in the regression setup:
152 | $$
153 | \lik(\be)=
154 | p_{\be,\sigma^2}(y) =
155 | (2 \pi)^{-n/2} |\sigma^2 I|^{-1/2} \exp\left(
156 | -\frac{1}{2 \sigma^2} \normII{y-X\be}^2
157 | \right)
158 | $$
159 | and $\estim{\be}_{ML}$ equals
160 | \begin{align}
161 | \estim{\be}_{ML}=(X'X)^{-1} X'y.
162 | \end{align}
163 |
164 |
165 | \end{example}
166 |
167 |
168 | \section{M-Estimation and Empirical Risk Minimization}
169 | \label{sec:m_estimation}
170 |
171 | M-Estimation, know as Empirical Risk Minimizaton (ERM) in the machine learning literature, is a very wide framework which stems from statistical desicion theory.
172 | The underlying idea is that each realization of $\x$ incurs some loss, and we seek to find a "policy", in this case a parameter, $\theta^*$ that minimizes the average loss.
173 | In the econometric literature, we dot not incur a loss, but rather a utility, we thus seek a policy that maximizes the average utility.
174 |
175 | \begin{definition}[Loss Function]
176 | The penalty for predicting $\theta$ when observing $x$:
177 | \begin{align}
178 | \loss(x;\theta).
179 | \end{align}
180 |
181 | \end{definition}
182 | \begin{definition}[Risk Function]
183 | The expected loss:
184 | \begin{align}
185 | \risk(\theta):=\expect{\loss(\x;\theta)}.
186 | \end{align}
187 |
188 | \end{definition}
189 | Then the best prediction, $\theta^*$, being the minimizer of the expected risk is
190 | \begin{align}
191 | \label{eq:risk_min}
192 | \theta^*:= \argmin{\theta}{\risk(\theta)}.
193 | \end{align}
194 |
195 | As we do not know the distribution of $\x$, we cannot solve Eq.(\ref{eq:risk_min}), so we minimize the \emph{empirical} risk.
196 | \begin{definition}[Empirical Risk]
197 | The average loss in the sample:
198 | \begin{align}
199 | \riskn(\theta):=\expectn{\loss(x;\theta)}=\frac{1}{n}\sum_i \loss(x_i,\theta).
200 | \end{align}
201 | \end{definition}
202 |
203 | A prediction that can actually be computed with data is thus the empirical minimizer $\estim{\theta}$:
204 | \begin{align}
205 | \label{eq:empirical_risk_min}
206 | \estim{\theta}:= \argmin{\theta}{\riskn(\theta)}.
207 | \end{align}
208 |
209 |
210 |
211 | \begin{example}[Squared Loss]
212 | \label{eg:squared_loss}
213 |
214 | Let $\loss(x;\theta)=(x-\theta)^2$. Then
215 | $
216 | \risk(\theta) =
217 | \expect{(\x-\theta)^2} =
218 | (\expect{\x}-\theta)^2 + Var[\x].
219 | $
220 | Clearly $Var[\x]$ does not depend on $\theta$ so that $\risk(\theta)$ is minimized by $\theta^*=\expect{\x}$.
221 | \textbf{We thus say that the expectation of a random variable is the minimizer of the squared loss.}
222 |
223 | How do we estimate the population expectation? Well a natural estimator is the empirical mean, which is also the minimizer of the empirical risk $\riskn(x)$. The proof is immediate by differentiating.
224 | \end{example}
225 |
226 |
227 | \begin{example}[Ordinary Least Squares (OLS)]
228 | \label{eg:OLS}
229 | Define the loss $\loss(y,x;\be):=\frac{1}{2}(y-x\be)^2$.
230 | Computing the risk, $\expect{\frac{1}{2} \normII{\y-\x\be}^2}$ will require dealing with the joint distribution of $(\x,\y)$.
231 | We don't really care about that right now.
232 | We merely want to see that the empirical risk minimizer, is actually the classical OLS problem.
233 | And well, it is (by definition actually):
234 | \begin{align*}
235 | \riskn(\be)=\sum_{i=1}^n \frac{1}{2}(y_i-x_i\be)^2 = \frac{1}{2}\normII{y-x\be}^2.
236 | \end{align*}
237 | Minimization is easiest with vector derivatives, but I will stick to regular derivatives:
238 | \begin{align*}
239 | \deriv{\riskn(\be)}{{\be_j}} = \sum_i \left[ (y_i-\sum_{j=1}^p x_{ij}\be_j)(-x_{ij}) \right]
240 | \end{align*}
241 | Equating $0$ yields $\estim{\be_j}=\frac{\sum_i y_i x_{ij}}{\sum_i x_{ij}^2}$.
242 | Solving for all $j$'s and putting in matrix notation we get
243 | \begin{align}
244 | \estim{\be}_{OLS}=(X'X)^{-1} X'y.
245 | \end{align}
246 |
247 | \end{example}
248 |
249 |
250 | \section{Notes}
251 |
252 | \subsection{Maximum Likelihood}
253 | Maximum likelihood estimators are a particular instance of M-estimators, if we set the loss function to be the negative log likelihood of the (true) sampling distribution.
254 |
255 |
256 | \subsection{Choosing the Loss Function}
257 | While by far the most popular, we do not automatically revert to minimizing a squared error loss.
258 | There are several considerations when choosing the loss function.
259 | Most ERM learning methods can be applied with different loss functions.
260 |
261 | The first consideration is computational complexity: if you choose a loss function that leads to a non-convex empirical risk, you are in trouble. There are no guarantees you will be able to find the risk minimizer in finite computing time.
262 |
263 | The second consideration is the nature of the outcome $y$. Some loss functions are more appropriate to continuous $y$'s and some for categorical $y$'s. Typical loss functions for continuous $y$'s are the squared loss, absolute loss, and hinge loss.
264 | Typical loss functions for categorical $y$'s are the Binomial likelihood loss (also known as the Cross Entropy, or Deviance), and the hinge loss.
265 |
266 | A third consideration, which is rarely given the importance it should get, is ``What is the meaning of $\theta^*$''? Or, ``What are we actually estimating''?
267 | As we have seen in Example~\ref{eg:squared_loss}, the squared loss implies we are aiming to estimate the population mean.
268 | What are we estimating when we use the hinge loss? The binomial loss?
269 | We will not discuss these matters, as we are discussing methods where these choices have already been made for us.
270 | When the day you start thinking of your own learning algorithms, you will need to give some thought to this question.
271 |
--------------------------------------------------------------------------------
/notes/graphics.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Graphics"
3 | author: "Jonathan Rosenblatt"
4 | date: "June 7, 2015"
5 | output: html_document
6 | ---
7 |
8 | The scripts producing the graphics in the class notes.
9 |
10 |
11 | # Imputing missing data
12 | Create complete data
13 | ```{r create}
14 | n <- 20
15 | noise <- rnorm(n*2) %>% matrix(ncol=2)
16 | signal <- outer(rexp(n,1/2),c(1,1)) %>% scale
17 | x <- signal + noise
18 | x.range <- 1.1* range(x[,1]); y.range <- 1.1* range(x[,2])
19 | plot(signal, xlab='Movie 1', ylab='Movie 2', xlim = x.range, ylim = y.range)
20 | points(x, pch=19)
21 | arrows(x0=signal[,1], y0=signal[,2], x1=x[,1], y1=x[,2], col='darkgrey')
22 | ```
23 |
24 | Then censor some points
25 | ```{r censor}
26 | censoring.ind <- cbind(1:n, sample(c(NA,1,2), n, replace=TRUE, prob = c(2,1,1)))
27 | x.censored <- x
28 | x.censored[censoring.ind] <- NA
29 | points(x.censored, col='red', pch=19) # Observed points
30 | #So this is what we know
31 | x.censored.1.ind <- (censoring.ind[,2]==1) %>% sapply(isTRUE)
32 | x.censored.2.ind <- (censoring.ind[,2]==2) %>% sapply(isTRUE)
33 | # plot(x.censored)
34 | abline(h=x.censored[x.censored.1.ind,2], lty=2)
35 | abline(v=x.censored[x.censored.2.ind,1], lty=2)
36 | ```
37 |
38 |
39 | Let's try to impute using a 1D linear space embedding and reconstruction:
40 | ```{r}
41 | x.censored.clean <- x.censored %>% na.omit
42 | svd.1 <- x.censored.clean %>% svd
43 | d.2 <- diag(svd.1$d)
44 | d.2[2,2] <- 0
45 | x.censored.reduced <- svd.1$u %*% d.2 %*% t(svd.1$v)
46 | points(x.censored.reduced, col='green', pch=19)
47 | lm.1 <- lm(x.censored.reduced[,2]~x.censored.reduced[,1])
48 | abline(lm.1, col='darkgreen')
49 | ```
50 |
51 |
52 |
53 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/notes/introduction.tex:
--------------------------------------------------------------------------------
1 | \chapter{Introduction}
2 | \label{sec:introduction}
3 |
4 | This text is intended to collect many machine learning methods and algorithms, present then, and organize them.
5 | The treatment of the different concepts attempts to be as intuitive as possible, and mathematics is presented only when unavoidable, or when adding special insight.
6 |
7 | Extra effort has been put into organizing the methods and algorithms along some fundamental building blocks, which are now briefly presented.
8 |
9 | \begin{description}
10 |
11 | \item[The learning problem]
12 | The first distinction between the different methods is along the tasks they perform, which closely corresponds to the type of data at hand.
13 | This includes Supervised Learning (\S\ref{sec:supervised}) and Unsupervised Learning (\S\ref{sec:unsupervised}).
14 | Within each, we find several sub-tasks:
15 | \begin{description}
16 |
17 | \item[Supervised Learning] Includes classification tasks, and regression tasks. The first, predicting a categorical outcome, and the latter, a continuous outcome.
18 |
19 | \item[Unsupervised Learning] Includes the learning of the data generating distribution, or related tasks such as detecting high density regions and clustering.
20 | \end{description}
21 |
22 | As we will see, not all learning tasks fall into these categories. Collaborative Filtering (\S\ref{sec:collaborative_filtering}) is an example of a learning task that is neither. It is a missing data imputation task.
23 |
24 | \item[An optimization problem vs. an algorithm]
25 | Both supervised and unsupervised learning methods can be classified as either an explicit algorithm, or as an optimization problem, agnostic to the optimization algorithm used to solve it.
26 |
27 | \item[Dimension Reduction]
28 | Both supervised and unsupervised learning methods can include a dimensionality reduction stage.
29 | This can be motivated by the need to reduce computational complexity, to apply low-dimensional algorithms down the road, to allow a visualization of the data, or simply to allow some human tractable interpretation of the data.
30 | This is discussed in Appendix~\ref{apx:dim_reduce}.
31 |
32 | We can further stratify dimensionality reduction methods along these lines:
33 | \begin{description}
34 | \item[Linear-Space vs. Non-Linear-Space Embeddings]
35 | When reducing the dimension of the data, it can be mapped (embedded) into a linear subspace of the original space, or a non linear one.
36 |
37 | \item[Linear vs. Non-Linear Space Embeddings]
38 | Not to be confused with the previous item.
39 | The dimensionality reduction can be a linear operation on the data or a non-linear one.
40 |
41 | \item[Learning an Embedding vs. Learning an Embedding Function]
42 | When learning a mapping to a lower dimensional space, we can map the original data points (an embedding), or learn a mapping of the whole data space (an embedding function).
43 | \end{description}
44 |
45 | \item[Kernel Trick]
46 | Both supervised and unsupervised learning methods can include a ``kernel trick''.
47 | This will happen when we wish to learn complex functions of the data, but keep computations quick.
48 | The kernel trick is applicable to methods that do not need the whole data, but rather, only some measure of similarity between the points.
49 | The idea is that many complicated functions, are merely linear combinations of the distances to other points.
50 | This is further elaborated in Appendix~\ref{apx:rkhs}.
51 |
52 | \item[Generative vs. Discriminative Models]
53 | Both supervised and unsupervised learning methods can benefit from an assumption on the data generating process, i.e., the sampling distribution.
54 | Generative models as those where we assume this process.
55 | Discriminative models, which appear in supervised learning, we do not assume the data generating process, but merely the nature of the relation between features and outcome.
56 |
57 | \item[Feature based vs. Graph Based]
58 | Unsupervised learning tasks can be classified to those that require the full features of the data, and those who require only some measure of similarity between data points. As such, the latter methods can be seen as graph based methods, where the similarities are represented as a graph.
59 |
60 | \item[Fully Observed vs. Latent Space Models]
61 | Both supervised and unsupervised learning methods can include unobservable, i.e. latent, variables.
62 |
63 | \end{description}
64 |
65 |
66 |
67 | \paragraph{Fully Automated Processes?}
68 | The machine learning literature draws heavily from the statistical literature.
69 | You should bear in mind that the ultimate goal of machine learning, is replacing a ``hard-coded'' algorithm, which externalizes the programmer's knowledge, into a self teaching algorithm.
70 | It may thus seem that problems like visualization do not belong in the realm of machine learning, as they are not completely automated.
71 | This is not completely accurate because, while we want the \emph{application} stage of an algorithm to be automated, we can sometimes allow for a human to be involved in the \emph{learning} stage.
72 |
73 |
74 | \paragraph{Notation}
75 | The notation conventions may seem non standard as they borrow from several lines of literature.
76 | These conventions were chosen as we find them to be clear and concise.
77 | They are collected in Appendix \ref{apx:notation}.
78 |
79 | \paragraph{Sources}
80 | This text draws mostly from \cite{hastie_elements_2003} and \cite{shalev-shwartz_understanding_2014}.
81 | The former is freely available online.
82 | For a softer introduction, with more hands-on examples, see \cite{james_introduction_2013}, also freely available online.
83 | All books are very well written and strongly recommended.
84 | More references can be found in the Bibliography (Appendix \ref{sec:bibliography}).
85 |
86 |
--------------------------------------------------------------------------------
/notes/notes.loa:
--------------------------------------------------------------------------------
1 | \contentsline {algorithm}{\numberline {1}{\ignorespaces Random Forest}}{27}{algorithm.1}
2 | \contentsline {algorithm}{\numberline {2}{\ignorespaces Rotation Forest}}{27}{algorithm.2}
3 | \contentsline {algorithm}{\numberline {3}{\ignorespaces Forward Search}}{32}{algorithm.3}
4 | \contentsline {algorithm}{\numberline {4}{\ignorespaces PCA Regression}}{33}{algorithm.4}
5 | \contentsline {algorithm}{\numberline {5}{\ignorespaces Commitee Methods}}{36}{algorithm.5}
6 | \contentsline {algorithm}{\numberline {6}{\ignorespaces Model Averaging}}{37}{algorithm.6}
7 | \contentsline {algorithm}{\numberline {7}{\ignorespaces Stacking}}{37}{algorithm.7}
8 | \contentsline {algorithm}{\numberline {8}{\ignorespaces Bagging}}{38}{algorithm.8}
9 | \contentsline {algorithm}{\numberline {9}{\ignorespaces Jackknife}}{43}{algorithm.9}
10 | \contentsline {algorithm}{\numberline {10}{\ignorespaces Cross Validation}}{43}{algorithm.10}
11 | \contentsline {algorithm}{\numberline {11}{\ignorespaces Bootstrap}}{44}{algorithm.11}
12 | \contentsline {algorithm}{\numberline {12}{\ignorespaces K-Means}}{72}{algorithm.12}
13 | \contentsline {algorithm}{\numberline {13}{\ignorespaces K-Medoids}}{73}{algorithm.13}
14 | \contentsline {algorithm}{\numberline {14}{\ignorespaces Spectral Clustering}}{76}{algorithm.14}
15 |
--------------------------------------------------------------------------------
/notes/notes.loe:
--------------------------------------------------------------------------------
1 | \addvspace {10\p@ }
2 | \contentsline {example}{\numberline {2.1.1}Example\thmtformatoptarg {Exponential Rate}}{11}{example.2.1.1}
3 | \contentsline {example}{\numberline {2.1.2}Example\thmtformatoptarg {Linear Regression}}{11}{example.2.1.2}
4 | \contentsline {example}{\numberline {2.2.1}Example\thmtformatoptarg {Exponential rate}}{12}{example.2.2.1}
5 | \contentsline {example}{\numberline {2.3.1}Example\thmtformatoptarg {Exponential rate}}{12}{example.2.3.1}
6 | \contentsline {example}{\numberline {2.3.2}Example\thmtformatoptarg {Discrete time Markov Chain}}{13}{example.2.3.2}
7 | \contentsline {remark}{\numberline {2.3.1}Remark\thmtformatoptarg {Confession}}{13}{remark.2.3.1}
8 | \contentsline {example}{\numberline {2.3.3}Example\thmtformatoptarg {Autoregression of order 1 (AR(1))}}{13}{example.2.3.3}
9 | \contentsline {example}{\numberline {2.3.4}Example\thmtformatoptarg {Linear Regression}}{14}{example.2.3.4}
10 | \contentsline {definition}{\numberline {2.1}Definition\thmtformatoptarg {Loss Function}}{14}{definition.2.1}
11 | \contentsline {definition}{\numberline {2.2}Definition\thmtformatoptarg {Risk Function}}{14}{definition.2.2}
12 | \contentsline {definition}{\numberline {2.3}Definition\thmtformatoptarg {Empirical Risk}}{15}{definition.2.3}
13 | \contentsline {example}{\numberline {2.4.1}Example\thmtformatoptarg {Squared Loss}}{15}{example.2.4.1}
14 | \contentsline {example}{\numberline {2.4.2}Example\thmtformatoptarg {Ordinary Least Squares (OLS)}}{15}{example.2.4.2}
15 | \addvspace {10\p@ }
16 | \contentsline {remark}{\numberline {3.1.1}Remark\thmtformatoptarg {No Sampling Distribution}}{18}{remark.3.1.1}
17 | \contentsline {remark}{\numberline {3.1.2}Remark\thmtformatoptarg {OLS Extensions}}{19}{remark.3.1.2}
18 | \contentsline {remark}{\numberline {3.1.3}Remark\thmtformatoptarg {Generalized Linear Models (GLM)}}{21}{remark.3.1.3}
19 | \contentsline {remark}{\numberline {3.1.4}Remark}{21}{remark.3.1.4}
20 | \contentsline {remark}{\numberline {3.1.5}Remark\thmtformatoptarg {Name Origins}}{22}{remark.3.1.5}
21 | \contentsline {remark}{\numberline {3.1.6}Remark\thmtformatoptarg {Solve the right problem}}{23}{remark.3.1.6}
22 | \contentsline {remark}{\numberline {3.1.7}Remark\thmtformatoptarg {Not a pure ERM}}{23}{remark.3.1.7}
23 | \contentsline {remark}{\numberline {3.1.8}Remark\thmtformatoptarg {Not a pure ERM}}{24}{remark.3.1.8}
24 | \contentsline {remark}{\numberline {3.1.9}Remark\thmtformatoptarg {Universal Approximator}}{24}{remark.3.1.9}
25 | \contentsline {remark}{\numberline {3.3.1}Remark\thmtformatoptarg {Hypothesis Testing Driven Variable Selection}}{32}{remark.3.3.1}
26 | \contentsline {remark}{\numberline {3.3.2}Remark\thmtformatoptarg {PCAR and Ridge Regression}}{33}{remark.3.3.2}
27 | \contentsline {remark}{\numberline {3.4.1}Remark\thmtformatoptarg {LDA and OLS classification}}{35}{remark.3.4.1}
28 | \contentsline {remark}{\numberline {3.5.1}Remark}{36}{remark.3.5.1}
29 | \addvspace {10\p@ }
30 | \contentsline {remark}{\numberline {4.2.1}Remark}{42}{remark.4.2.1}
31 | \addvspace {10\p@ }
32 | \contentsline {remark}{\numberline {5.1.1}Remark\thmtformatoptarg {Unsupervised Learning in the ERM framework}}{48}{remark.5.1.1}
33 | \contentsline {remark}{\numberline {5.2.1}Remark}{48}{remark.5.2.1}
34 | \contentsline {example}{\numberline {5.2.1}Example\thmtformatoptarg {First Order Univariate Markov Process}}{49}{example.5.2.1}
35 | \contentsline {remark}{\numberline {5.2.2}Remark\thmtformatoptarg {Restricted Bolzmann Machine}}{49}{remark.5.2.2}
36 | \contentsline {remark}{\numberline {5.4.1}Remark\thmtformatoptarg {Interpreting ``Linear''}}{51}{remark.5.4.1}
37 | \contentsline {remark}{\numberline {5.4.2}Remark\thmtformatoptarg {Interpreting the Low Dimensional Representation}}{51}{remark.5.4.2}
38 | \contentsline {definition}{\numberline {5.1}Definition\thmtformatoptarg {SVD}}{56}{definition.5.1}
39 | \contentsline {remark}{\numberline {5.4.3}Remark\thmtformatoptarg {Classical and Least Squares MDS}}{59}{remark.5.4.3}
40 | \contentsline {remark}{\numberline {5.4.4}Remark\thmtformatoptarg {The Non-Linearity of Local MDS}}{59}{remark.5.4.4}
41 | \contentsline {remark}{\numberline {5.4.5}Remark\thmtformatoptarg {The Non-Linearity of Isomap}}{60}{remark.5.4.5}
42 | \contentsline {remark}{\numberline {5.5.1}Remark\thmtformatoptarg {Non Linear Dimensionality Reduction}}{60}{remark.5.5.1}
43 | \contentsline {remark}{\numberline {5.5.2}Remark\thmtformatoptarg {Information Bottleneck and ICA}}{62}{remark.5.5.2}
44 | \contentsline {example}{\numberline {5.6.1}Example\thmtformatoptarg {Intelligence Measure (g-factor)}}{63}{example.5.6.1}
45 | \contentsline {example}{\numberline {5.6.2}Example\thmtformatoptarg {Face Rotations}}{63}{example.5.6.2}
46 | \contentsline {remark}{\numberline {5.6.1}Remark\thmtformatoptarg {Identifiability in PCA}}{63}{remark.5.6.1}
47 | \contentsline {remark}{\numberline {5.6.2}Remark\thmtformatoptarg {Non Linear FA}}{64}{remark.5.6.2}
48 | \contentsline {remark}{\numberline {5.6.3}Remark\thmtformatoptarg {ICA and FA}}{65}{remark.5.6.3}
49 | \contentsline {example}{\numberline {5.6.3}Example\thmtformatoptarg {Intelligence Factor Continued}}{66}{example.5.6.3}
50 | \contentsline {remark}{\numberline {5.6.4}Remark\thmtformatoptarg {Projection Pursuit and ICA}}{67}{remark.5.6.4}
51 | \contentsline {remark}{\numberline {5.6.5}Remark\thmtformatoptarg {Finite Mixture Distributions}}{68}{remark.5.6.5}
52 | \contentsline {remark}{\numberline {5.6.6}Remark\thmtformatoptarg {Mixtures And the Expectation Maximization Algorithm (EM)}}{68}{remark.5.6.6}
53 | \contentsline {remark}{\numberline {5.6.7}Remark\thmtformatoptarg {Mixtures For Clustering}}{68}{remark.5.6.7}
54 | \contentsline {remark}{\numberline {5.6.8}Remark\thmtformatoptarg {Mixture in Supervise Learning}}{69}{remark.5.6.8}
55 | \contentsline {remark}{\numberline {5.7.1}Remark\thmtformatoptarg {Relation to Spectral Clustering}}{71}{remark.5.7.1}
56 | \contentsline {remark}{\numberline {5.8.1}Remark\thmtformatoptarg {The population equivalent of K-means}}{72}{remark.5.8.1}
57 | \addvspace {10\p@ }
58 | \contentsline {remark}{\numberline {6.2.1}Remark\thmtformatoptarg {Collaborative Filtering and Other Supervised Learning Methods}}{79}{remark.6.2.1}
59 | \contentsline {remark}{\numberline {6.2.2}Remark\thmtformatoptarg {Matrix Norm Notation}}{80}{remark.6.2.2}
60 | \contentsline {remark}{\numberline {6.2.3}Remark\thmtformatoptarg {Matrix Factorization and Factor Analysis}}{80}{remark.6.2.3}
61 | \contentsline {remark}{\numberline {6.2.4}Remark\thmtformatoptarg {Temporal Dynamics and Tensor Factorization}}{80}{remark.6.2.4}
62 | \addvspace {10\p@ }
63 | \contentsline {definition}{\numberline {G.1}Definition\thmtformatoptarg {Entropy}}{91}{definition.G.1}
64 | \contentsline {definition}{\numberline {G.2}Definition\thmtformatoptarg {Mutual Information}}{91}{definition.G.2}
65 | \contentsline {definition}{\numberline {G.3}Definition\thmtformatoptarg {Kullback\IeC {\textendash }Leibler Divergence}}{91}{definition.G.3}
66 |
--------------------------------------------------------------------------------
/notes/notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/notes.pdf
--------------------------------------------------------------------------------
/notes/notes.tex:
--------------------------------------------------------------------------------
1 | \documentclass[12pt,a4paper]{report}
2 |
3 |
4 | \usepackage[utf8]{inputenc}
5 | \usepackage{amsmath}
6 | \usepackage{amsfonts}
7 | \usepackage{amssymb}
8 | \usepackage{graphicx}
9 | \usepackage{amsthm}
10 | \usepackage{natbib}
11 | \usepackage{algorithm}
12 | \usepackage{algpseudocode}
13 | \usepackage{framed}
14 |
15 | \usepackage{hyperref}
16 | \AtBeginDocument{\let\textlabel\label}
17 | \hypersetup{
18 | colorlinks=true,
19 | linkcolor=black,
20 | citecolor=black,
21 | filecolor=black,
22 | urlcolor=black,
23 | }
24 |
25 |
26 | \usepackage{marginnote}
27 | \renewcommand*{\marginfont}{\scriptsize }
28 |
29 | \usepackage{thmtools} % for lists of theorems
30 |
31 |
32 | \input{commands}
33 |
34 |
35 | \author{Jonathan Rosenblatt}
36 | \title{Class Notes (experimental)}
37 |
38 |
39 | \begin{document}
40 |
41 | \maketitle
42 |
43 | \tableofcontents
44 |
45 |
46 |
47 |
48 |
49 | %%%%%%%%% Algorithms %%%%%%%%%%%
50 | \newpage
51 | \listofalgorithms
52 | \addcontentsline{toc}{chapter}{List of Algorithms}
53 |
54 | \renewcommand{\listtheoremname}{List of Definitions}
55 | \listoftheorems[ignoreall,show={definition}]
56 |
57 |
58 | \renewcommand{\listtheoremname}{List of Examples}
59 | \listoftheorems[ignoreall,show={example}]
60 |
61 |
62 |
63 | % % % Introduction % % % %
64 | \input{introduction}
65 |
66 |
67 |
68 | % % % % % Estimation % % % % %
69 | \input{estimation}
70 |
71 |
72 | % % % % % % Supervised Learning % % % % % %
73 | \input{supervised}
74 |
75 |
76 | % % % % % % Statistical Descision Theory % % % % %
77 | \input{statistical_decision}
78 |
79 |
80 | % % % % % % Unsupervised % % % % %
81 | \input{unsupervised}
82 |
83 |
84 | % % % % % % Collaborative Filtering % % % % %
85 | \input{collaborative}
86 |
87 |
88 |
89 |
90 | % % % % % % Appendices % % % % % %
91 | \newpage
92 |
93 | \appendix
94 |
95 | \input{appendices}
96 |
97 |
98 |
99 |
100 | %%%%%%%%% Bibliography %%%%%%%%%%%
101 | \newpage
102 | \addcontentsline{toc}{chapter}{Bibliography}
103 | \bibliographystyle{abbrvnat}
104 | \bibliography{Intro2MachineLearning}
105 | \label{sec:bibliography}
106 |
107 |
108 | \end{document}
--------------------------------------------------------------------------------
/notes/statistical_decision.tex:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | \chapter{Statistical Decision Theory}
5 | \label{sec:desicion_theory}
6 |
7 | This section follows the spirit of Section~7 in \cite{hastie_elements_2003}, up to some changes in notation.
8 |
9 | In Section~\ref{sec:learning}, we gave an intuitive argument for which without some inductive bias, learning will return models with poor performance on new data.
10 | In this section we learn how to quantify the performance of a model. In particular, when given new data. This allows us to select among competing candidate models. It will also allow us to choose the value of the regularization parameter of each method.
11 |
12 | Figure~\ref{fig:bias_variance} demonstrate the prediction error (red curve) of some model as the model complexity increases. As can be seen, the prediction error decreases as the model becomes more complex, but saturates at some point.
13 | This is because the reduction in the bias is smaller than the increase in variance of learning very complex models.
14 | This is the celebrated bias-variance tradeoff.\marginnote{Bias Variance Tradeoff}
15 |
16 | Once we are able to estimate the prediction error from our data, we will seek for a model which minimizes this error.
17 |
18 | \begin{figure}[h]
19 | \centering
20 | \includegraphics[width=1\textwidth]{art/support-vector-machine-15-728}
21 | \caption{Overfitting:
22 | Prediction error on new data (red curve) versus the empirical prediction error (light blue).
23 | The empirical prediction error will always decrease as more complicated models are fit (moving right).
24 | The prediction error on new data, however, will not always decrease and will typically show a local minima.
25 | \label{fig:bias_variance}}
26 | \end{figure}
27 |
28 | Before we proceed, we now need to distinguish between several types of prediction errors.
29 | The population \emph{risk} of a model parametrized by $\theta$, was previously defined as the average loss over all possible data instances, and denoted by $\risk(\theta)$ (\S \ref{sec:m_estimation}).
30 | The empirical risk was defined as the average loss over the observed data points, and denoted by $\riskn(\theta)$.
31 | We now update these definitions to deal with the the $\hyp(x)$ notation of the previous section.
32 | \begin{align}
33 | \test :=& \expectg{Y,X}{\loss(Y,\hyp(X))}, \label{eq:test_error} \\
34 | \train :=& \expectn{\loss(y,\hyp(x))} = \frac{1}{n} \sum_i \loss(y_i,\hyp(x_i)), \label{eq:training_error} \\
35 | \insample :=& \frac{1}{n} \sum_i \expectg{Y}{\loss(Y,\hyp(x_i))}, \label{eq:in_sample} \\
36 | \EPE :=& \expectg{\estim{\hyp}_n}{
37 | \expectg{Y,X}{\loss(Y,\estim{\hyp}_n(X))|\estim{\hyp}_n}
38 | }.\label{eq:epe}
39 | \end{align}
40 |
41 | Eq.(\ref{eq:test_error}) is merely a reformulation of $\risk(\theta)$ from Section~\ref{sec:m_estimation}.
42 | It captures the expected loss, a given predictor, $\hyp(X)$, will incur on average when given new $X$'s and $Y$'s.
43 | This will be the magnitude which will tell us which models perform well, and which do not.
44 | It is known as the \emph{test error} or also as \emph{prediction error}.\marginnote{Test Error}
45 |
46 | Eq.(\ref{eq:training_error}) is the reformulation of empirical risk, $\riskn(\theta)$, we have been optimizing in Section~\ref{sec:learning}.
47 | We referred to it as the \emph{empirical risk}, but it is also known as the \emph{train error}.
48 | \marginnote{Train Error}
49 |
50 | Eq.(\ref{eq:in_sample}) is the average risk at the observed $x$'s, when given new $Y$'s \footnote{This magnitude should not be unfamiliar: e.g., inference in ANOVA is performed conditional on the $x$'s, which typically stem from a designed experiment.}.
51 | This is the \emph{in sample error}.
52 | \marginnote{In Sample Error}
53 |
54 | Eq.(\ref{eq:epe}) is called the \emph{expected prediction error}, i.e., the expected loss when $\hyp$ is also re-learned.
55 | Put differently: How much would we err when:(1) we are given $n$ new examples $\sample_1$; (2) re-learn $\estim{\hyp}_n$ on $\sample_1$; (3) compute the risk of $\estim{\hyp}_n$ (in the population, not in $\sample_1$.
56 | We emphasize this by writing $\estim{\hyp}_n$ instead of $\hyp$.
57 | $\EPE$ is thus not a property of a particular predictor $\hyp$, but rather of a whole learning algorithm on random samples of size $n$.
58 | It could have also been written as $\risk(algorithm)$, although I have not seen this notation in use.
59 | \marginnote{Expected Prediction Error}
60 |
61 |
62 | We would like to compare the performance of models based on $\test$, as this will give us an idea on the quality of the prediction on new data.
63 | Alas, computing $\test$ requires the distribution of $y$ and $x$, while we only have access to the $n$ observed samples.
64 | Can the empirical risk $\train$ estimate the unknown risk $\test$?
65 | Figure~\ref{fig:bias_variance} suggests it cannot since $\train$ underestimates $\test$.
66 | Why is this?
67 | At an intuitive level: this is because with ERM we learn the $\hyp$ with smallest error in each sample.
68 | It is thus the same as estimating the expected height in a population, by using the minimum in each sample; we will clearly be underestimating the expectation. Then again, there is the hope that we may take this minimum and debias it.
69 | This is the goal in the next sections.
70 |
71 | Before proceeding, we distinguish between two similar tasks:
72 | \begin{description}
73 | \item[Model Selection] This is the task of selecting between several candidate models.
74 | \item[Model Assessment] This is the task of assessing the prediction error (i.e., the expected loss, the risk) of a given model.
75 | \end{description}
76 |
77 |
78 |
79 | \section{Train, Validate, Test}
80 | \label{sec:train_test}
81 | If data is abundant, a trivial, assumption free way to estimate $\test$\footnote{Think: why $\test$ is being estimated, and not $\EPE$ nor $\insample$?}, is to split the data into $3$ sets.
82 | A \emph{training set}, used to learn several competing models.
83 | A \emph{validation set}, used check the performance of the learned models and choose the best performer using some comparison measure.
84 | A \emph{test set}, used to estimate the risk, as the empirical risk $\train$ will be unbiased to the population risk $\test$.
85 |
86 | If there is not enough data for this scheme, keep reading...
87 |
88 |
89 | \section{Unbiased Estimators of the Risk}
90 | \label{sec:risk_estimation}
91 | Under appropriate assumptions, the bias in $\train$ when estimating $\insample$\footnote{In this case, note that it is $\insample$ being estimated, and not $\test$ nor $\EPE$.} can be computed analytically, and accounted for.
92 | The bias $\insample-\train$ is called the \emph{optimism} of the algorithm.\marginnote{Optimism}
93 | Akaike's Information Criterion (AIC),
94 | the finite sample Corrected AIC (AICc),
95 | Mallow's Cp (Cp),
96 | the Bayesian Information Criterion (BIC, aka SBC, aka SBIC),
97 | the Minimum Description Description Length (MDL),
98 | Vapnic's Structural Risk Minimization (SRM),
99 | the Deviance Information Criterion (DIC),
100 | and the Hannan-Quinn Information Criterion (HQC),
101 | all try to estimate $\insample$ by correcting for the optimism under different assumptions.\marginnote{Cp, AIC, BIC, MDL, SRM}
102 |
103 | The differences, pros, and cons, of each will not be discussed herein. Just remember what they mean when you see them in your favourite software (R!).
104 | They all have in common that you will want the model with the smallest criterion.
105 | But be careful- as they are used for model selection, they are indifferent to scaling, and thus should be not interpreted as the expected prediction error.
106 |
107 | \begin{remark}
108 | Not all model selection criteria estimate $\insample$. The Focused Information Criterion (FIC), for example, does not.
109 | \end{remark}
110 |
111 |
112 |
113 |
114 |
115 | \paragraph{Further Reading}
116 | For a brief review of AIC, BIC, MDL and SRM see Chapter 7 in \citep{hastie_elements_2003}.
117 | For a more rigorous derivation, see \cite{claeskens_model_2008}.
118 |
119 |
120 |
121 |
122 |
123 | \section{Jackknifing}
124 | \label{sec:jackknife}
125 |
126 | If concerned with over fitting, here is a simple algorithm to estimate the prediction error:
127 |
128 | \begin{algorithm}[H]
129 | \caption{Jackknife}
130 | \begin{algorithmic}
131 | \For {$i \in 1,\dots,n$}
132 | \State $\estim{\hyp}^{(i)} \gets$ the learned model with all but the $i$'th observation.
133 | \State $\loss^{(i)} \gets$ the loss of $\estim{\hyp}^{(i)}$ on the $i$'th observation.
134 | \EndFor
135 | \State \Return the average loss over $\loss^{(i)}$.
136 | \end{algorithmic}
137 | \end{algorithm}
138 |
139 | This process is called the \emph{Jackknife}, or \emph{Leave-One-Out--Cross-Validation}.
140 | This algorithm return an estimator of $\EPE$.
141 | This might be quite surprising: every split uses almost an identical sample, so why would it not estimate $\test$? See Section 7.12 in \cite{hastie_elements_2003} for details..
142 |
143 | But wait! We might be able to stabilize the variability of the estimated error in every split, if instead of leaving only a single observation aside, we leave some more. This lead to way to \emph{K-Fold Cross Validation} in the next section.
144 |
145 |
146 | \section{Cross Validation}
147 | \label{sec:cv}
148 |
149 | \begin{algorithm}[H]
150 | \caption{Cross Validation}
151 | \begin{algorithmic}
152 | \State Split the data into $\folds$ parts (``folds'').
153 | \For {$\fold \in 1,\dots,\folds$}
154 | \State $\estim{\hyp}^{(k)} \gets$ the learned model with all \emph{except} the observations in the $\fold$'th fold.
155 | \State $\loss^{(\fold)} \gets$ the loss average of $\estim{\hyp}^{(\fold)}$ on the observations in the $\fold$'th fold.
156 | \EndFor
157 | \State \Return the average over $\loss^{(\fold)}$ .
158 | \end{algorithmic}
159 | \end{algorithm}
160 |
161 | This simple algorithm estimates $\EPE$ without any assumption on the data generating process, and less data than would be required for a ``train-validate-test'' scheme.
162 | Well, as it actually serves for model selection, it should be seen as a ``train-validate'' scheme, without the ``test'' part. It is thus \emph{not} an unbiased estimate of $\EPE$. See Section 7.12 in \cite{hastie_elements_2003} for details.
163 |
164 | But wait again!
165 | The Cross Validation scheme resamples the data \emph{without replacement} to estimate $\EPE$. Could we have sampled it \emph{with} replacement? Yes. This is the idea underlying the \emph{Bootstrapping} scheme.
166 |
167 |
168 | \section{Bootstrapping}
169 | \label{sec:bootstrap}
170 |
171 | Here is the simplest version of Bootstrap validation:
172 |
173 | \begin{algorithm}[H]
174 | \caption{Bootstrap}
175 | \begin{algorithmic}
176 | \For {$\bootstrap \in 1,\dots,\bootstraps$}
177 | \State $\sample^\bootstrap \gets$ $n$ randomly selected observations, with replacement, from the original data.
178 | \State $\estim{\hyp}^{\bootstrap} \gets$ the model learned with $\sample^\bootstrap$.
179 | \State $\loss^{\bootstrap} \gets$ the average loss of $\estim{\hyp}^{\bootstrap}$ on the observations in the \emph{original} data.
180 | \EndFor
181 | \State \Return the over of $\loss^{\bootstrap}$ .
182 | \end{algorithmic}
183 | \end{algorithm}
184 |
185 | This algorithm is not a good estimator of $\EPE$ as observations play a role both in learning and in validating.
186 | Several corrections are available. For details see Section 7.11 in \cite{hastie_elements_2003}.
187 |
188 | The Bootstrap is a very general scheme, which can be used not only for model validation, but for assessing many of its statistical properties. It is possibly best known when used for hypothesis testing.
189 | For more on the Bootstrap, see \cite{efron_introduction_1994}.
190 |
191 |
192 | \subsection{.632 Rule}
193 | [TODO]
--------------------------------------------------------------------------------
/project.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Project"
3 | author: "Jonathan Rosenblatt"
4 | date: "March 24, 2015"
5 | output: html_document
6 | ---
7 |
8 | Here are the guidelines for the course's concluding project.
9 | The project is based on the [Bike Sharing Demand](https://www.kaggle.com/c/bike-sharing-demand) competition at Kaggle.
10 | You are required to submit a prediction to Kaggle, and a report on the process to me.
11 |
12 | # Dates
13 | Teaming up: no later than __27.4.2015__.
14 | Submit prediction to Kaggle: __29.5.2015__.
15 | Submit report to Jonathan : __26.6.2015__.
16 |
17 | Recommended time-table:
18 |
19 | 1. During the Passover vacation, download the data. Make sure you can load it and practice `dplyr` and `lubridate` on it.
20 | 2. After Passover, find your team and notify me.
21 | 3. Keep revisiting the data as we progress and study new techniques. Don't leave everything to submission date.
22 |
23 |
24 | # Guidelines
25 |
26 | 1. Your task is to participate in the [Bike Sharing Demand](https://www.kaggle.com/c/bike-sharing-demand) competiton. The competion ends on __29.5.2015__ when you will have to submit your predictions to Kaggle.
27 | 2. You can do so in pairs, or trios.
28 | 3. By the end of the course you will need to submit to me a report documenting the process.
29 | - No longer than 8 pages (not including appendices).
30 | - Submitted by mail which includes:
31 | - A PDF file with the report.
32 | - Author names and IDs.
33 | - Should contain the sections:
34 | - Background: Some background on the competition.
35 | - Scoring: The scoring criterion in the competition. What loss function with what data?
36 | - The data: What data was provided for learning? What files in what formats? Which variables? How did you handle them?
37 | - Algorithms: Which learning algorithms did you try?
38 | - Results: What score did you achieve? What was your ranking in the competition?
39 | - Discussion: Why were you successful/unsuccessful? What other ideas would you have liked to try? What were the major challenges?
40 | - Code should be added in appendices.
41 | 4. Feel free to use the courses forums for questions. Especially regarding the use of Kaggle and R. Make sure however, that you do not share your solutions.
42 | 5. Any non trivial choices you made in the project need to be justified: tell me "why", not only "what".
43 |
44 |
--------------------------------------------------------------------------------
/sample_questions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Questions"
3 | author: "Jonathan Rosenblatt"
4 | date: "June 1, 2015"
5 | output:
6 | html_document:
7 | toc: no
8 | pdf_document:
9 | toc: no
10 | ---
11 |
12 | # Sample Questions
13 | ```{r preamble, cache=TRUE, echo=FALSE, results='hide'}
14 | suppressPackageStartupMessages(library(knitr))
15 | suppressPackageStartupMessages(library(magrittr)) # for piping
16 | suppressPackageStartupMessages(library(plyr))
17 | suppressPackageStartupMessages(library(dplyr)) # for handeling data frames
18 |
19 | .iris <- iris[,1:4] %>% scale
20 | .iris.y <- iris$Species=='virginica'
21 | .iris.dissimilarity <- dist(.iris)
22 |
23 | suppressPackageStartupMessages(library(arules))
24 | data("Groceries")
25 | rules <- apriori(Groceries, parameter = list(support=0.001, confidence=0.5))
26 |
27 | l2 <- function(x) x^2 %>% sum %>% sqrt
28 | l1 <- function(x) abs(x) %>% sum
29 | MSE <- function(x) x^2 %>% mean
30 | missclassification <- function(tab) sum(tab[c(2,3)])/sum(tab)
31 |
32 |
33 | suppressPackageStartupMessages(library(ElemStatLearn)) # for data
34 | data("prostate")
35 | data("spam")
36 |
37 |
38 |
39 |
40 | # Continous outcome:
41 | prostate.train <- prostate %>%
42 | filter(train) %>%
43 | select(-train)
44 | prostate.test <- prostate %>%
45 | filter(!train) %>%
46 | select(-train)
47 | y.train <- prostate.train$lcavol
48 | X.train <- prostate.train %>% select(-lcavol) %>% as.matrix
49 | y.test <- prostate.test$lcavol
50 | X.test <- prostate.test %>% select(-lcavol) %>% as.matrix
51 |
52 |
53 |
54 | # Categorical outcome:
55 | n <- nrow(spam)
56 |
57 | train.prop <- 0.66
58 | train.ind <- c(TRUE,FALSE) %>%
59 | sample(size = n, prob = c(train.prop,1-train.prop), replace=TRUE)
60 | spam.train <- spam[train.ind,]
61 | spam.test <- spam[!train.ind,]
62 |
63 | y.train.spam <- spam.train$spam
64 | X.train.spam <- spam.train %>% select(-spam) %>% as.matrix
65 | y.test.spam <- spam.test$spam
66 | X.test.spam <- spam.test %>% select(-spam) %>% as.matrix
67 |
68 | spam.dummy <- spam %>% mutate(spam=as.numeric(spam=='spam'))
69 | spam.train.dummy <- spam.dummy[train.ind,]
70 | spam.test.dummy <- spam.dummy[!train.ind,]
71 |
72 | suppressPackageStartupMessages(library(glmnet))
73 | lasso.1 <- glmnet(x=X.train, y=y.train, alpha = 1)
74 |
75 | ```
76 |
77 |
78 |
79 | 1. Based on the following Biplot... \newline
80 | ```{r, echo=FALSE, eval=TRUE, fig.width = 6, fig.height = 4 }
81 | pca <- prcomp(.iris)
82 | ggbiplot::ggbiplot(pca) # better!
83 | ```
84 | a. How many variables were in the original data?
85 | a. What original variables are captured by the first principal component?
86 | a. What original variables are captured by the second principal component?
87 | a. How many groups/clusters do you see in the data?
88 | 1.
89 | ```{r, eval=FALSE}
90 | n <- 100
91 | p <- 10
92 | X <- rnorm(n*p) %>% matrix(ncol = p, nrow=n)
93 | sigma <- 1e1
94 | epsilon <- rnorm(n, mean = 0, sd = sigma)
95 | y <- X %*% beta + epsilon
96 | ```
97 | a. What does the code do?
98 | a. What is the dimension of `beta`?
99 | a. Can I fit a neural network to the data? Explain.
100 | 1. How does the graphical model alleviate the parameter dimensionality problem?
101 | 1. What is the difference between FA and ICA.
102 | 1. What is the cutoff of OLS classification with -1,3 encoding.
103 | 1. Name three clustering methods. Explain them.
104 | 1. You want to cluster individuals based on their LinkedIn acquaintances: name an algorithm you __cannot__ use.
105 | 1.
106 | ```{r Cross Validation, eval=FALSE}
107 | hmmm <- 10
108 | ahhh <- sample(1:5, nrow(data), replace = TRUE)
109 | that <- NULL
110 |
111 | for (yup in 1:hmmm){
112 | wow <- data[ahhh!=yup,]
113 | arrrg <- data[ahhh==yup,]
114 | ok <- lm(y~. ,data = wow)
115 | nice <- predict(ok, newdata=arrrg)
116 | good <- nice - arrrg$y
117 | that <- c(that, good)
118 | }
119 |
120 | MSE(that)
121 | ```
122 | a. What is the method implemented in the code?
123 | a. What problem does the method solve?
124 | 1.
125 | ```{r, eval=FALSE}
126 | y1 <- prcomp(.iris, scale. = TRUE)
127 | y2 <- y1$x[,1:2]
128 | y3 <- glm(.iris.y~y2)
129 | ```
130 | a. Knowing that `.iris.y` is a two-level categorical variable, what does the code do?
131 | a. What could be a motivation for the proposed method?
132 | 1.
133 | ```{r, eval=FALSE}
134 | y1 <- prcomp(.iris, scale. = TRUE)
135 | y2 <- y1$x[,1:2]
136 | y3 <- kmeans(y2,3)
137 | ```
138 | a. What does the code do?
139 | a. What can be the motivation for the proposed method?
140 | 1. Two scientists claim to have found two unobservable movie attributes, that drive viewers' satisfaction in the Netflix data (movie ratings data). They both used the same data and factor analysis. One claims the factors are the "action factor" and "drama factor". The other claims it is "comedy factor" and the "animation factor". Try to resolve the situation with your knowledge of factor analysis.
141 | 1.
142 | $argmin_\beta \{ \frac{1}{n}\sum_i (y_i-x_i\beta)^2 + \lambda/2 \Vert\beta\Vert_2^2 \}$
143 | a. What is the name of the problem above?
144 | a. Does the solution enjoy the sparsity property?
145 | a. What is the regularization parameter? Name two methods for choosing it.
146 | 1. For the purpose of interpreting the predictor, would you prefer the CART or the NNET? Explain.
147 | 1. In order to estimate the covariance matrix in a Gaussian graphical model: should I estimate it directly or via its inverse? Explain.
148 | 1. Describe a method for selecting the number of mixing components in a mixture model using train-test samples.
149 | 1. Describe the stages of an algorithm to simulate $n$ samples from a two-state hidden Markov model. Assume you can generate data from Bernoulli and Gaussian distributions.
150 | 1. What assumption in ICA solves the FA rotation problem?
151 | 1. What is the LASSO ERM problem? Write the formula.
152 | 1. What is the OLS ERM problem? Write the formula.
153 | 1. What is the ridge ERM problem? Write the formula.
154 | 1. Name two algorithms for unbiased estimation of the population risk $R(\theta)$.
155 | 1. Name two unbiased estimators of the in-sample--prediction-error:
156 | $\bar{R}(f):=\frac{1}{n} \sum_i E_Y[l(Y,f(x_i))]$.
157 | 1. Suggest an algorithm to choose the number of principal components using cross validation. Write in pseudo-code.
158 | 1. Can the principal components in the PCA problem be estimated using maximum likelihood? Explain.
159 | 1. What can the logistic regression estimate that the SVM cannot?
160 | 1. Can any function be approximated using the LASSO? Put differently- does the LASSO have the Universal Approximator property?
161 | 1. Write the Bernoulli likelihood loss function. To what type of $y$ does it apply? What class of `R` objects holds this data type?
162 | 1. Name two methods for dimensionality reduction in supervised learning. Explain each briefly.
163 | 1. Here is some pseudo-code:
164 | - Set $M$ candidate learning algorithms.
165 | - For $m \in 1,\dots,M$, do
166 | - $\hat{f}^m(x) :=$ the predictor learned with the $m$'th algorithm.
167 | - EndFor
168 | - Set $\bar{f}(x) :=\frac{1}{M} \sum_{m=1}^M \hat{f}^m(x)$.
169 | - Return $\bar{f}(x)$.
170 | a. What is the name of the method above?
171 | a. What is the problem the method is designed to solve?
172 | a. Suggest an improvement to the method.
173 | 1. How many parameters need to be estimated to learn a multivariate Gaussian distribution where $p=15$. How does a graphical model help with this problem?
174 | 1.
175 | ```{r, cache=TRUE, echo=FALSE}
176 | rules %>% sort(by='lift') %>% head(1) %>% inspect()
177 | ```
178 | a. What method will return this output?
179 | a. Interpret the output.
180 | 1. One researcher applied k-means clustering on the first two PCs. Another applied k-medoids on the output of classical MDS with Euclidean distances. Can the clusters differ? Explain.
181 | 1. Suggest a method to visualize a social network. Explain.
182 | 1. A researcher wishes to cluster songs (not the lyrics. the actual audio files). Suggest two methods that will allow this and discuss their possible advantages and disadvantages.
183 | 1. What is the difference between "complete" and "single" linkage in agglomerative clustering?
184 | 1. $(X'X+\lambda I)^{-1}X'y$. This is the solution to what problem?
185 | 1. What will happen if we try to learn an empirical risk minimizer with no inductive bias? What is the name of the phenomenon?
186 | 1. Name two justifications for the regularization term in LASSO regression. How do we know predictions can only improve with a small regularization?
187 | 1. What method learns a hypothesis in the class $f(x)= \sum_{m=1}^M c_m I_{\{x \in R_m \}}$.
188 | a. What is the name of the hypothesis class?
189 | a. Name a particularly desirable property of this class (and thus- of the method)
190 | 1. If I am using the Deviance likelihood as a loss function-- what type is my predicted variable?
191 | 1. Having learned a mixture distribution $p(x)=\sum_{k=1}^k \pi_k p_k(x)$; how can I use it for clustering?
192 | 1. Why can't we produce a bi-plot for MDS while we can for PCA?
193 | 1. What is the difference between a Streaming Algorithm, and a Batch-Algorithm.
194 | 1. Why is prediction an easier task than classical statistical inference (from the Estimation course)?
195 | 1. What are the two historical motivations underlying PCA?
196 | 1. We saw that for the PCA problem, it suffice to know only the correlations between variables $X'X$. Why does it not suffice for OLS?
197 | 1. In what course did you cover methods for unsupervised learning of a parametric generative model? Name two learning methods?
198 |
199 |
200 |
--------------------------------------------------------------------------------
/sample_questions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/sample_questions.pdf
--------------------------------------------------------------------------------
/self_practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Self Practice"
3 | author: "Jonathan Rosenblatt"
4 | date: "March 23, 2015"
5 | output: html_document
6 | ---
7 |
8 | Some exercises to practice your initial R skills.
9 | Make sure you can answer. No need to submit.
10 |
11 | 1. What is the difference between .csv and tab delimited data files? What function imports and exports csv files?
12 | 2. What is the average departure delay of the flights that departed on the Sundays of Oct 2013? (`flights` dataset in the `nycflights13` package).
13 | 3. Plot a histogram and a boxplot of the delays of JetBlue Airways flights, after joining with the `airlines` dataset. Now plot the same plots for each day of the week. Export the plots as pdf files.
14 | 4. Create, then save as a csv, a data.frame named `drinks` with gender and drinks data, so that the output of `table(drinks)` is:
15 |
16 | Gender | Coke | Coffee
17 | -------|-------|-------
18 | Male | 12 | 10
19 | Female | 3 | 20
20 |
--------------------------------------------------------------------------------
/supervised.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Supervised Learning"
3 | author: "Jonathan Rosenblatt"
4 | date: "April 12, 2015"
5 | output:
6 | html_document:
7 | toc: true
8 | ---
9 | In these examples, I will use two data sets from the `ElemStatLearn` package: `spam` for categorical predictions (spam or not spam?), and `prostate` for continuous predictions (size of cancerous tumor).
10 | In `spam` we will try to decide if a mail is spam or not.
11 | In `prostate` we will try to predict the size of a cancerous tumor.
12 |
13 | ```{r}
14 | source('make_samples.R')
15 | ```
16 | You can now call `?prostate` and `?spam` to learn more about these data sets.
17 |
18 | We also load some utility packages and functions that we will require down the road.
19 | ```{r preamble}
20 | library(magrittr) # for piping
21 | library(dplyr) # for handeling data frames
22 |
23 | # My own utility functions:
24 | l2 <- function(x) x^2 %>% sum %>% sqrt
25 | l1 <- function(x) abs(x) %>% sum
26 | MSE <- function(x) x^2 %>% mean
27 | missclassification <- function(tab) sum(tab[c(2,3)])/sum(tab)
28 | ```
29 |
30 | We also initialize the random number generator so that we all get the same results (at least upon a first run)
31 | ```{r set seed}
32 | set.seed(2015)
33 | ```
34 |
35 | # OLS
36 |
37 | ## OLS Regression
38 |
39 | Starting with OLS regression, and a split train-test data set:
40 | ```{r OLS Regression}
41 | View(prostate)
42 | # now verify that your data looks as you would expect....
43 |
44 | ols.1 <- lm(lcavol~. ,data = prostate.train)
45 | # Train error:
46 | MSE( predict(ols.1)- prostate.train$lcavol)
47 | # Test error:
48 | MSE( predict(ols.1, newdata = prostate.test)- prostate.test$lcavol)
49 | ```
50 |
51 | Now using cross validation to estimate the prediction error:
52 | ```{r Cross Validation}
53 | folds <- 10
54 | fold.assignment <- sample(1:5, nrow(prostate), replace = TRUE)
55 | errors <- NULL
56 |
57 | for (k in 1:folds){
58 | prostate.cross.train <- prostate[fold.assignment!=k,]
59 | prostate.cross.test <- prostate[fold.assignment==k,]
60 | .ols <- lm(lcavol~. ,data = prostate.cross.train)
61 | .predictions <- predict(.ols, newdata=prostate.cross.test)
62 | .errors <- .predictions - prostate.cross.test$lcavol
63 | errors <- c(errors, .errors)
64 | }
65 |
66 | # Cross validated prediction error:
67 | MSE(errors)
68 | ```
69 |
70 | Also trying a bootstrap prediction error:
71 | ```{r Bootstrap}
72 | B <- 20
73 | n <- nrow(prostate)
74 | errors <- NULL
75 |
76 | prostate.boot.test <- prostate
77 | for (b in 1:B){
78 | prostate.boot.train <- prostate[sample(1:n, replace = TRUE),]
79 | .ols <- lm(lcavol~. ,data = prostate.boot.train)
80 | .predictions <- predict(.ols, newdata=prostate.boot.test)
81 | .errors <- .predictions - prostate.boot.test$lcavol
82 | errors <- c(errors, .errors)
83 | }
84 |
85 | # Bootstrapped prediction error:
86 | MSE(errors)
87 | ```
88 |
89 |
90 | ### OLS Regression Model Selection
91 |
92 |
93 | Best subset selection: find the best model of each size:
94 | ```{r best subset}
95 | # install.packages('leaps')
96 | library(leaps)
97 |
98 | regfit.full <- prostate.train %>%
99 | regsubsets(lcavol~.,data = ., method = 'exhaustive')
100 | summary(regfit.full)
101 | plot(regfit.full, scale = "Cp")
102 | ```
103 |
104 |
105 |
106 | Train-Validate-Test Model Selection.
107 | Example taken from [here](https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/ch6.html)
108 | ```{r OLS TVT model selection}
109 | model.n <- regfit.full %>% summary %>% length
110 | X.train.named <- prostate.train %>% model.matrix(lcavol ~ ., data = .)
111 | X.test.named <- prostate.test %>% model.matrix(lcavol ~ ., data = .)
112 | View(X.test.named)
113 |
114 | val.errors <- rep(NA, model.n)
115 | train.errors <- rep(NA, model.n)
116 | for (i in 1:model.n) {
117 | coefi <- coef(regfit.full, id = i)
118 |
119 | pred <- X.train.named[, names(coefi)] %*% coefi
120 | train.errors[i] <- MSE(y.train - pred)
121 |
122 | pred <- X.test.named[, names(coefi)] %*% coefi
123 | val.errors[i] <- MSE(y.test - pred)
124 | }
125 | plot(train.errors, ylab = "MSE", pch = 19, type = "black")
126 | points(val.errors, pch = 19, type = "b", col="blue")
127 |
128 | legend("topright",
129 | legend = c("Training", "Validation"),
130 | col = c("black", "blue"),
131 | pch = 19)
132 | ```
133 |
134 |
135 | AIC model selection:
136 | ```{r OLS AIC}
137 | # Forward search:
138 | ols.0 <- lm(lcavol~1 ,data = prostate.train)
139 | model.scope <- list(upper=ols.1, lower=ols.0)
140 | step(ols.0, scope=model.scope, direction='forward', trace = TRUE)
141 |
142 | # Backward search:
143 | step(ols.1, scope=model.scope, direction='backward', trace = TRUE)
144 | ```
145 |
146 |
147 | Cross Validated Model Selection.
148 | ```{r OLS CV}
149 | [TODO]
150 | ```
151 |
152 |
153 | Bootstrap model selection:
154 | ```{r OLS bootstrap}
155 | [TODO]
156 | ```
157 |
158 |
159 | Partial least squares and principal components:
160 | ```{r PLS}
161 | pls::plsr()
162 | pls::pcr()
163 | ```
164 |
165 | Canonical correlation analyis:
166 | ```{r CCA}
167 | cancor()
168 |
169 | # Kernel based robust version
170 | kernlab::kcca()
171 | ```
172 |
173 |
174 |
175 | ## OLS Classification
176 | ```{r OLS Classification}
177 | # Making train and test sets:
178 | ols.2 <- lm(spam~., data = spam.train.dummy)
179 |
180 | # Train confusion matrix:
181 | .predictions.train <- predict(ols.2) > 0.5
182 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train.dummy$spam))
183 | missclassification(confusion.train)
184 |
185 | # Test confusion matrix:
186 | .predictions.test <- predict(ols.2, newdata = spam.test.dummy) > 0.5
187 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test.dummy$spam))
188 | missclassification(confusion.test)
189 | ```
190 |
191 |
192 |
193 | # Ridge Regression
194 | ```{r Ridge I}
195 | # install.packages('ridge')
196 | library(ridge)
197 |
198 | ridge.1 <- linearRidge(lcavol~. ,data = prostate.train)
199 | # Note that if not specified, lambda is chosen automatically by linearRidge.
200 |
201 | # Train error:
202 | MSE( predict(ridge.1)- prostate.train$lcavol)
203 | # Test error:
204 | MSE( predict(ridge.1, newdata = prostate.test)- prostate.test$lcavol)
205 | ```
206 |
207 |
208 | Another implementation, which also automatically chooses the tuning parameter $\lambda$:
209 | ```{r Ridge II}
210 | # install.packages('glmnet')
211 | library(glmnet)
212 | ridge.2 <- glmnet(x=X.train, y=y.train, alpha = 0)
213 |
214 | # Train error:
215 | MSE( predict(ridge.2, newx =X.train)- y.train)
216 |
217 | # Test error:
218 | MSE( predict(ridge.2, newx = X.test)- y.test)
219 | ```
220 |
221 | __Note__: `glmnet` is slightly picky.
222 | I could not have created `y.train` using `select()` because I need a vector and not a `data.frame`. Also, `as.matrix` is there as `glmnet` expects a `matrix` class `x` argument.
223 | Thse objects are created in the make_samples.R script, which we sourced in the beggining.
224 |
225 |
226 |
227 |
228 | # LASSO Regression
229 | ```{r LASSO}
230 | # install.packages('glmnet')
231 | library(glmnet)
232 | lasso.1 <- glmnet(x=X.train, y=y.train, alpha = 1)
233 |
234 | # Train error:
235 | MSE( predict(lasso.1, newx =X.train)- y.train)
236 |
237 | # Test error:
238 | MSE( predict(lasso.1, newx = X.test)- y.test)
239 | ```
240 |
241 |
242 | # Logistic Regression For Classification
243 | ```{r Logistic Regression}
244 | logistic.1 <- glm(spam~., data = spam.train, family = binomial)
245 | # numerical error. Probably due to too many predictors.
246 | # Maybe regularizing the logistic regressio with Ridge or LASSO will make things better?
247 | ```
248 |
249 | In the next chunk, we do $l_2$ and $l_1$ regularized logistic regression.
250 | Some technical remarks are in order:
251 |
252 | - `glmnet` is picky with its inputs. This has already been discussed in the context of the LASSO regression above.
253 | - The `predict` function for `glmnet` objects returns a prediction (see below) for many candidate regularization levels $\lambda$. We thus we `cv.glmnet` which does an automatic cross validated selection of the best regularization level.
254 | ```{r Regularized Logistic Regression}
255 | library(glmnet)
256 | # Ridge Regularization with CV selection of regularization:
257 | logistic.2 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 0)
258 | # LASSO Regularization with CV selection of regularization:
259 | logistic.3 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 1)
260 |
261 |
262 | # Train confusion matrix:
263 | .predictions.train <- predict(logistic.2, newx = X.train.spam, type = 'class')
264 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
265 | missclassification(confusion.train)
266 |
267 | .predictions.train <- predict(logistic.3, newx = X.train.spam, type = 'class')
268 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
269 | missclassification(confusion.train)
270 |
271 | # Test confusion matrix:
272 | .predictions.test <- predict(logistic.2, newx = X.test.spam, type='class')
273 | (confusion.test <- table(prediction=.predictions.test, truth=y.test.spam))
274 | missclassification(confusion.test)
275 |
276 | .predictions.test <- predict(logistic.3, newx = X.test, type='class')
277 | (confusion.test <- table(prediction=.predictions.test, truth=y.test))
278 | missclassification(confusion.test)
279 | ```
280 |
281 |
282 |
283 |
284 | # SVM
285 |
286 | ## Classification
287 | ```{r SVM classification}
288 | library(e1071)
289 | svm.1 <- svm(spam~., data = spam.train)
290 |
291 | # Train confusion matrix:
292 | .predictions.train <- predict(svm.1)
293 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
294 | missclassification(confusion.train)
295 |
296 | # Test confusion matrix:
297 | .predictions.test <- predict(svm.1, newdata = spam.test)
298 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
299 | missclassification(confusion.test)
300 | ```
301 |
302 |
303 | ## Regression
304 | ```{r SVM regression}
305 | svm.2 <- svm(lcavol~., data = prostate.train)
306 |
307 | # Train error:
308 | MSE( predict(svm.2)- prostate.train$lcavol)
309 | # Test error:
310 | MSE( predict(svm.2, newdata = prostate.test)- prostate.test$lcavol)
311 | ```
312 |
313 |
314 |
315 |
316 | # GAM Regression
317 | ```{r GAM}
318 | # install.packages('mgcv')
319 | library(mgcv)
320 | form.1 <- lcavol~ s(lweight)+ s(age)+s(lbph)+s(svi)+s(lcp)+s(gleason)+s(pgg45)+s(lpsa)
321 | gam.1 <- gam(form.1, data = prostate.train) # the model is too rich. let's select a variable subset
322 |
323 | ridge.1 %>% coef %>% abs %>% sort(decreasing = TRUE) # select the most promising coefficients (a very arbitrary practice)
324 | form.2 <- lcavol~ s(lweight)+ s(age)+s(lbph)+s(lcp)+s(pgg45)+s(lpsa) # keep only promising coefficients in model
325 | gam.2 <- gam(form.2, data = prostate.train)
326 |
327 | # Train error:
328 | MSE( predict(gam.2)- prostate.train$lcavol)
329 | # Test error:
330 | MSE( predict(gam.2, newdata = prostate.test)- prostate.test$lcavol)
331 | ```
332 |
333 |
334 |
335 |
336 |
337 | # Neural Net
338 |
339 | ## Regression
340 | ```{r NNET regression}
341 | library(nnet)
342 | nnet.1 <- nnet(lcavol~., size=20, data=prostate.train, rang = 0.1, decay = 5e-4, maxit = 1000)
343 |
344 | # Train error:
345 | MSE( predict(nnet.1)- prostate.train$lcavol)
346 | # Test error:
347 | MSE( predict(nnet.1, newdata = prostate.test)- prostate.test$lcavol)
348 | ```
349 |
350 |
351 | Let's automate the network size selection:
352 | ```{r NNET validate}
353 | validate.nnet <- function(size){
354 | .nnet <- nnet(lcavol~., size=size, data=prostate.train, rang = 0.1, decay = 5e-4, maxit = 200)
355 | .train <- MSE( predict(.nnet)- prostate.train$lcavol)
356 | .test <- MSE( predict(.nnet, newdata = prostate.test)- prostate.test$lcavol)
357 | return(list(train=.train, test=.test))
358 | }
359 |
360 | validate.nnet(3)
361 | validate.nnet(4)
362 | validate.nnet(20)
363 | validate.nnet(50)
364 |
365 | sizes <- seq(2, 30)
366 | validate.sizes <- rep(NA, length(sizes))
367 | for (i in seq_along(sizes)){
368 | validate.sizes[i] <- validate.nnet(sizes[i])$test
369 | }
370 | plot(validate.sizes~sizes, type='l')
371 | ```
372 | What can I say... This plot is not what I would expect. Could be due to the random nature of the fitting algorithm.
373 |
374 |
375 |
376 | ## Classification
377 | ```{r NNET Classification}
378 | nnet.2 <- nnet(spam~., size=5, data=spam.train, rang = 0.1, decay = 5e-4, maxit = 1000)
379 |
380 | # Train confusion matrix:
381 | .predictions.train <- predict(nnet.2, type='class')
382 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
383 | missclassification(confusion.train)
384 |
385 | # Test confusion matrix:
386 | .predictions.test <- predict(nnet.2, newdata = spam.test, type='class')
387 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
388 | missclassification(confusion.test)
389 | ```
390 |
391 |
392 | # CART
393 |
394 |
395 | ## Regression
396 | ```{r Tree regression}
397 | library(rpart)
398 | tree.1 <- rpart(lcavol~., data=prostate.train)
399 |
400 | # Train error:
401 | MSE( predict(tree.1)- prostate.train$lcavol)
402 | # Test error:
403 | MSE( predict(tree.1, newdata = prostate.test)- prostate.test$lcavol)
404 | ```
405 |
406 | At this stage we should prune the tree using `prune()`...
407 |
408 | ## Classification
409 | ```{r Tree classification}
410 | tree.2 <- rpart(spam~., data=spam.train)
411 |
412 | # Train confusion matrix:
413 | .predictions.train <- predict(tree.2, type='class')
414 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
415 | missclassification(confusion.train)
416 |
417 | # Test confusion matrix:
418 | .predictions.test <- predict(tree.2, newdata = spam.test, type='class')
419 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
420 | missclassification(confusion.test)
421 | ```
422 |
423 |
424 |
425 |
426 | # Random Forest
427 | TODO
428 |
429 | # Rotation Forest
430 | TODO
431 |
432 |
433 |
434 |
435 |
436 | # Smoothing Splines
437 | I will demonstrate the method with a single predictor, so that we can visualize the smoothing that has been performed:
438 |
439 | ```{r Smoothing Splines}
440 | spline.1 <- smooth.spline(x=X.train, y=y.train)
441 |
442 | # Visualize the non linear hypothesis we have learned:
443 | plot(y.train~X.train, col='red', type='h')
444 | points(spline.1, type='l')
445 | ```
446 | I am not extracting train and test errors as the output of `smooth.spline` will require some tweaking for that.
447 |
448 |
449 |
450 | # KNN
451 |
452 | ## Classification
453 | ```{r knn classification}
454 | library(class)
455 | knn.1 <- knn(train = X.train.spam, test = X.test.spam, cl =y.train.spam, k = 1)
456 |
457 | # Test confusion matrix:
458 | .predictions.test <- knn.1
459 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
460 | missclassification(confusion.test)
461 | ```
462 |
463 | And now we would try to optimize `k` by trying different values.
464 |
465 |
466 | # Kernel Regression
467 | Kernel regression includes many particular algorithms.
468 | ```{r kernel}
469 | # install.packages('np')
470 | library(np)
471 | ksmooth.1 <- npreg(txdat =X.train, tydat = y.train)
472 |
473 | # Train error:
474 | MSE( predict(ksmooth.1)- prostate.train$lcavol)
475 | ```
476 |
477 | There is currently no method to make prediction on test data with this function.
478 |
479 |
480 |
481 | # Stacking
482 | As seen in the class notes, there are many ensemble methods.
483 | Stacking, in my view, is by far the most useful and coolest. It is thus the only one I present here.
484 |
485 | The following example is adapted from [James E. Yonamine](http://jayyonamine.com/?p=456).
486 |
487 | ```{r Stacking}
488 | #####step 1: train models ####
489 | #logits
490 | logistic.2 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 0)
491 | logistic.3 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 1)
492 |
493 |
494 | # Learning Vector Quantization (LVQ)
495 | my.codebook<-lvqinit(x=X.train.spam, cl=y.train.spam, size=10, prior=c(0.5,0.5),k = 2)
496 | my.codebook<-lvq1(x=X.train.spam, cl=y.train.spam, codebk=my.codebook, niter = 100 * nrow(my.codebook$x), alpha = 0.03)
497 |
498 | # SVM
499 | library('e1071')
500 | svm.fit <- svm(y=y.train.spam, x=X.train.spam, probability=TRUE)
501 |
502 |
503 |
504 | #####step 2a: build predictions for data.train####
505 | train.predict<- cbind(
506 | predict(logistic.2, newx=X.train.spam, type="response"),
507 | predict(logistic.3, newx=X.train.spam, type="response"),
508 | knn1(train=my.codebook$x, test=X.train.spam, cl=my.codebook$cl),
509 | predict(svm.fit, X.train.spam, probability=TRUE)
510 | )
511 |
512 | ####step 2b: build predictions for data.test####
513 | test.predict <- cbind(
514 | predict(logistic.2, newx=X.test.spam, type="response"),
515 | predict(logistic.3, newx=X.test.spam, type="response"),
516 | predict(svm.fit, newdata = X.test.spam, probability = TRUE),
517 | knn1(train=my.codebook$x, test=X.test.spam, cl=my.codebook$cl)
518 | )
519 |
520 |
521 | ####step 3: train SVM on train.predict####
522 | final <- svm(y=y.train.spam, x=train.predict, probability=TRUE)
523 |
524 | ####step 4: use trained SVM to make predictions with test.predict####
525 | final.predict <- predict(final, test.predict, probability=TRUE)
526 | results<-as.matrix(final.predict)
527 | table(results, y.test.spam)
528 | ```
529 |
530 |
531 |
532 |
533 |
534 | # Fisher's LDA
535 | ```{r LDA}
536 | library(MASS)
537 | lda.1 <- lda(spam~., spam.train)
538 |
539 | # Train confusion matrix:
540 | .predictions.train <- predict(lda.1)$class
541 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
542 | missclassification(confusion.train)
543 |
544 | # Test confusion matrix:
545 | .predictions.test <- predict(lda.1, newdata = spam.test)$class
546 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
547 | missclassification(confusion.test)
548 | ```
549 |
550 | __Caution__:
551 | Both `MASS` have a function called `select`. I will thus try avoid the two packages being loaded at once, or call the functionby its full name: `MASS::select` or `dplyr::select'.
552 |
553 |
554 |
555 | # Naive Bayes
556 | ```{r Naive Bayes}
557 | library(e1071)
558 | nb.1 <- naiveBayes(spam~., data = spam.train)
559 |
560 | # Train confusion matrix:
561 | .predictions.train <- predict(nb.1, newdata = spam.train)
562 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam))
563 | missclassification(confusion.train)
564 |
565 | # Test confusion matrix:
566 | .predictions.test <- predict(nb.1, newdata = spam.test)
567 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam))
568 | missclassification(confusion.test)
569 | ```
570 |
571 |
--------------------------------------------------------------------------------
/unsupervised.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Unsupervised Learning"
3 | author: "Jonathan Rosenblatt"
4 | date: "April 12, 2015"
5 | output: html_document
6 | ---
7 |
8 | Some utility functions:
9 | ```{r utility}
10 | l2 <- function(x) x^2 %>% sum %>% sqrt
11 | l1 <- function(x) abs(x) %>% sum
12 | MSE <- function(x) x^2 %>% mean
13 |
14 | # Matrix norms:
15 | frobenius <- function(A) norm(A, type="F")
16 | spectral <- function(A) norm(A, type="2")
17 | ```
18 |
19 |
20 | __Note__: `foo::bar` means that function `foo` is part of the `bar` package.
21 | With this syntax, there is no need to load (`library`) the package.
22 | If a line does not run, you may need to install the package: `install.packages('bar')`.
23 | Packages that are install from sources other than CRAN (like github or bioconductor) will include a commented installation line.
24 |
25 | __Note__:RStudio currently does not autocomplete function arguments when using the `::` syntax.
26 |
27 |
28 |
29 |
30 |
31 | # Learning Distributions
32 |
33 | ## Gaussian Density Estimation
34 | ```{r generate data}
35 | # Sample from a multivariate Gaussian:
36 | ## Generate a covariance matrix
37 | p <- 10
38 | Sigma <- bayesm::rwishart(nu = 100, V = diag(p))$W
39 | lattice::levelplot(Sigma)
40 |
41 | # Sample from a multivariate Gaussian:
42 | n <- 1e3
43 | means <- 1:p
44 | X1 <- mvtnorm::rmvnorm(n = n, sigma = Sigma, mean = means)
45 | dim(X1)
46 |
47 | # Estiamte parameters and compare to truth:
48 | estim.means <- colMeans(X1) # recall truth is (1,...,10)
49 | plot(estim.means~means); abline(0,1, lty=2)
50 |
51 | estim.cov <- cov(X1)
52 | plot(estim.cov~Sigma); abline(0,1, lty=2)
53 |
54 | estim.cov.errors <- Sigma - estim.cov
55 | lattice::levelplot(estim.cov.errors)
56 | lattice::levelplot(estim.cov.errors/Sigma) # percentage error
57 |
58 | frobenius(estim.cov.errors)
59 |
60 | # Now try the same while playing with n and p.
61 | ```
62 |
63 |
64 |
65 | Other covariance estimators (robust, fast,...)
66 | ```{r covariances}
67 | # Robust covariance
68 | estim.cov.1 <- MASS::cov.rob(X1)$cov
69 | estim.cov.errors.1 <- Sigma - estim.cov.1
70 | lattice::levelplot(estim.cov.errors.1)
71 | lattice::levelplot(estim.cov.errors.1/Sigma) # percentage error
72 |
73 | frobenius(estim.cov.errors.1)
74 |
75 |
76 | # Nearest neighbour cleaning of outliers
77 | estim.cov.2 <- covRobust::cov.nnve(X1)$cov
78 | estim.cov.errors.2 <- Sigma - estim.cov.2
79 | lattice::levelplot(estim.cov.errors.2)
80 | frobenius(estim.cov.errors.2)
81 |
82 |
83 | # Regularized covariance estimation
84 | estim.cov.3 <- robustbase::covMcd(X1)$cov
85 | estim.cov.errors.3 <- Sigma - estim.cov.3
86 | lattice::levelplot(estim.cov.errors.3)
87 | frobenius(estim.cov.errors.3)
88 |
89 |
90 | # Another robust covariance estimator
91 | estim.cov.4 <- robustbase::covComed(X1)$cov
92 | estim.cov.errors.4 <- Sigma - estim.cov.4
93 | lattice::levelplot(estim.cov.errors.4)
94 | frobenius(estim.cov.errors.4)
95 | ```
96 |
97 | ## Non parametric density estimation
98 | There is nothing that will even try dimensions higher than 6.
99 | See [here](http://vita.had.co.nz/papers/density-estimation.pdf) for a review.
100 |
101 |
102 |
103 | ## Graphical Models
104 | [TODO]
105 | See R's graphical modeling [task view](http://cran.r-project.org/web/views/gR.html).
106 |
107 |
108 |
109 | ## Association rules
110 | Note: Visualization examples are taken from the arulesViz [vignette](http://cran.r-project.org/web/packages/arulesViz/vignettes/arulesViz.pdf)
111 |
112 | ```{r association rules}
113 | library(arules)
114 | data("Groceries")
115 | inspect(Groceries[1:2])
116 | summary(Groceries)
117 |
118 | rules <- arules::apriori(Groceries, parameter = list(support=0.001, confidence=0.5))
119 | summary(rules)
120 | rules %>% sort(by='lift') %>% head(2) %>% inspect
121 |
122 | # For a rule {A => B} we denote:
123 | # support: P(A AND B)
124 | # confidence: P(B|A)
125 | # lift: P(A,B)/[P(B)P(A)]
126 |
127 |
128 | # Select a subset of rules
129 | rule.subset <- subset(rules, subset = rhs %pin% "yogurt")
130 | inspect(rule.subset)
131 |
132 | # Visualize rules:
133 | library(arulesViz)
134 | plot(rules)
135 |
136 | subrules <- rules[quality(rules)$confidence > 0.8]
137 | plot(subrules, method="matrix", measure="lift", control=list(reorder=TRUE))
138 | plot(subrules, method="matrix", measure=c("lift", "confidence"), control=list(reorder=TRUE))
139 |
140 | plot(subrules, method="grouped")
141 | plot(rules, method="grouped", control=list(k=50))
142 |
143 | subrules2 <- head(sort(rules, by="lift"), 10)
144 | plot(subrules2, method="graph", control=list(type="items"))
145 | plot(subrules2, method="graph")
146 |
147 | # Export rules graph to use with other software:
148 | # saveAsGraph(head(sort(rules, by="lift"),1000), file="rules.graphml")
149 |
150 | rule.1 <- rules[1]
151 | inspect(rule.1)
152 | plot(rule.1, method="doubledecker", data = Groceries)
153 | ```
154 |
155 | See also the `prim.box` function in the `prim` package for more algorithms to learn association rules
156 |
157 |
158 |
159 | # Dimensionality Reduction
160 |
161 | ## PCA
162 | Note: example is a blend from [Gaston Sanchez](http://gastonsanchez.com/blog/how-to/2012/06/17/PCA-in-R.html) and [Georgia's Geography dept.](http://geog.uoregon.edu/GeogR/topics/pca.html).
163 |
164 |
165 | Get some data
166 | ```{r PCA data}
167 | ?USArrests
168 |
169 | plot(USArrests) # basic plot
170 | corrplot::corrplot(cor(USArrests), method = "ellipse") # slightly fancier
171 |
172 |
173 | # As a correaltion graph
174 | cor.1 <- cor(USArrests)
175 | qgraph::qgraph(cor.1)
176 | qgraph::qgraph(cor.1, layout = "spring", posCol = "darkgreen", negCol = "darkmagenta")
177 | ```
178 |
179 |
180 | ```{r prepare data}
181 | USArrests.1 <- USArrests[,-3] %>% scale # note the scaling, which is required by some
182 | ```
183 |
184 |
185 | ```{r PCA}
186 | # functions down the road...
187 | pca1 <- prcomp(USArrests.1, scale. = TRUE) # The main workhorse.
188 |
189 | pca1$rotation # loadings
190 |
191 | # Now score the states:
192 | USArrests.1[
193 | pca1$x %>% extract(,1) %>% which.max
194 | ,] # Fewest arrests
195 | USArrests.1[
196 | pca1$x %>% extract(,1) %>% which.min
197 | ,] # Most arrests
198 |
199 | pca1$x %>% extract(,1) %>% sort %>% head
200 | pca1$x %>% extract(,1) %>% sort %>% tail
201 | ```
202 | Interpretation:
203 |
204 | - PC1 seems to capture overall crime rate.
205 | - PC2 seems distinguish between sexual and non-sexual crimes
206 | - North Dakota is the most "arrestful" state. Florida is the least.
207 |
208 |
209 | Projecting on first two PCs:
210 | ```{r visualizing PCA}
211 | library(ggplot2) # for graphing
212 |
213 | pcs <- as.data.frame(pca1$x)
214 | ggplot(data = pcs, aes(x = PC1, y = PC2, label = rownames(pcs))) +
215 | geom_hline(yintercept = 0, colour = "gray65") +
216 | geom_vline(xintercept = 0, colour = "gray65") +
217 | geom_text(colour = "red", alpha = 0.8, size = 6) +
218 | ggtitle("PCA plot of USA States - Crime Rates")
219 | ```
220 |
221 |
222 | The bi-Plot
223 | ```{r biplot}
224 | biplot(pca1) #ugly!
225 |
226 | # library(devtools)
227 | # install_github("vqv/ggbiplot")
228 | ggbiplot::ggbiplot(pca1, labels = rownames(USArrests.1)) # better!
229 | ```
230 |
231 |
232 | The scree-plot
233 | ```{r screeplot}
234 | screeplot(pca1)
235 |
236 | ggbiplot::ggscreeplot(pca1)
237 | ```
238 | So clearly the main differentiation is along the first component, which captures the overall crime level in each state (and not a particular type of crime).
239 |
240 |
241 | Visualize the scoring as a projection of the states' attributes onto the factors.
242 | ```{r visualize contributions to factors}
243 | # get parameters of component lines (after Everitt & Rabe-Hesketh)
244 | load <- pca1$rotation
245 | slope <- load[2, ]/load[1, ]
246 | mn <- apply(USArrests.1, 2, mean)
247 | intcpt <- mn[2] - (slope * mn[1])
248 |
249 | # scatter plot with the two new axes added
250 | dpar(pty = "s") # square plotting frame
251 | USArrests.2 <- USArrests[,1:2] %>% scale
252 | xlim <- range(USArrests.2) # overall min, max
253 | plot(USArrests.2, xlim = xlim, ylim = xlim, pch = 16, col = "purple") # both axes same length
254 | abline(intcpt[1], slope[1], lwd = 2) # first component solid line
255 | abline(intcpt[2], slope[2], lwd = 2, lty = 2) # second component dashed
256 | legend("right", legend = c("PC 1", "PC 2"), lty = c(1, 2), lwd = 2, cex = 1)
257 |
258 | # projections of points onto PCA 1
259 | y1 <- intcpt[1] + slope[1] * USArrests.2[, 1]
260 | x1 <- (USArrests.1[, 2] - intcpt[1])/slope[1]
261 | y2 <- (y1 + USArrests.1[, 2])/2
262 | x2 <- (x1 + USArrests.1[, 1])/2
263 | segments(USArrests.1[, 1], USArrests.1[, 2], x2, y2, lwd = 2, col = "purple")
264 | ```
265 |
266 |
267 | Visualize the loadings (ok... we are already doing factor analysis without noticing...)
268 | ```{r visualize PCA}
269 | # install.packages('GPArotation')
270 | pca.qgraph <- qgraph::qgraph.pca(USArrests.1, factors = 2, rotation = "varimax")
271 | plot(pca.qgraph)
272 |
273 | qgraph::qgraph(pca.qgraph, posCol = "darkgreen", layout = "spring", negCol = "darkmagenta",
274 | edge.width = 2, arrows = FALSE)
275 | ```
276 |
277 |
278 |
279 |
280 | More implementations of PCA:
281 | ```{r many PCA implementations}
282 | # FAST solutions:
283 | gmodels::fast.prcomp()
284 |
285 | # More detail in output:
286 | FactoMineR::PCA()
287 |
288 | # For flexibility in algorithms and visualization:
289 | ade4::dudi.pca()
290 |
291 | # Another one...
292 | amap::acp()
293 | ```
294 |
295 |
296 |
297 | Principal tensor analysis:
298 | [TODO]
299 | ```{r PTA}
300 | PTAk::PTAk()
301 | ```
302 |
303 |
304 |
305 | ## sPCA
306 | ```{r sPCA}
307 | # Compute similarity graph
308 | state.similarity <- MASS::cov.rob(USArrests.1)$cov
309 |
310 | spca1 <- elasticnet::spca(state.similarity, K=2,type="Gram",sparse="penalty",trace=TRUE, para=c(0.06,0.16))
311 | spca1$loadings
312 | ```
313 |
314 |
315 | ## kPCA
316 | [TODO]
317 | ```{r kPCA}
318 | kernlab::kpca()
319 | ```
320 |
321 |
322 | ## Random Projections
323 | [TODO]
324 | ```{r Random Projections}
325 |
326 | ```
327 |
328 |
329 | ## MDS
330 | Classical MDS
331 | ```{r MDS}
332 | # We first need a dissimarity matrix/graph:
333 | state.disimilarity <- dist(USArrests.1)
334 |
335 | mds.1 <- stats::cmdscale(state.disimilarity)
336 |
337 | plot(mds.1, pch = 19)
338 | abline(h=0, v=0, lty=2)
339 | text(mds.1, pos = 4, labels = rownames(USArrests.2), col = 'tomato')
340 |
341 | # Compare with two PCA (first two PCs):
342 | points(pca1$x[,1:2], col='red', pch=19, cex=0.5)
343 | # So classical MDS with Euclidean distance, is the same as PCA on two dimensions!
344 | ```
345 | Note: Also see the `cluster::daisy` for more dissimilarity measures.
346 |
347 |
348 | Let's try other strain functions for MDS.
349 |
350 | Sammon's strain:
351 | ```{r Sammon MDS}
352 | mds.2 <- MASS::sammon(state.disimilarity)
353 | plot(mds.2$points, pch = 19)
354 | abline(h=0, v=0, lty=2)
355 | text(mds.2$points, pos = 4, labels = rownames(USArrests.2))
356 |
357 | # Compare with two PCA (first two PCs):
358 | arrows(x0 = mds.2$points[,1], y0 = mds.2$points[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5)
359 | # So Sammon's MDS with Euclidean distance, is *not* the same as PCA on two dimensions.
360 | ```
361 |
362 |
363 | Kruskal's strain:
364 | ```{r isoMDS}
365 | mds.3 <- MASS::isoMDS(state.disimilarity)
366 | plot(mds.3$points, pch = 19)
367 | abline(h=0, v=0, lty=2)
368 | text(mds.3$points, pos = 4, labels = rownames(USArrests.2))
369 |
370 | # Compare with two PCA (first two PCs):
371 | arrows(x0 = mds.3$points[,1], y0 = mds.3$points[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5)
372 | # So Kruskal's MDS with Euclidean distance, is *not* the same as PCA on two dimensions.
373 | ```
374 |
375 |
376 | ## Isomap
377 | ```{r Isomap}
378 | # Installing the package:
379 | # source("http://bioconductor.org/biocLite.R")
380 | # biocLite("RDRToolbox")
381 | isomap.1 <- RDRToolbox::Isomap(USArrests.1)
382 |
383 | plot(isomap.1$dim2)
384 | abline(h=0, v=0, lty=2)
385 | text(isomap.1$dim2, pos = 4, labels = rownames(USArrests.2))
386 |
387 |
388 | # Compare with two PCA (first two PCs):
389 | arrows(x0 = isomap.1$dim2[,1], y0 = isomap.1$dim2[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5)
390 | ```
391 |
392 |
393 | ## Local Linear Embedding (LLE)
394 | ```{r LLE}
395 | lle.1 <- RDRToolbox::LLE(USArrests.1, k=3)
396 |
397 | plot(lle.1)
398 | abline(h=0, v=0, lty=2)
399 | text(lle.1, pos = 4, labels = rownames(USArrests.2))
400 |
401 |
402 | # Compare with two PCA (first two PCs):
403 | arrows(x0 = lle.1[,1], y0 = lle.1[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5)
404 | ```
405 | Well, LLE (with 3 neighbors) clearly disagrees with PCA. Why is this?
406 |
407 |
408 | ## LocalMDS
409 | The only package I found is `localmds` in [here](https://github.com/hadley/localmds/blob/master/R/localmds.r).
410 | It is currently under active development so I am still waiting a stable version.
411 |
412 |
413 | ## Principal Curves & Surfaces
414 | ```{r Principla curves}
415 | princurve.1 <- princurve::principal.curve(USArrests.1, plot=TRUE)
416 | princurve.1$s
417 |
418 | points(princurve.1) # Projections of data on principal curve
419 | whiskers <- function(from, to) segments(from[, 1], from[, 2], to[, 1], to[, 2])
420 | whiskers(USArrests.1, princurve.1$s)
421 | ```
422 |
423 |
424 |
425 |
426 |
427 | # Latent Space Generative Models
428 |
429 | ## Factor Analysis (FA)
430 |
431 | No rotation
432 | ```{r FA}
433 | fa.1 <- psych::principal(USArrests.1, nfactors = 2, rotate = "none")
434 | fa.1
435 | summary(fa.1)
436 | biplot(fa.1, labels = rownames(USArrests.1))
437 |
438 | # Numeric comparison with PCA:
439 | fa.1$loadings
440 | pca1$rotation
441 |
442 | # Graph comparison: loadings encoded in colors
443 | qgraph::qgraph(fa.1)
444 | qgraph::qgraph(pca.qgraph) # for comparison
445 |
446 |
447 | # Geometric coherent graph comparison: loadings encoded in distances and colors
448 | qgraph::qgraph(fa.1)
449 | qgraph::qgraph(pca.qgraph) # for comparison
450 | ```
451 |
452 |
453 | Varimax rotation
454 | ```{r varimax}
455 | fa.2 <- psych::principal(USArrests.1, nfactors = 2, rotate = "varimax")
456 |
457 | fa.2$loadings
458 | fa.1$loadings
459 | pca1$rotation
460 | ```
461 | Notice the rotation has changed the interpretation of the factors.
462 |
463 |
464 | ## Independant component analysis (ICA)
465 | ```{r ICA}
466 |
467 | ica.1 <- fastICA::fastICA(USArrests.1, n.com=2) # Also performs projection pursuit
468 |
469 |
470 | plot(ica.1$S)
471 | abline(h=0, v=0, lty=2)
472 | text(ica.1$S, pos = 4, labels = rownames(USArrests.1))
473 |
474 | # Compare with two PCA (first two PCs):
475 | arrows(x0 = ica.1$S[,1], y0 = ica.1$S[,2], x1 = pca1$x[,2], y1 = pca1$x[,1], col='red', pch=19, cex=0.5)
476 | ```
477 |
478 |
479 |
480 | ## Exploratory Projection Pursuit
481 | ```{r exploratory projection pursuit}
482 | epp.1 <- REPPlab::EPPlab(USArrests.1)
483 | plot(epp.1)
484 | ```
485 |
486 | ## Generative Topographic Map (GTP)
487 | [TODO]
488 |
489 |
490 |
491 | ## Finite Mixture
492 | ```{r mixtures}
493 | library(mixtools)
494 |
495 | # Generate data:
496 | # Note that component-wise independence is assumed.
497 | k <- 2
498 | mix.p <- 4
499 | mix.probs <- rep(1/k,k)
500 | mix.means <- seq(1,k*mix.p) %>% matrix(nrow = k, ncol = mix.p)
501 | mix.sigma <- rep(1,k*p) %>% matrix(nrow = k, ncol = mix.p)
502 | x.mix <- mixtools::rmvnormmix(n=n, lambda =mix.probs, mu=mix.means, sigma = mix.sigma)
503 | x.mix %>% dim
504 |
505 | # Non parametric fit (initializing with true means)
506 | mix.1 <- mixtools::npEM(x.mix, mu0 = mix.means, verb = TRUE)
507 | plot(mix.1)
508 |
509 | # Fit assuming the Gaussian distribution:
510 | matrix2list <- function(x) split(x, rep(1:ncol(x), each = nrow(x)))
511 | mix.means.list <- matrix2list(t(mix.means))
512 |
513 | mix.2 <- mixtools::mvnormalmixEM(x.mix, k=2, mu=mix.means.list, verb = TRUE, epsilon = 1e-1)
514 | summary(mix.2)
515 | ```
516 | Read [this](http://www.stat.cmu.edu/~cshalizi/uADA/12/lectures/ch20.pdf) for more information on Finite mixtures.
517 |
518 |
519 | ## Hidden Markov Model (HMM)
520 | ```{r}
521 | # Note: the HiddenMarkov::foo() syntax will not work with this function. We thus load it.
522 | library(HiddenMarkov)
523 |
524 | # Generate data:
525 | (hmm.transition <- matrix(c(1/2, 1/2, 0, 1/3, 1/3, 1/3, 0, 1/2, 1/2), byrow=TRUE, nrow=3))
526 | hmm.probs <- rep(1,3)/3
527 | hmm.distribution <- 'norm'
528 | hmm.params <- list(mean=c(1, 6, 3), sd=c(0.2, 0.2, 0.2))
529 | x <- dthmm(x = NULL, Pi = hmm.transition, delta = hmm.probs, distn = hmm.distribution, pm = hmm.params)
530 | x <- simulate(x, nsim=n)
531 | plot(x$x)
532 | # Can you guess when states were changed?
533 |
534 | # Let's make this harder:
535 | hmm.params <- list(mean=c(1, 6, 3), sd=rep(2,3))
536 | x <- dthmm(NULL, hmm.transition, hmm.probs, hmm.distribution, hmm.params)
537 | x <- simulate(x, nsim=n)
538 | plot(x$x, type='h')
539 |
540 |
541 | # Estimate parameters:
542 | y <- BaumWelch(x)
543 | summary(y)
544 |
545 | # Compare with truth:
546 | hmm.true.state <- x$y
547 | hmm.predict.state <- Viterbi(y)
548 | table(predict=hmm.predict.state, true=hmm.true.state)
549 | ```
550 |
551 |
552 |
553 | # Clustering:
554 | Some tutorials on clustering with R can be found in
555 |
556 | - [David Hitchcock](http://people.stat.sc.edu/Hitchcock/chapter6_R_examples.txt).
557 | - [QuickR](http://www.statmethods.net/advstats/cluster.html).
558 | - University of California, Riverside, [Institute of Integrative Genome Biology](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R).
559 | - [Phil Spector's](http://www.stat.berkeley.edu/~s133/Cluster2a.html) class notes from Berkeley Stats dept.
560 | - Michigan state university's [Laboratory for Dynamic Synthetic Vegephenomenology](http://ecology.msu.montana.edu/labdsv/R/labs/lab13/lab13.html).
561 |
562 |
563 |
564 | ## K-Means
565 | The following code is an adaptation from [David Hitchcock](http://people.stat.sc.edu/Hitchcock/chapter6_R_examples.txt).
566 | ```{r kmeans}
567 | k <- 2
568 | kmeans.1 <- stats::kmeans(USArrests.1, centers = k)
569 | kmeans.1$cluster # cluster asignments
570 |
571 | # Visualize using scatter plots of the original features
572 | pairs(USArrests.1, panel=function(x,y) text(x,y,kmeans.1$cluster))
573 |
574 | # Visualize using scatter plots of the original features
575 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
576 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=kmeans.1$cluster)
577 | ```
578 |
579 |
580 | ## K-Means++
581 | Recall that K-Means++ is a smart initialization for K-Means.
582 | The following code is taken from the [r-help](https://stat.ethz.ch/pipermail/r-help/2012-January/300051.html) mailing list.
583 | ```{r kmeansPP}
584 | kmpp <- function(X, k) {
585 | require('pracma')
586 |
587 | n <- nrow(X)
588 | C <- numeric(k)
589 | C[1] <- sample(1:n, 1)
590 |
591 | for (i in 2:k) {
592 | dm <- distmat(X, X[C, ])
593 | pr <- apply(dm, 1, min); pr[C] <- 0
594 | C[i] <- sample(1:n, 1, prob = pr)
595 | }
596 |
597 | kmeans(X, X[C, ])
598 | }
599 |
600 | # Examine output:
601 | kmeans.2 <- kmpp(USArrests.1, k)
602 | kmeans.2$cluster
603 | ```
604 |
605 |
606 | ## K-Medoids
607 | ```{r kmedoids}
608 | kmed.1 <- cluster::pam(x= state.disimilarity, k=2)
609 | kmed.1$clustering
610 |
611 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
612 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=kmed.1$cluster)
613 | ```
614 | Many other similarity measures can be found in `proxy::dist()`.
615 | See `cluster::clara()` for a massive-data implementation of PAM.
616 |
617 |
618 |
619 | ## Hirarchial Clustering
620 | ```{r Hirarchial Clustering}
621 | # Single linkage:
622 | hirar.1 <- hclust(state.disimilarity, method='single')
623 | plot(hirar.1, labels=rownames(USArrests.1), ylab="Distance")
624 |
625 | # Complete linkage:
626 | hirar.2 <- hclust(state.disimilarity, method='complete')
627 | plot(hirar.2, labels=rownames(USArrests.1), ylab="Distance")
628 |
629 | # Average linkage:
630 | hirar.3 <- hclust(state.disimilarity, method='average')
631 | plot(hirar.3, labels=rownames(USArrests.1), ylab="Distance")
632 |
633 | # Fixing the number of clusters:
634 | cut.2.2 <- cutree(hirar.2, k=2)
635 | cut.2.2 # printing the "clustering vector"
636 |
637 | # Suppose we preferred a 5-cluster solution:
638 | cut.2.5 <- cutree(hirar.2, k=5)
639 | cut.2.5 # printing the "clustering vector"
640 | ```
641 |
642 | Visualizing clusters:
643 | ```{r visualize clusters}
644 | # Visualize using scatter plots of the original features
645 | pairs(USArrests.1, panel=function(x,y) text(x,y,cut.2.5))
646 |
647 | # Visualize in the PC plane:
648 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
649 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=cut.2.5)
650 | ```
651 |
652 |
653 |
654 |
655 | ```{r agnes}
656 | # install.packages('cluster')
657 | library(cluster)
658 | agnes()
659 | ```
660 |
661 |
662 | ## QT Clustering
663 | [TODO]
664 | See [here](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R)]
665 |
666 |
667 | ## Fuzzy Clustering
668 | [TODO]
669 | See [here](But see [here](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R)])
670 |
671 |
672 | ## Self Organizing Maps (SOMs)
673 | The following is adapted from [Shane Lynn](http://shanelynn.ie/index.php/self-organising-maps-for-customer-segmentation-using-r/).
674 | More details in [this paper](http://www.jstatsoft.org/v21/i05/paper).
675 | If you want hexagons instead of circles, see [this](http://stackoverflow.com/questions/19858729/r-package-kohonen-how-to-plot-hexagons-instead-of-circles-as-in-matlab-som-too).
676 | ```{r som}
677 | library(kohonen)
678 | som.1 <- kohonen::som(USArrests.1, grid = somgrid(6, 6, "hexagonal"))
679 | ```
680 |
681 | Visuzlize results:
682 | We may need [this figure](notes/art/som_simulation.png) in mind when interpreting SOM:
683 | ```{r som}
684 | # Segments plot:
685 | plot(som.1)
686 |
687 | # Counts plot:
688 | plot(som.1, type='counts')
689 |
690 | # Quality plot:
691 | plot(som.1, type='quality')
692 |
693 |
694 | # Neighbours Distance plot:
695 | plot(som.1, type='dist.neighbours')
696 |
697 |
698 |
699 | #
700 | property.plot <- function(k) plot(som.1, type='property', property = som.1$codes[,k], main = colnames(som.1$codes)[k])
701 | property.plot(1)
702 | property.plot(2)
703 | property.plot(3)
704 |
705 |
706 | # Clustering:
707 | pretty_palette <- c('#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2')
708 | som.1.cluster <- cutree(hclust(dist(som.1$codes)), 5)
709 | plot(som.1, type="mapping", bgcol = pretty_palette[som.1.cluster], main = "Clusters")
710 | add.cluster.boundaries(som.1, som.1.cluster)
711 | ```
712 | For fancy visualization of `kohonen` SOMs, see [Seth Spielman's](https://github.com/geoss/som_visualization_r) code.
713 |
714 | Other SOM implementations can be found in `som::som()` and `class::SOM()` but `kohonen` seems the most complete and well documented.
715 |
716 | __Note__: many functions are called `som`. Be careful when loading packages, and make use of the `::` syntax.
717 |
718 |
719 |
720 | ## Spectral Clustering
721 | ```{r spectral clustering}
722 | # install.packages('kernlab')
723 | library(kernlab)
724 |
725 | kernlab::specc()
726 | ```
727 |
728 |
729 |
730 |
731 | ## Model based (generative) clustering
732 | ```{r generative clustering}
733 | library(mclust)
734 | mclust.1 <- Mclust(USArrests.1)
735 | summary(mclust.1)
736 |
737 | # By default, the generative Gaussian distributions considered are:
738 | # "EII": spherical, equal volume
739 | # "VII": spherical, unequal volume
740 | # "EEI": diagonal, equal volume and shape
741 | # "VEI": diagonal, varying volume, equal shape
742 | # "EVI": diagonal, equal volume, varying shape
743 | # "VVI": diagonal, varying volume and shape
744 | # "EEE": ellipsoidal, equal volume, shape, and orientation
745 | # "EEV": ellipsoidal, equal volume and equal shape
746 | # "VEV": ellipsoidal, equal shape
747 | # "VVV": ellipsoidal, varying volume, shape, and orientation
748 |
749 | # Plotting the BIC values (which is possible for generative methods)
750 | plot(mclust.1, data=USArrests, what="BIC")
751 | # The best solution is VEI with 3 clusters.
752 |
753 | # The clustering:
754 | mclust.1$classification
755 |
756 | # This gives the probabilities of belonging to each cluster for every object:
757 | round(mclust.1$z,2)
758 | ```
759 |
760 |
761 | Visualizing the clusters:
762 | ```{r visualize generative clustering}
763 | # Visualize using scatter plots of the original features
764 | pairs(USArrests.1, panel=function(x,y) text(x, y, mclust.1$classification))
765 |
766 | # Visualize in the PC plane:
767 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
768 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=mclust.1$classification)
769 | ```
770 |
771 |
--------------------------------------------------------------------------------