├── .gitignore ├── Intro2R.Rmd ├── Intro2R.Rproj ├── LICENSE ├── README.md ├── all_code.Rmd ├── ensembles.Rmd ├── heights.txt ├── make_samples.R ├── massive_data.Rmd ├── notes ├── .gitignore ├── Intro2MachineLearning.bib ├── Intro2R.txss ├── appendices.tex ├── art │ ├── avoid-overfitting.png │ ├── bias_variance.png │ ├── censored.pdf │ ├── imputing.pdf │ ├── irrelevant-features-hurt-knn-clustering.png │ ├── irrelevant-features.png │ ├── non-linear-basis-functions.png │ ├── som_simulation.png │ ├── support-vector-machine-15-728.jpg │ ├── uncensored.pdf │ └── why-complex-models-can-turn-out-to-be-less-probable.png ├── collaborative.tex ├── commands.tex ├── estimation.tex ├── graphics.Rmd ├── introduction.tex ├── notes.loa ├── notes.loe ├── notes.pdf ├── notes.tex ├── statistical_decision.tex ├── supervised.tex └── unsupervised.tex ├── project.Rmd ├── sample_questions.Rmd ├── sample_questions.pdf ├── self_practice.Rmd ├── supervised.Rmd └── unsupervised.Rmd /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | # Example code in package build process 4 | *-Ex.R 5 | # R data files from past sessions 6 | .Rdata 7 | # RStudio files 8 | .Rproj.user/ 9 | .Rproj.user 10 | notes.Rmd 11 | 12 | # LaTeX files 13 | *.aux 14 | *.glo 15 | *.idx 16 | *.log 17 | *.toc 18 | *.ist 19 | *.acn 20 | *.acr 21 | *.alg 22 | *.bbl 23 | *.blg 24 | *.dvi 25 | *.glg 26 | *.gls 27 | *.ilg 28 | *.ind 29 | *.lof 30 | *.lot 31 | *.maf 32 | *.mtc 33 | *.mtc1 34 | *.out 35 | *.synctex.gz 36 | 37 | # Questions 38 | questions.* 39 | AmitClass 40 | 41 | # Other 42 | test_* 43 | 2010* 44 | Questions 45 | sample_questions_cache/ 46 | sample_questions_files/ 47 | Exam 48 | -------------------------------------------------------------------------------- /Intro2R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Intro2R" 3 | author: "Jonathan Rosenblatt" 4 | date: "March 18, 2015" 5 | output: html_document 6 | 7 | --- 8 | # R Basics 9 | 10 | Tips for this introduction: 11 | - If you are working alone, consider starting with "An Introduction to R" here: 12 | http://cran.r-project.org/manuals.html 13 | - Make sure you use RStudio. 14 | - Ctrl+return to run lines from editor. 15 | - alt+shift+k for RStudio keyboard shortcuts. 16 | - Ctrl+alt+j to navigate between sections 17 | - tab for auto-completion 18 | - Ctrl+1 to skip to editor. 19 | - Ctrl+2 to skip to console. 20 | - Ctrl+8 to skip to the environment list. 21 | - Folding: 22 | - alt+l collapse chunk. 23 | - alt+shift+l unfold chunk. 24 | - alt+o collapse all. 25 | - alt+shift+o unfold all. 26 | 27 | 28 | 29 | ## Simple calculator 30 | ```{r example} 31 | 10+5 32 | 70*81 33 | 2**4 34 | 2^4 35 | log(10) 36 | log(16, 2) 37 | log(1000, 10) 38 | ``` 39 | 40 | 41 | ## Probability calculator 42 | Wish you knew this when you did Intro To Probability class? 43 | ```{r} 44 | dbinom(x=3, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns P(X=3) 45 | dbinom(3, 10, 0.5) 46 | 47 | pbinom(q=3, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns P(X<=3) 48 | dbinom(x=0, size=10, prob=0.5)+dbinom(x=1, size=10, prob=0.5)+dbinom(x=2, size=10, prob=0.5)+dbinom(x=3, size=10, prob=0.5) # Same as previous 49 | 50 | qbinom(p=0.1718, size=10, prob=0.5) # For X~B(n=10, p=0.5) returns k such that P(X<=k)=0.1718 51 | 52 | rbinom(n=1, size=10, prob=0.5) 53 | rbinom(n=10, size=10, prob=0.5) 54 | rbinom(n=100, size=10, prob=0.5) 55 | ``` 56 | 57 | 58 | ## Getting help 59 | Get help for a particular function. 60 | ```{r, eval=FALSE} 61 | ?dbinom 62 | help(dbinom) 63 | ``` 64 | 65 | Search local help files for a particular string. 66 | ```{r, eval=FALSE} 67 | ??binomial 68 | help.search('dbinom') 69 | ``` 70 | 71 | Load a menu with several important manuals: 72 | ```{r, eval=FALSE} 73 | help.start() 74 | ``` 75 | 76 | 77 | ## Variable asignment: 78 | Assignments into a variable named "x": 79 | ```{r} 80 | x = rbinom(n=1000, size=10, prob=0.5) # Works. Bad style. 81 | x <- rbinom(n=1000, size=10, prob=0.5) # Asignments into a variable named "x" 82 | ``` 83 | More on style: http://adv-r.had.co.nz/Style.html 84 | 85 | 86 | Print contents: 87 | ```{r} 88 | x 89 | print(x) 90 | (x <- rbinom(n=1000, size=10, prob=0.5)) # Assign and print. 91 | ``` 92 | 93 | 94 | Operate on the object 95 | ```{r} 96 | mean(x) 97 | var(x) 98 | hist(x) 99 | rm(x) # remove variable 100 | ``` 101 | 102 | 103 | For more information on distributions see http://cran.r-project.org/web/views/Distributions.html 104 | 105 | 106 | ## Piping for better style and readability 107 | ```{r} 108 | # install.packages('magrittr') 109 | library(magrittr) 110 | ``` 111 | 112 | ```{r} 113 | x <- rbinom(n=1000, size=10, prob=0.5) 114 | 115 | x %>% var() # Instead of var(x) 116 | x %>% hist() # Instead of hist(x) 117 | x %>% mean() %>% round(2) %>% add(10) 118 | ``` 119 | 120 | This example clearly demonstrates the benefits (from http://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) 121 | ```{r} 122 | # Functional (onion) style 123 | car_data <- 124 | transform(aggregate(. ~ cyl, 125 | data = subset(mtcars, hp > 100), 126 | FUN = function(x) round(mean(x, 2))), 127 | kpl = mpg*0.4251) 128 | 129 | 130 | # magrittr style 131 | car_data <- 132 | mtcars %>% 133 | subset(hp > 100) %>% 134 | aggregate(. ~ cyl, data = ., FUN = . %>% mean %>% round(2)) %>% 135 | transform(kpl = mpg %>% multiply_by(0.4251)) %>% 136 | print 137 | ``` 138 | 139 | 140 | ## Vector creation and manipulation 141 | ```{r} 142 | c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21) 143 | 10:21 144 | seq(from=10, to=21, by=1) 145 | x <- seq(from=10, to=21, by=2) 146 | x <- c(10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21) 147 | x 148 | ``` 149 | 150 | 151 | 152 | You can assign AFTER the computation is finished: 153 | ```{r} 154 | c(1,2,3) 155 | y<- .Last.value 156 | y 157 | ``` 158 | 159 | 160 | Operations usually work element-wise: 161 | ```{r} 162 | x+2 163 | x*2 164 | x^2 165 | sqrt(x) 166 | log(x) 167 | ``` 168 | 169 | 170 | ## Simple plotting 171 | ```{r} 172 | x<- 1:100; y<- 3+sin(x) # Create arbitrary data 173 | plot(x = x, y = y) # x,y syntax 174 | plot(y ~ x) # y~x syntax (I like better) 175 | ``` 176 | 177 | Control plot appearance: 178 | ```{r} 179 | plot(y~x, type='l', main='Plotting a connected line') 180 | plot(y~x, type='h', main='Sticks plot', xlab='Insert x axis label', ylab='Insert y axis label') 181 | plot(y~x, pch=5) 182 | plot(y~x, pch=10, type='p', col='blue', cex=4) 183 | abline(3, 0.002) 184 | ``` 185 | 186 | Available plotting options 187 | ```{r, eval=FALSE} 188 | example(plot) 189 | example(points) 190 | ?plot 191 | help(package='graphics') 192 | ``` 193 | 194 | When your plotting gets serious, move to `ggplot2` and `ggvis` as soon as possible. 195 | 196 | 197 | 198 | 199 | ___ 200 | 201 | 202 | ## Data frame Manipulation 203 | `data.frames` extend the `matrix` class, in that they allow the binding of vectors of several classes (with same length). 204 | ```{r} 205 | x<- 1:100; y<- 3 + sin(x) 206 | class(x) # R (high) level representation of an object. 207 | 208 | # mode(x) 209 | # typeof(x) 210 | ``` 211 | 212 | 213 | Create and checkout your first data frame 214 | ```{r} 215 | frame1 <- data.frame(x=x, sin=y) 216 | frame1 217 | head(frame1) 218 | frame1 %>% head() # just print the beginning 219 | frame1 %>% View() # Excel-like view (never edit!) 220 | 221 | class(frame1) # the object is of type data.frame 222 | dim(frame1) 223 | dim(x) 224 | length(frame1) 225 | length(x) 226 | 227 | str(frame1) # the inner structure of an object 228 | attributes(frame1) # get the object's meta data 229 | ``` 230 | 231 | ### Exctraction 232 | single element: 233 | ```{r} 234 | frame1[1, 2] 235 | frame1[2, 1] 236 | ``` 237 | 238 | Extract _column_ by index: 239 | ```{r} 240 | frame1[1, ] 241 | frame1[,1] %>% t 242 | frame1[,1] %>% t %>% dim 243 | ``` 244 | 245 | Extract column by name: 246 | ```{r} 247 | names(frame1) 248 | frame1[, 'sin'] 249 | dim(frame1[, 'sin']) # extract as a vector. no dim attribute. 250 | frame1['sin'] 251 | dim(frame1['x',]) # extract as a data.frame. has dim attribute. 252 | frame1[,1:2] %>% class 253 | frame1[2] %>% class 254 | frame1[2, ] # extract a row 255 | 256 | frame1$sin %>% class 257 | ``` 258 | 259 | `subset()` does the same 260 | ```{r} 261 | subset(frame1, select=sin) 262 | subset(frame1, select=2) 263 | subset(frame1, select= c(2,0)) 264 | ``` 265 | 266 | 267 | Sanity conservation notice! 268 | Always think if you want to extract a vector or a frame: 269 | - Note the difference between `[]` and `[[]]` extraction! 270 | - Note the difference between `frame[,1]` and `frame[1]`. 271 | ```{r} 272 | a <- frame1[1] 273 | b <- frame1[[1]] 274 | a==b # Seems identical. But not really: 275 | class(a) 276 | class(b) 277 | # Causes different behaviour: 278 | a[1] 279 | b[1] 280 | ``` 281 | 282 | More about extraction: http://adv-r.had.co.nz/Subsetting.html 283 | 284 | ### dplyr package 285 | `dplyr` makes the manipulation of data.frames a breeze. 286 | It is very fast, and straightforward to use. 287 | 288 | Install the package: 289 | ```{r} 290 | # install.packages('dplyr') 291 | ``` 292 | 293 | The following examples are taken from: 294 | https://github.com/justmarkham/dplyr-tutorial/blob/master/dplyr-tutorial.Rmd 295 | ```{r} 296 | # install.packages('nycflights13') 297 | library(nycflights13) 298 | dim(flights) 299 | View(flights) 300 | names(flights) 301 | class(flights) # a tbl_df is an extension of the data.frame class 302 | library(dplyr) # calling dplyr 303 | 304 | filter(flights, month == 1, day == 1) #dplyr style 305 | flights[flights$month == 1 & flights$day == 1, ] # old style 306 | flights %>% filter(month == 1, day == 1) # dplyr with magrittr style (yes!) 307 | 308 | filter(flights, month == 1 | month == 2) 309 | slice(flights, 1:10) # selects rows 310 | 311 | arrange(flights, year, month, day) # sort 312 | arrange(flights, desc(arr_delay)) # sort descending 313 | 314 | select(flights, year, month, day) # select columns 315 | select(flights, year:day) # select column range 316 | select(flights, -(year:day)) # drop columns 317 | rename(flights, tail_num = tailnum) # rename variables 318 | # add a new computed colume 319 | mutate(flights, 320 | gain = arr_delay - dep_delay, 321 | speed = distance / air_time * 60) 322 | # you can refer to columns just created! 323 | mutate(flights, 324 | gain = arr_delay - dep_delay, 325 | gain_per_hour = gain / (air_time / 60) 326 | ) 327 | # keep only new variables 328 | transmute(flights, 329 | gain = arr_delay - dep_delay, 330 | gain_per_hour = gain / (air_time / 60) 331 | ) 332 | # simple statistics 333 | summarise(flights, 334 | delay = mean(dep_delay, na.rm = TRUE) 335 | ) 336 | 337 | sample_n(flights, 10) # random subsample 338 | sample_frac(flights, 0.01) # random subsample 339 | ``` 340 | 341 | Subgroup operations 342 | ```{r} 343 | by_tailnum <- group_by(flights, tailnum) 344 | by_tailnum %>% class # a groupping object 345 | delay <- summarise(by_tailnum, 346 | count = n(), 347 | avg.dist = mean(distance, na.rm = TRUE), 348 | avg.delay = mean(arr_delay, na.rm = TRUE)) 349 | delay <- filter(delay, count > 20, avg.dist < 2000) 350 | View(delay) 351 | 352 | destinations <- group_by(flights, dest) 353 | summarise(destinations, 354 | planes = n_distinct(tailnum), 355 | flights = n() 356 | ) 357 | 358 | # Grouping works in a hirarchy. summarise() peels outer layer. 359 | daily <- group_by(flights, year, month, day) 360 | (per_day <- summarise(daily, flights = n())) 361 | (per_month <- summarise(per_day, flights = sum(flights))) 362 | (per_year <- summarise(per_month, flights = sum(flights))) 363 | ``` 364 | 365 | 366 | 367 | 368 | 369 | Two table operations 370 | ```{r} 371 | airlines %>% View 372 | flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier) 373 | 374 | flights2 %>% left_join(airlines) # join on left table with automatic matching. 375 | 376 | flights2 %>% left_join(weather) 377 | 378 | flights2 %>% left_join(planes, by = "tailnum") # with named matching 379 | 380 | flights2 %>% left_join(airports, c("dest" = "faa")) 381 | 382 | flights2 %>% left_join(airports, c("origin" = "faa")) 383 | ``` 384 | 385 | Types of join 386 | ```{r} 387 | (df1 <- data_frame(x = c(1, 2), y = 2:1)) 388 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a")) 389 | 390 | df1 %>% inner_join(df2) # SELECT * FROM x JOIN y ON x.a = y.a 391 | 392 | df1 %>% left_join(df2) # SELECT * FROM x LEFT JOIN y ON x.a = y.a 393 | 394 | df1 %>% right_join(df2) # SELECT * FROM x RIGHT JOIN y ON x.a = y.a 395 | df2 %>% left_join(df1) 396 | 397 | df1 %>% full_join(df2) # SELECT * FROM x FULL JOIN y ON x.a = y.a 398 | 399 | # return only unmatched cases 400 | flights %>% 401 | anti_join(planes, by = "tailnum") %>% 402 | count(tailnum, sort = TRUE) 403 | # SELECT * FROM x WHERE NOT EXISTS (SELECT 1 FROM y WHERE x.a = y.a) 404 | 405 | df1 %>% semi_join(df2, by = "x") # SELECT * FROM x WHERE EXISTS (SELECT 1 FROM y WHERE x.a = y.a) 406 | ``` 407 | 408 | Set operations 409 | ```{r} 410 | (df1 <- data_frame(x = 1:2, y = c(1L, 1L))) 411 | (df2 <- data_frame(x = 1:2, y = 1:2)) 412 | 413 | intersect(df1, df2) # SELECT * FROM x INTERSECT SELECT * FROM y 414 | 415 | union(df1, df2) # SELECT * FROM x UNION SELECT * FROM y 416 | 417 | setdiff(df1, df2) # SELECT * FROM x EXCEPT SELECT * FROM y 418 | 419 | setdiff(df2, df1) 420 | ``` 421 | 422 | Leaving dplyr for now... 423 | 424 | 425 | ## Data Import and export 426 | 427 | __Note__: The [readr](https://github.com/hadley/readr) package facilitates and accelerates data importing. This section should be updated to use it. 428 | 429 | For a complete review see: 430 | http://cran.r-project.org/doc/manuals/R-data.html 431 | also in help.start() -> "Import and Export Manual" 432 | 433 | 434 | ### Import from WEB 435 | `read.table()` is the main importing workhorse. 436 | ```{r} 437 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data' 438 | tirgul1 <- read.table(URL) 439 | ``` 440 | 441 | Always look at the imported result! 442 | ```{r} 443 | View(tirgul1) 444 | # hmmm... header interpreted as data. Fix with header=TRUE: 445 | tirgul1 <- read.table(URL, header = TRUE) 446 | View(tirgul1) 447 | ``` 448 | 449 | ### Import .csv files 450 | Let's write a simple file so that we have something to import: 451 | ```{r} 452 | View(airquality) # examine the data to export 453 | (temp.file.name <- tempfile()) # get an arbitrary file name 454 | write.csv(x = airquality, file = temp.file.name) #export 455 | ``` 456 | 457 | Now let's import: 458 | ```{r} 459 | # my.data<- read.csv(file='/home/jonathan/Projects/...') 460 | my.data<- read.csv(file=temp.file.name) 461 | View(my.data) 462 | ``` 463 | 464 | __Note__: Under MS Windows(R) you might want need '\\\' instead of '/' 465 | 466 | ### Imprt .txt files 467 | Tries to guess the separator 468 | ```{r, eval=FALSE} 469 | my.data<- read.table(file='C:\\Documents and Settings\\Jonathan\\My Documents\\...') # 470 | ``` 471 | Specifies the separator explicitly 472 | ```{r, eval=FALSE} 473 | my.data<- read.delim(file='C:\\Documents and Settings\\Jonathan\\My Documents\\...') 474 | ``` 475 | If you care about your sanity, see ?read.table before starting imports. 476 | 477 | ### Writing Data to files 478 | 479 | Get and set the current directory: 480 | ```{r, eval=FALSE} 481 | getwd() #What is the working directory? 482 | setwd() #Setting the working directory in Linux 483 | ``` 484 | 485 | ```{r} 486 | write.csv(x=tirgul1, file='/tmp/tirgul1.csv') # 487 | ``` 488 | 489 | See ?write.table for details. 490 | 491 | ### .XLS(X) files 492 | Strongly recommended to convert to .csv 493 | If you still insist see: 494 | http://cran.r-project.org/doc/manuals/R-data.html#Reading-Excel-spreadsheets 495 | 496 | ### Massive files 497 | Better store as matrices and not data.frames. 498 | `scan()` is faster than `read.table()` but less convenient: 499 | 500 | Create the example data: 501 | ```{r} 502 | cols<- 1e3 503 | # Note: On Windoes you might neet to change /tmp/A.txt to /temp/A.txt 504 | rnorm(cols^2) %>% 505 | matrix(ncol=cols) %>% 506 | write.table(file='/tmp/A.txt', col.names= F, row.names= F) 507 | # Measure speed of import: 508 | system.time(A<- read.table('/tmp/A.txt', header=F)) 509 | system.time(A <- scan(file='/tmp/A.txt', n = cols^2) %>% 510 | matrix(ncol=cols, byrow = TRUE)) 511 | 512 | file.remove('/tmp/A.txt') 513 | ``` 514 | 515 | This matter will be revisited in the last class. 516 | 517 | ### Databases 518 | Start [here](https://rforanalytics.wordpress.com/useful-links-for-r/odbc-databases-for-r/) 519 | 520 | ### Hands on example (from the WEB) 521 | ```{r} 522 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data' 523 | tirgul1 <- read.table(URL, header = TRUE) 524 | 525 | names(tirgul1) 526 | tirgul1 %>% head 527 | tirgul1 %>% tail 528 | View(tirgul1) 529 | dim(tirgul1) 530 | length(tirgul1) 531 | ``` 532 | 533 | R can be object oriented (read about S3 and S4 if interested). 534 | See how `summary()` behaves differently on different object classes: 535 | ```{r} 536 | class(tirgul1[, 1]); class(tirgul1[, 2]); class(tirgul1[, 3]); class(tirgul1[, 4]) 537 | summary(tirgul1) 538 | ``` 539 | 540 | 541 | 542 | Matrix is more efficient than data frames. But can store only a single class of vectors. 543 | ```{r} 544 | tirgul.matrix <- as.matrix(tirgul1) 545 | tirgul.matrix 546 | class(tirgul.matrix) 547 | # notice everything has been cast to the most general class. 548 | class(tirgul.matrix[, 1]); class(tirgul.matrix[, 2]); class(tirgul.matrix[, 3]); class(tirgul.matrix[, 4]) 549 | summary(tirgul.matrix) 550 | ``` 551 | 552 | Note: if copy-pasting an expression bothers you (as it should!), here are some solutions: 553 | ```{r} 554 | # The apply family of functions: 555 | sapply(tirgul.matrix, class) 556 | 557 | # looping 558 | for(j in 1:ncol(tirgul.matrix)) print(class(tirgul.matrix[,j])) 559 | ``` 560 | 561 | Make sure you read `?sapply`. 562 | LISP fans might also like to read `?MAP`. 563 | 564 | 565 | 566 | Operations _within_ data objects: 567 | ```{r} 568 | plot(tirgul1$gender) 569 | tirgul1$gender %>% plot() # 570 | with(tirgul1, plot(gender) ) # Same opration. Different syntax. 571 | 572 | mean(tirgul1$age) 573 | tirgul1$age %>% mean() # 574 | with(tirgul1, mean(age) ) # Same opration. Different syntax. 575 | ``` 576 | 577 | 578 | ```{r} 579 | tirgul1$age <- tirgul1$age * 365 580 | tirgul1<- transform(tirgul1, age=age*365 ) #Age in days 581 | with(tirgul1, mean(age) ) 582 | tirgul1<- transform(tirgul1, age=age/365 ) #Does this revert back to years? 583 | with(tirgul1, mean(age) ) 584 | ``` 585 | 586 | Then again, many of these functions are replaced by more friendly functions in the `dplyr` package (see below). 587 | 588 | 589 | ## Sorting 590 | ```{r} 591 | (x<- c(20, 11, 13, 23, 7, 4)) 592 | (y<- sort(x)) 593 | (ord<- order(x)) 594 | x[ord] # Exctracting along the order is the same as sorting. 595 | ranks<- rank(x) 596 | identical(y[ranks] , x) # Compares two objects 597 | 598 | (z<- c('b','a','c','d','e','z')) 599 | xz<- data.frame(x,z) 600 | sort(xz) 601 | xz[ord,] # Sorting a data frame using one column 602 | ``` 603 | 604 | 605 | ## Looping 606 | For a crash course in R programming (not only data analysis) try: 607 | http://adv-r.had.co.nz/ 608 | The usual for(), while(), repeat() 609 | ```{r} 610 | for (i in 1:100){ 611 | print(i) 612 | } 613 | ``` 614 | 615 | 616 | ```{r} 617 | for (helloeveryone in seq(10, 100, by=2) ){ 618 | print(helloeveryone) 619 | } 620 | ``` 621 | 622 | 623 | ## Recursion 624 | Typically very slow due to memory management issues. 625 | 626 | ```{r} 627 | fib<-function(n) { 628 | if (n < 2) fn<-1 629 | else fn<-Recall(n - 1) + Recall(n - 2) 630 | return(fn) 631 | } 632 | fib(30) 633 | ``` 634 | 635 | 636 | ## Finding your objects 637 | ```{r} 638 | ls() #Lists all available objects 639 | ls(pattern='x') 640 | 641 | ls(pattern='[0-9]') # Search using regular expressions 642 | ls(pattern='[A-Z]') 643 | ``` 644 | 645 | Ctrl+8 in RStudio. 646 | 647 | 648 | 649 | 650 | # Univariate Exploratory Statistics 651 | 652 | 653 | ## Exploring Categorical Variables 654 | ```{r} 655 | gender <- c(rep('Boy', 10), rep('Girl', 12)) 656 | drink <- c(rep('Coke', 5), rep('Sprite', 3), rep('Coffee', 6), rep('Tea', 7), rep('Water', 1)) 657 | class(gender);class(drink) 658 | 659 | cbind(gender, drink) 660 | table1 <- table(gender, drink) 661 | table1 662 | ``` 663 | 664 | 665 | 666 | 667 | ## Exploring Continous Variables 668 | 669 | Generating and exploring data 670 | ```{r} 671 | sample1 <- rnorm(100) 672 | table(sample1) 673 | hist(sample1, freq=T, main='Counts') 674 | hist(sample1, freq=F, main='Frequencies') 675 | lines(density(sample1)) 676 | rug(sample1) 677 | ``` 678 | 679 | 680 | ## The Boxplot 681 | ```{r} 682 | boxplot(sample1) 683 | ``` 684 | 685 | 686 | 687 | Several different visualizations: 688 | ```{r} 689 | sample2<-rnorm(1000) 690 | stem(sample2) 691 | hist(sample2) 692 | plot(density(sample2)) 693 | rug(sample2) 694 | ``` 695 | 696 | 697 | 698 | True data 699 | ```{r} 700 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data' 701 | bone <- read.table(URL, header = TRUE) 702 | names(bone) 703 | summary(bone) 704 | stripchart(bone['age']) 705 | stem(bone[, 'age']) 706 | hist(bone[, 'age'], prob=T) 707 | lines(density(bone[, 'age'])) 708 | with(bone, rug(age)) 709 | 710 | ind<-bone[, 'gender']=='male' 711 | 712 | boxplot(bone$age~bone$gender) 713 | ``` 714 | 715 | 716 | ## Graphical parameters 717 | ```{r} 718 | attach(bone) 719 | stripchart(age) 720 | stripchart(age~gender) 721 | stripchart(age~gender, v=T) 722 | 723 | boxplot(age~gender) 724 | boxplot(age~gender, horizontal=T, col=c('pink','lightblue') ) 725 | title(main='Amazing Boxplots!') 726 | title(sub="Well actually.. I've seen better Boxplots") 727 | 728 | plot(density(age), main='') 729 | plot(density(age), main='', type='h') 730 | plot(density(age), main='', type='o') 731 | plot(density(age), main='', type='p') 732 | plot(density(age), main='', type='l') 733 | 734 | ?plot.default 735 | 736 | plot(density(age),main='') 737 | rug(age) 738 | boxplot(age, add=T, horizontal=T, at=0.02, boxwex=0.05, col='grey') 739 | title(expression(alpha==f[i] (beta))) 740 | example(plotmath) 741 | 742 | par(mfrow=c(2,1)) 743 | (males<- gender=='male') 744 | plot(density(age[males]), main='Male') ; rug(age[males]) 745 | plot(density(age[!males]), main='Female') ; rug(age[!males]) 746 | 747 | range(age) 748 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males]) 749 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males]) 750 | par(mfrow=c(1,2)) 751 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males]) 752 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males]) 753 | 754 | par(mfrow=c(1,1),ask=T) 755 | plot(density(age[males]), main='Male', xlim=c(9,26)) ; rug(age[males]) 756 | plot(density(age[!males]), main='Female', xlim=c(9,26)) ; rug(age[!males]) 757 | ``` 758 | 759 | 760 | ## Integer data 761 | Integer data will most certainly produce overlaps if plotted. Either add hitter, or treat as discrete. 762 | ```{r} 763 | r.age<-round(age) 764 | plot(density(r.age)) 765 | rug(r.age) 766 | plot(density(r.age, from=9)) 767 | rug(jitter(r.age)) 768 | hist(r.age) 769 | rug(jitter(r.age)) 770 | ``` 771 | 772 | 773 | ## Plotting 774 | 775 | ### Preparing data for plotting 776 | 2D data can be in either _wide_ or _long_ format. 777 | Most R functions are designed for long formats. 778 | Let's start by trying to plot in the wide format. 779 | Notice each dosage is plotted separately (yes, I could have looped). 780 | ```{r} 781 | wide.data<-data.frame(id=1:4, age=c(40,50,60,50), dose1=c(1,2,1,2),dose2=c(2,1,2,1), dose4=c(3,3,3,3)) 782 | wide.data 783 | 784 | plot(dose1~age, data=wide.data, ylim=range(c(dose1,dose2,dose4)), ylab='') 785 | points(dose2~age, data=wide.data, pch=2) 786 | points(dose4~age, data=wide.data, pch=3) 787 | ``` 788 | 789 | 790 | Plotting in long format is much easier. 791 | I will first convert the data manually. 792 | ```{r} 793 | (dose.type<-c( 794 | rep('dose1', length(wide.data$dose1)), 795 | rep('dose2', length(wide.data$dose2)), 796 | rep('dose4', length(wide.data$dose4)))) 797 | (dose<- c(wide.data$dose1,wide.data$dose2,wide.data$dose4)) 798 | (long.id<- rep(wide.data$id,3)) 799 | (long.age<- rep(wide.data$age,3)) 800 | 801 | long.data <- data.frame(long.id, long.age, dose.type, dose) 802 | View(long.data) 803 | 804 | plot(dose~long.age, data=long.data, pch=as.numeric(dose.type)) 805 | ``` 806 | I will now try to avoid this manual reshaping. 807 | 808 | ### Reshaping data with `tidyr` package 809 | 810 | This is the package I recommend if you cannot reshape manually. 811 | Example from [here](http://blog.rstudio.org/2014/07/22/introducing-tidyr/) 812 | ```{r} 813 | library(tidyr) 814 | library(dplyr) 815 | 816 | # Data in wide format: 817 | messy <- data.frame( 818 | name = c("Wilbur", "Petunia", "Gregory"), 819 | a = c(67, 80, 64), 820 | b = c(56, 90, 50) 821 | ) 822 | messy 823 | 824 | # Convert to long format: 825 | messy %>% gather(drug, heartrate, a:b) 826 | ``` 827 | 828 | ```{r} 829 | # Another example- from wide to long: 830 | set.seed(10) 831 | messy <- data.frame( 832 | id = 1:4, 833 | trt = sample(rep(c('control', 'treatment'), each = 2)), 834 | work.T1 = runif(4), 835 | home.T1 = runif(4), 836 | work.T2 = runif(4), 837 | home.T2 = runif(4) 838 | ) 839 | messy %>% head 840 | tidier <- messy %>% gather(key, time, -id, -trt) 841 | tidier %>% head(8) 842 | 843 | # From long to wide 844 | tidy <- tidier %>% 845 | separate(key, into = c("location", "time"), sep = "\\.") 846 | tidy %>% head(8) 847 | ``` 848 | 849 | ### Fancy Plotting 850 | ```{r} 851 | library(ggplot2) 852 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/bone.data' 853 | bone <- read.table(URL, header = TRUE) 854 | qplot(spnbmd, data=bone) 855 | qplot(x=gender, y=spnbmd, data=bone, geom='boxplot') 856 | qplot(spnbmd, data=bone, geom='histogram')+ facet_wrap(~gender) 857 | qplot(spnbmd, data=bone, geom='density')+ facet_wrap(~gender) 858 | qplot(spnbmd, data=bone)+ geom_density(col='red', size=1)+ facet_wrap(~gender) 859 | qplot(spnbmd, data=bone, fill=gender, geom='density', alpha=1) 860 | ``` 861 | 862 | Diamonds example (Taken from Wickham's web site: http://had.co.nz/stat405/) 863 | ```{r} 864 | ?diamonds 865 | dim(diamonds) 866 | head(diamonds) 867 | ``` 868 | 869 | ```{r} 870 | qplot(carat, data = diamonds) 871 | qplot(carat, data = diamonds, binwidth = 1) 872 | qplot(carat, data = diamonds, binwidth = 0.1) 873 | qplot(carat, data = diamonds, binwidth = 0.01) 874 | resolution(diamonds$carat) 875 | last_plot() + xlim(0, 3) 876 | 877 | qplot(depth, data = diamonds, binwidth = 0.2) 878 | qplot(depth, data = diamonds, binwidth = 0.2,fill = cut) + xlim(55, 70) 879 | qplot(depth, data = diamonds, binwidth = 0.562) +xlim(55, 70) + facet_wrap(~ cut) 880 | 881 | qplot(table, price, data = diamonds) 882 | qplot(table, price, data = diamonds, geom = "boxplot") 883 | qplot(table, price, data = diamonds, geom="boxplot",group = round(table)) 884 | 885 | qplot(carat, price, data = diamonds) 886 | qplot(carat, price, data = diamonds, alpha = I(1/10)) 887 | 888 | qplot(carat, price, data = diamonds, geom = "bin2d", main='Count Heatmap') 889 | qplot(carat, price, data = diamonds, geom = "hex") 890 | qplot(carat, price, data = diamonds) + geom_smooth() 891 | ``` 892 | 893 | 894 | For more information on ggplot2 see http://had.co.nz/ggplot2 895 | 896 | 897 | ## The QQ plot 898 | A simple and efficient tool to compare between distributions. 899 | ```{r} 900 | mystery.2<-function(y) { 901 | n<-length(y) 902 | y<-sort(y) 903 | i<-1:n 904 | q<-(i-0.5)/n 905 | x<-qnorm(q, mean(y), sqrt(var(y))) 906 | plot(y~x, xlab='Theoretical Quantiles', ylab='Empirical Quantiles') 907 | } 908 | 909 | normals.1<-rnorm(100, 0, 1); hist(normals.1) 910 | mystery.2(normals.1); abline(0, 1) 911 | 912 | normals.2<-rnorm(100, 0, 10); hist(normals.2) 913 | mystery.2(normals.2); abline(0, 1) 914 | 915 | ## No need to write the function every time... 916 | qqnorm(normals.1) 917 | qqnorm(normals.2) 918 | 919 | ## How would non-normal observations look? ## 920 | non.normals.1<-runif(100); hist(non.normals.1) 921 | mystery.2(non.normals.1); abline(0, 1) 922 | 923 | non.normals.2<-rexp(100, 1); hist(non.normals.2) 924 | mystery.2(non.normals.2); abline(0, 1) 925 | 926 | non.normals.3<-rgeom(100, 0.5); hist(non.normals.3) 927 | mystery.2(non.normals.3); abline(0, 1) 928 | 929 | ## Adapting for a non-normal distribution: ## 930 | qq.uniform<-function(y) { 931 | n<-length(y); y<-sort(y); i<-1:n; q<-(i-0.5)/n 932 | x<-qunif(q, min=min(y), max=max(y)) #each disribution will require it's own parameters! 933 | plot(y~x, xlab='Theoretical Quantiles', ylab='Empirical Quantiles') 934 | } 935 | qq.uniform(non.normals.1);abline(0, 1) 936 | qq.uniform(non.normals.2);abline(0, 1) 937 | qq.uniform(normals.2);abline(0, 1) 938 | ``` 939 | 940 | 941 | 942 | 943 | 944 | # Multiple data vectors 945 | We now leave the single-vector world and move to the analysis of dependencies between several vectors. 946 | 947 | ## Scatter plots 948 | ```{r} 949 | # Sine function 950 | x<-seq(-pi, pi, 0.01) 951 | y<-sin(x) 952 | plot(y~x) 953 | 954 | #Exponent function 955 | x<-seq(-pi, pi, 0.01) 956 | y<-exp(x) 957 | plot(y~x) 958 | 959 | # Sinc function 960 | x<-seq(-10*pi, 10*pi, 0.01) 961 | y<-sin(x)/x 962 | plot(y~x) 963 | 964 | # Fancy function 965 | x<-seq(-pi, pi, 0.01) 966 | y<-sin(exp(x))+cos(2*x) 967 | plot(y~x) 968 | plot(y~x, type='l') 969 | plot(y~x, type='o') 970 | 971 | ## Some real life data 972 | URL <- 'http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/ozone.data' 973 | ozone <- read.table(URL, header=T) 974 | names(ozone) 975 | plot(ozone) 976 | ``` 977 | 978 | 979 | ## 3D plotting 980 | ```{r} 981 | # install.packages('rgl') 982 | library(rgl) 983 | plot3d(ozone[, 1:3]) 984 | ``` 985 | 986 | 987 | ## Plotting a surface 988 | ```{r} 989 | x <- seq(0, 1, 0.01) 990 | y <- seq(0, 1, 0.01) 991 | xy.grid <- expand.grid(x, y) 992 | func1 <- function(mesh) exp(mesh[, 1]+mesh[, 2]) 993 | z <- func1(xy.grid) 994 | xyz <- data.frame(xy.grid, z) 995 | plot3d(xyz, xlab='x', ylab='y') 996 | ``` 997 | 998 | 999 | ## Fitting linear lines and surfaces 1000 | We will now try and fit linear surfaces to our data. 1001 | 1002 | ### Well behaved data 1003 | ```{r} 1004 | x <- 1:100 1005 | a <- 2 1006 | b <- 3.5 1007 | sigma <- 10 1008 | y <- a+b*x+rnorm(100, 0, sigma) 1009 | plot(y~x) 1010 | ``` 1011 | 1012 | ### Ordinary Least Squares 1013 | ```{r} 1014 | ols.line<-function(x, y){ 1015 | sxy<-sum( (x-mean(x) ) * (y-mean(y) ) ) 1016 | sxx<-sum( (x-mean(x)) ^ 2 ) 1017 | b1<-sxy / sxx 1018 | a1<-mean(y) - b1 * mean(x) 1019 | return(list(slope=b1, intercept=a1)) 1020 | } 1021 | 1022 | ols<-ols.line(x, y) ; ols 1023 | abline(ols$intercept, ols$slope, lty=2, lwd=3) 1024 | predictions <- ols$intercept + ols$slope * x 1025 | residuals<- y - predictions 1026 | plot(residuals) ; abline(h=0) 1027 | ``` 1028 | 1029 | ### Dangers of Extrapolation 1030 | ```{r} 1031 | x<-runif(1000)*5 1032 | y<-exp(x)+rnorm(1000) 1033 | plot(y~x, main='Whole relation') 1034 | 1035 | rect(xleft=0, ybottom=-5, xright=2, ytop=10) 1036 | 1037 | plot(y~x, main='Local relation', cex=0.5, xlim=c(0, 2), ylim=c(-5, 10));abline(v=2, lty=3) 1038 | 1039 | ind<-x<=2;ind 1040 | ols.interpolating<-ols.line(x[ind], y[ind]);ols.interpolating 1041 | abline(ols.interpolating$intercept , ols.interpolating$slope, col='red') 1042 | text(x=0.5, y=6, labels='Interpolates Nicely', cex=2) 1043 | 1044 | plot(y~x, main='Whole relation') 1045 | abline(ols.interpolating$intercept , ols.interpolating$slope, col='red') 1046 | abline(v=2, lty=3) 1047 | text(x=2, y=121, labels='Extrapolates Terribly!', cex=2) 1048 | 1049 | # Non-linearity might be fixed with a transformation: 1050 | # Which of the following looks better (more linear)? 1051 | plot(y~exp(x)) 1052 | plot(log(y)~x) 1053 | plot(log(y)~log(x)) 1054 | ``` 1055 | 1056 | ### Multivariate linear regression 1057 | ```{r} 1058 | # install.packages('rgl') 1059 | library(rgl) 1060 | 1061 | xy.grid <- data.frame(x1=runif(10000), x2=runif(10000)) 1062 | 1063 | func1<-function(mesh, a0, a1, a2, sigma) { 1064 | n<-nrow(mesh) 1065 | a0 + a1 * mesh[, 1] + a2 * mesh[, 2] + rnorm(n, 0, sigma) 1066 | } 1067 | 1068 | # More noise hides the stucture in the data: 1069 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .0); z; xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2') 1070 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .4); xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2') 1071 | z<-func1(xy.grid, a0=5, a1=1, a2=3, 11); xyz=data.frame(xy.grid, z); plot3d(xyz, xlab='x1', ylab='x2') 1072 | 1073 | ``` 1074 | 1075 | `lm()` is the main workhorse for OLS solving $(X'X)^{-1} X'y$ with the QR decomposition. 1076 | ```{r} 1077 | z<-func1(xy.grid, a0=5, a1=1, a2=3, .4) 1078 | xyz=data.frame(xy.grid, z) 1079 | plot3d(xyz, xlab='x1', ylab='x2') 1080 | lm(z~., xyz) # Did we exctract the correct coefficients? 1081 | ``` 1082 | 1083 | 1084 | 1085 | # Date handeling 1086 | See the `lubridate` package and manual [here](http://cran.r-project.org/web/packages/lubridate/vignettes/lubridate.html). 1087 | 1088 | 1089 | # String handelind 1090 | ```{r} 1091 | print("Hello\n") # Wrong! 1092 | show("Hello\n") # Wrong! 1093 | cat("Hello\n") # Right! 1094 | 1095 | # Windows directories need double escapes: 1096 | print("C:\\Program Files\\") 1097 | cat("C:\\Program Files\\", sep="\n") 1098 | 1099 | # String concatenation: 1100 | paste("Hello", "World", "!") 1101 | paste("Hello", "World", "!", sep="") 1102 | paste("Hello", " World", "!", sep="") 1103 | 1104 | x <- 5 1105 | paste("x=", x) 1106 | paste("x=", x, paste="") 1107 | 1108 | cat("x=", x, "\n") #Too many spaces :-( 1109 | cat("x=", x, "\n", sep="") 1110 | 1111 | # Collapsing strings: 1112 | s <- c("Hello", " ", "World", "!") 1113 | paste(s) 1114 | paste(s, sep="") 1115 | paste(s, collapse="") 1116 | paste(s, collapse=" 1") 1117 | 1118 | 1119 | s <- c("Hello", "World!") 1120 | paste(1:3, "Hello World!") 1121 | paste(1:3, "Hello World!", sep=":") 1122 | paste(1:3, "Hello World!", sep=":", collapse="\n") 1123 | cat(paste(1:3, "Hello World!", sep=":", collapse="\n"), "\n") # cat() does not collapse :-( 1124 | 1125 | 1126 | # Substrings: 1127 | s <- "Hello World" 1128 | substring(s, start=4, stop=6) 1129 | 1130 | # Splits: 1131 | s <- "foo, bar, baz" 1132 | strsplit(s, ", ") 1133 | 1134 | s <- "foo-->bar-->baz" 1135 | strsplit(s, "-->") 1136 | 1137 | # Using regular expressions (see ?regexp): 1138 | s <- "foo, bar, baz" 1139 | strsplit(s, ", *") 1140 | strsplit(s, "") 1141 | 1142 | # Looking in *vectors* of strings: 1143 | (s <- apply(matrix(LETTERS[1:24], nr=4), 2, paste, collapse="")) 1144 | 1145 | grep("O", s) # Returns location 1146 | grep("O", s, value=T) # Returns value 1147 | 1148 | 1149 | regexpr(pattern="o", text="Hello") 1150 | regexpr(pattern="o", text=c("Hello", "World!")) 1151 | 1152 | s <- c("Hello", "World!") 1153 | regexpr("o", s) 1154 | s <- c("Helll ooo", "Wrld!") 1155 | regexpr("o", s) 1156 | 1157 | # Fuzzy (approximate) matches: 1158 | grep ("abc", c("abbc", "jdfja", "cba")) # No match :-( 1159 | agrep ("abc", c("abbc", "jdfja", "cba")) # Match! :-) 1160 | 1161 | ## Note: agrep() is the function used in help.search() 1162 | s <- "foo bar baz" 1163 | gsub(pattern=" ", replacement="", s) # Remove all the spaces 1164 | s <- "foo bar baz" 1165 | gsub(" ", " ", s) 1166 | gsub(" +", "", s) # Using regular expression 1167 | gsub(" +", " ", s) # Remove multiple spaces and replace them by single spaces 1168 | 1169 | s <- "foo bar baz" 1170 | sub(pattern=" ", replacement="", s) # sub() only replaces first occurance. 1171 | gsub(" ", " ", s) 1172 | ``` 1173 | 1174 | 1175 | If you use strings often, try the `stringr` package. 1176 | 1177 | 1178 | 1179 | 1180 | 1181 | -------------------------------------------------------------------------------- /Intro2R.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: ASCII 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | 341 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intro 2 Data Mining and Machine Learning 2 | Some notes and code accompanying the Machine Learning course at BGU IE (2015B). 3 | 4 | - Introductory code to R can be found in [Intro2R.Rmd](https://github.com/johnros/Intro2R/blob/master/Intro2R.Rmd). 5 | - Class notes can be found in [notes/notes.pdf](https://github.com/johnros/Intro2R/blob/master/notes/notes.pdf) 6 | - Supervised learning with R can be found in [supervised.Rmd](https://github.com/johnros/Intro2R/blob/master/supervised.Rmd). 7 | - Unsupervised learning with R can be found in [unsupervised.Rmd](https://github.com/johnros/Intro2R/blob/master/unsupervised.Rmd). 8 | - Memory efficient examples of learning with R can be found in [massive_data.Rmd] (https://github.com/johnros/Intro2R/blob/master/massive_data.Rmd) 9 | 10 | 11 | -------------------------------------------------------------------------------- /ensembles.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ensembles" 3 | author: "Jonathan Rosenblatt" 4 | date: "April 14, 2015" 5 | output: html_document 6 | --- 7 | 8 | This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . 9 | 10 | When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: 11 | 12 | ```{r} 13 | summary(cars) 14 | ``` 15 | 16 | You can also embed plots, for example: 17 | 18 | ```{r, echo=FALSE} 19 | plot(cars) 20 | ``` 21 | 22 | Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. 23 | -------------------------------------------------------------------------------- /heights.txt: -------------------------------------------------------------------------------- 1 | "x" 2 | "1" 154.642231925250 3 | "2" 181.515415046854 4 | "3" 183.669450064676 5 | "4" 164.477480388096 6 | "5" 169.507524099543 7 | "6" 189.307146945314 8 | "7" 193.226804929068 9 | "8" 193.624383404466 10 | "9" 157.893148072242 11 | "10" 195.93135138899 12 | "11" 179.295385447133 13 | "12" 157.810599782384 14 | "13" 173.036294036202 15 | "14" 164.475677873598 16 | "15" 188.450490611646 17 | "16" 181.360921601115 18 | "17" 186.16343914743 19 | "18" 202.102560455985 20 | "19" 171.886308678447 21 | "20" 190.852218115841 22 | "21" 188.724663842758 23 | "22" 173.779216793308 24 | "23" 158.261222744983 25 | "24" 148.437784228267 26 | "25" 164.705238349013 27 | "26" 161.828633949433 28 | "27" 165.970280600691 29 | "28" 204.305316731687 30 | "29" 177.005233142843 31 | "30" 177.897164795354 32 | "31" 181.737172315478 33 | "32" 201.350957558292 34 | "33" 182.702695946265 35 | "34" 192.607675062521 36 | "35" 192.360470203139 37 | "36" 182.51500372402 38 | "37" 165.138482104219 39 | "38" 152.853469735449 40 | "39" 172.983712248375 41 | "40" 178.958805102191 42 | "41" 149.170125932993 43 | "42" 161.111975309129 44 | "43" 172.825260236284 45 | "44" 161.069621369286 46 | "45" 163.269021621464 47 | "46" 175.603795887767 48 | "47" 178.51097212948 49 | "48" 182.172158659397 50 | "49" 196.447207903926 51 | "50" 179.892990650191 52 | "51" 168.686430392467 53 | "52" 191.517024422907 54 | "53" 162.379700159572 55 | "54" 145.825462006366 56 | "55" 178.189038640029 57 | "56" 179.306007330531 58 | "57" 195.107291012887 59 | "58" 169.007854077717 60 | "59" 177.181215676435 61 | "60" 177.747229786197 62 | "61" 161.659336117893 63 | "62" 182.430623725975 64 | "63" 218.730275495048 65 | "64" 169.926181297895 66 | "65" 176.959469749844 67 | "66" 171.375459547603 68 | "67" 176.571705720853 69 | "68" 189.076098092261 70 | "69" 177.544191311252 71 | "70" 181.532436109357 72 | "71" 176.780880816625 73 | "72" 170.996911012109 74 | "73" 179.270023529253 75 | "74" 170.487924086484 76 | "75" 155.259733169198 77 | "76" 184.160624725216 78 | "77" 161.592427130666 79 | "78" 180.925187006070 80 | "79" 155.723890803099 81 | "80" 157.896310501347 82 | "81" 159.916096248596 83 | "82" 187.555838702435 84 | "83" 181.616771940129 85 | "84" 149.155308713604 86 | "85" 171.259895469393 87 | "86" 185.845811531962 88 | "87" 181.152073442719 89 | "88" 176.662954488154 90 | "89" 182.314011770510 91 | "90" 196.537826803738 92 | "91" 164.408233463266 93 | "92" 168.009564401806 94 | "93" 180.671764709218 95 | "94" 153.779862401297 96 | "95" 170.184617805034 97 | "96" 161.331411417559 98 | "97" 191.196346904921 99 | "98" 197.625866973540 100 | "99" 177.810957191829 101 | "100" 168.021944571873 102 | "101" 171.084646937414 103 | "102" 184.663810253697 104 | "103" 177.294679064144 105 | "104" 205.735547141656 106 | "105" 198.377420855761 107 | "106" 159.024803260539 108 | "107" 178.250153726594 109 | "108" 153.710786658130 110 | "109" 187.694850049530 111 | "110" 170.643629368827 112 | "111" 178.716334879969 113 | "112" 144.636422096204 114 | "113" 153.249288032108 115 | "114" 176.081100503641 116 | "115" 157.427804319223 117 | "116" 177.033902823186 118 | "117" 170.178904993776 119 | "118" 190.474248991012 120 | "119" 183.085023424552 121 | "120" 193.856511469621 122 | "121" 185.499811658772 123 | "122" 169.931326461033 124 | "123" 150.233076351352 125 | "124" 161.168186250448 126 | "125" 182.481635567953 127 | "126" 160.257377592658 128 | "127" 191.81113058419 129 | "128" 171.986403914315 130 | "129" 179.367258611334 131 | "130" 164.898806840904 132 | "131" 182.899347114643 133 | "132" 149.177190245355 134 | "133" 152.258124570863 135 | "134" 178.14150953484 136 | "135" 193.038062896634 137 | "136" 163.409690529186 138 | "137" 184.504908083032 139 | "138" 171.479092390151 140 | "139" 179.260048968711 141 | "140" 168.972514134737 142 | "141" 176.856469061484 143 | "142" 165.440897778976 144 | "143" 158.137172556529 145 | "144" 145.085703028274 146 | "145" 158.171404459938 147 | "146" 184.095842848338 148 | "147" 152.288387245497 149 | "148" 186.840380367486 150 | "149" 157.009104714750 151 | "150" 186.077553698985 152 | "151" 170.550995250485 153 | "152" 162.294059162132 154 | "153" 172.750487675374 155 | "154" 196.123600968753 156 | "155" 172.267601753096 157 | "156" 187.672987438595 158 | "157" 180.110886810124 159 | "158" 189.111794244441 160 | "159" 152.265176197954 161 | "160" 192.270450417775 162 | "161" 140.299655523388 163 | "162" 184.597044560618 164 | "163" 146.558622196391 165 | "164" 162.417570943497 166 | "165" 180.644457172588 167 | "166" 165.036710002312 168 | "167" 175.956551314607 169 | "168" 192.954916343350 170 | "169" 197.412868130900 171 | "170" 172.920420601755 172 | "171" 202.502550832053 173 | "172" 183.243333328404 174 | "173" 175.972050514168 175 | "174" 188.801922656066 176 | "175" 187.782581309347 177 | "176" 140.948126678169 178 | "177" 181.095616175404 179 | "178" 157.676954004513 180 | "179" 153.607025029908 181 | "180" 172.937550152389 182 | "181" 193.921470548975 183 | "182" 187.442536829346 184 | "183" 143.305587956638 185 | "184" 162.774894681295 186 | "185" 171.440845276852 187 | "186" 179.076926287560 188 | "187" 167.961077937356 189 | "188" 190.250038457275 190 | "189" 201.963484919159 191 | "190" 141.042162054683 192 | "191" 184.265069122516 193 | "192" 182.673553176274 194 | "193" 187.485732272253 195 | "194" 191.180068240384 196 | "195" 154.399840166867 197 | "196" 179.320659414185 198 | "197" 163.311945378322 199 | "198" 185.560706289541 200 | "199" 177.454870554195 201 | "200" 164.435553245849 202 | "201" 159.427517684274 203 | "202" 180.224446836658 204 | "203" 174.972792623367 205 | "204" 192.080560659665 206 | "205" 162.279950860632 207 | "206" 164.48146276803 208 | "207" 200.380128457540 209 | "208" 182.522412007671 210 | "209" 170.517535152204 211 | "210" 172.694744220837 212 | "211" 182.062711990664 213 | "212" 176.956174180894 214 | "213" 174.763608434611 215 | "214" 186.915073375985 216 | "215" 187.598758105258 217 | "216" 198.696129728959 218 | "217" 142.716214897597 219 | "218" 177.641213787158 220 | "219" 190.327947804155 221 | "220" 144.671734742830 222 | "221" 180.782760918096 223 | "222" 198.844300403074 224 | "223" 182.605006536142 225 | "224" 219.326098979731 226 | "225" 156.158453000064 227 | "226" 172.252620703346 228 | "227" 186.044661966251 229 | "228" 178.867663085854 230 | "229" 198.650162198613 231 | "230" 167.979915629824 232 | "231" 199.829570620415 233 | "232" 165.794645158794 234 | "233" 180.465456074427 235 | "234" 160.236206469431 236 | "235" 158.272746330946 237 | "236" 158.126130368779 238 | "237" 183.688668152816 239 | "238" 181.616152538631 240 | "239" 157.015107584028 241 | "240" 187.300616297201 242 | "241" 185.853278049716 243 | "242" 168.151057052129 244 | "243" 163.388790469436 245 | "244" 202.069698991767 246 | "245" 155.691779950907 247 | "246" 191.373405402332 248 | "247" 172.781335803882 249 | "248" 165.429010426692 250 | "249" 175.279711075057 251 | "250" 178.358900897957 252 | "251" 177.963343223211 253 | "252" 187.293122130642 254 | "253" 179.218874053826 255 | "254" 188.559765592407 256 | "255" 162.839838117308 257 | "256" 203.303304780748 258 | "257" 172.174062379368 259 | "258" 211.814999759702 260 | "259" 161.310082911835 261 | "260" 184.679045548268 262 | "261" 193.229871635395 263 | "262" 168.315858935628 264 | "263" 186.510678598511 265 | "264" 153.652416659175 266 | "265" 171.322276317139 267 | "266" 193.055700886032 268 | "267" 166.171376322696 269 | "268" 182.442937932258 270 | "269" 166.224171003048 271 | "270" 175.591636365803 272 | "271" 182.720750420831 273 | "272" 150.985495000653 274 | "273" 160.718851035696 275 | "274" 182.048045733700 276 | "275" 160.756319841451 277 | "276" 182.866206153823 278 | "277" 185.955916196744 279 | "278" 181.016896691375 280 | "279" 165.186654433254 281 | "280" 164.8483675701 282 | "281" 160.486375377322 283 | "282" 182.811311492578 284 | "283" 184.576189010998 285 | "284" 163.482449976648 286 | "285" 158.916523195602 287 | "286" 176.607539724103 288 | "287" 176.763030850391 289 | "288" 168.939521562625 290 | "289" 182.719602455499 291 | "290" 166.320287381078 292 | "291" 182.738929711734 293 | "292" 194.861048807758 294 | "293" 172.807759693463 295 | "294" 184.049657455787 296 | "295" 179.872957039333 297 | "296" 161.952408671362 298 | "297" 174.767475290942 299 | "298" 175.295065249966 300 | "299" 181.289243701683 301 | "300" 197.129730106169 302 | "301" 164.198900174296 303 | "302" 185.627781284498 304 | "303" 194.036881596675 305 | "304" 171.319949519604 306 | "305" 174.532277679364 307 | "306" 153.553184277542 308 | "307" 172.315835580037 309 | "308" 173.498678687561 310 | "309" 198.599707763334 311 | "310" 178.301845140724 312 | "311" 174.617660283316 313 | "312" 176.705767338396 314 | "313" 188.964358341085 315 | "314" 183.221334563170 316 | "315" 196.421488995261 317 | "316" 181.510883802772 318 | "317" 166.044681836748 319 | "318" 176.629184860139 320 | "319" 175.731902626491 321 | "320" 173.210208272426 322 | "321" 145.265036663022 323 | "322" 190.305195235554 324 | "323" 148.265087802415 325 | "324" 204.066415444731 326 | "325" 173.946706217892 327 | "326" 178.803185423479 328 | "327" 160.498622623532 329 | "328" 161.151523827894 330 | "329" 176.023313790272 331 | "330" 183.854957555561 332 | "331" 168.800034584258 333 | "332" 178.599765787053 334 | "333" 187.772067736776 335 | "334" 170.116452180722 336 | "335" 162.114133823019 337 | "336" 177.674849349452 338 | "337" 158.340689417691 339 | "338" 176.940108207029 340 | "339" 184.428186163844 341 | "340" 177.023840734806 342 | "341" 171.684048722634 343 | "342" 163.491527740157 344 | "343" 156.490691723403 345 | "344" 162.703292679845 346 | "345" 187.668512154051 347 | "346" 180.687665814883 348 | "347" 168.616157229943 349 | "348" 162.396089748833 350 | "349" 166.301296030222 351 | "350" 181.744456128940 352 | "351" 178.063325752123 353 | "352" 173.358737016462 354 | "353" 178.988670013496 355 | "354" 184.838605963780 356 | "355" 183.578257343549 357 | "356" 153.778167609730 358 | "357" 158.804087342316 359 | "358" 183.765465951731 360 | "359" 176.707010322159 361 | "360" 188.954998616583 362 | "361" 164.793222925059 363 | "362" 179.307368830717 364 | "363" 159.100707473351 365 | "364" 177.151347649305 366 | "365" 168.350324575276 367 | "366" 160.842323131067 368 | "367" 191.048389544374 369 | "368" 155.054513198013 370 | "369" 188.577579723576 371 | "370" 176.74738506733 372 | "371" 160.961060221237 373 | "372" 142.732111673602 374 | "373" 163.838976786749 375 | "374" 172.30085890264 376 | "375" 199.540731347306 377 | "376" 176.897711058568 378 | "377" 195.716455633224 379 | "378" 168.287980179958 380 | "379" 196.434322310898 381 | "380" 171.407564874279 382 | "381" 203.939169204876 383 | "382" 209.530228125875 384 | "383" 167.906266127632 385 | "384" 174.787449215420 386 | "385" 151.638661217451 387 | "386" 166.047516130843 388 | "387" 172.620046084553 389 | "388" 183.843707890951 390 | "389" 173.609280363838 391 | "390" 154.520648015293 392 | "391" 174.736374866009 393 | "392" 162.707998552559 394 | "393" 187.40582697193 395 | "394" 166.174083139717 396 | "395" 176.809811037152 397 | "396" 177.957216306957 398 | "397" 189.510751295879 399 | "398" 168.346173280156 400 | "399" 157.457097341197 401 | "400" 184.44868299107 402 | "401" 177.269519184665 403 | "402" 183.165864567249 404 | "403" 180.186177501262 405 | "404" 164.571732945580 406 | "405" 164.447851571180 407 | "406" 152.055618509910 408 | "407" 196.429360038572 409 | "408" 152.384298770638 410 | "409" 195.001939287470 411 | "410" 158.386947120237 412 | "411" 195.376551337271 413 | "412" 193.668994004159 414 | "413" 194.458375099331 415 | "414" 184.905317590638 416 | "415" 201.80903479507 417 | "416" 206.392737694348 418 | "417" 181.917008990256 419 | "418" 178.298008200284 420 | "419" 178.856768769408 421 | "420" 157.028903266883 422 | "421" 188.689512968359 423 | "422" 150.653752276514 424 | "423" 178.593098910637 425 | "424" 181.715775796939 426 | "425" 182.097283170211 427 | "426" 166.937476316204 428 | "427" 173.090440923646 429 | "428" 166.012795827896 430 | "429" 186.370617168312 431 | "430" 183.196712136737 432 | "431" 186.971886663421 433 | "432" 202.051049646338 434 | "433" 170.517405166906 435 | "434" 166.729016033653 436 | "435" 160.729667221702 437 | "436" 189.403162000057 438 | "437" 174.855701218518 439 | "438" 173.264991115568 440 | "439" 161.457213219188 441 | "440" 165.747790539936 442 | "441" 141.647420542422 443 | "442" 172.050626305387 444 | "443" 166.278123797179 445 | "444" 174.425684168269 446 | "445" 170.310161681767 447 | "446" 191.011686530823 448 | "447" 181.511058382853 449 | "448" 176.390937575651 450 | "449" 160.683023343649 451 | "450" 172.962048247563 452 | "451" 170.075108146494 453 | "452" 154.268572556346 454 | "453" 189.145807753854 455 | "454" 179.443783799274 456 | "455" 198.944443680990 457 | "456" 184.250273631561 458 | "457" 197.998016718115 459 | "458" 132.265514767070 460 | "459" 174.083352963022 461 | "460" 165.735214256863 462 | "461" 182.622677687991 463 | "462" 199.925347763529 464 | "463" 178.115390549151 465 | "464" 196.105489795473 466 | "465" 178.061311774555 467 | "466" 159.049179015494 468 | "467" 170.822143984335 469 | "468" 186.966542660391 470 | "469" 180.221528641344 471 | "470" 185.079955928013 472 | "471" 153.589935109175 473 | "472" 152.497471203635 474 | "473" 191.785903588667 475 | "474" 157.825893080683 476 | "475" 182.388147027416 477 | "476" 166.016656869199 478 | "477" 185.222519651954 479 | "478" 165.929035974702 480 | "479" 198.217448402947 481 | "480" 169.335083278694 482 | "481" 165.243005111587 483 | "482" 178.659255296227 484 | "483" 182.788741780399 485 | "484" 178.178098439817 486 | "485" 186.966512831434 487 | "486" 152.084666463771 488 | "487" 162.950577999665 489 | "488" 179.013082169688 490 | "489" 195.129031851928 491 | "490" 210.924034778640 492 | "491" 174.428311074424 493 | "492" 177.77051975902 494 | "493" 174.822800338396 495 | "494" 138.774022461865 496 | "495" 165.336530996786 497 | "496" 173.30873598773 498 | "497" 163.956121125432 499 | "498" 160.467298732182 500 | "499" 175.832120967381 501 | "500" 199.913146162215 502 | "501" 155.964878841759 503 | "502" 166.796382826151 504 | "503" 174.973053891717 505 | "504" 163.039267015936 506 | "505" 179.787580622214 507 | "506" 175.701883854837 508 | "507" 199.933140593470 509 | "508" 156.968838583641 510 | "509" 184.843888411655 511 | "510" 161.80247847189 512 | "511" 201.257564828762 513 | "512" 179.085048480902 514 | "513" 163.077045364664 515 | "514" 157.520449391653 516 | "515" 194.984431216109 517 | "516" 183.608222302032 518 | "517" 191.092321742153 519 | "518" 183.757981172173 520 | "519" 181.379263911818 521 | "520" 159.233064203777 522 | "521" 203.590571375374 523 | "522" 166.866274037342 524 | "523" 171.307699687387 525 | "524" 163.08188847367 526 | "525" 187.112382361810 527 | "526" 150.167721435222 528 | "527" 164.895153026682 529 | "528" 153.804013234172 530 | "529" 154.652526592447 531 | "530" 165.742961681111 532 | "531" 191.6872752319 533 | "532" 175.983531879868 534 | "533" 185.905480300651 535 | "534" 185.093928110123 536 | "535" 166.083474075138 537 | "536" 165.766560460107 538 | "537" 170.440220534698 539 | "538" 185.938414284405 540 | "539" 199.217434832544 541 | "540" 180.87302856521 542 | "541" 170.816010211100 543 | "542" 168.939975605928 544 | "543" 184.736483536528 545 | "544" 179.287219879986 546 | "545" 187.203050536959 547 | "546" 157.572722788549 548 | "547" 182.800768256700 549 | "548" 186.374601314682 550 | "549" 164.809101803407 551 | "550" 178.491121635725 552 | "551" 173.482799639154 553 | "552" 163.200798693547 554 | "553" 175.161439344675 555 | "554" 189.336322399167 556 | "555" 176.529890249700 557 | "556" 178.925164253393 558 | "557" 160.761332844183 559 | "558" 192.562729714031 560 | "559" 187.393621841816 561 | "560" 176.068015277767 562 | "561" 172.381080213471 563 | "562" 167.72762249227 564 | "563" 167.263771906048 565 | "564" 183.109496801787 566 | "565" 164.777836297268 567 | "566" 171.679105667423 568 | "567" 172.191445630999 569 | "568" 178.915706193654 570 | "569" 198.573578910996 571 | "570" 166.213596337005 572 | "571" 149.066566144206 573 | "572" 170.853481637291 574 | "573" 161.139135803052 575 | "574" 174.150659444601 576 | "575" 171.996709839293 577 | "576" 170.234324639672 578 | "577" 197.587417529768 579 | "578" 162.793244578473 580 | "579" 186.859461234664 581 | "580" 186.083799923945 582 | "581" 195.717174499425 583 | "582" 175.614371368475 584 | "583" 185.446880388690 585 | "584" 166.917296428806 586 | "585" 186.902630363617 587 | "586" 172.414819999352 588 | "587" 178.191465949287 589 | "588" 147.840521132375 590 | "589" 196.302627884954 591 | "590" 171.909679246555 592 | "591" 177.636886346887 593 | "592" 189.121338930815 594 | "593" 201.573257006643 595 | "594" 184.457930447635 596 | "595" 166.773096995064 597 | "596" 188.384970652984 598 | "597" 142.414178497676 599 | "598" 179.217573300661 600 | "599" 166.976139352456 601 | "600" 180.710422421230 602 | "601" 190.990969102576 603 | "602" 167.070851807613 604 | "603" 193.690588312798 605 | "604" 170.276843219783 606 | "605" 138.163334897795 607 | "606" 168.498364925024 608 | "607" 175.895535811671 609 | "608" 164.679731434988 610 | "609" 148.356122363571 611 | "610" 176.058657518635 612 | "611" 197.831476059284 613 | "612" 183.607359396439 614 | "613" 175.561425572291 615 | "614" 168.647571229575 616 | "615" 176.044819204732 617 | "616" 194.857926288700 618 | "617" 154.495425583718 619 | "618" 193.712316559755 620 | "619" 151.757863441684 621 | "620" 188.785733351911 622 | "621" 175.931220440509 623 | "622" 193.441564462227 624 | "623" 170.621086909187 625 | "624" 176.181932728350 626 | "625" 176.679228632157 627 | "626" 186.689305906061 628 | "627" 182.426770197615 629 | "628" 168.824324409370 630 | "629" 181.842481343291 631 | "630" 155.722910948057 632 | "631" 173.994102722085 633 | "632" 174.887885426625 634 | "633" 173.288105227204 635 | "634" 195.00806176919 636 | "635" 180.744081183266 637 | "636" 166.650856343618 638 | "637" 160.375471741992 639 | "638" 176.380189932622 640 | "639" 161.389260998669 641 | "640" 144.626448351016 642 | "641" 193.556498250189 643 | "642" 179.018143096707 644 | "643" 196.875671660888 645 | "644" 191.379490856748 646 | "645" 163.323565682411 647 | "646" 164.614060459367 648 | "647" 159.055123031765 649 | "648" 199.245884125381 650 | "649" 161.833445898662 651 | "650" 147.423923918488 652 | "651" 184.087773925858 653 | "652" 180.994201037299 654 | "653" 183.147874840789 655 | "654" 181.737854946865 656 | "655" 165.997895799993 657 | "656" 173.808142018463 658 | "657" 174.431727477913 659 | "658" 189.186367872759 660 | "659" 148.725937849016 661 | "660" 196.044690283686 662 | "661" 175.402244457156 663 | "662" 135.453057583091 664 | "663" 163.488810653472 665 | "664" 170.145424222187 666 | "665" 159.176036768623 667 | "666" 180.230059851796 668 | "667" 168.483798366892 669 | "668" 179.917571696136 670 | "669" 153.410694204960 671 | "670" 184.968870647362 672 | "671" 173.119351780117 673 | "672" 177.682142750716 674 | "673" 170.937870506549 675 | "674" 174.280913757452 676 | "675" 166.524197403539 677 | "676" 181.552570035715 678 | "677" 145.120176960029 679 | "678" 168.837310366893 680 | "679" 165.336439848885 681 | "680" 174.400241424235 682 | "681" 183.724895739295 683 | "682" 174.751552845865 684 | "683" 158.747384278538 685 | "684" 164.451344192133 686 | "685" 184.626382080994 687 | "686" 156.220013690916 688 | "687" 202.896805309913 689 | "688" 189.638023666131 690 | "689" 220.222314568603 691 | "690" 166.120790744227 692 | "691" 172.058065310796 693 | "692" 160.975088761266 694 | "693" 166.842037229817 695 | "694" 187.061734750517 696 | "695" 177.692975234530 697 | "696" 188.325188757857 698 | "697" 182.860510364993 699 | "698" 192.918694254393 700 | "699" 175.405976841235 701 | "700" 174.012904060021 702 | "701" 156.836548471586 703 | "702" 145.011508040850 704 | "703" 189.017161622121 705 | "704" 199.726708071824 706 | "705" 170.760939805503 707 | "706" 149.086217767916 708 | "707" 157.085088329393 709 | "708" 180.070403347209 710 | "709" 154.865596235964 711 | "710" 169.10531322038 712 | "711" 167.503225841124 713 | "712" 179.208202477236 714 | "713" 188.861474882465 715 | "714" 166.013350789149 716 | "715" 179.039536104201 717 | "716" 186.605360220183 718 | "717" 179.327878746743 719 | "718" 174.552119509597 720 | "719" 148.067581820210 721 | "720" 180.281822679742 722 | "721" 189.573687736597 723 | "722" 168.901887620761 724 | "723" 183.281966733900 725 | "724" 181.033053569180 726 | "725" 158.056434869983 727 | "726" 149.828182680041 728 | "727" 177.984344422450 729 | "728" 152.735240544595 730 | "729" 170.625332097780 731 | "730" 157.160910020433 732 | "731" 164.226331260782 733 | "732" 184.092578085197 734 | "733" 150.804039273175 735 | "734" 163.425479542623 736 | "735" 157.898685785876 737 | "736" 163.732990510667 738 | "737" 151.101623921934 739 | "738" 160.210900486758 740 | "739" 156.387546833666 741 | "740" 190.313807005817 742 | "741" 181.388112584315 743 | "742" 179.497550491261 744 | "743" 169.311249071892 745 | "744" 175.667286891803 746 | "745" 198.232477608869 747 | "746" 185.457350947889 748 | "747" 182.109408275760 749 | "748" 175.379852483831 750 | "749" 178.007246724946 751 | "750" 179.226918455877 752 | "751" 185.136947684203 753 | "752" 181.473472595963 754 | "753" 162.920548244272 755 | "754" 172.526549133301 756 | "755" 165.049283214815 757 | "756" 156.937053434863 758 | "757" 177.416676715852 759 | "758" 159.054160957737 760 | "759" 170.595112481695 761 | "760" 168.038698710784 762 | "761" 181.698822738156 763 | "762" 128.546002612522 764 | "763" 122.209459066065 765 | "764" 172.190892156975 766 | "765" 186.750139751262 767 | "766" 202.955016362004 768 | "767" 185.626890968403 769 | "768" 154.008505981866 770 | "769" 185.687440807349 771 | "770" 196.847873751050 772 | "771" 171.953316185312 773 | "772" 149.505067941148 774 | "773" 207.902619631389 775 | "774" 179.074689892608 776 | "775" 180.646843489277 777 | "776" 165.107780580013 778 | "777" 169.425841578293 779 | "778" 191.422816729689 780 | "779" 176.401413685863 781 | "780" 166.478634258473 782 | "781" 195.507105336363 783 | "782" 170.622970647857 784 | "783" 188.311635321259 785 | "784" 187.448611678417 786 | "785" 135.822911259681 787 | "786" 151.968788731697 788 | "787" 172.081867164090 789 | "788" 172.227388222652 790 | "789" 170.692672325331 791 | "790" 178.242399162286 792 | "791" 168.646236011802 793 | "792" 166.627359788482 794 | "793" 185.536074378311 795 | "794" 178.518112086116 796 | "795" 159.603288661557 797 | "796" 187.613688381090 798 | "797" 146.783595329556 799 | "798" 173.461719218919 800 | "799" 149.415407348639 801 | "800" 171.968806125003 802 | "801" 161.469982062398 803 | "802" 170.115584949635 804 | "803" 163.815093696623 805 | "804" 198.350954416808 806 | "805" 190.522240900673 807 | "806" 164.874201225063 808 | "807" 194.36523036219 809 | "808" 166.241292689966 810 | "809" 194.640280892311 811 | "810" 151.538471262220 812 | "811" 186.954805564224 813 | "812" 210.014304319917 814 | "813" 195.305369283628 815 | "814" 193.182803191660 816 | "815" 169.462698179736 817 | "816" 165.635325509888 818 | "817" 175.388159254454 819 | "818" 201.955661876965 820 | "819" 178.666469289603 821 | "820" 195.122611888756 822 | "821" 191.291782738848 823 | -------------------------------------------------------------------------------- /make_samples.R: -------------------------------------------------------------------------------- 1 | # install.packages(ElemStatLearn) 2 | rm(list=ls()) 3 | 4 | 5 | library(ElemStatLearn) # for data 6 | data("prostate") 7 | data("spam") 8 | 9 | library(magrittr) # for piping 10 | library(dplyr) # for handeling data frames 11 | 12 | 13 | 14 | # Continous outcome: 15 | prostate.train <- prostate %>% 16 | filter(train) %>% 17 | select(-train) 18 | prostate.test <- prostate %>% 19 | filter(!train) %>% 20 | select(-train) 21 | y.train <- prostate.train$lcavol 22 | X.train <- prostate.train %>% select(-lcavol) %>% as.matrix 23 | y.test <- prostate.test$lcavol 24 | X.test <- prostate.test %>% select(-lcavol) %>% as.matrix 25 | 26 | 27 | 28 | # Categorical outcome: 29 | n <- nrow(spam) 30 | 31 | train.prop <- 0.66 32 | train.ind <- c(TRUE,FALSE) %>% 33 | sample(size = n, prob = c(train.prop,1-train.prop), replace=TRUE) 34 | spam.train <- spam[train.ind,] 35 | spam.test <- spam[!train.ind,] 36 | 37 | y.train.spam <- spam.train$spam 38 | X.train.spam <- spam.train %>% select(-spam) %>% as.matrix 39 | y.test.spam <- spam.test$spam 40 | X.test.spam <- spam.test %>% select(-spam) %>% as.matrix 41 | 42 | spam.dummy <- spam %>% mutate(spam=as.numeric(spam=='spam')) 43 | spam.train.dummy <- spam.dummy[train.ind,] 44 | spam.test.dummy <- spam.dummy[!train.ind,] 45 | 46 | 47 | -------------------------------------------------------------------------------- /massive_data.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analyzing Massive Data Sets" 3 | author: "Jonathan Rosenblatt" 4 | date: "23/04/2015" 5 | output: 6 | html_document: 7 | toc: true 8 | --- 9 | 10 | 11 | ```{r setup, include=FALSE} 12 | library(knitr) 13 | opts_chunk$set(cache=TRUE) 14 | ``` 15 | 16 | # Introduction 17 | When analyzing data, you may encounter several resource constraints: 18 | 19 | - Hard Disk Space: your data might not fit your HD. This matter is not discussed in this text. 20 | - RAM constraint: Your data fits in the HD but the implementation you are using of your favorite method needs more RAM that what you have. This is the main topic of this text, in which we demonstrate out-of-memory implementations of many popular algorithms. 21 | - CPU constraint: Your algorithms has all the memory it needs, it simply runs too slowly. Parralelizing the computation on more cores in your machines, or on more machines, is in order. 22 | 23 | ## Disagnostics 24 | In order to diagnose the resource limit you are encoutering, make sure you always work with your task-manager (Windows) or top (linux) open. The cases where you get error messages from your software are easy to diagnose. In other cases, where computations never end, but no erros are thrown, check which resource is runnning low in your task-manager. 25 | 26 | 27 | ## Terminology 28 | 29 | - In-memory: processing loads the required data into RAM. 30 | - Out-of-memory: processing is not done from RAM but rather from HD. 31 | - Batch algorithm: loads all the data when processing. 32 | - Streaming algorithm: the algorithm progresses by processing a sinle observation at a time. 33 | - Mini-batch algorith: mid-way between batch and streaming. 34 | - Swap file: a file in HD which mimiks RAM. 35 | 36 | ## Tips and Tricks 37 | 38 | 1. For *batch* algorithms memory usage should not exceed $30%$. 39 | 2. Swap files: 40 | - NEVER use swap file. 41 | 3. R releases memory only when needed, not when possible ("lazy" release). 42 | 4. Don't count on R returning RAM to the operating system (at least in Linux). Restart R if FACEBOOK slows down. 43 | 5. When you want to go pro- read [Hadley's memory usage guide](http://adv-r.had.co.nz/memory.html) 44 | 45 | 46 | 47 | 48 | ## Bla bla... Let's see some code! 49 | 50 | Inspiration from [here](http://www.r-bloggers.com/bigglm-on-your-big-data-set-in-open-source-r-it-just-works-similar-as-in-sas/). 51 | 52 | 53 | Download a fat data file: 54 | ```{r download_data} 55 | # download.file("http://www.cms.gov/Research-Statistics-Data-and-Systems/Statistics-Trends-and-Reports/BSAPUFS/Downloads/2010_Carrier_PUF.zip", "2010_Carrier_PUF.zip") 56 | # unzip(zipfile="2010_Carrier_PUF.zip") 57 | ``` 58 | 59 | `data.table` package is much more efficient than `read.table' functions. 60 | You should also consider the `readr` [package ](https://github.com/hadley/readr) which we did not document here (yet). 61 | ```{r import_data} 62 | # install.packages('data.table') 63 | library(data.table) 64 | 65 | data <- data.table::fread(input = "2010_BSA_Carrier_PUF.csv", 66 | sep = ',', 67 | header=TRUE) 68 | 69 | 70 | read.csv("2010_BSA_Carrier_PUF.csv") 71 | 72 | 73 | 74 | library(magrittr) % # for piping syntax 75 | .names <- c("sex", "age", "diagnose", "healthcare.procedure", "typeofservice", "service.count", "provider.type", "servicesprocessed", "place.served", "payment", "carrierline.count") 76 | data %>% setnames(.names) 77 | ``` 78 | 79 | Now verify the size of your data in memory: 80 | ```{r} 81 | object.size(data) 82 | # But I prefer pryr: 83 | pryr::object_size(data) 84 | ``` 85 | 86 | When does R create a copy of an object? Use `tracemem` 87 | ```{r tracemem} 88 | tracemem(data) 89 | .test <- glm(payment ~ sex + age + place.served, data = data[1:1e2,], family=poisson) 90 | ``` 91 | 92 | 93 | Profile each line of code for time and memory usage using [lineprof](https://github.com/hadley/lineprof) 94 | ```{r lineprof} 95 | # devtools::install_github("hadley/lineprof") 96 | prof <- lineprof::lineprof( 97 | glm(payment ~ sex + age + place.served, data = data) 98 | ) 99 | lineprof::shine(prof) 100 | ```` 101 | 102 | 103 | 104 | 105 | 106 | But actually, I just like to have my Task-Manager constantly open: 107 | ```{r inspect_RAM} 108 | # Run and inspect RAM/CPU 109 | glm(payment ~ sex + age + place.served, data = data, family=poisson) 110 | ``` 111 | 112 | 113 | 114 | 115 | Now lets artificially scale the problem. 116 | Note: `copies` is small so that fitting can be done in real-time. 117 | To demonstrate the problem, I would have set `copies <- 10`. 118 | ```{r artificial_scale} 119 | copies <- 2 120 | data.2 <- do.call(rbind, lapply(1:copies, function(x) data) ) 121 | system.time(data.2 %>% dim) 122 | pryr::object_size(data) 123 | pryr::object_size(data.2) 124 | ``` 125 | 126 | 127 | 128 | When you run the following code at home, it will *not* show memory exhaustion, but will take a long time to run and to release when stopped. 129 | It is thus a *memory* constraint. 130 | ```{r} 131 | ## Don't run: 132 | ## glm.2 <-glm(payment ~ sex + age + place.served, data = data.2, family=poisson) 133 | ``` 134 | Since the data easily fits in RAM, it can be fixed simply by a *streaming* algorithm. 135 | 136 | 137 | The following object, can't even be stored in RAM. 138 | Streaming *from RAM* will not solve the problem. 139 | We will get back to this... 140 | ```{r} 141 | ## Don't run: 142 | ## copies <- 1e2 143 | ## data.3 <- do.call(rbind, lapply(1:copies, function(x) data) ) 144 | ``` 145 | 146 | 147 | 148 | 149 | # Streaming Regression 150 | 151 | We now discover several R implementations of streaming algorithms, which overcome RAM constraints at a moderate CPU cost. 152 | 153 | ## biglm 154 | ```{r biglm} 155 | # install.packages('biglm') 156 | library(biglm) 157 | mymodel <- biglm::bigglm(payment ~ sex + age + place.served, 158 | data = data.2, 159 | family = poisson(), 160 | maxit=1e3) 161 | 162 | # Too long! Quit the job and time the release. 163 | 164 | # For demonstration: OLS example with original data. 165 | mymodel <- bigglm(payment ~ sex + age + place.served, data =data ) 166 | mymodel <- data %>% bigglm(payment ~ sex + age + place.served, data =. ) 167 | ``` 168 | Remarks: 169 | - R is immediatly(!) available after quitting the job. 170 | - `bigglm` objects behave (almost) like `glm` objects w.r.t. `coef`, `summary`,... 171 | - `bigglm` is aimed at *memory* constraints. Not speed. 172 | 173 | 174 | ## Exploit sparsity in your data 175 | Very relevant to factors with many levels. 176 | ```{r} 177 | reps <- 1e6 178 | y<-rnorm(reps) 179 | x<- letters %>% 180 | sample(reps, replace=TRUE) %>% 181 | factor 182 | 183 | X.1 <- model.matrix(~x-1) # Make dummy variable matrix 184 | 185 | library(MatrixModels) 186 | X.2<-as(x,"sparseMatrix") %>% t # Makes sparse dummy matrix 187 | 188 | dim(X.1) 189 | dim(X.2) 190 | 191 | pryr::object_size(X.1) 192 | pryr::object_size(X.2) 193 | ``` 194 | 195 | 196 | ```{r} 197 | system.time(lm.1 <- lm(y ~ X.1)) 198 | system.time(lm.1 <- lm.fit(y=y, x=X.1)) 199 | system.time(lm.2 <- MatrixModels:::lm.fit.sparse(X.2,y)) 200 | 201 | all.equal(lm.2, unname(lm.1$coefficients), tolerance = 1e-12) 202 | ``` 203 | 204 | 205 | 206 | # Streaming classification 207 | [LiblineaR](http://cran.r-project.org/web/packages/LiblineaR/index.html), and [RSofia](http://cran.r-project.org/web/packages/RSofia/index.html) will stream from RAM your data for classification problems; 208 | mainly SVMs. 209 | 210 | 211 | 212 | 213 | 214 | # Out of memory Regression 215 | 216 | What if it is not the __algorithm__ that causes the problem, but merely __importing__ my objects? 217 | 218 | 219 | ## ff 220 | The `ff` packages replaces R's in-RAM storage mechanism with on-disk (efficient) storage. 221 | First open a connection to the file, without actually importing it. 222 | ```{r} 223 | # install.packages('LaF') 224 | library(LaF) 225 | 226 | .dat <- laf_open_csv(filename = "2010_BSA_Carrier_PUF.csv", 227 | column_types = c("integer", "integer", "categorical", "categorical", "categorical", "integer", "integer", "categorical", "integer", "integer", "integer"), 228 | column_names = c("sex", "age", "diagnose", "healthcare.procedure", "typeofservice", "service.count", "provider.type", "servicesprocessed", "place.served", "payment", "carrierline.count"), 229 | skip = 1) 230 | ``` 231 | Now write the data to HD as an ff object: 232 | ```{r} 233 | # install.packages('ffbase') 234 | library(ffbase) 235 | data.ffdf <- laf_to_ffdf(laf = .dat) 236 | ``` 237 | Notice the minimial RAM allocation: 238 | ```{r} 239 | pryr::object_size(data) 240 | pryr::object_size(data.ffdf) 241 | ``` 242 | 243 | 244 | 245 | 246 | Caution: `base` functions are unaware of `ff`. 247 | Adapted algorithms are required... 248 | ```{r} 249 | data$age %>% table 250 | ffbase:::table.ff(data.ffdf$age) 251 | ``` 252 | 253 | 254 | Luckily, `bigglm` has it's `ff` version: 255 | ```{r biglm_regression} 256 | mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served, 257 | data = data.ffdf, 258 | family = poisson(), 259 | maxit=1e3) 260 | 261 | # Again, too slow. Stop and run: 262 | mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served, 263 | data = data.ffdf) 264 | ``` 265 | The previous can scale to any file I can store on disk (but might take a while). 266 | 267 | 268 | 269 | 270 | I will now inflate the data to a size that would not fit in RAM. 271 | ```{r} 272 | copies <- 2e1 273 | data.2.ffdf <- do.call(rbind, lapply(1:copies, function(x) data.ffdf) ) 274 | 275 | # Actual size: 276 | cat('Size in GB ',sum(.rambytes[vmode(data.2.ffdf)]) * (nrow(data.2.ffdf) * 9.31322575 * 10^(-10))) 277 | 278 | # In memory: 279 | pryr::object_size(data.2.ffdf) 280 | ``` 281 | 282 | 283 | 284 | And now I can run this MASSIVE regression: 285 | ```{r biglm_ffdf_regression} 286 | ## Do no run: 287 | 288 | # mymodel.ffdf.2 <- bigglm.ffdf(payment ~ sex + age + place.served, 289 | # data = data.2.ffdf, 290 | # family = poisson(), 291 | # maxit=1e3) 292 | ``` 293 | Notes: 294 | 295 | - Notice again the quick release when aborting process. 296 | - Solving RAM constraints does not guarantee speed. This particular problem is actually worth parallelizing. 297 | - SAS, SPSS, Revolutios-R,... all rely on similar ideas. 298 | - Clearly, with so few variables I would be better of *subsampling*. 299 | - The [SOAR](http://cran.r-project.org/web/packages/SOAR/index.html) package also allows similar out-of-memory processing. 300 | 301 | # Out of memory Classification 302 | I do not know if there are `ff` versions of `LiblineaR` or `RSofia`. 303 | If you find out, let me know. 304 | 305 | 306 | 307 | 308 | 309 | # Parallelation 310 | 311 | ## Parallelized learning 312 | [TODO] 313 | 314 | ## Parallelized simulation 315 | [TODO] 316 | 317 | ## Distributed Graph algorithms 318 | [TODO] 319 | -------------------------------------------------------------------------------- /notes/.gitignore: -------------------------------------------------------------------------------- 1 | notes.loa 2 | -------------------------------------------------------------------------------- /notes/Intro2R.txss: -------------------------------------------------------------------------------- 1 | [Session] 2 | FileVersion=1 3 | File0\FileName=notes.tex 4 | File0\Line=71 5 | File0\Col=17 6 | File0\FirstLine=0 7 | File0\FoldedLines= 8 | File1\FileName=unsupervised.tex 9 | File1\Line=312 10 | File1\Col=12 11 | File1\FirstLine=0 12 | File1\FoldedLines="67,125" 13 | File2\FileName=introduction.tex 14 | File2\Line=61 15 | File2\Col=0 16 | File2\FirstLine=0 17 | File2\FoldedLines= 18 | MasterFile= 19 | CurrentFile=statistical_decision.tex 20 | Bookmarks=@Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x31\0\x31\0\x35\0\0\0\x2\0\x31\0\0\0<\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x45\0s\0t\0i\0m\0\x61\0t\0i\0o\0n\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x33\0\x34\0\x34\0\0\0\x2\0\x32\0\0\0t\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x46\0r\0o\0m\0 \0\x45\0s\0t\0i\0m\0\x61\0t\0i\0o\0n\0 \0t\0o\0 \0S\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x37\0\x32\0\x32\0\0\0\x2\0\x33\0\0\0^\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0N\0o\0n\0 \0\x45\0R\0M\0 \0S\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\x6\0\x38\0\x34\0\x34\0\0\0\x2\0\x34\0\0\0^\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0S\0t\0\x61\0t\0i\0s\0t\0i\0\x63\0\x61\0l\0 \0\x44\0\x65\0\x63\0i\0s\0i\0o\0n\0 \0T\0h\0\x65\0o\0r\0y\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x31\0\x35\0\x38\0\0\0\x2\0\x35\0\0\0R\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0U\0n\0s\0u\0p\0\x65\0r\0v\0i\0s\0\x65\0\x64\0 \0L\0\x65\0\x61\0r\0n\0i\0n\0g\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x32\0\x38\0\x37\0\0\0\x2\0\x36\0\0\0J\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0G\0\x65\0n\0\x65\0r\0\x61\0t\0i\0v\0\x65\0 \0M\0o\0\x64\0\x65\0l\0s\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x33\0\x33\0\x33\0\0\0\x2\0\x37\0\0\0X\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0\x44\0i\0m\0\x65\0n\0s\0i\0o\0n\0\x61\0l\0i\0t\0y\0 \0R\0\x65\0\x64\0u\0\x63\0t\0i\0o\0n\0}), @Variant(\0\0\0\v\0\0\0\x4\0\0\0^\0/\0h\0o\0m\0\x65\0/\0j\0o\0h\0n\0r\0o\0s\0/\0w\0o\0r\0k\0s\0p\0\x61\0\x63\0\x65\0/\0I\0n\0t\0r\0o\0\x32\0R\0/\0n\0o\0t\0\x65\0s\0/\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\0\0\b\0\x31\0\x34\0\x39\0\x31\0\0\0\x2\0\x38\0\0\0N\0n\0o\0t\0\x65\0s\0.\0t\0\x65\0x\0\n\0\\\0s\0\x65\0\x63\0t\0i\0o\0n\0{\0L\0\x61\0t\0\x65\0n\0t\0 \0S\0p\0\x61\0\x63\0\x65\0 \0M\0o\0\x64\0\x65\0l\0s\0}) 21 | File3\FileName=appendices.tex 22 | File3\Line=143 23 | File3\Col=0 24 | File3\FirstLine=0 25 | File3\FoldedLines= 26 | File4\FileName=supervised.tex 27 | File4\Line=429 28 | File4\Col=95 29 | File4\FirstLine=0 30 | File4\FoldedLines= 31 | File5\FileName=commands.tex 32 | File5\Line=85 33 | File5\Col=16 34 | File5\FirstLine=0 35 | File5\FoldedLines= 36 | File6\FileName=statistical_decision.tex 37 | File6\Line=0 38 | File6\Col=0 39 | File6\FirstLine=0 40 | File6\FoldedLines= 41 | 42 | [InternalPDFViewer] 43 | File= 44 | Embedded=false 45 | -------------------------------------------------------------------------------- /notes/appendices.tex: -------------------------------------------------------------------------------- 1 | \chapter{The Relation Between Supervised and Unsupervised Learning} 2 | \label{sec:relation_supervised_unsupervised} 3 | 4 | 5 | It may be surprising that collaborative filtering can be seen as both an unsupervised and a supervised learning problem. 6 | But these are not mutually exclusive problems. 7 | In fact, the relation has already been implied in the introduction to the unsupervised learning section (\S\ref{sec:unsupervised}), and we now make it explicit. 8 | 9 | In unsupervised learning we try to learn the joint distribution of $x$, i.e., try to learn the relationship between any variable in $x$ to the rest, we may see it as several supervised learning problems. In each, a different variable in $x$ plays the role of $y$. 10 | 11 | Many unsupervised learning methods can be seen in this light. We, however, will not be exploring this avenue right now. 12 | 13 | [TODO: autoencoders]. 14 | 15 | 16 | 17 | 18 | % % % % % % RKHS % % % % % 19 | 20 | \chapter{The Kernel Trick and Reproducing Kernel Hilbert Spaces (RKHS)} 21 | \label{apx:rkhs} 22 | 23 | In the context of supervised learning the \emph{kernel trick} is a mathematical device that allows to learn very complicated predictors ($\hyp$) in a computationally efficient manner. 24 | More generally, in the context of unsupervised learning, the kernel tricks allow to learn complicated non-linear mappings of the original features (and not only predictor functions). 25 | 26 | Not all predictors and not all problem admit this trick. Then again, many do. 27 | Methods for which it applies include: 28 | SVM's (\S\ref{sec:svm}), principal components analysis (\S\ref{sec:pca}), canonical correlation analysis (\S\ref{sec:cca}), ridge regression (\S\ref{sec:ridge}), spectral clustering (\S\ref{sec:spectral_clustering}), Gaussian processes\footnote{See the Bayesian interpretation below to see why they apply to Gaussian Processes.}, and more\footnote{This partial list is taken from Wikipedia: \url{http://en.wikipedia.org/wiki/Kernel_method}}. 29 | 30 | We now give an exposition of the method in the context of supervised learning. 31 | 32 | 33 | Think of smoothing splines (\S\ref{sec:smoothing_splines}); 34 | It was quite magical that without constraining the hypothesis class $\hypclass$, the ERM problem in Eq.(\ref{eq:smoothing_spline}) has a finite dimensional closed form solution. 35 | The property of an infinite dimensional problem having a solution in a finite dimensional space is known as the \emph{kernel property}.\marginnote{Kernel Property} 36 | We with to generalize this observation and ask- which problems have the kernel property? 37 | Stating the general optimization problem: 38 | \begin{align} 39 | \label{eq:rkhs} 40 | \argmin{\hyp}{\frac{1}{n} \sum_i \loss(y_i,\hyp(x_i)) + \lambda J(\hyp) } 41 | \end{align} 42 | The problem is then- what type of penalties $J(\hyp)$ will return simple solutions to Eq.(\ref{eq:rkhs}). 43 | The answer is: function that belong to \emph{Reproducing Kernel Hilbert Space} function spaces. 44 | RKHS's are denoted by $\rkhs$. 45 | They include many functions, but they are a rather ``small'' subset of the space of all possible functions. 46 | These spaces, and the functions therein, are defined by another function called a \emph{Kernel} denoted by $\kernel$. 47 | Choosing a particular kernel defines the space and the functions therein. 48 | Choosing a particular kernel, also defines the form of $J$ in Eq.(\ref{eq:rkhs}). 49 | Put differently: for any choice of a kernel $\kernel$, there is a particular $J(\hyp)$ for which the solution of Eq.(\ref{eq:rkhs}) will be a function in $\rkhs$ and will be easily computable. 50 | 51 | 52 | \section{Mathematics of RKHS} 53 | We now show how choosing a kernel $\kernel$ defines a space $\rkhs$, and a penalty $J(\hyp)$. 54 | 55 | A kernel is a non-negative symmetric function of two arguments: $\kernel(x,y): \reals^p \times \reals^p \mapsto \reals_+$. 56 | By fixing $y$, $\kernel(x,y)$ is a function with a single argument $x \mapsto \kernel(x,y)$. 57 | $\rkhs$ is merely the space of functions of $x$, spanned at given $y$'s: 58 | \begin{align} 59 | \label{eq:rkhs_span} 60 | \hyp(x):\sum_m \al_m \kernel(x,y_m) 61 | \end{align} 62 | 63 | From linear algebra, you may know that positive definite matrices be diagonalized. 64 | This analogy carries to $\kernel$, which admits an eigen-expansion: 65 | \begin{align} 66 | \label{eq:rkhs_eigen} 67 | \kernel(x,y)=\sum_{i=1}^\infty \gamma_i \phi(x) \phi(y) 68 | \end{align} 69 | Using Eqs.(\ref{eq:rkhs_eigen}) and (\ref{eq:rkhs_span}) we can thus expand elements $f$ of $\rkhs$: 70 | \begin{align} 71 | \hyp(x)=\sum_{i=1}^\infty c_i \phi(x) 72 | \end{align} 73 | where $c_i=\gamma_i \sum_m \alpha_m \phi(y)$. 74 | We also define a norm $\normrkhs{\hyp}^2$ in this space, which is induced by $\kernel$: 75 | \begin{align} 76 | \label{eq:rkhs_norm} 77 | \normrkhs{\hyp}^2 := \sum_{i=1}^\infty \frac{c_i^2}{\gamma_i} 78 | \end{align} 79 | 80 | The penalty $J(\hyp)$ in Eq.(\ref{eq:rkhs}), is simply be $\normrkhs{\hyp}^2$. 81 | The $\hyp$'s that solve Eq.(\ref{eq:rkhs}) are guaranteed to have a simple form. They reside in an $n$ dimensional linear function space \citep{wahba_spline_1990}: 82 | \begin{align} 83 | \hyp(x)=\sum_{i=1}^n \al_i \kernel(x,x_i) 84 | \end{align} 85 | 86 | The functions $\kernel(x,x_i)$ can be seen as a basis to the solution space. 87 | The good news continue! Being only $n$ dimensional, the norms of these $\hyp$'s, do not require integration but rather only finite summation: 88 | \begin{align} 89 | \normrkhs{\hyp}^2=\sum_{i=1}^n \sum_{j=1}^n \kernel(x_i,x_j) \al_i \al_{j} := \al' K \al. 90 | \end{align} 91 | 92 | Adding the above results, we can restate Eq.(\ref{eq:rkhs}) and say that when fixing $\kernel$ and using the appropriate $J$, we only need to solve: 93 | \begin{align} 94 | \label{eq:rkhs_simple} 95 | \argmin{\al}{\frac{1}{n} \sum_i \loss(y_i-K_i \alpha) + \lambda \al' K \al } 96 | \end{align} 97 | which is a quadratic programming problem over an $n$ dimensional linear space, easily solvable with numeric routines. 98 | 99 | 100 | \section{The Bayesian View of RKHS} 101 | Just as the ridge regression (\S\ref{sec:ridge}) has a Bayesian interpretation, so does the kernel trick. 102 | Informally, the functions solving Eq.(\ref{eq:rkhs}) can be seen as the posterior mode if our prior beliefs postulate that the function we are trying to recover is a Gaussian zero-mean process with covariance given by $\kernel$. 103 | This view suggests the intuition that the regularization introduced by $J(\hyp)$ shrinks the estimated $\hyp$ towards a smoother function. At an extreme, where $\lambda\to\infty$, we will recover a constant function, since the the mode of our Gaussian process prior is at the origin of $\rkhs$. 104 | 105 | 106 | \section{Kernel Generalization of Other Methods} 107 | [TODO: Sec 18.5.2] 108 | 109 | 110 | 111 | 112 | 113 | 114 | % % % % % % % % % The Spectral Trick % % % % % % % 115 | 116 | \chapter{The Spectral Trick} 117 | \label{apx:spectral} 118 | [TODO] 119 | 120 | 121 | % % % % % % % % Generative models % % % % % % % % 122 | 123 | \chapter{Generative Models} 124 | \label{apx:generative_concept} 125 | 126 | By \emph{generative model} we mean that we specify the whole data distribution. This is particularly relevant to supervised learning where many methods only assume the distribution of $\dist(y|x)$ without stating the distribution of $\dist(x)$. 127 | Assuming only $\dist(y|x)$ is known as a \emph{discriminative model}, or \emph{discriminative analysis}.\marginnote{Descriminative Model} 128 | In a generative model, in contrast, we assume the whole $\dist(y,x)$. 129 | 130 | For the mere purpose of making a prediction, we do not need to learn $\dist(y,x)$. 131 | Knowing this distribution, however, does permit to make predictions, via Bayes Theorem: 132 | $\dist(y|x)=\frac{\dist(y,x)}{\int\dist(y,x)dy}$. 133 | Generative models make use of this relation to make predictions. 134 | 135 | To gain some intuition, consider a supervised learning problem where the data has an equal number of samples per class. 136 | Learning the distribution of $x$ withing each class, allows to a simple classification of a given $x$ to the class with highest probability. LDA (\S\ref{sec:lda}), QDA (\S\ref{sec:lda}), and \Naive Bayes (\S\ref{sec:naive_bayes}) follow this exact same rational. 137 | 138 | 139 | 140 | 141 | 142 | 143 | % % % % % % % % % Dimensionality Reduction % % % % % % % 144 | 145 | \chapter{Dimensionality Reduction} 146 | \label{apx:dim_reduce} 147 | 148 | Dimensionality reduction is a useful concept for both supervised and unsupervised learning. 149 | It allows to represent high dimensional data in a lower dimension. 150 | This allows the visualization of the data in a human-tractable dimension, the application of low-dimensional algorithms, and the reduction of computational burden when using the data for supervised learning. 151 | 152 | The fundamental idea behind dimensionality reduction is that while $\featureS$ may be high dimensional, thus $\dist(x)$ hard to learn, there is hope that $\x$ does not really vary in the whole space. 153 | If the mass of $\dist(x)$ is concentrated around some low dimensional manifold $\manifold$, then the original problem might be approximated to learning the distribution of the projection $\dist(X \project \manifold)$ on $\manifold$. 154 | If $\manifold$ is fairly low dimensional, we may hope to visualize and understand $\dist(X \project \manifold)$ with fairly simple tools. 155 | Dimensionality reduction also reduces the memory required to represent the data. It is thus intimately related to \emph{lossy compression} in information theory.\marginnote{Lossy Compression} 156 | 157 | A similar reasoning justifies dimensionality reduction in supervised learning. 158 | While $\dist(x)$ might vary in the whole $\featureS$, but there might be only few directions which carry information on $y$. Learning $\dist(y|x)$ can thus be well approximated by $\dist(y|x \project \manifold)$. 159 | 160 | As was first observed in the context of PCA (\S\ref{sec:pca}), for many types of embeddings, i.e., for many target manifolds and reconstruction errors, we do not really need the original data $X$, but rather only a graph of similarities between data points ($\similaritys$). 161 | This allow the dimensionality reduction theory to borrow from \emph{graph embedding} and \emph{graph drawing} literature. 162 | 163 | 164 | The different dimensionality methods can be stratified along these lines: 165 | We can further stratify dimensionality reduction methods along these lines: 166 | \begin{description} 167 | \item[Linear-Space vs. Non-Linear-Space Embeddings] 168 | When reducing the dimension of $X$, it can be mapped (embedded) into a linear subspace, $\manifold \subset \featureS$, or a non-linear $\manifold$. 169 | 170 | \item[Linear vs. Non-Linear Space Embeddings] 171 | Not to be confused with the previous item. 172 | The dimensionality reducing mapping, $X \project \manifold$, can be a linear operation on the data or a non-linear one. 173 | 174 | \item[Learning an Embedding vs. Learning an Embedding Function] 175 | When learning a mapping to a lower dimensional space, we can map the original data points (an embedding), or learn a mapping of the whole data space (an embedding function). 176 | \end{description} 177 | 178 | 179 | 180 | \section{Dimensionality Reduction in Supervised Learning} 181 | Dimensionality reduction is often performed before supervised learning to keep computational complexity low. 182 | It is sometimes performed on $X$ while ignoring $y$ (e.g. PCA Regression in \S\ref{sec:pca_regression}), and sometimes as part of the supervised learning (e.g. PLS in \S\ref{sec:pls}). 183 | 184 | From a statistical view-point, it is preferable to solve the supervised learning and dimensionality reduction simultaneously. This is because the subspace $\manifold$, which approximates $\dist(x)$ may differ than the one that approximates $\dist(y|x)$. 185 | From a computational view-point, however, it may be preferable to decouple the stages. 186 | 187 | 188 | 189 | 190 | \section{Graph Drawing} 191 | [TODO] 192 | 193 | 194 | 195 | 196 | 197 | % % % % % % % % % Latent Variables % % % % % % % 198 | 199 | \chapter{Latent Variables} 200 | \label{apx:latent} 201 | [TODO] 202 | 203 | 204 | 205 | 206 | % % % % % % % % % Information Theory % % % % % % % 207 | 208 | \chapter{Information Theory} 209 | \label{apx:information_theory} 210 | 211 | 212 | \begin{definition}[Entropy] 213 | \label{def:entropy} 214 | [TODO] 215 | \end{definition} 216 | 217 | 218 | 219 | \begin{definition}[Mutual Information] 220 | \label{def:mutual_information} 221 | [TODO] 222 | \end{definition} 223 | 224 | 225 | 226 | \begin{definition}[Kullback–Leibler Divergence] 227 | \label{def:kl_divergence} 228 | [TODO] 229 | \end{definition} 230 | 231 | 232 | 233 | 234 | 235 | % % % % % % Notation % % % % % 236 | 237 | 238 | \chapter{Notation} 239 | \label{apx:notation} 240 | 241 | In this text we use the following notation conventions: 242 | \begin{description} 243 | \item[$x$] A vector (or scalar). It is typically a column vector, but this should typically be implied from the text. 244 | \item[$\ones$] A vector of $1$'s. 245 | \item[$\x$] A vector (or scalar) valued random variable. 246 | \item[$X$] A matrix. 247 | \item[$\X$] A matrix valued random variable (a random matrix). 248 | \item[$X'$] The matrix transpose of $X$. 249 | \item[$\normII{x}$] The $l_2$ norm of $x$: $\sqrt{\sum_j x_j^2}$. 250 | \item[$\normI{x}$] The $l_1$ norm of $x$: $\sum_j |x_j|$ 251 | \item[$\normF{X}$] The Frobenius matrix norm of X: $\normF{X}^2=\sum_{ij} x_{ij}^2$ 252 | \item[$\ortho$] The space of orthogonal matrices. 253 | \item[$\scalar x y$] The scalar product of two vectors $x$ and $y$. 254 | \item[$\sample$] A data sample. 255 | \item[$\expect{\x}$] The expectation of $\x$. 256 | \item[$\expectn{x}$] The empirical expectation (average) of the vector $x$. 257 | \item[$\cov{\x}$] The covariance matrix of $\x$: $\expect{(\x-\expect{\x})(\x-\expect{\x})'}$. 258 | \item[$\covn{x}$] The empirical covariance matrix of x: $\expectn{(x-\expectn{x})(x-\expectn{x})'}$. 259 | \item[$\rho(\x,\y)$] The correlation coefficient. 260 | \item[$\cdf{x}{t}$] The CDF of $\x$ at $t$. 261 | \item[$\icdf{x}{\al}$] The inverse CDF at $\al$ (the quantile function). 262 | \item[$\cdfn{x}{t}$] The empirical CDF of data vector $x$. 263 | \item[$\icdfn{x}{\al}$] The empirical $\al$ quantile of the data vector $x$. 264 | \item[$\x \sim \dist$] The random variable $\x$ is $\dist$ distributed. 265 | \item[$\pdf(x)$] The density function of $\dist$ at $x$. 266 | \item[$\gauss{\mu,\sigma^2}$] The univariate Gaussian distribution with mean $\mu$ and variance $\sigma^2$. 267 | \item[$\gauss{\mu,\Sigma}$] The multivariate Gaussian distribution with mean vector $\mu$ and covariance matrix $\Sigma$. 268 | \item[$\lik(\theta)$] The likelihood function at $\theta$. 269 | \item[$\loglik(\theta)$] The log likelihood function at $\theta$. 270 | \item[$\loss(x,\theta)$] The loss function of $\theta$ at $x$. 271 | \item[$\risk(\theta)$] The risk at $\theta$. 272 | \item[$\riskn(\theta)$] The empirical risk at $\theta$. 273 | \item[$\hyp(x)$] A prediction (hypothesis) at $x$. 274 | \item[$\hypclass$] The class of all hypotheses $\hyp$. 275 | \item[$\plane$] A hyperplane. 276 | \item[$\categories$] A set of categories. 277 | \item[$\positive{t}$] The positive part of $t$: $\max\{0,t \}$. 278 | \item[$\kernel(x,y)$] A kernel function evaluated at $(x,y)$. 279 | \item[$\indicator{A}$] The indicator function of the set $A$. 280 | \item[$\manifold$] A manifold. 281 | \item[$\project$] A projection operator. 282 | 283 | \item[$\similarity_{ij}$] A similarity measure between observations $i$ and $j$. 284 | \item[$\dissimilarity_{ij}$] A dissimilarity (i.e., distance) measure between observations $i$ and $j$. 285 | \item[$\similaritys$] A weighted graph (i.e. network, or matrix) of similarities between observations. 286 | \item[$\dissimilaritys$] A weighted graph (i.e. network, or matrix) of dissimilarities between observations. 287 | 288 | \item[$\kl{\x}{\y}$] Kullbeck-Leibler divergence between random variable $\x$ to $\y$. 289 | \item[$\entropy(\x)$] The entropy of random variable $\x$. 290 | \item[$\mutual{\x}{\y}$] The mutual information between $\x$ and $\y$. 291 | 292 | 293 | \end{description} 294 | 295 | 296 | -------------------------------------------------------------------------------- /notes/art/avoid-overfitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/avoid-overfitting.png -------------------------------------------------------------------------------- /notes/art/bias_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/bias_variance.png -------------------------------------------------------------------------------- /notes/art/censored.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/censored.pdf -------------------------------------------------------------------------------- /notes/art/imputing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/imputing.pdf -------------------------------------------------------------------------------- /notes/art/irrelevant-features-hurt-knn-clustering.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/irrelevant-features-hurt-knn-clustering.png -------------------------------------------------------------------------------- /notes/art/irrelevant-features.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/irrelevant-features.png -------------------------------------------------------------------------------- /notes/art/non-linear-basis-functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/non-linear-basis-functions.png -------------------------------------------------------------------------------- /notes/art/som_simulation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/som_simulation.png -------------------------------------------------------------------------------- /notes/art/support-vector-machine-15-728.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/support-vector-machine-15-728.jpg -------------------------------------------------------------------------------- /notes/art/uncensored.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/uncensored.pdf -------------------------------------------------------------------------------- /notes/art/why-complex-models-can-turn-out-to-be-less-probable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/art/why-complex-models-can-turn-out-to-be-less-probable.png -------------------------------------------------------------------------------- /notes/collaborative.tex: -------------------------------------------------------------------------------- 1 | \chapter{Recommender Systems} 2 | \label{sec:recomender_systems} 3 | 4 | 5 | % % % % recommender systems % % % % % % % 6 | 7 | A recommender system is a software that, as the name suggests, gives recommendations to the user. 8 | Notable examples include Book recommendations by Amazon, and film recommendations by Netflix. 9 | The two main approaches to recommender systems include \emph{content filtering} and \emph{collaborative filtering}. 10 | 11 | Two nice introductions to recommender systems can be found in \citet{koren_matrix_2009} and \citet{su_survey_2009}. 12 | 13 | 14 | % % % % Content filtering % % % % % % 15 | \section{Content Filtering} 16 | \label{sec:content_filtering} 17 | 18 | In content filtering, the system is assumed to have some background information on the user (say, because he logged in), and uses 19 | this information to give him recommendations. 20 | The recommendation in this case, is approached as a supervised learning problem: 21 | the system learns to predict a product's rating based on the user's features. 22 | It then computes the rating for many candidate products and recommends a set with high predicted ratings. 23 | 24 | 25 | 26 | % % % % collaborative filtering % % % % % 27 | \section{Collaborative Filtering} 28 | \label{sec:collaborative_filtering} 29 | 30 | Unlike content filtering, in \emph{collaborative filtering}, there is no external information on the user or the products, besides the ratings of other users. 31 | The term collaborative filtering, was coined by the authors of the first such system-- Tapestry \citep{goldberg_using_1992}. 32 | 33 | Collaborative filtering can be approached as a supervised learning problem, or as an unsupervised learning problem. This is because it is neither. 34 | It is essentially a \emph{missing data} problem.\marginnote{Missing Data} 35 | To see this consider a matrix of rankings, $\rankings$ where the $i,j$'th entry, $\ranking_{i,j}$, is the ranking of user $i$ movie $j$. 36 | Predicting $\ranking_{i,j'}$, i.e., the ranking of a currently unseen movie, is essentially an imputation of a missing value. 37 | It is exceptionally challenging however, as in typical applications there is much more missing data than observed data. 38 | 39 | 40 | The two main approaches to collaborative filtering include \emph{neighbourhood methods}, and \emph{latent factor models} \cite{koren_matrix_2009}. 41 | 42 | \subsubsection{Neighbourhood Methods} 43 | The neighbourhood methods to collaborative filtering rest on the assumption that similar individuals have similar tastes. 44 | If someone similar to individual $i$ has seen movie $j'$, then $i$ should have a similar opinion. 45 | 46 | The notion of using the neighbourhood of a data point is not a new one. We have seen it being used for supervised learning in kernel regression (\S\ref{sec:kernel}) and KNN (\S\ref{sec:knn}). 47 | 48 | Neighbourhood methods for collaborative filtering, or missing data imputation in general, can thus be seen as a non-parametric approach to supervised learning problems, and solved in the same way. 49 | 50 | 51 | \begin{remark}[Collaborative Filtering and Other Supervised Learning Methods] 52 | If you are wondering, why only neighbourhood methods for supervised learning apply to collaborative filtering, you are right. 53 | Any supervised learning method can be applied to impute entries in $\rankings$. Neighbourhood 54 | methods are merely the most popular. 55 | \end{remark} 56 | 57 | 58 | \subsubsection{Latent Factor Models} 59 | The latent factor approach to collaborative filtering rests on the assumption that the rankings are a function of some latent user attributes and latent movie attributes. 60 | This idea is not a new one, as we have seen it in the context of unsupervised learning in factor analysis (FA) (\S\ref{sec:factor_analysis}), independent component analysis (ICA) (\S\ref{sec:ica}), and other latent space generative models. 61 | We thus see that collaborative filtering, and missing data imputation in general, can be approached as an unsupervised learning problem. 62 | 63 | As we will soon see, just like the FA problem (\S\ref{sec:factor_analysis}), the latent factor model implies that the data arises as a multiplication of matrices. This is why, this approach is more commonly known as the \emph{matrix factorization} approach collaborative filtering.\marginnote{Matrix Facorization} 64 | We will present several matrix factorization problem in the ERM framework. 65 | Note, however, that while stating the optimization problem requires only basic math and imagination, actually solving them is far from trivial. In fact, if you try arbitrarily changing the basic ERM problems below with your favourite loss function and generative model, you will probably find the problem to be computationally unsolvable. 66 | 67 | Having movie ratings in mind, the simplest collaborative filtering ERM problem is 68 | \begin{align} 69 | \label{eq:matrix_factorization} 70 | \argmin{\latentn,\loadings}{\sum_{i,j \in \kappa} (\rankings_{i,j} - \latentn_j' \loadings_i)^2 + \lambda (\normII{\latentn_j}^2+ \normII{\loadings_i}^2)}, 71 | \end{align} 72 | where $\latentn_j$ is the latent properties of movie $j$, 73 | $\loadings_i$ is the importance of a movies properties to viewer $i$, 74 | and summation is performed over $\kappa$, which is the set of movies and user which actually have a rating. 75 | As usual, the regularization $\lambda$ can be chosen with a cross-validation approach (\S\ref{sec:cv}), or the other unbiased risk estimation methods in Chapter~\ref{sec:desicion_theory}. 76 | 77 | It may seem quite miraculous that by assuming a lower dimensional generative model, one may impute missing values. 78 | The following figures try to suggest an intuition. 79 | [TODO: add figures] 80 | 81 | 82 | 83 | \begin{remark}[Matrix Norm Notation] 84 | We could write Eq.(\ref{eq:matrix_factorization}) using matrix norms, but we would then need to define multiplications with missing values. This is not hard to do, but I rather avoid it right now. 85 | \end{remark} 86 | 87 | \begin{remark}[Matrix Factorization and Factor Analysis] 88 | On the face of it, the matrix factorization problem in Eq.(\ref{eq:matrix_factorization}) seems very similar to the FA problem in Eq.(\ref{eq:factor}) with squared error loss. 89 | The reason we do not encounter the rotation invariance property in the solution to Eq.(\ref{eq:matrix_factorization}) is due to the $l_2$ regularization term 90 | \end{remark} 91 | 92 | We can now complicate the matrix factorization problem a little further. 93 | We account for personal effects, movie effects, and time-varying preferences. 94 | The implied ERM problem is 95 | \begin{align} 96 | \label{eq:matrix_factorization_comlicated} 97 | \argmin{\latentn,\loadings, b_i, b_j}{ 98 | \sum_{i,j,t \in \kappa} (\rankings_{i,j}(t) - b_i(t) - b_j- \latentn_j' \loadings_i(t))^2 + \lambda (\normII{\latentn_j}^2+ \normII{\loadings_i(t)}^2 + b_i(t)^2 + b_j^2 ) 99 | }, 100 | \end{align} 101 | $\loadings_i(t)$ is the importance of a movies properties to viewer $i$ at period $t$, 102 | $b_j$ is an average appreciation of movie $j$~\footnote{A marketing effect?}, 103 | $b_i(t)$ is the average appreciation of viewer $i$~\footnote{A mood effect?}, 104 | and summation is performed over $\kappa$, which is the set of times, movies, and users, which actually have a rating. 105 | 106 | \begin{remark}[Temporal Dynamics and Tensor Factorization] 107 | When introducing a temporal dimension, the rating can not longer be presented as a matrix. 108 | Eq.(\ref{eq:matrix_factorization_comlicated}) can thus no longer be seen as a \emph{matrix} factorization problem. 109 | Indeed, this is a \emph{tensor} factorization problem. 110 | Tensor factorization is currently much less advanced than matrix factorization theory. Moreover, the numerical libraries the implement tensor factorization are much less developed than existing matrix algebra libraries \citet{lorica_lets_????}. 111 | This is why, IMHO, authors prefer to deal with tensors by stacking and kronecker products, rather then treating them as the tensors they are. 112 | \end{remark} 113 | 114 | 115 | 116 | 117 | % % % % Hybrid methods % % % % 118 | \section{Hybrid Filtering} 119 | After introducing the ideas of content filtering (\S\ref{sec:content_filtering}) and collaborative filtering (\S\ref{sec:collaborative_filtering}), why not marry the two? 120 | \emph{Hybrid filtering} is the idea of imputing the missing data, thus making recommendations, using both a viewer's attributes, and other viewers' preferences. 121 | 122 | A simple version of the implied ERM problem is 123 | \begin{align} 124 | \label{eq:hybrid_filtering} 125 | \argmin{\latentn,\loadings, \hyp}{\sum_{i,j \in \kappa} (\rankings_{i,j} - \latentn_j' \loadings_i - \hyp(x_i) )^2 + \lambda (\normII{\latentn_j}^2 + \normII{\loadings_i}^2+ J(\hyp)) }, 126 | \end{align} 127 | where $\hyp(x_i)$ is the effect of the attributes of viewer $i$ to his preferences, and $J(\hyp)$ is some regularization for the predictor's complexity. 128 | 129 | 130 | \section{Recommender Systems Terminology} 131 | 132 | Since the recommender systems literature did not stem from the statistical learning literature, it typically uses different terminology for very similar, if not identical, concepts. 133 | Here is a partial list of some common terms: 134 | 135 | \begin{itemize} 136 | \item \textbf{Content Based Filtering}: A supervised learning approach to recommendations. 137 | \item \textbf{Collaborative Filtering}: A missing data imputation approach to recommendations. 138 | \item \textbf{Memory Based Filtering}: A non parametric (neighbourhood) approach (\S\ref{sec:non_erm}) to collaborative filtering. 139 | \item \textbf{Model Based Filtering}: A latent space generative model approach (\S\ref{sec:latent_space}) to collaborative filtering. 140 | \end{itemize} 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /notes/commands.tex: -------------------------------------------------------------------------------- 1 | % Custom environments 2 | 3 | \theoremstyle{plain} 4 | \newtheorem{thm}{Theorem}[section] 5 | \newtheorem{lemma}{Lemma}[section] 6 | \newtheorem{prop}{Proposition}[section] 7 | 8 | \theoremstyle{definition} 9 | \newtheorem{definition}{Definition}[chapter] 10 | \newtheorem{remark}{Remark}[section] 11 | \newtheorem{example}{Example}[section] 12 | 13 | 14 | 15 | 16 | 17 | % Custom commands 18 | 19 | \newcommand{\naive}{na\"{\i}ve } 20 | \newcommand{\Naive}{Na\"{\i}ve } 21 | \newcommand{\andor}{and\textbackslash or } 22 | \newcommand{\erdos}{Erd\H{o}s } 23 | \newcommand{\renyi}{R\`enyi } 24 | 25 | 26 | \newcommand{\al}{\alpha} 27 | \newcommand{\be}{\beta} 28 | 29 | \newcommand{\set}[1]{\left\{ #1 \right\}} % A set 30 | \newcommand{\rv}[1]{\mathbf{#1}} % A random variable 31 | \newcommand{\x}{\rv x} % The random variable x 32 | \newcommand{\y}{\rv y} % The random variable x 33 | \newcommand{\X}{\rv X} % The random variable x 34 | \newcommand{\Y}{\rv Y} % The random variable y 35 | \newcommand{\expect}[1]{\mathbf{E}\left[ #1 \right]} % The expectation operator 36 | \newcommand{\expectg}[2]{\mathbf{E}_{\rv{#1}}\left[ \rv{#2} \right]} % An expectation w.r.t. a particular random variable. 37 | \newcommand{\expectn}[1]{\mathbb{E}\left[#1\right]} % The empirical expectation 38 | \newcommand{\cov}[1]{\mathbf{C}ov \left[ #1 \right]} % The expectation operator 39 | \newcommand{\covn}[1]{\mathbb{C}ov \left[ #1 \right]} % The expectation operator 40 | \newcommand{\gauss}[1]{\mathcal{N}\left(#1\right)} % The gaussian distribution 41 | \newcommand{\cdf}[2]{F_\rv{#1} (#2)} % The CDF function 42 | \newcommand{\cdfn}[2]{\mathbb{F}_{#1}(#2)} % The empirical CDF function 43 | \newcommand{\icdf}[2]{F_\rv{#1}^{-1} (#2)} % The invecrse CDF function 44 | \newcommand{\icdfn}[2]{\mathbb{F}^{-1}_{#1}(#2)} % The inverse empirical CDF function 45 | \newcommand{\pdf}{p} % The probability density function 46 | \newcommand{\prob}[1]{P\left( #1 \right)} % the probability of an event 47 | \newcommand{\dist}{P} % The proabaiblity distribution 48 | \newcommand{\entropy}{H} % entropy 49 | \newcommand{\mutual}[2]{I\left(#1;#2\right)} % mutual information 50 | 51 | \newcommand{\estim}[1]{\widehat{#1}} % An estimator 52 | 53 | \newcommand{\norm}[1]{\Vert #1 \Vert} % The norm operator 54 | \newcommand{\normII}[1]{\norm{#1}_2} % The norm operator 55 | \newcommand{\normI}[1]{\norm{#1}_1} % The norm operator 56 | \newcommand{\normF}[1]{\norm{#1}_{Frob}} % The Frobenius matrix norm 57 | \newcommand{\ones}{\textbf{1}} % Vector of ones. 58 | \newcommand{\lik}{\mathcal{L}} % The likelihood function 59 | \newcommand{\loglik}{L} % The log likelihood function 60 | \newcommand{\loss}{l} % A loss function 61 | \newcommand{\risk}{R} % The risk function 62 | \newcommand{\riskn}{\mathbb{R}} % The empirical risk 63 | \newcommand{\deriv}[2]{\frac{\partial #1}{\partial #2}} % A derivative 64 | \newcommand{\argmin}[2]{\mathop{argmin} _{#1}\set{#2}} % The argmin operator 65 | \newcommand{\argmax}[2]{\mathop{argmax}_{#1}\set{#2}} % The argmin operator 66 | \newcommand{\hyp}{f} % A hypothesis 67 | \newcommand{\hypclass}{\mathcal{F}} % A hypothesis class 68 | \newcommand{\hilbert}{\mathcal{H}} 69 | \newcommand{\rkhs}{\hilbert_\kernel} % A hypothesis class 70 | \newcommand{\normrkhs}[1]{\norm{#1}_{\rkhs}} % the RKHS function norm 71 | 72 | 73 | \newcommand{\plane}{\mathbb{L}} % A hypoerplane 74 | \newcommand{\categories}{\mathcal{G}} % The categories set. 75 | \newcommand{\positive}[1]{\left[ #1 \right]_+} % The positive part function 76 | \newcommand{\kernel}{\mathcal{K}} % A kernel function 77 | \newcommand{\featureS}{\mathcal{X}} % The feature space 78 | \newcommand{\indicator}[1]{I_{\set{#1}}} % The indicator function. 79 | \newcommand{\reals}{\mathbb{R}} % the set of real numbers 80 | 81 | 82 | 83 | \newcommand{\latent}{\rv{s}} % latent variables matrix 84 | \newcommand{\latentn}{S} % latent variables matrix 85 | \newcommand{\loadings}{A} % factor loadings matrix 86 | \newcommand{\rotation}{R} % rotation matrix 87 | \newcommand{\similaritys}{\mathfrak{S}} % a similarity graph 88 | \newcommand{\similarity}{s} % A similarity measure. 89 | \newcommand{\dissimilarity}{d} % A dissimilarity measure. 90 | \newcommand{\dissimilaritys}{\mathfrak{D}} % a dissimilarity graph 91 | \newcommand{\scalar}[2]{\left< #1,#2 \right>} % a scalar product 92 | 93 | 94 | 95 | \newcommand{\manifold}{\mathcal{M}} % A manifold. 96 | \newcommand{\project}{\hookrightarrow} % The orthogonal projection operator. 97 | \newcommand{\projectMat}{H} % A projection matrix. 98 | \newcommand{\rank}{q} % A subspace rank. 99 | \newcommand{\dimy}{K} % The dimension of the output. 100 | \newcommand{\encode}{E} % a linear encoding matrix 101 | \newcommand{\decode}{D} % a linear decoding matrix 102 | \DeclareMathOperator{\Tr}{Tr} 103 | \newcommand{\ensembleSize}{M} % Size of a hypothesis ensemble. 104 | \newcommand{\ensembleInd}{m} % Index of a hypothesis in an ensemble. 105 | 106 | 107 | \newcommand{\sample}{\mathcal{S}} % A data sample. 108 | \newcommand{\test}{\risk(\hyp)} % The test error (risk) 109 | \newcommand{\train}{\riskn(\hyp)} % The train error (empirical risk) 110 | \newcommand{\insample}{\bar{\risk}(\hyp)} % The in-sample test error. 111 | \newcommand{\EPE}{\risk(\hat{\hyp}_n)} % The out-of-sample test error. 112 | \newcommand{\folds}{K} % Cross validation folds 113 | \newcommand{\fold}{k} % Index of a fold 114 | \newcommand{\bootstraps}{B} % Bootstrap samples 115 | \newcommand{\bootstrap}{{b^*}} % Index of a bootstrap replication 116 | 117 | 118 | \newcommand{\rankings}{\mathcal{R}} % Rankings, for colaborative filtering. 119 | \newcommand{\ranking}{\mathcal{R}} % Rankings, for colaborative filtering. 120 | \newcommand{\kl}[2]{D_{KL}\left(#1 \Vert #2 \right)} 121 | \newcommand{\ortho}{\mathbb{O}} % space of orthogonal matrices 122 | 123 | \newcommand{\id}[6]{ 124 | \begin{tabular}{|p{2cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|p{2cm}|} 125 | \hline Task & Type & Input & Output & Concept & Remark \\ 126 | \hline 127 | \hline #1 & #2 & #3 & #4 & #5 & #6 \\ 128 | \hline 129 | \end{tabular} 130 | \newline 131 | \newline 132 | } 133 | 134 | \newcommand{\union}{\cup} 135 | \newcommand{\intersect}{\cap} 136 | \newcommand{\supp}[1]{\mathop{support}(#1)} 137 | \newcommand{\conf}[2]{\mathop{confidence}(#1 \Rightarrow #2)} 138 | \newcommand{\lift}[2]{\mathop{lift}(#1 \Rightarrow #2)} 139 | \newcommand{\convic}[2]{\mathop{conviction}(#1 \Rightarrow #2)} 140 | -------------------------------------------------------------------------------- /notes/estimation.tex: -------------------------------------------------------------------------------- 1 | 2 | \chapter{Estimation} 3 | \label{sec:estimation} 4 | 5 | In this section, we present several estimation principles. 6 | Their properties are not discussed, as the section is merely a reminder and a preparation for what follows. 7 | These concepts and examples can be found in many introductory books to statistics. I particularly recommend \cite{wasserman_all_2004} or \cite{abramovich_statistical_2013}. 8 | 9 | \section{Moment matching} 10 | \label{sec:moment_matching} 11 | 12 | The fundamental idea: match empirical moments to theoretical. I.e., estimate 13 | $$ \expect{g(X)} $$ 14 | by 15 | $$ \expectn{g(X)} $$ 16 | where $\expectn{g(x)}:=\frac{1}{n} \sum_i g(x_i)$, is the empirical mean. 17 | 18 | \begin{example}[Exponential Rate] 19 | 20 | Estimate $\lambda$ in $\x_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d. 21 | $\expect{x}=1/\lambda$. 22 | $\Rightarrow \estim{\lambda}=1/\expectn{x}$ . 23 | 24 | \end{example} 25 | 26 | 27 | \begin{example}[Linear Regression] 28 | 29 | Estimate $\be$ in $\y \sim \gauss{X\be,\sigma^2 I}$, a $p$ dimensional random vector. 30 | $\expect{y}=X\be$ and $\expectn{y}=y$. 31 | Clearly, moment matching won't work because no $\be$ satisfies $X\be=y$. 32 | A technical workaround: 33 | Since $\be$ is $p$ dimensional, I need to find some $g(\y): \mathbb{R}^n \mapsto \mathbb{R}^p$. 34 | Well, $g(y):=Xy$ is such a mapping. I will use it, even though my technical justification is currently unsatisfactory. We thus have: 35 | $\expect{X'y}=X'X\be$ which I match to $\expectn{X'y}=X'y$: 36 | $$ 37 | X'X \be = X' y \Rightarrow \estim{\be}=(X'X)^{-1} X'y. 38 | $$ 39 | 40 | \end{example} 41 | 42 | 43 | \section{Quantile matching} 44 | \label{sec:quantiles} 45 | 46 | The fundamental idea: match empirical quantiles to theoretical. 47 | Denoting by $\cdf{x}{t}$ the CDF of $\x$, then $\icdf x \al$ is the $\al$ quantile of $\x$. 48 | Also denoting by $\cdfn x t$ the Empirical CDF of $x_1,\dots, x_n$, then $\icdfn x \al$ is the $\al$ quantile of $x_1,\dots, x_n$. 49 | The quantile matching method thus implies estimating 50 | $$ \icdf x \al $$ 51 | by 52 | $$ \icdfn x \al . $$ 53 | 54 | \begin{example}[Exponential rate] 55 | Estimate $\lambda$ in $\x_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d. 56 | \begin{align*} 57 | & \cdf x t = 1-\exp(-\lambda t) = \al \Rightarrow \\ 58 | & \icdf x \al = \frac{-\log(1-\al)}{\lambda} \Rightarrow \\ 59 | & \icdf{x}{0.5} = \frac{-\log(0.5)}{\lambda} \Rightarrow \\ 60 | & \estim{\lambda} = \frac{-\log(0.5)}{\icdfn{x}{0.5}}. 61 | \end{align*} 62 | 63 | \end{example} 64 | 65 | 66 | \section{Maximum Likelihood} 67 | \label{sec:ml} 68 | 69 | The fundamental idea is that if the data generating process (i.e., the \emph{sampling distribution}) can be assumed, then the observations are probably some high probability instance of this process, and not a low probability event: 70 | Let $\x_1,\dots,\x_n \sim P_\theta$, with density (or probability) $p_\theta(x_1,\dots,x_n)$. 71 | Denote the likelihood, as a function of $\theta$: $\lik(\theta): p_\theta(x_1,\dots,x_n)$. 72 | Then $$\estim{\theta}_{ML}:= argmax_{\theta}\set{ \lik(\theta) }.$$ 73 | 74 | Using a monotone mapping such as the log, does not change the $argmax$. 75 | Denote $$\loglik(\theta):=\log(\lik(\theta)).$$ 76 | 77 | 78 | \begin{example}[Exponential rate] 79 | 80 | Estimate $\lambda$ in $X_i \sim exp(\lambda)$, $i=1,\dots,n$, i.i.d. 81 | Using the exponential PDF and the i.i.d. assumption 82 | $$ \lik(\lambda) = \lambda^n \exp(-\lambda \sum_i X_i), $$ 83 | and 84 | $$ \loglik(\lambda) = n \log(\lambda) -\lambda \sum_i X_i. $$ 85 | 86 | By differentiating and equating $0$, we get $\estim{\lambda}_{ML}=1/\expectn{X}$. 87 | 88 | \end{example} 89 | 90 | \begin{example}[Discrete time Markov Chain] 91 | 92 | Estimate the transition probabilities, $p_1$ and $p_2$ in a two state, $\set{0,1}$, discrete time, Markov chain where: 93 | $P(\x_{t+1}=1|x_{t}=0)=p_1$ and $P(\x_{t+1}=1|X_{t}=1)=p_2$. 94 | The likelihood: 95 | $$ 96 | \lik(p_1,p_2)= 97 | P(X_2,\dots,X_T;X_1,p_1,p_2)= 98 | \prod_{t=1}^T P(X_{t+1}=x_{t+1}|X_{t}=x_t). 99 | $$ 100 | We denote $n_{ij}$ the total number of observed transitions from $i$ to $j$ and get that $\estim{p}_1=\frac{n_{01}}{n_{01}+n_{00}}$, and that $\estim{p}_2=\frac{n_{11}}{n_{11}+n_{10}}$. 101 | 102 | \begin{remark}[Confession] 103 | Well, this is a rather artificial example, as because of the Markov property, and the stationarity of the process, we only need to look at transition events, themselves Bernoulli distributed. 104 | This example does show, however, the power of the ML method to deal with non i.i.d. samples. As does the next example. 105 | \end{remark} 106 | \end{example} 107 | 108 | 109 | 110 | 111 | \begin{example}[Autoregression of order 1 (AR(1))] 112 | Estimate the drift parameter $a$, in a discrete time Gaussian process where: 113 | $\x_{t+1}=a \x_t+ \varepsilon; \varepsilon \sim \gauss{0,\sigma^2} \Rightarrow \x_{t+1}|\x_t \sim \gauss{a x_t,\sigma^2}$. 114 | 115 | We start with the conditional density at time $t+1$: 116 | $$ 117 | p_{\x_{t+1}|\x_t=x_t}(x_{t+1}) = 118 | (2 \pi \sigma^2)^{-1/2} \exp \left( 119 | -\frac{1}{2 \sigma^2}(x_{t+1}-a x_t)^2 120 | \right). 121 | $$ 122 | Moving to the likelihood: 123 | $$ 124 | \lik(a) = 125 | (2 \pi \sigma^2)^{-T/2} \exp \left( 126 | -\frac{1}{2 \sigma^2}\sum_{t=1}^T (x_{t+1}-a x_t)^2 127 | \right). 128 | $$ 129 | Taking the log and differentiating with respect to $a$ and equating $0$ we get $\estim{a}_{ML}=\frac{\sum x_{t+1}x_{t}}{\sum x_t^2}$. 130 | 131 | We again see the power of the ML device. 132 | Could we have arrived to this estimator by intuiton alone? Hmmmm... maybe. 133 | See that $Cov[X_{t+1},X_t] = a \; Var[X_t] \Rightarrow a=\frac{Cov[X_{t+1},X_t]}{Var[X_t]}$. 134 | So $a$ can also be derived using the moment matching method which is probably more intuitive. 135 | 136 | \end{example} 137 | 138 | 139 | 140 | 141 | \begin{example}[Linear Regression] 142 | 143 | Estimate $\be$ in $Y \sim \gauss{X\be,\sigma^2 I}$, a $p$ dimensional random vector. 144 | Recalling the multivariate Gaussian PDF: 145 | $$ 146 | p_{\mu,\Sigma}(y) = 147 | (2 \pi)^{-n/2} |\Sigma|^{-1/2} \exp\left( 148 | -\frac{1}{2} (y-\mu)' \Sigma^{-1} (y-\mu) 149 | \right) 150 | $$ 151 | So in the regression setup: 152 | $$ 153 | \lik(\be)= 154 | p_{\be,\sigma^2}(y) = 155 | (2 \pi)^{-n/2} |\sigma^2 I|^{-1/2} \exp\left( 156 | -\frac{1}{2 \sigma^2} \normII{y-X\be}^2 157 | \right) 158 | $$ 159 | and $\estim{\be}_{ML}$ equals 160 | \begin{align} 161 | \estim{\be}_{ML}=(X'X)^{-1} X'y. 162 | \end{align} 163 | 164 | 165 | \end{example} 166 | 167 | 168 | \section{M-Estimation and Empirical Risk Minimization} 169 | \label{sec:m_estimation} 170 | 171 | M-Estimation, know as Empirical Risk Minimizaton (ERM) in the machine learning literature, is a very wide framework which stems from statistical desicion theory. 172 | The underlying idea is that each realization of $\x$ incurs some loss, and we seek to find a "policy", in this case a parameter, $\theta^*$ that minimizes the average loss. 173 | In the econometric literature, we dot not incur a loss, but rather a utility, we thus seek a policy that maximizes the average utility. 174 | 175 | \begin{definition}[Loss Function] 176 | The penalty for predicting $\theta$ when observing $x$: 177 | \begin{align} 178 | \loss(x;\theta). 179 | \end{align} 180 | 181 | \end{definition} 182 | \begin{definition}[Risk Function] 183 | The expected loss: 184 | \begin{align} 185 | \risk(\theta):=\expect{\loss(\x;\theta)}. 186 | \end{align} 187 | 188 | \end{definition} 189 | Then the best prediction, $\theta^*$, being the minimizer of the expected risk is 190 | \begin{align} 191 | \label{eq:risk_min} 192 | \theta^*:= \argmin{\theta}{\risk(\theta)}. 193 | \end{align} 194 | 195 | As we do not know the distribution of $\x$, we cannot solve Eq.(\ref{eq:risk_min}), so we minimize the \emph{empirical} risk. 196 | \begin{definition}[Empirical Risk] 197 | The average loss in the sample: 198 | \begin{align} 199 | \riskn(\theta):=\expectn{\loss(x;\theta)}=\frac{1}{n}\sum_i \loss(x_i,\theta). 200 | \end{align} 201 | \end{definition} 202 | 203 | A prediction that can actually be computed with data is thus the empirical minimizer $\estim{\theta}$: 204 | \begin{align} 205 | \label{eq:empirical_risk_min} 206 | \estim{\theta}:= \argmin{\theta}{\riskn(\theta)}. 207 | \end{align} 208 | 209 | 210 | 211 | \begin{example}[Squared Loss] 212 | \label{eg:squared_loss} 213 | 214 | Let $\loss(x;\theta)=(x-\theta)^2$. Then 215 | $ 216 | \risk(\theta) = 217 | \expect{(\x-\theta)^2} = 218 | (\expect{\x}-\theta)^2 + Var[\x]. 219 | $ 220 | Clearly $Var[\x]$ does not depend on $\theta$ so that $\risk(\theta)$ is minimized by $\theta^*=\expect{\x}$. 221 | \textbf{We thus say that the expectation of a random variable is the minimizer of the squared loss.} 222 | 223 | How do we estimate the population expectation? Well a natural estimator is the empirical mean, which is also the minimizer of the empirical risk $\riskn(x)$. The proof is immediate by differentiating. 224 | \end{example} 225 | 226 | 227 | \begin{example}[Ordinary Least Squares (OLS)] 228 | \label{eg:OLS} 229 | Define the loss $\loss(y,x;\be):=\frac{1}{2}(y-x\be)^2$. 230 | Computing the risk, $\expect{\frac{1}{2} \normII{\y-\x\be}^2}$ will require dealing with the joint distribution of $(\x,\y)$. 231 | We don't really care about that right now. 232 | We merely want to see that the empirical risk minimizer, is actually the classical OLS problem. 233 | And well, it is (by definition actually): 234 | \begin{align*} 235 | \riskn(\be)=\sum_{i=1}^n \frac{1}{2}(y_i-x_i\be)^2 = \frac{1}{2}\normII{y-x\be}^2. 236 | \end{align*} 237 | Minimization is easiest with vector derivatives, but I will stick to regular derivatives: 238 | \begin{align*} 239 | \deriv{\riskn(\be)}{{\be_j}} = \sum_i \left[ (y_i-\sum_{j=1}^p x_{ij}\be_j)(-x_{ij}) \right] 240 | \end{align*} 241 | Equating $0$ yields $\estim{\be_j}=\frac{\sum_i y_i x_{ij}}{\sum_i x_{ij}^2}$. 242 | Solving for all $j$'s and putting in matrix notation we get 243 | \begin{align} 244 | \estim{\be}_{OLS}=(X'X)^{-1} X'y. 245 | \end{align} 246 | 247 | \end{example} 248 | 249 | 250 | \section{Notes} 251 | 252 | \subsection{Maximum Likelihood} 253 | Maximum likelihood estimators are a particular instance of M-estimators, if we set the loss function to be the negative log likelihood of the (true) sampling distribution. 254 | 255 | 256 | \subsection{Choosing the Loss Function} 257 | While by far the most popular, we do not automatically revert to minimizing a squared error loss. 258 | There are several considerations when choosing the loss function. 259 | Most ERM learning methods can be applied with different loss functions. 260 | 261 | The first consideration is computational complexity: if you choose a loss function that leads to a non-convex empirical risk, you are in trouble. There are no guarantees you will be able to find the risk minimizer in finite computing time. 262 | 263 | The second consideration is the nature of the outcome $y$. Some loss functions are more appropriate to continuous $y$'s and some for categorical $y$'s. Typical loss functions for continuous $y$'s are the squared loss, absolute loss, and hinge loss. 264 | Typical loss functions for categorical $y$'s are the Binomial likelihood loss (also known as the Cross Entropy, or Deviance), and the hinge loss. 265 | 266 | A third consideration, which is rarely given the importance it should get, is ``What is the meaning of $\theta^*$''? Or, ``What are we actually estimating''? 267 | As we have seen in Example~\ref{eg:squared_loss}, the squared loss implies we are aiming to estimate the population mean. 268 | What are we estimating when we use the hinge loss? The binomial loss? 269 | We will not discuss these matters, as we are discussing methods where these choices have already been made for us. 270 | When the day you start thinking of your own learning algorithms, you will need to give some thought to this question. 271 | -------------------------------------------------------------------------------- /notes/graphics.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Graphics" 3 | author: "Jonathan Rosenblatt" 4 | date: "June 7, 2015" 5 | output: html_document 6 | --- 7 | 8 | The scripts producing the graphics in the class notes. 9 | 10 | 11 | # Imputing missing data 12 | Create complete data 13 | ```{r create} 14 | n <- 20 15 | noise <- rnorm(n*2) %>% matrix(ncol=2) 16 | signal <- outer(rexp(n,1/2),c(1,1)) %>% scale 17 | x <- signal + noise 18 | x.range <- 1.1* range(x[,1]); y.range <- 1.1* range(x[,2]) 19 | plot(signal, xlab='Movie 1', ylab='Movie 2', xlim = x.range, ylim = y.range) 20 | points(x, pch=19) 21 | arrows(x0=signal[,1], y0=signal[,2], x1=x[,1], y1=x[,2], col='darkgrey') 22 | ``` 23 | 24 | Then censor some points 25 | ```{r censor} 26 | censoring.ind <- cbind(1:n, sample(c(NA,1,2), n, replace=TRUE, prob = c(2,1,1))) 27 | x.censored <- x 28 | x.censored[censoring.ind] <- NA 29 | points(x.censored, col='red', pch=19) # Observed points 30 | #So this is what we know 31 | x.censored.1.ind <- (censoring.ind[,2]==1) %>% sapply(isTRUE) 32 | x.censored.2.ind <- (censoring.ind[,2]==2) %>% sapply(isTRUE) 33 | # plot(x.censored) 34 | abline(h=x.censored[x.censored.1.ind,2], lty=2) 35 | abline(v=x.censored[x.censored.2.ind,1], lty=2) 36 | ``` 37 | 38 | 39 | Let's try to impute using a 1D linear space embedding and reconstruction: 40 | ```{r} 41 | x.censored.clean <- x.censored %>% na.omit 42 | svd.1 <- x.censored.clean %>% svd 43 | d.2 <- diag(svd.1$d) 44 | d.2[2,2] <- 0 45 | x.censored.reduced <- svd.1$u %*% d.2 %*% t(svd.1$v) 46 | points(x.censored.reduced, col='green', pch=19) 47 | lm.1 <- lm(x.censored.reduced[,2]~x.censored.reduced[,1]) 48 | abline(lm.1, col='darkgreen') 49 | ``` 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /notes/introduction.tex: -------------------------------------------------------------------------------- 1 | \chapter{Introduction} 2 | \label{sec:introduction} 3 | 4 | This text is intended to collect many machine learning methods and algorithms, present then, and organize them. 5 | The treatment of the different concepts attempts to be as intuitive as possible, and mathematics is presented only when unavoidable, or when adding special insight. 6 | 7 | Extra effort has been put into organizing the methods and algorithms along some fundamental building blocks, which are now briefly presented. 8 | 9 | \begin{description} 10 | 11 | \item[The learning problem] 12 | The first distinction between the different methods is along the tasks they perform, which closely corresponds to the type of data at hand. 13 | This includes Supervised Learning (\S\ref{sec:supervised}) and Unsupervised Learning (\S\ref{sec:unsupervised}). 14 | Within each, we find several sub-tasks: 15 | \begin{description} 16 | 17 | \item[Supervised Learning] Includes classification tasks, and regression tasks. The first, predicting a categorical outcome, and the latter, a continuous outcome. 18 | 19 | \item[Unsupervised Learning] Includes the learning of the data generating distribution, or related tasks such as detecting high density regions and clustering. 20 | \end{description} 21 | 22 | As we will see, not all learning tasks fall into these categories. Collaborative Filtering (\S\ref{sec:collaborative_filtering}) is an example of a learning task that is neither. It is a missing data imputation task. 23 | 24 | \item[An optimization problem vs. an algorithm] 25 | Both supervised and unsupervised learning methods can be classified as either an explicit algorithm, or as an optimization problem, agnostic to the optimization algorithm used to solve it. 26 | 27 | \item[Dimension Reduction] 28 | Both supervised and unsupervised learning methods can include a dimensionality reduction stage. 29 | This can be motivated by the need to reduce computational complexity, to apply low-dimensional algorithms down the road, to allow a visualization of the data, or simply to allow some human tractable interpretation of the data. 30 | This is discussed in Appendix~\ref{apx:dim_reduce}. 31 | 32 | We can further stratify dimensionality reduction methods along these lines: 33 | \begin{description} 34 | \item[Linear-Space vs. Non-Linear-Space Embeddings] 35 | When reducing the dimension of the data, it can be mapped (embedded) into a linear subspace of the original space, or a non linear one. 36 | 37 | \item[Linear vs. Non-Linear Space Embeddings] 38 | Not to be confused with the previous item. 39 | The dimensionality reduction can be a linear operation on the data or a non-linear one. 40 | 41 | \item[Learning an Embedding vs. Learning an Embedding Function] 42 | When learning a mapping to a lower dimensional space, we can map the original data points (an embedding), or learn a mapping of the whole data space (an embedding function). 43 | \end{description} 44 | 45 | \item[Kernel Trick] 46 | Both supervised and unsupervised learning methods can include a ``kernel trick''. 47 | This will happen when we wish to learn complex functions of the data, but keep computations quick. 48 | The kernel trick is applicable to methods that do not need the whole data, but rather, only some measure of similarity between the points. 49 | The idea is that many complicated functions, are merely linear combinations of the distances to other points. 50 | This is further elaborated in Appendix~\ref{apx:rkhs}. 51 | 52 | \item[Generative vs. Discriminative Models] 53 | Both supervised and unsupervised learning methods can benefit from an assumption on the data generating process, i.e., the sampling distribution. 54 | Generative models as those where we assume this process. 55 | Discriminative models, which appear in supervised learning, we do not assume the data generating process, but merely the nature of the relation between features and outcome. 56 | 57 | \item[Feature based vs. Graph Based] 58 | Unsupervised learning tasks can be classified to those that require the full features of the data, and those who require only some measure of similarity between data points. As such, the latter methods can be seen as graph based methods, where the similarities are represented as a graph. 59 | 60 | \item[Fully Observed vs. Latent Space Models] 61 | Both supervised and unsupervised learning methods can include unobservable, i.e. latent, variables. 62 | 63 | \end{description} 64 | 65 | 66 | 67 | \paragraph{Fully Automated Processes?} 68 | The machine learning literature draws heavily from the statistical literature. 69 | You should bear in mind that the ultimate goal of machine learning, is replacing a ``hard-coded'' algorithm, which externalizes the programmer's knowledge, into a self teaching algorithm. 70 | It may thus seem that problems like visualization do not belong in the realm of machine learning, as they are not completely automated. 71 | This is not completely accurate because, while we want the \emph{application} stage of an algorithm to be automated, we can sometimes allow for a human to be involved in the \emph{learning} stage. 72 | 73 | 74 | \paragraph{Notation} 75 | The notation conventions may seem non standard as they borrow from several lines of literature. 76 | These conventions were chosen as we find them to be clear and concise. 77 | They are collected in Appendix \ref{apx:notation}. 78 | 79 | \paragraph{Sources} 80 | This text draws mostly from \cite{hastie_elements_2003} and \cite{shalev-shwartz_understanding_2014}. 81 | The former is freely available online. 82 | For a softer introduction, with more hands-on examples, see \cite{james_introduction_2013}, also freely available online. 83 | All books are very well written and strongly recommended. 84 | More references can be found in the Bibliography (Appendix \ref{sec:bibliography}). 85 | 86 | -------------------------------------------------------------------------------- /notes/notes.loa: -------------------------------------------------------------------------------- 1 | \contentsline {algorithm}{\numberline {1}{\ignorespaces Random Forest}}{27}{algorithm.1} 2 | \contentsline {algorithm}{\numberline {2}{\ignorespaces Rotation Forest}}{27}{algorithm.2} 3 | \contentsline {algorithm}{\numberline {3}{\ignorespaces Forward Search}}{32}{algorithm.3} 4 | \contentsline {algorithm}{\numberline {4}{\ignorespaces PCA Regression}}{33}{algorithm.4} 5 | \contentsline {algorithm}{\numberline {5}{\ignorespaces Commitee Methods}}{36}{algorithm.5} 6 | \contentsline {algorithm}{\numberline {6}{\ignorespaces Model Averaging}}{37}{algorithm.6} 7 | \contentsline {algorithm}{\numberline {7}{\ignorespaces Stacking}}{37}{algorithm.7} 8 | \contentsline {algorithm}{\numberline {8}{\ignorespaces Bagging}}{38}{algorithm.8} 9 | \contentsline {algorithm}{\numberline {9}{\ignorespaces Jackknife}}{43}{algorithm.9} 10 | \contentsline {algorithm}{\numberline {10}{\ignorespaces Cross Validation}}{43}{algorithm.10} 11 | \contentsline {algorithm}{\numberline {11}{\ignorespaces Bootstrap}}{44}{algorithm.11} 12 | \contentsline {algorithm}{\numberline {12}{\ignorespaces K-Means}}{72}{algorithm.12} 13 | \contentsline {algorithm}{\numberline {13}{\ignorespaces K-Medoids}}{73}{algorithm.13} 14 | \contentsline {algorithm}{\numberline {14}{\ignorespaces Spectral Clustering}}{76}{algorithm.14} 15 | -------------------------------------------------------------------------------- /notes/notes.loe: -------------------------------------------------------------------------------- 1 | \addvspace {10\p@ } 2 | \contentsline {example}{\numberline {2.1.1}Example\thmtformatoptarg {Exponential Rate}}{11}{example.2.1.1} 3 | \contentsline {example}{\numberline {2.1.2}Example\thmtformatoptarg {Linear Regression}}{11}{example.2.1.2} 4 | \contentsline {example}{\numberline {2.2.1}Example\thmtformatoptarg {Exponential rate}}{12}{example.2.2.1} 5 | \contentsline {example}{\numberline {2.3.1}Example\thmtformatoptarg {Exponential rate}}{12}{example.2.3.1} 6 | \contentsline {example}{\numberline {2.3.2}Example\thmtformatoptarg {Discrete time Markov Chain}}{13}{example.2.3.2} 7 | \contentsline {remark}{\numberline {2.3.1}Remark\thmtformatoptarg {Confession}}{13}{remark.2.3.1} 8 | \contentsline {example}{\numberline {2.3.3}Example\thmtformatoptarg {Autoregression of order 1 (AR(1))}}{13}{example.2.3.3} 9 | \contentsline {example}{\numberline {2.3.4}Example\thmtformatoptarg {Linear Regression}}{14}{example.2.3.4} 10 | \contentsline {definition}{\numberline {2.1}Definition\thmtformatoptarg {Loss Function}}{14}{definition.2.1} 11 | \contentsline {definition}{\numberline {2.2}Definition\thmtformatoptarg {Risk Function}}{14}{definition.2.2} 12 | \contentsline {definition}{\numberline {2.3}Definition\thmtformatoptarg {Empirical Risk}}{15}{definition.2.3} 13 | \contentsline {example}{\numberline {2.4.1}Example\thmtformatoptarg {Squared Loss}}{15}{example.2.4.1} 14 | \contentsline {example}{\numberline {2.4.2}Example\thmtformatoptarg {Ordinary Least Squares (OLS)}}{15}{example.2.4.2} 15 | \addvspace {10\p@ } 16 | \contentsline {remark}{\numberline {3.1.1}Remark\thmtformatoptarg {No Sampling Distribution}}{18}{remark.3.1.1} 17 | \contentsline {remark}{\numberline {3.1.2}Remark\thmtformatoptarg {OLS Extensions}}{19}{remark.3.1.2} 18 | \contentsline {remark}{\numberline {3.1.3}Remark\thmtformatoptarg {Generalized Linear Models (GLM)}}{21}{remark.3.1.3} 19 | \contentsline {remark}{\numberline {3.1.4}Remark}{21}{remark.3.1.4} 20 | \contentsline {remark}{\numberline {3.1.5}Remark\thmtformatoptarg {Name Origins}}{22}{remark.3.1.5} 21 | \contentsline {remark}{\numberline {3.1.6}Remark\thmtformatoptarg {Solve the right problem}}{23}{remark.3.1.6} 22 | \contentsline {remark}{\numberline {3.1.7}Remark\thmtformatoptarg {Not a pure ERM}}{23}{remark.3.1.7} 23 | \contentsline {remark}{\numberline {3.1.8}Remark\thmtformatoptarg {Not a pure ERM}}{24}{remark.3.1.8} 24 | \contentsline {remark}{\numberline {3.1.9}Remark\thmtformatoptarg {Universal Approximator}}{24}{remark.3.1.9} 25 | \contentsline {remark}{\numberline {3.3.1}Remark\thmtformatoptarg {Hypothesis Testing Driven Variable Selection}}{32}{remark.3.3.1} 26 | \contentsline {remark}{\numberline {3.3.2}Remark\thmtformatoptarg {PCAR and Ridge Regression}}{33}{remark.3.3.2} 27 | \contentsline {remark}{\numberline {3.4.1}Remark\thmtformatoptarg {LDA and OLS classification}}{35}{remark.3.4.1} 28 | \contentsline {remark}{\numberline {3.5.1}Remark}{36}{remark.3.5.1} 29 | \addvspace {10\p@ } 30 | \contentsline {remark}{\numberline {4.2.1}Remark}{42}{remark.4.2.1} 31 | \addvspace {10\p@ } 32 | \contentsline {remark}{\numberline {5.1.1}Remark\thmtformatoptarg {Unsupervised Learning in the ERM framework}}{48}{remark.5.1.1} 33 | \contentsline {remark}{\numberline {5.2.1}Remark}{48}{remark.5.2.1} 34 | \contentsline {example}{\numberline {5.2.1}Example\thmtformatoptarg {First Order Univariate Markov Process}}{49}{example.5.2.1} 35 | \contentsline {remark}{\numberline {5.2.2}Remark\thmtformatoptarg {Restricted Bolzmann Machine}}{49}{remark.5.2.2} 36 | \contentsline {remark}{\numberline {5.4.1}Remark\thmtformatoptarg {Interpreting ``Linear''}}{51}{remark.5.4.1} 37 | \contentsline {remark}{\numberline {5.4.2}Remark\thmtformatoptarg {Interpreting the Low Dimensional Representation}}{51}{remark.5.4.2} 38 | \contentsline {definition}{\numberline {5.1}Definition\thmtformatoptarg {SVD}}{56}{definition.5.1} 39 | \contentsline {remark}{\numberline {5.4.3}Remark\thmtformatoptarg {Classical and Least Squares MDS}}{59}{remark.5.4.3} 40 | \contentsline {remark}{\numberline {5.4.4}Remark\thmtformatoptarg {The Non-Linearity of Local MDS}}{59}{remark.5.4.4} 41 | \contentsline {remark}{\numberline {5.4.5}Remark\thmtformatoptarg {The Non-Linearity of Isomap}}{60}{remark.5.4.5} 42 | \contentsline {remark}{\numberline {5.5.1}Remark\thmtformatoptarg {Non Linear Dimensionality Reduction}}{60}{remark.5.5.1} 43 | \contentsline {remark}{\numberline {5.5.2}Remark\thmtformatoptarg {Information Bottleneck and ICA}}{62}{remark.5.5.2} 44 | \contentsline {example}{\numberline {5.6.1}Example\thmtformatoptarg {Intelligence Measure (g-factor)}}{63}{example.5.6.1} 45 | \contentsline {example}{\numberline {5.6.2}Example\thmtformatoptarg {Face Rotations}}{63}{example.5.6.2} 46 | \contentsline {remark}{\numberline {5.6.1}Remark\thmtformatoptarg {Identifiability in PCA}}{63}{remark.5.6.1} 47 | \contentsline {remark}{\numberline {5.6.2}Remark\thmtformatoptarg {Non Linear FA}}{64}{remark.5.6.2} 48 | \contentsline {remark}{\numberline {5.6.3}Remark\thmtformatoptarg {ICA and FA}}{65}{remark.5.6.3} 49 | \contentsline {example}{\numberline {5.6.3}Example\thmtformatoptarg {Intelligence Factor Continued}}{66}{example.5.6.3} 50 | \contentsline {remark}{\numberline {5.6.4}Remark\thmtformatoptarg {Projection Pursuit and ICA}}{67}{remark.5.6.4} 51 | \contentsline {remark}{\numberline {5.6.5}Remark\thmtformatoptarg {Finite Mixture Distributions}}{68}{remark.5.6.5} 52 | \contentsline {remark}{\numberline {5.6.6}Remark\thmtformatoptarg {Mixtures And the Expectation Maximization Algorithm (EM)}}{68}{remark.5.6.6} 53 | \contentsline {remark}{\numberline {5.6.7}Remark\thmtformatoptarg {Mixtures For Clustering}}{68}{remark.5.6.7} 54 | \contentsline {remark}{\numberline {5.6.8}Remark\thmtformatoptarg {Mixture in Supervise Learning}}{69}{remark.5.6.8} 55 | \contentsline {remark}{\numberline {5.7.1}Remark\thmtformatoptarg {Relation to Spectral Clustering}}{71}{remark.5.7.1} 56 | \contentsline {remark}{\numberline {5.8.1}Remark\thmtformatoptarg {The population equivalent of K-means}}{72}{remark.5.8.1} 57 | \addvspace {10\p@ } 58 | \contentsline {remark}{\numberline {6.2.1}Remark\thmtformatoptarg {Collaborative Filtering and Other Supervised Learning Methods}}{79}{remark.6.2.1} 59 | \contentsline {remark}{\numberline {6.2.2}Remark\thmtformatoptarg {Matrix Norm Notation}}{80}{remark.6.2.2} 60 | \contentsline {remark}{\numberline {6.2.3}Remark\thmtformatoptarg {Matrix Factorization and Factor Analysis}}{80}{remark.6.2.3} 61 | \contentsline {remark}{\numberline {6.2.4}Remark\thmtformatoptarg {Temporal Dynamics and Tensor Factorization}}{80}{remark.6.2.4} 62 | \addvspace {10\p@ } 63 | \contentsline {definition}{\numberline {G.1}Definition\thmtformatoptarg {Entropy}}{91}{definition.G.1} 64 | \contentsline {definition}{\numberline {G.2}Definition\thmtformatoptarg {Mutual Information}}{91}{definition.G.2} 65 | \contentsline {definition}{\numberline {G.3}Definition\thmtformatoptarg {Kullback\IeC {\textendash }Leibler Divergence}}{91}{definition.G.3} 66 | -------------------------------------------------------------------------------- /notes/notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/notes/notes.pdf -------------------------------------------------------------------------------- /notes/notes.tex: -------------------------------------------------------------------------------- 1 | \documentclass[12pt,a4paper]{report} 2 | 3 | 4 | \usepackage[utf8]{inputenc} 5 | \usepackage{amsmath} 6 | \usepackage{amsfonts} 7 | \usepackage{amssymb} 8 | \usepackage{graphicx} 9 | \usepackage{amsthm} 10 | \usepackage{natbib} 11 | \usepackage{algorithm} 12 | \usepackage{algpseudocode} 13 | \usepackage{framed} 14 | 15 | \usepackage{hyperref} 16 | \AtBeginDocument{\let\textlabel\label} 17 | \hypersetup{ 18 | colorlinks=true, 19 | linkcolor=black, 20 | citecolor=black, 21 | filecolor=black, 22 | urlcolor=black, 23 | } 24 | 25 | 26 | \usepackage{marginnote} 27 | \renewcommand*{\marginfont}{\scriptsize } 28 | 29 | \usepackage{thmtools} % for lists of theorems 30 | 31 | 32 | \input{commands} 33 | 34 | 35 | \author{Jonathan Rosenblatt} 36 | \title{Class Notes (experimental)} 37 | 38 | 39 | \begin{document} 40 | 41 | \maketitle 42 | 43 | \tableofcontents 44 | 45 | 46 | 47 | 48 | 49 | %%%%%%%%% Algorithms %%%%%%%%%%% 50 | \newpage 51 | \listofalgorithms 52 | \addcontentsline{toc}{chapter}{List of Algorithms} 53 | 54 | \renewcommand{\listtheoremname}{List of Definitions} 55 | \listoftheorems[ignoreall,show={definition}] 56 | 57 | 58 | \renewcommand{\listtheoremname}{List of Examples} 59 | \listoftheorems[ignoreall,show={example}] 60 | 61 | 62 | 63 | % % % Introduction % % % % 64 | \input{introduction} 65 | 66 | 67 | 68 | % % % % % Estimation % % % % % 69 | \input{estimation} 70 | 71 | 72 | % % % % % % Supervised Learning % % % % % % 73 | \input{supervised} 74 | 75 | 76 | % % % % % % Statistical Descision Theory % % % % % 77 | \input{statistical_decision} 78 | 79 | 80 | % % % % % % Unsupervised % % % % % 81 | \input{unsupervised} 82 | 83 | 84 | % % % % % % Collaborative Filtering % % % % % 85 | \input{collaborative} 86 | 87 | 88 | 89 | 90 | % % % % % % Appendices % % % % % % 91 | \newpage 92 | 93 | \appendix 94 | 95 | \input{appendices} 96 | 97 | 98 | 99 | 100 | %%%%%%%%% Bibliography %%%%%%%%%%% 101 | \newpage 102 | \addcontentsline{toc}{chapter}{Bibliography} 103 | \bibliographystyle{abbrvnat} 104 | \bibliography{Intro2MachineLearning} 105 | \label{sec:bibliography} 106 | 107 | 108 | \end{document} -------------------------------------------------------------------------------- /notes/statistical_decision.tex: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | \chapter{Statistical Decision Theory} 5 | \label{sec:desicion_theory} 6 | 7 | This section follows the spirit of Section~7 in \cite{hastie_elements_2003}, up to some changes in notation. 8 | 9 | In Section~\ref{sec:learning}, we gave an intuitive argument for which without some inductive bias, learning will return models with poor performance on new data. 10 | In this section we learn how to quantify the performance of a model. In particular, when given new data. This allows us to select among competing candidate models. It will also allow us to choose the value of the regularization parameter of each method. 11 | 12 | Figure~\ref{fig:bias_variance} demonstrate the prediction error (red curve) of some model as the model complexity increases. As can be seen, the prediction error decreases as the model becomes more complex, but saturates at some point. 13 | This is because the reduction in the bias is smaller than the increase in variance of learning very complex models. 14 | This is the celebrated bias-variance tradeoff.\marginnote{Bias Variance Tradeoff} 15 | 16 | Once we are able to estimate the prediction error from our data, we will seek for a model which minimizes this error. 17 | 18 | \begin{figure}[h] 19 | \centering 20 | \includegraphics[width=1\textwidth]{art/support-vector-machine-15-728} 21 | \caption{Overfitting: 22 | Prediction error on new data (red curve) versus the empirical prediction error (light blue). 23 | The empirical prediction error will always decrease as more complicated models are fit (moving right). 24 | The prediction error on new data, however, will not always decrease and will typically show a local minima. 25 | \label{fig:bias_variance}} 26 | \end{figure} 27 | 28 | Before we proceed, we now need to distinguish between several types of prediction errors. 29 | The population \emph{risk} of a model parametrized by $\theta$, was previously defined as the average loss over all possible data instances, and denoted by $\risk(\theta)$ (\S \ref{sec:m_estimation}). 30 | The empirical risk was defined as the average loss over the observed data points, and denoted by $\riskn(\theta)$. 31 | We now update these definitions to deal with the the $\hyp(x)$ notation of the previous section. 32 | \begin{align} 33 | \test :=& \expectg{Y,X}{\loss(Y,\hyp(X))}, \label{eq:test_error} \\ 34 | \train :=& \expectn{\loss(y,\hyp(x))} = \frac{1}{n} \sum_i \loss(y_i,\hyp(x_i)), \label{eq:training_error} \\ 35 | \insample :=& \frac{1}{n} \sum_i \expectg{Y}{\loss(Y,\hyp(x_i))}, \label{eq:in_sample} \\ 36 | \EPE :=& \expectg{\estim{\hyp}_n}{ 37 | \expectg{Y,X}{\loss(Y,\estim{\hyp}_n(X))|\estim{\hyp}_n} 38 | }.\label{eq:epe} 39 | \end{align} 40 | 41 | Eq.(\ref{eq:test_error}) is merely a reformulation of $\risk(\theta)$ from Section~\ref{sec:m_estimation}. 42 | It captures the expected loss, a given predictor, $\hyp(X)$, will incur on average when given new $X$'s and $Y$'s. 43 | This will be the magnitude which will tell us which models perform well, and which do not. 44 | It is known as the \emph{test error} or also as \emph{prediction error}.\marginnote{Test Error} 45 | 46 | Eq.(\ref{eq:training_error}) is the reformulation of empirical risk, $\riskn(\theta)$, we have been optimizing in Section~\ref{sec:learning}. 47 | We referred to it as the \emph{empirical risk}, but it is also known as the \emph{train error}. 48 | \marginnote{Train Error} 49 | 50 | Eq.(\ref{eq:in_sample}) is the average risk at the observed $x$'s, when given new $Y$'s \footnote{This magnitude should not be unfamiliar: e.g., inference in ANOVA is performed conditional on the $x$'s, which typically stem from a designed experiment.}. 51 | This is the \emph{in sample error}. 52 | \marginnote{In Sample Error} 53 | 54 | Eq.(\ref{eq:epe}) is called the \emph{expected prediction error}, i.e., the expected loss when $\hyp$ is also re-learned. 55 | Put differently: How much would we err when:(1) we are given $n$ new examples $\sample_1$; (2) re-learn $\estim{\hyp}_n$ on $\sample_1$; (3) compute the risk of $\estim{\hyp}_n$ (in the population, not in $\sample_1$. 56 | We emphasize this by writing $\estim{\hyp}_n$ instead of $\hyp$. 57 | $\EPE$ is thus not a property of a particular predictor $\hyp$, but rather of a whole learning algorithm on random samples of size $n$. 58 | It could have also been written as $\risk(algorithm)$, although I have not seen this notation in use. 59 | \marginnote{Expected Prediction Error} 60 | 61 | 62 | We would like to compare the performance of models based on $\test$, as this will give us an idea on the quality of the prediction on new data. 63 | Alas, computing $\test$ requires the distribution of $y$ and $x$, while we only have access to the $n$ observed samples. 64 | Can the empirical risk $\train$ estimate the unknown risk $\test$? 65 | Figure~\ref{fig:bias_variance} suggests it cannot since $\train$ underestimates $\test$. 66 | Why is this? 67 | At an intuitive level: this is because with ERM we learn the $\hyp$ with smallest error in each sample. 68 | It is thus the same as estimating the expected height in a population, by using the minimum in each sample; we will clearly be underestimating the expectation. Then again, there is the hope that we may take this minimum and debias it. 69 | This is the goal in the next sections. 70 | 71 | Before proceeding, we distinguish between two similar tasks: 72 | \begin{description} 73 | \item[Model Selection] This is the task of selecting between several candidate models. 74 | \item[Model Assessment] This is the task of assessing the prediction error (i.e., the expected loss, the risk) of a given model. 75 | \end{description} 76 | 77 | 78 | 79 | \section{Train, Validate, Test} 80 | \label{sec:train_test} 81 | If data is abundant, a trivial, assumption free way to estimate $\test$\footnote{Think: why $\test$ is being estimated, and not $\EPE$ nor $\insample$?}, is to split the data into $3$ sets. 82 | A \emph{training set}, used to learn several competing models. 83 | A \emph{validation set}, used check the performance of the learned models and choose the best performer using some comparison measure. 84 | A \emph{test set}, used to estimate the risk, as the empirical risk $\train$ will be unbiased to the population risk $\test$. 85 | 86 | If there is not enough data for this scheme, keep reading... 87 | 88 | 89 | \section{Unbiased Estimators of the Risk} 90 | \label{sec:risk_estimation} 91 | Under appropriate assumptions, the bias in $\train$ when estimating $\insample$\footnote{In this case, note that it is $\insample$ being estimated, and not $\test$ nor $\EPE$.} can be computed analytically, and accounted for. 92 | The bias $\insample-\train$ is called the \emph{optimism} of the algorithm.\marginnote{Optimism} 93 | Akaike's Information Criterion (AIC), 94 | the finite sample Corrected AIC (AICc), 95 | Mallow's Cp (Cp), 96 | the Bayesian Information Criterion (BIC, aka SBC, aka SBIC), 97 | the Minimum Description Description Length (MDL), 98 | Vapnic's Structural Risk Minimization (SRM), 99 | the Deviance Information Criterion (DIC), 100 | and the Hannan-Quinn Information Criterion (HQC), 101 | all try to estimate $\insample$ by correcting for the optimism under different assumptions.\marginnote{Cp, AIC, BIC, MDL, SRM} 102 | 103 | The differences, pros, and cons, of each will not be discussed herein. Just remember what they mean when you see them in your favourite software (R!). 104 | They all have in common that you will want the model with the smallest criterion. 105 | But be careful- as they are used for model selection, they are indifferent to scaling, and thus should be not interpreted as the expected prediction error. 106 | 107 | \begin{remark} 108 | Not all model selection criteria estimate $\insample$. The Focused Information Criterion (FIC), for example, does not. 109 | \end{remark} 110 | 111 | 112 | 113 | 114 | 115 | \paragraph{Further Reading} 116 | For a brief review of AIC, BIC, MDL and SRM see Chapter 7 in \citep{hastie_elements_2003}. 117 | For a more rigorous derivation, see \cite{claeskens_model_2008}. 118 | 119 | 120 | 121 | 122 | 123 | \section{Jackknifing} 124 | \label{sec:jackknife} 125 | 126 | If concerned with over fitting, here is a simple algorithm to estimate the prediction error: 127 | 128 | \begin{algorithm}[H] 129 | \caption{Jackknife} 130 | \begin{algorithmic} 131 | \For {$i \in 1,\dots,n$} 132 | \State $\estim{\hyp}^{(i)} \gets$ the learned model with all but the $i$'th observation. 133 | \State $\loss^{(i)} \gets$ the loss of $\estim{\hyp}^{(i)}$ on the $i$'th observation. 134 | \EndFor 135 | \State \Return the average loss over $\loss^{(i)}$. 136 | \end{algorithmic} 137 | \end{algorithm} 138 | 139 | This process is called the \emph{Jackknife}, or \emph{Leave-One-Out--Cross-Validation}. 140 | This algorithm return an estimator of $\EPE$. 141 | This might be quite surprising: every split uses almost an identical sample, so why would it not estimate $\test$? See Section 7.12 in \cite{hastie_elements_2003} for details.. 142 | 143 | But wait! We might be able to stabilize the variability of the estimated error in every split, if instead of leaving only a single observation aside, we leave some more. This lead to way to \emph{K-Fold Cross Validation} in the next section. 144 | 145 | 146 | \section{Cross Validation} 147 | \label{sec:cv} 148 | 149 | \begin{algorithm}[H] 150 | \caption{Cross Validation} 151 | \begin{algorithmic} 152 | \State Split the data into $\folds$ parts (``folds''). 153 | \For {$\fold \in 1,\dots,\folds$} 154 | \State $\estim{\hyp}^{(k)} \gets$ the learned model with all \emph{except} the observations in the $\fold$'th fold. 155 | \State $\loss^{(\fold)} \gets$ the loss average of $\estim{\hyp}^{(\fold)}$ on the observations in the $\fold$'th fold. 156 | \EndFor 157 | \State \Return the average over $\loss^{(\fold)}$ . 158 | \end{algorithmic} 159 | \end{algorithm} 160 | 161 | This simple algorithm estimates $\EPE$ without any assumption on the data generating process, and less data than would be required for a ``train-validate-test'' scheme. 162 | Well, as it actually serves for model selection, it should be seen as a ``train-validate'' scheme, without the ``test'' part. It is thus \emph{not} an unbiased estimate of $\EPE$. See Section 7.12 in \cite{hastie_elements_2003} for details. 163 | 164 | But wait again! 165 | The Cross Validation scheme resamples the data \emph{without replacement} to estimate $\EPE$. Could we have sampled it \emph{with} replacement? Yes. This is the idea underlying the \emph{Bootstrapping} scheme. 166 | 167 | 168 | \section{Bootstrapping} 169 | \label{sec:bootstrap} 170 | 171 | Here is the simplest version of Bootstrap validation: 172 | 173 | \begin{algorithm}[H] 174 | \caption{Bootstrap} 175 | \begin{algorithmic} 176 | \For {$\bootstrap \in 1,\dots,\bootstraps$} 177 | \State $\sample^\bootstrap \gets$ $n$ randomly selected observations, with replacement, from the original data. 178 | \State $\estim{\hyp}^{\bootstrap} \gets$ the model learned with $\sample^\bootstrap$. 179 | \State $\loss^{\bootstrap} \gets$ the average loss of $\estim{\hyp}^{\bootstrap}$ on the observations in the \emph{original} data. 180 | \EndFor 181 | \State \Return the over of $\loss^{\bootstrap}$ . 182 | \end{algorithmic} 183 | \end{algorithm} 184 | 185 | This algorithm is not a good estimator of $\EPE$ as observations play a role both in learning and in validating. 186 | Several corrections are available. For details see Section 7.11 in \cite{hastie_elements_2003}. 187 | 188 | The Bootstrap is a very general scheme, which can be used not only for model validation, but for assessing many of its statistical properties. It is possibly best known when used for hypothesis testing. 189 | For more on the Bootstrap, see \cite{efron_introduction_1994}. 190 | 191 | 192 | \subsection{.632 Rule} 193 | [TODO] -------------------------------------------------------------------------------- /project.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Project" 3 | author: "Jonathan Rosenblatt" 4 | date: "March 24, 2015" 5 | output: html_document 6 | --- 7 | 8 | Here are the guidelines for the course's concluding project. 9 | The project is based on the [Bike Sharing Demand](https://www.kaggle.com/c/bike-sharing-demand) competition at Kaggle. 10 | You are required to submit a prediction to Kaggle, and a report on the process to me. 11 | 12 | # Dates 13 | Teaming up: no later than __27.4.2015__. 14 | Submit prediction to Kaggle: __29.5.2015__. 15 | Submit report to Jonathan : __26.6.2015__. 16 | 17 | Recommended time-table: 18 | 19 | 1. During the Passover vacation, download the data. Make sure you can load it and practice `dplyr` and `lubridate` on it. 20 | 2. After Passover, find your team and notify me. 21 | 3. Keep revisiting the data as we progress and study new techniques. Don't leave everything to submission date. 22 | 23 | 24 | # Guidelines 25 | 26 | 1. Your task is to participate in the [Bike Sharing Demand](https://www.kaggle.com/c/bike-sharing-demand) competiton. The competion ends on __29.5.2015__ when you will have to submit your predictions to Kaggle. 27 | 2. You can do so in pairs, or trios. 28 | 3. By the end of the course you will need to submit to me a report documenting the process. 29 | - No longer than 8 pages (not including appendices). 30 | - Submitted by mail which includes: 31 | - A PDF file with the report. 32 | - Author names and IDs. 33 | - Should contain the sections: 34 | - Background: Some background on the competition. 35 | - Scoring: The scoring criterion in the competition. What loss function with what data? 36 | - The data: What data was provided for learning? What files in what formats? Which variables? How did you handle them? 37 | - Algorithms: Which learning algorithms did you try? 38 | - Results: What score did you achieve? What was your ranking in the competition? 39 | - Discussion: Why were you successful/unsuccessful? What other ideas would you have liked to try? What were the major challenges? 40 | - Code should be added in appendices. 41 | 4. Feel free to use the courses forums for questions. Especially regarding the use of Kaggle and R. Make sure however, that you do not share your solutions. 42 | 5. Any non trivial choices you made in the project need to be justified: tell me "why", not only "what". 43 | 44 | -------------------------------------------------------------------------------- /sample_questions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Questions" 3 | author: "Jonathan Rosenblatt" 4 | date: "June 1, 2015" 5 | output: 6 | html_document: 7 | toc: no 8 | pdf_document: 9 | toc: no 10 | --- 11 | 12 | # Sample Questions 13 | ```{r preamble, cache=TRUE, echo=FALSE, results='hide'} 14 | suppressPackageStartupMessages(library(knitr)) 15 | suppressPackageStartupMessages(library(magrittr)) # for piping 16 | suppressPackageStartupMessages(library(plyr)) 17 | suppressPackageStartupMessages(library(dplyr)) # for handeling data frames 18 | 19 | .iris <- iris[,1:4] %>% scale 20 | .iris.y <- iris$Species=='virginica' 21 | .iris.dissimilarity <- dist(.iris) 22 | 23 | suppressPackageStartupMessages(library(arules)) 24 | data("Groceries") 25 | rules <- apriori(Groceries, parameter = list(support=0.001, confidence=0.5)) 26 | 27 | l2 <- function(x) x^2 %>% sum %>% sqrt 28 | l1 <- function(x) abs(x) %>% sum 29 | MSE <- function(x) x^2 %>% mean 30 | missclassification <- function(tab) sum(tab[c(2,3)])/sum(tab) 31 | 32 | 33 | suppressPackageStartupMessages(library(ElemStatLearn)) # for data 34 | data("prostate") 35 | data("spam") 36 | 37 | 38 | 39 | 40 | # Continous outcome: 41 | prostate.train <- prostate %>% 42 | filter(train) %>% 43 | select(-train) 44 | prostate.test <- prostate %>% 45 | filter(!train) %>% 46 | select(-train) 47 | y.train <- prostate.train$lcavol 48 | X.train <- prostate.train %>% select(-lcavol) %>% as.matrix 49 | y.test <- prostate.test$lcavol 50 | X.test <- prostate.test %>% select(-lcavol) %>% as.matrix 51 | 52 | 53 | 54 | # Categorical outcome: 55 | n <- nrow(spam) 56 | 57 | train.prop <- 0.66 58 | train.ind <- c(TRUE,FALSE) %>% 59 | sample(size = n, prob = c(train.prop,1-train.prop), replace=TRUE) 60 | spam.train <- spam[train.ind,] 61 | spam.test <- spam[!train.ind,] 62 | 63 | y.train.spam <- spam.train$spam 64 | X.train.spam <- spam.train %>% select(-spam) %>% as.matrix 65 | y.test.spam <- spam.test$spam 66 | X.test.spam <- spam.test %>% select(-spam) %>% as.matrix 67 | 68 | spam.dummy <- spam %>% mutate(spam=as.numeric(spam=='spam')) 69 | spam.train.dummy <- spam.dummy[train.ind,] 70 | spam.test.dummy <- spam.dummy[!train.ind,] 71 | 72 | suppressPackageStartupMessages(library(glmnet)) 73 | lasso.1 <- glmnet(x=X.train, y=y.train, alpha = 1) 74 | 75 | ``` 76 | 77 | 78 | 79 | 1. Based on the following Biplot... \newline 80 | ```{r, echo=FALSE, eval=TRUE, fig.width = 6, fig.height = 4 } 81 | pca <- prcomp(.iris) 82 | ggbiplot::ggbiplot(pca) # better! 83 | ``` 84 | a. How many variables were in the original data? 85 | a. What original variables are captured by the first principal component? 86 | a. What original variables are captured by the second principal component? 87 | a. How many groups/clusters do you see in the data? 88 | 1. 89 | ```{r, eval=FALSE} 90 | n <- 100 91 | p <- 10 92 | X <- rnorm(n*p) %>% matrix(ncol = p, nrow=n) 93 | sigma <- 1e1 94 | epsilon <- rnorm(n, mean = 0, sd = sigma) 95 | y <- X %*% beta + epsilon 96 | ``` 97 | a. What does the code do? 98 | a. What is the dimension of `beta`? 99 | a. Can I fit a neural network to the data? Explain. 100 | 1. How does the graphical model alleviate the parameter dimensionality problem? 101 | 1. What is the difference between FA and ICA. 102 | 1. What is the cutoff of OLS classification with -1,3 encoding. 103 | 1. Name three clustering methods. Explain them. 104 | 1. You want to cluster individuals based on their LinkedIn acquaintances: name an algorithm you __cannot__ use. 105 | 1. 106 | ```{r Cross Validation, eval=FALSE} 107 | hmmm <- 10 108 | ahhh <- sample(1:5, nrow(data), replace = TRUE) 109 | that <- NULL 110 | 111 | for (yup in 1:hmmm){ 112 | wow <- data[ahhh!=yup,] 113 | arrrg <- data[ahhh==yup,] 114 | ok <- lm(y~. ,data = wow) 115 | nice <- predict(ok, newdata=arrrg) 116 | good <- nice - arrrg$y 117 | that <- c(that, good) 118 | } 119 | 120 | MSE(that) 121 | ``` 122 | a. What is the method implemented in the code? 123 | a. What problem does the method solve? 124 | 1. 125 | ```{r, eval=FALSE} 126 | y1 <- prcomp(.iris, scale. = TRUE) 127 | y2 <- y1$x[,1:2] 128 | y3 <- glm(.iris.y~y2) 129 | ``` 130 | a. Knowing that `.iris.y` is a two-level categorical variable, what does the code do? 131 | a. What could be a motivation for the proposed method? 132 | 1. 133 | ```{r, eval=FALSE} 134 | y1 <- prcomp(.iris, scale. = TRUE) 135 | y2 <- y1$x[,1:2] 136 | y3 <- kmeans(y2,3) 137 | ``` 138 | a. What does the code do? 139 | a. What can be the motivation for the proposed method? 140 | 1. Two scientists claim to have found two unobservable movie attributes, that drive viewers' satisfaction in the Netflix data (movie ratings data). They both used the same data and factor analysis. One claims the factors are the "action factor" and "drama factor". The other claims it is "comedy factor" and the "animation factor". Try to resolve the situation with your knowledge of factor analysis. 141 | 1. 142 | $argmin_\beta \{ \frac{1}{n}\sum_i (y_i-x_i\beta)^2 + \lambda/2 \Vert\beta\Vert_2^2 \}$ 143 | a. What is the name of the problem above? 144 | a. Does the solution enjoy the sparsity property? 145 | a. What is the regularization parameter? Name two methods for choosing it. 146 | 1. For the purpose of interpreting the predictor, would you prefer the CART or the NNET? Explain. 147 | 1. In order to estimate the covariance matrix in a Gaussian graphical model: should I estimate it directly or via its inverse? Explain. 148 | 1. Describe a method for selecting the number of mixing components in a mixture model using train-test samples. 149 | 1. Describe the stages of an algorithm to simulate $n$ samples from a two-state hidden Markov model. Assume you can generate data from Bernoulli and Gaussian distributions. 150 | 1. What assumption in ICA solves the FA rotation problem? 151 | 1. What is the LASSO ERM problem? Write the formula. 152 | 1. What is the OLS ERM problem? Write the formula. 153 | 1. What is the ridge ERM problem? Write the formula. 154 | 1. Name two algorithms for unbiased estimation of the population risk $R(\theta)$. 155 | 1. Name two unbiased estimators of the in-sample--prediction-error: 156 | $\bar{R}(f):=\frac{1}{n} \sum_i E_Y[l(Y,f(x_i))]$. 157 | 1. Suggest an algorithm to choose the number of principal components using cross validation. Write in pseudo-code. 158 | 1. Can the principal components in the PCA problem be estimated using maximum likelihood? Explain. 159 | 1. What can the logistic regression estimate that the SVM cannot? 160 | 1. Can any function be approximated using the LASSO? Put differently- does the LASSO have the Universal Approximator property? 161 | 1. Write the Bernoulli likelihood loss function. To what type of $y$ does it apply? What class of `R` objects holds this data type? 162 | 1. Name two methods for dimensionality reduction in supervised learning. Explain each briefly. 163 | 1. Here is some pseudo-code: 164 | - Set $M$ candidate learning algorithms. 165 | - For $m \in 1,\dots,M$, do 166 | - $\hat{f}^m(x) :=$ the predictor learned with the $m$'th algorithm. 167 | - EndFor 168 | - Set $\bar{f}(x) :=\frac{1}{M} \sum_{m=1}^M \hat{f}^m(x)$. 169 | - Return $\bar{f}(x)$. 170 | a. What is the name of the method above? 171 | a. What is the problem the method is designed to solve? 172 | a. Suggest an improvement to the method. 173 | 1. How many parameters need to be estimated to learn a multivariate Gaussian distribution where $p=15$. How does a graphical model help with this problem? 174 | 1. 175 | ```{r, cache=TRUE, echo=FALSE} 176 | rules %>% sort(by='lift') %>% head(1) %>% inspect() 177 | ``` 178 | a. What method will return this output? 179 | a. Interpret the output. 180 | 1. One researcher applied k-means clustering on the first two PCs. Another applied k-medoids on the output of classical MDS with Euclidean distances. Can the clusters differ? Explain. 181 | 1. Suggest a method to visualize a social network. Explain. 182 | 1. A researcher wishes to cluster songs (not the lyrics. the actual audio files). Suggest two methods that will allow this and discuss their possible advantages and disadvantages. 183 | 1. What is the difference between "complete" and "single" linkage in agglomerative clustering? 184 | 1. $(X'X+\lambda I)^{-1}X'y$. This is the solution to what problem? 185 | 1. What will happen if we try to learn an empirical risk minimizer with no inductive bias? What is the name of the phenomenon? 186 | 1. Name two justifications for the regularization term in LASSO regression. How do we know predictions can only improve with a small regularization? 187 | 1. What method learns a hypothesis in the class $f(x)= \sum_{m=1}^M c_m I_{\{x \in R_m \}}$. 188 | a. What is the name of the hypothesis class? 189 | a. Name a particularly desirable property of this class (and thus- of the method) 190 | 1. If I am using the Deviance likelihood as a loss function-- what type is my predicted variable? 191 | 1. Having learned a mixture distribution $p(x)=\sum_{k=1}^k \pi_k p_k(x)$; how can I use it for clustering? 192 | 1. Why can't we produce a bi-plot for MDS while we can for PCA? 193 | 1. What is the difference between a Streaming Algorithm, and a Batch-Algorithm. 194 | 1. Why is prediction an easier task than classical statistical inference (from the Estimation course)? 195 | 1. What are the two historical motivations underlying PCA? 196 | 1. We saw that for the PCA problem, it suffice to know only the correlations between variables $X'X$. Why does it not suffice for OLS? 197 | 1. In what course did you cover methods for unsupervised learning of a parametric generative model? Name two learning methods? 198 | 199 | 200 | -------------------------------------------------------------------------------- /sample_questions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnros/Intro2R/61bbbd65582f43e047354f50851519ed5e4845b6/sample_questions.pdf -------------------------------------------------------------------------------- /self_practice.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Self Practice" 3 | author: "Jonathan Rosenblatt" 4 | date: "March 23, 2015" 5 | output: html_document 6 | --- 7 | 8 | Some exercises to practice your initial R skills. 9 | Make sure you can answer. No need to submit. 10 | 11 | 1. What is the difference between .csv and tab delimited data files? What function imports and exports csv files? 12 | 2. What is the average departure delay of the flights that departed on the Sundays of Oct 2013? (`flights` dataset in the `nycflights13` package). 13 | 3. Plot a histogram and a boxplot of the delays of JetBlue Airways flights, after joining with the `airlines` dataset. Now plot the same plots for each day of the week. Export the plots as pdf files. 14 | 4. Create, then save as a csv, a data.frame named `drinks` with gender and drinks data, so that the output of `table(drinks)` is: 15 | 16 | Gender | Coke | Coffee 17 | -------|-------|------- 18 | Male | 12 | 10 19 | Female | 3 | 20 20 | -------------------------------------------------------------------------------- /supervised.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Supervised Learning" 3 | author: "Jonathan Rosenblatt" 4 | date: "April 12, 2015" 5 | output: 6 | html_document: 7 | toc: true 8 | --- 9 | In these examples, I will use two data sets from the `ElemStatLearn` package: `spam` for categorical predictions (spam or not spam?), and `prostate` for continuous predictions (size of cancerous tumor). 10 | In `spam` we will try to decide if a mail is spam or not. 11 | In `prostate` we will try to predict the size of a cancerous tumor. 12 | 13 | ```{r} 14 | source('make_samples.R') 15 | ``` 16 | You can now call `?prostate` and `?spam` to learn more about these data sets. 17 | 18 | We also load some utility packages and functions that we will require down the road. 19 | ```{r preamble} 20 | library(magrittr) # for piping 21 | library(dplyr) # for handeling data frames 22 | 23 | # My own utility functions: 24 | l2 <- function(x) x^2 %>% sum %>% sqrt 25 | l1 <- function(x) abs(x) %>% sum 26 | MSE <- function(x) x^2 %>% mean 27 | missclassification <- function(tab) sum(tab[c(2,3)])/sum(tab) 28 | ``` 29 | 30 | We also initialize the random number generator so that we all get the same results (at least upon a first run) 31 | ```{r set seed} 32 | set.seed(2015) 33 | ``` 34 | 35 | # OLS 36 | 37 | ## OLS Regression 38 | 39 | Starting with OLS regression, and a split train-test data set: 40 | ```{r OLS Regression} 41 | View(prostate) 42 | # now verify that your data looks as you would expect.... 43 | 44 | ols.1 <- lm(lcavol~. ,data = prostate.train) 45 | # Train error: 46 | MSE( predict(ols.1)- prostate.train$lcavol) 47 | # Test error: 48 | MSE( predict(ols.1, newdata = prostate.test)- prostate.test$lcavol) 49 | ``` 50 | 51 | Now using cross validation to estimate the prediction error: 52 | ```{r Cross Validation} 53 | folds <- 10 54 | fold.assignment <- sample(1:5, nrow(prostate), replace = TRUE) 55 | errors <- NULL 56 | 57 | for (k in 1:folds){ 58 | prostate.cross.train <- prostate[fold.assignment!=k,] 59 | prostate.cross.test <- prostate[fold.assignment==k,] 60 | .ols <- lm(lcavol~. ,data = prostate.cross.train) 61 | .predictions <- predict(.ols, newdata=prostate.cross.test) 62 | .errors <- .predictions - prostate.cross.test$lcavol 63 | errors <- c(errors, .errors) 64 | } 65 | 66 | # Cross validated prediction error: 67 | MSE(errors) 68 | ``` 69 | 70 | Also trying a bootstrap prediction error: 71 | ```{r Bootstrap} 72 | B <- 20 73 | n <- nrow(prostate) 74 | errors <- NULL 75 | 76 | prostate.boot.test <- prostate 77 | for (b in 1:B){ 78 | prostate.boot.train <- prostate[sample(1:n, replace = TRUE),] 79 | .ols <- lm(lcavol~. ,data = prostate.boot.train) 80 | .predictions <- predict(.ols, newdata=prostate.boot.test) 81 | .errors <- .predictions - prostate.boot.test$lcavol 82 | errors <- c(errors, .errors) 83 | } 84 | 85 | # Bootstrapped prediction error: 86 | MSE(errors) 87 | ``` 88 | 89 | 90 | ### OLS Regression Model Selection 91 | 92 | 93 | Best subset selection: find the best model of each size: 94 | ```{r best subset} 95 | # install.packages('leaps') 96 | library(leaps) 97 | 98 | regfit.full <- prostate.train %>% 99 | regsubsets(lcavol~.,data = ., method = 'exhaustive') 100 | summary(regfit.full) 101 | plot(regfit.full, scale = "Cp") 102 | ``` 103 | 104 | 105 | 106 | Train-Validate-Test Model Selection. 107 | Example taken from [here](https://lagunita.stanford.edu/c4x/HumanitiesScience/StatLearning/asset/ch6.html) 108 | ```{r OLS TVT model selection} 109 | model.n <- regfit.full %>% summary %>% length 110 | X.train.named <- prostate.train %>% model.matrix(lcavol ~ ., data = .) 111 | X.test.named <- prostate.test %>% model.matrix(lcavol ~ ., data = .) 112 | View(X.test.named) 113 | 114 | val.errors <- rep(NA, model.n) 115 | train.errors <- rep(NA, model.n) 116 | for (i in 1:model.n) { 117 | coefi <- coef(regfit.full, id = i) 118 | 119 | pred <- X.train.named[, names(coefi)] %*% coefi 120 | train.errors[i] <- MSE(y.train - pred) 121 | 122 | pred <- X.test.named[, names(coefi)] %*% coefi 123 | val.errors[i] <- MSE(y.test - pred) 124 | } 125 | plot(train.errors, ylab = "MSE", pch = 19, type = "black") 126 | points(val.errors, pch = 19, type = "b", col="blue") 127 | 128 | legend("topright", 129 | legend = c("Training", "Validation"), 130 | col = c("black", "blue"), 131 | pch = 19) 132 | ``` 133 | 134 | 135 | AIC model selection: 136 | ```{r OLS AIC} 137 | # Forward search: 138 | ols.0 <- lm(lcavol~1 ,data = prostate.train) 139 | model.scope <- list(upper=ols.1, lower=ols.0) 140 | step(ols.0, scope=model.scope, direction='forward', trace = TRUE) 141 | 142 | # Backward search: 143 | step(ols.1, scope=model.scope, direction='backward', trace = TRUE) 144 | ``` 145 | 146 | 147 | Cross Validated Model Selection. 148 | ```{r OLS CV} 149 | [TODO] 150 | ``` 151 | 152 | 153 | Bootstrap model selection: 154 | ```{r OLS bootstrap} 155 | [TODO] 156 | ``` 157 | 158 | 159 | Partial least squares and principal components: 160 | ```{r PLS} 161 | pls::plsr() 162 | pls::pcr() 163 | ``` 164 | 165 | Canonical correlation analyis: 166 | ```{r CCA} 167 | cancor() 168 | 169 | # Kernel based robust version 170 | kernlab::kcca() 171 | ``` 172 | 173 | 174 | 175 | ## OLS Classification 176 | ```{r OLS Classification} 177 | # Making train and test sets: 178 | ols.2 <- lm(spam~., data = spam.train.dummy) 179 | 180 | # Train confusion matrix: 181 | .predictions.train <- predict(ols.2) > 0.5 182 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train.dummy$spam)) 183 | missclassification(confusion.train) 184 | 185 | # Test confusion matrix: 186 | .predictions.test <- predict(ols.2, newdata = spam.test.dummy) > 0.5 187 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test.dummy$spam)) 188 | missclassification(confusion.test) 189 | ``` 190 | 191 | 192 | 193 | # Ridge Regression 194 | ```{r Ridge I} 195 | # install.packages('ridge') 196 | library(ridge) 197 | 198 | ridge.1 <- linearRidge(lcavol~. ,data = prostate.train) 199 | # Note that if not specified, lambda is chosen automatically by linearRidge. 200 | 201 | # Train error: 202 | MSE( predict(ridge.1)- prostate.train$lcavol) 203 | # Test error: 204 | MSE( predict(ridge.1, newdata = prostate.test)- prostate.test$lcavol) 205 | ``` 206 | 207 | 208 | Another implementation, which also automatically chooses the tuning parameter $\lambda$: 209 | ```{r Ridge II} 210 | # install.packages('glmnet') 211 | library(glmnet) 212 | ridge.2 <- glmnet(x=X.train, y=y.train, alpha = 0) 213 | 214 | # Train error: 215 | MSE( predict(ridge.2, newx =X.train)- y.train) 216 | 217 | # Test error: 218 | MSE( predict(ridge.2, newx = X.test)- y.test) 219 | ``` 220 | 221 | __Note__: `glmnet` is slightly picky. 222 | I could not have created `y.train` using `select()` because I need a vector and not a `data.frame`. Also, `as.matrix` is there as `glmnet` expects a `matrix` class `x` argument. 223 | Thse objects are created in the make_samples.R script, which we sourced in the beggining. 224 | 225 | 226 | 227 | 228 | # LASSO Regression 229 | ```{r LASSO} 230 | # install.packages('glmnet') 231 | library(glmnet) 232 | lasso.1 <- glmnet(x=X.train, y=y.train, alpha = 1) 233 | 234 | # Train error: 235 | MSE( predict(lasso.1, newx =X.train)- y.train) 236 | 237 | # Test error: 238 | MSE( predict(lasso.1, newx = X.test)- y.test) 239 | ``` 240 | 241 | 242 | # Logistic Regression For Classification 243 | ```{r Logistic Regression} 244 | logistic.1 <- glm(spam~., data = spam.train, family = binomial) 245 | # numerical error. Probably due to too many predictors. 246 | # Maybe regularizing the logistic regressio with Ridge or LASSO will make things better? 247 | ``` 248 | 249 | In the next chunk, we do $l_2$ and $l_1$ regularized logistic regression. 250 | Some technical remarks are in order: 251 | 252 | - `glmnet` is picky with its inputs. This has already been discussed in the context of the LASSO regression above. 253 | - The `predict` function for `glmnet` objects returns a prediction (see below) for many candidate regularization levels $\lambda$. We thus we `cv.glmnet` which does an automatic cross validated selection of the best regularization level. 254 | ```{r Regularized Logistic Regression} 255 | library(glmnet) 256 | # Ridge Regularization with CV selection of regularization: 257 | logistic.2 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 0) 258 | # LASSO Regularization with CV selection of regularization: 259 | logistic.3 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 1) 260 | 261 | 262 | # Train confusion matrix: 263 | .predictions.train <- predict(logistic.2, newx = X.train.spam, type = 'class') 264 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 265 | missclassification(confusion.train) 266 | 267 | .predictions.train <- predict(logistic.3, newx = X.train.spam, type = 'class') 268 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 269 | missclassification(confusion.train) 270 | 271 | # Test confusion matrix: 272 | .predictions.test <- predict(logistic.2, newx = X.test.spam, type='class') 273 | (confusion.test <- table(prediction=.predictions.test, truth=y.test.spam)) 274 | missclassification(confusion.test) 275 | 276 | .predictions.test <- predict(logistic.3, newx = X.test, type='class') 277 | (confusion.test <- table(prediction=.predictions.test, truth=y.test)) 278 | missclassification(confusion.test) 279 | ``` 280 | 281 | 282 | 283 | 284 | # SVM 285 | 286 | ## Classification 287 | ```{r SVM classification} 288 | library(e1071) 289 | svm.1 <- svm(spam~., data = spam.train) 290 | 291 | # Train confusion matrix: 292 | .predictions.train <- predict(svm.1) 293 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 294 | missclassification(confusion.train) 295 | 296 | # Test confusion matrix: 297 | .predictions.test <- predict(svm.1, newdata = spam.test) 298 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 299 | missclassification(confusion.test) 300 | ``` 301 | 302 | 303 | ## Regression 304 | ```{r SVM regression} 305 | svm.2 <- svm(lcavol~., data = prostate.train) 306 | 307 | # Train error: 308 | MSE( predict(svm.2)- prostate.train$lcavol) 309 | # Test error: 310 | MSE( predict(svm.2, newdata = prostate.test)- prostate.test$lcavol) 311 | ``` 312 | 313 | 314 | 315 | 316 | # GAM Regression 317 | ```{r GAM} 318 | # install.packages('mgcv') 319 | library(mgcv) 320 | form.1 <- lcavol~ s(lweight)+ s(age)+s(lbph)+s(svi)+s(lcp)+s(gleason)+s(pgg45)+s(lpsa) 321 | gam.1 <- gam(form.1, data = prostate.train) # the model is too rich. let's select a variable subset 322 | 323 | ridge.1 %>% coef %>% abs %>% sort(decreasing = TRUE) # select the most promising coefficients (a very arbitrary practice) 324 | form.2 <- lcavol~ s(lweight)+ s(age)+s(lbph)+s(lcp)+s(pgg45)+s(lpsa) # keep only promising coefficients in model 325 | gam.2 <- gam(form.2, data = prostate.train) 326 | 327 | # Train error: 328 | MSE( predict(gam.2)- prostate.train$lcavol) 329 | # Test error: 330 | MSE( predict(gam.2, newdata = prostate.test)- prostate.test$lcavol) 331 | ``` 332 | 333 | 334 | 335 | 336 | 337 | # Neural Net 338 | 339 | ## Regression 340 | ```{r NNET regression} 341 | library(nnet) 342 | nnet.1 <- nnet(lcavol~., size=20, data=prostate.train, rang = 0.1, decay = 5e-4, maxit = 1000) 343 | 344 | # Train error: 345 | MSE( predict(nnet.1)- prostate.train$lcavol) 346 | # Test error: 347 | MSE( predict(nnet.1, newdata = prostate.test)- prostate.test$lcavol) 348 | ``` 349 | 350 | 351 | Let's automate the network size selection: 352 | ```{r NNET validate} 353 | validate.nnet <- function(size){ 354 | .nnet <- nnet(lcavol~., size=size, data=prostate.train, rang = 0.1, decay = 5e-4, maxit = 200) 355 | .train <- MSE( predict(.nnet)- prostate.train$lcavol) 356 | .test <- MSE( predict(.nnet, newdata = prostate.test)- prostate.test$lcavol) 357 | return(list(train=.train, test=.test)) 358 | } 359 | 360 | validate.nnet(3) 361 | validate.nnet(4) 362 | validate.nnet(20) 363 | validate.nnet(50) 364 | 365 | sizes <- seq(2, 30) 366 | validate.sizes <- rep(NA, length(sizes)) 367 | for (i in seq_along(sizes)){ 368 | validate.sizes[i] <- validate.nnet(sizes[i])$test 369 | } 370 | plot(validate.sizes~sizes, type='l') 371 | ``` 372 | What can I say... This plot is not what I would expect. Could be due to the random nature of the fitting algorithm. 373 | 374 | 375 | 376 | ## Classification 377 | ```{r NNET Classification} 378 | nnet.2 <- nnet(spam~., size=5, data=spam.train, rang = 0.1, decay = 5e-4, maxit = 1000) 379 | 380 | # Train confusion matrix: 381 | .predictions.train <- predict(nnet.2, type='class') 382 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 383 | missclassification(confusion.train) 384 | 385 | # Test confusion matrix: 386 | .predictions.test <- predict(nnet.2, newdata = spam.test, type='class') 387 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 388 | missclassification(confusion.test) 389 | ``` 390 | 391 | 392 | # CART 393 | 394 | 395 | ## Regression 396 | ```{r Tree regression} 397 | library(rpart) 398 | tree.1 <- rpart(lcavol~., data=prostate.train) 399 | 400 | # Train error: 401 | MSE( predict(tree.1)- prostate.train$lcavol) 402 | # Test error: 403 | MSE( predict(tree.1, newdata = prostate.test)- prostate.test$lcavol) 404 | ``` 405 | 406 | At this stage we should prune the tree using `prune()`... 407 | 408 | ## Classification 409 | ```{r Tree classification} 410 | tree.2 <- rpart(spam~., data=spam.train) 411 | 412 | # Train confusion matrix: 413 | .predictions.train <- predict(tree.2, type='class') 414 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 415 | missclassification(confusion.train) 416 | 417 | # Test confusion matrix: 418 | .predictions.test <- predict(tree.2, newdata = spam.test, type='class') 419 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 420 | missclassification(confusion.test) 421 | ``` 422 | 423 | 424 | 425 | 426 | # Random Forest 427 | TODO 428 | 429 | # Rotation Forest 430 | TODO 431 | 432 | 433 | 434 | 435 | 436 | # Smoothing Splines 437 | I will demonstrate the method with a single predictor, so that we can visualize the smoothing that has been performed: 438 | 439 | ```{r Smoothing Splines} 440 | spline.1 <- smooth.spline(x=X.train, y=y.train) 441 | 442 | # Visualize the non linear hypothesis we have learned: 443 | plot(y.train~X.train, col='red', type='h') 444 | points(spline.1, type='l') 445 | ``` 446 | I am not extracting train and test errors as the output of `smooth.spline` will require some tweaking for that. 447 | 448 | 449 | 450 | # KNN 451 | 452 | ## Classification 453 | ```{r knn classification} 454 | library(class) 455 | knn.1 <- knn(train = X.train.spam, test = X.test.spam, cl =y.train.spam, k = 1) 456 | 457 | # Test confusion matrix: 458 | .predictions.test <- knn.1 459 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 460 | missclassification(confusion.test) 461 | ``` 462 | 463 | And now we would try to optimize `k` by trying different values. 464 | 465 | 466 | # Kernel Regression 467 | Kernel regression includes many particular algorithms. 468 | ```{r kernel} 469 | # install.packages('np') 470 | library(np) 471 | ksmooth.1 <- npreg(txdat =X.train, tydat = y.train) 472 | 473 | # Train error: 474 | MSE( predict(ksmooth.1)- prostate.train$lcavol) 475 | ``` 476 | 477 | There is currently no method to make prediction on test data with this function. 478 | 479 | 480 | 481 | # Stacking 482 | As seen in the class notes, there are many ensemble methods. 483 | Stacking, in my view, is by far the most useful and coolest. It is thus the only one I present here. 484 | 485 | The following example is adapted from [James E. Yonamine](http://jayyonamine.com/?p=456). 486 | 487 | ```{r Stacking} 488 | #####step 1: train models #### 489 | #logits 490 | logistic.2 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 0) 491 | logistic.3 <- cv.glmnet(x=X.train.spam, y=y.train.spam, family = "binomial", alpha = 1) 492 | 493 | 494 | # Learning Vector Quantization (LVQ) 495 | my.codebook<-lvqinit(x=X.train.spam, cl=y.train.spam, size=10, prior=c(0.5,0.5),k = 2) 496 | my.codebook<-lvq1(x=X.train.spam, cl=y.train.spam, codebk=my.codebook, niter = 100 * nrow(my.codebook$x), alpha = 0.03) 497 | 498 | # SVM 499 | library('e1071') 500 | svm.fit <- svm(y=y.train.spam, x=X.train.spam, probability=TRUE) 501 | 502 | 503 | 504 | #####step 2a: build predictions for data.train#### 505 | train.predict<- cbind( 506 | predict(logistic.2, newx=X.train.spam, type="response"), 507 | predict(logistic.3, newx=X.train.spam, type="response"), 508 | knn1(train=my.codebook$x, test=X.train.spam, cl=my.codebook$cl), 509 | predict(svm.fit, X.train.spam, probability=TRUE) 510 | ) 511 | 512 | ####step 2b: build predictions for data.test#### 513 | test.predict <- cbind( 514 | predict(logistic.2, newx=X.test.spam, type="response"), 515 | predict(logistic.3, newx=X.test.spam, type="response"), 516 | predict(svm.fit, newdata = X.test.spam, probability = TRUE), 517 | knn1(train=my.codebook$x, test=X.test.spam, cl=my.codebook$cl) 518 | ) 519 | 520 | 521 | ####step 3: train SVM on train.predict#### 522 | final <- svm(y=y.train.spam, x=train.predict, probability=TRUE) 523 | 524 | ####step 4: use trained SVM to make predictions with test.predict#### 525 | final.predict <- predict(final, test.predict, probability=TRUE) 526 | results<-as.matrix(final.predict) 527 | table(results, y.test.spam) 528 | ``` 529 | 530 | 531 | 532 | 533 | 534 | # Fisher's LDA 535 | ```{r LDA} 536 | library(MASS) 537 | lda.1 <- lda(spam~., spam.train) 538 | 539 | # Train confusion matrix: 540 | .predictions.train <- predict(lda.1)$class 541 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 542 | missclassification(confusion.train) 543 | 544 | # Test confusion matrix: 545 | .predictions.test <- predict(lda.1, newdata = spam.test)$class 546 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 547 | missclassification(confusion.test) 548 | ``` 549 | 550 | __Caution__: 551 | Both `MASS` have a function called `select`. I will thus try avoid the two packages being loaded at once, or call the functionby its full name: `MASS::select` or `dplyr::select'. 552 | 553 | 554 | 555 | # Naive Bayes 556 | ```{r Naive Bayes} 557 | library(e1071) 558 | nb.1 <- naiveBayes(spam~., data = spam.train) 559 | 560 | # Train confusion matrix: 561 | .predictions.train <- predict(nb.1, newdata = spam.train) 562 | (confusion.train <- table(prediction=.predictions.train, truth=spam.train$spam)) 563 | missclassification(confusion.train) 564 | 565 | # Test confusion matrix: 566 | .predictions.test <- predict(nb.1, newdata = spam.test) 567 | (confusion.test <- table(prediction=.predictions.test, truth=spam.test$spam)) 568 | missclassification(confusion.test) 569 | ``` 570 | 571 | -------------------------------------------------------------------------------- /unsupervised.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Unsupervised Learning" 3 | author: "Jonathan Rosenblatt" 4 | date: "April 12, 2015" 5 | output: html_document 6 | --- 7 | 8 | Some utility functions: 9 | ```{r utility} 10 | l2 <- function(x) x^2 %>% sum %>% sqrt 11 | l1 <- function(x) abs(x) %>% sum 12 | MSE <- function(x) x^2 %>% mean 13 | 14 | # Matrix norms: 15 | frobenius <- function(A) norm(A, type="F") 16 | spectral <- function(A) norm(A, type="2") 17 | ``` 18 | 19 | 20 | __Note__: `foo::bar` means that function `foo` is part of the `bar` package. 21 | With this syntax, there is no need to load (`library`) the package. 22 | If a line does not run, you may need to install the package: `install.packages('bar')`. 23 | Packages that are install from sources other than CRAN (like github or bioconductor) will include a commented installation line. 24 | 25 | __Note__:RStudio currently does not autocomplete function arguments when using the `::` syntax. 26 | 27 | 28 | 29 | 30 | 31 | # Learning Distributions 32 | 33 | ## Gaussian Density Estimation 34 | ```{r generate data} 35 | # Sample from a multivariate Gaussian: 36 | ## Generate a covariance matrix 37 | p <- 10 38 | Sigma <- bayesm::rwishart(nu = 100, V = diag(p))$W 39 | lattice::levelplot(Sigma) 40 | 41 | # Sample from a multivariate Gaussian: 42 | n <- 1e3 43 | means <- 1:p 44 | X1 <- mvtnorm::rmvnorm(n = n, sigma = Sigma, mean = means) 45 | dim(X1) 46 | 47 | # Estiamte parameters and compare to truth: 48 | estim.means <- colMeans(X1) # recall truth is (1,...,10) 49 | plot(estim.means~means); abline(0,1, lty=2) 50 | 51 | estim.cov <- cov(X1) 52 | plot(estim.cov~Sigma); abline(0,1, lty=2) 53 | 54 | estim.cov.errors <- Sigma - estim.cov 55 | lattice::levelplot(estim.cov.errors) 56 | lattice::levelplot(estim.cov.errors/Sigma) # percentage error 57 | 58 | frobenius(estim.cov.errors) 59 | 60 | # Now try the same while playing with n and p. 61 | ``` 62 | 63 | 64 | 65 | Other covariance estimators (robust, fast,...) 66 | ```{r covariances} 67 | # Robust covariance 68 | estim.cov.1 <- MASS::cov.rob(X1)$cov 69 | estim.cov.errors.1 <- Sigma - estim.cov.1 70 | lattice::levelplot(estim.cov.errors.1) 71 | lattice::levelplot(estim.cov.errors.1/Sigma) # percentage error 72 | 73 | frobenius(estim.cov.errors.1) 74 | 75 | 76 | # Nearest neighbour cleaning of outliers 77 | estim.cov.2 <- covRobust::cov.nnve(X1)$cov 78 | estim.cov.errors.2 <- Sigma - estim.cov.2 79 | lattice::levelplot(estim.cov.errors.2) 80 | frobenius(estim.cov.errors.2) 81 | 82 | 83 | # Regularized covariance estimation 84 | estim.cov.3 <- robustbase::covMcd(X1)$cov 85 | estim.cov.errors.3 <- Sigma - estim.cov.3 86 | lattice::levelplot(estim.cov.errors.3) 87 | frobenius(estim.cov.errors.3) 88 | 89 | 90 | # Another robust covariance estimator 91 | estim.cov.4 <- robustbase::covComed(X1)$cov 92 | estim.cov.errors.4 <- Sigma - estim.cov.4 93 | lattice::levelplot(estim.cov.errors.4) 94 | frobenius(estim.cov.errors.4) 95 | ``` 96 | 97 | ## Non parametric density estimation 98 | There is nothing that will even try dimensions higher than 6. 99 | See [here](http://vita.had.co.nz/papers/density-estimation.pdf) for a review. 100 | 101 | 102 | 103 | ## Graphical Models 104 | [TODO] 105 | See R's graphical modeling [task view](http://cran.r-project.org/web/views/gR.html). 106 | 107 | 108 | 109 | ## Association rules 110 | Note: Visualization examples are taken from the arulesViz [vignette](http://cran.r-project.org/web/packages/arulesViz/vignettes/arulesViz.pdf) 111 | 112 | ```{r association rules} 113 | library(arules) 114 | data("Groceries") 115 | inspect(Groceries[1:2]) 116 | summary(Groceries) 117 | 118 | rules <- arules::apriori(Groceries, parameter = list(support=0.001, confidence=0.5)) 119 | summary(rules) 120 | rules %>% sort(by='lift') %>% head(2) %>% inspect 121 | 122 | # For a rule {A => B} we denote: 123 | # support: P(A AND B) 124 | # confidence: P(B|A) 125 | # lift: P(A,B)/[P(B)P(A)] 126 | 127 | 128 | # Select a subset of rules 129 | rule.subset <- subset(rules, subset = rhs %pin% "yogurt") 130 | inspect(rule.subset) 131 | 132 | # Visualize rules: 133 | library(arulesViz) 134 | plot(rules) 135 | 136 | subrules <- rules[quality(rules)$confidence > 0.8] 137 | plot(subrules, method="matrix", measure="lift", control=list(reorder=TRUE)) 138 | plot(subrules, method="matrix", measure=c("lift", "confidence"), control=list(reorder=TRUE)) 139 | 140 | plot(subrules, method="grouped") 141 | plot(rules, method="grouped", control=list(k=50)) 142 | 143 | subrules2 <- head(sort(rules, by="lift"), 10) 144 | plot(subrules2, method="graph", control=list(type="items")) 145 | plot(subrules2, method="graph") 146 | 147 | # Export rules graph to use with other software: 148 | # saveAsGraph(head(sort(rules, by="lift"),1000), file="rules.graphml") 149 | 150 | rule.1 <- rules[1] 151 | inspect(rule.1) 152 | plot(rule.1, method="doubledecker", data = Groceries) 153 | ``` 154 | 155 | See also the `prim.box` function in the `prim` package for more algorithms to learn association rules 156 | 157 | 158 | 159 | # Dimensionality Reduction 160 | 161 | ## PCA 162 | Note: example is a blend from [Gaston Sanchez](http://gastonsanchez.com/blog/how-to/2012/06/17/PCA-in-R.html) and [Georgia's Geography dept.](http://geog.uoregon.edu/GeogR/topics/pca.html). 163 | 164 | 165 | Get some data 166 | ```{r PCA data} 167 | ?USArrests 168 | 169 | plot(USArrests) # basic plot 170 | corrplot::corrplot(cor(USArrests), method = "ellipse") # slightly fancier 171 | 172 | 173 | # As a correaltion graph 174 | cor.1 <- cor(USArrests) 175 | qgraph::qgraph(cor.1) 176 | qgraph::qgraph(cor.1, layout = "spring", posCol = "darkgreen", negCol = "darkmagenta") 177 | ``` 178 | 179 | 180 | ```{r prepare data} 181 | USArrests.1 <- USArrests[,-3] %>% scale # note the scaling, which is required by some 182 | ``` 183 | 184 | 185 | ```{r PCA} 186 | # functions down the road... 187 | pca1 <- prcomp(USArrests.1, scale. = TRUE) # The main workhorse. 188 | 189 | pca1$rotation # loadings 190 | 191 | # Now score the states: 192 | USArrests.1[ 193 | pca1$x %>% extract(,1) %>% which.max 194 | ,] # Fewest arrests 195 | USArrests.1[ 196 | pca1$x %>% extract(,1) %>% which.min 197 | ,] # Most arrests 198 | 199 | pca1$x %>% extract(,1) %>% sort %>% head 200 | pca1$x %>% extract(,1) %>% sort %>% tail 201 | ``` 202 | Interpretation: 203 | 204 | - PC1 seems to capture overall crime rate. 205 | - PC2 seems distinguish between sexual and non-sexual crimes 206 | - North Dakota is the most "arrestful" state. Florida is the least. 207 | 208 | 209 | Projecting on first two PCs: 210 | ```{r visualizing PCA} 211 | library(ggplot2) # for graphing 212 | 213 | pcs <- as.data.frame(pca1$x) 214 | ggplot(data = pcs, aes(x = PC1, y = PC2, label = rownames(pcs))) + 215 | geom_hline(yintercept = 0, colour = "gray65") + 216 | geom_vline(xintercept = 0, colour = "gray65") + 217 | geom_text(colour = "red", alpha = 0.8, size = 6) + 218 | ggtitle("PCA plot of USA States - Crime Rates") 219 | ``` 220 | 221 | 222 | The bi-Plot 223 | ```{r biplot} 224 | biplot(pca1) #ugly! 225 | 226 | # library(devtools) 227 | # install_github("vqv/ggbiplot") 228 | ggbiplot::ggbiplot(pca1, labels = rownames(USArrests.1)) # better! 229 | ``` 230 | 231 | 232 | The scree-plot 233 | ```{r screeplot} 234 | screeplot(pca1) 235 | 236 | ggbiplot::ggscreeplot(pca1) 237 | ``` 238 | So clearly the main differentiation is along the first component, which captures the overall crime level in each state (and not a particular type of crime). 239 | 240 | 241 | Visualize the scoring as a projection of the states' attributes onto the factors. 242 | ```{r visualize contributions to factors} 243 | # get parameters of component lines (after Everitt & Rabe-Hesketh) 244 | load <- pca1$rotation 245 | slope <- load[2, ]/load[1, ] 246 | mn <- apply(USArrests.1, 2, mean) 247 | intcpt <- mn[2] - (slope * mn[1]) 248 | 249 | # scatter plot with the two new axes added 250 | dpar(pty = "s") # square plotting frame 251 | USArrests.2 <- USArrests[,1:2] %>% scale 252 | xlim <- range(USArrests.2) # overall min, max 253 | plot(USArrests.2, xlim = xlim, ylim = xlim, pch = 16, col = "purple") # both axes same length 254 | abline(intcpt[1], slope[1], lwd = 2) # first component solid line 255 | abline(intcpt[2], slope[2], lwd = 2, lty = 2) # second component dashed 256 | legend("right", legend = c("PC 1", "PC 2"), lty = c(1, 2), lwd = 2, cex = 1) 257 | 258 | # projections of points onto PCA 1 259 | y1 <- intcpt[1] + slope[1] * USArrests.2[, 1] 260 | x1 <- (USArrests.1[, 2] - intcpt[1])/slope[1] 261 | y2 <- (y1 + USArrests.1[, 2])/2 262 | x2 <- (x1 + USArrests.1[, 1])/2 263 | segments(USArrests.1[, 1], USArrests.1[, 2], x2, y2, lwd = 2, col = "purple") 264 | ``` 265 | 266 | 267 | Visualize the loadings (ok... we are already doing factor analysis without noticing...) 268 | ```{r visualize PCA} 269 | # install.packages('GPArotation') 270 | pca.qgraph <- qgraph::qgraph.pca(USArrests.1, factors = 2, rotation = "varimax") 271 | plot(pca.qgraph) 272 | 273 | qgraph::qgraph(pca.qgraph, posCol = "darkgreen", layout = "spring", negCol = "darkmagenta", 274 | edge.width = 2, arrows = FALSE) 275 | ``` 276 | 277 | 278 | 279 | 280 | More implementations of PCA: 281 | ```{r many PCA implementations} 282 | # FAST solutions: 283 | gmodels::fast.prcomp() 284 | 285 | # More detail in output: 286 | FactoMineR::PCA() 287 | 288 | # For flexibility in algorithms and visualization: 289 | ade4::dudi.pca() 290 | 291 | # Another one... 292 | amap::acp() 293 | ``` 294 | 295 | 296 | 297 | Principal tensor analysis: 298 | [TODO] 299 | ```{r PTA} 300 | PTAk::PTAk() 301 | ``` 302 | 303 | 304 | 305 | ## sPCA 306 | ```{r sPCA} 307 | # Compute similarity graph 308 | state.similarity <- MASS::cov.rob(USArrests.1)$cov 309 | 310 | spca1 <- elasticnet::spca(state.similarity, K=2,type="Gram",sparse="penalty",trace=TRUE, para=c(0.06,0.16)) 311 | spca1$loadings 312 | ``` 313 | 314 | 315 | ## kPCA 316 | [TODO] 317 | ```{r kPCA} 318 | kernlab::kpca() 319 | ``` 320 | 321 | 322 | ## Random Projections 323 | [TODO] 324 | ```{r Random Projections} 325 | 326 | ``` 327 | 328 | 329 | ## MDS 330 | Classical MDS 331 | ```{r MDS} 332 | # We first need a dissimarity matrix/graph: 333 | state.disimilarity <- dist(USArrests.1) 334 | 335 | mds.1 <- stats::cmdscale(state.disimilarity) 336 | 337 | plot(mds.1, pch = 19) 338 | abline(h=0, v=0, lty=2) 339 | text(mds.1, pos = 4, labels = rownames(USArrests.2), col = 'tomato') 340 | 341 | # Compare with two PCA (first two PCs): 342 | points(pca1$x[,1:2], col='red', pch=19, cex=0.5) 343 | # So classical MDS with Euclidean distance, is the same as PCA on two dimensions! 344 | ``` 345 | Note: Also see the `cluster::daisy` for more dissimilarity measures. 346 | 347 | 348 | Let's try other strain functions for MDS. 349 | 350 | Sammon's strain: 351 | ```{r Sammon MDS} 352 | mds.2 <- MASS::sammon(state.disimilarity) 353 | plot(mds.2$points, pch = 19) 354 | abline(h=0, v=0, lty=2) 355 | text(mds.2$points, pos = 4, labels = rownames(USArrests.2)) 356 | 357 | # Compare with two PCA (first two PCs): 358 | arrows(x0 = mds.2$points[,1], y0 = mds.2$points[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5) 359 | # So Sammon's MDS with Euclidean distance, is *not* the same as PCA on two dimensions. 360 | ``` 361 | 362 | 363 | Kruskal's strain: 364 | ```{r isoMDS} 365 | mds.3 <- MASS::isoMDS(state.disimilarity) 366 | plot(mds.3$points, pch = 19) 367 | abline(h=0, v=0, lty=2) 368 | text(mds.3$points, pos = 4, labels = rownames(USArrests.2)) 369 | 370 | # Compare with two PCA (first two PCs): 371 | arrows(x0 = mds.3$points[,1], y0 = mds.3$points[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5) 372 | # So Kruskal's MDS with Euclidean distance, is *not* the same as PCA on two dimensions. 373 | ``` 374 | 375 | 376 | ## Isomap 377 | ```{r Isomap} 378 | # Installing the package: 379 | # source("http://bioconductor.org/biocLite.R") 380 | # biocLite("RDRToolbox") 381 | isomap.1 <- RDRToolbox::Isomap(USArrests.1) 382 | 383 | plot(isomap.1$dim2) 384 | abline(h=0, v=0, lty=2) 385 | text(isomap.1$dim2, pos = 4, labels = rownames(USArrests.2)) 386 | 387 | 388 | # Compare with two PCA (first two PCs): 389 | arrows(x0 = isomap.1$dim2[,1], y0 = isomap.1$dim2[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5) 390 | ``` 391 | 392 | 393 | ## Local Linear Embedding (LLE) 394 | ```{r LLE} 395 | lle.1 <- RDRToolbox::LLE(USArrests.1, k=3) 396 | 397 | plot(lle.1) 398 | abline(h=0, v=0, lty=2) 399 | text(lle.1, pos = 4, labels = rownames(USArrests.2)) 400 | 401 | 402 | # Compare with two PCA (first two PCs): 403 | arrows(x0 = lle.1[,1], y0 = lle.1[,2], x1 = pca1$x[,1], y1 = pca1$x[,2], col='red', pch=19, cex=0.5) 404 | ``` 405 | Well, LLE (with 3 neighbors) clearly disagrees with PCA. Why is this? 406 | 407 | 408 | ## LocalMDS 409 | The only package I found is `localmds` in [here](https://github.com/hadley/localmds/blob/master/R/localmds.r). 410 | It is currently under active development so I am still waiting a stable version. 411 | 412 | 413 | ## Principal Curves & Surfaces 414 | ```{r Principla curves} 415 | princurve.1 <- princurve::principal.curve(USArrests.1, plot=TRUE) 416 | princurve.1$s 417 | 418 | points(princurve.1) # Projections of data on principal curve 419 | whiskers <- function(from, to) segments(from[, 1], from[, 2], to[, 1], to[, 2]) 420 | whiskers(USArrests.1, princurve.1$s) 421 | ``` 422 | 423 | 424 | 425 | 426 | 427 | # Latent Space Generative Models 428 | 429 | ## Factor Analysis (FA) 430 | 431 | No rotation 432 | ```{r FA} 433 | fa.1 <- psych::principal(USArrests.1, nfactors = 2, rotate = "none") 434 | fa.1 435 | summary(fa.1) 436 | biplot(fa.1, labels = rownames(USArrests.1)) 437 | 438 | # Numeric comparison with PCA: 439 | fa.1$loadings 440 | pca1$rotation 441 | 442 | # Graph comparison: loadings encoded in colors 443 | qgraph::qgraph(fa.1) 444 | qgraph::qgraph(pca.qgraph) # for comparison 445 | 446 | 447 | # Geometric coherent graph comparison: loadings encoded in distances and colors 448 | qgraph::qgraph(fa.1) 449 | qgraph::qgraph(pca.qgraph) # for comparison 450 | ``` 451 | 452 | 453 | Varimax rotation 454 | ```{r varimax} 455 | fa.2 <- psych::principal(USArrests.1, nfactors = 2, rotate = "varimax") 456 | 457 | fa.2$loadings 458 | fa.1$loadings 459 | pca1$rotation 460 | ``` 461 | Notice the rotation has changed the interpretation of the factors. 462 | 463 | 464 | ## Independant component analysis (ICA) 465 | ```{r ICA} 466 | 467 | ica.1 <- fastICA::fastICA(USArrests.1, n.com=2) # Also performs projection pursuit 468 | 469 | 470 | plot(ica.1$S) 471 | abline(h=0, v=0, lty=2) 472 | text(ica.1$S, pos = 4, labels = rownames(USArrests.1)) 473 | 474 | # Compare with two PCA (first two PCs): 475 | arrows(x0 = ica.1$S[,1], y0 = ica.1$S[,2], x1 = pca1$x[,2], y1 = pca1$x[,1], col='red', pch=19, cex=0.5) 476 | ``` 477 | 478 | 479 | 480 | ## Exploratory Projection Pursuit 481 | ```{r exploratory projection pursuit} 482 | epp.1 <- REPPlab::EPPlab(USArrests.1) 483 | plot(epp.1) 484 | ``` 485 | 486 | ## Generative Topographic Map (GTP) 487 | [TODO] 488 | 489 | 490 | 491 | ## Finite Mixture 492 | ```{r mixtures} 493 | library(mixtools) 494 | 495 | # Generate data: 496 | # Note that component-wise independence is assumed. 497 | k <- 2 498 | mix.p <- 4 499 | mix.probs <- rep(1/k,k) 500 | mix.means <- seq(1,k*mix.p) %>% matrix(nrow = k, ncol = mix.p) 501 | mix.sigma <- rep(1,k*p) %>% matrix(nrow = k, ncol = mix.p) 502 | x.mix <- mixtools::rmvnormmix(n=n, lambda =mix.probs, mu=mix.means, sigma = mix.sigma) 503 | x.mix %>% dim 504 | 505 | # Non parametric fit (initializing with true means) 506 | mix.1 <- mixtools::npEM(x.mix, mu0 = mix.means, verb = TRUE) 507 | plot(mix.1) 508 | 509 | # Fit assuming the Gaussian distribution: 510 | matrix2list <- function(x) split(x, rep(1:ncol(x), each = nrow(x))) 511 | mix.means.list <- matrix2list(t(mix.means)) 512 | 513 | mix.2 <- mixtools::mvnormalmixEM(x.mix, k=2, mu=mix.means.list, verb = TRUE, epsilon = 1e-1) 514 | summary(mix.2) 515 | ``` 516 | Read [this](http://www.stat.cmu.edu/~cshalizi/uADA/12/lectures/ch20.pdf) for more information on Finite mixtures. 517 | 518 | 519 | ## Hidden Markov Model (HMM) 520 | ```{r} 521 | # Note: the HiddenMarkov::foo() syntax will not work with this function. We thus load it. 522 | library(HiddenMarkov) 523 | 524 | # Generate data: 525 | (hmm.transition <- matrix(c(1/2, 1/2, 0, 1/3, 1/3, 1/3, 0, 1/2, 1/2), byrow=TRUE, nrow=3)) 526 | hmm.probs <- rep(1,3)/3 527 | hmm.distribution <- 'norm' 528 | hmm.params <- list(mean=c(1, 6, 3), sd=c(0.2, 0.2, 0.2)) 529 | x <- dthmm(x = NULL, Pi = hmm.transition, delta = hmm.probs, distn = hmm.distribution, pm = hmm.params) 530 | x <- simulate(x, nsim=n) 531 | plot(x$x) 532 | # Can you guess when states were changed? 533 | 534 | # Let's make this harder: 535 | hmm.params <- list(mean=c(1, 6, 3), sd=rep(2,3)) 536 | x <- dthmm(NULL, hmm.transition, hmm.probs, hmm.distribution, hmm.params) 537 | x <- simulate(x, nsim=n) 538 | plot(x$x, type='h') 539 | 540 | 541 | # Estimate parameters: 542 | y <- BaumWelch(x) 543 | summary(y) 544 | 545 | # Compare with truth: 546 | hmm.true.state <- x$y 547 | hmm.predict.state <- Viterbi(y) 548 | table(predict=hmm.predict.state, true=hmm.true.state) 549 | ``` 550 | 551 | 552 | 553 | # Clustering: 554 | Some tutorials on clustering with R can be found in 555 | 556 | - [David Hitchcock](http://people.stat.sc.edu/Hitchcock/chapter6_R_examples.txt). 557 | - [QuickR](http://www.statmethods.net/advstats/cluster.html). 558 | - University of California, Riverside, [Institute of Integrative Genome Biology](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R). 559 | - [Phil Spector's](http://www.stat.berkeley.edu/~s133/Cluster2a.html) class notes from Berkeley Stats dept. 560 | - Michigan state university's [Laboratory for Dynamic Synthetic Vegephenomenology](http://ecology.msu.montana.edu/labdsv/R/labs/lab13/lab13.html). 561 | 562 | 563 | 564 | ## K-Means 565 | The following code is an adaptation from [David Hitchcock](http://people.stat.sc.edu/Hitchcock/chapter6_R_examples.txt). 566 | ```{r kmeans} 567 | k <- 2 568 | kmeans.1 <- stats::kmeans(USArrests.1, centers = k) 569 | kmeans.1$cluster # cluster asignments 570 | 571 | # Visualize using scatter plots of the original features 572 | pairs(USArrests.1, panel=function(x,y) text(x,y,kmeans.1$cluster)) 573 | 574 | # Visualize using scatter plots of the original features 575 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2) 576 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=kmeans.1$cluster) 577 | ``` 578 | 579 | 580 | ## K-Means++ 581 | Recall that K-Means++ is a smart initialization for K-Means. 582 | The following code is taken from the [r-help](https://stat.ethz.ch/pipermail/r-help/2012-January/300051.html) mailing list. 583 | ```{r kmeansPP} 584 | kmpp <- function(X, k) { 585 | require('pracma') 586 | 587 | n <- nrow(X) 588 | C <- numeric(k) 589 | C[1] <- sample(1:n, 1) 590 | 591 | for (i in 2:k) { 592 | dm <- distmat(X, X[C, ]) 593 | pr <- apply(dm, 1, min); pr[C] <- 0 594 | C[i] <- sample(1:n, 1, prob = pr) 595 | } 596 | 597 | kmeans(X, X[C, ]) 598 | } 599 | 600 | # Examine output: 601 | kmeans.2 <- kmpp(USArrests.1, k) 602 | kmeans.2$cluster 603 | ``` 604 | 605 | 606 | ## K-Medoids 607 | ```{r kmedoids} 608 | kmed.1 <- cluster::pam(x= state.disimilarity, k=2) 609 | kmed.1$clustering 610 | 611 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2) 612 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=kmed.1$cluster) 613 | ``` 614 | Many other similarity measures can be found in `proxy::dist()`. 615 | See `cluster::clara()` for a massive-data implementation of PAM. 616 | 617 | 618 | 619 | ## Hirarchial Clustering 620 | ```{r Hirarchial Clustering} 621 | # Single linkage: 622 | hirar.1 <- hclust(state.disimilarity, method='single') 623 | plot(hirar.1, labels=rownames(USArrests.1), ylab="Distance") 624 | 625 | # Complete linkage: 626 | hirar.2 <- hclust(state.disimilarity, method='complete') 627 | plot(hirar.2, labels=rownames(USArrests.1), ylab="Distance") 628 | 629 | # Average linkage: 630 | hirar.3 <- hclust(state.disimilarity, method='average') 631 | plot(hirar.3, labels=rownames(USArrests.1), ylab="Distance") 632 | 633 | # Fixing the number of clusters: 634 | cut.2.2 <- cutree(hirar.2, k=2) 635 | cut.2.2 # printing the "clustering vector" 636 | 637 | # Suppose we preferred a 5-cluster solution: 638 | cut.2.5 <- cutree(hirar.2, k=5) 639 | cut.2.5 # printing the "clustering vector" 640 | ``` 641 | 642 | Visualizing clusters: 643 | ```{r visualize clusters} 644 | # Visualize using scatter plots of the original features 645 | pairs(USArrests.1, panel=function(x,y) text(x,y,cut.2.5)) 646 | 647 | # Visualize in the PC plane: 648 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2) 649 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=cut.2.5) 650 | ``` 651 | 652 | 653 | 654 | 655 | ```{r agnes} 656 | # install.packages('cluster') 657 | library(cluster) 658 | agnes() 659 | ``` 660 | 661 | 662 | ## QT Clustering 663 | [TODO] 664 | See [here](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R)] 665 | 666 | 667 | ## Fuzzy Clustering 668 | [TODO] 669 | See [here](But see [here](http://manuals.bioinformatics.ucr.edu/home/R_BioCondManual#TOC-Clustering-and-Data-Mining-in-R)]) 670 | 671 | 672 | ## Self Organizing Maps (SOMs) 673 | The following is adapted from [Shane Lynn](http://shanelynn.ie/index.php/self-organising-maps-for-customer-segmentation-using-r/). 674 | More details in [this paper](http://www.jstatsoft.org/v21/i05/paper). 675 | If you want hexagons instead of circles, see [this](http://stackoverflow.com/questions/19858729/r-package-kohonen-how-to-plot-hexagons-instead-of-circles-as-in-matlab-som-too). 676 | ```{r som} 677 | library(kohonen) 678 | som.1 <- kohonen::som(USArrests.1, grid = somgrid(6, 6, "hexagonal")) 679 | ``` 680 | 681 | Visuzlize results: 682 | We may need [this figure](notes/art/som_simulation.png) in mind when interpreting SOM: 683 | ```{r som} 684 | # Segments plot: 685 | plot(som.1) 686 | 687 | # Counts plot: 688 | plot(som.1, type='counts') 689 | 690 | # Quality plot: 691 | plot(som.1, type='quality') 692 | 693 | 694 | # Neighbours Distance plot: 695 | plot(som.1, type='dist.neighbours') 696 | 697 | 698 | 699 | # 700 | property.plot <- function(k) plot(som.1, type='property', property = som.1$codes[,k], main = colnames(som.1$codes)[k]) 701 | property.plot(1) 702 | property.plot(2) 703 | property.plot(3) 704 | 705 | 706 | # Clustering: 707 | pretty_palette <- c('#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2') 708 | som.1.cluster <- cutree(hclust(dist(som.1$codes)), 5) 709 | plot(som.1, type="mapping", bgcol = pretty_palette[som.1.cluster], main = "Clusters") 710 | add.cluster.boundaries(som.1, som.1.cluster) 711 | ``` 712 | For fancy visualization of `kohonen` SOMs, see [Seth Spielman's](https://github.com/geoss/som_visualization_r) code. 713 | 714 | Other SOM implementations can be found in `som::som()` and `class::SOM()` but `kohonen` seems the most complete and well documented. 715 | 716 | __Note__: many functions are called `som`. Be careful when loading packages, and make use of the `::` syntax. 717 | 718 | 719 | 720 | ## Spectral Clustering 721 | ```{r spectral clustering} 722 | # install.packages('kernlab') 723 | library(kernlab) 724 | 725 | kernlab::specc() 726 | ``` 727 | 728 | 729 | 730 | 731 | ## Model based (generative) clustering 732 | ```{r generative clustering} 733 | library(mclust) 734 | mclust.1 <- Mclust(USArrests.1) 735 | summary(mclust.1) 736 | 737 | # By default, the generative Gaussian distributions considered are: 738 | # "EII": spherical, equal volume 739 | # "VII": spherical, unequal volume 740 | # "EEI": diagonal, equal volume and shape 741 | # "VEI": diagonal, varying volume, equal shape 742 | # "EVI": diagonal, equal volume, varying shape 743 | # "VVI": diagonal, varying volume and shape 744 | # "EEE": ellipsoidal, equal volume, shape, and orientation 745 | # "EEV": ellipsoidal, equal volume and equal shape 746 | # "VEV": ellipsoidal, equal shape 747 | # "VVV": ellipsoidal, varying volume, shape, and orientation 748 | 749 | # Plotting the BIC values (which is possible for generative methods) 750 | plot(mclust.1, data=USArrests, what="BIC") 751 | # The best solution is VEI with 3 clusters. 752 | 753 | # The clustering: 754 | mclust.1$classification 755 | 756 | # This gives the probabilities of belonging to each cluster for every object: 757 | round(mclust.1$z,2) 758 | ``` 759 | 760 | 761 | Visualizing the clusters: 762 | ```{r visualize generative clustering} 763 | # Visualize using scatter plots of the original features 764 | pairs(USArrests.1, panel=function(x,y) text(x, y, mclust.1$classification)) 765 | 766 | # Visualize in the PC plane: 767 | plot(pca1$x[,1], pca1$x[,2], xlab="PC 1", ylab="PC 2", type ='n', lwd=2) 768 | text(pca1$x[,1], pca1$x[,2], labels=rownames(USArrests.1), cex=0.7, lwd=2, col=mclust.1$classification) 769 | ``` 770 | 771 | --------------------------------------------------------------------------------