├── tests └── test.txt ├── _config.yml ├── R ├── Rplot001.png ├── Rplot002.png ├── Rplot003.png ├── Rplot004.png ├── Rplot005.png ├── Rplot006.png ├── Rplot007.png ├── example_1.gif ├── logo.R ├── ml │ ├── bayesMLR.R │ └── data │ │ └── abalone.names ├── pdist.cpp ├── output.json ├── nytimes.R ├── highcharts.R ├── sadler.R ├── helpers.R ├── recursive.R ├── text2vec.R ├── count-and-pipes.R ├── fifa.R ├── applied.R ├── loadURL.R ├── bagboot.R ├── ml1.R ├── analysis2.R ├── json2gif.R ├── knn.R ├── nb.R ├── scraping.R ├── eda1.R ├── analysis1.R ├── nb2.R ├── final.R ├── preprocessing.R └── EDAreg.R ├── fig └── logo.png ├── .gitignore ├── data ├── demographaic.rds └── conv.R ├── .gitattributes ├── pres ├── recursive.Rmd ├── recursive.md ├── copyonmodify.Rmd ├── ml1.Rmd ├── copyonmodify.md ├── ml2.Rmd ├── html-scraping.Rmd ├── biasAndInference.Rmd └── html-scraping.md ├── shell.nix ├── Readme.md └── .Rhistory /tests/test.txt: -------------------------------------------------------------------------------- 1 | this is a test 2 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-slate -------------------------------------------------------------------------------- /R/Rplot001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot001.png -------------------------------------------------------------------------------- /R/Rplot002.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot002.png -------------------------------------------------------------------------------- /R/Rplot003.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot003.png -------------------------------------------------------------------------------- /R/Rplot004.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot004.png -------------------------------------------------------------------------------- /R/Rplot005.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot005.png -------------------------------------------------------------------------------- /R/Rplot006.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot006.png -------------------------------------------------------------------------------- /R/Rplot007.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/Rplot007.png -------------------------------------------------------------------------------- /fig/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/fig/logo.png -------------------------------------------------------------------------------- /R/example_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/R/example_1.gif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | pres/.Rhistory 2 | pres/biasAndInference_cache/ 3 | pres/biasAndInference_files/ 4 | -------------------------------------------------------------------------------- /data/demographaic.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josephsdavid/teachR/HEAD/data/demographaic.rds -------------------------------------------------------------------------------- /data/conv.R: -------------------------------------------------------------------------------- 1 | ?save 2 | library(readr) 3 | read_csv('demo.csv')->df 4 | 5 | saveRDS(df, "demographaic.rds") 6 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Basic .gitattributes for a R repo. 2 | 3 | # Source files 4 | # ============ 5 | *.Rdata text 6 | *.rdb binary 7 | *.rds binary 8 | *.Rd text 9 | *.Rdx binary 10 | *.Rmd text 11 | *.R text 12 | 13 | *.css linguist-detectable=false 14 | *.cs linguist-detectable=false 15 | *.js linguist-detectable=false 16 | *.html linguist-detectable=false 17 | *.xml linguist-detectable=false 18 | -------------------------------------------------------------------------------- /R/logo.R: -------------------------------------------------------------------------------- 1 | library(magick) 2 | library(tidyverse) 3 | # From the docs for the excellent magick package 4 | bigdata <- image_read('https://jeroen.github.io/images/bigdata.jpg') 5 | frink <- image_read("https://jeroen.github.io/images/frink.png") 6 | logo <- image_read("https://jeroen.github.io/images/Rlogo.png") 7 | combo <- c(bigdata, frink,logo) 8 | combo %>% image_flatten('Add') %>% 9 | image_write(path = "../fig/logo.png", format = "png") 10 | -------------------------------------------------------------------------------- /R/ml/bayesMLR.R: -------------------------------------------------------------------------------- 1 | library(caret) 2 | data(BreastCancer, package = "mlbench") 3 | 4 | bc <- BreastCancer 5 | 6 | 7 | makeSampleIndices <- function(x, perc, seed = NULL) { 8 | set.seed(seed) 9 | smpSize <- floor(perc * nrow(x)) 10 | return(sample(seq_len(nrow(x)), size = smpSize)) 11 | } 12 | 13 | bc$Id <- NULL 14 | trainInd <- makeSampleIndices(bc, 0.7, seed = 47) 15 | 16 | training <- bc[trainInd,] 17 | 18 | testing <- bc[trainInd,] 19 | 20 | # check for a class imbalance!!! 21 | 22 | # show off automated plots 23 | 24 | 25 | model <- train(Class ~ .) 26 | -------------------------------------------------------------------------------- /R/pdist.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | // [[Rcpp::depends(RcppArmadillo)]] 3 | 4 | using namespace Rcpp; 5 | 6 | // [[Rcpp::export]] 7 | NumericMatrix fastPdist2(NumericMatrix Ar, NumericMatrix Br) { 8 | int m = Ar.nrow(), 9 | n = Br.nrow(), 10 | k = Ar.ncol(); 11 | arma::mat A = arma::mat(Ar.begin(), m, k, false); 12 | arma::mat B = arma::mat(Br.begin(), n, k, false); 13 | 14 | arma::colvec An = sum(square(A),1); 15 | arma::colvec Bn = sum(square(B),1); 16 | 17 | arma::mat C = -2 * (A * B.t()); 18 | C.each_col() += An; 19 | C.each_row() += Bn.t(); 20 | 21 | return wrap(sqrt(C)); 22 | } 23 | 24 | -------------------------------------------------------------------------------- /pres/recursive.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "recursion" 3 | author: "David Josephs" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include = F} 8 | knitr::read_chunk('../R/recursive.R') 9 | library(knitr) 10 | library(kableExtra) 11 | library(magrittr) 12 | library(pander) 13 | knitr::opts_chunk$set(cache = T, autodep = T) 14 | ``` 15 | 16 | ```{r} 17 | <> 18 | ``` 19 | 20 | ```{r} 21 | <> 22 | 23 | ``` 24 | 25 | Make a countdown function 26 | 27 | ```{r} 28 | # <> 29 | ``` 30 | 31 | 32 | ## work with lists 33 | 34 | ```{r} 35 | <> 36 | ``` 37 | 38 | 39 | ```{r} 40 | <> 41 | ``` 42 | 43 | ```{r} 44 | <> 45 | ``` 46 | 47 | ```{r} 48 | <> 49 | ``` 50 | -------------------------------------------------------------------------------- /R/output.json: -------------------------------------------------------------------------------- 1 | [{"step":0,"bodies":[{"body":0,"pos":{"x":1.000000,"y":1.000000},"mass":10.000000},{"body":1,"pos":{"x":3.000000,"y":3.000000},"mass":10.000000},{"body":2,"pos":{"x":5.000000,"y":5.000000},"mass":100.000000}]},{"step":1,"bodies":[{"body":0,"pos":{"x":2.750000,"y":2.750000},"mass":10.000000},{"body":1,"pos":{"x":3.750000,"y":3.750000},"mass":10.000000},{"body":2,"pos":{"x":4.750000,"y":4.750000},"mass":100.000000}]},{"step":2,"bodies":[{"body":0,"pos":{"x":3.625000,"y":3.625000},"mass":10.000000},{"body":1,"pos":{"x":4.125000,"y":4.125000},"mass":10.000000},{"body":2,"pos":{"x":4.625000,"y":4.625000},"mass":100.000000}]},{"step":3,"bodies":[{"body":0,"pos":{"x":4.062500,"y":4.062500},"mass":10.000000},{"body":1,"pos":{"x":4.312500,"y":4.312500},"mass":10.000000},{"body":2,"pos":{"x":4.562500,"y":4.562500},"mass":100.000000}]},{"step":4,"bodies":[{"body":0,"pos":{"x":4.281250,"y":4.281250},"mass":10.000000},{"body":1,"pos":{"x":4.406250,"y":4.406250},"mass":10.000000},{"body":2,"pos":{"x":4.531250,"y":4.531250},"mass":100.000000}]}] 2 | -------------------------------------------------------------------------------- /R/nytimes.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(plyr) 4 | library(rjson) 5 | library(jsonlite) 6 | library(pander) 7 | 8 | 9 | NYTIMES_KEY = "ULAGIigdFMCFsmJgB5cxcwKmCjnISU6W"; 10 | 11 | # Let's set some parameters 12 | term <- "Trump" # Need to use + to string together separate words 13 | begin_date <- "20190101" 14 | end_date <- "20190106" 15 | 16 | baseurl <- paste0("http://api.nytimes.com/svc/search/v2/articlesearch.json?q=",term, 17 | "&begin_date=",begin_date,"&end_date=",end_date, 18 | "&facet_filter=true&api-key=",NYTIMES_KEY, sep="") 19 | 20 | initialQuery <- jsonlite::fromJSON(baseurl) 21 | pander(head(initialQuery,1)) 22 | str(initialQuery) 23 | maxPages <- round((initialQuery$response$meta$hits[1] / 10)-1) 24 | # [1] 18 25 | 26 | 27 | #for(i in 1:100000000) 28 | #{ 29 | # j = (i + 1 -1 )/i 30 | #} 31 | 32 | ### This will be slow 33 | pages <- list() 34 | for(i in 0:maxPages){ 35 | nytSearch <- jsonlite::fromJSON(paste(baseurl, "&page=", i), flatten = TRUE) %>% data.frame() 36 | message("Retrieving page ", i) 37 | pages[[i+1]] <- nytSearch 38 | Sys.sleep(4) 39 | } 40 | 41 | -------------------------------------------------------------------------------- /R/highcharts.R: -------------------------------------------------------------------------------- 1 | library("highcharter") 2 | library(ggplot2) 3 | library(tidyverse) 4 | data(diamonds, mpg, package = "ggplot2") 5 | 6 | hchart(mpg, "scatter", hcaes(x = displ, y = hwy, group = class)) 7 | 8 | hcmap("countries/nz/nz-all") 9 | 10 | mapdata <- get_data_from_map(download_map_data("countries/us/us-all")) 11 | glimpse(mapdata) 12 | 13 | data_fake <- mapdata %>% 14 | select(code = `hc-a2`) %>% 15 | mutate(value = 1e5 * abs(rt(nrow(.), df = 10))) 16 | hcmap("countries/us/us-all", data = data_fake, value = "value", 17 | joinBy = c("hc-a2", "code"), name = "Fake data", 18 | dataLabels = list(enabled = TRUE, format = '{point.name}'), 19 | borderColor = "#FAFAFA", borderWidth = 0.1, 20 | tooltip = list(valueDecimals = 2, valuePrefix = "$", valueSuffix = " USD")) 21 | 22 | data(unemployment) 23 | 24 | hcmap("countries/us/us-all-all", data = unemployment, 25 | name = "Unemployment", value = "value", joinBy = c("hc-key", "code"), 26 | borderColor = "transparent") %>% 27 | hc_colorAxis(dataClasses = color_classes(c(seq(0, 10, by = 2), 50))) %>% 28 | hc_legend(layout = "vertical", align = "right", 29 | floating = TRUE, valueDecimals = 0, valueSuffix = "%") 30 | 31 | 32 | -------------------------------------------------------------------------------- /R/sadler.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | splitPerc = .75 3 | str(iris) 4 | # 'data.frame': 150 obs. of 5 variables: 5 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 6 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 7 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 8 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 9 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... 10 | # NULL 11 | irisVersVirg = iris %>% filter(Species == "versicolor" | Species == "virginica") 12 | head(irisVersVirg) 13 | str(irisVersVirg) 14 | # 'data.frame': 100 obs. of 5 variables: 15 | # $ Sepal.Length: num 7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ... 16 | # $ Sepal.Width : num 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ... 17 | # $ Petal.Length: num 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ... 18 | # $ Petal.Width : num 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ... 19 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ... 20 | # NULL 21 | irisVersVirg = droplevels(irisVersVirg,exclude = "setosa") 22 | str(irisVersVirg) 23 | # 'data.frame': 100 obs. of 5 variables: 24 | # $ Sepal.Length: num 7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ... 25 | # $ Sepal.Width : num 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ... 26 | # $ Petal.Length: num 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ... 27 | # $ Petal.Width : num 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ... 28 | # $ Species : Factor w/ 2 levels "versicolor","virginica": 1 1 1 1 1 1 1 1 1 1 ... 29 | # NULL 30 | library(class) 31 | ?knn 32 | -------------------------------------------------------------------------------- /pres/recursive.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "recursion" 3 | author: "David Josephs" 4 | output: html_document 5 | --- 6 | 7 | 8 | 9 | 10 | ```r 11 | is_empty = function(xs) { # xs: a vector of any type 12 | length(xs) == 0 13 | } 14 | 15 | # Get the first element of a vector, raising an exception if the vector is empty. 16 | hd = function(xs) { # xs: a vector of any type 17 | if (is_empty(xs)) stop("Vector is empty.") 18 | else xs[1] 19 | } 20 | 21 | # Get the tail of a vector without its first element, raising an exception if the vector is empty. 22 | tl = function(xs) { # xs: a vector of any type 23 | if (is_empty(xs)) stop("Vector is empty.") 24 | else xs[-1] 25 | } 26 | ``` 27 | 28 | 29 | ```r 30 | recsum <- function(xs) { 31 | if (is_empty(xs)) 0 32 | else hd(xs) + recsum(tl(xs)) 33 | } 34 | ``` 35 | 36 | Make a countdown function 37 | 38 | 39 | ```r 40 | # <> 41 | ``` 42 | 43 | 44 | ## work with lists 45 | 46 | 47 | ```r 48 | a <- list(1:2,3:4) 49 | 50 | is_empty <- function(l){ 51 | length(l)==0 52 | } 53 | 54 | hd <- function(l){ 55 | if (is_empty(l)) stop("List is empty") 56 | else l[[1]] 57 | } 58 | 59 | tl <- function(l){ 60 | if(is_empty(l)) stop("List is empty") 61 | else l[-1] 62 | } 63 | ``` 64 | 65 | 66 | 67 | ```r 68 | sum_pairs <- function(l){ 69 | if (is_empty(l)) 0 70 | else hd(l)[1] + hd(l)[2] + sum_pairs(tl(l)) 71 | } 72 | ``` 73 | 74 | 75 | ```r 76 | firstitems <- function(l) { 77 | if (is_empty(l)) list() 78 | else c(hd(l)[1], firstitems(tl(l))) 79 | } 80 | ``` 81 | 82 | 83 | ```r 84 | #<> 85 | ``` 86 | -------------------------------------------------------------------------------- /R/helpers.R: -------------------------------------------------------------------------------- 1 | # fix up outliers 2 | clean <- forecast::tsclean 3 | 4 | # Generator function to fix up sampling rate to something reasonable 5 | 6 | change_samples <- function(n){ 7 | function(xs){ 8 | # tapply does table apply, or grouped apply 9 | # this time we are grouping by divisors of n 10 | # try 1:20 %/% 5 as an example. 11 | # we are summing every n things in this case 12 | out <- unname(tapply( 13 | xs, 14 | (seq_along(xs)-1) %/% n, 15 | sum 16 | )) 17 | out <- ts(out, frequency = (8760/n)) 18 | out 19 | } 20 | } 21 | 22 | # daily and weekly sampling, monthly is 4 weeks 23 | to_daily <- change_samples(24) 24 | to_weekly <- change_samples(24*7) 25 | to_monthly <- change_samples(24*7*4) 26 | to_season <- change_samples(24*(365/4)) 27 | 28 | # pipelining final cleaning and conversion, fixing insane outliers 29 | # and negatives 30 | cleandays <- function(xs) { 31 | xs %>>% clean %>>% abs %>>% to_daily 32 | } 33 | 34 | cleanweeks <- function(xs) { 35 | xs %>>% clean %>>% abs %>>% to_weekly 36 | } 37 | cleanmonths <- function(xs) { 38 | xs %>>% clean %>>% abs %>>% to_monthly 39 | } 40 | cleanseas <- function(xs) { 41 | xs %>>% clean %>>% abs %>>% to_season 42 | } 43 | 44 | # some polts 45 | seasonplot <- forecast::ggseasonplot 46 | subseriesplot <- forecast::ggsubseriesplot 47 | lagplot <- forecast::gglagplot 48 | # resampling our dataset with a window, this allows us to git a test set 49 | resample <- function(xs){ 50 | xs %>>% cleandays %>>% window(end = 5) -> day 51 | xs %>>% cleanweeks %>>% window(end = 5) -> week 52 | xs %>>% cleanmonths %>>% window(end = 5) -> month 53 | xs %>>% cleanseas %>>% window(end = 5) -> seas 54 | list(day = day, week = week, month = month, season = seas) 55 | } 56 | 57 | # forecast and assess 58 | 59 | fore_and_assess <- function(...){ 60 | f <- forecast(...) 61 | out <- assess(..., plot = FALSE) 62 | f$ASE <- out 63 | f 64 | } 65 | 66 | getASE <- function(model){ 67 | accuracy(model)[2]^2 68 | } 69 | -------------------------------------------------------------------------------- /R/recursive.R: -------------------------------------------------------------------------------- 1 | ## @knitr setup 2 | 3 | is_empty = function(xs) { # xs: a vector of any type 4 | length(xs) == 0 5 | } 6 | 7 | is_empty(c(1,2,3,4)) 8 | c() 9 | integer() 10 | is_empty(c()) 11 | # Get the first element of a vector, raising an exception if the vector is empty. 12 | hd = function(xs) { # xs: a vector of any type 13 | if (is_empty(xs)) stop("Vector is empty.") 14 | else xs[1] 15 | } 16 | x <- c(2,3,4,5,6) 17 | hd(x) 18 | 19 | # Get the tail of a vector without its first element, raising an exception if the vector is empty. 20 | tl = function(xs) { # xs: a vector of any type 21 | if (is_empty(xs)) stop("Vector is empty.") 22 | else xs[-1] 23 | } 24 | 25 | 26 | tl(x) 27 | 28 | 29 | ## @knitr first 30 | 31 | recsum <- function(xs) { 32 | if (is_empty(xs)) { 33 | return(0) 34 | } 35 | else { 36 | hd(xs) + recsum(tl(xs)) 37 | } 38 | } 39 | y = c(3,4,5,6) 40 | hd(y) 41 | y2 = c(4,5,6) 42 | hd(y2) 43 | y3=c(5,6) 44 | hd(y3) 45 | y4 = 6 46 | hd(y4) 47 | hd(y)+hd(y2)+hd(y3)+hd(y4) 48 | recsum(y) 49 | ## @knitr listtool 50 | 51 | a <- list(1:2,3:4) 52 | a 53 | is_empty <- function(l){ 54 | length(l)==0 55 | } 56 | 57 | hd <- function(l){ 58 | if (is_empty(l)) stop("List is empty") 59 | else l[[1]] 60 | } 61 | 62 | tl <- function(l){ 63 | if(is_empty(l)) stop("List is empty") 64 | else l[-1] 65 | } 66 | 67 | data(mtcars) 68 | 69 | 70 | mtcars[,-1] 71 | head(a,1) 72 | ## @knitr sumpair 73 | 74 | sum_pairs <- function(l){ 75 | if (is_empty(l)) 0 76 | else hd(l)[1] + hd(l)[2] + sum_pairs(tl(l)) 77 | } 78 | 79 | ## @knitr firstitem 80 | 81 | hd(a)[1] 82 | 83 | list() 84 | 85 | 86 | firstitems <- function(l) { 87 | if (is_empty(l)) list() 88 | else c(hd(l)[1], firstitems(tl(l))) 89 | } 90 | 91 | firstitems(b) 92 | 93 | h1 = hd(b)[1] 94 | 95 | b2 = tl(b) 96 | b2 97 | h2 =hd(b2)[1] 98 | 99 | b3 = tl(b2) 100 | b3 101 | h3 = hd(b3)[1] 102 | list(h1,h2,h3) 103 | b<- list(1:4,4:7,c("cat","dog","mouse")) 104 | b 105 | firstitems(a) 106 | hd(a)[1] 107 | a2 <-tl(a) 108 | hd(a2)[1] 109 | ## @knitr countdown 110 | 111 | countdown <- function(n){ 112 | if (n == 0) integer() 113 | else c(n,countdown(n-1)) 114 | } 115 | 116 | ## @knitr seconds 117 | 118 | 119 | seconds <- function(l) { 120 | if (is_empty(l)) list() 121 | else c(hd(l)[2], seconds(tl(l))) 122 | } 123 | -------------------------------------------------------------------------------- /pres/copyonmodify.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R copy on modify" 3 | author: "David Josephs" 4 | output: rmarkdown::github.document 5 | --- 6 | 7 | For this little lesson, we will learn about the cool things R does when we copy objects, and some of the pitfalls of for loops. 8 | We will be using Hadley's pryr package, which "prys back the covers of R" 9 | 10 | ```{r} 11 | library(pryr) 12 | size_and_addr <- function(x){ 13 | cat(rep('-',30), '\n') 14 | cat("Object size: ", object_size(x),"\n") 15 | cat(rep('-',30), '\n') 16 | cat("Address in memory: ", address(x)) 17 | } 18 | ``` 19 | 20 | First, let us create a simple data frame: 21 | 22 | ```{r} 23 | a <- data.frame(matrix(1:9, nrow=3)) 24 | ``` 25 | 26 | Next, let's see how big it is in your computer's memory (RAM), and where it is stored 27 | 28 | ```{r} 29 | size_and_addr(a) 30 | ``` 31 | 32 | Now lets create a new object, b, which is just a, and then see where it lies in your computer's memory. Lets also create a new object, x, for later use 33 | 34 | ```{r} 35 | x <- a 36 | b <- a 37 | 38 | size_and_addr(b) 39 | size_and_addr(x) 40 | 41 | ``` 42 | 43 | It is in the same place, which is pretty cool and efficient. But what happens if we modify b and leave A the same? 44 | 45 | ```{r} 46 | b$X4 <- c(10:12) 47 | address(a) 48 | size_and_addr(b) 49 | ``` 50 | 51 | So on modification, we create a copy, and put it in a new place. What do you think the size of a and b is? 52 | 53 | ```{r} 54 | object_size(a,b) 55 | ``` 56 | 57 | Pretty cool, R is saving you memory. All the columns that match are stored in the same location, and only the new ones take more space. 58 | 59 | ```{r} 60 | address(x) 61 | size_and_addr(a) 62 | object_size(a,x) 63 | x <- rbind(x,1:3) 64 | size_and_addr(x) 65 | object_size(a,x) 66 | ``` 67 | 68 | 69 | Note that this is a little bigger. This is because row indexes take up a little more space than columns. 70 | 71 | Lets now see what happens in a for loop, when we "grow a vector". Lets also run our code effic 72 | 73 | ```{r} 74 | vec <- c() 75 | size_and_addr(vec) 76 | vec[1] <- 1 77 | size_and_addr(vec) 78 | vec[2] <- 2 79 | size_and_addr(vec) 80 | vec[3] <- 3 81 | size_and_addr(vec) 82 | vec[4] <- 4 83 | size_and_addr(vec) 84 | ``` 85 | 86 | Wow, so each time we add an element to a vector, we are not only using more memory, but actually moving our object from point to point. This is not a fast process, which is why doing a dirty for loop takes so long in R. Instead, we should ***IN GENERAL*** use functions which call speedy compiled code, such as the apply family and/or purr::map 87 | 88 | For futher reading, see: 89 | [Row wise modification in a loop](https://milesmcbain.xyz/rstats-anti-pattern-row-wise/) 90 | -------------------------------------------------------------------------------- /R/text2vec.R: -------------------------------------------------------------------------------- 1 | library(stringr) 2 | library(text2vec) 3 | library(LDAvis) 4 | data("movie_review") 5 | str(movie_review) 6 | # select 1000 rows for faster running times 7 | 8 | # cross validation 9 | 10 | # overfitting 11 | 12 | # train test split 13 | movie_review_train <- movie_review[1:700, ] 14 | movie_review_test <- movie_review[701:1000, ] 15 | 16 | 17 | prep_fun <- function(x) { 18 | x %>% 19 | # make text lower case 20 | str_to_lower %>% 21 | # remove non-alphanumeric symbols 22 | str_replace_all("[^[:alpha:]]", " ") %>% 23 | # collapse multiple spaces 24 | str_replace_all("\\s+", " ") 25 | } 26 | str(movie_review_train) 27 | movie_review_train$review <- prep_fun(movie_review_train$review) 28 | str(movie_review_train) 29 | 30 | # 'data.frame': 700 obs. of 3 variables: 31 | # $ id : chr "5814_8" "2381_9" "7759_3" "3630_4" ... 32 | # $ sentiment: int 1 1 0 0 1 1 0 0 0 1 ... 33 | # $ review : chr "with all this stuff going down at the moment with mj i ve started listening to his music watching the odd docum"| __truncated__ " the classic war of the worlds by timothy hines is a very entertaining film that obviously goes to great effort"| __truncated__ "the film starts with a manager nicholas bell giving welcome investors robert carradine to primal park a secret "| __truncated__ "it must be assumed that those who praised this film the greatest filmed opera ever didn t i read somewhere eith"| __truncated__ ... 34 | # NULL 35 | 36 | # make the dataset in the right format 37 | it <- itoken(movie_review_train$review, progressbar = TRUE) 38 | v <- create_vocabulary(it) %>% 39 | prune_vocabulary(doc_proportion_max = 0.1, term_count_min = 5) 40 | vectorizer <- vocab_vectorizer(v) 41 | dtm <- create_dtm(it, vectorizer) 42 | 43 | 44 | 45 | tokens <- movie_review$review[1:4000] %>% 46 | tolower %>% 47 | word_tokenizer 48 | it <- itoken(tokens, ids = movie_review$id[1:4000], progressbar = FALSE) 49 | v <- create_vocabulary(it) %>% 50 | prune_vocabulary(term_count_min = 10, doc_proportion_max = 0.2) 51 | vectorizer <- vocab_vectorizer(v) 52 | dtm <- create_dtm(it, vectorizer, type = "dgTMatrix") 53 | 54 | 55 | # 56 | 57 | lda_model <- LDA$new(n_topics = 10, doc_topic_prior = 0.1, topic_word_prior = 0.01) 58 | doc_topic_distr <- lda_model$fit_transform(x = dtm, n_iter = 1000, 59 | convergence_tol = 0.001, n_check_convergence = 25, 60 | progressbar = TRUE) 61 | 62 | lda_model$get_top_words(n = 10, topic_number = 1:10, lambda = .4) 63 | new_dtm <- itoken(movie_review$review[4001:5000], tolower, word_tokenizer, ids = movie_review$id[4001:5000]) %>% 64 | create_dtm(vectorizer, type = "dgTMatrix") 65 | new_doc_topic_distr <- lda_model$transform(new_dtm) 66 | 67 | options(browser = "firefox") 68 | lda_model$plot() 69 | -------------------------------------------------------------------------------- /R/count-and-pipes.R: -------------------------------------------------------------------------------- 1 | head(mtcars) 2 | # mpg cyl disp hp drat wt qsec vs am gear carb 3 | # Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 4 | # Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 5 | # Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 6 | # Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 7 | # Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 8 | # Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 9 | library(tidyverse) 10 | mtcars %>% count(cyl,gear) 11 | # # A tibble: 8 x 3 12 | # cyl gear n 13 | # 14 | # 1 4 3 1 15 | # 2 4 4 8 16 | # 3 4 5 2 17 | # 4 6 3 2 18 | # 5 6 4 4 19 | # 6 6 5 1 20 | # 7 8 3 12 21 | # 8 8 5 2 22 | mtcars%>% group_by(cyl) %>% tally 23 | # # A tibble: 3 x 2 24 | # cyl n 25 | # 26 | # 1 4 11 27 | # 2 6 7 28 | # 3 8 14 29 | mtcars %>% group_by(gear) %>% summarise(count=n()) 30 | # # A tibble: 3 x 2 31 | # gear count 32 | # 33 | # 1 3 15 34 | # 2 4 12 35 | # 3 5 5 36 | 37 | # A functio 38 | sumsqrt <- function(x){ 39 | sqrt(sum(x)) 40 | } 41 | 42 | mathfun <- function(m,o,p){ 43 | out <- p+o-m 44 | } 45 | 46 | df <- data.frame(matrix(1:9,nrow=3)) 47 | library(magrittr) 48 | # magrittr is where %>% originally comes from, we have a lot more 49 | # pipe operators which we can show off here 50 | 51 | # %<>% assigns the variable as well as pipes 52 | df %<>% rename(x = X1, y = X2, z = X3) 53 | # equivalent to 54 | # df <- df %>% rename(x = X1, y = X2, z = X3) 55 | 56 | sumsqrt(df$x) 57 | # [1] 2.44949 58 | 59 | mathfun(df$x,df$y,df$z) 60 | # [1] 10 11 12 61 | 62 | # Now this is a little hard to run with the %>% because we dont have a tidy ( fun(data = ...) ) 63 | # API 64 | # How can we do this? 65 | # magrittr has a lovely %$% operator, which looks like a swear word or something, but it is a pipe 66 | # that spreads the dollar sign operator. That is, df %$% (a+b-c) is equivalent to: 67 | # df$a + df$b - df$c 68 | 69 | # Let us now show off this new pipe syntax 70 | 71 | # bad 72 | df %>% mathfun(.$x, .$y, .$z) 73 | # This will not run because it is piping df into the function, which doesnt work 74 | # Lets try with our knew dollar sign pipe 75 | df %$% mathfun(x, y, z) 76 | # [1] 10 11 12 77 | # amazing 78 | 79 | # We can still chain our pipes together as we do 80 | 81 | df %$% mathfun(x,y,z) %>% sumsqrt 82 | # [1] 5.744563 83 | 84 | # Finally, we can use pipes and dots for quick alternate function definition 85 | # This is nice for working fast, as well as for something we will cover a bit later, 86 | # function closures (a slightly more advanced R topic) 87 | # Lets try quickly defining a new function with the pipes, by putting a . at the leading enf 88 | # We tell R that we will give it some data later but please remember this function 89 | 90 | dotfun <- . %>% cos %>% sin %>% exp %>% sqrt 91 | dotfun(45) 92 | # [1] 1.284983 93 | dotfun(-56) 94 | # [1] 1.457468 95 | # This is the equivalent of, without pipes 96 | plainfun <- function(x){ 97 | sqrt(exp(sin(cos(x)))) 98 | } 99 | plainfun(45) 100 | # [1] 1.284983 101 | plainfun(-56) 102 | # [1] 1.457468 103 | 104 | # We can even include the pipes and the dots within our function definition, to write concise functions 105 | -------------------------------------------------------------------------------- /R/fifa.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | # dplyr, readr, ggplot2, 3 | 4 | fifa <- read_csv("https://raw.githubusercontent.com/BivinSadler/MDS-6306-Doing-Data-Science-Fall-2019/Master/Unit%203/FIFA%20Players.csv") 5 | 6 | fifa$Value %>% str 7 | str(fifa$Value) 8 | # chr [1:18207] "€110.5M" "€77M" "€118.5M" "€72M" "€102M" "€93M" "€67M" ... 9 | # NULL 10 | # chr [1:18207] "€110.5M" "€77M" "€118.5M" "€72M" "€102M" "€93M" "€67M" ... 11 | # NULL 12 | 13 | fifa$Value <- gsub("€", "", fifa$Value) 14 | 15 | head(fifa$Value) 16 | # [1] "110.5M" "77M" "118.5M" "72M" "102M" "93M" 17 | # [1] "60K" "60K" "60K" "60K" "60K" "60K" 18 | 19 | fifa$Value %>% str 20 | 21 | grepl("M", fifa$Value[i]) 22 | 23 | datacleaner <- function(col, ind) { 24 | ifelse( 25 | grepl("M", col[ind]), 26 | return(as.numeric(gsub("M","", col[ind])) * 1e6), 27 | return(as.numeric(gsub("K","", col[ind])) * 1000) 28 | ) 29 | } 30 | 31 | 32 | # always preallocate space 33 | values <- double(nrow(fifa)) 34 | for (i in 1:nrow(fifa)) { 35 | values[i] <- datacleaner(fifa$Value, i) 36 | } 37 | 38 | library(reticulate) 39 | os <- import("os") 40 | pd <- import("pandas") 41 | pd$read_csv 42 | 43 | 44 | 45 | # apply family way 46 | values2 <- vapply(1:nrow(fifa), function(i) datacleaner(fifa$Value, i), numeric(1)) 47 | 48 | all.equal(values2, values) 49 | # [1] TRUE 50 | 51 | sum(abs(values2 - values)) 52 | # [1] 0 53 | 54 | 55 | # apply family of functions 56 | # *apply(data, function, extrargs) 57 | # lapply 58 | data(mtcars) 59 | mtcars 60 | 61 | 62 | vapply(mtcars, mean, numeric(1)) 63 | # mpg cyl disp 64 | # 20.090625 6.187500 230.721875 65 | # hp drat wt 66 | # 146.687500 3.596563 3.217250 67 | # qsec vs am 68 | # 17.848750 0.437500 0.406250 69 | # gear carb 70 | # 3.687500 2.812500 71 | # $mpg 72 | # [1] 20.09062 73 | # 74 | # $cyl 75 | # [1] 6.1875 76 | # 77 | # $disp 78 | # [1] 230.7219 79 | # 80 | # $hp 81 | # [1] 146.6875 82 | # 83 | # $drat 84 | # [1] 3.596563 85 | # 86 | # $wt 87 | # [1] 3.21725 88 | # 89 | # $qsec 90 | # [1] 17.84875 91 | # 92 | # $vs 93 | # [1] 0.4375 94 | # 95 | # $am 96 | # [1] 0.40625 97 | # 98 | # $gear 99 | # [1] 3.6875 100 | # 101 | # $carb 102 | # [1] 2.8125 103 | # 104 | 105 | head(values) 106 | 107 | fifa$Value <- values 108 | 109 | head(fifa$Value) 110 | 111 | hist(fifa$Value) 112 | 113 | fifa <- fifa %>% mutate(logVal = log(Value)) 114 | 115 | hist(fifa$logVal) 116 | 117 | # ggthemes is great! 118 | library(ggthemes) 119 | fifa %>% ggplot(aes(x= logVal, fill = Position)) + geom_histogram() 120 | 121 | as.numeric(gsub("M","", fifa$Value[1])) 122 | 123 | 124 | 125 | 126 | 127 | 128 | x <- 1:3 129 | # [1] 1 2 3 130 | 131 | as.numeric(fifa$Value) 132 | 133 | # tidy solution 134 | 135 | library(tidyverse) 136 | fifa <- read_csv("https://raw.githubusercontent.com/BivinSadler/MDS-6306-Doing-Data-Science-Fall-2019/Master/Unit%203/FIFA%20Players.csv") 137 | fifa$Value <- gsub("€", "", fifa$Value) 138 | v2 <- ifelse(grepl("M", fifa$Value), as.numeric(gsub("M","", fifa$Value))*1e6, as.numeric(gsub("K","", fifa$Value))*1e3) 139 | 140 | fifa$Value <- v2 141 | 142 | 143 | 144 | 145 | fifa <- read_csv("https://raw.githubusercontent.com/BivinSadler/MDS-6306-Doing-Data-Science-Fall-2019/Master/Unit%203/FIFA%20Players.csv") 146 | Value <- as.numeric(gsub("[€MK]", "", fifa$Value)) * ifelse(grepl("M", fifa$Value), 1e6, 1e3) 147 | v2 - Value 148 | -------------------------------------------------------------------------------- /R/applied.R: -------------------------------------------------------------------------------- 1 | # re introducing lapply 2 | # lapply(X,FUN,...) 3 | # X is a list (or data frame, which is also a list) 4 | # FUN is the function 5 | # ... are extra arguments to the function 6 | 7 | 8 | a <- list(1:3, c(1.2,4.5,NA,46,-84),c(4,5), 2:25, c(4,5,6,7,8,9,10)) 9 | a 10 | 11 | meanfun <- function(x){ 12 | res <- c() 13 | for(i in seq_along(x)){ 14 | res[i] <- (mean(x[[i]])) 15 | } 16 | return(res) 17 | } 18 | 19 | as.data.frame(lapply(mtcars,mean)) 20 | library(tidyverse) 21 | lapply(mtcars,mean) %>% as.data.frame 22 | meanfun(a) 23 | lapply(a, mean) 24 | ?lapply 25 | data(mtcars) 26 | typeof(mtcars) 27 | 28 | 29 | meanfun2 <- function(x){ 30 | res <- c() 31 | for(i in seq_along(x)){ 32 | res[i] <- (mean(x[[i]], na.rm = T)) 33 | } 34 | return(res) 35 | } 36 | 37 | meanfun2(a) 38 | lapply(a, mean, na.rm = T) %>% as.data.frame 39 | 40 | # nameless functions 41 | data(mtcars) 42 | 43 | # lets say we want to find the square of every sum of mtcars 44 | 45 | squaresum <- function(x){ 46 | (sum(x)^2) 47 | } 48 | 49 | lapply(mtcars, squaresum) %>% as.data.frame 50 | lapply(mtcars, function(x) (sum(x))^2) # they are the same 51 | 52 | # list of functions 53 | 54 | operator <- list( 55 | mea = function(x) mean(x), 56 | med = function(x) median(x), 57 | squaresum = function(x) (sum(x))^2, 58 | summ = function(x) summary(x) 59 | ) 60 | 61 | 62 | x <- rnorm(n = 500,5,2) 63 | operator$mea(x) 64 | operator$med(x) 65 | operator$squaresum(x) 66 | operator$summ(x) 67 | 68 | # useful but how do we go on 69 | 70 | # lets use lapply now, in R, it is a little know fact that everything, 71 | # including data works as a function 72 | # so we can lapply the data onto our list 73 | # wild 74 | 75 | call_fun <- function(f, ...){ 76 | f(...) 77 | } 78 | # lets bring it all together now 79 | lapply(operator, call_fun, x) 80 | 81 | x <- rnorm(n = 500,5,2) 82 | lapply(operator, function(f) f(x)) 83 | 84 | # wild stuff 85 | 86 | # function factories, closures 87 | 88 | power <- function(exponent){ 89 | myfunction <- function(x){ 90 | x ^ exponent 91 | } 92 | return(myfunction) 93 | } 94 | 2^4 95 | 4^7 96 | 4^9 97 | 98 | 99 | eitght <-power(8) 100 | power(13) 101 | 102 | 103 | thirteenth <- power(13) 104 | thirteenth2 <- function(x){ 105 | x^13 106 | } 107 | square2<- function(x){ 108 | x^2 109 | } 110 | thirteenth(2) 111 | thirteenth(3) 112 | thirteenth(4) 113 | 114 | square <- power(2) 115 | cube <- power(3) 116 | quart <- power(4) 117 | square(3) 118 | cube(4) 119 | quart(5) 120 | 121 | # now lets really bring it all together with an in the wild example, a function factory list for time series (apply to your own work, especially in stats :) 122 | 123 | library(tswge) 124 | gen.sigplusnoise.wge(n=200, phi = c(0.2,0.4,-0.2),sn =4) 125 | gen.arima.wge(n=200, d= 1) 126 | gen.aruma.wge(n=200,s=11) 127 | 128 | ?gen.sigplusnoise.wge 129 | 130 | ?lapply 131 | 132 | 133 | 134 | tswgen <- function(n,sn = 0){ 135 | sig <- function(...){ 136 | gen.sigplusnoise.wge(n=n,...,sn=sn) 137 | } 138 | ari <- function(...){ 139 | gen.arima.wge(n=n,...,sn=sn) 140 | } 141 | aru <- function(...){ 142 | gen.aruma.wge(n=n,..., sn=sn) 143 | } 144 | list("sig"=sig,"ari"=ari,"aru"=aru) 145 | } 146 | 147 | 148 | mtcars %>% something %>% something 149 | mtcars %>% somethingelse 150 | 151 | 152 | ts200 <- tswgen(200, 30) 153 | ts200$sig(phi = c (0.2,0.4,-0.2)) 154 | ts200$ari(d = 1) 155 | ts200$aru(s=11) 156 | 157 | phis <- c(0.2,0.4,-0.2) 158 | lapply(ts200, function(f) f(phis)) 159 | ts200_37 <- tswgen(200,sn=2) 160 | ts200_37$sig(phi = c (0.2,0.4,-0.2)) 161 | ts200_37$ari(d = 4) 162 | ts200_37$aru(s=1) 163 | -------------------------------------------------------------------------------- /shell.nix: -------------------------------------------------------------------------------- 1 | let 2 | jupyter = import (builtins.fetchGit { 3 | url = https://github.com/tweag/jupyterWith; 4 | rev = ""; 5 | }) {}; 6 | 7 | iPython = jupyter.kernels.iPythonWith { 8 | name = "python"; 9 | packages = p: with p; [ 10 | plotly 11 | numpy 12 | pandas 13 | jupytext 14 | matplotlib 15 | scikitlearn 16 | seaborn 17 | scipy 18 | future 19 | ipywidgets 20 | scikitimage 21 | tzlocal 22 | simplegeneric 23 | pprintpp 24 | ]; 25 | }; 26 | 27 | iR = jupyter.kernels.juniperWith { 28 | name = "R"; 29 | packages = p: with p; [ 30 | mlbench 31 | arules 32 | tidyverse 33 | ggplot2 34 | dplyr 35 | ]; 36 | }; 37 | 38 | iHaskell = jupyter.kernels.iHaskellWith { 39 | name = "haskell"; 40 | packages = p: with p; [ hvega formatting ]; 41 | }; 42 | 43 | jupyterEnvironment = 44 | jupyter.jupyterlabWith { 45 | kernels = [ iPython iHaskell iR]; 46 | # directory = jupyter.mkDirectoryWith { 47 | # extensions = [ 48 | # "jupyterlab-jupytext" 49 | # ]; 50 | # }; 51 | }; 52 | pkgs = import {}; 53 | in 54 | pkgs.mkShell { 55 | name = "scratchwork"; 56 | buildInputs = with pkgs; [ 57 | vscode 58 | # jupyterEnvironment 59 | python37 60 | python37Packages.pandas 61 | python37Packages.numpy 62 | python37Packages.matplotlib 63 | #python37Packages.sqlite 64 | python37Packages.notebook 65 | python37Packages.ipython 66 | python37Packages.jupytext 67 | python37Packages.scikitlearn 68 | python37Packages.seaborn 69 | python37Packages.scipy 70 | python37Packages.plotly 71 | python37Packages.ipywidgets 72 | python37Packages.future 73 | python37Packages.scikitimage 74 | #Todo Package graphlab 75 | python37Packages.tzlocal 76 | rstudio 77 | python37Packages.simplegeneric 78 | R 79 | rPackages.ElemStatLearn 80 | rPackages.mlbench 81 | rPackages.magick 82 | rPackages.foreign 83 | rPackages.data_table 84 | rPackages.magrittr 85 | rPackages.lobstr 86 | rPackages.memery 87 | rPackages.lubridate 88 | rPackages.stringr 89 | rPackages.abind 90 | rPackages.foreign 91 | rPackages.downloader 92 | rPackages.memoise 93 | rPackages.lattice 94 | rPackages.microbenchmark 95 | rPackages.arules 96 | rPackages.tidyverse 97 | rPackages.devtools 98 | rPackages.pander 99 | rPackages.Rcpp 100 | rPackages.RNHANES 101 | rPackages.reticulate 102 | rPackages.humaniformat 103 | rPackages.httr 104 | rPackages.profvis 105 | rPackages.pryr 106 | rPackages.tswge 107 | rPackages.RcppArmadillo 108 | rPackages.benchmarkme 109 | python37Packages.pprintpp 110 | rPackages.jsonlite 111 | openblas 112 | julia 113 | ]; 114 | 115 | # First important part: Add here the dependencies the packages you want to install need 116 | # LD_LIBRARY_PATH="${glfw}/lib:${mesa}/lib:${freetype}/lib:${imagemagick}/lib:${portaudio}/lib:${libsndfile.out}/lib:${libxml2.out}/lib:${expat.out}/lib:${cairo.out}/lib:${pango.out}/lib:${gettext.out}/lib:${glib.out}/lib:${gtk3.out}/lib:${gdk_pixbuf.out}/lib:${cairo.out}:${tk.out}/lib:${tcl.out}/lib:${pkgs.sqlite.out}/lib:${pkgs.zlib}/lib"; 117 | shellHook = '' 118 | echo "#!/usr/bin/env Rscript" > libs.R 119 | echo "devtools::install_github('csgillespie/efficient', build_vignettes=TRUE)" >> libs.R 120 | Rscript libs.R 121 | ''; 122 | } 123 | -------------------------------------------------------------------------------- /R/loadURL.R: -------------------------------------------------------------------------------- 1 | library(jsonlite) 2 | library(RCurl) 3 | library(httr) 4 | 5 | site <- "https://public.opendatasoft.com/api/records/1.0/search/?dataset=titanic-passengers&rows=2000&facet=survived&facet=pclass&facet=sex&facet=age&facet=embarked" 6 | 7 | JSON2 <- GET(site) 8 | head(JSON2) 9 | # getURL of the site 10 | # convert that json to an R object with fromJSON 11 | # use str to figure out where the data we want is 12 | # have the data 13 | JSON <- getURL(site) 14 | 15 | titanic <- fromJSON(JSON) 16 | head(titanic) 17 | str(titanic) 18 | # List of 4 19 | # $ nhits : int 891 20 | # $ parameters :List of 5 21 | # ..$ dataset : chr "titanic-passengers" 22 | # ..$ timezone: chr "UTC" 23 | # ..$ rows : int 2000 24 | # ..$ format : chr "json" 25 | # ..$ facet : chr [1:5] "survived" "pclass" "sex" "age" ... 26 | # $ records :'data.frame': 891 obs. of 4 variables: 27 | # ..$ datasetid : chr [1:891] "titanic-passengers" "titanic-passengers" "titanic-passengers" "titanic-passengers" ... 28 | # ..$ recordid : chr [1:891] "eea7ba75804a635bbda037c6f1b0c3d2aa692676" "cd86858c28b1f1089d1da74a4da9c16ee7552a3a" "a9f68f2c2ffa9dc96c153cbeed93383095e5d8e9" "db6e63fcab7b6af79944143027f60f7cb66d846f" ... 29 | # ..$ fields :'data.frame': 891 obs. of 12 variables: 30 | # .. ..$ fare : num [1:891] 7.78 7.92 7.92 18.75 89.1 ... 31 | # .. ..$ name : chr [1:891] "Birkeland, Mr. Hans Martin Monsen" "Heikkinen, Miss. Laina" "Sundman, Mr. Johan Julian" "Richards, Mrs. Sidney (Emily Hocking)" ... 32 | # .. ..$ embarked : chr [1:891] "S" "S" "S" "S" ... 33 | # .. ..$ age : num [1:891] 21 26 44 24 NA 33 28 16 30 32 ... 34 | # .. ..$ parch : int [1:891] 0 0 0 3 0 0 0 0 0 0 ... 35 | # .. ..$ pclass : int [1:891] 3 3 3 2 1 3 2 2 3 2 ... 36 | # .. ..$ sex : chr [1:891] "male" "female" "male" "female" ... 37 | # .. ..$ survived : chr [1:891] "No" "Yes" "Yes" "Yes" ... 38 | # .. ..$ ticket : chr [1:891] "312992" "STON/O2. 3101282" "STON/O 2. 3101269" "29106" ... 39 | # .. ..$ passengerid: int [1:891] 409 3 415 438 850 882 884 792 287 71 ... 40 | # .. ..$ sibsp : int [1:891] 0 0 0 2 1 0 0 0 0 0 ... 41 | # .. ..$ cabin : chr [1:891] NA NA NA NA ... 42 | # ..$ record_timestamp: chr [1:891] "2016-09-20T22:34:51.313000+00:00" "2016-09-20T22:34:51.313000+00:00" "2016-09-20T22:34:51.313000+00:00" "2016-09-20T22:34:51.313000+00:00" ... 43 | # $ facet_groups:'data.frame': 5 obs. of 2 variables: 44 | # ..$ facets:List of 5 45 | # .. ..$ :'data.frame': 88 obs. of 4 variables: 46 | # .. .. ..$ count: int [1:88] 30 27 26 25 25 25 24 23 22 20 ... 47 | # .. .. ..$ path : chr [1:88] "24.0" "22.0" "18.0" "19.0" ... 48 | # .. .. ..$ state: chr [1:88] "displayed" "displayed" "displayed" "displayed" ... 49 | # .. .. ..$ name : chr [1:88] "24.0" "22.0" "18.0" "19.0" ... 50 | # .. ..$ :'data.frame': 2 obs. of 4 variables: 51 | # .. .. ..$ count: int [1:2] 577 314 52 | # .. .. ..$ path : chr [1:2] "male" "female" 53 | # .. .. ..$ state: chr [1:2] "displayed" "displayed" 54 | # .. .. ..$ name : chr [1:2] "male" "female" 55 | # .. ..$ :'data.frame': 2 obs. of 4 variables: 56 | # .. .. ..$ count: int [1:2] 549 342 57 | # .. .. ..$ path : chr [1:2] "No" "Yes" 58 | # .. .. ..$ state: chr [1:2] "displayed" "displayed" 59 | # .. .. ..$ name : chr [1:2] "No" "Yes" 60 | # .. ..$ :'data.frame': 3 obs. of 4 variables: 61 | # .. .. ..$ count: int [1:3] 491 216 184 62 | # .. .. ..$ path : chr [1:3] "3" "1" "2" 63 | # .. .. ..$ state: chr [1:3] "displayed" "displayed" "displayed" 64 | # .. .. ..$ name : chr [1:3] "3" "1" "2" 65 | # .. ..$ :'data.frame': 3 obs. of 4 variables: 66 | # .. .. ..$ count: int [1:3] 644 168 77 67 | # .. .. ..$ path : chr [1:3] "S" "C" "Q" 68 | # .. .. ..$ state: chr [1:3] "displayed" "displayed" "displayed" 69 | # .. .. ..$ name : chr [1:3] "S" "C" "Q" 70 | # ..$ name : chr [1:5] "age" "sex" "survived" "pclass" ... 71 | # NULL 72 | -------------------------------------------------------------------------------- /R/bagboot.R: -------------------------------------------------------------------------------- 1 | library(party) 2 | library(ranger) 3 | #library(randomforest) 4 | library(gbm) 5 | 6 | 7 | data(BreastCancer, package = "mlbench") 8 | bc <- BreastCancer 9 | 10 | head(bc) 11 | # Id Cl.thickness Cell.size Cell.shape Marg.adhesion Epith.c.size Bare.nuclei Bl.cromatin 12 | # 1 1000025 5 1 1 1 2 1 3 13 | # 2 1002945 5 4 4 5 7 10 3 14 | # 3 1015425 3 1 1 1 2 2 3 15 | # 4 1016277 6 8 8 1 3 4 3 16 | # 5 1017023 4 1 1 3 2 1 3 17 | # 6 1017122 8 10 10 8 7 10 9 18 | # Normal.nucleoli Mitoses Class 19 | # 1 1 1 benign 20 | # 2 2 1 benign 21 | # 3 1 1 benign 22 | # 4 7 1 benign 23 | # 5 1 1 benign 24 | # 6 7 1 malignant 25 | 26 | bc$Id <- NULL 27 | 28 | str(bc) 29 | # 'data.frame': 699 obs. of 10 variables: 30 | # $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ... 31 | # $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ... 32 | # $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ... 33 | # $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ... 34 | # $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ... 35 | # $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ... 36 | # $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ... 37 | # $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ... 38 | # $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ... 39 | # $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ... 40 | # NULL 41 | 42 | # do a train test split 43 | 44 | trainInd <- sample(1:nrow(bc), floor(0.7*nrow(bc))) 45 | training<- bc[trainInd,] 46 | testing <- bc[-trainInd,] 47 | tree <- ctree(Class ~ ., data = training) 48 | 49 | plot(tree) 50 | 51 | 52 | predict(tree, newdata = testing, type = "prob")[[1]][2] 53 | 54 | plot(tree) 55 | 56 | 57 | # bias variance tradeoff 58 | 59 | 60 | # bootstrap = random sample 61 | # it is here 62 | L_tree <- list() 63 | n <- nrow(df) 64 | for(s in 1:1000){ 65 | idx <- sample(1:nrow(training), size = nrow(training), replace = TRUE) 66 | L_tree[[s]] <- ctree(Class ~ ., data = training[idx,]) 67 | } 68 | 69 | 70 | # agg 71 | 72 | predict2 <- function(df) { 73 | res <- data.frame(lapply(1:1000, function(x) { 74 | predict(L_tree[[x]], newdata = df, type = "prob")[2] 75 | })) 76 | (res) 77 | } 78 | 79 | preds <- predict2(testing) 80 | preds 81 | 82 | head(bc$Class) 83 | 84 | preds_numeric <- ifelse(preds <= 0.5, 1, 2) 85 | 86 | preds_factor <- factor(preds, levels = c("benign","malignant")) 87 | 88 | 89 | caret::confusionMatrix(table(testing$Class, preds_factor)) 90 | 91 | 92 | # talk about forests and boosting now, then show an example 93 | 94 | 95 | 96 | # random forest 97 | 98 | 99 | 100 | rf <- randomForest(Class ~ ., data = na.omit(training), importance = TRUE) 101 | 102 | 103 | varImpPlot(rf) 104 | 105 | 106 | rfPred <- predict(rf, testing) 107 | rfPred 108 | 109 | # boosting 110 | 111 | 112 | 113 | library(xgboost) 114 | library(dplyr) 115 | 116 | features <- training %>% dplyr::select(-Class) 117 | xgb <- xgboost(data = data.matrix(features), label = training$Class, nrounds = 300) 118 | 119 | 120 | features2 <- testing %>% dplyr::select(-Class) %>% data.matrix 121 | predict(xgb, features2) 122 | 123 | 124 | library(mlr) 125 | 126 | 127 | -------------------------------------------------------------------------------- /R/ml1.R: -------------------------------------------------------------------------------- 1 | library(caret) 2 | library(FNN) 3 | library(fastNaiveBayes) 4 | library(tidyverse) 5 | library(doParallel) 6 | library(foreach) 7 | library(functional) 8 | 9 | ## reading data 10 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 11 | wine <- read_csv(dataurl, col_names = F) 12 | 13 | ## Actual columns 14 | good_cols <- c("class", 15 | "alcohol", 16 | 'malic_acid', 17 | 'ash', 18 | 'alkalinity', 19 | 'magnesium', 20 | 'total_phenols', 21 | 'flavanoids', 22 | 'nonflavonoids_phenols', 23 | 'proanthocyanins', 24 | 'color_intensity', 25 | 'hue', 26 | 'dilution', 27 | 'proline' 28 | ) 29 | 30 | ## Column rename (data processing) 31 | fix_cols <- function(df){ 32 | colnames(df) <- good_cols 33 | df$class <- (df$class) 34 | df 35 | } 36 | wine <- fix_cols(wine) 37 | 38 | ## Train test split 39 | split <- function(df, p = 0.75, list = FALSE, ...) { 40 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 41 | cat("creating training dataset...\n") 42 | training <<- df[train_ind, ] 43 | cat("completed training dataset, creating test set\n") 44 | test <<- df[-train_ind, ] 45 | cat("done") 46 | } 47 | split(wine) 48 | 49 | ## Making Model 50 | train_knn <- function(k) { 51 | FNN::knn(train = training[-1], test = test[-1], cl = as.factor(training$class), k) 52 | } 53 | 54 | ## Making metrics 55 | conmat <- function(predicted, expected){ 56 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 57 | cm 58 | } 59 | f1_score <- function(predicted, expected, positive.class="1") { 60 | cm = conmat(predicted, expected) 61 | 62 | precision <- diag(cm) / colSums(cm) 63 | recall <- diag(cm) / rowSums(cm) 64 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 65 | 66 | #Assuming that F1 is zero when it's not possible compute it 67 | f1[is.na(f1)] <- 0 68 | 69 | #Binary F1 or Multi-class macro-averaged F1 70 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 71 | } 72 | 73 | accuracy <- function(predicted, expected){ 74 | cm <- conmat(predicted, expected) 75 | sum(diag(cm)/length(test$class)) 76 | } 77 | 78 | ## Testing 79 | 3 %>% train_knn %>% conmat(test) 80 | 3 %>% train_knn %>% accuracy(test) 81 | 3 %>% train_knn %>% f1_score(test) 82 | get_scores <- function(k){ 83 | predictions <- train_knn(k) 84 | f1 <- f1_score(predictions,test) 85 | acc <- accuracy(predictions,test) 86 | scores <- c(accuracy = acc, f1 = f1) 87 | scores 88 | } 89 | get_scores(3) 90 | 91 | 92 | ## Tuning K without a grid search 93 | 94 | registerDoParallel(detectCores() -1) 95 | 96 | # parallel KNN for k 1:33 97 | # see how it is literally instantaneous 98 | foreach(i = 1:33, .combine = "rbind", .multicombine = T) %dopar% get_scores(i) %>% as_tibble -> scores 99 | 100 | 101 | 102 | 103 | scores$index <- 1:33 104 | 105 | library(ggplot2) 106 | library(reshape2) 107 | 108 | ggplot(data = melt(scores, id.vars = "index", variable.name = "metric"), aes(index, value)) + geom_line(aes(color = metric)) 109 | 110 | # making a pipeline for knn predictions 111 | readfile <- function(url) read_csv(url, col_names = F) 112 | 113 | split2 <- function(df, p = 0.75, list = FALSE, ...) { 114 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 115 | training <- df[train_ind, ] 116 | test <- df[-train_ind, ] 117 | list(training = training, test = test) 118 | } 119 | res <- split2(wine) 120 | 121 | make_knn <- function(k) { 122 | function(lst) { 123 | training <- lst$training 124 | test <- lst$test 125 | train_knn(k) 126 | } 127 | } 128 | 129 | knn11 <- make_knn(11) 130 | knn_pipeline <- Compose(readfile, fix_cols, split2,knn11) 131 | knn_pipeline(dataurl) 132 | 133 | 134 | caret_knn <- function(k) { 135 | knn3Train(train = training[-1], test = test[-1], cl = as.factor(training$class), k = k, prob = F) 136 | 137 | } 138 | caret_knn(11) 139 | train_knn(11) 140 | -------------------------------------------------------------------------------- /R/analysis2.R: -------------------------------------------------------------------------------- 1 | source("helpers.R") 2 | source("preprocessing.R") 3 | library(forecast) 4 | library(ggplot2) 5 | library(cowplot) 6 | china <- preprocess("../data/") 7 | names(china) 8 | # [1] "ChengduPM_" "ShenyangPM_" "ShanghaiPM_" "BeijingPM_" "GuangzhouPM_" 9 | 10 | beij <- china$BeijingPM_ 11 | 12 | # make a multiseasonal time series 13 | # in this section we will look at multiple seasonalities in 14 | # our hourly data, a daily one, a weekly one, and a yearly one 15 | bj <- beij$PM_US %>>% tsclean %>>% abs %>>% 16 | msts(seasonal.periods = c(24, (24*7), (24*365.25))) 17 | 18 | # training set 19 | train <- beij$PM_US %>>% tsclean %>>% abs %>>% 20 | window(end=6) %>>% 21 | msts(seasonal.periods = c(24, (24*7), (24*365.25))) 22 | 23 | p <- autoplot(bj,ylab = "ppm" , xlab = "hour") 24 | 25 | # an example forecast 26 | bj %>>% mstl %>>% autoplot 27 | # long long run time, probably skip 1if you are running this. Look up tbats time series to see more 28 | bjbats <- tbats(bj) 29 | # save our model 30 | # save(bjbats, file = "bjbats.Rda") 31 | # load("bjbats.Rda") 32 | plot(bjbats) 33 | summary(bjbats) 34 | # Length Class Mode 35 | # lambda 1 -none- numeric 36 | # alpha 1 -none- numeric 37 | # beta 1 -none- numeric 38 | # damping.parameter 1 -none- numeric 39 | # gamma.one.values 3 -none- numeric 40 | # gamma.two.values 3 -none- numeric 41 | # ar.coefficients 0 -none- NULL 42 | # ma.coefficients 0 -none- NULL 43 | # likelihood 1 -none- numeric 44 | # optim.return.code 1 -none- numeric 45 | # variance 1 -none- numeric 46 | # AIC 1 -none- numeric 47 | # parameters 2 -none- list 48 | # seed.states 24 -none- numeric 49 | # fitted.values 52584 msts numeric 50 | # errors 52584 msts numeric 51 | # x 1262016 -none- numeric 52 | # seasonal.periods 3 -none- numeric 53 | # k.vector 3 -none- numeric 54 | # y 52584 msts numeric 55 | # p 1 -none- numeric 56 | # q 1 -none- numeric 57 | # call 2 -none- call 58 | # series 1 -none- character 59 | # method 1 -none- character 60 | 61 | bjpred <- predict(bjbats, h = 150) 62 | plot(bjpred, include = 365) 63 | 64 | 65 | # instead, lets do harmonic regression 66 | # here, we are doing a normal arima, but we are also going to 67 | # try and use a series of sines and cosines to fit the seasonal shape 68 | bjtest <- auto.arima(train, seasonal = F, lambda = 0, 69 | xreg = fourier(train, K = rep(10,3))) 70 | 71 | # save(bjtest, file = "bjtest.Rda") 72 | # load("bjtest.Rda") 73 | # load("bjharm.Rda") 74 | 75 | # 3 year forecast 76 | bjfore <- forecast( 77 | bjharm, 78 | xreg = fourier(bj, K = rep(10,3)), 79 | h = 365*24*3 80 | ) 81 | 82 | # test set 83 | test <- window(bj, start = 6) 84 | length(test) 85 | bjfor.test <- Arima(test, model = bjtest, 86 | xreg = fourier( 87 | test, 88 | K = rep(10,3) 89 | )) 90 | getASE(bjfor.test) 91 | # [1] 416.4127 92 | bjpredt <- bjfore %>>% forecast(h = length(test)) 93 | str(fitted( bjpredt )) 94 | fitted(bjpredt) %>>% 95 | autoplot + autolayer(test) 96 | 97 | plot train vs test 98 | trainf <- data.frame(ppm = train, type = "actual") 99 | predf <- data.frame(ppm = as.numeric( fitted(bjfor.test) ), type = "predicted", 100 | t = seq_along(fitted(bjfor.test))) 101 | testf <- data.frame(ppm = as.numeric(test), type = "actual",t = seq_along(test)) 102 | 103 | l <- list(testf, predf) 104 | p <- ggplot() + theme_economist() + scale_color_few(palette = "Dark" ) 105 | doplot <- function(df){ 106 | p <<- p + geom_line(data = df, 107 | aes( 108 | x = t, 109 | y = ppm, 110 | color = type 111 | )) 112 | } 113 | out <- lapply(l, doplot) 114 | 115 | out 116 | -------------------------------------------------------------------------------- /R/ml/data/abalone.names: -------------------------------------------------------------------------------- 1 | 1. Title of Database: Abalone data 2 | 3 | 2. Sources: 4 | 5 | (a) Original owners of database: 6 | Marine Resources Division 7 | Marine Research Laboratories - Taroona 8 | Department of Primary Industry and Fisheries, Tasmania 9 | GPO Box 619F, Hobart, Tasmania 7001, Australia 10 | (contact: Warwick Nash +61 02 277277, wnash@dpi.tas.gov.au) 11 | 12 | (b) Donor of database: 13 | Sam Waugh (Sam.Waugh@cs.utas.edu.au) 14 | Department of Computer Science, University of Tasmania 15 | GPO Box 252C, Hobart, Tasmania 7001, Australia 16 | 17 | (c) Date received: December 1995 18 | 19 | 20 | 3. Past Usage: 21 | 22 | Sam Waugh (1995) "Extending and benchmarking Cascade-Correlation", PhD 23 | thesis, Computer Science Department, University of Tasmania. 24 | 25 | -- Test set performance (final 1044 examples, first 3133 used for training): 26 | 24.86% Cascade-Correlation (no hidden nodes) 27 | 26.25% Cascade-Correlation (5 hidden nodes) 28 | 21.5% C4.5 29 | 0.0% Linear Discriminate Analysis 30 | 3.57% k=5 Nearest Neighbour 31 | (Problem encoded as a classification task) 32 | 33 | -- Data set samples are highly overlapped. Further information is required 34 | to separate completely using affine combinations. Other restrictions 35 | to data set examined. 36 | 37 | David Clark, Zoltan Schreter, Anthony Adams "A Quantitative Comparison of 38 | Dystal and Backpropagation", submitted to the Australian Conference on 39 | Neural Networks (ACNN'96). Data set treated as a 3-category classification 40 | problem (grouping ring classes 1-8, 9 and 10, and 11 on). 41 | 42 | -- Test set performance (3133 training, 1044 testing as above): 43 | 64% Backprop 44 | 55% Dystal 45 | -- Previous work (Waugh, 1995) on same data set: 46 | 61.40% Cascade-Correlation (no hidden nodes) 47 | 65.61% Cascade-Correlation (5 hidden nodes) 48 | 59.2% C4.5 49 | 32.57% Linear Discriminate Analysis 50 | 62.46% k=5 Nearest Neighbour 51 | 52 | 53 | 4. Relevant Information Paragraph: 54 | 55 | Predicting the age of abalone from physical measurements. The age of 56 | abalone is determined by cutting the shell through the cone, staining it, 57 | and counting the number of rings through a microscope -- a boring and 58 | time-consuming task. Other measurements, which are easier to obtain, are 59 | used to predict the age. Further information, such as weather patterns 60 | and location (hence food availability) may be required to solve the problem. 61 | 62 | From the original data examples with missing values were removed (the 63 | majority having the predicted value missing), and the ranges of the 64 | continuous values have been scaled for use with an ANN (by dividing by 200). 65 | 66 | Data comes from an original (non-machine-learning) study: 67 | 68 | Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and 69 | Wes B Ford (1994) "The Population Biology of Abalone (_Haliotis_ 70 | species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North 71 | Coast and Islands of Bass Strait", Sea Fisheries Division, Technical 72 | Report No. 48 (ISSN 1034-3288) 73 | 74 | 75 | 5. Number of Instances: 4177 76 | 77 | 78 | 6. Number of Attributes: 8 79 | 80 | 81 | 7. Attribute information: 82 | 83 | Given is the attribute name, attribute type, the measurement unit and a 84 | brief description. The number of rings is the value to predict: either 85 | as a continuous value or as a classification problem. 86 | 87 | Name Data Type Meas. Description 88 | ---- --------- ----- ----------- 89 | Sex nominal M, F, and I (infant) 90 | Length continuous mm Longest shell measurement 91 | Diameter continuous mm perpendicular to length 92 | Height continuous mm with meat in shell 93 | Whole weight continuous grams whole abalone 94 | Shucked weight continuous grams weight of meat 95 | Viscera weight continuous grams gut weight (after bleeding) 96 | Shell weight continuous grams after being dried 97 | Rings integer +1.5 gives the age in years 98 | 99 | Statistics for numeric domains: 100 | 101 | Length Diam Height Whole Shucked Viscera Shell Rings 102 | Min 0.075 0.055 0.000 0.002 0.001 0.001 0.002 1 103 | Max 0.815 0.650 1.130 2.826 1.488 0.760 1.005 29 104 | Mean 0.524 0.408 0.140 0.829 0.359 0.181 0.239 9.934 105 | SD 0.120 0.099 0.042 0.490 0.222 0.110 0.139 3.224 106 | Correl 0.557 0.575 0.557 0.540 0.421 0.504 0.628 1.0 107 | 108 | 109 | 8. Missing Attribute Values: None 110 | 111 | 112 | 9. Class Distribution: 113 | 114 | Class Examples 115 | ----- -------- 116 | 1 1 117 | 2 1 118 | 3 15 119 | 4 57 120 | 5 115 121 | 6 259 122 | 7 391 123 | 8 568 124 | 9 689 125 | 10 634 126 | 11 487 127 | 12 267 128 | 13 203 129 | 14 126 130 | 15 103 131 | 16 67 132 | 17 58 133 | 18 42 134 | 19 32 135 | 20 26 136 | 21 14 137 | 22 6 138 | 23 9 139 | 24 2 140 | 25 1 141 | 26 1 142 | 27 2 143 | 29 1 144 | ----- ---- 145 | Total 4177 146 | -------------------------------------------------------------------------------- /pres/ml1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "machine learning in R" 3 | author: "David Josephs" 4 | --- 5 | 6 | # Setup 7 | 8 | First, we will load all required libraries for these examples: 9 | 10 | ```{r, message = F, warning = F} 11 | library(caret) 12 | library(FNN) 13 | library(fastNaiveBayes) 14 | library(tidyverse) 15 | library(doParallel) 16 | library(foreach) 17 | library(functional) 18 | library(ROCR) 19 | ``` 20 | 21 | ## Data Loading 22 | 23 | ```{r} 24 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 25 | 26 | wine <- read_csv(dataurl, col_names = F) 27 | tail(wine) 28 | ``` 29 | 30 | ### Data Fixing 31 | 32 | We want good column names, so we will follow the names here: [wine analysis link](http://dataaspirant.com/2017/01/09/knn-implementation-r-using-caret-package/) 33 | 34 | ```{r} 35 | good_cols <- c("class", 36 | "alcohol", 37 | 'malic_acid', 38 | 'ash', 39 | 'alkalinity', 40 | 'magnesium', 41 | 'total_phenols', 42 | 'flavanoids', 43 | 'nonflavonoids_phenols', 44 | 'proanthocyanins', 45 | 'color_intensity', 46 | 'hue', 47 | 'dilution', 48 | 'proline' 49 | ) 50 | 51 | fix_cols <- function(df){ 52 | colnames(df) <- good_cols 53 | df$class <- (df$class) 54 | df 55 | } 56 | wine <- fix_cols(wine) 57 | ``` 58 | 59 | ## Train test split 60 | 61 | ```{r} 62 | set.seed(12345) 63 | ## WARNING: Danger function 64 | split <- function(df, p = 0.75, list = FALSE, ...) { 65 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 66 | cat("creating training dataset...\n") 67 | training <<- df[train_ind, ] 68 | cat("completed training dataset, creating test set\n") 69 | test <<- df[-train_ind, ] 70 | cat("done") 71 | } 72 | split(wine) 73 | ``` 74 | 75 | # Exploratory data analysis 76 | 77 | Do this 78 | 79 | # Picking a knn model 80 | 81 | ## what is knn 82 | 83 | ## Picking a value for k 84 | 85 | There are other methods which we will explore later using k-folds CV 86 | 87 | For now, we will use the ASE to find the best value for k 88 | 89 | ```{r} 90 | train_knn <- function(k) { 91 | knn(train = training[-1], test = test[-1], cl = as.factor(training$class), k) 92 | } 93 | ``` 94 | 95 | ### Metrics for classification 96 | 97 | We can't use ASE, so what do we do? 98 | 99 | ```{r} 100 | conmat <- function(predicted, expected){ 101 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 102 | cm 103 | } 104 | f1_score <- function(predicted, expected, positive.class="1") { 105 | cm = conmat(predicted, expected) 106 | 107 | precision <- diag(cm) / colSums(cm) 108 | recall <- diag(cm) / rowSums(cm) 109 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 110 | 111 | #Assuming that F1 is zero when it's not possible compute it 112 | f1[is.na(f1)] <- 0 113 | 114 | #Binary F1 or Multi-class macro-averaged F1 115 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 116 | } 117 | 118 | accuracy <- function(predicted, expected){ 119 | cm <- conmat(predicted, expected) 120 | sum(diag(cm)/length(test$class)) 121 | } 122 | ``` 123 | 124 | lets test it out 125 | ```{r} 126 | 3 %>% train_knn %>% conmat(test) 127 | 3 %>% train_knn %>% accuracy(test) 128 | 3 %>% train_knn %>% f1_score(test) 129 | ``` 130 | 131 | ### Tuning in parallel! 132 | 133 | Lets write a function that then gets us the accuracy and the f1-score for a given model: 134 | 135 | ```{r} 136 | library(glue) 137 | get_scores <- function(k){ 138 | predictions <- train_knn(k) 139 | f1 <- f1_score(predictions,test) 140 | acc <- accuracy(predictions,test) 141 | scores <- c(accuracy = acc, f1 = f1) 142 | scores 143 | } 144 | get_scores(3) 145 | ``` 146 | 147 | Now its time to get wild, we are going to use the foreach and doparallel libraries to get the scores for everyone! 148 | 149 | ```{r} 150 | registerDoParallel(detectCores() -1) 151 | 152 | # parallel KNN for k 1:33 153 | # see how it is literally instantaneous 154 | foreach(i = 1:33, .combine = "rbind", .multicombine = T) %dopar% get_scores(i) %>% as_tibble -> scores 155 | 156 | 157 | 158 | 159 | scores$index <- 1:33 160 | 161 | library(ggplot2) 162 | library(reshape2) 163 | 164 | ggplot(data = melt(scores, id.vars = "index", variable.name = "metric"), aes(index, value)) + geom_line(aes(color = metric)) 165 | ``` 166 | 167 | So it looks like we will pick 11 or 13 nearest neighbors. Now lets make this whole process into a little pipeline 168 | 169 | ## Pipeline: KNN 170 | 171 | ```{r} 172 | 173 | readfile <- function(url) read_csv(url, col_names = F) 174 | 175 | split2 <- function(df, p = 0.75, list = FALSE, ...) { 176 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 177 | training <- df[train_ind, ] 178 | test <- df[-train_ind, ] 179 | list(training = training, test = test) 180 | } 181 | res <- split2(wine) 182 | 183 | make_knn <- function(k) { 184 | function(lst) { 185 | training <- lst$training 186 | test <- lst$test 187 | train_knn(k) 188 | } 189 | } 190 | 191 | knn11 <- make_knn(11) 192 | knn_pipeline <- Compose(readfile, fix_cols, split2,knn11) 193 | knn_pipeline(dataurl) 194 | ``` 195 | -------------------------------------------------------------------------------- /pres/copyonmodify.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R copy on modify" 3 | author: "David Josephs" 4 | output: rmarkdown::github.document 5 | --- 6 | 7 | For this little lesson, we will learn about the cool things R does when we copy objects, and some of the pitfalls of for loops. 8 | We will be using Hadley's pryr package, which "prys back the covers of R" 9 | 10 | 11 | ```r 12 | library(pryr) 13 | size_and_addr <- function(x){ 14 | cat(rep('-',30), '\n') 15 | cat("Object size: ", object_size(x),"\n") 16 | cat(rep('-',30), '\n') 17 | cat("Address in memory: ", address(x)) 18 | } 19 | ``` 20 | 21 | First, let us create a simple data frame: 22 | 23 | 24 | ```r 25 | a <- data.frame(matrix(1:9, nrow=3)) 26 | ``` 27 | 28 | Next, let's see how big it is in your computer's memory (RAM), and where it is stored 29 | 30 | 31 | ```r 32 | size_and_addr(a) 33 | ``` 34 | 35 | ``` 36 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 37 | ## Object size: 1032 38 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 39 | ## Address in memory: 0x2fabf60 40 | ``` 41 | 42 | Now lets create a new object, b, which is just a, and then see where it lies in your computer's memory. Lets also create a new object, x, for later use 43 | 44 | 45 | ```r 46 | x <- a 47 | b <- a 48 | 49 | size_and_addr(b) 50 | ``` 51 | 52 | ``` 53 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 54 | ## Object size: 1032 55 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 56 | ## Address in memory: 0x2f32c88 57 | ``` 58 | 59 | ```r 60 | size_and_addr(x) 61 | ``` 62 | 63 | ``` 64 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 65 | ## Object size: 1032 66 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 67 | ## Address in memory: 0x28c80b0 68 | ``` 69 | 70 | It is in the same place, which is pretty cool and efficient. But what happens if we modify b and leave A the same? 71 | 72 | 73 | ```r 74 | b$X4 <- c(10:12) 75 | address(a) 76 | ``` 77 | 78 | ``` 79 | ## [1] "0x2e813e8" 80 | ``` 81 | 82 | ```r 83 | size_and_addr(b) 84 | ``` 85 | 86 | ``` 87 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 88 | ## Object size: 1152 89 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 90 | ## Address in memory: 0x18ce4b8 91 | ``` 92 | 93 | So on modification, we create a copy, and put it in a new place. What do you think the size of a and b is? 94 | 95 | 96 | ```r 97 | object_size(a,b) 98 | ``` 99 | 100 | ``` 101 | ## 1.48 kB 102 | ``` 103 | 104 | Pretty cool, R is saving you memory. All the columns that match are stored in the same location, and only the new ones take more space. 105 | 106 | 107 | ```r 108 | address(x) 109 | ``` 110 | 111 | ``` 112 | ## [1] "0x2e813e8" 113 | ``` 114 | 115 | ```r 116 | size_and_addr(a) 117 | ``` 118 | 119 | ``` 120 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 121 | ## Object size: 1032 122 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 123 | ## Address in memory: 0x3f33038 124 | ``` 125 | 126 | ```r 127 | object_size(a,x) 128 | ``` 129 | 130 | ``` 131 | ## 1.03 kB 132 | ``` 133 | 134 | ```r 135 | x <- rbind(x,1:3) 136 | size_and_addr(x) 137 | ``` 138 | 139 | ``` 140 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 141 | ## Object size: 1032 142 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 143 | ## Address in memory: 0x406d818 144 | ``` 145 | 146 | ```r 147 | object_size(a,x) 148 | ``` 149 | 150 | ``` 151 | ## 1.53 kB 152 | ``` 153 | 154 | 155 | Note that this is a little bigger. This is because row indexes take up a little more space than columns. 156 | 157 | Lets now see what happens in a for loop, when we "grow a vector". Lets also run our code effic 158 | 159 | 160 | ```r 161 | vec <- c() 162 | size_and_addr(vec) 163 | ``` 164 | 165 | ``` 166 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 167 | ## Object size: 0 168 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 169 | ## Address in memory: 0x446bd88 170 | ``` 171 | 172 | ```r 173 | vec[1] <- 1 174 | size_and_addr(vec) 175 | ``` 176 | 177 | ``` 178 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 179 | ## Object size: 56 180 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 181 | ## Address in memory: 0x4532240 182 | ``` 183 | 184 | ```r 185 | vec[2] <- 2 186 | size_and_addr(vec) 187 | ``` 188 | 189 | ``` 190 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 191 | ## Object size: 64 192 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 193 | ## Address in memory: 0x45f86f8 194 | ``` 195 | 196 | ```r 197 | vec[3] <- 3 198 | size_and_addr(vec) 199 | ``` 200 | 201 | ``` 202 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 203 | ## Object size: 80 204 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 205 | ## Address in memory: 0x46c6820 206 | ``` 207 | 208 | ```r 209 | vec[4] <- 4 210 | size_and_addr(vec) 211 | ``` 212 | 213 | ``` 214 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 215 | ## Object size: 80 216 | ## - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 217 | ## Address in memory: 0x478ccd8 218 | ``` 219 | 220 | Wow, so each time we add an element to a vector, we are not only using more memory, but actually moving our object from point to point. This is not a fast process, which is why doing a dirty for loop takes so long in R. Instead, we should ***IN GENERAL*** use functions which call speedy compiled code, such as the apply family and/or purr::map 221 | 222 | For futher reading, see: 223 | [Row wise modification in a loop](https://milesmcbain.xyz/rstats-anti-pattern-row-wise/) 224 | -------------------------------------------------------------------------------- /R/json2gif.R: -------------------------------------------------------------------------------- 1 | library(jsonlite) 2 | library(ggplot2) 3 | ## requires: jsonlite, ggplot2 4 | 5 | myjson <- fromJSON("output.json") 6 | 7 | 8 | myjson$bodies 9 | # [[1]] 10 | # body pos.x pos.y mass 11 | # 1 0 1 1 10 12 | # 2 1 3 3 10 13 | # 3 2 5 5 100 14 | # 15 | # [[2]] 16 | # body pos.x pos.y mass 17 | # 1 0 2.75 2.75 10 18 | # 2 1 3.75 3.75 10 19 | # 3 2 4.75 4.75 100 20 | # 21 | # [[3]] 22 | # body pos.x pos.y mass 23 | # 1 0 3.625 3.625 10 24 | # 2 1 4.125 4.125 10 25 | # 3 2 4.625 4.625 100 26 | # 27 | # [[4]] 28 | # body pos.x pos.y mass 29 | # 1 0 4.0625 4.0625 10 30 | # 2 1 4.3125 4.3125 10 31 | # 3 2 4.5625 4.5625 100 32 | # 33 | # [[5]] 34 | # body pos.x pos.y mass 35 | # 1 0 4.28125 4.28125 10 36 | # 2 1 4.40625 4.40625 10 37 | # 3 2 4.53125 4.53125 100 38 | # 39 | 40 | str(myjson$bodies) 41 | 42 | x <- 1:5 43 | y <- 6:10 44 | z <- 11:15 45 | 46 | 47 | xydf <- data.frame(a = x, b = y) 48 | 49 | zdf <- data.frame(c = z) 50 | 51 | xyzdf <- data.frame(xydf, zdf) 52 | # a b c 53 | # 1 1 6 11 54 | # 2 2 7 12 55 | # 3 3 8 13 56 | # 4 4 9 14 57 | # 5 5 10 15 58 | 59 | str(xyzdf) 60 | # 'data.frame': 5 obs. of 3 variables: 61 | # $ a: int 1 2 3 4 5 62 | # $ b: int 6 7 8 9 10 63 | # $ c: int 11 12 13 14 15 64 | 65 | xyzdf 66 | # a b c 67 | # 1 1 6 11 68 | # 2 2 7 12 69 | # 3 3 8 13 70 | # 4 4 9 14 71 | # 5 5 10 15 72 | 73 | 74 | squareSum <- function(vec) { 75 | sum((vec)^2) 76 | } 77 | 78 | 79 | squareSum(z) 80 | # [1] 855 81 | # [1] 330 82 | # [1] 55 83 | 84 | squareSum(xyzdf$a) 85 | # [1] 55 86 | 87 | 88 | # *apply 89 | 90 | 91 | 92 | 93 | #lapply(data, function) 94 | 95 | 96 | 97 | lapply(xyzdf, squareSum) 98 | 99 | str(myjson$bodies[[1]]) 100 | # 'data.frame': 3 obs. of 3 variables: 101 | # $ body: int 0 1 2 102 | # $ pos :'data.frame': 3 obs. of 2 variables: 103 | # ..$ x: num 1 3 5 104 | # ..$ y: num 1 3 5 105 | # $ mass: num 10 10 100 106 | # NULL 107 | 108 | 109 | 110 | str(data.frame(myjson$bodies[[1]])) 111 | # 'data.frame': 3 obs. of 3 variables: 112 | # $ body: int 0 1 2 113 | # $ pos :'data.frame': 3 obs. of 2 variables: 114 | # ..$ x: num 1 3 5 115 | # ..$ y: num 1 3 5 116 | # $ mass: num 10 10 100 117 | # NULL 118 | 119 | 120 | 121 | dfs <- lapply(myjson$bodies, data.frame, stringsAsFactors = FALSE ) 122 | # [[1]] 123 | # body pos.x pos.y mass 124 | # 1 0 1 1 10 125 | # 2 1 3 3 10 126 | # 3 2 5 5 100 127 | # 128 | # [[2]] 129 | # body pos.x pos.y mass 130 | # 1 0 2.75 2.75 10 131 | # 2 1 3.75 3.75 10 132 | # 3 2 4.75 4.75 100 133 | # 134 | # [[3]] 135 | # body pos.x pos.y mass 136 | # 1 0 3.625 3.625 10 137 | # 2 1 4.125 4.125 10 138 | # 3 2 4.625 4.625 100 139 | # 140 | # [[4]] 141 | # body pos.x pos.y mass 142 | # 1 0 4.0625 4.0625 10 143 | # 2 1 4.3125 4.3125 10 144 | # 3 2 4.5625 4.5625 100 145 | # 146 | # [[5]] 147 | # body pos.x pos.y mass 148 | # 1 0 4.28125 4.28125 10 149 | # 2 1 4.40625 4.40625 10 150 | # 3 2 4.53125 4.53125 100 151 | # 152 | 153 | str(dfs) 154 | # List of 5 155 | # $ :'data.frame': 3 obs. of 3 variables: 156 | # ..$ body: int [1:3] 0 1 2 157 | # ..$ pos :'data.frame': 3 obs. of 2 variables: 158 | # .. ..$ x: num [1:3] 1 3 5 159 | # .. ..$ y: num [1:3] 1 3 5 160 | # ..$ mass: num [1:3] 10 10 100 161 | # $ :'data.frame': 3 obs. of 3 variables: 162 | # ..$ body: int [1:3] 0 1 2 163 | # ..$ pos :'data.frame': 3 obs. of 2 variables: 164 | # .. ..$ x: num [1:3] 2.75 3.75 4.75 165 | # .. ..$ y: num [1:3] 2.75 3.75 4.75 166 | # ..$ mass: num [1:3] 10 10 100 167 | # $ :'data.frame': 3 obs. of 3 variables: 168 | # ..$ body: int [1:3] 0 1 2 169 | # ..$ pos :'data.frame': 3 obs. of 2 variables: 170 | # .. ..$ x: num [1:3] 3.62 4.12 4.62 171 | # .. ..$ y: num [1:3] 3.62 4.12 4.62 172 | # ..$ mass: num [1:3] 10 10 100 173 | # $ :'data.frame': 3 obs. of 3 variables: 174 | # ..$ body: int [1:3] 0 1 2 175 | # ..$ pos :'data.frame': 3 obs. of 2 variables: 176 | # .. ..$ x: num [1:3] 4.06 4.31 4.56 177 | # .. ..$ y: num [1:3] 4.06 4.31 4.56 178 | # ..$ mass: num [1:3] 10 10 100 179 | 180 | p <- ggplot() 181 | 182 | p+geom_point(data = dfs[[1]], aes(x = pos$x, y = pos$y, color = as.factor(body))) 183 | 184 | 185 | myplot <- function(df) { 186 | p + geom_point(data = df, aes(x = pos$x, y = pos$y, color = as.factor(body))) + 187 | coord_cartesian(xlim = c(0,5), ylim = c(0,5)) 188 | } 189 | png() 190 | lapply(dfs,myplot) 191 | dev.off() 192 | 193 | # imageMagick 194 | system("convert -delay 60 *.png example_1.gif") 195 | 196 | 197 | 198 | lazplotter <- function(json) { 199 | myjson <- fromJSON(json) 200 | dfs <- lapply(myjson$bodies, data.frame, stringsAsFactors = FALSE) 201 | p <- ggplot() 202 | plot <- function(df) { 203 | p <- p + geom_point(data = df, aes(pos$x, pos$y, 204 | color = as.factor(body))) + 205 | coord_cartesian(xlim = c(0,5), ylim = c(0,5)) 206 | } 207 | lapply(dfs,plot) 208 | } 209 | 210 | lazplotter("output.json") 211 | 212 | 213 | 214 | library(dplyr) # for bind_rows 215 | tidyplotter <- function(json) { 216 | jason <- fromJSON(json) 217 | dfs <- lapply(jason$bodies, data.frame, stringsAsFactors = FALSE) 218 | 219 | pos2vec <- function(l) { 220 | df <- l$pos 221 | x <- df$x 222 | y <- df$y 223 | data.frame("body" = l$body, "x" = x, "y" = y, "mass" = l$mass) 224 | 225 | } 226 | 227 | dfs <- lapply(dfs,pos2vec) 228 | dfs %>% bind_rows(.id = "step") -> tabl 229 | ggplot(tabl, aes(x,y, label = step)) + geom_point(aes(color = step, size = 5)) + facet_wrap(body~.) + geom_line() +guides(size = F) 230 | } 231 | tidyplotter("output.json") 232 | 233 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # TeachR 2 | ![alt text](https://imgur.com/gOXtA3f.jpg) 3 | ## Repository for lectures and notes from my office hours etc 4 | 5 | ### Contents: 6 | 7 | * [pres](pres/): contains all Rmarkdown and knitted results. Contents below: 8 | * [ml2](pres/ml2.Rmd) 9 | * Contains an overview of caret, knn, and naive bayes 10 | * [ml1](pres/ml1.Rmd) 11 | * contains overcomplicated knn code, good for an excercise in overly fancy R code and not much else 12 | * [html-scraping](pres/html-scraping.md) 13 | * Contains a primer on scraping websites with `rvest`, as well as a slight introduction to pipes, and a few user created functions. Read this then [scraping.R](R/scraping.R) as a followup with more code to play with. 14 | * [copyonmodify](pres/copyonmodify.md) 15 | * Discusses how we should avoid for loops and "growing vectors" in general, due to some of the fun little quirks of R. 16 | * [R](R/): contains all R code. Contents below: 17 | * [eda1/R](R/eda1.R) 18 | * Feature elimination tricks in R!! 19 | * [final.R](R/final.R) 20 | * R example of automated EDA, some training of models, from our final office hours :( 21 | * [logo.R](R/logo.R) 22 | * R code used to make the logo for this repository 23 | * [scraping.R](R/scraping.R) 24 | * Example html scraping code 25 | * [tidy1.R](R/tidy1.R) 26 | * a quick primer on dplyr 27 | * [count-and-pipes.R](R/count-and-pipes.R) 28 | * Counting with dplyr and piping with magrittr 29 | * [applied.R](R/applied.R) 30 | * Contains the first really advanced stuff that we will do in here, apply/lapply review, which is the equivalent mathematically of mapping, anonymous functions, lists of functions, "function factories" (closures), and finally brings everything together in one crazy example. Will be made into a .Rmd soon enough 31 | * [lm_1.R](R/lm_1.R) 32 | * Contains the basics of linear modeling 33 | * [ml1.R](R/ml1.R) 34 | * Contains overcomplicated knn 35 | * [json2gif](R/json2gif.R) 36 | * Contains simple sample code where a JSON is used to show the movement of bodies through time 37 | * [src](src/): contains C/C++ code that is used to speed up R. Currently this is empty, and we may not use this directory. Interested parties can make an issue request, email me, or message me on slack and we will work on this. For now, it is enough to know this is part of the structure of a big R project. 38 | * [fig](fig/): contains images and figures generated 39 | * [data](data/): contains minimal data. It is best to save data here not in csv format, but as RData/rda, because it is much much lighter. 40 | 41 | 42 | 43 | 44 | 45 | ## The basic structure of a good R project 46 | 47 | This repository is a glimpse of what a well structured R project looks like. In general, we put R code in the **R** directory, the pretty output in its own directory, images in a directory, and low level code in a src directory. If you intend on developing an R package, which i would be happy to discuss, a good reading is [Hadley's "package structure"](http://r-pkgs.had.co.nz/package.html). This is also just useful information to use on your own R projects. I will provide (opinionated) thoughts on workflow, project structure, etc. later on. 48 | 49 | ## FAQ 50 | 51 | ### Why won't my file knit? 52 | 53 | #### Trying to use CRAN without setting a mirror 54 | 55 | Simple solution: don't put install.packages in Rmarkdown files. 56 | 57 | More complex solution: `install.packages(packagename,repos = "http://cran.us.r-project.org")` 58 | 59 | #### Cannot find my file 60 | 61 | First attempt at answering: setwd() does not work in knitr. Instead, in the R setup chunk, do `knitr::opts_knit$set(root.dir = '/path/to/root/dir/of/project')`, or set the root directory with the R studio GUI 62 | 63 | A simpler, but far less reproducible attempt is to just use the absolute path. But in general, it is better to use relative paths, so see above solution. Setting the root project dir tells knitr to execute your R code in a session where the working directory is what you specified. Then all your paths should work. 64 | 65 | A final solution, is lets say you have a Rmarkdown file in pres, and a data file in data. Then, we can in the rmd file, say: 66 | 67 | ```R 68 | df<-load('../data/myfile.RData') 69 | ``` 70 | 71 | ## Links to other useful sites and readings 72 | * [caret documentation](https://topepo.github.io/caret/index.html) 73 | * [ml metrics](https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9) 74 | * [naive bayes overview](https://towardsdatascience.com/whats-so-naive-about-naive-bayes-58166a6a9eba) 75 | * [naive bayes math/fast naive bayes](https://cran.r-project.org/web/packages/fastNaiveBayes/vignettes/fastnaivebayes.html) 76 | * [awesome-msds](https://github.com/drake-smu/awesome-msds-smu) 77 | * a MSDS student's repository containing awesome resources for the program 78 | * [awesome-r](https://awesome-r.com/#awesome-r) 79 | * awesome R packages 80 | * [rmarkdown manual](https://bookdown.org/yihui/rmarkdown/) 81 | * an amazing resource for knitting 82 | * [knitr options](https://yihui.name/knitr/options/) 83 | * More knitting resources 84 | * [why should I use functions](https://nicercode.github.io/guides/functions/) 85 | * Functions not only make your code more readable, but they can also make repitive tasks easier. In my ***opinion***, we should write many small functions and combine them in a bigger function. This makes our code more readable, and more overall useful. See below for an example: 86 | ```R 87 | # let us say we want to be able to take the log of any number, and if it is negative 88 | # we want to make it the absolute value. This is not directly useful, but in math 89 | # it pops up a lot (see differential equations) 90 | square <- function(x){ 91 | x*x 92 | } 93 | 94 | # yes there is an absolute value function, abs(), but this is for demonstration purpuses 95 | absval <- function(x){ 96 | sqrt(square(x)) 97 | } 98 | 99 | # We are including `...` because the log() function can take extra arguments, e.g. 100 | # base. We want to be able to have those be allowed in our function too. 101 | abslog <- function(x,...){ 102 | log(absval(x),...) 103 | } 104 | 105 | abslog(-2) 106 | # [1] 0.6931472 107 | abslog(2) 108 | # [1] 0.6931472 109 | 110 | # Now lets see the ... 111 | abslog(-10, base = 10) 112 | # [1] 1 113 | abslog(3432, base = 2) 114 | # [1] 11.74483 115 | abslog(-3432, base = 2) 116 | # [1] 11.74483 117 | ``` 118 | 119 | 120 | - [ ] more to come 121 | - [ ] even more 122 | -------------------------------------------------------------------------------- /R/knn.R: -------------------------------------------------------------------------------- 1 | # Building our own machine learning library 2 | 3 | # I am following an amazing blog series on this, which I will share. We will go a bit further than the blog posts 4 | # http://enhancedatascience.com/2018/05/23/create-your-machine-learning-library-from-scratch-with-r-3-5-knn/ 5 | # absolutely great stuff 6 | 7 | # Always set a seed, or else you can get in trouble 8 | set.seed(49) 9 | 10 | # create a train test split 11 | # Do this always! 12 | # Or something similar 13 | 14 | # create KNN Class (S3 Object) 15 | # DNN for davids nearest neighbors 16 | # sqrt(x^2 + y^2 + z^2 + whatever^2) = distance 17 | # pairwise distance matrix 18 | # rank distances per observation from closest to furthest 19 | 20 | 21 | # overall structure of knn 22 | 23 | # 1 Take in data 24 | # 2 Calculate pairwise distance matrix 25 | # Note euclidean distance 26 | # find nearest neighbors 27 | 28 | # 29 | 30 | 31 | 32 | DNN <- function(x, y, k = 5){ 33 | if (!is.matrix(x)) { 34 | x <- as.matrix(x) 35 | } 36 | if (!is.matrix(y)) { 37 | y <- as.matrix(y) 38 | } 39 | results <- list() 40 | results$points <- x 41 | results$value <- y 42 | results$k <- k 43 | results <- structure(results, class = "DNN") 44 | return(results) 45 | } 46 | 47 | 48 | 49 | 50 | 51 | # f ∥xi−xj∥2=∥xi∥2+∥xj∥2−2(xi⋅xj) 52 | # will not get into math but share link afterwards 53 | # https://www.r-bloggers.com/pairwise-distances-in-r/ 54 | # https://blog.smola.org/post/969195661/in-praise-of-the-second-binomial-formula 55 | compute_pairwise_distance=function(X,Y){ 56 | xn = rowSums(X ** 2) 57 | yn = rowSums(Y ** 2) 58 | outer(xn, yn, '+') - 2 * tcrossprod(X, Y) 59 | } 60 | 61 | # prediction methods 62 | # discuss here 63 | 64 | 65 | predict 66 | # function (object, ...) 67 | # UseMethod("predict") 68 | # 69 | # 70 | 71 | ### 72 | 73 | # do knn by hand 74 | # compute distance graph 75 | 76 | # minkowski distance 77 | # jaccard distance 78 | 79 | predict.DNN = function(my_knn,x, distance = compute_pairwise_distance){ 80 | if (!is.matrix(x)) 81 | { 82 | x = as.matrix(x) 83 | } 84 | ##Compute pairwise distance 85 | dist_pair = distance(x,my_knn$points) 86 | # rank distances computing a lovely graph 87 | crossprod( 88 | apply(dist_pair,1,order) <= my_knn[['k']], 89 | my_knn[["value"]]) / my_knn[['k']] 90 | # turn points more than k neighbors away to zero, otherwise one 91 | } 92 | head(iris) 93 | iris_class <- iris 94 | # Sepal.Length Sepal.Width Petal.Length Petal.Width Species 95 | # 1 5.1 3.5 1.4 0.2 setosa 96 | # 2 4.9 3.0 1.4 0.2 setosa 97 | # 3 4.7 3.2 1.3 0.2 setosa 98 | # 4 4.6 3.1 1.5 0.2 setosa 99 | # 5 5.0 3.6 1.4 0.2 setosa 100 | # 6 5.4 3.9 1.7 0.4 setosa 101 | str(iris_class) 102 | # 'data.frame': 100 obs. of 5 variables: 103 | # $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... 104 | # $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... 105 | # $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... 106 | # $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... 107 | # $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ... 108 | # NULL 109 | iris_class$Species = (iris_class$Species == "setosa") 110 | head(iris_class$Species) 111 | # [1] TRUE TRUE TRUE TRUE TRUE TRUE 112 | 113 | 114 | head(iris_class, 15) 115 | unique(iris_class$Species) 116 | 117 | head(as.numeric(iris_class$Species)) 118 | 119 | as.numeric(iris_class$Species) 120 | names(iris_class[,1:2]) 121 | # [1] "Sepal.Length" "Sepal.Width" 122 | 123 | 124 | # Set k to an odd number 125 | knn_class <- DNN(iris_class[,1:2], as.numeric(iris_class$Species), k = 7) 126 | 127 | 128 | # do it by hand before we show the predict function in action 129 | dists <- compute_pairwise_distance(knn_class$points, as.matrix(iris_class[,1:2])) 130 | View(dists) 131 | # ranks the distances in the graph and sorts them 132 | sorted_dists <- apply(dists, 2,order) 133 | View(sorted_dists) 134 | onehot_sorted <- sorted_dists <= knn_class$k 135 | View(onehot_sorted) 136 | head(t(onehot_sorted) %*% knn_class$value/knn_class$k) 137 | # [,1] 138 | # [1,] 1.0000000 139 | # [2,] 0.8571429 140 | # [3,] 0.8571429 141 | # [4,] 0.8571429 142 | # [5,] 1.0000000 143 | # [6,] 0.8571429 144 | 145 | 146 | head(predict(knn_class, iris_class[,1:2])) 147 | # [,1] 148 | # [1,] 1.0000000 149 | # [2,] 0.8571429 150 | # [3,] 0.8571429 151 | # [4,] 0.8571429 152 | # [5,] 1.0000000 153 | # [6,] 0.8571429 154 | 155 | 156 | x_coord = seq(min(iris_class[,1]) - 0.2,max(iris_class[,1]) + 0.2,length.out = 200) 157 | length(x_coord) 158 | y_coord = seq(min(iris_class[,2])- 0.2,max(iris_class[,2]) + 0.2 , length.out = 200) 159 | length(y_coord) 160 | coord = expand.grid(x = x_coord, y = y_coord) 161 | 162 | nrow(coord) 163 | # [1] 40000 164 | 165 | #predict probabilities 166 | coord$prob = predict(knn_class, coord[,1:2]) 167 | 168 | library(ggplot2) 169 | ggplot() + 170 | ##Ad tiles according to probabilities 171 | geom_tile(data=coord,mapping=aes(x, y, fill=prob)) + scale_fill_gradient(low = "lightblue", high = "red") + 172 | ##add points 173 | geom_point(data=iris_class,mapping=aes(Sepal.Length,Sepal.Width, shape=Species),size=3 ) + 174 | #add the labels to the plots 175 | xlab('Sepal length') + ylab('Sepal width') + ggtitle('Decision boundaries of KNN')+ 176 | #remove grey border from the tile 177 | scale_x_continuous(expand=c(0,0))+scale_y_continuous(expand=c(0,0)) 178 | 179 | # implement a confusing matrix (confusion matrix) 180 | 181 | p <- predict(knn_class, iris_class[,1:2]) 182 | head(p) 183 | # [,1] 184 | # [1,] 1.0000000 185 | # [2,] 0.8571429 186 | # [3,] 0.8571429 187 | # [4,] 0.8571429 188 | # [5,] 1.0000000 189 | # [6,] 0.8571429 190 | preds <- ifelse(p < 0.5, FALSE, TRUE) 191 | head(preds) 192 | # [,1] 193 | # [1,] TRUE 194 | # [2,] TRUE 195 | # [3,] TRUE 196 | # [4,] TRUE 197 | # [5,] TRUE 198 | # [6,] TRUE 199 | 200 | # Here we go 201 | 202 | (cm <- table(preds, iris_class$Species)) 203 | # 204 | # preds FALSE TRUE 205 | # FALSE 100 4 206 | # TRUE 0 46 207 | 208 | (acc <- sum(diag(cm))/sum(c(cm))) 209 | # [1] 0.9733333 210 | 211 | (precision <- cm[2,2]/(sum(cm[,2]))) 212 | # [1] 0.92 213 | 214 | (recall <- cm[2,2] / sum(cm[2,])) 215 | # [1] 1 216 | 217 | (F1 <- 2 * (precision * recall) / (precision + recall)) 218 | # [1] 0.9583333 219 | 220 | -------------------------------------------------------------------------------- /pres/ml2.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Machine Learning with caret: classification" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup} 7 | library(caret) 8 | library(fastNaiveBayes) 9 | library(readr) 10 | library(functional) 11 | library(ggplot2) 12 | library(magrittr) 13 | library(tidyverse) 14 | ``` 15 | 16 | # Classification with R 17 | 18 | ## KNN 19 | 20 | *what is knn?* 21 | 22 | ## Data loading 23 | 24 | 25 | ```{r} 26 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 27 | 28 | wine <- read_csv(dataurl, col_names = F) 29 | ``` 30 | 31 | 32 | 33 | ### Data Fixing 34 | 35 | We want good column names, so we will follow the names here: [wine analysis link](http://dataaspirant.com/2017/01/09/knn-implementation-r-using-caret-package/) 36 | 37 | ```{r} 38 | good_cols <- c("class", 39 | "alcohol", 40 | 'malic_acid', 41 | 'ash', 42 | 'alkalinity', 43 | 'magnesium', 44 | 'total_phenols', 45 | 'flavanoids', 46 | 'nonflavonoids_phenols', 47 | 'proanthocyanins', 48 | 'color_intensity', 49 | 'hue', 50 | 'dilution', 51 | 'proline' 52 | ) 53 | 54 | fix_cols <- function(df){ 55 | colnames(df) <- good_cols 56 | df$class <- as.factor(df$class) 57 | df 58 | } 59 | wine <- fix_cols(wine) 60 | glimpse(wine) 61 | ``` 62 | 63 | 64 | ```{r} 65 | set.seed(3033) 66 | ## WARNING: Danger function 67 | split <- function(df, p = 0.75, list = FALSE, ...) { 68 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 69 | cat("creating training dataset...\n") 70 | training <<- df[train_ind, ] 71 | cat("completed training dataset, creating test set\n") 72 | test <<- df[-train_ind, ] 73 | cat("done") 74 | } 75 | 76 | split(wine) 77 | 78 | ggplot(data = wine, aes(x = malic_acid, fill = class)) + geom_density() 79 | ggplot(data = wine, aes(x = alkalinity, fill = class)) + geom_density() 80 | ggplot(data = wine, aes(x = ash, fill = class)) + geom_density() 81 | ggplot(data = wine, aes(x = magnesium, fill = class)) + geom_density() 82 | ``` 83 | 84 | 85 | ## Picking a value for k 86 | 87 | First, we will set up our computer to process in parallel: 88 | 89 | ```{r} 90 | library(doParallel) 91 | numcores <- parallel::detectCores() - 1 92 | cl <- makePSOCKcluster(numcores) 93 | registerDoParallel(cl) 94 | ``` 95 | 96 | Now we will make a knn model using `caret::train`: 97 | 98 | ```{r} 99 | set.seed(3333) 100 | trainMethod <- trainControl(method = "repeatedcv", 101 | number = 10, 102 | repeats = 3) 103 | # k-folds cross validation 104 | # y ~ x 105 | knn_fit <- train(class ~ ., 106 | data = training, 107 | method = "knn", 108 | trControl = trainMethod, 109 | preProcess = c("center", "scale"), 110 | tuneLength = 10) 111 | 112 | knn_fit 113 | # k-Nearest Neighbors 114 | # 115 | # 135 samples 116 | # 13 predictor 117 | # 3 classes: '1', '2', '3' 118 | # 119 | # Pre-processing: 120 | # centered (13), scaled (13) 121 | # Resampling: Cross-Validated (10 fold, repeated 3 times) 122 | # Summary of sample sizes: 121, 122, 122, 121, 121, 121, ... 123 | # Resampling results across tuning parameters: 124 | # 125 | # k Accuracy Kappa 126 | # 5 0.9700549 0.9548756 127 | # 7 0.9676740 0.9516351 128 | # 9 0.9609280 0.9418362 129 | # 11 0.9579426 0.9370280 130 | # 13 0.9702686 0.9552588 131 | # 15 0.9722527 0.9579543 132 | # 17 0.9752442 0.9625294 133 | # 19 0.9681013 0.9519242 134 | # 21 0.9726496 0.9588742 135 | # 23 0.9726496 0.9589829 136 | # 137 | # Accuracy was used to 138 | # model using the 139 | # largest value. 140 | # The final value used 141 | # for the model was k = 17. 142 | ``` 143 | 144 | 145 | 146 | Lets see what value for K we chose with our grid search: 147 | 148 | ```{r} 149 | plot(knn_fit) 150 | ``` 151 | 152 | 153 | ### Alternative with a known K: 154 | 155 | 156 | 157 | ```{r} 158 | 159 | 160 | knn_fit2 <- knn3(training, training$class, k = 15, prob = FALSE) 161 | knn_fit2 162 | ``` 163 | 164 | ```{r} 165 | test_pred <- predict(knn_fit, newdata = test) 166 | test_pred 167 | test_pred2 <- predict(knn_fit2, newdata = test, prob = F) 168 | test_pred2 169 | ``` 170 | 171 | # Assessing the model 172 | 173 | We do not have vector, numeric data, so how can we assess what we are doing? 174 | 175 | ## Confusion matrix 176 | 177 | * accuracy 178 | * precision 179 | * recall 180 | * F1 score 181 | 182 | ```{r} 183 | confusionMatrix(test_pred, test$class) 184 | ``` 185 | 186 | ```{r} 187 | 188 | conmat <- function(predicted, expected){ 189 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 190 | cm 191 | } 192 | conmat(test_pred, test) 193 | f1_score <- function(predicted, expected, positive.class="1") { 194 | cm = conmat(predicted, expected) 195 | precision <- diag(cm) / colSums(cm) 196 | recall <- diag(cm) / rowSums(cm) 197 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 198 | #Assuming that F1 is zero when it's not possible compute it 199 | f1[is.na(f1)] <- 0 200 | 201 | #Binary F1 or Multi-class macro-averaged F1 202 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 203 | } 204 | 205 | accuracy <- function(predicted, expected){ 206 | cm <- conmat(predicted, expected) 207 | sum(diag(cm)/length(test$class)) 208 | } 209 | get_scores <- function(predictions, test){ 210 | f1 <- f1_score(predictions,test) 211 | acc <- accuracy(predictions,test) 212 | scores <- c(accuracy = acc, f1 = f1) 213 | scores 214 | } 215 | pander(get_scores(test_pred, test)) 216 | ``` 217 | 218 | # Naive Bayes 219 | 220 | ## What is bayes theorem? 221 | ## why is it naive? 222 | 223 | ## Doing it 224 | 225 | ```{r} 226 | nb_fit <- train(training, 227 | training$class, 228 | trControl = trainMethod, 229 | method = "nb", 230 | tuneLength = 10 231 | ) 232 | nb_fit 233 | plot(nb_fit) 234 | ``` 235 | 236 | ```{r} 237 | nb_pred <- predict(nb_fit, newdata = test) 238 | nb_pred 239 | ``` 240 | 241 | ```{r} 242 | confusionMatrix(nb_pred, test$class) 243 | get_scores(nb_pred, test) 244 | ``` 245 | 246 | 247 | ## Why not do it FAST 248 | 249 | detect the distribution: 250 | ```{r} 251 | library(fastNaiveBayes) 252 | y <- training$class 253 | x <- training[-1] 254 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 255 | dist 256 | ``` 257 | 258 | Make a model: 259 | 260 | ```{r} 261 | fast_nb_fit <- fastNaiveBayes.mixed(x,y) 262 | fast_nb_fit 263 | ``` 264 | 265 | Make a prediction 266 | 267 | ```{r} 268 | fast_pred <- predict(fast_nb_fit, newdata = test[-1]) 269 | fast_pred 270 | ``` 271 | 272 | Assess 273 | ```{r} 274 | confusionMatrix(fast_pred, test$class) 275 | get_scores(fast_pred, test) 276 | ``` 277 | 278 | 279 | # Make a pipeline 280 | 281 | 282 | 283 | ## Make up some new data 284 | 285 | 286 | ## Run new data through your machine learning pipeline 287 | 288 | # Close shop 289 | 290 | 291 | ```{r} 292 | stopCluster(cl) 293 | ``` 294 | -------------------------------------------------------------------------------- /R/nb.R: -------------------------------------------------------------------------------- 1 | library(caret) 2 | library(fastNaiveBayes) 3 | library(readr) 4 | library(ggplot2) 5 | library(tidyverse) 6 | 7 | 8 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 9 | 10 | wine <- read.csv(dataurl, header = F) 11 | 12 | library(skimr) 13 | 14 | skim(wine) 15 | colnames(wine) 16 | # [1] "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10" "V11" 17 | # [12] "V12" "V13" "V14" 18 | good_cols <- c("class", 19 | "alcohol", 20 | 'malic_acid', 21 | 'ash', 22 | 'alkalinity', 23 | 'magnesium', 24 | 'total_phenols', 25 | 'flavanoids', 26 | 'nonflavonoids_phenols', 27 | 'proanthocyanins', 28 | 'color_intensity', 29 | 'hue', 30 | 'dilution', 31 | 'proline' 32 | ) 33 | 34 | unique(wine$V1) 35 | # [1] 1 2 3 36 | 37 | wine <- wine %>% filter(V1 != 3) 38 | 39 | unique(wine$V1) 40 | # [1] 1 2 41 | nrow(wine) 42 | # [1] 130 43 | fix_cols <- function(df){ 44 | colnames(df) <- good_cols 45 | df$class <- as.factor(df$class) 46 | df 47 | } 48 | wine <- fix_cols(wine) 49 | glimpse(wine) 50 | 51 | # train test split 52 | 53 | makeSampleIndices <- function(x, perc, seed = NULL) { 54 | set.seed(seed) 55 | smpSize <- floor(perc * nrow(x)) 56 | return(sample(seq_len(nrow(x)), size = smpSize)) 57 | } 58 | 59 | 60 | trainInd <- makeSampleIndices(wine, 0.7, seed = 47) 61 | 62 | training <- wine[trainInd,] 63 | 64 | test <- wine[-trainInd,] 65 | 66 | nrow(training) 67 | # [1] 91 68 | nrow(test) 69 | # [1] 39 70 | 71 | nrow(training)/nrow(wine) 72 | # [1] 0.7 73 | 74 | nrow(test)/nrow(wine) 75 | # [1] 0.3 76 | 77 | # response is a factor 78 | # explanatory is a number 79 | # density plot 80 | # response is a factor 81 | # explanatory is a number 82 | # density plotss 83 | ggplot(data = training, aes(x = malic_acid, fill = class)) + geom_density() 84 | ggplot(data = training, aes(x = alkalinity, fill = class)) + geom_density() 85 | ggplot(data = training, aes(x = ash, fill = class)) + geom_density() 86 | ggplot(data = training, aes(x = magnesium, fill = class)) + geom_density() 87 | 88 | 89 | 90 | 91 | 92 | 93 | # looping through seeds 94 | 95 | # for loop 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | # automate 110 | 111 | numPlot <- function(dat, x, y){ 112 | ggplot(data = dat, aes_string(x = x, fill =y)) + geom_density() 113 | } 114 | 115 | 116 | response <- "class" 117 | 118 | # you would worry about types here 119 | 120 | 121 | # lapply through column names 122 | 123 | plotList <- lapply(colnames(training), function(x) numPlot(training, x, response)) 124 | 125 | plotList 126 | 127 | library(cowplot) 128 | 129 | plot_grid(plotlist = plotList, nrow = 3) 130 | 131 | 132 | 133 | # show off automated plots 134 | 135 | 136 | # set up model assessment 137 | 138 | 139 | conmat <- function(predicted, expected){ 140 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 141 | cm 142 | } 143 | f1_score <- function(predicted, expected, positive.class="1") { 144 | cm = conmat(predicted, expected) 145 | precision <- diag(cm) / colSums(cm) 146 | recall <- diag(cm) / rowSums(cm) 147 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 148 | #Assuming that F1 is zero when it's not possible compute it 149 | f1[is.na(f1)] <- 0 150 | 151 | #Binary F1 or Multi-class macro-averaged F1 152 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 153 | } 154 | 155 | accuracy <- function(predicted, expected){ 156 | cm <- conmat(predicted, expected) 157 | sum(diag(cm)/length(test$class)) 158 | } 159 | get_scores <- function(predictions, test){ 160 | f1 <- f1_score(predictions,test) 161 | acc <- accuracy(predictions,test) 162 | scores <- c(accuracy = acc, f1 = f1) 163 | scores 164 | } 165 | pander(get_scores(test_pred, test)) 166 | 167 | 168 | # fit with caret 169 | 170 | # define training method 171 | 172 | trainMethod <- trainControl(method = "cv", number = 3) 173 | 174 | 175 | 176 | nb_fit <- train(class ~ ., data = training, 177 | trControl = trainMethod, 178 | method = "nb", 179 | tuneLength = 10 180 | ) 181 | nb_fit 182 | # k-Nearest Neighbors 183 | # 184 | # 91 samples 185 | # 13 predictors 186 | # 2 classes: '1', '2' 187 | # 188 | # No pre-processing 189 | # Resampling: Cross-Validated (3 fold) 190 | # Summary of sample sizes: 60, 61, 61 191 | # Resampling results across tuning parameters: 192 | # 193 | # k Accuracy Kappa 194 | # 5 0.9448029 0.8890302 195 | # 7 0.9559140 0.9116519 196 | # 9 0.9448029 0.8890302 197 | # 11 0.9555556 0.9105132 198 | # 13 0.9225806 0.8438981 199 | # 15 0.9225806 0.8438981 200 | # 17 0.9444444 0.8880926 201 | # 19 0.9444444 0.8880926 202 | # 21 0.9444444 0.8880926 203 | # 23 0.9444444 0.8880926 204 | # 205 | # Accuracy was used to select the optimal model using the largest value. 206 | # The final value used for the model was k = 7. 207 | # Naive Bayes 208 | # 209 | # 91 samples 210 | # 13 predictors 211 | # 2 classes: '1', '2' 212 | # 213 | # No pre-processing 214 | # Resampling: Cross-Validated (3 fold) 215 | # Summary of sample sizes: 61, 61, 60 216 | # Resampling results across tuning parameters: 217 | # 218 | # usekernel Accuracy Kappa 219 | # FALSE 0.9784946 0.9567643 220 | # TRUE 0.9677419 0.9352818 221 | # 222 | # Tuning parameter 'fL' was held constant at a value of 0 223 | # Tuning 224 | # parameter 'adjust' was held constant at a value of 1 225 | # Accuracy was used to select the optimal model using the largest value. 226 | # The final values used for the model were fL = 0, usekernel = FALSE 227 | # and adjust = 1. 228 | plot(nb_fit) 229 | 230 | 231 | # model_fit 232 | # modelFit 233 | 234 | nb_pred <- predict(nb_fit, newdata = test) 235 | nb_pred 236 | 237 | 238 | 239 | confusionMatrix(nb_pred, test$class) 240 | get_scores(nb_pred, test) 241 | # accuracy f1 242 | # 0.9743590 0.9737374 243 | 244 | conmat(nb_pred, test) 245 | # Predicted 246 | # Actual 1 2 247 | # 1 16 0 248 | # 2 1 22 249 | 250 | 251 | # lets talk about speedy naivebayes, as well as maybe look at probabalistic programming 252 | 253 | 254 | 255 | # 256 | 257 | seeds <- 1:100 258 | 259 | # looping through seeds 260 | 261 | 262 | # train and test function 263 | 264 | accuracyVector <- numeric(100) 265 | 266 | for (i in 1:length(seeds)) { 267 | 268 | set.seed(seeds[i]) 269 | 270 | 271 | ############################################# 272 | # interchangeable 273 | fit <- train(class ~ ., data = training, 274 | trControl = trainMethod, 275 | method = "nb", 276 | tuneLength = 10) 277 | ########################################### 278 | 279 | 280 | preds <- predict(fit, newdata = test) 281 | 282 | acc <- accuracy(preds, test) 283 | 284 | accuracyVector[i] <- acc 285 | 286 | } 287 | 288 | # apply family 289 | 290 | # foreach 291 | 292 | 293 | 294 | 295 | accuracyVector 296 | 297 | mean(accuracyVector) 298 | # [1] 0.9882051 299 | 300 | 301 | data.frame(seed = seeds, acc = accuracyVector) 302 | 303 | 304 | set.seed(NULL) 305 | 306 | 307 | 308 | -------------------------------------------------------------------------------- /R/scraping.R: -------------------------------------------------------------------------------- 1 | ## @knitr libs 2 | library(rvest) 3 | library(tidyverse) 4 | 5 | 6 | ## @knitr datadef 7 | lotr <- 'https://www.imdb.com/title/tt0120737/fullcredits?ref_=tt_cl_sm#cast' 8 | 9 | ## @knitr html_read-1 10 | read_html(lotr) 11 | ## @knitr html_read-2 12 | rawdata <- read_html(lotr) 13 | 14 | ## @knitr html_read-3 15 | tables <- html_nodes(rawdata, "table") 16 | 17 | 18 | str(tables) 19 | 20 | ## @knitr nodes 21 | ateam <- read_html("http://www.boxofficemojo.com/movies/?id=ateam.htm") 22 | center <- html_nodes(ateam, "center") 23 | 24 | ## @knitr list 25 | x <- list("char" = c("cat","dog"), "nest" = list((1:3),2:4), "int" = 4:5, "logical" = c(T,F,T,F), "float" = c(87.5, -962.4)) 26 | 27 | x[[1]] 28 | # [1] "cat" "dog" 29 | 30 | x[1:3] 31 | 32 | (tables) 33 | 34 | ## @knitr table_choose 35 | table1 <- tables[[1]] 36 | # {html_node} 37 | # 38 | # [1] \n\n\n\n\n
\n Peter Jackson ... 40 | table2 <- tables[[2]] 41 | # {html_node} 42 | # 43 | # [1] \n\n\n\n\n\n
\n J.R.R. Tolk ... 45 | table3 <- tables[[3]] 46 | 47 | ## @knitr table_clean 48 | cast <- html_table(table3) 49 | ## @knitr pipes 50 | 51 | 52 | f(x,y) = x %>% f(y) = f(.,y) 53 | 54 | 55 | mtcars %>% filter(cyl == 4) %>% .$mpg 56 | 57 | 58 | cast <- read_html(lotr) %>% html_nodes("table") %>% .[[3]] %>% html_table 59 | 60 | ## @knitr scraper 61 | tablescraper <- function(url, item){ 62 | out <- read_html(url) %>% html_nodes("table") %>% .[[item]] %>% html_table 63 | return(out) 64 | } 65 | 66 | ## @knitr search 67 | tablescraper(lotr,1) %>% head 68 | # X1 X2 X3 69 | # 1 Peter Jackson NA NA 70 | 71 | tablescraper(lotr,2) %>% head 72 | # X1 X2 X3 73 | # 1 J.R.R. Tolkien ... (novel) 74 | # 2 75 | # 3 Fran Walsh ... (screenplay) & 76 | # 4 Philippa Boyens ... (screenplay) & 77 | # 5 Peter Jackson ... (screenplay) 78 | 79 | tablescraper(lotr,3) %>% head 80 | 81 | cast <- tablescraper(lotr,3) 82 | 83 | head(cast) 84 | # X1 X2 X3 X4 85 | # 1 86 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice) 87 | # 3 Noel Appleby ... Everard Proudfoot 88 | # 4 Sean Astin ... Sam 89 | # 5 Sala Baker ... Sauron 90 | # 6 Sean Bean ... Boromir 91 | # X1 X2 X3 X4 92 | # 1 93 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice) 94 | # 3 Noel Appleby ... Everard Proudfoot 95 | # 4 Sean Astin ... Sam 96 | # 5 Sala Baker ... Sauron 97 | # 6 Sean Bean ... Boromir 98 | ## @knitr clean-1 99 | cast$X1 <- NULL 100 | cast$X3 <- NULL 101 | 102 | head(cast) 103 | 104 | ## @knitr rename 105 | cast <- cast %>% rename(Actor = X2, Character = X4) 106 | head(cast) 107 | 108 | animals <- c("cat","dog","mouse","hamster","komodo dragon") 109 | 110 | #grep(pattern, object) 111 | 112 | grep("d", animals) 113 | # [1] 2 5 114 | 115 | grepl("d", animals) 116 | # [1] FALSE TRUE FALSE FALSE TRUE 117 | 118 | animals[grep("d",animals)] 119 | # [1] "dog" "komodo dragon" 120 | 121 | animals[grepl("d", animals)] 122 | # [1] "dog" "komodo dragon" 123 | 124 | 125 | # do it this way 126 | animals[!grepl("d",animals)] 127 | # [1] "cat" "mouse" "hamster" 128 | 129 | animals[-grep("d",animals)] 130 | 131 | 132 | 133 | ## @knitr grepl 134 | 135 | truefalse <- grepl 136 | 137 | # X1 X2 X3 X4 138 | # 1 139 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice) 140 | # 3 Noel Appleby ... Everard Proudfoot 141 | # 4 Sean Astin ... Sam 142 | # 5 Sala Baker ... Sauron 143 | # 6 Sean Bean ... Boromir 144 | 145 | !grepl("Rest of cast listed alphabetically:", cast$Actor) 146 | # [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 147 | # [19] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 148 | # [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE 149 | # [55] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 150 | # [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 151 | # [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 152 | # [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 153 | # [127] FALSE FALSE FALSE FALSE FALSE FALSE FALSE 154 | cast<-cast[!grepl("Rest of cast listed alphabetically:", cast$Actor),] 155 | 156 | 157 | 158 | ## @knitr regex1 159 | 160 | # gsub("pattern","replacement",object) 161 | cast$Character<-gsub("[\r\n]","",cast$Character) 162 | 163 | head(cast) 164 | ## @knitr regex2 165 | cast$Character <- gsub("\\s+"," ",cast$Character) 166 | cast$Character <- str_squish(cast$Character) 167 | 168 | head(cast,10) 169 | # Actor Character 170 | # 1 171 | # 2 Alan Howard Voice of the Ring (voice) 172 | # 3 Noel Appleby Everard Proudfoot 173 | # 4 Sean Astin Sam 174 | # 5 Sala Baker Sauron 175 | # 6 Sean Bean Boromir 176 | # 7 Cate Blanchett Galadriel 177 | # 8 Orlando Bloom Legolas 178 | # 9 Billy Boyd Pippin 179 | # 10 Marton Csokas Celeborn 180 | 181 | head(cast) 182 | html_table(html_nodes(pokemon,table)[[2]]) 183 | ateam <- read_html("http://www.boxofficemojo.com/movies/?id=ateam.htm") 184 | html_nodes(ateam, "center") 185 | 186 | 187 | data(mtcars) 188 | library(ggplot2) 189 | head(mtcars) 190 | ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = gear))+geom_bar(stat ="identity") + theme_minimal() 191 | 192 | 193 | cast$isGoblin <- grepl("Goblin", cast$Character) 194 | 195 | cast$isGoblin <- as.numeric(cast$isGoblin) 196 | 197 | numGoblins <- sum(cast$isGoblin) 198 | # [1] 24 199 | 200 | 201 | 202 | 203 | cast 204 | 205 | # if condition do thing else do other 206 | #ifelse(condition, what to do on true, what to do on false) 207 | cast$isGoblin <- ifelse( 208 | cast$isGoblin == TRUE, 209 | "i am a goblin", 210 | "i am not a goblin" 211 | ) 212 | 213 | View(cast) 214 | -------------------------------------------------------------------------------- /R/eda1.R: -------------------------------------------------------------------------------- 1 | library(mlbench) 2 | library(caret) 3 | library(mlr) 4 | library(tidyverse) 5 | library(ggthemes) 6 | library(gplots) 7 | library(randomForest) 8 | 9 | 10 | library(skimr) # skimr is sweet 11 | 12 | data(BostonHousing) 13 | head(BostonHousing) 14 | # crim zn indus chas nox rm age dis rad tax ptratio b 15 | # 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 16 | # 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 17 | # 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 18 | # 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 19 | # 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 20 | # 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 21 | # lstat medv 22 | # 1 4.98 24.0 23 | # 2 9.14 21.6 24 | # 3 4.03 34.7 25 | # 4 2.94 33.4 26 | # 5 5.33 36.2 27 | # 6 5.21 28.7 28 | length(BostonHousing) 29 | # [1] 14 30 | skim(BostonHousing) 31 | # Skim summary statistics 32 | # n obs: 506 33 | # n variables: 14 34 | # 35 | # ── Variable type:factor ────────────────────────────────────────────────── 36 | # variable missing complete n n_unique top_counts ordered 37 | # chas 0 506 506 2 0: 471, 1: 35, NA: 0 FALSE 38 | # 39 | # ── Variable type:numeric ───────────────────────────────────────────────── 40 | # variable missing complete n mean sd p0 p25 p50 41 | # age 0 506 506 68.57 28.15 2.9 45.02 77.5 42 | # b 0 506 506 356.67 91.29 0.32 375.38 391.44 43 | # crim 0 506 506 3.61 8.6 0.0063 0.082 0.26 44 | # dis 0 506 506 3.8 2.11 1.13 2.1 3.21 45 | # indus 0 506 506 11.14 6.86 0.46 5.19 9.69 46 | # lstat 0 506 506 12.65 7.14 1.73 6.95 11.36 47 | # medv 0 506 506 22.53 9.2 5 17.02 21.2 48 | # nox 0 506 506 0.55 0.12 0.39 0.45 0.54 49 | # ptratio 0 506 506 18.46 2.16 12.6 17.4 19.05 50 | # rad 0 506 506 9.55 8.71 1 4 5 51 | # rm 0 506 506 6.28 0.7 3.56 5.89 6.21 52 | # tax 0 506 506 408.24 168.54 187 279 330 53 | # zn 0 506 506 11.36 23.32 0 0 0 54 | # p75 p100 hist 55 | # 94.07 100 ▁▂▂▂▂▂▃▇ 56 | # 396.23 396.9 ▁▁▁▁▁▁▁▇ 57 | # 3.68 88.98 ▇▁▁▁▁▁▁▁ 58 | # 5.19 12.13 ▇▅▃▃▂▁▁▁ 59 | # 18.1 27.74 ▃▆▅▁▁▇▁▁ 60 | # 16.96 37.97 ▆▇▆▅▂▁▁▁ 61 | # 25 50 ▂▅▇▆▂▂▁▁ 62 | # 0.62 0.87 ▇▆▇▆▃▅▁▁ 63 | # 20.2 22 ▁▂▂▂▅▅▇▃ 64 | # 24 24 ▂▇▁▁▁▁▁▅ 65 | # 6.62 8.78 ▁▁▂▇▇▂▁▁ 66 | # 666 711 ▃▇▂▅▁▁▁▆ 67 | # 12.5 100 ▇▁▁▁▁▁▁▁ 68 | 69 | 70 | # lapply(df, sd) 71 | 72 | # step 0 73 | # get rid of zero variance variables (ones that only have one value) 74 | # check and make sure categorical variables are stored as factors 75 | # use common sense!!!! 76 | 77 | library(corrplot) 78 | library(tidyverse) 79 | 80 | # library(purrr) 81 | 82 | # cor function: Calculate correlation between columns of a df or matrix 83 | # conditions: 84 | # cant handle not numeric 85 | # cant handle NAs 86 | 87 | 88 | bh <- BostonHousing 89 | 90 | bh2 <- bh 91 | bh2$notNum <- "cat" 92 | bh2 %>% keep(is.numeric) %>% head 93 | # opposite of keep: discard 94 | 95 | #purrr 96 | # keep(condition) 97 | 98 | library(corrplot) 99 | corrplot 100 | 101 | 102 | bh %>% keep(is.numeric) %>% na.omit %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2, 103 | number.cex = 0.5, method="square", 104 | order="hclust", title="Variable Corr Heatmap", 105 | tl.srt=45, tl.cex = 0.8) 106 | 107 | 108 | # function to do this all in one go 109 | correlator <- function(df){ 110 | df %>% 111 | keep(is.numeric) %>% 112 | tidyr::drop_na() %>% 113 | cor %>% 114 | corrplot("upper", addCoef.col = "white", number.digits = 2, 115 | number.cex = 0.5, method="square", 116 | order="hclust", title="Variable Corr Heatmap", 117 | tl.srt=45, tl.cex = 0.8) 118 | } 119 | 120 | 121 | # correlation analysis 122 | # usage: this is step 1! Batch elimination of numeric variables 123 | # this can narrow things down a lot 124 | # do not forget to use human logic 125 | 126 | 127 | # key plots 128 | 129 | # x : y 130 | # numeric : categorical 131 | 132 | data(mtcars) 133 | 134 | head(mtcars) 135 | # mpg cyl disp hp drat wt qsec vs am gear carb 136 | # Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 137 | # Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 138 | # Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 139 | # Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 140 | # Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 141 | # Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 142 | 143 | mtcars$cyl <- as.factor(mtcars$cyl) 144 | mtcars$vs <- as.factor(mtcars$vs) 145 | mtcars$am <- as.factor(mtcars$am) 146 | mtcars$gear <- as.factor(mtcars$gear) 147 | mtcars$carb <- as.factor(mtcars$carb) 148 | 149 | skim(mtcars) 150 | 151 | 152 | # x : y 153 | # numeric : categorical 154 | 155 | mtcars$rvar <- rnorm(nrow(mtcars)) 156 | 157 | length(unique(mtcars$vs)) 158 | # [1] 2 159 | 160 | ggplot(data = mtcars) + geom_density(aes_string(x = "mpg", fill = "am"), alpha = 0.5) 161 | 162 | # automated EDA!!!!!!!!!!!! 163 | # step 1, save target variable name 164 | target <- "am" 165 | # step 2, save explanator variable names 166 | numvars <- mtcars %>% keep(is.numeric) %>% colnames 167 | # [1] "mpg" "disp" "hp" "drat" "wt" "qsec" "rvar" 168 | 169 | 170 | numplot <- function(df, explan, resp) { 171 | ggplot(data = df) + geom_density(aes_string(x = explan, fill = resp), alpha = 0.5) 172 | } 173 | 174 | numplot(mtcars, explan = "mpg", resp = "am") 175 | 176 | plotlist <- lapply(numvars, function(x) numplot(mtcars, x, "am")) 177 | library(cowplot) 178 | plot_grid(plotlist = plotlist) 179 | 180 | 181 | png() 182 | lapply(numvars, function(x) numplot(mtcars, x, "am")) 183 | dev.off() 184 | 185 | 186 | 187 | # categorical vs categorical 188 | str(mtcars) 189 | # 'data.frame': 32 obs. of 12 variables: 190 | # $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... 191 | # $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ... 192 | # $ disp: num 160 160 108 258 360 ... 193 | # $ hp : num 110 110 93 110 175 105 245 62 95 123 ... 194 | # $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... 195 | # $ wt : num 2.62 2.88 2.32 3.21 3.44 ... 196 | # $ qsec: num 16.5 17 18.6 19.4 17 ... 197 | # $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ... 198 | # $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ... 199 | # $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ... 200 | # $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ... 201 | # $ rvar: num 0.584 -0.573 0.582 -0.221 0.409 ... 202 | # NULL 203 | 204 | 205 | ggplot(data = mtcars) + geom_bar(aes(x = cyl, fill = am), position = "fill", alpha = 0.9) + coord_flip() 206 | 207 | 208 | ones <- rep(1, nrow(mtcars)) 209 | zeroes <- rep(0, nrow(mtcars)) 210 | onezeroes <- c(ones, zeroes) 211 | 212 | mtcars$rcat <- sample(onezeroes, nrow(mtcars)) 213 | 214 | 215 | ggplot(data = mtcars) + geom_bar(aes(x = rcat, fill = am), position = "fill", alpha = 0.9) + coord_flip() 216 | 217 | # step 1: Name target variable: 218 | 219 | target <- "am" 220 | 221 | # step 2: name explanatory vars 222 | 223 | expls <- mtcars %>% keep(is.factor) %>% colnames 224 | 225 | 226 | catplot <- function(df, x,y){ 227 | ggplot(data = df, aes_string(x = x, fill = y)) + 228 | geom_bar(position = "fill", alpha = 0.9) + 229 | coord_flip() 230 | } 231 | 232 | 233 | plotlist2 <- lapply(expls, function(x) catplot(mtcars, x, target)) 234 | plot_grid(plotlist = plotlist2) 235 | -------------------------------------------------------------------------------- /pres/html-scraping.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HTML Scraping in R" 3 | author: "David Josephs" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include = F} 8 | knitr::read_chunk('../R/scraping.R') 9 | library(knitr) 10 | library(kableExtra) 11 | library(magrittr) 12 | library(pander) 13 | knitr::opts_chunk$set(cache = T, autodep = T) 14 | ``` 15 | # Lord of the Rings Example 16 | 17 | ## Setup 18 | 19 | First, lets load up two libraries which will make our life easier. First is rvest, which is a great library for reading html, it is basically an extension of the xml2 package. It has some easy syntax and is quite helpful going forwards. 20 | 21 | The second one is in my opinion, one of the most useful libraries for doing any sort of data science or data analysis in R, the tidyverse. Just google it and see the documentation, it is a set of packages in which all of the functions have similar APIs and arguments, allowing for consistency throughout our programmig. They are also all pretty fast, with nice syntax. Examples from the tidyverse are: readr (data loading), dplyr(data analysis/cleaning/general utility), tidyr(data cleaning again, reshaping), caret(machine learning), and ggplot2(data viz). 22 | 23 | ```{r, message=F} 24 | <> 25 | ``` 26 | 27 | Next lets load up our data. In this example we will be looking at the imdb page for lord of the rings. So we will assign a variable to the url of the page we are interested in: 28 | 29 | ```{r dataload} 30 | <> 31 | ``` 32 | 33 | ## Reading the data 34 | 35 | ### The pipe operator 36 | 37 | Before we can read in the data, lets first learn about `%>%` pipes. A pipe is basically saying, take the thing on the left, and make it an argument of a thing on the right. For example, lets say we want to take the mean of mtcars, the classic R example dataset, with all columns. We can do that with: 38 | 39 | ```{r} 40 | lapply(mtcars,mean) 41 | ``` 42 | 43 | This is mapping the mean function over the mtcars dataset. Now, the output of this is not very pretty, so we will turn it into a nice, horizontal data frame: 44 | 45 | ```{r} 46 | as.data.frame(lapply(mtcars,mean)) 47 | ``` 48 | 49 | Still ugly. Lets try and use the pander library to make this look nice: 50 | 51 | ```{r} 52 | pander(as.data.frame(lapply(mtcars,mean))) 53 | ``` 54 | 55 | Much better, but look at how many parentheses we wrote, and how difficult this is to read. Imagine if we had 4 or 5 more steps. We would have to repeatedly assign things to new variables, and keep working and working and putting things in our computers memory to have readable code. Even then, if we assigned a variable on every step, someone reviewing your code would end up having to know 20 or so lines of code above, just to understand the final printing line. This leads to errors and is not reproducible. Instead, lets try it with the pipe operator. Mathematically, `f(x,y) = x %>% f(y)`, if that helps: 56 | 57 | ```{r} 58 | mtcars %>% lapply(mean) %>% as.data.frame %>% pander 59 | ``` 60 | 61 | This reads from left to right (as we english speakers are in the habit of doing): 62 | First, we take the mtcars dataset. Then, we apply the mean function onto every column of the dataset, outputting into the form of a list. We then turn the list, which is hard to read, into a nice flat data frame, and then we pretty up the data frame in a final step. This is the pipe operator. 63 | 64 | ### Actually reading in the data 65 | 66 | So, with our knowledge of the pipe operator, what can we do? Lets use rvest functions to turn the raw xml and/or html data into something nice and human human readbale. 67 | 68 | First, lets read in the website: 69 | 70 | ```{r, eval = F} 71 | # not run 72 | read_html(lotr) 73 | ``` 74 | 75 | Next lets choose all the tables (we know all of our data is in tables) in the raw data, with the `html_nodes()` function: 76 | 77 | ```{r, eval = F} 78 | read_html(lotr) %>% html_nodes("table") 79 | ``` 80 | 81 | Next, lets choose the right table. By looking at the website, we know that the third table contains the info on the cast. To choose the third table of an unnamed object, we are going to have to use the `.` operator, which we will see is just a placeholder for the thing on the left. 82 | 83 | 84 | ```{r, eval = F} 85 | read_html(lotr) %>% html_nodes("table") %>% .[[3]] 86 | ``` 87 | 88 | #### An Aside on lists 89 | Why did we do `[[]]`? 90 | This is because html_nodes outputs a list, and there are three ways we can get items from a list, `$`, for named items, keeps the type of the item if it is some sort of vector. `[]` allows us to index the list, but the output is always in the form of a list, eg, data type is extracted at some other set. Third, we have `[[]]`, which allows us to index the list and get the proper data type in an output. Experiment with this by using the following list as well as the built in `typeof()` function. 91 | 92 | ```{r} 93 | <> 94 | ``` 95 | 96 | ### Back to Business 97 | 98 | Now that we understand what `.[[3]]` is doing, we can now extract the full dataset: 99 | 100 | ```{r} 101 | <> 102 | (head(cast)) 103 | ``` 104 | 105 | Great. Now that process was pretty painful, and took a lot of typing, and in the future we may not know which table we are looking for, so lets write a nice little function to do this all in one step: 106 | 107 | 108 | ```{r} 109 | <> 110 | ``` 111 | 112 | Now that we have a nice function, we can iteratively search through the IMDB site: 113 | 114 | ```{r} 115 | <> 116 | ``` 117 | 118 | We can even imagine, for a large project, just writing a for loop to do all of this. 119 | Next, lets check out the first and last ten items of cast: 120 | 121 | ```{r} 122 | ht <- function(x,...){ 123 | head(x,...) 124 | tail(x,...) 125 | } 126 | ht(cast,10) 127 | ``` 128 | 129 | ***NOTE***: the `...` in our function allows for extra arguments. We do this so we can throw in the extra parameter, `10` which changes head and tail to showing the first and last 10 instead of the first and last 6 items. 130 | 131 | ## Cleaning the data 132 | Wow, this data is a mess. The first thing we see is that the first row is entirely blank, and then that the first and third columns are completely empty. Lets get rid of that: 133 | 134 | ```{r} 135 | <> 136 | ht(cast) 137 | ``` 138 | 139 | Next, lets rename with dplyr: 140 | 141 | ```{r} 142 | <> 143 | ``` 144 | 145 | Looking better, now we know from the IMDB website that the table contains"Rest of cast listed alphabetically:", so lets get rid of that. To do this, we are going to use `grepl()` 146 | 147 | `grepl()` searches for a pattern and then returns a logical (true/false) vector of whether or not there is a match. We can then index `cast` for all rows where the result of `grepl` are not true, eliminating the unwanted line: 148 | 149 | ```{r} 150 | <> 151 | ``` 152 | 153 | 154 | Try and see how this dplyr syntax is different from doing it in base R as a learning challenge, and see which one you prefer. 155 | 156 | Next lets get rid of those nasty `\n`'s. To do this, lets use `gsub()`, short for global substite. Since we dont know how all newlines are delimited, we will search for all types of newlines, `\n` (unix) `\r\n` (windows) and `\r` (old web line endings). To do this, we will use the regular expression `[\r\n]`. This allows us to search for `\r`,`\n`, and `\r\n` (thats what the brackets do). Lets turn those all into nothing. 157 | 158 | ```{r} 159 | <> 160 | ht(cast) 161 | ``` 162 | 163 | Now, we have a ton of whitespace. Lets get rid of that. The regular expression for a single space is `\s`. But, we want to get rid of more than one space, and the regular expression for that is `\s+`. Lets combine the two so we are looking for all spaces, by doing `\s\s+`. That however is not very pretty, so lets combine one step further, and rewrite as `\\s+`. This is going to match with all amounts of whitespace. Lets turn all of these into a single space: 164 | 165 | ```{r} 166 | <> 167 | ht(cast) 168 | ``` 169 | 170 | Excellent work, we have now turned a once very ugly raw frame into something we can work with. 171 | 172 | # A challenge: 173 | 174 | Two challenges here: 175 | 176 | * Is there another way we could have cleaned up the `\n` or the `\s`? Try out `library(stringr)` and explore the functions there. 177 | 178 | * Try separating first name from last name (eg make a first and last name column), using whatever means necessary (this is in your homework assignment this week) 179 | 180 | # Note 181 | 182 | To see more examples and play around with the source code for this document, see `R/scraping.R` 183 | -------------------------------------------------------------------------------- /R/analysis1.R: -------------------------------------------------------------------------------- 1 | library(tswgewrapped) 2 | library(ggthemes) 3 | library(ggplot2) 4 | library(cowplot) 5 | source("../R/preprocessing.R", echo = TRUE) 6 | source("../R/helpers.R", echo = TRUE) 7 | 8 | 9 | # data import 10 | # imports the data as a hash table 11 | fine_china <- preprocess("../data/") 12 | names(fine_china) 13 | # [1] "ChengduPM_" "ShenyangPM_" "ShanghaiPM_" "BeijingPM_" "GuangzhouPM_" 14 | # 15 | 16 | 17 | # shanghai 18 | shang_US <- fine_china$ShanghaiPM_$PM_US 19 | 20 | usShang <- resample(shang_US) 21 | plotts.sample.wge(usShang$day) 22 | plotts.sample.wge(usShang$week) 23 | plotts.sample.wge(usShang$month) 24 | plotts.sample.wge(usShang$sea) 25 | decompose(usShang$day, "multiplicative") %>>% autoplot+ theme_economist() 26 | decompose(usShang$day, "additive") %>>% autoplot+ theme_economist() 27 | decompose(usShang$week, "multiplicative") %>>% autoplot+ theme_economist() 28 | decompose(usShang$week, "additive") %>>% autoplot+ theme_economist() 29 | decompose(usShang$month, "multiplicative") %>>% autoplot+ theme_economist() 30 | decompose(usShang$month, "multiplicative") %>>% autoplot+ theme_economist() 31 | decompose(usShang$sea, "additive") %>>% autoplot+ theme_economist() 32 | decompose(usShang$sea, "additive") %>>% autoplot+ theme_economist() 33 | usShang$week %>>% lagplot+ theme_economist() 34 | usShang$day %>>% seasonplot + theme_economist() 35 | usShang$day %>>% seasonplot(polar = TRUE) + theme_economist() 36 | usShang$week %>>% seasonplot+ theme_economist() 37 | usShang$week %>>% seasonplot(polar = T)+ theme_economist() 38 | usShang$month %>>% seasonplot+ theme_economist() 39 | usShang$month %>>% seasonplot(polar = T)+ theme_economist() 40 | usShang$seas %>>% seasonplot(polar = T) + theme_economist() 41 | usShang$seas %>>% seasonplot + theme_economist() 42 | 43 | 44 | 45 | # next lets look at ses and holt models 46 | 47 | library(fpp2) 48 | 49 | sesd <- ses(usShang$day) 50 | sesw <- ses(usShang$week) 51 | sesm <- ses(usShang$month) 52 | par(mfrow = c(1,3)) 53 | lapply(list(sesd,sesw,sesm), plot) 54 | 55 | 56 | accuracy(fitted(sesd)) 57 | 58 | accuracy(fitted(sesw)) 59 | 60 | accuracy(fitted(sesm)) 61 | 62 | 63 | ## Problem: implement the above for holt 64 | 65 | 66 | 67 | # below is extra, see analysis2 for more complex fun 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | # We see with the weekly plot we have a lot of seasonality 77 | 78 | 79 | # lets just for fun do some predictions with the daily data 80 | 81 | shang <- usShang$day 82 | plotts.sample.wge(shang) 83 | # we have clear seasonality, and maybe a wandering behavior. I believe we have a biannual seasonality, based off of the monthly graph 84 | shang %>>% ( difference(seasonal,., (365)) ) -> shang2 85 | difference(arima, shang2, 1) -> shang3 86 | aics <- shang3 %>>% aicbic(p=0:10) 87 | pander(aics) 88 | # 89 | # 90 | # * 91 | # 92 | # ------------------------ 93 | #   p q aic 94 | # -------- --- --- ------- 95 | # **20** 3 1 13.51 96 | # 97 | # **6** 0 5 13.52 98 | # 99 | # **4** 0 3 13.52 100 | # 101 | # **10** 1 3 13.52 102 | # 103 | # **3** 0 2 13.55 104 | # ------------------------ 105 | # 106 | # * 107 | # 108 | # ------------------------ 109 | #   p q bic 110 | # -------- --- --- ------- 111 | # **20** 3 1 13.54 112 | # 113 | # **4** 0 3 13.55 114 | # 115 | # **6** 0 5 13.55 116 | # 117 | # **10** 1 3 13.56 118 | # 119 | # **3** 0 2 13.57 120 | # ------------------------ 121 | # 122 | # 123 | # 124 | # 125 | # 126 | # NULL 127 | aicss <- shang %>>% ( difference(seasonal,., 365) ) %>>% aicbic(p=0:10) 128 | pander(aics) 129 | # 130 | # 131 | # * 132 | # 133 | # ----------------------- 134 | #   p q aic 135 | # -------- --- --- ------ 136 | # **20** 3 1 13.5 137 | # 138 | # **11** 1 4 13.5 139 | # 140 | # **26** 4 1 13.5 141 | # 142 | # **16** 2 3 13.5 143 | # 144 | # **24** 3 5 13.5 145 | # ----------------------- 146 | # 147 | # * 148 | # 149 | # ------------------------ 150 | #   p q bic 151 | # -------- --- --- ------- 152 | # **13** 2 0 13.53 153 | # 154 | # **3** 0 2 13.53 155 | # 156 | # **8** 1 1 13.53 157 | # 158 | # **7** 1 0 13.53 159 | # 160 | # **20** 3 1 13.53 161 | # ------------------------ 162 | # 163 | # 164 | # 165 | # 166 | # 167 | # NULL 168 | par(mfrow = c(1,1)) 169 | est_shang <- estimate(shang2, p=2, q = 0) 170 | acf(est_shang$res) 171 | ljung_box(est_shang$res, p =2, q =0) 172 | shang_seasonal <- fore_and_assess(type = aruma, 173 | x = shang, 174 | s = 365, 175 | phi = est_shang$phi, 176 | n.ahead = 24, 177 | limits = F 178 | ) 179 | 180 | est_shang2 <- estimate(shang3, p = 3, q = 1) 181 | acf(est_shang2$res) 182 | ljung_box(est_shang2$res, 3, 1) 183 | # [,1] [,2] 184 | # test "Ljung-Box test" "Ljung-Box test" 185 | # K 24 48 186 | # chi.square 14.14806 35.92178 187 | # df 20 44 188 | # pval 0.8229101 0.8017901 189 | 190 | shang_aruma <- fore_and_assess(type = aruma, 191 | x = shang, 192 | s = 365, 193 | d = 1, 194 | phi = est_shang2$phi, 195 | theta = est_shang2$theta, 196 | n.ahead = 24, 197 | limits = F 198 | ) 199 | 200 | shang_seasonal$ASE 201 | # [1] 1154198 202 | shang_aruma$ASE 203 | # [1] 1154911 204 | test <- window(shang_US, start = 7)[1:24] 205 | ase(test, shang_aruma) 206 | # [1] 1977888 207 | ase(test, shang_seasonal) 208 | # [1] 3278672 209 | 210 | forecast(aruma, shang, s = 365, d = 1, phi = est_shang2$phi,theta = est_shang2$theta, n.ahead=500) 211 | forecast(aruma, shang, s = 365, phi = est_shang$phi, n.ahead=500) 212 | 213 | # ok looking damn good with the shang aruma 214 | 215 | 216 | # Beijing 217 | 218 | bj_US <- fine_china$BeijingPM_$PM_US 219 | usBJ <- resample(bj_US) 220 | plotts.sample.wge(usBJ$day) 221 | plotts.sample.wge(usBJ$week) 222 | plotts.sample.wge(usBJ$month) 223 | plotts.sample.wge(usBJ$sea) 224 | decompose(usBJ$day, "multiplicative") %>>% autoplot+ theme_economist() 225 | decompose(usBJ$day, "additive") %>>% autoplot+ theme_economist() 226 | decompose(usBJ$week, "multiplicative") %>>% autoplot+ theme_economist() 227 | decompose(usBJ$week, "additive") %>>% autoplot+ theme_economist() 228 | decompose(usBJ$month, "multiplicative") %>>% autoplot+ theme_economist() 229 | decompose(usBJ$month, "multiplicative") %>>% autoplot+ theme_economist() 230 | decompose(usBJ$sea, "additive") %>>% autoplot+ theme_economist() 231 | decompose(usBJ$sea, "additive") %>>% autoplot+ theme_economist() 232 | usBJ$week %>>% lagplot+ theme_economist() 233 | usBJ$day %>>% seasonplot + theme_economist() 234 | usBJ$day %>>% seasonplot(polar = TRUE) + theme_economist() 235 | usBJ$week %>>% seasonplot+ theme_economist() 236 | usBJ$week %>>% seasonplot(polar = T)+ theme_economist() 237 | usBJ$month %>>% seasonplot+ theme_economist() 238 | usBJ$month %>>% seasonplot(polar = T)+ theme_economist() 239 | usBJ$seas %>>% seasonplot(polar = T) + theme_economist() 240 | usBJ$seas %>>% seasonplot + theme_economist() 241 | bj <- usBJ$day 242 | 243 | bj %>>% (difference(seasonal,.,365)) -> bjtr 244 | aicbj <- bj %>>% (difference(seasonal,.,365)) %>>% 245 | aicbic(p = 0:10) 246 | pander(aicbj) 247 | # 248 | # 249 | # * 250 | # 251 | # ------------------------ 252 | #   p q aic 253 | # -------- --- --- ------- 254 | # **41** 6 4 15.31 255 | # 256 | # **53** 8 4 15.31 257 | # 258 | # **59** 9 4 15.31 259 | # 260 | # **47** 7 4 15.31 261 | # 262 | # **60** 9 5 15.31 263 | # ------------------------ 264 | # 265 | # * 266 | # 267 | # ------------------------ 268 | #   p q bic 269 | # -------- --- --- ------- 270 | # **13** 2 0 15.35 271 | # 272 | # **3** 0 2 15.35 273 | # 274 | # **8** 1 1 15.35 275 | # 276 | # **14** 2 1 15.35 277 | # 278 | # **19** 3 0 15.35 279 | # ------------------------ 280 | # 281 | # 282 | # 283 | # 284 | # 285 | # NULL 286 | 287 | est_bj <- estimate(bjtr, 6,4) 288 | acf(est_bj$res) 289 | ljung_box(est_bj$res,6,4) 290 | # [,1] [,2] 291 | # test "Ljung-Box test" "Ljung-Box test" 292 | # K 24 48 293 | # chi.square 28.84052 45.85887 294 | # df 14 38 295 | # pval 0.01098179 0.1784661 296 | 297 | bj_seas <- fore_and_assess(type = aruma, 298 | x = bj, 299 | s = 365, 300 | phi = est_bj$phi, 301 | theta = est_bj$theta, 302 | n.ahead = 24, 303 | limits = F 304 | ) 305 | test <- window(bj_US, start = 7)[1:24] 306 | ase(test, bj_seas) 307 | -------------------------------------------------------------------------------- /R/nb2.R: -------------------------------------------------------------------------------- 1 | library(caret) 2 | 3 | data(BreastCancer, package='mlbench') 4 | 5 | 6 | library(skimr) 7 | 8 | 9 | skim(BreastCancer) 10 | # ── Data Summary ──────────────────────── 11 | # Values 12 | # Name BreastCancer 13 | # Number of rows 699 14 | # Number of columns 11 15 | # _______________________ 16 | # Column type frequency: 17 | # character 1 18 | # factor 10 19 | # ________________________ 20 | # Group variables None 21 | # 22 | # ── Variable type: character ────────────────────────────────────────────── 23 | # skim_variable n_missing complete_rate min max empty n_unique 24 | # 1 Id 0 1 5 8 0 645 25 | # whitespace 26 | # 1 0 27 | # 28 | # ── Variable type: factor ───────────────────────────────────────────────── 29 | # skim_variable n_missing complete_rate ordered n_unique 30 | # 1 Cl.thickness 0 1 TRUE 10 31 | # 2 Cell.size 0 1 TRUE 10 32 | # 3 Cell.shape 0 1 TRUE 10 33 | # 4 Marg.adhesion 0 1 TRUE 10 34 | # 5 Epith.c.size 0 1 TRUE 10 35 | # 6 Bare.nuclei 16 0.977 FALSE 10 36 | # 7 Bl.cromatin 0 1 FALSE 10 37 | # 8 Normal.nucleoli 0 1 FALSE 10 38 | # 9 Mitoses 0 1 FALSE 9 39 | # 10 Class 0 1 FALSE 2 40 | # top_counts 41 | # 1 1: 145, 5: 130, 3: 108, 4: 80 42 | # 2 1: 384, 10: 67, 3: 52, 2: 45 43 | # 3 1: 353, 2: 59, 10: 58, 3: 56 44 | # 4 1: 407, 2: 58, 3: 58, 10: 55 45 | # 5 2: 386, 3: 72, 4: 48, 1: 47 46 | # 6 1: 402, 10: 132, 2: 30, 5: 30 47 | # 7 2: 166, 3: 165, 1: 152, 7: 73 48 | # 8 1: 443, 10: 61, 3: 44, 2: 36 49 | # 9 1: 579, 2: 35, 3: 33, 10: 14 50 | # 10 ben: 458, mal: 241 51 | 52 | bc <- BreastCancer 53 | bc$ID <- NULL 54 | 55 | bc 56 | 57 | library(tidyverse) 58 | ggplot(data = bc) + geom_bar(aes_string(x = Cell.size, fill = Class), position = "fill", alpha = 0.9) + coord_flip() 59 | 60 | 61 | catplot <- function(df, x,y){ 62 | ggplot(data = df, aes_string(x = x, fill = y)) + 63 | geom_bar(position = "fill", alpha = 0.9) + 64 | coord_flip() 65 | } 66 | 67 | 68 | bc$Id <- NULL 69 | rev(names(bc)) 70 | # [1] "Class" "Mitoses" "Normal.nucleoli" 71 | # [4] "Bl.cromatin" "Bare.nuclei" "Epith.c.size" 72 | # [7] "Marg.adhesion" "Cell.shape" "Cell.size" 73 | # [10] "Cl.thickness" 74 | # [1] "Cl.thickness" "Cell.size" "Cell.shape" 75 | # [4] "Marg.adhesion" "Epith.c.size" "Bare.nuclei" 76 | # [7] "Bl.cromatin" "Normal.nucleoli" "Mitoses" 77 | # [10] "Class" 78 | 79 | explanatory <- rev(names(bc))[2:length(names(bc))] 80 | # [1] "Mitoses" "Normal.nucleoli" "Bl.cromatin" 81 | # [4] "Bare.nuclei" "Epith.c.size" "Marg.adhesion" 82 | # [7] "Cell.shape" "Cell.size" "Cl.thickness" 83 | target <- "Class" 84 | 85 | plotlist <- lapply(explanatory, function(x) catplot(bc, x, target)) 86 | 87 | library(cowplot) 88 | plot_grid(plotlist = plotlist) 89 | 90 | trainIDS <- createDataPartition(bc$Class, list=FALSE, p = 0.7) 91 | 92 | trainIDS 93 | 94 | training <- bc[trainIDS,] 95 | test <- bc[-trainIDS, ] 96 | 97 | table(bc$Class) / sum(table(bc$Class)) 98 | # 99 | # benign malignant 100 | # 0.6552217 0.3447783 101 | 102 | table(training$Class) / sum(table(training$Class)) 103 | # 104 | # benign malignant 105 | # 0.655102 0.344898 106 | table(test$Class) / sum(table(test$Class)) 107 | # 108 | # benign malignant 109 | # 0.6555024 0.3444976 110 | 111 | nrow(test) / nrow(bc) 112 | 113 | trainMethod <- trainControl(method = "cv", number = 3) 114 | # train, validation, test 115 | 116 | nb_fit <- train(Class ~ ., data = na.omit(training), 117 | trControl = trainMethod, 118 | method = "nb", 119 | tuneLength = 10 120 | ) 121 | nb_fit 122 | 123 | 124 | preds <- predict(nb_fit, newdata=test) 125 | 126 | preds 127 | 128 | test <- na.omit(test) 129 | 130 | length(preds) 131 | length(test$Class) 132 | 133 | # Naive Bayes 134 | # 135 | # 478 samples 136 | # 9 predictor 137 | # 2 classes: 'benign', 'malignant' 138 | # 139 | # No pre-processing 140 | # Resampling: Cross-Validated (3 fold) 141 | # Summary of sample sizes: 319, 318, 319 142 | # Resampling results across tuning parameters: 143 | # 144 | # usekernel Accuracy Kappa 145 | # FALSE NaN NaN 146 | # TRUE 0.9602463 0.9142473 147 | # 148 | # Tuning parameter 'fL' was held constant at a value of 0 149 | # Tuning 150 | # parameter 'adjust' was held constant at a value of 1 151 | # Accuracy was used to select the optimal model using the largest value. 152 | # The final values used for the model were fL = 0, usekernel = TRUE 153 | # and adjust = 1. 154 | # Random Forest 155 | # 156 | # 478 samples 157 | # 9 predictor 158 | # 2 classes: 'benign', 'malignant' 159 | # 160 | # No pre-processing 161 | # Resampling: Cross-Validated (3 fold) 162 | # Summary of sample sizes: 319, 318, 319 163 | # Resampling results across tuning parameters: 164 | # 165 | # mtry Accuracy Kappa 166 | # 2 0.9581892 0.9083623 167 | # 10 0.9540225 0.8989660 168 | # 19 0.9519392 0.8943769 169 | # 28 0.9498428 0.8894839 170 | # 36 0.9540225 0.8988541 171 | # 45 0.9561059 0.9036828 172 | # 54 0.9581892 0.9084716 173 | # 62 0.9623690 0.9178711 174 | # 71 0.9623690 0.9178711 175 | # 80 0.9602725 0.9133694 176 | # 177 | # Accuracy was used to select the optimal model using the largest value. 178 | # The final value used for the model was mtry = 62. 179 | # k-Nearest Neighbors 180 | # 181 | # 478 samples 182 | # 9 predictor 183 | # 2 classes: 'benign', 'malignant' 184 | # 185 | # No pre-processing 186 | # Resampling: Cross-Validated (3 fold) 187 | # Summary of sample sizes: 319, 318, 319 188 | # Resampling results across tuning parameters: 189 | # 190 | # k Accuracy Kappa 191 | # 5 0.9330713 0.8478567 192 | # 7 0.9226022 0.8231873 193 | # 9 0.9142296 0.8029820 194 | # 11 0.9058569 0.7824233 195 | # 13 0.8953878 0.7565026 196 | # 15 0.8932914 0.7513188 197 | # 17 0.8870152 0.7354032 198 | # 19 0.8828223 0.7240868 199 | # 21 0.8786426 0.7124809 200 | # 23 0.8765592 0.7063262 201 | # 202 | 203 | plot(nb_fit) 204 | 205 | # Accuracy was used to select the optimal model using the largest value. 206 | # The final value used for the model was k = 5. 207 | # Naive Bayes 208 | # 209 | # 478 samples 210 | # 9 predictor 211 | # 2 classes: 'benign', 'malignant' 212 | # 213 | # No pre-processing 214 | # Resampling: Cross-Validated (3 fold) 215 | # Summary of sample sizes: 318, 319, 319 216 | # Resampling results across tuning parameters: 217 | # 218 | # usekernel Accuracy Kappa 219 | # FALSE NaN NaN 220 | # TRUE 0.9644392 0.9225881 221 | # 222 | # Tuning parameter 'fL' was held constant at a value of 0 223 | # Tuning 224 | # parameter 'adjust' was held constant at a value of 1 225 | # Accuracy was used to select the optimal model using the largest value. 226 | # The final values used for the model were fL = 0, usekernel = TRUE 227 | # and adjust = 1. 228 | 229 | 230 | conmat <- function(predicted, expected){ 231 | cm <- as.matrix(table(Actual = as.factor(expected), Predicted = predicted)) 232 | cm 233 | } 234 | 235 | cm <- table(preds, test$Class) 236 | # 237 | # preds benign malignant 238 | # benign 128 1 239 | # malignant 6 70 240 | 241 | 242 | accuracy <- sum(diag(cm)) / sum(as.matrix(cm)) 243 | # [1] 0.9704433 244 | 245 | precision <- diag(cm)[2] / rowSums(cm)[2] 246 | # malignant 247 | # 0.9577465 248 | # malignant 249 | # 0.9210526 250 | 251 | recall <- diag(cm)[2] / colSums(cm)[2] 252 | # malignant 253 | # 0.9577465 254 | 255 | specificity <- diag(cm)[1] / colSums(cm)[1] 256 | # benign 257 | # 0.9772727 258 | 259 | confusionMatrix(cm, positive='malignant') 260 | # Confusion Matrix and Statistics 261 | # 262 | # 263 | # preds benign malignant 264 | # benign 129 3 265 | # malignant 3 68 266 | # 267 | # Accuracy : 0.9704 268 | # 95% CI : (0.9368, 0.9891) 269 | # No Information Rate : 0.6502 270 | # P-Value [Acc > NIR] : <2e-16 271 | # 272 | # Kappa : 0.935 273 | # 274 | # Mcnemar's Test P-Value : 1 275 | # 276 | # Sensitivity : 0.9577 277 | # Specificity : 0.9773 278 | # Pos Pred Value : 0.9577 279 | # Neg Pred Value : 0.9773 280 | # Prevalence : 0.3498 281 | # Detection Rate : 0.3350 282 | # Detection Prevalence : 0.3498 283 | # Balanced Accuracy : 0.9675 284 | # 285 | # 'Positive' Class : malignant 286 | # 287 | # Confusion Matrix and Statistics 288 | # 289 | -------------------------------------------------------------------------------- /R/final.R: -------------------------------------------------------------------------------- 1 | library(mlbench) 2 | library(caret) 3 | library(mlr) 4 | library(tidyverse) 5 | library(ggthemes) 6 | library(gplots) 7 | library(randomForest) 8 | library(skimr) # skimr is sweet 9 | 10 | data(BostonHousing) 11 | head(BostonHousing) 12 | length(BostonHousing) 13 | str(BostonHousing) 14 | skim(BostonHousing) 15 | # Skim summary statistics 16 | # n obs: 506 17 | # n variables: 14 18 | # 19 | # ── Variable type:factor ────────────────────────────────────────────────── 20 | # variable missing complete n n_unique top_counts 21 | # chas 0 506 506 2 0: 471, 1: 35, NA: 0 22 | # rad 0 506 506 9 24: 132, 5: 115, 4: 110, 3: 38 23 | # ordered 24 | # FALSE 25 | # FALSE 26 | # 27 | # ── Variable type:numeric ───────────────────────────────────────────────── 28 | # variable missing complete n mean sd p0 p25 p50 29 | # age 0 506 506 68.57 28.15 2.9 45.02 77.5 30 | # b 0 506 506 356.67 91.29 0.32 375.38 391.44 31 | # crim 0 506 506 3.61 8.6 0.0063 0.082 0.26 32 | # dis 0 506 506 3.8 2.11 1.13 2.1 3.21 33 | # indus 0 506 506 11.14 6.86 0.46 5.19 9.69 34 | # lstat 0 506 506 12.65 7.14 1.73 6.95 11.36 35 | # medv 0 506 506 22.53 9.2 5 17.02 21.2 36 | # nox 0 506 506 0.55 0.12 0.39 0.45 0.54 37 | # ptratio 0 506 506 18.46 2.16 12.6 17.4 19.05 38 | # rm 0 506 506 6.28 0.7 3.56 5.89 6.21 39 | # tax 0 506 506 408.24 168.54 187 279 330 40 | # zn 0 506 506 11.36 23.32 0 0 0 41 | # p75 p100 hist 42 | # 94.07 100 ▁▂▂▂▂▂▃▇ 43 | # 396.23 396.9 ▁▁▁▁▁▁▁▇ 44 | # 3.68 88.98 ▇▁▁▁▁▁▁▁ 45 | # 5.19 12.13 ▇▅▃▃▂▁▁▁ 46 | # 18.1 27.74 ▃▆▅▁▁▇▁▁ 47 | # 16.96 37.97 ▆▇▆▅▂▁▁▁ 48 | # 25 50 ▂▅▇▆▂▂▁▁ 49 | # 0.62 0.87 ▇▆▇▆▃▅▁▁ 50 | # 20.2 22 ▁▂▂▂▅▅▇▃ 51 | # 6.62 8.78 ▁▁▂▇▇▂▁▁ 52 | # 666 711 ▃▇▂▅▁▁▁▆ 53 | # 12.5 100 ▇▁▁▁▁▁▁▁ 54 | 55 | 56 | ## Define a function using AES_STRING to put strings in ggplot objects 57 | scatterplotfun <- function(df, x,y){ 58 | ggplot(data = df, aes_string(x = x, y = y))+ geom_point() 59 | } 60 | 61 | ## Define the name of the Y variable 62 | yname <- "medv" 63 | 64 | # Get the names off the numeric x variables 65 | BostonNumeric <- BostonHousing %>% keep(is.numeric) 66 | xname <- names(BostonNumeric[-ncol(BostonNumeric)]) 67 | 68 | # ggplot with lapply 69 | lapply(xname, function(x) scatterplotfun(df = BostonNumeric, x = x, y = yname)) 70 | plist <- lapply(xname, function(x) scatterplotfun(df = BostonNumeric, x = x, y = yname)) 71 | 72 | library(cowplot) 73 | plot_grid(plotlist = plist) 74 | 75 | # make this a factor to show an example 76 | BostonHousing$rad <- as.factor(BostonHousing$rad) 77 | str(BostonHousing) 78 | # 'data.frame': 506 obs. of 14 variables: 79 | # $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ... 80 | # $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ... 81 | # $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ... 82 | # $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ... 83 | # $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ... 84 | # $ rm : num 6.58 6.42 7.18 7 7.15 ... 85 | # $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ... 86 | # $ dis : num 4.09 4.97 4.97 6.06 6.06 ... 87 | # $ rad : Factor w/ 9 levels "1","2","3","4",..: 1 2 2 3 3 3 5 5 5 5 ... 88 | # $ tax : num 296 242 242 222 222 222 311 311 311 311 ... 89 | # $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ... 90 | # $ b : num 397 397 393 395 397 ... 91 | # $ lstat : num 4.98 9.14 4.03 2.94 5.33 ... 92 | # $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ... 93 | # NULL 94 | 95 | 96 | # visualizing factor vs factor 97 | 98 | # reverse levels so we can read it in a human way 99 | # save column to vector 100 | chas <- BostonHousing$chas 101 | levels(chas) 102 | # [1] "0" "1" 103 | rev(levels(chas)) 104 | # [1] "1" "0" 105 | BostonHousing$chas <- factor(chas, levels = rev(levels(chas))) 106 | 107 | # position = "fill" is the key here 108 | # please make your plots look good and theme them consistently 109 | ggplot(BostonHousing, aes(x = rad, fill = chas)) + 110 | geom_bar(alpha = 0.9, position = "fill") + 111 | coord_flip() + 112 | labs(x = "rad", y = "Proportion", title = "Income bias based on Education", 113 | subtitle = "Stacked bar plot") + 114 | theme_hc() + scale_fill_hc() 115 | 116 | # coreelation plot function we defined last week 117 | library(corrplot) 118 | correlator <- function(df){ 119 | df %>% 120 | keep(is.numeric) %>% 121 | tidyr::drop_na() %>% 122 | cor %>% 123 | corrplot( addCoef.col = "white", number.digits = 2, 124 | number.cex = 0.5, method="square", 125 | order="hclust", title="Variable Corr Heatmap", 126 | tl.srt=45, tl.cex = 0.8) 127 | } 128 | 129 | 130 | BostonHousing %>%keep(is.numeric) %>%tidyr::drop_na() %>%cor 131 | 132 | ## interpret this 133 | correlator(BostonHousing) 134 | 135 | ## Now you do your LMs 136 | 137 | ## Useful variable importance plot 138 | ## Look at the plot on the left 139 | library(randomForest) 140 | rfreg <- randomForest(medv ~., data = BostonHousing, impotance = TRUE) 141 | varImpPlot(rfreg) 142 | 143 | 144 | 145 | # amazing library for categoricals/factors. No examples here but it rocks 146 | library(forcats) 147 | 148 | 149 | ## classification dataset 150 | 151 | data(BreastCancer) 152 | bc <- BreastCancer 153 | skim(bc) 154 | # Skim summary statistics 155 | # n obs: 699 156 | # n variables: 11 157 | # 158 | # ── Variable type:character ─────────────────────────────────────────────── 159 | # variable missing complete n min max empty n_unique 160 | # Id 0 699 699 5 8 0 645 161 | # 162 | # ── Variable type:factor ────────────────────────────────────────────────── 163 | # variable missing complete n n_unique 164 | # Bare.nuclei 16 683 699 10 165 | # Bl.cromatin 0 699 699 10 166 | # Cell.shape 0 699 699 10 167 | # Cell.size 0 699 699 10 168 | # Cl.thickness 0 699 699 10 169 | # Class 0 699 699 2 170 | # Epith.c.size 0 699 699 10 171 | # Marg.adhesion 0 699 699 10 172 | # Mitoses 0 699 699 9 173 | # Normal.nucleoli 0 699 699 10 174 | # top_counts ordered 175 | # 1: 402, 10: 132, 2: 30, 5: 30 FALSE 176 | # 2: 166, 3: 165, 1: 152, 7: 73 FALSE 177 | # 1: 353, 2: 59, 10: 58, 3: 56 TRUE 178 | # 1: 384, 10: 67, 3: 52, 2: 45 TRUE 179 | # 1: 145, 5: 130, 3: 108, 4: 80 TRUE 180 | # ben: 458, mal: 241, NA: 0 FALSE 181 | # 2: 386, 3: 72, 4: 48, 1: 47 TRUE 182 | # 1: 407, 2: 58, 3: 58, 10: 55 TRUE 183 | # 1: 579, 2: 35, 3: 33, 10: 14 FALSE 184 | # 1: 443, 10: 61, 3: 44, 2: 36 FALSE 185 | 186 | # 16 missing 187 | 188 | # get rid of ID column, get rid of NAs 189 | bc <- bc %>% select(-Id) %>% tidyr::drop_na() 190 | str(bc) 191 | # 'data.frame': 683 obs. of 10 variables: 192 | # $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ... 193 | # $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ... 194 | # $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ... 195 | # $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ... 196 | # $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ... 197 | # $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ... 198 | # $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ... 199 | # $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ... 200 | # $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ... 201 | # $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ... 202 | glimpse(bc) 203 | library(pander) 204 | skim(bc) %>% pander 205 | 206 | # practice aes string and categorical plot here 207 | 208 | # catplot = category plot 209 | catplot <- function(df, x,y){ 210 | ggplot(data = df, aes_string(x = x, fill = y)) + 211 | geom_bar(position = "fill", alpha = 0.9) + 212 | coord_flip() 213 | } 214 | 215 | ## Define the name of the Y variable 216 | yname <- "Class" 217 | 218 | # Get the names off the numeric x variables 219 | xname <- names(bc[-ncol(bc)]) 220 | 221 | # ggplot with lapply 222 | lapply(xname, function(x) catplot(df = bc, x = x, y = yname)) 223 | plist <- lapply(xname, function(x) catplot(df = bc, x = x, y = yname)) 224 | plist 225 | cowplot::plot_grid(plotlist = plist) 226 | 227 | # superassignment split practice 228 | split <- function(df, p = 0.75, list = FALSE, ...) { 229 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 230 | cat("creating training dataset...\n") 231 | training <<- df[train_ind, ] 232 | cat("completed training dataset, creating test set\n") 233 | test <<- df[-train_ind, ] 234 | cat("done") 235 | } 236 | 237 | split(bc) 238 | 239 | 240 | # 241 | 242 | library(doParallel) 243 | cores <- parallel::detectCores() 244 | # [1] 12 245 | 246 | # Generally do one less 247 | workers <- makeCluster(11L) 248 | 249 | # register for parallel computation 250 | registerDoParallel(workers) 251 | 252 | # train method for optimized specificity 253 | trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE) 254 | 255 | 256 | # naive bayes classifier 257 | fit.nb <- train(Class ~ . data = training, method = "nb", metric = "Spec", trControl = trainMethod) 258 | 259 | # knn classifier 260 | fit.knn <- train(Class ~ . data = training, method = "knn", metric = "Spec", trControl = trainMethod) 261 | 262 | # learn more about caret 263 | browseURL("https://topepo.github.io/caret/index.html") 264 | # if this fails try: 265 | # getOption("browser") 266 | # options(browser = "firefox") 267 | -------------------------------------------------------------------------------- /R/preprocessing.R: -------------------------------------------------------------------------------- 1 | library(functional) # to compose the preprocessing pipeline 2 | library(data.table)# to read csvs FAST 3 | library(rlist) # for list manipulations 4 | library(pipeR) # fast, dumb pipes 5 | library(imputeTS) # to impute NAs 6 | library(pander) # so i can read the output 7 | library(foreach) # go fast 8 | library(doParallel) # go fast 9 | 10 | # Data import 11 | # datadir <- "../data/" 12 | # function which imports the data as a list, then fixes up the names to be nice 13 | import <- function(path){ 14 | # first we list the files in the directory we specify 15 | files <- list.files(path) 16 | # we fund csvs 17 | files <- files[grepl(files, pattern = ".csv")] 18 | # we paste the path to our filename 19 | filepaths <- sapply(files, function(x) paste0(path,x)) 20 | # We read in all the files 21 | out <- lapply(filepaths, fread) 22 | # we clean up the names 23 | fnames <- gsub(".csv","",files) 24 | fnames <- gsub("[[:digit:]]+","", fnames) 25 | names(out) <- fnames 26 | # return the list of data frames 27 | out 28 | } 29 | # datas <- import(datadir) 30 | 31 | # count the nas of a data frame 32 | 33 | # sum up the NAs and combine into a vector, note we are using percentages here 34 | count_nas_single <- function(df){ 35 | sapply(df, function(x) sum(is.na(x))/length(x)) 36 | 37 | } 38 | 39 | # count the NAs of a whole list 40 | count_nas <- function(xs){ 41 | lapply(xs, count_nas_single) 42 | } 43 | 44 | # pander(count_nas(datas)) 45 | # 46 | # 47 | # * **BeijingPM_**: 48 | # 49 | # --------------------------------------------------------------------- 50 | # No year month day hour season PM_Dongsi PM_Dongsihuan 51 | # ---- ------ ------- ----- ------ -------- ----------- --------------- 52 | # 0 0 0 0 0 0 0.5236 0.61 53 | # --------------------------------------------------------------------- 54 | # 55 | # Table: Table continues below 56 | # 57 | # 58 | # ---------------------------------------------------------------------------- 59 | # PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP 60 | # ----------------- ------------ ----------- ---------- ---------- ----------- 61 | # 0.5259 0.04178 9.509e-05 0.006447 0.006447 9.509e-05 62 | # ---------------------------------------------------------------------------- 63 | # 64 | # Table: Table continues below 65 | # 66 | # 67 | # -------------------------------------------------- 68 | # cbwd Iws precipitation Iprec 69 | # ----------- ----------- --------------- ---------- 70 | # 9.509e-05 9.509e-05 0.009204 0.009204 71 | # -------------------------------------------------- 72 | # 73 | # * **ChengduPM_**: 74 | # 75 | # --------------------------------------------------------------------- 76 | # No year month day hour season PM_Caotangsi PM_Shahepu 77 | # ---- ------ ------- ----- ------ -------- -------------- ------------ 78 | # 0 0 0 0 0 0 0.5356 0.5323 79 | # --------------------------------------------------------------------- 80 | # 81 | # Table: Table continues below 82 | # 83 | # 84 | # -------------------------------------------------------------------------- 85 | # PM_US Post DEWP HUMI PRES TEMP cbwd Iws 86 | # ------------ --------- --------- ---------- --------- ---------- --------- 87 | # 0.4504 0.01006 0.01017 0.009908 0.01002 0.009908 0.01014 88 | # -------------------------------------------------------------------------- 89 | # 90 | # Table: Table continues below 91 | # 92 | # 93 | # ------------------------ 94 | # precipitation Iprec 95 | # --------------- -------- 96 | # 0.0562 0.0562 97 | # ------------------------ 98 | # 99 | # * **GuangzhouPM_**: 100 | # 101 | # -------------------------------------------------------------- 102 | # No year month day hour season PM_City Station 103 | # ---- ------ ------- ----- ------ ----------- ----------------- 104 | # 0 0 0 0 0 1.902e-05 0.3848 105 | # -------------------------------------------------------------- 106 | # 107 | # Table: Table continues below 108 | # 109 | # 110 | # ----------------------------------------------------------------------- 111 | # PM_5th Middle School PM_US Post DEWP HUMI PRES 112 | # ---------------------- ------------ ----------- ----------- ----------- 113 | # 0.5988 0.3848 1.902e-05 1.902e-05 1.902e-05 114 | # ----------------------------------------------------------------------- 115 | # 116 | # Table: Table continues below 117 | # 118 | # 119 | # --------------------------------------------------------------- 120 | # TEMP cbwd Iws precipitation Iprec 121 | # ----------- ----------- ----------- --------------- ----------- 122 | # 1.902e-05 1.902e-05 1.902e-05 1.902e-05 1.902e-05 123 | # --------------------------------------------------------------- 124 | # 125 | # * **ShanghaiPM_**: 126 | # 127 | # ----------------------------------------------------------------------------- 128 | # No year month day hour season PM_Jingan PM_US Post PM_Xuhui 129 | # ---- ------ ------- ----- ------ -------- ----------- ------------ ---------- 130 | # 0 0 0 0 0 0 0.5303 0.3527 0.521 131 | # ----------------------------------------------------------------------------- 132 | # 133 | # Table: Table continues below 134 | # 135 | # 136 | # ----------------------------------------------------------------------- 137 | # DEWP HUMI PRES TEMP cbwd Iws 138 | # ----------- ----------- ----------- ----------- ----------- ----------- 139 | # 0.0002472 0.0002472 0.0005325 0.0002472 0.0002282 0.0002282 140 | # ----------------------------------------------------------------------- 141 | # 142 | # Table: Table continues below 143 | # 144 | # 145 | # ------------------------- 146 | # precipitation Iprec 147 | # --------------- --------- 148 | # 0.07624 0.07624 149 | # ------------------------- 150 | # 151 | # * **ShenyangPM_**: 152 | # 153 | # ---------------------------------------------------------------------- 154 | # No year month day hour season PM_Taiyuanjie PM_US Post 155 | # ---- ------ ------- ----- ------ -------- --------------- ------------ 156 | # 0 0 0 0 0 0 0.5362 0.5877 157 | # ---------------------------------------------------------------------- 158 | # 159 | # Table: Table continues below 160 | # 161 | # 162 | # -------------------------------------------------------------------------- 163 | # PM_Xiaoheyan DEWP HUMI PRES TEMP cbwd Iws 164 | # -------------- --------- --------- --------- --------- --------- --------- 165 | # 0.5317 0.01316 0.01293 0.01316 0.01316 0.01316 0.01316 166 | # -------------------------------------------------------------------------- 167 | # 168 | # Table: Table continues below 169 | # 170 | # 171 | # ------------------------ 172 | # precipitation Iprec 173 | # --------------- -------- 174 | # 0.2427 0.2427 175 | # ------------------------ 176 | # 177 | # 178 | # 179 | # 180 | # 181 | # NULL 182 | 183 | # convert a vector to a time series with the proper frequency (we will say years in this case) 184 | tots <- function(v){ 185 | ts(v, frequency = 365*24) 186 | } 187 | # tots(datas[[1]]$PM_US) %>>% tail 188 | 189 | # convert a data frame into a list of time series objects, given column names 190 | totslist <- function(df){ 191 | # vector of column names which we do not want to convert to a time series 192 | badlist <- c( 193 | "No", 194 | "year", 195 | "month", 196 | "day", 197 | "hour", 198 | "season", 199 | "cbwd" 200 | ) 201 | # names of out data frame 202 | nms <- colnames(df) 203 | # coerce to a list 204 | df <- as.list(df) 205 | # if the column at [[name]] is on our list, return it, otherwise, convert 206 | # to a time series. This allows us to deal with different amounts of data 207 | # collections of time series data (some series have more PM collecting 208 | # stations than others) 209 | for (name in nms){ 210 | if (name %in% badlist){ 211 | df[[name]] <- df[[name]] 212 | } else { 213 | df[[name]] <- tots(df[[name]]) 214 | } 215 | } 216 | df 217 | 218 | 219 | } 220 | # datas[[1]] %>>% totsdf %>>%str 221 | # datas[[1]] %>>% totslist%>>%str 222 | # turn all data frames in a list of data frames to time series objects 223 | totsall <- function(xs){ 224 | lapply(xs, totslist) 225 | } 226 | # str(datas[[1]]$PM_US) 227 | # datas %>>% totsall -> datas 228 | 229 | # impute NAs of a single list with spline interpolation 230 | # try na.ma but dont fail on error, instead just do standard type checking 231 | # if the output is a time series, impute the NAs, otherwise do nothing 232 | imp_test <- function(v){ 233 | out <- try(na.interpolation(v, "spline")) 234 | ifelse( 235 | is.ts(out), 236 | return(out), 237 | return(v) 238 | ) 239 | } 240 | # impute the NAs of a single list, keep column names (.final) 241 | impute <- function(xs){ 242 | foreach(i = 1:length(xs), 243 | .final = function(x){ 244 | setNames(x, names(xs)) 245 | }) %dopar% 246 | imp_test(xs[[i]]) 247 | } 248 | # example of parallel computing 249 | # cl <- makeCluster(11, type = "FORK") 250 | # registerDoParallel(cl) 251 | # na.ma(datas[[1]][["PM_"]], k=200) 252 | # na.interpolation(datas[[1]][["PM_Dongsi"]], "spline") %>>% head 253 | # impute(datas[[1]]) %>>% names 254 | 255 | # impute NAs of the parent list 256 | impute_list <- function(xs){ 257 | lapply(xs, impute) 258 | } 259 | 260 | # make a fast hash table 261 | # hash tables are an excellent and flexible way to store large amounts of data 262 | # can be indexed with $ and [[]] nicely 263 | # but the data is represented in a memory efficient way, an can be manipulated in complex ways 264 | # Think of it like a realy fast database for searching and inserting 265 | to_hash <- function(xs){ 266 | list2env(xs, envir = NULL, hash = TRUE) 267 | } 268 | # final preprocessing function: 269 | 270 | preprocess <- Compose(import, totsall, impute_list, to_hash) 271 | -------------------------------------------------------------------------------- /pres/biasAndInference.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bias and Inference in Machine Learning" 3 | author: "David" 4 | date: "`r Sys.Date()`" 5 | output: 6 | revealjs::revealjs_presentation: 7 | df_print: paged 8 | theme: white 9 | transition: zoom 10 | self_contained: false 11 | reveal_plugins: ["chalkboard"] 12 | reveal_options: 13 | chalkboard: 14 | theme: whiteboard 15 | --- 16 | 17 | ```{r setup, include = FALSE} 18 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) 19 | # for pretty data frames 20 | library(DT) 21 | # useful alias 22 | DT <- datatable 23 | ``` 24 | 25 | 26 | # Responsible Modeling 27 | 28 | ```{r zoolander, echo = FALSE} 29 | knitr::include_graphics("https://i.imgur.com/uYs6KsN.mp4") 30 | ``` 31 | 32 | ## Bias in AI: A massive problem 33 | 34 | >- Hidden racial bias in machine learning models 35 | >- Gender bias in credit card models (Steve Wozniak wife) 36 | >- Impossible screenings in job applications 37 | >- Any other examples? 38 | >- We will all be affected (lots of models) 39 | 40 | 41 | ## Potential Solutions 42 | 43 | >- Highly technical options 44 | >- Simpler solutions for humans 45 | 46 | # EXAI: Machine Learning for Humans 47 | 48 | ## Explainable AI 49 | 50 | >- But how can "AI" be explainable? 51 | >- How does this reduce bias? 52 | >- How does this affect my case study?? 53 | 54 | ## (Less important) Less Frustrating Code: Introducing MLR 55 | 56 | >- Uniform API for pretty much all R models 57 | >- Obviously outside of deep learning 58 | >- Readable code is more transparent, better, and arguably more ethical 59 | 60 | # Code walkthrough: Regression 61 | 62 | ## Load in necessary packages {.smaller} 63 | 64 | 65 | ```{r packs, echo = c(-1,-4)} 66 | library(DALEX) # for data for now 67 | library(mlr) # what we are interested in! 68 | data(apartments) 69 | DT(apartments, class = 'compact') 70 | ``` 71 | 72 | 73 | ## Create a task 74 | 75 | ```{r taskSetup} 76 | regr_task <- makeRegrTask(data = apartments, target = "m2.price") 77 | ``` 78 | 79 | >- Tasks allow us to pursue a machine learning task! 80 | >- Specify whether we want to do regression, classification, etc 81 | 82 | 83 | ## Create Learners 84 | 85 | ```{r reglrn} 86 | regr_lrn_knn <- makeLearner("regr.kknn") 87 | regr_lrn_lm <- makeLearner("regr.lm") 88 | regr_lrn_rf <- makeLearner("regr.ranger") 89 | ``` 90 | 91 | ## Get parameter set to optimize! 92 | 93 | ```{r getPars, results = "hold"} 94 | getParamSet(regr_lrn_knn) 95 | getParamSet(regr_lrn_rf) 96 | getParamSet(regr_lrn_lm) 97 | ``` 98 | 99 | ## Tune KNN! 100 | 101 | ```{r} 102 | knn_paramSet <- makeParamSet( 103 | makeIntegerParam("k", lower = 3, upper = 30) 104 | ) 105 | rdesc = makeResampleDesc("CV", iters = 3L) 106 | ctrl <- makeTuneControlGrid() 107 | res <- tuneParams(regr_lrn_knn, 108 | task = regr_task, 109 | resampling=rdesc, 110 | measures = mse, 111 | par.set = knn_paramSet, 112 | control = ctrl) 113 | 114 | res 115 | ``` 116 | 117 | ## Explore KNN tuning! 118 | 119 | ```{r} 120 | khpe <- generateHyperParsEffectData(res) 121 | plotHyperParsEffect(khpe, x = "k", y = "mse.test.mean", plot.type = "line") 122 | ``` 123 | 124 | ## Tune random forest 125 | 126 | We will only tune one parameter, just because y'all havent actually studied this 127 | 128 | ```{r} 129 | rf_paramSet <- makeParamSet( 130 | makeIntegerParam("num.trees", lower = 10, upper = 100, trafo = function(x) 10*x) 131 | ) 132 | res2 <- tuneParams(regr_lrn_rf, 133 | task = regr_task, 134 | resampling=rdesc, 135 | measures = mse, 136 | par.set = rf_paramSet, 137 | control = ctrl) 138 | 139 | res2 140 | ``` 141 | 142 | ## Explore again! 143 | 144 | ```{r} 145 | 146 | rfhpe <- generateHyperParsEffectData(res2, trafo = TRUE) 147 | plotHyperParsEffect(rfhpe, x = "num.trees", y = "mse.test.mean", plot.type = "line") 148 | ``` 149 | 150 | ## Set hyperparameters for model! 151 | 152 | ```{r} 153 | knn_regr_tuned <- setHyperPars(regr_lrn_knn, k = res$x$k) 154 | rf_regr_tuned <- setHyperPars(regr_lrn_rf, num.trees = 600) 155 | ``` 156 | 157 | >- note we can refer around the res object 158 | >- also note we can set them ourselves! 159 | >- always always plot your tuning so you can find a simple model 160 | 161 | ## Train Models! 162 | 163 | ```{r} 164 | model_lm <- train(regr_lrn_lm, regr_task) 165 | model_knn <- train(knn_regr_tuned, regr_task) 166 | model_rf <- train(rf_regr_tuned, regr_task) 167 | ``` 168 | 169 | ## Make predictions!! 170 | 171 | ```{r} 172 | lm_preds <- predict(model_lm, newdata = apartmentsTest) 173 | knn_preds <- predict(model_knn, newdata = apartmentsTest) 174 | rf_preds <- predict(model_rf, newdata = apartmentsTest) 175 | data.frame("lm" = lm_preds$data$response, 176 | "knn" = knn_preds$data$response, 177 | "rf" = rf_preds$data$response) 178 | ``` 179 | 180 | ## Make this "explainable": introducing DALEX 181 | 182 | ```{r, results = "hide"} 183 | library(DALEX) 184 | library(DALEXtra) 185 | library(ingredients) 186 | lm_explained <- explain_mlr(model_lm, 187 | data = apartmentsTest, 188 | y = apartmentsTest$m2.price, 189 | label = "lm") 190 | knn_explained <- explain_mlr(model_knn, 191 | data = apartmentsTest, 192 | y = apartmentsTest$m2.price, 193 | label = "knn") 194 | rf_explained <- explain_mlr(model_rf, 195 | data = apartmentsTest, 196 | y = apartmentsTest$m2.price, 197 | label = "rf") 198 | explainers <- list(lm_explained, knn_explained, rf_explained) 199 | ``` 200 | 201 | ## Model performance with DALEX 202 | ```{r, fig.show = "hold", echo = 1:3} 203 | model_perfs <- lapply(explainers, model_performance) 204 | p1 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]]) 205 | p2 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]], geom = "boxplot") 206 | ``` 207 | 208 | ## Model Performance with DALEX 209 | ```{r, echo = F} 210 | cowplot::plot_grid(p1, p2) 211 | ``` 212 | 213 | ## Variable Importance 214 | 215 | >- Typically available in tree-based models such as rf and boosting 216 | >- But we can do it for any model with permutations 217 | >- How does it work? 218 | >- Calculate model scores after slightly altering a single variable 219 | >- repeat 220 | 221 | ## Variable Importance 222 | 223 | ```{r} 224 | library(ingredients) 225 | model_vis <- lapply(explainers, function(x) feature_importance(x, loss_function = loss_root_mean_square, type = "difference")) 226 | 227 | ``` 228 | 229 | ## Variable Importance 230 | 231 | ```{r, echo = FALSE} 232 | plot(model_vis[[1]], model_vis[[2]], model_vis[[3]]) 233 | ``` 234 | 235 | ## Partial Dependence 236 | 237 | Show the relationship between continuos variables and model outcomes! 238 | 239 | 240 | ## Partial Dependence 241 | 242 | 243 | ```{r} 244 | pdps <- lapply(explainers, function(x) partial_dependency(x, variables = "construction.year")) 245 | ``` 246 | 247 | ## Partial Dependence 248 | 249 | ```{r, echo = F} 250 | plot(pdps[[1]], pdps[[2]], pdps[[3]]) 251 | ``` 252 | 253 | ## Merging Paths! 254 | 255 | Show the relationship between a categorical variable and model response! 256 | 257 | 258 | ```{r} 259 | mpps <- lapply(explainers, function(x) variable_response(x, variable = "district", type = "factor")) 260 | ``` 261 | 262 | ## Merging Paths! 263 | 264 | ```{r, echo = F} 265 | plot(mpps[[1]], mpps[[2]], mpps[[3]]) 266 | ``` 267 | 268 | ## Holy Grail funnel plot 269 | 270 | ```{r} 271 | funnel <- funnel_measure(rf_explained, list(knn_explained, lm_explained), 272 | partition_data = cbind(apartmentsTest, 273 | "m2.per.room" = apartmentsTest$surface/apartmentsTest$no.rooms), 274 | nbins = 5, measure_function = loss_root_mean_square, show_info = TRUE) 275 | ``` 276 | 277 | ## Holy Grail Funnel plot 278 | 279 | ```{r, echo = FALSE} 280 | plot(funnel) 281 | ``` 282 | 283 | 284 | # Classification now! 285 | 286 | ## Step 1: Data 287 | 288 | ```{r} 289 | data(wine, package = "breakDown") 290 | wine$quality <- ifelse(wine$quality>5, 1, 0) 291 | wine$quality <- factor(wine$quality) 292 | train_index <- sample(1:nrow(wine), 0.6 * nrow(wine)) 293 | test_index <- setdiff(1:nrow(wine), train_index) 294 | wineTrain = wine[train_index,] 295 | wineTest <- wine[test_index,] 296 | ``` 297 | 298 | ## Step 2: Task and Learner definition 299 | 300 | ```{r} 301 | class_task <- makeClassifTask(data = wineTrain, target = "quality") 302 | 303 | classif_knn <- makeLearner("classif.kknn", predict.type = "prob") 304 | classif_nb <- makeLearner("classif.naiveBayes", predict.type = "prob") 305 | classif_rf <- makeLearner("classif.ranger", predict.type = "prob") 306 | ``` 307 | 308 | ## Step 3, find params to tune 309 | 310 | ```{r} 311 | lapply(list(classif_knn, classif_nb, classif_rf), getParamSet) 312 | ``` 313 | 314 | ## Step 4, set up global tuning values 315 | 316 | ```{r} 317 | rdesc = makeResampleDesc("CV", iters = 3L) 318 | ctrl <- makeTuneControlRandom(maxit = 20L) 319 | ``` 320 | 321 | ## Tune KNN 322 | 323 | ```{r} 324 | knn_paramSet <- makeParamSet( 325 | makeIntegerParam("k", lower = 3, upper = 40) 326 | ) 327 | 328 | res <- tuneParams(classif_knn, 329 | task = class_task, 330 | resampling=rdesc, 331 | measures = list(tnr, mlr::acc, tpr), 332 | par.set = knn_paramSet, 333 | control = ctrl) 334 | ``` 335 | 336 | ## Look at tuning! 337 | 338 | ```{r} 339 | khpe <- generateHyperParsEffectData(res) 340 | spec <- plotHyperParsEffect(khpe, x = "k", y = "tnr.test.mean", plot.type = "line") 341 | sens <- plotHyperParsEffect(khpe, x = "k", y = "tpr.test.mean", plot.type = "line") 342 | accu <- plotHyperParsEffect(khpe, x = "k", y = "acc.test.mean", plot.type = "line") 343 | ``` 344 | 345 | ## Look at tuning! 346 | 347 | ```{r, echo = F} 348 | cowplot::plot_grid(accu, sens, spec) 349 | ``` 350 | 351 | ## Define Tuned Model 352 | 353 | ```{r} 354 | knn_tuned <- setHyperPars(classif_knn, k = 7) 355 | ``` 356 | 357 | ## Tune Naive Bayes 358 | 359 | The version of naive bayes implemented in mlr honestly sucks, I would use the klaR version, which will ALSO work with DALEX. We will just use default naive bayes, and you can definitely tune the klaR nb yourself. 360 | 361 | ## Tune random forest 362 | 363 | ```{r} 364 | rf_paramSet <- makeParamSet( 365 | makeIntegerParam("num.trees", lower = 10, upper = 100, trafo = function(x) 10*x) 366 | ) 367 | ctrl2 <- makeTuneControlGrid() 368 | res2 <- tuneParams(classif_rf, 369 | task = class_task, 370 | resampling=rdesc, 371 | measures = list(tnr, mlr::acc, tpr), 372 | par.set = rf_paramSet, 373 | control = ctrl2) 374 | 375 | res2 376 | ``` 377 | 378 | 379 | ## Look at tuning! 380 | 381 | ```{r} 382 | rfpe <- generateHyperParsEffectData(res2, trafo = TRUE) 383 | spec <- plotHyperParsEffect(rfpe, x = "num.trees", y = "tnr.test.mean", plot.type = "line") 384 | sens <- plotHyperParsEffect(rfpe, x = "num.trees", y = "tpr.test.mean", plot.type = "line") 385 | accu <- plotHyperParsEffect(rfpe, x = "num.trees", y = "acc.test.mean", plot.type = "line") 386 | ``` 387 | 388 | ## Look at tuning! 389 | 390 | ```{r, echo = F} 391 | cowplot::plot_grid(accu, sens, spec) 392 | ``` 393 | 394 | ## Define Tuned Model 395 | 396 | ```{r} 397 | rf_tuned <- setHyperPars(classif_rf, num.trees = 240) 398 | ``` 399 | 400 | 401 | ## Train models 402 | 403 | ```{r} 404 | models <- list(rf_tuned, knn_tuned, classif_nb) 405 | trained <- lapply(models, function(x) train(x, class_task)) 406 | model_rf <- trained[[1]] 407 | model_knn <- trained[[2]] 408 | model_nb <- trained[[3]] 409 | ``` 410 | 411 | ## Make predictions 412 | 413 | ```{r} 414 | nb_preds <- predict(model_nb, newdata = wineTest) 415 | knn_preds <- predict(model_knn, newdata = wineTest) 416 | rf_preds <- predict(model_rf, newdata = wineTest) 417 | ``` 418 | 419 | Confusion matrices are left as an exercise ;) 420 | 421 | ## Explain models! 422 | 423 | 424 | ```{r, results = "hide"} 425 | lm_explained <- explain_mlr(model_nb, 426 | data = wineTest, 427 | y = wineTest$quality, 428 | label = "nb") 429 | knn_explained <- explain_mlr(model_knn, 430 | data = wineTest, 431 | y = wineTest$quality, 432 | label = "knn") 433 | rf_explained <- explain_mlr(model_rf, 434 | data = wineTest, 435 | y = wineTest$quality, 436 | label = "rf") 437 | explainers2 <- list(nb_explained, knn_explained, rf_explained) 438 | ``` 439 | 440 | ## Model Performance: residuals 441 | 442 | ```{r} 443 | perfs <- lapply(explainers2, model_performance) 444 | plot1 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]]) 445 | plot2 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]], geom = "boxplot") 446 | ``` 447 | 448 | ## Model Performance: residuals 449 | 450 | ```{r, echo = F} 451 | cowplot::plot_grid(plot1, plot2) 452 | ``` 453 | 454 | ## More things,display effect per whatever 455 | 456 | ```{r} 457 | selected_wines <- select_sample(wineTrain, n = 100) 458 | cps <- lapply(explainers2, function(x) ceteris_paribus(x, selected_wines)) 459 | pdps_sulph_alcohol <- lapply(cps, function(x) aggregate_profiles(x, variables = c("sulphates", "alcohol"))) 460 | pdp_plots <- lapply(pdps_sulph_alcohol, plot) 461 | ``` 462 | 463 | ## Localized Variable response! 464 | 465 | ```{r, echo = F} 466 | cowplot::plot_grid(plotlist = pdp_plots) 467 | ``` 468 | 469 | # Everything Else... 470 | 471 | ## Left as an exercise :) 472 | 473 | >- There are a ton other things to explore here, especially regarding classification,and the positive and negative rates. Please refer to the manual 474 | 475 | ## Sources and resources for students: 476 | 477 | >- [Manual for explainable AI in R and python, START HERE](https://pbiecek.github.io/PM_VEE/modelPerformance.html#modelPerformanceIntro) 478 | >- [DrWhy.AI, START HERE FOR PACKAGES](https://github.com/ModelOriented/DrWhy) 479 | >- [DALEX vignette](https://raw.githack.com/pbiecek/DALEX_docs/master/vignettes/DALEX_mlr.html#3_classification_use_case_-_wine_data) 480 | >- [DALEX documentation](https://modeloriented.github.io/DALEX/) 481 | >- [DALEXtra docs](https://modeloriented.github.io/DALEXtra/index.html) 482 | >- Please note for when you move on from R, DALEX works with python too, as well as Keras and deep learning! This is an important topic which we need to be aware of 483 | >- [AMAZING mlr docs](https://mlr.mlr-org.com/articles/tutorial/task.html) 484 | >- This is a super important topic, which I have BARELY scratched today. If you are going to fall into a rabbit hole over the break, this is where you should go. 485 | -------------------------------------------------------------------------------- /.Rhistory: -------------------------------------------------------------------------------- 1 | conmat <- function(predicted, expected){ 2 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 3 | cm 4 | } 5 | f1_score <- function(predicted, expected, positive.class="1") { 6 | cm = conmat(predicted, expected) 7 | precision <- diag(cm) / colSums(cm) 8 | recall <- diag(cm) / rowSums(cm) 9 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 10 | #Assuming that F1 is zero when it's not possible compute it 11 | f1[is.na(f1)] <- 0 12 | #Binary F1 or Multi-class macro-averaged F1 13 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 14 | } 15 | accuracy <- function(predicted, expected){ 16 | cm <- confusionMatrix(predicted, expected) 17 | sum(diag(cm)/length(test$class)) 18 | } 19 | cm <- conmat(predicted, expected) 20 | accuracy <- function(predicted, expected){ 21 | cm <- conmat(predicted, expected) 22 | sum(diag(cm)/length(test$class)) 23 | } 24 | f1_score(test_pred, test$class) 25 | f1_score(test_pred, test$class) 26 | f1_score(test_pred, test) 27 | get_scores <- function(predictions, test){ 28 | f1 <- f1_score(predictions,test) 29 | acc <- accuracy(predictions,test) 30 | scores <- c(accuracy = acc, f1 = f1) 31 | scores 32 | } 33 | pander(get_scores(test_pred, test)) 34 | conmat <- function(predicted, expected){ 35 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 36 | cm 37 | } 38 | f1_score <- function(predicted, expected, positive.class="1") { 39 | cm = conmat(predicted, expected) 40 | precision <- diag(cm) / colSums(cm) 41 | recall <- diag(cm) / rowSums(cm) 42 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 43 | #Assuming that F1 is zero when it's not possible compute it 44 | f1[is.na(f1)] <- 0 45 | #Binary F1 or Multi-class macro-averaged F1 46 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 47 | } 48 | accuracy <- function(predicted, expected){ 49 | cm <- conmat(predicted, expected) 50 | sum(diag(cm)/length(test$class)) 51 | } 52 | get_scores <- function(predictions, test){ 53 | f1 <- f1_score(predictions,test) 54 | acc <- accuracy(predictions,test) 55 | scores <- c(accuracy = acc, f1 = f1) 56 | scores 57 | } 58 | pander(get_scores(test_pred, test)) 59 | nb_fit <- train(training, 60 | training$class, 61 | trControl = trainMethod, 62 | method = "nb", 63 | tuneLength = 10 64 | ) 65 | nb_fit <- train(training, 66 | training$class, 67 | trControl = trainMethod, 68 | method = "nb", 69 | tuneLength = 10 70 | ) 71 | ``` 72 | nb_fit <- train(training, 73 | training$class, 74 | trControl = trainMethod, 75 | method = "nb", 76 | tuneLength = 10 77 | ) 78 | nb_fit 79 | plot(nb_fit) 80 | nb_pred <- predict(nb_fit, test) 81 | nb_pred <- predict(nb_fit, newdata = test) 82 | nb_pred 83 | confusionMatrix(nb_pred, test$class) 84 | get_scores(nb_pred, test) 85 | fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 86 | y <- train$class 87 | x <- train[-1] 88 | y <- trainng$class 89 | x <- training[-1] 90 | y <- training$class 91 | x <- training[-1] 92 | fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 93 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 94 | dist 95 | fast_nb_fit <- fastNaiveBayes.mixed(x,y) 96 | fast_nb_fit 97 | summary(fast_nb_fit) 98 | plot(fast_nb_fit) 99 | fast_pred <- predict(fast_nb_fit, test[-1]) 100 | fast_pred 101 | confusionMatrix(fast_pred, test$class) 102 | get_scores(fast_pred, test) 103 | confusionMatrix(fast_pred, test$class) 104 | get_scores(fast_pred, test) 105 | library(caret) 106 | library(fastNaiveBayes) 107 | library(readr) 108 | library(functional) 109 | library(ggplot2) 110 | library(magrittr) 111 | library(doParallel) 112 | library(caret) 113 | library(fastNaiveBayes) 114 | library(readr) 115 | library(functional) 116 | library(ggplot2) 117 | library(magrittr) 118 | library(caret) 119 | library(fastNaiveBayes) 120 | library(readr) 121 | library(functional) 122 | library(ggplot2) 123 | library(magrittr) 124 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 125 | wine <- read_csv(dataurl, col_names = F) 126 | good_cols <- c("class", 127 | "alcohol", 128 | 'malic_acid', 129 | 'ash', 130 | 'alkalinity', 131 | 'magnesium', 132 | 'total_phenols', 133 | 'flavanoids', 134 | 'nonflavonoids_phenols', 135 | 'proanthocyanins', 136 | 'color_intensity', 137 | 'hue', 138 | 'dilution', 139 | 'proline' 140 | ) 141 | fix_cols <- function(df){ 142 | colnames(df) <- good_cols 143 | df$class <- as.factor(df$class) 144 | df 145 | } 146 | wine <- fix_cols(wine) 147 | wine 148 | set.seed(3033) 149 | ## WARNING: Danger function 150 | split <- function(df, p = 0.75, list = FALSE, ...) { 151 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 152 | cat("creating training dataset...\n") 153 | training <<- df[train_ind, ] 154 | cat("completed training dataset, creating test set\n") 155 | test <<- df[-train_ind, ] 156 | cat("done") 157 | } 158 | split(wine) 159 | library(doParallel) 160 | numcores <- parallel::detectCores() -1 161 | cl <- makePSOCKcluster(numcores) 162 | registerDoParallel(cl) 163 | set.seed(3333) 164 | trainMethod <- trainControl(method = "repeatedcv", 165 | number = 10, 166 | repeats = 3) 167 | knn_fit <- train(class ~ ., 168 | data = training, 169 | method = "knn", 170 | trControl = trainMethod, 171 | preProcess = c("center", "scale"), 172 | tuneLength = 10) 173 | library(pander) 174 | knn_fit 175 | plot(knn_fit) 176 | knn_fit2 <- knn3(training, training$class, k = 15) 177 | knn_fit2 178 | test_pred <- predict(knn_fit, newdata = test) 179 | test_pred 180 | confusionMatrix(test_pred, test$class) 181 | conmat <- function(predicted, expected){ 182 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 183 | cm 184 | } 185 | f1_score <- function(predicted, expected, positive.class="1") { 186 | cm = conmat(predicted, expected) 187 | precision <- diag(cm) / colSums(cm) 188 | recall <- diag(cm) / rowSums(cm) 189 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 190 | #Assuming that F1 is zero when it's not possible compute it 191 | f1[is.na(f1)] <- 0 192 | #Binary F1 or Multi-class macro-averaged F1 193 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 194 | } 195 | accuracy <- function(predicted, expected){ 196 | cm <- conmat(predicted, expected) 197 | sum(diag(cm)/length(test$class)) 198 | } 199 | get_scores <- function(predictions, test){ 200 | f1 <- f1_score(predictions,test) 201 | acc <- accuracy(predictions,test) 202 | scores <- c(accuracy = acc, f1 = f1) 203 | scores 204 | } 205 | pander(get_scores(test_pred, test)) 206 | nb_fit <- train(training, 207 | training$class, 208 | trControl = trainMethod, 209 | method = "nb", 210 | tuneLength = 10 211 | ) 212 | nb_fit 213 | plot(nb_fit) 214 | nb_pred <- predict(nb_fit, newdata = test) 215 | nb_pred 216 | confusionMatrix(nb_pred, test$class) 217 | get_scores(nb_pred, test) 218 | y <- training$class 219 | x <- training[-1] 220 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 221 | dist 222 | fast_nb_fit <- fastNaiveBayes.mixed(x,y) 223 | fast_nb_fit 224 | summary(fast_nb_fit) 225 | fast_pred <- predict(fast_nb_fit, test[-1]) 226 | fast_pred 227 | confusionMatrix(fast_pred, test$class) 228 | get_scores(fast_pred, test) 229 | nb_fit <- train(training, 230 | training$class, 231 | trControl = trainMethod, 232 | method = "naivebayes", 233 | tuneLength = 10 234 | ) 235 | nb_fit <- train(training, 236 | training$class, 237 | trControl = trainMethod, 238 | method = "naive_bayes", 239 | tuneLength = 10 240 | ) 241 | nb_fit 242 | plot(nb_fit) 243 | nb_pred <- predict(nb_fit, newdata = test) 244 | nb_pred 245 | confusionMatrix(nb_pred, test$class) 246 | get_scores(nb_pred, test) 247 | rm(list = ls()) 248 | library(caret) 249 | library(fastNaiveBayes) 250 | library(readr) 251 | library(functional) 252 | library(ggplot2) 253 | library(magrittr) 254 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 255 | wine <- read_csv(dataurl, col_names = F) 256 | wine 257 | good_cols <- c("class", 258 | "alcohol", 259 | 'malic_acid', 260 | 'ash', 261 | 'alkalinity', 262 | 'magnesium', 263 | 'total_phenols', 264 | 'flavanoids', 265 | 'nonflavonoids_phenols', 266 | 'proanthocyanins', 267 | 'color_intensity', 268 | 'hue', 269 | 'dilution', 270 | 'proline' 271 | ) 272 | fix_cols <- function(df){ 273 | colnames(df) <- good_cols 274 | df$class <- as.factor(df$class) 275 | df 276 | } 277 | wine <- fix_cols(wine) 278 | wine 279 | set.seed(3033) 280 | ## WARNING: Danger function 281 | split <- function(df, p = 0.75, list = FALSE, ...) { 282 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 283 | cat("creating training dataset...\n") 284 | training <- df[train_ind, ] 285 | cat("completed training dataset, creating test set\n") 286 | test <- df[-train_ind, ] 287 | cat("done") 288 | } 289 | split(wine) 290 | x = 2 291 | plus1 <- function() { 292 | x =4 293 | x 294 | } 295 | plus1 296 | plus1() 297 | x 298 | plus1 <- function() { 299 | x <<- 4 300 | x 301 | } 302 | plus1() 303 | x 304 | set.seed(3033) 305 | ## WARNING: Danger function 306 | split <- function(df, p = 0.75, list = FALSE, ...) { 307 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 308 | cat("creating training dataset...\n") 309 | training <- df[train_ind, ] 310 | cat("completed training dataset, creating test set\n") 311 | test <- df[-train_ind, ] 312 | cat("done") 313 | } 314 | split(wine) 315 | set.seed(3033) 316 | ## WARNING: Danger function 317 | split <- function(df, p = 0.75, list = FALSE, ...) { 318 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 319 | cat("creating training dataset...\n") 320 | training <<- df[train_ind, ] 321 | cat("completed training dataset, creating test set\n") 322 | test <<- df[-train_ind, ] 323 | cat("done") 324 | } 325 | split(wine) 326 | square <- function(var){ 327 | x = x^2 328 | x 329 | } 330 | z = 4 331 | square(z) 332 | zsquared <- square(z) 333 | zsquared 334 | ## WARNING: Danger function 335 | split <- function(df, p = 0.75, list = FALSE, ...) { 336 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 337 | cat("creating training dataset...\n") 338 | training <<- df[train_ind, ] 339 | cat("completed training dataset, creating test set\n") 340 | test <<- df[-train_ind, ] 341 | cat("done") 342 | } 343 | split(wine) 344 | library(doParallel) 345 | parallel::detectCores() 346 | numcores <- parallel::detectCores() - 1 347 | cl <- makePSOCKcluster(numcores) 348 | registerDoParallel(cl) 349 | set.seed(3333) 350 | set.seed(3333) 351 | trainMethod <- trainControl(method = "repeatedcv", 352 | number = 10, 353 | repeats = 3) 354 | # k-folds cross validation 355 | # y ~ x 356 | knn_fit <- train(class ~ ., 357 | data = training, 358 | method = "knn", 359 | trControl = trainMethod, 360 | preProcess = c("center", "scale"), 361 | tuneLength = 10) 362 | knn_fit 363 | plot(knn_fit) 364 | set.seed(3333) 365 | trainMethod <- trainControl(method = "repeatedcv", 366 | number = 10, 367 | repeats = 3) 368 | # k-folds cross validation 369 | # y ~ x 370 | knn_fit <- train(class ~ ., 371 | data = training, 372 | method = "knn", 373 | trControl = trainMethod, 374 | preProcess = c("center", "scale"), 375 | tuneLength = 10) 376 | knn_fit 377 | set.seed(3033) 378 | ## WARNING: Danger function 379 | split <- function(df, p = 0.75, list = FALSE, ...) { 380 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 381 | cat("creating training dataset...\n") 382 | training <<- df[train_ind, ] 383 | cat("completed training dataset, creating test set\n") 384 | test <<- df[-train_ind, ] 385 | cat("done") 386 | } 387 | split(wine) 388 | set.seed(3333) 389 | trainMethod <- trainControl(method = "repeatedcv", 390 | number = 10, 391 | repeats = 3) 392 | # k-folds cross validation 393 | # y ~ x 394 | knn_fit <- train(class ~ ., 395 | data = training, 396 | method = "knn", 397 | trControl = trainMethod, 398 | preProcess = c("center", "scale"), 399 | tuneLength = 10) 400 | knn_fit 401 | plot(knn_fit) 402 | knn_fit2 <- knn3(training, training$class, k = 15) 403 | knn_fit2 404 | test_pred <- predict(knn_fit, newdata = test) 405 | test_pred 406 | test_pred2 <- predict(knn_fit2, newdata = test) 407 | test_pred2 408 | confusionMatrix(test_pred, test$class) 409 | confusionMatrix(test_pred2, test$class) 410 | knn_fit2 <- knn3(training, training$class, k = 15, prob = FALSE) 411 | knn_fit2 412 | test_pred2 <- predict(knn_fit2, newdata = test) 413 | test_pred2 414 | test_pred2 <- predict(knn_fit2, newdata = test, prob = F) 415 | test_pred2 416 | confusionMatrix(test_pred2, test$class) 417 | confusionMatrix(test_pred, test$class) 418 | conmat <- function(predicted, expected){ 419 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 420 | cm 421 | } 422 | f1_score <- function(predicted, expected, positive.class="1") { 423 | cm = conmat(predicted, expected) 424 | precision <- diag(cm) / colSums(cm) 425 | recall <- diag(cm) / rowSums(cm) 426 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 427 | #Assuming that F1 is zero when it's not possible compute it 428 | f1[is.na(f1)] <- 0 429 | #Binary F1 or Multi-class macro-averaged F1 430 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 431 | } 432 | accuracy <- function(predicted, expected){ 433 | cm <- conmat(predicted, expected) 434 | sum(diag(cm)/length(test$class)) 435 | } 436 | get_scores <- function(predictions, test){ 437 | f1 <- f1_score(predictions,test) 438 | acc <- accuracy(predictions,test) 439 | scores <- c(accuracy = acc, f1 = f1) 440 | scores 441 | } 442 | pander(get_scores(test_pred, test)) 443 | conmat(test_pred, test) 444 | conmat <- function(predicted, expected){ 445 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted)) 446 | cm 447 | } 448 | conmat(test_pred, test) 449 | f1_score <- function(predicted, expected, positive.class="1") { 450 | cm = conmat(predicted, expected) 451 | precision <- diag(cm) / colSums(cm) 452 | recall <- diag(cm) / rowSums(cm) 453 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall)) 454 | #Assuming that F1 is zero when it's not possible compute it 455 | f1[is.na(f1)] <- 0 456 | #Binary F1 or Multi-class macro-averaged F1 457 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1)) 458 | } 459 | accuracy <- function(predicted, expected){ 460 | cm <- conmat(predicted, expected) 461 | sum(diag(cm)/length(test$class)) 462 | } 463 | get_scores <- function(predictions, test){ 464 | f1 <- f1_score(predictions,test) 465 | acc <- accuracy(predictions,test) 466 | scores <- c(accuracy = acc, f1 = f1) 467 | scores 468 | } 469 | pander(get_scores(test_pred, test)) 470 | nb_fit <- train(training, 471 | training$class, 472 | trControl = trainMethod, 473 | method = "naive_bayes", 474 | tuneLength = 10 475 | ) 476 | nb_fit 477 | plot(nb_fit) 478 | nb_fit <- train(training, 479 | training$class, 480 | trControl = trainMethod, 481 | method = "naive_bayes", 482 | tuneLength = 10 483 | ) 484 | nb_fit 485 | plot(nb_fit) 486 | nb_fit <- train(training, 487 | training$class, 488 | trControl = trainMethod, 489 | method = "nb", 490 | tuneLength = 10 491 | ) 492 | nb_fit 493 | plot(nb_fit) 494 | nb_pred <- predict(nb_fit, newdata = test) 495 | nb_pred 496 | confusionMatrix(nb_pred, test$class) 497 | get_scores(nb_pred, test) 498 | library(fastNaiveBayes) 499 | y <- training$class 500 | x <- training[-1] 501 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x)) 502 | dist 503 | fast_nb_fit <- fastNaiveBayes.mixed(x,y) 504 | fast_nb_fit 505 | fast_pred <- predict(fast_nb_fit, test[-1]) 506 | fast_pred 507 | fast_pred <- predict(fast_nb_fit, newdata = test) 508 | fast_pred <- predict(fast_nb_fit, newdata = test[-1]) 509 | fast_pred 510 | confusionMatrix(fast_pred, test$class) 511 | get_scores(fast_pred, test) 512 | stopCluster(cl) 513 | -------------------------------------------------------------------------------- /R/EDAreg.R: -------------------------------------------------------------------------------- 1 | # regression EDA 2 | data(mtcars) 3 | 4 | mtcars 5 | # mpg cyl disp hp drat wt qsec vs am gear carb 6 | # Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 7 | # Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 8 | # Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 9 | # Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 10 | # Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 11 | # Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 12 | # Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 13 | # Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 14 | # Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 15 | # Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 16 | # Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 17 | # Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 18 | # Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 19 | # Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 20 | # Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 21 | # Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 22 | # Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 23 | # Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 24 | # Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 25 | # Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 26 | # Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 27 | # Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 28 | # AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 29 | # Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 30 | # Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 31 | # Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 32 | # Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 33 | # Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 34 | # Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 35 | # Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 36 | # Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 37 | # Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 38 | mtcars[c("cyl","vs","am","gear","carb")] 39 | # cyl vs am gear carb 40 | # Mazda RX4 6 0 1 4 4 41 | # Mazda RX4 Wag 6 0 1 4 4 42 | # Datsun 710 4 1 1 4 1 43 | # Hornet 4 Drive 6 1 0 3 1 44 | # Hornet Sportabout 8 0 0 3 2 45 | # Valiant 6 1 0 3 1 46 | # Duster 360 8 0 0 3 4 47 | # Merc 240D 4 1 0 4 2 48 | # Merc 230 4 1 0 4 2 49 | # Merc 280 6 1 0 4 4 50 | # Merc 280C 6 1 0 4 4 51 | # Merc 450SE 8 0 0 3 3 52 | # Merc 450SL 8 0 0 3 3 53 | # Merc 450SLC 8 0 0 3 3 54 | # Cadillac Fleetwood 8 0 0 3 4 55 | # Lincoln Continental 8 0 0 3 4 56 | # Chrysler Imperial 8 0 0 3 4 57 | # Fiat 128 4 1 1 4 1 58 | # Honda Civic 4 1 1 4 2 59 | # Toyota Corolla 4 1 1 4 1 60 | # Toyota Corona 4 1 0 3 1 61 | # Dodge Challenger 8 0 0 3 2 62 | # AMC Javelin 8 0 0 3 2 63 | # Camaro Z28 8 0 0 3 4 64 | # Pontiac Firebird 8 0 0 3 2 65 | # Fiat X1-9 4 1 1 4 1 66 | # Porsche 914-2 4 0 1 5 2 67 | # Lotus Europa 4 1 1 5 2 68 | # Ford Pantera L 8 0 1 5 4 69 | # Ferrari Dino 6 0 1 5 6 70 | # Maserati Bora 8 0 1 5 8 71 | # Volvo 142E 4 1 1 4 2 72 | str(mtcars) 73 | # 'data.frame': 32 obs. of 11 variables: 74 | # $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ... 75 | # $ cyl : num 6 6 4 6 8 6 8 4 4 6 ... 76 | # $ disp: num 160 160 108 258 360 ... 77 | # $ hp : num 110 110 93 110 175 105 245 62 95 123 ... 78 | # $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ... 79 | # $ wt : num 2.62 2.88 2.32 3.21 3.44 ... 80 | # $ qsec: num 16.5 17 18.6 19.4 17 ... 81 | # $ vs : num 0 0 1 1 0 1 0 1 1 1 ... 82 | # $ am : num 1 1 1 0 0 0 0 0 0 0 ... 83 | # $ gear: num 4 4 4 3 3 3 3 4 4 4 ... 84 | # $ carb: num 4 4 1 1 2 1 4 2 2 4 ... 85 | # NULL 86 | mtcars[c("cyl","vs","am","gear","carb")] <- lapply(mtcars[c("cyl","vs","am","gear","carb")], as.factor) 87 | hist(mtcars$mpg) 88 | 89 | # facet 90 | 91 | 92 | str(mtcars) 93 | 94 | library(tidyverse) 95 | mtcars %>% keep(is.numeric) %>%gather %>% ggplot(aes(x = value)) + facet_wrap(~key, scales = "free")+ geom_histogram() 96 | plotAllNumeric <- function(df){ 97 | df%>%keep(is.numeric) %>% 98 | gather() %>% 99 | ggplot(aes(value)) + 100 | facet_wrap(~ key, scales = "free") + 101 | geom_density()+geom_histogram() + theme_fivethirtyeight() 102 | } 103 | 104 | plotAllNumeric(mtcars) 105 | 106 | library(RColorBrewer) 107 | library(gplots) 108 | 109 | 110 | # heatmap 111 | 112 | my_palette <- colorRampPalette(c("red", "white", "black")) 113 | heatmapper <- function(df){ 114 | df %>% 115 | keep(is.numeric) %>% 116 | tidyr::drop_na() %>% 117 | cor %>% 118 | heatmap.2(col = my_palette , 119 | density.info = "none", trace = "none", 120 | dendogram = c("both"), symm = F, 121 | symkey = T, symbreaks = T, scale = "none", 122 | key = T) 123 | } 124 | 125 | data(iris) 126 | heatmapper(iris) 127 | heatmapper(mtcars) 128 | 129 | 130 | 131 | library(corrplot) 132 | 133 | ?corrplot 134 | correlator <- function(df){ 135 | df %>% 136 | keep(is.numeric) %>% 137 | tidyr::drop_na() %>% 138 | cor %>% 139 | corrplot( addCoef.col = "white", number.digits = 2, 140 | number.cex = 0.5, method="square", 141 | order="hclust", title="Variable Corr Heatmap", 142 | tl.srt=45, tl.cex = 0.8) 143 | } 144 | correlator(mtcars) 145 | 146 | 147 | 148 | # Categorical variables 149 | # box plots 150 | mtcars %>% keep(is.factor) %>% names -> label 151 | # [1] "cyl" "vs" "am" "gear" "carb" 152 | ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = cyl)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() 153 | ggplot(data = mtcars, aes(x = vs, y = mpg, fill = vs)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() 154 | ggplot(data = mtcars, aes(x = am, y = mpg, fill = am)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() 155 | ggplot(data = mtcars, aes(x = gear, y = mpg, fill = gear)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() 156 | ggplot(data = mtcars, aes(x = carb, y = mpg, fill = carb)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few() 157 | 158 | plot_grid(p,p1,p2,p3,p4, ncol = 3, labels = label) 159 | 160 | 161 | p <-ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = cyl)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few() 162 | p1<-ggplot(data = mtcars, aes(x = vs, y = mpg, fill = vs)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few() 163 | p2<-ggplot(data = mtcars, aes(x = am, y = mpg, fill = am)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few() 164 | p3<-ggplot(data = mtcars, aes(x = gear, y = mpg, fill = gear)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few() 165 | p4<-ggplot(data = mtcars, aes(x = carb, y = mpg, fill = carb)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few() 166 | 167 | plot_grid(p,p1,p2,p3,p4, ncol = 3, labels = label) 168 | 169 | 170 | # disp or weight = eliminate one 171 | # maybe eliminate gear or lm 172 | model1 <- lm(data = mtcars, mpg ~.) 173 | summary(model1) 174 | # 175 | # Call: 176 | # lm(formula = mpg ~ ., data = mtcars) 177 | # 178 | # Residuals: 179 | # Min 1Q Median 3Q Max 180 | # -3.5087 -1.3584 -0.0948 0.7745 4.6251 181 | # 182 | # Coefficients: 183 | # Estimate Std. Error t value 184 | # (Intercept) 23.87913 20.06582 1.190 185 | # cyl6 -2.64870 3.04089 -0.871 186 | # cyl8 -0.33616 7.15954 -0.047 187 | # disp 0.03555 0.03190 1.114 188 | # hp -0.07051 0.03943 -1.788 189 | # drat 1.18283 2.48348 0.476 190 | # wt -4.52978 2.53875 -1.784 191 | # qsec 0.36784 0.93540 0.393 192 | # vs1 1.93085 2.87126 0.672 193 | # am1 1.21212 3.21355 0.377 194 | # gear4 1.11435 3.79952 0.293 195 | # gear5 2.52840 3.73636 0.677 196 | # carb2 -0.97935 2.31797 -0.423 197 | # carb3 2.99964 4.29355 0.699 198 | # carb4 1.09142 4.44962 0.245 199 | # carb6 4.47757 6.38406 0.701 200 | # carb8 7.25041 8.36057 0.867 201 | # Pr(>|t|) 202 | # (Intercept) 0.2525 203 | # cyl6 0.3975 204 | # cyl8 0.9632 205 | # disp 0.2827 206 | # hp 0.0939 . 207 | # drat 0.6407 208 | # wt 0.0946 . 209 | # qsec 0.6997 210 | # vs1 0.5115 211 | # am1 0.7113 212 | # gear4 0.7733 213 | # gear5 0.5089 214 | # carb2 0.6787 215 | # carb3 0.4955 216 | # carb4 0.8096 217 | # carb6 0.4938 218 | # carb8 0.3995 219 | # --- 220 | # Signif. codes: 221 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 222 | # 0.1 ‘ ’ 1 223 | # 224 | # Residual standard error: 2.833 on 15 degrees of freedom 225 | # Multiple R-squared: 0.8931, Adjusted R-squared: 0.779 226 | # F-statistic: 7.83 on 16 and 15 DF, p-value: 0.000124 227 | # 228 | 229 | mtcars2 <- mtcars %>% keep(is.numeric) 230 | mtcars2$disp <- NULL 231 | 232 | model2 <- lm(data = mtcars2, mpg~.) 233 | 234 | summary(model2) 235 | # 236 | # Call: 237 | # lm(formula = mpg ~ ., data = mtcars2) 238 | # 239 | # Residuals: 240 | # Min 1Q Median 3Q Max 241 | # -3.5775 -1.6626 -0.3417 1.1317 5.4422 242 | # 243 | # Coefficients: 244 | # Estimate Std. Error t value 245 | # (Intercept) 19.25970 10.31545 1.867 246 | # hp -0.01784 0.01476 -1.209 247 | # drat 1.65710 1.21697 1.362 248 | # wt -3.70773 0.88227 -4.202 249 | # qsec 0.52754 0.43285 1.219 250 | # Pr(>|t|) 251 | # (Intercept) 0.072785 . 252 | # hp 0.237319 253 | # drat 0.184561 254 | # wt 0.000259 *** 255 | # qsec 0.233470 256 | # --- 257 | # Signif. codes: 258 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 259 | # 0.1 ‘ ’ 1 260 | # 261 | # Residual standard error: 2.539 on 27 degrees of freedom 262 | # Multiple R-squared: 0.8454, Adjusted R-squared: 0.8225 263 | # F-statistic: 36.91 on 4 and 27 DF, p-value: 1.408e-10 264 | # 265 | 266 | 267 | mtcars3 <- mtcars %>% keep(is.numeric) 268 | 269 | model3 <- lm(data = mtcars3, mpg~.) 270 | summary(model3) 271 | # 272 | # Call: 273 | # lm(formula = mpg ~ ., data = mtcars3) 274 | # 275 | # Residuals: 276 | # Min 1Q Median 3Q Max 277 | # -3.5404 -1.6701 -0.4264 1.1320 5.4996 278 | # 279 | # Coefficients: 280 | # Estimate Std. Error t value 281 | # (Intercept) 16.53357 10.96423 1.508 282 | # disp 0.00872 0.01119 0.779 283 | # hp -0.02060 0.01528 -1.348 284 | # drat 2.01578 1.30946 1.539 285 | # wt -4.38546 1.24343 -3.527 286 | # qsec 0.64015 0.45934 1.394 287 | # Pr(>|t|) 288 | # (Intercept) 0.14362 289 | # disp 0.44281 290 | # hp 0.18936 291 | # drat 0.13579 292 | # wt 0.00158 ** 293 | # qsec 0.17523 294 | # --- 295 | # Signif. codes: 296 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 297 | # 0.1 ‘ ’ 1 298 | # 299 | # Residual standard error: 2.558 on 26 degrees of freedom 300 | # Multiple R-squared: 0.8489, Adjusted R-squared: 0.8199 301 | # F-statistic: 29.22 on 5 and 26 DF, p-value: 6.892e-10 302 | # 303 | 304 | 305 | # exploring with base R and lapply 306 | # modify for your own data, this is geared for mtcars 307 | # for example train$income ~ train[[x]] instead of mtcars 308 | plot_vs_response <- function(x){ 309 | plot(mtcars$mpg ~ mtcars[[x]], xlab = x) 310 | lw1 <- loess(mtcars$mpg ~ mtcars[[x]]) 311 | j <- order(mtcars[[x]]) 312 | lines(mtcars[[x]][j],lw1$fitted[j],col="red",lwd=3) 313 | } 314 | mtcars %>% keep(is.numeric) %>% names -> numNames 315 | numNames 316 | # [1] "mpg" "disp" "hp" "drat" "wt" "qsec" 317 | # remove mpg 318 | numNames <- numNames[-1] 319 | length(numNames) 320 | # [1] 5 321 | # set up graphical parameters: 322 | 323 | par(mfrow = c(2,3)) 324 | # plot all numeric variables as x vs response with lapply 325 | # works like 326 | lapply(numNames, plot_vs_response) 327 | 328 | # how do you interpret this? remember how wt and disp are highly correlated?? 329 | 330 | 331 | 332 | # Classification EDA(light example) 333 | 334 | 335 | library(caret) 336 | library(fastNaiveBayes) 337 | library(readr) 338 | library(functional) 339 | library(ggplot2) 340 | library(magrittr) 341 | library(tidyverse) 342 | 343 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data" 344 | 345 | wine <- read_csv(dataurl, col_names = F) 346 | good_cols <- c("class", 347 | "alcohol", 348 | 'malic_acid', 349 | 'ash', 350 | 'alkalinity', 351 | 'magnesium', 352 | 'total_phenols', 353 | 'flavanoids', 354 | 'nonflavonoids_phenols', 355 | 'proanthocyanins', 356 | 'color_intensity', 357 | 'hue', 358 | 'dilution', 359 | 'proline' 360 | ) 361 | 362 | fix_cols <- function(df){ 363 | colnames(df) <- good_cols 364 | df$class <- as.factor(df$class) 365 | df 366 | } 367 | wine <- fix_cols(wine) 368 | glimpse(wine) 369 | 370 | 371 | set.seed(3033) 372 | ## WARNING: Danger function 373 | split <- function(df, p = 0.75, list = FALSE, ...) { 374 | train_ind <- createDataPartition(df[[1]], p = p, list = list) 375 | cat("creating training dataset...\n") 376 | training <<- df[train_ind, ] 377 | cat("completed training dataset, creating test set\n") 378 | test <<- df[-train_ind, ] 379 | cat("done") 380 | } 381 | 382 | split(wine) 383 | 384 | ggplot(data = wine, aes(x = malic_acid, fill = class)) + geom_density() 385 | ggplot(data = wine, aes(x = alkalinity, fill = class)) + geom_density() 386 | ggplot(data = wine, aes(x = ash, fill = class)) + geom_density() 387 | ggplot(data = wine, aes(x = magnesium, fill = class)) + geom_density() 388 | 389 | 390 | library(doParallel) 391 | numcores <- parallel::detectCores() - 1 392 | cl <- makePSOCKcluster(numcores) 393 | registerDoParallel(cl) 394 | 395 | 396 | set.seed(3333) 397 | trainMethod <- trainControl(method = "repeatedcv", 398 | number = 10, 399 | repeats = 3) 400 | # k-folds cross validation 401 | # y ~ x 402 | # use train to do a grid search for best model, see 403 | # https://topepo.github.io/caret/model-training-and-tuning.html#model-training-and-parameter-tuning 404 | knn_fit <- train(class ~ ., 405 | data = training, 406 | method = "knn", 407 | trControl = trainMethod, 408 | preProcess = c("center", "scale"), 409 | tuneLength = 10) 410 | 411 | knn_fit 412 | # k-Nearest Neighbors 413 | # 414 | # 135 samples 415 | # 13 predictor 416 | # 3 classes: '1', '2', '3' 417 | # 418 | # Pre-processing: 419 | # centered (13), scaled (13) 420 | # Resampling: Cross-Validated (10 fold, repeated 3 times) 421 | # Summary of sample sizes: 121, 122, 122, 121, 121, 121, ... 422 | # Resampling results across tuning parameters: 423 | # 424 | # k Accuracy Kappa 425 | # 5 0.9700549 0.9548756 426 | # 7 0.9676740 0.9516351 427 | # 9 0.9609280 0.9418362 428 | # 11 0.9579426 0.9370280 429 | # 13 0.9702686 0.9552588 430 | # 15 0.9722527 0.9579543 431 | # 17 0.9752442 0.9625294 432 | # 19 0.9681013 0.9519242 433 | # 21 0.9726496 0.9588742 434 | # 23 0.9726496 0.9589829 435 | # 436 | # Accuracy was used to 437 | # model using the 438 | # largest value. 439 | # The final value used 440 | # for the model was k = 17. 441 | 442 | 443 | plot(knn_fit) 444 | 445 | 446 | test_pred <- predict(knn_fit, newdata = test) 447 | test_pred 448 | 449 | 450 | confusionMatrix(test_pred, test$class) 451 | 452 | # try with different predictors as per your EDA 453 | # an idea to do programaticly: try writing a function then lapplying all the different iterations (may require instead %dopar% or mclapply) 454 | -------------------------------------------------------------------------------- /pres/html-scraping.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HTML Scraping in R" 3 | author: "David Josephs" 4 | output: html_document 5 | --- 6 | 7 | 8 | # Lord of the Rings Example 9 | 10 | ## Setup 11 | 12 | First, lets load up two libraries which will make our life easier. First is rvest, which is a great library for reading html, it is basically an extension of the xml2 package. It has some easy syntax and is quite helpful going forwards. 13 | 14 | The second one is in my opinion, one of the most useful libraries for doing any sort of data science or data analysis in R, the tidyverse. Just google it and see the documentation, it is a set of packages in which all of the functions have similar APIs and arguments, allowing for consistency throughout our programmig. They are also all pretty fast, with nice syntax. Examples from the tidyverse are: readr (data loading), dplyr(data analysis/cleaning/general utility), tidyr(data cleaning again, reshaping), caret(machine learning), and ggplot2(data viz). 15 | 16 | 17 | ```r 18 | library(rvest) 19 | library(tidyverse) 20 | ``` 21 | 22 | Next lets load up our data. In this example we will be looking at the imdb page for lord of the rings. So we will assign a variable to the url of the page we are interested in: 23 | 24 | 25 | ```r 26 | lotr <- 'https://www.imdb.com/title/tt0120737/fullcredits?ref_=tt_cl_sm#cast' 27 | ``` 28 | 29 | ## Reading the data 30 | 31 | ### The pipe operator 32 | 33 | Before we can read in the data, lets first learn about `%>%` pipes. A pipe is basically saying, take the thing on the left, and make it an argument of a thing on the right. For example, lets say we want to take the mean of mtcars, the classic R example dataset, with all columns. We can do that with: 34 | 35 | 36 | ```r 37 | lapply(mtcars,mean) 38 | ``` 39 | 40 | ``` 41 | ## $mpg 42 | ## [1] 20.09062 43 | ## 44 | ## $cyl 45 | ## [1] 6.1875 46 | ## 47 | ## $disp 48 | ## [1] 230.7219 49 | ## 50 | ## $hp 51 | ## [1] 146.6875 52 | ## 53 | ## $drat 54 | ## [1] 3.596563 55 | ## 56 | ## $wt 57 | ## [1] 3.21725 58 | ## 59 | ## $qsec 60 | ## [1] 17.84875 61 | ## 62 | ## $vs 63 | ## [1] 0.4375 64 | ## 65 | ## $am 66 | ## [1] 0.40625 67 | ## 68 | ## $gear 69 | ## [1] 3.6875 70 | ## 71 | ## $carb 72 | ## [1] 2.8125 73 | ``` 74 | 75 | This is mapping the mean function over the mtcars dataset. Now, the output of this is not very pretty, so we will turn it into a nice, horizontal data frame: 76 | 77 | 78 | ```r 79 | as.data.frame(lapply(mtcars,mean)) 80 | ``` 81 | 82 | ``` 83 | ## mpg cyl disp hp drat wt qsec vs 84 | ## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 85 | ## am gear carb 86 | ## 1 0.40625 3.6875 2.8125 87 | ``` 88 | 89 | Still ugly. Lets try and use the pander library to make this look nice: 90 | 91 | 92 | ```r 93 | pander(as.data.frame(lapply(mtcars,mean))) 94 | ``` 95 | 96 | 97 | ----------------------------------------------------------------------------------------- 98 | mpg cyl disp hp drat wt qsec vs am gear carb 99 | ------- ------- ------- ------- ------- ------- ------- -------- -------- ------- ------- 100 | 20.09 6.188 230.7 146.7 3.597 3.217 17.85 0.4375 0.4062 3.688 2.812 101 | ----------------------------------------------------------------------------------------- 102 | 103 | Much better, but look at how many parentheses we wrote, and how difficult this is to read. Imagine if we had 4 or 5 more steps. We would have to repeatedly assign things to new variables, and keep working and working and putting things in our computers memory to have readable code. Even then, if we assigned a variable on every step, someone reviewing your code would end up having to know 20 or so lines of code above, just to understand the final printing line. This leads to errors and is not reproducible. Instead, lets try it with the pipe operator. Mathematically, `f(x,y) = x %>% f(y)`, if that helps: 104 | 105 | 106 | ```r 107 | mtcars %>% lapply(mean) %>% as.data.frame %>% pander 108 | ``` 109 | 110 | 111 | ----------------------------------------------------------------------------------------- 112 | mpg cyl disp hp drat wt qsec vs am gear carb 113 | ------- ------- ------- ------- ------- ------- ------- -------- -------- ------- ------- 114 | 20.09 6.188 230.7 146.7 3.597 3.217 17.85 0.4375 0.4062 3.688 2.812 115 | ----------------------------------------------------------------------------------------- 116 | 117 | This reads from left to right (as we english speakers are in the habit of doing): 118 | First, we take the mtcars dataset. Then, we apply the mean function onto every column of the dataset, outputting into the form of a list. We then turn the list, which is hard to read, into a nice flat data frame, and then we pretty up the data frame in a final step. This is the pipe operator. 119 | 120 | ### Actually reading in the data 121 | 122 | So, with our knowledge of the pipe operator, what can we do? Lets use rvest functions to turn the raw xml and/or html data into something nice and human human readbale. 123 | 124 | First, lets read in the website: 125 | 126 | 127 | ```r 128 | # not run 129 | read_html(lotr) 130 | ``` 131 | 132 | Next lets choose all the tables (we know all of our data is in tables) in the raw data, with the `html_nodes()` function: 133 | 134 | 135 | ```r 136 | read_html(lotr) %>% html_nodes("table") 137 | ``` 138 | 139 | Next, lets choose the right table. By looking at the website, we know that the third table contains the info on the cast. To choose the third table of an unnamed object, we are going to have to use the `.` operator, which we will see is just a placeholder for the thing on the left. 140 | 141 | 142 | 143 | ```r 144 | read_html(lotr) %>% html_nodes("table") %>% .[[3]] 145 | ``` 146 | 147 | #### An Aside on lists 148 | Why did we do `[[]]`? 149 | This is because html_nodes outputs a list, and there are three ways we can get items from a list, `$`, for named items, keeps the type of the item if it is some sort of vector. `[]` allows us to index the list, but the output is always in the form of a list, eg, data type is extracted at some other set. Third, we have `[[]]`, which allows us to index the list and get the proper data type in an output. Experiment with this by using the following list as well as the built in `typeof()` function. 150 | 151 | 152 | ```r 153 | x <- list("char" = c("cat","dog"), "nest" = list((1:3),2:4), "int" = 4:5, "logical" = c(T,F,T,F), "float" = c(87.5, -962.4)) 154 | ``` 155 | 156 | ### Back to Business 157 | 158 | Now that we understand what `.[[3]]` is doing, we can now extract the full dataset: 159 | 160 | 161 | ```r 162 | read_html(lotr) %>% html_nodes("table") %>% .[[3]] %>% html_table -> cast 163 | (head(cast)) 164 | ``` 165 | 166 | ``` 167 | ## X1 X2 X3 X4 168 | ## 1 169 | ## 2 Alan Howard ... Voice of the Ring \n \n \n (voice) 170 | ## 3 Noel Appleby ... Everard Proudfoot 171 | ## 4 Sean Astin ... Sam 172 | ## 5 Sala Baker ... Sauron 173 | ## 6 Sean Bean ... Boromir 174 | ``` 175 | 176 | Great. Now that process was pretty painful, and took a lot of typing, and in the future we may not know which table we are looking for, so lets write a nice little function to do this all in one step: 177 | 178 | 179 | 180 | ```r 181 | tablescraper <- function(url, item){ 182 | read_html(url) %>% html_nodes("table") %>% . [[item]] %>% html_table -> out 183 | return(out) 184 | } 185 | ``` 186 | 187 | Now that we have a nice function, we can iteratively search through the IMDB site: 188 | 189 | 190 | ```r 191 | tablescraper(lotr,1) %>% head 192 | ``` 193 | 194 | ``` 195 | ## X1 X2 X3 196 | ## 1 Peter Jackson NA NA 197 | ``` 198 | 199 | ```r 200 | tablescraper(lotr,2) %>% head 201 | ``` 202 | 203 | ``` 204 | ## X1 X2 X3 205 | ## 1 J.R.R. Tolkien ... (novel) 206 | ## 2 207 | ## 3 Fran Walsh ... (screenplay) & 208 | ## 4 Philippa Boyens ... (screenplay) & 209 | ## 5 Peter Jackson ... (screenplay) 210 | ``` 211 | 212 | ```r 213 | tablescraper(lotr,3) -> cast 214 | ``` 215 | 216 | We can even imagine, for a large project, just writing a for loop to do all of this. 217 | Next, lets check out the first and last ten items of cast: 218 | 219 | 220 | ```r 221 | ht <- function(x,...){ 222 | head(x,...) 223 | tail(x,...) 224 | } 225 | ht(cast,10) 226 | ``` 227 | 228 | ``` 229 | ## X1 X2 X3 230 | ## 124 Chris Ryan ... 231 | ## 125 Paul Shapcott ... 232 | ## 126 Samuel E. Shore ... 233 | ## 127 Mike Stearne ... 234 | ## 128 Andrew Stehlin ... 235 | ## 129 Ken Stratton ... 236 | ## 130 Jo Surgison ... 237 | ## 131 James Waterhouse-Brown ... 238 | ## 132 Tim Wong ... 239 | ## 133 Robert Young ... 240 | ## X4 241 | ## 124 Breelander \n \n \n (uncredited) 242 | ## 125 Burning Ringwraith \n \n \n (uncredited) 243 | ## 126 Refugee / \n Orc \n \n \n (uncredited) 244 | ## 127 Uruk-hai \n \n \n (uncredited) 245 | ## 128 Uruk-hai \n \n \n (uncredited) 246 | ## 129 Isengard Orc / \n Last Alliance Soldier / \n Morgul Orc / \n Uruk-hai \n \n \n (uncredited) 247 | ## 130 Hobbit \n \n \n (uncredited) 248 | ## 131 Goblin \n \n \n (uncredited) 249 | ## 132 Uruk-hai \n \n \n (uncredited) 250 | ## 133 Goblin \n \n \n (uncredited) 251 | ``` 252 | 253 | ***NOTE***: the `...` in our function allows for extra arguments. We do this so we can throw in the extra parameter, `10` which changes head and tail to showing the first and last 10 instead of the first and last 6 items. 254 | 255 | ## Cleaning the data 256 | Wow, this data is a mess. The first thing we see is that the first row is entirely blank, and then that the first and third columns are completely empty. Lets get rid of that: 257 | 258 | 259 | ```r 260 | cast <- cast[-1,] 261 | cast$X1 <- NULL 262 | cast$X3 <- NULL 263 | ht(cast) 264 | ``` 265 | 266 | ``` 267 | ## X2 268 | ## 128 Andrew Stehlin 269 | ## 129 Ken Stratton 270 | ## 130 Jo Surgison 271 | ## 131 James Waterhouse-Brown 272 | ## 132 Tim Wong 273 | ## 133 Robert Young 274 | ## X4 275 | ## 128 Uruk-hai \n \n \n (uncredited) 276 | ## 129 Isengard Orc / \n Last Alliance Soldier / \n Morgul Orc / \n Uruk-hai \n \n \n (uncredited) 277 | ## 130 Hobbit \n \n \n (uncredited) 278 | ## 131 Goblin \n \n \n (uncredited) 279 | ## 132 Uruk-hai \n \n \n (uncredited) 280 | ## 133 Goblin \n \n \n (uncredited) 281 | ``` 282 | 283 | Next, lets rename with dplyr: 284 | 285 | 286 | ```r 287 | cast %>% rename(Actor = X2, Character = X4) -> cast 288 | ``` 289 | 290 | Looking better, now we know from the IMDB website that the table contains"Rest of cast listed alphabetically:", so lets get rid of that. To do this, we are going to use `grepl()` 291 | 292 | `grepl()` searches for a pattern and then returns a logical (true/false) vector of whether or not there is a match. We can then index `cast` for all rows where the result of `grepl` are not true, eliminating the unwanted line: 293 | 294 | 295 | ```r 296 | cast<-cast[!grepl("Rest of cast listed alphabetically:", cast$Actor),] 297 | ``` 298 | 299 | 300 | Try and see how this dplyr syntax is different from doing it in base R as a learning challenge, and see which one you prefer. 301 | 302 | Next lets get rid of those nasty `\n`'s. To do this, lets use `gsub()`, short for global substite. Since we dont know how all newlines are delimited, we will search for all types of newlines, `\n` (unix) `\r\n` (windows) and `\r` (old web line endings). To do this, we will use the regular expression `[\r\n]`. This allows us to search for `\r`,`\n`, and `\r\n` (thats what the brackets do). Lets turn those all into nothing. 303 | 304 | 305 | ```r 306 | cast$Character<-gsub("[\r\n]","",cast$Character) 307 | ht(cast) 308 | ``` 309 | 310 | ``` 311 | ## Actor 312 | ## 128 Andrew Stehlin 313 | ## 129 Ken Stratton 314 | ## 130 Jo Surgison 315 | ## 131 James Waterhouse-Brown 316 | ## 132 Tim Wong 317 | ## 133 Robert Young 318 | ## Character 319 | ## 128 Uruk-hai (uncredited) 320 | ## 129 Isengard Orc / Last Alliance Soldier / Morgul Orc / Uruk-hai (uncredited) 321 | ## 130 Hobbit (uncredited) 322 | ## 131 Goblin (uncredited) 323 | ## 132 Uruk-hai (uncredited) 324 | ## 133 Goblin (uncredited) 325 | ``` 326 | 327 | Now, we have a ton of whitespace. Lets get rid of that. The regular expression for a single space is `\s`. But, we want to get rid of more than one space, and the regular expression for that is `\s+`. Lets combine the two so we are looking for all spaces, by doing `\s\s+`. That however is not very pretty, so lets combine one step further, and rewrite as `\\s+`. This is going to match with all amounts of whitespace. Lets turn all of these into a single space: 328 | 329 | 330 | ```r 331 | cast$Character<-gsub("\\s+"," ",cast$Character) 332 | ht(cast) 333 | ``` 334 | 335 | ``` 336 | ## Actor 337 | ## 128 Andrew Stehlin 338 | ## 129 Ken Stratton 339 | ## 130 Jo Surgison 340 | ## 131 James Waterhouse-Brown 341 | ## 132 Tim Wong 342 | ## 133 Robert Young 343 | ## Character 344 | ## 128 Uruk-hai (uncredited) 345 | ## 129 Isengard Orc / Last Alliance Soldier / Morgul Orc / Uruk-hai (uncredited) 346 | ## 130 Hobbit (uncredited) 347 | ## 131 Goblin (uncredited) 348 | ## 132 Uruk-hai (uncredited) 349 | ## 133 Goblin (uncredited) 350 | ``` 351 | 352 | Excellent work, we have now turned a once very ugly raw frame into something we can work with. 353 | 354 | # A challenge: 355 | 356 | Two challenges here: 357 | 358 | * Is there another way we could have cleaned up the `\n` or the `\s`? Try out `library(stringr)` and explore the functions there. 359 | 360 | * Try separating first name from last name (eg make a first and last name column), using whatever means necessary (this is in your homework assignment this week) 361 | 362 | # Note 363 | 364 | To see more examples and play around with the source code for this document, see `R/scraping.R` 365 | --------------------------------------------------------------------------------