\n\n Peter Jackson ...
40 | table2 <- tables[[2]]
41 | # {html_node}
42 | #
43 | # [1] \n\n\n\n\n\n\n J.R.R. Tolk ...
45 | table3 <- tables[[3]]
46 |
47 | ## @knitr table_clean
48 | cast <- html_table(table3)
49 | ## @knitr pipes
50 |
51 |
52 | f(x,y) = x %>% f(y) = f(.,y)
53 |
54 |
55 | mtcars %>% filter(cyl == 4) %>% .$mpg
56 |
57 |
58 | cast <- read_html(lotr) %>% html_nodes("table") %>% .[[3]] %>% html_table
59 |
60 | ## @knitr scraper
61 | tablescraper <- function(url, item){
62 | out <- read_html(url) %>% html_nodes("table") %>% .[[item]] %>% html_table
63 | return(out)
64 | }
65 |
66 | ## @knitr search
67 | tablescraper(lotr,1) %>% head
68 | # X1 X2 X3
69 | # 1 Peter Jackson NA NA
70 |
71 | tablescraper(lotr,2) %>% head
72 | # X1 X2 X3
73 | # 1 J.R.R. Tolkien ... (novel)
74 | # 2
75 | # 3 Fran Walsh ... (screenplay) &
76 | # 4 Philippa Boyens ... (screenplay) &
77 | # 5 Peter Jackson ... (screenplay)
78 |
79 | tablescraper(lotr,3) %>% head
80 |
81 | cast <- tablescraper(lotr,3)
82 |
83 | head(cast)
84 | # X1 X2 X3 X4
85 | # 1
86 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice)
87 | # 3 Noel Appleby ... Everard Proudfoot
88 | # 4 Sean Astin ... Sam
89 | # 5 Sala Baker ... Sauron
90 | # 6 Sean Bean ... Boromir
91 | # X1 X2 X3 X4
92 | # 1
93 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice)
94 | # 3 Noel Appleby ... Everard Proudfoot
95 | # 4 Sean Astin ... Sam
96 | # 5 Sala Baker ... Sauron
97 | # 6 Sean Bean ... Boromir
98 | ## @knitr clean-1
99 | cast$X1 <- NULL
100 | cast$X3 <- NULL
101 |
102 | head(cast)
103 |
104 | ## @knitr rename
105 | cast <- cast %>% rename(Actor = X2, Character = X4)
106 | head(cast)
107 |
108 | animals <- c("cat","dog","mouse","hamster","komodo dragon")
109 |
110 | #grep(pattern, object)
111 |
112 | grep("d", animals)
113 | # [1] 2 5
114 |
115 | grepl("d", animals)
116 | # [1] FALSE TRUE FALSE FALSE TRUE
117 |
118 | animals[grep("d",animals)]
119 | # [1] "dog" "komodo dragon"
120 |
121 | animals[grepl("d", animals)]
122 | # [1] "dog" "komodo dragon"
123 |
124 |
125 | # do it this way
126 | animals[!grepl("d",animals)]
127 | # [1] "cat" "mouse" "hamster"
128 |
129 | animals[-grep("d",animals)]
130 |
131 |
132 |
133 | ## @knitr grepl
134 |
135 | truefalse <- grepl
136 |
137 | # X1 X2 X3 X4
138 | # 1
139 | # 2 Alan Howard ... Voice of the Ring \n \n \n (voice)
140 | # 3 Noel Appleby ... Everard Proudfoot
141 | # 4 Sean Astin ... Sam
142 | # 5 Sala Baker ... Sauron
143 | # 6 Sean Bean ... Boromir
144 |
145 | !grepl("Rest of cast listed alphabetically:", cast$Actor)
146 | # [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
147 | # [19] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
148 | # [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
149 | # [55] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
150 | # [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
151 | # [91] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
152 | # [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
153 | # [127] FALSE FALSE FALSE FALSE FALSE FALSE FALSE
154 | cast<-cast[!grepl("Rest of cast listed alphabetically:", cast$Actor),]
155 |
156 |
157 |
158 | ## @knitr regex1
159 |
160 | # gsub("pattern","replacement",object)
161 | cast$Character<-gsub("[\r\n]","",cast$Character)
162 |
163 | head(cast)
164 | ## @knitr regex2
165 | cast$Character <- gsub("\\s+"," ",cast$Character)
166 | cast$Character <- str_squish(cast$Character)
167 |
168 | head(cast,10)
169 | # Actor Character
170 | # 1
171 | # 2 Alan Howard Voice of the Ring (voice)
172 | # 3 Noel Appleby Everard Proudfoot
173 | # 4 Sean Astin Sam
174 | # 5 Sala Baker Sauron
175 | # 6 Sean Bean Boromir
176 | # 7 Cate Blanchett Galadriel
177 | # 8 Orlando Bloom Legolas
178 | # 9 Billy Boyd Pippin
179 | # 10 Marton Csokas Celeborn
180 |
181 | head(cast)
182 | html_table(html_nodes(pokemon,table)[[2]])
183 | ateam <- read_html("http://www.boxofficemojo.com/movies/?id=ateam.htm")
184 | html_nodes(ateam, "center")
185 |
186 |
187 | data(mtcars)
188 | library(ggplot2)
189 | head(mtcars)
190 | ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = gear))+geom_bar(stat ="identity") + theme_minimal()
191 |
192 |
193 | cast$isGoblin <- grepl("Goblin", cast$Character)
194 |
195 | cast$isGoblin <- as.numeric(cast$isGoblin)
196 |
197 | numGoblins <- sum(cast$isGoblin)
198 | # [1] 24
199 |
200 |
201 |
202 |
203 | cast
204 |
205 | # if condition do thing else do other
206 | #ifelse(condition, what to do on true, what to do on false)
207 | cast$isGoblin <- ifelse(
208 | cast$isGoblin == TRUE,
209 | "i am a goblin",
210 | "i am not a goblin"
211 | )
212 |
213 | View(cast)
214 |
--------------------------------------------------------------------------------
/R/eda1.R:
--------------------------------------------------------------------------------
1 | library(mlbench)
2 | library(caret)
3 | library(mlr)
4 | library(tidyverse)
5 | library(ggthemes)
6 | library(gplots)
7 | library(randomForest)
8 |
9 |
10 | library(skimr) # skimr is sweet
11 |
12 | data(BostonHousing)
13 | head(BostonHousing)
14 | # crim zn indus chas nox rm age dis rad tax ptratio b
15 | # 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
16 | # 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90
17 | # 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83
18 | # 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63
19 | # 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
20 | # 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12
21 | # lstat medv
22 | # 1 4.98 24.0
23 | # 2 9.14 21.6
24 | # 3 4.03 34.7
25 | # 4 2.94 33.4
26 | # 5 5.33 36.2
27 | # 6 5.21 28.7
28 | length(BostonHousing)
29 | # [1] 14
30 | skim(BostonHousing)
31 | # Skim summary statistics
32 | # n obs: 506
33 | # n variables: 14
34 | #
35 | # ── Variable type:factor ──────────────────────────────────────────────────
36 | # variable missing complete n n_unique top_counts ordered
37 | # chas 0 506 506 2 0: 471, 1: 35, NA: 0 FALSE
38 | #
39 | # ── Variable type:numeric ─────────────────────────────────────────────────
40 | # variable missing complete n mean sd p0 p25 p50
41 | # age 0 506 506 68.57 28.15 2.9 45.02 77.5
42 | # b 0 506 506 356.67 91.29 0.32 375.38 391.44
43 | # crim 0 506 506 3.61 8.6 0.0063 0.082 0.26
44 | # dis 0 506 506 3.8 2.11 1.13 2.1 3.21
45 | # indus 0 506 506 11.14 6.86 0.46 5.19 9.69
46 | # lstat 0 506 506 12.65 7.14 1.73 6.95 11.36
47 | # medv 0 506 506 22.53 9.2 5 17.02 21.2
48 | # nox 0 506 506 0.55 0.12 0.39 0.45 0.54
49 | # ptratio 0 506 506 18.46 2.16 12.6 17.4 19.05
50 | # rad 0 506 506 9.55 8.71 1 4 5
51 | # rm 0 506 506 6.28 0.7 3.56 5.89 6.21
52 | # tax 0 506 506 408.24 168.54 187 279 330
53 | # zn 0 506 506 11.36 23.32 0 0 0
54 | # p75 p100 hist
55 | # 94.07 100 ▁▂▂▂▂▂▃▇
56 | # 396.23 396.9 ▁▁▁▁▁▁▁▇
57 | # 3.68 88.98 ▇▁▁▁▁▁▁▁
58 | # 5.19 12.13 ▇▅▃▃▂▁▁▁
59 | # 18.1 27.74 ▃▆▅▁▁▇▁▁
60 | # 16.96 37.97 ▆▇▆▅▂▁▁▁
61 | # 25 50 ▂▅▇▆▂▂▁▁
62 | # 0.62 0.87 ▇▆▇▆▃▅▁▁
63 | # 20.2 22 ▁▂▂▂▅▅▇▃
64 | # 24 24 ▂▇▁▁▁▁▁▅
65 | # 6.62 8.78 ▁▁▂▇▇▂▁▁
66 | # 666 711 ▃▇▂▅▁▁▁▆
67 | # 12.5 100 ▇▁▁▁▁▁▁▁
68 |
69 |
70 | # lapply(df, sd)
71 |
72 | # step 0
73 | # get rid of zero variance variables (ones that only have one value)
74 | # check and make sure categorical variables are stored as factors
75 | # use common sense!!!!
76 |
77 | library(corrplot)
78 | library(tidyverse)
79 |
80 | # library(purrr)
81 |
82 | # cor function: Calculate correlation between columns of a df or matrix
83 | # conditions:
84 | # cant handle not numeric
85 | # cant handle NAs
86 |
87 |
88 | bh <- BostonHousing
89 |
90 | bh2 <- bh
91 | bh2$notNum <- "cat"
92 | bh2 %>% keep(is.numeric) %>% head
93 | # opposite of keep: discard
94 |
95 | #purrr
96 | # keep(condition)
97 |
98 | library(corrplot)
99 | corrplot
100 |
101 |
102 | bh %>% keep(is.numeric) %>% na.omit %>% cor %>% corrplot("upper", addCoef.col = "white", number.digits = 2,
103 | number.cex = 0.5, method="square",
104 | order="hclust", title="Variable Corr Heatmap",
105 | tl.srt=45, tl.cex = 0.8)
106 |
107 |
108 | # function to do this all in one go
109 | correlator <- function(df){
110 | df %>%
111 | keep(is.numeric) %>%
112 | tidyr::drop_na() %>%
113 | cor %>%
114 | corrplot("upper", addCoef.col = "white", number.digits = 2,
115 | number.cex = 0.5, method="square",
116 | order="hclust", title="Variable Corr Heatmap",
117 | tl.srt=45, tl.cex = 0.8)
118 | }
119 |
120 |
121 | # correlation analysis
122 | # usage: this is step 1! Batch elimination of numeric variables
123 | # this can narrow things down a lot
124 | # do not forget to use human logic
125 |
126 |
127 | # key plots
128 |
129 | # x : y
130 | # numeric : categorical
131 |
132 | data(mtcars)
133 |
134 | head(mtcars)
135 | # mpg cyl disp hp drat wt qsec vs am gear carb
136 | # Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
137 | # Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
138 | # Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
139 | # Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
140 | # Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
141 | # Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
142 |
143 | mtcars$cyl <- as.factor(mtcars$cyl)
144 | mtcars$vs <- as.factor(mtcars$vs)
145 | mtcars$am <- as.factor(mtcars$am)
146 | mtcars$gear <- as.factor(mtcars$gear)
147 | mtcars$carb <- as.factor(mtcars$carb)
148 |
149 | skim(mtcars)
150 |
151 |
152 | # x : y
153 | # numeric : categorical
154 |
155 | mtcars$rvar <- rnorm(nrow(mtcars))
156 |
157 | length(unique(mtcars$vs))
158 | # [1] 2
159 |
160 | ggplot(data = mtcars) + geom_density(aes_string(x = "mpg", fill = "am"), alpha = 0.5)
161 |
162 | # automated EDA!!!!!!!!!!!!
163 | # step 1, save target variable name
164 | target <- "am"
165 | # step 2, save explanator variable names
166 | numvars <- mtcars %>% keep(is.numeric) %>% colnames
167 | # [1] "mpg" "disp" "hp" "drat" "wt" "qsec" "rvar"
168 |
169 |
170 | numplot <- function(df, explan, resp) {
171 | ggplot(data = df) + geom_density(aes_string(x = explan, fill = resp), alpha = 0.5)
172 | }
173 |
174 | numplot(mtcars, explan = "mpg", resp = "am")
175 |
176 | plotlist <- lapply(numvars, function(x) numplot(mtcars, x, "am"))
177 | library(cowplot)
178 | plot_grid(plotlist = plotlist)
179 |
180 |
181 | png()
182 | lapply(numvars, function(x) numplot(mtcars, x, "am"))
183 | dev.off()
184 |
185 |
186 |
187 | # categorical vs categorical
188 | str(mtcars)
189 | # 'data.frame': 32 obs. of 12 variables:
190 | # $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
191 | # $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
192 | # $ disp: num 160 160 108 258 360 ...
193 | # $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
194 | # $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
195 | # $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
196 | # $ qsec: num 16.5 17 18.6 19.4 17 ...
197 | # $ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
198 | # $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
199 | # $ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
200 | # $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...
201 | # $ rvar: num 0.584 -0.573 0.582 -0.221 0.409 ...
202 | # NULL
203 |
204 |
205 | ggplot(data = mtcars) + geom_bar(aes(x = cyl, fill = am), position = "fill", alpha = 0.9) + coord_flip()
206 |
207 |
208 | ones <- rep(1, nrow(mtcars))
209 | zeroes <- rep(0, nrow(mtcars))
210 | onezeroes <- c(ones, zeroes)
211 |
212 | mtcars$rcat <- sample(onezeroes, nrow(mtcars))
213 |
214 |
215 | ggplot(data = mtcars) + geom_bar(aes(x = rcat, fill = am), position = "fill", alpha = 0.9) + coord_flip()
216 |
217 | # step 1: Name target variable:
218 |
219 | target <- "am"
220 |
221 | # step 2: name explanatory vars
222 |
223 | expls <- mtcars %>% keep(is.factor) %>% colnames
224 |
225 |
226 | catplot <- function(df, x,y){
227 | ggplot(data = df, aes_string(x = x, fill = y)) +
228 | geom_bar(position = "fill", alpha = 0.9) +
229 | coord_flip()
230 | }
231 |
232 |
233 | plotlist2 <- lapply(expls, function(x) catplot(mtcars, x, target))
234 | plot_grid(plotlist = plotlist2)
235 |
--------------------------------------------------------------------------------
/pres/html-scraping.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "HTML Scraping in R"
3 | author: "David Josephs"
4 | output: html_document
5 | ---
6 |
7 | ```{r setup, include = F}
8 | knitr::read_chunk('../R/scraping.R')
9 | library(knitr)
10 | library(kableExtra)
11 | library(magrittr)
12 | library(pander)
13 | knitr::opts_chunk$set(cache = T, autodep = T)
14 | ```
15 | # Lord of the Rings Example
16 |
17 | ## Setup
18 |
19 | First, lets load up two libraries which will make our life easier. First is rvest, which is a great library for reading html, it is basically an extension of the xml2 package. It has some easy syntax and is quite helpful going forwards.
20 |
21 | The second one is in my opinion, one of the most useful libraries for doing any sort of data science or data analysis in R, the tidyverse. Just google it and see the documentation, it is a set of packages in which all of the functions have similar APIs and arguments, allowing for consistency throughout our programmig. They are also all pretty fast, with nice syntax. Examples from the tidyverse are: readr (data loading), dplyr(data analysis/cleaning/general utility), tidyr(data cleaning again, reshaping), caret(machine learning), and ggplot2(data viz).
22 |
23 | ```{r, message=F}
24 | <>
25 | ```
26 |
27 | Next lets load up our data. In this example we will be looking at the imdb page for lord of the rings. So we will assign a variable to the url of the page we are interested in:
28 |
29 | ```{r dataload}
30 | <>
31 | ```
32 |
33 | ## Reading the data
34 |
35 | ### The pipe operator
36 |
37 | Before we can read in the data, lets first learn about `%>%` pipes. A pipe is basically saying, take the thing on the left, and make it an argument of a thing on the right. For example, lets say we want to take the mean of mtcars, the classic R example dataset, with all columns. We can do that with:
38 |
39 | ```{r}
40 | lapply(mtcars,mean)
41 | ```
42 |
43 | This is mapping the mean function over the mtcars dataset. Now, the output of this is not very pretty, so we will turn it into a nice, horizontal data frame:
44 |
45 | ```{r}
46 | as.data.frame(lapply(mtcars,mean))
47 | ```
48 |
49 | Still ugly. Lets try and use the pander library to make this look nice:
50 |
51 | ```{r}
52 | pander(as.data.frame(lapply(mtcars,mean)))
53 | ```
54 |
55 | Much better, but look at how many parentheses we wrote, and how difficult this is to read. Imagine if we had 4 or 5 more steps. We would have to repeatedly assign things to new variables, and keep working and working and putting things in our computers memory to have readable code. Even then, if we assigned a variable on every step, someone reviewing your code would end up having to know 20 or so lines of code above, just to understand the final printing line. This leads to errors and is not reproducible. Instead, lets try it with the pipe operator. Mathematically, `f(x,y) = x %>% f(y)`, if that helps:
56 |
57 | ```{r}
58 | mtcars %>% lapply(mean) %>% as.data.frame %>% pander
59 | ```
60 |
61 | This reads from left to right (as we english speakers are in the habit of doing):
62 | First, we take the mtcars dataset. Then, we apply the mean function onto every column of the dataset, outputting into the form of a list. We then turn the list, which is hard to read, into a nice flat data frame, and then we pretty up the data frame in a final step. This is the pipe operator.
63 |
64 | ### Actually reading in the data
65 |
66 | So, with our knowledge of the pipe operator, what can we do? Lets use rvest functions to turn the raw xml and/or html data into something nice and human human readbale.
67 |
68 | First, lets read in the website:
69 |
70 | ```{r, eval = F}
71 | # not run
72 | read_html(lotr)
73 | ```
74 |
75 | Next lets choose all the tables (we know all of our data is in tables) in the raw data, with the `html_nodes()` function:
76 |
77 | ```{r, eval = F}
78 | read_html(lotr) %>% html_nodes("table")
79 | ```
80 |
81 | Next, lets choose the right table. By looking at the website, we know that the third table contains the info on the cast. To choose the third table of an unnamed object, we are going to have to use the `.` operator, which we will see is just a placeholder for the thing on the left.
82 |
83 |
84 | ```{r, eval = F}
85 | read_html(lotr) %>% html_nodes("table") %>% .[[3]]
86 | ```
87 |
88 | #### An Aside on lists
89 | Why did we do `[[]]`?
90 | This is because html_nodes outputs a list, and there are three ways we can get items from a list, `$`, for named items, keeps the type of the item if it is some sort of vector. `[]` allows us to index the list, but the output is always in the form of a list, eg, data type is extracted at some other set. Third, we have `[[]]`, which allows us to index the list and get the proper data type in an output. Experiment with this by using the following list as well as the built in `typeof()` function.
91 |
92 | ```{r}
93 | <>
94 | ```
95 |
96 | ### Back to Business
97 |
98 | Now that we understand what `.[[3]]` is doing, we can now extract the full dataset:
99 |
100 | ```{r}
101 | <>
102 | (head(cast))
103 | ```
104 |
105 | Great. Now that process was pretty painful, and took a lot of typing, and in the future we may not know which table we are looking for, so lets write a nice little function to do this all in one step:
106 |
107 |
108 | ```{r}
109 | <>
110 | ```
111 |
112 | Now that we have a nice function, we can iteratively search through the IMDB site:
113 |
114 | ```{r}
115 | <>
116 | ```
117 |
118 | We can even imagine, for a large project, just writing a for loop to do all of this.
119 | Next, lets check out the first and last ten items of cast:
120 |
121 | ```{r}
122 | ht <- function(x,...){
123 | head(x,...)
124 | tail(x,...)
125 | }
126 | ht(cast,10)
127 | ```
128 |
129 | ***NOTE***: the `...` in our function allows for extra arguments. We do this so we can throw in the extra parameter, `10` which changes head and tail to showing the first and last 10 instead of the first and last 6 items.
130 |
131 | ## Cleaning the data
132 | Wow, this data is a mess. The first thing we see is that the first row is entirely blank, and then that the first and third columns are completely empty. Lets get rid of that:
133 |
134 | ```{r}
135 | <>
136 | ht(cast)
137 | ```
138 |
139 | Next, lets rename with dplyr:
140 |
141 | ```{r}
142 | <>
143 | ```
144 |
145 | Looking better, now we know from the IMDB website that the table contains"Rest of cast listed alphabetically:", so lets get rid of that. To do this, we are going to use `grepl()`
146 |
147 | `grepl()` searches for a pattern and then returns a logical (true/false) vector of whether or not there is a match. We can then index `cast` for all rows where the result of `grepl` are not true, eliminating the unwanted line:
148 |
149 | ```{r}
150 | <>
151 | ```
152 |
153 |
154 | Try and see how this dplyr syntax is different from doing it in base R as a learning challenge, and see which one you prefer.
155 |
156 | Next lets get rid of those nasty `\n`'s. To do this, lets use `gsub()`, short for global substite. Since we dont know how all newlines are delimited, we will search for all types of newlines, `\n` (unix) `\r\n` (windows) and `\r` (old web line endings). To do this, we will use the regular expression `[\r\n]`. This allows us to search for `\r`,`\n`, and `\r\n` (thats what the brackets do). Lets turn those all into nothing.
157 |
158 | ```{r}
159 | <>
160 | ht(cast)
161 | ```
162 |
163 | Now, we have a ton of whitespace. Lets get rid of that. The regular expression for a single space is `\s`. But, we want to get rid of more than one space, and the regular expression for that is `\s+`. Lets combine the two so we are looking for all spaces, by doing `\s\s+`. That however is not very pretty, so lets combine one step further, and rewrite as `\\s+`. This is going to match with all amounts of whitespace. Lets turn all of these into a single space:
164 |
165 | ```{r}
166 | <>
167 | ht(cast)
168 | ```
169 |
170 | Excellent work, we have now turned a once very ugly raw frame into something we can work with.
171 |
172 | # A challenge:
173 |
174 | Two challenges here:
175 |
176 | * Is there another way we could have cleaned up the `\n` or the `\s`? Try out `library(stringr)` and explore the functions there.
177 |
178 | * Try separating first name from last name (eg make a first and last name column), using whatever means necessary (this is in your homework assignment this week)
179 |
180 | # Note
181 |
182 | To see more examples and play around with the source code for this document, see `R/scraping.R`
183 |
--------------------------------------------------------------------------------
/R/analysis1.R:
--------------------------------------------------------------------------------
1 | library(tswgewrapped)
2 | library(ggthemes)
3 | library(ggplot2)
4 | library(cowplot)
5 | source("../R/preprocessing.R", echo = TRUE)
6 | source("../R/helpers.R", echo = TRUE)
7 |
8 |
9 | # data import
10 | # imports the data as a hash table
11 | fine_china <- preprocess("../data/")
12 | names(fine_china)
13 | # [1] "ChengduPM_" "ShenyangPM_" "ShanghaiPM_" "BeijingPM_" "GuangzhouPM_"
14 | #
15 |
16 |
17 | # shanghai
18 | shang_US <- fine_china$ShanghaiPM_$PM_US
19 |
20 | usShang <- resample(shang_US)
21 | plotts.sample.wge(usShang$day)
22 | plotts.sample.wge(usShang$week)
23 | plotts.sample.wge(usShang$month)
24 | plotts.sample.wge(usShang$sea)
25 | decompose(usShang$day, "multiplicative") %>>% autoplot+ theme_economist()
26 | decompose(usShang$day, "additive") %>>% autoplot+ theme_economist()
27 | decompose(usShang$week, "multiplicative") %>>% autoplot+ theme_economist()
28 | decompose(usShang$week, "additive") %>>% autoplot+ theme_economist()
29 | decompose(usShang$month, "multiplicative") %>>% autoplot+ theme_economist()
30 | decompose(usShang$month, "multiplicative") %>>% autoplot+ theme_economist()
31 | decompose(usShang$sea, "additive") %>>% autoplot+ theme_economist()
32 | decompose(usShang$sea, "additive") %>>% autoplot+ theme_economist()
33 | usShang$week %>>% lagplot+ theme_economist()
34 | usShang$day %>>% seasonplot + theme_economist()
35 | usShang$day %>>% seasonplot(polar = TRUE) + theme_economist()
36 | usShang$week %>>% seasonplot+ theme_economist()
37 | usShang$week %>>% seasonplot(polar = T)+ theme_economist()
38 | usShang$month %>>% seasonplot+ theme_economist()
39 | usShang$month %>>% seasonplot(polar = T)+ theme_economist()
40 | usShang$seas %>>% seasonplot(polar = T) + theme_economist()
41 | usShang$seas %>>% seasonplot + theme_economist()
42 |
43 |
44 |
45 | # next lets look at ses and holt models
46 |
47 | library(fpp2)
48 |
49 | sesd <- ses(usShang$day)
50 | sesw <- ses(usShang$week)
51 | sesm <- ses(usShang$month)
52 | par(mfrow = c(1,3))
53 | lapply(list(sesd,sesw,sesm), plot)
54 |
55 |
56 | accuracy(fitted(sesd))
57 |
58 | accuracy(fitted(sesw))
59 |
60 | accuracy(fitted(sesm))
61 |
62 |
63 | ## Problem: implement the above for holt
64 |
65 |
66 |
67 | # below is extra, see analysis2 for more complex fun
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 | # We see with the weekly plot we have a lot of seasonality
77 |
78 |
79 | # lets just for fun do some predictions with the daily data
80 |
81 | shang <- usShang$day
82 | plotts.sample.wge(shang)
83 | # we have clear seasonality, and maybe a wandering behavior. I believe we have a biannual seasonality, based off of the monthly graph
84 | shang %>>% ( difference(seasonal,., (365)) ) -> shang2
85 | difference(arima, shang2, 1) -> shang3
86 | aics <- shang3 %>>% aicbic(p=0:10)
87 | pander(aics)
88 | #
89 | #
90 | # *
91 | #
92 | # ------------------------
93 | # p q aic
94 | # -------- --- --- -------
95 | # **20** 3 1 13.51
96 | #
97 | # **6** 0 5 13.52
98 | #
99 | # **4** 0 3 13.52
100 | #
101 | # **10** 1 3 13.52
102 | #
103 | # **3** 0 2 13.55
104 | # ------------------------
105 | #
106 | # *
107 | #
108 | # ------------------------
109 | # p q bic
110 | # -------- --- --- -------
111 | # **20** 3 1 13.54
112 | #
113 | # **4** 0 3 13.55
114 | #
115 | # **6** 0 5 13.55
116 | #
117 | # **10** 1 3 13.56
118 | #
119 | # **3** 0 2 13.57
120 | # ------------------------
121 | #
122 | #
123 | #
124 | #
125 | #
126 | # NULL
127 | aicss <- shang %>>% ( difference(seasonal,., 365) ) %>>% aicbic(p=0:10)
128 | pander(aics)
129 | #
130 | #
131 | # *
132 | #
133 | # -----------------------
134 | # p q aic
135 | # -------- --- --- ------
136 | # **20** 3 1 13.5
137 | #
138 | # **11** 1 4 13.5
139 | #
140 | # **26** 4 1 13.5
141 | #
142 | # **16** 2 3 13.5
143 | #
144 | # **24** 3 5 13.5
145 | # -----------------------
146 | #
147 | # *
148 | #
149 | # ------------------------
150 | # p q bic
151 | # -------- --- --- -------
152 | # **13** 2 0 13.53
153 | #
154 | # **3** 0 2 13.53
155 | #
156 | # **8** 1 1 13.53
157 | #
158 | # **7** 1 0 13.53
159 | #
160 | # **20** 3 1 13.53
161 | # ------------------------
162 | #
163 | #
164 | #
165 | #
166 | #
167 | # NULL
168 | par(mfrow = c(1,1))
169 | est_shang <- estimate(shang2, p=2, q = 0)
170 | acf(est_shang$res)
171 | ljung_box(est_shang$res, p =2, q =0)
172 | shang_seasonal <- fore_and_assess(type = aruma,
173 | x = shang,
174 | s = 365,
175 | phi = est_shang$phi,
176 | n.ahead = 24,
177 | limits = F
178 | )
179 |
180 | est_shang2 <- estimate(shang3, p = 3, q = 1)
181 | acf(est_shang2$res)
182 | ljung_box(est_shang2$res, 3, 1)
183 | # [,1] [,2]
184 | # test "Ljung-Box test" "Ljung-Box test"
185 | # K 24 48
186 | # chi.square 14.14806 35.92178
187 | # df 20 44
188 | # pval 0.8229101 0.8017901
189 |
190 | shang_aruma <- fore_and_assess(type = aruma,
191 | x = shang,
192 | s = 365,
193 | d = 1,
194 | phi = est_shang2$phi,
195 | theta = est_shang2$theta,
196 | n.ahead = 24,
197 | limits = F
198 | )
199 |
200 | shang_seasonal$ASE
201 | # [1] 1154198
202 | shang_aruma$ASE
203 | # [1] 1154911
204 | test <- window(shang_US, start = 7)[1:24]
205 | ase(test, shang_aruma)
206 | # [1] 1977888
207 | ase(test, shang_seasonal)
208 | # [1] 3278672
209 |
210 | forecast(aruma, shang, s = 365, d = 1, phi = est_shang2$phi,theta = est_shang2$theta, n.ahead=500)
211 | forecast(aruma, shang, s = 365, phi = est_shang$phi, n.ahead=500)
212 |
213 | # ok looking damn good with the shang aruma
214 |
215 |
216 | # Beijing
217 |
218 | bj_US <- fine_china$BeijingPM_$PM_US
219 | usBJ <- resample(bj_US)
220 | plotts.sample.wge(usBJ$day)
221 | plotts.sample.wge(usBJ$week)
222 | plotts.sample.wge(usBJ$month)
223 | plotts.sample.wge(usBJ$sea)
224 | decompose(usBJ$day, "multiplicative") %>>% autoplot+ theme_economist()
225 | decompose(usBJ$day, "additive") %>>% autoplot+ theme_economist()
226 | decompose(usBJ$week, "multiplicative") %>>% autoplot+ theme_economist()
227 | decompose(usBJ$week, "additive") %>>% autoplot+ theme_economist()
228 | decompose(usBJ$month, "multiplicative") %>>% autoplot+ theme_economist()
229 | decompose(usBJ$month, "multiplicative") %>>% autoplot+ theme_economist()
230 | decompose(usBJ$sea, "additive") %>>% autoplot+ theme_economist()
231 | decompose(usBJ$sea, "additive") %>>% autoplot+ theme_economist()
232 | usBJ$week %>>% lagplot+ theme_economist()
233 | usBJ$day %>>% seasonplot + theme_economist()
234 | usBJ$day %>>% seasonplot(polar = TRUE) + theme_economist()
235 | usBJ$week %>>% seasonplot+ theme_economist()
236 | usBJ$week %>>% seasonplot(polar = T)+ theme_economist()
237 | usBJ$month %>>% seasonplot+ theme_economist()
238 | usBJ$month %>>% seasonplot(polar = T)+ theme_economist()
239 | usBJ$seas %>>% seasonplot(polar = T) + theme_economist()
240 | usBJ$seas %>>% seasonplot + theme_economist()
241 | bj <- usBJ$day
242 |
243 | bj %>>% (difference(seasonal,.,365)) -> bjtr
244 | aicbj <- bj %>>% (difference(seasonal,.,365)) %>>%
245 | aicbic(p = 0:10)
246 | pander(aicbj)
247 | #
248 | #
249 | # *
250 | #
251 | # ------------------------
252 | # p q aic
253 | # -------- --- --- -------
254 | # **41** 6 4 15.31
255 | #
256 | # **53** 8 4 15.31
257 | #
258 | # **59** 9 4 15.31
259 | #
260 | # **47** 7 4 15.31
261 | #
262 | # **60** 9 5 15.31
263 | # ------------------------
264 | #
265 | # *
266 | #
267 | # ------------------------
268 | # p q bic
269 | # -------- --- --- -------
270 | # **13** 2 0 15.35
271 | #
272 | # **3** 0 2 15.35
273 | #
274 | # **8** 1 1 15.35
275 | #
276 | # **14** 2 1 15.35
277 | #
278 | # **19** 3 0 15.35
279 | # ------------------------
280 | #
281 | #
282 | #
283 | #
284 | #
285 | # NULL
286 |
287 | est_bj <- estimate(bjtr, 6,4)
288 | acf(est_bj$res)
289 | ljung_box(est_bj$res,6,4)
290 | # [,1] [,2]
291 | # test "Ljung-Box test" "Ljung-Box test"
292 | # K 24 48
293 | # chi.square 28.84052 45.85887
294 | # df 14 38
295 | # pval 0.01098179 0.1784661
296 |
297 | bj_seas <- fore_and_assess(type = aruma,
298 | x = bj,
299 | s = 365,
300 | phi = est_bj$phi,
301 | theta = est_bj$theta,
302 | n.ahead = 24,
303 | limits = F
304 | )
305 | test <- window(bj_US, start = 7)[1:24]
306 | ase(test, bj_seas)
307 |
--------------------------------------------------------------------------------
/R/nb2.R:
--------------------------------------------------------------------------------
1 | library(caret)
2 |
3 | data(BreastCancer, package='mlbench')
4 |
5 |
6 | library(skimr)
7 |
8 |
9 | skim(BreastCancer)
10 | # ── Data Summary ────────────────────────
11 | # Values
12 | # Name BreastCancer
13 | # Number of rows 699
14 | # Number of columns 11
15 | # _______________________
16 | # Column type frequency:
17 | # character 1
18 | # factor 10
19 | # ________________________
20 | # Group variables None
21 | #
22 | # ── Variable type: character ──────────────────────────────────────────────
23 | # skim_variable n_missing complete_rate min max empty n_unique
24 | # 1 Id 0 1 5 8 0 645
25 | # whitespace
26 | # 1 0
27 | #
28 | # ── Variable type: factor ─────────────────────────────────────────────────
29 | # skim_variable n_missing complete_rate ordered n_unique
30 | # 1 Cl.thickness 0 1 TRUE 10
31 | # 2 Cell.size 0 1 TRUE 10
32 | # 3 Cell.shape 0 1 TRUE 10
33 | # 4 Marg.adhesion 0 1 TRUE 10
34 | # 5 Epith.c.size 0 1 TRUE 10
35 | # 6 Bare.nuclei 16 0.977 FALSE 10
36 | # 7 Bl.cromatin 0 1 FALSE 10
37 | # 8 Normal.nucleoli 0 1 FALSE 10
38 | # 9 Mitoses 0 1 FALSE 9
39 | # 10 Class 0 1 FALSE 2
40 | # top_counts
41 | # 1 1: 145, 5: 130, 3: 108, 4: 80
42 | # 2 1: 384, 10: 67, 3: 52, 2: 45
43 | # 3 1: 353, 2: 59, 10: 58, 3: 56
44 | # 4 1: 407, 2: 58, 3: 58, 10: 55
45 | # 5 2: 386, 3: 72, 4: 48, 1: 47
46 | # 6 1: 402, 10: 132, 2: 30, 5: 30
47 | # 7 2: 166, 3: 165, 1: 152, 7: 73
48 | # 8 1: 443, 10: 61, 3: 44, 2: 36
49 | # 9 1: 579, 2: 35, 3: 33, 10: 14
50 | # 10 ben: 458, mal: 241
51 |
52 | bc <- BreastCancer
53 | bc$ID <- NULL
54 |
55 | bc
56 |
57 | library(tidyverse)
58 | ggplot(data = bc) + geom_bar(aes_string(x = Cell.size, fill = Class), position = "fill", alpha = 0.9) + coord_flip()
59 |
60 |
61 | catplot <- function(df, x,y){
62 | ggplot(data = df, aes_string(x = x, fill = y)) +
63 | geom_bar(position = "fill", alpha = 0.9) +
64 | coord_flip()
65 | }
66 |
67 |
68 | bc$Id <- NULL
69 | rev(names(bc))
70 | # [1] "Class" "Mitoses" "Normal.nucleoli"
71 | # [4] "Bl.cromatin" "Bare.nuclei" "Epith.c.size"
72 | # [7] "Marg.adhesion" "Cell.shape" "Cell.size"
73 | # [10] "Cl.thickness"
74 | # [1] "Cl.thickness" "Cell.size" "Cell.shape"
75 | # [4] "Marg.adhesion" "Epith.c.size" "Bare.nuclei"
76 | # [7] "Bl.cromatin" "Normal.nucleoli" "Mitoses"
77 | # [10] "Class"
78 |
79 | explanatory <- rev(names(bc))[2:length(names(bc))]
80 | # [1] "Mitoses" "Normal.nucleoli" "Bl.cromatin"
81 | # [4] "Bare.nuclei" "Epith.c.size" "Marg.adhesion"
82 | # [7] "Cell.shape" "Cell.size" "Cl.thickness"
83 | target <- "Class"
84 |
85 | plotlist <- lapply(explanatory, function(x) catplot(bc, x, target))
86 |
87 | library(cowplot)
88 | plot_grid(plotlist = plotlist)
89 |
90 | trainIDS <- createDataPartition(bc$Class, list=FALSE, p = 0.7)
91 |
92 | trainIDS
93 |
94 | training <- bc[trainIDS,]
95 | test <- bc[-trainIDS, ]
96 |
97 | table(bc$Class) / sum(table(bc$Class))
98 | #
99 | # benign malignant
100 | # 0.6552217 0.3447783
101 |
102 | table(training$Class) / sum(table(training$Class))
103 | #
104 | # benign malignant
105 | # 0.655102 0.344898
106 | table(test$Class) / sum(table(test$Class))
107 | #
108 | # benign malignant
109 | # 0.6555024 0.3444976
110 |
111 | nrow(test) / nrow(bc)
112 |
113 | trainMethod <- trainControl(method = "cv", number = 3)
114 | # train, validation, test
115 |
116 | nb_fit <- train(Class ~ ., data = na.omit(training),
117 | trControl = trainMethod,
118 | method = "nb",
119 | tuneLength = 10
120 | )
121 | nb_fit
122 |
123 |
124 | preds <- predict(nb_fit, newdata=test)
125 |
126 | preds
127 |
128 | test <- na.omit(test)
129 |
130 | length(preds)
131 | length(test$Class)
132 |
133 | # Naive Bayes
134 | #
135 | # 478 samples
136 | # 9 predictor
137 | # 2 classes: 'benign', 'malignant'
138 | #
139 | # No pre-processing
140 | # Resampling: Cross-Validated (3 fold)
141 | # Summary of sample sizes: 319, 318, 319
142 | # Resampling results across tuning parameters:
143 | #
144 | # usekernel Accuracy Kappa
145 | # FALSE NaN NaN
146 | # TRUE 0.9602463 0.9142473
147 | #
148 | # Tuning parameter 'fL' was held constant at a value of 0
149 | # Tuning
150 | # parameter 'adjust' was held constant at a value of 1
151 | # Accuracy was used to select the optimal model using the largest value.
152 | # The final values used for the model were fL = 0, usekernel = TRUE
153 | # and adjust = 1.
154 | # Random Forest
155 | #
156 | # 478 samples
157 | # 9 predictor
158 | # 2 classes: 'benign', 'malignant'
159 | #
160 | # No pre-processing
161 | # Resampling: Cross-Validated (3 fold)
162 | # Summary of sample sizes: 319, 318, 319
163 | # Resampling results across tuning parameters:
164 | #
165 | # mtry Accuracy Kappa
166 | # 2 0.9581892 0.9083623
167 | # 10 0.9540225 0.8989660
168 | # 19 0.9519392 0.8943769
169 | # 28 0.9498428 0.8894839
170 | # 36 0.9540225 0.8988541
171 | # 45 0.9561059 0.9036828
172 | # 54 0.9581892 0.9084716
173 | # 62 0.9623690 0.9178711
174 | # 71 0.9623690 0.9178711
175 | # 80 0.9602725 0.9133694
176 | #
177 | # Accuracy was used to select the optimal model using the largest value.
178 | # The final value used for the model was mtry = 62.
179 | # k-Nearest Neighbors
180 | #
181 | # 478 samples
182 | # 9 predictor
183 | # 2 classes: 'benign', 'malignant'
184 | #
185 | # No pre-processing
186 | # Resampling: Cross-Validated (3 fold)
187 | # Summary of sample sizes: 319, 318, 319
188 | # Resampling results across tuning parameters:
189 | #
190 | # k Accuracy Kappa
191 | # 5 0.9330713 0.8478567
192 | # 7 0.9226022 0.8231873
193 | # 9 0.9142296 0.8029820
194 | # 11 0.9058569 0.7824233
195 | # 13 0.8953878 0.7565026
196 | # 15 0.8932914 0.7513188
197 | # 17 0.8870152 0.7354032
198 | # 19 0.8828223 0.7240868
199 | # 21 0.8786426 0.7124809
200 | # 23 0.8765592 0.7063262
201 | #
202 |
203 | plot(nb_fit)
204 |
205 | # Accuracy was used to select the optimal model using the largest value.
206 | # The final value used for the model was k = 5.
207 | # Naive Bayes
208 | #
209 | # 478 samples
210 | # 9 predictor
211 | # 2 classes: 'benign', 'malignant'
212 | #
213 | # No pre-processing
214 | # Resampling: Cross-Validated (3 fold)
215 | # Summary of sample sizes: 318, 319, 319
216 | # Resampling results across tuning parameters:
217 | #
218 | # usekernel Accuracy Kappa
219 | # FALSE NaN NaN
220 | # TRUE 0.9644392 0.9225881
221 | #
222 | # Tuning parameter 'fL' was held constant at a value of 0
223 | # Tuning
224 | # parameter 'adjust' was held constant at a value of 1
225 | # Accuracy was used to select the optimal model using the largest value.
226 | # The final values used for the model were fL = 0, usekernel = TRUE
227 | # and adjust = 1.
228 |
229 |
230 | conmat <- function(predicted, expected){
231 | cm <- as.matrix(table(Actual = as.factor(expected), Predicted = predicted))
232 | cm
233 | }
234 |
235 | cm <- table(preds, test$Class)
236 | #
237 | # preds benign malignant
238 | # benign 128 1
239 | # malignant 6 70
240 |
241 |
242 | accuracy <- sum(diag(cm)) / sum(as.matrix(cm))
243 | # [1] 0.9704433
244 |
245 | precision <- diag(cm)[2] / rowSums(cm)[2]
246 | # malignant
247 | # 0.9577465
248 | # malignant
249 | # 0.9210526
250 |
251 | recall <- diag(cm)[2] / colSums(cm)[2]
252 | # malignant
253 | # 0.9577465
254 |
255 | specificity <- diag(cm)[1] / colSums(cm)[1]
256 | # benign
257 | # 0.9772727
258 |
259 | confusionMatrix(cm, positive='malignant')
260 | # Confusion Matrix and Statistics
261 | #
262 | #
263 | # preds benign malignant
264 | # benign 129 3
265 | # malignant 3 68
266 | #
267 | # Accuracy : 0.9704
268 | # 95% CI : (0.9368, 0.9891)
269 | # No Information Rate : 0.6502
270 | # P-Value [Acc > NIR] : <2e-16
271 | #
272 | # Kappa : 0.935
273 | #
274 | # Mcnemar's Test P-Value : 1
275 | #
276 | # Sensitivity : 0.9577
277 | # Specificity : 0.9773
278 | # Pos Pred Value : 0.9577
279 | # Neg Pred Value : 0.9773
280 | # Prevalence : 0.3498
281 | # Detection Rate : 0.3350
282 | # Detection Prevalence : 0.3498
283 | # Balanced Accuracy : 0.9675
284 | #
285 | # 'Positive' Class : malignant
286 | #
287 | # Confusion Matrix and Statistics
288 | #
289 |
--------------------------------------------------------------------------------
/R/final.R:
--------------------------------------------------------------------------------
1 | library(mlbench)
2 | library(caret)
3 | library(mlr)
4 | library(tidyverse)
5 | library(ggthemes)
6 | library(gplots)
7 | library(randomForest)
8 | library(skimr) # skimr is sweet
9 |
10 | data(BostonHousing)
11 | head(BostonHousing)
12 | length(BostonHousing)
13 | str(BostonHousing)
14 | skim(BostonHousing)
15 | # Skim summary statistics
16 | # n obs: 506
17 | # n variables: 14
18 | #
19 | # ── Variable type:factor ──────────────────────────────────────────────────
20 | # variable missing complete n n_unique top_counts
21 | # chas 0 506 506 2 0: 471, 1: 35, NA: 0
22 | # rad 0 506 506 9 24: 132, 5: 115, 4: 110, 3: 38
23 | # ordered
24 | # FALSE
25 | # FALSE
26 | #
27 | # ── Variable type:numeric ─────────────────────────────────────────────────
28 | # variable missing complete n mean sd p0 p25 p50
29 | # age 0 506 506 68.57 28.15 2.9 45.02 77.5
30 | # b 0 506 506 356.67 91.29 0.32 375.38 391.44
31 | # crim 0 506 506 3.61 8.6 0.0063 0.082 0.26
32 | # dis 0 506 506 3.8 2.11 1.13 2.1 3.21
33 | # indus 0 506 506 11.14 6.86 0.46 5.19 9.69
34 | # lstat 0 506 506 12.65 7.14 1.73 6.95 11.36
35 | # medv 0 506 506 22.53 9.2 5 17.02 21.2
36 | # nox 0 506 506 0.55 0.12 0.39 0.45 0.54
37 | # ptratio 0 506 506 18.46 2.16 12.6 17.4 19.05
38 | # rm 0 506 506 6.28 0.7 3.56 5.89 6.21
39 | # tax 0 506 506 408.24 168.54 187 279 330
40 | # zn 0 506 506 11.36 23.32 0 0 0
41 | # p75 p100 hist
42 | # 94.07 100 ▁▂▂▂▂▂▃▇
43 | # 396.23 396.9 ▁▁▁▁▁▁▁▇
44 | # 3.68 88.98 ▇▁▁▁▁▁▁▁
45 | # 5.19 12.13 ▇▅▃▃▂▁▁▁
46 | # 18.1 27.74 ▃▆▅▁▁▇▁▁
47 | # 16.96 37.97 ▆▇▆▅▂▁▁▁
48 | # 25 50 ▂▅▇▆▂▂▁▁
49 | # 0.62 0.87 ▇▆▇▆▃▅▁▁
50 | # 20.2 22 ▁▂▂▂▅▅▇▃
51 | # 6.62 8.78 ▁▁▂▇▇▂▁▁
52 | # 666 711 ▃▇▂▅▁▁▁▆
53 | # 12.5 100 ▇▁▁▁▁▁▁▁
54 |
55 |
56 | ## Define a function using AES_STRING to put strings in ggplot objects
57 | scatterplotfun <- function(df, x,y){
58 | ggplot(data = df, aes_string(x = x, y = y))+ geom_point()
59 | }
60 |
61 | ## Define the name of the Y variable
62 | yname <- "medv"
63 |
64 | # Get the names off the numeric x variables
65 | BostonNumeric <- BostonHousing %>% keep(is.numeric)
66 | xname <- names(BostonNumeric[-ncol(BostonNumeric)])
67 |
68 | # ggplot with lapply
69 | lapply(xname, function(x) scatterplotfun(df = BostonNumeric, x = x, y = yname))
70 | plist <- lapply(xname, function(x) scatterplotfun(df = BostonNumeric, x = x, y = yname))
71 |
72 | library(cowplot)
73 | plot_grid(plotlist = plist)
74 |
75 | # make this a factor to show an example
76 | BostonHousing$rad <- as.factor(BostonHousing$rad)
77 | str(BostonHousing)
78 | # 'data.frame': 506 obs. of 14 variables:
79 | # $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
80 | # $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
81 | # $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
82 | # $ chas : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
83 | # $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
84 | # $ rm : num 6.58 6.42 7.18 7 7.15 ...
85 | # $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
86 | # $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
87 | # $ rad : Factor w/ 9 levels "1","2","3","4",..: 1 2 2 3 3 3 5 5 5 5 ...
88 | # $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
89 | # $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
90 | # $ b : num 397 397 393 395 397 ...
91 | # $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
92 | # $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
93 | # NULL
94 |
95 |
96 | # visualizing factor vs factor
97 |
98 | # reverse levels so we can read it in a human way
99 | # save column to vector
100 | chas <- BostonHousing$chas
101 | levels(chas)
102 | # [1] "0" "1"
103 | rev(levels(chas))
104 | # [1] "1" "0"
105 | BostonHousing$chas <- factor(chas, levels = rev(levels(chas)))
106 |
107 | # position = "fill" is the key here
108 | # please make your plots look good and theme them consistently
109 | ggplot(BostonHousing, aes(x = rad, fill = chas)) +
110 | geom_bar(alpha = 0.9, position = "fill") +
111 | coord_flip() +
112 | labs(x = "rad", y = "Proportion", title = "Income bias based on Education",
113 | subtitle = "Stacked bar plot") +
114 | theme_hc() + scale_fill_hc()
115 |
116 | # coreelation plot function we defined last week
117 | library(corrplot)
118 | correlator <- function(df){
119 | df %>%
120 | keep(is.numeric) %>%
121 | tidyr::drop_na() %>%
122 | cor %>%
123 | corrplot( addCoef.col = "white", number.digits = 2,
124 | number.cex = 0.5, method="square",
125 | order="hclust", title="Variable Corr Heatmap",
126 | tl.srt=45, tl.cex = 0.8)
127 | }
128 |
129 |
130 | BostonHousing %>%keep(is.numeric) %>%tidyr::drop_na() %>%cor
131 |
132 | ## interpret this
133 | correlator(BostonHousing)
134 |
135 | ## Now you do your LMs
136 |
137 | ## Useful variable importance plot
138 | ## Look at the plot on the left
139 | library(randomForest)
140 | rfreg <- randomForest(medv ~., data = BostonHousing, impotance = TRUE)
141 | varImpPlot(rfreg)
142 |
143 |
144 |
145 | # amazing library for categoricals/factors. No examples here but it rocks
146 | library(forcats)
147 |
148 |
149 | ## classification dataset
150 |
151 | data(BreastCancer)
152 | bc <- BreastCancer
153 | skim(bc)
154 | # Skim summary statistics
155 | # n obs: 699
156 | # n variables: 11
157 | #
158 | # ── Variable type:character ───────────────────────────────────────────────
159 | # variable missing complete n min max empty n_unique
160 | # Id 0 699 699 5 8 0 645
161 | #
162 | # ── Variable type:factor ──────────────────────────────────────────────────
163 | # variable missing complete n n_unique
164 | # Bare.nuclei 16 683 699 10
165 | # Bl.cromatin 0 699 699 10
166 | # Cell.shape 0 699 699 10
167 | # Cell.size 0 699 699 10
168 | # Cl.thickness 0 699 699 10
169 | # Class 0 699 699 2
170 | # Epith.c.size 0 699 699 10
171 | # Marg.adhesion 0 699 699 10
172 | # Mitoses 0 699 699 9
173 | # Normal.nucleoli 0 699 699 10
174 | # top_counts ordered
175 | # 1: 402, 10: 132, 2: 30, 5: 30 FALSE
176 | # 2: 166, 3: 165, 1: 152, 7: 73 FALSE
177 | # 1: 353, 2: 59, 10: 58, 3: 56 TRUE
178 | # 1: 384, 10: 67, 3: 52, 2: 45 TRUE
179 | # 1: 145, 5: 130, 3: 108, 4: 80 TRUE
180 | # ben: 458, mal: 241, NA: 0 FALSE
181 | # 2: 386, 3: 72, 4: 48, 1: 47 TRUE
182 | # 1: 407, 2: 58, 3: 58, 10: 55 TRUE
183 | # 1: 579, 2: 35, 3: 33, 10: 14 FALSE
184 | # 1: 443, 10: 61, 3: 44, 2: 36 FALSE
185 |
186 | # 16 missing
187 |
188 | # get rid of ID column, get rid of NAs
189 | bc <- bc %>% select(-Id) %>% tidyr::drop_na()
190 | str(bc)
191 | # 'data.frame': 683 obs. of 10 variables:
192 | # $ Cl.thickness : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 5 5 3 6 4 8 1 2 2 4 ...
193 | # $ Cell.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 1 1 2 ...
194 | # $ Cell.shape : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 4 1 8 1 10 1 2 1 1 ...
195 | # $ Marg.adhesion : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 1 5 1 1 3 8 1 1 1 1 ...
196 | # $ Epith.c.size : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 2 7 2 3 2 7 2 2 2 2 ...
197 | # $ Bare.nuclei : Factor w/ 10 levels "1","2","3","4",..: 1 10 2 4 1 10 10 1 1 1 ...
198 | # $ Bl.cromatin : Factor w/ 10 levels "1","2","3","4",..: 3 3 3 3 3 9 3 3 1 2 ...
199 | # $ Normal.nucleoli: Factor w/ 10 levels "1","2","3","4",..: 1 2 1 7 1 7 1 1 1 1 ...
200 | # $ Mitoses : Factor w/ 9 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 5 1 ...
201 | # $ Class : Factor w/ 2 levels "benign","malignant": 1 1 1 1 1 2 1 1 1 1 ...
202 | glimpse(bc)
203 | library(pander)
204 | skim(bc) %>% pander
205 |
206 | # practice aes string and categorical plot here
207 |
208 | # catplot = category plot
209 | catplot <- function(df, x,y){
210 | ggplot(data = df, aes_string(x = x, fill = y)) +
211 | geom_bar(position = "fill", alpha = 0.9) +
212 | coord_flip()
213 | }
214 |
215 | ## Define the name of the Y variable
216 | yname <- "Class"
217 |
218 | # Get the names off the numeric x variables
219 | xname <- names(bc[-ncol(bc)])
220 |
221 | # ggplot with lapply
222 | lapply(xname, function(x) catplot(df = bc, x = x, y = yname))
223 | plist <- lapply(xname, function(x) catplot(df = bc, x = x, y = yname))
224 | plist
225 | cowplot::plot_grid(plotlist = plist)
226 |
227 | # superassignment split practice
228 | split <- function(df, p = 0.75, list = FALSE, ...) {
229 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
230 | cat("creating training dataset...\n")
231 | training <<- df[train_ind, ]
232 | cat("completed training dataset, creating test set\n")
233 | test <<- df[-train_ind, ]
234 | cat("done")
235 | }
236 |
237 | split(bc)
238 |
239 |
240 | #
241 |
242 | library(doParallel)
243 | cores <- parallel::detectCores()
244 | # [1] 12
245 |
246 | # Generally do one less
247 | workers <- makeCluster(11L)
248 |
249 | # register for parallel computation
250 | registerDoParallel(workers)
251 |
252 | # train method for optimized specificity
253 | trainMethod <- trainControl( method = "repeatedcv", number = 25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)
254 |
255 |
256 | # naive bayes classifier
257 | fit.nb <- train(Class ~ . data = training, method = "nb", metric = "Spec", trControl = trainMethod)
258 |
259 | # knn classifier
260 | fit.knn <- train(Class ~ . data = training, method = "knn", metric = "Spec", trControl = trainMethod)
261 |
262 | # learn more about caret
263 | browseURL("https://topepo.github.io/caret/index.html")
264 | # if this fails try:
265 | # getOption("browser")
266 | # options(browser = "firefox")
267 |
--------------------------------------------------------------------------------
/R/preprocessing.R:
--------------------------------------------------------------------------------
1 | library(functional) # to compose the preprocessing pipeline
2 | library(data.table)# to read csvs FAST
3 | library(rlist) # for list manipulations
4 | library(pipeR) # fast, dumb pipes
5 | library(imputeTS) # to impute NAs
6 | library(pander) # so i can read the output
7 | library(foreach) # go fast
8 | library(doParallel) # go fast
9 |
10 | # Data import
11 | # datadir <- "../data/"
12 | # function which imports the data as a list, then fixes up the names to be nice
13 | import <- function(path){
14 | # first we list the files in the directory we specify
15 | files <- list.files(path)
16 | # we fund csvs
17 | files <- files[grepl(files, pattern = ".csv")]
18 | # we paste the path to our filename
19 | filepaths <- sapply(files, function(x) paste0(path,x))
20 | # We read in all the files
21 | out <- lapply(filepaths, fread)
22 | # we clean up the names
23 | fnames <- gsub(".csv","",files)
24 | fnames <- gsub("[[:digit:]]+","", fnames)
25 | names(out) <- fnames
26 | # return the list of data frames
27 | out
28 | }
29 | # datas <- import(datadir)
30 |
31 | # count the nas of a data frame
32 |
33 | # sum up the NAs and combine into a vector, note we are using percentages here
34 | count_nas_single <- function(df){
35 | sapply(df, function(x) sum(is.na(x))/length(x))
36 |
37 | }
38 |
39 | # count the NAs of a whole list
40 | count_nas <- function(xs){
41 | lapply(xs, count_nas_single)
42 | }
43 |
44 | # pander(count_nas(datas))
45 | #
46 | #
47 | # * **BeijingPM_**:
48 | #
49 | # ---------------------------------------------------------------------
50 | # No year month day hour season PM_Dongsi PM_Dongsihuan
51 | # ---- ------ ------- ----- ------ -------- ----------- ---------------
52 | # 0 0 0 0 0 0 0.5236 0.61
53 | # ---------------------------------------------------------------------
54 | #
55 | # Table: Table continues below
56 | #
57 | #
58 | # ----------------------------------------------------------------------------
59 | # PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP
60 | # ----------------- ------------ ----------- ---------- ---------- -----------
61 | # 0.5259 0.04178 9.509e-05 0.006447 0.006447 9.509e-05
62 | # ----------------------------------------------------------------------------
63 | #
64 | # Table: Table continues below
65 | #
66 | #
67 | # --------------------------------------------------
68 | # cbwd Iws precipitation Iprec
69 | # ----------- ----------- --------------- ----------
70 | # 9.509e-05 9.509e-05 0.009204 0.009204
71 | # --------------------------------------------------
72 | #
73 | # * **ChengduPM_**:
74 | #
75 | # ---------------------------------------------------------------------
76 | # No year month day hour season PM_Caotangsi PM_Shahepu
77 | # ---- ------ ------- ----- ------ -------- -------------- ------------
78 | # 0 0 0 0 0 0 0.5356 0.5323
79 | # ---------------------------------------------------------------------
80 | #
81 | # Table: Table continues below
82 | #
83 | #
84 | # --------------------------------------------------------------------------
85 | # PM_US Post DEWP HUMI PRES TEMP cbwd Iws
86 | # ------------ --------- --------- ---------- --------- ---------- ---------
87 | # 0.4504 0.01006 0.01017 0.009908 0.01002 0.009908 0.01014
88 | # --------------------------------------------------------------------------
89 | #
90 | # Table: Table continues below
91 | #
92 | #
93 | # ------------------------
94 | # precipitation Iprec
95 | # --------------- --------
96 | # 0.0562 0.0562
97 | # ------------------------
98 | #
99 | # * **GuangzhouPM_**:
100 | #
101 | # --------------------------------------------------------------
102 | # No year month day hour season PM_City Station
103 | # ---- ------ ------- ----- ------ ----------- -----------------
104 | # 0 0 0 0 0 1.902e-05 0.3848
105 | # --------------------------------------------------------------
106 | #
107 | # Table: Table continues below
108 | #
109 | #
110 | # -----------------------------------------------------------------------
111 | # PM_5th Middle School PM_US Post DEWP HUMI PRES
112 | # ---------------------- ------------ ----------- ----------- -----------
113 | # 0.5988 0.3848 1.902e-05 1.902e-05 1.902e-05
114 | # -----------------------------------------------------------------------
115 | #
116 | # Table: Table continues below
117 | #
118 | #
119 | # ---------------------------------------------------------------
120 | # TEMP cbwd Iws precipitation Iprec
121 | # ----------- ----------- ----------- --------------- -----------
122 | # 1.902e-05 1.902e-05 1.902e-05 1.902e-05 1.902e-05
123 | # ---------------------------------------------------------------
124 | #
125 | # * **ShanghaiPM_**:
126 | #
127 | # -----------------------------------------------------------------------------
128 | # No year month day hour season PM_Jingan PM_US Post PM_Xuhui
129 | # ---- ------ ------- ----- ------ -------- ----------- ------------ ----------
130 | # 0 0 0 0 0 0 0.5303 0.3527 0.521
131 | # -----------------------------------------------------------------------------
132 | #
133 | # Table: Table continues below
134 | #
135 | #
136 | # -----------------------------------------------------------------------
137 | # DEWP HUMI PRES TEMP cbwd Iws
138 | # ----------- ----------- ----------- ----------- ----------- -----------
139 | # 0.0002472 0.0002472 0.0005325 0.0002472 0.0002282 0.0002282
140 | # -----------------------------------------------------------------------
141 | #
142 | # Table: Table continues below
143 | #
144 | #
145 | # -------------------------
146 | # precipitation Iprec
147 | # --------------- ---------
148 | # 0.07624 0.07624
149 | # -------------------------
150 | #
151 | # * **ShenyangPM_**:
152 | #
153 | # ----------------------------------------------------------------------
154 | # No year month day hour season PM_Taiyuanjie PM_US Post
155 | # ---- ------ ------- ----- ------ -------- --------------- ------------
156 | # 0 0 0 0 0 0 0.5362 0.5877
157 | # ----------------------------------------------------------------------
158 | #
159 | # Table: Table continues below
160 | #
161 | #
162 | # --------------------------------------------------------------------------
163 | # PM_Xiaoheyan DEWP HUMI PRES TEMP cbwd Iws
164 | # -------------- --------- --------- --------- --------- --------- ---------
165 | # 0.5317 0.01316 0.01293 0.01316 0.01316 0.01316 0.01316
166 | # --------------------------------------------------------------------------
167 | #
168 | # Table: Table continues below
169 | #
170 | #
171 | # ------------------------
172 | # precipitation Iprec
173 | # --------------- --------
174 | # 0.2427 0.2427
175 | # ------------------------
176 | #
177 | #
178 | #
179 | #
180 | #
181 | # NULL
182 |
183 | # convert a vector to a time series with the proper frequency (we will say years in this case)
184 | tots <- function(v){
185 | ts(v, frequency = 365*24)
186 | }
187 | # tots(datas[[1]]$PM_US) %>>% tail
188 |
189 | # convert a data frame into a list of time series objects, given column names
190 | totslist <- function(df){
191 | # vector of column names which we do not want to convert to a time series
192 | badlist <- c(
193 | "No",
194 | "year",
195 | "month",
196 | "day",
197 | "hour",
198 | "season",
199 | "cbwd"
200 | )
201 | # names of out data frame
202 | nms <- colnames(df)
203 | # coerce to a list
204 | df <- as.list(df)
205 | # if the column at [[name]] is on our list, return it, otherwise, convert
206 | # to a time series. This allows us to deal with different amounts of data
207 | # collections of time series data (some series have more PM collecting
208 | # stations than others)
209 | for (name in nms){
210 | if (name %in% badlist){
211 | df[[name]] <- df[[name]]
212 | } else {
213 | df[[name]] <- tots(df[[name]])
214 | }
215 | }
216 | df
217 |
218 |
219 | }
220 | # datas[[1]] %>>% totsdf %>>%str
221 | # datas[[1]] %>>% totslist%>>%str
222 | # turn all data frames in a list of data frames to time series objects
223 | totsall <- function(xs){
224 | lapply(xs, totslist)
225 | }
226 | # str(datas[[1]]$PM_US)
227 | # datas %>>% totsall -> datas
228 |
229 | # impute NAs of a single list with spline interpolation
230 | # try na.ma but dont fail on error, instead just do standard type checking
231 | # if the output is a time series, impute the NAs, otherwise do nothing
232 | imp_test <- function(v){
233 | out <- try(na.interpolation(v, "spline"))
234 | ifelse(
235 | is.ts(out),
236 | return(out),
237 | return(v)
238 | )
239 | }
240 | # impute the NAs of a single list, keep column names (.final)
241 | impute <- function(xs){
242 | foreach(i = 1:length(xs),
243 | .final = function(x){
244 | setNames(x, names(xs))
245 | }) %dopar%
246 | imp_test(xs[[i]])
247 | }
248 | # example of parallel computing
249 | # cl <- makeCluster(11, type = "FORK")
250 | # registerDoParallel(cl)
251 | # na.ma(datas[[1]][["PM_"]], k=200)
252 | # na.interpolation(datas[[1]][["PM_Dongsi"]], "spline") %>>% head
253 | # impute(datas[[1]]) %>>% names
254 |
255 | # impute NAs of the parent list
256 | impute_list <- function(xs){
257 | lapply(xs, impute)
258 | }
259 |
260 | # make a fast hash table
261 | # hash tables are an excellent and flexible way to store large amounts of data
262 | # can be indexed with $ and [[]] nicely
263 | # but the data is represented in a memory efficient way, an can be manipulated in complex ways
264 | # Think of it like a realy fast database for searching and inserting
265 | to_hash <- function(xs){
266 | list2env(xs, envir = NULL, hash = TRUE)
267 | }
268 | # final preprocessing function:
269 |
270 | preprocess <- Compose(import, totsall, impute_list, to_hash)
271 |
--------------------------------------------------------------------------------
/pres/biasAndInference.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Bias and Inference in Machine Learning"
3 | author: "David"
4 | date: "`r Sys.Date()`"
5 | output:
6 | revealjs::revealjs_presentation:
7 | df_print: paged
8 | theme: white
9 | transition: zoom
10 | self_contained: false
11 | reveal_plugins: ["chalkboard"]
12 | reveal_options:
13 | chalkboard:
14 | theme: whiteboard
15 | ---
16 |
17 | ```{r setup, include = FALSE}
18 | knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
19 | # for pretty data frames
20 | library(DT)
21 | # useful alias
22 | DT <- datatable
23 | ```
24 |
25 |
26 | # Responsible Modeling
27 |
28 | ```{r zoolander, echo = FALSE}
29 | knitr::include_graphics("https://i.imgur.com/uYs6KsN.mp4")
30 | ```
31 |
32 | ## Bias in AI: A massive problem
33 |
34 | >- Hidden racial bias in machine learning models
35 | >- Gender bias in credit card models (Steve Wozniak wife)
36 | >- Impossible screenings in job applications
37 | >- Any other examples?
38 | >- We will all be affected (lots of models)
39 |
40 |
41 | ## Potential Solutions
42 |
43 | >- Highly technical options
44 | >- Simpler solutions for humans
45 |
46 | # EXAI: Machine Learning for Humans
47 |
48 | ## Explainable AI
49 |
50 | >- But how can "AI" be explainable?
51 | >- How does this reduce bias?
52 | >- How does this affect my case study??
53 |
54 | ## (Less important) Less Frustrating Code: Introducing MLR
55 |
56 | >- Uniform API for pretty much all R models
57 | >- Obviously outside of deep learning
58 | >- Readable code is more transparent, better, and arguably more ethical
59 |
60 | # Code walkthrough: Regression
61 |
62 | ## Load in necessary packages {.smaller}
63 |
64 |
65 | ```{r packs, echo = c(-1,-4)}
66 | library(DALEX) # for data for now
67 | library(mlr) # what we are interested in!
68 | data(apartments)
69 | DT(apartments, class = 'compact')
70 | ```
71 |
72 |
73 | ## Create a task
74 |
75 | ```{r taskSetup}
76 | regr_task <- makeRegrTask(data = apartments, target = "m2.price")
77 | ```
78 |
79 | >- Tasks allow us to pursue a machine learning task!
80 | >- Specify whether we want to do regression, classification, etc
81 |
82 |
83 | ## Create Learners
84 |
85 | ```{r reglrn}
86 | regr_lrn_knn <- makeLearner("regr.kknn")
87 | regr_lrn_lm <- makeLearner("regr.lm")
88 | regr_lrn_rf <- makeLearner("regr.ranger")
89 | ```
90 |
91 | ## Get parameter set to optimize!
92 |
93 | ```{r getPars, results = "hold"}
94 | getParamSet(regr_lrn_knn)
95 | getParamSet(regr_lrn_rf)
96 | getParamSet(regr_lrn_lm)
97 | ```
98 |
99 | ## Tune KNN!
100 |
101 | ```{r}
102 | knn_paramSet <- makeParamSet(
103 | makeIntegerParam("k", lower = 3, upper = 30)
104 | )
105 | rdesc = makeResampleDesc("CV", iters = 3L)
106 | ctrl <- makeTuneControlGrid()
107 | res <- tuneParams(regr_lrn_knn,
108 | task = regr_task,
109 | resampling=rdesc,
110 | measures = mse,
111 | par.set = knn_paramSet,
112 | control = ctrl)
113 |
114 | res
115 | ```
116 |
117 | ## Explore KNN tuning!
118 |
119 | ```{r}
120 | khpe <- generateHyperParsEffectData(res)
121 | plotHyperParsEffect(khpe, x = "k", y = "mse.test.mean", plot.type = "line")
122 | ```
123 |
124 | ## Tune random forest
125 |
126 | We will only tune one parameter, just because y'all havent actually studied this
127 |
128 | ```{r}
129 | rf_paramSet <- makeParamSet(
130 | makeIntegerParam("num.trees", lower = 10, upper = 100, trafo = function(x) 10*x)
131 | )
132 | res2 <- tuneParams(regr_lrn_rf,
133 | task = regr_task,
134 | resampling=rdesc,
135 | measures = mse,
136 | par.set = rf_paramSet,
137 | control = ctrl)
138 |
139 | res2
140 | ```
141 |
142 | ## Explore again!
143 |
144 | ```{r}
145 |
146 | rfhpe <- generateHyperParsEffectData(res2, trafo = TRUE)
147 | plotHyperParsEffect(rfhpe, x = "num.trees", y = "mse.test.mean", plot.type = "line")
148 | ```
149 |
150 | ## Set hyperparameters for model!
151 |
152 | ```{r}
153 | knn_regr_tuned <- setHyperPars(regr_lrn_knn, k = res$x$k)
154 | rf_regr_tuned <- setHyperPars(regr_lrn_rf, num.trees = 600)
155 | ```
156 |
157 | >- note we can refer around the res object
158 | >- also note we can set them ourselves!
159 | >- always always plot your tuning so you can find a simple model
160 |
161 | ## Train Models!
162 |
163 | ```{r}
164 | model_lm <- train(regr_lrn_lm, regr_task)
165 | model_knn <- train(knn_regr_tuned, regr_task)
166 | model_rf <- train(rf_regr_tuned, regr_task)
167 | ```
168 |
169 | ## Make predictions!!
170 |
171 | ```{r}
172 | lm_preds <- predict(model_lm, newdata = apartmentsTest)
173 | knn_preds <- predict(model_knn, newdata = apartmentsTest)
174 | rf_preds <- predict(model_rf, newdata = apartmentsTest)
175 | data.frame("lm" = lm_preds$data$response,
176 | "knn" = knn_preds$data$response,
177 | "rf" = rf_preds$data$response)
178 | ```
179 |
180 | ## Make this "explainable": introducing DALEX
181 |
182 | ```{r, results = "hide"}
183 | library(DALEX)
184 | library(DALEXtra)
185 | library(ingredients)
186 | lm_explained <- explain_mlr(model_lm,
187 | data = apartmentsTest,
188 | y = apartmentsTest$m2.price,
189 | label = "lm")
190 | knn_explained <- explain_mlr(model_knn,
191 | data = apartmentsTest,
192 | y = apartmentsTest$m2.price,
193 | label = "knn")
194 | rf_explained <- explain_mlr(model_rf,
195 | data = apartmentsTest,
196 | y = apartmentsTest$m2.price,
197 | label = "rf")
198 | explainers <- list(lm_explained, knn_explained, rf_explained)
199 | ```
200 |
201 | ## Model performance with DALEX
202 | ```{r, fig.show = "hold", echo = 1:3}
203 | model_perfs <- lapply(explainers, model_performance)
204 | p1 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]])
205 | p2 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]], geom = "boxplot")
206 | ```
207 |
208 | ## Model Performance with DALEX
209 | ```{r, echo = F}
210 | cowplot::plot_grid(p1, p2)
211 | ```
212 |
213 | ## Variable Importance
214 |
215 | >- Typically available in tree-based models such as rf and boosting
216 | >- But we can do it for any model with permutations
217 | >- How does it work?
218 | >- Calculate model scores after slightly altering a single variable
219 | >- repeat
220 |
221 | ## Variable Importance
222 |
223 | ```{r}
224 | library(ingredients)
225 | model_vis <- lapply(explainers, function(x) feature_importance(x, loss_function = loss_root_mean_square, type = "difference"))
226 |
227 | ```
228 |
229 | ## Variable Importance
230 |
231 | ```{r, echo = FALSE}
232 | plot(model_vis[[1]], model_vis[[2]], model_vis[[3]])
233 | ```
234 |
235 | ## Partial Dependence
236 |
237 | Show the relationship between continuos variables and model outcomes!
238 |
239 |
240 | ## Partial Dependence
241 |
242 |
243 | ```{r}
244 | pdps <- lapply(explainers, function(x) partial_dependency(x, variables = "construction.year"))
245 | ```
246 |
247 | ## Partial Dependence
248 |
249 | ```{r, echo = F}
250 | plot(pdps[[1]], pdps[[2]], pdps[[3]])
251 | ```
252 |
253 | ## Merging Paths!
254 |
255 | Show the relationship between a categorical variable and model response!
256 |
257 |
258 | ```{r}
259 | mpps <- lapply(explainers, function(x) variable_response(x, variable = "district", type = "factor"))
260 | ```
261 |
262 | ## Merging Paths!
263 |
264 | ```{r, echo = F}
265 | plot(mpps[[1]], mpps[[2]], mpps[[3]])
266 | ```
267 |
268 | ## Holy Grail funnel plot
269 |
270 | ```{r}
271 | funnel <- funnel_measure(rf_explained, list(knn_explained, lm_explained),
272 | partition_data = cbind(apartmentsTest,
273 | "m2.per.room" = apartmentsTest$surface/apartmentsTest$no.rooms),
274 | nbins = 5, measure_function = loss_root_mean_square, show_info = TRUE)
275 | ```
276 |
277 | ## Holy Grail Funnel plot
278 |
279 | ```{r, echo = FALSE}
280 | plot(funnel)
281 | ```
282 |
283 |
284 | # Classification now!
285 |
286 | ## Step 1: Data
287 |
288 | ```{r}
289 | data(wine, package = "breakDown")
290 | wine$quality <- ifelse(wine$quality>5, 1, 0)
291 | wine$quality <- factor(wine$quality)
292 | train_index <- sample(1:nrow(wine), 0.6 * nrow(wine))
293 | test_index <- setdiff(1:nrow(wine), train_index)
294 | wineTrain = wine[train_index,]
295 | wineTest <- wine[test_index,]
296 | ```
297 |
298 | ## Step 2: Task and Learner definition
299 |
300 | ```{r}
301 | class_task <- makeClassifTask(data = wineTrain, target = "quality")
302 |
303 | classif_knn <- makeLearner("classif.kknn", predict.type = "prob")
304 | classif_nb <- makeLearner("classif.naiveBayes", predict.type = "prob")
305 | classif_rf <- makeLearner("classif.ranger", predict.type = "prob")
306 | ```
307 |
308 | ## Step 3, find params to tune
309 |
310 | ```{r}
311 | lapply(list(classif_knn, classif_nb, classif_rf), getParamSet)
312 | ```
313 |
314 | ## Step 4, set up global tuning values
315 |
316 | ```{r}
317 | rdesc = makeResampleDesc("CV", iters = 3L)
318 | ctrl <- makeTuneControlRandom(maxit = 20L)
319 | ```
320 |
321 | ## Tune KNN
322 |
323 | ```{r}
324 | knn_paramSet <- makeParamSet(
325 | makeIntegerParam("k", lower = 3, upper = 40)
326 | )
327 |
328 | res <- tuneParams(classif_knn,
329 | task = class_task,
330 | resampling=rdesc,
331 | measures = list(tnr, mlr::acc, tpr),
332 | par.set = knn_paramSet,
333 | control = ctrl)
334 | ```
335 |
336 | ## Look at tuning!
337 |
338 | ```{r}
339 | khpe <- generateHyperParsEffectData(res)
340 | spec <- plotHyperParsEffect(khpe, x = "k", y = "tnr.test.mean", plot.type = "line")
341 | sens <- plotHyperParsEffect(khpe, x = "k", y = "tpr.test.mean", plot.type = "line")
342 | accu <- plotHyperParsEffect(khpe, x = "k", y = "acc.test.mean", plot.type = "line")
343 | ```
344 |
345 | ## Look at tuning!
346 |
347 | ```{r, echo = F}
348 | cowplot::plot_grid(accu, sens, spec)
349 | ```
350 |
351 | ## Define Tuned Model
352 |
353 | ```{r}
354 | knn_tuned <- setHyperPars(classif_knn, k = 7)
355 | ```
356 |
357 | ## Tune Naive Bayes
358 |
359 | The version of naive bayes implemented in mlr honestly sucks, I would use the klaR version, which will ALSO work with DALEX. We will just use default naive bayes, and you can definitely tune the klaR nb yourself.
360 |
361 | ## Tune random forest
362 |
363 | ```{r}
364 | rf_paramSet <- makeParamSet(
365 | makeIntegerParam("num.trees", lower = 10, upper = 100, trafo = function(x) 10*x)
366 | )
367 | ctrl2 <- makeTuneControlGrid()
368 | res2 <- tuneParams(classif_rf,
369 | task = class_task,
370 | resampling=rdesc,
371 | measures = list(tnr, mlr::acc, tpr),
372 | par.set = rf_paramSet,
373 | control = ctrl2)
374 |
375 | res2
376 | ```
377 |
378 |
379 | ## Look at tuning!
380 |
381 | ```{r}
382 | rfpe <- generateHyperParsEffectData(res2, trafo = TRUE)
383 | spec <- plotHyperParsEffect(rfpe, x = "num.trees", y = "tnr.test.mean", plot.type = "line")
384 | sens <- plotHyperParsEffect(rfpe, x = "num.trees", y = "tpr.test.mean", plot.type = "line")
385 | accu <- plotHyperParsEffect(rfpe, x = "num.trees", y = "acc.test.mean", plot.type = "line")
386 | ```
387 |
388 | ## Look at tuning!
389 |
390 | ```{r, echo = F}
391 | cowplot::plot_grid(accu, sens, spec)
392 | ```
393 |
394 | ## Define Tuned Model
395 |
396 | ```{r}
397 | rf_tuned <- setHyperPars(classif_rf, num.trees = 240)
398 | ```
399 |
400 |
401 | ## Train models
402 |
403 | ```{r}
404 | models <- list(rf_tuned, knn_tuned, classif_nb)
405 | trained <- lapply(models, function(x) train(x, class_task))
406 | model_rf <- trained[[1]]
407 | model_knn <- trained[[2]]
408 | model_nb <- trained[[3]]
409 | ```
410 |
411 | ## Make predictions
412 |
413 | ```{r}
414 | nb_preds <- predict(model_nb, newdata = wineTest)
415 | knn_preds <- predict(model_knn, newdata = wineTest)
416 | rf_preds <- predict(model_rf, newdata = wineTest)
417 | ```
418 |
419 | Confusion matrices are left as an exercise ;)
420 |
421 | ## Explain models!
422 |
423 |
424 | ```{r, results = "hide"}
425 | lm_explained <- explain_mlr(model_nb,
426 | data = wineTest,
427 | y = wineTest$quality,
428 | label = "nb")
429 | knn_explained <- explain_mlr(model_knn,
430 | data = wineTest,
431 | y = wineTest$quality,
432 | label = "knn")
433 | rf_explained <- explain_mlr(model_rf,
434 | data = wineTest,
435 | y = wineTest$quality,
436 | label = "rf")
437 | explainers2 <- list(nb_explained, knn_explained, rf_explained)
438 | ```
439 |
440 | ## Model Performance: residuals
441 |
442 | ```{r}
443 | perfs <- lapply(explainers2, model_performance)
444 | plot1 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]])
445 | plot2 <- plot(model_perfs[[1]], model_perfs[[2]], model_perfs[[3]], geom = "boxplot")
446 | ```
447 |
448 | ## Model Performance: residuals
449 |
450 | ```{r, echo = F}
451 | cowplot::plot_grid(plot1, plot2)
452 | ```
453 |
454 | ## More things,display effect per whatever
455 |
456 | ```{r}
457 | selected_wines <- select_sample(wineTrain, n = 100)
458 | cps <- lapply(explainers2, function(x) ceteris_paribus(x, selected_wines))
459 | pdps_sulph_alcohol <- lapply(cps, function(x) aggregate_profiles(x, variables = c("sulphates", "alcohol")))
460 | pdp_plots <- lapply(pdps_sulph_alcohol, plot)
461 | ```
462 |
463 | ## Localized Variable response!
464 |
465 | ```{r, echo = F}
466 | cowplot::plot_grid(plotlist = pdp_plots)
467 | ```
468 |
469 | # Everything Else...
470 |
471 | ## Left as an exercise :)
472 |
473 | >- There are a ton other things to explore here, especially regarding classification,and the positive and negative rates. Please refer to the manual
474 |
475 | ## Sources and resources for students:
476 |
477 | >- [Manual for explainable AI in R and python, START HERE](https://pbiecek.github.io/PM_VEE/modelPerformance.html#modelPerformanceIntro)
478 | >- [DrWhy.AI, START HERE FOR PACKAGES](https://github.com/ModelOriented/DrWhy)
479 | >- [DALEX vignette](https://raw.githack.com/pbiecek/DALEX_docs/master/vignettes/DALEX_mlr.html#3_classification_use_case_-_wine_data)
480 | >- [DALEX documentation](https://modeloriented.github.io/DALEX/)
481 | >- [DALEXtra docs](https://modeloriented.github.io/DALEXtra/index.html)
482 | >- Please note for when you move on from R, DALEX works with python too, as well as Keras and deep learning! This is an important topic which we need to be aware of
483 | >- [AMAZING mlr docs](https://mlr.mlr-org.com/articles/tutorial/task.html)
484 | >- This is a super important topic, which I have BARELY scratched today. If you are going to fall into a rabbit hole over the break, this is where you should go.
485 |
--------------------------------------------------------------------------------
/.Rhistory:
--------------------------------------------------------------------------------
1 | conmat <- function(predicted, expected){
2 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
3 | cm
4 | }
5 | f1_score <- function(predicted, expected, positive.class="1") {
6 | cm = conmat(predicted, expected)
7 | precision <- diag(cm) / colSums(cm)
8 | recall <- diag(cm) / rowSums(cm)
9 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
10 | #Assuming that F1 is zero when it's not possible compute it
11 | f1[is.na(f1)] <- 0
12 | #Binary F1 or Multi-class macro-averaged F1
13 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
14 | }
15 | accuracy <- function(predicted, expected){
16 | cm <- confusionMatrix(predicted, expected)
17 | sum(diag(cm)/length(test$class))
18 | }
19 | cm <- conmat(predicted, expected)
20 | accuracy <- function(predicted, expected){
21 | cm <- conmat(predicted, expected)
22 | sum(diag(cm)/length(test$class))
23 | }
24 | f1_score(test_pred, test$class)
25 | f1_score(test_pred, test$class)
26 | f1_score(test_pred, test)
27 | get_scores <- function(predictions, test){
28 | f1 <- f1_score(predictions,test)
29 | acc <- accuracy(predictions,test)
30 | scores <- c(accuracy = acc, f1 = f1)
31 | scores
32 | }
33 | pander(get_scores(test_pred, test))
34 | conmat <- function(predicted, expected){
35 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
36 | cm
37 | }
38 | f1_score <- function(predicted, expected, positive.class="1") {
39 | cm = conmat(predicted, expected)
40 | precision <- diag(cm) / colSums(cm)
41 | recall <- diag(cm) / rowSums(cm)
42 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
43 | #Assuming that F1 is zero when it's not possible compute it
44 | f1[is.na(f1)] <- 0
45 | #Binary F1 or Multi-class macro-averaged F1
46 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
47 | }
48 | accuracy <- function(predicted, expected){
49 | cm <- conmat(predicted, expected)
50 | sum(diag(cm)/length(test$class))
51 | }
52 | get_scores <- function(predictions, test){
53 | f1 <- f1_score(predictions,test)
54 | acc <- accuracy(predictions,test)
55 | scores <- c(accuracy = acc, f1 = f1)
56 | scores
57 | }
58 | pander(get_scores(test_pred, test))
59 | nb_fit <- train(training,
60 | training$class,
61 | trControl = trainMethod,
62 | method = "nb",
63 | tuneLength = 10
64 | )
65 | nb_fit <- train(training,
66 | training$class,
67 | trControl = trainMethod,
68 | method = "nb",
69 | tuneLength = 10
70 | )
71 | ```
72 | nb_fit <- train(training,
73 | training$class,
74 | trControl = trainMethod,
75 | method = "nb",
76 | tuneLength = 10
77 | )
78 | nb_fit
79 | plot(nb_fit)
80 | nb_pred <- predict(nb_fit, test)
81 | nb_pred <- predict(nb_fit, newdata = test)
82 | nb_pred
83 | confusionMatrix(nb_pred, test$class)
84 | get_scores(nb_pred, test)
85 | fastNaiveBayes.detect_distribution(x, nrows = nrow(x))
86 | y <- train$class
87 | x <- train[-1]
88 | y <- trainng$class
89 | x <- training[-1]
90 | y <- training$class
91 | x <- training[-1]
92 | fastNaiveBayes.detect_distribution(x, nrows = nrow(x))
93 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x))
94 | dist
95 | fast_nb_fit <- fastNaiveBayes.mixed(x,y)
96 | fast_nb_fit
97 | summary(fast_nb_fit)
98 | plot(fast_nb_fit)
99 | fast_pred <- predict(fast_nb_fit, test[-1])
100 | fast_pred
101 | confusionMatrix(fast_pred, test$class)
102 | get_scores(fast_pred, test)
103 | confusionMatrix(fast_pred, test$class)
104 | get_scores(fast_pred, test)
105 | library(caret)
106 | library(fastNaiveBayes)
107 | library(readr)
108 | library(functional)
109 | library(ggplot2)
110 | library(magrittr)
111 | library(doParallel)
112 | library(caret)
113 | library(fastNaiveBayes)
114 | library(readr)
115 | library(functional)
116 | library(ggplot2)
117 | library(magrittr)
118 | library(caret)
119 | library(fastNaiveBayes)
120 | library(readr)
121 | library(functional)
122 | library(ggplot2)
123 | library(magrittr)
124 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
125 | wine <- read_csv(dataurl, col_names = F)
126 | good_cols <- c("class",
127 | "alcohol",
128 | 'malic_acid',
129 | 'ash',
130 | 'alkalinity',
131 | 'magnesium',
132 | 'total_phenols',
133 | 'flavanoids',
134 | 'nonflavonoids_phenols',
135 | 'proanthocyanins',
136 | 'color_intensity',
137 | 'hue',
138 | 'dilution',
139 | 'proline'
140 | )
141 | fix_cols <- function(df){
142 | colnames(df) <- good_cols
143 | df$class <- as.factor(df$class)
144 | df
145 | }
146 | wine <- fix_cols(wine)
147 | wine
148 | set.seed(3033)
149 | ## WARNING: Danger function
150 | split <- function(df, p = 0.75, list = FALSE, ...) {
151 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
152 | cat("creating training dataset...\n")
153 | training <<- df[train_ind, ]
154 | cat("completed training dataset, creating test set\n")
155 | test <<- df[-train_ind, ]
156 | cat("done")
157 | }
158 | split(wine)
159 | library(doParallel)
160 | numcores <- parallel::detectCores() -1
161 | cl <- makePSOCKcluster(numcores)
162 | registerDoParallel(cl)
163 | set.seed(3333)
164 | trainMethod <- trainControl(method = "repeatedcv",
165 | number = 10,
166 | repeats = 3)
167 | knn_fit <- train(class ~ .,
168 | data = training,
169 | method = "knn",
170 | trControl = trainMethod,
171 | preProcess = c("center", "scale"),
172 | tuneLength = 10)
173 | library(pander)
174 | knn_fit
175 | plot(knn_fit)
176 | knn_fit2 <- knn3(training, training$class, k = 15)
177 | knn_fit2
178 | test_pred <- predict(knn_fit, newdata = test)
179 | test_pred
180 | confusionMatrix(test_pred, test$class)
181 | conmat <- function(predicted, expected){
182 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
183 | cm
184 | }
185 | f1_score <- function(predicted, expected, positive.class="1") {
186 | cm = conmat(predicted, expected)
187 | precision <- diag(cm) / colSums(cm)
188 | recall <- diag(cm) / rowSums(cm)
189 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
190 | #Assuming that F1 is zero when it's not possible compute it
191 | f1[is.na(f1)] <- 0
192 | #Binary F1 or Multi-class macro-averaged F1
193 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
194 | }
195 | accuracy <- function(predicted, expected){
196 | cm <- conmat(predicted, expected)
197 | sum(diag(cm)/length(test$class))
198 | }
199 | get_scores <- function(predictions, test){
200 | f1 <- f1_score(predictions,test)
201 | acc <- accuracy(predictions,test)
202 | scores <- c(accuracy = acc, f1 = f1)
203 | scores
204 | }
205 | pander(get_scores(test_pred, test))
206 | nb_fit <- train(training,
207 | training$class,
208 | trControl = trainMethod,
209 | method = "nb",
210 | tuneLength = 10
211 | )
212 | nb_fit
213 | plot(nb_fit)
214 | nb_pred <- predict(nb_fit, newdata = test)
215 | nb_pred
216 | confusionMatrix(nb_pred, test$class)
217 | get_scores(nb_pred, test)
218 | y <- training$class
219 | x <- training[-1]
220 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x))
221 | dist
222 | fast_nb_fit <- fastNaiveBayes.mixed(x,y)
223 | fast_nb_fit
224 | summary(fast_nb_fit)
225 | fast_pred <- predict(fast_nb_fit, test[-1])
226 | fast_pred
227 | confusionMatrix(fast_pred, test$class)
228 | get_scores(fast_pred, test)
229 | nb_fit <- train(training,
230 | training$class,
231 | trControl = trainMethod,
232 | method = "naivebayes",
233 | tuneLength = 10
234 | )
235 | nb_fit <- train(training,
236 | training$class,
237 | trControl = trainMethod,
238 | method = "naive_bayes",
239 | tuneLength = 10
240 | )
241 | nb_fit
242 | plot(nb_fit)
243 | nb_pred <- predict(nb_fit, newdata = test)
244 | nb_pred
245 | confusionMatrix(nb_pred, test$class)
246 | get_scores(nb_pred, test)
247 | rm(list = ls())
248 | library(caret)
249 | library(fastNaiveBayes)
250 | library(readr)
251 | library(functional)
252 | library(ggplot2)
253 | library(magrittr)
254 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
255 | wine <- read_csv(dataurl, col_names = F)
256 | wine
257 | good_cols <- c("class",
258 | "alcohol",
259 | 'malic_acid',
260 | 'ash',
261 | 'alkalinity',
262 | 'magnesium',
263 | 'total_phenols',
264 | 'flavanoids',
265 | 'nonflavonoids_phenols',
266 | 'proanthocyanins',
267 | 'color_intensity',
268 | 'hue',
269 | 'dilution',
270 | 'proline'
271 | )
272 | fix_cols <- function(df){
273 | colnames(df) <- good_cols
274 | df$class <- as.factor(df$class)
275 | df
276 | }
277 | wine <- fix_cols(wine)
278 | wine
279 | set.seed(3033)
280 | ## WARNING: Danger function
281 | split <- function(df, p = 0.75, list = FALSE, ...) {
282 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
283 | cat("creating training dataset...\n")
284 | training <- df[train_ind, ]
285 | cat("completed training dataset, creating test set\n")
286 | test <- df[-train_ind, ]
287 | cat("done")
288 | }
289 | split(wine)
290 | x = 2
291 | plus1 <- function() {
292 | x =4
293 | x
294 | }
295 | plus1
296 | plus1()
297 | x
298 | plus1 <- function() {
299 | x <<- 4
300 | x
301 | }
302 | plus1()
303 | x
304 | set.seed(3033)
305 | ## WARNING: Danger function
306 | split <- function(df, p = 0.75, list = FALSE, ...) {
307 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
308 | cat("creating training dataset...\n")
309 | training <- df[train_ind, ]
310 | cat("completed training dataset, creating test set\n")
311 | test <- df[-train_ind, ]
312 | cat("done")
313 | }
314 | split(wine)
315 | set.seed(3033)
316 | ## WARNING: Danger function
317 | split <- function(df, p = 0.75, list = FALSE, ...) {
318 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
319 | cat("creating training dataset...\n")
320 | training <<- df[train_ind, ]
321 | cat("completed training dataset, creating test set\n")
322 | test <<- df[-train_ind, ]
323 | cat("done")
324 | }
325 | split(wine)
326 | square <- function(var){
327 | x = x^2
328 | x
329 | }
330 | z = 4
331 | square(z)
332 | zsquared <- square(z)
333 | zsquared
334 | ## WARNING: Danger function
335 | split <- function(df, p = 0.75, list = FALSE, ...) {
336 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
337 | cat("creating training dataset...\n")
338 | training <<- df[train_ind, ]
339 | cat("completed training dataset, creating test set\n")
340 | test <<- df[-train_ind, ]
341 | cat("done")
342 | }
343 | split(wine)
344 | library(doParallel)
345 | parallel::detectCores()
346 | numcores <- parallel::detectCores() - 1
347 | cl <- makePSOCKcluster(numcores)
348 | registerDoParallel(cl)
349 | set.seed(3333)
350 | set.seed(3333)
351 | trainMethod <- trainControl(method = "repeatedcv",
352 | number = 10,
353 | repeats = 3)
354 | # k-folds cross validation
355 | # y ~ x
356 | knn_fit <- train(class ~ .,
357 | data = training,
358 | method = "knn",
359 | trControl = trainMethod,
360 | preProcess = c("center", "scale"),
361 | tuneLength = 10)
362 | knn_fit
363 | plot(knn_fit)
364 | set.seed(3333)
365 | trainMethod <- trainControl(method = "repeatedcv",
366 | number = 10,
367 | repeats = 3)
368 | # k-folds cross validation
369 | # y ~ x
370 | knn_fit <- train(class ~ .,
371 | data = training,
372 | method = "knn",
373 | trControl = trainMethod,
374 | preProcess = c("center", "scale"),
375 | tuneLength = 10)
376 | knn_fit
377 | set.seed(3033)
378 | ## WARNING: Danger function
379 | split <- function(df, p = 0.75, list = FALSE, ...) {
380 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
381 | cat("creating training dataset...\n")
382 | training <<- df[train_ind, ]
383 | cat("completed training dataset, creating test set\n")
384 | test <<- df[-train_ind, ]
385 | cat("done")
386 | }
387 | split(wine)
388 | set.seed(3333)
389 | trainMethod <- trainControl(method = "repeatedcv",
390 | number = 10,
391 | repeats = 3)
392 | # k-folds cross validation
393 | # y ~ x
394 | knn_fit <- train(class ~ .,
395 | data = training,
396 | method = "knn",
397 | trControl = trainMethod,
398 | preProcess = c("center", "scale"),
399 | tuneLength = 10)
400 | knn_fit
401 | plot(knn_fit)
402 | knn_fit2 <- knn3(training, training$class, k = 15)
403 | knn_fit2
404 | test_pred <- predict(knn_fit, newdata = test)
405 | test_pred
406 | test_pred2 <- predict(knn_fit2, newdata = test)
407 | test_pred2
408 | confusionMatrix(test_pred, test$class)
409 | confusionMatrix(test_pred2, test$class)
410 | knn_fit2 <- knn3(training, training$class, k = 15, prob = FALSE)
411 | knn_fit2
412 | test_pred2 <- predict(knn_fit2, newdata = test)
413 | test_pred2
414 | test_pred2 <- predict(knn_fit2, newdata = test, prob = F)
415 | test_pred2
416 | confusionMatrix(test_pred2, test$class)
417 | confusionMatrix(test_pred, test$class)
418 | conmat <- function(predicted, expected){
419 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
420 | cm
421 | }
422 | f1_score <- function(predicted, expected, positive.class="1") {
423 | cm = conmat(predicted, expected)
424 | precision <- diag(cm) / colSums(cm)
425 | recall <- diag(cm) / rowSums(cm)
426 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
427 | #Assuming that F1 is zero when it's not possible compute it
428 | f1[is.na(f1)] <- 0
429 | #Binary F1 or Multi-class macro-averaged F1
430 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
431 | }
432 | accuracy <- function(predicted, expected){
433 | cm <- conmat(predicted, expected)
434 | sum(diag(cm)/length(test$class))
435 | }
436 | get_scores <- function(predictions, test){
437 | f1 <- f1_score(predictions,test)
438 | acc <- accuracy(predictions,test)
439 | scores <- c(accuracy = acc, f1 = f1)
440 | scores
441 | }
442 | pander(get_scores(test_pred, test))
443 | conmat(test_pred, test)
444 | conmat <- function(predicted, expected){
445 | cm <- as.matrix(table(Actual = as.factor(expected$class), Predicted = predicted))
446 | cm
447 | }
448 | conmat(test_pred, test)
449 | f1_score <- function(predicted, expected, positive.class="1") {
450 | cm = conmat(predicted, expected)
451 | precision <- diag(cm) / colSums(cm)
452 | recall <- diag(cm) / rowSums(cm)
453 | f1 <- ifelse(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
454 | #Assuming that F1 is zero when it's not possible compute it
455 | f1[is.na(f1)] <- 0
456 | #Binary F1 or Multi-class macro-averaged F1
457 | ifelse(nlevels(expected) == 2, f1[positive.class], mean(f1))
458 | }
459 | accuracy <- function(predicted, expected){
460 | cm <- conmat(predicted, expected)
461 | sum(diag(cm)/length(test$class))
462 | }
463 | get_scores <- function(predictions, test){
464 | f1 <- f1_score(predictions,test)
465 | acc <- accuracy(predictions,test)
466 | scores <- c(accuracy = acc, f1 = f1)
467 | scores
468 | }
469 | pander(get_scores(test_pred, test))
470 | nb_fit <- train(training,
471 | training$class,
472 | trControl = trainMethod,
473 | method = "naive_bayes",
474 | tuneLength = 10
475 | )
476 | nb_fit
477 | plot(nb_fit)
478 | nb_fit <- train(training,
479 | training$class,
480 | trControl = trainMethod,
481 | method = "naive_bayes",
482 | tuneLength = 10
483 | )
484 | nb_fit
485 | plot(nb_fit)
486 | nb_fit <- train(training,
487 | training$class,
488 | trControl = trainMethod,
489 | method = "nb",
490 | tuneLength = 10
491 | )
492 | nb_fit
493 | plot(nb_fit)
494 | nb_pred <- predict(nb_fit, newdata = test)
495 | nb_pred
496 | confusionMatrix(nb_pred, test$class)
497 | get_scores(nb_pred, test)
498 | library(fastNaiveBayes)
499 | y <- training$class
500 | x <- training[-1]
501 | dist <- fastNaiveBayes.detect_distribution(x, nrows = nrow(x))
502 | dist
503 | fast_nb_fit <- fastNaiveBayes.mixed(x,y)
504 | fast_nb_fit
505 | fast_pred <- predict(fast_nb_fit, test[-1])
506 | fast_pred
507 | fast_pred <- predict(fast_nb_fit, newdata = test)
508 | fast_pred <- predict(fast_nb_fit, newdata = test[-1])
509 | fast_pred
510 | confusionMatrix(fast_pred, test$class)
511 | get_scores(fast_pred, test)
512 | stopCluster(cl)
513 |
--------------------------------------------------------------------------------
/R/EDAreg.R:
--------------------------------------------------------------------------------
1 | # regression EDA
2 | data(mtcars)
3 |
4 | mtcars
5 | # mpg cyl disp hp drat wt qsec vs am gear carb
6 | # Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
7 | # Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
8 | # Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
9 | # Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
10 | # Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
11 | # Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
12 | # Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
13 | # Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
14 | # Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
15 | # Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
16 | # Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
17 | # Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
18 | # Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
19 | # Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
20 | # Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
21 | # Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
22 | # Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
23 | # Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
24 | # Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
25 | # Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
26 | # Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
27 | # Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
28 | # AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
29 | # Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
30 | # Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
31 | # Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
32 | # Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
33 | # Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
34 | # Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
35 | # Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
36 | # Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
37 | # Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
38 | mtcars[c("cyl","vs","am","gear","carb")]
39 | # cyl vs am gear carb
40 | # Mazda RX4 6 0 1 4 4
41 | # Mazda RX4 Wag 6 0 1 4 4
42 | # Datsun 710 4 1 1 4 1
43 | # Hornet 4 Drive 6 1 0 3 1
44 | # Hornet Sportabout 8 0 0 3 2
45 | # Valiant 6 1 0 3 1
46 | # Duster 360 8 0 0 3 4
47 | # Merc 240D 4 1 0 4 2
48 | # Merc 230 4 1 0 4 2
49 | # Merc 280 6 1 0 4 4
50 | # Merc 280C 6 1 0 4 4
51 | # Merc 450SE 8 0 0 3 3
52 | # Merc 450SL 8 0 0 3 3
53 | # Merc 450SLC 8 0 0 3 3
54 | # Cadillac Fleetwood 8 0 0 3 4
55 | # Lincoln Continental 8 0 0 3 4
56 | # Chrysler Imperial 8 0 0 3 4
57 | # Fiat 128 4 1 1 4 1
58 | # Honda Civic 4 1 1 4 2
59 | # Toyota Corolla 4 1 1 4 1
60 | # Toyota Corona 4 1 0 3 1
61 | # Dodge Challenger 8 0 0 3 2
62 | # AMC Javelin 8 0 0 3 2
63 | # Camaro Z28 8 0 0 3 4
64 | # Pontiac Firebird 8 0 0 3 2
65 | # Fiat X1-9 4 1 1 4 1
66 | # Porsche 914-2 4 0 1 5 2
67 | # Lotus Europa 4 1 1 5 2
68 | # Ford Pantera L 8 0 1 5 4
69 | # Ferrari Dino 6 0 1 5 6
70 | # Maserati Bora 8 0 1 5 8
71 | # Volvo 142E 4 1 1 4 2
72 | str(mtcars)
73 | # 'data.frame': 32 obs. of 11 variables:
74 | # $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
75 | # $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
76 | # $ disp: num 160 160 108 258 360 ...
77 | # $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
78 | # $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
79 | # $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
80 | # $ qsec: num 16.5 17 18.6 19.4 17 ...
81 | # $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
82 | # $ am : num 1 1 1 0 0 0 0 0 0 0 ...
83 | # $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
84 | # $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
85 | # NULL
86 | mtcars[c("cyl","vs","am","gear","carb")] <- lapply(mtcars[c("cyl","vs","am","gear","carb")], as.factor)
87 | hist(mtcars$mpg)
88 |
89 | # facet
90 |
91 |
92 | str(mtcars)
93 |
94 | library(tidyverse)
95 | mtcars %>% keep(is.numeric) %>%gather %>% ggplot(aes(x = value)) + facet_wrap(~key, scales = "free")+ geom_histogram()
96 | plotAllNumeric <- function(df){
97 | df%>%keep(is.numeric) %>%
98 | gather() %>%
99 | ggplot(aes(value)) +
100 | facet_wrap(~ key, scales = "free") +
101 | geom_density()+geom_histogram() + theme_fivethirtyeight()
102 | }
103 |
104 | plotAllNumeric(mtcars)
105 |
106 | library(RColorBrewer)
107 | library(gplots)
108 |
109 |
110 | # heatmap
111 |
112 | my_palette <- colorRampPalette(c("red", "white", "black"))
113 | heatmapper <- function(df){
114 | df %>%
115 | keep(is.numeric) %>%
116 | tidyr::drop_na() %>%
117 | cor %>%
118 | heatmap.2(col = my_palette ,
119 | density.info = "none", trace = "none",
120 | dendogram = c("both"), symm = F,
121 | symkey = T, symbreaks = T, scale = "none",
122 | key = T)
123 | }
124 |
125 | data(iris)
126 | heatmapper(iris)
127 | heatmapper(mtcars)
128 |
129 |
130 |
131 | library(corrplot)
132 |
133 | ?corrplot
134 | correlator <- function(df){
135 | df %>%
136 | keep(is.numeric) %>%
137 | tidyr::drop_na() %>%
138 | cor %>%
139 | corrplot( addCoef.col = "white", number.digits = 2,
140 | number.cex = 0.5, method="square",
141 | order="hclust", title="Variable Corr Heatmap",
142 | tl.srt=45, tl.cex = 0.8)
143 | }
144 | correlator(mtcars)
145 |
146 |
147 |
148 | # Categorical variables
149 | # box plots
150 | mtcars %>% keep(is.factor) %>% names -> label
151 | # [1] "cyl" "vs" "am" "gear" "carb"
152 | ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = cyl)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
153 | ggplot(data = mtcars, aes(x = vs, y = mpg, fill = vs)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
154 | ggplot(data = mtcars, aes(x = am, y = mpg, fill = am)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
155 | ggplot(data = mtcars, aes(x = gear, y = mpg, fill = gear)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
156 | ggplot(data = mtcars, aes(x = carb, y = mpg, fill = carb)) + geom_boxplot() + scale_fill_few(palette = "Dark") + theme_few()
157 |
158 | plot_grid(p,p1,p2,p3,p4, ncol = 3, labels = label)
159 |
160 |
161 | p <-ggplot(data = mtcars, aes(x = cyl, y = mpg, fill = cyl)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few()
162 | p1<-ggplot(data = mtcars, aes(x = vs, y = mpg, fill = vs)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few()
163 | p2<-ggplot(data = mtcars, aes(x = am, y = mpg, fill = am)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few()
164 | p3<-ggplot(data = mtcars, aes(x = gear, y = mpg, fill = gear)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few()
165 | p4<-ggplot(data = mtcars, aes(x = carb, y = mpg, fill = carb)) + geom_violin() + scale_fill_few(palette = "Dark") + theme_few()
166 |
167 | plot_grid(p,p1,p2,p3,p4, ncol = 3, labels = label)
168 |
169 |
170 | # disp or weight = eliminate one
171 | # maybe eliminate gear or lm
172 | model1 <- lm(data = mtcars, mpg ~.)
173 | summary(model1)
174 | #
175 | # Call:
176 | # lm(formula = mpg ~ ., data = mtcars)
177 | #
178 | # Residuals:
179 | # Min 1Q Median 3Q Max
180 | # -3.5087 -1.3584 -0.0948 0.7745 4.6251
181 | #
182 | # Coefficients:
183 | # Estimate Std. Error t value
184 | # (Intercept) 23.87913 20.06582 1.190
185 | # cyl6 -2.64870 3.04089 -0.871
186 | # cyl8 -0.33616 7.15954 -0.047
187 | # disp 0.03555 0.03190 1.114
188 | # hp -0.07051 0.03943 -1.788
189 | # drat 1.18283 2.48348 0.476
190 | # wt -4.52978 2.53875 -1.784
191 | # qsec 0.36784 0.93540 0.393
192 | # vs1 1.93085 2.87126 0.672
193 | # am1 1.21212 3.21355 0.377
194 | # gear4 1.11435 3.79952 0.293
195 | # gear5 2.52840 3.73636 0.677
196 | # carb2 -0.97935 2.31797 -0.423
197 | # carb3 2.99964 4.29355 0.699
198 | # carb4 1.09142 4.44962 0.245
199 | # carb6 4.47757 6.38406 0.701
200 | # carb8 7.25041 8.36057 0.867
201 | # Pr(>|t|)
202 | # (Intercept) 0.2525
203 | # cyl6 0.3975
204 | # cyl8 0.9632
205 | # disp 0.2827
206 | # hp 0.0939 .
207 | # drat 0.6407
208 | # wt 0.0946 .
209 | # qsec 0.6997
210 | # vs1 0.5115
211 | # am1 0.7113
212 | # gear4 0.7733
213 | # gear5 0.5089
214 | # carb2 0.6787
215 | # carb3 0.4955
216 | # carb4 0.8096
217 | # carb6 0.4938
218 | # carb8 0.3995
219 | # ---
220 | # Signif. codes:
221 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’
222 | # 0.1 ‘ ’ 1
223 | #
224 | # Residual standard error: 2.833 on 15 degrees of freedom
225 | # Multiple R-squared: 0.8931, Adjusted R-squared: 0.779
226 | # F-statistic: 7.83 on 16 and 15 DF, p-value: 0.000124
227 | #
228 |
229 | mtcars2 <- mtcars %>% keep(is.numeric)
230 | mtcars2$disp <- NULL
231 |
232 | model2 <- lm(data = mtcars2, mpg~.)
233 |
234 | summary(model2)
235 | #
236 | # Call:
237 | # lm(formula = mpg ~ ., data = mtcars2)
238 | #
239 | # Residuals:
240 | # Min 1Q Median 3Q Max
241 | # -3.5775 -1.6626 -0.3417 1.1317 5.4422
242 | #
243 | # Coefficients:
244 | # Estimate Std. Error t value
245 | # (Intercept) 19.25970 10.31545 1.867
246 | # hp -0.01784 0.01476 -1.209
247 | # drat 1.65710 1.21697 1.362
248 | # wt -3.70773 0.88227 -4.202
249 | # qsec 0.52754 0.43285 1.219
250 | # Pr(>|t|)
251 | # (Intercept) 0.072785 .
252 | # hp 0.237319
253 | # drat 0.184561
254 | # wt 0.000259 ***
255 | # qsec 0.233470
256 | # ---
257 | # Signif. codes:
258 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’
259 | # 0.1 ‘ ’ 1
260 | #
261 | # Residual standard error: 2.539 on 27 degrees of freedom
262 | # Multiple R-squared: 0.8454, Adjusted R-squared: 0.8225
263 | # F-statistic: 36.91 on 4 and 27 DF, p-value: 1.408e-10
264 | #
265 |
266 |
267 | mtcars3 <- mtcars %>% keep(is.numeric)
268 |
269 | model3 <- lm(data = mtcars3, mpg~.)
270 | summary(model3)
271 | #
272 | # Call:
273 | # lm(formula = mpg ~ ., data = mtcars3)
274 | #
275 | # Residuals:
276 | # Min 1Q Median 3Q Max
277 | # -3.5404 -1.6701 -0.4264 1.1320 5.4996
278 | #
279 | # Coefficients:
280 | # Estimate Std. Error t value
281 | # (Intercept) 16.53357 10.96423 1.508
282 | # disp 0.00872 0.01119 0.779
283 | # hp -0.02060 0.01528 -1.348
284 | # drat 2.01578 1.30946 1.539
285 | # wt -4.38546 1.24343 -3.527
286 | # qsec 0.64015 0.45934 1.394
287 | # Pr(>|t|)
288 | # (Intercept) 0.14362
289 | # disp 0.44281
290 | # hp 0.18936
291 | # drat 0.13579
292 | # wt 0.00158 **
293 | # qsec 0.17523
294 | # ---
295 | # Signif. codes:
296 | # 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’
297 | # 0.1 ‘ ’ 1
298 | #
299 | # Residual standard error: 2.558 on 26 degrees of freedom
300 | # Multiple R-squared: 0.8489, Adjusted R-squared: 0.8199
301 | # F-statistic: 29.22 on 5 and 26 DF, p-value: 6.892e-10
302 | #
303 |
304 |
305 | # exploring with base R and lapply
306 | # modify for your own data, this is geared for mtcars
307 | # for example train$income ~ train[[x]] instead of mtcars
308 | plot_vs_response <- function(x){
309 | plot(mtcars$mpg ~ mtcars[[x]], xlab = x)
310 | lw1 <- loess(mtcars$mpg ~ mtcars[[x]])
311 | j <- order(mtcars[[x]])
312 | lines(mtcars[[x]][j],lw1$fitted[j],col="red",lwd=3)
313 | }
314 | mtcars %>% keep(is.numeric) %>% names -> numNames
315 | numNames
316 | # [1] "mpg" "disp" "hp" "drat" "wt" "qsec"
317 | # remove mpg
318 | numNames <- numNames[-1]
319 | length(numNames)
320 | # [1] 5
321 | # set up graphical parameters:
322 |
323 | par(mfrow = c(2,3))
324 | # plot all numeric variables as x vs response with lapply
325 | # works like
326 | lapply(numNames, plot_vs_response)
327 |
328 | # how do you interpret this? remember how wt and disp are highly correlated??
329 |
330 |
331 |
332 | # Classification EDA(light example)
333 |
334 |
335 | library(caret)
336 | library(fastNaiveBayes)
337 | library(readr)
338 | library(functional)
339 | library(ggplot2)
340 | library(magrittr)
341 | library(tidyverse)
342 |
343 | dataurl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
344 |
345 | wine <- read_csv(dataurl, col_names = F)
346 | good_cols <- c("class",
347 | "alcohol",
348 | 'malic_acid',
349 | 'ash',
350 | 'alkalinity',
351 | 'magnesium',
352 | 'total_phenols',
353 | 'flavanoids',
354 | 'nonflavonoids_phenols',
355 | 'proanthocyanins',
356 | 'color_intensity',
357 | 'hue',
358 | 'dilution',
359 | 'proline'
360 | )
361 |
362 | fix_cols <- function(df){
363 | colnames(df) <- good_cols
364 | df$class <- as.factor(df$class)
365 | df
366 | }
367 | wine <- fix_cols(wine)
368 | glimpse(wine)
369 |
370 |
371 | set.seed(3033)
372 | ## WARNING: Danger function
373 | split <- function(df, p = 0.75, list = FALSE, ...) {
374 | train_ind <- createDataPartition(df[[1]], p = p, list = list)
375 | cat("creating training dataset...\n")
376 | training <<- df[train_ind, ]
377 | cat("completed training dataset, creating test set\n")
378 | test <<- df[-train_ind, ]
379 | cat("done")
380 | }
381 |
382 | split(wine)
383 |
384 | ggplot(data = wine, aes(x = malic_acid, fill = class)) + geom_density()
385 | ggplot(data = wine, aes(x = alkalinity, fill = class)) + geom_density()
386 | ggplot(data = wine, aes(x = ash, fill = class)) + geom_density()
387 | ggplot(data = wine, aes(x = magnesium, fill = class)) + geom_density()
388 |
389 |
390 | library(doParallel)
391 | numcores <- parallel::detectCores() - 1
392 | cl <- makePSOCKcluster(numcores)
393 | registerDoParallel(cl)
394 |
395 |
396 | set.seed(3333)
397 | trainMethod <- trainControl(method = "repeatedcv",
398 | number = 10,
399 | repeats = 3)
400 | # k-folds cross validation
401 | # y ~ x
402 | # use train to do a grid search for best model, see
403 | # https://topepo.github.io/caret/model-training-and-tuning.html#model-training-and-parameter-tuning
404 | knn_fit <- train(class ~ .,
405 | data = training,
406 | method = "knn",
407 | trControl = trainMethod,
408 | preProcess = c("center", "scale"),
409 | tuneLength = 10)
410 |
411 | knn_fit
412 | # k-Nearest Neighbors
413 | #
414 | # 135 samples
415 | # 13 predictor
416 | # 3 classes: '1', '2', '3'
417 | #
418 | # Pre-processing:
419 | # centered (13), scaled (13)
420 | # Resampling: Cross-Validated (10 fold, repeated 3 times)
421 | # Summary of sample sizes: 121, 122, 122, 121, 121, 121, ...
422 | # Resampling results across tuning parameters:
423 | #
424 | # k Accuracy Kappa
425 | # 5 0.9700549 0.9548756
426 | # 7 0.9676740 0.9516351
427 | # 9 0.9609280 0.9418362
428 | # 11 0.9579426 0.9370280
429 | # 13 0.9702686 0.9552588
430 | # 15 0.9722527 0.9579543
431 | # 17 0.9752442 0.9625294
432 | # 19 0.9681013 0.9519242
433 | # 21 0.9726496 0.9588742
434 | # 23 0.9726496 0.9589829
435 | #
436 | # Accuracy was used to
437 | # model using the
438 | # largest value.
439 | # The final value used
440 | # for the model was k = 17.
441 |
442 |
443 | plot(knn_fit)
444 |
445 |
446 | test_pred <- predict(knn_fit, newdata = test)
447 | test_pred
448 |
449 |
450 | confusionMatrix(test_pred, test$class)
451 |
452 | # try with different predictors as per your EDA
453 | # an idea to do programaticly: try writing a function then lapplying all the different iterations (may require instead %dopar% or mclapply)
454 |
--------------------------------------------------------------------------------
/pres/html-scraping.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "HTML Scraping in R"
3 | author: "David Josephs"
4 | output: html_document
5 | ---
6 |
7 |
8 | # Lord of the Rings Example
9 |
10 | ## Setup
11 |
12 | First, lets load up two libraries which will make our life easier. First is rvest, which is a great library for reading html, it is basically an extension of the xml2 package. It has some easy syntax and is quite helpful going forwards.
13 |
14 | The second one is in my opinion, one of the most useful libraries for doing any sort of data science or data analysis in R, the tidyverse. Just google it and see the documentation, it is a set of packages in which all of the functions have similar APIs and arguments, allowing for consistency throughout our programmig. They are also all pretty fast, with nice syntax. Examples from the tidyverse are: readr (data loading), dplyr(data analysis/cleaning/general utility), tidyr(data cleaning again, reshaping), caret(machine learning), and ggplot2(data viz).
15 |
16 |
17 | ```r
18 | library(rvest)
19 | library(tidyverse)
20 | ```
21 |
22 | Next lets load up our data. In this example we will be looking at the imdb page for lord of the rings. So we will assign a variable to the url of the page we are interested in:
23 |
24 |
25 | ```r
26 | lotr <- 'https://www.imdb.com/title/tt0120737/fullcredits?ref_=tt_cl_sm#cast'
27 | ```
28 |
29 | ## Reading the data
30 |
31 | ### The pipe operator
32 |
33 | Before we can read in the data, lets first learn about `%>%` pipes. A pipe is basically saying, take the thing on the left, and make it an argument of a thing on the right. For example, lets say we want to take the mean of mtcars, the classic R example dataset, with all columns. We can do that with:
34 |
35 |
36 | ```r
37 | lapply(mtcars,mean)
38 | ```
39 |
40 | ```
41 | ## $mpg
42 | ## [1] 20.09062
43 | ##
44 | ## $cyl
45 | ## [1] 6.1875
46 | ##
47 | ## $disp
48 | ## [1] 230.7219
49 | ##
50 | ## $hp
51 | ## [1] 146.6875
52 | ##
53 | ## $drat
54 | ## [1] 3.596563
55 | ##
56 | ## $wt
57 | ## [1] 3.21725
58 | ##
59 | ## $qsec
60 | ## [1] 17.84875
61 | ##
62 | ## $vs
63 | ## [1] 0.4375
64 | ##
65 | ## $am
66 | ## [1] 0.40625
67 | ##
68 | ## $gear
69 | ## [1] 3.6875
70 | ##
71 | ## $carb
72 | ## [1] 2.8125
73 | ```
74 |
75 | This is mapping the mean function over the mtcars dataset. Now, the output of this is not very pretty, so we will turn it into a nice, horizontal data frame:
76 |
77 |
78 | ```r
79 | as.data.frame(lapply(mtcars,mean))
80 | ```
81 |
82 | ```
83 | ## mpg cyl disp hp drat wt qsec vs
84 | ## 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375
85 | ## am gear carb
86 | ## 1 0.40625 3.6875 2.8125
87 | ```
88 |
89 | Still ugly. Lets try and use the pander library to make this look nice:
90 |
91 |
92 | ```r
93 | pander(as.data.frame(lapply(mtcars,mean)))
94 | ```
95 |
96 |
97 | -----------------------------------------------------------------------------------------
98 | mpg cyl disp hp drat wt qsec vs am gear carb
99 | ------- ------- ------- ------- ------- ------- ------- -------- -------- ------- -------
100 | 20.09 6.188 230.7 146.7 3.597 3.217 17.85 0.4375 0.4062 3.688 2.812
101 | -----------------------------------------------------------------------------------------
102 |
103 | Much better, but look at how many parentheses we wrote, and how difficult this is to read. Imagine if we had 4 or 5 more steps. We would have to repeatedly assign things to new variables, and keep working and working and putting things in our computers memory to have readable code. Even then, if we assigned a variable on every step, someone reviewing your code would end up having to know 20 or so lines of code above, just to understand the final printing line. This leads to errors and is not reproducible. Instead, lets try it with the pipe operator. Mathematically, `f(x,y) = x %>% f(y)`, if that helps:
104 |
105 |
106 | ```r
107 | mtcars %>% lapply(mean) %>% as.data.frame %>% pander
108 | ```
109 |
110 |
111 | -----------------------------------------------------------------------------------------
112 | mpg cyl disp hp drat wt qsec vs am gear carb
113 | ------- ------- ------- ------- ------- ------- ------- -------- -------- ------- -------
114 | 20.09 6.188 230.7 146.7 3.597 3.217 17.85 0.4375 0.4062 3.688 2.812
115 | -----------------------------------------------------------------------------------------
116 |
117 | This reads from left to right (as we english speakers are in the habit of doing):
118 | First, we take the mtcars dataset. Then, we apply the mean function onto every column of the dataset, outputting into the form of a list. We then turn the list, which is hard to read, into a nice flat data frame, and then we pretty up the data frame in a final step. This is the pipe operator.
119 |
120 | ### Actually reading in the data
121 |
122 | So, with our knowledge of the pipe operator, what can we do? Lets use rvest functions to turn the raw xml and/or html data into something nice and human human readbale.
123 |
124 | First, lets read in the website:
125 |
126 |
127 | ```r
128 | # not run
129 | read_html(lotr)
130 | ```
131 |
132 | Next lets choose all the tables (we know all of our data is in tables) in the raw data, with the `html_nodes()` function:
133 |
134 |
135 | ```r
136 | read_html(lotr) %>% html_nodes("table")
137 | ```
138 |
139 | Next, lets choose the right table. By looking at the website, we know that the third table contains the info on the cast. To choose the third table of an unnamed object, we are going to have to use the `.` operator, which we will see is just a placeholder for the thing on the left.
140 |
141 |
142 |
143 | ```r
144 | read_html(lotr) %>% html_nodes("table") %>% .[[3]]
145 | ```
146 |
147 | #### An Aside on lists
148 | Why did we do `[[]]`?
149 | This is because html_nodes outputs a list, and there are three ways we can get items from a list, `$`, for named items, keeps the type of the item if it is some sort of vector. `[]` allows us to index the list, but the output is always in the form of a list, eg, data type is extracted at some other set. Third, we have `[[]]`, which allows us to index the list and get the proper data type in an output. Experiment with this by using the following list as well as the built in `typeof()` function.
150 |
151 |
152 | ```r
153 | x <- list("char" = c("cat","dog"), "nest" = list((1:3),2:4), "int" = 4:5, "logical" = c(T,F,T,F), "float" = c(87.5, -962.4))
154 | ```
155 |
156 | ### Back to Business
157 |
158 | Now that we understand what `.[[3]]` is doing, we can now extract the full dataset:
159 |
160 |
161 | ```r
162 | read_html(lotr) %>% html_nodes("table") %>% .[[3]] %>% html_table -> cast
163 | (head(cast))
164 | ```
165 |
166 | ```
167 | ## X1 X2 X3 X4
168 | ## 1
169 | ## 2 Alan Howard ... Voice of the Ring \n \n \n (voice)
170 | ## 3 Noel Appleby ... Everard Proudfoot
171 | ## 4 Sean Astin ... Sam
172 | ## 5 Sala Baker ... Sauron
173 | ## 6 Sean Bean ... Boromir
174 | ```
175 |
176 | Great. Now that process was pretty painful, and took a lot of typing, and in the future we may not know which table we are looking for, so lets write a nice little function to do this all in one step:
177 |
178 |
179 |
180 | ```r
181 | tablescraper <- function(url, item){
182 | read_html(url) %>% html_nodes("table") %>% . [[item]] %>% html_table -> out
183 | return(out)
184 | }
185 | ```
186 |
187 | Now that we have a nice function, we can iteratively search through the IMDB site:
188 |
189 |
190 | ```r
191 | tablescraper(lotr,1) %>% head
192 | ```
193 |
194 | ```
195 | ## X1 X2 X3
196 | ## 1 Peter Jackson NA NA
197 | ```
198 |
199 | ```r
200 | tablescraper(lotr,2) %>% head
201 | ```
202 |
203 | ```
204 | ## X1 X2 X3
205 | ## 1 J.R.R. Tolkien ... (novel)
206 | ## 2
207 | ## 3 Fran Walsh ... (screenplay) &
208 | ## 4 Philippa Boyens ... (screenplay) &
209 | ## 5 Peter Jackson ... (screenplay)
210 | ```
211 |
212 | ```r
213 | tablescraper(lotr,3) -> cast
214 | ```
215 |
216 | We can even imagine, for a large project, just writing a for loop to do all of this.
217 | Next, lets check out the first and last ten items of cast:
218 |
219 |
220 | ```r
221 | ht <- function(x,...){
222 | head(x,...)
223 | tail(x,...)
224 | }
225 | ht(cast,10)
226 | ```
227 |
228 | ```
229 | ## X1 X2 X3
230 | ## 124 Chris Ryan ...
231 | ## 125 Paul Shapcott ...
232 | ## 126 Samuel E. Shore ...
233 | ## 127 Mike Stearne ...
234 | ## 128 Andrew Stehlin ...
235 | ## 129 Ken Stratton ...
236 | ## 130 Jo Surgison ...
237 | ## 131 James Waterhouse-Brown ...
238 | ## 132 Tim Wong ...
239 | ## 133 Robert Young ...
240 | ## X4
241 | ## 124 Breelander \n \n \n (uncredited)
242 | ## 125 Burning Ringwraith \n \n \n (uncredited)
243 | ## 126 Refugee / \n Orc \n \n \n (uncredited)
244 | ## 127 Uruk-hai \n \n \n (uncredited)
245 | ## 128 Uruk-hai \n \n \n (uncredited)
246 | ## 129 Isengard Orc / \n Last Alliance Soldier / \n Morgul Orc / \n Uruk-hai \n \n \n (uncredited)
247 | ## 130 Hobbit \n \n \n (uncredited)
248 | ## 131 Goblin \n \n \n (uncredited)
249 | ## 132 Uruk-hai \n \n \n (uncredited)
250 | ## 133 Goblin \n \n \n (uncredited)
251 | ```
252 |
253 | ***NOTE***: the `...` in our function allows for extra arguments. We do this so we can throw in the extra parameter, `10` which changes head and tail to showing the first and last 10 instead of the first and last 6 items.
254 |
255 | ## Cleaning the data
256 | Wow, this data is a mess. The first thing we see is that the first row is entirely blank, and then that the first and third columns are completely empty. Lets get rid of that:
257 |
258 |
259 | ```r
260 | cast <- cast[-1,]
261 | cast$X1 <- NULL
262 | cast$X3 <- NULL
263 | ht(cast)
264 | ```
265 |
266 | ```
267 | ## X2
268 | ## 128 Andrew Stehlin
269 | ## 129 Ken Stratton
270 | ## 130 Jo Surgison
271 | ## 131 James Waterhouse-Brown
272 | ## 132 Tim Wong
273 | ## 133 Robert Young
274 | ## X4
275 | ## 128 Uruk-hai \n \n \n (uncredited)
276 | ## 129 Isengard Orc / \n Last Alliance Soldier / \n Morgul Orc / \n Uruk-hai \n \n \n (uncredited)
277 | ## 130 Hobbit \n \n \n (uncredited)
278 | ## 131 Goblin \n \n \n (uncredited)
279 | ## 132 Uruk-hai \n \n \n (uncredited)
280 | ## 133 Goblin \n \n \n (uncredited)
281 | ```
282 |
283 | Next, lets rename with dplyr:
284 |
285 |
286 | ```r
287 | cast %>% rename(Actor = X2, Character = X4) -> cast
288 | ```
289 |
290 | Looking better, now we know from the IMDB website that the table contains"Rest of cast listed alphabetically:", so lets get rid of that. To do this, we are going to use `grepl()`
291 |
292 | `grepl()` searches for a pattern and then returns a logical (true/false) vector of whether or not there is a match. We can then index `cast` for all rows where the result of `grepl` are not true, eliminating the unwanted line:
293 |
294 |
295 | ```r
296 | cast<-cast[!grepl("Rest of cast listed alphabetically:", cast$Actor),]
297 | ```
298 |
299 |
300 | Try and see how this dplyr syntax is different from doing it in base R as a learning challenge, and see which one you prefer.
301 |
302 | Next lets get rid of those nasty `\n`'s. To do this, lets use `gsub()`, short for global substite. Since we dont know how all newlines are delimited, we will search for all types of newlines, `\n` (unix) `\r\n` (windows) and `\r` (old web line endings). To do this, we will use the regular expression `[\r\n]`. This allows us to search for `\r`,`\n`, and `\r\n` (thats what the brackets do). Lets turn those all into nothing.
303 |
304 |
305 | ```r
306 | cast$Character<-gsub("[\r\n]","",cast$Character)
307 | ht(cast)
308 | ```
309 |
310 | ```
311 | ## Actor
312 | ## 128 Andrew Stehlin
313 | ## 129 Ken Stratton
314 | ## 130 Jo Surgison
315 | ## 131 James Waterhouse-Brown
316 | ## 132 Tim Wong
317 | ## 133 Robert Young
318 | ## Character
319 | ## 128 Uruk-hai (uncredited)
320 | ## 129 Isengard Orc / Last Alliance Soldier / Morgul Orc / Uruk-hai (uncredited)
321 | ## 130 Hobbit (uncredited)
322 | ## 131 Goblin (uncredited)
323 | ## 132 Uruk-hai (uncredited)
324 | ## 133 Goblin (uncredited)
325 | ```
326 |
327 | Now, we have a ton of whitespace. Lets get rid of that. The regular expression for a single space is `\s`. But, we want to get rid of more than one space, and the regular expression for that is `\s+`. Lets combine the two so we are looking for all spaces, by doing `\s\s+`. That however is not very pretty, so lets combine one step further, and rewrite as `\\s+`. This is going to match with all amounts of whitespace. Lets turn all of these into a single space:
328 |
329 |
330 | ```r
331 | cast$Character<-gsub("\\s+"," ",cast$Character)
332 | ht(cast)
333 | ```
334 |
335 | ```
336 | ## Actor
337 | ## 128 Andrew Stehlin
338 | ## 129 Ken Stratton
339 | ## 130 Jo Surgison
340 | ## 131 James Waterhouse-Brown
341 | ## 132 Tim Wong
342 | ## 133 Robert Young
343 | ## Character
344 | ## 128 Uruk-hai (uncredited)
345 | ## 129 Isengard Orc / Last Alliance Soldier / Morgul Orc / Uruk-hai (uncredited)
346 | ## 130 Hobbit (uncredited)
347 | ## 131 Goblin (uncredited)
348 | ## 132 Uruk-hai (uncredited)
349 | ## 133 Goblin (uncredited)
350 | ```
351 |
352 | Excellent work, we have now turned a once very ugly raw frame into something we can work with.
353 |
354 | # A challenge:
355 |
356 | Two challenges here:
357 |
358 | * Is there another way we could have cleaned up the `\n` or the `\s`? Try out `library(stringr)` and explore the functions there.
359 |
360 | * Try separating first name from last name (eg make a first and last name column), using whatever means necessary (this is in your homework assignment this week)
361 |
362 | # Note
363 |
364 | To see more examples and play around with the source code for this document, see `R/scraping.R`
365 |
--------------------------------------------------------------------------------
| |