├── .gitignore
├── 00_data
├── .DS_Store
├── HomeCredit_columns_description.csv.zip
└── application_train.csv.zip
├── 00_images
├── DS4B_201_R_Course.png
├── kaggle_credit_default.png
└── rstudio_server.png
├── 00_scripts
└── c1.R
├── 01_machine_learning_h2o
└── 01_machine_learning_h2o.R
├── Dockerfile
├── README.html
├── README.md
└── workshop_2018_dsgo.Rproj
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | Icon*
6 | *.csv
7 | *.DS_Store
--------------------------------------------------------------------------------
/00_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/.DS_Store
--------------------------------------------------------------------------------
/00_data/HomeCredit_columns_description.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/HomeCredit_columns_description.csv.zip
--------------------------------------------------------------------------------
/00_data/application_train.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/application_train.csv.zip
--------------------------------------------------------------------------------
/00_images/DS4B_201_R_Course.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/DS4B_201_R_Course.png
--------------------------------------------------------------------------------
/00_images/kaggle_credit_default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/kaggle_credit_default.png
--------------------------------------------------------------------------------
/00_images/rstudio_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/rstudio_server.png
--------------------------------------------------------------------------------
/00_scripts/c1.R:
--------------------------------------------------------------------------------
1 | # CHALLENGE SOLUTION ----
2 |
3 | start <- Sys.time()
4 | h2o_deeplearning <- h2o.deeplearning(
5 | x = x,
6 | y = y,
7 | training_frame = train_h2o,
8 | validation_frame = test_h2o,
9 | nfolds = 5,
10 | seed = 1234,
11 |
12 | # Deep Learning
13 | epochs = 10,
14 | hidden = c(100, 50, 10)
15 | )
16 | Sys.time() - start
17 | # Time difference of 59.41523 secs
18 |
19 | h2o_deeplearning %>% h2o.auc(valid = TRUE)
20 | # [1] 0.7098785
21 |
--------------------------------------------------------------------------------
/01_machine_learning_h2o/01_machine_learning_h2o.R:
--------------------------------------------------------------------------------
1 | # MACHINE LEARNING ----
2 |
3 | # Objectives:
4 | # Size the problem
5 | # Prepare the data for Binary Classification
6 | # Build models with H2O: GLM, GBM, RF
7 | # Inspect Features with LIME
8 |
9 | # Estimated time: 2-3 hours
10 |
11 |
12 |
13 | # 1.0 LIBRARIES ----
14 | library(tidyverse) # Workhorse with dplyr, ggplot2, etc
15 | library(h2o) # High Performance Machine Learning
16 | library(recipes) # Preprocessing
17 | library(rsample) # Sampling
18 | library(lime) # Black-box explanations
19 |
20 |
21 | # 2.0 DATA ----
22 |
23 | unzip("00_data/application_train.csv.zip", exdir = "00_data/")
24 | unzip("00_data/HomeCredit_columns_description.csv.zip", exdir = "00_data/")
25 |
26 | # Loan Applications (50% of data)
27 | application_train_raw_tbl <- read_csv("00_data/application_train.csv")
28 |
29 | application_train_raw_tbl
30 |
31 | glimpse(application_train_raw_tbl)
32 |
33 |
34 | # Column (Feature) Descriptions
35 | feature_description_tbl <- read_csv("00_data/HomeCredit_columns_description.csv")
36 |
37 | feature_description_tbl
38 |
39 | # 3.0 SIZE THE PROBLEM ----
40 |
41 | # How many defaulters?
42 | application_train_raw_tbl %>%
43 | count(TARGET) %>%
44 | mutate(n_total = n / 0.15) %>%
45 | mutate(pct = n_total / sum(n_total)) %>%
46 | mutate(pct_text = scales::percent(pct))
47 |
48 | # Size the problem financially $$$
49 | size_problem_tbl <- application_train_raw_tbl %>%
50 | count(TARGET) %>%
51 | filter(TARGET == 1) %>%
52 | # approximate number of annual defaults
53 | mutate(prop = 0.15,
54 | n_total = n / prop) %>%
55 | # cost of default
56 | mutate(avg_loan = 15000,
57 | avg_recovery = 0.40 * avg_loan,
58 | avg_loss = avg_loan - avg_recovery) %>%
59 | mutate(total_loss = n_total * avg_loss) %>%
60 | mutate(total_loss_text = scales::dollar(total_loss))
61 |
62 | size_problem_tbl
63 |
64 |
65 | # 4.0 EXPLORATORY DATA ANALYSIS (SKIPPED) ----
66 | # SKIPPED - Very Important!
67 | # Efficient exploration of features to find which to focus on
68 | # Critical Step in Business Science Problem Framework
69 | # Taught in my DS4B 201-R Course
70 | # IMPORTANT: ATTEND MY TALK TOMORROW
71 |
72 |
73 | # 5.0 SPLIT DATA ----
74 |
75 | # Resource: https://tidymodels.github.io/rsample/
76 |
77 | set.seed(1234)
78 | split_obj_1 <- initial_split(application_train_raw_tbl, strata = "TARGET", prop = 0.2)
79 |
80 | set.seed(1234)
81 | split_obj_2 <- initial_split(training(split_obj_1), strata = "TARGET", prop = 0.8)
82 |
83 | # Working with 20% sample of "Big Data"
84 | train_raw_tbl <- training(split_obj_2) # 80% of Data
85 | test_raw_tbl <- testing(split_obj_2) # 20% of Data
86 |
87 | # Verify proportions have been maintained
88 | train_raw_tbl %>%
89 | count(TARGET) %>%
90 | mutate(prop = n / sum(n))
91 |
92 | test_raw_tbl %>%
93 | count(TARGET) %>%
94 | mutate(prop = n / sum(n))
95 |
96 |
97 |
98 | # 6.0 PREPROCESSING ----
99 |
100 | # Fix issues with data:
101 | # Some Numeric data with low number of unique values should be Factor (Categorical)
102 | # All Character data should be Factor (Categorical)
103 | # NA's (imputation)
104 |
105 | # 5.1 Handle Categorical ----
106 |
107 | # Numeric
108 | num2factor_names <- train_raw_tbl %>%
109 | select_if(is.numeric) %>%
110 | map_df(~ unique(.) %>% length()) %>%
111 | gather() %>%
112 | arrange(value) %>%
113 | filter(value <= 6) %>%
114 | pull(key)
115 |
116 | num2factor_names
117 |
118 | # Character
119 | string2factor_names <- train_raw_tbl %>%
120 | select_if(is.character) %>%
121 | names()
122 |
123 | string2factor_names
124 |
125 |
126 | # 6.2 Missing Data ----
127 |
128 | # Transform
129 | missing_tbl <- train_raw_tbl %>%
130 | summarize_all(.funs = funs(sum(is.na(.)) / length(.))) %>%
131 | gather() %>%
132 | arrange(desc(value))
133 |
134 | missing_tbl
135 |
136 | # Visualize
137 | missing_tbl %>%
138 | filter(value > 0) %>%
139 | mutate(key = as_factor(key) %>% fct_rev()) %>%
140 | ggplot(aes(x = value, y = key)) +
141 | geom_point() +
142 | geom_segment(aes(xend = 0, yend = key)) +
143 | expand_limits(x = c(0, 1)) +
144 | scale_x_continuous(labels = scales::percent) +
145 | labs(title = "Percentage Missing")
146 |
147 |
148 | # 6.3 Recipes ----
149 |
150 | # Resource: https://tidymodels.github.io/recipes/
151 |
152 | # recipe
153 | rec_obj <- recipe(TARGET ~ ., data = train_raw_tbl) %>%
154 | step_num2factor(num2factor_names) %>%
155 | step_string2factor(string2factor_names) %>%
156 | step_meanimpute(all_numeric()) %>%
157 | step_modeimpute(all_nominal()) %>%
158 | prep(stringsAsFactors = FALSE)
159 |
160 | # bake
161 | train_tbl <- bake(rec_obj, train_raw_tbl)
162 | test_tbl <- bake(rec_obj, test_raw_tbl)
163 |
164 | train_tbl %>%
165 | glimpse()
166 |
167 | # 7.0 MODELING -----
168 |
169 | # 7.1 H2O Setup ----
170 |
171 | # H2O Docs: http://docs.h2o.ai
172 |
173 | h2o.init()
174 |
175 | train_h2o <- as.h2o(train_tbl)
176 | test_h2o <- as.h2o(test_tbl)
177 |
178 | y <- "TARGET"
179 | x <- setdiff(names(train_h2o), y)
180 |
181 | # 7.2 H2O Models ----
182 |
183 | # 7.2.1 GLM (Elastic Net) ----
184 |
185 | start <- Sys.time()
186 | h2o_glm <- h2o.glm(
187 | x = x,
188 | y = y,
189 | training_frame = train_h2o,
190 | validation_frame = test_h2o,
191 | nfolds = 5,
192 | seed = 1234,
193 |
194 | # GLM
195 | family = "binomial"
196 |
197 | )
198 | Sys.time() - start
199 | # Time difference of 6.508575 secs
200 |
201 | h2o.performance(h2o_glm, valid = TRUE) %>%
202 | h2o.auc()
203 | # [1] 0.7384649
204 |
205 | h2o_glm@allparameters
206 |
207 | # 7.2.2 GBM ----
208 |
209 | # Resource: https://blog.h2o.ai/2016/06/h2o-gbm-tuning-tutorial-for-r/
210 |
211 | start <- Sys.time()
212 | h2o_gbm <- h2o.gbm(
213 | x = x,
214 | y = y,
215 | training_frame = train_h2o,
216 | validation_frame = test_h2o,
217 | nfolds = 5,
218 | seed = 1234,
219 |
220 | # GBM
221 | ntrees = 100,
222 | max_depth = 5,
223 | learn_rate = 0.1
224 | )
225 | Sys.time() - start
226 | # Time difference of 29.29766 secs
227 |
228 | h2o.performance(h2o_gbm, valid = TRUE) %>%
229 | h2o.auc()
230 | # [1] 0.7369739
231 |
232 | h2o_gbm@allparameters
233 |
234 | # 7.2.3 Random Forest ----
235 |
236 | start <- Sys.time()
237 | h2o_rf <- h2o.randomForest(
238 | x = x,
239 | y = y,
240 | training_frame = train_h2o,
241 | validation_frame = test_h2o,
242 | nfolds = 5,
243 | seed = 1234,
244 |
245 | # RF
246 | ntrees = 100,
247 | max_depth = 5
248 |
249 | )
250 | Sys.time() - start
251 | # Time difference of 27.21049 secs
252 |
253 | h2o.performance(h2o_rf, valid = TRUE) %>%
254 | h2o.auc()
255 | # [1] 0.7259596
256 |
257 | h2o_rf@allparameters
258 |
259 |
260 | # CHALLENGE: DEEP LEARNING ----
261 |
262 | # 10 Minutes
263 | # Create a Deep Learning Algorithm with H2O
264 | # h2o.deeplearning
265 | # 10 epochs
266 | # 3 hidden layers: 100, 50, 10,
267 |
268 |
269 |
270 | # 7.3 Saving & Loading Models ----
271 |
272 | h2o.saveModel(h2o_gbm, "00_models")
273 |
274 | h2o.loadModel("")
275 |
276 | # 8.0 Making Predictions -----
277 |
278 | prediction_h2o <- h2o.predict(h2o_gbm, newdata = test_h2o)
279 |
280 | prediction_tbl <- prediction_h2o %>%
281 | as.tibble() %>%
282 | bind_cols(
283 | test_tbl %>% select(TARGET, SK_ID_CURR)
284 | )
285 |
286 | prediction_tbl
287 |
288 | prediction_tbl %>%
289 | filter(TARGET == "1")
290 |
291 |
292 | # 9.0 PERFORMANCE (SKIPPING) -----
293 |
294 | # Very Important
295 | # Adjusting Threshold
296 | # ROC Plot, Precision vs Recall
297 | # Gain & Lift - Important for executives
298 |
299 | h2o_gbm %>%
300 | h2o.performance(valid = TRUE)
301 |
302 |
303 | # 10.0 EXPLANATIONS LIME ----
304 |
305 | # Create explainer
306 | explainer <- train_tbl %>%
307 | select(-TARGET) %>%
308 | lime(
309 | model = h2o_gbm,
310 | bin_continuous = TRUE,
311 | n_bins = 4,
312 | quantile_bins = TRUE
313 | )
314 |
315 | # Create explanation
316 | explanation <- test_tbl %>%
317 | filter(TARGET == "1") %>%
318 | slice(1) %>%
319 | select(-TARGET) %>%
320 | lime::explain(
321 | explainer = explainer,
322 | n_features = 8,
323 | n_permutations = 10000,
324 | dist_fun = "gower",
325 | kernel_width = 1.5,
326 | feature_select = "lasso_path",
327 | # n_labels = 2,
328 | labels = "p1"
329 | )
330 |
331 | explanation %>%
332 | as.tibble() %>%
333 | glimpse()
334 |
335 | # Visualize
336 | plot_features(explanation)
337 |
338 |
339 | # What are Ext_Source?
340 |
341 | feature_description_tbl %>%
342 | filter(str_detect(Row, "EXT_SOURCE")) %>%
343 | View()
344 |
345 | # Equifax, Experian, TransUnion
346 |
347 | # 11.0 OPTIMIZATION (SKIPPING) ----
348 |
349 | # Expected Value
350 | # Threshold Optimization - Find the balance of False Positives & False Negatives that maximizes revenue
351 | # Sensitivity Analysis - Taking into account what assumptions we are inputing into the model
352 |
353 |
354 | # 12.0 RECOMMENDATION ALGORITHMS (SKIPPING) ----
355 |
356 | # 3 Step Process:
357 | # 1. Discretized Correlation Visualization (Correlation Funnel)
358 | # 2. Fill out our Recommendation Algorithm Worksheet
359 | # 3. Implement Strategies into R Code
360 | # Correlation Funnel - S&P Loved This!!
361 |
362 | # BONUS #1: GRIDSEARCH ----
363 |
364 | # GBM hyperparamters
365 | gbm_params <- list(learn_rate = c(0.01, 0.1),
366 | max_depth = c(3, 5, 9))
367 | gbm_params
368 |
369 | # Train and validate a cartesian grid of GBMs
370 | gbm_grid <- h2o.grid("gbm",
371 | x = x,
372 | y = y,
373 | grid_id = "gbm_grid1",
374 | training_frame = train_h2o,
375 | validation_frame = test_h2o,
376 | ntrees = 100,
377 | seed = 1234,
378 | hyper_params = gbm_params)
379 |
380 | h2o.getGrid(grid_id = "gbm_grid",
381 | sort_by = "auc",
382 | decreasing = TRUE)
383 |
384 | h2o.getModel("gbm_grid1_model_1") %>%
385 | h2o.auc(valid = TRUE)
386 | # [1] 0.7459666
387 |
388 |
389 | # BONUS #2: AUTOML ----
390 |
391 | start <- Sys.time()
392 | h2o_automl <- h2o.automl(
393 | x = x,
394 | y = y,
395 | training_frame = train_h2o,
396 | validation_frame = test_h2o,
397 | nfolds = 5,
398 | seed = 1234,
399 |
400 | # AutoML
401 | max_runtime_secs = 300
402 | )
403 | Sys.time() - start
404 | # Time difference of 5.243099 mins
405 |
406 |
407 | h2o_automl@leaderboard
408 |
409 | h2o_automl@leader %>%
410 | h2o.auc(valid = TRUE)
411 | # [1] 0.7423596
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ####### Dockerfile #######
2 | FROM rocker/tidyverse:3.4.3
3 |
4 | RUN apt-get update -qq \
5 | && apt-get -y --no-install-recommends install \
6 | libglu1-mesa-dev \
7 | liblzma-dev \
8 | libbz2-dev \
9 | clang \
10 | ccache \
11 | default-jdk \
12 | default-jre \
13 | libmagick++-dev \
14 | && R CMD javareconf \
15 | && install2.r --error --deps TRUE \
16 | h2o \
17 | recipes \
18 | rsample \
19 | lime \
20 | tidyquant
21 |
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DSGO 2018 - Machine Learning With R + H2O Workshop
2 |
3 | ___Get ready to learn how to predict credit defaults with `R` + `H2O`!___
4 |
5 | ## Program
6 |
7 |
8 |
9 |
10 |
11 | - Data is Credit Loan Applications to a Bank.
12 |
13 | - Objective is to assess Risk Of Default, prevent bad loans, save bank lots of \$\$\$
14 |
15 | - Best Kagglers got 0.80 AUC with more 100's of manhours, feature engineering, combining more data sets
16 |
17 | - We'll get 0.74 AUC in 30 minutes of coding (+1.5 hour of explaining)
18 |
19 |
20 |
21 |
22 | ## Data
23 |
24 | - Kaggle Competition: [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
25 |
26 | - Data is large (166MB unzipped, 308K rows, 122 columns)
27 |
28 | - Will work with sampled data 20% to keep manageable
29 |
30 |
31 | ## Machine Learning With H2O
32 |
33 | The goal of ___Machine Learning with H2O___ is to get you experience with:
34 |
35 | 1. The R programming language
36 |
37 | 2. `h2o` for machine learning
38 |
39 | 3. `lime` for feature explanation
40 |
41 | 4. `recipes` for preprocessing
42 |
43 | ### Becoming A Data Science Rockstar
44 |
45 |
46 |
47 |
48 |
49 | - This 3 hour workshop will teach you some of the latest tools & techniques for Machine Learning in business
50 |
51 | - With this said, you will spend 5% of your time on modeling (machine learning) & 95% of your time:
52 |
53 | - Managing projects
54 | - Collecting & working with data (manipulating, combining, cleaning)
55 | - Visualizing information - showing the size of problems and what is likely contributing
56 | - Communicating results in terms the business cares about
57 | - Recommending actions that improve the business
58 |
59 | - Further, your organization will be keenly aware of what you contribute __financially__. You need to show them __Return on Investment (ROI)__. They are making an investment in having a data science team. They expect __tangible results__.
60 |
61 | - Important Actions:
62 |
63 | - Attend my talk on the [Business Science Problem Framework](https://www.business-science.io/bspf.html) tomorrow. The BSPF is the essential system that enables driving ROI with data science.
64 |
65 | - Take my [DS4B 201-R course](https://university.business-science.io/p/hr201-using-machine-learning-h2o-lime-to-predict-employee-turnover/?product_id=635023&coupon_code=DSGO20). This teaches you a 10-Week Program that has cut data science projects in half for consultants and has progressed data scientists more than any other course they've take. ___You will get 20% OFF (expires after DSGO conference).___
66 |
67 |
68 | ---
69 |
70 | ## Installation Instructions
71 |
72 | ### Option 1: RStudio IDE Desktop + Install R Packages
73 |
74 | ###### Step 1: Install R and RStudio IDE
75 |
76 | - [Download and Install R](https://cloud.r-project.org/)
77 |
78 | - [Download RStudio IDE Desktop](https://www.rstudio.com/products/rstudio/download/)
79 |
80 | ###### Step 2: Open Rstudio and run the following scripts
81 |
82 | ```
83 | pkgs <- c("h2o", "tidyverse", "rsample", "recipes", "lime")
84 | install.packages(pkgs)
85 | ```
86 |
87 | Test H2O - You may need the [Java Developer Kit](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#requirements)
88 |
89 | ```
90 | library(h2o)
91 | h2o.init()
92 | ```
93 |
94 | If H2O cannot connect, you probably need to install Java.
95 |
96 | ###### Step 3: Load the Project From GitHub
97 |
98 | _Wait for instructions from Matt._
99 |
100 | The URL for the GitHub project is:
101 |
102 | https://github.com/business-science/workshop_2018_dsgo
103 |
104 | ### Option 2: If You Have Docker Installed
105 |
106 | ###### Step 0: Docker Installation (Takes Time)
107 |
108 | _Skip this step if you already have Docker Community Edition installed_
109 |
110 | [Docker Community Edition Installation Instructions](https://store.docker.com/search?offering=community&type=edition)
111 |
112 |
113 | ###### Step 1: Run the DSGO Workshop Docker Image
114 |
115 | In a terminal / command line, run the following command to download and install the workshop container. This will take a few minutes to load.
116 |
117 | ```
118 | docker run -d -p 8787:8787 -v "`pwd`":/home/rstudio/working -e PASSWORD=rstudio -e ROOT=TRUE mdancho/workshop_2018_dsgo
119 | ```
120 |
121 | ###### Step 3: Fire Up RStudio IDE in your Browser
122 |
123 | Go into you favorite browser (I'll be using Chrome), and enter the following in the web address field.
124 |
125 | ```
126 | localhost:8787
127 | ```
128 |
129 | ###### Step 4: Log into RStudio Server
130 |
131 |
132 |
133 |
134 |
135 | Use the following credentials.
136 |
137 | - __User Name:__ rstudio
138 | - __Password:__ rstudio
139 |
140 |