├── .gitignore ├── 00_data ├── .DS_Store ├── HomeCredit_columns_description.csv.zip └── application_train.csv.zip ├── 00_images ├── DS4B_201_R_Course.png ├── kaggle_credit_default.png └── rstudio_server.png ├── 00_scripts └── c1.R ├── 01_machine_learning_h2o └── 01_machine_learning_h2o.R ├── Dockerfile ├── README.html ├── README.md └── workshop_2018_dsgo.Rproj /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | Icon* 6 | *.csv 7 | *.DS_Store -------------------------------------------------------------------------------- /00_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/.DS_Store -------------------------------------------------------------------------------- /00_data/HomeCredit_columns_description.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/HomeCredit_columns_description.csv.zip -------------------------------------------------------------------------------- /00_data/application_train.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/application_train.csv.zip -------------------------------------------------------------------------------- /00_images/DS4B_201_R_Course.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/DS4B_201_R_Course.png -------------------------------------------------------------------------------- /00_images/kaggle_credit_default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/kaggle_credit_default.png -------------------------------------------------------------------------------- /00_images/rstudio_server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/rstudio_server.png -------------------------------------------------------------------------------- /00_scripts/c1.R: -------------------------------------------------------------------------------- 1 | # CHALLENGE SOLUTION ---- 2 | 3 | start <- Sys.time() 4 | h2o_deeplearning <- h2o.deeplearning( 5 | x = x, 6 | y = y, 7 | training_frame = train_h2o, 8 | validation_frame = test_h2o, 9 | nfolds = 5, 10 | seed = 1234, 11 | 12 | # Deep Learning 13 | epochs = 10, 14 | hidden = c(100, 50, 10) 15 | ) 16 | Sys.time() - start 17 | # Time difference of 59.41523 secs 18 | 19 | h2o_deeplearning %>% h2o.auc(valid = TRUE) 20 | # [1] 0.7098785 21 | -------------------------------------------------------------------------------- /01_machine_learning_h2o/01_machine_learning_h2o.R: -------------------------------------------------------------------------------- 1 | # MACHINE LEARNING ---- 2 | 3 | # Objectives: 4 | # Size the problem 5 | # Prepare the data for Binary Classification 6 | # Build models with H2O: GLM, GBM, RF 7 | # Inspect Features with LIME 8 | 9 | # Estimated time: 2-3 hours 10 | 11 | 12 | 13 | # 1.0 LIBRARIES ---- 14 | library(tidyverse) # Workhorse with dplyr, ggplot2, etc 15 | library(h2o) # High Performance Machine Learning 16 | library(recipes) # Preprocessing 17 | library(rsample) # Sampling 18 | library(lime) # Black-box explanations 19 | 20 | 21 | # 2.0 DATA ---- 22 | 23 | unzip("00_data/application_train.csv.zip", exdir = "00_data/") 24 | unzip("00_data/HomeCredit_columns_description.csv.zip", exdir = "00_data/") 25 | 26 | # Loan Applications (50% of data) 27 | application_train_raw_tbl <- read_csv("00_data/application_train.csv") 28 | 29 | application_train_raw_tbl 30 | 31 | glimpse(application_train_raw_tbl) 32 | 33 | 34 | # Column (Feature) Descriptions 35 | feature_description_tbl <- read_csv("00_data/HomeCredit_columns_description.csv") 36 | 37 | feature_description_tbl 38 | 39 | # 3.0 SIZE THE PROBLEM ---- 40 | 41 | # How many defaulters? 42 | application_train_raw_tbl %>% 43 | count(TARGET) %>% 44 | mutate(n_total = n / 0.15) %>% 45 | mutate(pct = n_total / sum(n_total)) %>% 46 | mutate(pct_text = scales::percent(pct)) 47 | 48 | # Size the problem financially $$$ 49 | size_problem_tbl <- application_train_raw_tbl %>% 50 | count(TARGET) %>% 51 | filter(TARGET == 1) %>% 52 | # approximate number of annual defaults 53 | mutate(prop = 0.15, 54 | n_total = n / prop) %>% 55 | # cost of default 56 | mutate(avg_loan = 15000, 57 | avg_recovery = 0.40 * avg_loan, 58 | avg_loss = avg_loan - avg_recovery) %>% 59 | mutate(total_loss = n_total * avg_loss) %>% 60 | mutate(total_loss_text = scales::dollar(total_loss)) 61 | 62 | size_problem_tbl 63 | 64 | 65 | # 4.0 EXPLORATORY DATA ANALYSIS (SKIPPED) ---- 66 | # SKIPPED - Very Important! 67 | # Efficient exploration of features to find which to focus on 68 | # Critical Step in Business Science Problem Framework 69 | # Taught in my DS4B 201-R Course 70 | # IMPORTANT: ATTEND MY TALK TOMORROW 71 | 72 | 73 | # 5.0 SPLIT DATA ---- 74 | 75 | # Resource: https://tidymodels.github.io/rsample/ 76 | 77 | set.seed(1234) 78 | split_obj_1 <- initial_split(application_train_raw_tbl, strata = "TARGET", prop = 0.2) 79 | 80 | set.seed(1234) 81 | split_obj_2 <- initial_split(training(split_obj_1), strata = "TARGET", prop = 0.8) 82 | 83 | # Working with 20% sample of "Big Data" 84 | train_raw_tbl <- training(split_obj_2) # 80% of Data 85 | test_raw_tbl <- testing(split_obj_2) # 20% of Data 86 | 87 | # Verify proportions have been maintained 88 | train_raw_tbl %>% 89 | count(TARGET) %>% 90 | mutate(prop = n / sum(n)) 91 | 92 | test_raw_tbl %>% 93 | count(TARGET) %>% 94 | mutate(prop = n / sum(n)) 95 | 96 | 97 | 98 | # 6.0 PREPROCESSING ---- 99 | 100 | # Fix issues with data: 101 | # Some Numeric data with low number of unique values should be Factor (Categorical) 102 | # All Character data should be Factor (Categorical) 103 | # NA's (imputation) 104 | 105 | # 5.1 Handle Categorical ---- 106 | 107 | # Numeric 108 | num2factor_names <- train_raw_tbl %>% 109 | select_if(is.numeric) %>% 110 | map_df(~ unique(.) %>% length()) %>% 111 | gather() %>% 112 | arrange(value) %>% 113 | filter(value <= 6) %>% 114 | pull(key) 115 | 116 | num2factor_names 117 | 118 | # Character 119 | string2factor_names <- train_raw_tbl %>% 120 | select_if(is.character) %>% 121 | names() 122 | 123 | string2factor_names 124 | 125 | 126 | # 6.2 Missing Data ---- 127 | 128 | # Transform 129 | missing_tbl <- train_raw_tbl %>% 130 | summarize_all(.funs = funs(sum(is.na(.)) / length(.))) %>% 131 | gather() %>% 132 | arrange(desc(value)) 133 | 134 | missing_tbl 135 | 136 | # Visualize 137 | missing_tbl %>% 138 | filter(value > 0) %>% 139 | mutate(key = as_factor(key) %>% fct_rev()) %>% 140 | ggplot(aes(x = value, y = key)) + 141 | geom_point() + 142 | geom_segment(aes(xend = 0, yend = key)) + 143 | expand_limits(x = c(0, 1)) + 144 | scale_x_continuous(labels = scales::percent) + 145 | labs(title = "Percentage Missing") 146 | 147 | 148 | # 6.3 Recipes ---- 149 | 150 | # Resource: https://tidymodels.github.io/recipes/ 151 | 152 | # recipe 153 | rec_obj <- recipe(TARGET ~ ., data = train_raw_tbl) %>% 154 | step_num2factor(num2factor_names) %>% 155 | step_string2factor(string2factor_names) %>% 156 | step_meanimpute(all_numeric()) %>% 157 | step_modeimpute(all_nominal()) %>% 158 | prep(stringsAsFactors = FALSE) 159 | 160 | # bake 161 | train_tbl <- bake(rec_obj, train_raw_tbl) 162 | test_tbl <- bake(rec_obj, test_raw_tbl) 163 | 164 | train_tbl %>% 165 | glimpse() 166 | 167 | # 7.0 MODELING ----- 168 | 169 | # 7.1 H2O Setup ---- 170 | 171 | # H2O Docs: http://docs.h2o.ai 172 | 173 | h2o.init() 174 | 175 | train_h2o <- as.h2o(train_tbl) 176 | test_h2o <- as.h2o(test_tbl) 177 | 178 | y <- "TARGET" 179 | x <- setdiff(names(train_h2o), y) 180 | 181 | # 7.2 H2O Models ---- 182 | 183 | # 7.2.1 GLM (Elastic Net) ---- 184 | 185 | start <- Sys.time() 186 | h2o_glm <- h2o.glm( 187 | x = x, 188 | y = y, 189 | training_frame = train_h2o, 190 | validation_frame = test_h2o, 191 | nfolds = 5, 192 | seed = 1234, 193 | 194 | # GLM 195 | family = "binomial" 196 | 197 | ) 198 | Sys.time() - start 199 | # Time difference of 6.508575 secs 200 | 201 | h2o.performance(h2o_glm, valid = TRUE) %>% 202 | h2o.auc() 203 | # [1] 0.7384649 204 | 205 | h2o_glm@allparameters 206 | 207 | # 7.2.2 GBM ---- 208 | 209 | # Resource: https://blog.h2o.ai/2016/06/h2o-gbm-tuning-tutorial-for-r/ 210 | 211 | start <- Sys.time() 212 | h2o_gbm <- h2o.gbm( 213 | x = x, 214 | y = y, 215 | training_frame = train_h2o, 216 | validation_frame = test_h2o, 217 | nfolds = 5, 218 | seed = 1234, 219 | 220 | # GBM 221 | ntrees = 100, 222 | max_depth = 5, 223 | learn_rate = 0.1 224 | ) 225 | Sys.time() - start 226 | # Time difference of 29.29766 secs 227 | 228 | h2o.performance(h2o_gbm, valid = TRUE) %>% 229 | h2o.auc() 230 | # [1] 0.7369739 231 | 232 | h2o_gbm@allparameters 233 | 234 | # 7.2.3 Random Forest ---- 235 | 236 | start <- Sys.time() 237 | h2o_rf <- h2o.randomForest( 238 | x = x, 239 | y = y, 240 | training_frame = train_h2o, 241 | validation_frame = test_h2o, 242 | nfolds = 5, 243 | seed = 1234, 244 | 245 | # RF 246 | ntrees = 100, 247 | max_depth = 5 248 | 249 | ) 250 | Sys.time() - start 251 | # Time difference of 27.21049 secs 252 | 253 | h2o.performance(h2o_rf, valid = TRUE) %>% 254 | h2o.auc() 255 | # [1] 0.7259596 256 | 257 | h2o_rf@allparameters 258 | 259 | 260 | # CHALLENGE: DEEP LEARNING ---- 261 | 262 | # 10 Minutes 263 | # Create a Deep Learning Algorithm with H2O 264 | # h2o.deeplearning 265 | # 10 epochs 266 | # 3 hidden layers: 100, 50, 10, 267 | 268 | 269 | 270 | # 7.3 Saving & Loading Models ---- 271 | 272 | h2o.saveModel(h2o_gbm, "00_models") 273 | 274 | h2o.loadModel("") 275 | 276 | # 8.0 Making Predictions ----- 277 | 278 | prediction_h2o <- h2o.predict(h2o_gbm, newdata = test_h2o) 279 | 280 | prediction_tbl <- prediction_h2o %>% 281 | as.tibble() %>% 282 | bind_cols( 283 | test_tbl %>% select(TARGET, SK_ID_CURR) 284 | ) 285 | 286 | prediction_tbl 287 | 288 | prediction_tbl %>% 289 | filter(TARGET == "1") 290 | 291 | 292 | # 9.0 PERFORMANCE (SKIPPING) ----- 293 | 294 | # Very Important 295 | # Adjusting Threshold 296 | # ROC Plot, Precision vs Recall 297 | # Gain & Lift - Important for executives 298 | 299 | h2o_gbm %>% 300 | h2o.performance(valid = TRUE) 301 | 302 | 303 | # 10.0 EXPLANATIONS LIME ---- 304 | 305 | # Create explainer 306 | explainer <- train_tbl %>% 307 | select(-TARGET) %>% 308 | lime( 309 | model = h2o_gbm, 310 | bin_continuous = TRUE, 311 | n_bins = 4, 312 | quantile_bins = TRUE 313 | ) 314 | 315 | # Create explanation 316 | explanation <- test_tbl %>% 317 | filter(TARGET == "1") %>% 318 | slice(1) %>% 319 | select(-TARGET) %>% 320 | lime::explain( 321 | explainer = explainer, 322 | n_features = 8, 323 | n_permutations = 10000, 324 | dist_fun = "gower", 325 | kernel_width = 1.5, 326 | feature_select = "lasso_path", 327 | # n_labels = 2, 328 | labels = "p1" 329 | ) 330 | 331 | explanation %>% 332 | as.tibble() %>% 333 | glimpse() 334 | 335 | # Visualize 336 | plot_features(explanation) 337 | 338 | 339 | # What are Ext_Source? 340 | 341 | feature_description_tbl %>% 342 | filter(str_detect(Row, "EXT_SOURCE")) %>% 343 | View() 344 | 345 | # Equifax, Experian, TransUnion 346 | 347 | # 11.0 OPTIMIZATION (SKIPPING) ---- 348 | 349 | # Expected Value 350 | # Threshold Optimization - Find the balance of False Positives & False Negatives that maximizes revenue 351 | # Sensitivity Analysis - Taking into account what assumptions we are inputing into the model 352 | 353 | 354 | # 12.0 RECOMMENDATION ALGORITHMS (SKIPPING) ---- 355 | 356 | # 3 Step Process: 357 | # 1. Discretized Correlation Visualization (Correlation Funnel) 358 | # 2. Fill out our Recommendation Algorithm Worksheet 359 | # 3. Implement Strategies into R Code 360 | # Correlation Funnel - S&P Loved This!! 361 | 362 | # BONUS #1: GRIDSEARCH ---- 363 | 364 | # GBM hyperparamters 365 | gbm_params <- list(learn_rate = c(0.01, 0.1), 366 | max_depth = c(3, 5, 9)) 367 | gbm_params 368 | 369 | # Train and validate a cartesian grid of GBMs 370 | gbm_grid <- h2o.grid("gbm", 371 | x = x, 372 | y = y, 373 | grid_id = "gbm_grid1", 374 | training_frame = train_h2o, 375 | validation_frame = test_h2o, 376 | ntrees = 100, 377 | seed = 1234, 378 | hyper_params = gbm_params) 379 | 380 | h2o.getGrid(grid_id = "gbm_grid", 381 | sort_by = "auc", 382 | decreasing = TRUE) 383 | 384 | h2o.getModel("gbm_grid1_model_1") %>% 385 | h2o.auc(valid = TRUE) 386 | # [1] 0.7459666 387 | 388 | 389 | # BONUS #2: AUTOML ---- 390 | 391 | start <- Sys.time() 392 | h2o_automl <- h2o.automl( 393 | x = x, 394 | y = y, 395 | training_frame = train_h2o, 396 | validation_frame = test_h2o, 397 | nfolds = 5, 398 | seed = 1234, 399 | 400 | # AutoML 401 | max_runtime_secs = 300 402 | ) 403 | Sys.time() - start 404 | # Time difference of 5.243099 mins 405 | 406 | 407 | h2o_automl@leaderboard 408 | 409 | h2o_automl@leader %>% 410 | h2o.auc(valid = TRUE) 411 | # [1] 0.7423596 -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ####### Dockerfile ####### 2 | FROM rocker/tidyverse:3.4.3 3 | 4 | RUN apt-get update -qq \ 5 | && apt-get -y --no-install-recommends install \ 6 | libglu1-mesa-dev \ 7 | liblzma-dev \ 8 | libbz2-dev \ 9 | clang \ 10 | ccache \ 11 | default-jdk \ 12 | default-jre \ 13 | libmagick++-dev \ 14 | && R CMD javareconf \ 15 | && install2.r --error --deps TRUE \ 16 | h2o \ 17 | recipes \ 18 | rsample \ 19 | lime \ 20 | tidyquant 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DSGO 2018 - Machine Learning With R + H2O Workshop 2 | 3 | ___Get ready to learn how to predict credit defaults with `R` + `H2O`!___ 4 | 5 | ## Program 6 | 7 | 8 | 9 | 10 | 11 | - Data is Credit Loan Applications to a Bank. 12 | 13 | - Objective is to assess Risk Of Default, prevent bad loans, save bank lots of \$\$\$ 14 | 15 | - Best Kagglers got 0.80 AUC with more 100's of manhours, feature engineering, combining more data sets 16 | 17 | - We'll get 0.74 AUC in 30 minutes of coding (+1.5 hour of explaining) 18 | 19 | 20 | 21 | 22 | ## Data 23 | 24 | - Kaggle Competition: [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk) 25 | 26 | - Data is large (166MB unzipped, 308K rows, 122 columns) 27 | 28 | - Will work with sampled data 20% to keep manageable 29 | 30 | 31 | ## Machine Learning With H2O 32 | 33 | The goal of ___Machine Learning with H2O___ is to get you experience with: 34 | 35 | 1. The R programming language 36 | 37 | 2. `h2o` for machine learning 38 | 39 | 3. `lime` for feature explanation 40 | 41 | 4. `recipes` for preprocessing 42 | 43 | ### Becoming A Data Science Rockstar 44 | 45 | 46 | 47 | 48 | 49 | - This 3 hour workshop will teach you some of the latest tools & techniques for Machine Learning in business 50 | 51 | - With this said, you will spend 5% of your time on modeling (machine learning) & 95% of your time: 52 | 53 | - Managing projects 54 | - Collecting & working with data (manipulating, combining, cleaning) 55 | - Visualizing information - showing the size of problems and what is likely contributing 56 | - Communicating results in terms the business cares about 57 | - Recommending actions that improve the business 58 | 59 | - Further, your organization will be keenly aware of what you contribute __financially__. You need to show them __Return on Investment (ROI)__. They are making an investment in having a data science team. They expect __tangible results__. 60 | 61 | - Important Actions: 62 | 63 | - Attend my talk on the [Business Science Problem Framework](https://www.business-science.io/bspf.html) tomorrow. The BSPF is the essential system that enables driving ROI with data science. 64 | 65 | - Take my [DS4B 201-R course](https://university.business-science.io/p/hr201-using-machine-learning-h2o-lime-to-predict-employee-turnover/?product_id=635023&coupon_code=DSGO20). This teaches you a 10-Week Program that has cut data science projects in half for consultants and has progressed data scientists more than any other course they've take. ___You will get 20% OFF (expires after DSGO conference).___ 66 | 67 | 68 | --- 69 | 70 | ## Installation Instructions 71 | 72 | ### Option 1: RStudio IDE Desktop + Install R Packages 73 | 74 | ###### Step 1: Install R and RStudio IDE 75 | 76 | - [Download and Install R](https://cloud.r-project.org/) 77 | 78 | - [Download RStudio IDE Desktop](https://www.rstudio.com/products/rstudio/download/) 79 | 80 | ###### Step 2: Open Rstudio and run the following scripts 81 | 82 | ``` 83 | pkgs <- c("h2o", "tidyverse", "rsample", "recipes", "lime") 84 | install.packages(pkgs) 85 | ``` 86 | 87 | Test H2O - You may need the [Java Developer Kit](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#requirements) 88 | 89 | ``` 90 | library(h2o) 91 | h2o.init() 92 | ``` 93 | 94 | If H2O cannot connect, you probably need to install Java. 95 | 96 | ###### Step 3: Load the Project From GitHub 97 | 98 | _Wait for instructions from Matt._ 99 | 100 | The URL for the GitHub project is: 101 | 102 | https://github.com/business-science/workshop_2018_dsgo 103 | 104 | ### Option 2: If You Have Docker Installed 105 | 106 | ###### Step 0: Docker Installation (Takes Time) 107 | 108 | _Skip this step if you already have Docker Community Edition installed_ 109 | 110 | [Docker Community Edition Installation Instructions](https://store.docker.com/search?offering=community&type=edition) 111 | 112 | 113 | ###### Step 1: Run the DSGO Workshop Docker Image 114 | 115 | In a terminal / command line, run the following command to download and install the workshop container. This will take a few minutes to load. 116 | 117 | ``` 118 | docker run -d -p 8787:8787 -v "`pwd`":/home/rstudio/working -e PASSWORD=rstudio -e ROOT=TRUE mdancho/workshop_2018_dsgo 119 | ``` 120 | 121 | ###### Step 3: Fire Up RStudio IDE in your Browser 122 | 123 | Go into you favorite browser (I'll be using Chrome), and enter the following in the web address field. 124 | 125 | ``` 126 | localhost:8787 127 | ``` 128 | 129 | ###### Step 4: Log into RStudio Server 130 | 131 | 132 | 133 | 134 | 135 | Use the following credentials. 136 | 137 | - __User Name:__ rstudio 138 | - __Password:__ rstudio 139 | 140 |
141 | 142 | 143 | ###### Step 5: Load the Project From GitHub 144 | 145 | _Wait for instructions from Matt._ 146 | 147 | The URL for the GitHub project is: 148 | 149 | https://github.com/business-science/workshop_2018_dsgo 150 | 151 | 152 | 153 | --- 154 | 155 | ## Further Resources 156 | 157 | - `tidyverse`: A meta-package for data wrangling and visualization. Loads `dplyr`, `ggplot2`, and a number of essential packages for working with data. Documentation: https://www.tidyverse.org/ 158 | 159 | - `recipes`: A preprocessing package that includes many standard preprocessing steps. Documentation: https://tidymodels.github.io/recipes/ 160 | 161 | - `h2o`: A high-performance machine learning library that is scalable and is optimized for perfromance. Documentation: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html 162 | 163 | - GLM: Elastic Net (Generalized Linear Regression with L1 + L2 Regularization) 164 | 165 | - GBM: Gradient Boosted Machines (Tree-Based + Boosting) 166 | 167 | - Random Forest: Tree Based + Bagging 168 | 169 | - Deep Learning: Neural Network 170 | 171 | - Automated Machine Learning: Stacked Ensemble, All Models and Best of Family 172 | 173 | - `lime`: A package for explaining black-box models. LIME Tutorial: https://www.business-science.io/business/2018/06/25/lime-local-feature-interpretation.html 174 | 175 | -------------------------------------------------------------------------------- /workshop_2018_dsgo.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | --------------------------------------------------------------------------------