├── .gitignore
├── 00_data
    ├── .DS_Store
    ├── HomeCredit_columns_description.csv.zip
    └── application_train.csv.zip
├── 00_images
    ├── DS4B_201_R_Course.png
    ├── kaggle_credit_default.png
    └── rstudio_server.png
├── 00_scripts
    └── c1.R
├── 01_machine_learning_h2o
    └── 01_machine_learning_h2o.R
├── Dockerfile
├── README.html
├── README.md
└── workshop_2018_dsgo.Rproj


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | Icon*
6 | *.csv
7 | *.DS_Store


--------------------------------------------------------------------------------
/00_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/.DS_Store


--------------------------------------------------------------------------------
/00_data/HomeCredit_columns_description.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/HomeCredit_columns_description.csv.zip


--------------------------------------------------------------------------------
/00_data/application_train.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_data/application_train.csv.zip


--------------------------------------------------------------------------------
/00_images/DS4B_201_R_Course.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/DS4B_201_R_Course.png


--------------------------------------------------------------------------------
/00_images/kaggle_credit_default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/kaggle_credit_default.png


--------------------------------------------------------------------------------
/00_images/rstudio_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/business-science/workshop_2018_dsgo/2b7b0e95828b5a95e0e783b7ea764e5962fb3fb4/00_images/rstudio_server.png


--------------------------------------------------------------------------------
/00_scripts/c1.R:
--------------------------------------------------------------------------------
 1 | # CHALLENGE SOLUTION ----
 2 | 
 3 | start <- Sys.time()
 4 | h2o_deeplearning <- h2o.deeplearning(
 5 |     x = x,
 6 |     y = y,
 7 |     training_frame = train_h2o,
 8 |     validation_frame = test_h2o,
 9 |     nfolds = 5,
10 |     seed = 1234,
11 |     
12 |     # Deep Learning
13 |     epochs = 10,
14 |     hidden = c(100, 50, 10)
15 | )
16 | Sys.time() - start
17 | # Time difference of 59.41523 secs
18 | 
19 | h2o_deeplearning %>% h2o.auc(valid = TRUE)
20 | # [1] 0.7098785
21 | 


--------------------------------------------------------------------------------
/01_machine_learning_h2o/01_machine_learning_h2o.R:
--------------------------------------------------------------------------------
  1 | # MACHINE LEARNING ----
  2 | 
  3 | # Objectives:
  4 | #   Size the problem
  5 | #   Prepare the data for Binary Classification
  6 | #   Build models with H2O: GLM, GBM, RF
  7 | #   Inspect Features with LIME
  8 | 
  9 | # Estimated time: 2-3 hours
 10 | 
 11 | 
 12 | 
 13 | # 1.0 LIBRARIES ----
 14 | library(tidyverse)   # Workhorse with dplyr, ggplot2, etc
 15 | library(h2o)         # High Performance Machine Learning
 16 | library(recipes)     # Preprocessing
 17 | library(rsample)     # Sampling
 18 | library(lime)        # Black-box explanations
 19 | 
 20 | 
 21 | # 2.0 DATA ----
 22 | 
 23 | unzip("00_data/application_train.csv.zip", exdir = "00_data/")
 24 | unzip("00_data/HomeCredit_columns_description.csv.zip", exdir = "00_data/")
 25 | 
 26 | # Loan Applications (50% of data)
 27 | application_train_raw_tbl <- read_csv("00_data/application_train.csv")
 28 | 
 29 | application_train_raw_tbl
 30 | 
 31 | glimpse(application_train_raw_tbl)
 32 | 
 33 | 
 34 | # Column (Feature) Descriptions
 35 | feature_description_tbl <- read_csv("00_data/HomeCredit_columns_description.csv")
 36 | 
 37 | feature_description_tbl
 38 | 
 39 | # 3.0 SIZE THE PROBLEM ----
 40 | 
 41 | # How many defaulters?
 42 | application_train_raw_tbl %>%
 43 |     count(TARGET) %>%
 44 |     mutate(n_total = n / 0.15) %>%
 45 |     mutate(pct = n_total / sum(n_total)) %>%
 46 |     mutate(pct_text = scales::percent(pct))
 47 | 
 48 | # Size the problem financially $$$
 49 | size_problem_tbl <- application_train_raw_tbl %>%
 50 |     count(TARGET) %>%
 51 |     filter(TARGET == 1) %>%
 52 |     # approximate number of annual defaults
 53 |     mutate(prop = 0.15,
 54 |            n_total = n / prop) %>%
 55 |     # cost of default
 56 |     mutate(avg_loan = 15000,
 57 |            avg_recovery = 0.40 * avg_loan,
 58 |            avg_loss = avg_loan - avg_recovery) %>%
 59 |     mutate(total_loss = n_total * avg_loss) %>%
 60 |     mutate(total_loss_text = scales::dollar(total_loss))
 61 | 
 62 | size_problem_tbl
 63 | 
 64 | 
 65 | # 4.0 EXPLORATORY DATA ANALYSIS (SKIPPED) ----
 66 | #   SKIPPED - Very Important!
 67 | #   Efficient exploration of features to find which to focus on
 68 | #   Critical Step in Business Science Problem Framework
 69 | #   Taught in my DS4B 201-R Course
 70 | #   IMPORTANT: ATTEND MY TALK TOMORROW
 71 | 
 72 | 
 73 | # 5.0 SPLIT DATA ----
 74 | 
 75 | # Resource: https://tidymodels.github.io/rsample/
 76 | 
 77 | set.seed(1234)
 78 | split_obj_1 <- initial_split(application_train_raw_tbl, strata = "TARGET", prop = 0.2)
 79 | 
 80 | set.seed(1234)
 81 | split_obj_2 <- initial_split(training(split_obj_1), strata = "TARGET", prop = 0.8)
 82 | 
 83 | # Working with 20% sample of "Big Data"
 84 | train_raw_tbl <- training(split_obj_2) # 80% of Data
 85 | test_raw_tbl  <- testing(split_obj_2)  # 20% of Data
 86 | 
 87 | # Verify proportions have been maintained
 88 | train_raw_tbl %>%
 89 |     count(TARGET) %>%
 90 |     mutate(prop = n / sum(n))
 91 | 
 92 | test_raw_tbl %>%
 93 |     count(TARGET) %>%
 94 |     mutate(prop = n / sum(n))
 95 | 
 96 | 
 97 | 
 98 | # 6.0 PREPROCESSING ----
 99 | 
100 | # Fix issues with data: 
101 | #   Some Numeric data with low number of unique values should be Factor (Categorical)
102 | #   All Character data should be Factor (Categorical)
103 | #   NA's (imputation)
104 | 
105 | # 5.1 Handle Categorical ----
106 | 
107 | # Numeric
108 | num2factor_names <- train_raw_tbl %>%
109 |     select_if(is.numeric) %>%
110 |     map_df(~ unique(.) %>% length()) %>%
111 |     gather() %>%
112 |     arrange(value) %>%
113 |     filter(value <= 6) %>%
114 |     pull(key)
115 | 
116 | num2factor_names
117 | 
118 | # Character
119 | string2factor_names <- train_raw_tbl %>%
120 |     select_if(is.character) %>%
121 |     names()
122 | 
123 | string2factor_names
124 | 
125 | 
126 | # 6.2 Missing Data ----
127 | 
128 | # Transform
129 | missing_tbl <- train_raw_tbl %>%
130 |     summarize_all(.funs = funs(sum(is.na(.)) / length(.))) %>%
131 |     gather() %>%
132 |     arrange(desc(value))
133 | 
134 | missing_tbl
135 | 
136 | # Visualize
137 | missing_tbl %>%
138 |     filter(value > 0) %>%
139 |     mutate(key = as_factor(key) %>% fct_rev()) %>%
140 |     ggplot(aes(x = value, y = key)) +
141 |     geom_point() +
142 |     geom_segment(aes(xend = 0, yend = key)) +
143 |     expand_limits(x = c(0, 1)) +
144 |     scale_x_continuous(labels = scales::percent) +
145 |     labs(title = "Percentage Missing") 
146 | 
147 | 
148 | # 6.3 Recipes ----
149 | 
150 | # Resource: https://tidymodels.github.io/recipes/
151 | 
152 | # recipe
153 | rec_obj <- recipe(TARGET ~ ., data = train_raw_tbl) %>%
154 |     step_num2factor(num2factor_names) %>%
155 |     step_string2factor(string2factor_names) %>%
156 |     step_meanimpute(all_numeric()) %>%
157 |     step_modeimpute(all_nominal()) %>%
158 |     prep(stringsAsFactors = FALSE)
159 | 
160 | # bake
161 | train_tbl <- bake(rec_obj, train_raw_tbl)
162 | test_tbl  <- bake(rec_obj, test_raw_tbl)
163 | 
164 | train_tbl %>% 
165 |     glimpse()
166 | 
167 | # 7.0 MODELING -----
168 | 
169 | # 7.1 H2O Setup ----
170 | 
171 | # H2O Docs: http://docs.h2o.ai
172 | 
173 | h2o.init()
174 | 
175 | train_h2o <- as.h2o(train_tbl)
176 | test_h2o  <- as.h2o(test_tbl)
177 | 
178 | y <- "TARGET"
179 | x <- setdiff(names(train_h2o), y)
180 | 
181 | # 7.2 H2O Models ----
182 | 
183 | # 7.2.1 GLM (Elastic Net) ----
184 | 
185 | start <- Sys.time()
186 | h2o_glm <- h2o.glm(
187 |     x = x,
188 |     y = y,
189 |     training_frame   = train_h2o,
190 |     validation_frame = test_h2o,
191 |     nfolds = 5,
192 |     seed   = 1234,
193 |     
194 |     # GLM
195 |     family = "binomial"
196 |     
197 | )
198 | Sys.time() - start
199 | # Time difference of 6.508575 secs
200 | 
201 | h2o.performance(h2o_glm, valid = TRUE) %>%
202 |     h2o.auc()
203 | # [1] 0.7384649
204 | 
205 | h2o_glm@allparameters
206 | 
207 | # 7.2.2 GBM ----
208 | 
209 | # Resource: https://blog.h2o.ai/2016/06/h2o-gbm-tuning-tutorial-for-r/
210 | 
211 | start <- Sys.time()
212 | h2o_gbm <- h2o.gbm(
213 |     x = x,
214 |     y = y,
215 |     training_frame   = train_h2o,
216 |     validation_frame = test_h2o,
217 |     nfolds = 5,
218 |     seed   = 1234,
219 |     
220 |     # GBM
221 |     ntrees     = 100,
222 |     max_depth  = 5,
223 |     learn_rate = 0.1
224 | )
225 | Sys.time() - start
226 | # Time difference of 29.29766 secs
227 | 
228 | h2o.performance(h2o_gbm, valid = TRUE) %>%
229 |     h2o.auc()
230 | # [1] 0.7369739
231 | 
232 | h2o_gbm@allparameters
233 | 
234 | # 7.2.3 Random Forest ----
235 | 
236 | start <- Sys.time()
237 | h2o_rf <- h2o.randomForest(
238 |     x = x,
239 |     y = y,
240 |     training_frame = train_h2o,
241 |     validation_frame = test_h2o,
242 |     nfolds = 5,
243 |     seed = 1234,
244 |     
245 |     # RF
246 |     ntrees          = 100,
247 |     max_depth       = 5
248 |     
249 | )
250 | Sys.time() - start
251 | # Time difference of 27.21049 secs
252 | 
253 | h2o.performance(h2o_rf, valid = TRUE) %>%
254 |     h2o.auc()
255 | # [1] 0.7259596
256 | 
257 | h2o_rf@allparameters
258 | 
259 | 
260 | # CHALLENGE: DEEP LEARNING ----
261 | 
262 | # 10 Minutes
263 | # Create a Deep Learning Algorithm with H2O
264 | # h2o.deeplearning
265 | # 10 epochs
266 | # 3 hidden layers: 100, 50, 10,
267 | 
268 | 
269 | 
270 | # 7.3 Saving & Loading Models ----
271 | 
272 | h2o.saveModel(h2o_gbm, "00_models")
273 | 
274 | h2o.loadModel("")
275 | 
276 | # 8.0 Making Predictions -----
277 | 
278 | prediction_h2o <- h2o.predict(h2o_gbm, newdata = test_h2o)
279 | 
280 | prediction_tbl <- prediction_h2o %>%
281 |     as.tibble() %>%
282 |     bind_cols(
283 |         test_tbl %>% select(TARGET, SK_ID_CURR)
284 |     )
285 | 
286 | prediction_tbl
287 | 
288 | prediction_tbl %>%
289 |     filter(TARGET == "1")
290 | 
291 | 
292 | # 9.0 PERFORMANCE (SKIPPING) -----
293 | 
294 | #   Very Important
295 | #   Adjusting Threshold
296 | #   ROC Plot, Precision vs Recall
297 | #   Gain & Lift - Important for executives
298 | 
299 | h2o_gbm %>%
300 |     h2o.performance(valid = TRUE)
301 | 
302 | 
303 | # 10.0 EXPLANATIONS LIME ----
304 | 
305 | # Create explainer
306 | explainer <- train_tbl %>%
307 |     select(-TARGET) %>%
308 |     lime(
309 |         model           = h2o_gbm,
310 |         bin_continuous  = TRUE,
311 |         n_bins          = 4,
312 |         quantile_bins   = TRUE
313 |     )
314 | 
315 | # Create explanation
316 | explanation <- test_tbl %>%
317 |     filter(TARGET == "1") %>%
318 |     slice(1) %>%
319 |     select(-TARGET) %>%
320 |     lime::explain(
321 |         explainer = explainer,
322 |         n_features = 8,
323 |         n_permutations = 10000,
324 |         dist_fun = "gower",
325 |         kernel_width   = 1.5,
326 |         feature_select = "lasso_path",
327 |         # n_labels   = 2,
328 |         labels         = "p1"
329 |     )
330 | 
331 | explanation %>%
332 |     as.tibble() %>%
333 |     glimpse()
334 | 
335 | # Visualize
336 | plot_features(explanation)
337 | 
338 | 
339 | # What are Ext_Source?
340 | 
341 | feature_description_tbl %>%
342 |     filter(str_detect(Row, "EXT_SOURCE")) %>%
343 |     View()
344 | 
345 | # Equifax, Experian, TransUnion
346 | 
347 | # 11.0 OPTIMIZATION (SKIPPING) ----
348 | 
349 | #   Expected Value
350 | #   Threshold Optimization - Find the balance of False Positives & False Negatives that maximizes revenue
351 | #   Sensitivity Analysis - Taking into account what assumptions we are inputing into the model
352 | 
353 | 
354 | # 12.0 RECOMMENDATION ALGORITHMS (SKIPPING) ----
355 | 
356 | #   3 Step Process:
357 | #       1. Discretized Correlation Visualization (Correlation Funnel)
358 | #       2. Fill out our Recommendation Algorithm Worksheet
359 | #       3. Implement Strategies into R Code
360 | #   Correlation Funnel - S&P Loved This!!
361 | 
362 | # BONUS #1: GRIDSEARCH ----
363 | 
364 | # GBM hyperparamters
365 | gbm_params <- list(learn_rate       = c(0.01, 0.1),
366 |                    max_depth        = c(3, 5, 9))
367 | gbm_params
368 | 
369 | # Train and validate a cartesian grid of GBMs
370 | gbm_grid <- h2o.grid("gbm", 
371 |                      x = x, 
372 |                      y = y,
373 |                      grid_id = "gbm_grid1",
374 |                      training_frame   = train_h2o,
375 |                      validation_frame = test_h2o,
376 |                      ntrees = 100,
377 |                      seed   = 1234,
378 |                      hyper_params = gbm_params)
379 | 
380 | h2o.getGrid(grid_id = "gbm_grid",
381 |             sort_by = "auc",
382 |             decreasing = TRUE)
383 | 
384 | h2o.getModel("gbm_grid1_model_1") %>%
385 |     h2o.auc(valid = TRUE)
386 | # [1] 0.7459666
387 | 
388 | 
389 | # BONUS #2:  AUTOML ----
390 | 
391 | start <- Sys.time()
392 | h2o_automl <- h2o.automl(
393 |     x = x,
394 |     y = y,
395 |     training_frame = train_h2o,
396 |     validation_frame = test_h2o,
397 |     nfolds = 5,
398 |     seed = 1234,
399 |     
400 |     # AutoML
401 |     max_runtime_secs = 300
402 | )
403 | Sys.time() - start
404 | # Time difference of 5.243099 mins
405 | 
406 | 
407 | h2o_automl@leaderboard
408 | 
409 | h2o_automl@leader %>%
410 |     h2o.auc(valid = TRUE)
411 | # [1] 0.7423596


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ####### Dockerfile #######
 2 | FROM rocker/tidyverse:3.4.3
 3 | 
 4 | RUN apt-get update -qq \
 5 |     && apt-get -y --no-install-recommends install \
 6 | 	libglu1-mesa-dev \
 7 | 	liblzma-dev \
 8 |     libbz2-dev \
 9 |     clang  \
10 |     ccache \
11 |     default-jdk \
12 |     default-jre \
13 |     libmagick++-dev \
14 |     && R CMD javareconf \
15 |     && install2.r --error --deps TRUE \
16 |         h2o \
17 |         recipes \
18 |         rsample \
19 |         lime \
20 |         tidyquant
21 |         
22 |     


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DSGO 2018 - Machine Learning With R + H2O Workshop
  2 | 
  3 | ___Get ready to learn how to predict credit defaults with `R` + `H2O`!___
  4 | 
  5 | ## Program
  6 | 
  7 | <a href="https://www.kaggle.com/c/home-credit-default-risk">
  8 | <img src="00_images/kaggle_credit_default.png" width="30%" align="right">
  9 | </a>
 10 | 
 11 | - Data is Credit Loan Applications to a Bank. 
 12 | 
 13 | - Objective is to assess Risk Of Default, prevent bad loans, save bank lots of \$\$\$
 14 | 
 15 | - Best Kagglers got 0.80 AUC with more 100's of manhours, feature engineering, combining more data sets 
 16 | 
 17 | - We'll get 0.74 AUC in 30 minutes of coding (+1.5 hour of explaining)
 18 | 
 19 | 
 20 | 
 21 | 
 22 | ## Data
 23 | 
 24 | - Kaggle Competition: [Home Credit Default Risk](https://www.kaggle.com/c/home-credit-default-risk)
 25 | 
 26 | - Data is large (166MB unzipped, 308K rows, 122 columns)
 27 | 
 28 | - Will work with sampled data 20% to keep manageable
 29 | 
 30 | 
 31 | ## Machine Learning With H2O
 32 | 
 33 | The goal of ___Machine Learning with H2O___ is to get you experience with:
 34 | 
 35 | 1. The R programming language
 36 | 
 37 | 2. `h2o` for machine learning
 38 | 
 39 | 3. `lime` for feature explanation
 40 | 
 41 | 4. `recipes` for preprocessing
 42 | 
 43 | ### Becoming A Data Science Rockstar
 44 | 
 45 | <a href="https://university.business-science.io/p/hr201-using-machine-learning-h2o-lime-to-predict-employee-turnover/?product_id=635023&coupon_code=DSGO20">
 46 | <img src="00_images/DS4B_201_R_Course.png" width="30%" align="right">
 47 | </a>
 48 | 
 49 | - This 3 hour workshop will teach you some of the latest tools & techniques for Machine Learning in business
 50 | 
 51 | - With this said, you will spend 5% of your time on modeling (machine learning) & 95% of your time:
 52 | 
 53 |     - Managing projects
 54 |     - Collecting & working with data (manipulating, combining, cleaning) 
 55 |     - Visualizing information - showing the size of problems and what is likely contributing
 56 |     - Communicating results in terms the business cares about 
 57 |     - Recommending actions that improve the business
 58 | 
 59 | - Further, your organization will be keenly aware of what you contribute __financially__. You need to show them __Return on Investment (ROI)__. They are making an investment in having a data science team. They expect __tangible results__. 
 60 | 
 61 | - Important Actions:
 62 | 
 63 |     - Attend my talk on the [Business Science Problem Framework](https://www.business-science.io/bspf.html) tomorrow. The BSPF is the essential system that enables driving ROI with data science.
 64 |     
 65 |     - Take my [DS4B 201-R course](https://university.business-science.io/p/hr201-using-machine-learning-h2o-lime-to-predict-employee-turnover/?product_id=635023&coupon_code=DSGO20). This teaches you a 10-Week Program that has cut data science projects in half for consultants and has progressed data scientists more than any other course they've take. ___You will get 20% OFF (expires after DSGO conference).___
 66 | 
 67 | 
 68 | ---
 69 | 
 70 | ## Installation Instructions 
 71 | 
 72 | ### Option 1: RStudio IDE Desktop + Install R Packages
 73 | 
 74 | ###### Step 1: Install R and RStudio IDE
 75 | 
 76 | - [Download and Install R](https://cloud.r-project.org/)
 77 | 
 78 | - [Download RStudio IDE Desktop](https://www.rstudio.com/products/rstudio/download/)
 79 | 
 80 | ###### Step 2: Open Rstudio and run the following scripts
 81 | 
 82 | ```
 83 | pkgs <- c("h2o", "tidyverse", "rsample", "recipes", "lime")
 84 | install.packages(pkgs)
 85 | ```
 86 | 
 87 | Test H2O - You may need the [Java Developer Kit](http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#requirements)
 88 | 
 89 | ```
 90 | library(h2o)
 91 | h2o.init()
 92 | ```
 93 | 
 94 | If H2O cannot connect, you probably need to install Java. 
 95 | 
 96 | ###### Step 3: Load the Project From GitHub
 97 | 
 98 | _Wait for instructions from Matt._
 99 | 
100 | The URL for the GitHub project is:
101 | 
102 | https://github.com/business-science/workshop_2018_dsgo
103 | 
104 | ### Option 2: If You Have Docker Installed
105 | 
106 | ###### Step 0: Docker Installation (Takes Time)
107 | 
108 | _Skip this step if you already have Docker Community Edition installed_
109 | 
110 | [Docker Community Edition Installation Instructions](https://store.docker.com/search?offering=community&type=edition)
111 | 
112 | 
113 | ###### Step 1: Run the DSGO Workshop Docker Image
114 | 
115 | In a terminal / command line, run the following command to download and install the workshop container. This will take a few minutes to load. 
116 | 
117 | ```
118 | docker run -d -p 8787:8787 -v "`pwd`":/home/rstudio/working -e PASSWORD=rstudio -e ROOT=TRUE mdancho/workshop_2018_dsgo
119 | ```
120 | 
121 | ###### Step 3: Fire Up RStudio IDE in your Browser
122 | 
123 | Go into you favorite browser (I'll be using Chrome), and enter the following in the web address field.
124 | 
125 | ```
126 | localhost:8787
127 | ```
128 | 
129 | ###### Step 4: Log into RStudio Server
130 | 
131 | <a href="https://www.kaggle.com/c/home-credit-default-risk">
132 | <img src="00_images/rstudio_server.png" width="30%" align="right">
133 | </a>
134 | 
135 | Use the following credentials.
136 | 
137 | - __User Name:__ rstudio
138 | - __Password:__ rstudio
139 | 
140 | <div class="clearfix"></div>
141 | 
142 | 
143 | ###### Step 5: Load the Project From GitHub
144 | 
145 | _Wait for instructions from Matt._
146 | 
147 | The URL for the GitHub project is:
148 | 
149 | https://github.com/business-science/workshop_2018_dsgo
150 | 
151 | 
152 | 
153 | ---
154 | 
155 | ## Further Resources
156 | 
157 | - `tidyverse`: A meta-package for data wrangling and visualization. Loads `dplyr`, `ggplot2`, and a number of essential packages for working with data. Documentation: https://www.tidyverse.org/
158 | 
159 | - `recipes`: A preprocessing package that includes many standard preprocessing steps. Documentation: https://tidymodels.github.io/recipes/ 
160 | 
161 | - `h2o`: A high-performance machine learning library that is scalable and is optimized for perfromance. Documentation: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/index.html 
162 | 
163 |     - GLM: Elastic Net (Generalized Linear Regression with L1 + L2 Regularization)
164 |     
165 |     - GBM: Gradient Boosted Machines (Tree-Based + Boosting)
166 |     
167 |     - Random Forest: Tree Based + Bagging
168 |     
169 |     - Deep Learning: Neural Network
170 |     
171 |     - Automated Machine Learning: Stacked Ensemble, All Models and Best of Family
172 | 
173 | - `lime`: A package for explaining black-box models. LIME Tutorial: https://www.business-science.io/business/2018/06/25/lime-local-feature-interpretation.html 
174 | 
175 | 


--------------------------------------------------------------------------------
/workshop_2018_dsgo.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------