├── Kuhn.pdf ├── emr.RData ├── README.md ├── okc_data.R ├── Kuhn.R └── LICENSE /Kuhn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/useR2016/HEAD/Kuhn.pdf -------------------------------------------------------------------------------- /emr.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/topepo/useR2016/HEAD/emr.RData -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Slides and code for the 2016 useR! tutorial "Never Tell Me the Odds! Machine Learning with Class Imbalances" 3 | -------------------------------------------------------------------------------- /okc_data.R: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | ## Code for the useR 2016 tutorial "Never Tell Me the Odds! Machine 3 | ## Learning with Class Imbalances" by Max Kuhn 4 | ## 5 | ## Slides and this code can be found at 6 | ## https://github.com/topepo/useR2016 7 | ## 8 | ## packages used here are: lubridate 9 | ## 10 | ## Data are at: https://github.com/rudeboybert/JSE_OkCupid 11 | 12 | library(lubridate) 13 | 14 | ################################################################### 15 | ## Some levels of predictors have spaces or symbols; fix these 16 | ## and replace "" with "missing" 17 | 18 | fix_levels <- function(x) { 19 | x <- gsub("’", "", x) 20 | x <- gsub("[[:space:]]", "_", x) 21 | x <- gsub("[[:punct:]]", "_", x) 22 | x <- gsub("__", "_", x, perl = TRUE) 23 | x <- gsub("__", "_", x, perl = TRUE) 24 | x <- gsub("__", "_", x, perl = TRUE) 25 | x[x == ""] <- "missing" 26 | resort_lvl(x) 27 | } 28 | 29 | ################################################################### 30 | ## resort categorical predictors so that "missing" is the first level 31 | resort_lvl <- function(x) { 32 | x <- as.character(x) 33 | lv <- c("missing", sort(unique(x[x != "missing"]))) 34 | factor(x, levels = lv) 35 | } 36 | 37 | ################################################################### 38 | ## Unpackage the data and read in. A compressed version of the csv 39 | ## file is at https://github.com/rudeboybert/JSE_OkCupid 40 | 41 | raw <- read.csv("profiles.csv", 42 | stringsAsFactors = FALSE) 43 | raw <- raw[, !grepl("^essay", names(raw))] 44 | 45 | ################################################################### 46 | ## Compute the number of days since last online 47 | 48 | tmp_last <- ymd(substring(raw$last_online, 1, 10)) 49 | tmp_last <- difftime(max(tmp_last), tmp_last, units = "days") 50 | raw$last_online <- as.numeric(tmp_last) 51 | 52 | ################################################################### 53 | ## encode the "easy" categorical predictors 54 | 55 | raw$body_type <- fix_levels(raw$body_type) 56 | raw$drinks <- fix_levels(raw$drinks) 57 | raw$drugs <- fix_levels(raw$drugs) 58 | raw$education <- fix_levels(raw$education) 59 | raw$diet <- fix_levels(raw$diet) 60 | raw$job <- fix_levels(raw$job) 61 | raw$offspring <- fix_levels(raw$offspring) 62 | raw$pets <- fix_levels(raw$pets) 63 | raw$orientation <- fix_levels(raw$orientation) 64 | raw$sex <- fix_levels(raw$sex) 65 | raw$smokes <- fix_levels(raw$smokes) 66 | raw$drugs <- fix_levels(raw$drugs) 67 | raw$status <- fix_levels(raw$status) 68 | 69 | ################################################################### 70 | ## Income is basically encoded categorical so we will make it a factor 71 | 72 | test <- ifelse(raw$income == -1, NA, raw$income) 73 | test <- factor(paste0("inc", test), levels = c("missing", paste0("inc", sort(unique(test))))) 74 | test[is.na(test)] <- "missing" 75 | raw$income <- test 76 | 77 | ################################################################### 78 | ## Split their location into city and state. There are some R functions 79 | ## (ahem, randomForest) that can only handle predictors with <=52 80 | ## levels so we take the long tail of the distribution and truncate 81 | ## some cities to "other" 82 | 83 | tmp_where <- strsplit(raw$location, split = ", ") 84 | where_state <- unlist(lapply(tmp_where, function(x) if(length(x) == 2) x[2] else "missing")) 85 | where_town <- unlist(lapply(tmp_where, function(x) if(length(x) == 2) x[1] else "missing")) 86 | 87 | town_tab <- sort(-table(where_town)) 88 | where_town[!(where_town %in% names(town_tab)[1:50])] <- "other" 89 | 90 | raw$where_state <- factor(gsub(" ", "_", where_state)) 91 | raw$where_town <- factor(gsub(" ", "_", where_town)) 92 | raw$location <- NULL 93 | 94 | ################################################################### 95 | ## Some predictors have values and modifiers that describe how 96 | ## serious they are about their choice. We will create predictors 97 | ## for both characteristics of their answer 98 | 99 | ## for religon, split religon and modifier 100 | tmp_relig_split <- strsplit(raw$religion, split = " ") 101 | tmp_relig <- unlist(lapply(tmp_relig_split, function(x) x[1])) 102 | tmp_relig[tmp_relig == ""] <- "missing" 103 | tmp_relig[is.na(tmp_relig)] <- "missing" 104 | raw$religion <- resort_lvl(tmp_relig) 105 | raw$religion_modifer <- unlist(lapply(tmp_relig_split, 106 | function(x) 107 | if(length(x) > 1) 108 | paste(x[-1], collapse = "_") else 109 | "missing")) 110 | raw$religion_modifer <- resort_lvl(raw$religion_modifer) 111 | 112 | ################################################################### 113 | ## Same for sign 114 | 115 | raw$sign <- gsub("’", "", raw$sign) 116 | tmp_sign_split <- strsplit(raw$sign, split = " ") 117 | tmp_sign <- unlist(lapply(tmp_sign_split, function(x) x[1])) 118 | sign_lvl <- sort(unique(tmp_sign)) 119 | tmp_sign[tmp_sign == ""] <- "missing" 120 | tmp_sign[is.na(tmp_sign)] <- "missing" 121 | raw$sign <- resort_lvl(tmp_sign) 122 | raw$sign_modifer <- unlist(lapply(tmp_sign_split, 123 | function(x) 124 | if(length(x) > 1) 125 | paste(x[-1], collapse = "_") else 126 | "missing")) 127 | raw$sign_modifer <- resort_lvl(raw$sign_modifer) 128 | 129 | ################################################################### 130 | ## They are allowed to list multiple languages so we will pre-split 131 | ## these into dummy variables since they might have multiple choices. 132 | ## Also, "c++" and "lisp" ! 133 | 134 | tmp_speaks <- gsub("(", "", raw$speaks, fixed = TRUE) 135 | tmp_speaks <- gsub(")", "", tmp_speaks, fixed = TRUE) 136 | tmp_speaks <- gsub("c++", "cpp", tmp_speaks, fixed = TRUE) 137 | tmp_speaks_split <- strsplit(tmp_speaks, split = ",") 138 | tmp_speaks_split <- lapply(tmp_speaks_split, 139 | function(x) gsub("^ ", "", x)) 140 | tmp_speaks_split <- lapply(tmp_speaks_split, 141 | function(x) gsub(" ", "_", x)) 142 | speaks_values <- sort(unique(unlist(tmp_speaks_split))) 143 | # tmp_speaks <- unlist(lapply(tmp_speaks_split, paste, collapse = ",", sep = "")) 144 | for(i in speaks_values) 145 | raw[, i] <- ifelse(unlist(lapply(tmp_speaks_split, function(x, sp) any(x == sp), sp = i)), 1, 0) 146 | raw$speaks <- NULL 147 | 148 | ################################################################### 149 | ## Similaly, ethnicity is pre-split into dummy variables 150 | 151 | tmp_eth <- gsub(", ", ",", raw$ethnicity) 152 | tmp_eth <- gsub("/ ", "", tmp_eth) 153 | tmp_eth <- gsub(" ", "_", tmp_eth) 154 | tmp_eth_split <- strsplit(tmp_eth, split = ",") 155 | eth_lvl <- sort(unique(unlist(tmp_eth_split))) 156 | for(i in eth_lvl) 157 | raw[, i] <- ifelse(unlist(lapply(tmp_speaks_split, function(x, eth) any(x == eth), eth = i)), 1, 0) 158 | raw$ethnicity <- NULL 159 | 160 | ################################################################### 161 | ## There are very few missing values for continuous fields so 162 | ## remove them and convert the job field to the outcome. 163 | 164 | okc <- raw[complete.cases(raw),] 165 | 166 | okc$Class <- factor(ifelse(grepl("(computer)|(science)", okc$job), "stem", "other"), 167 | levels = c("stem", "other")) 168 | 169 | 170 | okc <- okc[okc$job != "missing",] 171 | okc$job <- NULL 172 | 173 | table(okc$Class)/nrow(okc) 174 | 175 | save(okc, file = "okc.RData") 176 | -------------------------------------------------------------------------------- /Kuhn.R: -------------------------------------------------------------------------------- 1 | ################################################################### 2 | ## Code for the useR 2016 tutorial "Never Tell Me the Odds! Machine 3 | ## Learning with Class Imbalances" by Max Kuhn 4 | ## 5 | ## Slides and this code can be found at 6 | ## https://github.com/topepo/useR2016 7 | ## 8 | ## packages used here are: caret, pROC, rpart, partykit, randomForest, 9 | ## AppliedPredictiveModeling, DMwR, ROSE, C50, kernlab, ggthemes, 10 | ## plyr 11 | ## 12 | ## Session info is at the bottom of this document 13 | ## 14 | ## Data are at: https://github.com/rudeboybert/JSE_OkCupid 15 | ## https://github.com/topepo/useR2016 16 | ## 17 | ## OkC data are created in the file okc_data.R in the useR2016 repo 18 | ## 19 | 20 | 21 | ################################################################### 22 | ## Create toy data used throught the slides 23 | 24 | library(AppliedPredictiveModeling) 25 | set.seed(14034) 26 | ex_dat <- easyBoundaryFunc(250, intercept = -6, interaction = 1.5) 27 | 28 | library(ggplot2) 29 | ggplot(ex_dat, aes(x = X1, y = X2)) + 30 | geom_point(aes(color = class), cex = 3, alpha = .5) + 31 | theme(legend.position = "top") + 32 | scale_colour_tableau() + 33 | xlab("Predictor A") + ylab("Predictor B") 34 | 35 | ################################################################### 36 | ## Slide 22 "Example Data - Electronic Medical Records" 37 | 38 | load("emr.RData") 39 | 40 | str(emr, list.len = 20) 41 | 42 | ################################################################### 43 | ## Slide 23 "Example Data - Electronic Medical Records" 44 | 45 | library(caret) 46 | 47 | set.seed(1732) 48 | emr_ind <- createDataPartition(emr$Class, p = 2/3, list = FALSE) 49 | emr_train <- emr[ emr_ind,] 50 | emr_test <- emr[-emr_ind,] 51 | 52 | mean(emr_train$Class == "event") 53 | mean(emr_test$Class == "event") 54 | 55 | table(emr_train$Class) 56 | table(emr_test$Class) 57 | 58 | ################################################################### 59 | ## Slide 25 "Example Data - OKCupid" 60 | 61 | load("okc.RData") ## create this using the file "okc_data.R" 62 | str(okc, list.len = 20, vec.len = 2) 63 | 64 | ################################################################### 65 | ## Slide 26 "Example Data - OKCupid" 66 | 67 | set.seed(1732) 68 | okc_ind <- createDataPartition(okc$Class, p = 2/3, list = FALSE) 69 | okc_train <- okc[ okc_ind,] 70 | okc_test <- okc[-okc_ind,] 71 | 72 | mean(okc_train$Class == "stem") 73 | mean(okc_test$Class == "stem") 74 | 75 | ################################################################### 76 | ## Slide 40 and 43 "A Single Shallow Tree" 77 | 78 | library(rpart) 79 | library(partykit) 80 | rp1 <- rpart(Class ~ ., data = emr_train, control = rpart.control(maxdepth = 3, cp = 0)) 81 | plot(as.party(rp1)) 82 | 83 | ################################################################### 84 | ## Slide 44 "A Single Shallow Tree (Bootstrapped)" 85 | 86 | set.seed(9595) 87 | dat2 <- emr_train[sample(1:nrow(emr_train), nrow(emr_train), replace = TRUE),] 88 | rp2 <- rpart(Class ~ ., data = dat2, control = rpart.control(maxdepth = 3, cp = 0)) 89 | plot(as.party(rp2)) 90 | 91 | ################################################################### 92 | ## Slide 45 "A Single Shallow Tree (Bootstrapped)" 93 | 94 | set.seed(1976) 95 | dat3 <- emr_train[sample(1:nrow(emr_train), nrow(emr_train), replace = TRUE),] 96 | rp3 <- rpart(Class ~ ., data = dat3, control = rpart.control(maxdepth = 3, cp = 0)) 97 | plot(as.party(rp3)) 98 | 99 | ################################################################### 100 | ## Slide 47 "Random Forests with the EMR Data" 101 | 102 | ## on OS X, I ran in parallel using 103 | ## library(doMC) 104 | ## registerDoMC(cores=8) 105 | ## on Windows, try the doParallel package 106 | ## **if** your computer has multiple cores and sufficient memory 107 | 108 | ctrl <- trainControl(method = "repeatedcv", 109 | repeats = 5, 110 | classProbs = TRUE, 111 | savePredictions = TRUE, 112 | summaryFunction = twoClassSummary) 113 | emr_grid <- data.frame(mtry = c(1:15, (4:9)*5)) 114 | 115 | set.seed(1537) 116 | rf_emr_mod <- train(Class ~ ., 117 | data = emr_train, 118 | method = "rf", 119 | metric = "ROC", 120 | tuneGrid = emr_grid, 121 | ntree = 1000, 122 | trControl = ctrl) 123 | 124 | ################################################################### 125 | ## Back to Slide 37 "ROC Curve" to plot the **test set data** 126 | 127 | exRoc <- roc(emr_test$Class, ex_probs$Prob, levels = rev(levels(emr_test$Class))) 128 | plot(exRoc, legacy.axes = FALSE, 129 | print.thres=c(.2, .5, 1), 130 | print.thres.pattern = "%.2f (Sp = %.3f, Sn = %.3f)", 131 | print.thres.cex = .8) 132 | 133 | ################################################################### 134 | ## Slide 50 "Random Forest Results - EMR Example" 135 | 136 | ggplot(rf_emr_mod) 137 | 138 | ################################################################### 139 | ## Slide 51 "Approximate Random Forest Resampled ROC Curve" 140 | 141 | ## This function averages the class probability values per sample 142 | ## across the hold-outs to get an averaged ROC curve 143 | 144 | roc_train <- function(object, best_only = TRUE, ...) { 145 | library("pROC") 146 | library("plyr") 147 | 148 | if(object$modelType != "Classification") 149 | stop("ROC curves are only available for classification models") 150 | if(!any(names(object$modelInfo) == "levels")) 151 | stop(paste("The model's code is required to have a 'levels' module.", 152 | "See http://topepo.github.io/caret/custom_models.html#Components")) 153 | lvs <- object$modelInfo$levels(object$finalModel) 154 | if(length(lvs) != 2) 155 | stop("ROC curves are only implemented here for two class problems") 156 | 157 | ## check for predictions 158 | if(is.null(object$pred)) 159 | stop(paste("The out of sample predictions are required.", 160 | "See the `savePredictions` argument of `trainControl`")) 161 | 162 | if(best_only) { 163 | object$pred <- merge(object$pred, object$bestTune) 164 | } 165 | ## find tuning parameter names 166 | p_names <- as.character(object$modelInfo$parameters$parameter) 167 | p_combos <- object$pred[, p_names, drop = FALSE] 168 | 169 | ## average probabilities across resamples 170 | object$pred <- plyr::ddply(.data = object$pred, 171 | .variables = c("obs", "rowIndex", p_names), 172 | .fun = function(dat, lvls = lvs) { 173 | out <- mean(dat[, lvls[1]]) 174 | names(out) <- lvls[1] 175 | out 176 | }) 177 | 178 | make_roc <- function(x, lvls = lvs, nms = NULL, ...) { 179 | out <- pROC::roc(response = x$obs, 180 | predictor = x[, lvls[1]], 181 | levels = rev(lvls)) 182 | 183 | out$model_param <- x[1,nms,drop = FALSE] 184 | out 185 | } 186 | out <- plyr::dlply(.data = object$pred, 187 | .variables = p_names, 188 | .fun = make_roc, 189 | lvls = lvs, 190 | nms = p_names) 191 | if(length(out) == 1) out <- out[[1]] 192 | out 193 | } 194 | 195 | plot(roc_train(rf_emr_mod), 196 | legacy.axes = TRUE, 197 | print.thres = .5, 198 | print.thres.pattern=" <- default %.1f threshold") 199 | 200 | ################################################################### 201 | ## Slide 52 "A Better Cutoff" 202 | 203 | plot(roc_train(rf_emr_mod), 204 | legacy.axes = TRUE, 205 | print.thres.pattern = "Cutoff: %.2f (Sp = %.2f, Sn = %.2f)", 206 | print.thres = "best") 207 | 208 | ################################################################### 209 | ## Slide 59 "Down-Sampling - EMR Data" 210 | 211 | down_ctrl <- ctrl 212 | down_ctrl$sampling <- "down" 213 | set.seed(1537) 214 | rf_emr_down <- train(Class ~ ., 215 | data = emr_train, 216 | method = "rf", 217 | metric = "ROC", 218 | tuneGrid = emr_grid, 219 | ntree = 1000, 220 | trControl = down_ctrl) 221 | 222 | ################################################################### 223 | ## Slide 60 "Down-Sampling - EMR Data" 224 | 225 | ggplot(rf_emr_down) 226 | 227 | ################################################################### 228 | ## Slide 61 "Approximate Resampled ROC Curve with Down-Sampling" 229 | 230 | plot(roc_train(rf_emr_down), 231 | legacy.axes = TRUE, 232 | print.thres = .5, 233 | print.thres.pattern=" <- default %.1f threshold") 234 | 235 | ################################################################### 236 | ## Slide 63 "Internal Down-Sampling - EMR Data" 237 | 238 | set.seed(1537) 239 | rf_emr_down_int <- train(Class ~ ., 240 | data = emr_train, 241 | method = "rf", 242 | metric = "ROC", 243 | ntree = 1000, 244 | tuneGrid = emr_grid, 245 | trControl = ctrl, 246 | ## These are passed to `randomForest` 247 | strata = emr_train$Class, 248 | sampsize = rep(sum(emr_train$Class == "event"), 2)) 249 | 250 | ################################################################### 251 | ## Slide 64 "Internal Down-Sampling - EMR Data" 252 | 253 | ggplot(rf_emr_down_int) 254 | 255 | ################################################################### 256 | ## Slide 67 "Up-Sampling - EMR Data" 257 | 258 | up_ctrl <- ctrl 259 | up_ctrl$sampling <- "up" 260 | set.seed(1537) 261 | rf_emr_up <- train(Class ~ ., 262 | data = emr_train, 263 | method = "rf", 264 | tuneGrid = emr_grid, 265 | ntree = 1000, 266 | metric = "ROC", 267 | trControl = up_ctrl) 268 | 269 | ################################################################### 270 | ## Slide 68 "Up-Sampling - EMR Data" 271 | 272 | ggplot(rf_emr_up) 273 | 274 | ################################################################### 275 | ## Slide 73 "SMOTE - EMR Data" 276 | 277 | smote_ctrl <- ctrl 278 | smote_ctrl$sampling <- "smote" 279 | set.seed(1537) 280 | rf_emr_smote <- train(Class ~ ., 281 | data = emr_train, 282 | method = "rf", 283 | tuneGrid = emr_grid, 284 | ntree = 1000, 285 | metric = "ROC", 286 | trControl = smote_ctrl) 287 | 288 | ################################################################### 289 | ## Slide 74 "SMOTE - EMR Data" 290 | 291 | ggplot(rf_emr_smote) 292 | 293 | ################################################################### 294 | ## Slide 75 "SMOTE - EMR Data" 295 | 296 | emr_test_pred <- data.frame(Class = emr_test$Class) 297 | emr_test_pred$normal <- predict(rf_emr_mod, emr_test, type = "prob")[, "event"] 298 | emr_test_pred$down <- predict(rf_emr_down, emr_test, type = "prob")[, "event"] 299 | emr_test_pred$down_int <- predict(rf_emr_down_int, emr_test, type = "prob")[, "event"] 300 | emr_test_pred$up <- predict(rf_emr_up, emr_test, type = "prob")[, "event"] 301 | emr_test_pred$smote <- predict(rf_emr_smote, emr_test, type = "prob")[, "event"] 302 | 303 | get_auc <- function(pred, ref) auc(roc(ref, pred, levels = rev(levels(ref)))) 304 | 305 | apply(emr_test_pred[, -1], 2, get_auc, ref = emr_test_pred$Class) 306 | 307 | ################################################################### 308 | ## Slide 81 "CART and Costs - OkC Data" 309 | 310 | fourStats <- function (data, lev = levels(data$obs), model = NULL) { 311 | accKapp <- postResample(data[, "pred"], data[, "obs"]) 312 | out <- c(accKapp, 313 | sensitivity(data[, "pred"], data[, "obs"], lev[1]), 314 | specificity(data[, "pred"], data[, "obs"], lev[2])) 315 | names(out)[3:4] <- c("Sens", "Spec") 316 | out 317 | } 318 | 319 | ctrl_cost <- trainControl(method = "repeatedcv", 320 | repeats = 5, 321 | savePredictions = TRUE, 322 | summaryFunction = fourStats) 323 | 324 | ################################################################### 325 | ## Slide 82 "CART and Costs - OkC Data" 326 | 327 | ## Get an initial grid of Cp values 328 | rpart_init <- rpart(Class ~ ., data = okc_train, cp = 0)$cptable 329 | 330 | cost_grid <- expand.grid(cp = rpart_init[, "CP"], Cost = 1:5) 331 | 332 | ## Use the non-formula method. Many of the predictors are factors and 333 | ## this will preserve the factor encoding instead of using dummy 334 | ## variables. 335 | 336 | set.seed(1537) 337 | rpart_costs <- train(x = okc_train[, names(okc_train) != "Class"], 338 | y = okc_train$Class, 339 | method = "rpartCost", 340 | tuneGrid = cost_grid, 341 | metric = "Kappa", 342 | trControl = ctrl_cost) 343 | 344 | ################################################################### 345 | ## Slide 84 "CART and Costs - OkC Data" 346 | 347 | ggplot(rpart_costs) + 348 | scale_x_log10() + 349 | theme(legend.position = "top") 350 | 351 | ################################################################### 352 | ## Slide 85 "CART and Costs - OkC Data" 353 | 354 | ggplot(rpart_costs, metric = "Sens") + 355 | scale_x_log10() + 356 | theme(legend.position = "top") 357 | 358 | ################################################################### 359 | ## Slide 86 "CART and Costs - OkC Data" 360 | 361 | ggplot(rpart_costs, metric = "Spec") + 362 | scale_x_log10() + 363 | theme(legend.position = "top") 364 | 365 | ################################################################### 366 | ## Slide 87 "C5.0 and Costs - OkC Data" 367 | 368 | cost_grid <- expand.grid(trials = c(1:10, 20, 30), 369 | winnow = FALSE, model = "tree", 370 | cost = c(1, 5, 10, 15)) 371 | set.seed(1537) 372 | c5_costs <- train(x = okc_train[, names(okc_train) != "Class"], 373 | y = okc_train$Class, 374 | method = "C5.0Cost", 375 | tuneGrid = cost_grid, 376 | metric = "Kappa", 377 | trControl = ctrl_cost) 378 | 379 | ################################################################### 380 | ## Slide 89 "C5.0 and Costs - OkC Data" 381 | 382 | ggplot(c5_costs) + theme(legend.position = "top") 383 | 384 | ################################################################### 385 | ## Slide 91 "OkC Test Results - C5.0" 386 | 387 | rp_pred <- predict(rpart_costs, newdata = okc_test) 388 | confusionMatrix(rp_pred, okc_test$Class) 389 | 390 | ################################################################### 391 | ## Slide 90 "OkC Test Results - CART" 392 | 393 | c5_pred <- predict(c5_costs, newdata = okc_test) 394 | confusionMatrix(c5_pred, okc_test$Class) 395 | 396 | ################################################################### 397 | ## Slide 103 "CART and Costs and Probabilities" 398 | 399 | cost_mat <-matrix(c(0, 1, 5, 0), ncol = 2) 400 | rownames(cost_mat) <- colnames(cost_mat) <- levels(okc_train$Class) 401 | rp_mod <- rpart(Class ~ ., data = okc_train, parms = list(loss = cost_mat)) 402 | pred_1 <- predict(rp_mod, okc_test, type = "class") 403 | pred_2 <- ifelse(predict(rp_mod, okc_test)[, "stem"] >= .5, "stem", "other") 404 | pred_2 <- factor(pred_2, levels = levels(pred_1)) 405 | 406 | table(pred_1, pred_2) 407 | 408 | ################################################################### 409 | ## Session info: 410 | 411 | # R Under development (unstable) (2016-06-07 r70726) 412 | # Platform: x86_64-apple-darwin13.4.0 (64-bit) 413 | # Running under: OS X 10.10.5 (Yosemite) 414 | # 415 | # locale: 416 | # [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 417 | # 418 | # attached base packages: 419 | # [1] parallel grid stats graphics grDevices utils datasets 420 | # [8] methods base 421 | # 422 | # other attached packages: 423 | # [1] vcd_1.4-1 ggthemes_3.0.3 424 | # [3] kernlab_0.9-24 RColorBrewer_1.1-2 425 | # [5] randomForest_4.6-12 doMC_1.3.4 426 | # [7] iterators_1.0.8 foreach_1.4.3 427 | # [9] inTrees_1.1 C50_0.1.0-24 428 | # [11] plyr_1.8.4 ROSE_0.0-3 429 | # [13] DMwR_0.4.1 proxy_0.4-15 430 | # [15] AppliedPredictiveModeling_1.1-6 partykit_1.0-5 431 | # [17] rpart_4.1-10 nnet_7.3-12 432 | # [19] Hmisc_3.17-4 Formula_1.2-1 433 | # [21] survival_2.39-2 caret_6.0-70 434 | # [23] ggplot2_2.1.0 lattice_0.20-33 435 | # [25] pROC_1.8 knitr_1.13 436 | # 437 | # loaded via a namespace (and not attached): 438 | # [1] splines_3.4.0 gtools_3.5.0 assertthat_0.1 439 | # [4] TTR_0.23-1 highr_0.5.1 stats4_3.4.0 440 | # [7] latticeExtra_0.6-28 arules_1.4-1 quantreg_5.21 441 | # [10] chron_2.3-47 digest_0.6.9 minqa_1.2.4 442 | # [13] RRF_1.6 colorspace_1.2-6 gbm_2.1.1 443 | # [16] Matrix_1.2-6 SparseM_1.7 xtable_1.8-2 444 | # [19] scales_0.4.0 gdata_2.17.0 lme4_1.1-12 445 | # [22] MatrixModels_0.4-1 mgcv_1.8-12 car_2.1-2 446 | # [25] ROCR_1.0-7 pbkrtest_0.4-6 quantmod_0.4-5 447 | # [28] magrittr_1.5 evaluate_0.8.3 CORElearn_1.47.1 448 | # [31] nlme_3.1-127 MASS_7.3-45 gplots_3.0.1 449 | # [34] xts_0.9-7 foreign_0.8-66 class_7.3-14 450 | # [37] tools_3.4.0 data.table_1.9.6 formatR_1.3 451 | # [40] stringr_1.0.0 munsell_0.4.3 cluster_2.0.4 452 | # [43] compiler_3.4.0 e1071_1.6-7 caTools_1.17.1 453 | # [46] nloptr_1.0.4 bitops_1.0-6 labeling_0.3 454 | # [49] gtable_0.2.0 codetools_0.2-14 abind_1.4-3 455 | # [52] reshape2_1.4.1 gridExtra_2.2.1 zoo_1.7-12 456 | # [55] KernSmooth_2.23-15 stringi_1.0-1 Rcpp_0.12.4 457 | # [58] acepack_1.3-3.3 lmtest_0.9-34 458 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | {description} 294 | Copyright (C) {year} {fullname} 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | {signature of Ty Coon}, 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | --------------------------------------------------------------------------------