├── .gitignore ├── README.md ├── RedundantScripts └── h2o_model.R ├── log.txt └── model_building.R /.gitignore: -------------------------------------------------------------------------------- 1 | *.Rproj* 2 | *.csv 3 | Submissions 4 | .Rhistory 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The Smart Recruits 2 | ## Data Science competition by Analytics Vidhya 3 | 4 | Private LB: Top 15% finish (47/379 participants) 5 | 6 | - Score on private LB: AUC- 0.615203707604 7 | - Final model is trained using the XGBoost algorithm. 8 | - Used 5-fold CV 9 | 10 | _Associated blog post: [Data Science Competitions 101: Anatomy and Approach](https://techandmortals.wordpress.com/2016/07/27/data-science-competitions-101-anatomy-and-approach/)_ 11 | -------------------------------------------------------------------------------- /RedundantScripts/h2o_model.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | library(Matrix) 3 | library(caret) 4 | library(ROCR) 5 | library(tidyr) 6 | library(mice) 7 | 8 | set.seed(1234) 9 | 10 | train <- read.csv(file = "Train_pjb2QcD.csv", stringsAsFactors = F) 11 | test <- read.csv(file = "Test_wyCirpO.csv", stringsAsFactors = F) 12 | 13 | test.id <- test$ID 14 | train.id <- train$ID 15 | train.y <- train$Business_Sourced 16 | 17 | ######################################################################################### 18 | ## FEATURE ENGINEERING 19 | ## ------------------- 20 | ## ------------------- 21 | train <- train[,-c(1, 4)] 22 | test <- test[,-c(1, 4)] 23 | 24 | train$Business_Sourced <- NULL 25 | 26 | # Separating Application_Receipt_Date into date, month, year 27 | # ---------------------------------------------------------- 28 | # train <- separate(data = train, col = Application_Receipt_Date, 29 | # into = c("Receipt_month", "Receipt_date", "Receipt_year")) 30 | # train$Receipt_date <- as.numeric(train$Receipt_date) 31 | # train$Receipt_month <- as.numeric(train$Receipt_month) 32 | # train$Receipt_year <- as.numeric(train$Receipt_year) 33 | # 34 | # test <- separate(data = test, col = Application_Receipt_Date, 35 | # into = c("Receipt_month", "Receipt_date", "Receipt_year")) 36 | # test$Receipt_date <- as.numeric(test$Receipt_date) 37 | # test$Receipt_month <- as.numeric(test$Receipt_month) 38 | # test$Receipt_year <- as.numeric(test$Receipt_year) 39 | # 40 | # # Separating Applicant_BirthDate into date, month, year 41 | # # ---------------------------------------------------------- 42 | # train <- separate(data = train, col = Applicant_BirthDate, 43 | # into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year")) 44 | # # train$Applicant_Birth_date <- as.numeric(train$Applicant_Birth_date) 45 | # # train$Applicant_Birth_date[is.na(train$Applicant_Birth_date)] <- 46 | # # as.numeric(names(which.max((table(train$Applicant_Birth_date))))) 47 | # # 48 | # # train$Applicant_Birth_month <- as.numeric(train$Applicant_Birth_month) 49 | # # train$Applicant_Birth_month[is.na(train$Applicant_Birth_month)] <- 50 | # # as.numeric(names(which.max((table(train$Applicant_Birth_month))))) 51 | # train$Applicant_Birth_date <- NULL 52 | # train$Applicant_Birth_month <- NULL 53 | # train$Applicant_Birth_year <- as.numeric(train$Applicant_Birth_year) 54 | # train$Applicant_Birth_year[is.na(train$Applicant_Birth_year)] <- 55 | # as.numeric(names(which.max((table(train$Applicant_Birth_year))))) 56 | # 57 | # test <- separate(data = test, col = Applicant_BirthDate, 58 | # into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year")) 59 | # # test$Applicant_Birth_date <- as.numeric(test$Applicant_Birth_date) 60 | # # test$Applicant_Birth_date[is.na(test$Applicant_Birth_date)] <- 61 | # # as.numeric(names(which.max((table(test$Applicant_Birth_date))))) 62 | # # 63 | # # test$Applicant_Birth_month <- as.numeric(test$Applicant_Birth_month) 64 | # # test$Applicant_Birth_month[is.na(test$Applicant_Birth_month)] <- 65 | # # as.numeric(names(which.max((table(test$Applicant_Birth_month))))) 66 | # test$Applicant_Birth_date <- NULL 67 | # test$Applicant_Birth_month <- NULL 68 | # test$Applicant_Birth_year <- as.numeric(test$Applicant_Birth_year) 69 | # test$Applicant_Birth_year[is.na(test$Applicant_Birth_year)] <- 70 | # as.numeric(names(which.max((table(test$Applicant_Birth_year))))) 71 | # 72 | # # Create Applicant_age variable 73 | # # --------------------- 74 | # train$Applicant_Age <- (2008 - train$Applicant_Birth_year) 75 | # train$Applicant_Birth_year <- NULL 76 | # test$Applicant_Age <- (2008 - test$Applicant_Birth_year) 77 | # test$Applicant_Birth_year <- NULL 78 | # 79 | # # Separating Manager_DOJ into date, month, year 80 | # # ---------------------------------------------------------- 81 | # train <- separate(data = train, col = Manager_DOJ, 82 | # into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year")) 83 | # train$Manager_Join_date <- as.numeric(train$Manager_Join_date) 84 | # train$Manager_Join_date[is.na(train$Manager_Join_date)] <- 85 | # median(train$Manager_Join_date, na.rm = T) 86 | # 87 | # train$Manager_Join_month <- as.numeric(train$Manager_Join_month) 88 | # train$Manager_Join_month[is.na(train$Manager_Join_month)] <- 89 | # median(train$Manager_Join_month, na.rm = T) 90 | # 91 | # # train$Manager_Join_date <- NULL 92 | # # train$Manager_Join_month <- NULL 93 | # train$Manager_Join_year <- as.numeric(train$Manager_Join_year) 94 | # train$Manager_Join_year[is.na(train$Manager_Join_year)] <- 95 | # median(train$Manager_Join_year, na.rm = T) 96 | # 97 | # test <- separate(data = test, col = Manager_DOJ, 98 | # into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year")) 99 | # test$Manager_Join_date <- as.numeric(test$Manager_Join_date) 100 | # test$Manager_Join_date[is.na(test$Manager_Join_date)] <- 101 | # median(test$Manager_Join_date, na.rm = T) 102 | # 103 | # test$Manager_Join_month <- as.numeric(test$Manager_Join_month) 104 | # test$Manager_Join_month[is.na(test$Manager_Join_month)] <- 105 | # median(test$Manager_Join_month, na.rm = T) 106 | # 107 | # # test$Manager_Join_date <- NULL 108 | # # test$Manager_Join_month <- NULL 109 | # test$Manager_Join_year <- as.numeric(test$Manager_Join_year) 110 | # test$Manager_Join_year[is.na(test$Manager_Join_year)] <- 111 | # median(test$Manager_Join_year, na.rm = T) 112 | # 113 | # # Create Manager_Experience variable 114 | # # ---------------------------------- 115 | # # train$Manager_Experience <- (2008 - train$Manager_Join_year) 116 | # # train$Manager_Join_year <- NULL 117 | # # test$Manager_Experience <- (2008 - test$Manager_Join_year) 118 | # # test$Manager_Join_year <- NULL 119 | # 120 | # # Separating Manager_DoB into date, month, year 121 | # # ---------------------------------------------------------- 122 | # train <- separate(data = train, col = Manager_DoB, 123 | # into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year")) 124 | # # train$Manager_Birth_date <- as.numeric(train$Manager_Birth_date) 125 | # # train$Manager_Birth_date[is.na(train$Manager_Birth_date)] <- 126 | # # median(train$Manager_Birth_date, na.rm = T) 127 | # # 128 | # # train$Manager_Birth_month <- as.numeric(train$Manager_Birth_month) 129 | # # train$Manager_Birth_month[is.na(train$Manager_Birth_month)] <- 130 | # # median(train$Manager_Birth_month, na.rm = T) 131 | # 132 | # train$Manager_Birth_date <- NULL 133 | # train$Manager_Birth_month <- NULL 134 | # train$Manager_Birth_year <- as.numeric(train$Manager_Birth_year) 135 | # train$Manager_Birth_year[is.na(train$Manager_Birth_year)] <- 136 | # median(train$Manager_Birth_year, na.rm = T) 137 | # 138 | # test <- separate(data = test, col = Manager_DoB, 139 | # into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year")) 140 | # # test$Manager_Birth_date <- as.numeric(test$Manager_Birth_date) 141 | # # test$Manager_Birth_date[is.na(test$Manager_Birth_date)] <- 142 | # # median(test$Manager_Birth_date, na.rm = T) 143 | # # 144 | # # test$Manager_Birth_month <- as.numeric(test$Manager_Birth_month) 145 | # # test$Manager_Birth_month[is.na(test$Manager_Birth_month)] <- 146 | # # median(test$Manager_Birth_month, na.rm = T) 147 | # 148 | # test$Manager_Birth_date <- NULL 149 | # test$Manager_Birth_month <- NULL 150 | # test$Manager_Birth_year <- as.numeric(test$Manager_Birth_year) 151 | # test$Manager_Birth_year[is.na(test$Manager_Birth_year)] <- 152 | # median(test$Manager_Birth_year, na.rm = T) 153 | # 154 | # # Create Manager_Age variable 155 | # # --------------------- 156 | # train$Manager_Age <- (2008 - train$Manager_Birth_year) 157 | # train$Manager_Birth_year <- NULL 158 | # test$Manager_Age <- (2008 - test$Manager_Birth_year) 159 | # test$Manager_Birth_year <- NULL 160 | # 161 | # # Encoding Applicant_Gender 162 | # # ---------------------------------------------------------- 163 | # train$Applicant_Gender[train$Applicant_Gender == "F"] <- 1 164 | # train$Applicant_Gender[train$Applicant_Gender == "M"] <- 2 165 | # train$Applicant_Gender[train$Applicant_Gender == ""] <- 3 166 | # train$Applicant_Gender <- as.numeric(train$Applicant_Gender) 167 | # 168 | # test$Applicant_Gender[test$Applicant_Gender == "F"] <- 1 169 | # test$Applicant_Gender[test$Applicant_Gender == "M"] <- 2 170 | # test$Applicant_Gender[test$Applicant_Gender == ""] <- 3 171 | # test$Applicant_Gender <- as.numeric(test$Applicant_Gender) 172 | # 173 | # # train$Female <- as.numeric(train$Applicant_Gender != "M") 174 | # # train$Male <- as.numeric(!train$Female) 175 | # # test$Female <- as.numeric(test$Applicant_Gender != "M") 176 | # # test$Male <- as.numeric(!test$Female) 177 | # # 178 | # # train$Applicant_Gender <- NULL 179 | # # test$Applicant_Gender <- NULL 180 | # 181 | # # Encoding Applicant_Marital_Status 182 | # # ---------------------------------------------------------- 183 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "M"] <- 1 184 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "S"] <- 2 185 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "D"] <- 3 186 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "W" | 187 | # train$Applicant_Marital_Status == "D" | 188 | # train$Applicant_Marital_Status == ""] <- 3 189 | # train$Applicant_Marital_Status <- as.numeric(train$Applicant_Marital_Status) 190 | # 191 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "M"] <- 1 192 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "S"] <- 2 193 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "D"] <- 3 194 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "W" | 195 | # test$Applicant_Marital_Status == "D" | 196 | # test$Applicant_Marital_Status == ""] <- 3 197 | # test$Applicant_Marital_Status <- as.numeric(test$Applicant_Marital_Status) 198 | # 199 | # # Encoding Applicant_Occupation 200 | # # ---------------------------------------------------------- 201 | # train$Applicant_Occupation[train$Applicant_Occupation == "Salaried"] <- 1 202 | # train$Applicant_Occupation[train$Applicant_Occupation == "Business"] <- 2 203 | # train$Applicant_Occupation[train$Applicant_Occupation == "Others" | 204 | # train$Applicant_Occupation == ""] <- 3 205 | # train$Applicant_Occupation[train$Applicant_Occupation == "Self Employed" | 206 | # train$Applicant_Occupation == "Student"] <- 4 207 | # train$Applicant_Occupation <- as.numeric(train$Applicant_Occupation) 208 | # 209 | # test$Applicant_Occupation[test$Applicant_Occupation == "Salaried"] <- 1 210 | # test$Applicant_Occupation[test$Applicant_Occupation == "Business"] <- 2 211 | # test$Applicant_Occupation[test$Applicant_Occupation == "Others" | 212 | # test$Applicant_Occupation == ""] <- 3 213 | # test$Applicant_Occupation[test$Applicant_Occupation == "Self Employed" | 214 | # test$Applicant_Occupation == "Student"] <- 4 215 | # test$Applicant_Occupation <- as.numeric(test$Applicant_Occupation) 216 | # 217 | # # Encoding Applicant_Qualification 218 | # # ---------------------------------------------------------- 219 | # train$Applicant_Qualification[train$Applicant_Qualification == "Class XII"] <- 1 220 | # train$Applicant_Qualification[train$Applicant_Qualification == "Graduate"] <- 2 221 | # train$Applicant_Qualification[train$Applicant_Qualification == "Class X"] <- 3 222 | # train$Applicant_Qualification[train$Applicant_Qualification != 1 & 223 | # train$Applicant_Qualification != 2 & 224 | # train$Applicant_Qualification != 3] <- 4 225 | # train$Applicant_Qualification <- as.numeric(train$Applicant_Qualification) 226 | # 227 | # test$Applicant_Qualification[test$Applicant_Qualification == "Class XII"] <- 1 228 | # test$Applicant_Qualification[test$Applicant_Qualification == "Graduate"] <- 2 229 | # test$Applicant_Qualification[test$Applicant_Qualification == "Class X"] <- 3 230 | # test$Applicant_Qualification[test$Applicant_Qualification != 1 & 231 | # test$Applicant_Qualification != 2 & 232 | # test$Applicant_Qualification != 3] <- 4 233 | # test$Applicant_Qualification <- as.numeric(test$Applicant_Qualification) 234 | # 235 | # # Encoding Manager_Joining_Designation 236 | # # ---------------------------------------------------------- 237 | # temp_joining_des <- train$Manager_Joining_Designation 238 | # train$Manager_Joining_Designation <- 0 239 | # train$Manager_Joining_Designation[temp_joining_des == "Level 1"| 240 | # temp_joining_des == "Other"] <- 1 241 | # train$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2 242 | # train$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3 243 | # train$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4 244 | # train$Manager_Joining_Designation[temp_joining_des == "Level 5" | 245 | # temp_joining_des == "Level 6" | 246 | # temp_joining_des == "Level 7"] <- 5 247 | # rm(temp_joining_des) 248 | # 249 | # temp_joining_des <- test$Manager_Joining_Designation 250 | # test$Manager_Joining_Designation <- 0 251 | # test$Manager_Joining_Designation[temp_joining_des == "Level 1"| 252 | # temp_joining_des == "Other"] <- 1 253 | # test$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2 254 | # test$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3 255 | # test$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4 256 | # test$Manager_Joining_Designation[temp_joining_des == "Level 5" | 257 | # temp_joining_des == "Level 6" | 258 | # temp_joining_des == "Level 7"] <- 5 259 | # rm(temp_joining_des) 260 | # 261 | # # Encoding Manager_Current_Designation 262 | # # ---------------------------------------------------------- 263 | # temp_current_des <- train$Manager_Current_Designation 264 | # train$Manager_Current_Designation <- 0 265 | # train$Manager_Current_Designation[temp_current_des == "Level 1"| 266 | # temp_current_des == "Other"] <- 1 267 | # train$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2 268 | # train$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3 269 | # train$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4 270 | # train$Manager_Current_Designation[temp_current_des == "Level 5" | 271 | # temp_current_des == "Level 6" | 272 | # temp_current_des == "Level 7"] <- 5 273 | # rm(temp_current_des) 274 | # 275 | # temp_current_des <- test$Manager_Current_Designation 276 | # test$Manager_Current_Designation <- 0 277 | # test$Manager_Current_Designation[temp_current_des == "Level 1" | 278 | # temp_current_des == "Other"] <- 1 279 | # test$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2 280 | # test$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3 281 | # test$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4 282 | # test$Manager_Current_Designation[temp_current_des == "Level 5" | 283 | # temp_current_des == "Level 6" | 284 | # temp_current_des == "Level 7"] <- 5 285 | # rm(temp_current_des) 286 | # 287 | # # Creating Manager_Progress variable 288 | # # ---------------------------------- 289 | # train$Manager_Progress <- (train$Manager_Current_Designation - 290 | # train$Manager_Joining_Designation) 291 | # train$Manager_Progress[train$Manager_Joining_Designation == 0] <- -2 292 | # 293 | # test$Manager_Progress <- (test$Manager_Current_Designation - 294 | # test$Manager_Joining_Designation) 295 | # test$Manager_Progress[test$Manager_Joining_Designation == 0] <- -2 296 | # 297 | # # Encoding Manager_Status 298 | # # ---------------------------------------------------------- 299 | # train$Manager_Status[train$Manager_Status == "Confirmation"] <- 1 300 | # train$Manager_Status[train$Manager_Status == "Probation"] <- 2 301 | # train$Manager_Status[train$Manager_Status == ""] <- 3 302 | # train$Manager_Status <- as.numeric(train$Manager_Status) 303 | # 304 | # test$Manager_Status[test$Manager_Status == "Confirmation"] <- 1 305 | # test$Manager_Status[test$Manager_Status == "Probation"] <- 2 306 | # test$Manager_Status[test$Manager_Status == ""] <- 3 307 | # test$Manager_Status <- as.numeric(test$Manager_Status) 308 | # 309 | # # Encoding Manager_Gender 310 | # # ---------------------------------------------------------- 311 | # train$Manager_Gender[train$Manager_Gender == "F"] <- 1 312 | # train$Manager_Gender[train$Manager_Gender == "M"] <- 2 313 | # train$Manager_Gender[train$Manager_Gender == ""] <- 3 314 | # train$Manager_Gender <- as.numeric(train$Manager_Gender) 315 | # 316 | # test$Manager_Gender[test$Manager_Gender == "F"] <- 1 317 | # test$Manager_Gender[test$Manager_Gender == "M"] <- 2 318 | # test$Manager_Gender[test$Manager_Gender == ""] <- 3 319 | # test$Manager_Gender <- as.numeric(test$Manager_Gender) 320 | # 321 | # # Imputing the numeric variables 322 | # # ---------------------------------------------------------- 323 | # train$Manager_Grade[!complete.cases(train)] <- median(train$Manager_Grade, na.rm = T) 324 | # train$Manager_Num_Application[!complete.cases(train)] <- 2.00 325 | # train$Manager_Num_Coded[!complete.cases(train)] <- mean(train$Manager_Num_Coded, na.rm = T) 326 | # train$Manager_Business[!complete.cases(train)] <- mean(train$Manager_Business, na.rm = T) 327 | # train$Manager_Num_Products[!complete.cases(train)] <- mean(train$Manager_Num_Products, 328 | # na.rm = T) 329 | # train$Manager_Business2[!complete.cases(train)] <- median(train$Manager_Business2, 330 | # na.rm = T) 331 | # train$Manager_Num_Products2[!complete.cases(train)] <- median(train$Manager_Num_Products2, 332 | # na.rm = T) 333 | # 334 | # test$Manager_Grade[!complete.cases(test)] <- median(test$Manager_Grade, na.rm = T) 335 | # test$Manager_Num_Application[!complete.cases(test)] <- 2.00 336 | # test$Manager_Num_Coded[!complete.cases(test)] <- mean(test$Manager_Num_Coded, na.rm = T) 337 | # test$Manager_Business[!complete.cases(test)] <- mean(test$Manager_Business, na.rm = T) 338 | # test$Manager_Num_Products[!complete.cases(test)] <- mean(test$Manager_Num_Products, 339 | # na.rm = T) 340 | # test$Manager_Business2[!complete.cases(test)] <- median(test$Manager_Business2, 341 | # na.rm = T) 342 | # test$Manager_Num_Products2[!complete.cases(test)] <- median(test$Manager_Num_Products2, 343 | # na.rm = T) 344 | # names_numeric <- names(train[,c(17:22)]) 345 | # 346 | # train_init = mice(train[,names_numeric], maxit = 0) 347 | # meth = train_init$method 348 | # predM = train_init$predictorMatrix 349 | # meth[names_numeric] = "rf" 350 | # imputed = mice(train[,names_numeric], method=meth, predictorMatrix=predM, m=5) 351 | # imputed <- complete(imputed) 352 | # train[,names_numeric] <- imputed 353 | # 354 | # test_init = mice(test[,names_numeric], maxit = 0) 355 | # meth = test_init$method 356 | # predM = test_init$predictorMatrix 357 | # meth[names_numeric] = "rf" 358 | # imputed = mice(test[,names_numeric], method=meth, predictorMatrix=predM, m=5) 359 | # imputed <- complete(imputed) 360 | # test[,names_numeric] <- imputed 361 | 362 | # Create variables for business from Category A advisor 363 | # ----------------------------------------------------- 364 | # train$Manager_Business1 <- (train$Manager_Business - train$Manager_Business2) 365 | # train$Manager_Num_Products1 <- (train$Manager_Num_Products2 - train$Manager_Num_Products2) 366 | # 367 | # test$Manager_Business1 <- (test$Manager_Business - test$Manager_Business2) 368 | # test$Manager_Num_Products1 <- (test$Manager_Num_Products2 - test$Manager_Num_Products2) 369 | 370 | # ---------------------------------------------- 371 | # train$Manager_Gender <- NULL 372 | # test$Manager_Gender <- NULL 373 | # train$Applicant_Marital_Status <- NULL 374 | # test$Applicant_Marital_Status <- NULL 375 | 376 | # ---------------------------------------------- 377 | # train$Applicant_City_PIN[is.na(train$Applicant_City_PIN)] <- 378 | # mean(train$Applicant_City_PIN, na.rm = T) 379 | # test$Applicant_City_PIN[is.na(test$Applicant_City_PIN)] <- 380 | # mean(test$Applicant_City_PIN, na.rm = T) 381 | 382 | # Add back the Target variable, Business_Sourced 383 | # ---------------------------------------------- 384 | train$Business_Sourced <- train.y 385 | 386 | ########################################################################################### 387 | 388 | set.seed(21) 389 | split <- createDataPartition(train$Business_Sourced, p = 0.7, list = F) 390 | training <- train[split,] 391 | testing <- train[-split,] 392 | rm(split) 393 | 394 | library(h2o) 395 | h2o.init(nthreads = -1) 396 | training.hex <- as.h2o(training) 397 | testing.hex <- as.h2o(testing) 398 | test.hex <- as.h2o(test) 399 | 400 | features <- c(colnames(training[,-26])) 401 | gbm_model1 <- h2o.gbm(x = features, 402 | y = "Business_Sourced", 403 | training_frame = training.hex, 404 | ntrees = 30, 405 | max_depth = 6, 406 | col_sample_rate = 0.2) 407 | 408 | pred_gbm_h2o <- as.data.frame(h2o.predict(gbm_model1,newdata = testing.hex)) 409 | # table(pred_gbm_h2o$predict > 0.5, testing$target) 410 | # 411 | library("pROC") 412 | auc(testing$Business_Sourced, pred_gbm_h2o$predict) 413 | 414 | h2o.varimp(gbm_model1) 415 | 416 | final_pred_gbm <- as.data.frame(h2o.predict(gbm_model1, newdata = test.hex)) 417 | 418 | submission <- data.frame(ID=test.id, Business_Sourced=final_pred_gbm$predict) 419 | write.csv(submission, "Submissions/submission_h202.csv", row.names = F) 420 | -------------------------------------------------------------------------------- /log.txt: -------------------------------------------------------------------------------- 1 | Log.txt 2 | 3 | 1 ------------------- 4 | > print(fold_auc) 5 | [1] 0.5585568 0.5758749 0.5922487 0.5837894 0.5647606 6 | > mean(fold_auc) 7 | [1] 0.5750461 8 | > sd(fold_auc) 9 | [1] 0.0136999 10 | 11 | Public LB: 0.52 12 | 13 | 2 ------------------ 14 | Added Male, Female variables 15 | Removed Applicant_Gender 16 | Changed nrounds to 125 from 150 17 | 18 | > print(fold_auc) 19 | [1] 0.5571820 0.5794472 0.5931177 0.5792398 0.5634502 20 | > mean(fold_auc) 21 | [1] 0.5744874 22 | > sd(fold_auc) 23 | [1] 0.01427908 24 | 25 | Public LB: 0.53063342592 26 | 27 | 3 ------------------ 28 | Encoded variables 29 | 30 | > print(fold_auc) 31 | [1] 0.5803248 0.5967548 0.5968560 0.5836218 0.5890557 32 | > sd(fold_auc) 33 | [1] 0.007508699 34 | > mean(fold_auc) 35 | [1] 0.5893226 36 | 37 | 4 ------------------ 38 | Changed depth from 5 to 8 39 | 40 | > print(fold_auc) 41 | [1] 0.5717975 0.6062098 0.6135852 0.5938706 0.5818983 42 | > mean(fold_auc) 43 | [1] 0.5934723 44 | > sd(fold_auc) 45 | [1] 0.01710765 46 | 47 | 5 ----------------- 48 | Changed depth to 2 49 | 50 | > print(fold_auc) 51 | [1] 0.5776977 0.5700044 0.5881579 0.5864932 0.5750182 52 | > mean(fold_auc) 53 | [1] 0.5794743 54 | > sd(fold_auc) 55 | [1] 0.007703269 56 | 57 | Public LB: 0.50 58 | 59 | 6 ----------------- 60 | Included Applicant Date variables 61 | 62 | > print(fold_auc) 63 | [1] 0.5898055 0.6228925 0.6145244 0.6084384 0.6064969 64 | > mean(fold_auc) 65 | [1] 0.6084315 66 | > sd(fold_auc) 67 | [1] 0.01221593 68 | 69 | Public LB: 0.54 70 | 71 | 7 ----------------- 72 | Removed Applicant BirthDate info; Created an Age variable instead 73 | 74 | > print(fold_auc) 75 | [1] 0.5886343 0.6196701 0.6181840 0.6079076 0.6105778 76 | > mean(fold_auc) 77 | [1] 0.6089948 78 | > sd(fold_auc) 79 | [1] 0.01241613 80 | 81 | Public LB: 0.578666756995 82 | 83 | 8 ------------------ 84 | Added Manager Join Date info 85 | 86 | > print(fold_auc) 87 | [1] 0.6042491 0.6308430 0.6204544 0.6037996 0.6208891 88 | > mean(fold_auc) 89 | [1] 0.6160471 90 | > sd(fold_auc) 91 | [1] 0.0117365 92 | 93 | public LB: 0.58 94 | 95 | 9 ------------------- 96 | Used Manager Birth Date info to add Manager_Age 97 | 98 | > print(fold_auc) 99 | [1] 0.6017031 0.6328631 0.6249353 0.6072667 0.6252992 100 | > mean(fold_auc) 101 | [1] 0.6184135 102 | > sd(fold_auc) 103 | [1] 0.01324977 104 | 105 | public LB: 0.57 106 | 107 | 10 ------------------ 108 | Added Office_PIN variable 109 | 110 | > print(fold_auc) 111 | [1] 0.6193239 0.6510257 0.6389947 0.6185533 0.6404322 112 | > mean(fold_auc) 113 | [1] 0.633666 114 | > sd(fold_auc) 115 | [1] 0.01422695 116 | 117 | public LB: 0.597 118 | 119 | 11 ------------------ 120 | Removed Manager Gender and Applicant Marital Status 121 | Added Business by Class A Advisor 122 | 123 | > print(fold_auc) 124 | [1] 0.6193533 0.6516049 0.6378345 0.6170152 0.6399937 125 | > mean(fold_auc) 126 | [1] 0.6331603 127 | > sd(fold_auc) 128 | [1] 0.01466326 129 | 130 | public LB: 0.59 131 | 132 | 12 ------------------- 133 | Added ApplicantCity_PIN 134 | 135 | > print(fold_auc) 136 | [1] 0.6039560 0.6296482 0.6273919 0.6218840 0.6367833 137 | > mean(fold_auc) 138 | [1] 0.6239327 139 | > sd(fold_auc) 140 | [1] 0.01237979 141 | 142 | public LB: 0.58 143 | 144 | 13 ------------------- 145 | Tuned Hyperparameters using Davut Polat's approach described here: 146 | https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19083/best-practices-for-parameter-tuning-on-models/108783#post108783 147 | 148 | > print(fold_auc) 149 | [1] 0.6369154 0.6348540 0.6456727 0.6688282 0.6384908 150 | > mean(fold_auc) 151 | [1] 0.6449522 152 | > sd(fold_auc) 153 | [1] 0.01395436 154 | 155 | public LB: 0.602131291918 156 | 157 | 14 -------------------- 158 | Changed nrounds from 200 to 150 159 | 160 | > print(fold_auc) 161 | [1] 0.6430740 0.6578375 0.6417713 0.6356861 0.6579393 162 | > mean(fold_auc) 163 | [1] 0.6472617 164 | > sd(fold_auc) 165 | [1] 0.01009375 166 | 167 | public LB: 0.59 168 | 169 | 15 --------------------- 170 | Added Manager_Progress variable 171 | 172 | > print(fold_auc) 173 | [1] 0.6363332 0.6566584 0.6507521 0.6306357 0.6500852 174 | > mean(fold_auc) 175 | [1] 0.6448929 176 | > sd(fold_auc) 177 | [1] 0.01091162 178 | 179 | public LB: 0.616988460583 180 | 181 | 16 ---------------------- 182 | Created Manager_Experience variable 183 | 184 | > print(fold_auc) 185 | [1] 0.6363271 0.6504765 0.6544330 0.6309831 0.6527037 186 | > mean(fold_auc) 187 | [1] 0.6449847 188 | > sd(fold_auc) 189 | [1] 0.01060677 190 | 191 | public LB: 0.59 192 | 193 | 17 ----------------------- 194 | Used MICE imputation for numeric variables 195 | 196 | > print(fold_auc) 197 | [1] 0.6507499 0.6268592 0.6239240 0.6519003 0.6439642 198 | > mean(fold_auc) 199 | [1] 0.6394795 200 | > sd(fold_auc) 201 | [1] 0.01325382 202 | 203 | public LB: 0.57 204 | 205 | 18 ----------------------- 206 | Used MICE imputation with 'rf' mode 207 | 208 | > print(fold_auc) 209 | [1] 0.6441430 0.6416233 0.6237396 0.6404840 0.6581744 210 | > mean(fold_auc) 211 | [1] 0.6416329 212 | > sd(fold_auc) 213 | [1] 0.01226186 214 | 215 | public LB: 0.597276155635 216 | 217 | 19 ----------------------- 218 | Removed Manager_Experience 219 | Used MICE imputation with 'rf' mode 220 | Changed nrounds to 175 from 150 221 | 222 | > print(fold_auc) 223 | [1] 0.6549643 0.6610666 0.6517388 0.6380177 0.6282266 224 | > mean(fold_auc) 225 | [1] 0.6468028 226 | > sd(fold_auc) 227 | [1] 0.01338547 228 | 229 | public LB: 0.597232798139 -------------------------------------------------------------------------------- /model_building.R: -------------------------------------------------------------------------------- 1 | library(xgboost) 2 | library(Matrix) 3 | library(caret) 4 | library(ROCR) 5 | library(tidyr) 6 | library(mice) 7 | 8 | set.seed(1234) 9 | 10 | train <- read.csv(file = "Data/Train_pjb2QcD.csv", stringsAsFactors = F) 11 | test <- read.csv(file = "Data/Test_wyCirpO.csv", stringsAsFactors = F) 12 | 13 | test.id <- test$ID 14 | train.id <- train$ID 15 | train.y <- train$Business_Sourced 16 | 17 | ######################################################################################### 18 | ## FEATURE ENGINEERING 19 | ## ------------------- 20 | ## ------------------- 21 | train <- train[,-c(1, 4, 7, 15)] 22 | test <- test[,-c(1, 4, 7, 15)] 23 | 24 | train$Business_Sourced <- NULL 25 | 26 | # Separating Application_Receipt_Date into date, month, year 27 | # ---------------------------------------------------------- 28 | train <- separate(data = train, col = Application_Receipt_Date, 29 | into = c("Receipt_month", "Receipt_date", "Receipt_year")) 30 | train$Receipt_date <- as.numeric(train$Receipt_date) 31 | train$Receipt_month <- as.numeric(train$Receipt_month) 32 | train$Receipt_year <- as.numeric(train$Receipt_year) 33 | 34 | test <- separate(data = test, col = Application_Receipt_Date, 35 | into = c("Receipt_month", "Receipt_date", "Receipt_year")) 36 | test$Receipt_date <- as.numeric(test$Receipt_date) 37 | test$Receipt_month <- as.numeric(test$Receipt_month) 38 | test$Receipt_year <- as.numeric(test$Receipt_year) 39 | 40 | # Separating Applicant_BirthDate into date, month, year 41 | # ---------------------------------------------------------- 42 | train <- separate(data = train, col = Applicant_BirthDate, 43 | into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year")) 44 | train$Applicant_Birth_date <- NULL 45 | train$Applicant_Birth_month <- NULL 46 | train$Applicant_Birth_year <- as.numeric(train$Applicant_Birth_year) 47 | train$Applicant_Birth_year[is.na(train$Applicant_Birth_year)] <- 48 | as.numeric(names(which.max((table(train$Applicant_Birth_year))))) 49 | 50 | test <- separate(data = test, col = Applicant_BirthDate, 51 | into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year")) 52 | test$Applicant_Birth_date <- NULL 53 | test$Applicant_Birth_month <- NULL 54 | test$Applicant_Birth_year <- as.numeric(test$Applicant_Birth_year) 55 | test$Applicant_Birth_year[is.na(test$Applicant_Birth_year)] <- 56 | as.numeric(names(which.max((table(test$Applicant_Birth_year))))) 57 | 58 | # Create Applicant_age variable 59 | # --------------------- 60 | train$Applicant_Age <- (2008 - train$Applicant_Birth_year) 61 | train$Applicant_Birth_year <- NULL 62 | test$Applicant_Age <- (2008 - test$Applicant_Birth_year) 63 | test$Applicant_Birth_year <- NULL 64 | 65 | # Separating Manager_DOJ into date, month, year 66 | # ---------------------------------------------------------- 67 | train <- separate(data = train, col = Manager_DOJ, 68 | into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year")) 69 | train$Manager_Join_date <- as.numeric(train$Manager_Join_date) 70 | train$Manager_Join_date[is.na(train$Manager_Join_date)] <- 71 | median(train$Manager_Join_date, na.rm = T) 72 | 73 | train$Manager_Join_month <- as.numeric(train$Manager_Join_month) 74 | train$Manager_Join_month[is.na(train$Manager_Join_month)] <- 75 | median(train$Manager_Join_month, na.rm = T) 76 | 77 | train$Manager_Join_year <- as.numeric(train$Manager_Join_year) 78 | train$Manager_Join_year[is.na(train$Manager_Join_year)] <- 79 | median(train$Manager_Join_year, na.rm = T) 80 | 81 | test <- separate(data = test, col = Manager_DOJ, 82 | into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year")) 83 | test$Manager_Join_date <- as.numeric(test$Manager_Join_date) 84 | test$Manager_Join_date[is.na(test$Manager_Join_date)] <- 85 | median(test$Manager_Join_date, na.rm = T) 86 | 87 | test$Manager_Join_month <- as.numeric(test$Manager_Join_month) 88 | test$Manager_Join_month[is.na(test$Manager_Join_month)] <- 89 | median(test$Manager_Join_month, na.rm = T) 90 | 91 | test$Manager_Join_year <- as.numeric(test$Manager_Join_year) 92 | test$Manager_Join_year[is.na(test$Manager_Join_year)] <- 93 | median(test$Manager_Join_year, na.rm = T) 94 | 95 | # Separating Manager_DoB into date, month, year 96 | # ---------------------------------------------------------- 97 | train <- separate(data = train, col = Manager_DoB, 98 | into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year")) 99 | train$Manager_Birth_date <- NULL 100 | train$Manager_Birth_month <- NULL 101 | train$Manager_Birth_year <- as.numeric(train$Manager_Birth_year) 102 | train$Manager_Birth_year[is.na(train$Manager_Birth_year)] <- 103 | median(train$Manager_Birth_year, na.rm = T) 104 | 105 | test <- separate(data = test, col = Manager_DoB, 106 | into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year")) 107 | test$Manager_Birth_date <- NULL 108 | test$Manager_Birth_month <- NULL 109 | test$Manager_Birth_year <- as.numeric(test$Manager_Birth_year) 110 | test$Manager_Birth_year[is.na(test$Manager_Birth_year)] <- 111 | median(test$Manager_Birth_year, na.rm = T) 112 | 113 | # Create Manager_Age variable 114 | # --------------------- 115 | train$Manager_Age <- (2008 - train$Manager_Birth_year) 116 | train$Manager_Birth_year <- NULL 117 | test$Manager_Age <- (2008 - test$Manager_Birth_year) 118 | test$Manager_Birth_year <- NULL 119 | 120 | # Encoding Applicant_Gender 121 | # ---------------------------------------------------------- 122 | train$Applicant_Gender[train$Applicant_Gender == "F"] <- 1 123 | train$Applicant_Gender[train$Applicant_Gender == "M"] <- 2 124 | train$Applicant_Gender[train$Applicant_Gender == ""] <- 3 125 | train$Applicant_Gender <- as.numeric(train$Applicant_Gender) 126 | 127 | test$Applicant_Gender[test$Applicant_Gender == "F"] <- 1 128 | test$Applicant_Gender[test$Applicant_Gender == "M"] <- 2 129 | test$Applicant_Gender[test$Applicant_Gender == ""] <- 3 130 | test$Applicant_Gender <- as.numeric(test$Applicant_Gender) 131 | 132 | # Encoding Applicant_Occupation 133 | # ---------------------------------------------------------- 134 | train$Applicant_Occupation[train$Applicant_Occupation == "Salaried"] <- 1 135 | train$Applicant_Occupation[train$Applicant_Occupation == "Business"] <- 2 136 | train$Applicant_Occupation[train$Applicant_Occupation == "Others" | 137 | train$Applicant_Occupation == ""] <- 3 138 | train$Applicant_Occupation[train$Applicant_Occupation == "Self Employed" | 139 | train$Applicant_Occupation == "Student"] <- 4 140 | train$Applicant_Occupation <- as.numeric(train$Applicant_Occupation) 141 | 142 | test$Applicant_Occupation[test$Applicant_Occupation == "Salaried"] <- 1 143 | test$Applicant_Occupation[test$Applicant_Occupation == "Business"] <- 2 144 | test$Applicant_Occupation[test$Applicant_Occupation == "Others" | 145 | test$Applicant_Occupation == ""] <- 3 146 | test$Applicant_Occupation[test$Applicant_Occupation == "Self Employed" | 147 | test$Applicant_Occupation == "Student"] <- 4 148 | test$Applicant_Occupation <- as.numeric(test$Applicant_Occupation) 149 | 150 | # Encoding Applicant_Qualification 151 | # ---------------------------------------------------------- 152 | train$Applicant_Qualification[train$Applicant_Qualification == "Class XII"] <- 1 153 | train$Applicant_Qualification[train$Applicant_Qualification == "Graduate"] <- 2 154 | train$Applicant_Qualification[train$Applicant_Qualification == "Class X"] <- 3 155 | train$Applicant_Qualification[train$Applicant_Qualification != 1 & 156 | train$Applicant_Qualification != 2 & 157 | train$Applicant_Qualification != 3] <- 4 158 | train$Applicant_Qualification <- as.numeric(train$Applicant_Qualification) 159 | 160 | test$Applicant_Qualification[test$Applicant_Qualification == "Class XII"] <- 1 161 | test$Applicant_Qualification[test$Applicant_Qualification == "Graduate"] <- 2 162 | test$Applicant_Qualification[test$Applicant_Qualification == "Class X"] <- 3 163 | test$Applicant_Qualification[test$Applicant_Qualification != 1 & 164 | test$Applicant_Qualification != 2 & 165 | test$Applicant_Qualification != 3] <- 4 166 | test$Applicant_Qualification <- as.numeric(test$Applicant_Qualification) 167 | 168 | # Encoding Manager_Joining_Designation 169 | # ---------------------------------------------------------- 170 | temp_joining_des <- train$Manager_Joining_Designation 171 | train$Manager_Joining_Designation <- 0 172 | train$Manager_Joining_Designation[temp_joining_des == "Level 1"| 173 | temp_joining_des == "Other"] <- 1 174 | train$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2 175 | train$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3 176 | train$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4 177 | train$Manager_Joining_Designation[temp_joining_des == "Level 5" | 178 | temp_joining_des == "Level 6" | 179 | temp_joining_des == "Level 7"] <- 5 180 | rm(temp_joining_des) 181 | 182 | temp_joining_des <- test$Manager_Joining_Designation 183 | test$Manager_Joining_Designation <- 0 184 | test$Manager_Joining_Designation[temp_joining_des == "Level 1"| 185 | temp_joining_des == "Other"] <- 1 186 | test$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2 187 | test$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3 188 | test$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4 189 | test$Manager_Joining_Designation[temp_joining_des == "Level 5" | 190 | temp_joining_des == "Level 6" | 191 | temp_joining_des == "Level 7"] <- 5 192 | rm(temp_joining_des) 193 | 194 | # Encoding Manager_Current_Designation 195 | # ---------------------------------------------------------- 196 | temp_current_des <- train$Manager_Current_Designation 197 | train$Manager_Current_Designation <- 0 198 | train$Manager_Current_Designation[temp_current_des == "Level 1"| 199 | temp_current_des == "Other"] <- 1 200 | train$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2 201 | train$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3 202 | train$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4 203 | train$Manager_Current_Designation[temp_current_des == "Level 5" | 204 | temp_current_des == "Level 6" | 205 | temp_current_des == "Level 7"] <- 5 206 | rm(temp_current_des) 207 | 208 | temp_current_des <- test$Manager_Current_Designation 209 | test$Manager_Current_Designation <- 0 210 | test$Manager_Current_Designation[temp_current_des == "Level 1" | 211 | temp_current_des == "Other"] <- 1 212 | test$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2 213 | test$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3 214 | test$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4 215 | test$Manager_Current_Designation[temp_current_des == "Level 5" | 216 | temp_current_des == "Level 6" | 217 | temp_current_des == "Level 7"] <- 5 218 | rm(temp_current_des) 219 | 220 | # Creating Manager_Progress variable 221 | # ---------------------------------- 222 | train$Manager_Progress <- (train$Manager_Current_Designation - 223 | train$Manager_Joining_Designation) 224 | train$Manager_Progress[train$Manager_Joining_Designation == 0] <- -2 225 | 226 | test$Manager_Progress <- (test$Manager_Current_Designation - 227 | test$Manager_Joining_Designation) 228 | test$Manager_Progress[test$Manager_Joining_Designation == 0] <- -2 229 | 230 | # Encoding Manager_Status 231 | # ---------------------------------------------------------- 232 | train$Manager_Status[train$Manager_Status == "Confirmation"] <- 1 233 | train$Manager_Status[train$Manager_Status == "Probation"] <- 2 234 | train$Manager_Status[train$Manager_Status == ""] <- 3 235 | train$Manager_Status <- as.numeric(train$Manager_Status) 236 | 237 | test$Manager_Status[test$Manager_Status == "Confirmation"] <- 1 238 | test$Manager_Status[test$Manager_Status == "Probation"] <- 2 239 | test$Manager_Status[test$Manager_Status == ""] <- 3 240 | test$Manager_Status <- as.numeric(test$Manager_Status) 241 | 242 | # Imputing the numeric variables 243 | # ---------------------------------------------------------- 244 | train$Manager_Grade[!complete.cases(train)] <- median(train$Manager_Grade, na.rm = T) 245 | train$Manager_Num_Application[!complete.cases(train)] <- 2.00 246 | train$Manager_Num_Coded[!complete.cases(train)] <- mean(train$Manager_Num_Coded, na.rm = T) 247 | train$Manager_Business[!complete.cases(train)] <- mean(train$Manager_Business, na.rm = T) 248 | train$Manager_Num_Products[!complete.cases(train)] <- mean(train$Manager_Num_Products, 249 | na.rm = T) 250 | train$Manager_Business2[!complete.cases(train)] <- median(train$Manager_Business2, 251 | na.rm = T) 252 | train$Manager_Num_Products2[!complete.cases(train)] <- median(train$Manager_Num_Products2, 253 | na.rm = T) 254 | 255 | test$Manager_Grade[!complete.cases(test)] <- median(test$Manager_Grade, na.rm = T) 256 | test$Manager_Num_Application[!complete.cases(test)] <- 2.00 257 | test$Manager_Num_Coded[!complete.cases(test)] <- mean(test$Manager_Num_Coded, na.rm = T) 258 | test$Manager_Business[!complete.cases(test)] <- mean(test$Manager_Business, na.rm = T) 259 | test$Manager_Num_Products[!complete.cases(test)] <- mean(test$Manager_Num_Products, 260 | na.rm = T) 261 | test$Manager_Business2[!complete.cases(test)] <- median(test$Manager_Business2, 262 | na.rm = T) 263 | test$Manager_Num_Products2[!complete.cases(test)] <- median(test$Manager_Num_Products2, 264 | na.rm = T) 265 | 266 | # Add back the Target variable, Business_Sourced 267 | # ---------------------------------------------- 268 | train$Business_Sourced <- train.y 269 | 270 | ########################################################################################### 271 | 272 | ## PARAMETER TUNING 273 | 274 | # split <- createDataPartition(train$Business_Sourced, p = 0.8, list = F) 275 | # x_train <- train[split,] 276 | # x_train.y <- train$Business_Sourced[split] 277 | # x_test <- train[-split,] 278 | # x_test.y <- train$Business_Sourced[-split] 279 | # 280 | # x_train <- sparse.model.matrix(Business_Sourced ~ ., data= x_train) 281 | # x_test <- sparse.model.matrix(Business_Sourced ~ ., data = x_test) 282 | # 283 | # d_train <- xgb.DMatrix(data = x_train, label = x_train.y) 284 | # d_test <- xgb.DMatrix(data = x_test, label = x_test.y) 285 | # watchlist <- list(train=d_train, test=d_test) 286 | # 287 | # param <- list( objective = "binary:logistic", 288 | # booster = "gbtree", 289 | # eval_metric = "auc", 290 | # subsample = 0.8, 291 | # min_child_weight = 5, 292 | # colsample_bytree = 0.2, 293 | # eta = 0.05, 294 | # max_depth = 8 295 | # ) 296 | # 297 | # clf <- xgb.train( params = param, 298 | # data = d_train, 299 | # nrounds = 300, 300 | # verbose = 2, 301 | # watchlist = watchlist 302 | # ) 303 | # 304 | # fold_pred <- predict(clf, x_test) 305 | # pred <- prediction(fold_pred, x_test.y) 306 | # auc <- performance(pred, measure = "auc") 307 | 308 | ########################################################################################### 309 | 310 | ## BUILD MODEL WITH STRATIFIED K-FOLD CV 311 | folds <- createFolds(as.factor(train$Business_Sourced), k = 5) 312 | fold_auc <- c() 313 | 314 | for (fold in folds) { 315 | x_train <- train[-fold, ] 316 | x_train.y <- train$Business_Sourced[-fold] 317 | x_test <- train[fold, ] 318 | x_test.y <- train$Business_Sourced[fold] 319 | 320 | print("Split info") 321 | print(table(x_train$Business_Sourced)/nrow(x_train)) 322 | print(table(x_test$Business_Sourced)/nrow(x_test)) 323 | 324 | x_train <- sparse.model.matrix(Business_Sourced ~ ., data= x_train) 325 | x_test <- sparse.model.matrix(Business_Sourced ~ ., data = x_test) 326 | 327 | d_train <- xgb.DMatrix(data = x_train, label = x_train.y) 328 | d_test <- xgb.DMatrix(data = x_test, label = x_test.y) 329 | watchlist <- list(train=d_train, test=d_test) 330 | 331 | param <- list( objective = "binary:logistic", 332 | booster = "gbtree", 333 | eval_metric = "auc", 334 | subsample = 0.8, 335 | min_child_weight = 5, 336 | colsample_bytree = 0.2, 337 | eta = 0.05, 338 | max_depth = 8 339 | ) 340 | 341 | clf <- xgb.train( params = param, 342 | data = d_train, 343 | nrounds = 150, 344 | verbose = 2, 345 | watchlist = watchlist 346 | ) 347 | fold_pred <- predict(clf, x_test) 348 | pred <- prediction(fold_pred, x_test.y) 349 | auc <- performance(pred, measure = "auc") 350 | fold_auc <- c(fold_auc, auc@y.values[[1]]) 351 | } 352 | 353 | print(fold_auc) 354 | ######################################## 355 | 356 | train <- sparse.model.matrix(Business_Sourced ~ ., data = train) 357 | dtrain <- xgb.DMatrix(data=train, label=train.y) 358 | watchlist <- list(train=dtrain) 359 | # 360 | param <- list( objective = "binary:logistic", 361 | booster = "gbtree", 362 | eval_metric = "auc", 363 | subsample = 0.8, 364 | min_child_weight = 5, 365 | colsample_bytree = 0.2, 366 | eta = 0.05, 367 | max_depth = 8 368 | ) 369 | 370 | clf <- xgb.train( params = param, 371 | data = d_train, 372 | nrounds = 150, 373 | verbose = 2, 374 | watchlist = watchlist 375 | ) 376 | 377 | test$target <- -1 378 | test <- sparse.model.matrix(target ~ ., data = test) 379 | 380 | preds <- predict(clf, test) 381 | submission <- data.frame(ID=test.id, Business_Sourced=preds) 382 | cat("saving the submission file\n") 383 | write.csv(submission, "Submissions/submission.csv", row.names = F) 384 | --------------------------------------------------------------------------------