├── .gitignore
├── README.md
├── RedundantScripts
    └── h2o_model.R
├── log.txt
└── model_building.R


/.gitignore:
--------------------------------------------------------------------------------
1 | *.Rproj*
2 | *.csv
3 | Submissions
4 | .Rhistory
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The Smart Recruits
 2 | ## Data Science competition by Analytics Vidhya
 3 | 
 4 | Private LB: Top 15% finish (47/379 participants)
 5 | 
 6 | - Score on private LB: AUC- 0.615203707604
 7 | - Final model is trained using the XGBoost algorithm.
 8 | - Used 5-fold CV
 9 | 
10 | _Associated blog post: [Data Science Competitions 101: Anatomy and Approach](https://techandmortals.wordpress.com/2016/07/27/data-science-competitions-101-anatomy-and-approach/)_
11 | 


--------------------------------------------------------------------------------
/RedundantScripts/h2o_model.R:
--------------------------------------------------------------------------------
  1 | library(xgboost)
  2 | library(Matrix)
  3 | library(caret)
  4 | library(ROCR)
  5 | library(tidyr)
  6 | library(mice)
  7 | 
  8 | set.seed(1234)
  9 | 
 10 | train <- read.csv(file = "Train_pjb2QcD.csv", stringsAsFactors = F)
 11 | test <- read.csv(file = "Test_wyCirpO.csv", stringsAsFactors = F)
 12 | 
 13 | test.id <- test$ID
 14 | train.id <- train$ID
 15 | train.y <- train$Business_Sourced
 16 | 
 17 | #########################################################################################
 18 | ## FEATURE ENGINEERING
 19 | ## -------------------
 20 | ## -------------------
 21 | train <- train[,-c(1, 4)]
 22 | test <- test[,-c(1, 4)]
 23 | 
 24 | train$Business_Sourced <- NULL
 25 | 
 26 | # Separating Application_Receipt_Date into date, month, year
 27 | # ----------------------------------------------------------
 28 | # train <- separate(data = train, col = Application_Receipt_Date, 
 29 | #                   into = c("Receipt_month", "Receipt_date", "Receipt_year"))
 30 | # train$Receipt_date <- as.numeric(train$Receipt_date)
 31 | # train$Receipt_month <- as.numeric(train$Receipt_month)
 32 | # train$Receipt_year <- as.numeric(train$Receipt_year)
 33 | # 
 34 | # test <- separate(data = test, col = Application_Receipt_Date, 
 35 | #                  into = c("Receipt_month", "Receipt_date", "Receipt_year"))
 36 | # test$Receipt_date <- as.numeric(test$Receipt_date)
 37 | # test$Receipt_month <- as.numeric(test$Receipt_month)
 38 | # test$Receipt_year <- as.numeric(test$Receipt_year)
 39 | # 
 40 | # # Separating Applicant_BirthDate into date, month, year
 41 | # # ----------------------------------------------------------
 42 | # train <- separate(data = train, col = Applicant_BirthDate, 
 43 | #                   into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year"))
 44 | # # train$Applicant_Birth_date <- as.numeric(train$Applicant_Birth_date)
 45 | # # train$Applicant_Birth_date[is.na(train$Applicant_Birth_date)] <- 
 46 | # #     as.numeric(names(which.max((table(train$Applicant_Birth_date)))))
 47 | # # 
 48 | # # train$Applicant_Birth_month <- as.numeric(train$Applicant_Birth_month)
 49 | # # train$Applicant_Birth_month[is.na(train$Applicant_Birth_month)] <- 
 50 | # #     as.numeric(names(which.max((table(train$Applicant_Birth_month)))))
 51 | # train$Applicant_Birth_date <- NULL
 52 | # train$Applicant_Birth_month <- NULL
 53 | # train$Applicant_Birth_year <- as.numeric(train$Applicant_Birth_year)
 54 | # train$Applicant_Birth_year[is.na(train$Applicant_Birth_year)] <- 
 55 | #     as.numeric(names(which.max((table(train$Applicant_Birth_year)))))
 56 | # 
 57 | # test <- separate(data = test, col = Applicant_BirthDate, 
 58 | #                  into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year"))
 59 | # # test$Applicant_Birth_date <- as.numeric(test$Applicant_Birth_date)
 60 | # # test$Applicant_Birth_date[is.na(test$Applicant_Birth_date)] <- 
 61 | # #     as.numeric(names(which.max((table(test$Applicant_Birth_date)))))
 62 | # # 
 63 | # # test$Applicant_Birth_month <- as.numeric(test$Applicant_Birth_month)
 64 | # # test$Applicant_Birth_month[is.na(test$Applicant_Birth_month)] <- 
 65 | # #     as.numeric(names(which.max((table(test$Applicant_Birth_month)))))
 66 | # test$Applicant_Birth_date <- NULL
 67 | # test$Applicant_Birth_month <- NULL
 68 | # test$Applicant_Birth_year <- as.numeric(test$Applicant_Birth_year)
 69 | # test$Applicant_Birth_year[is.na(test$Applicant_Birth_year)] <- 
 70 | #     as.numeric(names(which.max((table(test$Applicant_Birth_year)))))
 71 | # 
 72 | # # Create Applicant_age variable
 73 | # # ---------------------
 74 | # train$Applicant_Age <- (2008 - train$Applicant_Birth_year)
 75 | # train$Applicant_Birth_year <- NULL
 76 | # test$Applicant_Age <- (2008 - test$Applicant_Birth_year)
 77 | # test$Applicant_Birth_year <- NULL
 78 | # 
 79 | # # Separating Manager_DOJ into date, month, year
 80 | # # ----------------------------------------------------------
 81 | # train <- separate(data = train, col = Manager_DOJ, 
 82 | #                   into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year"))
 83 | # train$Manager_Join_date <- as.numeric(train$Manager_Join_date)
 84 | # train$Manager_Join_date[is.na(train$Manager_Join_date)] <- 
 85 | #     median(train$Manager_Join_date, na.rm = T)
 86 | # 
 87 | # train$Manager_Join_month <- as.numeric(train$Manager_Join_month)
 88 | # train$Manager_Join_month[is.na(train$Manager_Join_month)] <- 
 89 | #     median(train$Manager_Join_month, na.rm = T)
 90 | # 
 91 | # # train$Manager_Join_date <- NULL
 92 | # # train$Manager_Join_month <- NULL
 93 | # train$Manager_Join_year <- as.numeric(train$Manager_Join_year)
 94 | # train$Manager_Join_year[is.na(train$Manager_Join_year)] <- 
 95 | #     median(train$Manager_Join_year, na.rm = T)
 96 | # 
 97 | # test <- separate(data = test, col = Manager_DOJ, 
 98 | #                  into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year"))
 99 | # test$Manager_Join_date <- as.numeric(test$Manager_Join_date)
100 | # test$Manager_Join_date[is.na(test$Manager_Join_date)] <- 
101 | #     median(test$Manager_Join_date, na.rm = T)
102 | # 
103 | # test$Manager_Join_month <- as.numeric(test$Manager_Join_month)
104 | # test$Manager_Join_month[is.na(test$Manager_Join_month)] <- 
105 | #     median(test$Manager_Join_month, na.rm = T)
106 | # 
107 | # # test$Manager_Join_date <- NULL
108 | # # test$Manager_Join_month <- NULL
109 | # test$Manager_Join_year <- as.numeric(test$Manager_Join_year)
110 | # test$Manager_Join_year[is.na(test$Manager_Join_year)] <- 
111 | #     median(test$Manager_Join_year, na.rm = T)
112 | # 
113 | # # Create Manager_Experience variable
114 | # # ----------------------------------
115 | # # train$Manager_Experience <- (2008 - train$Manager_Join_year)
116 | # # train$Manager_Join_year <- NULL
117 | # # test$Manager_Experience <- (2008 - test$Manager_Join_year)
118 | # # test$Manager_Join_year <- NULL
119 | # 
120 | # # Separating Manager_DoB into date, month, year
121 | # # ----------------------------------------------------------
122 | # train <- separate(data = train, col = Manager_DoB, 
123 | #                   into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year"))
124 | # # train$Manager_Birth_date <- as.numeric(train$Manager_Birth_date)
125 | # # train$Manager_Birth_date[is.na(train$Manager_Birth_date)] <- 
126 | # #     median(train$Manager_Birth_date, na.rm = T)
127 | # # 
128 | # # train$Manager_Birth_month <- as.numeric(train$Manager_Birth_month)
129 | # # train$Manager_Birth_month[is.na(train$Manager_Birth_month)] <- 
130 | # #     median(train$Manager_Birth_month, na.rm = T)
131 | # 
132 | # train$Manager_Birth_date <- NULL
133 | # train$Manager_Birth_month <- NULL
134 | # train$Manager_Birth_year <- as.numeric(train$Manager_Birth_year)
135 | # train$Manager_Birth_year[is.na(train$Manager_Birth_year)] <- 
136 | #     median(train$Manager_Birth_year, na.rm = T)
137 | # 
138 | # test <- separate(data = test, col = Manager_DoB, 
139 | #                  into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year"))
140 | # # test$Manager_Birth_date <- as.numeric(test$Manager_Birth_date)
141 | # # test$Manager_Birth_date[is.na(test$Manager_Birth_date)] <- 
142 | # #     median(test$Manager_Birth_date, na.rm = T)
143 | # # 
144 | # # test$Manager_Birth_month <- as.numeric(test$Manager_Birth_month)
145 | # # test$Manager_Birth_month[is.na(test$Manager_Birth_month)] <- 
146 | # #     median(test$Manager_Birth_month, na.rm = T)
147 | # 
148 | # test$Manager_Birth_date <- NULL
149 | # test$Manager_Birth_month <- NULL
150 | # test$Manager_Birth_year <- as.numeric(test$Manager_Birth_year)
151 | # test$Manager_Birth_year[is.na(test$Manager_Birth_year)] <- 
152 | #     median(test$Manager_Birth_year, na.rm = T)
153 | # 
154 | # # Create Manager_Age variable
155 | # # ---------------------
156 | # train$Manager_Age <- (2008 - train$Manager_Birth_year)
157 | # train$Manager_Birth_year <- NULL
158 | # test$Manager_Age <- (2008 - test$Manager_Birth_year)
159 | # test$Manager_Birth_year <- NULL
160 | # 
161 | # # Encoding Applicant_Gender
162 | # # ----------------------------------------------------------
163 | # train$Applicant_Gender[train$Applicant_Gender == "F"] <- 1
164 | # train$Applicant_Gender[train$Applicant_Gender == "M"] <- 2
165 | # train$Applicant_Gender[train$Applicant_Gender == ""] <- 3
166 | # train$Applicant_Gender <- as.numeric(train$Applicant_Gender)
167 | # 
168 | # test$Applicant_Gender[test$Applicant_Gender == "F"] <- 1
169 | # test$Applicant_Gender[test$Applicant_Gender == "M"] <- 2
170 | # test$Applicant_Gender[test$Applicant_Gender == ""] <- 3
171 | # test$Applicant_Gender <- as.numeric(test$Applicant_Gender)
172 | # 
173 | # # train$Female <- as.numeric(train$Applicant_Gender != "M")
174 | # # train$Male <- as.numeric(!train$Female)
175 | # # test$Female <- as.numeric(test$Applicant_Gender != "M")
176 | # # test$Male <- as.numeric(!test$Female)
177 | # # 
178 | # # train$Applicant_Gender <- NULL
179 | # # test$Applicant_Gender <- NULL
180 | # 
181 | # # Encoding Applicant_Marital_Status
182 | # # ----------------------------------------------------------
183 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "M"] <- 1
184 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "S"] <- 2
185 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "D"] <- 3
186 | # train$Applicant_Marital_Status[train$Applicant_Marital_Status == "W" |
187 | #                                    train$Applicant_Marital_Status == "D" |
188 | #                                    train$Applicant_Marital_Status == ""] <- 3
189 | # train$Applicant_Marital_Status <- as.numeric(train$Applicant_Marital_Status)
190 | # 
191 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "M"] <- 1
192 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "S"] <- 2
193 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "D"] <- 3
194 | # test$Applicant_Marital_Status[test$Applicant_Marital_Status == "W" |
195 | #                                   test$Applicant_Marital_Status == "D" |
196 | #                                   test$Applicant_Marital_Status == ""] <- 3
197 | # test$Applicant_Marital_Status <- as.numeric(test$Applicant_Marital_Status)
198 | # 
199 | # # Encoding Applicant_Occupation
200 | # # ----------------------------------------------------------
201 | # train$Applicant_Occupation[train$Applicant_Occupation == "Salaried"] <- 1
202 | # train$Applicant_Occupation[train$Applicant_Occupation == "Business"] <- 2
203 | # train$Applicant_Occupation[train$Applicant_Occupation == "Others" |
204 | #                                train$Applicant_Occupation == ""] <- 3
205 | # train$Applicant_Occupation[train$Applicant_Occupation == "Self Employed" |
206 | #                                train$Applicant_Occupation == "Student"] <- 4
207 | # train$Applicant_Occupation <- as.numeric(train$Applicant_Occupation)
208 | # 
209 | # test$Applicant_Occupation[test$Applicant_Occupation == "Salaried"] <- 1
210 | # test$Applicant_Occupation[test$Applicant_Occupation == "Business"] <- 2
211 | # test$Applicant_Occupation[test$Applicant_Occupation == "Others" |
212 | #                               test$Applicant_Occupation == ""] <- 3
213 | # test$Applicant_Occupation[test$Applicant_Occupation == "Self Employed" |
214 | #                               test$Applicant_Occupation == "Student"] <- 4
215 | # test$Applicant_Occupation <- as.numeric(test$Applicant_Occupation)
216 | # 
217 | # # Encoding Applicant_Qualification
218 | # # ----------------------------------------------------------
219 | # train$Applicant_Qualification[train$Applicant_Qualification == "Class XII"] <- 1
220 | # train$Applicant_Qualification[train$Applicant_Qualification == "Graduate"] <- 2
221 | # train$Applicant_Qualification[train$Applicant_Qualification == "Class X"] <- 3
222 | # train$Applicant_Qualification[train$Applicant_Qualification != 1 &
223 | #                                   train$Applicant_Qualification != 2 &
224 | #                                   train$Applicant_Qualification != 3] <- 4
225 | # train$Applicant_Qualification <- as.numeric(train$Applicant_Qualification)
226 | # 
227 | # test$Applicant_Qualification[test$Applicant_Qualification == "Class XII"] <- 1
228 | # test$Applicant_Qualification[test$Applicant_Qualification == "Graduate"] <- 2
229 | # test$Applicant_Qualification[test$Applicant_Qualification == "Class X"] <- 3
230 | # test$Applicant_Qualification[test$Applicant_Qualification != 1 &
231 | #                                  test$Applicant_Qualification != 2 &
232 | #                                  test$Applicant_Qualification != 3] <- 4
233 | # test$Applicant_Qualification <- as.numeric(test$Applicant_Qualification)
234 | # 
235 | # # Encoding Manager_Joining_Designation
236 | # # ----------------------------------------------------------
237 | # temp_joining_des <- train$Manager_Joining_Designation
238 | # train$Manager_Joining_Designation <- 0
239 | # train$Manager_Joining_Designation[temp_joining_des == "Level 1"|
240 | #                                       temp_joining_des == "Other"] <- 1
241 | # train$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2
242 | # train$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3
243 | # train$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4
244 | # train$Manager_Joining_Designation[temp_joining_des == "Level 5" |
245 | #                                       temp_joining_des == "Level 6" |
246 | #                                       temp_joining_des == "Level 7"] <- 5
247 | # rm(temp_joining_des)
248 | # 
249 | # temp_joining_des <- test$Manager_Joining_Designation
250 | # test$Manager_Joining_Designation <- 0
251 | # test$Manager_Joining_Designation[temp_joining_des == "Level 1"|
252 | #                                      temp_joining_des == "Other"] <- 1
253 | # test$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2
254 | # test$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3
255 | # test$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4
256 | # test$Manager_Joining_Designation[temp_joining_des == "Level 5" |
257 | #                                      temp_joining_des == "Level 6" |
258 | #                                      temp_joining_des == "Level 7"] <- 5
259 | # rm(temp_joining_des)
260 | # 
261 | # # Encoding Manager_Current_Designation
262 | # # ----------------------------------------------------------
263 | # temp_current_des <- train$Manager_Current_Designation
264 | # train$Manager_Current_Designation <- 0
265 | # train$Manager_Current_Designation[temp_current_des == "Level 1"|
266 | #                                       temp_current_des == "Other"] <- 1
267 | # train$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2
268 | # train$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3
269 | # train$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4
270 | # train$Manager_Current_Designation[temp_current_des == "Level 5" |
271 | #                                       temp_current_des == "Level 6" |
272 | #                                       temp_current_des == "Level 7"] <- 5
273 | # rm(temp_current_des)
274 | # 
275 | # temp_current_des <- test$Manager_Current_Designation
276 | # test$Manager_Current_Designation <- 0
277 | # test$Manager_Current_Designation[temp_current_des == "Level 1" |
278 | #                                      temp_current_des == "Other"] <- 1
279 | # test$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2
280 | # test$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3
281 | # test$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4
282 | # test$Manager_Current_Designation[temp_current_des == "Level 5" |
283 | #                                      temp_current_des == "Level 6" |
284 | #                                      temp_current_des == "Level 7"] <- 5
285 | # rm(temp_current_des)
286 | # 
287 | # # Creating Manager_Progress variable
288 | # # ----------------------------------
289 | # train$Manager_Progress <- (train$Manager_Current_Designation - 
290 | #                                train$Manager_Joining_Designation)
291 | # train$Manager_Progress[train$Manager_Joining_Designation == 0] <- -2
292 | # 
293 | # test$Manager_Progress <- (test$Manager_Current_Designation - 
294 | #                               test$Manager_Joining_Designation)
295 | # test$Manager_Progress[test$Manager_Joining_Designation == 0] <- -2
296 | # 
297 | # # Encoding Manager_Status
298 | # # ----------------------------------------------------------
299 | # train$Manager_Status[train$Manager_Status == "Confirmation"] <- 1
300 | # train$Manager_Status[train$Manager_Status == "Probation"] <- 2
301 | # train$Manager_Status[train$Manager_Status == ""] <- 3
302 | # train$Manager_Status <- as.numeric(train$Manager_Status)
303 | # 
304 | # test$Manager_Status[test$Manager_Status == "Confirmation"] <- 1
305 | # test$Manager_Status[test$Manager_Status == "Probation"] <- 2
306 | # test$Manager_Status[test$Manager_Status == ""] <- 3
307 | # test$Manager_Status <- as.numeric(test$Manager_Status)
308 | # 
309 | # # Encoding Manager_Gender
310 | # # ----------------------------------------------------------
311 | # train$Manager_Gender[train$Manager_Gender == "F"] <- 1
312 | # train$Manager_Gender[train$Manager_Gender == "M"] <- 2
313 | # train$Manager_Gender[train$Manager_Gender == ""] <- 3
314 | # train$Manager_Gender <- as.numeric(train$Manager_Gender)
315 | # 
316 | # test$Manager_Gender[test$Manager_Gender == "F"] <- 1
317 | # test$Manager_Gender[test$Manager_Gender == "M"] <- 2
318 | # test$Manager_Gender[test$Manager_Gender == ""] <- 3
319 | # test$Manager_Gender <- as.numeric(test$Manager_Gender)
320 | # 
321 | # # Imputing the numeric variables
322 | # # ----------------------------------------------------------
323 | # train$Manager_Grade[!complete.cases(train)] <- median(train$Manager_Grade, na.rm = T)
324 | # train$Manager_Num_Application[!complete.cases(train)] <- 2.00
325 | # train$Manager_Num_Coded[!complete.cases(train)] <- mean(train$Manager_Num_Coded, na.rm = T)
326 | # train$Manager_Business[!complete.cases(train)] <- mean(train$Manager_Business, na.rm = T)
327 | # train$Manager_Num_Products[!complete.cases(train)] <- mean(train$Manager_Num_Products, 
328 | #                                                              na.rm = T)
329 | # train$Manager_Business2[!complete.cases(train)] <- median(train$Manager_Business2, 
330 | #                                                           na.rm = T)
331 | # train$Manager_Num_Products2[!complete.cases(train)] <- median(train$Manager_Num_Products2, 
332 | #                                                           na.rm = T)
333 | # 
334 | # test$Manager_Grade[!complete.cases(test)] <- median(test$Manager_Grade, na.rm = T)
335 | # test$Manager_Num_Application[!complete.cases(test)] <- 2.00
336 | # test$Manager_Num_Coded[!complete.cases(test)] <- mean(test$Manager_Num_Coded, na.rm = T)
337 | # test$Manager_Business[!complete.cases(test)] <- mean(test$Manager_Business, na.rm = T)
338 | # test$Manager_Num_Products[!complete.cases(test)] <- mean(test$Manager_Num_Products, 
339 | #                                                            na.rm = T)
340 | # test$Manager_Business2[!complete.cases(test)] <- median(test$Manager_Business2, 
341 | #                                                           na.rm = T)
342 | # test$Manager_Num_Products2[!complete.cases(test)] <- median(test$Manager_Num_Products2, 
343 | #                                                               na.rm = T)
344 | # names_numeric <- names(train[,c(17:22)])
345 | # 
346 | # train_init = mice(train[,names_numeric], maxit = 0)
347 | # meth = train_init$method
348 | # predM = train_init$predictorMatrix
349 | # meth[names_numeric] = "rf"
350 | # imputed = mice(train[,names_numeric], method=meth, predictorMatrix=predM, m=5)
351 | # imputed <- complete(imputed)
352 | # train[,names_numeric] <- imputed
353 | # 
354 | # test_init = mice(test[,names_numeric], maxit = 0)
355 | # meth = test_init$method
356 | # predM = test_init$predictorMatrix
357 | # meth[names_numeric] = "rf"
358 | # imputed = mice(test[,names_numeric], method=meth, predictorMatrix=predM, m=5)
359 | # imputed <- complete(imputed)
360 | # test[,names_numeric] <- imputed
361 | 
362 | # Create variables for business from Category A advisor
363 | # -----------------------------------------------------
364 | # train$Manager_Business1 <- (train$Manager_Business - train$Manager_Business2)
365 | # train$Manager_Num_Products1 <- (train$Manager_Num_Products2 - train$Manager_Num_Products2)
366 | # 
367 | # test$Manager_Business1 <- (test$Manager_Business - test$Manager_Business2)
368 | # test$Manager_Num_Products1 <- (test$Manager_Num_Products2 - test$Manager_Num_Products2)
369 | 
370 | # ----------------------------------------------
371 | # train$Manager_Gender <- NULL
372 | # test$Manager_Gender <- NULL
373 | # train$Applicant_Marital_Status <- NULL
374 | # test$Applicant_Marital_Status <- NULL
375 | 
376 | # ----------------------------------------------
377 | # train$Applicant_City_PIN[is.na(train$Applicant_City_PIN)] <- 
378 | #     mean(train$Applicant_City_PIN, na.rm = T)
379 | # test$Applicant_City_PIN[is.na(test$Applicant_City_PIN)] <- 
380 | #     mean(test$Applicant_City_PIN, na.rm = T)
381 | 
382 | # Add back the Target variable, Business_Sourced
383 | # ----------------------------------------------
384 | train$Business_Sourced <- train.y
385 | 
386 | ###########################################################################################
387 | 
388 | set.seed(21)
389 | split <- createDataPartition(train$Business_Sourced, p = 0.7, list = F)
390 | training <- train[split,]
391 | testing <- train[-split,]
392 | rm(split)
393 | 
394 | library(h2o)
395 | h2o.init(nthreads = -1)
396 | training.hex <- as.h2o(training)
397 | testing.hex <- as.h2o(testing)
398 | test.hex <- as.h2o(test)
399 | 
400 | features <- c(colnames(training[,-26]))
401 | gbm_model1 <- h2o.gbm(x = features,
402 |                       y = "Business_Sourced",
403 |                       training_frame = training.hex,
404 |                       ntrees = 30,
405 |                       max_depth = 6, 
406 |                       col_sample_rate = 0.2)
407 | 
408 | pred_gbm_h2o <- as.data.frame(h2o.predict(gbm_model1,newdata = testing.hex))
409 | # table(pred_gbm_h2o$predict > 0.5, testing$target)
410 | # 
411 | library("pROC")
412 | auc(testing$Business_Sourced, pred_gbm_h2o$predict)
413 | 
414 | h2o.varimp(gbm_model1)
415 | 
416 | final_pred_gbm <- as.data.frame(h2o.predict(gbm_model1, newdata = test.hex))
417 | 
418 | submission <- data.frame(ID=test.id, Business_Sourced=final_pred_gbm$predict)
419 | write.csv(submission, "Submissions/submission_h202.csv", row.names = F)
420 | 


--------------------------------------------------------------------------------
/log.txt:
--------------------------------------------------------------------------------
  1 | Log.txt
  2 | 
  3 | 1 -------------------
  4 | > print(fold_auc)
  5 | [1] 0.5585568 0.5758749 0.5922487 0.5837894 0.5647606
  6 | > mean(fold_auc)
  7 | [1] 0.5750461
  8 | > sd(fold_auc)
  9 | [1] 0.0136999
 10 | 
 11 | Public LB: 0.52
 12 | 
 13 | 2 ------------------
 14 | Added Male, Female variables
 15 | Removed Applicant_Gender
 16 | Changed nrounds to 125 from 150
 17 | 
 18 | > print(fold_auc)
 19 | [1] 0.5571820 0.5794472 0.5931177 0.5792398 0.5634502
 20 | > mean(fold_auc)
 21 | [1] 0.5744874
 22 | > sd(fold_auc)
 23 | [1] 0.01427908
 24 | 
 25 | Public LB: 0.53063342592
 26 | 
 27 | 3 ------------------
 28 | Encoded variables
 29 | 
 30 | > print(fold_auc)
 31 | [1] 0.5803248 0.5967548 0.5968560 0.5836218 0.5890557
 32 | > sd(fold_auc)
 33 | [1] 0.007508699
 34 | > mean(fold_auc)
 35 | [1] 0.5893226
 36 | 
 37 | 4 ------------------
 38 | Changed depth from 5 to 8
 39 | 
 40 | > print(fold_auc)
 41 | [1] 0.5717975 0.6062098 0.6135852 0.5938706 0.5818983
 42 | > mean(fold_auc)
 43 | [1] 0.5934723
 44 | > sd(fold_auc)
 45 | [1] 0.01710765
 46 | 
 47 | 5 -----------------
 48 | Changed depth to 2
 49 | 
 50 | > print(fold_auc)
 51 | [1] 0.5776977 0.5700044 0.5881579 0.5864932 0.5750182
 52 | > mean(fold_auc)
 53 | [1] 0.5794743
 54 | > sd(fold_auc)
 55 | [1] 0.007703269
 56 | 
 57 | Public LB: 0.50
 58 | 
 59 | 6 -----------------
 60 | Included Applicant Date variables
 61 | 
 62 | > print(fold_auc)
 63 | [1] 0.5898055 0.6228925 0.6145244 0.6084384 0.6064969
 64 | > mean(fold_auc)
 65 | [1] 0.6084315
 66 | > sd(fold_auc)
 67 | [1] 0.01221593
 68 | 
 69 | Public LB: 0.54
 70 | 
 71 | 7 -----------------
 72 | Removed Applicant BirthDate info; Created an Age variable instead
 73 | 
 74 | > print(fold_auc)
 75 | [1] 0.5886343 0.6196701 0.6181840 0.6079076 0.6105778
 76 | > mean(fold_auc)
 77 | [1] 0.6089948
 78 | > sd(fold_auc)
 79 | [1] 0.01241613
 80 | 
 81 | Public LB: 0.578666756995
 82 | 
 83 | 8 ------------------
 84 | Added Manager Join Date info
 85 | 
 86 | > print(fold_auc)
 87 | [1] 0.6042491 0.6308430 0.6204544 0.6037996 0.6208891
 88 | > mean(fold_auc)
 89 | [1] 0.6160471
 90 | > sd(fold_auc)
 91 | [1] 0.0117365
 92 | 
 93 | public LB: 0.58
 94 | 
 95 | 9 -------------------
 96 | Used Manager Birth Date info to add Manager_Age
 97 | 
 98 | > print(fold_auc)
 99 | [1] 0.6017031 0.6328631 0.6249353 0.6072667 0.6252992
100 | > mean(fold_auc)
101 | [1] 0.6184135
102 | > sd(fold_auc)
103 | [1] 0.01324977
104 | 
105 | public LB: 0.57
106 | 
107 | 10 ------------------
108 | Added Office_PIN variable
109 | 
110 | > print(fold_auc)
111 | [1] 0.6193239 0.6510257 0.6389947 0.6185533 0.6404322
112 | > mean(fold_auc)
113 | [1] 0.633666
114 | > sd(fold_auc)
115 | [1] 0.01422695
116 | 
117 | public LB: 0.597
118 | 
119 | 11 ------------------
120 | Removed Manager Gender and Applicant Marital Status
121 | Added Business by Class A Advisor
122 | 
123 | > print(fold_auc)
124 | [1] 0.6193533 0.6516049 0.6378345 0.6170152 0.6399937
125 | > mean(fold_auc)
126 | [1] 0.6331603
127 | > sd(fold_auc)
128 | [1] 0.01466326
129 | 
130 | public LB: 0.59
131 | 
132 | 12 -------------------
133 | Added ApplicantCity_PIN
134 | 
135 | > print(fold_auc)
136 | [1] 0.6039560 0.6296482 0.6273919 0.6218840 0.6367833
137 | > mean(fold_auc)
138 | [1] 0.6239327
139 | > sd(fold_auc)
140 | [1] 0.01237979
141 | 
142 | public LB: 0.58
143 | 
144 | 13 -------------------
145 | Tuned Hyperparameters using Davut Polat's approach described here:
146 | https://www.kaggle.com/c/bnp-paribas-cardif-claims-management/forums/t/19083/best-practices-for-parameter-tuning-on-models/108783#post108783
147 | 
148 | > print(fold_auc)
149 | [1] 0.6369154 0.6348540 0.6456727 0.6688282 0.6384908
150 | > mean(fold_auc)
151 | [1] 0.6449522
152 | > sd(fold_auc)
153 | [1] 0.01395436
154 | 
155 | public LB: 0.602131291918
156 | 
157 | 14 --------------------
158 | Changed nrounds from 200 to 150
159 | 
160 | > print(fold_auc)
161 | [1] 0.6430740 0.6578375 0.6417713 0.6356861 0.6579393
162 | > mean(fold_auc)
163 | [1] 0.6472617
164 | > sd(fold_auc)
165 | [1] 0.01009375
166 | 
167 | public LB: 0.59
168 | 
169 | 15 ---------------------
170 | Added Manager_Progress variable
171 | 
172 | > print(fold_auc)
173 | [1] 0.6363332 0.6566584 0.6507521 0.6306357 0.6500852
174 | > mean(fold_auc)
175 | [1] 0.6448929
176 | > sd(fold_auc)
177 | [1] 0.01091162
178 | 
179 | public LB: 0.616988460583
180 | 
181 | 16 ----------------------
182 | Created Manager_Experience variable
183 | 
184 | > print(fold_auc)
185 | [1] 0.6363271 0.6504765 0.6544330 0.6309831 0.6527037
186 | > mean(fold_auc)
187 | [1] 0.6449847
188 | > sd(fold_auc)
189 | [1] 0.01060677
190 | 
191 | public LB: 0.59
192 | 
193 | 17 -----------------------
194 | Used MICE imputation for numeric variables
195 | 
196 | > print(fold_auc)
197 | [1] 0.6507499 0.6268592 0.6239240 0.6519003 0.6439642
198 | > mean(fold_auc)
199 | [1] 0.6394795
200 | > sd(fold_auc)
201 | [1] 0.01325382
202 | 
203 | public LB: 0.57
204 | 
205 | 18 -----------------------
206 | Used MICE imputation with 'rf' mode
207 | 
208 | > print(fold_auc)
209 | [1] 0.6441430 0.6416233 0.6237396 0.6404840 0.6581744
210 | > mean(fold_auc)
211 | [1] 0.6416329
212 | > sd(fold_auc)
213 | [1] 0.01226186
214 | 
215 | public LB: 0.597276155635
216 | 
217 | 19 -----------------------
218 | Removed Manager_Experience
219 | Used MICE imputation with 'rf' mode
220 | Changed nrounds to 175 from 150
221 | 
222 | > print(fold_auc)
223 | [1] 0.6549643 0.6610666 0.6517388 0.6380177 0.6282266
224 | > mean(fold_auc)
225 | [1] 0.6468028
226 | > sd(fold_auc)
227 | [1] 0.01338547
228 | 
229 | public LB: 0.597232798139


--------------------------------------------------------------------------------
/model_building.R:
--------------------------------------------------------------------------------
  1 | library(xgboost)
  2 | library(Matrix)
  3 | library(caret)
  4 | library(ROCR)
  5 | library(tidyr)
  6 | library(mice)
  7 | 
  8 | set.seed(1234)
  9 | 
 10 | train <- read.csv(file = "Data/Train_pjb2QcD.csv", stringsAsFactors = F)
 11 | test <- read.csv(file = "Data/Test_wyCirpO.csv", stringsAsFactors = F)
 12 | 
 13 | test.id <- test$ID
 14 | train.id <- train$ID
 15 | train.y <- train$Business_Sourced
 16 | 
 17 | #########################################################################################
 18 | ## FEATURE ENGINEERING
 19 | ## -------------------
 20 | ## -------------------
 21 | train <- train[,-c(1, 4, 7, 15)]
 22 | test <- test[,-c(1, 4, 7, 15)]
 23 | 
 24 | train$Business_Sourced <- NULL
 25 | 
 26 | # Separating Application_Receipt_Date into date, month, year
 27 | # ----------------------------------------------------------
 28 | train <- separate(data = train, col = Application_Receipt_Date, 
 29 |          into = c("Receipt_month", "Receipt_date", "Receipt_year"))
 30 | train$Receipt_date <- as.numeric(train$Receipt_date)
 31 | train$Receipt_month <- as.numeric(train$Receipt_month)
 32 | train$Receipt_year <- as.numeric(train$Receipt_year)
 33 | 
 34 | test <- separate(data = test, col = Application_Receipt_Date, 
 35 |                   into = c("Receipt_month", "Receipt_date", "Receipt_year"))
 36 | test$Receipt_date <- as.numeric(test$Receipt_date)
 37 | test$Receipt_month <- as.numeric(test$Receipt_month)
 38 | test$Receipt_year <- as.numeric(test$Receipt_year)
 39 | 
 40 | # Separating Applicant_BirthDate into date, month, year
 41 | # ----------------------------------------------------------
 42 | train <- separate(data = train, col = Applicant_BirthDate, 
 43 |                   into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year"))
 44 | train$Applicant_Birth_date <- NULL
 45 | train$Applicant_Birth_month <- NULL
 46 | train$Applicant_Birth_year <- as.numeric(train$Applicant_Birth_year)
 47 | train$Applicant_Birth_year[is.na(train$Applicant_Birth_year)] <- 
 48 |     as.numeric(names(which.max((table(train$Applicant_Birth_year)))))
 49 | 
 50 | test <- separate(data = test, col = Applicant_BirthDate, 
 51 |                  into = c("Applicant_Birth_month", "Applicant_Birth_date", "Applicant_Birth_year"))
 52 | test$Applicant_Birth_date <- NULL
 53 | test$Applicant_Birth_month <- NULL
 54 | test$Applicant_Birth_year <- as.numeric(test$Applicant_Birth_year)
 55 | test$Applicant_Birth_year[is.na(test$Applicant_Birth_year)] <- 
 56 |     as.numeric(names(which.max((table(test$Applicant_Birth_year)))))
 57 | 
 58 | # Create Applicant_age variable
 59 | # ---------------------
 60 | train$Applicant_Age <- (2008 - train$Applicant_Birth_year)
 61 | train$Applicant_Birth_year <- NULL
 62 | test$Applicant_Age <- (2008 - test$Applicant_Birth_year)
 63 | test$Applicant_Birth_year <- NULL
 64 | 
 65 | # Separating Manager_DOJ into date, month, year
 66 | # ----------------------------------------------------------
 67 | train <- separate(data = train, col = Manager_DOJ, 
 68 |                   into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year"))
 69 | train$Manager_Join_date <- as.numeric(train$Manager_Join_date)
 70 | train$Manager_Join_date[is.na(train$Manager_Join_date)] <- 
 71 |     median(train$Manager_Join_date, na.rm = T)
 72 | 
 73 | train$Manager_Join_month <- as.numeric(train$Manager_Join_month)
 74 | train$Manager_Join_month[is.na(train$Manager_Join_month)] <- 
 75 |     median(train$Manager_Join_month, na.rm = T)
 76 | 
 77 | train$Manager_Join_year <- as.numeric(train$Manager_Join_year)
 78 | train$Manager_Join_year[is.na(train$Manager_Join_year)] <- 
 79 |     median(train$Manager_Join_year, na.rm = T)
 80 | 
 81 | test <- separate(data = test, col = Manager_DOJ, 
 82 |                  into = c("Manager_Join_month", "Manager_Join_date", "Manager_Join_year"))
 83 | test$Manager_Join_date <- as.numeric(test$Manager_Join_date)
 84 | test$Manager_Join_date[is.na(test$Manager_Join_date)] <- 
 85 |     median(test$Manager_Join_date, na.rm = T)
 86 | 
 87 | test$Manager_Join_month <- as.numeric(test$Manager_Join_month)
 88 | test$Manager_Join_month[is.na(test$Manager_Join_month)] <- 
 89 |     median(test$Manager_Join_month, na.rm = T)
 90 | 
 91 | test$Manager_Join_year <- as.numeric(test$Manager_Join_year)
 92 | test$Manager_Join_year[is.na(test$Manager_Join_year)] <- 
 93 |     median(test$Manager_Join_year, na.rm = T)
 94 | 
 95 | # Separating Manager_DoB into date, month, year
 96 | # ----------------------------------------------------------
 97 | train <- separate(data = train, col = Manager_DoB, 
 98 |                   into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year"))
 99 | train$Manager_Birth_date <- NULL
100 | train$Manager_Birth_month <- NULL
101 | train$Manager_Birth_year <- as.numeric(train$Manager_Birth_year)
102 | train$Manager_Birth_year[is.na(train$Manager_Birth_year)] <- 
103 |     median(train$Manager_Birth_year, na.rm = T)
104 | 
105 | test <- separate(data = test, col = Manager_DoB, 
106 |                  into = c("Manager_Birth_month", "Manager_Birth_date", "Manager_Birth_year"))
107 | test$Manager_Birth_date <- NULL
108 | test$Manager_Birth_month <- NULL
109 | test$Manager_Birth_year <- as.numeric(test$Manager_Birth_year)
110 | test$Manager_Birth_year[is.na(test$Manager_Birth_year)] <- 
111 |     median(test$Manager_Birth_year, na.rm = T)
112 | 
113 | # Create Manager_Age variable
114 | # ---------------------
115 | train$Manager_Age <- (2008 - train$Manager_Birth_year)
116 | train$Manager_Birth_year <- NULL
117 | test$Manager_Age <- (2008 - test$Manager_Birth_year)
118 | test$Manager_Birth_year <- NULL
119 | 
120 | # Encoding Applicant_Gender
121 | # ----------------------------------------------------------
122 | train$Applicant_Gender[train$Applicant_Gender == "F"] <- 1
123 | train$Applicant_Gender[train$Applicant_Gender == "M"] <- 2
124 | train$Applicant_Gender[train$Applicant_Gender == ""] <- 3
125 | train$Applicant_Gender <- as.numeric(train$Applicant_Gender)
126 | 
127 | test$Applicant_Gender[test$Applicant_Gender == "F"] <- 1
128 | test$Applicant_Gender[test$Applicant_Gender == "M"] <- 2
129 | test$Applicant_Gender[test$Applicant_Gender == ""] <- 3
130 | test$Applicant_Gender <- as.numeric(test$Applicant_Gender)
131 | 
132 | # Encoding Applicant_Occupation
133 | # ----------------------------------------------------------
134 | train$Applicant_Occupation[train$Applicant_Occupation == "Salaried"] <- 1
135 | train$Applicant_Occupation[train$Applicant_Occupation == "Business"] <- 2
136 | train$Applicant_Occupation[train$Applicant_Occupation == "Others" |
137 |                                train$Applicant_Occupation == ""] <- 3
138 | train$Applicant_Occupation[train$Applicant_Occupation == "Self Employed" |
139 |                                train$Applicant_Occupation == "Student"] <- 4
140 | train$Applicant_Occupation <- as.numeric(train$Applicant_Occupation)
141 | 
142 | test$Applicant_Occupation[test$Applicant_Occupation == "Salaried"] <- 1
143 | test$Applicant_Occupation[test$Applicant_Occupation == "Business"] <- 2
144 | test$Applicant_Occupation[test$Applicant_Occupation == "Others" |
145 |                                test$Applicant_Occupation == ""] <- 3
146 | test$Applicant_Occupation[test$Applicant_Occupation == "Self Employed" |
147 |                                test$Applicant_Occupation == "Student"] <- 4
148 | test$Applicant_Occupation <- as.numeric(test$Applicant_Occupation)
149 | 
150 | # Encoding Applicant_Qualification
151 | # ----------------------------------------------------------
152 | train$Applicant_Qualification[train$Applicant_Qualification == "Class XII"] <- 1
153 | train$Applicant_Qualification[train$Applicant_Qualification == "Graduate"] <- 2
154 | train$Applicant_Qualification[train$Applicant_Qualification == "Class X"] <- 3
155 | train$Applicant_Qualification[train$Applicant_Qualification != 1 &
156 |                                   train$Applicant_Qualification != 2 &
157 |                                   train$Applicant_Qualification != 3] <- 4
158 | train$Applicant_Qualification <- as.numeric(train$Applicant_Qualification)
159 | 
160 | test$Applicant_Qualification[test$Applicant_Qualification == "Class XII"] <- 1
161 | test$Applicant_Qualification[test$Applicant_Qualification == "Graduate"] <- 2
162 | test$Applicant_Qualification[test$Applicant_Qualification == "Class X"] <- 3
163 | test$Applicant_Qualification[test$Applicant_Qualification != 1 &
164 |                                   test$Applicant_Qualification != 2 &
165 |                                   test$Applicant_Qualification != 3] <- 4
166 | test$Applicant_Qualification <- as.numeric(test$Applicant_Qualification)
167 | 
168 | # Encoding Manager_Joining_Designation
169 | # ----------------------------------------------------------
170 | temp_joining_des <- train$Manager_Joining_Designation
171 | train$Manager_Joining_Designation <- 0
172 | train$Manager_Joining_Designation[temp_joining_des == "Level 1"|
173 |                                       temp_joining_des == "Other"] <- 1
174 | train$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2
175 | train$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3
176 | train$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4
177 | train$Manager_Joining_Designation[temp_joining_des == "Level 5" |
178 |                                       temp_joining_des == "Level 6" |
179 |                                       temp_joining_des == "Level 7"] <- 5
180 | rm(temp_joining_des)
181 | 
182 | temp_joining_des <- test$Manager_Joining_Designation
183 | test$Manager_Joining_Designation <- 0
184 | test$Manager_Joining_Designation[temp_joining_des == "Level 1"|
185 |                                      temp_joining_des == "Other"] <- 1
186 | test$Manager_Joining_Designation[temp_joining_des == "Level 2"] <- 2
187 | test$Manager_Joining_Designation[temp_joining_des == "Level 3"] <- 3
188 | test$Manager_Joining_Designation[temp_joining_des == "Level 4"] <- 4
189 | test$Manager_Joining_Designation[temp_joining_des == "Level 5" |
190 |                                       temp_joining_des == "Level 6" |
191 |                                       temp_joining_des == "Level 7"] <- 5
192 | rm(temp_joining_des)
193 | 
194 | # Encoding Manager_Current_Designation
195 | # ----------------------------------------------------------
196 | temp_current_des <- train$Manager_Current_Designation
197 | train$Manager_Current_Designation <- 0
198 | train$Manager_Current_Designation[temp_current_des == "Level 1"|
199 |                                       temp_current_des == "Other"] <- 1
200 | train$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2
201 | train$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3
202 | train$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4
203 | train$Manager_Current_Designation[temp_current_des == "Level 5" |
204 |                                       temp_current_des == "Level 6" |
205 |                                       temp_current_des == "Level 7"] <- 5
206 | rm(temp_current_des)
207 | 
208 | temp_current_des <- test$Manager_Current_Designation
209 | test$Manager_Current_Designation <- 0
210 | test$Manager_Current_Designation[temp_current_des == "Level 1" |
211 |                                      temp_current_des == "Other"] <- 1
212 | test$Manager_Current_Designation[temp_current_des == "Level 2"] <- 2
213 | test$Manager_Current_Designation[temp_current_des == "Level 3"] <- 3
214 | test$Manager_Current_Designation[temp_current_des == "Level 4"] <- 4
215 | test$Manager_Current_Designation[temp_current_des == "Level 5" |
216 |                                       temp_current_des == "Level 6" |
217 |                                       temp_current_des == "Level 7"] <- 5
218 | rm(temp_current_des)
219 | 
220 | # Creating Manager_Progress variable
221 | # ----------------------------------
222 | train$Manager_Progress <- (train$Manager_Current_Designation - 
223 |                                train$Manager_Joining_Designation)
224 | train$Manager_Progress[train$Manager_Joining_Designation == 0] <- -2
225 | 
226 | test$Manager_Progress <- (test$Manager_Current_Designation - 
227 |                               test$Manager_Joining_Designation)
228 | test$Manager_Progress[test$Manager_Joining_Designation == 0] <- -2
229 | 
230 | # Encoding Manager_Status
231 | # ----------------------------------------------------------
232 | train$Manager_Status[train$Manager_Status == "Confirmation"] <- 1
233 | train$Manager_Status[train$Manager_Status == "Probation"] <- 2
234 | train$Manager_Status[train$Manager_Status == ""] <- 3
235 | train$Manager_Status <- as.numeric(train$Manager_Status)
236 | 
237 | test$Manager_Status[test$Manager_Status == "Confirmation"] <- 1
238 | test$Manager_Status[test$Manager_Status == "Probation"] <- 2
239 | test$Manager_Status[test$Manager_Status == ""] <- 3
240 | test$Manager_Status <- as.numeric(test$Manager_Status)
241 | 
242 | # Imputing the numeric variables
243 | # ----------------------------------------------------------
244 | train$Manager_Grade[!complete.cases(train)] <- median(train$Manager_Grade, na.rm = T)
245 | train$Manager_Num_Application[!complete.cases(train)] <- 2.00
246 | train$Manager_Num_Coded[!complete.cases(train)] <- mean(train$Manager_Num_Coded, na.rm = T)
247 | train$Manager_Business[!complete.cases(train)] <- mean(train$Manager_Business, na.rm = T)
248 | train$Manager_Num_Products[!complete.cases(train)] <- mean(train$Manager_Num_Products, 
249 |                                                              na.rm = T)
250 | train$Manager_Business2[!complete.cases(train)] <- median(train$Manager_Business2, 
251 |                                                           na.rm = T)
252 | train$Manager_Num_Products2[!complete.cases(train)] <- median(train$Manager_Num_Products2, 
253 |                                                           na.rm = T)
254 | 
255 | test$Manager_Grade[!complete.cases(test)] <- median(test$Manager_Grade, na.rm = T)
256 | test$Manager_Num_Application[!complete.cases(test)] <- 2.00
257 | test$Manager_Num_Coded[!complete.cases(test)] <- mean(test$Manager_Num_Coded, na.rm = T)
258 | test$Manager_Business[!complete.cases(test)] <- mean(test$Manager_Business, na.rm = T)
259 | test$Manager_Num_Products[!complete.cases(test)] <- mean(test$Manager_Num_Products, 
260 |                                                            na.rm = T)
261 | test$Manager_Business2[!complete.cases(test)] <- median(test$Manager_Business2, 
262 |                                                           na.rm = T)
263 | test$Manager_Num_Products2[!complete.cases(test)] <- median(test$Manager_Num_Products2, 
264 |                                                               na.rm = T)
265 | 
266 | # Add back the Target variable, Business_Sourced
267 | # ----------------------------------------------
268 | train$Business_Sourced <- train.y
269 | 
270 | ###########################################################################################
271 | 
272 | ## PARAMETER TUNING
273 | 
274 | # split <- createDataPartition(train$Business_Sourced, p = 0.8, list = F)
275 | # x_train <- train[split,]
276 | # x_train.y <- train$Business_Sourced[split]
277 | # x_test <- train[-split,]
278 | # x_test.y <- train$Business_Sourced[-split]
279 | # 
280 | # x_train <- sparse.model.matrix(Business_Sourced ~ ., data= x_train) 
281 | # x_test <- sparse.model.matrix(Business_Sourced ~ ., data = x_test)
282 | # 
283 | # d_train <- xgb.DMatrix(data = x_train, label = x_train.y)
284 | # d_test <- xgb.DMatrix(data = x_test, label = x_test.y)
285 | # watchlist <- list(train=d_train, test=d_test)
286 | # 
287 | # param <- list( objective    = "binary:logistic",
288 | #                booster      = "gbtree",
289 | #                eval_metric  = "auc",
290 | #                subsample = 0.8,
291 | #                min_child_weight = 5,
292 | #                colsample_bytree = 0.2,
293 | #                eta          = 0.05,
294 | #                max_depth    = 8
295 | # )
296 | # 
297 | # clf <- xgb.train(   params              = param, 
298 | #                     data                = d_train, 
299 | #                     nrounds             = 300, 
300 | #                     verbose             = 2,
301 | #                     watchlist           = watchlist
302 | # )
303 | # 
304 | # fold_pred <- predict(clf, x_test)
305 | # pred <- prediction(fold_pred, x_test.y)
306 | # auc <- performance(pred, measure = "auc")
307 | 
308 | ###########################################################################################
309 | 
310 | ## BUILD MODEL WITH STRATIFIED K-FOLD CV
311 | folds <- createFolds(as.factor(train$Business_Sourced), k = 5)
312 | fold_auc <- c()
313 | 
314 | for (fold in folds) {
315 |     x_train <- train[-fold, ]
316 |     x_train.y <- train$Business_Sourced[-fold]
317 |     x_test <- train[fold, ]
318 |     x_test.y <- train$Business_Sourced[fold]
319 |     
320 |     print("Split info")
321 |     print(table(x_train$Business_Sourced)/nrow(x_train))
322 |     print(table(x_test$Business_Sourced)/nrow(x_test))
323 |     
324 |     x_train <- sparse.model.matrix(Business_Sourced ~ ., data= x_train) 
325 |     x_test <- sparse.model.matrix(Business_Sourced ~ ., data = x_test)
326 |     
327 |     d_train <- xgb.DMatrix(data = x_train, label = x_train.y)
328 |     d_test <- xgb.DMatrix(data = x_test, label = x_test.y)
329 |     watchlist <- list(train=d_train, test=d_test)
330 |     
331 |     param <- list( objective    = "binary:logistic",
332 |                    booster      = "gbtree",
333 |                    eval_metric  = "auc",
334 |                    subsample = 0.8,
335 |                    min_child_weight = 5,
336 |                    colsample_bytree = 0.2,
337 |                    eta          = 0.05,
338 |                    max_depth    = 8
339 |     )
340 |     
341 |     clf <- xgb.train(   params              = param, 
342 |                         data                = d_train, 
343 |                         nrounds             = 150, 
344 |                         verbose             = 2,
345 |                         watchlist           = watchlist
346 |     )    
347 |     fold_pred <- predict(clf, x_test)
348 |     pred <- prediction(fold_pred, x_test.y)
349 |     auc <- performance(pred, measure = "auc")
350 |     fold_auc <- c(fold_auc, auc@y.values[[1]])
351 | }
352 | 
353 | print(fold_auc)
354 | ########################################
355 | 
356 | train <- sparse.model.matrix(Business_Sourced ~ ., data = train)
357 | dtrain <- xgb.DMatrix(data=train, label=train.y)
358 | watchlist <- list(train=dtrain)
359 | # 
360 | param <- list( objective    = "binary:logistic",
361 |                booster      = "gbtree",
362 |                eval_metric  = "auc",
363 |                subsample = 0.8,
364 |                min_child_weight = 5,
365 |                colsample_bytree = 0.2,
366 |                eta          = 0.05,
367 |                max_depth    = 8
368 | )
369 | 
370 | clf <- xgb.train(   params              = param, 
371 |                     data                = d_train, 
372 |                     nrounds             = 150, 
373 |                     verbose             = 2,
374 |                     watchlist           = watchlist
375 | )
376 | 
377 | test$target <- -1
378 | test <- sparse.model.matrix(target ~ ., data = test)
379 | 
380 | preds <- predict(clf, test)
381 | submission <- data.frame(ID=test.id, Business_Sourced=preds)
382 | cat("saving the submission file\n")
383 | write.csv(submission, "Submissions/submission.csv", row.names = F)
384 | 


--------------------------------------------------------------------------------