├── README.md ├── LICENSE ├── prepare_submission_file.py └── model_xgb.R /README.md: -------------------------------------------------------------------------------- 1 | # Santander Product Recommendation competition on Kaggle 2 | Please download the datasets from the [Kaggle competition page](https://www.kaggle.com/c/santander-product-recommendation) before running the codes. 3 | 4 | Edit the path to the files in *model_xgb.R* as well as *prepare_submission_file.py* 5 | 6 | **model_xgb.R** imports the raw data, creates the features, trains the model and generates the predictions for the test data. 7 | **prepare_submission_file.py** creates the final submission file in Kaggle's required format, after removing products already owned by customers. 8 | 9 | This model scores ~ 0.03102 on the private LB, ranked 11th. 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Rohan Rao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /prepare_submission_file.py: -------------------------------------------------------------------------------- 1 | # file paths (edit the paths before running) 2 | path_train = 'train.csv' 3 | path_preds = 'preds.csv' 4 | path_submission = 'submit.csv' 5 | 6 | # loading libraries 7 | import numpy as np 8 | import pandas as pd 9 | 10 | # preparing submission file 11 | target_cols = ['ind_cco_fin_ult1','ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1', 12 | 'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1','ind_dela_fin_ult1', 13 | 'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1','ind_plan_fin_ult1', 14 | 'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1','ind_valo_fin_ult1', 15 | 'ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1'] 16 | 17 | last_instance_df = pd.read_csv(path_train, usecols=['ncodpers'] + target_cols) 18 | last_instance_df = last_instance_df.drop_duplicates('ncodpers', keep='last') 19 | 20 | cust_dict = {} 21 | target_cols = np.array(target_cols) 22 | 23 | for ind, row in last_instance_df.iterrows(): 24 | cust = row['ncodpers'] 25 | used_products = set(target_cols[np.array(row[1:])==1]) 26 | cust_dict[cust] = used_products 27 | 28 | del last_instance_df 29 | 30 | preds = pd.read_csv(path_preds) 31 | 32 | test_id = np.array(preds['ncodpers']) 33 | 34 | preds.drop(['ncodpers'], axis=1, inplace=True) 35 | preds = np.argsort(preds, axis=1) 36 | preds = np.fliplr(preds) 37 | 38 | final_preds = [] 39 | 40 | for ind, pred in enumerate(preds): 41 | cust = test_id[ind] 42 | top_products = target_cols[pred] 43 | used_products = cust_dict.get(cust,[]) 44 | new_top_products = [] 45 | for product in top_products: 46 | if product not in used_products: 47 | new_top_products.append(product) 48 | if len(new_top_products) == 7: 49 | break 50 | final_preds.append(" ".join(new_top_products)) 51 | 52 | submission = pd.DataFrame({'ncodpers':test_id, 'added_products':final_preds}) 53 | submission.to_csv(path_submission, index=False) 54 | -------------------------------------------------------------------------------- /model_xgb.R: -------------------------------------------------------------------------------- 1 | #### Santander Product Recommendation on Kaggle 2 | #### Team 'SRRRK' (Sudalai Raj Kumar and Rohan Rao) 3 | #### 11th Rank 4 | 5 | ## setting file paths and seed (edit the paths before running) 6 | path_train <- "train.csv" 7 | path_test <- "test.csv" 8 | path_preds <- "preds.csv" 9 | 10 | ## put your favourite number as seed 11 | seed <- 123 12 | set.seed(seed) 13 | 14 | ## loading libraries 15 | library(data.table) 16 | library(xgboost) 17 | 18 | ## loading raw data 19 | train <- fread(path_train, showProgress = T) 20 | test <- fread(path_test, showProgress = T) 21 | 22 | ## removing five products 23 | train[, ind_ahor_fin_ult1 := NULL] 24 | train[, ind_aval_fin_ult1 := NULL] 25 | train[, ind_deco_fin_ult1 := NULL] 26 | train[, ind_deme_fin_ult1 := NULL] 27 | train[, ind_viv_fin_ult1 := NULL] 28 | 29 | ## extracting train data of each product and rbinding them together with single multiclass label 30 | i <- 0 31 | target_cols <- names(train)[which(regexpr("ult1", names(train)) > 0)] 32 | 33 | for (target_col in target_cols) 34 | { 35 | i <- i + 1 36 | 37 | S <- paste0("train", i, " <- train[", target_col, " > 0]") 38 | eval(parse(text = S)) 39 | } 40 | 41 | rm(train) 42 | gc() 43 | 44 | for (i in 1:19) 45 | { 46 | S1 <- paste0("train", i, " <- train", i, "[, !target_cols, with = F]") 47 | eval(parse(text = S1)) 48 | 49 | S2 <- paste0("train", i, "[, target := ", i-1, "]") 50 | eval(parse(text = S2)) 51 | } 52 | 53 | X_train <- rbind(train1, train2, train3, train4, train5, train6, train7, train8, train9, train10, 54 | train11, train12, train13, train14, train15, train16, train17, train18, train19) 55 | 56 | rm(train1, train2, train3, train4, train5, train6, train7, train8, train9, train10, 57 | train11, train12, train13, train14, train15, train16, train17, train18, train19) 58 | gc() 59 | 60 | ## rbinding train and test data 61 | X_panel <- rbind(X_train, test, use.names = T, fill = T) 62 | 63 | ## adding corresponding numeric months (1-18) to fecha_dato 64 | X_panel[, month := as.numeric(as.factor(fecha_dato))] 65 | 66 | ## creating user-product matrix 67 | X_user_target <- dcast(X_panel[!is.na(target)], ncodpers + month ~ target, length, value.var = "target", fill = 0) 68 | 69 | ## creating product lag-variables of order-12 and merging with data 70 | X_user_target[, month := month + 1] 71 | 72 | setnames(X_user_target, 73 | c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 74 | "16", "17", "18"), 75 | c("prev_0", "prev_1", "prev_2", "prev_3", "prev_4", "prev_5", "prev_6", "prev_7", 76 | "prev_8", "prev_9", "prev_10", "prev_11", "prev_12", "prev_13", "prev_14", "prev_15", 77 | "prev_16", "prev_17", "prev_18")) 78 | 79 | X_panel <- merge(X_panel, X_user_target, all.x = T, by = c("ncodpers", "month")) 80 | 81 | for (i in 2:12) 82 | { 83 | X_user_target[, month := month + 1] 84 | 85 | lag_cols <- names(X_user_target)[which(regexpr("prev", names(X_user_target)) > 0)] 86 | setnames(X_user_target, lag_cols, paste0("prev_", lag_cols)) 87 | 88 | X_panel <- merge(X_panel, X_user_target, all.x = T, by = c("ncodpers", "month")) 89 | } 90 | 91 | X_panel[is.na(X_panel)] <- 0 92 | 93 | ## calculating historic average of product lag-variables with 1, 2, 3, 4, 5, 6, 9, 12 months 94 | for (i in 0:18) 95 | { 96 | S1 <- paste0("X_panel[, prev2_", i, " := (prev_", i, " + prev_prev_", i, ") / 2]") 97 | eval(parse(text = S1)) 98 | 99 | S2 <- paste0("X_panel[, prev3_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, ") / 3]") 100 | eval(parse(text = S2)) 101 | 102 | S3 <- paste0("X_panel[, prev4_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, " + prev_prev_prev_prev_", i, ") / 4]") 103 | eval(parse(text = S3)) 104 | 105 | S4 <- paste0("X_panel[, prev5_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, " + prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_", i, ") / 5]") 106 | eval(parse(text = S4)) 107 | 108 | S5 <- paste0("X_panel[, prev6_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, " + prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_", i, ") / 6]") 109 | eval(parse(text = S5)) 110 | 111 | S6 <- paste0("X_panel[, prev9_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, " + prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_", i, " 112 | + prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, ") / 9]") 113 | eval(parse(text = S6)) 114 | 115 | S6 <- paste0("X_panel[, prev12_", i, " := (prev_", i, " + prev_prev_", i, " + prev_prev_prev_", i, " + prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_", i, " 116 | + prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " 117 | + prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " + prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, ") / 12]") 118 | eval(parse(text = S6)) 119 | 120 | S7 <- paste0("X_panel[, ':='(prev_prev_", i, " = NULL, prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_prev_", i, " = NULL, 121 | prev_prev_prev_prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " = NULL, 122 | prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " = NULL, prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_prev_", i, " = NULL)]") 123 | eval(parse(text = S7)) 124 | } 125 | 126 | ## cleaning raw features 127 | X_panel[, ":="(ind_empleado = as.numeric(as.factor(ind_empleado)), 128 | pais_residencia = as.numeric(as.factor(pais_residencia)), 129 | sexo = as.numeric(as.factor(sexo)), 130 | year_joining = year(as.Date(fecha_alta)), 131 | month_joining = month(as.Date(fecha_alta)), 132 | fecha_alta = as.numeric(as.Date(fecha_alta) - as.Date("2016-05-31")), 133 | ult_fec_cli_1t = ifelse(ult_fec_cli_1t == "", 0, 1), 134 | indrel_1mes = as.numeric(as.factor(indrel_1mes)), 135 | tiprel_1mes = as.numeric(as.factor(tiprel_1mes)), 136 | indresi = as.numeric(as.factor(indresi)), 137 | indext = as.numeric(as.factor(indext)), 138 | conyuemp = as.numeric(as.factor(conyuemp)), 139 | canal_entrada = as.numeric(as.factor(canal_entrada)), 140 | indfall = as.numeric(as.factor(indfall)), 141 | tipodom = NULL, 142 | cod_prov = as.numeric(as.factor(cod_prov)), 143 | nomprov = NULL, 144 | segmento = as.numeric(as.factor(segmento)))] 145 | 146 | ## calculating product count of previous month 147 | X_panel[, count_prev1_products := (prev_0 + prev_1 + prev_2 + prev_3 + prev_4 + prev_5 + prev_6 148 | + prev_7 + prev_8 + prev_9 + prev_10 + prev_11 + prev_12 149 | + prev_13 + prev_14 + prev_15 + prev_16 + prev_17 + prev_18)] 150 | 151 | ## label encoding binary string of previous month's products (in order of popularity) 152 | X_panel[, prev_products := as.numeric(as.factor(paste0(prev_0, prev_18, prev_5, prev_2, prev_8, 153 | prev_17, prev_16, prev_13, prev_14, 154 | prev_15, prev_6, prev_7, prev_9, prev_4, 155 | prev_3, prev_11, prev_10, prev_12, prev_1)))] 156 | 157 | ## creating train and test data for June-15 (seasonality) and May-16 (trend) models 158 | X_train_1 <- X_panel[fecha_dato %in% c("2015-06-28")] 159 | X_train_2 <- X_panel[fecha_dato %in% c("2016-05-28")] 160 | 161 | X_test_1 <- X_panel[fecha_dato %in% c("2016-06-28")] 162 | X_test_2 <- X_panel[fecha_dato %in% c("2016-06-28")] 163 | 164 | X_test_order <- X_test_1$ncodpers 165 | 166 | ## creating binary flag for new products, test data will always have 1 since we need to predict new products 167 | X_train_1$flag_new <- 0 168 | X_train_2$flag_new <- 0 169 | 170 | X_test_1$flag_new <- 1 171 | X_test_2$flag_new <- 1 172 | 173 | for (i in 0:18) 174 | { 175 | S1 <- paste0("X_train_1$flag_new[X_train_1$prev_", i, " == 0 & X_train_1$target == ", i, "] <- 1") 176 | eval(parse(text = S1)) 177 | 178 | S2 <- paste0("X_train_2$flag_new[X_train_2$prev_", i, " == 0 & X_train_2$target == ", i, "] <- 1") 179 | eval(parse(text = S2)) 180 | } 181 | 182 | ## removing lag6, lag9, lag12 variables from seasonality model 183 | for (col in names(X_train_1)[which(regexpr("prev6", names(X_train_1)) > 0 | regexpr("prev9", names(X_train_1)) > 0 | regexpr("prev12", names(X_train_1)) > 0)]) 184 | { 185 | S1 <- paste0("X_train_1[, ", col, " := NULL]") 186 | eval(parse(text = S1)) 187 | 188 | S2 <- paste0("X_test_1[, ", col, " := NULL]") 189 | eval(parse(text = S2)) 190 | } 191 | 192 | ## extracting labels 193 | X_target_1 <- X_train_1$target 194 | X_target_2 <- X_train_2$target 195 | 196 | ## removing redundant columns 197 | X_train_1[, ":="(fecha_dato = NULL, ncodpers = NULL, month = NULL, target = NULL)] 198 | X_train_2[, ":="(fecha_dato = NULL, ncodpers = NULL, month = NULL, target = NULL)] 199 | 200 | X_test_1[, ":="(fecha_dato = NULL, ncodpers = NULL, month = NULL, target = NULL)] 201 | X_test_2[, ":="(fecha_dato = NULL, ncodpers = NULL, month = NULL, target = NULL)] 202 | 203 | ## creating xgb.DMatrix 204 | xgtrain1 <- xgb.DMatrix(as.matrix(X_train_1), label = X_target_1, missing = NA) 205 | xgtrain2 <- xgb.DMatrix(as.matrix(X_train_2), label = X_target_2, missing = NA) 206 | 207 | xgtest1 <- xgb.DMatrix(as.matrix(X_test_1), missing = NA) 208 | xgtest2 <- xgb.DMatrix(as.matrix(X_test_2), missing = NA) 209 | 210 | ## xgboost parameters 211 | params <- list() 212 | params$objective <- "multi:softprob" 213 | params$num_class <- 19 214 | params$eta <- 0.1 215 | params$max_depth <- 5 216 | params$subsample <- 0.8 217 | params$colsample_bytree <- 0.8 218 | params$min_child_weight <- 3 219 | params$eval_metric <- "mlogloss" 220 | 221 | ## xgboost training 222 | model_xgb_1 <- xgb.train(params = params, xgtrain1, nrounds = 140, nthread = -1) 223 | model_xgb_2 <- xgb.train(params = params, xgtrain2, nrounds = 140, nthread = -1) 224 | 225 | ## xgboost predictions 226 | pred_1 <- predict(model_xgb_1, xgtest1) 227 | pred_2 <- predict(model_xgb_2, xgtest2) 228 | 229 | ## converting predictions into 19 product columns 230 | pred_matrix_1 <- data.table(matrix(pred_1, ncol = 19, byrow = T)) 231 | pred_matrix_2 <- data.table(matrix(pred_2, ncol = 19, byrow = T)) 232 | 233 | ## cco weights 234 | pred_matrix_1[,1] <- 0.9 * pred_matrix_1[,1] 235 | pred_matrix_2[,1] <- 0.1 * pred_matrix_2[,1] 236 | 237 | ## cder weights 238 | pred_matrix_1[,2] <- 0.5 * pred_matrix_1[,2] 239 | pred_matrix_2[,2] <- 0.5 * pred_matrix_2[,2] 240 | 241 | ## cno weights 242 | pred_matrix_1[,3] <- 0.3 * pred_matrix_1[,3] 243 | pred_matrix_2[,3] <- 0.7 * pred_matrix_2[,3] 244 | 245 | ## ctju weights 246 | pred_matrix_1[,4] <- 0.4 * pred_matrix_1[,4] 247 | pred_matrix_2[,4] <- 0.6 * pred_matrix_2[,4] 248 | 249 | ## ctma weights 250 | pred_matrix_1[,5] <- 0.3 * pred_matrix_1[,5] 251 | pred_matrix_2[,5] <- 0.7 * pred_matrix_2[,5] 252 | 253 | ## ctop weights 254 | pred_matrix_1[,6] <- 0.5 * pred_matrix_1[,6] 255 | pred_matrix_2[,6] <- 0.5 * pred_matrix_2[,6] 256 | 257 | ## ctpp weights 258 | pred_matrix_1[,7] <- 0.4 * pred_matrix_1[,7] 259 | pred_matrix_2[,7] <- 0.6 * pred_matrix_2[,7] 260 | 261 | ## dela weights 262 | pred_matrix_1[,8] <- 0.1 * pred_matrix_1[,8] 263 | pred_matrix_2[,8] <- 0.9 * pred_matrix_2[,8] 264 | 265 | ## ecue weights 266 | pred_matrix_1[,9] <- 0.3 * pred_matrix_1[,9] 267 | pred_matrix_2[,9] <- 0.7 * pred_matrix_2[,9] 268 | 269 | ## fond weights 270 | pred_matrix_1[,10] <- 0.1 * pred_matrix_1[,10] 271 | pred_matrix_2[,10] <- 0.9 * pred_matrix_2[,10] 272 | 273 | ## hip weights 274 | pred_matrix_1[,11] <- 0.5 * pred_matrix_1[,11] 275 | pred_matrix_2[,11] <- 0.5 * pred_matrix_2[,11] 276 | 277 | ## plan weights 278 | pred_matrix_1[,12] <- 0.5 * pred_matrix_1[,12] 279 | pred_matrix_2[,12] <- 0.5 * pred_matrix_2[,12] 280 | 281 | ## pres weights 282 | pred_matrix_1[,13] <- 0.5 * pred_matrix_1[,13] 283 | pred_matrix_2[,13] <- 0.5 * pred_matrix_2[,13] 284 | 285 | ## reca weights 286 | pred_matrix_1[,14] <- 0.9 * pred_matrix_1[,14] 287 | pred_matrix_2[,14] <- 0.1 * pred_matrix_2[,14] 288 | 289 | ## tjcr weights 290 | pred_matrix_1[,15] <- 0.5 * pred_matrix_1[,15] 291 | pred_matrix_2[,15] <- 0.5 * pred_matrix_2[,15] 292 | 293 | ## valo weights 294 | pred_matrix_1[,16] <- 0.6 * pred_matrix_1[,16] 295 | pred_matrix_2[,16] <- 0.4 * pred_matrix_2[,16] 296 | 297 | ## nomina weights 298 | pred_matrix_1[,17] <- 0.8 * pred_matrix_1[,17] 299 | pred_matrix_2[,17] <- 0.2 * pred_matrix_2[,17] 300 | 301 | ## nom_pens weights 302 | pred_matrix_1[,18] <- 0.8 * pred_matrix_1[,18] 303 | pred_matrix_2[,18] <- 0.2 * pred_matrix_2[,18] 304 | 305 | ## recibo weights 306 | pred_matrix_1[,19] <- 0.4 * pred_matrix_1[,19] 307 | pred_matrix_2[,19] <- 0.6 * pred_matrix_2[,19] 308 | 309 | ## saving preds to csv 310 | pred_matrix <- pred_matrix_1 + pred_matrix_2 311 | pred_matrix[, ncodpers := X_test_order] 312 | 313 | fwrite(pred_matrix, path_preds) 314 | --------------------------------------------------------------------------------