├── AV-Black-Friday ├── Code.py └── README.md ├── AV-Hackathon-3.X ├── Data_Preprocessing.R ├── Hack_3x_Modelling.py └── README.md ├── AV-Hackathon-3 ├── AV_wog.R ├── Feature Engineering.R └── README.md ├── Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank- ├── FactorVariables.R ├── Final_Model.R ├── LoadData.R └── README.md ├── Analytics_Vidhya_3.X_Hackathon ├── README.md ├── Weekend │ ├── _1_preprocessing.py │ ├── _2_train_xgb.py │ ├── _3_preprocessing_ftrl.py │ ├── _4_train_ftrl.py │ ├── _5_postprocessing_ftrl.py │ ├── _6_ensemble.py │ └── av_script.sh ├── Weeklong │ ├── av_final.sh │ ├── ensemble_rank_final.py │ ├── postprocessing_RF.py │ ├── postprocessing_XGB_1.py │ ├── postprocessing_XGB_2.py │ ├── postprocessing_ftrl.py │ ├── preprocessing.py │ ├── preprocessing2.py │ ├── preprocessing_ftrl.py │ ├── script_ftrl.py │ ├── script_ftrl2.py │ ├── script_ftrl3.py │ ├── script_ftrl4.py │ ├── script_ftrl5.py │ ├── shuffle.py │ ├── train_2xgb1.py │ ├── train_2xgb2.py │ ├── train_2xgb3.py │ ├── train_2xgb4.py │ ├── train_2xgb5.py │ ├── train_rf.py │ ├── train_xgb.py │ ├── train_xgb2.py │ ├── train_xgb3.py │ ├── train_xgb4.py │ └── train_xgb5.py ├── XGB Tuning guide.md └── requirements.md ├── D-hack ├── Code.py └── README.md ├── Hacker-Earth---Will-Bill-Solve-it- ├── Final-Code.R └── README.md ├── README.md └── minnemudac ├── 1999_2014_monitoring_data.csv ├── AvgWaterQualityByLake&Season.sql ├── AvgWaterQualityByLake.sql ├── DuplicatePropertyCheck.sql ├── NumberPropertyTypesPerLake.sql ├── PropertiesByYear.csv ├── PropertiesPctChangeByYear.csv ├── PropertiesPctOfTotalByYear.csv ├── README.md ├── Top&Bottom10LakesPerYear.sql └── seasonal.csv /AV-Black-Friday/Code.py: -------------------------------------------------------------------------------- 1 | #importing libraries 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.ensemble import RandomForestRegressor 5 | import xgboost as xgb 6 | from sklearn.ensemble import GradientBoostingRegressor 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn import preprocessing 9 | from sklearn import ensemble 10 | 11 | # setting the input path and reading the data into dataframe # 12 | data_path = "E:/DS/AV Black Friday/" 13 | train = pd.read_csv(data_path+"Train.csv") 14 | test = pd.read_csv(data_path+"Test.csv") 15 | 16 | ## categical column name list ## 17 | categorical_columns = ["Product_ID","Gender","Age","Occupation","City_Category","Stay_In_Current_City_Years","Marital_Status","Product_Category_1","Product_Category_2","Product_Category_3"] 18 | 19 | ## Getting the ID and DV from the data frame ## 20 | train_y = np.array(train["Purchase"]) 21 | 22 | ## Creating the IDVs from the train and test dataframe ## 23 | train_X = train.copy() 24 | test_X = test.copy() 25 | 26 | ## Fill up the na values with -999 ## 27 | train_X = train_X.fillna(-999) 28 | test_X = test_X.fillna(-999) 29 | 30 | #encoding categorical variable 31 | for var in categorical_columns: 32 | lb = preprocessing.LabelEncoder() 33 | full_var_data = pd.concat((train_X[var],test_X[var]),axis=0).astype('str') 34 | lb.fit( full_var_data ) 35 | train_X[var] = lb.transform(train_X[var].astype('str')) 36 | test_X[var] = lb.transform(test_X[var].astype('str')) 37 | 38 | ## Dropping the unnecessary columns from IDVs ## 39 | train_X = np.array( train_X.drop(['Purchase'],axis=1) ) 40 | print "Train shape is : ",train_X.shape 41 | print "Test shape is : ",test_X.shape 42 | 43 | print "Building XGB1" 44 | params = {} 45 | params["objective"] = "reg:linear" 46 | params["eta"] = 0.05 47 | params["seed"] = 0 48 | plst = list(params.items()) 49 | xgtrain = xgb.DMatrix(train_X, label=train_y, missing = -999) 50 | xgtest = xgb.DMatrix(test_X,missing = -999) 51 | num_rounds = 5667 52 | model = xgb.train(plst, xgtrain, num_rounds) 53 | pred_test_y_xgb1 = model.predict(xgtest) 54 | 55 | 56 | #submission 57 | test['Purchase']=pred_test_y_xgb1 58 | test.to_csv(data_path+'Solution.csv',columns = ['User_ID','Product_ID','Purchase'],index = False) 59 | -------------------------------------------------------------------------------- /AV-Black-Friday/README.md: -------------------------------------------------------------------------------- 1 | ##### Codes for Analytics Vidhya Online Hackathon - Black Friday 2 | http://datahack.analyticsvidhya.com/contest/black-friday-data-hack 3 | 4 | ##### Problem Statement 5 | The challenge was to predict purchase prices of various products purchased by customers based on historical purchase patterns. The data contained features like age, gender, marital status, categories of products purchased, city demographics etc. 6 | 7 | ##### My approach for the hackathon is as follows: 8 | 1. Looked into levels of data and ran a basic random forest to understand Feature importance, realized Product ID was the most important feature 9 | 2. Added a new variable in Excel with mean of each product 10 | 3. Converted all categorical variable in one hot encoded categories 11 | 4. Built an XGB over it and optimized parameters 12 | 5. Got a RMSE of 2465, Public Leader Board Ranking 7 , Private Leaderboard Ranking 5 13 | -------------------------------------------------------------------------------- /AV-Hackathon-3.X/Data_Preprocessing.R: -------------------------------------------------------------------------------- 1 | ########## AV Hackathon 3.X ######## 2 | 3 | #setting working library 4 | setwd("E:/DS/AV Hack 3.x") 5 | 6 | #loading libraries 7 | library(caret) 8 | library(randomForest) 9 | library(rpart) 10 | 11 | #reading the files 12 | train=read.csv("train.csv") 13 | test=read.csv("test.csv") 14 | 15 | str(train) 16 | 17 | #just converting some levels 18 | train$Var4 = as.factor(train$Var4) 19 | train$Var5 = as.factor(train$Var5) 20 | train$Disbursed = as.factor(train$Disbursed) 21 | train$DOB = as.Date(train$DOB, format = "%d-%m-%Y") 22 | train$Lead_Creation_Date = as.Date(train$Lead_Creation_Date, format = "%d-%m-%Y") 23 | 24 | 25 | 26 | 27 | test$Var4 = as.factor(test$Var4) 28 | test$Var5 = as.factor(test$Var5) 29 | test$DOB = as.Date(test$DOB, format = "%d-%m-%Y") 30 | test$Lead_Creation_Date = as.Date(test$Lead_Creation_Date, format = "%d-%m-%Y") 31 | 32 | 33 | #treating some of the variables 34 | 35 | #0) Creating Dummy Variable of class factors Gender 36 | #0.i) is_male 37 | is_male <- function(x) { 38 | if(x == "Male") { 39 | y <- 1 40 | } else { 41 | y <- 0 42 | } 43 | return(y) 44 | } 45 | 46 | train_1 <- cbind(train,is_male = as.factor(mapply(is_male,train$Gender))) 47 | test_1 <- cbind(test,is_male = as.factor(mapply(is_male,test$Gender))) 48 | 49 | 50 | 51 | ############################################################################## 52 | #1) Adding age by using DOB and Lead_Creation_Date column 53 | train_1 <- cbind(train_1,age = as.integer(round((train_1$Lead_Creation_Date - train$DOB)/365,digits =0))) 54 | test_1 <- cbind(test_1,age = as.integer(round((test_1$Lead_Creation_Date - test$DOB)/365,digits = 0))) 55 | 56 | 57 | ############################################################################## 58 | #2) Extraction and Addition of DOB month and year 59 | train_1 <- cbind(train_1,DOB_month = as.factor(format(train_1$DOB,'%m'))) 60 | train_1 <- cbind(train_1,DOB_year = as.factor(format(train_1$DOB,'%Y'))) 61 | 62 | test_1 <- cbind(test_1,DOB_month = as.factor(format(test_1$DOB,'%m'))) 63 | test_1 <- cbind(test_1,DOB_year = as.factor(format(test_1$DOB,'%Y'))) 64 | 65 | ############################################################################## 66 | #3) Extraction and Addition of Lead_Creation_Date month 67 | train_1 <- cbind(train_1,Lead_Creation_day = as.factor(format(train_1$Lead_Creation_Date,'%d'))) 68 | train_1 <- cbind(train_1,Lead_Creation_month = as.factor(format(train_1$Lead_Creation_Date,'%m'))) 69 | train_1 <- cbind(train_1,Lead_Creation_year = as.factor(format(train_1$Lead_Creation_Date,'%Y'))) 70 | 71 | test_1 <- cbind(test_1,Lead_Creation_day = as.factor(format(test_1$Lead_Creation_Date,'%d'))) 72 | test_1 <- cbind(test_1,Lead_Creation_month = as.factor(format(test_1$Lead_Creation_Date,'%m'))) 73 | test_1 <- cbind(test_1,Lead_Creation_year = as.factor(format(test_1$Lead_Creation_Date,'%Y'))) 74 | 75 | 76 | ############################################################################## 77 | #4) Treating Loan_Amount_Submitted by adding zero 78 | train_1$Loan_Amount_Submitted[is.na(train_1$Loan_Amount_Submitted)] <- train_1$Loan_Amount_Applied[is.na(train_1$Loan_Amount_Submitted)] 79 | test_1$Loan_Amount_Submitted[is.na(test_1$Loan_Amount_Submitted)] <- test_1$Loan_Amount_Applied[is.na(test_1$Loan_Amount_Submitted)] 80 | 81 | ############################################################################## 82 | #5) Treating Loan_Tenure_Submitted by adding zero 83 | train_1$Loan_Tenure_Submitted[is.na(train_1$Loan_Tenure_Submitted)] <- train_1$Loan_Tenure_Applied[is.na(train_1$Loan_Tenure_Submitted)] 84 | test_1$Loan_Tenure_Submitted[is.na(test_1$Loan_Tenure_Submitted)] <- test_1$Loan_Tenure_Applied[is.na(test_1$Loan_Tenure_Submitted)] 85 | 86 | ############################################################################## 87 | #6) Treating Processing_Fee and EMI_Loan_Submitted 88 | Processing_Fee_null_train <- is.na(train_1$Processing_Fee) 89 | Processing_Fee_null_test <- is.na(test_1$Processing_Fee) 90 | 91 | train_1$Processing_Fee[is.na(train_1$Processing_Fee)] <- 0 92 | test_1$Processing_Fee[is.na(test_1$Processing_Fee)] <- 0 93 | 94 | 95 | EMI_Loan_Submitted_null_train <- is.na(train_1$EMI_Loan_Submitted) 96 | EMI_Loan_Submitted_null_test <- is.na(test_1$EMI_Loan_Submitted) 97 | 98 | train_1$EMI_Loan_Submitted[is.na(train_1$EMI_Loan_Submitted)] <- 0 99 | test_1$EMI_Loan_Submitted[is.na(test_1$EMI_Loan_Submitted)] <- 0 100 | 101 | ############################################################################## 102 | #7) Creating Counter for Existing EMI and Interest Rate 103 | Existing_EMI_null_train <- is.na(train_1$Existing_EMI) 104 | Interest_Rate_null_train <- is.na(train_1$Interest_Rate) 105 | 106 | Existing_EMI_null_test <- is.na(test_1$Existing_EMI) 107 | Interest_Rate_null_test <- is.na(test_1$Interest_Rate) 108 | 109 | 110 | ############################################################################## 111 | #8) Missing value Imputation of columns 112 | 113 | 114 | numeric_columns <- NULL 115 | for (i in 1:ncol(train_1)){ 116 | if(class(train_1[,i]) == "integer" | class(train_1[,i]) == "numeric") { 117 | numeric_columns <- rbind(numeric_columns,i) 118 | } 119 | } 120 | 121 | preproc <- preProcess(method = "bagImpute", train_1[,numeric_columns[-10]]) 122 | 123 | train_1_imputed <- predict(preproc, train_1[,numeric_columns]) 124 | numeric_columns_1 <- NULL 125 | for (i in 1:ncol(train_1_imputed)){ 126 | if(class(train_1_imputed[,i]) == "integer" | class(train_1_imputed[,i]) == "numeric") { 127 | numeric_columns_1 <- rbind(numeric_columns_1,i) 128 | } 129 | } 130 | train_1[,numeric_columns] <- train_1_imputed[,numeric_columns_1] 131 | 132 | #train_1$Loan_Tenure_Submitted <- train_1_imputed$Loan_Tenure_Submitted 133 | 134 | numeric_columns <- NULL 135 | for (i in 1:ncol(test_1)){ 136 | if(class(test_1[,i]) == "integer" | class(test_1[,i]) == "numeric") { 137 | numeric_columns <- rbind(numeric_columns,i) 138 | } 139 | } 140 | 141 | 142 | test_1_imputed <- predict(preproc, test_1[,numeric_columns]) 143 | numeric_columns_1 <- NULL 144 | for (i in 1:ncol(test_1_imputed)){ 145 | if(class(test_1_imputed[,i]) == "integer" | class(test_1_imputed[,i]) == "numeric") { 146 | numeric_columns_1 <- rbind(numeric_columns_1,i) 147 | } 148 | } 149 | 150 | 151 | test_1[,numeric_columns] <- test_1_imputed[,numeric_columns_1] 152 | 153 | 154 | ####################################################################### 155 | #9) New Variable Creation : EMI_calculated 156 | 157 | EMI <- function(x,y,z) { 158 | if(y == 0 | z == 0) { 159 | a <- 0 160 | } else { 161 | b <- y/1200 162 | c <- z*12 163 | a <- (x*b*((1+b)^c)) / (((1+b)^c) - 1) 164 | } 165 | return(a) 166 | } 167 | 168 | train_1 <- cbind(train_1,EMI_calculated= as.numeric(mapply(EMI,x = train_1$Loan_Amount_Submitted , y = train_1$Interest_Rate,z = train_1$Loan_Tenure_Submitted))) 169 | test_1 <- cbind(test_1,EMI_calculated = as.numeric(mapply(EMI,x = test_1$Loan_Amount_Submitted , y = test_1$Interest_Rate,z = test_1$Loan_Tenure_Submitted))) 170 | 171 | 172 | ####################################################################### 173 | #10) New Variable Creation : Future_EMI_perincome index 174 | 175 | train_1 <- cbind(train_1,Future_EMI_perincome = as.numeric((train_1$Existing_EMI + train_1$EMI_calculated) / (train_1$Monthly_Income+1))) 176 | test_1 <- cbind(test_1,Future_EMI_perincome = as.numeric((test_1$Existing_EMI + test_1$EMI_calculated) / (test_1$Monthly_Income+1))) 177 | 178 | train_1$Future_EMI_perincome[train_1$Future_EMI_perincome > 2] = 2 179 | test_1$Future_EMI_perincome[test_1$Future_EMI_perincome > 2] = 2 180 | 181 | ## Creating is_zero function 182 | is_zero <- function(x) { 183 | if(x == 0) { 184 | a <- 1 185 | } else { 186 | a <- 0 187 | } 188 | return(a) 189 | } 190 | 191 | 192 | 193 | ####################################################################### 194 | #11) Changing monthly income outliers 195 | 196 | train_1$Monthly_Income[train_1$Monthly_Income > 1000000] = 1000000 197 | test_1$Monthly_Income[test_1$Monthly_Income > 1000000] = 1000000 198 | 199 | ####################################################################### 200 | #12) New Variable Creation : Process_percent 201 | 202 | train_1 <- cbind(train_1,Proces_perct = as.numeric((train_1$Processing_Fee*100) / (train_1$Monthly_Income+1))) 203 | test_1 <- cbind(test_1,Proces_perct = as.numeric((test_1$Processing_Fee*100) / (test_1$Monthly_Income+1))) 204 | 205 | train_1$Proces_perct[train_1$Proces_perct > 40] = 40 206 | test_1$Proces_perct[test_1$Proces_perct > 40] = 40 207 | 208 | ####################################################################### 209 | #13) New Variable Creation : exist_EMI_perincome index 210 | 211 | train_1 <- cbind(train_1,exist_EMI_perincome = as.numeric((train_1$Existing_EMI) / (train_1$Monthly_Income+1))) 212 | test_1 <- cbind(test_1,exist_EMI_perincome = as.numeric((test_1$Existing_EMI) / (test_1$Monthly_Income+1))) 213 | 214 | train_1$exist_EMI_perincome[train_1$exist_EMI_perincome > 1.5] = 1.5 215 | test_1$exist_EMI_perincome[test_1$exist_EMI_perincome > 1.5] = 1.5 216 | 217 | #14) New Variable Creation : exx_EMI_perincome index 218 | 219 | train_1 <- cbind(train_1,exx_EMI_perincome = as.numeric((train_1$EMI_calculated) / (train_1$Monthly_Income+1))) 220 | test_1 <- cbind(test_1,exx_EMI_perincome = as.numeric((test_1$EMI_calculated) / (test_1$Monthly_Income+1))) 221 | 222 | train_1$exx_EMI_perincome[train_1$exx_EMI_perincome > 2] = 2 223 | test_1$exx_EMI_perincome[test_1$exx_EMI_perincome > 2] = 2 224 | 225 | #15) Removal of some Columns 226 | remove_var <- c('Gender','LoggedIn','EMI_Loan_Submitted') 227 | train_1 <- train_1[ , -which(names(train_1) %in% remove_var)] 228 | test_1 <- test_1[ , -which(names(test_1) %in% remove_var)] 229 | 230 | #16) Final file for modelling 231 | write.csv(test_1,file="test_1.csv",row.names=FALSE) 232 | write.csv(train_1,file="train_1.csv",row.names=FALSE) 233 | 234 | -------------------------------------------------------------------------------- /AV-Hackathon-3.X/Hack_3x_Modelling.py: -------------------------------------------------------------------------------- 1 | ######### Python code for AV Hack 3.x , Author = Aayush Agrawal ########## 2 | 3 | # Step 1: Importing Libraries 4 | from sklearn.ensemble import RandomForestClassifier 5 | from sklearn.linear_model import LogisticRegression 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn import preprocessing 9 | from sklearn.metrics import roc_curve, auc 10 | import pandas as pd 11 | from sklearn import ensemble 12 | import random 13 | import xgboost as xgb 14 | 15 | #Step 2 : Defining a 1/0 hard enccoder function 16 | number = preprocessing.LabelEncoder() 17 | 18 | #Step 3 : Importing Train and testing data after preprocessing from R code 19 | train=pd.read_csv('E:/DS/AV Hack 3.x/train_1.csv') 20 | test=pd.read_csv('E:/DS/AV Hack 3.x/test_1.csv') 21 | 22 | #Step 4 : Having a look at the data 23 | train.head() 24 | 25 | 26 | #Step 5 : Converting factor variables in 1/0 encoding and making any missing value -999 27 | def convert(data): 28 | number = preprocessing.LabelEncoder() 29 | data['Lead_Creation_Date'] = number.fit_transform(data.Lead_Creation_Date) 30 | data['is_male'] = number.fit_transform(data.is_male) 31 | data['City'] = number.fit_transform(data.City) 32 | data['Salary_Account'] = number.fit_transform(data.Salary_Account) 33 | data['Employer_Name'] = number.fit_transform(data.Employer_Name) 34 | data['Mobile_Verified'] = number.fit_transform(data.Mobile_Verified) 35 | data['Var1'] = number.fit_transform(data.Var1) 36 | data['Filled_Form'] = number.fit_transform(data.Filled_Form) 37 | data['Device_Type'] = number.fit_transform(data.Device_Type) 38 | data['Var2'] = number.fit_transform(data.Var2) 39 | data['Var5'] = number.fit_transform(data.Var5) 40 | data['Var4'] = number.fit_transform(data.Var4) 41 | data['DOB_month'] = number.fit_transform(data.DOB_month) 42 | data['DOB_year'] = number.fit_transform(data.DOB_year) 43 | data['Lead_Creation_day'] = number.fit_transform(data.Lead_Creation_day) 44 | data['Lead_Creation_month'] = number.fit_transform(data.Lead_Creation_month) 45 | data['Source'] = number.fit_transform(data.Source) 46 | data=data.fillna(-999) 47 | return data 48 | 49 | train=convert(train) 50 | test=convert(test) 51 | 52 | #Step 6 : Running my 1st Model XGB 53 | # Step 6.i): Defining features for XGB 54 | features=['City', 55 | 'Monthly_Income', 56 | 'Lead_Creation_Date', 57 | 'Loan_Amount_Applied', 58 | 'Loan_Tenure_Applied', 59 | 'Existing_EMI', 60 | 'Employer_Name', 61 | 'Salary_Account', 62 | 'Mobile_Verified', 63 | 'Var5', 64 | 'Var1', 65 | 'Loan_Amount_Submitted', 66 | 'Loan_Tenure_Submitted', 67 | 'Interest_Rate', 68 | 'Processing_Fee', 69 | 'Filled_Form', 70 | 'Device_Type', 71 | 'Var2', 72 | 'Source', 73 | 'Var4', 74 | 'is_male', 75 | 'age', 76 | 'DOB_month', 77 | 'DOB_year', 78 | 'Lead_Creation_day', 79 | 'Lead_Creation_month', 80 | 'EMI_calculated', 81 | 'Future_EMI_perincome', 82 | 'Proces_perct', 83 | 'exist_EMI_perincome', 84 | 'exx_EMI_perincome' 85 | #'Profit_perc' 86 | #'EMI_Loan_Submitted' 87 | ] 88 | 89 | ## Step 6.ii) Preparing data from the features listed 90 | x_train = train[list(features)].values 91 | y_train = train['Disbursed'].values 92 | x_test=test[list(features)].values 93 | 94 | 95 | ## Step 6.iii) Defining Parameters 96 | params = {} 97 | params["objective"] = "binary:logistic" 98 | params["eta"] = 0.01 99 | params["min_child_weight"] = 7 100 | params["subsample"] = 0.7 101 | params["colsample_bytree"] = 0.7 102 | params["scale_pos_weight"] = 0.8 103 | params["silent"] = 0 104 | params["max_depth"] = 4 105 | params["seed"] = 0 106 | params["eval_metric"] = "auc" 107 | 108 | plst = list(params.items()) 109 | num_rounds = 1525 110 | 111 | xgtrain = xgb.DMatrix(x_train,label=y_train,missing=-999) 112 | xgtest = xgb.DMatrix(x_test,missing=-999) 113 | 114 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 115 | 116 | # Step 6.iv) Running the trained model on testing file 117 | pred_test_y_xgb1 = model_xgb.predict(xgtest) 118 | test['Disbursed']=pred_test_y_xgb1 119 | 120 | # Step 6.v) Getting the output 121 | test.to_csv('E:/DS/AV Hack 3.x/Solution_xgb.csv', columns=['ID','Disbursed'],index=False) 122 | 123 | ## Step 7) Running my 2nd Model Random Forest 124 | 125 | # Step 7.i): Defining features for Random Forest 126 | features=['City', 127 | 'Monthly_Income', 128 | 'Lead_Creation_Date', 129 | 'Loan_Amount_Applied', 130 | 'Loan_Tenure_Applied', 131 | 'Existing_EMI', 132 | 'Employer_Name', 133 | 'Salary_Account', 134 | #'Mobile_Verified', 135 | 'Var5', 136 | 'Var1', 137 | 'Loan_Amount_Submitted', 138 | 'Loan_Tenure_Submitted', 139 | 'Interest_Rate', 140 | 'Processing_Fee', 141 | 'Filled_Form', 142 | #'Device_Type', 143 | 'Var2', 144 | 'Source', 145 | 'Var4', 146 | #'is_male', 147 | 'age', 148 | 'DOB_month', 149 | 'DOB_year', 150 | 'Lead_Creation_day', 151 | #'Lead_Creation_month', 152 | 'EMI_calculated', 153 | 'Future_EMI_perincome', 154 | 'Proces_perct', 155 | 'exist_EMI_perincome', 156 | 'exx_EMI_perincome' 157 | #'Profit_perc' 158 | #'EMI_Loan_Submitted' 159 | ] 160 | 161 | ## Step 7.ii) Preparing data from the features listed 162 | x_train = train[list(features)].values 163 | y_train = train['Disbursed'].values 164 | x_test=test[list(features)].values 165 | 166 | 167 | ## Step 7.iii) Running Model : Random Forest , 1000 classifier 168 | rf = ensemble.RandomForestClassifier(n_estimators=1000,min_samples_leaf=50, max_features="auto", n_jobs=4, random_state=0) 169 | rf.fit(x_train, y_train) 170 | 171 | 172 | ## Step 7.iv) Looking at Feature Importance 173 | importances = rf.feature_importances_ 174 | indices = np.argsort(importances) 175 | 176 | ind=[] 177 | for i in indices: 178 | ind.append(features[i]) 179 | 180 | import matplotlib.pyplot as plt 181 | plt.figure(1) 182 | plt.title('Feature Importances') 183 | plt.barh(range(len(indices)), importances[indices], color='b', align='center') 184 | plt.yticks(range(len(indices)),ind) 185 | plt.xlabel('Relative Importance') 186 | plt.show() 187 | 188 | # Step 7.v) Running the trained model on testing file 189 | disbursed = rf.predict_proba(x_test) 190 | test['Disbursed']=disbursed[:,1] 191 | 192 | # Step 7.vi) Getting the output 193 | test.to_csv('E:/DS/AV Hack 3.x/Solution_rf.csv', columns=['ID','Disbursed'],index=False) 194 | 195 | 196 | ### After that Ensemble the model in excel using Rank.avg function and assignment of weight 0.66 to XGBoost model and 0.33 to RF model 197 | 198 | ####### END ####### Happy Learning #### 199 | -------------------------------------------------------------------------------- /AV-Hackathon-3.X/README.md: -------------------------------------------------------------------------------- 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.X - Predict-customer-worth-for-happy-customer-bank 2 | 3 | http://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802 4 | 5 | ##### My approach for the hackathon is as follows: 6 | 7 | 8 | ###### Data Preprocessing ( R Code) 9 | 1. I looked into levels of data and created a data dictionary by mentioning the level gaps, as I figured out that there is difference in level of data in training and testing data set (Like some cities are only in training dataset but are missing from testing and vice versa) 10 | 11 | 2. Treated City and Employee Name column by removing extra spaces and making proper font 12 | 13 | 3. Removed Extra levels from city by looking at count of cities finally reduced it to 15 levels by making other cities as "Others" 14 | 15 | 4. Removed some extra levels from Employee names, replaced all Employers below 30 cases to "Others" 16 | 17 | 5. Extracted Date, Month and Year from DOB column and then removed DOB because of many levels 18 | 19 | 6. Extracted Day and Month from Lead Creation Date, but kept Lead creation date 20 | 21 | 7. Replaced missing values of Loan Amount and Tenure submitted from Loan Amount and Tenure Applied 22 | 23 | 8. Replaced missing values of Processing Fee to zero 24 | 25 | 9. Imputed missing value of Interest Rate, Loan Amount Submitted and Loan tenure by using bagged imputation from R caret 26 | 27 | 10. Created a new variable of EMI_calculated : E = P×r×(1 + r)n/((1 + r)n - 1) 28 | 29 | 11. Created a new variable of Future_EMI_perincome ratio : (Existing EMI + EMI submitted)/ Monthly Income, restrited value till 2 30 | 31 | 12. Removed outlier from Monthly income by anything greater than 1,000,000 to 1,000,000 32 | 33 | 13. Created a new variable Process_percent : (Processing Fee/ Monthly Income) * 100, restricted it to 40 34 | 35 | 14. Created two variables exist_EMI_perincome(Existing EMI / Monthly income) and exx_EMI_perincome (EMI_calculated/ Monthly income) 36 | 37 | ###### Modelling (Python) 38 | 39 | 1. Used Extreme Gradient boosting and optimized the tuning parameterss based on local CV score, as many solutions on LB were proven to be overfitting in Weekender version 40 | 41 | 2. Final XGB model had a Local CV(4-Fold) score of 0.854141 +- 0.004308 and a LB rating of 0.85456 42 | 43 | 3. Used a Random forest classifier(1000 trees) and tuned it on a 75:25 approach 44 | 45 | 4. Final RF model was having local score of 0.84233 and a LB rating of 0.85213 46 | 47 | 5. Finally used Rank Average Ensembing for the final solution. Weights (2*XGB_score + Rf_score)/3 48 | 49 | -------------------------------------------------------------------------------- /AV-Hackathon-3/AV_wog.R: -------------------------------------------------------------------------------- 1 | #################### AV Hackathon 3#################### 2 | 3 | #setting working library 4 | setwd("E:/DS/AV wog") 5 | 6 | #loading libraries 7 | library(caret) 8 | library(randomForest) 9 | 10 | #reading the files 11 | train=read.csv("train.csv") 12 | test=read.csv("test.csv") 13 | 14 | str(train) 15 | str(test) 16 | 17 | 18 | # removing some unwanted variables 19 | remove_var <- c("institute_country") 20 | train <- train[ , -which(names(train) %in% remove_var)] 21 | test <- test[ , -which(names(test) %in% remove_var)] 22 | test <- test[,-26] 23 | 24 | #run feature engineering file now then come back 25 | 26 | remove_var <- c("Var15","institute_city","institute_state","subject_area","secondary_area") 27 | train_1 <- train_1[ , -which(names(train_1) %in% remove_var)] 28 | test_1 <- test_1[ , -which(names(test_1) %in% remove_var)] 29 | remove(test) 30 | remove(train) 31 | 32 | # R part split check 33 | library(rpart) 34 | r_part=rpart(Project_Valuation~project_subject,data=train_1) 35 | summary(r_part) 36 | r_part 37 | 38 | # R part split with RF 39 | train_2_1 <- train_1[train_1$Similar_Project_Valuation_other_institute <549 & train_1$Project_Valuation<5750,] 40 | train_2_2 <- train_1[train_1$Similar_Project_Valuation_other_institute >=549 & train_1$Project_Valuation < 5750,] 41 | test_2_1 <- test_1[test_1$Similar_Project_Valuation_other_institute <549,] 42 | test_2_2 <- test_1[test_1$Similar_Project_Valuation_other_institute >=549,] 43 | 44 | rf_2_1 <- randomForest(Project_Valuation ~.,data=train_2_1[,-c(1)], ntree=100, norm.votes=FALSE,importance = TRUE,do.trace = 1,nodesize = 50) 45 | rf_2_2 <- randomForest(Project_Valuation ~.,data=train_2_2[,-c(1)], ntree=100, norm.votes=FALSE,importance = TRUE,do.trace = 1,nodesize = 50) 46 | save(rf_2_1,file='rf_2_1.RData') 47 | save(rf_2_2,file='rf3_2_1.RData') 48 | 49 | load('rf_2_1.RData') 50 | load('rf3_2_1.RData') 51 | test_2_1$Project_Valuation=0 52 | test_2_2$Project_Valuation=0 53 | 54 | pred_2_1= predict(rf_2_1,test_2_1) 55 | pred_2_2= predict(rf_2_2,test_2_2) 56 | test_2_1$Project_Valuation=pred_2_1 57 | test_2_2$Project_Valuation=pred_2_2 58 | submit_2_1<-data.frame(ID=test_2_1$ID,Project_Valuation_rf=test_2_1$Project_Valuation) 59 | submit_2_2<-data.frame(ID=test_2_2$ID,Project_Valuation_rf=test_2_2$Project_Valuation) 60 | submit_rf <- rbind(submit_2_1,submit_2_2) 61 | 62 | write.csv(submit_rf,file="submit_rf_rpart.csv",row.names=FALSE) 63 | 64 | -------------------------------------------------------------------------------- /AV-Hackathon-3/Feature Engineering.R: -------------------------------------------------------------------------------- 1 | # Feature Engineering AV Hackathon 3 2 | 3 | #a) Creating Dummy Variable of class factors VAR15 4 | #a.i) is_HAXXF 5 | 6 | is_HAXXF <- function(x) { 7 | if(x == "HAXXF") { 8 | y <- 1 9 | } else { 10 | y <- 0 11 | } 12 | return(y) 13 | } 14 | train_1 <- cbind(train,is_HAXXF = as.factor(mapply(is_HAXXF,train$Var15))) 15 | test_1 <- cbind(test,is_HAXXF = as.factor(mapply(is_HAXXF,test$Var15))) 16 | 17 | #a.ii) is_HAXXC 18 | 19 | is_HAXXC <- function(x) { 20 | if(x == "HAXXC") { 21 | y <- 1 22 | } else { 23 | y <- 0 24 | } 25 | return(y) 26 | } 27 | train_1 <- cbind(train_1,is_HAXXC = as.factor(mapply(is_HAXXC,train_1$Var15))) 28 | test_1 <- cbind(test_1,is_HAXXC = as.factor(mapply(is_HAXXC,test_1$Var15))) 29 | 30 | #a.iii) is_HATEM 31 | 32 | is_HATEM <- function(x) { 33 | if(x == "HATEM") { 34 | y <- 1 35 | } else { 36 | y <- 0 37 | } 38 | return(y) 39 | } 40 | train_1 <- cbind(train_1,is_HATEM = as.factor(mapply(is_HATEM,train_1$Var15))) 41 | test_1 <- cbind(test_1,is_HATEM = as.factor(mapply(is_HATEM,test_1$Var15))) 42 | 43 | #a.ii) is_HATFD 44 | 45 | is_HATFD <- function(x) { 46 | if(x == "HATFD") { 47 | y <- 1 48 | } else { 49 | y <- 0 50 | } 51 | return(y) 52 | } 53 | train_1 <- cbind(train_1,is_HATFD = as.factor(mapply(is_HATFD,train_1$Var15))) 54 | test_1 <- cbind(test_1,is_HATFD = as.factor(mapply(is_HATFD,test_1$Var15))) 55 | 56 | 57 | 58 | 59 | 60 | ############################################################################## 61 | #b) Creating Dummy Variable of class factors institute_state 62 | #b.i) is_CT 63 | 64 | is_CT <- function(x) { 65 | if(x == "CT") { 66 | y <- 1 67 | } else { 68 | y <- 0 69 | } 70 | return(y) 71 | } 72 | train_1 <- cbind(train_1,is_CT = as.factor(mapply(is_CT,train_1$institute_state))) 73 | test_1 <- cbind(test_1,is_CT = as.factor(mapply(is_CT,test_1$institute_state))) 74 | 75 | #b.ii) is_DC 76 | 77 | is_DC <- function(x) { 78 | if(x == "DC") { 79 | y <- 1 80 | } else { 81 | y <- 0 82 | } 83 | return(y) 84 | } 85 | train_1 <- cbind(train_1,is_DC = as.factor(mapply(is_DC,train_1$institute_state))) 86 | test_1 <- cbind(test_1,is_DC = as.factor(mapply(is_DC,test_1$institute_state))) 87 | 88 | #b.iii) is_DE 89 | 90 | is_DE <- function(x) { 91 | if(x == "DE") { 92 | y <- 1 93 | } else { 94 | y <- 0 95 | } 96 | return(y) 97 | } 98 | train_1 <- cbind(train_1,is_DE = as.factor(mapply(is_DE,train_1$institute_state))) 99 | test_1 <- cbind(test_1,is_DE = as.factor(mapply(is_DE,test_1$institute_state))) 100 | 101 | #b.iv) is_FL 102 | 103 | is_FL <- function(x) { 104 | if(x == "FL") { 105 | y <- 1 106 | } else { 107 | y <- 0 108 | } 109 | return(y) 110 | } 111 | train_1 <- cbind(train_1,is_FL = as.factor(mapply(is_FL,train_1$institute_state))) 112 | test_1 <- cbind(test_1,is_FL = as.factor(mapply(is_FL,test_1$institute_state))) 113 | 114 | #b.v) is_GA 115 | 116 | is_GA <- function(x) { 117 | if(x == "GA") { 118 | y <- 1 119 | } else { 120 | y <- 0 121 | } 122 | return(y) 123 | } 124 | train_1 <- cbind(train_1,is_GA = as.factor(mapply(is_GA,train_1$institute_state))) 125 | test_1 <- cbind(test_1,is_GA = as.factor(mapply(is_GA,test_1$institute_state))) 126 | 127 | #b.vi) is_KS 128 | 129 | is_KS <- function(x) { 130 | if(x == "KS") { 131 | y <- 1 132 | } else { 133 | y <- 0 134 | } 135 | return(y) 136 | } 137 | train_1 <- cbind(train_1,is_KS = as.factor(mapply(is_KS,train_1$institute_state))) 138 | test_1 <- cbind(test_1,is_KS = as.factor(mapply(is_KS,test_1$institute_state))) 139 | 140 | #b.vi) is_KY 141 | 142 | is_KY <- function(x) { 143 | if(x == "KY") { 144 | y <- 1 145 | } else { 146 | y <- 0 147 | } 148 | return(y) 149 | } 150 | train_1 <- cbind(train_1,is_KY = as.factor(mapply(is_KY,train_1$institute_state))) 151 | test_1 <- cbind(test_1,is_KY = as.factor(mapply(is_KY,test_1$institute_state))) 152 | 153 | #b.vii) is_MA 154 | 155 | is_MA <- function(x) { 156 | if(x == "MA") { 157 | y <- 1 158 | } else { 159 | y <- 0 160 | } 161 | return(y) 162 | } 163 | train_1 <- cbind(train_1,is_MA = as.factor(mapply(is_MA,train_1$institute_state))) 164 | test_1 <- cbind(test_1,is_MA = as.factor(mapply(is_MA,test_1$institute_state))) 165 | 166 | #b.viii) is_MD 167 | 168 | is_MD <- function(x) { 169 | if(x == "MD") { 170 | y <- 1 171 | } else { 172 | y <- 0 173 | } 174 | return(y) 175 | } 176 | train_1 <- cbind(train_1,is_MD = as.factor(mapply(is_MD,train_1$institute_state))) 177 | test_1 <- cbind(test_1,is_MD = as.factor(mapply(is_MD,test_1$institute_state))) 178 | 179 | #b.ix) is_ME 180 | 181 | is_ME <- function(x) { 182 | if(x == "ME") { 183 | y <- 1 184 | } else { 185 | y <- 0 186 | } 187 | return(y) 188 | } 189 | train_1 <- cbind(train_1,is_ME = as.factor(mapply(is_ME,train_1$institute_state))) 190 | test_1 <- cbind(test_1,is_ME = as.factor(mapply(is_ME,test_1$institute_state))) 191 | 192 | #b.x) is_MI 193 | 194 | is_MI <- function(x) { 195 | if(x == "MI") { 196 | y <- 1 197 | } else { 198 | y <- 0 199 | } 200 | return(y) 201 | } 202 | train_1 <- cbind(train_1,is_MI = as.factor(mapply(is_MI,train_1$institute_state))) 203 | test_1 <- cbind(test_1,is_MI = as.factor(mapply(is_MI,test_1$institute_state))) 204 | 205 | #b.xi) is_MN 206 | 207 | is_MN <- function(x) { 208 | if(x == "MN") { 209 | y <- 1 210 | } else { 211 | y <- 0 212 | } 213 | return(y) 214 | } 215 | train_1 <- cbind(train_1,is_MN = as.factor(mapply(is_MN,train_1$institute_state))) 216 | test_1 <- cbind(test_1,is_MN = as.factor(mapply(is_MN,test_1$institute_state))) 217 | 218 | #b.xii) is_MS 219 | 220 | is_MS <- function(x) { 221 | if(x == "MS") { 222 | y <- 1 223 | } else { 224 | y <- 0 225 | } 226 | return(y) 227 | } 228 | train_1 <- cbind(train_1,is_MS = as.factor(mapply(is_MS,train_1$institute_state))) 229 | test_1 <- cbind(test_1,is_MS = as.factor(mapply(is_MS,test_1$institute_state))) 230 | 231 | #b.xiii) is_NH 232 | 233 | is_NH <- function(x) { 234 | if(x == "NH") { 235 | y <- 1 236 | } else { 237 | y <- 0 238 | } 239 | return(y) 240 | } 241 | train_1 <- cbind(train_1,is_NH = as.factor(mapply(is_NH,train_1$institute_state))) 242 | test_1 <- cbind(test_1,is_NH = as.factor(mapply(is_NH,test_1$institute_state))) 243 | 244 | #b.xiv) is_NJ 245 | 246 | is_NJ <- function(x) { 247 | if(x == "NJ") { 248 | y <- 1 249 | } else { 250 | y <- 0 251 | } 252 | return(y) 253 | } 254 | train_1 <- cbind(train_1,is_NJ = as.factor(mapply(is_NJ,train_1$institute_state))) 255 | test_1 <- cbind(test_1,is_NJ = as.factor(mapply(is_NJ,test_1$institute_state))) 256 | 257 | #b.xiv) is_NY 258 | 259 | is_NY <- function(x) { 260 | if(x == "NY") { 261 | y <- 1 262 | } else { 263 | y <- 0 264 | } 265 | return(y) 266 | } 267 | train_1 <- cbind(train_1,is_NY = as.factor(mapply(is_NY,train_1$institute_state))) 268 | test_1 <- cbind(test_1,is_NY = as.factor(mapply(is_NY,test_1$institute_state))) 269 | 270 | #b.xv) is_OH 271 | 272 | is_OH <- function(x) { 273 | if(x == "OH") { 274 | y <- 1 275 | } else { 276 | y <- 0 277 | } 278 | return(y) 279 | } 280 | train_1 <- cbind(train_1,is_OH = as.factor(mapply(is_OH,train_1$institute_state))) 281 | test_1 <- cbind(test_1,is_OH = as.factor(mapply(is_OH,test_1$institute_state))) 282 | 283 | #b.xv) is_PA 284 | 285 | is_PA <- function(x) { 286 | if(x == "PA") { 287 | y <- 1 288 | } else { 289 | y <- 0 290 | } 291 | return(y) 292 | } 293 | train_1 <- cbind(train_1,is_PA = as.factor(mapply(is_PA,train_1$institute_state))) 294 | test_1 <- cbind(test_1,is_PA = as.factor(mapply(is_PA,test_1$institute_state))) 295 | 296 | #b.xvi) is_RI 297 | 298 | is_RI <- function(x) { 299 | if(x == "RI") { 300 | y <- 1 301 | } else { 302 | y <- 0 303 | } 304 | return(y) 305 | } 306 | train_1 <- cbind(train_1,is_RI = as.factor(mapply(is_RI,train_1$institute_state))) 307 | test_1 <- cbind(test_1,is_RI = as.factor(mapply(is_RI,test_1$institute_state))) 308 | 309 | #b.xvii) is_TN 310 | 311 | is_TN <- function(x) { 312 | if(x == "TN") { 313 | y <- 1 314 | } else { 315 | y <- 0 316 | } 317 | return(y) 318 | } 319 | train_1 <- cbind(train_1,is_TN = as.factor(mapply(is_TN,train_1$institute_state))) 320 | test_1 <- cbind(test_1,is_TN = as.factor(mapply(is_TN,test_1$institute_state))) 321 | 322 | #b.xviii) is_VA 323 | 324 | is_VA <- function(x) { 325 | if(x == "VA") { 326 | y <- 1 327 | } else { 328 | y <- 0 329 | } 330 | return(y) 331 | } 332 | train_1 <- cbind(train_1,is_VA = as.factor(mapply(is_VA,train_1$institute_state))) 333 | test_1 <- cbind(test_1,is_VA = as.factor(mapply(is_VA,test_1$institute_state))) 334 | 335 | #b.xix) is_VT 336 | 337 | is_VT <- function(x) { 338 | if(x == "VT") { 339 | y <- 1 340 | } else { 341 | y <- 0 342 | } 343 | return(y) 344 | } 345 | train_1 <- cbind(train_1,is_VT = as.factor(mapply(is_VT,train_1$institute_state))) 346 | test_1 <- cbind(test_1,is_VT = as.factor(mapply(is_VT,test_1$institute_state))) 347 | 348 | #b.xx) is_WV 349 | 350 | is_WV <- function(x) { 351 | if(x == "WV") { 352 | y <- 1 353 | } else { 354 | y <- 0 355 | } 356 | return(y) 357 | } 358 | train_1 <- cbind(train_1,is_WV = as.factor(mapply(is_WV,train_1$institute_state))) 359 | test_1 <- cbind(test_1,is_WV = as.factor(mapply(is_WV,test_1$institute_state))) 360 | 361 | 362 | 363 | ############################################################################## 364 | #c) Creating Dummy Variable of class factors var8 365 | #c.i) is_rural 366 | is_rural <- function(x) { 367 | if(x == "HXYJ" | x == "HXYK" | x == "HXYL") { 368 | y <- 1 369 | } else { 370 | y <- 0 371 | } 372 | return(y) 373 | } 374 | train_1 <- cbind(train_1,is_rural = as.factor(mapply(is_rural,train_1$Var8))) 375 | test_1 <- cbind(test_1,is_rural = as.factor(mapply(is_rural,test_1$Var8))) 376 | 377 | #c.ii) is_urban 378 | is_urban <- function(x) { 379 | if(x == "HXYB" | x == "HXYC" | x == "HXYD" | x == "HXYE") { 380 | y <- 1 381 | } else { 382 | y <- 0 383 | } 384 | return(y) 385 | } 386 | train_1 <- cbind(train_1,is_urban = as.factor(mapply(is_urban,train_1$Var8))) 387 | test_1 <- cbind(test_1,is_urban = as.factor(mapply(is_urban,test_1$Var8))) 388 | 389 | #c.iii) is_suburban 390 | is_suburban <- function(x) { 391 | if(x == "HXYG" | x == "HXYF" | x == "HXYH" | x == "HXYI") { 392 | y <- 1 393 | } else { 394 | y <- 0 395 | } 396 | return(y) 397 | } 398 | train_1 <- cbind(train_1,is_suburban = as.factor(mapply(is_suburban,train_1$Var8))) 399 | test_1 <- cbind(test_1,is_suburban = as.factor(mapply(is_suburban,test_1$Var8))) 400 | 401 | #c.iv) is_other 402 | is_other <- function(x) { 403 | if(x == "HXYM" | x == "HXYN" | x == "HXYO") { 404 | y <- 1 405 | } else { 406 | y <- 0 407 | } 408 | return(y) 409 | } 410 | train_1 <- cbind(train_1,is_other = as.factor(mapply(is_other,train_1$Var8))) 411 | test_1 <- cbind(test_1,is_other = as.factor(mapply(is_other,test_1$Var8))) 412 | 413 | ############################################################################## 414 | #f) Creating Dummy Variable of class factors subject_area 415 | #f.i) is_AL 416 | is_Applearn <- function(x) { 417 | if(x == "Applied Learning") { 418 | y <- 1 419 | } else { 420 | y <- 0 421 | } 422 | return(y) 423 | } 424 | train_1 <- cbind(train_1,is_Applearn = as.factor(mapply(is_Applearn,train_1$subject_area))) 425 | test_1 <- cbind(test_1,is_Applearn = as.factor(mapply(is_Applearn,test_1$subject_area))) 426 | 427 | #f.ii) is_Heaspo 428 | is_Heaspo <- function(x) { 429 | if(x == "Health & Sports") { 430 | y <- 1 431 | } else { 432 | y <- 0 433 | } 434 | return(y) 435 | } 436 | train_1 <- cbind(train_1,is_Heaspo = as.factor(mapply(is_Heaspo,train_1$subject_area))) 437 | test_1 <- cbind(test_1,is_Heaspo = as.factor(mapply(is_Heaspo,test_1$subject_area))) 438 | 439 | #f.iii) is_HeaCiv 440 | is_HeaCiv <- function(x) { 441 | if(x == "History & Civics") { 442 | y <- 1 443 | } else { 444 | y <- 0 445 | } 446 | return(y) 447 | } 448 | train_1 <- cbind(train_1,is_HeaCiv = as.factor(mapply(is_HeaCiv,train_1$subject_area))) 449 | test_1 <- cbind(test_1,is_HeaCiv = as.factor(mapply(is_HeaCiv,test_1$subject_area))) 450 | 451 | #f.iv) is_LitLan 452 | is_LitLan <- function(x) { 453 | if(x == "Literacy & Language") { 454 | y <- 1 455 | } else { 456 | y <- 0 457 | } 458 | return(y) 459 | } 460 | train_1 <- cbind(train_1,is_LitLan = as.factor(mapply(is_LitLan,train_1$subject_area))) 461 | test_1 <- cbind(test_1,is_LitLan = as.factor(mapply(is_LitLan,test_1$subject_area))) 462 | 463 | #f.v) is_MathSci 464 | is_MathSci <- function(x) { 465 | if(x == "Math & Science") { 466 | y <- 1 467 | } else { 468 | y <- 0 469 | } 470 | return(y) 471 | } 472 | train_1 <- cbind(train_1,is_MathSci = as.factor(mapply(is_MathSci,train_1$subject_area))) 473 | test_1 <- cbind(test_1,is_MathSci = as.factor(mapply(is_MathSci,test_1$subject_area))) 474 | 475 | #f.vi) is_MuArt 476 | is_MuArt <- function(x) { 477 | if(x == "Music & The Arts") { 478 | y <- 1 479 | } else { 480 | y <- 0 481 | } 482 | return(y) 483 | } 484 | train_1 <- cbind(train_1,is_MuArt = as.factor(mapply(is_MuArt,train_1$subject_area))) 485 | test_1 <- cbind(test_1,is_MuArt = as.factor(mapply(is_MuArt,test_1$subject_area))) 486 | 487 | #f.vii) is_SpeNee 488 | is_SpeNee <- function(x) { 489 | if(x == "Special Needs") { 490 | y <- 1 491 | } else { 492 | y <- 0 493 | } 494 | return(y) 495 | } 496 | train_1 <- cbind(train_1,is_SpeNee = as.factor(mapply(is_SpeNee,train_1$subject_area))) 497 | test_1 <- cbind(test_1,is_SpeNee = as.factor(mapply(is_SpeNee,test_1$subject_area))) 498 | 499 | ############################################################################## 500 | #g) Creating Dummy Variable of class factors secondary_subject 501 | #g.i) is_AL 502 | is_sApplearn <- function(x) { 503 | if(x == "Applied Learning") { 504 | y <- 1 505 | } else { 506 | y <- 0 507 | } 508 | return(y) 509 | } 510 | train_1 <- cbind(train_1,is_sApplearn = as.factor(mapply(is_sApplearn,train_1$secondary_area))) 511 | test_1 <- cbind(test_1,is_sApplearn = as.factor(mapply(is_sApplearn,test_1$secondary_area))) 512 | 513 | #g.ii) is_Heaspo 514 | is_sHeaspo <- function(x) { 515 | if(x == "Health & Sports") { 516 | y <- 1 517 | } else { 518 | y <- 0 519 | } 520 | return(y) 521 | } 522 | train_1 <- cbind(train_1,is_sHeaspo = as.factor(mapply(is_sHeaspo,train_1$secondary_area))) 523 | test_1 <- cbind(test_1,is_sHeaspo = as.factor(mapply(is_sHeaspo,test_1$secondary_area))) 524 | 525 | #g.iii) is_HeaCiv 526 | is_sHeaCiv <- function(x) { 527 | if(x == "History & Civics") { 528 | y <- 1 529 | } else { 530 | y <- 0 531 | } 532 | return(y) 533 | } 534 | train_1 <- cbind(train_1,is_sHeaCiv = as.factor(mapply(is_sHeaCiv,train_1$secondary_area))) 535 | test_1 <- cbind(test_1,is_sHeaCiv = as.factor(mapply(is_sHeaCiv,test_1$secondary_area))) 536 | 537 | #g.iv) is_sLitLan 538 | is_sLitLan <- function(x) { 539 | if(x == "Literacy & Language") { 540 | y <- 1 541 | } else { 542 | y <- 0 543 | } 544 | return(y) 545 | } 546 | train_1 <- cbind(train_1,is_sLitLan = as.factor(mapply(is_sLitLan,train_1$secondary_area))) 547 | test_1 <- cbind(test_1,is_sLitLan = as.factor(mapply(is_sLitLan,test_1$secondary_area))) 548 | 549 | #g.v) is_sMathSci 550 | is_sMathSci <- function(x) { 551 | if(x == "Math & Science") { 552 | y <- 1 553 | } else { 554 | y <- 0 555 | } 556 | return(y) 557 | } 558 | train_1 <- cbind(train_1,is_sMathSci = as.factor(mapply(is_sMathSci,train_1$secondary_area))) 559 | test_1 <- cbind(test_1,is_sMathSci = as.factor(mapply(is_sMathSci,test_1$secondary_area))) 560 | 561 | #f.vi) is_sMuArt 562 | is_sMuArt <- function(x) { 563 | if(x == "Music & The Arts") { 564 | y <- 1 565 | } else { 566 | y <- 0 567 | } 568 | return(y) 569 | } 570 | train_1 <- cbind(train_1,is_sMuArt = as.factor(mapply(is_sMuArt,train_1$secondary_area))) 571 | test_1 <- cbind(test_1,is_sMuArt = as.factor(mapply(is_sMuArt,test_1$secondary_area))) 572 | 573 | #f.vii) is_SpeNee 574 | is_sSpeNee <- function(x) { 575 | if(x == "Special Needs") { 576 | y <- 1 577 | } else { 578 | y <- 0 579 | } 580 | return(y) 581 | } 582 | train_1 <- cbind(train_1,is_sSpeNee = as.factor(mapply(is_sSpeNee,train_1$secondary_area))) 583 | test_1 <- cbind(test_1,is_sSpeNee = as.factor(mapply(is_sSpeNee,test_1$secondary_area))) 584 | 585 | #f.viii) is_SNull 586 | is_SNull <- function(x) { 587 | if(x == "") { 588 | y <- 1 589 | } else { 590 | y <- 0 591 | } 592 | return(y) 593 | } 594 | train_1 <- cbind(train_1,is_SNull = as.factor(mapply(is_SNull,train_1$secondary_area))) 595 | test_1 <- cbind(test_1,is_SNull = as.factor(mapply(is_SNull,test_1$secondary_area))) 596 | 597 | 598 | 599 | ############################################################################## 600 | # Level matching 601 | levels(test_1$Var4) <- levels(train_1$Var4) 602 | levels(test_1$Var10) <- levels(train_1$Var10) 603 | levels(test_1$Var8) <- levels(train_1$Var8) 604 | levels(test_1$Var11) <- levels(train_1$Var11) 605 | levels(test_1$Var12) <- levels(train_1$Var12) 606 | levels(test_1$Var13) <- levels(train_1$Var13) 607 | levels(test_1$Var14) <- levels(train_1$Var14) 608 | levels(test_1$Instructor_Past_Performance) <- levels(train_1$Instructor_Past_Performance) 609 | levels(test_1$Instructor_Association_Industry_Expert) <- levels(train_1$Instructor_Association_Industry_Expert) 610 | levels(test_1$project_subject) <- levels(train_1$project_subject) 611 | levels(test_1$subject_area) <- levels(train_1$subject_area) 612 | levels(test_1$secondary_subject) <- levels(train_1$secondary_subject) 613 | levels(test_1$secondary_area) <- levels(train_1$secondary_area) 614 | levels(test_1$Resource_Category) <- levels(train_1$Resource_Category) 615 | levels(test_1$Resource_Sub_Category) <- levels(train_1$Resource_Sub_Category) 616 | levels(test_1$Var23) <- levels(train_1$Var23) 617 | levels(test_1$Var24) <- levels(train_1$Var24) 618 | levels(test_1$is_NH) <- levels(train_1$is_NH) 619 | levels(test_1$is_rural) <- levels(train_1$is_rural) 620 | levels(test_1$is_urban) <- levels(train_1$is_urban) 621 | levels(test_1$is_suburban) <- levels(train_1$is_suburban) 622 | levels(test_1$is_other) <- levels(train_1$is_other) 623 | -------------------------------------------------------------------------------- /AV-Hackathon-3/README.md: -------------------------------------------------------------------------------- 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong ! 2 | 3 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838 4 | 5 | ###### My approach for the hackathon is as follows: 6 | 7 | 1. I looked into levels of data and created a data dictionary by mentioning the level gaps, as I figured out that there is difference in level of data in training and testing data set (Like some cities are only in training dataset but are missing from testing and vice versa) 8 | 9 | 2. Ran a simple linear model to see if some of the greater number of level categories are impacting the funding and found that state column have some impact on the valuation 10 | 11 | 3. Converted some of the categorical variables into 1/0 encoded variables 12 | 13 | 4. Ran R part over Similar project valuation to see it's impact on subsequent funding and found that there is significant shift in mean values with Similar project valuation >$549 and <$549 14 | 15 | 5. Made two Random forest models with Similar project valuation >$549 and <$549, simply merged there result for the final output 16 | -------------------------------------------------------------------------------- /Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/FactorVariables.R: -------------------------------------------------------------------------------- 1 | ######### 2 | ### All the factor variables(with >2 levels ) are finetuned using the following steps: 3 | ### 1. Select a factor variable and create an XGB model and rank the important features(levels) 4 | ### 2. keep the most important levelsand merge the rest in to a single level "Others" 5 | ### 3. Include a subset of this levels into the final model depending on their effect. 6 | ######### 7 | 8 | 9 | ########################################## 10 | ######## BANKS / SALARY ACCOUNT ################### 11 | ########################################## 12 | banks = as.data.frame(model.matrix(~0 + Salary_Account, loans)) 13 | bankstest = as.data.frame(model.matrix(~0 + Salary_Account, loanstest)) 14 | 15 | xgb2 <- xgboost(data = as.matrix(banks), 16 | label = dispnum, 17 | nrounds = 1930, max_depth = 4 ,eta = 0.01, 18 | objective = "binary:logistic", verbose=1) 19 | m = xgb.importance(feature_names = colnames(banks),model = xgb2) 20 | xgb.plot.importance(m) 21 | 22 | banksorder = m$Feature 23 | banksordertest = intersect(banksorder,colnames(bankstest)) 24 | rembanks = setdiff(colnames(banks) , banksorder) 25 | rembankstest = setdiff(colnames(bankstest) , banksordertest) 26 | uselessbanks = banks[rembanks] 27 | uselessbankstest = bankstest[rembankstest] 28 | banks = banks[banksorder] 29 | bankstest = bankstest[banksordertest] 30 | 31 | banks$Otherbanks = rowSums(cbind(banks[,12:ncol(banks)],uselessbanks)) 32 | bankstest$Otherbanks = rowSums(cbind(bankstest[,12:ncol(bankstest)],uselessbankstest)) 33 | banks = banks[,-(12:(ncol(banks)-1))] 34 | bankstest = bankstest[,-(12:(ncol(bankstest)-1))] 35 | write.csv(banks, "banks.csv", row.names=FALSE) 36 | write.csv(bankstest, "bankstest.csv", row.names=FALSE) 37 | rm(uselessbanks,uselessbankstest) 38 | 39 | ########################################## 40 | ######## DATE OF LEAD ################### 41 | ########################################## 42 | 43 | ###### EXTRACTING ONLY MONTHS AND ADDING TO TRAIN AND TEST DATA SET##### 44 | date = as.character(loans$Lead_Creation_Date) 45 | doj = strptime(date, format = "%d-%b-%Y") 46 | months = as.factor(format(doj,'%b')) 47 | days = as.numeric((doj - min(doj))/86400) 48 | ### 49 | datetest = as.character(loanstest$Lead_Creation_Date) 50 | dojtest = strptime(datetest, format = "%d-%b-%Y") 51 | monthstest = as.factor(format(dojtest,'%b')) 52 | daystest = as.numeric((dojtest - min(dojtest))/86400) 53 | 54 | Train = cbind(Train,month = months) 55 | Test = cbind(Test,month = monthstest) 56 | 57 | Train = cbind(Train,model.matrix(~0 + month, Train)) 58 | Test = cbind(Test,model.matrix(~0 + month, Test)) 59 | 60 | Train$month = NULL 61 | Test$month = NULL 62 | ############################# 63 | 64 | dates = as.data.frame(model.matrix(~0 + Lead_Creation_Date, loans)) 65 | datestest = as.data.frame(model.matrix(~0 + Lead_Creation_Date, loanstest)) 66 | 67 | xgb2 <- xgboost(data = as.matrix(dates), 68 | label = dispnum, 69 | nrounds = 727, max_depth = 4 ,eta = 0.01, 70 | objective = "binary:logistic", verbose=1) 71 | m = xgb.importance(feature_names = colnames(dates),model = xgb2) 72 | xgb.plot.importance(m) 73 | 74 | datesorder = m$Feature 75 | remdates = setdiff(colnames(dates) , datesorder) 76 | uselessdates = dates[remdates] 77 | uselessdatestest = datestest[remdates] 78 | dates = dates[datesorder] 79 | datestest = datestest[datesorder] 80 | 81 | dates$Otherdates = rowSums(cbind(dates[,28:35],uselessdates)) 82 | datestest$Otherdates = rowSums(cbind(datestest[,28:35],uselessdatestest)) 83 | dates = dates[,-(28:(ncol(dates)-1))] 84 | datestest = datestest[,-(28:(ncol(datestest)-1))] 85 | write.csv(dates, "dates.csv", row.names=FALSE) 86 | write.csv(datestest, "datestest.csv", row.names=FALSE) 87 | rm(uselessdates,uselessdatestest) 88 | 89 | ########################################## 90 | ######## CITY ################### 91 | ########################################## 92 | 93 | city = as.data.frame(model.matrix(~0 + City, loans)) 94 | citytest = as.data.frame(model.matrix(~0 + City, loanstest)) 95 | 96 | xgb2 <- xgboost(data = as.matrix(city), 97 | label = dispnum, 98 | nrounds = 588, max_depth = 4 ,eta = 0.01, 99 | objective = "binary:logistic", verbose=1) 100 | m = xgb.importance(feature_names = colnames(city),model = xgb2) 101 | xgb.plot.importance(m) 102 | 103 | cityorder = m$Feature 104 | cityordertest = intersect(cityorder,colnames(citytest)) 105 | remcity = setdiff(colnames(city) , cityorder) 106 | remcitytest = setdiff(colnames(citytest) , cityordertest) 107 | uselesscity = city[remcity] 108 | uselesscitytest = citytest[remcitytest] 109 | city = city[cityorder] 110 | citytest = citytest[cityordertest] 111 | 112 | city$Othercity = rowSums(cbind(city[,12:ncol(city)],uselesscity)) 113 | citytest$Othercity = rowSums(cbind(citytest[,12:ncol(citytest)],uselesscitytest)) 114 | city = city[,-(12:(ncol(city)-1))] 115 | citytest = citytest[,-(12:(ncol(citytest)-1))] 116 | write.csv(city, "city.csv", row.names=FALSE) 117 | write.csv(citytest, "citytest.csv", row.names=FALSE) 118 | rm(uselesscity,uselesscitytest) 119 | 120 | ########################################## 121 | ######## VAR1 ################### 122 | ########################################## 123 | var1 = as.data.frame(model.matrix(~0 + Var1, loans)) 124 | var1test = as.data.frame(model.matrix(~0 + Var1, loanstest)) 125 | 126 | xgb2 <- xgboost(data = as.matrix(var1), 127 | label = dispnum, 128 | nrounds = 740, max_depth = 4 ,eta = 0.01, 129 | objective = "binary:logistic", verbose=1) 130 | m = xgb.importance(feature_names = colnames(var1),model = xgb2) 131 | xgb.plot.importance(m) 132 | 133 | 134 | var1order = m$Feature 135 | remvar1 = setdiff(colnames(var1) , var1order) 136 | uselessvar1 = var1[remvar1] 137 | uselessvar1test = var1test[remvar1] 138 | var1 = var1[var1order] 139 | var1test = var1test[var1order] 140 | var1$Othervar1 = rowSums(cbind(var1[,7:16],uselessvar1)) 141 | var1test$Othervar1 = rowSums(cbind(var1test[,7:16],uselessvar1test)) 142 | var1 = var1[,-(7:(ncol(var1)-1))] 143 | var1test = var1test[,-(7:(ncol(var1test)-1))] 144 | write.csv(var1, "var1.csv", row.names=FALSE) 145 | write.csv(var1test, "var1test.csv", row.names=FALSE) 146 | 147 | rm(uselessvar1,uselessvar1test) 148 | 149 | ########################################## 150 | ######## VAR2 ################### 151 | ########################################## 152 | 153 | var2 = as.data.frame(model.matrix(~0 + Var2, loans)) 154 | var2test = as.data.frame(model.matrix(~0 + Var2, loanstest)) 155 | 156 | ''' 157 | xgb2 <- xgboost(data = as.matrix(var2), 158 | label = dispnum, 159 | nrounds = 530, max_depth = 2 ,eta = 0.01, 160 | objective = "binary:logistic", verbose=1) 161 | m = xgb.importance(feature_names = colnames(var2),model = xgb2) 162 | xgb.plot.importance(m) 163 | 164 | var2order = m$Feature 165 | var2 = var2[var2order] 166 | var2test = var2test[var2order] 167 | 168 | var2$Var2AD = (var2$Var2A + var2$Var2D) 169 | var2test$Var2AD = (var2test$Var2A + var2test$Var2D) 170 | var2$Var2EF = (var2$Var2E + var2$Var2F) 171 | var2test$Var2EF = (var2test$Var2E + var2test$Var2F) 172 | var2 = var2[,-c(1,4:6)] 173 | var2test = var2test[,-c(1,4:6)] 174 | ''' 175 | ########################################## 176 | ######## VAR4 ################### 177 | ########################################## 178 | var4 = as.data.frame(model.matrix(~0 + Var4, loans)) 179 | var4test = as.data.frame(model.matrix(~0 + Var4, loanstest)) 180 | 181 | 182 | ########################################## 183 | ######## Source ################### 184 | ########################################## 185 | 186 | Source = as.data.frame(model.matrix(~0 + Source, loans)) 187 | Sourcetest = as.data.frame(model.matrix(~0 + Source, loanstest)) 188 | 189 | xgb2 <- xgboost(data = as.matrix(Source), 190 | label = dispnum, 191 | nrounds = 1230, max_depth = 4 ,eta = 0.01, 192 | objective = "binary:logistic", verbose=1) 193 | m = xgb.importance(feature_names = colnames(Source),model = xgb2) 194 | xgb.plot.importance(m) 195 | 196 | Sourceorder = m$Feature 197 | Sourceorder = Sourceorder 198 | Sourceordertest = intersect(Sourceorder,colnames(Sourcetest)) 199 | remSource = setdiff(colnames(Source) , Sourceorder) 200 | remSourcetest = setdiff(colnames(Sourcetest) , Sourceordertest) 201 | uselessSource = Source[remSource] 202 | uselessSourcetest = Sourcetest[remSourcetest] 203 | Source = Source[Sourceorder] 204 | Sourcetest = Sourcetest[Sourceordertest] 205 | 206 | Source$OtherSource = rowSums(cbind(Source[,13],uselessSource)) 207 | Sourcetest$OtherSource = rowSums(uselessSourcetest) 208 | Source = Source[,-13] 209 | 210 | rm(uselessSource,uselessSourcetest) 211 | write.csv(Source, "Source.csv", row.names=FALSE) 212 | write.csv(Sourcetest, "Sourcetest.csv", row.names=FALSE) 213 | 214 | ########################################## 215 | ######## EMPLOYERS ################### 216 | ########################################## 217 | ###################### 218 | 219 | t = as.data.frame(table(loans$Employer_Name)) 220 | empnames = as.character(tail(t[order(t$Freq),1],26)) ## selecting only 26 employers with max freq 221 | emp = as.character(loans$Employer_Name) 222 | emptest = as.character(loanstest$Employer_Name) 223 | 224 | emp[1:87020] = lapply(1:87020, function(x) ifelse(emp[x] %in% empnames, emp[x],"OtherEmployer" )) 225 | 226 | emptest[1:nrow(loanstest)] = lapply(1:nrow(loanstest), function(x) ifelse(emptest[x] %in% empnames, emptest[x],"OtherEmployer" )) 227 | 228 | emp = as.factor(c(do.call("cbind",emp))) 229 | emptest = as.factor(c(do.call("cbind",emptest))) 230 | 231 | Employer = as.data.frame(model.matrix(~0 + emp, loans)) 232 | Employertest = as.data.frame(model.matrix(~0 + emp, loanstest)) 233 | 234 | xgb2 <- xgboost(data = as.matrix(Employer), 235 | label = dispnum, 236 | nrounds = 920, max_depth = 4 ,eta = 0.01, 237 | objective = "binary:logistic", verbose=1) 238 | 239 | m = xgb.importance(feature_names = colnames(Employer),model = xgb2) 240 | xgb.plot.importance(m) 241 | 242 | Employerorder = m$Feature 243 | remEmployer = setdiff(colnames(Employer) , Employerorder) 244 | uselessEmployer = Employer[remEmployer] 245 | uselessEmployertest = Employertest[remEmployer] 246 | Employer = Employer[Employerorder] 247 | Employertest = Employertest[Employerorder] 248 | 249 | Employer$OtherEmployer2 = rowSums(cbind(Employer[,18:(ncol(Employer)-1)],uselessEmployer)) 250 | Employertest$OtherEmployer2 = rowSums(cbind(Employertest[,18:(ncol(Employer)-1)],uselessEmployertest)) 251 | Employer = Employer[,-(18:(ncol(Employer)-1))] 252 | Employertest = Employertest[,-(18:(ncol(Employertest)-1))] 253 | write.csv(Employer, "Employer.csv", row.names=FALSE) 254 | write.csv(Employertest, "Employertest.csv", row.names=FALSE) 255 | ################### 256 | ##################### 257 | -------------------------------------------------------------------------------- /Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/Final_Model.R: -------------------------------------------------------------------------------- 1 | ########################## 2 | ######## After including different features and CV finally obtained 3 | ####### the following features as important 4 | ######################### 5 | k = c(3,6:8,11,12,14:19,23:29) 6 | 7 | train = cbind(Train[,k],banks,city,var1,var4,Source,dates[,1:10],Employer[,1:3],var2[,4]) 8 | test = cbind(Test[,k],bankstest,citytest,var1test,var4test,Sourcetest,datestest[,1:10],Employertest[,1:3],var2test[,4]) 9 | 10 | ####################### 11 | ###### CV ####### 12 | ######################## 13 | 14 | nrounds = 5000 15 | nfolds = 4 16 | ps = list( max_depth = 5 ,eta = 0.01,objective = "binary:logistic") 17 | ms = list( 'auc','rmse') 18 | 19 | cvXgb = xgb.cv(params = ps, data = as.matrix(train) , 20 | label = dispnum, nrounds = nrounds ,nfold = nfolds,showsd = T,metrics = ms,stratified = T, verbose = T,subsample = 0.7 21 | ) 22 | 23 | ####################### 24 | ###### FINAL MODEL ####### 25 | ######################## 26 | xgb2 <- xgboost(data = as.matrix(train), 27 | label = dispnum, 28 | nrounds = 1426, max_depth = 5 ,eta = 0.01, 29 | objective = "binary:logistic", verbose=1,subsample = 0.7) 30 | 31 | 32 | #n = xgb.importance(feature_names = colnames(banks),model = xgb2,data = dates,label = #dispnum) 33 | pred <- predict(xgb2, as.matrix(test,type = 'response')) 34 | 35 | pp2 = data.frame(ID = Test[,1] ,Disbursed = pred) 36 | write.csv(pp2, "samplesub.csv", row.names=FALSE) 37 | 38 | ################ 39 | ################ 40 | -------------------------------------------------------------------------------- /Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/LoadData.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(gplots) 3 | library(caTools) 4 | library(lattice) 5 | library(caret) 6 | library(foreach) 7 | library(Matrix) 8 | library(pROC) 9 | library(ROCR) 10 | library(Rcpp) 11 | library(mice) 12 | library(xgboost) 13 | library(survival) 14 | library(gbm) 15 | library(randomForest) 16 | 17 | ###################### 18 | ########## Load data 19 | loans = read.csv('Train.csv',stringsAsFactors = F) 20 | loanstest = read.csv('Test.csv',stringsAsFactors = F) 21 | 22 | Train = loans 23 | Train[,c(3,10,11,14,22:26)] = lapply(c(3,10,11,14,22:26), function(x) as.factor(Train[,x])) 24 | 25 | Train$Mobile_Verified = ifelse (Train$Mobile_Verified == "Y",1,0) 26 | Train$Filled_Form = ifelse (Train$Filled_Form == "Y",1,0) 27 | Train$IsMobile = ifelse (Train$Device_Type == "Mobile",1,0) 28 | Train$IsMale = ifelse (Train$Gender == "Male",1,0) 29 | 30 | Train$Gender = NULL 31 | Train$Device_Type = NULL 32 | ########################### 33 | ###### Handling NA Values###################### 34 | ############################ 35 | #### keeping track of Rows with NA 36 | Train$issubmitNA = ifelse (is.na(Train$Loan_Amount_Submitted),1,0) 37 | table(Train$issubmitNA) 38 | 39 | ###### Filling very few NA values in Loan_Amount_Applied && Loan_Tenure_Applied 40 | ###### using mean(remaing data) or Loan_Amount_submitted && Loan_Tenure_Submitted (If available) 41 | Train$Loan_Amount_Applied = ifelse(is.na(Train$Loan_Amount_Applied),ifelse(is.na(Train$Loan_Amount_Submitted),230300,Train$Loan_Amount_Submitted),Train$Loan_Amount_Applied ) 42 | 43 | Train$Loan_Tenure_Applied = ifelse(is.na(Train$Loan_Tenure_Applied),ifelse(is.na(Train$Loan_Tenure_Submitted),2,Train$Loan_Tenure_Submitted),Train$Loan_Tenure_Applied ) 44 | 45 | ####### Multiple Imputation 46 | ####### First imputing Loan_Amount_submitted && Loan_Tenure_Submitted using Loan_Amount_Applied ####### && Loan_Tenure_Applied 47 | 48 | temp = Train[,c(6,7,14,15)] 49 | 50 | set.seed(123) 51 | imputed = complete(mice(temp)) 52 | 53 | Train$Loan_Amount_Submitted = imputed$Loan_Amount_Submitted 54 | Train$Loan_Tenure_Submitted = imputed$Loan_Tenure_Submitted 55 | Train$Existing_EMI = ifelse(is.na(Train$Existing_EMI),0,Train$Existing_EMI) 56 | 57 | 58 | ####### Imputation of Int.Rate, Proc.fee and EMI_Loan_Submitted using already imputed 59 | ####### Loan_Amount_submitted && Loan_Tenure_Submitted 60 | 61 | temp = Train[,c(14,15,16,17,18)] 62 | 63 | set.seed(123) 64 | 65 | imputed = complete(mice(temp)) ### This will takes several minutes 66 | 67 | Train$Interest_Rate = imputed$Interest_Rate 68 | Train$Processing_Fee = imputed$Processing_Fee 69 | Train$EMI_Loan_Submitted = imputed$EMI_Loan_Submitted 70 | 71 | ###### OUTCOME variables 72 | disb = Train$Disbursed 73 | dispnum = as.numeric(as.character(disb)) 74 | Train$Disbursed = NULL 75 | Lin = Train$LoggedIn 76 | Train$LoggedIn = NULL 77 | 78 | ##################################### age variable 79 | dob = strptime(Train$DOB, format = "%d-%b-%Y") 80 | year = format(dob,"%Y") 81 | year = as.numeric(year) 82 | Train$age = 115 - year 83 | ## assuming people with year 0015 have wrongly mentioned their yob as 2015 84 | ## assigning avg value of 30 to 17 such cases in the data 85 | Train$age = ifelse(Train$age == 100, 30, Train$age) 86 | ############################# 87 | 88 | ############# 89 | ############# similarly cleaning test data 90 | ############ 91 | Test = loanstest 92 | Test[,c(3,10,11,14,22:24)] = lapply(c(3,10,11,14,22:24), function(x) as.factor(Test[,x])) 93 | 94 | Test$Mobile_Verified = ifelse (Test$Mobile_Verified == "Y",1,0) 95 | Test$Filled_Form = ifelse (Test$Filled_Form == "Y",1,0) 96 | Test$IsMobile = ifelse (Test$Device_Type == "Mobile",1,0) 97 | Test$IsMale = ifelse (Test$Gender == "Male",1,0) 98 | 99 | Test$Gender = NULL 100 | Test$Device_Type = NULL 101 | 102 | ########## Dealing with NA 103 | ####### keeping track of NA rows 104 | Test$issubmitNA = ifelse (is.na(Test$Loan_Amount_Submitted),1,0) 105 | table(Test$issubmitNA) 106 | 107 | ##### Imputation 108 | temp = Test[,c(6,7,14,15)] 109 | set.seed(123) 110 | imputed = complete(mice(temp)) 111 | Test$Loan_Amount_Submitted = imputed$Loan_Amount_Submitted 112 | Test$Loan_Tenure_Submitted = imputed$Loan_Tenure_Submitted 113 | Test$Loan_Amount_Applied = imputed$Loan_Amount_Applied 114 | Test$Loan_Tenure_Applied = imputed$Loan_Tenure_Applied 115 | Test$Existing_EMI = ifelse(is.na(Test$Existing_EMI),0,Test$Existing_EMI) 116 | ##write.csv(Test,"Testpartialimp.csv",row.names = F) 117 | ###### second imputation 118 | temp = Test[,c(14,15,16,17,18)] 119 | set.seed(123) 120 | imputed = complete(mice(temp)) 121 | Test$Interest_Rate = imputed$Interest_Rate 122 | Test$Processing_Fee = imputed$Processing_Fee 123 | Test$EMI_Loan_Submitted = imputed$EMI_Loan_Submitted 124 | ##write.csv(Test,"TestFullimp.csv",row.names = F) 125 | ################## 126 | ################ age variable 127 | dob = strptime(Test$DOB, format = "%d-%b-%Y") 128 | year = format(dob,"%Y") 129 | year = as.numeric(year) 130 | Test$age = 115 - year 131 | Test$age = ifelse(Test$age == 100, 30, Test$age) 132 | ############################# 133 | 134 | -------------------------------------------------------------------------------- /Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/README.md: -------------------------------------------------------------------------------- 1 | # Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank- 2 | # Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank- 3 | The code corresponds to a single XGB model which gave me a public score of ~0.866 and a private score of ~0.84 4 | for the problem: 5 | http://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802 6 | 7 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/README.md: -------------------------------------------------------------------------------- 1 | # Analytics_Vidhya_3.X_Hackathon 2 | Codes for the Analytics Vidhya Hackathon 3.X 3 | 4 | Both the versions (weekend and weeklong) have shell scripts. Just run them and the solution is generated. 5 | 6 | - The weekend codes have a public LB score: 0.8612 and private LB score: 0.8413. Runtimes ~6 min (4-core machine) - 2nd place 7 | - The weekday codes have a public LB score: 0.8620 and private LB score: 0.8410. Runtimes ~ 32 min (4-core machine) - 1st place 8 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_1_preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | train = pd.read_csv("Train.csv") 5 | test = pd.read_csv("Test.csv") 6 | submission = pd.read_csv("sample_submission.csv") 7 | print "Train dataset dimensions:", train.shape 8 | print "Test dataset dimensions:", test.shape 9 | 10 | salary_acc = train.Salary_Account.value_counts(dropna=False) 11 | salary_acc_rare = list(salary_acc[salary_acc<40].index) 12 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 13 | 14 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day 15 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek 16 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear 17 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter 18 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month 19 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year 20 | 21 | train['Lifetime'] = pd.to_datetime(train['Lead_Creation_Date']) - pd.to_datetime(train['DOB']) 22 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int) 23 | 24 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day 25 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek 26 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear 27 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter 28 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month 29 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year 30 | 31 | city = pd.DataFrame(train['City'].value_counts()) 32 | city_rare = list(city[city[0] < 100].index) 33 | train.ix[train['City'].isin(city_rare), 'City'] = "Others" 34 | train['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True) 35 | train['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True) 36 | 37 | from sklearn.preprocessing import LabelEncoder 38 | le = LabelEncoder() 39 | train['City_encoded'] = le.fit_transform(train['City']) 40 | 41 | empnames = pd.DataFrame(train['Employer_Name'].value_counts()) 42 | empnames_rare = list(empnames[empnames[0]<30].index) 43 | train.ix[train['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others" 44 | 45 | # # Preprocessing 46 | train2 = train.copy() 47 | 48 | id_train = train['ID'] 49 | label = train2['Disbursed'] 50 | 51 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB', 'Lead_Creation_Date'] 52 | train2.drop(dropCols, axis=1, inplace = True) 53 | 54 | y_train = label 55 | X_train = pd.get_dummies(train2) 56 | 57 | # # Test set preparation 58 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 59 | 60 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day 61 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek 62 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear 63 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter 64 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month 65 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year 66 | 67 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day 68 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek 69 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear 70 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter 71 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month 72 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year 73 | 74 | test['Lifetime'] = pd.to_datetime(test['Lead_Creation_Date']) - pd.to_datetime(test['DOB']) 75 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int) 76 | 77 | test.ix[test['City'].isin(city_rare), 'City'] = "Others" 78 | newcities = list(set(test['City']) - set(train['City'])) 79 | test.ix[test['City'].isin(newcities), 'City'] = np.nan 80 | test['City_encoded'] = le.transform(test['City']) 81 | 82 | test['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True) 83 | test['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True) 84 | 85 | test.ix[test['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others" 86 | 87 | newempnames = list(set(test['Employer_Name']) - set(train['Employer_Name'])) 88 | test.ix[test['Employer_Name'].isin(newempnames), "Employer_Name"] = "Others" 89 | 90 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed'])) 91 | test2 = test.drop(testdropcols, axis=1) 92 | 93 | X_test = pd.get_dummies(test2) 94 | missingCols = list(set(X_train.columns)-set(X_test.columns)) 95 | for col in missingCols: 96 | X_test[col] = 0 97 | X_test = X_test[X_train.columns] 98 | assert X_train.columns.equals(X_test.columns) 99 | 100 | X_train.to_csv("train_preprocessed.csv", index = False) 101 | X_test.to_csv("test_preprocessed.csv", index = False) 102 | y_train.to_csv("train_labels.csv", index = False) 103 | test['ID'].to_csv("test_ids.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_2_train_xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | 6 | train = pd.read_csv("train_preprocessed.csv") 7 | test = pd.read_csv("test_preprocessed.csv") 8 | labels = pd.read_csv("train_labels.csv", header = None) 9 | test_ids = pd.read_csv("test_ids.csv", header = None) 10 | 11 | labels = list(labels.iloc[:,0]) 12 | test_ids = list(test_ids.iloc[:,0]) 13 | 14 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'logloss', 15 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 16 | 'min_child_weight':3, 'max_delta_step':3} 17 | num_rounds = 400 18 | 19 | params['seed'] = 523264626346 # 0.85533 20 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 21 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 22 | # exit() 23 | # [395] cv-test-logloss:0.062599+0.001852 cv-train-logloss:0.042591+0.001435 24 | # [396] cv-test-logloss:0.062594+0.001854 cv-train-logloss:0.042548+0.001437 25 | # [397] cv-test-logloss:0.062595+0.001854 cv-train-logloss:0.042507+0.001445 26 | # [398] cv-test-logloss:0.062601+0.001851 cv-train-logloss:0.042446+0.001435 27 | # [399] cv-test-logloss:0.062603+0.001852 cv-train-logloss:0.042390+0.001416 28 | 29 | 30 | clf = xgb.train(params, dtrain, num_rounds) 31 | dtest = xgb.DMatrix(test, missing = np.nan) 32 | test_preds = clf.predict(dtest) 33 | 34 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds}) 35 | submission = submission[['ID', 'Disbursed']] 36 | submission.to_csv("xgb_final.csv", index = False) 37 | 38 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_3_preprocessing_ftrl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | train = pd.read_csv("train_preprocessed.csv") 4 | labels = pd.read_csv("train_labels.csv", header = None) 5 | 6 | labels = list(labels.iloc[:,0]) 7 | 8 | train['Disbursed'] = labels 9 | 10 | train.to_csv("train_preprocessed_full.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_4_train_ftrl.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | from random import random 13 | import pickle 14 | 15 | # TL; DR, the main training process starts on line: 250, 16 | # you may want to start reading the code from there 17 | 18 | 19 | ############################################################################## 20 | # parameters ################################################################# 21 | ############################################################################## 22 | 23 | # A, paths 24 | train='train_preprocessed_full.csv' 25 | test='test_preprocessed.csv'#'vali_100.tsv' 26 | submission = 'ftrl_final.csv' # path of to be outputted submission file 27 | 28 | # B, model 29 | alpha = .05 # learning rate 30 | beta = 1. # smoothing parameter for adaptive learning rate 31 | L1 = 0. # L1 regularization, larger value means more regularized 32 | L2 = 1. # L2 regularization, larger value means more regularized 33 | 34 | # C, feature/hash trick 35 | D = 2 ** 24 # number of weights to use 36 | interaction = False # whether to enable poly2 feature interactions 37 | 38 | # D, training/validation 39 | epoch = 4 # learn training data for N passes 40 | holdafter = 9 # data after date N (exclusive) are used as validation 41 | holdout = 200 # use every N training instance for holdout validation 42 | 43 | 44 | ############################################################################## 45 | # class, function, generator definitions ##################################### 46 | ############################################################################## 47 | 48 | class ftrl_proximal(object): 49 | ''' Our main algorithm: Follow the regularized leader - proximal 50 | 51 | In short, 52 | this is an adaptive-learning-rate sparse logistic-regression with 53 | efficient L1-L2-regularization 54 | 55 | Reference: 56 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 57 | ''' 58 | 59 | def __init__(self, alpha, beta, L1, L2, D, interaction): 60 | # parameters 61 | self.alpha = alpha 62 | self.beta = beta 63 | self.L1 = L1 64 | self.L2 = L2 65 | 66 | # feature related parameters 67 | self.D = D 68 | self.interaction = interaction 69 | 70 | # model 71 | # n: squared sum of past gradients 72 | # z: weights 73 | # w: lazy weights 74 | self.n = [0.] * D 75 | self.z = [random() for k in range(D)]#[0.] * D 76 | self.w = {} 77 | 78 | def _indices(self, x): 79 | ''' A helper generator that yields the indices in x 80 | 81 | The purpose of this generator is to make the following 82 | code a bit cleaner when doing feature interaction. 83 | ''' 84 | 85 | # first yield index of the bias term 86 | yield 0 87 | 88 | # then yield the normal indices 89 | for index in x: 90 | yield index 91 | 92 | # now yield interactions (if applicable) 93 | if self.interaction: 94 | D = self.D 95 | L = len(x) 96 | 97 | x = sorted(x) 98 | for i in xrange(L): 99 | for j in xrange(i+1, L): 100 | # one-hot encode interactions with hash trick 101 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 102 | 103 | def predict(self, x): 104 | ''' Get probability estimation on x 105 | 106 | INPUT: 107 | x: features 108 | 109 | OUTPUT: 110 | probability of p(y = 1 | x; w) 111 | ''' 112 | 113 | # parameters 114 | alpha = self.alpha 115 | beta = self.beta 116 | L1 = self.L1 117 | L2 = self.L2 118 | 119 | # model 120 | n = self.n 121 | z = self.z 122 | w = {} 123 | 124 | # wTx is the inner product of w and x 125 | wTx = 0. 126 | for i in self._indices(x): 127 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 128 | 129 | # build w on the fly using z and n, hence the name - lazy weights 130 | # we are doing this at prediction instead of update time is because 131 | # this allows us for not storing the complete w 132 | if sign * z[i] <= L1: 133 | # w[i] vanishes due to L1 regularization 134 | w[i] = 0. 135 | else: 136 | # apply prediction time L1, L2 regularization to z and get w 137 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 138 | 139 | wTx += w[i] 140 | 141 | # cache the current w for update stage 142 | self.w = w 143 | 144 | # bounded sigmoid function, this is the probability estimation 145 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 146 | 147 | def update(self, x, p, y): 148 | ''' Update model using x, p, y 149 | 150 | INPUT: 151 | x: feature, a list of indices 152 | p: click probability prediction of our model 153 | y: answer 154 | 155 | MODIFIES: 156 | self.n: increase by squared gradient 157 | self.z: weights 158 | ''' 159 | 160 | # parameter 161 | alpha = self.alpha 162 | 163 | # model 164 | n = self.n 165 | z = self.z 166 | w = self.w 167 | 168 | # gradient under logloss 169 | g = p - y 170 | 171 | # update z and n 172 | for i in self._indices(x): 173 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 174 | z[i] += g - sigma * w[i] 175 | n[i] += g * g 176 | 177 | 178 | def logloss(p, y): 179 | ''' FUNCTION: Bounded logloss 180 | 181 | INPUT: 182 | p: our prediction 183 | y: real answer 184 | 185 | OUTPUT: 186 | logarithmic loss of p given y 187 | ''' 188 | 189 | p = max(min(p, 1. - 10e-15), 10e-15) 190 | return -log(p) if y == 1. else -log(1. - p) 191 | 192 | 193 | def data(path, D): 194 | ''' GENERATOR: Apply hash-trick to the original csv row 195 | and for simplicity, we one-hot-encode everything 196 | 197 | INPUT: 198 | path: path to training or testing file 199 | D: the max index that we can hash to 200 | 201 | YIELDS: 202 | ID: id of the instance, mainly useless 203 | x: a list of hashed and one-hot-encoded 'indices' 204 | we only need the index since all values are either 0 or 1 205 | y: y = 1 if we have a click, else we have y = 0 206 | ''' 207 | 208 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 209 | 210 | try: 211 | ID= row['ID'] 212 | del row['ID'] 213 | except: 214 | ID = 0 215 | pass 216 | 217 | # process target. 218 | y = 0. 219 | target='Disbursed' 220 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 221 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 222 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 223 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 224 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 225 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 226 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 227 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 228 | row['I8'] = str(row['Var5']) + str(row['Var4']) 229 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 230 | #lcd_weekofyear 231 | 232 | 233 | if target in row: 234 | if row[target] == '1': 235 | y = 1. 236 | del row[target] 237 | 238 | # extract date 239 | 240 | # turn hour really into hour, it was originally YYMMDDHH 241 | 242 | 243 | # build x 244 | x = [] 245 | for key in row: 246 | value = row[key] 247 | 248 | # one-hot encode everything with hash trick 249 | index = abs(hash(key + '_' + value)) % D 250 | x.append(index) 251 | 252 | yield t, ID, x, y 253 | 254 | 255 | ############################################################################## 256 | # start training ############################################################# 257 | ############################################################################## 258 | 259 | start = datetime.now() 260 | print("started at: %s" % datetime.now()) 261 | 262 | # initialize ourselves a learner 263 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 264 | 265 | # start training 266 | for e in range(epoch): 267 | loss = 0. 268 | count = 0 269 | for t, ID, x, y in data(train, D): # data is a generator 270 | 271 | p = learner.predict(x) 272 | 273 | # if (holdout and t % holdout == 0): 274 | # # # Estimate progressive validation loss 275 | # loss += logloss(p, y) 276 | # count += 1 277 | # else: 278 | # # # Use other samples to train the model 279 | # learner.update(x, p, y) 280 | 281 | learner.update(x, p, y) 282 | # if t % 1000000 == 0: 283 | # continue 284 | 285 | #print('epoch: %s\tval. logloss: %0.5f\telapsed time: %s' % (e + 1, loss/count, str(datetime.now() - start))) 286 | 287 | #import pickle 288 | #pickle.dump(learner,open('ftrl3.p','w')) 289 | 290 | ############################################################################## 291 | # start testing, and build Kaggle's submission file ########################## 292 | ############################################################################## 293 | print ('creating submission file') 294 | with open(submission, 'w') as outfile: 295 | outfile.write('ID,Disbursed\n') 296 | for t, ID, x, y in data(test, D): 297 | p = learner.predict(x) 298 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_5_postprocessing_ftrl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | test = pd.read_csv("test.csv", usecols = ["ID"]) 4 | 5 | preds = pd.read_csv("ftrl_final.csv") 6 | preds['ID'] = test['ID'] 7 | preds.to_csv("ftrl_final2.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/_6_ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.stats import rankdata 4 | 5 | xgb_pred = pd.read_csv("xgb_final.csv") #XGB 6 | ftrl_pred = pd.read_csv("ftrl_final.csv") #FTRL 7 | 8 | ens = xgb_pred.copy() 9 | ens.rename(columns={'Disbursed':'XGB'}, inplace = True) 10 | ens['FTRL'] = ftrl_pred['Disbursed'] 11 | 12 | ens['XGB_Rank'] = rankdata(ens['XGB'], method='min') 13 | ens['FTRL_Rank'] = rankdata(ens['FTRL'], method='min') 14 | ens['Final'] = 0.8*ens['XGB_Rank'] + 0.2*ens['FTRL_Rank'] 15 | 16 | ens = ens[['ID', 'Final']] 17 | ens.rename(columns={'Final':'Disbursed'}, inplace = True) 18 | ens.sort_index(inplace = True) 19 | ens.head() 20 | 21 | ens.to_csv("weekend_solution.csv", index = False) # 0.86116 public LB -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weekend/av_script.sh: -------------------------------------------------------------------------------- 1 | python _1_preprocessing.py 2 | python _2_train_xgb.py 3 | python _3_preprocessing_ftrl.py 4 | pypy _4_train_ftrl.py 5 | python _5_postprocessing_ftrl.py 6 | python _6_ensemble.py -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/av_final.sh: -------------------------------------------------------------------------------- 1 | mkdir temp_data 2 | mkdir temp_submission 3 | 4 | python preprocessing.py 5 | python preprocessing2.py 6 | 7 | echo "====> Lets train 5 XGBs for same type of configuration and average for seed stability and control overfitting" 8 | python train_xgb.py 9 | python train_xgb2.py 10 | python train_xgb3.py 11 | python train_xgb4.py 12 | python train_xgb5.py 13 | python postprocessing_XGB_1.py 14 | 15 | echo "====> Train one more 5-set XGB with slightly different feature set(resulted in higher CV). Rank average." 16 | python train_2xgb1.py 17 | python train_2xgb2.py 18 | python train_2xgb3.py 19 | python train_2xgb4.py 20 | python train_2xgb5.py 21 | python postprocessing_XGB_2.py 22 | 23 | python preprocessing_ftrl.py 24 | 25 | echo "====> Shuffle the input data to train linear models with FTRL (Logistic Regression)" 26 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train1.csv 1 100000 1234 27 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train2.csv 1 100000 3456 28 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train3.csv 1 100000 6789 29 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train4.csv 1 100000 6543 30 | 31 | echo "====> Train them in an online manner" 32 | pypy script_ftrl.py 33 | pypy script_ftrl2.py 34 | pypy script_ftrl3.py 35 | pypy script_ftrl4.py 36 | pypy script_ftrl5.py 37 | 38 | echo "====> Rank average linear models for stability" 39 | python postprocessing_ftrl.py 40 | 41 | echo "Let's train Random Forests on original data without city and employer name features" 42 | python train_rf.py 43 | 44 | python postprocessing_rf.py 45 | 46 | echo "Final Rank ensemble!" 47 | python ensemble_rank_final.py -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/ensemble_rank_final.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from scipy.stats import rankdata 4 | 5 | xgb1_pred = pd.read_csv("temp_submission/XGB1_Ens.csv") #XGB 6 | xgb2_pred = pd.read_csv("temp_submission/XGB2_Ens.csv") #XGB 7 | rf_pred = pd.read_csv("temp_submission/RF_Ens.csv") #RF 8 | ftrl_pred = pd.read_csv("temp_submission/FTRL_Ens.csv") # FTRL 9 | 10 | ens = xgb1_pred.copy() 11 | ens.rename(columns={'Disbursed':'XGB1'}, inplace = True) 12 | ens['XGB2'] = xgb2_pred['Disbursed'] 13 | 14 | ens['RF'] = rf_pred['Disbursed'] 15 | ens['FTRL'] = ftrl_pred['Disbursed'] 16 | 17 | 18 | ens['XGB1_Rank'] = rankdata(ens['XGB1'], method='min') 19 | ens['XGB2_Rank'] = rankdata(ens['XGB2'], method='min') 20 | 21 | ens['XGB_Rank'] = 0.5 * ens['XGB1_Rank'] + 0.5 * ens['XGB2_Rank'] 22 | ens['RF_Rank'] = rankdata(ens['RF'], method='min') 23 | ens['FTRL_Rank'] = rankdata(ens['FTRL'], method='min') 24 | 25 | ens['Final'] = (0.75*ens['XGB_Rank'] + 0.25*ens['RF_Rank']) * 0.75 + 0.25 * ens['FTRL'] 26 | 27 | ens = ens[['ID', 'Final']] 28 | ens.rename(columns={'Final':'Disbursed'}, inplace = True) 29 | ens.sort_index(inplace = True) 30 | ens.head() 31 | 32 | ens.to_csv("FinalSolution.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_RF.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.stats import rankdata 3 | 4 | test = pd.read_csv("test.csv", usecols = ["ID"]) 5 | 6 | preds = pd.read_csv("temp_submission/Sub151.csv") 7 | preds['ID'] = test['ID'] 8 | 9 | preds2 = pd.read_csv("temp_submission/Sub152.csv") 10 | preds3 = pd.read_csv("temp_submission/Sub153.csv") 11 | preds4 = pd.read_csv("temp_submission/Sub154.csv") 12 | preds5 = pd.read_csv("temp_submission/Sub155.csv") 13 | 14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal') 15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal') 16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal') 17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal') 18 | preds4['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal') 19 | 20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 21 | preds2['Disbursed'] + 22 | preds3['Disbursed'] + 23 | preds4['Disbursed'] + 24 | preds5['Disbursed']) 25 | 26 | preds.to_csv("temp_submission/RF_Ens.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_XGB_1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.stats import rankdata 3 | 4 | test = pd.read_csv("test.csv", usecols = ["ID"]) 5 | 6 | preds = pd.read_csv("temp_submission/Sub241.csv") 7 | preds['ID'] = test['ID'] 8 | 9 | preds2 = pd.read_csv("temp_submission/Sub242.csv") 10 | preds3 = pd.read_csv("temp_submission/Sub243.csv") 11 | preds4 = pd.read_csv("temp_submission/Sub244.csv") 12 | preds5 = pd.read_csv("temp_submission/Sub245.csv") 13 | 14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal') 15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal') 16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal') 17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal') 18 | preds5['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal') 19 | 20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 21 | preds2['Disbursed'] + 22 | preds3['Disbursed'] + 23 | preds4['Disbursed'] + 24 | preds5['Disbursed']) 25 | 26 | preds.to_csv("temp_submission/XGB1_Ens.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_XGB_2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.stats import rankdata 3 | 4 | test = pd.read_csv("test.csv", usecols = ["ID"]) 5 | 6 | preds = pd.read_csv("temp_submission/Sub251.csv") 7 | preds['ID'] = test['ID'] 8 | 9 | preds2 = pd.read_csv("temp_submission/Sub252.csv") 10 | preds3 = pd.read_csv("temp_submission/Sub253.csv") 11 | preds4 = pd.read_csv("temp_submission/Sub254.csv") 12 | preds5 = pd.read_csv("temp_submission/Sub255.csv") 13 | 14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal') 15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal') 16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal') 17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal') 18 | preds5['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal') 19 | 20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 21 | preds2['Disbursed'] + 22 | preds3['Disbursed'] + 23 | preds4['Disbursed'] + 24 | preds5['Disbursed']) 25 | 26 | preds.to_csv("temp_submission/XGB2_Ens.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_ftrl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from scipy.stats import rankdata 3 | 4 | test = pd.read_csv("test.csv", usecols = ["ID"]) 5 | 6 | preds = pd.read_csv("temp_submission/Sub701.csv") 7 | preds['ID'] = test['ID'] 8 | 9 | preds2 = pd.read_csv("temp_submission/Sub702.csv") 10 | preds3 = pd.read_csv("temp_submission/Sub703.csv") 11 | preds4 = pd.read_csv("temp_submission/Sub704.csv") 12 | preds5 = pd.read_csv("temp_submission/Sub705.csv") 13 | 14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal') 15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal') 16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal') 17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal') 18 | preds4['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal') 19 | 20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 21 | preds2['Disbursed'] + 22 | preds3['Disbursed'] + 23 | preds4['Disbursed'] + 24 | preds5['Disbursed']) 25 | 26 | preds.to_csv("temp_submission/FTRL_Ens.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 7 | import xgboost as xgb 8 | from sklearn.cross_validation import cross_val_score, cross_val_predict 9 | 10 | 11 | train = pd.read_csv("Train.csv") 12 | test = pd.read_csv("Test.csv") 13 | submission = pd.read_csv("sample_submission.csv") 14 | print "Train dataset dimensions:", train.shape 15 | print "Test dataset dimensions:", test.shape 16 | 17 | salary_acc = train.Salary_Account.value_counts(dropna=False) 18 | salary_acc_rare = list(salary_acc[salary_acc<40].index) 19 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 20 | 21 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day 22 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek 23 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear 24 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter 25 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month 26 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year 27 | 28 | train['Lifetime'] = pd.to_datetime(train['Lead_Creation_Date']) - pd.to_datetime(train['DOB']) 29 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int) 30 | 31 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day 32 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek 33 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear 34 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter 35 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month 36 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year 37 | 38 | city = pd.DataFrame(train['City'].value_counts()) 39 | city_rare = list(city[city[0] < 100].index) 40 | train.ix[train['City'].isin(city_rare), 'City'] = "Others" 41 | train['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True) 42 | train['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True) 43 | 44 | from sklearn.preprocessing import LabelEncoder 45 | le = LabelEncoder() 46 | train['City_encoded'] = le.fit_transform(train['City']) 47 | 48 | empnames = pd.DataFrame(train['Employer_Name'].value_counts()) 49 | empnames_rare = list(empnames[empnames[0]<30].index) 50 | train.ix[train['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others" 51 | 52 | # # Preprocessing 53 | train2 = train.copy() 54 | 55 | id_train = train['ID'] 56 | label = train2['Disbursed'] 57 | 58 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB']#, 'Lead_Creation_Date'] 59 | train2.drop(dropCols, axis=1, inplace = True) 60 | 61 | y_train = label 62 | X_train = pd.get_dummies(train2) 63 | 64 | # # Test set preparation 65 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 66 | 67 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day 68 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek 69 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear 70 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter 71 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month 72 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year 73 | 74 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day 75 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek 76 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear 77 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter 78 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month 79 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year 80 | 81 | test['Lifetime'] = pd.to_datetime(test['Lead_Creation_Date']) - pd.to_datetime(test['DOB']) 82 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int) 83 | 84 | test.ix[test['City'].isin(city_rare), 'City'] = "Others" 85 | newcities = list(set(test['City']) - set(train['City'])) 86 | test.ix[test['City'].isin(newcities), 'City'] = np.nan 87 | test['City_encoded'] = le.transform(test['City']) 88 | 89 | test['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True) 90 | test['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True) 91 | 92 | test.ix[test['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others" 93 | 94 | newempnames = list(set(test['Employer_Name']) - set(train['Employer_Name'])) 95 | test.ix[test['Employer_Name'].isin(newempnames), "Employer_Name"] = "Others" 96 | 97 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed'])) 98 | test2 = test.drop(testdropcols, axis=1) 99 | 100 | X_test = pd.get_dummies(test2) 101 | missingCols = list(set(X_train.columns)-set(X_test.columns)) 102 | for col in missingCols: 103 | X_test[col] = 0 104 | X_test = X_test[X_train.columns] 105 | assert X_train.columns.equals(X_test.columns) 106 | 107 | X_train.to_csv("temp_data/train_preprocessed.csv", index = False) 108 | X_test.to_csv("temp_data/test_preprocessed.csv", index = False) 109 | y_train.to_csv("temp_data/train_labels.csv", index = False) 110 | test['ID'].to_csv("temp_data/test_ids.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.tree import DecisionTreeClassifier 6 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier 7 | import xgboost as xgb 8 | from sklearn.cross_validation import cross_val_score, cross_val_predict 9 | 10 | train = pd.read_csv("Train.csv") 11 | test = pd.read_csv("Test.csv") 12 | submission = pd.read_csv("sample_submission.csv") 13 | print "Train dataset dimensions:", train.shape 14 | print "Test dataset dimensions:", test.shape 15 | 16 | salary_acc = train.Salary_Account.value_counts(dropna=False) 17 | salary_acc_rare = list(salary_acc[salary_acc<40].index) 18 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 19 | 20 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day 21 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek 22 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear 23 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter 24 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month 25 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year 26 | 27 | train['Lifetime'] = pd.to_datetime("2015-10-01") - pd.to_datetime(train['DOB']) 28 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int) 29 | 30 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day 31 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek 32 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear 33 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter 34 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month 35 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year 36 | 37 | city = pd.DataFrame(train['City'].value_counts()) 38 | city_rare = list(city[city[0] < 100].index) 39 | train.ix[train['City'].isin(city_rare), 'City'] = "Others" 40 | 41 | train.ix[pd.isnull(train['City']), 'City'] = "-3.14" 42 | 43 | from sklearn.preprocessing import LabelEncoder 44 | le = LabelEncoder() 45 | train['City_encoded'] = le.fit_transform(train['City']) 46 | 47 | # # Preprocessing 48 | train2 = train.copy() 49 | 50 | id_train = train['ID'] 51 | label = train2['Disbursed'] 52 | 53 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'Employer_Name', 'DOB', 'Lead_Creation_Date'] 54 | train2.drop(dropCols, axis=1, inplace = True) 55 | 56 | y_train = label 57 | X_train = pd.get_dummies(train2) 58 | 59 | 60 | # # Test set preparation 61 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 62 | 63 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day 64 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek 65 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear 66 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter 67 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month 68 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year 69 | 70 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day 71 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek 72 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear 73 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter 74 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month 75 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year 76 | 77 | test['Lifetime'] = pd.to_datetime("2015-10-01") - pd.to_datetime(test['DOB']) 78 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int) 79 | 80 | test.ix[test['City'].isin(city_rare), 'City'] = "Others" 81 | newcities = list(set(test['City']) - set(train['City'])) 82 | test.ix[test['City'].isin(newcities), 'City'] = "-3.14" 83 | test['City_encoded'] = le.transform(test['City']) 84 | 85 | test.ix[pd.isnull(test['City']), 'City'] = "-3.14" 86 | 87 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed'])) 88 | test2 = test.drop(testdropcols, axis=1) 89 | 90 | X_test = pd.get_dummies(test2) 91 | missingCols = list(set(X_train.columns)-set(X_test.columns)) 92 | for col in missingCols: 93 | X_test[col] = 0 94 | X_test = X_test[X_train.columns] 95 | assert X_train.columns.equals(X_test.columns) 96 | 97 | X_train.to_csv("temp_data/train_preprocessed2.csv", index = False) 98 | X_test.to_csv("temp_data/test_preprocessed2.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing_ftrl.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | train = pd.read_csv("temp_data/train_preprocessed.csv") 4 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 5 | 6 | labels = list(labels.iloc[:,0]) 7 | 8 | train['Disbursed'] = labels 9 | 10 | train.to_csv("temp_data/train_preprocessed_full.csv", index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | import random 13 | import pickle 14 | 15 | ############################################################################## 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 17 | def tied_rank(x): 18 | """ 19 | Computes the tied rank of elements in x. 20 | This function computes the tied rank of elements in x. 21 | Parameters 22 | ---------- 23 | x : list of numbers, numpy array 24 | Returns 25 | ------- 26 | score : list of numbers 27 | The tied rank f each element in x 28 | """ 29 | sorted_x = sorted(zip(x,range(len(x)))) 30 | r = [0 for k in x] 31 | cur_val = sorted_x[0][0] 32 | last_rank = 0 33 | for i in range(len(sorted_x)): 34 | if cur_val != sorted_x[i][0]: 35 | cur_val = sorted_x[i][0] 36 | for j in range(last_rank, i): 37 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 38 | last_rank = i 39 | if i==len(sorted_x)-1: 40 | for j in range(last_rank, i+1): 41 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 42 | return r 43 | 44 | def auc(actual, posterior): 45 | """ 46 | Computes the area under the receiver-operater characteristic (AUC) 47 | This function computes the AUC error metric for binary classification. 48 | Parameters 49 | ---------- 50 | actual : list of binary numbers, numpy array 51 | The ground truth value 52 | posterior : same type as actual 53 | Defines a ranking on the binary numbers, from most likely to 54 | be positive to least likely to be positive. 55 | Returns 56 | ------- 57 | score : double 58 | The mean squared error between actual and posterior 59 | """ 60 | r = tied_rank(posterior) 61 | num_positive = len([0 for x in actual if x==1]) 62 | num_negative = len(actual)-num_positive 63 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 64 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 65 | (num_negative*num_positive)) 66 | return auc 67 | ############################################################################## 68 | 69 | # TL; DR, the main training process starts on line: 250, 70 | # you may want to start reading the code from there 71 | 72 | 73 | ############################################################################## 74 | # parameters ################################################################# 75 | ############################################################################## 76 | 77 | # A, paths 78 | train='temp_data/train_preprocessed_full.csv' 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv' 80 | submission = 'temp_submission/Sub701.csv' # path of to be outputted submission file 81 | 82 | # B, model 83 | alpha = .05 # learning rate 84 | beta = 1. # smoothing parameter for adaptive learning rate 85 | L1 = 0. # L1 regularization, larger value means more regularized 86 | L2 = 1. # L2 regularization, larger value means more regularized 87 | 88 | # C, feature/hash trick 89 | D = 2 ** 24 # number of weights to use 90 | interaction = False # whether to enable poly2 feature interactions 91 | 92 | # D, training/validation 93 | epoch = 4 # learn training data for N passes 94 | holdafter = 9 # data after date N (exclusive) are used as validation 95 | holdout = 200 # use every N training instance for holdout validation 96 | 97 | 98 | ############################################################################## 99 | # class, function, generator definitions ##################################### 100 | ############################################################################## 101 | 102 | class ftrl_proximal(object): 103 | ''' Our main algorithm: Follow the regularized leader - proximal 104 | 105 | In short, 106 | this is an adaptive-learning-rate sparse logistic-regression with 107 | efficient L1-L2-regularization 108 | 109 | Reference: 110 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 111 | ''' 112 | 113 | def __init__(self, alpha, beta, L1, L2, D, interaction): 114 | # parameters 115 | self.alpha = alpha 116 | self.beta = beta 117 | self.L1 = L1 118 | self.L2 = L2 119 | 120 | # feature related parameters 121 | self.D = D 122 | self.interaction = interaction 123 | 124 | # model 125 | # n: squared sum of past gradients 126 | # z: weights 127 | # w: lazy weights 128 | self.n = [0.] * D 129 | self.z = [random.random() for k in range(D)]#[0.] * D 130 | self.w = {} 131 | 132 | def _indices(self, x): 133 | ''' A helper generator that yields the indices in x 134 | 135 | The purpose of this generator is to make the following 136 | code a bit cleaner when doing feature interaction. 137 | ''' 138 | 139 | # first yield index of the bias term 140 | yield 0 141 | 142 | # then yield the normal indices 143 | for index in x: 144 | yield index 145 | 146 | # now yield interactions (if applicable) 147 | if self.interaction: 148 | D = self.D 149 | L = len(x) 150 | 151 | x = sorted(x) 152 | for i in xrange(L): 153 | for j in xrange(i+1, L): 154 | # one-hot encode interactions with hash trick 155 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 156 | 157 | def predict(self, x): 158 | ''' Get probability estimation on x 159 | 160 | INPUT: 161 | x: features 162 | 163 | OUTPUT: 164 | probability of p(y = 1 | x; w) 165 | ''' 166 | 167 | # parameters 168 | alpha = self.alpha 169 | beta = self.beta 170 | L1 = self.L1 171 | L2 = self.L2 172 | 173 | # model 174 | n = self.n 175 | z = self.z 176 | w = {} 177 | 178 | # wTx is the inner product of w and x 179 | wTx = 0. 180 | for i in self._indices(x): 181 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 182 | 183 | # build w on the fly using z and n, hence the name - lazy weights 184 | # we are doing this at prediction instead of update time is because 185 | # this allows us for not storing the complete w 186 | if sign * z[i] <= L1: 187 | # w[i] vanishes due to L1 regularization 188 | w[i] = 0. 189 | else: 190 | # apply prediction time L1, L2 regularization to z and get w 191 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 192 | 193 | wTx += w[i] 194 | 195 | # cache the current w for update stage 196 | self.w = w 197 | 198 | # bounded sigmoid function, this is the probability estimation 199 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 200 | 201 | def update(self, x, p, y): 202 | ''' Update model using x, p, y 203 | 204 | INPUT: 205 | x: feature, a list of indices 206 | p: click probability prediction of our model 207 | y: answer 208 | 209 | MODIFIES: 210 | self.n: increase by squared gradient 211 | self.z: weights 212 | ''' 213 | 214 | # parameter 215 | alpha = self.alpha 216 | 217 | # model 218 | n = self.n 219 | z = self.z 220 | w = self.w 221 | 222 | # gradient under logloss 223 | g = p - y 224 | 225 | # update z and n 226 | for i in self._indices(x): 227 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 228 | z[i] += g - sigma * w[i] 229 | n[i] += g * g 230 | 231 | 232 | def logloss(p, y): 233 | ''' FUNCTION: Bounded logloss 234 | 235 | INPUT: 236 | p: our prediction 237 | y: real answer 238 | 239 | OUTPUT: 240 | logarithmic loss of p given y 241 | ''' 242 | 243 | p = max(min(p, 1. - 10e-15), 10e-15) 244 | return -log(p) if y == 1. else -log(1. - p) 245 | 246 | 247 | def data(path, D): 248 | ''' GENERATOR: Apply hash-trick to the original csv row 249 | and for simplicity, we one-hot-encode everything 250 | 251 | INPUT: 252 | path: path to training or testing file 253 | D: the max index that we can hash to 254 | 255 | YIELDS: 256 | ID: id of the instance, mainly useless 257 | x: a list of hashed and one-hot-encoded 'indices' 258 | we only need the index since all values are either 0 or 1 259 | y: y = 1 if we have a click, else we have y = 0 260 | ''' 261 | 262 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 263 | 264 | try: 265 | ID= row['ID'] 266 | del row['ID'] 267 | except: 268 | ID = 0 269 | pass 270 | 271 | # process target. 272 | y = 0. 273 | target='Disbursed' 274 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 275 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 276 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 277 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 278 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 279 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 280 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 281 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 282 | row['I8'] = str(row['Var5']) + str(row['Var4']) 283 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 284 | #lcd_weekofyear 285 | 286 | 287 | if target in row: 288 | if row[target] == '1': 289 | y = 1. 290 | del row[target] 291 | 292 | # extract date 293 | 294 | # turn hour really into hour, it was originally YYMMDDHH 295 | 296 | 297 | # build x 298 | x = [] 299 | for key in row: 300 | value = row[key] 301 | 302 | # one-hot encode everything with hash trick 303 | index = abs(hash(key + '_' + value)) % D 304 | x.append(index) 305 | 306 | yield t, ID, x, y 307 | 308 | 309 | ############################################################################## 310 | # start training ############################################################# 311 | ############################################################################## 312 | 313 | start = datetime.now() 314 | #print("started at: %s" % datetime.now()) 315 | 316 | # initialize ourselves a learner 317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 318 | 319 | # start training 320 | for e in range(epoch): 321 | random.seed(1234) 322 | loss = 0. 323 | count = 0 324 | predlist=[] 325 | targetlist=[] 326 | for t, ID, x, y in data(train, D): # data is a generator 327 | 328 | p = learner.predict(x) 329 | 330 | # if random.random() < 0.3: 331 | # # Estimate progressive validation loss 332 | # loss += logloss(p, y) 333 | # count += 1 334 | # predlist.append(p) 335 | # targetlist.append(y) 336 | # else: 337 | # # Use other samples to train the model 338 | # learner.update(x, p, y) 339 | 340 | learner.update(x, p, y) 341 | # if t % 1000000 == 0: 342 | # continue 343 | 344 | # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 345 | # (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start))) 346 | 347 | #import pickle 348 | #pickle.dump(learner,open('ftrl3.p','w')) 349 | 350 | ############################################################################## 351 | # start testing, and build Kaggle's submission file ########################## 352 | ############################################################################## 353 | #print ('creating submission file') 354 | with open(submission, 'w') as outfile: 355 | outfile.write('ID,Disbursed\n') 356 | for t, ID, x, y in data(test, D): 357 | p = learner.predict(x) 358 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl2.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | import random 13 | import pickle 14 | 15 | ############################################################################## 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 17 | def tied_rank(x): 18 | """ 19 | Computes the tied rank of elements in x. 20 | This function computes the tied rank of elements in x. 21 | Parameters 22 | ---------- 23 | x : list of numbers, numpy array 24 | Returns 25 | ------- 26 | score : list of numbers 27 | The tied rank f each element in x 28 | """ 29 | sorted_x = sorted(zip(x,range(len(x)))) 30 | r = [0 for k in x] 31 | cur_val = sorted_x[0][0] 32 | last_rank = 0 33 | for i in range(len(sorted_x)): 34 | if cur_val != sorted_x[i][0]: 35 | cur_val = sorted_x[i][0] 36 | for j in range(last_rank, i): 37 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 38 | last_rank = i 39 | if i==len(sorted_x)-1: 40 | for j in range(last_rank, i+1): 41 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 42 | return r 43 | 44 | def auc(actual, posterior): 45 | """ 46 | Computes the area under the receiver-operater characteristic (AUC) 47 | This function computes the AUC error metric for binary classification. 48 | Parameters 49 | ---------- 50 | actual : list of binary numbers, numpy array 51 | The ground truth value 52 | posterior : same type as actual 53 | Defines a ranking on the binary numbers, from most likely to 54 | be positive to least likely to be positive. 55 | Returns 56 | ------- 57 | score : double 58 | The mean squared error between actual and posterior 59 | """ 60 | r = tied_rank(posterior) 61 | num_positive = len([0 for x in actual if x==1]) 62 | num_negative = len(actual)-num_positive 63 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 64 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 65 | (num_negative*num_positive)) 66 | return auc 67 | ############################################################################## 68 | 69 | # TL; DR, the main training process starts on line: 250, 70 | # you may want to start reading the code from there 71 | 72 | 73 | ############################################################################## 74 | # parameters ################################################################# 75 | ############################################################################## 76 | 77 | # A, paths 78 | train='temp_data/shuffled_train1.csv' 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv' 80 | submission = 'temp_submission/Sub702.csv' # path of to be outputted submission file 81 | 82 | # B, model 83 | alpha = .05 # learning rate 84 | beta = 1. # smoothing parameter for adaptive learning rate 85 | L1 = 0. # L1 regularization, larger value means more regularized 86 | L2 = 1. # L2 regularization, larger value means more regularized 87 | 88 | # C, feature/hash trick 89 | D = 2 ** 24 # number of weights to use 90 | interaction = False # whether to enable poly2 feature interactions 91 | 92 | # D, training/validation 93 | epoch = 4 # learn training data for N passes 94 | holdafter = 9 # data after date N (exclusive) are used as validation 95 | holdout = 200 # use every N training instance for holdout validation 96 | 97 | 98 | ############################################################################## 99 | # class, function, generator definitions ##################################### 100 | ############################################################################## 101 | 102 | class ftrl_proximal(object): 103 | ''' Our main algorithm: Follow the regularized leader - proximal 104 | 105 | In short, 106 | this is an adaptive-learning-rate sparse logistic-regression with 107 | efficient L1-L2-regularization 108 | 109 | Reference: 110 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 111 | ''' 112 | 113 | def __init__(self, alpha, beta, L1, L2, D, interaction): 114 | # parameters 115 | self.alpha = alpha 116 | self.beta = beta 117 | self.L1 = L1 118 | self.L2 = L2 119 | 120 | # feature related parameters 121 | self.D = D 122 | self.interaction = interaction 123 | 124 | # model 125 | # n: squared sum of past gradients 126 | # z: weights 127 | # w: lazy weights 128 | self.n = [0.] * D 129 | self.z = [random.random() for k in range(D)]#[0.] * D 130 | self.w = {} 131 | 132 | def _indices(self, x): 133 | ''' A helper generator that yields the indices in x 134 | 135 | The purpose of this generator is to make the following 136 | code a bit cleaner when doing feature interaction. 137 | ''' 138 | 139 | # first yield index of the bias term 140 | yield 0 141 | 142 | # then yield the normal indices 143 | for index in x: 144 | yield index 145 | 146 | # now yield interactions (if applicable) 147 | if self.interaction: 148 | D = self.D 149 | L = len(x) 150 | 151 | x = sorted(x) 152 | for i in xrange(L): 153 | for j in xrange(i+1, L): 154 | # one-hot encode interactions with hash trick 155 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 156 | 157 | def predict(self, x): 158 | ''' Get probability estimation on x 159 | 160 | INPUT: 161 | x: features 162 | 163 | OUTPUT: 164 | probability of p(y = 1 | x; w) 165 | ''' 166 | 167 | # parameters 168 | alpha = self.alpha 169 | beta = self.beta 170 | L1 = self.L1 171 | L2 = self.L2 172 | 173 | # model 174 | n = self.n 175 | z = self.z 176 | w = {} 177 | 178 | # wTx is the inner product of w and x 179 | wTx = 0. 180 | for i in self._indices(x): 181 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 182 | 183 | # build w on the fly using z and n, hence the name - lazy weights 184 | # we are doing this at prediction instead of update time is because 185 | # this allows us for not storing the complete w 186 | if sign * z[i] <= L1: 187 | # w[i] vanishes due to L1 regularization 188 | w[i] = 0. 189 | else: 190 | # apply prediction time L1, L2 regularization to z and get w 191 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 192 | 193 | wTx += w[i] 194 | 195 | # cache the current w for update stage 196 | self.w = w 197 | 198 | # bounded sigmoid function, this is the probability estimation 199 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 200 | 201 | def update(self, x, p, y): 202 | ''' Update model using x, p, y 203 | 204 | INPUT: 205 | x: feature, a list of indices 206 | p: click probability prediction of our model 207 | y: answer 208 | 209 | MODIFIES: 210 | self.n: increase by squared gradient 211 | self.z: weights 212 | ''' 213 | 214 | # parameter 215 | alpha = self.alpha 216 | 217 | # model 218 | n = self.n 219 | z = self.z 220 | w = self.w 221 | 222 | # gradient under logloss 223 | g = p - y 224 | 225 | # update z and n 226 | for i in self._indices(x): 227 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 228 | z[i] += g - sigma * w[i] 229 | n[i] += g * g 230 | 231 | 232 | def logloss(p, y): 233 | ''' FUNCTION: Bounded logloss 234 | 235 | INPUT: 236 | p: our prediction 237 | y: real answer 238 | 239 | OUTPUT: 240 | logarithmic loss of p given y 241 | ''' 242 | 243 | p = max(min(p, 1. - 10e-15), 10e-15) 244 | return -log(p) if y == 1. else -log(1. - p) 245 | 246 | 247 | def data(path, D): 248 | ''' GENERATOR: Apply hash-trick to the original csv row 249 | and for simplicity, we one-hot-encode everything 250 | 251 | INPUT: 252 | path: path to training or testing file 253 | D: the max index that we can hash to 254 | 255 | YIELDS: 256 | ID: id of the instance, mainly useless 257 | x: a list of hashed and one-hot-encoded 'indices' 258 | we only need the index since all values are either 0 or 1 259 | y: y = 1 if we have a click, else we have y = 0 260 | ''' 261 | 262 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 263 | 264 | try: 265 | ID= row['ID'] 266 | del row['ID'] 267 | except: 268 | ID = 0 269 | pass 270 | 271 | # process target. 272 | y = 0. 273 | target='Disbursed' 274 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 275 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 276 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 277 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 278 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 279 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 280 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 281 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 282 | row['I8'] = str(row['Var5']) + str(row['Var4']) 283 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 284 | #lcd_weekofyear 285 | 286 | 287 | if target in row: 288 | if row[target] == '1': 289 | y = 1. 290 | del row[target] 291 | 292 | # extract date 293 | 294 | # turn hour really into hour, it was originally YYMMDDHH 295 | 296 | 297 | # build x 298 | x = [] 299 | for key in row: 300 | value = row[key] 301 | 302 | # one-hot encode everything with hash trick 303 | index = abs(hash(key + '_' + value)) % D 304 | x.append(index) 305 | 306 | yield t, ID, x, y 307 | 308 | 309 | ############################################################################## 310 | # start training ############################################################# 311 | ############################################################################## 312 | 313 | start = datetime.now() 314 | #print("started at: %s" % datetime.now()) 315 | 316 | # initialize ourselves a learner 317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 318 | 319 | # start training 320 | for e in range(epoch): 321 | random.seed(1234) 322 | loss = 0. 323 | count = 0 324 | predlist=[] 325 | targetlist=[] 326 | for t, ID, x, y in data(train, D): # data is a generator 327 | 328 | p = learner.predict(x) 329 | 330 | # if random.random() < 0.3: 331 | # # Estimate progressive validation loss 332 | # loss += logloss(p, y) 333 | # count += 1 334 | # predlist.append(p) 335 | # targetlist.append(y) 336 | # else: 337 | # # Use other samples to train the model 338 | # learner.update(x, p, y) 339 | 340 | learner.update(x, p, y) 341 | # if t % 1000000 == 0: 342 | # continue 343 | 344 | # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 345 | # (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start))) 346 | 347 | #import pickle 348 | #pickle.dump(learner,open('ftrl3.p','w')) 349 | 350 | ############################################################################## 351 | # start testing, and build Kaggle's submission file ########################## 352 | ############################################################################## 353 | #print ('creating submission file') 354 | with open(submission, 'w') as outfile: 355 | outfile.write('ID,Disbursed\n') 356 | for t, ID, x, y in data(test, D): 357 | p = learner.predict(x) 358 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl3.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | import random 13 | import pickle 14 | 15 | ############################################################################## 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 17 | def tied_rank(x): 18 | """ 19 | Computes the tied rank of elements in x. 20 | This function computes the tied rank of elements in x. 21 | Parameters 22 | ---------- 23 | x : list of numbers, numpy array 24 | Returns 25 | ------- 26 | score : list of numbers 27 | The tied rank f each element in x 28 | """ 29 | sorted_x = sorted(zip(x,range(len(x)))) 30 | r = [0 for k in x] 31 | cur_val = sorted_x[0][0] 32 | last_rank = 0 33 | for i in range(len(sorted_x)): 34 | if cur_val != sorted_x[i][0]: 35 | cur_val = sorted_x[i][0] 36 | for j in range(last_rank, i): 37 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 38 | last_rank = i 39 | if i==len(sorted_x)-1: 40 | for j in range(last_rank, i+1): 41 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 42 | return r 43 | 44 | def auc(actual, posterior): 45 | """ 46 | Computes the area under the receiver-operater characteristic (AUC) 47 | This function computes the AUC error metric for binary classification. 48 | Parameters 49 | ---------- 50 | actual : list of binary numbers, numpy array 51 | The ground truth value 52 | posterior : same type as actual 53 | Defines a ranking on the binary numbers, from most likely to 54 | be positive to least likely to be positive. 55 | Returns 56 | ------- 57 | score : double 58 | The mean squared error between actual and posterior 59 | """ 60 | r = tied_rank(posterior) 61 | num_positive = len([0 for x in actual if x==1]) 62 | num_negative = len(actual)-num_positive 63 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 64 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 65 | (num_negative*num_positive)) 66 | return auc 67 | ############################################################################## 68 | 69 | # TL; DR, the main training process starts on line: 250, 70 | # you may want to start reading the code from there 71 | 72 | 73 | ############################################################################## 74 | # parameters ################################################################# 75 | ############################################################################## 76 | 77 | # A, paths 78 | train='temp_data/shuffled_train2.csv' 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv' 80 | submission = 'temp_submission/Sub703.csv' # path of to be outputted submission file 81 | 82 | # B, model 83 | alpha = .05 # learning rate 84 | beta = 1. # smoothing parameter for adaptive learning rate 85 | L1 = 0. # L1 regularization, larger value means more regularized 86 | L2 = 1. # L2 regularization, larger value means more regularized 87 | 88 | # C, feature/hash trick 89 | D = 2 ** 24 # number of weights to use 90 | interaction = False # whether to enable poly2 feature interactions 91 | 92 | # D, training/validation 93 | epoch = 4 # learn training data for N passes 94 | holdafter = 9 # data after date N (exclusive) are used as validation 95 | holdout = 200 # use every N training instance for holdout validation 96 | 97 | 98 | ############################################################################## 99 | # class, function, generator definitions ##################################### 100 | ############################################################################## 101 | 102 | class ftrl_proximal(object): 103 | ''' Our main algorithm: Follow the regularized leader - proximal 104 | 105 | In short, 106 | this is an adaptive-learning-rate sparse logistic-regression with 107 | efficient L1-L2-regularization 108 | 109 | Reference: 110 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 111 | ''' 112 | 113 | def __init__(self, alpha, beta, L1, L2, D, interaction): 114 | # parameters 115 | self.alpha = alpha 116 | self.beta = beta 117 | self.L1 = L1 118 | self.L2 = L2 119 | 120 | # feature related parameters 121 | self.D = D 122 | self.interaction = interaction 123 | 124 | # model 125 | # n: squared sum of past gradients 126 | # z: weights 127 | # w: lazy weights 128 | self.n = [0.] * D 129 | self.z = [random.random() for k in range(D)]#[0.] * D 130 | self.w = {} 131 | 132 | def _indices(self, x): 133 | ''' A helper generator that yields the indices in x 134 | 135 | The purpose of this generator is to make the following 136 | code a bit cleaner when doing feature interaction. 137 | ''' 138 | 139 | # first yield index of the bias term 140 | yield 0 141 | 142 | # then yield the normal indices 143 | for index in x: 144 | yield index 145 | 146 | # now yield interactions (if applicable) 147 | if self.interaction: 148 | D = self.D 149 | L = len(x) 150 | 151 | x = sorted(x) 152 | for i in xrange(L): 153 | for j in xrange(i+1, L): 154 | # one-hot encode interactions with hash trick 155 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 156 | 157 | def predict(self, x): 158 | ''' Get probability estimation on x 159 | 160 | INPUT: 161 | x: features 162 | 163 | OUTPUT: 164 | probability of p(y = 1 | x; w) 165 | ''' 166 | 167 | # parameters 168 | alpha = self.alpha 169 | beta = self.beta 170 | L1 = self.L1 171 | L2 = self.L2 172 | 173 | # model 174 | n = self.n 175 | z = self.z 176 | w = {} 177 | 178 | # wTx is the inner product of w and x 179 | wTx = 0. 180 | for i in self._indices(x): 181 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 182 | 183 | # build w on the fly using z and n, hence the name - lazy weights 184 | # we are doing this at prediction instead of update time is because 185 | # this allows us for not storing the complete w 186 | if sign * z[i] <= L1: 187 | # w[i] vanishes due to L1 regularization 188 | w[i] = 0. 189 | else: 190 | # apply prediction time L1, L2 regularization to z and get w 191 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 192 | 193 | wTx += w[i] 194 | 195 | # cache the current w for update stage 196 | self.w = w 197 | 198 | # bounded sigmoid function, this is the probability estimation 199 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 200 | 201 | def update(self, x, p, y): 202 | ''' Update model using x, p, y 203 | 204 | INPUT: 205 | x: feature, a list of indices 206 | p: click probability prediction of our model 207 | y: answer 208 | 209 | MODIFIES: 210 | self.n: increase by squared gradient 211 | self.z: weights 212 | ''' 213 | 214 | # parameter 215 | alpha = self.alpha 216 | 217 | # model 218 | n = self.n 219 | z = self.z 220 | w = self.w 221 | 222 | # gradient under logloss 223 | g = p - y 224 | 225 | # update z and n 226 | for i in self._indices(x): 227 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 228 | z[i] += g - sigma * w[i] 229 | n[i] += g * g 230 | 231 | 232 | def logloss(p, y): 233 | ''' FUNCTION: Bounded logloss 234 | 235 | INPUT: 236 | p: our prediction 237 | y: real answer 238 | 239 | OUTPUT: 240 | logarithmic loss of p given y 241 | ''' 242 | 243 | p = max(min(p, 1. - 10e-15), 10e-15) 244 | return -log(p) if y == 1. else -log(1. - p) 245 | 246 | 247 | def data(path, D): 248 | ''' GENERATOR: Apply hash-trick to the original csv row 249 | and for simplicity, we one-hot-encode everything 250 | 251 | INPUT: 252 | path: path to training or testing file 253 | D: the max index that we can hash to 254 | 255 | YIELDS: 256 | ID: id of the instance, mainly useless 257 | x: a list of hashed and one-hot-encoded 'indices' 258 | we only need the index since all values are either 0 or 1 259 | y: y = 1 if we have a click, else we have y = 0 260 | ''' 261 | 262 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 263 | 264 | try: 265 | ID= row['ID'] 266 | del row['ID'] 267 | except: 268 | ID = 0 269 | pass 270 | 271 | # process target. 272 | y = 0. 273 | target='Disbursed' 274 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 275 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 276 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 277 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 278 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 279 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 280 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 281 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 282 | row['I8'] = str(row['Var5']) + str(row['Var4']) 283 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 284 | #lcd_weekofyear 285 | 286 | 287 | if target in row: 288 | if row[target] == '1': 289 | y = 1. 290 | del row[target] 291 | 292 | # extract date 293 | 294 | # turn hour really into hour, it was originally YYMMDDHH 295 | 296 | 297 | # build x 298 | x = [] 299 | for key in row: 300 | value = row[key] 301 | 302 | # one-hot encode everything with hash trick 303 | index = abs(hash(key + '_' + value)) % D 304 | x.append(index) 305 | 306 | yield t, ID, x, y 307 | 308 | 309 | ############################################################################## 310 | # start training ############################################################# 311 | ############################################################################## 312 | 313 | start = datetime.now() 314 | #print("started at: %s" % datetime.now()) 315 | 316 | # initialize ourselves a learner 317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 318 | 319 | # start training 320 | for e in range(epoch): 321 | random.seed(1234) 322 | loss = 0. 323 | count = 0 324 | predlist=[] 325 | targetlist=[] 326 | for t, ID, x, y in data(train, D): # data is a generator 327 | 328 | p = learner.predict(x) 329 | 330 | # if random.random() < 0.3: 331 | # # Estimate progressive validation loss 332 | # loss += logloss(p, y) 333 | # count += 1 334 | # predlist.append(p) 335 | # targetlist.append(y) 336 | # else: 337 | # # Use other samples to train the model 338 | # learner.update(x, p, y) 339 | 340 | learner.update(x, p, y) 341 | # if t % 1000000 == 0: 342 | # continue 343 | 344 | # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 345 | # (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start))) 346 | 347 | #import pickle 348 | #pickle.dump(learner,open('ftrl3.p','w')) 349 | 350 | ############################################################################## 351 | # start testing, and build Kaggle's submission file ########################## 352 | ############################################################################## 353 | #print ('creating submission file') 354 | with open(submission, 'w') as outfile: 355 | outfile.write('ID,Disbursed\n') 356 | for t, ID, x, y in data(test, D): 357 | p = learner.predict(x) 358 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl4.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | import random 13 | import pickle 14 | 15 | ############################################################################## 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 17 | def tied_rank(x): 18 | """ 19 | Computes the tied rank of elements in x. 20 | This function computes the tied rank of elements in x. 21 | Parameters 22 | ---------- 23 | x : list of numbers, numpy array 24 | Returns 25 | ------- 26 | score : list of numbers 27 | The tied rank f each element in x 28 | """ 29 | sorted_x = sorted(zip(x,range(len(x)))) 30 | r = [0 for k in x] 31 | cur_val = sorted_x[0][0] 32 | last_rank = 0 33 | for i in range(len(sorted_x)): 34 | if cur_val != sorted_x[i][0]: 35 | cur_val = sorted_x[i][0] 36 | for j in range(last_rank, i): 37 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 38 | last_rank = i 39 | if i==len(sorted_x)-1: 40 | for j in range(last_rank, i+1): 41 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 42 | return r 43 | 44 | def auc(actual, posterior): 45 | """ 46 | Computes the area under the receiver-operater characteristic (AUC) 47 | This function computes the AUC error metric for binary classification. 48 | Parameters 49 | ---------- 50 | actual : list of binary numbers, numpy array 51 | The ground truth value 52 | posterior : same type as actual 53 | Defines a ranking on the binary numbers, from most likely to 54 | be positive to least likely to be positive. 55 | Returns 56 | ------- 57 | score : double 58 | The mean squared error between actual and posterior 59 | """ 60 | r = tied_rank(posterior) 61 | num_positive = len([0 for x in actual if x==1]) 62 | num_negative = len(actual)-num_positive 63 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 64 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 65 | (num_negative*num_positive)) 66 | return auc 67 | ############################################################################## 68 | 69 | # TL; DR, the main training process starts on line: 250, 70 | # you may want to start reading the code from there 71 | 72 | 73 | ############################################################################## 74 | # parameters ################################################################# 75 | ############################################################################## 76 | 77 | # A, paths 78 | 79 | train='temp_data/shuffled_train3.csv' 80 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv' 81 | submission = 'temp_submission/Sub704.csv' # path of to be outputted submission file 82 | 83 | # B, model 84 | alpha = .05 # learning rate 85 | beta = 1. # smoothing parameter for adaptive learning rate 86 | L1 = 0. # L1 regularization, larger value means more regularized 87 | L2 = 1. # L2 regularization, larger value means more regularized 88 | 89 | # C, feature/hash trick 90 | D = 2 ** 24 # number of weights to use 91 | interaction = False # whether to enable poly2 feature interactions 92 | 93 | # D, training/validation 94 | epoch = 4 # learn training data for N passes 95 | holdafter = 9 # data after date N (exclusive) are used as validation 96 | holdout = 200 # use every N training instance for holdout validation 97 | 98 | 99 | ############################################################################## 100 | # class, function, generator definitions ##################################### 101 | ############################################################################## 102 | 103 | class ftrl_proximal(object): 104 | ''' Our main algorithm: Follow the regularized leader - proximal 105 | 106 | In short, 107 | this is an adaptive-learning-rate sparse logistic-regression with 108 | efficient L1-L2-regularization 109 | 110 | Reference: 111 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 112 | ''' 113 | 114 | def __init__(self, alpha, beta, L1, L2, D, interaction): 115 | # parameters 116 | self.alpha = alpha 117 | self.beta = beta 118 | self.L1 = L1 119 | self.L2 = L2 120 | 121 | # feature related parameters 122 | self.D = D 123 | self.interaction = interaction 124 | 125 | # model 126 | # n: squared sum of past gradients 127 | # z: weights 128 | # w: lazy weights 129 | self.n = [0.] * D 130 | self.z = [random.random() for k in range(D)]#[0.] * D 131 | self.w = {} 132 | 133 | def _indices(self, x): 134 | ''' A helper generator that yields the indices in x 135 | 136 | The purpose of this generator is to make the following 137 | code a bit cleaner when doing feature interaction. 138 | ''' 139 | 140 | # first yield index of the bias term 141 | yield 0 142 | 143 | # then yield the normal indices 144 | for index in x: 145 | yield index 146 | 147 | # now yield interactions (if applicable) 148 | if self.interaction: 149 | D = self.D 150 | L = len(x) 151 | 152 | x = sorted(x) 153 | for i in xrange(L): 154 | for j in xrange(i+1, L): 155 | # one-hot encode interactions with hash trick 156 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 157 | 158 | def predict(self, x): 159 | ''' Get probability estimation on x 160 | 161 | INPUT: 162 | x: features 163 | 164 | OUTPUT: 165 | probability of p(y = 1 | x; w) 166 | ''' 167 | 168 | # parameters 169 | alpha = self.alpha 170 | beta = self.beta 171 | L1 = self.L1 172 | L2 = self.L2 173 | 174 | # model 175 | n = self.n 176 | z = self.z 177 | w = {} 178 | 179 | # wTx is the inner product of w and x 180 | wTx = 0. 181 | for i in self._indices(x): 182 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 183 | 184 | # build w on the fly using z and n, hence the name - lazy weights 185 | # we are doing this at prediction instead of update time is because 186 | # this allows us for not storing the complete w 187 | if sign * z[i] <= L1: 188 | # w[i] vanishes due to L1 regularization 189 | w[i] = 0. 190 | else: 191 | # apply prediction time L1, L2 regularization to z and get w 192 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 193 | 194 | wTx += w[i] 195 | 196 | # cache the current w for update stage 197 | self.w = w 198 | 199 | # bounded sigmoid function, this is the probability estimation 200 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 201 | 202 | def update(self, x, p, y): 203 | ''' Update model using x, p, y 204 | 205 | INPUT: 206 | x: feature, a list of indices 207 | p: click probability prediction of our model 208 | y: answer 209 | 210 | MODIFIES: 211 | self.n: increase by squared gradient 212 | self.z: weights 213 | ''' 214 | 215 | # parameter 216 | alpha = self.alpha 217 | 218 | # model 219 | n = self.n 220 | z = self.z 221 | w = self.w 222 | 223 | # gradient under logloss 224 | g = p - y 225 | 226 | # update z and n 227 | for i in self._indices(x): 228 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 229 | z[i] += g - sigma * w[i] 230 | n[i] += g * g 231 | 232 | 233 | def logloss(p, y): 234 | ''' FUNCTION: Bounded logloss 235 | 236 | INPUT: 237 | p: our prediction 238 | y: real answer 239 | 240 | OUTPUT: 241 | logarithmic loss of p given y 242 | ''' 243 | 244 | p = max(min(p, 1. - 10e-15), 10e-15) 245 | return -log(p) if y == 1. else -log(1. - p) 246 | 247 | 248 | def data(path, D): 249 | ''' GENERATOR: Apply hash-trick to the original csv row 250 | and for simplicity, we one-hot-encode everything 251 | 252 | INPUT: 253 | path: path to training or testing file 254 | D: the max index that we can hash to 255 | 256 | YIELDS: 257 | ID: id of the instance, mainly useless 258 | x: a list of hashed and one-hot-encoded 'indices' 259 | we only need the index since all values are either 0 or 1 260 | y: y = 1 if we have a click, else we have y = 0 261 | ''' 262 | 263 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 264 | 265 | try: 266 | ID= row['ID'] 267 | del row['ID'] 268 | except: 269 | ID = 0 270 | pass 271 | 272 | # process target. 273 | y = 0. 274 | target='Disbursed' 275 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 276 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 277 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 278 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 279 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 280 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 281 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 282 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 283 | row['I8'] = str(row['Var5']) + str(row['Var4']) 284 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 285 | #lcd_weekofyear 286 | 287 | 288 | if target in row: 289 | if row[target] == '1': 290 | y = 1. 291 | del row[target] 292 | 293 | # extract date 294 | 295 | # turn hour really into hour, it was originally YYMMDDHH 296 | 297 | 298 | # build x 299 | x = [] 300 | for key in row: 301 | value = row[key] 302 | 303 | # one-hot encode everything with hash trick 304 | index = abs(hash(key + '_' + value)) % D 305 | x.append(index) 306 | 307 | yield t, ID, x, y 308 | 309 | 310 | ############################################################################## 311 | # start training ############################################################# 312 | ############################################################################## 313 | 314 | start = datetime.now() 315 | #print("started at: %s" % datetime.now()) 316 | 317 | # initialize ourselves a learner 318 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 319 | 320 | # start training 321 | for e in range(epoch): 322 | random.seed(1234) 323 | loss = 0. 324 | count = 0 325 | predlist=[] 326 | targetlist=[] 327 | for t, ID, x, y in data(train, D): # data is a generator 328 | 329 | p = learner.predict(x) 330 | 331 | # if random.random() < 0.3: 332 | # # Estimate progressive validation loss 333 | # loss += logloss(p, y) 334 | # count += 1 335 | # predlist.append(p) 336 | # targetlist.append(y) 337 | # else: 338 | # # Use other samples to train the model 339 | # learner.update(x, p, y) 340 | 341 | learner.update(x, p, y) 342 | # if t % 1000000 == 0: 343 | # continue 344 | 345 | # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 346 | # (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start))) 347 | 348 | #import pickle 349 | #pickle.dump(learner,open('ftrl3.p','w')) 350 | 351 | ############################################################################## 352 | # start testing, and build Kaggle's submission file ########################## 353 | ############################################################################## 354 | #print ('creating submission file') 355 | with open(submission, 'w') as outfile: 356 | outfile.write('ID,Disbursed\n') 357 | for t, ID, x, y in data(test, D): 358 | p = learner.predict(x) 359 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl5.py: -------------------------------------------------------------------------------- 1 | ############################################################################################################# 2 | #classic tinrtgu's code 3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory 4 | #modified by rcarson 5 | #https://www.kaggle.com/jiweiliu 6 | ############################################################################################################# 7 | 8 | 9 | from datetime import datetime 10 | from csv import DictReader 11 | from math import exp, log, sqrt 12 | import random 13 | import pickle 14 | 15 | ############################################################################## 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py 17 | def tied_rank(x): 18 | """ 19 | Computes the tied rank of elements in x. 20 | This function computes the tied rank of elements in x. 21 | Parameters 22 | ---------- 23 | x : list of numbers, numpy array 24 | Returns 25 | ------- 26 | score : list of numbers 27 | The tied rank f each element in x 28 | """ 29 | sorted_x = sorted(zip(x,range(len(x)))) 30 | r = [0 for k in x] 31 | cur_val = sorted_x[0][0] 32 | last_rank = 0 33 | for i in range(len(sorted_x)): 34 | if cur_val != sorted_x[i][0]: 35 | cur_val = sorted_x[i][0] 36 | for j in range(last_rank, i): 37 | r[sorted_x[j][1]] = float(last_rank+1+i)/2.0 38 | last_rank = i 39 | if i==len(sorted_x)-1: 40 | for j in range(last_rank, i+1): 41 | r[sorted_x[j][1]] = float(last_rank+i+2)/2.0 42 | return r 43 | 44 | def auc(actual, posterior): 45 | """ 46 | Computes the area under the receiver-operater characteristic (AUC) 47 | This function computes the AUC error metric for binary classification. 48 | Parameters 49 | ---------- 50 | actual : list of binary numbers, numpy array 51 | The ground truth value 52 | posterior : same type as actual 53 | Defines a ranking on the binary numbers, from most likely to 54 | be positive to least likely to be positive. 55 | Returns 56 | ------- 57 | score : double 58 | The mean squared error between actual and posterior 59 | """ 60 | r = tied_rank(posterior) 61 | num_positive = len([0 for x in actual if x==1]) 62 | num_negative = len(actual)-num_positive 63 | sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1]) 64 | auc = ((sum_positive - num_positive*(num_positive+1)/2.0) / 65 | (num_negative*num_positive)) 66 | return auc 67 | ############################################################################## 68 | 69 | # TL; DR, the main training process starts on line: 250, 70 | # you may want to start reading the code from there 71 | 72 | 73 | ############################################################################## 74 | # parameters ################################################################# 75 | ############################################################################## 76 | 77 | # A, paths 78 | train='temp_data/shuffled_train4.csv' 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv' 80 | submission = 'temp_submission/Sub705.csv' # path of to be outputted submission file 81 | 82 | # B, model 83 | alpha = .05 # learning rate 84 | beta = 1. # smoothing parameter for adaptive learning rate 85 | L1 = 0. # L1 regularization, larger value means more regularized 86 | L2 = 1. # L2 regularization, larger value means more regularized 87 | 88 | # C, feature/hash trick 89 | D = 2 ** 24 # number of weights to use 90 | interaction = False # whether to enable poly2 feature interactions 91 | 92 | # D, training/validation 93 | epoch = 4 # learn training data for N passes 94 | holdafter = 9 # data after date N (exclusive) are used as validation 95 | holdout = 200 # use every N training instance for holdout validation 96 | 97 | 98 | ############################################################################## 99 | # class, function, generator definitions ##################################### 100 | ############################################################################## 101 | 102 | class ftrl_proximal(object): 103 | ''' Our main algorithm: Follow the regularized leader - proximal 104 | 105 | In short, 106 | this is an adaptive-learning-rate sparse logistic-regression with 107 | efficient L1-L2-regularization 108 | 109 | Reference: 110 | http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf 111 | ''' 112 | 113 | def __init__(self, alpha, beta, L1, L2, D, interaction): 114 | # parameters 115 | self.alpha = alpha 116 | self.beta = beta 117 | self.L1 = L1 118 | self.L2 = L2 119 | 120 | # feature related parameters 121 | self.D = D 122 | self.interaction = interaction 123 | 124 | # model 125 | # n: squared sum of past gradients 126 | # z: weights 127 | # w: lazy weights 128 | self.n = [0.] * D 129 | self.z = [random.random() for k in range(D)]#[0.] * D 130 | self.w = {} 131 | 132 | def _indices(self, x): 133 | ''' A helper generator that yields the indices in x 134 | 135 | The purpose of this generator is to make the following 136 | code a bit cleaner when doing feature interaction. 137 | ''' 138 | 139 | # first yield index of the bias term 140 | yield 0 141 | 142 | # then yield the normal indices 143 | for index in x: 144 | yield index 145 | 146 | # now yield interactions (if applicable) 147 | if self.interaction: 148 | D = self.D 149 | L = len(x) 150 | 151 | x = sorted(x) 152 | for i in xrange(L): 153 | for j in xrange(i+1, L): 154 | # one-hot encode interactions with hash trick 155 | yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D 156 | 157 | def predict(self, x): 158 | ''' Get probability estimation on x 159 | 160 | INPUT: 161 | x: features 162 | 163 | OUTPUT: 164 | probability of p(y = 1 | x; w) 165 | ''' 166 | 167 | # parameters 168 | alpha = self.alpha 169 | beta = self.beta 170 | L1 = self.L1 171 | L2 = self.L2 172 | 173 | # model 174 | n = self.n 175 | z = self.z 176 | w = {} 177 | 178 | # wTx is the inner product of w and x 179 | wTx = 0. 180 | for i in self._indices(x): 181 | sign = -1. if z[i] < 0 else 1. # get sign of z[i] 182 | 183 | # build w on the fly using z and n, hence the name - lazy weights 184 | # we are doing this at prediction instead of update time is because 185 | # this allows us for not storing the complete w 186 | if sign * z[i] <= L1: 187 | # w[i] vanishes due to L1 regularization 188 | w[i] = 0. 189 | else: 190 | # apply prediction time L1, L2 regularization to z and get w 191 | w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2) 192 | 193 | wTx += w[i] 194 | 195 | # cache the current w for update stage 196 | self.w = w 197 | 198 | # bounded sigmoid function, this is the probability estimation 199 | return 1. / (1. + exp(-max(min(wTx, 35.), -35.))) 200 | 201 | def update(self, x, p, y): 202 | ''' Update model using x, p, y 203 | 204 | INPUT: 205 | x: feature, a list of indices 206 | p: click probability prediction of our model 207 | y: answer 208 | 209 | MODIFIES: 210 | self.n: increase by squared gradient 211 | self.z: weights 212 | ''' 213 | 214 | # parameter 215 | alpha = self.alpha 216 | 217 | # model 218 | n = self.n 219 | z = self.z 220 | w = self.w 221 | 222 | # gradient under logloss 223 | g = p - y 224 | 225 | # update z and n 226 | for i in self._indices(x): 227 | sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha 228 | z[i] += g - sigma * w[i] 229 | n[i] += g * g 230 | 231 | 232 | def logloss(p, y): 233 | ''' FUNCTION: Bounded logloss 234 | 235 | INPUT: 236 | p: our prediction 237 | y: real answer 238 | 239 | OUTPUT: 240 | logarithmic loss of p given y 241 | ''' 242 | 243 | p = max(min(p, 1. - 10e-15), 10e-15) 244 | return -log(p) if y == 1. else -log(1. - p) 245 | 246 | 247 | def data(path, D): 248 | ''' GENERATOR: Apply hash-trick to the original csv row 249 | and for simplicity, we one-hot-encode everything 250 | 251 | INPUT: 252 | path: path to training or testing file 253 | D: the max index that we can hash to 254 | 255 | YIELDS: 256 | ID: id of the instance, mainly useless 257 | x: a list of hashed and one-hot-encoded 'indices' 258 | we only need the index since all values are either 0 or 1 259 | y: y = 1 if we have a click, else we have y = 0 260 | ''' 261 | 262 | for t, row in enumerate(DictReader(open(path), delimiter=',')): 263 | 264 | try: 265 | ID= row['ID'] 266 | del row['ID'] 267 | except: 268 | ID = 0 269 | pass 270 | 271 | # process target. 272 | y = 0. 273 | target='Disbursed' 274 | #row['I1'] = str(row['Monthly_Income']) + str(row['Var5']) 275 | row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI']) 276 | row['I3'] = str(row['Var5']) + str(row['Existing_EMI']) 277 | row['I4'] = str(row['Var5']) + str(row['Lifetime']) 278 | row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted']) 279 | row['I6'] = str(row['Interest_Rate']) + str(row['dob_year']) 280 | #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day']) 281 | row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee']) 282 | row['I8'] = str(row['Var5']) + str(row['Var4']) 283 | #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek']) 284 | #lcd_weekofyear 285 | 286 | 287 | if target in row: 288 | if row[target] == '1': 289 | y = 1. 290 | del row[target] 291 | 292 | # extract date 293 | 294 | # turn hour really into hour, it was originally YYMMDDHH 295 | 296 | 297 | # build x 298 | x = [] 299 | for key in row: 300 | value = row[key] 301 | 302 | # one-hot encode everything with hash trick 303 | index = abs(hash(key + '_' + value)) % D 304 | x.append(index) 305 | 306 | yield t, ID, x, y 307 | 308 | 309 | ############################################################################## 310 | # start training ############################################################# 311 | ############################################################################## 312 | 313 | start = datetime.now() 314 | #print("started at: %s" % datetime.now()) 315 | 316 | # initialize ourselves a learner 317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction) 318 | 319 | # start training 320 | for e in range(epoch): 321 | random.seed(1234) 322 | loss = 0. 323 | count = 0 324 | predlist=[] 325 | targetlist=[] 326 | for t, ID, x, y in data(train, D): # data is a generator 327 | 328 | p = learner.predict(x) 329 | 330 | # if random.random() < 0.3: 331 | # # Estimate progressive validation loss 332 | # loss += logloss(p, y) 333 | # count += 1 334 | # predlist.append(p) 335 | # targetlist.append(y) 336 | # else: 337 | # # Use other samples to train the model 338 | # learner.update(x, p, y) 339 | 340 | learner.update(x, p, y) 341 | # if t % 1000000 == 0: 342 | # continue 343 | 344 | # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 345 | # (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start))) 346 | 347 | #import pickle 348 | #pickle.dump(learner,open('ftrl3.p','w')) 349 | 350 | ############################################################################## 351 | # start testing, and build Kaggle's submission file ########################## 352 | ############################################################################## 353 | #print ('creating submission file') 354 | with open(submission, 'w') as outfile: 355 | outfile.write('ID,Disbursed\n') 356 | for t, ID, x, y in data(test, D): 357 | p = learner.predict(x) 358 | outfile.write('%s,%s\n' % (ID, str(p))) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/shuffle.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shuffle lines in a [big] file 3 | shuffle.py [] [] [] 4 | """ 5 | 6 | import sys 7 | import random 8 | 9 | input_file = sys.argv[1] 10 | output_file = sys.argv[2] 11 | 12 | try: 13 | preserve_headers = int( sys.argv[3] ) 14 | except IndexError: 15 | preserve_headers = 0 16 | 17 | try: 18 | lines_in_memory = int( sys.argv[4] ) 19 | except IndexError: 20 | lines_in_memory = 25000 21 | 22 | print "caching %s lines at a time..." % ( lines_in_memory ) 23 | 24 | try: 25 | random_seed = sys.argv[5] 26 | random.seed( random_seed ) 27 | print "random seed: %s" % ( random_seed ) 28 | except IndexError: 29 | pass 30 | 31 | # first count 32 | 33 | print "counting lines..." 34 | 35 | i_f = open( input_file ) 36 | o_f = open( output_file, 'wb' ) 37 | 38 | if preserve_headers: 39 | headers = i_f.readline() 40 | o_f.write( headers ) 41 | 42 | counter = 0 43 | for line in i_f: 44 | counter += 1 45 | 46 | if counter % 100000 == 0: 47 | print counter 48 | 49 | print counter 50 | 51 | print "shuffling..." 52 | 53 | order = range( counter ) 54 | random.shuffle( order ) 55 | 56 | epoch = 0 57 | 58 | while order: 59 | 60 | current_lines = {} 61 | current_lines_count = 0 62 | 63 | current_chunk = order[:lines_in_memory] 64 | current_chunk_dict = { x: 1 for x in current_chunk } # faster "in" 65 | current_chunk_length = len( current_chunk ) 66 | 67 | order = order[lines_in_memory:] 68 | 69 | i_f.seek( 0 ) 70 | if preserve_headers: 71 | i_f.readline() 72 | 73 | count = 0 74 | 75 | for line in i_f: 76 | if count in current_chunk_dict: 77 | current_lines[count] = line 78 | current_lines_count += 1 79 | if current_lines_count == current_chunk_length: 80 | break 81 | count += 1 82 | if count % 100000 == 0: 83 | print count 84 | 85 | print "writing..." 86 | 87 | for l in current_chunk: 88 | o_f.write( current_lines[l] ) 89 | 90 | lines_saved = current_chunk_length + epoch * lines_in_memory 91 | epoch += 1 92 | print "pass %s complete (%s lines saved)" % ( epoch, lines_saved ) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed2.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 800 21 | 22 | params['seed'] = 523264626346 # 0.85533 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | #xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | #exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub251.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed2.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | 15 | labels = list(labels.iloc[:,0]) 16 | test_ids = list(test_ids.iloc[:,0]) 17 | 18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 19 | 'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 20 | 'min_child_weight':3, 'max_delta_step':3} 21 | num_rounds = 800 22 | 23 | params['seed'] = 64378683511 # 0.85533 24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 25 | #xgb.cv(params, dtrain, num_rounds, nfold=4) 26 | #exit() 27 | 28 | clf = xgb.train(params, dtrain, num_rounds) 29 | dtest = xgb.DMatrix(test, missing = np.nan) 30 | test_preds_xgb = clf.predict(dtest) 31 | 32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 33 | submission = submission[['ID', 'Disbursed']] 34 | submission.to_csv("temp_submission/Sub252.csv", index = False) 35 | 36 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed2.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | 15 | labels = list(labels.iloc[:,0]) 16 | test_ids = list(test_ids.iloc[:,0]) 17 | 18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 19 | 'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 20 | 'min_child_weight':3, 'max_delta_step':3} 21 | num_rounds = 800 22 | 23 | params['seed'] = 132323786373 # 0.85533 24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 25 | #xgb.cv(params, dtrain, num_rounds, nfold=4) 26 | #exit() 27 | 28 | clf = xgb.train(params, dtrain, num_rounds) 29 | dtest = xgb.DMatrix(test, missing = np.nan) 30 | test_preds_xgb = clf.predict(dtest) 31 | 32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 33 | submission = submission[['ID', 'Disbursed']] 34 | submission.to_csv("temp_submission/Sub253.csv", index = False) 35 | 36 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed2.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | 15 | labels = list(labels.iloc[:,0]) 16 | test_ids = list(test_ids.iloc[:,0]) 17 | 18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 19 | 'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 20 | 'min_child_weight':3, 'max_delta_step':3} 21 | num_rounds = 800 22 | 23 | params['seed'] = 548563448943 # 0.85533 24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 25 | #xgb.cv(params, dtrain, num_rounds, nfold=4) 26 | #exit() 27 | 28 | clf = xgb.train(params, dtrain, num_rounds) 29 | dtest = xgb.DMatrix(test, missing = np.nan) 30 | test_preds_xgb = clf.predict(dtest) 31 | 32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 33 | submission = submission[['ID', 'Disbursed']] 34 | submission.to_csv("temp_submission/Sub254.csv", index = False) 35 | 36 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed2.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | 15 | labels = list(labels.iloc[:,0]) 16 | test_ids = list(test_ids.iloc[:,0]) 17 | 18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 19 | 'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 20 | 'min_child_weight':3, 'max_delta_step':3} 21 | num_rounds = 800 22 | 23 | params['seed'] = 14357846377 # 0.85533 24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 25 | #xgb.cv(params, dtrain, num_rounds, nfold=4) 26 | #exit() 27 | 28 | clf = xgb.train(params, dtrain, num_rounds) 29 | dtest = xgb.DMatrix(test, missing = np.nan) 30 | test_preds_xgb = clf.predict(dtest) 31 | 32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 33 | submission = submission[['ID', 'Disbursed']] 34 | submission.to_csv("temp_submission/Sub255.csv", index = False) 35 | 36 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_rf.py: -------------------------------------------------------------------------------- 1 | 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from sklearn.ensemble import RandomForestClassifier 6 | from sklearn.cross_validation import cross_val_score, cross_val_predict 7 | 8 | train = pd.read_csv("Train.csv") 9 | test = pd.read_csv("Test.csv") 10 | submission = pd.read_csv("sample_submission.csv") 11 | 12 | salary_acc = train.Salary_Account.value_counts(dropna=False) 13 | salary_acc_rare = list(salary_acc[salary_acc<40].index) 14 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 15 | 16 | train2 = train.copy()#[~pd.isnull(train['Loan_Amount_Applied'])] 17 | 18 | id_train = train['ID'] 19 | label = train2['Disbursed'] 20 | 21 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB', 'Lead_Creation_Date', 'City', 'Employer_Name'] 22 | train2.drop(dropCols, axis=1, inplace = True) 23 | 24 | y_train = label 25 | X_train = pd.get_dummies(train2) 26 | 27 | # # Test set preparation 28 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others" 29 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed'])) 30 | test2 = test.drop(testdropcols, axis=1) 31 | 32 | X_test = pd.get_dummies(test2) 33 | missingCols = list(set(X_train.columns)-set(X_test.columns)) 34 | for col in missingCols: 35 | X_test[col] = 0 36 | X_test = X_test[X_train.columns] 37 | assert X_train.columns.equals(X_test.columns) 38 | 39 | # # Modeling 40 | X_train_2 = X_train.fillna(-999) 41 | X_test_2 = X_test.fillna(-999) 42 | 43 | # from sklearn.cross_validation import KFold 44 | # kf = KFold(len(X_train_2), n_folds=4) 45 | # scores = cross_val_score(clf, X_train_2, y_train, scoring='roc_auc', cv=kf) 46 | # print "CV:", np.mean(scores), "+/-", np.std(scores), "All:", scores 47 | # CV: 0.831889207925 +/- 0.0109754348042 All: [ 0.82381549 0.82907869 0.85055107 0.82411158] 48 | seeds = [31121421,53153,5245326,6536,75] 49 | numbers = [151,152,153,154,155] 50 | 51 | for i in range(5): 52 | clf = RandomForestClassifier(n_estimators=360, max_depth=9, criterion = 'entropy', min_samples_split=2, bootstrap = False, n_jobs=-1, random_state=seeds[i]) 53 | clf.fit(X_train_2, y_train) 54 | test_preds = clf.predict_proba(X_test_2)[:,1] 55 | print("RF %s done" % i) 56 | 57 | submission = pd.DataFrame({'ID':test['ID'], 'Disbursed':test_preds}) 58 | submission = submission[['ID', 'Disbursed']] 59 | submission.to_csv("temp_submission/Sub%s.csv" % str(numbers[i]), index = False) -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 400 21 | 22 | params['seed'] = 523264626346 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | # exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub241.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 400 21 | 22 | params['seed'] = 64378683511 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | # exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub242.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 400 21 | 22 | params['seed'] = 132323786373 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | # exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub243.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 400 21 | 22 | params['seed'] = 548563448943 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | # exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub244.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | import xgboost as xgb 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict 6 | 7 | # # Modeling 8 | 9 | train = pd.read_csv("temp_data/train_preprocessed.csv") 10 | test = pd.read_csv("temp_data/test_preprocessed.csv") 11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None) 12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None) 13 | 14 | labels = list(labels.iloc[:,0]) 15 | test_ids = list(test_ids.iloc[:,0]) 16 | 17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc', 18 | 'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1, 19 | 'min_child_weight':3, 'max_delta_step':3} 20 | num_rounds = 400 21 | 22 | params['seed'] = 14357846377 23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan) 24 | # xgb.cv(params, dtrain, num_rounds, nfold=4) 25 | # exit() 26 | 27 | clf = xgb.train(params, dtrain, num_rounds) 28 | dtest = xgb.DMatrix(test, missing = np.nan) 29 | test_preds_xgb = clf.predict(dtest) 30 | 31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb}) 32 | submission = submission[['ID', 'Disbursed']] 33 | submission.to_csv("temp_submission/Sub245.csv", index = False) 34 | 35 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/XGB Tuning guide.md: -------------------------------------------------------------------------------- 1 | Tuning and CV strategy for XGB: 2 | ============================== 3 | 4 | Typically, people use 5 folds. You can make a choice. To see the reliability of CV estimate, a few guys use 10-fold as well. 5 | 6 | Steps: 7 | ----- 8 | 1. Decide 'n' in n-fold. Stick to it for complete analysis. 9 | 2. Create a baseline score using a simple model. 10 | 3. Now, use XGBoost default settings and establish another XGB baseline score. 11 | 4. Put num_trees at 10000 and a tiny learning rate of 0.01. 12 | 5. Try step(4) for various max_depth. 13 | 6. While doing step(4), monitor the progress. Note at what tree# is the model overfitting 14 | 7. After you're done with 1-6, you would have reached a saturation score 15 | 8. Now comes some magic! Start using subsample and tada, your score improves. 16 | 9. Use colsample_bytree, then scale_pos_weight, improve your score 17 | 10. Try using max_delta_step and gamma too (a little tricky to tune) 18 | -------------------------------------------------------------------------------- /Analytics_Vidhya_3.X_Hackathon/requirements.md: -------------------------------------------------------------------------------- 1 | Requirements: 2 | ============= 3 | 4 | - Python 5 | - Pandas, Numpy, Scipy, Scikit-learn - latest libraries 6 | - XGBoost - https://github.com/dmlc/xgboost 7 | - Pypy (to run the ftrl code faster) 8 | -------------------------------------------------------------------------------- /D-hack/Code.py: -------------------------------------------------------------------------------- 1 | #Code for D-Hack Weeklong version 2 | 3 | #importing libraries 4 | from sklearn.ensemble import RandomForestClassifier 5 | #from sklearn.ensemble import AdaBoostClassifier 6 | from sklearn.ensemble import GradientBoostingClassifier 7 | from sklearn.metrics import confusion_matrix 8 | import xgboost as xgb 9 | import pandas as pd 10 | import numpy as np 11 | from sklearn import preprocessing 12 | import pandas as pd 13 | from sklearn import ensemble 14 | import random 15 | 16 | #Importing i/p files 17 | train=pd.read_csv('E:/DS/DHack/train_FBFog7d.csv') 18 | test=pd.read_csv('E:/DS/DHack/Test_L4P23N3.csv') 19 | train.head() 20 | 21 | #Pre-processing 22 | def convert(data): 23 | number = preprocessing.LabelEncoder() 24 | data['Var1'] = number.fit_transform(data.Var1) 25 | data['WorkStatus'] = number.fit_transform(data.WorkStatus) 26 | data['Divorce'] = number.fit_transform(data.Divorce) 27 | data['Widowed'] = number.fit_transform(data.Widowed) 28 | data['Education'] = number.fit_transform(data.Education) 29 | data['Residence_Region'] = number.fit_transform(data.Residence_Region) 30 | data['babies'] = number.fit_transform(data.babies) 31 | data['preteen'] = number.fit_transform(data.preteen) 32 | data['teens'] = number.fit_transform(data.teens) 33 | data['income'] = number.fit_transform(data.income) 34 | data['Engagement_Religion'] = number.fit_transform(data.Engagement_Religion) 35 | data['Var2'] = number.fit_transform(data.Var2) 36 | data['TVhours'] = number.fit_transform(data.TVhours) 37 | data['Gender'] = number.fit_transform(data.Gender) 38 | data['Unemployed10'] = number.fit_transform(data.Unemployed10) 39 | data['Alcohol_Consumption'] = number.fit_transform(data.Alcohol_Consumption) 40 | data=data.fillna(-999) 41 | return data 42 | new = train.append(test) 43 | new = convert(new) 44 | train = new[0:10357] 45 | test = new[10357:] 46 | 47 | #Features 48 | Columns_names = train.columns.values 49 | features = Columns_names[0:np.size(Columns_names)] 50 | features = np.delete(features,[5,6]) 51 | features 52 | 53 | #Creating Data set for training 54 | x_train = train[list(features)].values 55 | y_train = train['Happy'].values 56 | x_test=test[features].values 57 | 58 | 59 | ############## RF Models ############ 60 | #0 Rf_model - 900 61 | rf = ensemble.RandomForestClassifier(n_estimators=900,max_depth=16,criterion='entropy',max_features=6, min_samples_leaf=35, n_jobs=4, random_state=0) 62 | rf.fit(x_train, y_train) 63 | Happy = rf.predict(x_test) 64 | test['Happy_Rf_900']=Happy[:] 65 | 66 | #1 Rf_model - 850 67 | rf = ensemble.RandomForestClassifier(n_estimators=850,max_depth=16,criterion='entropy',max_features=6, min_samples_leaf=35, n_jobs=4, random_state=0) 68 | rf.fit(x_train, y_train) 69 | Happy = rf.predict(x_test) 70 | test['Happy_Rf_850']=Happy[:] 71 | 72 | #2 Rf_model - 800 73 | rf = ensemble.RandomForestClassifier(n_estimators=800,max_depth=16,criterion='entropy',max_features=6, min_samples_leaf=35, n_jobs=4, random_state=0) 74 | rf.fit(x_train, y_train) 75 | Happy = rf.predict(x_test) 76 | test['Happy_Rf_800']=Happy[:] 77 | 78 | #3 Rf_model - 750 79 | rf = ensemble.RandomForestClassifier(n_estimators=750,max_depth=16,criterion='entropy',max_features=6, min_samples_leaf=35, n_jobs=4, random_state=0) 80 | rf.fit(x_train, y_train) 81 | Happy = rf.predict(x_test) 82 | test['Happy_Rf_750']=Happy[:] 83 | 84 | # Making Function for XGB 85 | def happy_to_scores2(x): 86 | if x == 2: 87 | return 'Very Happy' 88 | elif x == 1: 89 | return 'Pretty Happy' 90 | elif x == 0: 91 | return 'Not Happy' 92 | 93 | ########### XG boost Models ############# 94 | 95 | xgtrain = xgb.DMatrix(x_train,label=number.fit_transform(y_train),missing=-999) 96 | xgtest = xgb.DMatrix(x_test,missing=-999) 97 | 98 | # Defining Parameter 99 | params = {} 100 | params["objective"] = "multi:softmax" 101 | params["num_class"] = 3 102 | params["eta"] = 0.01 103 | params["min_child_weight"] = 15 104 | params["subsample"] = 0.7 105 | params["colsample_bytree"] = 0.7 106 | params["max_depth"] = 6 107 | params["seed"] = 0 108 | number = preprocessing.LabelEncoder() 109 | plst = list(params.items()) 110 | 111 | #4 XGB model : num_round - 390 112 | num_rounds = 390 113 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 114 | label = pd.DataFrame(model_xgb.predict(xgtest)) 115 | label = label[0].apply(lambda x: happy_to_scores2(x)) 116 | test['Happy_XGB_390']=label[:] 117 | 118 | #5 XGB model : num_round - 340 119 | num_rounds = 340 120 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 121 | label = pd.DataFrame(model_xgb.predict(xgtest)) 122 | label = label[0].apply(lambda x: happy_to_scores2(x)) 123 | test['Happy_XGB_340']=label[:] 124 | 125 | #6 XGB model : num_round - 290 126 | num_rounds = 290 127 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 128 | label = pd.DataFrame(model_xgb.predict(xgtest)) 129 | label = label[0].apply(lambda x: happy_to_scores2(x)) 130 | test['Happy_XGB_290']=label[:] 131 | 132 | #7 XGB model : num_round - 240 133 | num_rounds = 240 134 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 135 | label = pd.DataFrame(model_xgb.predict(xgtest)) 136 | label = label[0].apply(lambda x: happy_to_scores2(x)) 137 | test['Happy_XGB_240']=label[:] 138 | 139 | #8 XGB model : num_round - 190 140 | num_rounds = 190 141 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 142 | label = pd.DataFrame(model_xgb.predict(xgtest)) 143 | label = label[0].apply(lambda x: happy_to_scores2(x)) 144 | test['Happy_XGB_190']=label[:] 145 | 146 | #9 XGB model : num_round - 140 147 | num_rounds = 140 148 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 149 | label = pd.DataFrame(model_xgb.predict(xgtest)) 150 | label = label[0].apply(lambda x: happy_to_scores2(x)) 151 | test['Happy_XGB_140']=label[:] 152 | 153 | #10 XGB model : num_round - 90 154 | num_rounds = 90 155 | model_xgb = xgb.train(plst, xgtrain, num_rounds) 156 | label = pd.DataFrame(model_xgb.predict(xgtest)) 157 | label = label[0].apply(lambda x: happy_to_scores2(x)) 158 | test['Happy_XGB_90']=label[:] 159 | 160 | ########### Gradient Boosting Models ################## 161 | 162 | #11 GB model - 1200 163 | clf = GradientBoostingClassifier(n_estimators=1200, learning_rate=0.01) 164 | clf.fit(x_train, y_train) 165 | Happy = clf.predict(x_test) 166 | test['Happy_GB_1200']=Happy[:] 167 | 168 | #12 GB model - 1300 169 | clf = GradientBoostingClassifier(n_estimators=1300, learning_rate=0.01) 170 | clf.fit(x_train, y_train) 171 | Happy = clf.predict(x_test) 172 | test['Happy_GB_1300']=Happy[:] 173 | 174 | #13 GB model - 1400 175 | clf = GradientBoostingClassifier(n_estimators=1400, learning_rate=0.01) 176 | clf.fit(x_train, y_train) 177 | Happy = clf.predict(x_test) 178 | test['Happy_GB_1400']=Happy[:] 179 | 180 | Test_final = test[['ID','Happy_Rf_900','Happy_Rf_850','Happy_Rf_800','Happy_Rf_750','Happy_XGB_390','Happy_XGB_340','Happy_XGB_290','Happy_XGB_240','Happy_XGB_190','Happy_XGB_140','Happy_XGB_90','Happy_GB_1100','Happy_GB_1200','Happy_GB_1300','Happy_GB_1400']].copy() 181 | Test_final.to_csv('E:/DS/DHack/Solution_ensemble_15.csv',index=False) 182 | 183 | # After this did a maximum vote ensemble in excel, as I am not so good with Python :P Happy Hacking! 184 | -------------------------------------------------------------------------------- /D-hack/README.md: -------------------------------------------------------------------------------- 1 | ##### Codes for Analytics Vidhya Online Hackathon D Hack, 24th and 25th October, 2015 - Decode D Dalai Lama! 2 | 3 | http://datahack.analyticsvidhya.com/contest/the-d-hack 4 | 5 | ###### My approach for the hackathon is as follows: 6 | 7 | 1. Creating a Data dictionary by understanding levels of data and gaps in the data 8 | 9 | 2. Converting all the categorical variables into 1/0 encoder variables 10 | 11 | 3. Treating missing value as a different class itself by imputing it by -999 12 | 13 | 4. The evaluation metric used in the hackathon was very unconventional, it penalizes misclassification using several rules, which cannot be directly optimized by any conventional machine learning algorithm. So because of lack of time the best bet was to make a robust model, which doesn't deviate from Public to Private Leaderboard 14 | 15 | 5. Made 15 models , 4 - Random Forest, 8 - XGB , 4 - GB 16 | 17 | 6. Did maximum vote ensemble for final Solution 18 | 19 | ###### Extras : 20 | 21 | 1. My single model was giving me a 0.71 score over Public LB , but I know its overfitting, my ensemble model was giving around 0.706 over public LB but I considered it to be more robust 22 | 23 | 2. The 0.71 single model was scored around 0.68329 over Private LB, while ensemble model was around 0.69304 over Private LB, so the assumption that ensemble model would be more robust proves to be right 24 | 25 | 3. I did tried one more thing just for fun, I made eval metric based in excel and extracted probability of each class from the model. After that I optimized weight of each class probability to maximize the eval metric. I didn't use it in final solution but would have been a fun thing to try :) 26 | -------------------------------------------------------------------------------- /Hacker-Earth---Will-Bill-Solve-it-/README.md: -------------------------------------------------------------------------------- 1 | # Approach and Codes for Hacker-Earth Will-Bill-Solve-it? 2 | 3 | https://www.hackerearth.com/machine-learning-india-hacks-2016/machine-learning/will-bill-solve-it/ 4 | 5 | Finished - 4th over Public LB(AUC - 0.833, winners(0.834)) 6 | 7 | ##Problem Statement: 8 | HackerEarth is a community of programmers. Thousands of hackers solve problems on HackerEarth everyday to improve their programming skills or win prizes. These hackers can be beginners who are new to programming, or experts who know the solution in a blink. There is a pattern to everything, and this problem is about finding those patterns and problem solving behaviours of the users. 9 | 10 | Finding these patterns will be of immense help to the problem solvers, as it will allow to suggest relevant problems to solve and offer solution when they seem to be stuck. The opportunities are diverse and you are entitled with the task to predict them. 11 | 12 | ## Data Sets: 13 | Both training and testing dataset consist of 3 files :- 14 | 15 | ### 1) User File: 16 | With Attributes of a User:
17 |
18 | user_id - the user id
19 | skills - all his skills separated by the delimiter '|'
20 | solved_count - number of problems solved by the user
21 | attempts - total number of incorrect submissions done by the user
22 | user_type : type of user (S - Student, W - Working, NA - No Information Available)
23 | 24 | ### 2) Problem File: 25 | Attribute related to a Problem :
26 |
27 | problem_id - the id of the problem
28 | level - difficulty of the problem (Very-Easy, Easy, Easy-Medium, Medium, Medium-Hard, Hard)
29 | accuracy - the accuracy score for the problem
30 | solved_count - number of people who have solved it
31 | error_count - number of people who have solved it incorrectly
32 | rating - star (quality) rating of the problem on scale of 0-5
33 | tag1 - tag of the problem representing the type e.g. Data Structures
34 | tag2 - tag of the problem
35 | tag3 - tag of the problem
36 | tag4 - tag of the problem
37 | tag5 - tag of the problem
38 | 39 | ### 3) Submissions File: 40 | Problem User interaction and final results for each attempt a user made to a solve a particular problem.
41 |
42 | user_id - the id of the user who made a submission
43 | problem_id - the id of the problem that was attempted
44 | solved_status - indicates whether the submission was correct (SO : Solved or Correct solution, AT : Attempted or Incorrect solution )
45 | result - result of the code execution (PAC: Partially Accepted, AC : Accepted, TLE : Time limit exceeded, CE : Compilation Error, RE : Runtime Error, WA : Wrong Answer)
46 | language_used - the lang used by user to code the solution
47 | execution_time - the execution time of the solution
48 | 49 | ## Approach: 50 | ### Preprocessing 51 | #### User File: 52 | 1. Create Features of user skill, there are total 24 skills in the skills columns so created binary flag for each skills 53 | 2. Counting the total number of skills a user have 54 | 3. User Success Rate Percentage: Solved count * 100/ (Attempts + Solved Counts) 55 | 56 | #### Problem File: 57 | 1. Count of Tag : Counting number of tags present in problems ( 5- number of NA's) 58 | 2. I created a Dictionary file for 81 unique tags present in Tag1 to Tag5, and bucketed it in 17 super categories based on business understanding of tag 59 | 3. Made Binary features of each of the 17 skills (1/0) if the respected tag is present in front of problem 60 | 4. Imputing missing value with zero 61 | 5. Assuming Text variables as categorical vairables and encoding each with numeric values 62 | 6. Accuracy Measure : (Solved Count*100) / (Solved count + Error Count) 63 | 64 | #### Submission File(Only for Training Submission File) : 65 | 1. Removing entries which have Solved status as UK (Unknown) 66 | 2. Creating 1/0 of our target variable as solved status == "SO" then 1 else 0 67 | 3. Rolling up data at User ID, Problem ID level and sum of solved status 68 | 4. Creating our final Target Variable(1/0) by checking Solved Status > 1 as 1 else 0 69 | 70 | Merging all the 3 files to get our Final training and testing set 71 | 72 | ### Modelling 73 | 1. I trained 3 XGboost modesls with different number of rounds, but same probabily cutoffs 74 | 2. Did vote ensemble of the three models 75 | 76 | 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reporsitory of Codes 2 | 3 | This is a compiled repository of codes I wrote in various competitions. 4 | 5 | -------------------------------------------------------------------------------- /minnemudac/AvgWaterQualityByLake&Season.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | LAKE_NAME 3 | , YEAR(START_DATE) AS Year 4 | , CASE 5 | WHEN MONTH(START_DATE) IN (12, 1, 2) THEN 'Winter' 6 | WHEN MONTH(START_DATE) BETWEEN 3 AND 5 THEN 'Spring' 7 | WHEN MONTH(START_DATE) BETWEEN 6 AND 9 THEN 'Summer' 8 | ELSE 'Fall' 9 | END AS Season 10 | 11 | -- For ordering 12 | , CASE 13 | WHEN MONTH(START_DATE) IN (12, 1, 2) THEN 4 14 | WHEN MONTH(START_DATE) BETWEEN 3 AND 5 THEN 1 15 | WHEN MONTH(START_DATE) BETWEEN 6 AND 9 THEN 2 16 | ELSE 3 17 | END AS SeasonNum 18 | 19 | , AVG(RECREATIONAL_SUITABILITY_RESULT) AS RECREATIONAL_SUITABILITY_RESULT 20 | , AVG(PHYSICAL_CONDITION_RESULT) AS PHYSICAL_CONDITION_RESULT 21 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT 22 | , AVG(TOTAL_PHOSPHORUS_RESULT) AS TOTAL_PHOSPHORUS_RESULT 23 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 24 | WHERE 25 | 26 | -- Worst lakes: Top 10 lakes with lowest Secchi depths 27 | LAKE_NAME IN ('Benton Lake','Hazeltine Lake','Cobblecrest Lake','Downs Lake','Penn Lake' 28 | ,'Winkler Lake','Meadow Lake','Cornelia Lake','Cedar Island Lake','Gaystock Lake') 29 | 30 | -- Best lakes: Top 10 lakes with highest Secchi depths 31 | -- LAKE_NAME IN ('West Boot Lake','Brickyard Clayhole Lake','Big Carnelian Lake','Jane Lake' 32 | -- ,'Halfbreed Lake' /*What the hell kind of name is this?!*/,'Little Long Lake','Mays Lake' 33 | -- ,'Christmas Lake','Little Carnelian Lake','Square Lake') 34 | 35 | 36 | AND SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided 37 | 38 | GROUP BY LAKE_NAME, Year, Season, SeasonNum 39 | ORDER BY LAKE_NAME, Year, SeasonNum 40 | -------------------------------------------------------------------------------- /minnemudac/AvgWaterQualityByLake.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | LAKE_NAME 3 | 4 | , AVG(RECREATIONAL_SUITABILITY_RESULT) AS RECREATIONAL_SUITABILITY_RESULT 5 | , AVG(PHYSICAL_CONDITION_RESULT) AS PHYSICAL_CONDITION_RESULT 6 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT 7 | , AVG(TOTAL_PHOSPHORUS_RESULT) AS TOTAL_PHOSPHORUS_RESULT 8 | , COUNT(*) AS NumberRecords 9 | 10 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 11 | WHERE 12 | SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided 13 | GROUP BY LAKE_NAME 14 | HAVING COUNT(*) > 50 -- Removes ~1/3 of the data, but also removes unreliable lakes 15 | 16 | ORDER BY SECCHI_DEPTH_RESULT ASC -- Using Secchi depth since physical condition/recreational condition isn't available for all lakes 17 | -------------------------------------------------------------------------------- /minnemudac/DuplicatePropertyCheck.sql: -------------------------------------------------------------------------------- 1 | WITH test AS ( 2 | SELECT 3 | CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) = 1 THEN 1 ELSE NULL END AS Original 4 | , CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) = 2 THEN 1 ELSE NULL END AS Duplicate 5 | , CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) > 2 THEN 1 ELSE NULL END AS MoreThanTwo 6 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 7 | ) 8 | 9 | SELECT 10 | SUM(Original) AS Original 11 | , SUM(Duplicate) AS Duplicate 12 | , SUM(MoreThanTwo) AS MoreThanTwo 13 | FROM test 14 | -------------------------------------------------------------------------------- /minnemudac/NumberPropertyTypesPerLake.sql: -------------------------------------------------------------------------------- 1 | -- Disable "Use Legacy SQL" under Google BigQuery to support CTEs 2 | 3 | WITH lake AS ( 4 | SELECT 5 | LAKE_NAME AS LakeName 6 | , DNR_ID_SITE_NUMBER 7 | FROM `datadive-142319.mces_lakes.1999_2014_monitoring_data` 8 | GROUP BY LAKE_NAME, DNR_ID_SITE_NUMBER 9 | ) 10 | 11 | , residential AS ( 12 | SELECT 13 | USE1_DESC AS PropertyType 14 | , centroid_long 15 | , centroid_lat 16 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 17 | WHERE LTRIM(LOWER(USE1_DESC)) LIKE '1__%' 18 | OR LOWER(USE1_DESC) LIKE '%residential%' 19 | OR LOWER(USE1_DESC) LIKE '%res%' 20 | OR LOWER(USE1_DESC) LIKE '%house%' 21 | OR LOWER(USE1_DESC) LIKE '%condo%' 22 | OR LOWER(USE1_DESC) LIKE '%apartment%' 23 | OR LOWER(USE1_DESC) LIKE '%apt%' 24 | OR LOWER(USE1_DESC) LIKE '%plex%' 25 | OR LOWER(USE1_DESC) LIKE '%bungalo%' 26 | OR LOWER(USE1_DESC) LIKE '%housing%' 27 | OR LOWER(USE1_DESC) LIKE '%home%' 28 | OR LOWER(USE1_DESC) LIKE '%family%' 29 | GROUP BY USE1_DESC, centroid_long, centroid_lat 30 | ) 31 | 32 | , agriculture AS ( 33 | SELECT 34 | USE1_DESC AS PropertyType 35 | , centroid_long 36 | , centroid_lat 37 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 38 | WHERE LOWER(USE1_DESC) LIKE '2__%' 39 | OR LOWER(USE1_DESC) LIKE '%ag%' 40 | OR LOWER(USE1_DESC) LIKE '%farm%' 41 | OR LOWER(USE1_DESC) LIKE '%rural%' 42 | ) 43 | 44 | , commercial AS ( 45 | SELECT 46 | USE1_DESC AS PropertyType 47 | , centroid_long 48 | , centroid_lat 49 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 50 | WHERE LOWER(USE1_DESC) LIKE '3__%' 51 | OR LOWER(USE1_DESC) LIKE '%commercial%' 52 | OR LOWER(USE1_DESC) LIKE '%machinery%' 53 | OR LOWER(USE1_DESC) LIKE '%recreational%' 54 | OR LOWER(USE1_DESC) LIKE '%golf%' 55 | OR LOWER(USE1_DESC) LIKE '%coop%' 56 | ) 57 | 58 | , industrial AS ( 59 | SELECT 60 | USE1_DESC AS PropertyType 61 | , centroid_long 62 | , centroid_lat 63 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 64 | WHERE LOWER(USE1_DESC) LIKE '%ind%' 65 | OR LOWER(USE1_DESC) = '305 industrial' 66 | ) 67 | 68 | , public AS ( 69 | SELECT 70 | USE1_DESC AS PropertyType 71 | , centroid_long 72 | , centroid_lat 73 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 74 | WHERE LOWER(USE1_DESC) LIKE '9__%' 75 | OR LOWER(USE1_DESC) LIKE '%public%' 76 | OR LOWER(USE1_DESC) LIKE '%muni%' 77 | OR LOWER(USE1_DESC) LIKE '%rail%' 78 | OR LOWER(USE1_DESC) LIKE '%church%' 79 | OR LOWER(USE1_DESC) LIKE '%school%' 80 | OR LOWER(USE1_DESC) LIKE '%forest%' 81 | OR LOWER(USE1_DESC) LIKE '%state%' 82 | OR LOWER(USE1_DESC) LIKE '%county%' 83 | OR LOWER(USE1_DESC) LIKE '%util%' 84 | OR LOWER(USE1_DESC) LIKE '%college%' 85 | OR LOWER(USE1_DESC) LIKE '%cem%' 86 | OR LOWER(USE1_DESC) LIKE '%common%' 87 | OR LOWER(USE1_DESC) LIKE '%road%' 88 | OR LOWER(USE1_DESC) LIKE '%fed%' 89 | OR LOWER(USE1_DESC) LIKE '%tax%' 90 | OR LOWER(USE1_DESC) LIKE '%dnr%' 91 | OR LOWER(USE1_DESC) LIKE '%charit%' 92 | OR LOWER(USE1_DESC) LIKE '%serv%' 93 | OR LOWER(USE1_DESC) LIKE '%hosp%' 94 | OR LOWER(USE1_DESC) LIKE '%park%' 95 | ) 96 | 97 | SELECT 98 | ROW_NUMBER() OVER (ORDER BY lake.LakeName) AS ID 99 | , lake.LakeName 100 | , COUNT(residential.PropertyType) AS ResidentialCount_2015 101 | , COUNT(agriculture.PropertyType) AS AgriculturalCount_2015 102 | , COUNT(commercial.PropertyType) AS CommercialCount_2015 103 | , COUNT(industrial.PropertyType) AS IndustrialCount_2015 104 | , COUNT(public.PropertyType) AS PublicCount_2015 105 | FROM lake 106 | JOIN `datadive-142319.sds_xref.parcel_to_water` AS intersection ON lake.DNR_ID_SITE_NUMBER = intersection.MCES_Map_Code1 107 | 108 | LEFT JOIN residential ON intersection.parcel_centroid_long = residential.centroid_long 109 | AND intersection.parcel_centroid_lat = residential.centroid_lat 110 | 111 | LEFT JOIN agriculture ON intersection.parcel_centroid_long = agriculture.centroid_long 112 | AND intersection.parcel_centroid_lat = agriculture.centroid_lat 113 | 114 | LEFT JOIN commercial ON intersection.parcel_centroid_long = commercial.centroid_long 115 | AND intersection.parcel_centroid_lat = commercial.centroid_lat 116 | 117 | LEFT JOIN industrial ON intersection.parcel_centroid_long = industrial.centroid_long 118 | AND intersection.parcel_centroid_lat = industrial.centroid_lat 119 | 120 | LEFT JOIN public ON intersection.parcel_centroid_long = public.centroid_long 121 | AND intersection.parcel_centroid_lat = public.centroid_lat 122 | 123 | GROUP BY lake.LakeName 124 | ORDER BY lake.LakeName ASC 125 | -------------------------------------------------------------------------------- /minnemudac/README.md: -------------------------------------------------------------------------------- 1 | # WELCOME TO SHERLOCK 2 | 3 | 4 | 5 | ### Density by Property Types Over Time 6 | 7 | Using properties joined to lakes in the intersection table 8 | 9 | 1) **PropertiesByYear**: Residential, Industrial, Commercial, Agricultural, and Public properties by lake from 2003 - 2015 using the "Number of Properties per Lake" query 10 | 11 | 2) **PropertiesPctChangeByYear**: YoY change by property type by lake. Percentages in decimal format (Ex. 2.5 indicates a 250% increase) 12 | 13 | 3) **PropertiesPctOfTotalByYear**: Percentage of total by lake by year by type for 2003 - 2015. -1 indicates a lack of the property type for that lake. 14 | -------------------------------------------------------------------------------- /minnemudac/Top&Bottom10LakesPerYear.sql: -------------------------------------------------------------------------------- 1 | /* Adjust line 8 to ASC for the top 10 lakes per year with the worst quality */ 2 | SELECT * 3 | FROM ( 4 | SELECT 5 | LAKE_NAME 6 | , YEAR(START_DATE) as Year 7 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT 8 | , RANK(SECCHI_DEPTH_RESULT) OVER (PARTITION BY Year ORDER BY SECCHI_DEPTH_RESULT DESC) AS Rank 9 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 10 | WHERE 11 | SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided 12 | AND SECCHI_DEPTH_RESULT IS NOT NULL 13 | AND YEAR(START_DATE) >= 1995 14 | GROUP BY LAKE_NAME, Year 15 | HAVING COUNT(*) > 5 16 | ) 17 | WHERE Rank <= 10 18 | ORDER BY Year DESC, Rank ASC -- Using Secchi depth since physical condition/recreational condition isn't available for all lakes 19 | --------------------------------------------------------------------------------