├── AV-Black-Friday
    ├── Code.py
    └── README.md
├── AV-Hackathon-3.X
    ├── Data_Preprocessing.R
    ├── Hack_3x_Modelling.py
    └── README.md
├── AV-Hackathon-3
    ├── AV_wog.R
    ├── Feature Engineering.R
    └── README.md
├── Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-
    ├── FactorVariables.R
    ├── Final_Model.R
    ├── LoadData.R
    └── README.md
├── Analytics_Vidhya_3.X_Hackathon
    ├── README.md
    ├── Weekend
    │   ├── _1_preprocessing.py
    │   ├── _2_train_xgb.py
    │   ├── _3_preprocessing_ftrl.py
    │   ├── _4_train_ftrl.py
    │   ├── _5_postprocessing_ftrl.py
    │   ├── _6_ensemble.py
    │   └── av_script.sh
    ├── Weeklong
    │   ├── av_final.sh
    │   ├── ensemble_rank_final.py
    │   ├── postprocessing_RF.py
    │   ├── postprocessing_XGB_1.py
    │   ├── postprocessing_XGB_2.py
    │   ├── postprocessing_ftrl.py
    │   ├── preprocessing.py
    │   ├── preprocessing2.py
    │   ├── preprocessing_ftrl.py
    │   ├── script_ftrl.py
    │   ├── script_ftrl2.py
    │   ├── script_ftrl3.py
    │   ├── script_ftrl4.py
    │   ├── script_ftrl5.py
    │   ├── shuffle.py
    │   ├── train_2xgb1.py
    │   ├── train_2xgb2.py
    │   ├── train_2xgb3.py
    │   ├── train_2xgb4.py
    │   ├── train_2xgb5.py
    │   ├── train_rf.py
    │   ├── train_xgb.py
    │   ├── train_xgb2.py
    │   ├── train_xgb3.py
    │   ├── train_xgb4.py
    │   └── train_xgb5.py
    ├── XGB Tuning guide.md
    └── requirements.md
├── D-hack
    ├── Code.py
    └── README.md
├── Hacker-Earth---Will-Bill-Solve-it-
    ├── Final-Code.R
    └── README.md
├── README.md
└── minnemudac
    ├── 1999_2014_monitoring_data.csv
    ├── AvgWaterQualityByLake&Season.sql
    ├── AvgWaterQualityByLake.sql
    ├── DuplicatePropertyCheck.sql
    ├── NumberPropertyTypesPerLake.sql
    ├── PropertiesByYear.csv
    ├── PropertiesPctChangeByYear.csv
    ├── PropertiesPctOfTotalByYear.csv
    ├── README.md
    ├── Top&Bottom10LakesPerYear.sql
    └── seasonal.csv


/AV-Black-Friday/Code.py:
--------------------------------------------------------------------------------
 1 | #importing libraries
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.ensemble import RandomForestRegressor
 5 | import xgboost as xgb
 6 | from sklearn.ensemble import GradientBoostingRegressor
 7 | from sklearn.metrics import mean_squared_error
 8 | from sklearn import preprocessing
 9 | from sklearn import ensemble
10 | 
11 | # setting the input path and reading the data into dataframe #
12 | data_path = "E:/DS/AV Black Friday/"
13 | train = pd.read_csv(data_path+"Train.csv")
14 | test = pd.read_csv(data_path+"Test.csv")
15 | 
16 | ## categical column name list ##
17 | categorical_columns = ["Product_ID","Gender","Age","Occupation","City_Category","Stay_In_Current_City_Years","Marital_Status","Product_Category_1","Product_Category_2","Product_Category_3"]
18 | 
19 | ## Getting the ID and DV from the data frame ##
20 | train_y = np.array(train["Purchase"])
21 | 
22 | ## Creating the IDVs from the train and test dataframe ##
23 | train_X = train.copy()
24 | test_X = test.copy()
25 | 
26 | ## Fill up the na values with -999 ##
27 | train_X = train_X.fillna(-999)
28 | test_X = test_X.fillna(-999)
29 | 
30 | #encoding categorical variable
31 | for var in categorical_columns:
32 |     lb = preprocessing.LabelEncoder()
33 |     full_var_data = pd.concat((train_X[var],test_X[var]),axis=0).astype('str')
34 |     lb.fit( full_var_data )
35 |     train_X[var] = lb.transform(train_X[var].astype('str'))
36 |     test_X[var] = lb.transform(test_X[var].astype('str'))
37 | 	
38 | ## Dropping the unnecessary columns from IDVs ##
39 | train_X = np.array( train_X.drop(['Purchase'],axis=1) )
40 | print "Train shape is : ",train_X.shape
41 | print "Test shape is : ",test_X.shape
42 | 
43 | print "Building XGB1"
44 | params = {}
45 | params["objective"] = "reg:linear"
46 | params["eta"] = 0.05
47 | params["seed"] = 0
48 | plst = list(params.items())
49 | xgtrain = xgb.DMatrix(train_X, label=train_y, missing = -999)
50 | xgtest = xgb.DMatrix(test_X,missing = -999)
51 | num_rounds = 5667
52 | model = xgb.train(plst, xgtrain, num_rounds)
53 | pred_test_y_xgb1 = model.predict(xgtest)
54 | 
55 | 
56 | #submission
57 | test['Purchase']=pred_test_y_xgb1
58 | test.to_csv(data_path+'Solution.csv',columns = ['User_ID','Product_ID','Purchase'],index = False)
59 | 


--------------------------------------------------------------------------------
/AV-Black-Friday/README.md:
--------------------------------------------------------------------------------
 1 | ##### Codes for Analytics Vidhya Online Hackathon - Black Friday
 2 | http://datahack.analyticsvidhya.com/contest/black-friday-data-hack
 3 | 
 4 | ##### Problem Statement
 5 | The challenge was to predict purchase prices of various products purchased by customers based on historical purchase patterns. The data contained features like age, gender, marital status, categories of products purchased, city demographics etc.
 6 | 
 7 | ##### My approach for the hackathon is as follows:
 8 | 1. Looked into levels of data and ran a basic random forest to understand Feature importance, realized Product ID was the most important feature
 9 | 2. Added a new variable in Excel with mean of each product
10 | 3. Converted all categorical variable in one hot encoded categories
11 | 4. Built an XGB over it and optimized parameters
12 | 5. Got a RMSE of 2465, Public Leader Board Ranking 7 , Private Leaderboard Ranking 5
13 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3.X/Data_Preprocessing.R:
--------------------------------------------------------------------------------
  1 | ########## AV Hackathon 3.X ########
  2 | 
  3 | #setting working library
  4 | setwd("E:/DS/AV Hack 3.x")
  5 | 
  6 | #loading libraries
  7 | library(caret)
  8 | library(randomForest)
  9 | library(rpart)
 10 | 
 11 | #reading the files
 12 | train=read.csv("train.csv")
 13 | test=read.csv("test.csv")
 14 | 
 15 | str(train)
 16 | 
 17 | #just converting some levels
 18 | train$Var4 = as.factor(train$Var4)
 19 | train$Var5 = as.factor(train$Var5)
 20 | train$Disbursed = as.factor(train$Disbursed)
 21 | train$DOB = as.Date(train$DOB, format = "%d-%m-%Y")
 22 | train$Lead_Creation_Date = as.Date(train$Lead_Creation_Date, format = "%d-%m-%Y")
 23 | 
 24 | 
 25 | 
 26 | 
 27 | test$Var4 = as.factor(test$Var4)
 28 | test$Var5 = as.factor(test$Var5)
 29 | test$DOB = as.Date(test$DOB, format = "%d-%m-%Y")
 30 | test$Lead_Creation_Date = as.Date(test$Lead_Creation_Date, format = "%d-%m-%Y")
 31 | 
 32 | 
 33 | #treating some of the variables
 34 | 
 35 | #0) Creating Dummy Variable of class factors Gender
 36 |   #0.i) is_male
 37 |   is_male <- function(x) {
 38 |     if(x == "Male") {
 39 |       y <- 1
 40 |     } else {
 41 |       y <- 0
 42 |     }
 43 |     return(y)
 44 |   }
 45 | 
 46 | train_1 <- cbind(train,is_male = as.factor(mapply(is_male,train$Gender)))
 47 | test_1 <- cbind(test,is_male = as.factor(mapply(is_male,test$Gender)))
 48 | 
 49 | 
 50 | 
 51 | ##############################################################################
 52 | #1) Adding age by using DOB and Lead_Creation_Date column 
 53 | train_1 <- cbind(train_1,age = as.integer(round((train_1$Lead_Creation_Date - train$DOB)/365,digits =0)))
 54 | test_1 <- cbind(test_1,age = as.integer(round((test_1$Lead_Creation_Date - test$DOB)/365,digits = 0)))
 55 | 
 56 | 
 57 | ##############################################################################
 58 | #2) Extraction and Addition of DOB month and year
 59 | train_1 <- cbind(train_1,DOB_month = as.factor(format(train_1$DOB,'%m')))
 60 | train_1 <- cbind(train_1,DOB_year = as.factor(format(train_1$DOB,'%Y')))
 61 | 
 62 | test_1 <- cbind(test_1,DOB_month = as.factor(format(test_1$DOB,'%m')))
 63 | test_1 <- cbind(test_1,DOB_year = as.factor(format(test_1$DOB,'%Y')))
 64 | 
 65 | ##############################################################################
 66 | #3) Extraction and Addition of Lead_Creation_Date month
 67 | train_1 <- cbind(train_1,Lead_Creation_day = as.factor(format(train_1$Lead_Creation_Date,'%d')))
 68 | train_1 <- cbind(train_1,Lead_Creation_month = as.factor(format(train_1$Lead_Creation_Date,'%m')))
 69 | train_1 <- cbind(train_1,Lead_Creation_year = as.factor(format(train_1$Lead_Creation_Date,'%Y')))
 70 | 
 71 | test_1 <- cbind(test_1,Lead_Creation_day = as.factor(format(test_1$Lead_Creation_Date,'%d')))
 72 | test_1 <- cbind(test_1,Lead_Creation_month = as.factor(format(test_1$Lead_Creation_Date,'%m')))
 73 | test_1 <- cbind(test_1,Lead_Creation_year = as.factor(format(test_1$Lead_Creation_Date,'%Y')))
 74 | 
 75 | 
 76 | ##############################################################################
 77 | #4) Treating Loan_Amount_Submitted by adding zero 
 78 | train_1$Loan_Amount_Submitted[is.na(train_1$Loan_Amount_Submitted)] <- train_1$Loan_Amount_Applied[is.na(train_1$Loan_Amount_Submitted)]
 79 | test_1$Loan_Amount_Submitted[is.na(test_1$Loan_Amount_Submitted)] <- test_1$Loan_Amount_Applied[is.na(test_1$Loan_Amount_Submitted)]
 80 | 
 81 | ##############################################################################
 82 | #5) Treating Loan_Tenure_Submitted by adding zero 
 83 | train_1$Loan_Tenure_Submitted[is.na(train_1$Loan_Tenure_Submitted)] <- train_1$Loan_Tenure_Applied[is.na(train_1$Loan_Tenure_Submitted)]
 84 | test_1$Loan_Tenure_Submitted[is.na(test_1$Loan_Tenure_Submitted)] <- test_1$Loan_Tenure_Applied[is.na(test_1$Loan_Tenure_Submitted)]
 85 | 
 86 | ##############################################################################
 87 | #6) Treating Processing_Fee and EMI_Loan_Submitted
 88 | Processing_Fee_null_train <- is.na(train_1$Processing_Fee)
 89 | Processing_Fee_null_test <- is.na(test_1$Processing_Fee)
 90 | 
 91 | train_1$Processing_Fee[is.na(train_1$Processing_Fee)] <- 0
 92 | test_1$Processing_Fee[is.na(test_1$Processing_Fee)] <- 0
 93 | 
 94 | 
 95 | EMI_Loan_Submitted_null_train <- is.na(train_1$EMI_Loan_Submitted)
 96 | EMI_Loan_Submitted_null_test <- is.na(test_1$EMI_Loan_Submitted)
 97 | 
 98 | train_1$EMI_Loan_Submitted[is.na(train_1$EMI_Loan_Submitted)] <- 0
 99 | test_1$EMI_Loan_Submitted[is.na(test_1$EMI_Loan_Submitted)] <- 0
100 | 
101 | ##############################################################################
102 | #7) Creating Counter for Existing EMI and Interest Rate
103 | Existing_EMI_null_train <- is.na(train_1$Existing_EMI)
104 | Interest_Rate_null_train <- is.na(train_1$Interest_Rate)
105 | 
106 | Existing_EMI_null_test <- is.na(test_1$Existing_EMI)
107 | Interest_Rate_null_test <- is.na(test_1$Interest_Rate)
108 | 
109 | 
110 | ##############################################################################
111 | #8) Missing value Imputation of columns
112 | 
113 | 
114 | numeric_columns <- NULL
115 | for (i in 1:ncol(train_1)){
116 |   if(class(train_1[,i]) == "integer"  | class(train_1[,i]) == "numeric") {
117 |     numeric_columns <- rbind(numeric_columns,i)
118 |   }
119 | }
120 | 
121 | preproc <- preProcess(method = "bagImpute", train_1[,numeric_columns[-10]])
122 | 
123 | train_1_imputed <- predict(preproc, train_1[,numeric_columns])
124 | numeric_columns_1 <- NULL
125 | for (i in 1:ncol(train_1_imputed)){
126 |   if(class(train_1_imputed[,i]) == "integer"  | class(train_1_imputed[,i]) == "numeric") {
127 |     numeric_columns_1 <- rbind(numeric_columns_1,i)
128 |   }
129 | }
130 | train_1[,numeric_columns] <- train_1_imputed[,numeric_columns_1]
131 | 
132 | #train_1$Loan_Tenure_Submitted <- train_1_imputed$Loan_Tenure_Submitted
133 | 
134 | numeric_columns <- NULL
135 | for (i in 1:ncol(test_1)){
136 |   if(class(test_1[,i]) == "integer"  | class(test_1[,i]) == "numeric") {
137 |     numeric_columns <- rbind(numeric_columns,i)
138 |   }
139 | }
140 | 
141 | 
142 | test_1_imputed <- predict(preproc, test_1[,numeric_columns])
143 | numeric_columns_1 <- NULL
144 | for (i in 1:ncol(test_1_imputed)){
145 |   if(class(test_1_imputed[,i]) == "integer"  | class(test_1_imputed[,i]) == "numeric") {
146 |     numeric_columns_1 <- rbind(numeric_columns_1,i)
147 |   }
148 | }
149 | 
150 | 
151 | test_1[,numeric_columns] <- test_1_imputed[,numeric_columns_1]
152 | 
153 | 
154 | #######################################################################
155 | #9) New Variable Creation : EMI_calculated
156 | 
157 | EMI <- function(x,y,z) {
158 |   if(y == 0 | z == 0) {
159 |     a <- 0
160 |   } else {
161 |     b <- y/1200
162 |     c <- z*12
163 |     a <- (x*b*((1+b)^c)) / (((1+b)^c) - 1)
164 |   }
165 |   return(a)
166 | }
167 | 
168 | train_1 <- cbind(train_1,EMI_calculated= as.numeric(mapply(EMI,x = train_1$Loan_Amount_Submitted , y = train_1$Interest_Rate,z = train_1$Loan_Tenure_Submitted)))
169 | test_1 <- cbind(test_1,EMI_calculated = as.numeric(mapply(EMI,x = test_1$Loan_Amount_Submitted , y = test_1$Interest_Rate,z = test_1$Loan_Tenure_Submitted)))
170 | 
171 | 
172 | #######################################################################
173 | #10) New Variable Creation : Future_EMI_perincome index
174 | 
175 | train_1 <- cbind(train_1,Future_EMI_perincome = as.numeric((train_1$Existing_EMI + train_1$EMI_calculated) / (train_1$Monthly_Income+1)))
176 | test_1 <- cbind(test_1,Future_EMI_perincome = as.numeric((test_1$Existing_EMI + test_1$EMI_calculated) / (test_1$Monthly_Income+1)))
177 | 
178 | train_1$Future_EMI_perincome[train_1$Future_EMI_perincome > 2] = 2
179 | test_1$Future_EMI_perincome[test_1$Future_EMI_perincome > 2] = 2
180 | 
181 | ## Creating is_zero function
182 | is_zero <- function(x) {
183 |   if(x == 0) {
184 |     a <- 1
185 |   } else {
186 |     a <- 0
187 |   }
188 |   return(a)
189 | }
190 | 
191 | 
192 | 
193 | #######################################################################
194 | #11) Changing monthly income outliers
195 | 
196 | train_1$Monthly_Income[train_1$Monthly_Income > 1000000] = 1000000
197 | test_1$Monthly_Income[test_1$Monthly_Income > 1000000] = 1000000
198 | 
199 | #######################################################################
200 | #12) New Variable Creation : Process_percent
201 | 
202 | train_1 <- cbind(train_1,Proces_perct = as.numeric((train_1$Processing_Fee*100) / (train_1$Monthly_Income+1)))
203 | test_1 <- cbind(test_1,Proces_perct = as.numeric((test_1$Processing_Fee*100) / (test_1$Monthly_Income+1)))
204 | 
205 | train_1$Proces_perct[train_1$Proces_perct > 40] = 40
206 | test_1$Proces_perct[test_1$Proces_perct > 40] = 40
207 | 
208 | #######################################################################
209 | #13) New Variable Creation : exist_EMI_perincome index
210 | 
211 | train_1 <- cbind(train_1,exist_EMI_perincome = as.numeric((train_1$Existing_EMI) / (train_1$Monthly_Income+1)))
212 | test_1 <- cbind(test_1,exist_EMI_perincome = as.numeric((test_1$Existing_EMI) / (test_1$Monthly_Income+1)))
213 | 
214 | train_1$exist_EMI_perincome[train_1$exist_EMI_perincome > 1.5] = 1.5
215 | test_1$exist_EMI_perincome[test_1$exist_EMI_perincome > 1.5] = 1.5
216 | 
217 | #14) New Variable Creation : exx_EMI_perincome index
218 | 
219 | train_1 <- cbind(train_1,exx_EMI_perincome = as.numeric((train_1$EMI_calculated) / (train_1$Monthly_Income+1)))
220 | test_1 <- cbind(test_1,exx_EMI_perincome = as.numeric((test_1$EMI_calculated) / (test_1$Monthly_Income+1)))
221 | 
222 | train_1$exx_EMI_perincome[train_1$exx_EMI_perincome > 2] = 2
223 | test_1$exx_EMI_perincome[test_1$exx_EMI_perincome > 2] = 2
224 | 
225 | #15) Removal of some Columns 
226 | remove_var <- c('Gender','LoggedIn','EMI_Loan_Submitted')
227 | train_1 <- train_1[ , -which(names(train_1) %in% remove_var)]
228 | test_1 <- test_1[ , -which(names(test_1) %in% remove_var)]
229 | 
230 | #16) Final file for modelling
231 | write.csv(test_1,file="test_1.csv",row.names=FALSE)
232 | write.csv(train_1,file="train_1.csv",row.names=FALSE)
233 | 
234 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3.X/Hack_3x_Modelling.py:
--------------------------------------------------------------------------------
  1 | ######### Python code for AV Hack 3.x , Author = Aayush Agrawal ##########
  2 | 
  3 | # Step 1: Importing Libraries
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | from sklearn.linear_model import LogisticRegression
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn import preprocessing
  9 | from sklearn.metrics import roc_curve, auc
 10 | import pandas as pd
 11 | from sklearn import ensemble
 12 | import random
 13 | import xgboost as xgb
 14 | 
 15 | #Step 2 : Defining a 1/0 hard enccoder function
 16 | number = preprocessing.LabelEncoder()
 17 | 
 18 | #Step 3 : Importing Train and testing data after preprocessing from R code
 19 | train=pd.read_csv('E:/DS/AV Hack 3.x/train_1.csv')
 20 | test=pd.read_csv('E:/DS/AV Hack 3.x/test_1.csv')
 21 | 
 22 | #Step 4 : Having a look at the data	
 23 | train.head()
 24 | 
 25 | 
 26 | #Step 5 : Converting factor variables in 1/0 encoding and making any missing value -999
 27 | def convert(data):
 28 |     number = preprocessing.LabelEncoder()
 29 |     data['Lead_Creation_Date'] = number.fit_transform(data.Lead_Creation_Date)
 30 |     data['is_male'] = number.fit_transform(data.is_male)
 31 |     data['City'] = number.fit_transform(data.City)
 32 |     data['Salary_Account'] = number.fit_transform(data.Salary_Account)
 33 |     data['Employer_Name'] = number.fit_transform(data.Employer_Name)
 34 |     data['Mobile_Verified'] = number.fit_transform(data.Mobile_Verified)
 35 |     data['Var1'] = number.fit_transform(data.Var1)
 36 |     data['Filled_Form'] = number.fit_transform(data.Filled_Form)
 37 |     data['Device_Type'] = number.fit_transform(data.Device_Type)
 38 |     data['Var2'] = number.fit_transform(data.Var2)
 39 |     data['Var5'] = number.fit_transform(data.Var5)
 40 |     data['Var4'] = number.fit_transform(data.Var4)
 41 |     data['DOB_month'] = number.fit_transform(data.DOB_month)
 42 |     data['DOB_year'] = number.fit_transform(data.DOB_year)
 43 |     data['Lead_Creation_day'] = number.fit_transform(data.Lead_Creation_day)
 44 |     data['Lead_Creation_month'] = number.fit_transform(data.Lead_Creation_month)
 45 |     data['Source'] = number.fit_transform(data.Source)
 46 |     data=data.fillna(-999)
 47 |     return data
 48 | 
 49 | train=convert(train)
 50 | test=convert(test)
 51 | 
 52 | #Step 6 : Running my 1st Model XGB
 53 | # Step 6.i): Defining features for XGB
 54 | features=['City',
 55 | 'Monthly_Income',
 56 | 'Lead_Creation_Date',    
 57 | 'Loan_Amount_Applied',
 58 | 'Loan_Tenure_Applied',
 59 | 'Existing_EMI',
 60 | 'Employer_Name',
 61 | 'Salary_Account',
 62 | 'Mobile_Verified',
 63 | 'Var5',
 64 | 'Var1',
 65 | 'Loan_Amount_Submitted',
 66 | 'Loan_Tenure_Submitted',
 67 | 'Interest_Rate',
 68 | 'Processing_Fee',
 69 | 'Filled_Form',
 70 | 'Device_Type',
 71 | 'Var2',
 72 | 'Source',
 73 | 'Var4',
 74 | 'is_male',
 75 | 'age',
 76 | 'DOB_month',
 77 | 'DOB_year',
 78 | 'Lead_Creation_day',
 79 | 'Lead_Creation_month',
 80 | 'EMI_calculated',
 81 | 'Future_EMI_perincome',
 82 | 'Proces_perct',
 83 | 'exist_EMI_perincome',
 84 | 'exx_EMI_perincome'
 85 | #'Profit_perc'
 86 | #'EMI_Loan_Submitted'
 87 | ]
 88 | 
 89 | ## Step 6.ii) Preparing data from the features listed 
 90 | x_train = train[list(features)].values
 91 | y_train = train['Disbursed'].values
 92 | x_test=test[list(features)].values
 93 | 
 94 | 
 95 | ## Step 6.iii) Defining Parameters
 96 | params = {}
 97 | params["objective"] = "binary:logistic"
 98 | params["eta"] = 0.01
 99 | params["min_child_weight"] = 7
100 | params["subsample"] = 0.7
101 | params["colsample_bytree"] = 0.7
102 | params["scale_pos_weight"] = 0.8
103 | params["silent"] = 0
104 | params["max_depth"] = 4
105 | params["seed"] = 0
106 | params["eval_metric"] = "auc"  
107 | 
108 | plst = list(params.items())
109 | num_rounds = 1525
110 | 
111 | xgtrain = xgb.DMatrix(x_train,label=y_train,missing=-999)
112 | xgtest = xgb.DMatrix(x_test,missing=-999)
113 | 
114 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
115 | 
116 | # Step 6.iv) Running the trained model on testing file
117 | pred_test_y_xgb1 = model_xgb.predict(xgtest)
118 | test['Disbursed']=pred_test_y_xgb1
119 | 
120 | # Step 6.v) Getting the output
121 | test.to_csv('E:/DS/AV Hack 3.x/Solution_xgb.csv', columns=['ID','Disbursed'],index=False)
122 | 
123 | ## Step 7) Running my 2nd Model Random Forest
124 | 
125 | # Step 7.i): Defining features for Random Forest
126 | features=['City',
127 | 'Monthly_Income',
128 | 'Lead_Creation_Date',    
129 | 'Loan_Amount_Applied',
130 | 'Loan_Tenure_Applied',
131 | 'Existing_EMI',
132 | 'Employer_Name',
133 | 'Salary_Account',
134 | #'Mobile_Verified',
135 | 'Var5',
136 | 'Var1',
137 | 'Loan_Amount_Submitted',
138 | 'Loan_Tenure_Submitted',
139 | 'Interest_Rate',
140 | 'Processing_Fee',
141 | 'Filled_Form',
142 | #'Device_Type',
143 | 'Var2',
144 | 'Source',
145 | 'Var4',
146 | #'is_male',
147 | 'age',
148 | 'DOB_month',
149 | 'DOB_year',
150 | 'Lead_Creation_day',
151 | #'Lead_Creation_month',
152 | 'EMI_calculated',
153 | 'Future_EMI_perincome',
154 | 'Proces_perct',
155 | 'exist_EMI_perincome',
156 | 'exx_EMI_perincome'
157 | #'Profit_perc'
158 | #'EMI_Loan_Submitted'
159 | ]
160 | 
161 | ## Step 7.ii) Preparing data from the features listed 
162 | x_train = train[list(features)].values
163 | y_train = train['Disbursed'].values
164 | x_test=test[list(features)].values
165 | 
166 | 
167 | ## Step 7.iii) Running Model : Random Forest , 1000 classifier
168 | rf = ensemble.RandomForestClassifier(n_estimators=1000,min_samples_leaf=50, max_features="auto", n_jobs=4, random_state=0)
169 | rf.fit(x_train, y_train)
170 | 
171 | 
172 | ## Step 7.iv) Looking at Feature Importance
173 | importances = rf.feature_importances_
174 | indices = np.argsort(importances)
175 | 
176 | ind=[]
177 | for i in indices:
178 |     ind.append(features[i])
179 | 
180 | import matplotlib.pyplot as plt
181 | plt.figure(1)
182 | plt.title('Feature Importances')
183 | plt.barh(range(len(indices)), importances[indices], color='b', align='center')
184 | plt.yticks(range(len(indices)),ind)
185 | plt.xlabel('Relative Importance')
186 | plt.show()
187 | 
188 | # Step 7.v) Running the trained model on testing file
189 | disbursed = rf.predict_proba(x_test)
190 | test['Disbursed']=disbursed[:,1]
191 | 
192 | # Step 7.vi) Getting the output
193 | test.to_csv('E:/DS/AV Hack 3.x/Solution_rf.csv', columns=['ID','Disbursed'],index=False)
194 | 
195 | 
196 | ### After that Ensemble the model in excel using Rank.avg function and assignment of weight 0.66 to XGBoost model and 0.33 to RF model
197 | 
198 | ####### END ####### Happy Learning ####
199 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3.X/README.md:
--------------------------------------------------------------------------------
 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.X - Predict-customer-worth-for-happy-customer-bank
 2 | 
 3 | http://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802
 4 | 
 5 | ##### My approach for the hackathon is as follows:
 6 | 
 7 | 
 8 | ######  Data Preprocessing ( R Code)
 9 | 1.  I looked into levels of data and created a data dictionary by mentioning the level gaps, as I figured out that there is difference in level of data in training and testing data set (Like some cities are only in training dataset but are missing from testing and vice versa)
10 | 
11 | 2.  Treated City and Employee Name column by removing extra spaces and making proper font
12 | 
13 | 3. Removed Extra levels from city by looking at count of cities finally reduced it to 15 levels by making other cities as "Others"
14 | 
15 | 4. Removed some extra levels from Employee names, replaced all Employers below 30 cases to "Others"
16 | 
17 | 5. Extracted Date, Month and Year from DOB column and then removed DOB because of many levels
18 | 
19 | 6. Extracted Day and Month from Lead Creation Date, but kept Lead creation date
20 | 
21 | 7. Replaced missing values of Loan Amount and Tenure submitted from Loan Amount and Tenure Applied
22 | 
23 | 8. Replaced missing values of Processing Fee to zero
24 | 
25 | 9. Imputed missing value of Interest Rate, Loan Amount Submitted and Loan tenure by using bagged imputation from R caret
26 | 
27 | 10. Created a new variable of EMI_calculated :  E = P×r×(1 + r)n/((1 + r)n - 1)
28 | 
29 | 11. Created a new variable of Future_EMI_perincome ratio : (Existing EMI + EMI submitted)/ Monthly Income, restrited value till 2
30 | 
31 | 12. Removed outlier from Monthly income by anything greater than 1,000,000 to 1,000,000
32 | 
33 | 13. Created a new variable Process_percent : (Processing Fee/ Monthly Income) * 100, restricted it to 40
34 | 
35 | 14. Created two variables exist_EMI_perincome(Existing EMI / Monthly income) and exx_EMI_perincome (EMI_calculated/ Monthly income)
36 | 
37 | ######  Modelling (Python)
38 | 
39 | 1. Used Extreme Gradient boosting and optimized the tuning parameterss based on local CV score, as many solutions on LB were proven to be overfitting in Weekender version
40 | 
41 | 2. Final XGB model had a Local CV(4-Fold) score of 0.854141 +- 0.004308 and a LB rating of 0.85456
42 | 
43 | 3. Used a Random forest classifier(1000 trees) and tuned it on a 75:25 approach
44 | 
45 | 4. Final RF model was having local score of 0.84233 and a LB rating of 0.85213
46 | 
47 | 5. Finally used Rank Average Ensembing for the final solution. Weights (2*XGB_score + Rf_score)/3
48 | 
49 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3/AV_wog.R:
--------------------------------------------------------------------------------
 1 | #################### AV Hackathon 3####################
 2 | 
 3 | #setting working library
 4 | setwd("E:/DS/AV wog")
 5 | 
 6 | #loading libraries
 7 | library(caret)
 8 | library(randomForest)
 9 | 
10 | #reading the files
11 | train=read.csv("train.csv")
12 | test=read.csv("test.csv")
13 | 
14 | str(train)
15 | str(test)
16 | 
17 | 
18 | # removing some unwanted variables
19 | remove_var <- c("institute_country")
20 | train <- train[ , -which(names(train) %in% remove_var)]
21 | test <- test[ , -which(names(test) %in% remove_var)]
22 | test <- test[,-26]
23 | 
24 | #run feature engineering file now then come back
25 | 
26 | remove_var <- c("Var15","institute_city","institute_state","subject_area","secondary_area")
27 | train_1 <- train_1[ , -which(names(train_1) %in% remove_var)]
28 | test_1 <- test_1[ , -which(names(test_1) %in% remove_var)]
29 | remove(test)
30 | remove(train)
31 | 
32 | # R part split check
33 | library(rpart)
34 | r_part=rpart(Project_Valuation~project_subject,data=train_1)
35 | summary(r_part)
36 | r_part
37 | 
38 | # R part split with RF
39 | train_2_1 <- train_1[train_1$Similar_Project_Valuation_other_institute <549 & train_1$Project_Valuation<5750,]
40 | train_2_2 <- train_1[train_1$Similar_Project_Valuation_other_institute >=549 & train_1$Project_Valuation < 5750,]
41 | test_2_1 <- test_1[test_1$Similar_Project_Valuation_other_institute <549,]
42 | test_2_2 <- test_1[test_1$Similar_Project_Valuation_other_institute >=549,]
43 | 
44 | rf_2_1 <- randomForest(Project_Valuation ~.,data=train_2_1[,-c(1)], ntree=100, norm.votes=FALSE,importance = TRUE,do.trace = 1,nodesize = 50)
45 | rf_2_2 <- randomForest(Project_Valuation ~.,data=train_2_2[,-c(1)], ntree=100, norm.votes=FALSE,importance = TRUE,do.trace = 1,nodesize = 50)
46 | save(rf_2_1,file='rf_2_1.RData')
47 | save(rf_2_2,file='rf3_2_1.RData')
48 | 
49 | load('rf_2_1.RData')
50 | load('rf3_2_1.RData')
51 | test_2_1$Project_Valuation=0
52 | test_2_2$Project_Valuation=0
53 | 
54 | pred_2_1= predict(rf_2_1,test_2_1)
55 | pred_2_2= predict(rf_2_2,test_2_2)
56 | test_2_1$Project_Valuation=pred_2_1
57 | test_2_2$Project_Valuation=pred_2_2
58 | submit_2_1<-data.frame(ID=test_2_1$ID,Project_Valuation_rf=test_2_1$Project_Valuation)
59 | submit_2_2<-data.frame(ID=test_2_2$ID,Project_Valuation_rf=test_2_2$Project_Valuation)
60 | submit_rf <- rbind(submit_2_1,submit_2_2)
61 | 
62 | write.csv(submit_rf,file="submit_rf_rpart.csv",row.names=FALSE)
63 | 
64 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3/Feature Engineering.R:
--------------------------------------------------------------------------------
  1 | # Feature Engineering AV Hackathon 3
  2 | 
  3 | #a) Creating Dummy Variable of class factors VAR15
  4 | #a.i) is_HAXXF
  5 | 
  6 | is_HAXXF <- function(x) {
  7 |   if(x == "HAXXF") {
  8 |     y <- 1
  9 |   } else {
 10 |     y <- 0
 11 |   }
 12 |   return(y)
 13 | }
 14 | train_1 <- cbind(train,is_HAXXF = as.factor(mapply(is_HAXXF,train$Var15)))
 15 | test_1 <- cbind(test,is_HAXXF = as.factor(mapply(is_HAXXF,test$Var15)))
 16 | 
 17 | #a.ii) is_HAXXC
 18 | 
 19 | is_HAXXC <- function(x) {
 20 |   if(x == "HAXXC") {
 21 |     y <- 1
 22 |   } else {
 23 |     y <- 0
 24 |   }
 25 |   return(y)
 26 | }
 27 | train_1 <- cbind(train_1,is_HAXXC = as.factor(mapply(is_HAXXC,train_1$Var15)))
 28 | test_1 <- cbind(test_1,is_HAXXC = as.factor(mapply(is_HAXXC,test_1$Var15)))
 29 | 
 30 | #a.iii) is_HATEM
 31 | 
 32 | is_HATEM <- function(x) {
 33 |   if(x == "HATEM") {
 34 |     y <- 1
 35 |   } else {
 36 |     y <- 0
 37 |   }
 38 |   return(y)
 39 | }
 40 | train_1 <- cbind(train_1,is_HATEM = as.factor(mapply(is_HATEM,train_1$Var15)))
 41 | test_1 <- cbind(test_1,is_HATEM = as.factor(mapply(is_HATEM,test_1$Var15)))
 42 | 
 43 | #a.ii) is_HATFD
 44 | 
 45 | is_HATFD <- function(x) {
 46 |   if(x == "HATFD") {
 47 |     y <- 1
 48 |   } else {
 49 |     y <- 0
 50 |   }
 51 |   return(y)
 52 | }
 53 | train_1 <- cbind(train_1,is_HATFD = as.factor(mapply(is_HATFD,train_1$Var15)))
 54 | test_1 <- cbind(test_1,is_HATFD = as.factor(mapply(is_HATFD,test_1$Var15)))
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | ##############################################################################
 61 | #b) Creating Dummy Variable of class factors institute_state
 62 | #b.i) is_CT
 63 | 
 64 | is_CT <- function(x) {
 65 |   if(x == "CT") {
 66 |     y <- 1
 67 |   } else {
 68 |     y <- 0
 69 |   }
 70 |   return(y)
 71 | }
 72 | train_1 <- cbind(train_1,is_CT = as.factor(mapply(is_CT,train_1$institute_state)))
 73 | test_1 <- cbind(test_1,is_CT = as.factor(mapply(is_CT,test_1$institute_state)))
 74 | 
 75 | #b.ii) is_DC
 76 | 
 77 | is_DC <- function(x) {
 78 |   if(x == "DC") {
 79 |     y <- 1
 80 |   } else {
 81 |     y <- 0
 82 |   }
 83 |   return(y)
 84 | }
 85 | train_1 <- cbind(train_1,is_DC = as.factor(mapply(is_DC,train_1$institute_state)))
 86 | test_1 <- cbind(test_1,is_DC = as.factor(mapply(is_DC,test_1$institute_state)))
 87 | 
 88 | #b.iii) is_DE
 89 | 
 90 | is_DE <- function(x) {
 91 |   if(x == "DE") {
 92 |     y <- 1
 93 |   } else {
 94 |     y <- 0
 95 |   }
 96 |   return(y)
 97 | }
 98 | train_1 <- cbind(train_1,is_DE = as.factor(mapply(is_DE,train_1$institute_state)))
 99 | test_1 <- cbind(test_1,is_DE = as.factor(mapply(is_DE,test_1$institute_state)))
100 | 
101 | #b.iv) is_FL
102 | 
103 | is_FL <- function(x) {
104 |   if(x == "FL") {
105 |     y <- 1
106 |   } else {
107 |     y <- 0
108 |   }
109 |   return(y)
110 | }
111 | train_1 <- cbind(train_1,is_FL = as.factor(mapply(is_FL,train_1$institute_state)))
112 | test_1 <- cbind(test_1,is_FL = as.factor(mapply(is_FL,test_1$institute_state)))
113 | 
114 | #b.v) is_GA
115 | 
116 | is_GA <- function(x) {
117 |   if(x == "GA") {
118 |     y <- 1
119 |   } else {
120 |     y <- 0
121 |   }
122 |   return(y)
123 | }
124 | train_1 <- cbind(train_1,is_GA = as.factor(mapply(is_GA,train_1$institute_state)))
125 | test_1 <- cbind(test_1,is_GA = as.factor(mapply(is_GA,test_1$institute_state)))
126 | 
127 | #b.vi) is_KS
128 | 
129 | is_KS <- function(x) {
130 |   if(x == "KS") {
131 |     y <- 1
132 |   } else {
133 |     y <- 0
134 |   }
135 |   return(y)
136 | }
137 | train_1 <- cbind(train_1,is_KS = as.factor(mapply(is_KS,train_1$institute_state)))
138 | test_1 <- cbind(test_1,is_KS = as.factor(mapply(is_KS,test_1$institute_state)))
139 | 
140 | #b.vi) is_KY
141 | 
142 | is_KY <- function(x) {
143 |   if(x == "KY") {
144 |     y <- 1
145 |   } else {
146 |     y <- 0
147 |   }
148 |   return(y)
149 | }
150 | train_1 <- cbind(train_1,is_KY = as.factor(mapply(is_KY,train_1$institute_state)))
151 | test_1 <- cbind(test_1,is_KY = as.factor(mapply(is_KY,test_1$institute_state)))
152 | 
153 | #b.vii) is_MA
154 | 
155 | is_MA <- function(x) {
156 |   if(x == "MA") {
157 |     y <- 1
158 |   } else {
159 |     y <- 0
160 |   }
161 |   return(y)
162 | }
163 | train_1 <- cbind(train_1,is_MA = as.factor(mapply(is_MA,train_1$institute_state)))
164 | test_1 <- cbind(test_1,is_MA = as.factor(mapply(is_MA,test_1$institute_state)))
165 | 
166 | #b.viii) is_MD
167 | 
168 | is_MD <- function(x) {
169 |   if(x == "MD") {
170 |     y <- 1
171 |   } else {
172 |     y <- 0
173 |   }
174 |   return(y)
175 | }
176 | train_1 <- cbind(train_1,is_MD = as.factor(mapply(is_MD,train_1$institute_state)))
177 | test_1 <- cbind(test_1,is_MD = as.factor(mapply(is_MD,test_1$institute_state)))
178 | 
179 | #b.ix) is_ME
180 | 
181 | is_ME <- function(x) {
182 |   if(x == "ME") {
183 |     y <- 1
184 |   } else {
185 |     y <- 0
186 |   }
187 |   return(y)
188 | }
189 | train_1 <- cbind(train_1,is_ME = as.factor(mapply(is_ME,train_1$institute_state)))
190 | test_1 <- cbind(test_1,is_ME = as.factor(mapply(is_ME,test_1$institute_state)))
191 | 
192 | #b.x) is_MI
193 | 
194 | is_MI <- function(x) {
195 |   if(x == "MI") {
196 |     y <- 1
197 |   } else {
198 |     y <- 0
199 |   }
200 |   return(y)
201 | }
202 | train_1 <- cbind(train_1,is_MI = as.factor(mapply(is_MI,train_1$institute_state)))
203 | test_1 <- cbind(test_1,is_MI = as.factor(mapply(is_MI,test_1$institute_state)))
204 | 
205 | #b.xi) is_MN
206 | 
207 | is_MN <- function(x) {
208 |   if(x == "MN") {
209 |     y <- 1
210 |   } else {
211 |     y <- 0
212 |   }
213 |   return(y)
214 | }
215 | train_1 <- cbind(train_1,is_MN = as.factor(mapply(is_MN,train_1$institute_state)))
216 | test_1 <- cbind(test_1,is_MN = as.factor(mapply(is_MN,test_1$institute_state)))
217 | 
218 | #b.xii) is_MS
219 | 
220 | is_MS <- function(x) {
221 |   if(x == "MS") {
222 |     y <- 1
223 |   } else {
224 |     y <- 0
225 |   }
226 |   return(y)
227 | }
228 | train_1 <- cbind(train_1,is_MS = as.factor(mapply(is_MS,train_1$institute_state)))
229 | test_1 <- cbind(test_1,is_MS = as.factor(mapply(is_MS,test_1$institute_state)))
230 | 
231 | #b.xiii) is_NH
232 | 
233 | is_NH <- function(x) {
234 |   if(x == "NH") {
235 |     y <- 1
236 |   } else {
237 |     y <- 0
238 |   }
239 |   return(y)
240 | }
241 | train_1 <- cbind(train_1,is_NH = as.factor(mapply(is_NH,train_1$institute_state)))
242 | test_1 <- cbind(test_1,is_NH = as.factor(mapply(is_NH,test_1$institute_state)))
243 | 
244 | #b.xiv) is_NJ
245 | 
246 | is_NJ <- function(x) {
247 |   if(x == "NJ") {
248 |     y <- 1
249 |   } else {
250 |     y <- 0
251 |   }
252 |   return(y)
253 | }
254 | train_1 <- cbind(train_1,is_NJ = as.factor(mapply(is_NJ,train_1$institute_state)))
255 | test_1 <- cbind(test_1,is_NJ = as.factor(mapply(is_NJ,test_1$institute_state)))
256 | 
257 | #b.xiv) is_NY
258 | 
259 | is_NY <- function(x) {
260 |   if(x == "NY") {
261 |     y <- 1
262 |   } else {
263 |     y <- 0
264 |   }
265 |   return(y)
266 | }
267 | train_1 <- cbind(train_1,is_NY = as.factor(mapply(is_NY,train_1$institute_state)))
268 | test_1 <- cbind(test_1,is_NY = as.factor(mapply(is_NY,test_1$institute_state)))
269 | 
270 | #b.xv) is_OH
271 | 
272 | is_OH <- function(x) {
273 |   if(x == "OH") {
274 |     y <- 1
275 |   } else {
276 |     y <- 0
277 |   }
278 |   return(y)
279 | }
280 | train_1 <- cbind(train_1,is_OH = as.factor(mapply(is_OH,train_1$institute_state)))
281 | test_1 <- cbind(test_1,is_OH = as.factor(mapply(is_OH,test_1$institute_state)))
282 | 
283 | #b.xv) is_PA
284 | 
285 | is_PA <- function(x) {
286 |   if(x == "PA") {
287 |     y <- 1
288 |   } else {
289 |     y <- 0
290 |   }
291 |   return(y)
292 | }
293 | train_1 <- cbind(train_1,is_PA = as.factor(mapply(is_PA,train_1$institute_state)))
294 | test_1 <- cbind(test_1,is_PA = as.factor(mapply(is_PA,test_1$institute_state)))
295 | 
296 | #b.xvi) is_RI
297 | 
298 | is_RI <- function(x) {
299 |   if(x == "RI") {
300 |     y <- 1
301 |   } else {
302 |     y <- 0
303 |   }
304 |   return(y)
305 | }
306 | train_1 <- cbind(train_1,is_RI = as.factor(mapply(is_RI,train_1$institute_state)))
307 | test_1 <- cbind(test_1,is_RI = as.factor(mapply(is_RI,test_1$institute_state)))
308 | 
309 | #b.xvii) is_TN
310 | 
311 | is_TN <- function(x) {
312 |   if(x == "TN") {
313 |     y <- 1
314 |   } else {
315 |     y <- 0
316 |   }
317 |   return(y)
318 | }
319 | train_1 <- cbind(train_1,is_TN = as.factor(mapply(is_TN,train_1$institute_state)))
320 | test_1 <- cbind(test_1,is_TN = as.factor(mapply(is_TN,test_1$institute_state)))
321 | 
322 | #b.xviii) is_VA
323 | 
324 | is_VA <- function(x) {
325 |   if(x == "VA") {
326 |     y <- 1
327 |   } else {
328 |     y <- 0
329 |   }
330 |   return(y)
331 | }
332 | train_1 <- cbind(train_1,is_VA = as.factor(mapply(is_VA,train_1$institute_state)))
333 | test_1 <- cbind(test_1,is_VA = as.factor(mapply(is_VA,test_1$institute_state)))
334 | 
335 | #b.xix) is_VT
336 | 
337 | is_VT <- function(x) {
338 |   if(x == "VT") {
339 |     y <- 1
340 |   } else {
341 |     y <- 0
342 |   }
343 |   return(y)
344 | }
345 | train_1 <- cbind(train_1,is_VT = as.factor(mapply(is_VT,train_1$institute_state)))
346 | test_1 <- cbind(test_1,is_VT = as.factor(mapply(is_VT,test_1$institute_state)))
347 | 
348 | #b.xx) is_WV
349 | 
350 | is_WV <- function(x) {
351 |   if(x == "WV") {
352 |     y <- 1
353 |   } else {
354 |     y <- 0
355 |   }
356 |   return(y)
357 | }
358 | train_1 <- cbind(train_1,is_WV = as.factor(mapply(is_WV,train_1$institute_state)))
359 | test_1 <- cbind(test_1,is_WV = as.factor(mapply(is_WV,test_1$institute_state)))
360 | 
361 | 
362 | 
363 | ##############################################################################
364 | #c) Creating Dummy Variable of class factors var8
365 | #c.i) is_rural
366 | is_rural <- function(x) {
367 |   if(x == "HXYJ" | x == "HXYK" | x == "HXYL") {
368 |     y <- 1
369 |   } else {
370 |     y <- 0
371 |   }
372 |   return(y)
373 | }
374 | train_1 <- cbind(train_1,is_rural = as.factor(mapply(is_rural,train_1$Var8)))
375 | test_1 <- cbind(test_1,is_rural = as.factor(mapply(is_rural,test_1$Var8)))
376 | 
377 | #c.ii) is_urban
378 | is_urban <- function(x) {
379 |   if(x == "HXYB" | x == "HXYC" | x == "HXYD" |  x == "HXYE") {
380 |     y <- 1
381 |   } else {
382 |     y <- 0
383 |   }
384 |   return(y)
385 | }
386 | train_1 <- cbind(train_1,is_urban = as.factor(mapply(is_urban,train_1$Var8)))
387 | test_1 <- cbind(test_1,is_urban = as.factor(mapply(is_urban,test_1$Var8)))
388 | 
389 | #c.iii) is_suburban
390 | is_suburban <- function(x) {
391 |   if(x == "HXYG" | x == "HXYF" | x == "HXYH" |  x == "HXYI") {
392 |     y <- 1
393 |   } else {
394 |     y <- 0
395 |   }
396 |   return(y)
397 | }
398 | train_1 <- cbind(train_1,is_suburban = as.factor(mapply(is_suburban,train_1$Var8)))
399 | test_1 <- cbind(test_1,is_suburban = as.factor(mapply(is_suburban,test_1$Var8)))
400 | 
401 | #c.iv) is_other
402 | is_other <- function(x) {
403 |   if(x == "HXYM" | x == "HXYN" | x == "HXYO") {
404 |     y <- 1
405 |   } else {
406 |     y <- 0
407 |   }
408 |   return(y)
409 | }
410 | train_1 <- cbind(train_1,is_other = as.factor(mapply(is_other,train_1$Var8)))
411 | test_1 <- cbind(test_1,is_other = as.factor(mapply(is_other,test_1$Var8)))
412 | 
413 | ##############################################################################
414 | #f) Creating Dummy Variable of class factors subject_area
415 | #f.i) is_AL
416 | is_Applearn <- function(x) {
417 |   if(x == "Applied Learning") {
418 |     y <- 1
419 |   } else {
420 |     y <- 0
421 |   }
422 |   return(y)
423 | }
424 | train_1 <- cbind(train_1,is_Applearn = as.factor(mapply(is_Applearn,train_1$subject_area)))
425 | test_1 <- cbind(test_1,is_Applearn = as.factor(mapply(is_Applearn,test_1$subject_area)))
426 | 
427 | #f.ii) is_Heaspo
428 | is_Heaspo <- function(x) {
429 |   if(x == "Health & Sports") {
430 |     y <- 1
431 |   } else {
432 |     y <- 0
433 |   }
434 |   return(y)
435 | }
436 | train_1 <- cbind(train_1,is_Heaspo = as.factor(mapply(is_Heaspo,train_1$subject_area)))
437 | test_1 <- cbind(test_1,is_Heaspo = as.factor(mapply(is_Heaspo,test_1$subject_area)))
438 | 
439 | #f.iii) is_HeaCiv
440 | is_HeaCiv <- function(x) {
441 |   if(x == "History & Civics") {
442 |     y <- 1
443 |   } else {
444 |     y <- 0
445 |   }
446 |   return(y)
447 | }
448 | train_1 <- cbind(train_1,is_HeaCiv = as.factor(mapply(is_HeaCiv,train_1$subject_area)))
449 | test_1 <- cbind(test_1,is_HeaCiv = as.factor(mapply(is_HeaCiv,test_1$subject_area)))
450 | 
451 | #f.iv) is_LitLan
452 | is_LitLan <- function(x) {
453 |   if(x == "Literacy & Language") {
454 |     y <- 1
455 |   } else {
456 |     y <- 0
457 |   }
458 |   return(y)
459 | }
460 | train_1 <- cbind(train_1,is_LitLan = as.factor(mapply(is_LitLan,train_1$subject_area)))
461 | test_1 <- cbind(test_1,is_LitLan = as.factor(mapply(is_LitLan,test_1$subject_area)))
462 | 
463 | #f.v) is_MathSci
464 | is_MathSci <- function(x) {
465 |   if(x == "Math & Science") {
466 |     y <- 1
467 |   } else {
468 |     y <- 0
469 |   }
470 |   return(y)
471 | }
472 | train_1 <- cbind(train_1,is_MathSci = as.factor(mapply(is_MathSci,train_1$subject_area)))
473 | test_1 <- cbind(test_1,is_MathSci = as.factor(mapply(is_MathSci,test_1$subject_area)))
474 | 
475 | #f.vi) is_MuArt
476 | is_MuArt <- function(x) {
477 |   if(x == "Music & The Arts") {
478 |     y <- 1
479 |   } else {
480 |     y <- 0
481 |   }
482 |   return(y)
483 | }
484 | train_1 <- cbind(train_1,is_MuArt = as.factor(mapply(is_MuArt,train_1$subject_area)))
485 | test_1 <- cbind(test_1,is_MuArt = as.factor(mapply(is_MuArt,test_1$subject_area)))
486 | 
487 | #f.vii) is_SpeNee
488 | is_SpeNee <- function(x) {
489 |   if(x == "Special Needs") {
490 |     y <- 1
491 |   } else {
492 |     y <- 0
493 |   }
494 |   return(y)
495 | }
496 | train_1 <- cbind(train_1,is_SpeNee = as.factor(mapply(is_SpeNee,train_1$subject_area)))
497 | test_1 <- cbind(test_1,is_SpeNee = as.factor(mapply(is_SpeNee,test_1$subject_area)))
498 | 
499 | ##############################################################################
500 | #g) Creating Dummy Variable of class factors secondary_subject
501 | #g.i) is_AL
502 | is_sApplearn <- function(x) {
503 |   if(x == "Applied Learning") {
504 |     y <- 1
505 |   } else {
506 |     y <- 0
507 |   }
508 |   return(y)
509 | }
510 | train_1 <- cbind(train_1,is_sApplearn = as.factor(mapply(is_sApplearn,train_1$secondary_area)))
511 | test_1 <- cbind(test_1,is_sApplearn = as.factor(mapply(is_sApplearn,test_1$secondary_area)))
512 | 
513 | #g.ii) is_Heaspo
514 | is_sHeaspo <- function(x) {
515 |   if(x == "Health & Sports") {
516 |     y <- 1
517 |   } else {
518 |     y <- 0
519 |   }
520 |   return(y)
521 | }
522 | train_1 <- cbind(train_1,is_sHeaspo = as.factor(mapply(is_sHeaspo,train_1$secondary_area)))
523 | test_1 <- cbind(test_1,is_sHeaspo = as.factor(mapply(is_sHeaspo,test_1$secondary_area)))
524 | 
525 | #g.iii) is_HeaCiv
526 | is_sHeaCiv <- function(x) {
527 |   if(x == "History & Civics") {
528 |     y <- 1
529 |   } else {
530 |     y <- 0
531 |   }
532 |   return(y)
533 | }
534 | train_1 <- cbind(train_1,is_sHeaCiv = as.factor(mapply(is_sHeaCiv,train_1$secondary_area)))
535 | test_1 <- cbind(test_1,is_sHeaCiv = as.factor(mapply(is_sHeaCiv,test_1$secondary_area)))
536 | 
537 | #g.iv) is_sLitLan
538 | is_sLitLan <- function(x) {
539 |   if(x == "Literacy & Language") {
540 |     y <- 1
541 |   } else {
542 |     y <- 0
543 |   }
544 |   return(y)
545 | }
546 | train_1 <- cbind(train_1,is_sLitLan = as.factor(mapply(is_sLitLan,train_1$secondary_area)))
547 | test_1 <- cbind(test_1,is_sLitLan = as.factor(mapply(is_sLitLan,test_1$secondary_area)))
548 | 
549 | #g.v) is_sMathSci
550 | is_sMathSci <- function(x) {
551 |   if(x == "Math & Science") {
552 |     y <- 1
553 |   } else {
554 |     y <- 0
555 |   }
556 |   return(y)
557 | }
558 | train_1 <- cbind(train_1,is_sMathSci = as.factor(mapply(is_sMathSci,train_1$secondary_area)))
559 | test_1 <- cbind(test_1,is_sMathSci = as.factor(mapply(is_sMathSci,test_1$secondary_area)))
560 | 
561 | #f.vi) is_sMuArt
562 | is_sMuArt <- function(x) {
563 |   if(x == "Music & The Arts") {
564 |     y <- 1
565 |   } else {
566 |     y <- 0
567 |   }
568 |   return(y)
569 | }
570 | train_1 <- cbind(train_1,is_sMuArt = as.factor(mapply(is_sMuArt,train_1$secondary_area)))
571 | test_1 <- cbind(test_1,is_sMuArt = as.factor(mapply(is_sMuArt,test_1$secondary_area)))
572 | 
573 | #f.vii) is_SpeNee
574 | is_sSpeNee <- function(x) {
575 |   if(x == "Special Needs") {
576 |     y <- 1
577 |   } else {
578 |     y <- 0
579 |   }
580 |   return(y)
581 | }
582 | train_1 <- cbind(train_1,is_sSpeNee = as.factor(mapply(is_sSpeNee,train_1$secondary_area)))
583 | test_1 <- cbind(test_1,is_sSpeNee = as.factor(mapply(is_sSpeNee,test_1$secondary_area)))
584 | 
585 | #f.viii) is_SNull
586 | is_SNull <- function(x) {
587 |   if(x == "") {
588 |     y <- 1
589 |   } else {
590 |     y <- 0
591 |   }
592 |   return(y)
593 | }
594 | train_1 <- cbind(train_1,is_SNull = as.factor(mapply(is_SNull,train_1$secondary_area)))
595 | test_1 <- cbind(test_1,is_SNull = as.factor(mapply(is_SNull,test_1$secondary_area)))
596 | 
597 | 
598 | 
599 | ##############################################################################
600 | # Level matching
601 | levels(test_1$Var4) <- levels(train_1$Var4)
602 | levels(test_1$Var10) <- levels(train_1$Var10)
603 | levels(test_1$Var8) <- levels(train_1$Var8)
604 | levels(test_1$Var11) <- levels(train_1$Var11)
605 | levels(test_1$Var12) <- levels(train_1$Var12)
606 | levels(test_1$Var13) <- levels(train_1$Var13)
607 | levels(test_1$Var14) <- levels(train_1$Var14)
608 | levels(test_1$Instructor_Past_Performance) <- levels(train_1$Instructor_Past_Performance)
609 | levels(test_1$Instructor_Association_Industry_Expert) <- levels(train_1$Instructor_Association_Industry_Expert)
610 | levels(test_1$project_subject) <- levels(train_1$project_subject)
611 | levels(test_1$subject_area) <- levels(train_1$subject_area)
612 | levels(test_1$secondary_subject) <- levels(train_1$secondary_subject)
613 | levels(test_1$secondary_area) <- levels(train_1$secondary_area)
614 | levels(test_1$Resource_Category) <- levels(train_1$Resource_Category)
615 | levels(test_1$Resource_Sub_Category) <- levels(train_1$Resource_Sub_Category)
616 | levels(test_1$Var23) <- levels(train_1$Var23)
617 | levels(test_1$Var24) <- levels(train_1$Var24)
618 | levels(test_1$is_NH) <- levels(train_1$is_NH)
619 | levels(test_1$is_rural) <- levels(train_1$is_rural)
620 | levels(test_1$is_urban) <- levels(train_1$is_urban)
621 | levels(test_1$is_suburban) <- levels(train_1$is_suburban)
622 | levels(test_1$is_other) <- levels(train_1$is_other)
623 | 


--------------------------------------------------------------------------------
/AV-Hackathon-3/README.md:
--------------------------------------------------------------------------------
 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong !
 2 | 
 3 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838
 4 | 
 5 | ###### My approach for the hackathon is as follows:
 6 | 
 7 | 1.  I looked into levels of data and created a data dictionary by mentioning the level gaps, as I figured out that there is difference in level of data in training and testing data set (Like some cities are only in training dataset but are missing from testing and vice versa)
 8 | 
 9 | 2.  Ran a simple linear model to see if some of the greater number of level categories are impacting the funding and found that state column have some impact on the valuation
10 | 
11 | 3. Converted some of the categorical variables into 1/0 encoded variables
12 | 
13 | 4. Ran R part over Similar project valuation to see it's impact on subsequent funding and found that there is significant shift in mean values with Similar project valuation >$549 and <$549
14 | 
15 | 5. Made two Random forest models with Similar project valuation >$549 and <$549, simply merged there result for the final output
16 | 


--------------------------------------------------------------------------------
/Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/FactorVariables.R:
--------------------------------------------------------------------------------
  1 | #########
  2 | ### All the factor variables(with >2 levels ) are finetuned using the following steps:
  3 | ### 1. Select a factor variable and create an XGB model and rank the important features(levels)
  4 | ### 2. keep the most important levelsand merge the rest in to a single level "Others"
  5 | ### 3. Include a subset of this levels into the final model depending on their effect.
  6 | #########
  7 | 
  8 | 
  9 | ##########################################
 10 | ######## BANKS / SALARY ACCOUNT ###################
 11 | ##########################################
 12 | banks = as.data.frame(model.matrix(~0 + Salary_Account, loans))
 13 | bankstest = as.data.frame(model.matrix(~0 + Salary_Account, loanstest))
 14 | 
 15 | xgb2 <- xgboost(data = as.matrix(banks),
 16 |                 label = dispnum,
 17 |                 nrounds = 1930, max_depth = 4 ,eta = 0.01,
 18 |                 objective = "binary:logistic", verbose=1)
 19 | m = xgb.importance(feature_names = colnames(banks),model = xgb2)
 20 | xgb.plot.importance(m)
 21 | 
 22 | banksorder = m$Feature
 23 | banksordertest = intersect(banksorder,colnames(bankstest))
 24 | rembanks = setdiff(colnames(banks) , banksorder)
 25 | rembankstest = setdiff(colnames(bankstest) , banksordertest)
 26 | uselessbanks = banks[rembanks]
 27 | uselessbankstest = bankstest[rembankstest]
 28 | banks = banks[banksorder]
 29 | bankstest = bankstest[banksordertest]
 30 | 
 31 | banks$Otherbanks = rowSums(cbind(banks[,12:ncol(banks)],uselessbanks))
 32 | bankstest$Otherbanks = rowSums(cbind(bankstest[,12:ncol(bankstest)],uselessbankstest))
 33 | banks = banks[,-(12:(ncol(banks)-1))]
 34 | bankstest = bankstest[,-(12:(ncol(bankstest)-1))]
 35 | write.csv(banks, "banks.csv", row.names=FALSE)
 36 | write.csv(bankstest, "bankstest.csv", row.names=FALSE)
 37 | rm(uselessbanks,uselessbankstest)
 38 | 
 39 | ##########################################
 40 | ######## DATE OF LEAD ###################
 41 | ##########################################
 42 | 
 43 | ###### EXTRACTING ONLY MONTHS AND ADDING TO TRAIN AND TEST DATA SET#####
 44 | date = as.character(loans$Lead_Creation_Date)
 45 | doj = strptime(date, format = "%d-%b-%Y")
 46 | months = as.factor(format(doj,'%b'))
 47 | days = as.numeric((doj - min(doj))/86400)
 48 | ###
 49 | datetest = as.character(loanstest$Lead_Creation_Date)
 50 | dojtest = strptime(datetest, format = "%d-%b-%Y")
 51 | monthstest = as.factor(format(dojtest,'%b'))
 52 | daystest = as.numeric((dojtest - min(dojtest))/86400)
 53 | 
 54 | Train = cbind(Train,month = months)
 55 | Test = cbind(Test,month = monthstest)
 56 | 
 57 | Train = cbind(Train,model.matrix(~0 + month, Train))
 58 | Test = cbind(Test,model.matrix(~0 + month, Test))
 59 | 
 60 | Train$month = NULL
 61 | Test$month = NULL
 62 | #############################
 63 | 
 64 | dates = as.data.frame(model.matrix(~0 + Lead_Creation_Date, loans))
 65 | datestest = as.data.frame(model.matrix(~0 + Lead_Creation_Date, loanstest))
 66 | 
 67 | xgb2 <- xgboost(data = as.matrix(dates),
 68 |                 label = dispnum,
 69 |                 nrounds = 727, max_depth = 4 ,eta = 0.01,
 70 |                 objective = "binary:logistic", verbose=1)
 71 | m = xgb.importance(feature_names = colnames(dates),model = xgb2)
 72 | xgb.plot.importance(m)
 73 | 
 74 | datesorder = m$Feature
 75 | remdates = setdiff(colnames(dates) , datesorder)
 76 | uselessdates = dates[remdates]
 77 | uselessdatestest = datestest[remdates]
 78 | dates = dates[datesorder]
 79 | datestest = datestest[datesorder]
 80 | 
 81 | dates$Otherdates = rowSums(cbind(dates[,28:35],uselessdates))
 82 | datestest$Otherdates = rowSums(cbind(datestest[,28:35],uselessdatestest))
 83 | dates = dates[,-(28:(ncol(dates)-1))]
 84 | datestest = datestest[,-(28:(ncol(datestest)-1))]
 85 | write.csv(dates, "dates.csv", row.names=FALSE)
 86 | write.csv(datestest, "datestest.csv", row.names=FALSE)
 87 | rm(uselessdates,uselessdatestest)
 88 | 
 89 | ##########################################
 90 | ######## CITY ###################
 91 | ##########################################
 92 | 
 93 | city = as.data.frame(model.matrix(~0 + City, loans))
 94 | citytest = as.data.frame(model.matrix(~0 + City, loanstest))
 95 | 
 96 | xgb2 <- xgboost(data = as.matrix(city),
 97 |                 label = dispnum,
 98 |                 nrounds = 588, max_depth = 4 ,eta = 0.01,
 99 |                 objective = "binary:logistic", verbose=1)
100 | m = xgb.importance(feature_names = colnames(city),model = xgb2)
101 | xgb.plot.importance(m)
102 | 
103 | cityorder = m$Feature
104 | cityordertest = intersect(cityorder,colnames(citytest))
105 | remcity = setdiff(colnames(city) , cityorder)
106 | remcitytest = setdiff(colnames(citytest) , cityordertest)
107 | uselesscity = city[remcity]
108 | uselesscitytest = citytest[remcitytest]
109 | city = city[cityorder]
110 | citytest = citytest[cityordertest]
111 | 
112 | city$Othercity = rowSums(cbind(city[,12:ncol(city)],uselesscity))
113 | citytest$Othercity = rowSums(cbind(citytest[,12:ncol(citytest)],uselesscitytest))
114 | city = city[,-(12:(ncol(city)-1))]
115 | citytest = citytest[,-(12:(ncol(citytest)-1))]
116 | write.csv(city, "city.csv", row.names=FALSE)
117 | write.csv(citytest, "citytest.csv", row.names=FALSE)
118 | rm(uselesscity,uselesscitytest)
119 | 
120 | ##########################################
121 | ######## VAR1 ###################
122 | ##########################################
123 | var1 = as.data.frame(model.matrix(~0 + Var1, loans))
124 | var1test = as.data.frame(model.matrix(~0 + Var1, loanstest))
125 | 
126 | xgb2 <- xgboost(data = as.matrix(var1),
127 |                 label = dispnum,
128 |                 nrounds = 740, max_depth = 4 ,eta = 0.01,
129 |                 objective = "binary:logistic", verbose=1)
130 | m = xgb.importance(feature_names = colnames(var1),model = xgb2)
131 | xgb.plot.importance(m)
132 | 
133 | 
134 | var1order = m$Feature
135 | remvar1 = setdiff(colnames(var1) , var1order)
136 | uselessvar1 = var1[remvar1]
137 | uselessvar1test = var1test[remvar1]
138 | var1 = var1[var1order]
139 | var1test = var1test[var1order]
140 | var1$Othervar1 = rowSums(cbind(var1[,7:16],uselessvar1))
141 | var1test$Othervar1 = rowSums(cbind(var1test[,7:16],uselessvar1test))
142 | var1 = var1[,-(7:(ncol(var1)-1))]
143 | var1test = var1test[,-(7:(ncol(var1test)-1))]
144 | write.csv(var1, "var1.csv", row.names=FALSE)
145 | write.csv(var1test, "var1test.csv", row.names=FALSE)
146 | 
147 | rm(uselessvar1,uselessvar1test)
148 | 
149 | ##########################################
150 | ######## VAR2 ###################
151 | ##########################################
152 | 
153 | var2 = as.data.frame(model.matrix(~0 + Var2, loans))
154 | var2test = as.data.frame(model.matrix(~0 + Var2, loanstest))
155 | 
156 | '''
157 | xgb2 <- xgboost(data = as.matrix(var2),
158 |                 label = dispnum,
159 |                 nrounds = 530, max_depth = 2 ,eta = 0.01,
160 |                 objective = "binary:logistic", verbose=1)
161 | m = xgb.importance(feature_names = colnames(var2),model = xgb2)
162 | xgb.plot.importance(m)
163 | 
164 | var2order = m$Feature
165 | var2 = var2[var2order]
166 | var2test = var2test[var2order]
167 | 
168 | var2$Var2AD = (var2$Var2A + var2$Var2D)
169 | var2test$Var2AD = (var2test$Var2A + var2test$Var2D)
170 | var2$Var2EF = (var2$Var2E + var2$Var2F)
171 | var2test$Var2EF = (var2test$Var2E + var2test$Var2F)
172 | var2 = var2[,-c(1,4:6)]
173 | var2test = var2test[,-c(1,4:6)]
174 | '''
175 | ##########################################
176 | ######## VAR4 ###################
177 | ##########################################
178 | var4 = as.data.frame(model.matrix(~0 + Var4, loans))
179 | var4test = as.data.frame(model.matrix(~0 + Var4, loanstest))
180 | 
181 | 
182 | ##########################################
183 | ######## Source ###################
184 | ##########################################
185 | 
186 | Source = as.data.frame(model.matrix(~0 + Source, loans))
187 | Sourcetest = as.data.frame(model.matrix(~0 + Source, loanstest))
188 | 
189 | xgb2 <- xgboost(data = as.matrix(Source),
190 |                 label = dispnum,
191 |                 nrounds = 1230, max_depth = 4 ,eta = 0.01,
192 |                 objective = "binary:logistic", verbose=1)
193 | m = xgb.importance(feature_names = colnames(Source),model = xgb2)
194 | xgb.plot.importance(m)
195 | 
196 | Sourceorder = m$Feature
197 | Sourceorder = Sourceorder
198 | Sourceordertest = intersect(Sourceorder,colnames(Sourcetest))
199 | remSource = setdiff(colnames(Source) , Sourceorder)
200 | remSourcetest = setdiff(colnames(Sourcetest) , Sourceordertest)
201 | uselessSource = Source[remSource]
202 | uselessSourcetest = Sourcetest[remSourcetest]
203 | Source = Source[Sourceorder]
204 | Sourcetest = Sourcetest[Sourceordertest]
205 | 
206 | Source$OtherSource = rowSums(cbind(Source[,13],uselessSource))
207 | Sourcetest$OtherSource = rowSums(uselessSourcetest)
208 | Source = Source[,-13]
209 | 
210 | rm(uselessSource,uselessSourcetest)
211 | write.csv(Source, "Source.csv", row.names=FALSE)
212 | write.csv(Sourcetest, "Sourcetest.csv", row.names=FALSE)
213 | 
214 | ##########################################
215 | ######## EMPLOYERS ###################
216 | ##########################################
217 | ######################
218 | 
219 | t = as.data.frame(table(loans$Employer_Name))
220 | empnames = as.character(tail(t[order(t$Freq),1],26)) ## selecting only 26 employers with max freq
221 | emp = as.character(loans$Employer_Name)
222 | emptest = as.character(loanstest$Employer_Name)
223 | 
224 | emp[1:87020] = lapply(1:87020, function(x) ifelse(emp[x] %in% empnames, emp[x],"OtherEmployer" ))
225 | 
226 | emptest[1:nrow(loanstest)] = lapply(1:nrow(loanstest), function(x) ifelse(emptest[x] %in% empnames, emptest[x],"OtherEmployer" ))
227 | 
228 | emp = as.factor(c(do.call("cbind",emp)))
229 | emptest = as.factor(c(do.call("cbind",emptest)))
230 | 
231 | Employer = as.data.frame(model.matrix(~0 + emp, loans))
232 | Employertest = as.data.frame(model.matrix(~0 + emp, loanstest))
233 | 
234 | xgb2 <- xgboost(data = as.matrix(Employer),
235 |                 label = dispnum,
236 |                 nrounds = 920, max_depth = 4 ,eta = 0.01,
237 |                 objective = "binary:logistic", verbose=1)
238 | 
239 | m = xgb.importance(feature_names = colnames(Employer),model = xgb2)
240 | xgb.plot.importance(m)
241 | 
242 | Employerorder = m$Feature
243 | remEmployer = setdiff(colnames(Employer) , Employerorder)
244 | uselessEmployer = Employer[remEmployer]
245 | uselessEmployertest = Employertest[remEmployer]
246 | Employer = Employer[Employerorder]
247 | Employertest = Employertest[Employerorder]
248 | 
249 | Employer$OtherEmployer2 = rowSums(cbind(Employer[,18:(ncol(Employer)-1)],uselessEmployer))
250 | Employertest$OtherEmployer2 = rowSums(cbind(Employertest[,18:(ncol(Employer)-1)],uselessEmployertest))
251 | Employer = Employer[,-(18:(ncol(Employer)-1))]
252 | Employertest = Employertest[,-(18:(ncol(Employertest)-1))]
253 | write.csv(Employer, "Employer.csv", row.names=FALSE)
254 | write.csv(Employertest, "Employertest.csv", row.names=FALSE)
255 | ###################
256 | #####################
257 | 


--------------------------------------------------------------------------------
/Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/Final_Model.R:
--------------------------------------------------------------------------------
 1 | ##########################
 2 | ######## After including different features and CV finally obtained 
 3 | #######  the following features as important 
 4 | #########################
 5 | k = c(3,6:8,11,12,14:19,23:29) 
 6 | 
 7 | train = cbind(Train[,k],banks,city,var1,var4,Source,dates[,1:10],Employer[,1:3],var2[,4])
 8 | test = cbind(Test[,k],bankstest,citytest,var1test,var4test,Sourcetest,datestest[,1:10],Employertest[,1:3],var2test[,4])
 9 | 
10 | #######################
11 | ###### CV #######
12 | ########################
13 | 
14 | nrounds = 5000
15 | nfolds = 4
16 | ps = list( max_depth = 5 ,eta = 0.01,objective = "binary:logistic")
17 | ms = list( 'auc','rmse')
18 | 
19 | cvXgb = xgb.cv(params = ps, data = as.matrix(train) , 
20 |                label = dispnum, nrounds = nrounds ,nfold = nfolds,showsd = T,metrics = ms,stratified = T, verbose = T,subsample = 0.7
21 | )
22 | 
23 | #######################
24 | ###### FINAL MODEL #######
25 | ########################
26 | xgb2 <- xgboost(data = as.matrix(train),
27 |                 label = dispnum,
28 |                 nrounds = 1426, max_depth = 5 ,eta = 0.01,
29 |                 objective = "binary:logistic", verbose=1,subsample = 0.7)
30 | 
31 | 
32 | #n = xgb.importance(feature_names = colnames(banks),model = xgb2,data = dates,label = #dispnum)
33 | pred <- predict(xgb2, as.matrix(test,type = 'response'))
34 | 
35 | pp2 = data.frame(ID = Test[,1] ,Disbursed = pred)
36 | write.csv(pp2, "samplesub.csv", row.names=FALSE)
37 | 
38 | ################
39 | ################
40 | 


--------------------------------------------------------------------------------
/Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/LoadData.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(gplots)
  3 | library(caTools)
  4 | library(lattice)
  5 | library(caret)
  6 | library(foreach)
  7 | library(Matrix)
  8 | library(pROC)
  9 | library(ROCR)
 10 | library(Rcpp)
 11 | library(mice)
 12 | library(xgboost)
 13 | library(survival)
 14 | library(gbm)
 15 | library(randomForest)
 16 | 
 17 | ######################
 18 | ########## Load data
 19 | loans = read.csv('Train.csv',stringsAsFactors = F)
 20 | loanstest = read.csv('Test.csv',stringsAsFactors = F)
 21 | 
 22 | Train = loans
 23 | Train[,c(3,10,11,14,22:26)]  = lapply(c(3,10,11,14,22:26), function(x) as.factor(Train[,x]))
 24 | 
 25 | Train$Mobile_Verified = ifelse (Train$Mobile_Verified == "Y",1,0)
 26 | Train$Filled_Form = ifelse (Train$Filled_Form == "Y",1,0)
 27 | Train$IsMobile = ifelse (Train$Device_Type == "Mobile",1,0)
 28 | Train$IsMale = ifelse (Train$Gender == "Male",1,0)
 29 | 
 30 | Train$Gender = NULL
 31 | Train$Device_Type = NULL
 32 | ###########################
 33 | ###### Handling NA Values######################
 34 | ############################ 
 35 | #### keeping track of Rows with NA
 36 | Train$issubmitNA = ifelse (is.na(Train$Loan_Amount_Submitted),1,0)
 37 | table(Train$issubmitNA)
 38 | 
 39 | ###### Filling very few NA values in Loan_Amount_Applied && Loan_Tenure_Applied
 40 | ###### using mean(remaing data) or Loan_Amount_submitted && Loan_Tenure_Submitted (If available)
 41 | Train$Loan_Amount_Applied = ifelse(is.na(Train$Loan_Amount_Applied),ifelse(is.na(Train$Loan_Amount_Submitted),230300,Train$Loan_Amount_Submitted),Train$Loan_Amount_Applied )
 42 | 
 43 | Train$Loan_Tenure_Applied = ifelse(is.na(Train$Loan_Tenure_Applied),ifelse(is.na(Train$Loan_Tenure_Submitted),2,Train$Loan_Tenure_Submitted),Train$Loan_Tenure_Applied )
 44 | 
 45 | ####### Multiple Imputation
 46 | ####### First imputing Loan_Amount_submitted && Loan_Tenure_Submitted using Loan_Amount_Applied #######  && Loan_Tenure_Applied
 47 | 
 48 | temp = Train[,c(6,7,14,15)] 
 49 | 
 50 | set.seed(123)
 51 | imputed = complete(mice(temp))
 52 | 
 53 | Train$Loan_Amount_Submitted = imputed$Loan_Amount_Submitted
 54 | Train$Loan_Tenure_Submitted = imputed$Loan_Tenure_Submitted
 55 | Train$Existing_EMI = ifelse(is.na(Train$Existing_EMI),0,Train$Existing_EMI)
 56 | 
 57 | 
 58 | ####### Imputation of Int.Rate, Proc.fee and EMI_Loan_Submitted using already imputed
 59 | #######  Loan_Amount_submitted && Loan_Tenure_Submitted
 60 | 
 61 | temp = Train[,c(14,15,16,17,18)]
 62 | 
 63 | set.seed(123)
 64 | 
 65 | imputed = complete(mice(temp)) ### This will takes several minutes 
 66 | 
 67 | Train$Interest_Rate = imputed$Interest_Rate
 68 | Train$Processing_Fee = imputed$Processing_Fee
 69 | Train$EMI_Loan_Submitted = imputed$EMI_Loan_Submitted
 70 | 
 71 | ###### OUTCOME variables
 72 | disb = Train$Disbursed
 73 | dispnum = as.numeric(as.character(disb))
 74 | Train$Disbursed = NULL
 75 | Lin = Train$LoggedIn
 76 | Train$LoggedIn = NULL
 77 | 
 78 | ##################################### age variable
 79 | dob = strptime(Train$DOB, format = "%d-%b-%Y")
 80 | year = format(dob,"%Y")
 81 | year = as.numeric(year)
 82 | Train$age = 115 - year
 83 | ## assuming people with year 0015 have wrongly mentioned their yob as 2015
 84 | ## assigning avg value of 30 to 17 such cases in the data
 85 | Train$age = ifelse(Train$age == 100, 30, Train$age)  
 86 | #############################
 87 | 
 88 | #############
 89 | ############# similarly cleaning test data
 90 | ############
 91 | Test = loanstest
 92 | Test[,c(3,10,11,14,22:24)]  = lapply(c(3,10,11,14,22:24), function(x) as.factor(Test[,x]))
 93 | 
 94 | Test$Mobile_Verified = ifelse (Test$Mobile_Verified == "Y",1,0)
 95 | Test$Filled_Form = ifelse (Test$Filled_Form == "Y",1,0)
 96 | Test$IsMobile = ifelse (Test$Device_Type == "Mobile",1,0)
 97 | Test$IsMale = ifelse (Test$Gender == "Male",1,0)
 98 | 
 99 | Test$Gender = NULL
100 | Test$Device_Type = NULL
101 | 
102 | ########## Dealing with NA
103 | ####### keeping track of NA rows
104 | Test$issubmitNA = ifelse (is.na(Test$Loan_Amount_Submitted),1,0)
105 | table(Test$issubmitNA)
106 | 
107 | ##### Imputation
108 | temp = Test[,c(6,7,14,15)]
109 | set.seed(123)
110 | imputed = complete(mice(temp))
111 | Test$Loan_Amount_Submitted = imputed$Loan_Amount_Submitted
112 | Test$Loan_Tenure_Submitted = imputed$Loan_Tenure_Submitted
113 | Test$Loan_Amount_Applied = imputed$Loan_Amount_Applied
114 | Test$Loan_Tenure_Applied = imputed$Loan_Tenure_Applied
115 | Test$Existing_EMI = ifelse(is.na(Test$Existing_EMI),0,Test$Existing_EMI)
116 | ##write.csv(Test,"Testpartialimp.csv",row.names = F) 
117 | ###### second imputation
118 | temp = Test[,c(14,15,16,17,18)]
119 | set.seed(123)
120 | imputed = complete(mice(temp))
121 | Test$Interest_Rate = imputed$Interest_Rate
122 | Test$Processing_Fee = imputed$Processing_Fee
123 | Test$EMI_Loan_Submitted = imputed$EMI_Loan_Submitted
124 | ##write.csv(Test,"TestFullimp.csv",row.names = F) 
125 | ##################
126 | ################ age variable
127 | dob = strptime(Test$DOB, format = "%d-%b-%Y")
128 | year = format(dob,"%Y")
129 | year = as.numeric(year)
130 | Test$age = 115 - year
131 | Test$age = ifelse(Test$age == 100, 30, Test$age) 
132 | #############################
133 | 
134 | 


--------------------------------------------------------------------------------
/Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-/README.md:
--------------------------------------------------------------------------------
1 | # Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-
2 | # Analytics-Vidhya-Hackathon-Customer-worth-to-a-bank-
3 |  The code corresponds to a single XGB model which gave me a public score of ~0.866 and a private score of ~0.84
4 |  for the problem:
5 |  http://discuss.analyticsvidhya.com/t/hackathon-3-x-predict-customer-worth-for-happy-customer-bank/3802
6 |  
7 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/README.md:
--------------------------------------------------------------------------------
1 | # Analytics_Vidhya_3.X_Hackathon
2 | Codes for the Analytics Vidhya Hackathon 3.X
3 | 
4 | Both the versions (weekend and weeklong) have shell scripts. Just run them and the solution is generated.
5 | 
6 | - The weekend codes have a public LB score: 0.8612 and private LB score: 0.8413. Runtimes ~6 min (4-core machine) - 2nd place
7 | - The weekday codes have a public LB score: 0.8620 and private LB score: 0.8410. Runtimes ~ 32 min (4-core machine) - 1st place
8 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_1_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | train = pd.read_csv("Train.csv")
  5 | test = pd.read_csv("Test.csv")
  6 | submission = pd.read_csv("sample_submission.csv")
  7 | print "Train dataset dimensions:", train.shape
  8 | print "Test dataset dimensions:", test.shape
  9 | 
 10 | salary_acc = train.Salary_Account.value_counts(dropna=False)
 11 | salary_acc_rare = list(salary_acc[salary_acc<40].index)
 12 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
 13 | 
 14 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day
 15 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek
 16 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear
 17 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter
 18 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month
 19 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year
 20 | 
 21 | train['Lifetime'] = pd.to_datetime(train['Lead_Creation_Date']) - pd.to_datetime(train['DOB'])
 22 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int)
 23 | 
 24 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day
 25 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek
 26 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear
 27 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter
 28 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month
 29 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year
 30 | 
 31 | city = pd.DataFrame(train['City'].value_counts())
 32 | city_rare = list(city[city[0] < 100].index)
 33 | train.ix[train['City'].isin(city_rare), 'City'] = "Others"
 34 | train['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True)
 35 | train['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True)
 36 | 
 37 | from sklearn.preprocessing import LabelEncoder
 38 | le = LabelEncoder()
 39 | train['City_encoded'] = le.fit_transform(train['City'])
 40 | 
 41 | empnames = pd.DataFrame(train['Employer_Name'].value_counts())
 42 | empnames_rare = list(empnames[empnames[0]<30].index)
 43 | train.ix[train['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others"
 44 | 
 45 | # # Preprocessing
 46 | train2 = train.copy()
 47 | 
 48 | id_train = train['ID']
 49 | label = train2['Disbursed']
 50 | 
 51 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB', 'Lead_Creation_Date']
 52 | train2.drop(dropCols, axis=1, inplace = True)
 53 | 
 54 | y_train = label
 55 | X_train = pd.get_dummies(train2)
 56 | 
 57 | # # Test set preparation
 58 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
 59 | 
 60 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day
 61 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek
 62 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear
 63 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter
 64 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month
 65 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year
 66 | 
 67 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day
 68 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek
 69 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear
 70 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter
 71 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month
 72 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year
 73 | 
 74 | test['Lifetime'] = pd.to_datetime(test['Lead_Creation_Date']) - pd.to_datetime(test['DOB'])
 75 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int)
 76 | 
 77 | test.ix[test['City'].isin(city_rare), 'City'] = "Others"
 78 | newcities = list(set(test['City']) - set(train['City']))
 79 | test.ix[test['City'].isin(newcities), 'City'] = np.nan
 80 | test['City_encoded'] = le.transform(test['City'])
 81 | 
 82 | test['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True)
 83 | test['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True)
 84 | 
 85 | test.ix[test['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others"
 86 | 
 87 | newempnames = list(set(test['Employer_Name']) - set(train['Employer_Name']))
 88 | test.ix[test['Employer_Name'].isin(newempnames), "Employer_Name"] = "Others"
 89 | 
 90 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed']))
 91 | test2 = test.drop(testdropcols, axis=1)
 92 | 
 93 | X_test = pd.get_dummies(test2)
 94 | missingCols = list(set(X_train.columns)-set(X_test.columns))
 95 | for col in missingCols:
 96 |     X_test[col] = 0
 97 | X_test = X_test[X_train.columns]
 98 | assert X_train.columns.equals(X_test.columns)
 99 | 
100 | X_train.to_csv("train_preprocessed.csv", index = False)
101 | X_test.to_csv("test_preprocessed.csv", index = False)
102 | y_train.to_csv("train_labels.csv", index = False)
103 | test['ID'].to_csv("test_ids.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_2_train_xgb.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | 
 6 | train = pd.read_csv("train_preprocessed.csv")
 7 | test = pd.read_csv("test_preprocessed.csv")
 8 | labels = pd.read_csv("train_labels.csv", header = None)
 9 | test_ids = pd.read_csv("test_ids.csv", header = None)
10 | 
11 | labels = list(labels.iloc[:,0])
12 | test_ids = list(test_ids.iloc[:,0])
13 | 
14 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'logloss',
15 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
16 |           'min_child_weight':3, 'max_delta_step':3}
17 | num_rounds = 400
18 | 
19 | params['seed'] = 523264626346 # 0.85533
20 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
21 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
22 | # exit()
23 | # [395]   cv-test-logloss:0.062599+0.001852       cv-train-logloss:0.042591+0.001435
24 | # [396]   cv-test-logloss:0.062594+0.001854       cv-train-logloss:0.042548+0.001437
25 | # [397]   cv-test-logloss:0.062595+0.001854       cv-train-logloss:0.042507+0.001445
26 | # [398]   cv-test-logloss:0.062601+0.001851       cv-train-logloss:0.042446+0.001435
27 | # [399]   cv-test-logloss:0.062603+0.001852       cv-train-logloss:0.042390+0.001416
28 | 
29 | 
30 | clf = xgb.train(params, dtrain, num_rounds)
31 | dtest = xgb.DMatrix(test, missing = np.nan)
32 | test_preds = clf.predict(dtest)
33 | 
34 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds})
35 | submission = submission[['ID', 'Disbursed']]
36 | submission.to_csv("xgb_final.csv", index = False)
37 | 
38 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_3_preprocessing_ftrl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | train = pd.read_csv("train_preprocessed.csv")
 4 | labels = pd.read_csv("train_labels.csv", header = None)
 5 | 
 6 | labels = list(labels.iloc[:,0])
 7 | 
 8 | train['Disbursed'] = labels
 9 | 
10 | train.to_csv("train_preprocessed_full.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_4_train_ftrl.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | from random import random
 13 | import pickle
 14 | 
 15 | # TL; DR, the main training process starts on line: 250,
 16 | # you may want to start reading the code from there
 17 | 
 18 | 
 19 | ##############################################################################
 20 | # parameters #################################################################
 21 | ##############################################################################
 22 | 
 23 | # A, paths
 24 | train='train_preprocessed_full.csv'
 25 | test='test_preprocessed.csv'#'vali_100.tsv'
 26 | submission = 'ftrl_final.csv'  # path of to be outputted submission file
 27 | 
 28 | # B, model
 29 | alpha = .05  # learning rate
 30 | beta = 1.   # smoothing parameter for adaptive learning rate
 31 | L1 = 0.     # L1 regularization, larger value means more regularized
 32 | L2 = 1.     # L2 regularization, larger value means more regularized
 33 | 
 34 | # C, feature/hash trick
 35 | D = 2 ** 24             # number of weights to use
 36 | interaction = False     # whether to enable poly2 feature interactions
 37 | 
 38 | # D, training/validation
 39 | epoch = 4       # learn training data for N passes
 40 | holdafter = 9   # data after date N (exclusive) are used as validation
 41 | holdout = 200  # use every N training instance for holdout validation
 42 | 
 43 | 
 44 | ##############################################################################
 45 | # class, function, generator definitions #####################################
 46 | ##############################################################################
 47 | 
 48 | class ftrl_proximal(object):
 49 |     ''' Our main algorithm: Follow the regularized leader - proximal
 50 | 
 51 |         In short,
 52 |         this is an adaptive-learning-rate sparse logistic-regression with
 53 |         efficient L1-L2-regularization
 54 | 
 55 |         Reference:
 56 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
 57 |     '''
 58 | 
 59 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
 60 |         # parameters
 61 |         self.alpha = alpha
 62 |         self.beta = beta
 63 |         self.L1 = L1
 64 |         self.L2 = L2
 65 | 
 66 |         # feature related parameters
 67 |         self.D = D
 68 |         self.interaction = interaction
 69 | 
 70 |         # model
 71 |         # n: squared sum of past gradients
 72 |         # z: weights
 73 |         # w: lazy weights
 74 |         self.n = [0.] * D
 75 |         self.z = [random() for k in range(D)]#[0.] * D
 76 |         self.w = {}
 77 | 
 78 |     def _indices(self, x):
 79 |         ''' A helper generator that yields the indices in x
 80 | 
 81 |             The purpose of this generator is to make the following
 82 |             code a bit cleaner when doing feature interaction.
 83 |         '''
 84 | 
 85 |         # first yield index of the bias term
 86 |         yield 0
 87 | 
 88 |         # then yield the normal indices
 89 |         for index in x:
 90 |             yield index
 91 | 
 92 |         # now yield interactions (if applicable)
 93 |         if self.interaction:
 94 |             D = self.D
 95 |             L = len(x)
 96 | 
 97 |             x = sorted(x)
 98 |             for i in xrange(L):
 99 |                 for j in xrange(i+1, L):
100 |                     # one-hot encode interactions with hash trick
101 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
102 | 
103 |     def predict(self, x):
104 |         ''' Get probability estimation on x
105 | 
106 |             INPUT:
107 |                 x: features
108 | 
109 |             OUTPUT:
110 |                 probability of p(y = 1 | x; w)
111 |         '''
112 | 
113 |         # parameters
114 |         alpha = self.alpha
115 |         beta = self.beta
116 |         L1 = self.L1
117 |         L2 = self.L2
118 | 
119 |         # model
120 |         n = self.n
121 |         z = self.z
122 |         w = {}
123 | 
124 |         # wTx is the inner product of w and x
125 |         wTx = 0.
126 |         for i in self._indices(x):
127 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
128 | 
129 |             # build w on the fly using z and n, hence the name - lazy weights
130 |             # we are doing this at prediction instead of update time is because
131 |             # this allows us for not storing the complete w
132 |             if sign * z[i] <= L1:
133 |                 # w[i] vanishes due to L1 regularization
134 |                 w[i] = 0.
135 |             else:
136 |                 # apply prediction time L1, L2 regularization to z and get w
137 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
138 | 
139 |             wTx += w[i]
140 | 
141 |         # cache the current w for update stage
142 |         self.w = w
143 | 
144 |         # bounded sigmoid function, this is the probability estimation
145 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
146 | 
147 |     def update(self, x, p, y):
148 |         ''' Update model using x, p, y
149 | 
150 |             INPUT:
151 |                 x: feature, a list of indices
152 |                 p: click probability prediction of our model
153 |                 y: answer
154 | 
155 |             MODIFIES:
156 |                 self.n: increase by squared gradient
157 |                 self.z: weights
158 |         '''
159 | 
160 |         # parameter
161 |         alpha = self.alpha
162 | 
163 |         # model
164 |         n = self.n
165 |         z = self.z
166 |         w = self.w
167 | 
168 |         # gradient under logloss
169 |         g = p - y
170 | 
171 |         # update z and n
172 |         for i in self._indices(x):
173 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
174 |             z[i] += g - sigma * w[i]
175 |             n[i] += g * g
176 | 
177 | 
178 | def logloss(p, y):
179 |     ''' FUNCTION: Bounded logloss
180 | 
181 |         INPUT:
182 |             p: our prediction
183 |             y: real answer
184 | 
185 |         OUTPUT:
186 |             logarithmic loss of p given y
187 |     '''
188 | 
189 |     p = max(min(p, 1. - 10e-15), 10e-15)
190 |     return -log(p) if y == 1. else -log(1. - p)
191 | 
192 | 
193 | def data(path, D):
194 |     ''' GENERATOR: Apply hash-trick to the original csv row
195 |                    and for simplicity, we one-hot-encode everything
196 | 
197 |         INPUT:
198 |             path: path to training or testing file
199 |             D: the max index that we can hash to
200 | 
201 |         YIELDS:
202 |             ID: id of the instance, mainly useless
203 |             x: a list of hashed and one-hot-encoded 'indices'
204 |                we only need the index since all values are either 0 or 1
205 |             y: y = 1 if we have a click, else we have y = 0
206 |     '''
207 | 
208 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
209 | 
210 |         try:
211 |             ID= row['ID']
212 |             del row['ID']
213 |         except:
214 |             ID = 0
215 |             pass
216 | 
217 |         # process target.
218 |         y = 0.
219 |         target='Disbursed'
220 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
221 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
222 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
223 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
224 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
225 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
226 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
227 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
228 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
229 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
230 |         #lcd_weekofyear
231 | 
232 | 
233 |         if target in row:
234 |             if row[target] == '1':
235 |                 y = 1.
236 |             del row[target]
237 | 
238 |         # extract date
239 | 
240 |         # turn hour really into hour, it was originally YYMMDDHH
241 | 
242 | 
243 |         # build x
244 |         x = []
245 |         for key in row:
246 |             value = row[key]
247 | 
248 |             # one-hot encode everything with hash trick
249 |             index = abs(hash(key + '_' + value)) % D
250 |             x.append(index)
251 | 
252 |         yield t, ID, x, y
253 | 
254 | 
255 | ##############################################################################
256 | # start training #############################################################
257 | ##############################################################################
258 | 
259 | start = datetime.now()
260 | print("started at: %s" % datetime.now())
261 | 
262 | # initialize ourselves a learner
263 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
264 | 
265 | # start training
266 | for e in range(epoch):
267 |     loss = 0.
268 |     count = 0
269 |     for t, ID, x, y in data(train, D):  # data is a generator
270 | 
271 |         p = learner.predict(x)
272 | 
273 |         # if (holdout and t % holdout == 0):
274 |         # #     # Estimate progressive validation loss
275 |         #     loss += logloss(p, y)
276 |         #     count += 1
277 |         # else:
278 |         # #     # Use other samples to train the model
279 |         #     learner.update(x, p, y)
280 | 
281 |         learner.update(x, p, y)
282 |         # if t % 1000000 == 0:
283 |         #     continue
284 | 
285 |     #print('epoch: %s\tval. logloss: %0.5f\telapsed time: %s' % (e + 1, loss/count, str(datetime.now() - start)))
286 | 
287 | #import pickle
288 | #pickle.dump(learner,open('ftrl3.p','w'))
289 | 
290 | ##############################################################################
291 | # start testing, and build Kaggle's submission file ##########################
292 | ##############################################################################
293 | print ('creating submission file')
294 | with open(submission, 'w') as outfile:
295 |     outfile.write('ID,Disbursed\n')
296 |     for t, ID, x, y in data(test, D):
297 |         p = learner.predict(x)
298 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_5_postprocessing_ftrl.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | 
3 | test = pd.read_csv("test.csv", usecols = ["ID"])
4 | 
5 | preds = pd.read_csv("ftrl_final.csv")
6 | preds['ID'] = test['ID']
7 | preds.to_csv("ftrl_final2.csv", index  = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/_6_ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from scipy.stats import rankdata
 4 | 
 5 | xgb_pred = pd.read_csv("xgb_final.csv") #XGB
 6 | ftrl_pred = pd.read_csv("ftrl_final.csv") #FTRL
 7 | 
 8 | ens = xgb_pred.copy()
 9 | ens.rename(columns={'Disbursed':'XGB'}, inplace = True)
10 | ens['FTRL'] = ftrl_pred['Disbursed']
11 | 
12 | ens['XGB_Rank'] = rankdata(ens['XGB'], method='min')
13 | ens['FTRL_Rank'] = rankdata(ens['FTRL'], method='min')
14 | ens['Final'] = 0.8*ens['XGB_Rank'] + 0.2*ens['FTRL_Rank']
15 | 
16 | ens = ens[['ID', 'Final']]
17 | ens.rename(columns={'Final':'Disbursed'}, inplace = True)
18 | ens.sort_index(inplace = True)
19 | ens.head()
20 | 
21 | ens.to_csv("weekend_solution.csv", index = False) # 0.86116 public LB


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weekend/av_script.sh:
--------------------------------------------------------------------------------
1 | python _1_preprocessing.py
2 | python _2_train_xgb.py
3 | python _3_preprocessing_ftrl.py
4 | pypy _4_train_ftrl.py
5 | python _5_postprocessing_ftrl.py
6 | python _6_ensemble.py


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/av_final.sh:
--------------------------------------------------------------------------------
 1 | mkdir temp_data
 2 | mkdir temp_submission
 3 | 
 4 | python preprocessing.py
 5 | python preprocessing2.py
 6 | 
 7 | echo "====> Lets train 5 XGBs for same type of configuration and average for seed stability and control overfitting"
 8 | python train_xgb.py
 9 | python train_xgb2.py
10 | python train_xgb3.py
11 | python train_xgb4.py
12 | python train_xgb5.py
13 | python postprocessing_XGB_1.py
14 | 
15 | echo "====> Train one more 5-set XGB with slightly different feature set(resulted in higher CV). Rank average."
16 | python train_2xgb1.py
17 | python train_2xgb2.py
18 | python train_2xgb3.py
19 | python train_2xgb4.py
20 | python train_2xgb5.py
21 | python postprocessing_XGB_2.py
22 | 
23 | python preprocessing_ftrl.py
24 | 
25 | echo "====> Shuffle the input data to train linear models with FTRL (Logistic Regression)"
26 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train1.csv 1 100000 1234
27 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train2.csv 1 100000 3456
28 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train3.csv 1 100000 6789
29 | python shuffle.py temp_data/train_preprocessed_full.csv temp_data/shuffled_train4.csv 1 100000 6543
30 | 
31 | echo "====> Train them in an online manner"
32 | pypy script_ftrl.py
33 | pypy script_ftrl2.py
34 | pypy script_ftrl3.py
35 | pypy script_ftrl4.py
36 | pypy script_ftrl5.py
37 | 
38 | echo "====> Rank average linear models for stability"
39 | python postprocessing_ftrl.py
40 | 
41 | echo "Let's train Random Forests on original data without city and employer name features"
42 | python train_rf.py
43 | 
44 | python postprocessing_rf.py
45 | 
46 | echo "Final Rank ensemble!"
47 | python ensemble_rank_final.py


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/ensemble_rank_final.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from scipy.stats import rankdata
 4 | 
 5 | xgb1_pred = pd.read_csv("temp_submission/XGB1_Ens.csv") #XGB
 6 | xgb2_pred = pd.read_csv("temp_submission/XGB2_Ens.csv") #XGB
 7 | rf_pred = pd.read_csv("temp_submission/RF_Ens.csv") #RF
 8 | ftrl_pred = pd.read_csv("temp_submission/FTRL_Ens.csv") # FTRL
 9 | 
10 | ens = xgb1_pred.copy()
11 | ens.rename(columns={'Disbursed':'XGB1'}, inplace = True)
12 | ens['XGB2'] = xgb2_pred['Disbursed']
13 | 
14 | ens['RF'] = rf_pred['Disbursed']
15 | ens['FTRL'] = ftrl_pred['Disbursed']
16 | 
17 | 
18 | ens['XGB1_Rank'] = rankdata(ens['XGB1'], method='min')
19 | ens['XGB2_Rank'] = rankdata(ens['XGB2'], method='min')
20 | 
21 | ens['XGB_Rank'] = 0.5 * ens['XGB1_Rank'] + 0.5 * ens['XGB2_Rank']
22 | ens['RF_Rank'] = rankdata(ens['RF'], method='min')
23 | ens['FTRL_Rank'] = rankdata(ens['FTRL'], method='min')
24 | 
25 | ens['Final'] = (0.75*ens['XGB_Rank'] + 0.25*ens['RF_Rank']) * 0.75 + 0.25 * ens['FTRL']
26 | 
27 | ens = ens[['ID', 'Final']]
28 | ens.rename(columns={'Final':'Disbursed'}, inplace = True)
29 | ens.sort_index(inplace = True)
30 | ens.head()
31 | 
32 | ens.to_csv("FinalSolution.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_RF.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from scipy.stats import rankdata
 3 | 
 4 | test = pd.read_csv("test.csv", usecols = ["ID"])
 5 | 
 6 | preds = pd.read_csv("temp_submission/Sub151.csv")
 7 | preds['ID'] = test['ID']
 8 | 
 9 | preds2 = pd.read_csv("temp_submission/Sub152.csv")
10 | preds3 = pd.read_csv("temp_submission/Sub153.csv")
11 | preds4 = pd.read_csv("temp_submission/Sub154.csv")
12 | preds5 = pd.read_csv("temp_submission/Sub155.csv")
13 | 
14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal')
15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal')
16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal')
17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal')
18 | preds4['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal')
19 | 
20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 
21 | 							preds2['Disbursed'] + 
22 | 							preds3['Disbursed'] + 
23 | 							preds4['Disbursed'] + 
24 | 							preds5['Disbursed'])
25 | 
26 | preds.to_csv("temp_submission/RF_Ens.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_XGB_1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from scipy.stats import rankdata
 3 | 
 4 | test = pd.read_csv("test.csv", usecols = ["ID"])
 5 | 
 6 | preds = pd.read_csv("temp_submission/Sub241.csv")
 7 | preds['ID'] = test['ID']
 8 | 
 9 | preds2 = pd.read_csv("temp_submission/Sub242.csv")
10 | preds3 = pd.read_csv("temp_submission/Sub243.csv")
11 | preds4 = pd.read_csv("temp_submission/Sub244.csv")
12 | preds5 = pd.read_csv("temp_submission/Sub245.csv")
13 | 
14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal')
15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal')
16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal')
17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal')
18 | preds5['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal')
19 | 
20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 
21 | 							preds2['Disbursed'] + 
22 | 							preds3['Disbursed'] + 
23 | 							preds4['Disbursed'] + 
24 | 							preds5['Disbursed'])
25 | 
26 | preds.to_csv("temp_submission/XGB1_Ens.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_XGB_2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from scipy.stats import rankdata
 3 | 
 4 | test = pd.read_csv("test.csv", usecols = ["ID"])
 5 | 
 6 | preds = pd.read_csv("temp_submission/Sub251.csv")
 7 | preds['ID'] = test['ID']
 8 | 
 9 | preds2 = pd.read_csv("temp_submission/Sub252.csv")
10 | preds3 = pd.read_csv("temp_submission/Sub253.csv")
11 | preds4 = pd.read_csv("temp_submission/Sub254.csv")
12 | preds5 = pd.read_csv("temp_submission/Sub255.csv")
13 | 
14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal')
15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal')
16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal')
17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal')
18 | preds5['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal')
19 | 
20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 
21 | 							preds2['Disbursed'] + 
22 | 							preds3['Disbursed'] + 
23 | 							preds4['Disbursed'] + 
24 | 							preds5['Disbursed'])
25 | 
26 | preds.to_csv("temp_submission/XGB2_Ens.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/postprocessing_ftrl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from scipy.stats import rankdata
 3 | 
 4 | test = pd.read_csv("test.csv", usecols = ["ID"])
 5 | 
 6 | preds = pd.read_csv("temp_submission/Sub701.csv")
 7 | preds['ID'] = test['ID']
 8 | 
 9 | preds2 = pd.read_csv("temp_submission/Sub702.csv")
10 | preds3 = pd.read_csv("temp_submission/Sub703.csv")
11 | preds4 = pd.read_csv("temp_submission/Sub704.csv")
12 | preds5 = pd.read_csv("temp_submission/Sub705.csv")
13 | 
14 | preds['Disbursed'] = rankdata(preds['Disbursed'], method='ordinal')
15 | preds2['Disbursed'] = rankdata(preds2['Disbursed'], method='ordinal')
16 | preds3['Disbursed'] = rankdata(preds3['Disbursed'], method='ordinal')
17 | preds4['Disbursed'] = rankdata(preds4['Disbursed'], method='ordinal')
18 | preds4['Disbursed'] = rankdata(preds5['Disbursed'], method='ordinal')
19 | 
20 | preds['Disbursed'] = 0.2 * (preds['Disbursed'] + 
21 | 							preds2['Disbursed'] + 
22 | 							preds3['Disbursed'] + 
23 | 							preds4['Disbursed'] + 
24 | 							preds5['Disbursed'])
25 | 
26 | preds.to_csv("temp_submission/FTRL_Ens.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn.tree import DecisionTreeClassifier
  6 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
  7 | import xgboost as xgb
  8 | from sklearn.cross_validation import cross_val_score, cross_val_predict
  9 | 
 10 | 
 11 | train = pd.read_csv("Train.csv")
 12 | test = pd.read_csv("Test.csv")
 13 | submission = pd.read_csv("sample_submission.csv")
 14 | print "Train dataset dimensions:", train.shape
 15 | print "Test dataset dimensions:", test.shape
 16 | 
 17 | salary_acc = train.Salary_Account.value_counts(dropna=False)
 18 | salary_acc_rare = list(salary_acc[salary_acc<40].index)
 19 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
 20 | 
 21 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day
 22 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek
 23 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear
 24 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter
 25 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month
 26 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year
 27 | 
 28 | train['Lifetime'] = pd.to_datetime(train['Lead_Creation_Date']) - pd.to_datetime(train['DOB'])
 29 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int)
 30 | 
 31 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day
 32 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek
 33 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear
 34 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter
 35 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month
 36 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year
 37 | 
 38 | city = pd.DataFrame(train['City'].value_counts())
 39 | city_rare = list(city[city[0] < 100].index)
 40 | train.ix[train['City'].isin(city_rare), 'City'] = "Others"
 41 | train['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True)
 42 | train['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True)
 43 | 
 44 | from sklearn.preprocessing import LabelEncoder
 45 | le = LabelEncoder()
 46 | train['City_encoded'] = le.fit_transform(train['City'])
 47 | 
 48 | empnames = pd.DataFrame(train['Employer_Name'].value_counts())
 49 | empnames_rare = list(empnames[empnames[0]<30].index)
 50 | train.ix[train['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others"
 51 | 
 52 | # # Preprocessing
 53 | train2 = train.copy()
 54 | 
 55 | id_train = train['ID']
 56 | label = train2['Disbursed']
 57 | 
 58 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB']#, 'Lead_Creation_Date']
 59 | train2.drop(dropCols, axis=1, inplace = True)
 60 | 
 61 | y_train = label
 62 | X_train = pd.get_dummies(train2)
 63 | 
 64 | # # Test set preparation
 65 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
 66 | 
 67 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day
 68 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek
 69 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear
 70 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter
 71 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month
 72 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year
 73 | 
 74 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day
 75 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek
 76 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear
 77 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter
 78 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month
 79 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year
 80 | 
 81 | test['Lifetime'] = pd.to_datetime(test['Lead_Creation_Date']) - pd.to_datetime(test['DOB'])
 82 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int)
 83 | 
 84 | test.ix[test['City'].isin(city_rare), 'City'] = "Others"
 85 | newcities = list(set(test['City']) - set(train['City']))
 86 | test.ix[test['City'].isin(newcities), 'City'] = np.nan
 87 | test['City_encoded'] = le.transform(test['City'])
 88 | 
 89 | test['Loan_Tenure_Applied'].replace([10,6,7,8,9],value = np.nan, inplace = True)
 90 | test['Loan_Tenure_Submitted'].replace(6, np.nan, inplace = True)
 91 | 
 92 | test.ix[test['Employer_Name'].isin(empnames_rare), 'Employer_Name'] = "Others"
 93 | 
 94 | newempnames = list(set(test['Employer_Name']) - set(train['Employer_Name']))
 95 | test.ix[test['Employer_Name'].isin(newempnames), "Employer_Name"] = "Others"
 96 | 
 97 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed']))
 98 | test2 = test.drop(testdropcols, axis=1)
 99 | 
100 | X_test = pd.get_dummies(test2)
101 | missingCols = list(set(X_train.columns)-set(X_test.columns))
102 | for col in missingCols:
103 |     X_test[col] = 0
104 | X_test = X_test[X_train.columns]
105 | assert X_train.columns.equals(X_test.columns)
106 | 
107 | X_train.to_csv("temp_data/train_preprocessed.csv", index = False)
108 | X_test.to_csv("temp_data/test_preprocessed.csv", index = False)
109 | y_train.to_csv("temp_data/train_labels.csv", index = False)
110 | test['ID'].to_csv("temp_data/test_ids.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
 7 | import xgboost as xgb
 8 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 9 | 
10 | train = pd.read_csv("Train.csv")
11 | test = pd.read_csv("Test.csv")
12 | submission = pd.read_csv("sample_submission.csv")
13 | print "Train dataset dimensions:", train.shape
14 | print "Test dataset dimensions:", test.shape
15 | 
16 | salary_acc = train.Salary_Account.value_counts(dropna=False)
17 | salary_acc_rare = list(salary_acc[salary_acc<40].index)
18 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
19 | 
20 | train['dob_day'] = pd.to_datetime(train['DOB']).dt.day
21 | train['dob_dayofweek'] = pd.to_datetime(train['DOB']).dt.dayofweek
22 | train['dob_weekofyear'] = pd.to_datetime(train['DOB']).dt.weekofyear
23 | train['dob_quarter'] = pd.to_datetime(train['DOB']).dt.quarter
24 | train['dob_month'] = pd.to_datetime(train['DOB']).dt.month
25 | train['dob_year'] = pd.to_datetime(train['DOB']).dt.year
26 | 
27 | train['Lifetime'] = pd.to_datetime("2015-10-01") - pd.to_datetime(train['DOB'])
28 | train['Lifetime'] = train['Lifetime'].dt.days.astype(int)
29 | 
30 | train['lcd_day'] = pd.to_datetime(train['Lead_Creation_Date']).dt.day
31 | train['lcd_dayofweek'] = pd.to_datetime(train['Lead_Creation_Date']).dt.dayofweek
32 | train['lcd_weekofyear'] = pd.to_datetime(train['Lead_Creation_Date']).dt.weekofyear
33 | train['lcd_quarter'] = pd.to_datetime(train['Lead_Creation_Date']).dt.quarter
34 | train['lcd_month'] = pd.to_datetime(train['Lead_Creation_Date']).dt.month
35 | train['lcd_year'] = pd.to_datetime(train['Lead_Creation_Date']).dt.year
36 | 
37 | city = pd.DataFrame(train['City'].value_counts())
38 | city_rare = list(city[city[0] < 100].index)
39 | train.ix[train['City'].isin(city_rare), 'City'] = "Others"
40 | 
41 | train.ix[pd.isnull(train['City']), 'City'] = "-3.14"
42 | 
43 | from sklearn.preprocessing import LabelEncoder
44 | le = LabelEncoder()
45 | train['City_encoded'] = le.fit_transform(train['City'])
46 | 
47 | # # Preprocessing
48 | train2 = train.copy()
49 | 
50 | id_train = train['ID']
51 | label = train2['Disbursed']
52 | 
53 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'Employer_Name', 'DOB', 'Lead_Creation_Date']
54 | train2.drop(dropCols, axis=1, inplace = True)
55 | 
56 | y_train = label
57 | X_train = pd.get_dummies(train2)
58 | 
59 | 
60 | # # Test set preparation
61 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
62 | 
63 | test['lcd_day'] = pd.to_datetime(test['Lead_Creation_Date']).dt.day
64 | test['lcd_dayofweek'] = pd.to_datetime(test['Lead_Creation_Date']).dt.dayofweek
65 | test['lcd_weekofyear'] = pd.to_datetime(test['Lead_Creation_Date']).dt.weekofyear
66 | test['lcd_quarter'] = pd.to_datetime(test['Lead_Creation_Date']).dt.quarter
67 | test['lcd_month'] = pd.to_datetime(test['Lead_Creation_Date']).dt.month
68 | test['lcd_year'] = pd.to_datetime(test['Lead_Creation_Date']).dt.year
69 | 
70 | test['dob_day'] = pd.to_datetime(test['DOB']).dt.day
71 | test['dob_dayofweek'] = pd.to_datetime(test['DOB']).dt.dayofweek
72 | test['dob_weekofyear'] = pd.to_datetime(test['DOB']).dt.weekofyear
73 | test['dob_quarter'] = pd.to_datetime(test['DOB']).dt.quarter
74 | test['dob_month'] = pd.to_datetime(test['DOB']).dt.month
75 | test['dob_year'] = pd.to_datetime(test['DOB']).dt.year
76 | 
77 | test['Lifetime'] = pd.to_datetime("2015-10-01") - pd.to_datetime(test['DOB'])
78 | test['Lifetime'] = test['Lifetime'].dt.days.astype(int)
79 | 
80 | test.ix[test['City'].isin(city_rare), 'City'] = "Others"
81 | newcities = list(set(test['City']) - set(train['City']))
82 | test.ix[test['City'].isin(newcities), 'City'] = "-3.14"
83 | test['City_encoded'] = le.transform(test['City'])
84 | 
85 | test.ix[pd.isnull(test['City']), 'City'] = "-3.14"
86 | 
87 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed']))
88 | test2 = test.drop(testdropcols, axis=1)
89 | 
90 | X_test = pd.get_dummies(test2)
91 | missingCols = list(set(X_train.columns)-set(X_test.columns))
92 | for col in missingCols:
93 |     X_test[col] = 0
94 | X_test = X_test[X_train.columns]
95 | assert X_train.columns.equals(X_test.columns)
96 | 
97 | X_train.to_csv("temp_data/train_preprocessed2.csv", index = False)
98 | X_test.to_csv("temp_data/test_preprocessed2.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/preprocessing_ftrl.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | train = pd.read_csv("temp_data/train_preprocessed.csv")
 4 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
 5 | 
 6 | labels = list(labels.iloc[:,0])
 7 | 
 8 | train['Disbursed'] = labels
 9 | 
10 | train.to_csv("temp_data/train_preprocessed_full.csv", index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | import random
 13 | import pickle
 14 | 
 15 | ##############################################################################
 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 17 | def tied_rank(x):
 18 |     """
 19 |     Computes the tied rank of elements in x.
 20 |     This function computes the tied rank of elements in x.
 21 |     Parameters
 22 |     ----------
 23 |     x : list of numbers, numpy array
 24 |     Returns
 25 |     -------
 26 |     score : list of numbers
 27 |             The tied rank f each element in x
 28 |     """
 29 |     sorted_x = sorted(zip(x,range(len(x))))
 30 |     r = [0 for k in x]
 31 |     cur_val = sorted_x[0][0]
 32 |     last_rank = 0
 33 |     for i in range(len(sorted_x)):
 34 |         if cur_val != sorted_x[i][0]:
 35 |             cur_val = sorted_x[i][0]
 36 |             for j in range(last_rank, i): 
 37 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
 38 |             last_rank = i
 39 |         if i==len(sorted_x)-1:
 40 |             for j in range(last_rank, i+1): 
 41 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
 42 |     return r
 43 | 
 44 | def auc(actual, posterior):
 45 |     """
 46 |     Computes the area under the receiver-operater characteristic (AUC)
 47 |     This function computes the AUC error metric for binary classification.
 48 |     Parameters
 49 |     ----------
 50 |     actual : list of binary numbers, numpy array
 51 |              The ground truth value
 52 |     posterior : same type as actual
 53 |                 Defines a ranking on the binary numbers, from most likely to
 54 |                 be positive to least likely to be positive.
 55 |     Returns
 56 |     -------
 57 |     score : double
 58 |             The mean squared error between actual and posterior
 59 |     """
 60 |     r = tied_rank(posterior)
 61 |     num_positive = len([0 for x in actual if x==1])
 62 |     num_negative = len(actual)-num_positive
 63 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
 64 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
 65 |            (num_negative*num_positive))
 66 |     return auc
 67 | ##############################################################################
 68 | 
 69 | # TL; DR, the main training process starts on line: 250,
 70 | # you may want to start reading the code from there
 71 | 
 72 | 
 73 | ##############################################################################
 74 | # parameters #################################################################
 75 | ##############################################################################
 76 | 
 77 | # A, paths
 78 | train='temp_data/train_preprocessed_full.csv'
 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv'
 80 | submission = 'temp_submission/Sub701.csv'  # path of to be outputted submission file
 81 | 
 82 | # B, model
 83 | alpha = .05  # learning rate
 84 | beta = 1.   # smoothing parameter for adaptive learning rate
 85 | L1 = 0.     # L1 regularization, larger value means more regularized
 86 | L2 = 1.     # L2 regularization, larger value means more regularized
 87 | 
 88 | # C, feature/hash trick
 89 | D = 2 ** 24             # number of weights to use
 90 | interaction = False     # whether to enable poly2 feature interactions
 91 | 
 92 | # D, training/validation
 93 | epoch = 4       # learn training data for N passes
 94 | holdafter = 9   # data after date N (exclusive) are used as validation
 95 | holdout = 200  # use every N training instance for holdout validation
 96 | 
 97 | 
 98 | ##############################################################################
 99 | # class, function, generator definitions #####################################
100 | ##############################################################################
101 | 
102 | class ftrl_proximal(object):
103 |     ''' Our main algorithm: Follow the regularized leader - proximal
104 | 
105 |         In short,
106 |         this is an adaptive-learning-rate sparse logistic-regression with
107 |         efficient L1-L2-regularization
108 | 
109 |         Reference:
110 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
111 |     '''
112 | 
113 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
114 |         # parameters
115 |         self.alpha = alpha
116 |         self.beta = beta
117 |         self.L1 = L1
118 |         self.L2 = L2
119 | 
120 |         # feature related parameters
121 |         self.D = D
122 |         self.interaction = interaction
123 | 
124 |         # model
125 |         # n: squared sum of past gradients
126 |         # z: weights
127 |         # w: lazy weights
128 |         self.n = [0.] * D
129 |         self.z = [random.random() for k in range(D)]#[0.] * D
130 |         self.w = {}
131 | 
132 |     def _indices(self, x):
133 |         ''' A helper generator that yields the indices in x
134 | 
135 |             The purpose of this generator is to make the following
136 |             code a bit cleaner when doing feature interaction.
137 |         '''
138 | 
139 |         # first yield index of the bias term
140 |         yield 0
141 | 
142 |         # then yield the normal indices
143 |         for index in x:
144 |             yield index
145 | 
146 |         # now yield interactions (if applicable)
147 |         if self.interaction:
148 |             D = self.D
149 |             L = len(x)
150 | 
151 |             x = sorted(x)
152 |             for i in xrange(L):
153 |                 for j in xrange(i+1, L):
154 |                     # one-hot encode interactions with hash trick
155 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
156 | 
157 |     def predict(self, x):
158 |         ''' Get probability estimation on x
159 | 
160 |             INPUT:
161 |                 x: features
162 | 
163 |             OUTPUT:
164 |                 probability of p(y = 1 | x; w)
165 |         '''
166 | 
167 |         # parameters
168 |         alpha = self.alpha
169 |         beta = self.beta
170 |         L1 = self.L1
171 |         L2 = self.L2
172 | 
173 |         # model
174 |         n = self.n
175 |         z = self.z
176 |         w = {}
177 | 
178 |         # wTx is the inner product of w and x
179 |         wTx = 0.
180 |         for i in self._indices(x):
181 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
182 | 
183 |             # build w on the fly using z and n, hence the name - lazy weights
184 |             # we are doing this at prediction instead of update time is because
185 |             # this allows us for not storing the complete w
186 |             if sign * z[i] <= L1:
187 |                 # w[i] vanishes due to L1 regularization
188 |                 w[i] = 0.
189 |             else:
190 |                 # apply prediction time L1, L2 regularization to z and get w
191 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
192 | 
193 |             wTx += w[i]
194 | 
195 |         # cache the current w for update stage
196 |         self.w = w
197 | 
198 |         # bounded sigmoid function, this is the probability estimation
199 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
200 | 
201 |     def update(self, x, p, y):
202 |         ''' Update model using x, p, y
203 | 
204 |             INPUT:
205 |                 x: feature, a list of indices
206 |                 p: click probability prediction of our model
207 |                 y: answer
208 | 
209 |             MODIFIES:
210 |                 self.n: increase by squared gradient
211 |                 self.z: weights
212 |         '''
213 | 
214 |         # parameter
215 |         alpha = self.alpha
216 | 
217 |         # model
218 |         n = self.n
219 |         z = self.z
220 |         w = self.w
221 | 
222 |         # gradient under logloss
223 |         g = p - y
224 | 
225 |         # update z and n
226 |         for i in self._indices(x):
227 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
228 |             z[i] += g - sigma * w[i]
229 |             n[i] += g * g
230 | 
231 | 
232 | def logloss(p, y):
233 |     ''' FUNCTION: Bounded logloss
234 | 
235 |         INPUT:
236 |             p: our prediction
237 |             y: real answer
238 | 
239 |         OUTPUT:
240 |             logarithmic loss of p given y
241 |     '''
242 | 
243 |     p = max(min(p, 1. - 10e-15), 10e-15)
244 |     return -log(p) if y == 1. else -log(1. - p)
245 | 
246 | 
247 | def data(path, D):
248 |     ''' GENERATOR: Apply hash-trick to the original csv row
249 |                    and for simplicity, we one-hot-encode everything
250 | 
251 |         INPUT:
252 |             path: path to training or testing file
253 |             D: the max index that we can hash to
254 | 
255 |         YIELDS:
256 |             ID: id of the instance, mainly useless
257 |             x: a list of hashed and one-hot-encoded 'indices'
258 |                we only need the index since all values are either 0 or 1
259 |             y: y = 1 if we have a click, else we have y = 0
260 |     '''
261 | 
262 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
263 | 
264 |         try:
265 |             ID= row['ID']
266 |             del row['ID']
267 |         except:
268 |             ID = 0
269 |             pass
270 | 
271 |         # process target.
272 |         y = 0.
273 |         target='Disbursed'
274 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
275 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
276 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
277 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
278 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
279 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
280 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
281 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
282 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
283 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
284 |         #lcd_weekofyear
285 | 
286 | 
287 |         if target in row:
288 |             if row[target] == '1':
289 |                 y = 1.
290 |             del row[target]
291 | 
292 |         # extract date
293 | 
294 |         # turn hour really into hour, it was originally YYMMDDHH
295 | 
296 | 
297 |         # build x
298 |         x = []
299 |         for key in row:
300 |             value = row[key]
301 | 
302 |             # one-hot encode everything with hash trick
303 |             index = abs(hash(key + '_' + value)) % D
304 |             x.append(index)
305 | 
306 |         yield t, ID, x, y
307 | 
308 | 
309 | ##############################################################################
310 | # start training #############################################################
311 | ##############################################################################
312 | 
313 | start = datetime.now()
314 | #print("started at: %s" % datetime.now())
315 | 
316 | # initialize ourselves a learner
317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
318 | 
319 | # start training
320 | for e in range(epoch):
321 |     random.seed(1234)
322 |     loss = 0.
323 |     count = 0
324 |     predlist=[]
325 |     targetlist=[]
326 |     for t, ID, x, y in data(train, D):  # data is a generator
327 | 
328 |         p = learner.predict(x)
329 | 
330 |         # if random.random() < 0.3:
331 |         # # Estimate progressive validation loss
332 |         #     loss += logloss(p, y)
333 |         #     count += 1
334 |         #     predlist.append(p)
335 |         #     targetlist.append(y) 
336 |         # else:
337 |         # # Use other samples to train the model
338 |         #     learner.update(x, p, y)
339 | 
340 |         learner.update(x, p, y)
341 |         # if t % 1000000 == 0:
342 |         #     continue
343 | 
344 |     # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 
345 |     #         (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start)))
346 | 
347 | #import pickle
348 | #pickle.dump(learner,open('ftrl3.p','w'))
349 | 
350 | ##############################################################################
351 | # start testing, and build Kaggle's submission file ##########################
352 | ##############################################################################
353 | #print ('creating submission file')
354 | with open(submission, 'w') as outfile:
355 |     outfile.write('ID,Disbursed\n')
356 |     for t, ID, x, y in data(test, D):
357 |         p = learner.predict(x)
358 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl2.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | import random
 13 | import pickle
 14 | 
 15 | ##############################################################################
 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 17 | def tied_rank(x):
 18 |     """
 19 |     Computes the tied rank of elements in x.
 20 |     This function computes the tied rank of elements in x.
 21 |     Parameters
 22 |     ----------
 23 |     x : list of numbers, numpy array
 24 |     Returns
 25 |     -------
 26 |     score : list of numbers
 27 |             The tied rank f each element in x
 28 |     """
 29 |     sorted_x = sorted(zip(x,range(len(x))))
 30 |     r = [0 for k in x]
 31 |     cur_val = sorted_x[0][0]
 32 |     last_rank = 0
 33 |     for i in range(len(sorted_x)):
 34 |         if cur_val != sorted_x[i][0]:
 35 |             cur_val = sorted_x[i][0]
 36 |             for j in range(last_rank, i): 
 37 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
 38 |             last_rank = i
 39 |         if i==len(sorted_x)-1:
 40 |             for j in range(last_rank, i+1): 
 41 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
 42 |     return r
 43 | 
 44 | def auc(actual, posterior):
 45 |     """
 46 |     Computes the area under the receiver-operater characteristic (AUC)
 47 |     This function computes the AUC error metric for binary classification.
 48 |     Parameters
 49 |     ----------
 50 |     actual : list of binary numbers, numpy array
 51 |              The ground truth value
 52 |     posterior : same type as actual
 53 |                 Defines a ranking on the binary numbers, from most likely to
 54 |                 be positive to least likely to be positive.
 55 |     Returns
 56 |     -------
 57 |     score : double
 58 |             The mean squared error between actual and posterior
 59 |     """
 60 |     r = tied_rank(posterior)
 61 |     num_positive = len([0 for x in actual if x==1])
 62 |     num_negative = len(actual)-num_positive
 63 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
 64 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
 65 |            (num_negative*num_positive))
 66 |     return auc
 67 | ##############################################################################
 68 | 
 69 | # TL; DR, the main training process starts on line: 250,
 70 | # you may want to start reading the code from there
 71 | 
 72 | 
 73 | ##############################################################################
 74 | # parameters #################################################################
 75 | ##############################################################################
 76 | 
 77 | # A, paths
 78 | train='temp_data/shuffled_train1.csv'
 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv'
 80 | submission = 'temp_submission/Sub702.csv'  # path of to be outputted submission file
 81 | 
 82 | # B, model
 83 | alpha = .05  # learning rate
 84 | beta = 1.   # smoothing parameter for adaptive learning rate
 85 | L1 = 0.     # L1 regularization, larger value means more regularized
 86 | L2 = 1.     # L2 regularization, larger value means more regularized
 87 | 
 88 | # C, feature/hash trick
 89 | D = 2 ** 24             # number of weights to use
 90 | interaction = False     # whether to enable poly2 feature interactions
 91 | 
 92 | # D, training/validation
 93 | epoch = 4       # learn training data for N passes
 94 | holdafter = 9   # data after date N (exclusive) are used as validation
 95 | holdout = 200  # use every N training instance for holdout validation
 96 | 
 97 | 
 98 | ##############################################################################
 99 | # class, function, generator definitions #####################################
100 | ##############################################################################
101 | 
102 | class ftrl_proximal(object):
103 |     ''' Our main algorithm: Follow the regularized leader - proximal
104 | 
105 |         In short,
106 |         this is an adaptive-learning-rate sparse logistic-regression with
107 |         efficient L1-L2-regularization
108 | 
109 |         Reference:
110 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
111 |     '''
112 | 
113 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
114 |         # parameters
115 |         self.alpha = alpha
116 |         self.beta = beta
117 |         self.L1 = L1
118 |         self.L2 = L2
119 | 
120 |         # feature related parameters
121 |         self.D = D
122 |         self.interaction = interaction
123 | 
124 |         # model
125 |         # n: squared sum of past gradients
126 |         # z: weights
127 |         # w: lazy weights
128 |         self.n = [0.] * D
129 |         self.z = [random.random() for k in range(D)]#[0.] * D
130 |         self.w = {}
131 | 
132 |     def _indices(self, x):
133 |         ''' A helper generator that yields the indices in x
134 | 
135 |             The purpose of this generator is to make the following
136 |             code a bit cleaner when doing feature interaction.
137 |         '''
138 | 
139 |         # first yield index of the bias term
140 |         yield 0
141 | 
142 |         # then yield the normal indices
143 |         for index in x:
144 |             yield index
145 | 
146 |         # now yield interactions (if applicable)
147 |         if self.interaction:
148 |             D = self.D
149 |             L = len(x)
150 | 
151 |             x = sorted(x)
152 |             for i in xrange(L):
153 |                 for j in xrange(i+1, L):
154 |                     # one-hot encode interactions with hash trick
155 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
156 | 
157 |     def predict(self, x):
158 |         ''' Get probability estimation on x
159 | 
160 |             INPUT:
161 |                 x: features
162 | 
163 |             OUTPUT:
164 |                 probability of p(y = 1 | x; w)
165 |         '''
166 | 
167 |         # parameters
168 |         alpha = self.alpha
169 |         beta = self.beta
170 |         L1 = self.L1
171 |         L2 = self.L2
172 | 
173 |         # model
174 |         n = self.n
175 |         z = self.z
176 |         w = {}
177 | 
178 |         # wTx is the inner product of w and x
179 |         wTx = 0.
180 |         for i in self._indices(x):
181 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
182 | 
183 |             # build w on the fly using z and n, hence the name - lazy weights
184 |             # we are doing this at prediction instead of update time is because
185 |             # this allows us for not storing the complete w
186 |             if sign * z[i] <= L1:
187 |                 # w[i] vanishes due to L1 regularization
188 |                 w[i] = 0.
189 |             else:
190 |                 # apply prediction time L1, L2 regularization to z and get w
191 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
192 | 
193 |             wTx += w[i]
194 | 
195 |         # cache the current w for update stage
196 |         self.w = w
197 | 
198 |         # bounded sigmoid function, this is the probability estimation
199 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
200 | 
201 |     def update(self, x, p, y):
202 |         ''' Update model using x, p, y
203 | 
204 |             INPUT:
205 |                 x: feature, a list of indices
206 |                 p: click probability prediction of our model
207 |                 y: answer
208 | 
209 |             MODIFIES:
210 |                 self.n: increase by squared gradient
211 |                 self.z: weights
212 |         '''
213 | 
214 |         # parameter
215 |         alpha = self.alpha
216 | 
217 |         # model
218 |         n = self.n
219 |         z = self.z
220 |         w = self.w
221 | 
222 |         # gradient under logloss
223 |         g = p - y
224 | 
225 |         # update z and n
226 |         for i in self._indices(x):
227 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
228 |             z[i] += g - sigma * w[i]
229 |             n[i] += g * g
230 | 
231 | 
232 | def logloss(p, y):
233 |     ''' FUNCTION: Bounded logloss
234 | 
235 |         INPUT:
236 |             p: our prediction
237 |             y: real answer
238 | 
239 |         OUTPUT:
240 |             logarithmic loss of p given y
241 |     '''
242 | 
243 |     p = max(min(p, 1. - 10e-15), 10e-15)
244 |     return -log(p) if y == 1. else -log(1. - p)
245 | 
246 | 
247 | def data(path, D):
248 |     ''' GENERATOR: Apply hash-trick to the original csv row
249 |                    and for simplicity, we one-hot-encode everything
250 | 
251 |         INPUT:
252 |             path: path to training or testing file
253 |             D: the max index that we can hash to
254 | 
255 |         YIELDS:
256 |             ID: id of the instance, mainly useless
257 |             x: a list of hashed and one-hot-encoded 'indices'
258 |                we only need the index since all values are either 0 or 1
259 |             y: y = 1 if we have a click, else we have y = 0
260 |     '''
261 | 
262 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
263 | 
264 |         try:
265 |             ID= row['ID']
266 |             del row['ID']
267 |         except:
268 |             ID = 0
269 |             pass
270 | 
271 |         # process target.
272 |         y = 0.
273 |         target='Disbursed'
274 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
275 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
276 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
277 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
278 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
279 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
280 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
281 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
282 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
283 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
284 |         #lcd_weekofyear
285 | 
286 | 
287 |         if target in row:
288 |             if row[target] == '1':
289 |                 y = 1.
290 |             del row[target]
291 | 
292 |         # extract date
293 | 
294 |         # turn hour really into hour, it was originally YYMMDDHH
295 | 
296 | 
297 |         # build x
298 |         x = []
299 |         for key in row:
300 |             value = row[key]
301 | 
302 |             # one-hot encode everything with hash trick
303 |             index = abs(hash(key + '_' + value)) % D
304 |             x.append(index)
305 | 
306 |         yield t, ID, x, y
307 | 
308 | 
309 | ##############################################################################
310 | # start training #############################################################
311 | ##############################################################################
312 | 
313 | start = datetime.now()
314 | #print("started at: %s" % datetime.now())
315 | 
316 | # initialize ourselves a learner
317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
318 | 
319 | # start training
320 | for e in range(epoch):
321 |     random.seed(1234)
322 |     loss = 0.
323 |     count = 0
324 |     predlist=[]
325 |     targetlist=[]
326 |     for t, ID, x, y in data(train, D):  # data is a generator
327 | 
328 |         p = learner.predict(x)
329 | 
330 |         # if random.random() < 0.3:
331 |         # # Estimate progressive validation loss
332 |         #     loss += logloss(p, y)
333 |         #     count += 1
334 |         #     predlist.append(p)
335 |         #     targetlist.append(y) 
336 |         # else:
337 |         # # Use other samples to train the model
338 |         #     learner.update(x, p, y)
339 | 
340 |         learner.update(x, p, y)
341 |         # if t % 1000000 == 0:
342 |         #     continue
343 | 
344 |     # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 
345 |     #         (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start)))
346 | 
347 | #import pickle
348 | #pickle.dump(learner,open('ftrl3.p','w'))
349 | 
350 | ##############################################################################
351 | # start testing, and build Kaggle's submission file ##########################
352 | ##############################################################################
353 | #print ('creating submission file')
354 | with open(submission, 'w') as outfile:
355 |     outfile.write('ID,Disbursed\n')
356 |     for t, ID, x, y in data(test, D):
357 |         p = learner.predict(x)
358 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl3.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | import random
 13 | import pickle
 14 | 
 15 | ##############################################################################
 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 17 | def tied_rank(x):
 18 |     """
 19 |     Computes the tied rank of elements in x.
 20 |     This function computes the tied rank of elements in x.
 21 |     Parameters
 22 |     ----------
 23 |     x : list of numbers, numpy array
 24 |     Returns
 25 |     -------
 26 |     score : list of numbers
 27 |             The tied rank f each element in x
 28 |     """
 29 |     sorted_x = sorted(zip(x,range(len(x))))
 30 |     r = [0 for k in x]
 31 |     cur_val = sorted_x[0][0]
 32 |     last_rank = 0
 33 |     for i in range(len(sorted_x)):
 34 |         if cur_val != sorted_x[i][0]:
 35 |             cur_val = sorted_x[i][0]
 36 |             for j in range(last_rank, i): 
 37 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
 38 |             last_rank = i
 39 |         if i==len(sorted_x)-1:
 40 |             for j in range(last_rank, i+1): 
 41 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
 42 |     return r
 43 | 
 44 | def auc(actual, posterior):
 45 |     """
 46 |     Computes the area under the receiver-operater characteristic (AUC)
 47 |     This function computes the AUC error metric for binary classification.
 48 |     Parameters
 49 |     ----------
 50 |     actual : list of binary numbers, numpy array
 51 |              The ground truth value
 52 |     posterior : same type as actual
 53 |                 Defines a ranking on the binary numbers, from most likely to
 54 |                 be positive to least likely to be positive.
 55 |     Returns
 56 |     -------
 57 |     score : double
 58 |             The mean squared error between actual and posterior
 59 |     """
 60 |     r = tied_rank(posterior)
 61 |     num_positive = len([0 for x in actual if x==1])
 62 |     num_negative = len(actual)-num_positive
 63 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
 64 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
 65 |            (num_negative*num_positive))
 66 |     return auc
 67 | ##############################################################################
 68 | 
 69 | # TL; DR, the main training process starts on line: 250,
 70 | # you may want to start reading the code from there
 71 | 
 72 | 
 73 | ##############################################################################
 74 | # parameters #################################################################
 75 | ##############################################################################
 76 | 
 77 | # A, paths
 78 | train='temp_data/shuffled_train2.csv'
 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv'
 80 | submission = 'temp_submission/Sub703.csv'  # path of to be outputted submission file
 81 | 
 82 | # B, model
 83 | alpha = .05  # learning rate
 84 | beta = 1.   # smoothing parameter for adaptive learning rate
 85 | L1 = 0.     # L1 regularization, larger value means more regularized
 86 | L2 = 1.     # L2 regularization, larger value means more regularized
 87 | 
 88 | # C, feature/hash trick
 89 | D = 2 ** 24             # number of weights to use
 90 | interaction = False     # whether to enable poly2 feature interactions
 91 | 
 92 | # D, training/validation
 93 | epoch = 4       # learn training data for N passes
 94 | holdafter = 9   # data after date N (exclusive) are used as validation
 95 | holdout = 200  # use every N training instance for holdout validation
 96 | 
 97 | 
 98 | ##############################################################################
 99 | # class, function, generator definitions #####################################
100 | ##############################################################################
101 | 
102 | class ftrl_proximal(object):
103 |     ''' Our main algorithm: Follow the regularized leader - proximal
104 | 
105 |         In short,
106 |         this is an adaptive-learning-rate sparse logistic-regression with
107 |         efficient L1-L2-regularization
108 | 
109 |         Reference:
110 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
111 |     '''
112 | 
113 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
114 |         # parameters
115 |         self.alpha = alpha
116 |         self.beta = beta
117 |         self.L1 = L1
118 |         self.L2 = L2
119 | 
120 |         # feature related parameters
121 |         self.D = D
122 |         self.interaction = interaction
123 | 
124 |         # model
125 |         # n: squared sum of past gradients
126 |         # z: weights
127 |         # w: lazy weights
128 |         self.n = [0.] * D
129 |         self.z = [random.random() for k in range(D)]#[0.] * D
130 |         self.w = {}
131 | 
132 |     def _indices(self, x):
133 |         ''' A helper generator that yields the indices in x
134 | 
135 |             The purpose of this generator is to make the following
136 |             code a bit cleaner when doing feature interaction.
137 |         '''
138 | 
139 |         # first yield index of the bias term
140 |         yield 0
141 | 
142 |         # then yield the normal indices
143 |         for index in x:
144 |             yield index
145 | 
146 |         # now yield interactions (if applicable)
147 |         if self.interaction:
148 |             D = self.D
149 |             L = len(x)
150 | 
151 |             x = sorted(x)
152 |             for i in xrange(L):
153 |                 for j in xrange(i+1, L):
154 |                     # one-hot encode interactions with hash trick
155 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
156 | 
157 |     def predict(self, x):
158 |         ''' Get probability estimation on x
159 | 
160 |             INPUT:
161 |                 x: features
162 | 
163 |             OUTPUT:
164 |                 probability of p(y = 1 | x; w)
165 |         '''
166 | 
167 |         # parameters
168 |         alpha = self.alpha
169 |         beta = self.beta
170 |         L1 = self.L1
171 |         L2 = self.L2
172 | 
173 |         # model
174 |         n = self.n
175 |         z = self.z
176 |         w = {}
177 | 
178 |         # wTx is the inner product of w and x
179 |         wTx = 0.
180 |         for i in self._indices(x):
181 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
182 | 
183 |             # build w on the fly using z and n, hence the name - lazy weights
184 |             # we are doing this at prediction instead of update time is because
185 |             # this allows us for not storing the complete w
186 |             if sign * z[i] <= L1:
187 |                 # w[i] vanishes due to L1 regularization
188 |                 w[i] = 0.
189 |             else:
190 |                 # apply prediction time L1, L2 regularization to z and get w
191 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
192 | 
193 |             wTx += w[i]
194 | 
195 |         # cache the current w for update stage
196 |         self.w = w
197 | 
198 |         # bounded sigmoid function, this is the probability estimation
199 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
200 | 
201 |     def update(self, x, p, y):
202 |         ''' Update model using x, p, y
203 | 
204 |             INPUT:
205 |                 x: feature, a list of indices
206 |                 p: click probability prediction of our model
207 |                 y: answer
208 | 
209 |             MODIFIES:
210 |                 self.n: increase by squared gradient
211 |                 self.z: weights
212 |         '''
213 | 
214 |         # parameter
215 |         alpha = self.alpha
216 | 
217 |         # model
218 |         n = self.n
219 |         z = self.z
220 |         w = self.w
221 | 
222 |         # gradient under logloss
223 |         g = p - y
224 | 
225 |         # update z and n
226 |         for i in self._indices(x):
227 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
228 |             z[i] += g - sigma * w[i]
229 |             n[i] += g * g
230 | 
231 | 
232 | def logloss(p, y):
233 |     ''' FUNCTION: Bounded logloss
234 | 
235 |         INPUT:
236 |             p: our prediction
237 |             y: real answer
238 | 
239 |         OUTPUT:
240 |             logarithmic loss of p given y
241 |     '''
242 | 
243 |     p = max(min(p, 1. - 10e-15), 10e-15)
244 |     return -log(p) if y == 1. else -log(1. - p)
245 | 
246 | 
247 | def data(path, D):
248 |     ''' GENERATOR: Apply hash-trick to the original csv row
249 |                    and for simplicity, we one-hot-encode everything
250 | 
251 |         INPUT:
252 |             path: path to training or testing file
253 |             D: the max index that we can hash to
254 | 
255 |         YIELDS:
256 |             ID: id of the instance, mainly useless
257 |             x: a list of hashed and one-hot-encoded 'indices'
258 |                we only need the index since all values are either 0 or 1
259 |             y: y = 1 if we have a click, else we have y = 0
260 |     '''
261 | 
262 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
263 | 
264 |         try:
265 |             ID= row['ID']
266 |             del row['ID']
267 |         except:
268 |             ID = 0
269 |             pass
270 | 
271 |         # process target.
272 |         y = 0.
273 |         target='Disbursed'
274 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
275 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
276 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
277 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
278 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
279 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
280 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
281 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
282 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
283 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
284 |         #lcd_weekofyear
285 | 
286 | 
287 |         if target in row:
288 |             if row[target] == '1':
289 |                 y = 1.
290 |             del row[target]
291 | 
292 |         # extract date
293 | 
294 |         # turn hour really into hour, it was originally YYMMDDHH
295 | 
296 | 
297 |         # build x
298 |         x = []
299 |         for key in row:
300 |             value = row[key]
301 | 
302 |             # one-hot encode everything with hash trick
303 |             index = abs(hash(key + '_' + value)) % D
304 |             x.append(index)
305 | 
306 |         yield t, ID, x, y
307 | 
308 | 
309 | ##############################################################################
310 | # start training #############################################################
311 | ##############################################################################
312 | 
313 | start = datetime.now()
314 | #print("started at: %s" % datetime.now())
315 | 
316 | # initialize ourselves a learner
317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
318 | 
319 | # start training
320 | for e in range(epoch):
321 |     random.seed(1234)
322 |     loss = 0.
323 |     count = 0
324 |     predlist=[]
325 |     targetlist=[]
326 |     for t, ID, x, y in data(train, D):  # data is a generator
327 | 
328 |         p = learner.predict(x)
329 | 
330 |         # if random.random() < 0.3:
331 |         # # Estimate progressive validation loss
332 |         #     loss += logloss(p, y)
333 |         #     count += 1
334 |         #     predlist.append(p)
335 |         #     targetlist.append(y) 
336 |         # else:
337 |         # # Use other samples to train the model
338 |         #     learner.update(x, p, y)
339 | 
340 |         learner.update(x, p, y)
341 |         # if t % 1000000 == 0:
342 |         #     continue
343 | 
344 |     # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 
345 |     #         (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start)))
346 | 
347 | #import pickle
348 | #pickle.dump(learner,open('ftrl3.p','w'))
349 | 
350 | ##############################################################################
351 | # start testing, and build Kaggle's submission file ##########################
352 | ##############################################################################
353 | #print ('creating submission file')
354 | with open(submission, 'w') as outfile:
355 |     outfile.write('ID,Disbursed\n')
356 |     for t, ID, x, y in data(test, D):
357 |         p = learner.predict(x)
358 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl4.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | import random
 13 | import pickle
 14 | 
 15 | ##############################################################################
 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 17 | def tied_rank(x):
 18 |     """
 19 |     Computes the tied rank of elements in x.
 20 |     This function computes the tied rank of elements in x.
 21 |     Parameters
 22 |     ----------
 23 |     x : list of numbers, numpy array
 24 |     Returns
 25 |     -------
 26 |     score : list of numbers
 27 |             The tied rank f each element in x
 28 |     """
 29 |     sorted_x = sorted(zip(x,range(len(x))))
 30 |     r = [0 for k in x]
 31 |     cur_val = sorted_x[0][0]
 32 |     last_rank = 0
 33 |     for i in range(len(sorted_x)):
 34 |         if cur_val != sorted_x[i][0]:
 35 |             cur_val = sorted_x[i][0]
 36 |             for j in range(last_rank, i): 
 37 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
 38 |             last_rank = i
 39 |         if i==len(sorted_x)-1:
 40 |             for j in range(last_rank, i+1): 
 41 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
 42 |     return r
 43 | 
 44 | def auc(actual, posterior):
 45 |     """
 46 |     Computes the area under the receiver-operater characteristic (AUC)
 47 |     This function computes the AUC error metric for binary classification.
 48 |     Parameters
 49 |     ----------
 50 |     actual : list of binary numbers, numpy array
 51 |              The ground truth value
 52 |     posterior : same type as actual
 53 |                 Defines a ranking on the binary numbers, from most likely to
 54 |                 be positive to least likely to be positive.
 55 |     Returns
 56 |     -------
 57 |     score : double
 58 |             The mean squared error between actual and posterior
 59 |     """
 60 |     r = tied_rank(posterior)
 61 |     num_positive = len([0 for x in actual if x==1])
 62 |     num_negative = len(actual)-num_positive
 63 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
 64 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
 65 |            (num_negative*num_positive))
 66 |     return auc
 67 | ##############################################################################
 68 | 
 69 | # TL; DR, the main training process starts on line: 250,
 70 | # you may want to start reading the code from there
 71 | 
 72 | 
 73 | ##############################################################################
 74 | # parameters #################################################################
 75 | ##############################################################################
 76 | 
 77 | # A, paths
 78 | 
 79 | train='temp_data/shuffled_train3.csv'
 80 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv'
 81 | submission = 'temp_submission/Sub704.csv'  # path of to be outputted submission file
 82 | 
 83 | # B, model
 84 | alpha = .05  # learning rate
 85 | beta = 1.   # smoothing parameter for adaptive learning rate
 86 | L1 = 0.     # L1 regularization, larger value means more regularized
 87 | L2 = 1.     # L2 regularization, larger value means more regularized
 88 | 
 89 | # C, feature/hash trick
 90 | D = 2 ** 24             # number of weights to use
 91 | interaction = False     # whether to enable poly2 feature interactions
 92 | 
 93 | # D, training/validation
 94 | epoch = 4       # learn training data for N passes
 95 | holdafter = 9   # data after date N (exclusive) are used as validation
 96 | holdout = 200  # use every N training instance for holdout validation
 97 | 
 98 | 
 99 | ##############################################################################
100 | # class, function, generator definitions #####################################
101 | ##############################################################################
102 | 
103 | class ftrl_proximal(object):
104 |     ''' Our main algorithm: Follow the regularized leader - proximal
105 | 
106 |         In short,
107 |         this is an adaptive-learning-rate sparse logistic-regression with
108 |         efficient L1-L2-regularization
109 | 
110 |         Reference:
111 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
112 |     '''
113 | 
114 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
115 |         # parameters
116 |         self.alpha = alpha
117 |         self.beta = beta
118 |         self.L1 = L1
119 |         self.L2 = L2
120 | 
121 |         # feature related parameters
122 |         self.D = D
123 |         self.interaction = interaction
124 | 
125 |         # model
126 |         # n: squared sum of past gradients
127 |         # z: weights
128 |         # w: lazy weights
129 |         self.n = [0.] * D
130 |         self.z = [random.random() for k in range(D)]#[0.] * D
131 |         self.w = {}
132 | 
133 |     def _indices(self, x):
134 |         ''' A helper generator that yields the indices in x
135 | 
136 |             The purpose of this generator is to make the following
137 |             code a bit cleaner when doing feature interaction.
138 |         '''
139 | 
140 |         # first yield index of the bias term
141 |         yield 0
142 | 
143 |         # then yield the normal indices
144 |         for index in x:
145 |             yield index
146 | 
147 |         # now yield interactions (if applicable)
148 |         if self.interaction:
149 |             D = self.D
150 |             L = len(x)
151 | 
152 |             x = sorted(x)
153 |             for i in xrange(L):
154 |                 for j in xrange(i+1, L):
155 |                     # one-hot encode interactions with hash trick
156 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
157 | 
158 |     def predict(self, x):
159 |         ''' Get probability estimation on x
160 | 
161 |             INPUT:
162 |                 x: features
163 | 
164 |             OUTPUT:
165 |                 probability of p(y = 1 | x; w)
166 |         '''
167 | 
168 |         # parameters
169 |         alpha = self.alpha
170 |         beta = self.beta
171 |         L1 = self.L1
172 |         L2 = self.L2
173 | 
174 |         # model
175 |         n = self.n
176 |         z = self.z
177 |         w = {}
178 | 
179 |         # wTx is the inner product of w and x
180 |         wTx = 0.
181 |         for i in self._indices(x):
182 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
183 | 
184 |             # build w on the fly using z and n, hence the name - lazy weights
185 |             # we are doing this at prediction instead of update time is because
186 |             # this allows us for not storing the complete w
187 |             if sign * z[i] <= L1:
188 |                 # w[i] vanishes due to L1 regularization
189 |                 w[i] = 0.
190 |             else:
191 |                 # apply prediction time L1, L2 regularization to z and get w
192 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
193 | 
194 |             wTx += w[i]
195 | 
196 |         # cache the current w for update stage
197 |         self.w = w
198 | 
199 |         # bounded sigmoid function, this is the probability estimation
200 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
201 | 
202 |     def update(self, x, p, y):
203 |         ''' Update model using x, p, y
204 | 
205 |             INPUT:
206 |                 x: feature, a list of indices
207 |                 p: click probability prediction of our model
208 |                 y: answer
209 | 
210 |             MODIFIES:
211 |                 self.n: increase by squared gradient
212 |                 self.z: weights
213 |         '''
214 | 
215 |         # parameter
216 |         alpha = self.alpha
217 | 
218 |         # model
219 |         n = self.n
220 |         z = self.z
221 |         w = self.w
222 | 
223 |         # gradient under logloss
224 |         g = p - y
225 | 
226 |         # update z and n
227 |         for i in self._indices(x):
228 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
229 |             z[i] += g - sigma * w[i]
230 |             n[i] += g * g
231 | 
232 | 
233 | def logloss(p, y):
234 |     ''' FUNCTION: Bounded logloss
235 | 
236 |         INPUT:
237 |             p: our prediction
238 |             y: real answer
239 | 
240 |         OUTPUT:
241 |             logarithmic loss of p given y
242 |     '''
243 | 
244 |     p = max(min(p, 1. - 10e-15), 10e-15)
245 |     return -log(p) if y == 1. else -log(1. - p)
246 | 
247 | 
248 | def data(path, D):
249 |     ''' GENERATOR: Apply hash-trick to the original csv row
250 |                    and for simplicity, we one-hot-encode everything
251 | 
252 |         INPUT:
253 |             path: path to training or testing file
254 |             D: the max index that we can hash to
255 | 
256 |         YIELDS:
257 |             ID: id of the instance, mainly useless
258 |             x: a list of hashed and one-hot-encoded 'indices'
259 |                we only need the index since all values are either 0 or 1
260 |             y: y = 1 if we have a click, else we have y = 0
261 |     '''
262 | 
263 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
264 | 
265 |         try:
266 |             ID= row['ID']
267 |             del row['ID']
268 |         except:
269 |             ID = 0
270 |             pass
271 | 
272 |         # process target.
273 |         y = 0.
274 |         target='Disbursed'
275 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
276 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
277 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
278 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
279 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
280 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
281 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
282 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
283 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
284 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
285 |         #lcd_weekofyear
286 | 
287 | 
288 |         if target in row:
289 |             if row[target] == '1':
290 |                 y = 1.
291 |             del row[target]
292 | 
293 |         # extract date
294 | 
295 |         # turn hour really into hour, it was originally YYMMDDHH
296 | 
297 | 
298 |         # build x
299 |         x = []
300 |         for key in row:
301 |             value = row[key]
302 | 
303 |             # one-hot encode everything with hash trick
304 |             index = abs(hash(key + '_' + value)) % D
305 |             x.append(index)
306 | 
307 |         yield t, ID, x, y
308 | 
309 | 
310 | ##############################################################################
311 | # start training #############################################################
312 | ##############################################################################
313 | 
314 | start = datetime.now()
315 | #print("started at: %s" % datetime.now())
316 | 
317 | # initialize ourselves a learner
318 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
319 | 
320 | # start training
321 | for e in range(epoch):
322 |     random.seed(1234)
323 |     loss = 0.
324 |     count = 0
325 |     predlist=[]
326 |     targetlist=[]
327 |     for t, ID, x, y in data(train, D):  # data is a generator
328 | 
329 |         p = learner.predict(x)
330 | 
331 |         # if random.random() < 0.3:
332 |         # # Estimate progressive validation loss
333 |         #     loss += logloss(p, y)
334 |         #     count += 1
335 |         #     predlist.append(p)
336 |         #     targetlist.append(y) 
337 |         # else:
338 |         # # Use other samples to train the model
339 |         #     learner.update(x, p, y)
340 | 
341 |         learner.update(x, p, y)
342 |         # if t % 1000000 == 0:
343 |         #     continue
344 | 
345 |     # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 
346 |     #         (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start)))
347 | 
348 | #import pickle
349 | #pickle.dump(learner,open('ftrl3.p','w'))
350 | 
351 | ##############################################################################
352 | # start testing, and build Kaggle's submission file ##########################
353 | ##############################################################################
354 | #print ('creating submission file')
355 | with open(submission, 'w') as outfile:
356 |     outfile.write('ID,Disbursed\n')
357 |     for t, ID, x, y in data(test, D):
358 |         p = learner.predict(x)
359 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/script_ftrl5.py:
--------------------------------------------------------------------------------
  1 | #############################################################################################################
  2 | #classic tinrtgu's code
  3 | #https://www.kaggle.com/c/avazu-ctr-prediction/forums/t/10927/beat-the-benchmark-with-less-than-1mb-of-memory
  4 | #modified by rcarson
  5 | #https://www.kaggle.com/jiweiliu
  6 | #############################################################################################################
  7 | 
  8 | 
  9 | from datetime import datetime
 10 | from csv import DictReader
 11 | from math import exp, log, sqrt
 12 | import random
 13 | import pickle
 14 | 
 15 | ##############################################################################
 16 | # auc calculator. Author: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/auc.py
 17 | def tied_rank(x):
 18 |     """
 19 |     Computes the tied rank of elements in x.
 20 |     This function computes the tied rank of elements in x.
 21 |     Parameters
 22 |     ----------
 23 |     x : list of numbers, numpy array
 24 |     Returns
 25 |     -------
 26 |     score : list of numbers
 27 |             The tied rank f each element in x
 28 |     """
 29 |     sorted_x = sorted(zip(x,range(len(x))))
 30 |     r = [0 for k in x]
 31 |     cur_val = sorted_x[0][0]
 32 |     last_rank = 0
 33 |     for i in range(len(sorted_x)):
 34 |         if cur_val != sorted_x[i][0]:
 35 |             cur_val = sorted_x[i][0]
 36 |             for j in range(last_rank, i): 
 37 |                 r[sorted_x[j][1]] = float(last_rank+1+i)/2.0
 38 |             last_rank = i
 39 |         if i==len(sorted_x)-1:
 40 |             for j in range(last_rank, i+1): 
 41 |                 r[sorted_x[j][1]] = float(last_rank+i+2)/2.0
 42 |     return r
 43 | 
 44 | def auc(actual, posterior):
 45 |     """
 46 |     Computes the area under the receiver-operater characteristic (AUC)
 47 |     This function computes the AUC error metric for binary classification.
 48 |     Parameters
 49 |     ----------
 50 |     actual : list of binary numbers, numpy array
 51 |              The ground truth value
 52 |     posterior : same type as actual
 53 |                 Defines a ranking on the binary numbers, from most likely to
 54 |                 be positive to least likely to be positive.
 55 |     Returns
 56 |     -------
 57 |     score : double
 58 |             The mean squared error between actual and posterior
 59 |     """
 60 |     r = tied_rank(posterior)
 61 |     num_positive = len([0 for x in actual if x==1])
 62 |     num_negative = len(actual)-num_positive
 63 |     sum_positive = sum([r[i] for i in range(len(r)) if actual[i]==1])
 64 |     auc = ((sum_positive - num_positive*(num_positive+1)/2.0) /
 65 |            (num_negative*num_positive))
 66 |     return auc
 67 | ##############################################################################
 68 | 
 69 | # TL; DR, the main training process starts on line: 250,
 70 | # you may want to start reading the code from there
 71 | 
 72 | 
 73 | ##############################################################################
 74 | # parameters #################################################################
 75 | ##############################################################################
 76 | 
 77 | # A, paths
 78 | train='temp_data/shuffled_train4.csv'
 79 | test='temp_data/test_preprocessed.csv'#'vali_100.tsv'
 80 | submission = 'temp_submission/Sub705.csv'  # path of to be outputted submission file
 81 | 
 82 | # B, model
 83 | alpha = .05  # learning rate
 84 | beta = 1.   # smoothing parameter for adaptive learning rate
 85 | L1 = 0.     # L1 regularization, larger value means more regularized
 86 | L2 = 1.     # L2 regularization, larger value means more regularized
 87 | 
 88 | # C, feature/hash trick
 89 | D = 2 ** 24             # number of weights to use
 90 | interaction = False     # whether to enable poly2 feature interactions
 91 | 
 92 | # D, training/validation
 93 | epoch = 4       # learn training data for N passes
 94 | holdafter = 9   # data after date N (exclusive) are used as validation
 95 | holdout = 200  # use every N training instance for holdout validation
 96 | 
 97 | 
 98 | ##############################################################################
 99 | # class, function, generator definitions #####################################
100 | ##############################################################################
101 | 
102 | class ftrl_proximal(object):
103 |     ''' Our main algorithm: Follow the regularized leader - proximal
104 | 
105 |         In short,
106 |         this is an adaptive-learning-rate sparse logistic-regression with
107 |         efficient L1-L2-regularization
108 | 
109 |         Reference:
110 |         http://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf
111 |     '''
112 | 
113 |     def __init__(self, alpha, beta, L1, L2, D, interaction):
114 |         # parameters
115 |         self.alpha = alpha
116 |         self.beta = beta
117 |         self.L1 = L1
118 |         self.L2 = L2
119 | 
120 |         # feature related parameters
121 |         self.D = D
122 |         self.interaction = interaction
123 | 
124 |         # model
125 |         # n: squared sum of past gradients
126 |         # z: weights
127 |         # w: lazy weights
128 |         self.n = [0.] * D
129 |         self.z = [random.random() for k in range(D)]#[0.] * D
130 |         self.w = {}
131 | 
132 |     def _indices(self, x):
133 |         ''' A helper generator that yields the indices in x
134 | 
135 |             The purpose of this generator is to make the following
136 |             code a bit cleaner when doing feature interaction.
137 |         '''
138 | 
139 |         # first yield index of the bias term
140 |         yield 0
141 | 
142 |         # then yield the normal indices
143 |         for index in x:
144 |             yield index
145 | 
146 |         # now yield interactions (if applicable)
147 |         if self.interaction:
148 |             D = self.D
149 |             L = len(x)
150 | 
151 |             x = sorted(x)
152 |             for i in xrange(L):
153 |                 for j in xrange(i+1, L):
154 |                     # one-hot encode interactions with hash trick
155 |                     yield abs(hash(str(x[i]) + '_' + str(x[j]))) % D
156 | 
157 |     def predict(self, x):
158 |         ''' Get probability estimation on x
159 | 
160 |             INPUT:
161 |                 x: features
162 | 
163 |             OUTPUT:
164 |                 probability of p(y = 1 | x; w)
165 |         '''
166 | 
167 |         # parameters
168 |         alpha = self.alpha
169 |         beta = self.beta
170 |         L1 = self.L1
171 |         L2 = self.L2
172 | 
173 |         # model
174 |         n = self.n
175 |         z = self.z
176 |         w = {}
177 | 
178 |         # wTx is the inner product of w and x
179 |         wTx = 0.
180 |         for i in self._indices(x):
181 |             sign = -1. if z[i] < 0 else 1.  # get sign of z[i]
182 | 
183 |             # build w on the fly using z and n, hence the name - lazy weights
184 |             # we are doing this at prediction instead of update time is because
185 |             # this allows us for not storing the complete w
186 |             if sign * z[i] <= L1:
187 |                 # w[i] vanishes due to L1 regularization
188 |                 w[i] = 0.
189 |             else:
190 |                 # apply prediction time L1, L2 regularization to z and get w
191 |                 w[i] = (sign * L1 - z[i]) / ((beta + sqrt(n[i])) / alpha + L2)
192 | 
193 |             wTx += w[i]
194 | 
195 |         # cache the current w for update stage
196 |         self.w = w
197 | 
198 |         # bounded sigmoid function, this is the probability estimation
199 |         return 1. / (1. + exp(-max(min(wTx, 35.), -35.)))
200 | 
201 |     def update(self, x, p, y):
202 |         ''' Update model using x, p, y
203 | 
204 |             INPUT:
205 |                 x: feature, a list of indices
206 |                 p: click probability prediction of our model
207 |                 y: answer
208 | 
209 |             MODIFIES:
210 |                 self.n: increase by squared gradient
211 |                 self.z: weights
212 |         '''
213 | 
214 |         # parameter
215 |         alpha = self.alpha
216 | 
217 |         # model
218 |         n = self.n
219 |         z = self.z
220 |         w = self.w
221 | 
222 |         # gradient under logloss
223 |         g = p - y
224 | 
225 |         # update z and n
226 |         for i in self._indices(x):
227 |             sigma = (sqrt(n[i] + g * g) - sqrt(n[i])) / alpha
228 |             z[i] += g - sigma * w[i]
229 |             n[i] += g * g
230 | 
231 | 
232 | def logloss(p, y):
233 |     ''' FUNCTION: Bounded logloss
234 | 
235 |         INPUT:
236 |             p: our prediction
237 |             y: real answer
238 | 
239 |         OUTPUT:
240 |             logarithmic loss of p given y
241 |     '''
242 | 
243 |     p = max(min(p, 1. - 10e-15), 10e-15)
244 |     return -log(p) if y == 1. else -log(1. - p)
245 | 
246 | 
247 | def data(path, D):
248 |     ''' GENERATOR: Apply hash-trick to the original csv row
249 |                    and for simplicity, we one-hot-encode everything
250 | 
251 |         INPUT:
252 |             path: path to training or testing file
253 |             D: the max index that we can hash to
254 | 
255 |         YIELDS:
256 |             ID: id of the instance, mainly useless
257 |             x: a list of hashed and one-hot-encoded 'indices'
258 |                we only need the index since all values are either 0 or 1
259 |             y: y = 1 if we have a click, else we have y = 0
260 |     '''
261 | 
262 |     for t, row in enumerate(DictReader(open(path), delimiter=',')):
263 | 
264 |         try:
265 |             ID= row['ID']
266 |             del row['ID']
267 |         except:
268 |             ID = 0
269 |             pass
270 | 
271 |         # process target.
272 |         y = 0.
273 |         target='Disbursed'
274 |         #row['I1'] = str(row['Monthly_Income']) + str(row['Var5'])
275 |         row['I2'] = str(row['Monthly_Income']) + str(row['Existing_EMI'])
276 |         row['I3'] = str(row['Var5']) + str(row['Existing_EMI'])
277 |         row['I4'] = str(row['Var5']) + str(row['Lifetime'])
278 |         row['I5'] = str(row['Var5']) + str(row['Loan_Amount_Submitted'])
279 |         row['I6'] = str(row['Interest_Rate']) + str(row['dob_year'])
280 |         #row['I7'] = str(row['dob_weekofyear']) + str(row['dob_day'])
281 |         row['I7'] = str(row['Loan_Amount_Applied']) + str(row['Processing_Fee'])
282 |         row['I8'] = str(row['Var5']) + str(row['Var4'])
283 |         #row['I9'] = str(row['dob_month']) + str(row['dob_dayofweek'])
284 |         #lcd_weekofyear
285 | 
286 | 
287 |         if target in row:
288 |             if row[target] == '1':
289 |                 y = 1.
290 |             del row[target]
291 | 
292 |         # extract date
293 | 
294 |         # turn hour really into hour, it was originally YYMMDDHH
295 | 
296 | 
297 |         # build x
298 |         x = []
299 |         for key in row:
300 |             value = row[key]
301 | 
302 |             # one-hot encode everything with hash trick
303 |             index = abs(hash(key + '_' + value)) % D
304 |             x.append(index)
305 | 
306 |         yield t, ID, x, y
307 | 
308 | 
309 | ##############################################################################
310 | # start training #############################################################
311 | ##############################################################################
312 | 
313 | start = datetime.now()
314 | #print("started at: %s" % datetime.now())
315 | 
316 | # initialize ourselves a learner
317 | learner = ftrl_proximal(alpha, beta, L1, L2, D, interaction)
318 | 
319 | # start training
320 | for e in range(epoch):
321 |     random.seed(1234)
322 |     loss = 0.
323 |     count = 0
324 |     predlist=[]
325 |     targetlist=[]
326 |     for t, ID, x, y in data(train, D):  # data is a generator
327 | 
328 |         p = learner.predict(x)
329 | 
330 |         # if random.random() < 0.3:
331 |         # # Estimate progressive validation loss
332 |         #     loss += logloss(p, y)
333 |         #     count += 1
334 |         #     predlist.append(p)
335 |         #     targetlist.append(y) 
336 |         # else:
337 |         # # Use other samples to train the model
338 |         #     learner.update(x, p, y)
339 | 
340 |         learner.update(x, p, y)
341 |         # if t % 1000000 == 0:
342 |         #     continue
343 | 
344 |     # print('epoch: %s\tval. logloss: %0.5f\tval. AUC: %0.5f\telapsed time: %s' % 
345 |     #         (e + 1, loss/count, auc(targetlist, predlist), str(datetime.now() - start)))
346 | 
347 | #import pickle
348 | #pickle.dump(learner,open('ftrl3.p','w'))
349 | 
350 | ##############################################################################
351 | # start testing, and build Kaggle's submission file ##########################
352 | ##############################################################################
353 | #print ('creating submission file')
354 | with open(submission, 'w') as outfile:
355 |     outfile.write('ID,Disbursed\n')
356 |     for t, ID, x, y in data(test, D):
357 |         p = learner.predict(x)
358 |         outfile.write('%s,%s\n' % (ID, str(p)))


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/shuffle.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shuffle lines in a [big] file
 3 | shuffle.py <input_file> <output_file> [<preserve headers?>] [<max. lines in memory>] [<random seed>]
 4 | """
 5 | 
 6 | import sys
 7 | import random
 8 | 
 9 | input_file = sys.argv[1]
10 | output_file = sys.argv[2]
11 | 
12 | try:
13 | 	preserve_headers = int( sys.argv[3] )
14 | except IndexError:
15 | 	preserve_headers = 0
16 | 
17 | try:
18 | 	lines_in_memory = int( sys.argv[4] )
19 | except IndexError:
20 | 	lines_in_memory = 25000
21 | 	
22 | print "caching %s lines at a time..." % ( lines_in_memory )
23 | 	
24 | try:
25 | 	random_seed = sys.argv[5]
26 | 	random.seed( random_seed )
27 | 	print "random seed: %s" % ( random_seed )
28 | except IndexError:
29 | 	pass
30 | 	
31 | # first count
32 | 
33 | print "counting lines..."
34 | 
35 | i_f = open( input_file )
36 | o_f = open( output_file, 'wb' )
37 | 
38 | if preserve_headers:
39 | 	headers = i_f.readline()
40 | 	o_f.write( headers )
41 | 
42 | counter =  0
43 | for line in i_f:
44 | 	counter += 1
45 | 	
46 | 	if counter % 100000 == 0:
47 | 		print counter
48 | 	
49 | print counter
50 | 		
51 | print "shuffling..."
52 | 
53 | order = range( counter )
54 | random.shuffle( order )
55 | 
56 | epoch = 0
57 | 	
58 | while order:
59 | 
60 | 	current_lines = {}
61 | 	current_lines_count = 0
62 | 
63 | 	current_chunk = order[:lines_in_memory]
64 | 	current_chunk_dict = { x: 1 for x in current_chunk }		# faster "in"
65 | 	current_chunk_length = len( current_chunk )
66 | 	
67 | 	order = order[lines_in_memory:]
68 | 	
69 | 	i_f.seek( 0 )
70 | 	if preserve_headers:
71 | 		i_f.readline()
72 | 		
73 | 	count = 0
74 | 		
75 | 	for line in i_f:
76 | 		if count in current_chunk_dict:
77 | 			current_lines[count] = line
78 | 			current_lines_count += 1
79 | 			if current_lines_count == current_chunk_length:
80 | 				break
81 | 		count += 1	
82 | 		if count % 100000 == 0:
83 | 			print count		
84 | 	
85 | 	print "writing..."
86 | 	
87 | 	for l in current_chunk:
88 | 		o_f.write( current_lines[l] )
89 | 	
90 | 	lines_saved = current_chunk_length + epoch * lines_in_memory
91 | 	epoch += 1
92 | 	print "pass %s complete (%s lines saved)" % ( epoch, lines_saved )


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb1.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed2.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 800
21 | 
22 | params['seed'] = 523264626346 # 0.85533
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | #xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | #exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub251.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed2.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | 
15 | labels = list(labels.iloc[:,0])
16 | test_ids = list(test_ids.iloc[:,0])
17 | 
18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
19 |           'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
20 |           'min_child_weight':3, 'max_delta_step':3}
21 | num_rounds = 800
22 | 
23 | params['seed'] = 64378683511 # 0.85533
24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
25 | #xgb.cv(params, dtrain, num_rounds, nfold=4)
26 | #exit()
27 | 
28 | clf = xgb.train(params, dtrain, num_rounds)
29 | dtest = xgb.DMatrix(test, missing = np.nan)
30 | test_preds_xgb = clf.predict(dtest)
31 | 
32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
33 | submission = submission[['ID', 'Disbursed']]
34 | submission.to_csv("temp_submission/Sub252.csv", index = False)
35 | 
36 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed2.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | 
15 | labels = list(labels.iloc[:,0])
16 | test_ids = list(test_ids.iloc[:,0])
17 | 
18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
19 |           'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
20 |           'min_child_weight':3, 'max_delta_step':3}
21 | num_rounds = 800
22 | 
23 | params['seed'] = 132323786373 # 0.85533
24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
25 | #xgb.cv(params, dtrain, num_rounds, nfold=4)
26 | #exit()
27 | 
28 | clf = xgb.train(params, dtrain, num_rounds)
29 | dtest = xgb.DMatrix(test, missing = np.nan)
30 | test_preds_xgb = clf.predict(dtest)
31 | 
32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
33 | submission = submission[['ID', 'Disbursed']]
34 | submission.to_csv("temp_submission/Sub253.csv", index = False)
35 | 
36 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed2.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | 
15 | labels = list(labels.iloc[:,0])
16 | test_ids = list(test_ids.iloc[:,0])
17 | 
18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
19 |           'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
20 |           'min_child_weight':3, 'max_delta_step':3}
21 | num_rounds = 800
22 | 
23 | params['seed'] = 548563448943 # 0.85533
24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
25 | #xgb.cv(params, dtrain, num_rounds, nfold=4)
26 | #exit()
27 | 
28 | clf = xgb.train(params, dtrain, num_rounds)
29 | dtest = xgb.DMatrix(test, missing = np.nan)
30 | test_preds_xgb = clf.predict(dtest)
31 | 
32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
33 | submission = submission[['ID', 'Disbursed']]
34 | submission.to_csv("temp_submission/Sub254.csv", index = False)
35 | 
36 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_2xgb5.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed2.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed2.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | 
15 | labels = list(labels.iloc[:,0])
16 | test_ids = list(test_ids.iloc[:,0])
17 | 
18 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
19 |           'eta':0.01, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
20 |           'min_child_weight':3, 'max_delta_step':3}
21 | num_rounds = 800
22 | 
23 | params['seed'] = 14357846377 # 0.85533
24 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
25 | #xgb.cv(params, dtrain, num_rounds, nfold=4)
26 | #exit()
27 | 
28 | clf = xgb.train(params, dtrain, num_rounds)
29 | dtest = xgb.DMatrix(test, missing = np.nan)
30 | test_preds_xgb = clf.predict(dtest)
31 | 
32 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
33 | submission = submission[['ID', 'Disbursed']]
34 | submission.to_csv("temp_submission/Sub255.csv", index = False)
35 | 
36 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_rf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | from sklearn.ensemble import RandomForestClassifier
 6 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 7 | 
 8 | train = pd.read_csv("Train.csv")
 9 | test = pd.read_csv("Test.csv")
10 | submission = pd.read_csv("sample_submission.csv")
11 | 
12 | salary_acc = train.Salary_Account.value_counts(dropna=False)
13 | salary_acc_rare = list(salary_acc[salary_acc<40].index)
14 | train.ix[train['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
15 | 
16 | train2 = train.copy()#[~pd.isnull(train['Loan_Amount_Applied'])]
17 | 
18 | id_train = train['ID']
19 | label = train2['Disbursed']
20 | 
21 | dropCols = ['ID', 'LoggedIn', 'Disbursed', 'DOB', 'Lead_Creation_Date', 'City', 'Employer_Name']
22 | train2.drop(dropCols, axis=1, inplace = True)
23 | 
24 | y_train = label
25 | X_train = pd.get_dummies(train2)
26 | 
27 | # # Test set preparation
28 | test.ix[test['Salary_Account'].isin(salary_acc_rare), "Salary_Account"] = "Others"
29 | testdropcols = list(set(dropCols)-set(['LoggedIn', 'Disbursed']))
30 | test2 = test.drop(testdropcols, axis=1)
31 | 
32 | X_test = pd.get_dummies(test2)
33 | missingCols = list(set(X_train.columns)-set(X_test.columns))
34 | for col in missingCols:
35 |     X_test[col] = 0
36 | X_test = X_test[X_train.columns]
37 | assert X_train.columns.equals(X_test.columns)
38 | 
39 | # # Modeling
40 | X_train_2 = X_train.fillna(-999)
41 | X_test_2 = X_test.fillna(-999)
42 | 
43 | # from sklearn.cross_validation import KFold
44 | # kf = KFold(len(X_train_2), n_folds=4)
45 | # scores = cross_val_score(clf, X_train_2, y_train, scoring='roc_auc', cv=kf)
46 | # print "CV:", np.mean(scores), "+/-", np.std(scores), "All:", scores
47 | # CV: 0.831889207925 +/- 0.0109754348042 All: [ 0.82381549  0.82907869  0.85055107  0.82411158]
48 | seeds = [31121421,53153,5245326,6536,75]
49 | numbers = [151,152,153,154,155]
50 | 
51 | for i in range(5): 
52 |     clf = RandomForestClassifier(n_estimators=360, max_depth=9, criterion = 'entropy', min_samples_split=2, bootstrap = False, n_jobs=-1, random_state=seeds[i])
53 |     clf.fit(X_train_2, y_train)
54 |     test_preds = clf.predict_proba(X_test_2)[:,1]
55 |     print("RF %s done" % i)
56 | 
57 |     submission = pd.DataFrame({'ID':test['ID'], 'Disbursed':test_preds})
58 |     submission = submission[['ID', 'Disbursed']]
59 |     submission.to_csv("temp_submission/Sub%s.csv" % str(numbers[i]), index = False)


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 400
21 | 
22 | params['seed'] = 523264626346
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | # exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub241.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb2.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 400
21 | 
22 | params['seed'] = 64378683511
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | # exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub242.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb3.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 400
21 | 
22 | params['seed'] = 132323786373
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | # exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub243.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb4.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 400
21 | 
22 | params['seed'] = 548563448943
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | # exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub244.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/Weeklong/train_xgb5.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | import xgboost as xgb
 5 | from sklearn.cross_validation import cross_val_score, cross_val_predict
 6 | 
 7 | # # Modeling
 8 | 
 9 | train = pd.read_csv("temp_data/train_preprocessed.csv")
10 | test = pd.read_csv("temp_data/test_preprocessed.csv")
11 | labels = pd.read_csv("temp_data/train_labels.csv", header = None)
12 | test_ids = pd.read_csv("temp_data/test_ids.csv", header = None)
13 | 
14 | labels = list(labels.iloc[:,0])
15 | test_ids = list(test_ids.iloc[:,0])
16 | 
17 | params = {'booster':'gbtree', 'objective':'binary:logistic', 'max_depth':9, 'eval_metric':'auc',
18 |           'eta':0.02, 'silent':1, 'nthread':4, 'subsample': 0.9, 'colsample_bytree':0.9, 'scale_pos_weight': 1,
19 |           'min_child_weight':3, 'max_delta_step':3}
20 | num_rounds = 400
21 | 
22 | params['seed'] = 14357846377
23 | dtrain = xgb.DMatrix(train, labels, missing=np.nan)
24 | # xgb.cv(params, dtrain, num_rounds, nfold=4)
25 | # exit()
26 | 
27 | clf = xgb.train(params, dtrain, num_rounds)
28 | dtest = xgb.DMatrix(test, missing = np.nan)
29 | test_preds_xgb = clf.predict(dtest)
30 | 
31 | submission = pd.DataFrame({ 'ID':test_ids, 'Disbursed':test_preds_xgb})
32 | submission = submission[['ID', 'Disbursed']]
33 | submission.to_csv("temp_submission/Sub245.csv", index = False)
34 | 
35 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/XGB Tuning guide.md:
--------------------------------------------------------------------------------
 1 | Tuning and CV strategy for XGB:
 2 | ==============================
 3 | 
 4 | Typically, people use 5 folds. You can make a choice. To see the reliability of CV estimate, a few guys use 10-fold as well. 
 5 | 
 6 | Steps:
 7 | -----
 8 |   1. Decide 'n' in n-fold. Stick to it for complete analysis.
 9 |   2. Create a baseline score using a simple model.
10 |   3. Now, use XGBoost default settings and establish another XGB baseline score.
11 |   4. Put num_trees at 10000 and a tiny learning rate of 0.01.
12 |   5. Try step(4) for various max_depth.
13 |   6. While doing step(4), monitor the progress. Note at what tree# is the model overfitting
14 |   7. After you're done with 1-6, you would have reached a saturation score
15 |   8. Now comes some magic! Start using subsample and tada, your score improves.
16 |   9. Use colsample_bytree, then scale_pos_weight, improve your score
17 |   10. Try using max_delta_step and gamma too (a little tricky to tune)
18 | 


--------------------------------------------------------------------------------
/Analytics_Vidhya_3.X_Hackathon/requirements.md:
--------------------------------------------------------------------------------
1 | Requirements:
2 | =============
3 | 
4 |  - Python
5 |  - Pandas, Numpy, Scipy, Scikit-learn - latest libraries
6 |  - XGBoost - https://github.com/dmlc/xgboost
7 |  - Pypy (to run the ftrl code faster)
8 | 


--------------------------------------------------------------------------------
/D-hack/Code.py:
--------------------------------------------------------------------------------
  1 | #Code for D-Hack Weeklong version 
  2 | 
  3 | #importing libraries
  4 | from sklearn.ensemble import RandomForestClassifier
  5 | #from sklearn.ensemble import AdaBoostClassifier
  6 | from sklearn.ensemble import GradientBoostingClassifier
  7 | from sklearn.metrics import confusion_matrix
  8 | import xgboost as xgb
  9 | import pandas as pd
 10 | import numpy as np
 11 | from sklearn import preprocessing
 12 | import pandas as pd
 13 | from sklearn import ensemble
 14 | import random
 15 | 
 16 | #Importing i/p files
 17 | train=pd.read_csv('E:/DS/DHack/train_FBFog7d.csv')
 18 | test=pd.read_csv('E:/DS/DHack/Test_L4P23N3.csv')
 19 | train.head()
 20 | 
 21 | #Pre-processing
 22 | def convert(data):
 23 |     number = preprocessing.LabelEncoder()
 24 |     data['Var1'] = number.fit_transform(data.Var1)
 25 |     data['WorkStatus'] = number.fit_transform(data.WorkStatus)
 26 |     data['Divorce'] = number.fit_transform(data.Divorce)
 27 |     data['Widowed'] = number.fit_transform(data.Widowed)
 28 |     data['Education'] = number.fit_transform(data.Education)
 29 |     data['Residence_Region'] = number.fit_transform(data.Residence_Region)
 30 |     data['babies'] = number.fit_transform(data.babies)
 31 |     data['preteen'] = number.fit_transform(data.preteen)
 32 |     data['teens'] = number.fit_transform(data.teens)
 33 |     data['income'] = number.fit_transform(data.income)
 34 |     data['Engagement_Religion'] = number.fit_transform(data.Engagement_Religion)
 35 |     data['Var2'] = number.fit_transform(data.Var2)
 36 |     data['TVhours'] = number.fit_transform(data.TVhours)
 37 |     data['Gender'] = number.fit_transform(data.Gender)
 38 |     data['Unemployed10'] = number.fit_transform(data.Unemployed10)
 39 |     data['Alcohol_Consumption'] = number.fit_transform(data.Alcohol_Consumption)
 40 |     data=data.fillna(-999)
 41 |     return data
 42 | new = train.append(test)
 43 | new = convert(new)
 44 | train = new[0:10357]
 45 | test = new[10357:]
 46 | 
 47 | #Features
 48 | Columns_names = train.columns.values
 49 | features = Columns_names[0:np.size(Columns_names)]
 50 | features = np.delete(features,[5,6])
 51 | features
 52 | 
 53 | #Creating Data set for training
 54 | x_train = train[list(features)].values
 55 | y_train = train['Happy'].values
 56 | x_test=test[features].values
 57 | 
 58 | 
 59 | ############## RF Models ############
 60 | #0 Rf_model - 900
 61 | rf = ensemble.RandomForestClassifier(n_estimators=900,max_depth=16,criterion='entropy',max_features=6,  min_samples_leaf=35, n_jobs=4, random_state=0)
 62 | rf.fit(x_train, y_train)
 63 | Happy = rf.predict(x_test)
 64 | test['Happy_Rf_900']=Happy[:]
 65 | 
 66 | #1 Rf_model - 850
 67 | rf = ensemble.RandomForestClassifier(n_estimators=850,max_depth=16,criterion='entropy',max_features=6,  min_samples_leaf=35, n_jobs=4, random_state=0)
 68 | rf.fit(x_train, y_train)
 69 | Happy = rf.predict(x_test)
 70 | test['Happy_Rf_850']=Happy[:]
 71 | 
 72 | #2 Rf_model - 800
 73 | rf = ensemble.RandomForestClassifier(n_estimators=800,max_depth=16,criterion='entropy',max_features=6,  min_samples_leaf=35, n_jobs=4, random_state=0)
 74 | rf.fit(x_train, y_train)
 75 | Happy = rf.predict(x_test)
 76 | test['Happy_Rf_800']=Happy[:]
 77 | 
 78 | #3 Rf_model - 750
 79 | rf = ensemble.RandomForestClassifier(n_estimators=750,max_depth=16,criterion='entropy',max_features=6,  min_samples_leaf=35, n_jobs=4, random_state=0)
 80 | rf.fit(x_train, y_train)
 81 | Happy = rf.predict(x_test)
 82 | test['Happy_Rf_750']=Happy[:]
 83 | 
 84 | # Making Function for XGB
 85 | def happy_to_scores2(x):
 86 |     if x == 2:
 87 |         return 'Very Happy'
 88 |     elif x == 1:
 89 |         return 'Pretty Happy'
 90 |     elif x == 0:
 91 |         return 'Not Happy'
 92 | 		
 93 | ########### XG boost Models #############
 94 | 
 95 | xgtrain = xgb.DMatrix(x_train,label=number.fit_transform(y_train),missing=-999)
 96 | xgtest = xgb.DMatrix(x_test,missing=-999)
 97 | 
 98 | # Defining Parameter
 99 | params = {}
100 | params["objective"] = "multi:softmax"
101 | params["num_class"] = 3
102 | params["eta"] = 0.01
103 | params["min_child_weight"] = 15
104 | params["subsample"] = 0.7
105 | params["colsample_bytree"] = 0.7
106 | params["max_depth"] = 6
107 | params["seed"] = 0
108 | number = preprocessing.LabelEncoder()
109 | plst = list(params.items())
110 | 
111 | #4 XGB model : num_round - 390
112 | num_rounds = 390
113 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
114 | label = pd.DataFrame(model_xgb.predict(xgtest))
115 | label = label[0].apply(lambda x: happy_to_scores2(x))
116 | test['Happy_XGB_390']=label[:]
117 | 
118 | #5 XGB model : num_round - 340
119 | num_rounds = 340
120 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
121 | label = pd.DataFrame(model_xgb.predict(xgtest))
122 | label = label[0].apply(lambda x: happy_to_scores2(x))
123 | test['Happy_XGB_340']=label[:]
124 | 
125 | #6 XGB model : num_round - 290
126 | num_rounds = 290
127 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
128 | label = pd.DataFrame(model_xgb.predict(xgtest))
129 | label = label[0].apply(lambda x: happy_to_scores2(x))
130 | test['Happy_XGB_290']=label[:]
131 | 
132 | #7 XGB model : num_round - 240
133 | num_rounds = 240
134 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
135 | label = pd.DataFrame(model_xgb.predict(xgtest))
136 | label = label[0].apply(lambda x: happy_to_scores2(x))
137 | test['Happy_XGB_240']=label[:]
138 | 
139 | #8 XGB model : num_round - 190
140 | num_rounds = 190
141 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
142 | label = pd.DataFrame(model_xgb.predict(xgtest))
143 | label = label[0].apply(lambda x: happy_to_scores2(x))
144 | test['Happy_XGB_190']=label[:]
145 | 
146 | #9 XGB model : num_round - 140
147 | num_rounds = 140
148 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
149 | label = pd.DataFrame(model_xgb.predict(xgtest))
150 | label = label[0].apply(lambda x: happy_to_scores2(x))
151 | test['Happy_XGB_140']=label[:]
152 | 
153 | #10 XGB model : num_round - 90
154 | num_rounds = 90
155 | model_xgb = xgb.train(plst, xgtrain, num_rounds)
156 | label = pd.DataFrame(model_xgb.predict(xgtest))
157 | label = label[0].apply(lambda x: happy_to_scores2(x))
158 | test['Happy_XGB_90']=label[:]
159 | 
160 | ########### Gradient Boosting Models ##################
161 | 
162 | #11 GB model - 1200
163 | clf = GradientBoostingClassifier(n_estimators=1200, learning_rate=0.01)
164 | clf.fit(x_train, y_train) 
165 | Happy = clf.predict(x_test)
166 | test['Happy_GB_1200']=Happy[:]
167 | 
168 | #12 GB model - 1300
169 | clf = GradientBoostingClassifier(n_estimators=1300, learning_rate=0.01)
170 | clf.fit(x_train, y_train) 
171 | Happy = clf.predict(x_test)
172 | test['Happy_GB_1300']=Happy[:]
173 | 
174 | #13 GB model - 1400
175 | clf = GradientBoostingClassifier(n_estimators=1400, learning_rate=0.01)
176 | clf.fit(x_train, y_train) 
177 | Happy = clf.predict(x_test)
178 | test['Happy_GB_1400']=Happy[:]
179 | 
180 | Test_final = test[['ID','Happy_Rf_900','Happy_Rf_850','Happy_Rf_800','Happy_Rf_750','Happy_XGB_390','Happy_XGB_340','Happy_XGB_290','Happy_XGB_240','Happy_XGB_190','Happy_XGB_140','Happy_XGB_90','Happy_GB_1100','Happy_GB_1200','Happy_GB_1300','Happy_GB_1400']].copy()
181 | Test_final.to_csv('E:/DS/DHack/Solution_ensemble_15.csv',index=False)
182 | 
183 | # After this did a maximum vote ensemble in excel, as I am not so good with Python :P Happy Hacking!
184 | 


--------------------------------------------------------------------------------
/D-hack/README.md:
--------------------------------------------------------------------------------
 1 | ##### Codes for Analytics Vidhya Online Hackathon D Hack, 24th and 25th October, 2015 - Decode D Dalai Lama!
 2 | 
 3 | http://datahack.analyticsvidhya.com/contest/the-d-hack
 4 | 
 5 | ###### My approach for the hackathon is as follows:
 6 | 
 7 | 1.  Creating a Data dictionary by understanding levels of data and gaps in the data
 8 | 
 9 | 2. Converting all the categorical variables into 1/0 encoder variables
10 | 
11 | 3. Treating missing value as a different class itself by imputing it by -999
12 | 
13 | 4. The evaluation metric used in the hackathon was very unconventional, it penalizes misclassification using several rules, which cannot be directly optimized by any conventional machine learning algorithm. So because of lack of time the best bet was to make a robust model, which doesn't deviate from Public to Private Leaderboard
14 | 
15 | 5. Made 15 models , 4 - Random Forest, 8 - XGB , 4 - GB 
16 | 
17 | 6. Did maximum vote ensemble for final Solution
18 | 
19 | ###### Extras :
20 | 
21 | 1. My single model was giving me a 0.71 score over Public LB , but I know its overfitting, my ensemble model was giving around 0.706 over public LB but I considered it to be more robust
22 | 
23 | 2. The 0.71 single model was scored around 0.68329 over Private LB, while ensemble model was around 0.69304 over Private LB, so the assumption that ensemble model would be more robust proves to be right
24 | 
25 | 3. I did tried one more thing just for fun, I made eval metric based in excel and extracted probability of each class from the model. After that I optimized weight of each class probability to maximize the eval metric. I didn't use it in final solution but would have been a fun thing to try :)
26 | 


--------------------------------------------------------------------------------
/Hacker-Earth---Will-Bill-Solve-it-/README.md:
--------------------------------------------------------------------------------
 1 | # Approach and Codes for Hacker-Earth Will-Bill-Solve-it? 
 2 | 
 3 | https://www.hackerearth.com/machine-learning-india-hacks-2016/machine-learning/will-bill-solve-it/
 4 | 
 5 | Finished - 4th over Public LB(AUC - 0.833, winners(0.834))
 6 | 
 7 | ##Problem Statement:
 8 | HackerEarth is a community of programmers. Thousands of hackers solve problems on HackerEarth everyday to improve their programming skills or win prizes. These hackers can be beginners who are new to programming, or experts who know the solution in a blink. There is a pattern to everything, and this problem is about finding those patterns and problem solving behaviours of the users.
 9 | 
10 | Finding these patterns will be of immense help to the problem solvers, as it will allow to suggest relevant problems to solve and offer solution when they seem to be stuck. The opportunities are diverse and you are entitled with the task to predict them.
11 | 
12 | ## Data Sets:
13 | Both training and testing dataset consist of 3 files :-
14 | 
15 | ### 1) User File:  
16 | With Attributes of a User:<br />
17 | <br />
18 | user_id - the user id <br />
19 | skills - all his skills separated by the delimiter '|' <br />
20 | solved_count - number of problems solved by the user <br />
21 | attempts - total number of incorrect submissions done by the user <br />
22 | user_type : type of user (S - Student, W - Working, NA - No Information Available)<br />
23 |   
24 | ### 2) Problem File:
25 | Attribute related to a Problem : <br />
26 | <br />
27 | problem_id - the id of the problem<br />
28 | level - difficulty of the problem (Very-Easy, Easy, Easy-Medium, Medium, Medium-Hard, Hard)<br />
29 | accuracy - the accuracy score for the problem<br />
30 | solved_count - number of people who have solved it<br />
31 | error_count - number of people who have solved it incorrectly<br />
32 | rating - star (quality) rating of the problem on scale of 0-5<br />
33 | tag1 - tag of the problem representing the type e.g. Data Structures<br />
34 | tag2 - tag of the problem<br />
35 | tag3 - tag of the problem<br />
36 | tag4 - tag of the problem<br />
37 | tag5 - tag of the problem<br />
38 |   
39 | ### 3) Submissions File:
40 | Problem User interaction and final results for each attempt a user made to a solve a particular problem.<br />
41 | <br />
42 | user_id - the id of the user who made a submission<br />
43 | problem_id - the id of the problem that was attempted<br />
44 | solved_status - indicates whether the submission was correct (SO : Solved or Correct solution, AT : Attempted or Incorrect solution )<br />
45 | result - result of the code execution (PAC: Partially Accepted, AC : Accepted, TLE : Time limit exceeded, CE : Compilation Error, RE : Runtime Error, WA : Wrong Answer)<br />
46 | language_used - the lang used by user to code the solution <br />
47 | execution_time - the execution time of the solution<br />
48 | 
49 | ## Approach:
50 | ### Preprocessing
51 | #### User File:
52 | 1. Create Features of user skill, there are total 24 skills in the skills columns so created binary flag for each skills
53 | 2. Counting the total number of skills a user have
54 | 3. User Success Rate Percentage: Solved count * 100/ (Attempts + Solved Counts)
55 |   
56 | #### Problem File:
57 | 1. Count of Tag : Counting number of tags present in problems ( 5- number of NA's)
58 | 2. I created a Dictionary file for 81 unique tags present in Tag1 to Tag5, and bucketed it in 17 super categories based on business understanding of tag
59 | 3. Made Binary features of each of the 17 skills (1/0) if the respected tag is present in front of problem
60 | 4. Imputing missing value with zero
61 | 5. Assuming Text variables as categorical vairables and encoding each with numeric values
62 | 6. Accuracy Measure : (Solved Count*100) / (Solved count + Error Count)
63 |       
64 | #### Submission File(Only for Training Submission File) :
65 | 1. Removing entries which have Solved status as UK (Unknown)
66 | 2. Creating 1/0 of our target variable as solved status == "SO" then 1 else 0
67 | 3. Rolling up data at User ID, Problem ID level and sum of solved status
68 | 4. Creating our final Target Variable(1/0) by checking Solved Status > 1 as 1 else 0
69 |     
70 | Merging all the 3 files to get our Final training and testing set
71 |     
72 | ### Modelling
73 | 1. I trained 3 XGboost modesls with different number of rounds, but same probabily cutoffs
74 | 2. Did vote ensemble of the three models
75 | 
76 |     
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Reporsitory of Codes
2 | 
3 | This is a compiled repository of codes I wrote in various competitions.
4 | 
5 | 


--------------------------------------------------------------------------------
/minnemudac/AvgWaterQualityByLake&Season.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |  LAKE_NAME
 3 | , YEAR(START_DATE) AS Year
 4 | , CASE 
 5 |     WHEN MONTH(START_DATE) IN (12, 1, 2) THEN 'Winter'
 6 |     WHEN MONTH(START_DATE) BETWEEN 3 AND 5 THEN 'Spring'
 7 |     WHEN MONTH(START_DATE) BETWEEN 6 AND 9 THEN 'Summer'
 8 |     ELSE 'Fall'
 9 |   END AS Season
10 | 
11 | -- For ordering
12 | , CASE 
13 |     WHEN MONTH(START_DATE) IN (12, 1, 2) THEN 4
14 |     WHEN MONTH(START_DATE) BETWEEN 3 AND 5 THEN 1
15 |     WHEN MONTH(START_DATE) BETWEEN 6 AND 9 THEN 2
16 |     ELSE 3
17 |   END AS SeasonNum
18 |   
19 | , AVG(RECREATIONAL_SUITABILITY_RESULT) AS RECREATIONAL_SUITABILITY_RESULT
20 | , AVG(PHYSICAL_CONDITION_RESULT) AS PHYSICAL_CONDITION_RESULT
21 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT
22 | , AVG(TOTAL_PHOSPHORUS_RESULT) AS TOTAL_PHOSPHORUS_RESULT
23 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 
24 | WHERE 
25 | 
26 | -- Worst lakes: Top 10 lakes with lowest Secchi depths
27 |  LAKE_NAME IN ('Benton Lake','Hazeltine Lake','Cobblecrest Lake','Downs Lake','Penn Lake'
28 | ,'Winkler Lake','Meadow Lake','Cornelia Lake','Cedar Island Lake','Gaystock Lake')
29 |   
30 | -- Best lakes: Top 10 lakes with highest Secchi depths
31 | --  LAKE_NAME IN ('West Boot Lake','Brickyard Clayhole Lake','Big Carnelian Lake','Jane Lake'
32 | -- ,'Halfbreed Lake' /*What the hell kind of name is this?!*/,'Little Long Lake','Mays Lake'
33 | -- ,'Christmas Lake','Little Carnelian Lake','Square Lake')
34 |   
35 |   
36 |   AND SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided
37 |   
38 | GROUP BY LAKE_NAME, Year, Season, SeasonNum
39 | ORDER BY LAKE_NAME, Year, SeasonNum
40 | 


--------------------------------------------------------------------------------
/minnemudac/AvgWaterQualityByLake.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |  LAKE_NAME
 3 | 
 4 | , AVG(RECREATIONAL_SUITABILITY_RESULT) AS RECREATIONAL_SUITABILITY_RESULT
 5 | , AVG(PHYSICAL_CONDITION_RESULT) AS PHYSICAL_CONDITION_RESULT
 6 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT
 7 | , AVG(TOTAL_PHOSPHORUS_RESULT) AS TOTAL_PHOSPHORUS_RESULT
 8 | , COUNT(*) AS NumberRecords
 9 | 
10 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 
11 | WHERE 
12 |  SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided
13 | GROUP BY LAKE_NAME
14 | HAVING COUNT(*) > 50  -- Removes ~1/3 of the data, but also removes unreliable lakes
15 | 
16 | ORDER BY SECCHI_DEPTH_RESULT ASC  -- Using Secchi depth since physical condition/recreational condition isn't available for all lakes
17 | 


--------------------------------------------------------------------------------
/minnemudac/DuplicatePropertyCheck.sql:
--------------------------------------------------------------------------------
 1 | WITH test AS (
 2 | SELECT 
 3 |  CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) = 1 THEN 1 ELSE NULL END AS Original
 4 | , CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) = 2 THEN 1 ELSE NULL END AS Duplicate
 5 | , CASE WHEN ROW_NUMBER() OVER (PARTITION BY centroid_long, centroid_lat) > 2 THEN 1 ELSE NULL END AS MoreThanTwo
 6 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data` 
 7 | )
 8 | 
 9 | SELECT 
10 | SUM(Original) AS Original
11 | , SUM(Duplicate) AS Duplicate
12 | , SUM(MoreThanTwo) AS MoreThanTwo
13 | FROM test
14 | 


--------------------------------------------------------------------------------
/minnemudac/NumberPropertyTypesPerLake.sql:
--------------------------------------------------------------------------------
  1 | -- Disable "Use Legacy SQL" under Google BigQuery to support CTEs
  2 | 
  3 | WITH lake AS (
  4 | SELECT
  5 |  LAKE_NAME AS LakeName
  6 | , DNR_ID_SITE_NUMBER
  7 | FROM `datadive-142319.mces_lakes.1999_2014_monitoring_data`
  8 | GROUP BY LAKE_NAME, DNR_ID_SITE_NUMBER
  9 | )
 10 | 
 11 | , residential AS (
 12 | SELECT
 13 |  USE1_DESC AS PropertyType
 14 | , centroid_long
 15 | , centroid_lat
 16 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data`
 17 | WHERE LTRIM(LOWER(USE1_DESC)) LIKE '1__%'
 18 |   OR LOWER(USE1_DESC) LIKE '%residential%'
 19 |     OR LOWER(USE1_DESC) LIKE '%res%'
 20 |     OR LOWER(USE1_DESC) LIKE '%house%'
 21 |     OR LOWER(USE1_DESC) LIKE '%condo%'
 22 |     OR LOWER(USE1_DESC) LIKE '%apartment%'
 23 |       OR LOWER(USE1_DESC) LIKE '%apt%'
 24 |     OR LOWER(USE1_DESC) LIKE '%plex%'
 25 |     OR LOWER(USE1_DESC) LIKE '%bungalo%'
 26 |     OR LOWER(USE1_DESC) LIKE '%housing%'
 27 |       OR LOWER(USE1_DESC) LIKE '%home%'
 28 |     OR LOWER(USE1_DESC) LIKE '%family%'
 29 | GROUP BY USE1_DESC, centroid_long, centroid_lat
 30 | )
 31 | 
 32 | , agriculture AS (
 33 | SELECT
 34 |  USE1_DESC AS PropertyType
 35 | , centroid_long
 36 | , centroid_lat
 37 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data`
 38 | WHERE LOWER(USE1_DESC) LIKE '2__%'
 39 |    OR LOWER(USE1_DESC) LIKE '%ag%'
 40 |    OR LOWER(USE1_DESC) LIKE '%farm%'
 41 |    OR LOWER(USE1_DESC) LIKE '%rural%'
 42 | )
 43 | 
 44 | , commercial AS (
 45 | SELECT
 46 |  USE1_DESC AS PropertyType
 47 | , centroid_long
 48 | , centroid_lat
 49 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data`
 50 | WHERE LOWER(USE1_DESC) LIKE '3__%'
 51 |    OR LOWER(USE1_DESC) LIKE '%commercial%'
 52 |    OR LOWER(USE1_DESC) LIKE '%machinery%'
 53 |    OR LOWER(USE1_DESC) LIKE '%recreational%'
 54 |    OR LOWER(USE1_DESC) LIKE '%golf%'
 55 |    OR LOWER(USE1_DESC) LIKE '%coop%'
 56 | )
 57 | 
 58 | , industrial AS (
 59 | SELECT
 60 |  USE1_DESC AS PropertyType
 61 | , centroid_long
 62 | , centroid_lat
 63 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data`
 64 | WHERE LOWER(USE1_DESC) LIKE '%ind%'
 65 |    OR LOWER(USE1_DESC) = '305 industrial'
 66 | )
 67 | 
 68 | , public AS (
 69 | SELECT
 70 |  USE1_DESC AS PropertyType
 71 | , centroid_long
 72 | , centroid_lat
 73 | FROM `datadive-142319.metrogis_parcels.2015_tax_parcel_data`
 74 | WHERE LOWER(USE1_DESC) LIKE '9__%'
 75 |    OR LOWER(USE1_DESC) LIKE '%public%'
 76 |    OR LOWER(USE1_DESC) LIKE '%muni%'
 77 |    OR LOWER(USE1_DESC) LIKE '%rail%'
 78 |    OR LOWER(USE1_DESC) LIKE '%church%'
 79 |    OR LOWER(USE1_DESC) LIKE '%school%'
 80 |    OR LOWER(USE1_DESC) LIKE '%forest%'
 81 |    OR LOWER(USE1_DESC) LIKE '%state%'
 82 |    OR LOWER(USE1_DESC) LIKE '%county%'
 83 |    OR LOWER(USE1_DESC) LIKE '%util%'
 84 |    OR LOWER(USE1_DESC) LIKE '%college%'
 85 |    OR LOWER(USE1_DESC) LIKE '%cem%'
 86 |    OR LOWER(USE1_DESC) LIKE '%common%'
 87 |    OR LOWER(USE1_DESC) LIKE '%road%'
 88 |    OR LOWER(USE1_DESC) LIKE '%fed%'
 89 |    OR LOWER(USE1_DESC) LIKE '%tax%'
 90 |    OR LOWER(USE1_DESC) LIKE '%dnr%'
 91 |    OR LOWER(USE1_DESC) LIKE '%charit%'
 92 |    OR LOWER(USE1_DESC) LIKE '%serv%'
 93 |    OR LOWER(USE1_DESC) LIKE '%hosp%'
 94 |    OR LOWER(USE1_DESC) LIKE '%park%'
 95 | )
 96 | 
 97 | SELECT
 98 |  ROW_NUMBER() OVER (ORDER BY lake.LakeName) AS ID
 99 | , lake.LakeName
100 | , COUNT(residential.PropertyType) AS ResidentialCount_2015
101 | , COUNT(agriculture.PropertyType) AS AgriculturalCount_2015
102 | , COUNT(commercial.PropertyType) AS CommercialCount_2015
103 | , COUNT(industrial.PropertyType) AS IndustrialCount_2015
104 | , COUNT(public.PropertyType) AS PublicCount_2015
105 | FROM lake
106 |   JOIN `datadive-142319.sds_xref.parcel_to_water` AS intersection ON lake.DNR_ID_SITE_NUMBER = intersection.MCES_Map_Code1
107 |  
108 |   LEFT JOIN residential ON intersection.parcel_centroid_long = residential.centroid_long
109 |                        AND intersection.parcel_centroid_lat = residential.centroid_lat
110 | 
111 |   LEFT JOIN agriculture ON intersection.parcel_centroid_long = agriculture.centroid_long
112 |                        AND intersection.parcel_centroid_lat = agriculture.centroid_lat
113 | 
114 |   LEFT JOIN commercial ON intersection.parcel_centroid_long = commercial.centroid_long
115 |                        AND intersection.parcel_centroid_lat = commercial.centroid_lat
116 | 
117 |   LEFT JOIN industrial ON intersection.parcel_centroid_long = industrial.centroid_long
118 |                        AND intersection.parcel_centroid_lat = industrial.centroid_lat
119 | 
120 |   LEFT JOIN public ON intersection.parcel_centroid_long = public.centroid_long
121 |                        AND intersection.parcel_centroid_lat = public.centroid_lat
122 |                  
123 | GROUP BY lake.LakeName
124 | ORDER BY lake.LakeName ASC
125 | 


--------------------------------------------------------------------------------
/minnemudac/README.md:
--------------------------------------------------------------------------------
 1 | # WELCOME TO SHERLOCK
 2 | 
 3 | 
 4 | 
 5 | ### Density by Property Types Over Time
 6 | 
 7 | Using properties joined to lakes in the intersection table
 8 | 
 9 | 1) **PropertiesByYear**: Residential, Industrial, Commercial, Agricultural, and Public properties by lake from 2003 - 2015 using the "Number of Properties per Lake" query
10 | 
11 | 2) **PropertiesPctChangeByYear**: YoY change by property type by lake. Percentages in decimal format (Ex. 2.5 indicates a 250% increase)
12 | 
13 | 3) **PropertiesPctOfTotalByYear**: Percentage of total by lake by year by type for 2003 - 2015.  -1 indicates a lack of the property type for that lake.
14 | 


--------------------------------------------------------------------------------
/minnemudac/Top&Bottom10LakesPerYear.sql:
--------------------------------------------------------------------------------
 1 | /* Adjust line 8 to ASC for the top 10 lakes per year with the worst quality */
 2 | SELECT *
 3 | FROM (
 4 | SELECT
 5 |  LAKE_NAME
 6 | , YEAR(START_DATE) as Year
 7 | , AVG(SECCHI_DEPTH_RESULT) AS SECCHI_DEPTH_RESULT
 8 | , RANK(SECCHI_DEPTH_RESULT) OVER (PARTITION BY Year ORDER BY SECCHI_DEPTH_RESULT DESC) AS Rank
 9 | FROM [datadive-142319:mces_lakes.1999_2014_monitoring_data] 
10 | WHERE 
11 |  SEASONAL_LAKE_GRADE_RESULT IS NULL -- Ensures seasonal records are avoided
12 |    AND SECCHI_DEPTH_RESULT IS NOT NULL
13 |    AND YEAR(START_DATE) >= 1995
14 | GROUP BY LAKE_NAME, Year
15 | HAVING COUNT(*) > 5
16 | )
17 | WHERE Rank <= 10
18 | ORDER BY Year DESC, Rank ASC -- Using Secchi depth since physical condition/recreational condition isn't available for all lakes
19 | 


--------------------------------------------------------------------------------