├── Loan Prediction III ├── AV Loan Prediction III - EDA.ipynb ├── EDA.py └── README.md ├── Lord Of The Machines ├── CType_vs_Prob.png ├── FinalSubmission.csv ├── LordOfTheMachines.py ├── README.md ├── SendHour_vs_Prob.png └── WeekdayType_vs_Prob.png ├── Mckinskey Healthcare Hackathon ├── EDA_MckinskeyHealthcareHackathon_AV.ipynb ├── README.md └── script.py └── README.md /Loan Prediction III/EDA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib as mlt 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | train = pd.read_csv('train.csv') 8 | test = pd.read_csv('test.csv') 9 | 10 | print(train.columns) 11 | 12 | print('Missing values per attribute in train:') 13 | print(train.apply(lambda x: x.isnull().sum(), axis=0) * 100 / len(train)) 14 | 15 | print('Rows with missing values in train:') 16 | print(train.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train)) 17 | 18 | 19 | print('Missing values per attribute in test:') 20 | print(test.apply(lambda x: x.isnull().sum(), axis=0) * 100 / len(test)) 21 | 22 | print('Rows with missing values in test:') 23 | print(test.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train)) 24 | 25 | # Let's how each variable affects the target variable Loan_Status' individually 26 | # Map Y to 1 and N to 0 27 | train.Loan_Status = train.apply(lambda x: (1 if x['Loan_Status'] == 'Y' else 0),axis=1) 28 | # See percentage of loans apporved 29 | print(train.Loan_Status.mean() * 100) 30 | 31 | # Continuous 32 | field = 'ApplicantIncome' 33 | sns.distplot(train[train.Loan_Status == 1][field],color='g') 34 | sns.distplot(train[train.Loan_Status == 0][field],color='r') 35 | plt.legend(['Approve','Rejected']) 36 | plt.show() 37 | 38 | field = 'CoapplicantIncome' 39 | sns.distplot(train[train.Loan_Status == 1][field],color='g') 40 | sns.distplot(train[train.Loan_Status == 0][field],color='r') 41 | plt.legend(['Approve','Rejected']) 42 | plt.show() 43 | 44 | field = 'LoanAmount' 45 | sns.distplot(train[(~train.LoanAmount.isnull()) & (train.Loan_Status == 1)][field],color='g') 46 | sns.distplot(train[(~train.LoanAmount.isnull()) & (train.Loan_Status == 0)][field],color='r') 47 | plt.legend(['Approve','Rejected']) 48 | plt.show() 49 | 50 | field = 'Loan_Amount_Term' 51 | sns.distplot(train[(~train.Loan_Amount_Term.isnull()) & (train.Loan_Status == 1)][field],color='g') 52 | sns.distplot(train[(~train.Loan_Amount_Term.isnull()) & (train.Loan_Status == 0)][field],color='r') 53 | plt.legend(['Approve','Rejected']) 54 | plt.show() 55 | 56 | print( train.pivot_table(index='Gender',values='Loan_Status',aggfunc=('mean','count')) ) 57 | 58 | print( train.pivot_table(index='Married',values='Loan_Status',aggfunc=('mean','count')) ) 59 | 60 | print( train.pivot_table(index='Dependents',values='Loan_Status',aggfunc=('mean','count')) ) 61 | 62 | print( train.pivot_table(index='Education',values='Loan_Status',aggfunc=('mean','count')) ) 63 | 64 | print( train.pivot_table(index='Self_Employed',values='Loan_Status',aggfunc=('mean','count')) ) 65 | 66 | print( train.pivot_table(index='Credit_History',values='Loan_Status',aggfunc=('mean','count')) ) 67 | 68 | print( train.pivot_table(index='Property_Area',values='Loan_Status',aggfunc=('mean','count')) ) 69 | 70 | print( train.pivot_table(index='Loan_Amount_Term',values='Loan_Status',aggfunc=('mean','count')) ) 71 | 72 | # On first observation: 73 | # Credit_History, Education, Married, Gender, Dependents and Property_Area seem to affect the chance of getting a loan approved 74 | # Credit_History seems to be the most significant variable and also has the most % of missing values which need to be imputed 75 | 76 | # Imputing Missing Values 77 | # Gender 78 | # Marreid 79 | # Dependents 80 | # Self_Employed 81 | # LoanAmount 82 | # Loan_Amount_Term 83 | # Credit_History 84 | 85 | # Credit_History seems to be the most important variable. 0 -> 0 and 1-> 1 hence we can use that to impute missing values in the train set 86 | train.loc[(train.Credit_History.isnull()) & (train.Loan_Status == 1),'Credit_History'] = 1 87 | train.loc[(train.Credit_History.isnull()) & (train.Loan_Status == 0),'Credit_History'] = 0 88 | # This takes care of missing values in the train set 89 | # Now let's look at the combined train and test set to impute test set 90 | combined = pd.concat([train,test]) 91 | 92 | # Let's see if credit_history is determined by any other variable 93 | cols = ['Self_Employed', 'Gender', 'Married' ,'Education', 'Dependents', 'Property_Area'] 94 | pivots = [] 95 | for col in cols: 96 | pivots.append( ( combined.pivot_table(index=col,values='Credit_History',aggfunc=('mean')) ) ) 97 | 98 | for pivot in pivots: 99 | pivot.plot(kind='bar') 100 | plt.show() 101 | 102 | field = 'ApplicantIncome' 103 | sns.distplot(combined[(~combined.ApplicantIncome.isnull()) & (combined.Credit_History == 1)][field],color='g') 104 | sns.distplot(combined[(~combined.ApplicantIncome.isnull()) & (combined.Credit_History == 0)][field],color='r') 105 | plt.legend(['Credit_History 1','Credit_History 2']) 106 | plt.show() 107 | 108 | # Can't see any clear distinction 109 | # Let's replace missing values with groupwise mode 110 | pivot = train.pivot_table( 111 | index=['Self_Employed', 'Property_Area', 'Dependents', 'Gender', 'Education', 'Married'], 112 | values='Credit_History', aggfunc=('mean', 'count')) 113 | 114 | missed = 0 115 | chPredictions = [] 116 | for i in range(len(test)): 117 | if np.isnan(test.loc[i,].Credit_History):#.isnull(): 118 | #print(i) 119 | chance = 0.0 120 | try: 121 | chance = pivot.loc[test.loc[i,].Self_Employed].loc[test.loc[i,].Property_Area].loc[test.loc[i,].Dependents].loc[test.loc[i,].Gender].loc[test.loc[i,].Education].loc[test.loc[i,].Married]['mean'] 122 | except: 123 | chance = 1.0 124 | missed = missed + 1 125 | pred = 1.0 if chance > 0.75 else 0.0 126 | chPredictions.append(pred) 127 | test.loc[i,].Credit_History = pred 128 | #print(missed) 129 | 130 | test.loc[np.isnan(test.Credit_History),"Credit_History"] = chPredictions 131 | 132 | # Let's look at other pair-plots 133 | # Let's look at other missing variables 134 | missingColsCat = ['Gender', 'Married', 'Dependents', 'Self_Employed','Credit_History'] 135 | for i in range(len(missingColsCat)): 136 | for j in range(len(missingColsCat)): 137 | if i != j: 138 | pd.crosstab(train[missingColsCat[i]],train[missingColsCat[j]]).plot(kind='bar') 139 | plt.show() 140 | # We will look for variables such that the distribution of the other variable grouped by this variable is different. 141 | # # Ow it means that the first variable doesn't affect the second variable 142 | combined = pd.concat([train,test]) 143 | # Given gender we can talk about marriage distinctly. Male -> Married and Female -> Not Married 144 | combined.loc[(combined.Gender.isnull()) & (combined.Gender == 'Male'),'Married'] = 'Yes' 145 | combined.loc[(combined.Gender.isnull()) & (combined.Gender == 'Female'),'Married'] = 'No' 146 | # Let's impute other missung values with the mode 147 | combined = pd.concat([train,test]) 148 | combined.Gender.fillna('Male',inplace=True) 149 | combined.Married.fillna('Yes',inplace=True) 150 | combined.Dependents.fillna('0',inplace=True) 151 | combined.Self_Employed.fillna('No',inplace=True) 152 | combined.LoanAmount.fillna(np.mean(combined.LoanAmount),inplace=True) 153 | combined.Loan_Amount_Term.fillna(360.0,inplace=True) 154 | 155 | train,test = combined[:len(train)],combined[len(train):] 156 | test = test.drop(['Loan_Status'],axis=1) 157 | 158 | # Check if missing values have been removed 159 | print('Rows with missing values in train:') 160 | print(train.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train)) 161 | print('Rows with missing values in test:') 162 | print(test.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(test)) 163 | 164 | # Let's try to engineer some features 165 | # Loan_Amount and Application/Coapplicant Income don't seem to make a difference which is surprising. Maybe the ratio of Loan_Amount to the income might be important 166 | train.loc[:,'TotalIncome'] = train.ApplicantIncome + train.CoapplicantIncome 167 | train.loc[:,'ILR'] = train.TotalIncome / train.LoanAmount 168 | 169 | field = 'ILR' 170 | sns.distplot(train[(~train.ILR.isnull()) & (train.Loan_Status == 1)][field],color='g') 171 | sns.distplot(train[(~train.ILR.isnull()) & (train.Loan_Status == 0)][field],color='r') 172 | plt.legend(['Approve','Rejected']) 173 | plt.show() 174 | 175 | # Even this doesn't seem to have an effect! 176 | # Since the loan is paid over several years in monthly installments, the ratio of the income to the EMI might be more significant. 177 | # Also, rate of interest for the loan should also be considered, let's assume it to be 7% 178 | interestRate = 0.07 179 | # Get the number of years for which the loan is sanctioned 180 | train.Loan_Amount_Term = train.Loan_Amount_Term / 12 181 | # Even this doesn't seem to have an effect! 182 | # Since the loan is paid over several years in monthly installments, the ratio of the income to the EMI might be more significant. 183 | # Also, rate of interest for the loan should also be considered, let's assume it to be 7% 184 | interestRate = 0.07 185 | # Get the number of years for which the loan is sanctioned 186 | train.Loan_Amount_Term = train.Loan_Amount_Term / 12 187 | train.loc[:,'LoanRepayAmount'] = train.apply(lambda x: x['LoanAmount']*((1 + interestRate)**x['Loan_Amount_Term']),axis=1) 188 | train.loc[:,'LoanEMI'] = train.apply(lambda x: x['LoanAmount']*interestRate*((1 + interestRate)**x['Loan_Amount_Term']/((1+interestRate)**(x['Loan_Amount_Term']-1))),axis=1) 189 | train.loc[:,'IEMIR'] = train.TotalIncome / train.LoanEMI 190 | 191 | field = 'IEMIR' 192 | sns.distplot(train[(~train.IEMIR.isnull()) & (train.Loan_Status == 1)][field],color='g') 193 | sns.distplot(train[(~train.IEMIR.isnull()) & (train.Loan_Status == 0)][field],color='r') 194 | plt.legend(['Approve','Rejected']) 195 | plt.show() 196 | 197 | # Still doesn't seem to be very significant, but intuitively this IEMIR variable should matter. We'll explore it further later. 198 | -------------------------------------------------------------------------------- /Loan Prediction III/README.md: -------------------------------------------------------------------------------- 1 | # LoanPredictionIII - Analytics_Vidhya 2 | 3 | Solution to Loan Prediction III Practice Problem on Analytics Vidhya 4 | 5 | The problem involves predicting whether a given loan application will be approved or not given information such as the applicant's income, gender, marital status, employment status, loan amount, loan tenure etc. 6 | 7 | My approach: 8 | 9 | - EDA (described in detail as a ipynb in this repository itself) 10 | 11 | 1) Looked at individual effect of each variable on the target variable to gain some initial hypothesis of important variables. This included crosstabs for categorical variables and a distribution plot for continuous variables grouped by the target variable value 12 | 13 | 2) Imputed missing values using group-wise mean/mode and other evidence from pivot tables generated from the data 14 | 15 | 3) Engineered features such as EMI to Income ratio and total income (applicant + co-applicant income) 16 | 17 | 4) Used random forests and xgboost linear boosting models on the data. Tuned the parameters using randomized search based on 5-fold cross-validation. Picked the best model out of the two 18 | 19 | Results: 20 | 21 | Accuracy: 22 | 23 | 0.79166 (XGBoost) 24 | 0.784722 (RF) 25 | 26 | Will try genetic algorithm to select the optimal set of features for the model. 27 | 28 | -------------------------------------------------------------------------------- /Lord Of The Machines/CType_vs_Prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/CType_vs_Prob.png -------------------------------------------------------------------------------- /Lord Of The Machines/LordOfTheMachines.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib as mlt 4 | import matplotlib.pyplot as plt 5 | from sklearn.preprocessing import LabelEncoder 6 | from sklearn.linear_model import LogisticRegression 7 | from sklearn import metrics 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 10 | import seaborn as sns 11 | from sklearn.metrics import roc_curve, auc 12 | import collections 13 | import datetime 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | from sklearn.model_selection import train_test_split 16 | 17 | 18 | def get_new_features(df): 19 | df['send_hour'] = df['send_date'].apply(lambda x: int(x[-5:-3])) 20 | df['send_hour_category'] = pd.cut(df['send_hour'], range(0, 30, 6), labels=['EM', 'M', 'AN', 'N']) 21 | df['weekday_type'] = df['send_date'].apply( 22 | lambda x: datetime.datetime.strptime(x[:10], '%d-%m-%Y').date().isoweekday()) 23 | df['has_hackathon'] = df['subject'].apply(lambda x: ('Hack' in x) or ('hack' in x)) 24 | return df 25 | 26 | # Pre-processing the data 27 | # Extract hour and weekday on which email was sent as category 28 | # Transforming categorical variables to OHE 29 | def pre_process(df): 30 | df['send_hour'] = LabelEncoder().fit_transform(df['send_hour']) 31 | df['send_hour_category'] = LabelEncoder().fit_transform(df['send_hour_category']) 32 | df['communication_type'] = LabelEncoder().fit_transform(df['communication_type']) 33 | df['weekday_type'] = LabelEncoder().fit_transform(df['weekday_type']) 34 | return df 35 | 36 | 37 | 38 | # Loading the data 39 | train = pd.read_csv('train.csv') 40 | dfCampaign = pd.read_csv('campaign_data.csv') 41 | test = pd.read_csv('test.csv') 42 | 43 | # Join data frames to get campaign information along with the open/click value 44 | dfTrain = train.join(dfCampaign.set_index('campaign_id'),on='campaign_id') 45 | dfTest = test.join(dfCampaign.set_index('campaign_id'),on='campaign_id') 46 | 47 | 48 | print(dfTrain.columns) 49 | # Look at all columns 50 | ''' 51 | Index(['id', 'user_id', 'campaign_id', 'send_date', 'is_open', 'is_click', 52 | 'communication_type', 'total_links', 'no_of_internal_links', 53 | 'no_of_images', 'no_of_sections', 'email_body', 'subject', 'email_url'], 54 | dtype='object') 55 | ''' 56 | 57 | 58 | # Engineer features 59 | dfTrain = get_new_features(dfTrain) 60 | dfTest = get_new_features(dfTest) 61 | 62 | 63 | # Viewing open/click probabilities based on communication type 64 | barwidth = 0.35 65 | ctype_click_pivot = dfTrain.pivot_table(values='is_click',index=['communication_type'],aggfunc=('mean','count','sum')).head() 66 | ctype_open_pivot = dfTrain.pivot_table(values='is_open',index=['communication_type'],aggfunc=('mean','count','sum')).head() 67 | index = np.arange(len(ctype_click_pivot.index)) 68 | 69 | plt.bar(index+barwidth,ctype_open_pivot['mean'],barwidth,color='r',label='Open') 70 | plt.bar(index,ctype_click_pivot['mean'],barwidth,color='b',label='Click') 71 | 72 | plt.ylabel('Click/Open Probability Per Communication Type') 73 | plt.xlabel('Communication Type') 74 | plt.title('Probability Of Clicking/Opening For Each Communication Type') 75 | plt.xticks(index+barwidth,ctype_click_pivot.index) 76 | plt.legend() 77 | plt.tight_layout() 78 | plt.savefig('CType_vs_Prob.png') 79 | 80 | # Viewing open/click probabilities based on send hour 81 | barwidth = 0.35 82 | hour_click_pivot = dfTrain.pivot_table(values='is_click',index=['send_hour'],aggfunc=('mean','count','sum')) 83 | hour_open_pivot = dfTrain.pivot_table(values='is_open',index=['send_hour'],aggfunc=('mean','count','sum')) 84 | index = np.arange(len(hour_click_pivot.index)) 85 | 86 | plt.bar(index+barwidth,hour_open_pivot['mean'],barwidth,color='r',label='Open') 87 | plt.bar(index,hour_click_pivot['mean'],barwidth,color='b',label='Click') 88 | 89 | plt.ylabel('Click/Open Probability Per Send Hour') 90 | plt.xlabel('Send Hour') 91 | plt.title('Probability Of Clicking/Opening For Each Send Hour') 92 | plt.xticks(index+barwidth,hour_click_pivot.index) 93 | plt.legend() 94 | plt.tight_layout() 95 | plt.savefig('SendHour_vs_Prob.png') 96 | 97 | 98 | # Viewing open/click probabilities based on day of the week 99 | barwidth = 0.35 100 | wd_click_pivot = dfTrain.pivot_table(values='is_click',index=['weekday_type'],aggfunc=('mean','count','sum')) 101 | wd_open_pivot = dfTrain.pivot_table(values='is_open',index=['weekday_type'],aggfunc=('mean','count','sum')) 102 | index = np.arange(len(wd_click_pivot.index)) 103 | 104 | 105 | plt.bar(index+barwidth,wd_open_pivot['mean'],barwidth,color='r',label='Open') 106 | plt.bar(index,wd_click_pivot['mean'],barwidth,color='b',label='Click') 107 | 108 | plt.ylabel('Click/Open Probability Per Weekday') 109 | plt.xlabel('Weekday') 110 | plt.title('Probability Of Clicking/Opening For Each Weekday') 111 | plt.xticks(index+barwidth,wd_click_pivot.index) 112 | plt.legend() 113 | plt.tight_layout() 114 | plt.savefig('WeekdayType_vs_Prob.png') 115 | 116 | 117 | 118 | # Pre-process Train and Test data by converting to OHE 119 | dfTrain = pre_process(dfTrain) 120 | dfTest = pre_process(dfTest) 121 | 122 | 123 | 124 | 125 | # Add fake values to dfTest to get a combined data frame of Train and Test 126 | dfTest['is_click'] = 0 127 | dfTest['is_open'] = 0 128 | dfCombined = pd.concat([dfTrain,dfTest]) 129 | 130 | # Splitting for CV 131 | trainData,valData,trainLabels,valLabels = train_test_split(dfTrain,dfTrain[['is_click','is_open']]) 132 | # Used while doing cross-validation 133 | 134 | # Extract click probability and open probability for each UserID 135 | clickProb = dfTrain.pivot_table(values='is_click',index=['user_id'],aggfunc=('mean','sum','count')) 136 | openProb = dfTrain.pivot_table(values='is_open',index=['user_id'],aggfunc=('mean','sum','count')) 137 | 138 | # Extract click probability and open probability for each UserID indexed by Communucation Type 139 | typeOpenProb = dfTrain.pivot_table(values='is_open',index=['communication_type'],aggfunc=('mean','sum','count')) 140 | typeClickProb = dfTrain.pivot_table(values='is_click',index=['communication_type'],aggfunc=('mean','sum','count')) 141 | 142 | # Get probability of each campaign of being sent using the combined train and test data 143 | campaignTrainProb = dfCombined['campaign_id'].value_counts() / len(dfCombined.index) 144 | 145 | # Get probability of each campaign given that user has clicked from Train data 146 | campaignTrainProbGivenClick = dfTrain[dfTrain['is_click'] == 1]['campaign_id'].value_counts() / len(dfTrain[dfTrain['is_click'] == 1].index) 147 | 148 | # Get probability of campaign indexed by Communication Type 149 | campaignTrainProbType = dfCombined['communication_type'].value_counts() / len(dfTrain.index) 150 | 151 | # Get probability of each campaign given that user has clicked from Train data per UserID 152 | campaignGivenClickPerUser = dfTrain.pivot_table(values='is_click',index='user_id',columns=['campaign_id'],aggfunc='mean') 153 | 154 | # Get probability of Campaign features from train data given that user has clicked 155 | totalClicks = len(dfTrain[dfTrain['is_click'] == 1]) 156 | ccIL = dfTrain[dfTrain['is_click'] == 1]['no_of_internal_links'].value_counts() / totalClicks 157 | ccNS = dfTrain[dfTrain['is_click'] == 1]['no_of_sections'].value_counts() / totalClicks 158 | ccNI = dfTrain[dfTrain['is_click'] == 1]['no_of_images'].value_counts() / totalClicks 159 | ccTL = dfTrain[dfTrain['is_click'] == 1]['total_links'].value_counts() / totalClicks 160 | ccCT = dfTrain[dfTrain['is_click']==1]['communication_type'].value_counts() / totalClicks 161 | 162 | 163 | # Naive Bayes Estimates 164 | # Calculate probability from above calculated data frames if available else use communication type based or overall mean 165 | # as a proxy 166 | 167 | def NBEstimate_1(x): 168 | global campaignTrainProb 169 | global trainData 170 | global campaignTrainProbType 171 | global campaignGivenClickPerUser 172 | global campaignTrainProbGivenClick 173 | global clickProb 174 | global typeClickProb 175 | 176 | pCampaign = campaignTrainProb[x['campaign_id']] 177 | pClick = 0.0 178 | if x['user_id'] in clickProb.index: 179 | pClick = clickProb.get_value(x['user_id'],col='mean') 180 | elif x['communication_type'] in typeClickProb.index: 181 | pClick = typeClickProb.get_value(x['communication_type'],col='mean') 182 | else: pClick = 0.0 183 | 184 | pCC = 1.0 185 | if x['user_id'] not in clickProb.index: 186 | pCC = campaignTrainProbGivenClick[x['campaign_id']] 187 | else: 188 | temp = campaignGivenClickPerUser.get_value(x['user_id'],col=x['campaign_id']) 189 | if np.isnan(temp): 190 | pCC = campaignTrainProbGivenClick[x['campaign_id']] 191 | else: 192 | pCC = temp 193 | return (pCC*pClick)/pCampaign 194 | 195 | 196 | def NBEstimate_2(x): 197 | global campaignTrainProb 198 | global trainData 199 | global campaignTrainProbType 200 | global campaignGivenClickPerUser 201 | global campaignTrainProbGivenClick 202 | global clickProb 203 | global typeClickProb 204 | global ccIL, ccNS, ccNI, ccTL, ccCT 205 | 206 | pCampaign = campaignTrainProb[x['campaign_id']] 207 | pClick = 0.0 208 | if x['user_id'] in clickProb.index: 209 | pClick = clickProb.get_value(x['user_id'],col='mean') 210 | elif x['communication_type'] in typeClickProb.index: 211 | pClick = typeClickProb.get_value(x['communication_type'],col='mean') 212 | else: pClick = 0.0 213 | 214 | pCC = 1.0 215 | pCC = pCC * (ccIL.get_value(x['no_of_internal_links']) if x['no_of_internal_links'] in ccIL.index else ccIL.mean()) 216 | pCC = pCC * (ccNI.get_value(x['no_of_images']) if x['no_of_images'] in ccNI.index else ccNI.mean()) 217 | pCC = pCC * (ccNS.get_value(x['no_of_sections']) if x['no_of_sections'] in ccNS.index else ccNS.mean()) 218 | pCC = pCC * (ccTL.get_value(x['total_links']) if x['total_links'] in ccTL.index else ccTL.mean()) 219 | pCC = pCC * (ccCT.get_value(x['communication_type']) if x['communication_type'] in ccCT.index else ccCT.mean()) 220 | return (pCC*pClick)/pCampaign 221 | 222 | 223 | # Click Probability for each UserID 224 | # Use mean click probability for a particular communication type in case of new UserID 225 | def cpEstimate(x): 226 | global clickProb 227 | global typeClickProb 228 | if x['user_id'] in clickProb.index: 229 | return clickProb.get_value(x['user_id'],col='mean') 230 | if x['communication_type'] in typeClickProb.index: 231 | return typeClickProb.get_value(x['communication_type'],col='mean') 232 | return 0.0 233 | 234 | # Use mean open probability for a particular communication type in case of new UserID 235 | def opEstimate(x): 236 | global openProb 237 | global typeOpenProb 238 | if x['user_id'] in openProb.index: 239 | return openProb.get_value(x['user_id'],col='mean') 240 | if x['communication_type'] in typeOpenProb.index: 241 | return typeOpenProb.get_value(x['communication_type'],col='mean') 242 | return 0.0 243 | 244 | # Calculate open and click probabilities on train and test data 245 | dfTrain['cp'] = dfTrain.apply(cpEstimate,axis=1) 246 | dfTrain['op'] = dfTrain.apply(opEstimate,axis=1) 247 | dfTest['cp'] = dfTest.apply(cpEstimate,axis=1) 248 | dfTest['op'] = dfTest.apply(opEstimate,axis=1) 249 | 250 | # Naive Bayes Predictions 251 | dfTest['predictionNB'] = dfTest.apply(NBEstimate_2,axis=1) 252 | dfTrain['predictionNB'] = dfTrain.apply(NBEstimate_2,axis=1) 253 | 254 | 255 | # Logistic Regression Based on Open and Click Probability values and the NB estimate (Stacking) 256 | model = LogisticRegression(class_weight={0:1,1:100},max_iter=100) 257 | outcome = 'is_click' 258 | predictors = ['op','cp','predictionNB'] 259 | modelFit = model.fit(dfTrain[predictors],dfTrain[outcome]) 260 | prediction = model.predict_proba(dfTest[predictors]) 261 | dfTest['predictionLR1'] = prediction[:,1] 262 | 263 | predictors = ['op','cp'] 264 | modelFit = model.fit(dfTrain[predictors],dfTrain[outcome]) 265 | prediction = model.predict_proba(dfTest[predictors]) 266 | dfTest['predictionLR2'] = prediction[:,1] 267 | 268 | predictors = ['predictionNB','cp'] 269 | modelFit = model.fit(dfTrain[predictors],dfTrain[outcome]) 270 | prediction = model.predict_proba(dfTest[predictors]) 271 | dfTest['predictionLR3'] = prediction[:,1] 272 | 273 | 274 | # fpr,tpr,_ = roc_curve(valLabels['is_click'],valData['predictionNB']) 275 | # print(auc(fpr,tpr)) 276 | 277 | # Weight average ensemble of Naive Bayes and Logistic Regression methods 278 | dfTest['is_click'] = (0.4*dfTest['predictionLR1'] + 0.4*dfTest['predictionLR2'] + 0.1*dfTest['predictionLR3'] 279 | + 0.1*dfTest['predictionNB']) 280 | #(0.3*dfTest['predictionNB']+0.7*dfTest['predictionLR']) 281 | dfTest[['id','is_click']].to_csv('FinalSubmission.csv',index=False) 282 | 283 | 284 | 285 | # Attempt at using Field-Aware Factorization Machine using xlearn package 286 | # 287 | # 288 | # dfCombined = pd.concat([dfTrain,dfTest]) 289 | # x = dfCombined 290 | # unwanted = ['campaign_id','email_body','email_url','has_hackathon','id','predictionNB','send_date','send_hour_category', 291 | # 'subject','user_id','is_open'] 292 | # x.drop(unwanted,inplace=True,axis=1) 293 | # 294 | # features = list(x.columns) 295 | # features.remove('is_click') 296 | # x.loc[:,'no_of_images'] = pd.cut(x['no_of_images'],6,labels=False) 297 | # x.loc[:,'no_of_internal_links'] = pd.cut(x['no_of_internal_links'],6,labels=False) 298 | # x.loc[:,'no_of_sections'] = pd.cut(x['no_of_sections'],6,labels=False) 299 | # x.loc[:,'total_links'] = pd.cut(x['total_links'],6,labels=False) 300 | # x.loc[:,'op'] = pd.cut(x['op'],6,labels=False) 301 | # x.loc[:,'cp'] = pd.cut(x['cp'],6,labels=False) 302 | # 303 | # test = x[dfTrain.shape[0]:].copy() 304 | # train = x[:dfTrain.shape[0]].copy() 305 | # train = train.sample(frac=1).reset_index(drop=True) 306 | # categories = features.copy() 307 | # numerics = [] 308 | # # FFM TRY 309 | # def convert_to_ffm(df, type, numerics, categories, features): 310 | # currentcode = len(numerics) 311 | # catdict = {} 312 | # catcodes = {} 313 | # # Flagging categorical and numerical fields 314 | # for x in numerics: 315 | # catdict[x] = 0 316 | # for x in categories: 317 | # catdict[x] = 1 318 | # 319 | # nrows = df.shape[0] 320 | # ncolumns = len(features) 321 | # with open(str(type) + "_ffm.txt", "w") as text_file: 322 | # 323 | # # Looping over rows to convert each row to libffm format 324 | # for n, r in enumerate(range(nrows)): 325 | # datastring = "" 326 | # datarow = df.iloc[r].to_dict() 327 | # datastring += str(int(datarow['is_click'])) 328 | # # For numerical fields, we are creating a dummy field here 329 | # for i, x in enumerate(catdict.keys()): 330 | # if (catdict[x] == 0): 331 | # datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x]) 332 | # else: 333 | # # For a new field appearing in a training example 334 | # if (x not in catcodes): 335 | # catcodes[x] = {} 336 | # currentcode += 1 337 | # catcodes[x][datarow[x]] = currentcode # encoding the feature 338 | # # For already encoded fields 339 | # elif (datarow[x] not in catcodes[x]): 340 | # currentcode += 1 341 | # catcodes[x][datarow[x]] = currentcode # encoding the feature 342 | # code = catcodes[x][datarow[x]] 343 | # datastring = datastring + " " + str(i) + ":" + str(int(code)) + ":1" 344 | # 345 | # datastring += '\n' 346 | # text_file.write(datastring) 347 | # 348 | # import xlearn as xl 349 | # convert_to_ffm(train,'train',numerics,categories,features) 350 | # convert_to_ffm(test,'test',numerics,categories,features) -------------------------------------------------------------------------------- /Lord Of The Machines/README.md: -------------------------------------------------------------------------------- 1 | # Lord Of The Machines 2 | ## Data Science Hackathon conducted by Analytics Vidhya 3 | 4 | (Contest Link: https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/) 5 | 6 | Team: alpha1995 7 | 8 | Public LB Score: 0.6837 9 | 10 | Private LB Score: 0.6786 11 | 12 | Overall Rank: 19/153 13 | 14 | Sharing our solution to the Lord Of The Machines data science hackathon conducted by Analytics Vidhya 15 | The problem involved predicting the click probability of links inside a mailer for email campaigns from January 2018 to March 2018. 16 | 17 | ### Dataset 18 | Details can be found in the contest link mentioned above. We are given details about when a campaign was sent to a particular user, some information about the campaign such as number of links, images, subject, body etc. and whether the user opened and/or clicked the link. 19 | We are supposed to predict the probability of user clicking on a link. 20 | 21 | ### Data Processing & Exploring 22 | We started by joining the train and test dataset with the campaign data and then engineered the following features: 23 | 24 | (1) Hour when email was sent (send_hour) 25 | 26 | (2) Day of the week when email was sent (weekday_type) 27 | 28 | This was followed by visualizing the probability of opening/clicking of links based on: 29 | (1) Communication Type 30 | 31 | ![CType](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/CType_vs_Prob.png) 32 | 33 | (2) Send Hour 34 | 35 | ![SendHour](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/SendHour_vs_Prob.png) 36 | 37 | (3) Day Of The Week 38 | 39 | ![WeekdayType](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/WeekdayType_vs_Prob.png) 40 | 41 | 42 | Our first attempt was to calculate the probability that a user will open or click a link based on UserID for each user. For UserID's present in test set that are not present in the training set, I used the mean probability of opening/clicking across all campaigns having the same communication_type. This gave a public leaderboard score of 0.5714. 43 | 44 | We then created a basic logistic regression model based on the open and click probabilities(OP and CP) so calculated. This gave a public leaderboard score of 0.6821. 45 | 46 | We then generated estimates based on Naive Bayes(NB). 47 | P(Click|Campaign) = P(Campaign|Click) * P(Click) / P(Campaign) 48 | The above were calculated as follows: 49 | 50 | 51 | Probabilities were calculated and replaced by aggregated proxies when certain values were missing (for example new UserID's in test set). 52 | 53 | 54 | P(Click): Number of clicks / Number of emails sent to a particular UserID. In case of a new UserID, the mean probability for the email's Communication Type was used as a proxy 55 | 56 | P(Campaign): Number of emails for given Campaign_ID / Total number of emails sent (calculate over both train and test) 57 | 58 | P(Click|Campaign): This is the most interesting part. We represent a campaign by its features (number of links, images, sections, communication_type etc. Numberical features are convereted to categorical using cut. Let's denote these features as k1,k2 etc. 59 | Therefore, each campaign is [k1, k2, ... kn]. 60 | 61 | Assuming independence of features, 62 | 63 | P([k1, k2, k3 ... kn] | Click) = P([k1] | Click) * P([k2] | Click) ... P([kn] | Click) 64 | 65 | P([ki]|Click) is simply the number of times a campaign had feature ki across all campaigns sent to the user. This way we were able to incorporate the features of a campaign into the naive bayes estimate. 66 | 67 | 68 | The Naive Bayes method gave a public leaderboard score of 0.644. 69 | 70 | We finally used the Naive Based estimates as another feature fore the Logistic Regression model and created an ensemble out of the following models: 71 | 72 | 1) Naive Bayes 73 | 74 | 2) Logistic Regression (OP , CP , NB) 75 | 76 | 3) Logistic Regression (OP , CP) 77 | 78 | 4) Logistic Regression (CP , NB) 79 | 80 | A weighted average of the above produced our final output having a public leaderboard score of 0.6837 and a private leaderboard score of 0.6786. 81 | 82 | Adding communication type, number of links, images etc. directly as features did not seem to increase our score hence they were not used as features in the logistic regression model. Instead, the NB estimate calculated using these features were incorporated. 83 | 84 | ### Techniques Not Tried 85 | - Target encoding of categorical variables 86 | - Aggregating text features of email to create a user profile feature 87 | - Features such as time between emails 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /Lord Of The Machines/SendHour_vs_Prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/SendHour_vs_Prob.png -------------------------------------------------------------------------------- /Lord Of The Machines/WeekdayType_vs_Prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/WeekdayType_vs_Prob.png -------------------------------------------------------------------------------- /Mckinskey Healthcare Hackathon/README.md: -------------------------------------------------------------------------------- 1 | # Mckinskey Healthcare Hackathon by Analytics Vidhya 2 | ## Stroke Prediction - Data Science Hackathon 3 | 4 | (Contest Link: https://datahack.analyticsvidhya.com/contest/mckinsey-analytics-online-hackathon/) 5 | 6 | Sharing my solution to the healthcare hackathon organized by Mckinskey on Analytics Vidhya. We were given several health, demographic and lifestyle details about patients including details such as age and gender, along with several health parameters (e.g. hypertension, body mass index) and lifestyle related variables (e.g. smoking status, occupation type). 7 | We were supposed to predict the probability of stroke happening to a patients. This would help doctors take proactive health measures for these patients. 8 | 9 | Evaluation metric was AUC ROC 10 | 11 | Public LB Score: 0.8278 12 | 13 | Private LB Score: 0.8577 14 | 15 | Overall Rank: 28/584 16 | 17 | ### Dataset 18 | Details regarding the dataset can be found on the contest site. We were given details of a patient such as smoking status, gender, health conditions, lifestyle conditions etc. and were supposed to predict the probability of suffering a strokg for that patient. 19 | 20 | ### Pre-processing & EDA 21 | Exploratory data analysis has been presented in a ipython notebook here https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Mckinskey%20Healthcare%20Hackathon/EDA_MckinskeyHealthcareHackathon_AV.ipynb 22 | I looked at the distribution of all variables, the possible values they can take and how they affect the target variable. Also, went through some studies that indicate which factors are responsible for strokes. Based on that narrowed down a few features that seemed to be important. 23 | 24 | Next, I imputed missing values for smoking_status and bmi using a group based mode and median respectively. I created a new feature HTpHD which is the sum of heart_disease and hypertension. The intuition behind this was to create a variable denoting the severeness of a disease affecting the patient. 25 | 26 | ### Modelling 27 | 28 | I have used logistic regression, xgboost trees and catboost models. I tuned the parameters for each using randomized search with 5-old cross-validation. Features were selected based on how they affected the cross-validation accuracy. 29 | 30 | To handle the class imbalance I have used class weights inversely proportional to the frequency of that class. 31 | 32 | Final features selected were: ['smoking_status, 'age', 'heart_disease', 'hypertension', 'HTpHD', 'avg_glucose_level' ,'gender'] 33 | 34 | A simple average of the best performing models of each classifier was taken as the final submission. 35 | 36 | NOTE: I think the cross-validation strategy helped quite a lot. Eventhough it gave an okayish public leaderboard score, the private leaderboard score went up drastically. 37 | 38 | #### Techniques Not Tried 39 | - Thought about using pair-wise features such as Gender+BMI and Gender+Hypertension but didn't implement it. 40 | - Also, categorical variables such (obese, underweight, overweight) or (glucose levels normal, abnormal) could have been used 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /Mckinskey Healthcare Hackathon/script.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib as mlt 4 | import matplotlib.pyplot as plt 5 | from sklearn.preprocessing import LabelEncoder 6 | from sklearn.linear_model import LogisticRegression,LinearRegression 7 | from sklearn import metrics 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.tree import DecisionTreeClassifier, export_graphviz 10 | import seaborn as sns 11 | from sklearn.metrics import roc_curve, auc,accuracy_score,confusion_matrix,make_scorer 12 | import collections 13 | import datetime 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | from sklearn.neural_network import MLPClassifier 16 | from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold,RandomizedSearchCV 17 | from xgboost import XGBClassifier,plot_importance 18 | from matplotlib import pyplot 19 | from catboost import CatBoostClassifier 20 | 21 | def LR(train,test,params): 22 | X_Train,X_Val,Y_Train,Y_Val = train_test_split(train,train.stroke,random_state=101) 23 | model = LogisticRegression(class_weight='balanced',random_state=101) 24 | model.fit(X_Train[params],Y_Train) 25 | 26 | prediction = model.predict_proba(X_Val[params]) 27 | prediction = prediction[:,1] 28 | print('Validation Accuracy: ') 29 | fpr,tpr,_ = roc_curve(Y_Val, prediction) 30 | print(auc(fpr,tpr)) 31 | 32 | prediction = model.predict_proba(X_Train[params]) 33 | prediction = prediction[:, 1] 34 | print('Train Accuracy: ') 35 | fpr, tpr, _ = roc_curve(Y_Train, prediction) 36 | print(auc(fpr, tpr)) 37 | 38 | model = LogisticRegression(class_weight='balanced',random_state=101) 39 | model.fit(train[params], train.stroke) 40 | prediction = model.predict_proba(test[params]) 41 | prediction = prediction[:,1] 42 | test.loc[:,'stroke'] = prediction 43 | test[['id','stroke']].to_csv('output_lr.csv',index=False) 44 | 45 | def XGB(train,test,params): 46 | parameters = { 47 | 'min_child_weight': [5, 10], 48 | 'gamma': [0.5, 2, 5], 49 | 'subsample': [0.4, 0.6, 0.8, 1.0], 50 | 'colsample_bytree': [0.6, 0.8, 1.0], 51 | 'max_depth': [3, 5] 52 | } 53 | folds = 5 54 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) 55 | model = XGBClassifier(n_estimators=100, learning_rate=0.02, objective='binary:logistic', 56 | silent=True, nthread=1) 57 | 58 | # define scoring function 59 | def custom_auc(ground_truth, predictions): 60 | # I need only one column of predictions["0" and "1"]. You can get an error here 61 | # while trying to return both columns at once 62 | fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1) 63 | return auc(fpr, tpr) 64 | 65 | # to be standart sklearn's scorer 66 | 67 | my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True) 68 | 69 | search = RandomizedSearchCV(model, parameters, 70 | cv=skf.split(train[params], train.stroke),scoring=my_auc,n_iter=15) 71 | search.fit(train[params],train.stroke) 72 | 73 | print('Best Score: ' + str(search.best_score_)) 74 | print('Best Params: ') 75 | print(search.best_params_) 76 | 77 | prediction = search.predict_proba(test[params]) 78 | prediction = prediction[:, 1] 79 | test.loc[:,'stroke'] = prediction 80 | test[['id','stroke']].to_csv('output_xgb.csv',index=False) 81 | 82 | def CB(train,test,params,idx): 83 | 84 | parameters = { 85 | 'iterations': [20,50,100], 86 | 'learning_rate': [0.1, 0.01, 0.001], 87 | 'depth': [3, 5, 7], 88 | } 89 | folds = 5 90 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) 91 | model = CatBoostClassifier(loss_function='Logloss') 92 | 93 | # define scoring function 94 | def custom_auc(ground_truth, predictions): 95 | # I need only one column of predictions["0" and "1"]. You can get an error here 96 | # while trying to return both columns at once 97 | fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1) 98 | return auc(fpr, tpr) 99 | 100 | # to be standart sklearn's scorer 101 | 102 | my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True) 103 | 104 | search = RandomizedSearchCV(model, parameters, 105 | cv=skf.split(train[params], train.stroke),scoring=my_auc,n_iter=15) 106 | search.fit(train[params],train.stroke,cat_features=idx) 107 | 108 | print('Best Score: ' + str(search.best_score_)) 109 | print('Best Params: ') 110 | print(search.best_params_) 111 | 112 | prediction = search.predict_proba(test[params]) 113 | prediction = prediction[:, 1] 114 | test.loc[:,'stroke'] = prediction 115 | test[['id','stroke']].to_csv('output_cb.csv',index=False) 116 | 117 | 118 | def preprocess_data(train,test): 119 | 120 | combined = pd.concat([train,test]) 121 | combined.loc[:, 'age2'] = combined.age * combined.age 122 | combined.loc[:, 'age3'] = combined.age * combined.age * combined.age 123 | combined.loc[:, 'HTHD'] = combined.heart_disease * combined.hypertension 124 | combined.loc[:, 'HTpHD'] = combined.heart_disease + combined.hypertension 125 | 126 | combined.loc[:, 'age_group'] = 'Low' 127 | combined.loc[combined.age < 40,'age_group'] = 'Children' 128 | combined.loc[combined.age < 40, 'age_group'] = 'Children' 129 | combined.loc[(combined.age >= 40) & (combined.age < 65), 'age_group'] = 'Adults' 130 | combined.loc[combined.age >= 65, 'age_group'] = 'Elderly' 131 | combined.loc[combined.age < 10,'smoking_status'] = 'never smoked' 132 | smoketab = pd.crosstab([combined.gender, np.round(combined.age, -1)], combined.smoking_status).apply(lambda x: x / x.sum(), axis=1) 133 | combined.smoking_status = combined.apply(lambda x: np.argmax(smoketab.loc[x['gender']].loc[int(np.round(x['age'],-1))]),axis=1) 134 | 135 | bmiByAge = combined[~combined.bmi.isnull()].pivot_table(index=np.round(combined[~combined.bmi.isnull()].age,-1),values='bmi',aggfunc='median') 136 | combined.loc[combined.bmi.isnull(),'bmi'] = combined[combined.bmi.isnull()].apply(lambda x: bmiByAge.loc[int(np.round(x['age'],-1))],axis=1) 137 | combined.loc[:, 'bmi2'] = combined.bmi * combined.bmi 138 | 139 | train,test = combined[:len(train)],combined[len(train):] 140 | return train,test 141 | 142 | def OHE(train,test): 143 | combined = pd.concat([train,test]) 144 | combined.Residence_type = LabelEncoder().fit_transform(combined.Residence_type) 145 | combined.smoking_status = LabelEncoder().fit_transform(combined.smoking_status) 146 | combined.age_group = LabelEncoder().fit_transform(combined.age_group) 147 | combined.work_type = LabelEncoder().fit_transform(combined.work_type) 148 | combined.HTpHD = LabelEncoder().fit_transform(combined.HTpHD) 149 | combined.gender = LabelEncoder().fit_transform(combined.gender) 150 | combined.ever_married = LabelEncoder().fit_transform(combined.ever_married) 151 | return combined[:len(train)],combined[len(train):] 152 | 153 | 154 | train = pd.read_csv('train.csv') 155 | test = pd.read_csv('test.csv') 156 | train,test = preprocess_data(train,test) 157 | 158 | CB(train,test,['age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender','smoking_status'],[1,2,3,5,6]) 159 | 160 | train,test = OHE(train,test) 161 | LR(train,test,['smoking_status','age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender']) 162 | XGB(train,test,['smoking_status','age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender']) 163 | 164 | 165 | cbPredictions = pd.read_csv('output_cb.csv').stroke 166 | lrPredictions = pd.read_csv('output_lr.csv').stroke 167 | xgbPredictions = pd.read_csv('output_xgb.csv').stroke 168 | finalPrediction = (cbPredictions + lrPredictions + xgbPredictions)/3.0 169 | test.loc[:,'stroke'] = finalPrediction 170 | test[['id','stroke']].to_csv('final_output.csv',index=False) 171 | 172 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Solutions to Data Science Hackathons conducted by Analytics Vidhya 2 | 3 | The solution to the following contests have been added: 4 | 5 | - Lord Of The Machines 6 | 7 | - Mckinskey Healthcare Hackathon 8 | 9 | - Loan Prediction III 10 | --------------------------------------------------------------------------------