├── Loan Prediction III
    ├── AV Loan Prediction III - EDA.ipynb
    ├── EDA.py
    └── README.md
├── Lord Of The Machines
    ├── CType_vs_Prob.png
    ├── FinalSubmission.csv
    ├── LordOfTheMachines.py
    ├── README.md
    ├── SendHour_vs_Prob.png
    └── WeekdayType_vs_Prob.png
├── Mckinskey Healthcare Hackathon
    ├── EDA_MckinskeyHealthcareHackathon_AV.ipynb
    ├── README.md
    └── script.py
└── README.md


/Loan Prediction III/EDA.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib as mlt
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | 
  7 | train = pd.read_csv('train.csv')
  8 | test = pd.read_csv('test.csv')
  9 | 
 10 | print(train.columns)
 11 | 
 12 | print('Missing values per attribute in train:')
 13 | print(train.apply(lambda x: x.isnull().sum(), axis=0) * 100 / len(train))
 14 | 
 15 | print('Rows with missing values in train:')
 16 | print(train.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train))
 17 | 
 18 | 
 19 | print('Missing values per attribute in test:')
 20 | print(test.apply(lambda x: x.isnull().sum(), axis=0) * 100 / len(test))
 21 | 
 22 | print('Rows with missing values in test:')
 23 | print(test.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train))
 24 | 
 25 | # Let's how each variable affects the target variable Loan_Status' individually
 26 | # Map Y to 1 and N to 0
 27 | train.Loan_Status = train.apply(lambda x: (1 if x['Loan_Status'] == 'Y' else 0),axis=1)
 28 | # See percentage of loans apporved
 29 | print(train.Loan_Status.mean() * 100)
 30 | 
 31 | # Continuous
 32 | field = 'ApplicantIncome'
 33 | sns.distplot(train[train.Loan_Status == 1][field],color='g')
 34 | sns.distplot(train[train.Loan_Status == 0][field],color='r')
 35 | plt.legend(['Approve','Rejected'])
 36 | plt.show()
 37 | 
 38 | field = 'CoapplicantIncome'
 39 | sns.distplot(train[train.Loan_Status == 1][field],color='g')
 40 | sns.distplot(train[train.Loan_Status == 0][field],color='r')
 41 | plt.legend(['Approve','Rejected'])
 42 | plt.show()
 43 | 
 44 | field = 'LoanAmount'
 45 | sns.distplot(train[(~train.LoanAmount.isnull()) & (train.Loan_Status == 1)][field],color='g')
 46 | sns.distplot(train[(~train.LoanAmount.isnull()) & (train.Loan_Status == 0)][field],color='r')
 47 | plt.legend(['Approve','Rejected'])
 48 | plt.show()
 49 | 
 50 | field = 'Loan_Amount_Term'
 51 | sns.distplot(train[(~train.Loan_Amount_Term.isnull()) & (train.Loan_Status == 1)][field],color='g')
 52 | sns.distplot(train[(~train.Loan_Amount_Term.isnull()) & (train.Loan_Status == 0)][field],color='r')
 53 | plt.legend(['Approve','Rejected'])
 54 | plt.show()
 55 | 
 56 | print( train.pivot_table(index='Gender',values='Loan_Status',aggfunc=('mean','count')) )
 57 | 
 58 | print( train.pivot_table(index='Married',values='Loan_Status',aggfunc=('mean','count')) )
 59 | 
 60 | print( train.pivot_table(index='Dependents',values='Loan_Status',aggfunc=('mean','count')) )
 61 | 
 62 | print( train.pivot_table(index='Education',values='Loan_Status',aggfunc=('mean','count')) )
 63 | 
 64 | print( train.pivot_table(index='Self_Employed',values='Loan_Status',aggfunc=('mean','count')) )
 65 | 
 66 | print( train.pivot_table(index='Credit_History',values='Loan_Status',aggfunc=('mean','count')) )
 67 | 
 68 | print( train.pivot_table(index='Property_Area',values='Loan_Status',aggfunc=('mean','count')) )
 69 | 
 70 | print( train.pivot_table(index='Loan_Amount_Term',values='Loan_Status',aggfunc=('mean','count')) )
 71 | 
 72 | # On first observation:
 73 | # Credit_History, Education, Married, Gender, Dependents and Property_Area seem to affect the chance of getting a loan approved
 74 | # Credit_History seems to be the most significant variable and also has the most % of missing values which need to be imputed
 75 | 
 76 | # Imputing Missing Values
 77 | # Gender
 78 | # Marreid
 79 | # Dependents
 80 | # Self_Employed
 81 | # LoanAmount
 82 | # Loan_Amount_Term
 83 | # Credit_History
 84 | 
 85 | # Credit_History seems to be the most important variable. 0 -> 0 and 1-> 1 hence we can use that to impute missing values in the train set
 86 | train.loc[(train.Credit_History.isnull()) & (train.Loan_Status == 1),'Credit_History'] = 1
 87 | train.loc[(train.Credit_History.isnull()) & (train.Loan_Status == 0),'Credit_History'] = 0
 88 | # This takes care of missing values in the train set
 89 | # Now let's look at the combined train and test set to impute test set
 90 | combined = pd.concat([train,test])
 91 | 
 92 | # Let's see if credit_history is determined by any other variable
 93 | cols = ['Self_Employed', 'Gender', 'Married' ,'Education', 'Dependents', 'Property_Area']
 94 | pivots = []
 95 | for col in cols:
 96 |     pivots.append( ( combined.pivot_table(index=col,values='Credit_History',aggfunc=('mean')) ) )
 97 | 
 98 | for pivot in pivots:
 99 |     pivot.plot(kind='bar')
100 |     plt.show()
101 | 
102 | field = 'ApplicantIncome'
103 | sns.distplot(combined[(~combined.ApplicantIncome.isnull()) & (combined.Credit_History == 1)][field],color='g')
104 | sns.distplot(combined[(~combined.ApplicantIncome.isnull()) & (combined.Credit_History == 0)][field],color='r')
105 | plt.legend(['Credit_History 1','Credit_History 2'])
106 | plt.show()
107 | 
108 | # Can't see any clear distinction
109 | # Let's replace missing values with groupwise mode
110 | pivot = train.pivot_table(
111 |         index=['Self_Employed', 'Property_Area', 'Dependents', 'Gender', 'Education', 'Married'],
112 |         values='Credit_History', aggfunc=('mean', 'count'))
113 | 
114 | missed = 0
115 | chPredictions = []
116 | for i in range(len(test)):
117 |     if np.isnan(test.loc[i,].Credit_History):#.isnull():
118 |         #print(i)
119 |         chance = 0.0
120 |         try:
121 |             chance = pivot.loc[test.loc[i,].Self_Employed].loc[test.loc[i,].Property_Area].loc[test.loc[i,].Dependents].loc[test.loc[i,].Gender].loc[test.loc[i,].Education].loc[test.loc[i,].Married]['mean']
122 |         except:
123 |             chance = 1.0
124 |             missed = missed + 1
125 |         pred = 1.0 if chance > 0.75 else 0.0
126 |         chPredictions.append(pred)
127 |         test.loc[i,].Credit_History = pred
128 | #print(missed)
129 | 
130 | test.loc[np.isnan(test.Credit_History),"Credit_History"] = chPredictions
131 | 
132 | # Let's look at other pair-plots
133 | # Let's look at other missing variables
134 | missingColsCat = ['Gender', 'Married', 'Dependents', 'Self_Employed','Credit_History']
135 | for i in range(len(missingColsCat)):
136 |     for j in range(len(missingColsCat)):
137 |         if i != j:
138 |             pd.crosstab(train[missingColsCat[i]],train[missingColsCat[j]]).plot(kind='bar')
139 |             plt.show()
140 | # We will look for variables such that the distribution of the other variable grouped by this variable is different.
141 | # # Ow it means that the first variable doesn't affect the second variable
142 | combined = pd.concat([train,test])
143 | # Given gender we can talk about marriage distinctly. Male -> Married and Female -> Not Married
144 | combined.loc[(combined.Gender.isnull()) & (combined.Gender == 'Male'),'Married'] = 'Yes'
145 | combined.loc[(combined.Gender.isnull()) & (combined.Gender == 'Female'),'Married'] = 'No'
146 | # Let's impute other missung values with the mode
147 | combined = pd.concat([train,test])
148 | combined.Gender.fillna('Male',inplace=True)
149 | combined.Married.fillna('Yes',inplace=True)
150 | combined.Dependents.fillna('0',inplace=True)
151 | combined.Self_Employed.fillna('No',inplace=True)
152 | combined.LoanAmount.fillna(np.mean(combined.LoanAmount),inplace=True)
153 | combined.Loan_Amount_Term.fillna(360.0,inplace=True)
154 | 
155 | train,test = combined[:len(train)],combined[len(train):]
156 | test = test.drop(['Loan_Status'],axis=1)
157 | 
158 | # Check if missing values have been removed
159 | print('Rows with missing values in train:')
160 | print(train.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(train))
161 | print('Rows with missing values in test:')
162 | print(test.apply(lambda x: (x.isnull().sum() > 0), axis=1).sum() * 100 / len(test))
163 | 
164 | # Let's try to engineer some features
165 | # Loan_Amount and Application/Coapplicant Income don't seem to make a difference which is surprising. Maybe the ratio of Loan_Amount to the income might be important
166 | train.loc[:,'TotalIncome'] = train.ApplicantIncome + train.CoapplicantIncome
167 | train.loc[:,'ILR'] = train.TotalIncome / train.LoanAmount
168 | 
169 | field = 'ILR'
170 | sns.distplot(train[(~train.ILR.isnull()) & (train.Loan_Status == 1)][field],color='g')
171 | sns.distplot(train[(~train.ILR.isnull()) & (train.Loan_Status == 0)][field],color='r')
172 | plt.legend(['Approve','Rejected'])
173 | plt.show()
174 | 
175 | # Even this doesn't seem to have an effect!
176 | # Since the loan is paid over several years in monthly installments, the ratio of the income to the EMI might be more significant.
177 | # Also, rate of interest for the loan should also be considered, let's assume it to be 7%
178 | interestRate = 0.07
179 | # Get the number of years for which the loan is sanctioned
180 | train.Loan_Amount_Term = train.Loan_Amount_Term / 12
181 | # Even this doesn't seem to have an effect!
182 | # Since the loan is paid over several years in monthly installments, the ratio of the income to the EMI might be more significant.
183 | # Also, rate of interest for the loan should also be considered, let's assume it to be 7%
184 | interestRate = 0.07
185 | # Get the number of years for which the loan is sanctioned
186 | train.Loan_Amount_Term = train.Loan_Amount_Term / 12
187 | train.loc[:,'LoanRepayAmount'] = train.apply(lambda x: x['LoanAmount']*((1 + interestRate)**x['Loan_Amount_Term']),axis=1)
188 | train.loc[:,'LoanEMI'] = train.apply(lambda x: x['LoanAmount']*interestRate*((1 + interestRate)**x['Loan_Amount_Term']/((1+interestRate)**(x['Loan_Amount_Term']-1))),axis=1)
189 | train.loc[:,'IEMIR'] = train.TotalIncome / train.LoanEMI
190 | 
191 | field = 'IEMIR'
192 | sns.distplot(train[(~train.IEMIR.isnull()) & (train.Loan_Status == 1)][field],color='g')
193 | sns.distplot(train[(~train.IEMIR.isnull()) & (train.Loan_Status == 0)][field],color='r')
194 | plt.legend(['Approve','Rejected'])
195 | plt.show()
196 | 
197 | # Still doesn't seem to be very significant, but intuitively this IEMIR variable should matter. We'll explore it further later.
198 | 


--------------------------------------------------------------------------------
/Loan Prediction III/README.md:
--------------------------------------------------------------------------------
 1 | # LoanPredictionIII - Analytics_Vidhya
 2 | 
 3 | Solution to Loan Prediction III Practice Problem on Analytics Vidhya
 4 | 
 5 | The problem involves predicting whether a given loan application will be approved or not given information such as the applicant's income, gender, marital status, employment status, loan amount, loan tenure etc.
 6 | 
 7 | My approach:
 8 | 
 9 | - EDA (described in detail as a ipynb in this repository itself)
10 | 
11 | 1) Looked at individual effect of each variable on the target variable to gain some initial hypothesis of important variables. This included crosstabs for categorical variables and a distribution plot for continuous variables grouped by the target variable value
12 | 
13 | 2) Imputed missing values using group-wise mean/mode and other evidence from pivot tables generated from the data
14 | 
15 | 3) Engineered features such as EMI to Income ratio and total income (applicant + co-applicant income)
16 | 
17 | 4) Used random forests and xgboost linear boosting models on the data. Tuned the parameters using randomized search based on 5-fold cross-validation. Picked the best model out of the two
18 | 
19 | Results:
20 | 
21 | Accuracy:
22 | 
23 | 0.79166 (XGBoost)
24 | 0.784722 (RF)
25 | 
26 | Will try genetic algorithm to select the optimal set of features for the model.
27 | 
28 | 


--------------------------------------------------------------------------------
/Lord Of The Machines/CType_vs_Prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/CType_vs_Prob.png


--------------------------------------------------------------------------------
/Lord Of The Machines/LordOfTheMachines.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib as mlt
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.preprocessing import LabelEncoder
  6 | from sklearn.linear_model import LogisticRegression
  7 | from sklearn import metrics
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.tree import DecisionTreeClassifier, export_graphviz
 10 | import seaborn as sns
 11 | from sklearn.metrics import roc_curve, auc
 12 | import collections
 13 | import datetime
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | from sklearn.model_selection import train_test_split
 16 | 
 17 | 
 18 | def get_new_features(df):
 19 |     df['send_hour'] = df['send_date'].apply(lambda x: int(x[-5:-3]))
 20 |     df['send_hour_category'] = pd.cut(df['send_hour'], range(0, 30, 6), labels=['EM', 'M', 'AN', 'N'])
 21 |     df['weekday_type'] = df['send_date'].apply(
 22 |         lambda x: datetime.datetime.strptime(x[:10], '%d-%m-%Y').date().isoweekday())
 23 |     df['has_hackathon'] = df['subject'].apply(lambda x: ('Hack' in x) or ('hack' in x))
 24 |     return df
 25 | 
 26 | # Pre-processing the data
 27 | # Extract hour and weekday on which email was sent as category
 28 | # Transforming categorical variables to OHE
 29 | def pre_process(df):
 30 |     df['send_hour'] = LabelEncoder().fit_transform(df['send_hour'])
 31 |     df['send_hour_category'] = LabelEncoder().fit_transform(df['send_hour_category'])
 32 |     df['communication_type'] = LabelEncoder().fit_transform(df['communication_type'])
 33 |     df['weekday_type'] = LabelEncoder().fit_transform(df['weekday_type'])
 34 |     return df
 35 | 
 36 | 
 37 | 
 38 | # Loading the data
 39 | train = pd.read_csv('train.csv')
 40 | dfCampaign = pd.read_csv('campaign_data.csv')
 41 | test = pd.read_csv('test.csv')
 42 | 
 43 | # Join data frames to get campaign information along with the open/click value
 44 | dfTrain = train.join(dfCampaign.set_index('campaign_id'),on='campaign_id')
 45 | dfTest =  test.join(dfCampaign.set_index('campaign_id'),on='campaign_id')
 46 | 
 47 | 
 48 | print(dfTrain.columns)
 49 | # Look at all columns
 50 | '''
 51 | Index(['id', 'user_id', 'campaign_id', 'send_date', 'is_open', 'is_click',
 52 |        'communication_type', 'total_links', 'no_of_internal_links',
 53 |        'no_of_images', 'no_of_sections', 'email_body', 'subject', 'email_url'],
 54 |       dtype='object')
 55 | '''
 56 | 
 57 | 
 58 | # Engineer features
 59 | dfTrain = get_new_features(dfTrain)
 60 | dfTest = get_new_features(dfTest)
 61 | 
 62 | 
 63 | # Viewing open/click probabilities based on communication type
 64 | barwidth = 0.35
 65 | ctype_click_pivot = dfTrain.pivot_table(values='is_click',index=['communication_type'],aggfunc=('mean','count','sum')).head()
 66 | ctype_open_pivot = dfTrain.pivot_table(values='is_open',index=['communication_type'],aggfunc=('mean','count','sum')).head()
 67 | index = np.arange(len(ctype_click_pivot.index))
 68 | 
 69 | plt.bar(index+barwidth,ctype_open_pivot['mean'],barwidth,color='r',label='Open')
 70 | plt.bar(index,ctype_click_pivot['mean'],barwidth,color='b',label='Click')
 71 | 
 72 | plt.ylabel('Click/Open Probability Per Communication Type')
 73 | plt.xlabel('Communication Type')
 74 | plt.title('Probability Of Clicking/Opening For Each Communication Type')
 75 | plt.xticks(index+barwidth,ctype_click_pivot.index)
 76 | plt.legend()
 77 | plt.tight_layout()
 78 | plt.savefig('CType_vs_Prob.png')
 79 | 
 80 | # Viewing open/click probabilities based on send hour
 81 | barwidth = 0.35
 82 | hour_click_pivot = dfTrain.pivot_table(values='is_click',index=['send_hour'],aggfunc=('mean','count','sum'))
 83 | hour_open_pivot = dfTrain.pivot_table(values='is_open',index=['send_hour'],aggfunc=('mean','count','sum'))
 84 | index = np.arange(len(hour_click_pivot.index))
 85 | 
 86 | plt.bar(index+barwidth,hour_open_pivot['mean'],barwidth,color='r',label='Open')
 87 | plt.bar(index,hour_click_pivot['mean'],barwidth,color='b',label='Click')
 88 | 
 89 | plt.ylabel('Click/Open Probability Per Send Hour')
 90 | plt.xlabel('Send Hour')
 91 | plt.title('Probability Of Clicking/Opening For Each Send Hour')
 92 | plt.xticks(index+barwidth,hour_click_pivot.index)
 93 | plt.legend()
 94 | plt.tight_layout()
 95 | plt.savefig('SendHour_vs_Prob.png')
 96 | 
 97 | 
 98 | # Viewing open/click probabilities based on day of the week
 99 | barwidth = 0.35
100 | wd_click_pivot = dfTrain.pivot_table(values='is_click',index=['weekday_type'],aggfunc=('mean','count','sum'))
101 | wd_open_pivot = dfTrain.pivot_table(values='is_open',index=['weekday_type'],aggfunc=('mean','count','sum'))
102 | index = np.arange(len(wd_click_pivot.index))
103 | 
104 | 
105 | plt.bar(index+barwidth,wd_open_pivot['mean'],barwidth,color='r',label='Open')
106 | plt.bar(index,wd_click_pivot['mean'],barwidth,color='b',label='Click')
107 | 
108 | plt.ylabel('Click/Open Probability Per Weekday')
109 | plt.xlabel('Weekday')
110 | plt.title('Probability Of Clicking/Opening For Each Weekday')
111 | plt.xticks(index+barwidth,wd_click_pivot.index)
112 | plt.legend()
113 | plt.tight_layout()
114 | plt.savefig('WeekdayType_vs_Prob.png')
115 | 
116 | 
117 | 
118 | # Pre-process Train and Test data by converting to OHE
119 | dfTrain = pre_process(dfTrain)
120 | dfTest = pre_process(dfTest)
121 | 
122 | 
123 | 
124 | 
125 | # Add fake values to dfTest to get a combined data frame of Train and Test
126 | dfTest['is_click'] = 0
127 | dfTest['is_open'] = 0
128 | dfCombined = pd.concat([dfTrain,dfTest])
129 | 
130 | # Splitting for CV
131 | trainData,valData,trainLabels,valLabels = train_test_split(dfTrain,dfTrain[['is_click','is_open']])
132 | # Used while doing cross-validation
133 | 
134 | # Extract click probability and open probability for each UserID
135 | clickProb = dfTrain.pivot_table(values='is_click',index=['user_id'],aggfunc=('mean','sum','count'))
136 | openProb = dfTrain.pivot_table(values='is_open',index=['user_id'],aggfunc=('mean','sum','count'))
137 | 
138 | # Extract click probability and open probability for each UserID indexed by Communucation Type
139 | typeOpenProb = dfTrain.pivot_table(values='is_open',index=['communication_type'],aggfunc=('mean','sum','count'))
140 | typeClickProb = dfTrain.pivot_table(values='is_click',index=['communication_type'],aggfunc=('mean','sum','count'))
141 | 
142 | # Get probability of each campaign of being sent using the combined train and test data
143 | campaignTrainProb = dfCombined['campaign_id'].value_counts() / len(dfCombined.index)
144 | 
145 | # Get probability of each campaign given that user has clicked from Train data
146 | campaignTrainProbGivenClick = dfTrain[dfTrain['is_click'] == 1]['campaign_id'].value_counts() / len(dfTrain[dfTrain['is_click'] == 1].index)
147 | 
148 | # Get probability of campaign indexed by Communication Type
149 | campaignTrainProbType = dfCombined['communication_type'].value_counts() / len(dfTrain.index)
150 | 
151 | # Get probability of each campaign given that user has clicked from Train data per UserID
152 | campaignGivenClickPerUser = dfTrain.pivot_table(values='is_click',index='user_id',columns=['campaign_id'],aggfunc='mean')
153 | 
154 | # Get probability of Campaign features from train data given that user has clicked
155 | totalClicks = len(dfTrain[dfTrain['is_click'] == 1])
156 | ccIL = dfTrain[dfTrain['is_click'] == 1]['no_of_internal_links'].value_counts() / totalClicks
157 | ccNS = dfTrain[dfTrain['is_click'] == 1]['no_of_sections'].value_counts() / totalClicks
158 | ccNI = dfTrain[dfTrain['is_click'] == 1]['no_of_images'].value_counts() / totalClicks
159 | ccTL = dfTrain[dfTrain['is_click'] == 1]['total_links'].value_counts() / totalClicks
160 | ccCT = dfTrain[dfTrain['is_click']==1]['communication_type'].value_counts() / totalClicks
161 | 
162 | 
163 | # Naive Bayes Estimates
164 | # Calculate probability from above calculated data frames if available else use communication type based or overall mean
165 | # as a proxy
166 | 
167 | def NBEstimate_1(x):
168 |     global campaignTrainProb
169 |     global trainData
170 |     global campaignTrainProbType
171 |     global campaignGivenClickPerUser
172 |     global campaignTrainProbGivenClick
173 |     global clickProb
174 |     global typeClickProb
175 | 
176 |     pCampaign = campaignTrainProb[x['campaign_id']]
177 |     pClick = 0.0
178 |     if x['user_id'] in clickProb.index:
179 |         pClick = clickProb.get_value(x['user_id'],col='mean')
180 |     elif x['communication_type'] in typeClickProb.index:
181 |        pClick = typeClickProb.get_value(x['communication_type'],col='mean')
182 |     else: pClick = 0.0
183 | 
184 |     pCC = 1.0
185 |     if x['user_id'] not in clickProb.index:
186 |         pCC = campaignTrainProbGivenClick[x['campaign_id']]
187 |     else:
188 |         temp = campaignGivenClickPerUser.get_value(x['user_id'],col=x['campaign_id'])
189 |         if np.isnan(temp):
190 |             pCC = campaignTrainProbGivenClick[x['campaign_id']]
191 |         else:
192 |             pCC = temp
193 |     return (pCC*pClick)/pCampaign
194 | 
195 | 
196 | def NBEstimate_2(x):
197 |     global campaignTrainProb
198 |     global trainData
199 |     global campaignTrainProbType
200 |     global campaignGivenClickPerUser
201 |     global campaignTrainProbGivenClick
202 |     global clickProb
203 |     global typeClickProb
204 |     global ccIL, ccNS, ccNI, ccTL, ccCT
205 | 
206 |     pCampaign = campaignTrainProb[x['campaign_id']]
207 |     pClick = 0.0
208 |     if x['user_id'] in clickProb.index:
209 |         pClick = clickProb.get_value(x['user_id'],col='mean')
210 |     elif x['communication_type'] in typeClickProb.index:
211 |        pClick = typeClickProb.get_value(x['communication_type'],col='mean')
212 |     else: pClick = 0.0
213 | 
214 |     pCC = 1.0
215 |     pCC = pCC *  (ccIL.get_value(x['no_of_internal_links']) if  x['no_of_internal_links'] in ccIL.index else ccIL.mean())
216 |     pCC = pCC * (ccNI.get_value(x['no_of_images']) if x['no_of_images'] in ccNI.index else ccNI.mean())
217 |     pCC = pCC * (ccNS.get_value(x['no_of_sections']) if x['no_of_sections'] in ccNS.index else ccNS.mean())
218 |     pCC = pCC * (ccTL.get_value(x['total_links']) if x['total_links'] in ccTL.index else ccTL.mean())
219 |     pCC = pCC * (ccCT.get_value(x['communication_type']) if x['communication_type'] in ccCT.index else ccCT.mean())
220 |     return (pCC*pClick)/pCampaign
221 | 
222 | 
223 | # Click Probability for each UserID
224 | # Use mean click probability for a particular communication type in case of new UserID
225 | def cpEstimate(x):
226 |     global clickProb
227 |     global typeClickProb
228 |     if x['user_id'] in clickProb.index:
229 |         return clickProb.get_value(x['user_id'],col='mean')
230 |     if x['communication_type'] in typeClickProb.index:
231 |        return typeClickProb.get_value(x['communication_type'],col='mean')
232 |     return 0.0
233 | 
234 | # Use mean open probability for a particular communication type in case of new UserID
235 | def opEstimate(x):
236 |     global openProb
237 |     global typeOpenProb
238 |     if x['user_id'] in openProb.index:
239 |         return openProb.get_value(x['user_id'],col='mean')
240 |     if x['communication_type'] in typeOpenProb.index:
241 |        return typeOpenProb.get_value(x['communication_type'],col='mean')
242 |     return 0.0
243 | 
244 | # Calculate open and click probabilities on train and test data
245 | dfTrain['cp'] = dfTrain.apply(cpEstimate,axis=1)
246 | dfTrain['op'] = dfTrain.apply(opEstimate,axis=1)
247 | dfTest['cp'] = dfTest.apply(cpEstimate,axis=1)
248 | dfTest['op'] = dfTest.apply(opEstimate,axis=1)
249 | 
250 | # Naive Bayes Predictions
251 | dfTest['predictionNB'] = dfTest.apply(NBEstimate_2,axis=1)
252 | dfTrain['predictionNB'] = dfTrain.apply(NBEstimate_2,axis=1)
253 | 
254 | 
255 | # Logistic Regression Based on Open and Click Probability values and the NB estimate (Stacking)
256 | model = LogisticRegression(class_weight={0:1,1:100},max_iter=100)
257 | outcome = 'is_click'
258 | predictors = ['op','cp','predictionNB']
259 | modelFit =  model.fit(dfTrain[predictors],dfTrain[outcome])
260 | prediction = model.predict_proba(dfTest[predictors])
261 | dfTest['predictionLR1'] = prediction[:,1]
262 | 
263 | predictors = ['op','cp']
264 | modelFit =  model.fit(dfTrain[predictors],dfTrain[outcome])
265 | prediction = model.predict_proba(dfTest[predictors])
266 | dfTest['predictionLR2'] = prediction[:,1]
267 | 
268 | predictors = ['predictionNB','cp']
269 | modelFit =  model.fit(dfTrain[predictors],dfTrain[outcome])
270 | prediction = model.predict_proba(dfTest[predictors])
271 | dfTest['predictionLR3'] = prediction[:,1]
272 | 
273 | 
274 | # fpr,tpr,_ = roc_curve(valLabels['is_click'],valData['predictionNB'])
275 | # print(auc(fpr,tpr))
276 | 
277 | # Weight average ensemble of Naive Bayes and Logistic Regression methods
278 | dfTest['is_click'] = (0.4*dfTest['predictionLR1'] + 0.4*dfTest['predictionLR2'] + 0.1*dfTest['predictionLR3']
279 |                       + 0.1*dfTest['predictionNB'])
280 | #(0.3*dfTest['predictionNB']+0.7*dfTest['predictionLR'])
281 | dfTest[['id','is_click']].to_csv('FinalSubmission.csv',index=False)
282 | 
283 | 
284 | 
285 | # Attempt at using Field-Aware Factorization Machine using xlearn package
286 | #
287 | #
288 | # dfCombined = pd.concat([dfTrain,dfTest])
289 | # x = dfCombined
290 | # unwanted = ['campaign_id','email_body','email_url','has_hackathon','id','predictionNB','send_date','send_hour_category',
291 | #             'subject','user_id','is_open']
292 | # x.drop(unwanted,inplace=True,axis=1)
293 | #
294 | # features = list(x.columns)
295 | # features.remove('is_click')
296 | # x.loc[:,'no_of_images'] = pd.cut(x['no_of_images'],6,labels=False)
297 | # x.loc[:,'no_of_internal_links'] = pd.cut(x['no_of_internal_links'],6,labels=False)
298 | # x.loc[:,'no_of_sections'] = pd.cut(x['no_of_sections'],6,labels=False)
299 | # x.loc[:,'total_links'] = pd.cut(x['total_links'],6,labels=False)
300 | # x.loc[:,'op'] = pd.cut(x['op'],6,labels=False)
301 | # x.loc[:,'cp'] = pd.cut(x['cp'],6,labels=False)
302 | #
303 | # test = x[dfTrain.shape[0]:].copy()
304 | # train = x[:dfTrain.shape[0]].copy()
305 | # train = train.sample(frac=1).reset_index(drop=True)
306 | # categories = features.copy()
307 | # numerics = []
308 | # # FFM TRY
309 | # def convert_to_ffm(df, type, numerics, categories, features):
310 | #     currentcode = len(numerics)
311 | #     catdict = {}
312 | #     catcodes = {}
313 | #     # Flagging categorical and numerical fields
314 | #     for x in numerics:
315 | #         catdict[x] = 0
316 | #     for x in categories:
317 | #         catdict[x] = 1
318 | #
319 | #     nrows = df.shape[0]
320 | #     ncolumns = len(features)
321 | #     with open(str(type) + "_ffm.txt", "w") as text_file:
322 | #
323 | #     # Looping over rows to convert each row to libffm format
324 | #         for n, r in enumerate(range(nrows)):
325 | #             datastring = ""
326 | #             datarow = df.iloc[r].to_dict()
327 | #             datastring += str(int(datarow['is_click']))
328 | #             # For numerical fields, we are creating a dummy field here
329 | #             for i, x in enumerate(catdict.keys()):
330 | #                 if (catdict[x] == 0):
331 | #                     datastring = datastring + " " + str(i) + ":" + str(i) + ":" + str(datarow[x])
332 | #                 else:
333 | #                     # For a new field appearing in a training example
334 | #                     if (x not in catcodes):
335 | #                         catcodes[x] = {}
336 | #                         currentcode += 1
337 | #                         catcodes[x][datarow[x]] = currentcode  # encoding the feature
338 | #                         # For already encoded fields
339 | #                     elif (datarow[x] not in catcodes[x]):
340 | #                         currentcode += 1
341 | #                         catcodes[x][datarow[x]] = currentcode  # encoding the feature
342 | #                     code = catcodes[x][datarow[x]]
343 | #                     datastring = datastring + " " + str(i) + ":" + str(int(code)) + ":1"
344 | #
345 | #         datastring += '\n'
346 | #         text_file.write(datastring)
347 | #
348 | # import xlearn as xl
349 | # convert_to_ffm(train,'train',numerics,categories,features)
350 | # convert_to_ffm(test,'test',numerics,categories,features)


--------------------------------------------------------------------------------
/Lord Of The Machines/README.md:
--------------------------------------------------------------------------------
 1 | # Lord Of The Machines
 2 | ## Data Science Hackathon conducted by Analytics Vidhya
 3 | 
 4 | (Contest Link: https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/)
 5 | 
 6 | Team: alpha1995
 7 | 
 8 | Public LB Score: 0.6837
 9 | 
10 | Private LB Score: 0.6786
11 | 
12 | Overall Rank: 19/153
13 | 
14 | Sharing our solution to the Lord Of The Machines data science hackathon conducted by Analytics Vidhya
15 | The problem involved predicting the click probability of links inside a mailer for email campaigns from January 2018 to March 2018.
16 | 
17 | ### Dataset
18 | Details can be found in the contest link mentioned above. We are given details about when a campaign was sent to a particular user, some information about the campaign such as number of links, images, subject, body etc. and whether the user opened and/or clicked the link.
19 | We are supposed to predict the probability of user clicking on a link.
20 | 
21 | ### Data Processing & Exploring
22 | We started by joining the train and test dataset with the campaign data and then engineered the following features:
23 | 
24 | (1) Hour when email was sent (send_hour)
25 | 
26 | (2) Day of the week when email was sent (weekday_type)
27 | 
28 | This was followed by visualizing the probability of opening/clicking of links based on:
29 | (1) Communication Type
30 | 
31 |  ![CType](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/CType_vs_Prob.png)
32 | 
33 | (2) Send Hour
34 | 
35 | ![SendHour](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/SendHour_vs_Prob.png)
36 | 
37 | (3) Day Of The Week
38 | 
39 | ![WeekdayType](https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Lord%20Of%20The%20Machines/WeekdayType_vs_Prob.png)
40 | 
41 | 
42 | Our first attempt was to calculate the probability that a user will open or click a link based on UserID for each user. For UserID's present in test set that are not present in the training set, I used the mean probability of opening/clicking across all campaigns having the same communication_type. This gave a public leaderboard score of 0.5714.
43 | 
44 | We then created a basic logistic regression model based on the open and click probabilities(OP and CP) so calculated. This gave a public leaderboard score of 0.6821.
45 | 
46 | We then generated estimates based on Naive Bayes(NB).
47 | P(Click|Campaign) = P(Campaign|Click) * P(Click) / P(Campaign)
48 | The above were calculated as follows:
49 | 
50 | 
51 | Probabilities were calculated and replaced by aggregated proxies when certain values were missing (for example new UserID's in test set).
52 | 
53 | 
54 | P(Click): Number of clicks / Number of emails sent to a particular UserID. In case of a new UserID, the mean probability for the email's Communication Type was used as a proxy
55 | 
56 | P(Campaign): Number of emails for given Campaign_ID / Total number of emails sent (calculate over both train and test)
57 | 
58 | P(Click|Campaign): This is the most interesting part. We represent a campaign by its features (number of links, images, sections, communication_type etc. Numberical features are convereted to categorical using cut. Let's denote these features as k1,k2 etc.
59 | Therefore, each campaign is [k1, k2, ... kn].
60 | 
61 | Assuming independence of features,
62 | 
63 | P([k1, k2, k3 ... kn] | Click) = P([k1] | Click) * P([k2] | Click) ... P([kn] | Click)
64 | 
65 | P([ki]|Click) is simply the number of times a campaign had feature ki across all campaigns sent to the user. This way we were able to incorporate the features of a campaign into the naive bayes estimate.
66 | 
67 | 
68 | The Naive Bayes method gave a public leaderboard score of 0.644.
69 | 
70 | We finally used the Naive Based estimates as another feature fore the Logistic Regression model and created an ensemble out of the following models:
71 | 
72 | 1) Naive Bayes
73 | 
74 | 2) Logistic Regression (OP , CP , NB)
75 | 
76 | 3) Logistic Regression (OP , CP)
77 | 
78 | 4) Logistic Regression (CP , NB)
79 | 
80 | A weighted average of the above produced our final output having a public leaderboard score of 0.6837 and a private leaderboard score of 0.6786.
81 | 
82 | Adding communication type, number of links, images etc. directly as features did not seem to increase our score hence they were not used as features in the logistic regression model. Instead, the NB estimate calculated using these features were incorporated.
83 | 
84 | ### Techniques Not Tried
85 | - Target encoding of categorical variables
86 | - Aggregating text features of email to create a user profile feature
87 | - Features such as time between emails
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/Lord Of The Machines/SendHour_vs_Prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/SendHour_vs_Prob.png


--------------------------------------------------------------------------------
/Lord Of The Machines/WeekdayType_vs_Prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/50659f381a773d2bed9bae722c535d734b0f1e6b/Lord Of The Machines/WeekdayType_vs_Prob.png


--------------------------------------------------------------------------------
/Mckinskey Healthcare Hackathon/README.md:
--------------------------------------------------------------------------------
 1 | # Mckinskey Healthcare Hackathon by Analytics Vidhya
 2 | ## Stroke Prediction - Data Science Hackathon
 3 | 
 4 | (Contest Link: https://datahack.analyticsvidhya.com/contest/mckinsey-analytics-online-hackathon/)
 5 | 
 6 | Sharing my solution to the healthcare hackathon organized by Mckinskey on Analytics Vidhya. We were given several health, demographic and lifestyle details about patients including details such as age and gender, along with several health parameters (e.g. hypertension, body mass index) and lifestyle related variables (e.g. smoking status, occupation type). 
 7 | We were supposed to predict the probability of stroke happening to a patients. This would help doctors take proactive health measures for these patients.
 8 | 
 9 | Evaluation metric was AUC ROC
10 | 
11 | Public LB Score: 0.8278
12 | 
13 | Private LB Score: 0.8577
14 | 
15 | Overall Rank: 28/584
16 | 
17 | ### Dataset
18 | Details regarding the dataset can be found on the contest site. We were given details of a patient such as smoking status, gender, health conditions, lifestyle conditions etc. and were supposed to predict the probability of suffering a strokg for that patient.
19 | 
20 | ### Pre-processing & EDA
21 | Exploratory data analysis has been presented in a ipython notebook here https://github.com/tusharsircar95/Data-Science-Hackathons-Analytics-Vidhya/blob/master/Mckinskey%20Healthcare%20Hackathon/EDA_MckinskeyHealthcareHackathon_AV.ipynb
22 | I looked at the distribution of all variables, the possible values they can take and how they affect the target variable. Also, went through some studies that indicate which factors are responsible for strokes. Based on that narrowed down a few features that seemed to be important.
23 | 
24 | Next, I imputed missing values for smoking_status and bmi using a group based mode and median respectively. I created a new feature HTpHD which is the sum of heart_disease and hypertension. The intuition behind this was to create a variable denoting the severeness of a disease affecting the patient.
25 | 
26 | ### Modelling
27 | 
28 | I have used logistic regression, xgboost trees and catboost models. I tuned the parameters for each using randomized search with 5-old cross-validation. Features were selected based on how they affected the cross-validation accuracy.
29 | 
30 | To handle the class imbalance I have used class weights inversely proportional to the frequency of that class.
31 | 
32 | Final features selected were: ['smoking_status, 'age', 'heart_disease', 'hypertension', 'HTpHD', 'avg_glucose_level' ,'gender']
33 | 
34 | A simple average of the best performing models of each classifier was taken as the final submission.
35 | 
36 | NOTE: I think the cross-validation strategy helped quite a lot. Eventhough it gave an okayish public leaderboard score, the private leaderboard score went up drastically.
37 | 
38 | #### Techniques Not Tried
39 | - Thought about using pair-wise features such as Gender+BMI and Gender+Hypertension but didn't implement it.
40 | - Also, categorical variables such (obese, underweight, overweight) or (glucose levels normal, abnormal) could have been used
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Mckinskey Healthcare Hackathon/script.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib as mlt
  4 | import matplotlib.pyplot as plt
  5 | from sklearn.preprocessing import LabelEncoder
  6 | from sklearn.linear_model import LogisticRegression,LinearRegression
  7 | from sklearn import metrics
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.tree import DecisionTreeClassifier, export_graphviz
 10 | import seaborn as sns
 11 | from sklearn.metrics import roc_curve, auc,accuracy_score,confusion_matrix,make_scorer
 12 | import collections
 13 | import datetime
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | from sklearn.neural_network import MLPClassifier
 16 | from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold,RandomizedSearchCV
 17 | from xgboost import XGBClassifier,plot_importance
 18 | from matplotlib import pyplot
 19 | from catboost import CatBoostClassifier
 20 | 
 21 | def LR(train,test,params):
 22 |     X_Train,X_Val,Y_Train,Y_Val = train_test_split(train,train.stroke,random_state=101)
 23 |     model = LogisticRegression(class_weight='balanced',random_state=101)
 24 |     model.fit(X_Train[params],Y_Train)
 25 | 
 26 |     prediction = model.predict_proba(X_Val[params])
 27 |     prediction = prediction[:,1]
 28 |     print('Validation Accuracy: ')
 29 |     fpr,tpr,_ = roc_curve(Y_Val, prediction)
 30 |     print(auc(fpr,tpr))
 31 | 
 32 |     prediction = model.predict_proba(X_Train[params])
 33 |     prediction = prediction[:, 1]
 34 |     print('Train Accuracy: ')
 35 |     fpr, tpr, _ = roc_curve(Y_Train, prediction)
 36 |     print(auc(fpr, tpr))
 37 | 
 38 |     model = LogisticRegression(class_weight='balanced',random_state=101)
 39 |     model.fit(train[params], train.stroke)
 40 |     prediction = model.predict_proba(test[params])
 41 |     prediction = prediction[:,1]
 42 |     test.loc[:,'stroke'] = prediction
 43 |     test[['id','stroke']].to_csv('output_lr.csv',index=False)
 44 | 
 45 | def XGB(train,test,params):
 46 |     parameters = {
 47 |         'min_child_weight': [5, 10],
 48 |         'gamma': [0.5, 2, 5],
 49 |         'subsample': [0.4, 0.6, 0.8, 1.0],
 50 |         'colsample_bytree': [0.6, 0.8, 1.0],
 51 |         'max_depth': [3, 5]
 52 |     }
 53 |     folds = 5
 54 |     skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
 55 |     model = XGBClassifier(n_estimators=100, learning_rate=0.02, objective='binary:logistic',
 56 |                           silent=True, nthread=1)
 57 | 
 58 |     # define scoring function
 59 |     def custom_auc(ground_truth, predictions):
 60 |         # I need only one column of predictions["0" and "1"]. You can get an error here
 61 |         # while trying to return both columns at once
 62 |         fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1)
 63 |         return auc(fpr, tpr)
 64 | 
 65 |         # to be standart sklearn's scorer
 66 | 
 67 |     my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)
 68 | 
 69 |     search = RandomizedSearchCV(model, parameters,
 70 |                           cv=skf.split(train[params], train.stroke),scoring=my_auc,n_iter=15)
 71 |     search.fit(train[params],train.stroke)
 72 | 
 73 |     print('Best Score: ' + str(search.best_score_))
 74 |     print('Best Params: ')
 75 |     print(search.best_params_)
 76 | 
 77 |     prediction = search.predict_proba(test[params])
 78 |     prediction = prediction[:, 1]
 79 |     test.loc[:,'stroke'] = prediction
 80 |     test[['id','stroke']].to_csv('output_xgb.csv',index=False)
 81 | 
 82 | def CB(train,test,params,idx):
 83 | 
 84 |     parameters = {
 85 |         'iterations': [20,50,100],
 86 |         'learning_rate': [0.1, 0.01, 0.001],
 87 |         'depth': [3, 5, 7],
 88 |     }
 89 |     folds = 5
 90 |     skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
 91 |     model = CatBoostClassifier(loss_function='Logloss')
 92 | 
 93 |     # define scoring function
 94 |     def custom_auc(ground_truth, predictions):
 95 |         # I need only one column of predictions["0" and "1"]. You can get an error here
 96 |         # while trying to return both columns at once
 97 |         fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1)
 98 |         return auc(fpr, tpr)
 99 | 
100 |         # to be standart sklearn's scorer
101 | 
102 |     my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)
103 | 
104 |     search = RandomizedSearchCV(model, parameters,
105 |                           cv=skf.split(train[params], train.stroke),scoring=my_auc,n_iter=15)
106 |     search.fit(train[params],train.stroke,cat_features=idx)
107 | 
108 |     print('Best Score: ' + str(search.best_score_))
109 |     print('Best Params: ')
110 |     print(search.best_params_)
111 | 
112 |     prediction = search.predict_proba(test[params])
113 |     prediction = prediction[:, 1]
114 |     test.loc[:,'stroke'] = prediction
115 |     test[['id','stroke']].to_csv('output_cb.csv',index=False)
116 | 
117 | 
118 | def preprocess_data(train,test):
119 | 
120 |     combined = pd.concat([train,test])
121 |     combined.loc[:, 'age2'] = combined.age * combined.age
122 |     combined.loc[:, 'age3'] = combined.age * combined.age * combined.age
123 |     combined.loc[:, 'HTHD'] = combined.heart_disease * combined.hypertension
124 |     combined.loc[:, 'HTpHD'] = combined.heart_disease + combined.hypertension
125 | 
126 |     combined.loc[:, 'age_group'] = 'Low'
127 |     combined.loc[combined.age < 40,'age_group'] = 'Children'
128 |     combined.loc[combined.age < 40, 'age_group'] = 'Children'
129 |     combined.loc[(combined.age >= 40) & (combined.age < 65), 'age_group'] = 'Adults'
130 |     combined.loc[combined.age >= 65, 'age_group'] = 'Elderly'
131 |     combined.loc[combined.age < 10,'smoking_status'] = 'never smoked'
132 |     smoketab = pd.crosstab([combined.gender, np.round(combined.age, -1)], combined.smoking_status).apply(lambda x: x / x.sum(), axis=1)
133 |     combined.smoking_status = combined.apply(lambda x: np.argmax(smoketab.loc[x['gender']].loc[int(np.round(x['age'],-1))]),axis=1)
134 | 
135 |     bmiByAge = combined[~combined.bmi.isnull()].pivot_table(index=np.round(combined[~combined.bmi.isnull()].age,-1),values='bmi',aggfunc='median')
136 |     combined.loc[combined.bmi.isnull(),'bmi'] = combined[combined.bmi.isnull()].apply(lambda x: bmiByAge.loc[int(np.round(x['age'],-1))],axis=1)
137 |     combined.loc[:, 'bmi2'] = combined.bmi * combined.bmi
138 | 
139 |     train,test = combined[:len(train)],combined[len(train):]
140 |     return train,test
141 | 
142 | def OHE(train,test):
143 |     combined = pd.concat([train,test])
144 |     combined.Residence_type = LabelEncoder().fit_transform(combined.Residence_type)
145 |     combined.smoking_status = LabelEncoder().fit_transform(combined.smoking_status)
146 |     combined.age_group = LabelEncoder().fit_transform(combined.age_group)
147 |     combined.work_type = LabelEncoder().fit_transform(combined.work_type)
148 |     combined.HTpHD = LabelEncoder().fit_transform(combined.HTpHD)
149 |     combined.gender = LabelEncoder().fit_transform(combined.gender)
150 |     combined.ever_married = LabelEncoder().fit_transform(combined.ever_married)
151 |     return combined[:len(train)],combined[len(train):]
152 | 
153 | 
154 | train = pd.read_csv('train.csv')
155 | test = pd.read_csv('test.csv')
156 | train,test = preprocess_data(train,test)
157 | 
158 | CB(train,test,['age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender','smoking_status'],[1,2,3,5,6])
159 | 
160 | train,test = OHE(train,test)
161 | LR(train,test,['smoking_status','age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender'])
162 | XGB(train,test,['smoking_status','age','heart_disease','hypertension','HTpHD','avg_glucose_level','gender'])
163 | 
164 | 
165 | cbPredictions = pd.read_csv('output_cb.csv').stroke
166 | lrPredictions = pd.read_csv('output_lr.csv').stroke
167 | xgbPredictions = pd.read_csv('output_xgb.csv').stroke
168 | finalPrediction = (cbPredictions + lrPredictions + xgbPredictions)/3.0
169 | test.loc[:,'stroke'] = finalPrediction
170 | test[['id','stroke']].to_csv('final_output.csv',index=False)
171 | 
172 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Solutions to Data Science Hackathons conducted by Analytics Vidhya
 2 | 
 3 | The solution to the following contests have been added:
 4 | 
 5 | - Lord Of The Machines
 6 | 
 7 | - Mckinskey Healthcare Hackathon
 8 | 
 9 | - Loan Prediction III
10 | 


--------------------------------------------------------------------------------