├── AV_BlackFridayHack ├── finalModel.py ├── prepData.py └── readme.md ├── AV_ChurnPrediction_Nov2017 ├── buildModel.py └── readme.md ├── AV_ClubMahindra_May2019 ├── FeatureEngg.ipynb ├── ModelBuild.ipynb └── README.md ├── AV_DHS_2017 ├── Exploratory_Data_Analysis.ipynb ├── FeatureEngineering_Walkthrough.ipynb ├── Feature_Engineering.ipynb ├── Modeling.ipynb ├── Stacking_Walkthrough.ipynb └── readme.md ├── AV_DHS_2018 ├── DataExploration.ipynb └── readme.md ├── AV_Genpact_2018 ├── final_model.py └── readme.md ├── AV_Hack3 ├── buildModel.py └── readme.md ├── AV_Hackathon_July11 ├── benchmark.R ├── benchmark.py └── readme.md ├── AV_Knocktober ├── DataExploration.ipynb ├── getOutcome.py ├── readme.md ├── srk_final.py └── vopani_final.R ├── AV_LTFS_April2019 ├── ModelBuild.ipynb └── README.md ├── AV_LordOfTheMachines ├── Explorations.ipynb ├── build_model.py ├── build_model_xgb.py ├── ensemble.py └── readme.md ├── AV_MiniHack1 ├── model_ens.py ├── model_lr.py ├── model_xgb.py └── readme.md ├── AV_MiniHack2_SimpleBuy ├── finalModel.py └── readme.md ├── AV_SmartRecruits ├── finalModel.py └── readme.md ├── AV_TheSeersAccuracy ├── createFeatures.py ├── finalModel.py ├── readme.md └── splitDevVal.py ├── LICENSE └── README.md /AV_BlackFridayHack/finalModel.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.cross_validation import KFold 5 | from sklearn import ensemble 6 | from sklearn import metrics 7 | from sklearn.preprocessing import LabelEncoder 8 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/") 9 | import xgboost as xgb 10 | 11 | gender_dict = {'F':0, 'M':1} 12 | age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6} 13 | city_dict = {'A':0, 'B':1, 'C':2} 14 | stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4} 15 | 16 | def runXGB(train_X, train_y, test_X): 17 | params = {} 18 | params["objective"] = "reg:linear" 19 | params["eta"] = 0.03 20 | params["min_child_weight"] = 10 21 | params["subsample"] = 0.8 22 | params["colsample_bytree"] = 0.7 23 | params["silent"] = 1 24 | params["max_depth"] = 10 25 | #params["max_delta_step"]=2 26 | params["seed"] = 0 27 | #params['eval_metric'] = "auc" 28 | plst = list(params.items()) 29 | num_rounds = 1100 30 | 31 | xgtrain = xgb.DMatrix(train_X, label=train_y) 32 | xgtest = xgb.DMatrix(test_X) 33 | model = xgb.train(plst, xgtrain, num_rounds) 34 | pred_test_y = model.predict(xgtest) 35 | return pred_test_y 36 | 37 | def getCountVar(compute_df, count_df, var_name): 38 | grouped_df = count_df.groupby(var_name) 39 | count_dict = {} 40 | for name, group in grouped_df: 41 | count_dict[name] = group.shape[0] 42 | 43 | count_list = [] 44 | for index, row in compute_df.iterrows(): 45 | name = row[var_name] 46 | count_list.append(count_dict.get(name, 0)) 47 | return count_list 48 | 49 | def getPurchaseVar(compute_df, purchase_df, var_name): 50 | grouped_df = purchase_df.groupby(var_name) 51 | min_dict = {} 52 | max_dict = {} 53 | mean_dict = {} 54 | twentyfive_dict = {} 55 | seventyfive_dict = {} 56 | for name, group in grouped_df: 57 | min_dict[name] = min(np.array(group["Purchase"])) 58 | max_dict[name] = max(np.array(group["Purchase"])) 59 | mean_dict[name] = np.mean(np.array(group["Purchase"])) 60 | twentyfive_dict[name] = np.percentile(np.array(group["Purchase"]),25) 61 | seventyfive_dict[name] = np.percentile(np.array(group["Purchase"]),75) 62 | 63 | min_list = [] 64 | max_list = [] 65 | mean_list = [] 66 | twentyfive_list = [] 67 | seventyfive_list = [] 68 | for index, row in compute_df.iterrows(): 69 | name = row[var_name] 70 | min_list.append(min_dict.get(name,0)) 71 | max_list.append(max_dict.get(name,0)) 72 | mean_list.append(mean_dict.get(name,0)) 73 | twentyfive_list.append( twentyfive_dict.get(name,0)) 74 | seventyfive_list.append( seventyfive_dict.get(name,0)) 75 | 76 | return min_list, max_list, mean_list, twentyfive_list, seventyfive_list 77 | 78 | 79 | if __name__ == "__main__": 80 | data_path = "../Data/" 81 | train_file = data_path + "train_mod.csv" 82 | test_file = data_path + "test_mod.csv" 83 | 84 | train_df = pd.read_csv(train_file) 85 | test_df = pd.read_csv(test_file) 86 | print train_df.shape, test_df.shape 87 | 88 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "User_ID") 89 | train_df["User_ID_MinPrice"] = min_price_list 90 | train_df["User_ID_MaxPrice"] = max_price_list 91 | train_df["User_ID_MeanPrice"] = mean_price_list 92 | train_df["User_ID_25PercPrice"] = twentyfive_price_list 93 | train_df["User_ID_75PercPrice"] = seventyfive_price_list 94 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "User_ID") 95 | test_df["User_ID_MinPrice"] = min_price_list 96 | test_df["User_ID_MaxPrice"] = max_price_list 97 | test_df["User_ID_MeanPrice"] = mean_price_list 98 | test_df["User_ID_25PercPrice"] = twentyfive_price_list 99 | test_df["User_ID_75PercPrice"] = seventyfive_price_list 100 | #print np.unique(test_df["User_ID_MeanPrice"])[:10] 101 | 102 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_ID") 103 | train_df["Product_ID_MinPrice"] = min_price_list 104 | train_df["Product_ID_MaxPrice"] = max_price_list 105 | train_df["Product_ID_MeanPrice"] = mean_price_list 106 | train_df["Product_ID_25PercPrice"] = twentyfive_price_list 107 | train_df["Product_ID_75PercPrice"] = seventyfive_price_list 108 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_ID") 109 | test_df["Product_ID_MinPrice"] = min_price_list 110 | test_df["Product_ID_MaxPrice"] = max_price_list 111 | test_df["Product_ID_MeanPrice"] = mean_price_list 112 | test_df["Product_ID_25PercPrice"] = twentyfive_price_list 113 | test_df["Product_ID_75PercPrice"] = seventyfive_price_list 114 | #print np.unique(test_df["Product_ID_MeanPrice"])[:10] 115 | 116 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_1") 117 | train_df["Product_Cat1_MinPrice"] = min_price_list 118 | train_df["Product_Cat1_MaxPrice"] = max_price_list 119 | train_df["Product_Cat1_MeanPrice"] = mean_price_list 120 | train_df["Product_Cat1_25PercPrice"] = twentyfive_price_list 121 | train_df["Product_Cat1_75PercPrice"] = seventyfive_price_list 122 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_1") 123 | test_df["Product_Cat1_MinPrice"] = min_price_list 124 | test_df["Product_Cat1_MaxPrice"] = max_price_list 125 | test_df["Product_Cat1_MeanPrice"] = mean_price_list 126 | test_df["Product_Cat1_25PercPrice"] = twentyfive_price_list 127 | test_df["Product_Cat1_75PercPrice"] = seventyfive_price_list 128 | print np.unique(test_df["Product_Cat1_MeanPrice"])[:10] 129 | 130 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_2") 131 | train_df["Product_Cat2_MinPrice"] = min_price_list 132 | train_df["Product_Cat2_MaxPrice"] = max_price_list 133 | train_df["Product_Cat2_MeanPrice"] = mean_price_list 134 | train_df["Product_Cat2_25PercPrice"] = twentyfive_price_list 135 | train_df["Product_Cat2_75PercPrice"] = seventyfive_price_list 136 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_2") 137 | test_df["Product_Cat2_MinPrice"] = min_price_list 138 | test_df["Product_Cat2_MaxPrice"] = max_price_list 139 | test_df["Product_Cat2_MeanPrice"] = mean_price_list 140 | test_df["Product_Cat2_25PercPrice"] = twentyfive_price_list 141 | test_df["Product_Cat2_75PercPrice"] = seventyfive_price_list 142 | print np.unique(test_df["Product_Cat2_MeanPrice"])[:10] 143 | 144 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_3") 145 | train_df["Product_Cat3_MinPrice"] = min_price_list 146 | train_df["Product_Cat3_MaxPrice"] = max_price_list 147 | train_df["Product_Cat3_MeanPrice"] = mean_price_list 148 | train_df["Product_Cat3_25PercPrice"] = twentyfive_price_list 149 | train_df["Product_Cat3_75PercPrice"] = seventyfive_price_list 150 | min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_3") 151 | test_df["Product_Cat3_MinPrice"] = min_price_list 152 | test_df["Product_Cat3_MaxPrice"] = max_price_list 153 | test_df["Product_Cat3_MeanPrice"] = mean_price_list 154 | test_df["Product_Cat3_25PercPrice"] = twentyfive_price_list 155 | test_df["Product_Cat3_75PercPrice"] = seventyfive_price_list 156 | print np.unique(test_df["Product_Cat3_MeanPrice"])[:10] 157 | 158 | 159 | 160 | train_y = np.array(train_df["Purchase"]) 161 | test_user_id = np.array(test_df["User_ID"]) 162 | test_product_id = np.array(test_df["Product_ID"]) 163 | 164 | train_df.drop(["Purchase"], axis=1, inplace=True) 165 | 166 | cat_columns_list = ["User_ID", "Product_ID"] 167 | for var in cat_columns_list: 168 | lb = LabelEncoder() 169 | full_var_data = pd.concat((train_df[var],test_df[var]),axis=0).astype('str') 170 | temp = lb.fit_transform(np.array(full_var_data)) 171 | train_df[var] = lb.transform(np.array( train_df[var] ).astype('str')) 172 | test_df[var] = lb.transform(np.array( test_df[var] ).astype('str')) 173 | 174 | train_X = np.array(train_df).astype('float') 175 | test_X = np.array(test_df).astype('float') 176 | print train_X.shape, test_X.shape 177 | 178 | print "Running model.." 179 | pred_test_y = runXGB(train_X, train_y, test_X) 180 | pred_test_y[pred_test_y<0] = 1 181 | 182 | out_df = pd.DataFrame({"User_ID":test_user_id}) 183 | out_df["Product_ID"] = test_product_id 184 | out_df["Purchase"] = pred_test_y 185 | out_df.to_csv("sub20.csv", index=False) 186 | -------------------------------------------------------------------------------- /AV_BlackFridayHack/prepData.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import pandas as pd 3 | import numpy as np 4 | from sklearn.cross_validation import KFold 5 | from sklearn import ensemble 6 | from sklearn import metrics 7 | from sklearn.preprocessing import LabelEncoder 8 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/") 9 | import xgboost as xgb 10 | 11 | gender_dict = {'F':0, 'M':1} 12 | age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6} 13 | city_dict = {'A':0, 'B':1, 'C':2} 14 | stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4} 15 | 16 | def getCountVar(compute_df, count_df, var_name): 17 | grouped_df = count_df.groupby(var_name) 18 | count_dict = {} 19 | for name, group in grouped_df: 20 | count_dict[name] = group.shape[0] 21 | 22 | count_list = [] 23 | for index, row in compute_df.iterrows(): 24 | name = row[var_name] 25 | count_list.append(count_dict.get(name, 0)) 26 | return count_list 27 | 28 | if __name__ == "__main__": 29 | data_path = "../Data/" 30 | train_file = data_path + "train.csv" 31 | test_file = data_path + "test.csv" 32 | 33 | train_df = pd.read_csv(train_file) 34 | test_df = pd.read_csv(test_file) 35 | print train_df.shape, test_df.shape 36 | 37 | train_df["Gender"] = train_df["Gender"].apply(lambda x: gender_dict[x]) 38 | test_df["Gender"] = test_df["Gender"].apply(lambda x: gender_dict[x]) 39 | 40 | train_df["Age"] = train_df["Age"].apply(lambda x: age_dict[x]) 41 | test_df["Age"] = test_df["Age"].apply(lambda x: age_dict[x]) 42 | 43 | train_df["City_Category"] = train_df["City_Category"].apply(lambda x: city_dict[x]) 44 | test_df["City_Category"] = test_df["City_Category"].apply(lambda x: city_dict[x]) 45 | 46 | train_df["Stay_In_Current_City_Years"] = train_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x]) 47 | test_df["Stay_In_Current_City_Years"] = test_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x]) 48 | 49 | 50 | print "Getting count features.." 51 | train_df["Age_Count"] = getCountVar(train_df, train_df, "Age") 52 | test_df["Age_Count"] = getCountVar(test_df, train_df, "Age") 53 | print "Age", np.unique(test_df["Age_Count"]) 54 | 55 | train_df["Occupation_Count"] = getCountVar(train_df, train_df, "Occupation") 56 | test_df["Occupation_Count"] = getCountVar(test_df, train_df, "Occupation") 57 | print "Occupation", np.unique(test_df["Occupation_Count"]) 58 | 59 | train_df["Product_Category_1_Count"] = getCountVar(train_df, train_df, "Product_Category_1") 60 | test_df["Product_Category_1_Count"] = getCountVar(test_df, train_df, "Product_Category_1") 61 | print "Cat 1 ",np.unique(test_df["Product_Category_1_Count"]) 62 | 63 | train_df["Product_Category_2_Count"] = getCountVar(train_df, train_df, "Product_Category_2") 64 | test_df["Product_Category_2_Count"] = getCountVar(test_df, train_df, "Product_Category_2") 65 | print "Cat 2 ", np.unique(test_df["Product_Category_2_Count"]) 66 | 67 | train_df["Product_Category_3_Count"] = getCountVar(train_df, train_df, "Product_Category_3") 68 | test_df["Product_Category_3_Count"] = getCountVar(test_df, train_df, "Product_Category_3") 69 | print "Cat 3 ", np.unique(test_df["Product_Category_3_Count"]) 70 | 71 | train_df["User_ID_Count"] = getCountVar(train_df, train_df, "User_ID") 72 | test_df["User_ID_Count"] = getCountVar(test_df, train_df, "User_ID") 73 | print "User id ", np.unique(test_df["User_ID_Count"])[:10] 74 | 75 | train_df["Product_ID_Count"] = getCountVar(train_df, train_df, "Product_ID") 76 | test_df["Product_ID_Count"] = getCountVar(test_df, train_df, "Product_ID") 77 | print "Product id ", np.unique(test_df["Product_ID_Count"])[:10] 78 | 79 | train_df.fillna(-999, inplace=True) 80 | test_df.fillna(-999, inplace=True) 81 | 82 | train_df.to_csv(data_path+"train_mod.csv", index=False) 83 | test_df.to_csv(data_path+"test_mod.csv", index=False) 84 | 85 | -------------------------------------------------------------------------------- /AV_BlackFridayHack/readme.md: -------------------------------------------------------------------------------- 1 | #####Codes for Black Friday Hack##### 2 | 3 | prepData.py - Creates count based variables and store it as new csv 4 | 5 | finalModel.py - Creates the final model after creating few DV based variables 6 | 7 | -------------------------------------------------------------------------------- /AV_ChurnPrediction_Nov2017/buildModel.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import operator 4 | import pandas as pd 5 | import numpy as np 6 | import xgboost as xgb 7 | import lightgbm as lgb 8 | from sklearn import preprocessing, metrics, ensemble, neighbors, linear_model, tree, model_selection 9 | from sklearn.model_selection import KFold, StratifiedKFold 10 | from sklearn import manifold, decomposition 11 | from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection 12 | 13 | def create_feature_map(features): 14 | outfile = open('xgb.fmap', 'w') 15 | for i, feat in enumerate(features): 16 | outfile.write('{0}\t{1}\tq\n'.format(i,feat)) 17 | outfile.close() 18 | 19 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05): 20 | params = {} 21 | params["objective"] = "binary:logistic" 22 | params['eval_metric'] = 'auc' 23 | params["eta"] = eta 24 | params["subsample"] = 0.7 25 | params["min_child_weight"] = 1 26 | params["colsample_bytree"] = 0.7 27 | params["max_depth"] = dep 28 | 29 | params["silent"] = 1 30 | params["seed"] = seed_val 31 | #params["max_delta_step"] = 2 32 | #params["gamma"] = 0.5 33 | num_rounds = rounds 34 | 35 | plst = list(params.items()) 36 | xgtrain = xgb.DMatrix(train_X, label=train_y) 37 | 38 | if test_y is not None: 39 | xgtest = xgb.DMatrix(test_X, label=test_y) 40 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 41 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20) 42 | else: 43 | xgtest = xgb.DMatrix(test_X) 44 | model = xgb.train(plst, xgtrain, num_rounds) 45 | 46 | if feature_names is not None: 47 | create_feature_map(feature_names) 48 | model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True) 49 | importance = model.get_fscore(fmap='xgb.fmap') 50 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 51 | imp_df = pd.DataFrame(importance, columns=['feature','fscore']) 52 | imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() 53 | imp_df.to_csv("imp_feat.txt", index=False) 54 | 55 | pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit) 56 | pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit) 57 | 58 | loss = 0 59 | if test_y is not None: 60 | loss = metrics.roc_auc_score(test_y, pred_test_y) 61 | return pred_test_y, loss, pred_test_y2 62 | else: 63 | return pred_test_y, loss, pred_test_y2 64 | 65 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05): 66 | params = {} 67 | params["objective"] = "binary" 68 | params['metric'] = 'auc' 69 | params["max_depth"] = dep 70 | params["min_data_in_leaf"] = 20 71 | params["learning_rate"] = eta 72 | params["bagging_fraction"] = 0.7 73 | params["feature_fraction"] = 0.7 74 | params["bagging_freq"] = 5 75 | params["bagging_seed"] = seed_val 76 | params["verbosity"] = 0 77 | num_rounds = rounds 78 | 79 | plst = list(params.items()) 80 | lgtrain = lgb.Dataset(train_X, label=train_y) 81 | 82 | if test_y is not None: 83 | lgtest = lgb.Dataset(test_X, label=test_y) 84 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) 85 | else: 86 | lgtest = lgb.DMatrix(test_X) 87 | model = lgb.train(params, lgtrain, num_rounds) 88 | 89 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 90 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 91 | 92 | loss = 0 93 | if test_y is not None: 94 | loss = metrics.roc_auc_score(test_y, pred_test_y) 95 | print loss 96 | return pred_test_y, loss, pred_test_y2 97 | else: 98 | return pred_test_y, loss, pred_test_y2 99 | 100 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=20, leaf=10, feat=0.2): 101 | model = ensemble.ExtraTreesClassifier( 102 | n_estimators = 100, 103 | max_depth = depth, 104 | min_samples_split = 2, 105 | min_samples_leaf = leaf, 106 | max_features = feat, 107 | n_jobs = 6, 108 | random_state = 0) 109 | model.fit(train_X, train_y) 110 | train_preds = model.predict_proba(train_X)[:,1] 111 | test_preds = model.predict_proba(test_X)[:,1] 112 | test_preds2 = model.predict_proba(test_X2)[:,1] 113 | test_loss = 0 114 | if test_y is not None: 115 | train_loss = metrics.roc_auc_score(train_y, train_preds) 116 | test_loss = metrics.roc_auc_score(test_y, test_preds) 117 | print "Depth, leaf, feat : ", depth, leaf, feat 118 | print "Train and Test loss : ", train_loss, test_loss 119 | return test_preds, test_loss, test_preds2 120 | 121 | if __name__ == "__main__": 122 | #model_name = "ET" 123 | for model_name in ["LGB1", "XGB1"]: 124 | data_path = "../input/" 125 | train_df = pd.read_csv(data_path + "train.csv") 126 | test_df = pd.read_csv(data_path + "test.csv") 127 | 128 | # process columns, apply LabelEncoder to categorical features 129 | for c in train_df.columns: 130 | if train_df[c].dtype == 'object' and c not in ["Responders", "UCIC_ID"]: 131 | lbl = preprocessing.LabelEncoder() 132 | lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 133 | train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 134 | test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 135 | 136 | train_df.fillna(-99, inplace=True) 137 | test_df.fillna(-99, inplace=True) 138 | 139 | ################### Feature Engineeering ############################### 140 | f1_f2_list = [["D_prev1", "D_prev2"], ["D_prev2", "D_prev3"], ["D_prev3", "D_prev4"], ["D_prev4", "D_prev5"], ["D_prev5", "D_prev6"], 141 | ["CR_AMB_Prev1", "CR_AMB_Prev3"], ["CR_AMB_Prev1", "CR_AMB_Prev4"], ["CR_AMB_Prev1", "CR_AMB_Prev5"], ["CR_AMB_Prev1", "CR_AMB_Prev6"], 142 | ["EOP_prev1", "CR_AMB_Prev1"], ["EOP_prev2", "CR_AMB_Prev2"], ["EOP_prev3", "CR_AMB_Prev3"], ["EOP_prev4", "CR_AMB_Prev4"], ["EOP_prev5", "CR_AMB_Prev5"], ["EOP_prev6", "CR_AMB_Prev6"], 143 | ["EOP_prev1", "EOP_prev2"], ["EOP_prev2", "EOP_prev3"], ["EOP_prev3", "EOP_prev4"], ["EOP_prev4", "EOP_prev5"], ["EOP_prev5", "EOP_prev6"], 144 | ["CR_AMB_Prev2", "CR_AMB_Prev4"], ["CR_AMB_Prev2", "CR_AMB_Prev5"], ["CR_AMB_Prev2", "CR_AMB_Prev6"], 145 | ["EOP_prev1", "CR_AMB_Prev2"], ["EOP_prev1", "CR_AMB_Prev3"], ["EOP_prev1", "CR_AMB_Prev4"], ["EOP_prev1", "CR_AMB_Prev5"], ["EOP_prev1", "CR_AMB_Prev6"], 146 | ["CR_AMB_Drop_Build_1", "CR_AMB_Drop_Build_2"], ["CR_AMB_Drop_Build_2", "CR_AMB_Drop_Build_3"], ["CR_AMB_Drop_Build_3", "CR_AMB_Drop_Build_4"], 147 | ["BAL_prev1", "BAL_prev2"], ["BAL_prev2", "BAL_prev3"], ["BAL_prev3", "BAL_prev4"], 148 | ["BAL_prev1", "CR_AMB_Prev1"], ["BAL_prev2", "CR_AMB_Prev2"], ["BAL_prev3", "CR_AMB_Prev3"], 149 | ["I_AQB_PrevQ1", "I_AQB_PrevQ2"], ["I_NRV_PrevQ1", "I_NRV_PrevQ2"], 150 | ["D_prev1", "D_prev3"], ["D_prev1", "D_prev4"], ["D_prev1", "D_prev6"], 151 | 152 | ] 153 | for f1, f2 in f1_f2_list: 154 | train_df["Ratio_"+f1+"_"+f2] = train_df[f1].astype('float') / np.maximum(train_df[f2],1.) 155 | test_df["Ratio_"+f1+"_"+f2] = test_df[f1].astype('float') / np.maximum(test_df[f2],1.) 156 | 157 | 158 | print "Preparing response variable.." 159 | cols_to_leave = ["Responders", "UCIC_ID"] 160 | cols_to_use = [col for col in train_df.columns if col not in cols_to_leave] 161 | train_X = train_df[cols_to_use] 162 | test_X = test_df[cols_to_use] 163 | train_y = (train_df["Responders"]).values 164 | train_id = train_df["UCIC_ID"].values 165 | test_id = test_df["UCIC_ID"].values 166 | 167 | print "Model building.." 168 | kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018) 169 | cv_scores = [] 170 | pred_test_full = 0 171 | pred_val_full = np.zeros(train_X.shape[0]) 172 | for dev_index, val_index in kf.split(train_X): 173 | dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:] 174 | dev_y, val_y = train_y[dev_index], train_y[val_index] 175 | 176 | if model_name == "XGB1": 177 | pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=8, feature_names=dev_X.columns.tolist()) 178 | elif model_name == "LGB1": 179 | pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=8) 180 | pred_val_full[val_index] = pred_val 181 | pred_test_full = pred_test_full + pred_test 182 | cv_scores.append(loss) 183 | print cv_scores 184 | pred_test_full /= 5. 185 | print metrics.roc_auc_score(train_y, pred_val_full) 186 | 187 | out_df = pd.DataFrame({"UCIC_ID":test_id}) 188 | out_df["Responders"] = pred_test_full 189 | out_df.to_csv("./meta_models/test/pred_test_v5_"+model_name+".csv", index=False) 190 | 191 | out_df = pd.DataFrame({"UCIC_ID":train_id}) 192 | out_df["Responders"] = pred_val_full 193 | out_df.to_csv("./meta_models/val/pred_val_v5_"+model_name+".csv", index=False) 194 | -------------------------------------------------------------------------------- /AV_ChurnPrediction_Nov2017/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the Analytics Vidhya Hackathon - [Churn Prediction](https://datahack.analyticsvidhya.com/contest/data-science-hackathon-churn-prediction/) 2 | -------------------------------------------------------------------------------- /AV_ClubMahindra_May2019/README.md: -------------------------------------------------------------------------------- 1 | Codes for the [Analytics Vidhya Hackathon - Club Mahindra DataOlympics](https://datahack.analyticsvidhya.com/contest/club-mahindra-dataolympics/) 2 | 3 | Finished [4th](https://datahack.analyticsvidhya.com/contest/club-mahindra-dataolympics/pvt_lb) on this competition 4 | -------------------------------------------------------------------------------- /AV_DHS_2017/Exploratory_Data_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Exploration\n", 8 | "\n", 9 | "First, let us start with something numeric. We shall look into this [Kaggle competition - Zillow Prize Estimate](https://www.kaggle.com/c/zillow-prize-1)\n", 10 | "\n", 11 | "1. [Python EDA Notebook](https://www.kaggle.com/c/zillow-prize-1)\n", 12 | "2. [R EDA Notebook](https://www.kaggle.com/philippsp/exploratory-analysis-zillow) by Philipp\n", 13 | "\n", 14 | "\n", 15 | "\n", 16 | "\n", 17 | "We will look at the Data Exploration scripts of [Kaggle Competition - Spooky Author Identification](https://www.kaggle.com/c/spooky-author-identification)\n", 18 | "\n", 19 | "1. [Python EDA Notebook](https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial) by Anisotropic\n", 20 | "2. [R EDA Notebook](https://www.kaggle.com/headsortails/treemap-house-of-horror-spooky-eda-lda-features) by Heads or Tails" 21 | ] 22 | } 23 | ], 24 | "metadata": { 25 | "kernelspec": { 26 | "display_name": "Python 2", 27 | "language": "python", 28 | "name": "python2" 29 | }, 30 | "language_info": { 31 | "codemirror_mode": { 32 | "name": "ipython", 33 | "version": 2 34 | }, 35 | "file_extension": ".py", 36 | "mimetype": "text/x-python", 37 | "name": "python", 38 | "nbconvert_exporter": "python", 39 | "pygments_lexer": "ipython2", 40 | "version": "2.7.10" 41 | } 42 | }, 43 | "nbformat": 4, 44 | "nbformat_minor": 2 45 | } 46 | -------------------------------------------------------------------------------- /AV_DHS_2017/Feature_Engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Feature Engineering\n", 8 | "\n", 9 | "Some codes related to feature engineering can be seen in this notebook\n", 10 | "\n", 11 | "### Count Encoding" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import numpy as np\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "def getCountVar(compute_df, count_df, var_name, count_var=\"v1\"):\n", 26 | " \"\"\"\n", 27 | " compute_df : Data frame for which the count encoding should be done\n", 28 | " count_df : Data frame from which the counts should be taken\n", 29 | " var_name : categorical variable for count encoding\n", 30 | " count_var : some other variable from the dataset (used as dummy variable to get count)\n", 31 | " \"\"\"\n", 32 | " grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')\n", 33 | " grouped_df.columns = [var_name, \"var_count\"]\n", 34 | " merged_df = pd.merge(compute_df, grouped_df, how=\"left\", on=var_name)\n", 35 | " merged_df.fillna(-1, inplace=True)\n", 36 | " return list(merged_df[\"var_count\"])" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Target Encoding" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 4, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "from sklearn import model_selection\n", 55 | "\n", 56 | "def getDVEncodeVar(compute_df, target_df, var_name, target_var=\"RESPONDERS\", min_cutoff=1):\n", 57 | " if type(var_name) != type([]):\n", 58 | " var_name = [var_name]\n", 59 | " grouped_df = target_df.groupby(var_name)[target_var].agg([\"mean\"]).reset_index()\n", 60 | " grouped_df.columns = var_name + [\"mean_value\"]\n", 61 | " merged_df = pd.merge(compute_df, grouped_df, how=\"left\", on=var_name)\n", 62 | " merged_df.fillna(-1, inplace=True)\n", 63 | " return list(merged_df[\"mean_value\"])\n", 64 | "\n", 65 | "\n", 66 | "def do_target_encode():\n", 67 | " kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018)\n", 68 | " for col in [\"ZIP_CODE_FINAL\", \"DESIGNATION_FINAL\"]:\n", 69 | " train_enc_values = np.zeros(train_df.shape[0])\n", 70 | " test_enc_values = 0\n", 71 | " for dev_index, val_index in kf.split(train_df):\n", 72 | " new_train_df = train_df[[col, \"RESPONDERS\"]]\n", 73 | " dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]\n", 74 | " train_enc_values[val_index] = np.array( getDVEncodeVar(val_X[[col]], dev_X, col))\n", 75 | " test_enc_values += np.array( getDVEncodeVar(test_df[[col]], dev_X, col))\n", 76 | " test_enc_values /= 5.\n", 77 | " train_df[col + \"_enc\"] = train_enc_values\n", 78 | " test_df[col + \"_enc\"] = test_enc_values\n", 79 | " print train_df[col + \"_enc\"].describe()\n", 80 | " print test_df[col + \"_enc\"].describe()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "### Interaction features \n", 88 | "\n", 89 | "[XGBoost Feature Interactions and Importance](https://github.com/Far0n/xgbfi) by Faron" 90 | ] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 2", 96 | "language": "python", 97 | "name": "python2" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 2 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython2", 109 | "version": "2.7.10" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 2 114 | } 115 | -------------------------------------------------------------------------------- /AV_DHS_2017/Modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Modeling\n", 11 | "\n", 12 | "Here we will have some sample codes and links with respect to modeling section.\n", 13 | "\n", 14 | "\n", 15 | "## Modeling Bigger Datasets \n", 16 | "\n", 17 | "1. [FTRL Implementation](https://www.kaggle.com/jiweiliu/ftrl-starter-code/code)\n", 18 | "2. [LibFFM](https://github.com/guestwalk/libffm)\n", 19 | "3. [Voapal Wabbit](https://github.com/JohnLangford/vowpal_wabbit/wiki)\n", 20 | "4. [Incremental Learning](http://scikit-learn.org/stable/modules/scaling_strategies.html#incremental-learning)\n", 21 | "\n", 22 | "## Time Series Forecasting\n", 23 | "\n", 24 | "1. [R Tutorial](https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/)\n", 25 | "\n", 26 | "2. [Python Tutorial](https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/)\n", 27 | "\n", 28 | "\n", 29 | "## Bayesian Optimization\n", 30 | "\n", 31 | "Some python libraries are\n", 32 | "\n", 33 | "1. [Hyperopt](http://hyperopt.github.io/hyperopt/)\n", 34 | "\n", 35 | "2. [Spearmint](https://github.com/JasperSnoek/spearmint)\n", 36 | "\n", 37 | "3. [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization) \n", 38 | "\n", 39 | "Example code can be seen in this [Kaggle Kernel](https://www.kaggle.com/dreeux/hyperparameter-tuning-using-hyperopt)\n", 40 | "\n", 41 | "### Random Forest ###" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 5, 47 | "metadata": { 48 | "collapsed": true, 49 | "deletable": true, 50 | "editable": true 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "def runRF(train_X, train_y, test_X, test_y=None, test_X2=None, depth=20, leaf=10, feat=0.2):\n", 55 | " model = ensemble.RandomForestClassifier(\n", 56 | " n_estimators = 1000,\n", 57 | " max_depth = depth,\n", 58 | " min_samples_split = 2,\n", 59 | " min_samples_leaf = leaf,\n", 60 | " max_features = feat,\n", 61 | " n_jobs = 4,\n", 62 | " random_state = 0)\n", 63 | " model.fit(train_X, train_y)\n", 64 | " train_preds = model.predict_proba(train_X)[:,1]\n", 65 | " test_preds = model.predict_proba(test_X)[:,1]\n", 66 | " test_preds2 = model.predict_proba(test_X2)[:,1]\n", 67 | " test_loss = 0\n", 68 | " \n", 69 | " train_loss = metrics.log_loss(train_y, train_preds)\n", 70 | " test_loss = metrics.log_loss(test_y, test_preds)\n", 71 | " print \"Train and Test loss : \", train_loss, test_loss\n", 72 | " return test_preds, test_loss, test_preds2" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "deletable": true, 79 | "editable": true 80 | }, 81 | "source": [ 82 | "### XGBoost / Light GBM" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": { 89 | "collapsed": true, 90 | "deletable": true, 91 | "editable": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, rounds=500, dep=8, eta=0.05):\n", 96 | " params = {}\n", 97 | " params[\"objective\"] = \"binary:logistic\"\n", 98 | " params['eval_metric'] = 'auc'\n", 99 | " params[\"eta\"] = eta\n", 100 | " params[\"subsample\"] = 0.7\n", 101 | " params[\"min_child_weight\"] = 1\n", 102 | " params[\"colsample_bytree\"] = 0.7\n", 103 | " params[\"max_depth\"] = dep\n", 104 | " params[\"silent\"] = 1\n", 105 | " params[\"seed\"] = seed_val\n", 106 | " #params[\"max_delta_step\"] = 2\n", 107 | " #params[\"gamma\"] = 0.5\n", 108 | " num_rounds = rounds\n", 109 | "\n", 110 | " plst = list(params.items())\n", 111 | " xgtrain = xgb.DMatrix(train_X, label=train_y)\n", 112 | "\n", 113 | " xgtest = xgb.DMatrix(test_X, label=test_y)\n", 114 | " watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]\n", 115 | " model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)\n", 116 | "\n", 117 | "\n", 118 | " pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)\n", 119 | " pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)\n", 120 | " \n", 121 | " loss = metrics.roc_auc_score(test_y, pred_test_y)\n", 122 | " return pred_test_y, loss, pred_test_y2" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": { 128 | "deletable": true, 129 | "editable": true 130 | }, 131 | "source": [ 132 | "### Neural Networks / Deep Learning" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 4, 138 | "metadata": { 139 | "collapsed": true, 140 | "deletable": true, 141 | "editable": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "def runNN(train_X, train_y, test_X, test_y=None, test_X2=None, epochs=100, scale=False):\n", 146 | " if scale:\n", 147 | " sc = preprocessing.StandardScaler()\n", 148 | " all_X = pd.concat([train_X, test_X, test_X2], axis=0)\n", 149 | " sc.fit(all_X)\n", 150 | " train_X = sc.transform(train_X)\n", 151 | " test_X = sc.transform(test_X)\n", 152 | " test_X2 = sc.transform(test_X2)\n", 153 | "\n", 154 | " random.seed(12345)\n", 155 | " np.random.seed(12345)\n", 156 | " model = Sequential()\n", 157 | " model.add(Dense(200, input_shape=(train_X.shape[1],), init='he_uniform')) #, W_regularizer=regularizers.l1(0.002)))\n", 158 | " model.add(Activation('relu'))\n", 159 | " model.add(Dropout(0.3))\n", 160 | "\n", 161 | " #model.add(Dense(50, init='he_uniform'))\n", 162 | " #model.add(Activation('relu'))\n", 163 | " #model.add(Dropout(0.3))\n", 164 | "\n", 165 | " #model.add(Dense(100, init='he_uniform'))\n", 166 | " #model.add(Activation('relu'))\n", 167 | " #model.add(Dropout(0.3))\n", 168 | "\n", 169 | " model.add(Dense(1, init='he_uniform'))\n", 170 | " model.add(Activation('sigmoid'))\n", 171 | " model.compile(loss='binary_crossentropy', optimizer='adagrad')\n", 172 | " \n", 173 | " ### Model fitting takes place ###\n", 174 | " model.fit(train_X, train_y, batch_size=512, nb_epoch=epochs, validation_data=(test_X, test_y), verbose=2, shuffle=True)\n", 175 | " \n", 176 | " preds = model.predict(test_X, verbose=0)\n", 177 | " preds_test2 = model.predict(test_X2, verbose=0)\n", 178 | " loss = metrics.log_loss(test_y, preds)\n", 179 | " return preds.ravel(), loss, preds_test2.ravel()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "deletable": true, 186 | "editable": true 187 | }, 188 | "source": [ 189 | "## Ensembling\n", 190 | "\n", 191 | "Codes for basic ensembling methods can be seen in this [github link by MLWave](https://github.com/MLWave/Kaggle-Ensemble-Guide)\n", 192 | "\n", 193 | "## Stacking \n", 194 | "\n", 195 | "1. [StackNet](https://github.com/kaz-Anova/StackNet) by Marios KazAnova\n", 196 | "2. [Stacked Ensembles](https://h2o-release.s3.amazonaws.com/h2o/rel-ueno/2/docs-website/h2o-docs/data-science/stacked-ensembles.html) by H2O" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 2", 212 | "language": "python", 213 | "name": "python2" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 2 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython2", 225 | "version": "2.7.10" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /AV_DHS_2017/readme.md: -------------------------------------------------------------------------------- 1 | Codes and materials related to [Analytics Vidhya Datahack Summit workshop 2017](https://www.analyticsvidhya.com/datahacksummit/workshops/the-masterclass-how-to-win-data-science-challenges/) can be seen in this folder 2 | -------------------------------------------------------------------------------- /AV_DHS_2018/readme.md: -------------------------------------------------------------------------------- 1 | Codes for DHS 2018 is present here. 2 | -------------------------------------------------------------------------------- /AV_Genpact_2018/final_model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn import preprocessing, model_selection, metrics, ensemble 5 | import lightgbm as lgb 6 | 7 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=10, seed=0, rounds=20000): 8 | params = {} 9 | params["objective"] = "regression" 10 | params['metric'] = 'rmse' 11 | params["max_depth"] = dep 12 | params["min_data_in_leaf"] = 100 13 | params["learning_rate"] = 0.04 14 | params["bagging_fraction"] = 0.7 15 | params["feature_fraction"] = 0.5 16 | params["bagging_freq"] = 5 17 | params["bagging_seed"] = seed 18 | #params["lambda_l2"] = 0.01 19 | params["verbosity"] = -1 20 | num_rounds = rounds 21 | 22 | plst = list(params.items()) 23 | lgtrain = lgb.Dataset(train_X, label=train_y) 24 | 25 | if test_y is not None: 26 | lgtest = lgb.Dataset(test_X, label=test_y) 27 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100) 28 | else: 29 | lgtest = lgb.Dataset(test_X) 30 | model = lgb.train(params, lgtrain, num_rounds) 31 | 32 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 33 | if test_X2 is not None: 34 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 35 | imps = model.feature_importance() 36 | names = model.feature_name() 37 | for fi, fn in enumerate(names): 38 | print(fn, imps[fi]) 39 | 40 | loss = 0 41 | if test_y is not None: 42 | loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y)) 43 | print(loss) 44 | return pred_test_y, loss, pred_test_y2, model.best_iteration 45 | else: 46 | return pred_test_y 47 | 48 | 49 | def run_model(week_num): 50 | print("WEEK NUMBER IS : ", week_num) 51 | week_shift_map = { 52 | 146 : ["target_shift1", "target_shift2", "target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"], 53 | 147 : ["target_shift2", "target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"], 54 | 148 : ["target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"], 55 | 149 : ["target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"], 56 | 150 : ["target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"], 57 | 151 : ["target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"], 58 | 152 : ["target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"], 59 | 153 : ["target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"], 60 | 154 : ["target_shift9", "target_shift10", "target_shift11", "target_shift13"], 61 | 155 : ["target_shift10", "target_shift11", "target_shift13"] 62 | } 63 | 64 | train_df = pd.read_csv("../input/train.csv") 65 | test_df = pd.read_csv("../input/test_QoiMO9B.csv") 66 | center_df = pd.read_csv("../input/fulfilment_center_info.csv") 67 | meal_df = pd.read_csv("../input/meal_info.csv") 68 | 69 | train_df = pd.merge(train_df, center_df, on="center_id", how="left") 70 | test_df = pd.merge(test_df, center_df, on="center_id", how="left") 71 | train_df = pd.merge(train_df, meal_df, on="meal_id", how="left") 72 | test_df = pd.merge(test_df, meal_df, on="meal_id", how="left") 73 | 74 | cat_cols = ["center_type", "category", "cuisine"] 75 | for c in cat_cols: 76 | lbl = preprocessing.LabelEncoder() 77 | lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 78 | train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 79 | test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 80 | 81 | train_df["discount_ratio"] = train_df["base_price"] / train_df["checkout_price"] 82 | test_df["discount_ratio"] = test_df["base_price"] / test_df["checkout_price"] 83 | 84 | train_df["train_set"] = 1 85 | test_df["train_set"] = 0 86 | test_df["num_orders"] = -99 87 | 88 | print(train_df.shape) 89 | all_df = pd.concat([train_df, test_df]) 90 | all_df = all_df.sort_values(by=["center_id", "meal_id", "week"]).reset_index(drop=True) 91 | print(all_df.shape) 92 | all_df["target_shift1"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(1) 93 | all_df["target_shift2"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(2) 94 | all_df["target_shift3"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(3) 95 | all_df["target_shift4"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(4) 96 | all_df["target_shift5"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(5) 97 | all_df["target_shift6"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(6) 98 | all_df["target_shift7"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(7) 99 | all_df["target_shift8"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(8) 100 | all_df["target_shift9"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(9) 101 | all_df["target_shift10"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(10) 102 | all_df["target_shift11"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(11) 103 | all_df["target_shift12"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(12) 104 | all_df["target_shift13"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(13) 105 | 106 | all_df["discount_shift1"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(1) 107 | all_df["discount_shift2"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(2) 108 | all_df["discount_shift3"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(3) 109 | 110 | #### center shift features ### 111 | #gdf = all_df.groupby(["center_id", "category", "week"])["target_shift11"].agg(['sum']).reset_index() 112 | #gdf.columns = ["center_id", "category", "week", "center_week_orders11"] 113 | #all_df = all_df.merge(gdf, on=["center_id", "category", "week"], how="left") 114 | gdf = all_df.groupby(["category"])["id"].agg(['size']).reset_index() 115 | gdf.columns = ["category", "cat_count"] 116 | all_df = all_df.merge(gdf, on=["category"], how="left") 117 | 118 | gdf = all_df.groupby(["cuisine"])["id"].agg(['size']).reset_index() 119 | gdf.columns = ["cuisine", "cui_count"] 120 | all_df = all_df.merge(gdf, on=["cuisine"], how="left") 121 | 122 | gdf = all_df.groupby(["city_code"])["id"].agg(['size']).reset_index() 123 | gdf.columns = ["city_code", "city_count"] 124 | all_df = all_df.merge(gdf, on=["city_code"], how="left") 125 | 126 | gdf = all_df.groupby(["region_code"])["id"].agg(['size']).reset_index() 127 | gdf.columns = ["region_code", "region_count"] 128 | all_df = all_df.merge(gdf, on=["region_code"], how="left") 129 | 130 | #gdf = all_df.groupby(["city_code", "category"])["id"].agg(['size']).reset_index() 131 | #gdf.columns = ["city_code", "category", "city_cat_count"] 132 | #all_df = all_df.merge(gdf, on=["city_code", "category"], how="left") 133 | 134 | #gdf = all_df.groupby(["city_code", "cuisine"])["id"].agg(['size']).reset_index() 135 | #gdf.columns = ["city_code", "cuisine", "city_cui_count"] 136 | #all_df = all_df.merge(gdf, on=["city_code", "cuisine"], how="left") 137 | 138 | #gdf = all_df.groupby(["region_code", "category"])["id"].agg(['size']).reset_index() 139 | #gdf.columns = ["region_code", "category", "region_cat_count"] 140 | #all_df = all_df.merge(gdf, on=["region_code", "category"], how="left") 141 | 142 | #gdf = all_df.groupby(["region_code", "cuisine"])["id"].agg(['size']).reset_index() 143 | #gdf.columns = ["region_code", "cuisine", "region_cui_count"] 144 | #all_df = all_df.merge(gdf, on=["region_code", "cuisine"], how="left") 145 | 146 | ### Center count features ### 147 | gdf = all_df.groupby(["center_id", "week"])["id"].agg(['size']).reset_index() 148 | gdf.columns = ["center_id", "week", "center_week_count"] 149 | all_df = all_df.merge(gdf, on=["center_id", "week"], how="left") 150 | 151 | gdf = all_df.groupby(["center_id", "category"])["id"].count().reset_index() 152 | gdf.columns = ["center_id", "category", "center_cat_count"] 153 | all_df = all_df.merge(gdf, on=["center_id", "category"], how="left") 154 | 155 | gdf = all_df.groupby(["center_id", "category", "week"])["id"].count().reset_index() 156 | gdf.columns = ["center_id", "category", "week", "center_cat_week_count"] 157 | #gdf = gdf.sort_values(by=["center_id", "category", "week"]).reset_index(drop=True) 158 | #gdf["center_cat_week1_count"] = gdf.groupby(["center_id", "category", "week"])["center_cat_week_count"].shift(1) 159 | all_df = all_df.merge(gdf, on=["center_id", "category", "week"], how="left") 160 | 161 | gdf = all_df.groupby(["center_id", "cuisine"])["id"].count().reset_index() 162 | gdf.columns = ["center_id", "cuisine", "center_cui_count"] 163 | all_df = all_df.merge(gdf, on=["center_id", "cuisine"], how="left") 164 | 165 | 166 | ### Meal count features ### 167 | gdf = all_df.groupby(["meal_id"])["id"].count().reset_index() 168 | gdf.columns = ["meal_id", "meal_count"] 169 | all_df = all_df.merge(gdf, on=["meal_id"], how="left") 170 | 171 | gdf = all_df.groupby(["region_code", "meal_id"])["id"].count().reset_index() 172 | gdf.columns = ["region_code", "meal_id", "region_meal_count"] 173 | all_df = all_df.merge(gdf, on=["region_code", "meal_id"], how="left") 174 | 175 | gdf = all_df.groupby(["meal_id", "week"])["id"].count().reset_index() 176 | gdf.columns = ["meal_id", "week", "meal_week_count"] 177 | all_df = all_df.merge(gdf, on=["meal_id", "week"], how="left") 178 | 179 | gdf = all_df.groupby(["center_type", "meal_id", "week"])["id"].count().reset_index() 180 | gdf.columns = ["center_type", "meal_id", "week", "type_meal_week_count"] 181 | all_df = all_df.merge(gdf, on=["center_type", "meal_id", "week"], how="left") 182 | 183 | gdf = all_df.groupby(["region_code", "meal_id", "week"])["id"].count().reset_index() 184 | gdf.columns = ["region_code", "meal_id", "week", "region_meal_week_count"] 185 | all_df = all_df.merge(gdf, on=["region_code", "meal_id", "week"], how="left") 186 | 187 | gdf = all_df.groupby(["city_code", "meal_id", "week"])["id"].count().reset_index() 188 | gdf.columns = ["city_code", "meal_id", "week", "city_meal_week_count"] 189 | all_df = all_df.merge(gdf, on=["city_code", "meal_id", "week"], how="left") 190 | 191 | ### Price rank ### 192 | all_df["meal_price_rank"] = all_df.groupby("meal_id")["checkout_price"].rank() 193 | all_df["meal_city_price_rank"] = all_df.groupby(["meal_id", "city_code"])["checkout_price"].rank() 194 | all_df["meal_region_price_rank"] = all_df.groupby(["meal_id", "region_code"])["checkout_price"].rank() 195 | all_df["meal_week_price_rank"] = all_df.groupby(["meal_id", "week"])["checkout_price"].rank() 196 | 197 | all_df["center_price_rank"] = all_df.groupby("center_id")["checkout_price"].rank() 198 | all_df["center_week_price_rank"] = all_df.groupby(["center_id", "week"])["checkout_price"].rank() 199 | all_df["center_cat_price_rank"] = all_df.groupby(["center_id", "category"])["checkout_price"].rank() 200 | 201 | ### Week features ### 202 | gdf = all_df.groupby(["meal_id"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index() 203 | gdf.columns = ["meal_id", "meal_price_min", "meal_price_max", "meal_price_mean", "meal_price_std"] 204 | all_df = all_df.merge(gdf, on=["meal_id"], how="left") 205 | 206 | gdf = all_df.groupby(["meal_id"])["base_price"].agg(["min", "max", "mean", "std"]).reset_index() 207 | gdf.columns = ["meal_id", "disc_price_min", "disc_price_max", "disc_price_mean", "disc_price_std"] 208 | all_df = all_df.merge(gdf, on=["meal_id"], how="left") 209 | 210 | gdf = all_df.groupby(["city_code","meal_id", "week"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index() 211 | gdf.columns = ["city_code", "meal_id", "week", "meal_price2_min", "meal_price2_max", "meal_price2_mean", "meal_price2_std"] 212 | all_df = all_df.merge(gdf, on=["city_code", "meal_id", "week"], how="left") 213 | 214 | gdf = all_df.groupby(["city_code", "category"])["checkout_price"].agg(["mean", "std"]).reset_index() 215 | gdf.columns = ["city_code", "category", "meal_price3_mean", "meal_price3_std"] 216 | all_df = all_df.merge(gdf, on=["city_code", "category"], how="left") 217 | 218 | #gdf = all_df.groupby(["region_code","meal_id", "week"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index() 219 | #gdf.columns = ["region_code", "meal_id", "week", "meal_price4_min", "meal_price4_max", "meal_price4_mean", "meal_price4_std"] 220 | #all_df = all_df.merge(gdf, on=["region_code", "meal_id", "week"], how="left") 221 | 222 | 223 | ### New ones ### 224 | #all_df["ratio1"] = all_df["target_shift10"] / all_df["op_area"] 225 | #all_df["ratio2"] = all_df["target_shift10"] / all_df["checkout_price"] 226 | 227 | ### overall mean sum ### 228 | #gdf = all_df.groupby(["meal_id", "week"])["target_shift10"].sum().reset_index() 229 | #gdf.columns = ["meal_id", "week", "city_meal_week_lag10"] 230 | #all_df = all_df.merge(gdf, on=["meal_id", "week"], how="left") 231 | 232 | train_df = all_df[all_df["train_set"]==1].reset_index(drop=True) 233 | test_df = all_df[all_df["train_set"]==0].reset_index(drop=True) 234 | test_df = test_df[test_df["week"] == week_num].reset_index(drop=True) 235 | 236 | dev_df = train_df[train_df["week"]<=135] 237 | #dev_df = dev_df[dev_df["week"]>20] 238 | val_df = train_df[train_df["week"]>135] 239 | train_y = np.log1p(train_df["num_orders"].values) 240 | dev_y = np.log1p(dev_df["num_orders"].values) 241 | val_y = np.log1p(val_df["num_orders"].values) 242 | cols_to_use = ["center_id", "meal_id", "checkout_price", "base_price", "discount_ratio", "emailer_for_promotion", "homepage_featured"] 243 | cols_to_use += ["city_code","region_code","center_type","op_area"] 244 | cols_to_use += ["category", "cuisine"] 245 | cols_to_use += ["cat_count", "cui_count", "city_count", "region_count"] 246 | #cols_to_use += ["city_cat_count", "city_cui_count", "region_cat_count", "region_cui_count"] 247 | cols_to_use += ["center_cat_count", "center_cui_count", "center_week_count"] 248 | cols_to_use += ["meal_week_count", "type_meal_week_count", "region_meal_week_count", "city_meal_week_count", "meal_count", "region_meal_count"] 249 | cols_to_use += ["meal_price_rank", "meal_city_price_rank", "meal_region_price_rank", "meal_week_price_rank"] 250 | cols_to_use += ["center_price_rank", "center_cat_price_rank", "center_week_price_rank"] 251 | cols_to_use += ["meal_price_min", "meal_price_max", "meal_price_mean", "meal_price_std"] 252 | cols_to_use += ["disc_price_min", "disc_price_max", "disc_price_mean", "disc_price_std"] 253 | cols_to_use += ["meal_price2_min", "meal_price2_max", "meal_price2_mean", "meal_price2_std"] 254 | cols_to_use += ["meal_price3_mean", "meal_price3_std"] 255 | cols_to_use += week_shift_map[week_num] 256 | 257 | train_X = train_df[cols_to_use] 258 | dev_X = dev_df[cols_to_use] 259 | val_X = val_df[cols_to_use] 260 | test_X = test_df[cols_to_use] 261 | print(val_X.tail()) 262 | 263 | pred_val, loss, pred_test, nrounds = runLGB(dev_X, dev_y, val_X, val_y, test_X) 264 | pred_test1 = runLGB(train_X, train_y, test_X, rounds=nrounds) 265 | pred_test2 = runLGB(train_X, train_y, test_X, rounds=nrounds, seed=2018) 266 | pred_test = 0.5*pred_test1 + 0.5*pred_test2 267 | 268 | test_id = list(test_df["id"].values) 269 | test_preds = list(np.expm1(pred_test)) 270 | return test_id, test_preds, loss 271 | 272 | if __name__ == "__main__": 273 | test_ids = [] 274 | preds = [] 275 | cv = [] 276 | for week_num in [146, 147, 148, 149, 150, 151, 152, 153, 154, 155]: 277 | ids, prs, ll = run_model(week_num) 278 | test_ids.extend(ids) 279 | preds.extend(prs) 280 | cv.append(ll) 281 | print(cv) 282 | sub_df = pd.DataFrame({"id":test_ids}) 283 | sub_df["num_orders"] = preds 284 | sub_df.to_csv("sub8.csv", index=False) 285 | -------------------------------------------------------------------------------- /AV_Genpact_2018/readme.md: -------------------------------------------------------------------------------- 1 | Code for the Genpact Hackathon 2 | https://datahack.analyticsvidhya.com/contest/genpact-machine-learning-hackathon/ 3 | -------------------------------------------------------------------------------- /AV_Hack3/buildModel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Code for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong ! 4 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838 5 | __author__ : SRK 6 | """ 7 | import sys 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.preprocessing import LabelEncoder 11 | from sklearn.cross_validation import KFold 12 | from sklearn import ensemble 13 | from sklearn.metrics import mean_squared_error 14 | import xgboost as xgb 15 | 16 | if __name__ == "__main__": 17 | # setting the input path and reading the data into dataframe # 18 | data_path = "../Data/" 19 | train = pd.read_csv(data_path+"Train.csv") 20 | test = pd.read_csv(data_path+"Test.csv") 21 | 22 | ## mapping the var8 with the given data and create a new column ## 23 | var8_map_dict = {"HXYB":0, "HXYC":0, "HXYD":0, "HXYE":0, "HXYF":1, "HXFG":1, "HXYH":1, "HXYI":1, "HXYJ":2, "HXYK":2, "HXYL":2, "HXYM":3, "HXYN":3, "HXYO":3} 24 | train_var8_map = [] 25 | for var_val in train["Var8"]: 26 | if var8_map_dict.has_key(var_val): 27 | train_var8_map.append(var8_map_dict[var_val]) 28 | else: 29 | train_var8_map.append(4) # just in case if the value is missing in dict, assign 4 30 | test_var8_map = [] 31 | for var_val in test["Var8"]: 32 | if var8_map_dict.has_key(var_val): 33 | test_var8_map.append(var8_map_dict[var_val]) 34 | else: 35 | test_var8_map.append(4) 36 | train["Var8Map"] = train_var8_map 37 | test["Var8Map"] = test_var8_map 38 | 39 | ## categical column name list ## 40 | categorical_columns = ['Var4', 'institute_city', 'institute_state', 'Var8', 'institute_country', 'Var10', 'Var11', 'Var12', 'Var13', 'Var14', 'Var15', 'Instructor_Past_Performance', 'Instructor_Association_Industry_Expert', 'project_subject', 'subject_area', 'secondary_subject', 'secondary_area', 'Resource_Category', 'Resource_Sub_Category', 'Var23', 'Var24'] 41 | 42 | ## Getting the ID and DV from the data frame ## 43 | train_y = np.array(train["Project_Valuation"]) 44 | train_y[train_y>6121] = 6121 45 | train_id = np.array(train["ID"]) 46 | test_id = np.array(test["ID"]) 47 | 48 | ## Creating the IDVs from the train and test dataframe ## 49 | train_X = train.copy() 50 | test_X = test.copy() 51 | 52 | ## Fill up the na values with -999 ## 53 | train_X = train_X.fillna(-999) 54 | test_X = test_X.fillna(-999) 55 | 56 | ## One hot encoding the categorical variables ## 57 | for var in categorical_columns: 58 | lb = LabelEncoder() 59 | full_var_data = pd.concat((train_X[var],test_X[var]),axis=0).astype('str') 60 | lb.fit( full_var_data ) 61 | train_X[var] = lb.transform(train_X[var].astype('str')) 62 | test_X[var] = lb.transform(test_X[var].astype('str')) 63 | 64 | ## Dropping the unnecessary columns from IDVs ## 65 | train_X = np.array( train_X.drop(['ID','Project_Valuation'],axis=1) ) 66 | test_X = np.array( test_X.drop(['ID','Unnamed: 26'],axis=1) ) 67 | print "Train shape is : ",train_X.shape 68 | print "Test shape is : ",test_X.shape 69 | 70 | 71 | ################################ MODEL BUILDING ################################################## 72 | print "Building RF1" 73 | reg = ensemble.RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_leaf=7, max_features="auto", n_jobs=4, random_state=0) 74 | reg.fit(train_X, train_y) 75 | pred_test_y_rf1 = reg.predict(test_X) 76 | 77 | print "Building RF2" 78 | reg = ensemble.RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=2, max_features=0.8, n_jobs=4, random_state=0) 79 | reg.fit(train_X, train_y) 80 | pred_test_y_rf2 = reg.predict(test_X) 81 | 82 | print "Building GB1" 83 | reg = ensemble.GradientBoostingRegressor(n_estimators=400, max_depth=7, min_samples_leaf=8, max_features=0.3, subsample=0.6, learning_rate=0.01, random_state=0) 84 | reg.fit(train_X, train_y) 85 | pred_test_y_gb1 = reg.predict(test_X) 86 | 87 | print "Building GB2" 88 | reg = ensemble.GradientBoostingRegressor(n_estimators=600, max_depth=6, min_samples_leaf=8, max_features=0.3, subsample=0.6, learning_rate=0.01, random_state=0) 89 | reg.fit(train_X, train_y) 90 | pred_test_y_gb2 = reg.predict(test_X) 91 | 92 | print "Building XGB1" 93 | params = {} 94 | params["objective"] = "reg:linear" 95 | params["eta"] = 0.005 96 | params["min_child_weight"] = 10 97 | params["subsample"] = 0.7 98 | params["colsample_bytree"] = 0.6 99 | params["scale_pos_weight"] = 0.8 100 | params["silent"] = 1 101 | params["max_depth"] = 5 102 | params["max_delta_step"]=2 103 | params["seed"] = 0 104 | plst = list(params.items()) 105 | xgtrain = xgb.DMatrix(train_X, label=train_y) 106 | xgtest = xgb.DMatrix(test_X) 107 | num_rounds = 1100 108 | model = xgb.train(plst, xgtrain, num_rounds) 109 | pred_test_y_xgb1 = model.predict(xgtest) 110 | 111 | print "Building XGB2" 112 | params = {} 113 | params["objective"] = "reg:linear" 114 | params["eta"] = 0.005 115 | params["min_child_weight"] = 6 116 | params["subsample"] = 0.7 117 | params["colsample_bytree"] = 0.6 118 | params["scale_pos_weight"] = 0.8 119 | params["silent"] = 1 120 | params["max_depth"] = 8 121 | params["max_delta_step"]=2 122 | params["seed"] = 0 123 | plst = list(params.items()) 124 | xgtrain = xgb.DMatrix(train_X, label=train_y) 125 | xgtest = xgb.DMatrix(test_X) 126 | num_rounds = 800 127 | model = xgb.train(plst, xgtrain, num_rounds) 128 | pred_test_y_xgb2 = model.predict(xgtest) 129 | 130 | ## Averaging the six models ## 131 | pred_test_y = 0.15*pred_test_y_rf1 + 0.15*pred_test_y_rf2 + 0.15*pred_test_y_gb1 + 0.15*pred_test_y_gb2 + 0.2*pred_test_y_xgb1 + 0.2*pred_test_y_xgb2 132 | 133 | ## Writing the submission file ## 134 | out_df = pd.DataFrame({"ID":test_id, "Project_Valuation":pred_test_y}) 135 | out_df.to_csv("sub_2.csv", index=False) 136 | 137 | -------------------------------------------------------------------------------- /AV_Hack3/readme.md: -------------------------------------------------------------------------------- 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong ! 2 | 3 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838 4 | 5 | ###### My approach for the hackathon is as follows: 6 | 7 | 1. Converted all the categorical variables into one-hot encoded variables 8 | 9 | 2. Truncate the "Project Evaluation" value at 99.9th percentile value (value is 6121) 10 | 11 | 3. Built tree based models by selecting the params through cross validation 12 | 13 | a. Random Forest (2 models with different params - 1 with shorter trees and 1 with deep trees) 14 | 15 | b. Gradient Boosting (2 models with different params) 16 | 17 | c. Extreme Gradient Boosting (2 models with different params) 18 | 19 | 4. Simple weighted average of all the six models based on local validation 20 | 21 | -------------------------------------------------------------------------------- /AV_Hackathon_July11/benchmark.R: -------------------------------------------------------------------------------- 1 | ## setting the working directory ## 2 | setwd("../Data/") 3 | 4 | ## reading the train and test files ## 5 | train = read.csv("train.csv") 6 | test = read.csv("test.csv") 7 | 8 | ## removing the categorical columns for benchmark script. Create dummy variables for further improvement ## 9 | test_id = test["id"] 10 | train["id"] = NULL 11 | test["id"] = NULL 12 | train["Category_article"] = NULL 13 | test["Category_article"] = NULL 14 | train["Day_of_publishing"] = NULL 15 | test["Day_of_publishing"] = NULL 16 | 17 | ## creating a linear regression model and predicting on teset set ## 18 | ## change the modeling methodology and try different models ## 19 | model = lm(shares~., data=train) 20 | summary(model) 21 | preds = predict(model, test, type='response') 22 | 23 | ## writing the outputs to csv file ## 24 | out_df = data.frame(test_id, preds) 25 | names(out_df) = c("id", "predictions") 26 | write.csv(out_df, "benchmark_R.csv", row.names=F, quote=F) 27 | -------------------------------------------------------------------------------- /AV_Hackathon_July11/benchmark.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Benchmark script for Analytics Vidhya Online Hackathon using Linear Regression. 4 | __author__ : SRK 5 | Date : July 11, 2015 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from sklearn.feature_extraction import DictVectorizer 11 | from sklearn.linear_model import LinearRegression 12 | 13 | if __name__ == "__main__": 14 | ## specify the location of input files ## 15 | data_path = "../Data/" 16 | train_file = data_path + "train.csv" 17 | test_file = data_path + "test.csv" 18 | names_categorical = ['Category_article', 'Day_of_publishing'] 19 | 20 | ## creating pandas data frame for train and test ## 21 | train = pd.read_csv(train_file) 22 | test = pd.read_csv(test_file) 23 | 24 | # strpping the leading space in column names (some of them have leading spaces while reading using pandas read_csv) # 25 | train.columns = [i.strip() for i in list(train.columns.values)] 26 | test.columns = [i.strip() for i in list(test.columns.values)] 27 | 28 | ## getting the DV and ID values ## 29 | train_y = train["shares"] 30 | train_id = train["id"] 31 | test_id = test["id"] 32 | 33 | ## dropping the categorical columns, ID and DV from dataframe ## 34 | train_X = train.drop( ["id"]+names_categorical+["shares"], axis=1) 35 | test_X = test.drop( ["id"]+names_categorical, axis=1) 36 | print "Train, test shape : ", train_X.shape, test_X.shape 37 | 38 | ## building a linear regression model and predicting on test set ## 39 | lm_model = LinearRegression() 40 | lm_model.fit(train_X, train_y) 41 | pred_test_y = lm_model.predict(test_X) 42 | 43 | ## Writing it to output csv files ## 44 | out_df = pd.DataFrame({"id":test_id, "predictions":pred_test_y}) 45 | out_df.to_csv("benchmark.csv", index=False) 46 | -------------------------------------------------------------------------------- /AV_Hackathon_July11/readme.md: -------------------------------------------------------------------------------- 1 | This folder has codes for Analytics Vidhya Hackathon held on July 11,2015 2 | -------------------------------------------------------------------------------- /AV_Knocktober/getOutcome.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | data_path = "../input/Train/" 5 | first_camp = pd.read_csv( data_path + "First_Health_Camp_Attended.csv" ) 6 | second_camp = pd.read_csv( data_path + "Second_Health_Camp_Attended.csv" ) 7 | third_camp = pd.read_csv( data_path + "Third_Health_Camp_Attended.csv" ) 8 | print first_camp.shape, second_camp.shape, third_camp.shape 9 | 10 | col_names = [['Patient_ID','Health_Camp_ID','Outcome']] 11 | first_camp = first_camp[['Patient_ID','Health_Camp_ID','Health_Score']] 12 | first_camp.columns = col_names 13 | second_camp = second_camp[['Patient_ID','Health_Camp_ID','Health Score']] 14 | second_camp.columns = col_names 15 | third_camp = third_camp[['Patient_ID','Health_Camp_ID','Number_of_stall_visited']] 16 | third_camp = third_camp[third_camp['Number_of_stall_visited']>0] 17 | third_camp.columns = col_names 18 | print third_camp.shape 19 | 20 | all_camps = pd.concat([first_camp, second_camp, third_camp]) 21 | all_camps['Outcome'] = 1 22 | print all_camps.shape 23 | 24 | train = pd.read_csv(data_path + "Train.csv") 25 | print train.shape 26 | 27 | train = train.merge(all_camps, on=['Patient_ID','Health_Camp_ID'], how='left') 28 | train['Outcome'] = train['Outcome'].fillna(0).astype('int') 29 | train.to_csv(data_path+'train_with_outcome.csv', index=False) 30 | print train.Outcome.value_counts() 31 | -------------------------------------------------------------------------------- /AV_Knocktober/readme.md: -------------------------------------------------------------------------------- 1 | Codes and Files used for [AV Data hack](https://datahack.analyticsvidhya.com/contest/all/) - [Knocktober](https://datahack.analyticsvidhya.com/contest/knocktober-2016/) 2 | 3 | We ([Rohan Rao](https://github.com/rohanrao91) and myself) finished first in this competition and the leaderboard can be accessed [here](https://datahack.analyticsvidhya.com/contest/knocktober-2016/lb) 4 | 5 | The code file - vopani_final.R is written by Rohan Rao and you can see more about his comments [here](https://github.com/rohanrao91/AnalyticsVidhya_Knocktober) 6 | 7 | The code file - srk_final.py is written by me. Finally we blended both our models which ended up at first position. 8 | -------------------------------------------------------------------------------- /AV_Knocktober/srk_final.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import operator 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn import preprocessing, model_selection, metrics, ensemble 6 | import xgboost as xgb 7 | 8 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 9 | grouped_df = count_df.groupby(var_name, as_index=False).agg('size').reset_index() 10 | grouped_df.columns = [var_name, "var_count"] 11 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 12 | merged_df.fillna(-1, inplace=True) 13 | return list(merged_df["var_count"]) 14 | 15 | def create_feature_map(features): 16 | outfile = open('xgb.fmap', 'w') 17 | for i, feat in enumerate(features): 18 | outfile.write('{0}\t{1}\tq\n'.format(i,feat)) 19 | outfile.close() 20 | 21 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, extra_X=None, seed_val=0, num_rounds=200): 22 | params = {} 23 | params["objective"] = "binary:logistic" 24 | params['eval_metric'] = 'auc' 25 | params["eta"] = 0.02 26 | params["subsample"] = 0.8 27 | params["min_child_weight"] = 5 28 | params["colsample_bytree"] = 0.7 29 | params["max_depth"] = 6 30 | params["silent"] = 1 31 | params["seed"] = seed_val 32 | 33 | plst = list(params.items()) 34 | xgtrain = xgb.DMatrix(train_X, label=train_y) 35 | 36 | if test_y is not None: 37 | xgtest = xgb.DMatrix(test_X, label=test_y) 38 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 39 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=300) 40 | else: 41 | xgtest = xgb.DMatrix(test_X) 42 | model = xgb.train(plst, xgtrain, num_rounds) 43 | 44 | if feature_names is not None: 45 | create_feature_map(feature_names) 46 | model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True) 47 | importance = model.get_fscore(fmap='xgb.fmap') 48 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 49 | imp_df = pd.DataFrame(importance, columns=['feature','fscore']) 50 | imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() 51 | imp_df.to_csv("imp_feat.txt", index=False) 52 | 53 | pred_test_y = model.predict(xgtest) 54 | loss = 0 55 | 56 | if extra_X is not None: 57 | xgtest = xgb.DMatrix(extra_X) 58 | pred_extra_y = model.predict(xgtest) 59 | return pred_test_y, pred_extra_y, loss 60 | 61 | if test_y is not None: 62 | loss = metrics.roc_auc_score(test_y, pred_test_y) 63 | print loss 64 | return pred_test_y, loss 65 | else: 66 | return pred_test_y,loss 67 | 68 | if __name__ == "__main__": 69 | ## Reading the files and converting the dates ## 70 | data_path = "../input/Train/" 71 | train = pd.read_csv(data_path + "train_with_outcome.csv") 72 | test = pd.read_csv(data_path + "Test.csv") 73 | train['Registration_Date'].fillna('10-jan-90', inplace=True) 74 | test['Registration_Date'].fillna('10-jan-90', inplace=True) 75 | train['Registration_Date'] = pd.to_datetime(train['Registration_Date'], format="%d-%b-%y") 76 | test['Registration_Date'] = pd.to_datetime(test['Registration_Date'], format="%d-%b-%y") 77 | train['Registration_Date'] = train['Registration_Date'].apply(lambda x: x.toordinal()) 78 | test['Registration_Date'] = test['Registration_Date'].apply(lambda x: x.toordinal()) 79 | print train.shape, test.shape 80 | 81 | ## Getting patient details and merging with train and test ## 82 | patient = pd.read_csv(data_path + "Patient_Profile.csv", na_values=['None','']) 83 | patient['First_Interaction'] = pd.to_datetime(patient['First_Interaction'], format="%d-%b-%y") 84 | patient['First_Interaction'] = patient['First_Interaction'].apply(lambda x: x.toordinal()) 85 | print patient.shape 86 | train = train.merge(patient, on=['Patient_ID'], how='left') 87 | test = test.merge(patient, on=['Patient_ID'], how='left') 88 | print train.shape, test.shape 89 | 90 | ## Getting health camp details and merging with train and test ## 91 | hc_details = pd.read_csv(data_path + "Health_Camp_Detail.csv") 92 | hc_ids = list(hc_details.Health_Camp_ID.values) 93 | hc_details['Camp_Start_Date'] = pd.to_datetime(hc_details['Camp_Start_Date'], format="%d-%b-%y") 94 | hc_details['Camp_End_Date'] = pd.to_datetime(hc_details['Camp_End_Date'], format="%d-%b-%y") 95 | hc_details['Camp_Start_Date'] = hc_details['Camp_Start_Date'].apply(lambda x: x.toordinal()) 96 | hc_details['Camp_End_Date'] = hc_details['Camp_End_Date'].apply(lambda x: x.toordinal()) 97 | hc_details['Camp_Duration_Days'] = hc_details['Camp_End_Date'] - hc_details['Camp_Start_Date'] 98 | print hc_details.head() 99 | train = train.merge(hc_details, on=['Health_Camp_ID'], how='left') 100 | test = test.merge(hc_details, on=['Health_Camp_ID'], how='left') 101 | print train.shape, test.shape 102 | 103 | ## Reading the camp files ## 104 | first_camp_details = pd.read_csv(data_path + "First_Health_Camp_Attended.csv") 105 | first_camp_details = first_camp_details[["Patient_ID","Health_Camp_ID","Donation","Health_Score"]] 106 | train = train.merge(first_camp_details, on=["Patient_ID","Health_Camp_ID"], how='left') 107 | third_camp_details = pd.read_csv(data_path + "Third_Health_Camp_Attended.csv") 108 | third_camp_details = third_camp_details[["Patient_ID","Health_Camp_ID","Number_of_stall_visited","Last_Stall_Visited_Number"]] 109 | train = train.merge(third_camp_details, on=["Patient_ID","Health_Camp_ID"], how='left') 110 | train["Number_of_stall_visited"].fillna(0, inplace=True) 111 | train["Donation"].fillna(0, inplace=True) 112 | train["Health_Score"].fillna(0, inplace=True) 113 | print train.shape, test.shape 114 | 115 | 116 | ## Filling NA with -99 ## 117 | train.fillna(-99, inplace=True) 118 | test.fillna(-99, inplace=True) 119 | 120 | ## print create additional features ## 121 | print "Getting additional features." 122 | train["Diff_CampStart_Registration"] = train["Camp_Start_Date"] - train["Registration_Date"] 123 | test["Diff_CampStart_Registration"] = test["Camp_Start_Date"] - test["Registration_Date"] 124 | 125 | train["Diff_CampEnd_Registration"] = train["Camp_End_Date"] - train["Registration_Date"] 126 | test["Diff_CampEnd_Registration"] = test["Camp_End_Date"] - test["Registration_Date"] 127 | 128 | train["Diff_Registration_FirstInteraction"] = train["Registration_Date"] - train["First_Interaction"] 129 | test["Diff_Registration_FirstInteraction"] = test["Registration_Date"] - test["First_Interaction"] 130 | 131 | train["Diff_CampStart_FirstInteraction"] = train["Camp_Start_Date"] - train["First_Interaction"] 132 | test["Diff_CampStart_FirstInteraction"] = test["Camp_Start_Date"] - test["First_Interaction"] 133 | print train.shape, test.shape 134 | 135 | ## Getitng the cat columns and label encode them ## 136 | cat_columns = [] 137 | for col in train.columns: 138 | if train[col].dtype == 'object': 139 | print col 140 | cat_columns.append(col) 141 | enc = preprocessing.LabelEncoder() 142 | full_list = list(train[col].values) + list(test[col].values) 143 | enc.fit(full_list) 144 | train[col] = enc.transform(list(train[col].values)) 145 | test[col] = enc.transform(list(test[col].values)) 146 | 147 | # getting count # 148 | for col in ["Patient_ID", "Health_Camp_ID"]: 149 | print "Count : ", col 150 | full_df = pd.concat([train, test]) 151 | train["Count_"+col] = getCountVar(train, full_df, col) 152 | test["Count_"+col] = getCountVar(test, full_df, col) 153 | 154 | 155 | ## do sorting so as to compute the next variables ## 156 | train = train.sort_values(['Camp_Start_Date', 'Camp_End_Date', 'Patient_ID']).reset_index(drop=True) 157 | test = test.sort_values(['Camp_Start_Date', 'Camp_End_Date', 'Patient_ID']).reset_index(drop=True) 158 | print train.head() 159 | 160 | print "First pass to get necessary details.." 161 | people_camp_dict = {} 162 | people_date_dict = {} 163 | people_dv_dict = {} 164 | people_cat1_dict = {} 165 | people_cdate_dict = {} 166 | people_donation_dict = {} 167 | people_num_stall_dict = {} 168 | people_last_stall_dict = {} 169 | people_fscore_dict = {} 170 | for ind, row in train.iterrows(): 171 | pid = row['Patient_ID'] 172 | cid = row['Health_Camp_ID'] 173 | reg_date = row['Registration_Date'] 174 | dv = row['Outcome'] 175 | cat1 = row['Category1'] 176 | cdate = row['Camp_Start_Date'] 177 | donation = row['Donation'] 178 | num_stall = row['Number_of_stall_visited'] 179 | fscore = row['Health_Score'] 180 | 181 | tlist = people_camp_dict.get(pid,[]) 182 | tlist.append(cid) 183 | people_camp_dict[pid] = tlist[:] 184 | 185 | tlist = people_date_dict.get(pid,[]) 186 | tlist.append(reg_date) 187 | people_date_dict[pid] = tlist[:] 188 | 189 | tlist = people_dv_dict.get(pid, []) 190 | tlist.append(dv) 191 | people_dv_dict[pid] = tlist[:] 192 | 193 | tlist = people_donation_dict.get(pid, []) 194 | tlist.append(donation) 195 | people_donation_dict[pid] = tlist[:] 196 | 197 | tlist = people_num_stall_dict.get(pid, []) 198 | tlist.append(num_stall) 199 | people_num_stall_dict[pid] = tlist[:] 200 | 201 | tlist = people_fscore_dict.get(pid, []) 202 | tlist.append(fscore) 203 | people_fscore_dict[pid] = tlist[:] 204 | 205 | tlist = people_cat1_dict.get(pid, []) 206 | tlist.append(cat1) 207 | people_cat1_dict[pid] = tlist[:] 208 | 209 | tlist = people_cdate_dict.get(pid, []) 210 | tlist.append(cdate) 211 | people_cdate_dict[pid] = tlist[:] 212 | 213 | print "Creating features now using dict for train.." 214 | last_date_list = [] 215 | last_dv_list = [] 216 | last_cat1_list = [] 217 | mean_dv_list = [] 218 | last_cdate_list = [] 219 | last_donation_list = [] 220 | last_num_stall_list = [] 221 | last_fscore_list=[] 222 | for ind, row in train.iterrows(): 223 | pid = row['Patient_ID'] 224 | reg_date = row['Registration_Date'] 225 | cat1 = row['Category1'] 226 | cid = row['Health_Camp_ID'] 227 | cdate = row['Camp_Start_Date'] 228 | 229 | camp_list = people_camp_dict[pid] 230 | for ind, camp in enumerate(camp_list): 231 | if camp == cid: 232 | use_index = ind 233 | break 234 | 235 | tlist = people_date_dict[pid][:use_index] 236 | if len(tlist)>0: 237 | last_date_list.append(reg_date-tlist[-1]) 238 | else: 239 | last_date_list.append(-99) 240 | 241 | tlist = people_dv_dict[pid][:use_index] 242 | if len(tlist)>0: 243 | last_dv_list.append(tlist[-1]) 244 | mean_dv_list.append(np.mean(tlist)) 245 | else: 246 | last_dv_list.append(-99) 247 | mean_dv_list.append(-99) 248 | 249 | tlist = people_donation_dict[pid][:use_index] 250 | if len(tlist)>0: 251 | last_donation_list.append(np.sum(tlist)) 252 | else: 253 | last_donation_list.append(-99) 254 | 255 | tlist = people_num_stall_dict[pid][:use_index] 256 | if len(tlist)>0: 257 | last_num_stall_list.append(np.sum(tlist)) 258 | else: 259 | last_num_stall_list.append(-99) 260 | 261 | tlist = people_fscore_dict[pid][:use_index] 262 | if len(tlist)>0: 263 | last_fscore_list.append(np.mean([i for i in tlist if i!=0])) 264 | else: 265 | last_fscore_list.append(-99) 266 | 267 | tlist = people_cat1_dict[pid][:use_index] 268 | if len(tlist)>0: 269 | last_cat1_list.append(tlist[-1]) 270 | else: 271 | last_cat1_list.append(-99) 272 | 273 | tlist = people_date_dict[pid][use_index+1:] 274 | if len(tlist)>0: 275 | last_cdate_list.append(reg_date-tlist[0]) 276 | else: 277 | last_cdate_list.append(-99) 278 | 279 | print last_fscore_list[:50] 280 | 281 | train["Last_Reg_Date"] = last_date_list[:] 282 | train["Mean_Outcome"] = mean_dv_list[:] 283 | train["Last_Cat1"] = last_cat1_list[:] 284 | train["Next_Reg_Date"] = last_cdate_list 285 | train["Sum_Donations"] = last_donation_list[:] 286 | train["Sum_NumStalls"] = last_num_stall_list[:] 287 | train["Mean_Fscore"] = last_fscore_list[:] 288 | 289 | print "Prepare dict using val.." 290 | for ind, row in test.iterrows(): 291 | pid = row['Patient_ID'] 292 | cid = row['Health_Camp_ID'] 293 | reg_date = row['Registration_Date'] 294 | cat1 = row['Category1'] 295 | cdate = row['Camp_Start_Date'] 296 | 297 | tlist = people_camp_dict.get(pid,[]) 298 | tlist.append(cid) 299 | people_camp_dict[pid] = tlist[:] 300 | 301 | tlist = people_date_dict.get(pid,[]) 302 | tlist.append(reg_date) 303 | people_date_dict[pid] = tlist[:] 304 | 305 | tlist = people_cat1_dict.get(pid, []) 306 | tlist.append(cat1) 307 | people_cat1_dict[pid] = tlist[:] 308 | 309 | tlist = people_cdate_dict.get(pid, []) 310 | tlist.append(cdate) 311 | people_cdate_dict[pid] = tlist[:] 312 | 313 | print "Creating features for val using dict.." 314 | last_date_list = [] 315 | last_dv_list = [] 316 | last_cat1_list = [] 317 | mean_dv_list = [] 318 | last_cdate_list = [] 319 | last_donation_list = [] 320 | last_num_stall_list = [] 321 | last_fscore_list = [] 322 | for ind, row in test.iterrows(): 323 | pid = row['Patient_ID'] 324 | reg_date = row['Registration_Date'] 325 | cat1 = row['Category1'] 326 | cid = row['Health_Camp_ID'] 327 | cdate = row['Camp_Start_Date'] 328 | 329 | camp_list = people_camp_dict[pid] 330 | for ind, camp in enumerate(camp_list): 331 | if camp == cid: 332 | use_index = ind 333 | break 334 | 335 | tlist = people_date_dict[pid][:use_index] 336 | if len(tlist)>0: 337 | last_date_list.append(reg_date-tlist[-1]) 338 | else: 339 | last_date_list.append(-99) 340 | 341 | tlist = people_dv_dict.get(pid, []) 342 | if len(tlist)>0: 343 | last_dv_list.append(tlist[-1]) 344 | mean_dv_list.append(np.mean(tlist)) 345 | else: 346 | last_dv_list.append(-99) 347 | mean_dv_list.append(-99) 348 | 349 | tlist = people_donation_dict.get(pid, []) 350 | if len(tlist)>0: 351 | last_donation_list.append(np.sum(tlist)) 352 | else: 353 | last_donation_list.append(-99) 354 | 355 | tlist = people_num_stall_dict.get(pid, []) 356 | if len(tlist)>0: 357 | last_num_stall_list.append(np.sum(tlist)) 358 | else: 359 | last_num_stall_list.append(-99) 360 | 361 | tlist = people_fscore_dict.get(pid, []) 362 | if len(tlist)>0: 363 | last_fscore_list.append(np.mean([i for i in tlist if i!=0])) 364 | else: 365 | last_fscore_list.append(-99) 366 | 367 | tlist = people_cat1_dict[pid][:use_index] 368 | if len(tlist)>0: 369 | last_cat1_list.append(tlist[-1]) 370 | else: 371 | last_cat1_list.append(-99) 372 | 373 | tlist = people_date_dict[pid][use_index+1:] 374 | if len(tlist)>0: 375 | last_cdate_list.append(reg_date-tlist[0]) 376 | else: 377 | last_cdate_list.append(-99) 378 | 379 | test["Last_Reg_Date"] = last_date_list[:] 380 | test["Mean_Outcome"] = mean_dv_list[:] 381 | test["Last_Cat1"] = last_cat1_list[:] 382 | test["Next_Reg_Date"] = last_cdate_list[:] 383 | test["Sum_Donations"] = last_donation_list[:] 384 | test["Sum_NumStalls"] = last_num_stall_list[:] 385 | test["Mean_Fscore"] = last_fscore_list[:] 386 | 387 | train.fillna(-99, inplace=True) 388 | test.fillna(-99, inplace=True) 389 | 390 | print "Getting dv and id values" 391 | train_y = train.Outcome.values 392 | 393 | ## Columns to drop ## 394 | print "Dropping columns.." 395 | drop_cols = ["Camp_Start_Date", "Camp_End_Date", "Registration_Date"] #, "First_Interaction"] 396 | drop_cols = drop_cols + ["LinkedIn_Shared", "Facebook_Shared", "Twitter_Shared", "Online_Follower", "Var4"] 397 | train.drop(drop_cols, axis=1, inplace=True) 398 | test.drop(drop_cols, axis=1, inplace=True) 399 | print train.shape, test.shape 400 | 401 | # preparing train and test # 402 | print "Choose the columns to use.." 403 | xcols = [col for col in train.columns if col not in ["Outcome", "Health_Camp_ID", "Patient_ID", "Der_Var1", "Number_of_stall_visited","Last_Stall_Visited_Number", "Donation", "Health_Score", "Mean_Fscore"]] 404 | print xcols 405 | train_X = np.array(train[xcols]) 406 | test_X = np.array(test[xcols]) 407 | print train_X.shape, test_X.shape 408 | 409 | print "Final Model.." 410 | preds = 0 411 | for seed_val, num_rounds in [[0,200], [2016,250], [1323, 225]]: 412 | print seed_val, num_rounds 413 | temp_preds, loss = runXGB(train_X, train_y, test_X, feature_names=xcols, seed_val=seed_val, num_rounds=num_rounds) 414 | preds += temp_preds 415 | preds = preds/3. 416 | 417 | out_df = pd.DataFrame({"Patient_ID":test.Patient_ID.values}) 418 | out_df["Health_Camp_ID"] = test.Health_Camp_ID.values 419 | out_df["Outcome"] = preds 420 | out_df.to_csv("sub_srk.csv", index=False) 421 | -------------------------------------------------------------------------------- /AV_Knocktober/vopani_final.R: -------------------------------------------------------------------------------- 1 | ## setting working directory 2 | path <- "/Volumes/External SD/AnalyticsVidhya/Knocktober" 3 | setwd(path) 4 | 5 | seed <- 235 6 | set.seed(seed) 7 | 8 | 9 | ## loading libraries 10 | library(data.table) 11 | library(xgboost) 12 | 13 | 14 | ## loading data 15 | train <- fread("./raw/Train.csv") 16 | test <- fread("./raw/Test_D7W1juQ.csv") 17 | 18 | health_camp <- fread("./raw/Health_Camp_Detail.csv") 19 | 20 | health_1 <- fread("./raw/First_Health_Camp_Attended.csv") 21 | health_2 <- fread("./raw/Second_Health_Camp_Attended.csv") 22 | health_3 <- fread("./raw/Third_Health_Camp_Attended.csv") 23 | 24 | health_1[, V5 := NULL] 25 | setnames(health_1, "Health_Score", "Health_Score_1") 26 | setnames(health_2, "Health Score", "Health_Score_2") 27 | 28 | patient <- fread("./raw/Patient_Profile.csv") 29 | 30 | train[, train_flag := 1] 31 | test[, train_flag := 0] 32 | 33 | 34 | ## processing data 35 | X_panel <- rbind(train, test) 36 | 37 | X_panel <- merge(X_panel, health_1, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID")) 38 | X_panel <- merge(X_panel, health_2, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID")) 39 | X_panel <- merge(X_panel, health_3, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID")) 40 | 41 | X_panel <- merge(X_panel, health_camp, all.x = TRUE, by = "Health_Camp_ID") 42 | X_panel <- merge(X_panel, patient, all.x = TRUE, by = "Patient_ID") 43 | 44 | X_panel[, target := 0] 45 | 46 | X_panel$target[X_panel$Category1 != "Third" & (X_panel$Health_Score_1 > 0 | X_panel$Health_Score_2 > 0)] <- 1 47 | X_panel$target[X_panel$Category1 == "Third" & X_panel$Number_of_stall_visited > 0] <- 1 48 | 49 | X_panel[, ":="(Registration_Date = as.Date(Registration_Date, "%d-%b-%y"), 50 | Camp_Start_Date = as.Date(Camp_Start_Date, "%d-%b-%y"), 51 | Camp_End_Date = as.Date(Camp_End_Date, "%d-%b-%y"), 52 | First_Interaction = as.Date(First_Interaction, "%d-%b-%y"), 53 | Category1 = as.numeric(as.factor(Category1)), 54 | Category2 = as.numeric(as.factor(Category2)), 55 | City_Type = as.numeric(as.factor(City_Type)), 56 | Income = as.numeric(as.factor(Income)), 57 | Employer_Category = as.numeric(as.factor(Employer_Category)), 58 | Education_Score = as.numeric(Education_Score), 59 | Age = as.numeric(Age))] 60 | 61 | setorder(X_panel, Patient_ID, Registration_Date) 62 | X_panel$order <- seq(1, nrow(X_panel)) 63 | 64 | X_date <- X_panel[, c("Patient_ID", "Registration_Date", "order"), with = FALSE] 65 | X_date$order <- X_date$order + 1 66 | names(X_date)[2] <- "Prev_Date" 67 | 68 | X_panel <- merge(X_panel, X_date, all.x = TRUE, by = c("Patient_ID", "order")) 69 | 70 | X_date$order <- X_date$order - 2 71 | names(X_date)[2] <- "Next_Date" 72 | 73 | X_panel <- merge(X_panel, X_date, all.x = TRUE, by = c("Patient_ID", "order")) 74 | 75 | X_panel[, ":="(Start_Date_Diff = as.numeric(Registration_Date - Camp_Start_Date), 76 | End_Date_Diff = as.numeric(Camp_End_Date - Registration_Date), 77 | Interaction_Date_Diff = as.numeric(Registration_Date - First_Interaction), 78 | Prev_Date_Diff = as.numeric(Registration_Date - Prev_Date), 79 | Next_Date_Diff = as.numeric(Registration_Date - Next_Date), 80 | Camp_Start_Year = year(Camp_Start_Date), 81 | Registration_Year = year(Registration_Date), 82 | Registration_Month = month(Registration_Date), 83 | Registration_Day = wday(Registration_Date))] 84 | 85 | X_panel <- X_panel[Camp_Start_Year >= 2005] 86 | X_panel <- X_panel[!is.na(Registration_Date)] 87 | X_panel <- X_panel[Category3 == 2] 88 | 89 | X_patient <- X_panel[, .(Count_Patient = .N), .(Patient_ID)] 90 | X_panel <- merge(X_panel, X_patient, by = "Patient_ID") 91 | 92 | X_patient_date <- X_panel[, .(Count_Patient_Date = .N), .(Patient_ID, Registration_Date)] 93 | X_panel <- merge(X_panel, X_patient_date, by = c("Patient_ID", "Registration_Date")) 94 | 95 | X_donation <- X_panel[Donation > 0, .(Min_Date_Donation = min(Registration_Date)), .(Patient_ID)] 96 | X_panel <- merge(X_panel, X_donation, all.x = T, by = "Patient_ID") 97 | 98 | X_panel[, Donation_Flag := ifelse(is.na(Min_Date_Donation), 0, ifelse(Registration_Date > Min_Date_Donation, 1, 0))] 99 | 100 | X_train <- X_panel[train_flag == 1] 101 | X_test <- X_panel[train_flag == 0] 102 | 103 | X_features <- c("Count_Patient", "Count_Patient_Date", "Donation_Flag", 104 | "City_Type", "Income", "Education_Score", "Age", 105 | "Category1", "Category2", 106 | "Start_Date_Diff", "End_Date_Diff", "Prev_Date_Diff", "Next_Date_Diff") 107 | X_target <- X_train$target 108 | 109 | xgtrain <- xgb.DMatrix(data = as.matrix(X_train[, X_features, with = FALSE]), label = X_target, missing = NA) 110 | xgtest <- xgb.DMatrix(data = as.matrix(X_test[, X_features, with = FALSE]), missing = NA) 111 | 112 | 113 | ## xgboost 114 | params <- list() 115 | params$objective <- "binary:logistic" 116 | params$eta <- 0.1 117 | params$max_depth <- 5 118 | params$subsample <- 0.9 119 | params$colsample_bytree <- 0.9 120 | params$min_child_weight <- 2 121 | params$eval_metric <- "auc" 122 | 123 | model_xgb_cv <- xgb.cv(params=params, xgtrain, nrounds = 100, nfold = 5, early.stop.round = 30, prediction = TRUE) 124 | 125 | model_xgb <- xgb.train(params = params, xgtrain, nrounds = 100) 126 | 127 | vimp <- xgb.importance(model = model_xgb, feature_names = X_features) 128 | View(vimp) 129 | 130 | 131 | ## submission 132 | pred <- predict(model_xgb, xgtest) 133 | 134 | submit <- data.table(Patient_ID = X_test$Patient_ID, 135 | Health_Camp_ID = X_test$Health_Camp_ID, 136 | Outcome = pred) 137 | 138 | write.csv(submit, "./sub_vopani.csv", row.names = FALSE) 139 | -------------------------------------------------------------------------------- /AV_LTFS_April2019/README.md: -------------------------------------------------------------------------------- 1 | Code for the Analytics Vidhya L&T Financial Services Hackathon 2 | 3 | https://datahack.analyticsvidhya.com/contest/ltfs-datascience-finhack-an-online-hackathon/ 4 | 5 | Thanks to Ziron, we finished [4th](https://datahack.analyticsvidhya.com/contest/ltfs-datascience-finhack-an-online-hackathon/pvt_lb) on this one. 6 | -------------------------------------------------------------------------------- /AV_LordOfTheMachines/build_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model 4 | import lightgbm as lgb 5 | 6 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 7 | grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index() 8 | grouped_df.columns = var_name + ["var_count"] 9 | 10 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 11 | merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True) 12 | return list(merged_df["var_count"]) 13 | 14 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 15 | if type(var_name) != type([]): 16 | var_name = [var_name] 17 | grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index() 18 | grouped_df.columns = var_name + ["mean_value"] 19 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 20 | merged_df.fillna(np.mean(target_df[target_var].values), inplace=True) 21 | return list(merged_df["mean_value"]) 22 | 23 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 24 | if type(var_name) != type([]): 25 | var_name = [var_name] 26 | grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index() 27 | grouped_df.columns = var_name + ["sum_value"] 28 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 29 | merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True) 30 | return list(merged_df["sum_value"]) 31 | 32 | 33 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None): 34 | model = linear_model.LogisticRegression(fit_intercept=True, C=0.3) 35 | model.fit(train_X, train_y) 36 | print model.coef_, model.intercept_ 37 | train_preds = model.predict_proba(train_X)[:,1] 38 | test_preds = model.predict_proba(test_X)[:,1] 39 | test_preds2 = model.predict_proba(test_X2)[:,1] 40 | test_loss = 0 41 | if test_y is not None: 42 | train_loss = metrics.roc_auc_score(train_y, train_preds) 43 | test_loss = metrics.roc_auc_score(test_y, test_preds) 44 | print "Train and Test loss : ", train_loss, test_loss 45 | return test_preds, test_loss, test_preds2 46 | 47 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3): 48 | model = ensemble.ExtraTreesClassifier( 49 | n_estimators = 300, 50 | max_depth = depth, 51 | min_samples_split = 10, 52 | min_samples_leaf = leaf, 53 | max_features = feat, 54 | n_jobs = 6, 55 | random_state = 0) 56 | model.fit(train_X, train_y) 57 | train_preds = model.predict_proba(train_X)[:,1] 58 | test_preds = model.predict_proba(test_X)[:,1] 59 | test_preds2 = model.predict_proba(test_X2)[:,1] 60 | test_loss = 0 61 | if test_y is not None: 62 | train_loss = metrics.roc_auc_score(train_y, train_preds) 63 | test_loss = metrics.roc_auc_score(test_y, test_preds) 64 | print "Depth, leaf, feat : ", depth, leaf, feat 65 | print "Train and Test loss : ", train_loss, test_loss 66 | return test_preds, test_loss, test_preds2 67 | 68 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001): 69 | params = {} 70 | params["objective"] = "binary" 71 | params['metric'] = 'auc' 72 | params["max_depth"] = dep 73 | params["min_data_in_leaf"] = 100 74 | params["learning_rate"] = eta 75 | params["bagging_fraction"] = 0.7 76 | params["feature_fraction"] = 0.7 77 | params["bagging_freq"] = 5 78 | params["bagging_seed"] = seed_val 79 | params["verbosity"] = -1 80 | num_rounds = rounds 81 | 82 | plst = list(params.items()) 83 | lgtrain = lgb.Dataset(train_X, label=train_y) 84 | 85 | if test_y is not None: 86 | lgtest = lgb.Dataset(test_X, label=test_y) 87 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) 88 | else: 89 | lgtest = lgb.DMatrix(test_X) 90 | model = lgb.train(params, lgtrain, num_rounds) 91 | 92 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 93 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 94 | 95 | loss = 0 96 | if test_y is not None: 97 | loss = metrics.roc_auc_score(test_y, pred_test_y) 98 | print loss 99 | return pred_test_y, loss, pred_test_y2 100 | else: 101 | return pred_test_y, loss, pred_test_y2 102 | 103 | if __name__ == "__main__": 104 | print "Reading input files..." 105 | train_df = pd.read_csv("../input/train_feat.csv") 106 | test_df = pd.read_csv("../input/test_feat.csv") 107 | campaign_df = pd.read_csv("../input/campaign_data.csv") 108 | train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1) 109 | print train_df.shape, test_df.shape 110 | print train_df.head() 111 | 112 | 113 | print np.sort(train_df["campaign_id"].unique()) 114 | #camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]] 115 | 116 | print "Merging with campaign data.." 117 | train_df = pd.merge(train_df, campaign_df, on="campaign_id") 118 | test_df = pd.merge(test_df, campaign_df, on="campaign_id") 119 | print train_df.shape, test_df.shape 120 | kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017) 121 | 122 | train_y_open = train_df["is_open"].values 123 | train_y = train_df["is_click"].values 124 | test_id = test_df["id"].values 125 | train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 126 | cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"] 127 | #cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"] 128 | #cols_to_use = [] 129 | #cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"] 130 | cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"] 131 | cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"] 132 | #cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"] 133 | 134 | #print "Label encoding.." 135 | #for c in ["communication_type"]: 136 | # cols_to_use.append(c) 137 | # lbl = preprocessing.LabelEncoder() 138 | # lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 139 | # train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 140 | # test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 141 | 142 | 143 | #print "Full Count encoding.." 144 | #full_df = train_df.append(test_df) 145 | #print full_df.shape 146 | #for col in [["user_id"]]: 147 | # if isinstance(col, list): 148 | # col_name = "_".join(col) 149 | # train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id')) 150 | # test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id')) 151 | # cols_to_use.append(col_name + "_full_count") 152 | 153 | 154 | print "Count encoding.." 155 | for col in [["user_id"], ["user_id", "communication_type"]]: 156 | #for col in [["user_id"]]: 157 | train_enc_values = np.zeros(train_df.shape[0]) 158 | test_enc_values = 0 159 | for dev_index, val_index in kf.split(train_unique_campaigns): 160 | #for [dev_camp, val_camp] in camp_indices: 161 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 162 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 163 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click')) 164 | test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click')) 165 | test_enc_values /= 5. 166 | if isinstance(col, list): 167 | col = "_".join(col) 168 | train_df[col + "_count"] = train_enc_values 169 | test_df[col + "_count"] = test_enc_values 170 | cols_to_use.append(col + "_count") 171 | 172 | 173 | 174 | print "Target encoding.." 175 | for col in [["user_id"], ["user_id", "communication_type"]]: 176 | #for col in [["user_id"]]: 177 | train_enc_values = np.zeros(train_df.shape[0]) 178 | test_enc_values = 0 179 | for dev_index, val_index in kf.split(train_unique_campaigns): 180 | #for [dev_camp, val_camp] in camp_indices: 181 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 182 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 183 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click')) 184 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click')) 185 | test_enc_values /= 5. 186 | if isinstance(col, list): 187 | col = "_".join(col) 188 | train_df[col + "_enc"] = train_enc_values 189 | test_df[col + "_enc"] = test_enc_values 190 | cols_to_use.append(col + "_enc") 191 | 192 | 193 | print "Open Target encoding.." 194 | for col in [["user_id"], ["user_id", "communication_type"]]: 195 | #for col in [["user_id"]]: 196 | train_enc_values = np.zeros(train_df.shape[0]) 197 | test_enc_values = 0 198 | for dev_index, val_index in kf.split(train_unique_campaigns): 199 | #for [dev_camp, val_camp] in camp_indices: 200 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 201 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 202 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open')) 203 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open')) 204 | test_enc_values /= 5. 205 | if isinstance(col, list): 206 | col = "_".join(col) 207 | train_df[col + "_open_enc"] = train_enc_values 208 | test_df[col + "_open_enc"] = test_enc_values 209 | cols_to_use.append(col + "_open_enc") 210 | 211 | 212 | 213 | 214 | """ 215 | print "Open Alone Target encoding.." 216 | #for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]: 217 | for col in [["user_id"]]: 218 | train_enc_values = np.zeros(train_df.shape[0]) 219 | test_enc_values = 0 220 | for dev_index, val_index in kf.split(train_unique_campaigns): 221 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 222 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 223 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open')) 224 | test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open')) 225 | test_enc_values /= 5. 226 | if isinstance(col, list): 227 | col = "_".join(col) 228 | train_df[col + "_open_sum_enc"] = train_enc_values 229 | test_df[col + "_open_sum_enc"] = test_enc_values 230 | cols_to_use.append(col + "_open_sum_enc") 231 | """ 232 | 233 | 234 | print cols_to_use 235 | train_X = train_df[cols_to_use] 236 | test_X = test_df[cols_to_use] 237 | print train_X.describe() 238 | print test_X.describe() 239 | 240 | #train_X.fillna(-1, inplace=True) 241 | #test_X.fillna(-1, inplace=True) 242 | 243 | print "Model building.." 244 | model_name = "LGB" 245 | cv_scores = [] 246 | pred_test_full = 0 247 | pred_val_full = np.zeros(train_df.shape[0]) 248 | for dev_index, val_index in kf.split(train_unique_campaigns): 249 | #for [dev_camp, val_camp] in camp_indices: 250 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 251 | dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)] 252 | dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)] 253 | print dev_X.shape, val_X.shape 254 | 255 | if model_name == "LGB": 256 | pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 257 | pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 258 | pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 259 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 260 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 261 | loss = (loss1 + loss2 + loss3)/3. 262 | elif model_name == "ET": 263 | pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3) 264 | elif model_name == "LR": 265 | pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X) 266 | 267 | pred_test_full += pred_test 268 | pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val 269 | loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val) 270 | cv_scores.append(loss) 271 | print cv_scores 272 | pred_test_full /= 5. 273 | print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full) 274 | 275 | sub_df = pd.DataFrame({"id":test_id}) 276 | sub_df["is_click"] = pred_test_full 277 | sub_df.to_csv("srk_sub47.csv", index=False) 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /AV_LordOfTheMachines/build_model_xgb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model 4 | import lightgbm as lgb 5 | import xgboost as xgb 6 | 7 | def getCountVar(compute_df, count_df, var_name, count_var="v1"): 8 | grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index() 9 | grouped_df.columns = var_name + ["var_count"] 10 | 11 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 12 | merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True) 13 | return list(merged_df["var_count"]) 14 | 15 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 16 | if type(var_name) != type([]): 17 | var_name = [var_name] 18 | grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index() 19 | grouped_df.columns = var_name + ["mean_value"] 20 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 21 | merged_df.fillna(np.mean(target_df[target_var].values), inplace=True) 22 | return list(merged_df["mean_value"]) 23 | 24 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1): 25 | if type(var_name) != type([]): 26 | var_name = [var_name] 27 | grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index() 28 | grouped_df.columns = var_name + ["sum_value"] 29 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 30 | merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True) 31 | return list(merged_df["sum_value"]) 32 | 33 | 34 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None): 35 | model = linear_model.LogisticRegression(fit_intercept=True, C=0.3) 36 | model.fit(train_X, train_y) 37 | print model.coef_, model.intercept_ 38 | train_preds = model.predict_proba(train_X)[:,1] 39 | test_preds = model.predict_proba(test_X)[:,1] 40 | test_preds2 = model.predict_proba(test_X2)[:,1] 41 | test_loss = 0 42 | if test_y is not None: 43 | train_loss = metrics.roc_auc_score(train_y, train_preds) 44 | test_loss = metrics.roc_auc_score(test_y, test_preds) 45 | print "Train and Test loss : ", train_loss, test_loss 46 | return test_preds, test_loss, test_preds2 47 | 48 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3): 49 | model = ensemble.ExtraTreesClassifier( 50 | n_estimators = 300, 51 | max_depth = depth, 52 | min_samples_split = 10, 53 | min_samples_leaf = leaf, 54 | max_features = feat, 55 | n_jobs = 6, 56 | random_state = 0) 57 | model.fit(train_X, train_y) 58 | train_preds = model.predict_proba(train_X)[:,1] 59 | test_preds = model.predict_proba(test_X)[:,1] 60 | test_preds2 = model.predict_proba(test_X2)[:,1] 61 | test_loss = 0 62 | if test_y is not None: 63 | train_loss = metrics.roc_auc_score(train_y, train_preds) 64 | test_loss = metrics.roc_auc_score(test_y, test_preds) 65 | print "Depth, leaf, feat : ", depth, leaf, feat 66 | print "Train and Test loss : ", train_loss, test_loss 67 | return test_preds, test_loss, test_preds2 68 | 69 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001): 70 | params = {} 71 | params["objective"] = "binary" 72 | params['metric'] = 'auc' 73 | params["max_depth"] = dep 74 | params["min_data_in_leaf"] = 100 75 | params["learning_rate"] = eta 76 | params["bagging_fraction"] = 0.7 77 | params["feature_fraction"] = 0.7 78 | params["bagging_freq"] = 5 79 | params["bagging_seed"] = seed_val 80 | params["verbosity"] = -1 81 | num_rounds = rounds 82 | 83 | plst = list(params.items()) 84 | lgtrain = lgb.Dataset(train_X, label=train_y) 85 | 86 | if test_y is not None: 87 | lgtest = lgb.Dataset(test_X, label=test_y) 88 | model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20) 89 | else: 90 | lgtest = lgb.DMatrix(test_X) 91 | model = lgb.train(params, lgtrain, num_rounds) 92 | 93 | pred_test_y = model.predict(test_X, num_iteration=model.best_iteration) 94 | pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration) 95 | 96 | loss = 0 97 | if test_y is not None: 98 | loss = metrics.roc_auc_score(test_y, pred_test_y) 99 | print loss 100 | return pred_test_y, loss, pred_test_y2 101 | else: 102 | return pred_test_y, loss, pred_test_y2 103 | 104 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.001): 105 | params = {} 106 | params["objective"] = "binary:logistic" 107 | params['eval_metric'] = 'auc' 108 | params["eta"] = eta 109 | params["subsample"] = 0.7 110 | params["min_child_weight"] = 10 111 | params["colsample_bytree"] = 0.7 112 | params["max_depth"] = dep 113 | params["silent"] = 1 114 | params["seed"] = seed_val 115 | #params["max_delta_step"] = 2 116 | #params["gamma"] = 0.5 117 | num_rounds = rounds 118 | 119 | plst = list(params.items()) 120 | xgtrain = xgb.DMatrix(train_X, label=train_y) 121 | 122 | if test_y is not None: 123 | xgtest = xgb.DMatrix(test_X, label=test_y) 124 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 125 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20) 126 | else: 127 | xgtest = xgb.DMatrix(test_X) 128 | model = xgb.train(plst, xgtrain, num_rounds) 129 | 130 | pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit) 131 | pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit) 132 | 133 | loss = 0 134 | if test_y is not None: 135 | loss = metrics.log_loss(test_y, pred_test_y) 136 | print loss 137 | return pred_test_y, loss, pred_test_y2 138 | else: 139 | return pred_test_y, loss, pred_test_y2 140 | 141 | 142 | if __name__ == "__main__": 143 | print "Reading input files..." 144 | train_df = pd.read_csv("../input/train_feat.csv") 145 | test_df = pd.read_csv("../input/test_feat.csv") 146 | campaign_df = pd.read_csv("../input/campaign_data.csv") 147 | train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1) 148 | print train_df.shape, test_df.shape 149 | print train_df.head() 150 | 151 | 152 | print np.sort(train_df["campaign_id"].unique()) 153 | #camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]] 154 | 155 | print "Merging with campaign data.." 156 | train_df = pd.merge(train_df, campaign_df, on="campaign_id") 157 | test_df = pd.merge(test_df, campaign_df, on="campaign_id") 158 | print train_df.shape, test_df.shape 159 | kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=98765) 160 | 161 | train_y_open = train_df["is_open"].values 162 | train_y = train_df["is_click"].values 163 | test_id = test_df["id"].values 164 | train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 165 | cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"] 166 | #cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"] 167 | #cols_to_use = [] 168 | #cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"] 169 | cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"] 170 | cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"] 171 | #cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"] 172 | 173 | #print "Label encoding.." 174 | #for c in ["communication_type"]: 175 | # cols_to_use.append(c) 176 | # lbl = preprocessing.LabelEncoder() 177 | # lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str'))) 178 | # train_df[c] = lbl.transform(list(train_df[c].values.astype('str'))) 179 | # test_df[c] = lbl.transform(list(test_df[c].values.astype('str'))) 180 | 181 | 182 | #print "Full Count encoding.." 183 | #full_df = train_df.append(test_df) 184 | #print full_df.shape 185 | #for col in [["user_id"]]: 186 | # if isinstance(col, list): 187 | # col_name = "_".join(col) 188 | # train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id')) 189 | # test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id')) 190 | # cols_to_use.append(col_name + "_full_count") 191 | 192 | 193 | print "Count encoding.." 194 | for col in [["user_id"], ["user_id", "communication_type"]]: 195 | #for col in [["user_id"]]: 196 | train_enc_values = np.zeros(train_df.shape[0]) 197 | test_enc_values = 0 198 | for dev_index, val_index in kf.split(train_unique_campaigns): 199 | #for [dev_camp, val_camp] in camp_indices: 200 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 201 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 202 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click')) 203 | test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click')) 204 | test_enc_values /= 5. 205 | if isinstance(col, list): 206 | col = "_".join(col) 207 | train_df[col + "_count"] = train_enc_values 208 | test_df[col + "_count"] = test_enc_values 209 | cols_to_use.append(col + "_count") 210 | 211 | 212 | 213 | print "Target encoding.." 214 | for col in [["user_id"], ["user_id", "communication_type"]]: 215 | #for col in [["user_id"]]: 216 | train_enc_values = np.zeros(train_df.shape[0]) 217 | test_enc_values = 0 218 | for dev_index, val_index in kf.split(train_unique_campaigns): 219 | #for [dev_camp, val_camp] in camp_indices: 220 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 221 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 222 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click')) 223 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click')) 224 | test_enc_values /= 5. 225 | if isinstance(col, list): 226 | col = "_".join(col) 227 | train_df[col + "_enc"] = train_enc_values 228 | test_df[col + "_enc"] = test_enc_values 229 | cols_to_use.append(col + "_enc") 230 | 231 | 232 | print "Open Target encoding.." 233 | for col in [["user_id"], ["user_id", "communication_type"]]: 234 | #for col in [["user_id"]]: 235 | train_enc_values = np.zeros(train_df.shape[0]) 236 | test_enc_values = 0 237 | for dev_index, val_index in kf.split(train_unique_campaigns): 238 | #for [dev_camp, val_camp] in camp_indices: 239 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 240 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 241 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open')) 242 | test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open')) 243 | test_enc_values /= 5. 244 | if isinstance(col, list): 245 | col = "_".join(col) 246 | train_df[col + "_open_enc"] = train_enc_values 247 | test_df[col + "_open_enc"] = test_enc_values 248 | cols_to_use.append(col + "_open_enc") 249 | 250 | 251 | 252 | 253 | """ 254 | print "Open Alone Target encoding.." 255 | #for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]: 256 | for col in [["user_id"]]: 257 | train_enc_values = np.zeros(train_df.shape[0]) 258 | test_enc_values = 0 259 | for dev_index, val_index in kf.split(train_unique_campaigns): 260 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 261 | dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)] 262 | train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open')) 263 | test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open')) 264 | test_enc_values /= 5. 265 | if isinstance(col, list): 266 | col = "_".join(col) 267 | train_df[col + "_open_sum_enc"] = train_enc_values 268 | test_df[col + "_open_sum_enc"] = test_enc_values 269 | cols_to_use.append(col + "_open_sum_enc") 270 | """ 271 | 272 | 273 | print cols_to_use 274 | train_X = train_df[cols_to_use] 275 | test_X = test_df[cols_to_use] 276 | print train_X.describe() 277 | print test_X.describe() 278 | 279 | #train_X.fillna(-1, inplace=True) 280 | #test_X.fillna(-1, inplace=True) 281 | 282 | print "Model building.." 283 | model_name = "XGB" 284 | cv_scores = [] 285 | pred_test_full = 0 286 | pred_val_full = np.zeros(train_df.shape[0]) 287 | for dev_index, val_index in kf.split(train_unique_campaigns): 288 | #for [dev_camp, val_camp] in camp_indices: 289 | dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist() 290 | dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)] 291 | dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)] 292 | print dev_X.shape, val_X.shape 293 | 294 | if model_name == "LGB": 295 | pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 296 | pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 297 | pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 298 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 299 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 300 | loss = (loss1 + loss2 + loss3)/3. 301 | elif model_name == "XGB": 302 | pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4) 303 | pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018) 304 | pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876) 305 | pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 306 | pred_test = (pred_test1 + pred_test2 + pred_test3)/3. 307 | loss = (loss1 + loss2 + loss3)/3. 308 | elif model_name == "ET": 309 | pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3) 310 | elif model_name == "LR": 311 | pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X) 312 | 313 | pred_test_full += pred_test 314 | pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val 315 | loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val) 316 | cv_scores.append(loss) 317 | print cv_scores 318 | pred_test_full /= 5. 319 | print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full) 320 | 321 | sub_df = pd.DataFrame({"id":test_id}) 322 | sub_df["is_click"] = pred_test_full 323 | sub_df.to_csv("srk_sub48.csv", index=False) 324 | 325 | 326 | 327 | -------------------------------------------------------------------------------- /AV_LordOfTheMachines/ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | s1 = pd.read_csv("../Submissions/srk_sub47.csv") 5 | s2 = pd.read_csv("../Submissions/srk_sub48.csv") 6 | #s3 = pd.read_csv("../Submissions/srk_sub23.csv") 7 | #s4 = pd.read_csv("../Submissions/srk_sub24.csv") 8 | 9 | #s1["is_click"] = 0.35*(0.5*s1["is_click"] + 0.5*s2["is_click"]) + 0.65*(0.65*(s3["is_click"])+0.35*(s4["is_click"])) 10 | s1["is_click"] = 0.5*s1["is_click"] + 0.5*s2["is_click"] 11 | s1.to_csv("srk_sub49.csv", index=False) 12 | -------------------------------------------------------------------------------- /AV_LordOfTheMachines/readme.md: -------------------------------------------------------------------------------- 1 | This folder has the code files for the [Hackaton - Lord Of The Machines](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/) 2 | 3 | We finished [third](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/pvt_lb) on this comptition. 4 | 5 | Order of files to run 6 | 1. Explorations.ipynb - Code file to create the features. 7 | 2. build_model.py - Code file to build the Light GBM model 8 | 3. build_model_xgb.py - Code file to build the XGB model 9 | 4. ensemble.py - Code file to merge both the results. 10 | -------------------------------------------------------------------------------- /AV_MiniHack1/model_ens.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | data_path = "./" 5 | s1 = pd.read_csv(data_path + "sub_lr.csv") 6 | s2 = pd.read_csv(data_path + "sub_xgb.csv") 7 | 8 | s1["Count"] = 0.5*s1["Count"] + 0.5*s2["Count"] 9 | s1.to_csv("sub_ens.csv", index=False) 10 | -------------------------------------------------------------------------------- /AV_MiniHack1/model_lr.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from math import sqrt 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 6 | from sklearn.cross_validation import KFold 7 | from sklearn import ensemble, preprocessing 8 | from sklearn import linear_model as lm 9 | from sklearn.metrics import mean_squared_error as mse 10 | 11 | def rmse(act_y, pred_y): 12 | return np.sqrt(mse(act_y, pred_y)) 13 | 14 | if __name__ == "__main__": 15 | # Data path of the input files # 16 | data_path = "../Data/" 17 | train_file = data_path + "Train_JPXjxg6.csv" 18 | test_file = data_path + "Test_mvj827l.csv" 19 | 20 | print "Reading the files into dataframes.." 21 | train_df = pd.read_csv(train_file) 22 | test_df = pd.read_csv(test_file) 23 | 24 | print "Converting to date format.." 25 | train_df["Date"] = (pd.to_datetime(train_df["Datetime"], format="%d-%m-%Y %H:%M")) 26 | test_df["Date"] = (pd.to_datetime(test_df["Datetime"], format="%d-%m-%Y %H:%M")) 27 | 28 | print "Getting the dv and id column.." 29 | train_y = np.array(train_df.Count.values) 30 | test_id = test_df.Datetime.values 31 | 32 | print "Creating variables from date field.." 33 | train_df["Year"] = train_df["Date"].apply(lambda x: x.year) 34 | test_df["Year"] = test_df["Date"].apply(lambda x: x.year) 35 | train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour) 36 | test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour) 37 | train_df["WeekDay"] = train_df["Date"].apply(lambda x: x.weekday()) 38 | test_df["WeekDay"] = test_df["Date"].apply(lambda x: x.weekday()) 39 | train_df["DayCount"] = train_df["Date"].apply(lambda x: x.toordinal()) 40 | test_df["DayCount"] = test_df["Date"].apply(lambda x: x.toordinal()) 41 | 42 | train = train_df.drop(["Datetime","Date","Count"], axis=1) 43 | test = test_df.drop(["Datetime","Date"], axis=1) 44 | 45 | print "One hot encoding.." 46 | temp_train_arr = np.empty([train.shape[0],0]) 47 | temp_test_arr = np.empty([test.shape[0],0]) 48 | cols_to_drop = [] 49 | for var in train.columns: 50 | if var in ["Hour", "WeekDay"]: 51 | print var 52 | lb = preprocessing.LabelEncoder() 53 | full_var_data = pd.concat((train[var],test[var]),axis=0).astype('str') 54 | temp = lb.fit_transform(np.array(full_var_data)) 55 | train[var] = lb.transform(np.array( train[var] ).astype('str')) 56 | test[var] = lb.transform(np.array( test[var] ).astype('str')) 57 | 58 | cols_to_drop.append(var) 59 | ohe = preprocessing.OneHotEncoder(sparse=False) 60 | ohe.fit(temp.reshape(-1,1)) 61 | temp_arr = ohe.transform(train[var].reshape(-1,1)) 62 | temp_train_arr = np.hstack([temp_train_arr, temp_arr]) 63 | temp_arr = ohe.transform(test[var].reshape(-1,1)) 64 | temp_test_arr = np.hstack([temp_test_arr, temp_arr]) 65 | 66 | train = train.drop(cols_to_drop, axis=1) 67 | test = test.drop(cols_to_drop, axis=1) 68 | train = np.hstack( [np.array(train),temp_train_arr]).astype("float") 69 | test = np.hstack( [np.array(test),temp_test_arr]).astype("float") 70 | print train.shape, test.shape 71 | 72 | # Use the lastest data # 73 | train_X = np.array(train)[16000:] 74 | train_y = train_y[16000:] 75 | test_X = np.array(test) 76 | 77 | # Train the linear model and predict on test data # 78 | reg = lm.LinearRegression() 79 | reg.fit(train_X, train_y) 80 | preds = reg.predict(test_X).astype('int') 81 | 82 | # writing to out file # 83 | sample = pd.read_csv(data_path + "Test_mvj827l.csv") 84 | sample["Count"] = preds 85 | sample.to_csv("sub_lr.csv", index=False) 86 | -------------------------------------------------------------------------------- /AV_MiniHack1/model_xgb.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from math import sqrt 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 6 | from sklearn.cross_validation import KFold 7 | from sklearn import ensemble 8 | from sklearn import linear_model as lm 9 | from sklearn.metrics import mean_squared_error as mse 10 | import xgboost as xgb 11 | 12 | def runXGB(train_X, train_y, test_X, test_y=None): 13 | params = {} 14 | params["objective"] = "reg:linear" 15 | params["eta"] = 0.02 16 | params["min_child_weight"] = 8 17 | params["subsample"] = 0.9 18 | params["colsample_bytree"] = 0.8 19 | params["silent"] = 1 20 | params["max_depth"] = 8 21 | params["seed"] = 1 22 | plst = list(params.items()) 23 | num_rounds = 500 24 | 25 | xgtrain = xgb.DMatrix(train_X, label=train_y) 26 | xgtest = xgb.DMatrix(test_X) 27 | model = xgb.train(plst, xgtrain, num_rounds) 28 | pred_test_y = model.predict(xgtest) 29 | return pred_test_y 30 | 31 | def rmse(act_y, pred_y): 32 | return np.sqrt(mse(act_y, pred_y)) 33 | 34 | 35 | if __name__ == "__main__": 36 | # Input data path # 37 | data_path = "../Data/" 38 | train_file = data_path + "Train_JPXjxg6.csv" 39 | test_file = data_path + "Test_mvj827l.csv" 40 | 41 | # Reading the csv file into pandas dataframe # 42 | train_df = pd.read_csv(train_file) 43 | test_df = pd.read_csv(test_file) 44 | 45 | print "Converting to date format" 46 | train_df["Date"] = (pd.to_datetime(train_df["Datetime"], format="%d-%m-%Y %H:%M")) 47 | test_df["Date"] = (pd.to_datetime(test_df["Datetime"], format="%d-%m-%Y %H:%M")) 48 | 49 | # Getting the dv and id values # 50 | train_y = np.array(train_df.Count.values) 51 | test_id = test_df.Datetime.values 52 | 53 | print "Processing Date field.." 54 | train_df["DayOfMonth"] = train_df["Date"].apply(lambda x: x.day) 55 | test_df["DayOfMonth"] = test_df["Date"].apply(lambda x: x.day) 56 | train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour) 57 | test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour) 58 | train_df["WeekDay"] = train_df["Date"].apply(lambda x: x.weekday()) 59 | test_df["WeekDay"] = test_df["Date"].apply(lambda x: x.weekday()) 60 | train_df["DayCount"] = train_df["Date"].apply(lambda x: x.toordinal()) 61 | test_df["DayCount"] = test_df["Date"].apply(lambda x: x.toordinal()) 62 | 63 | # Dropping the columns that are not needed # 64 | train_df.drop(["Datetime","Date","Count"], axis=1, inplace=True) 65 | test_df.drop(["Datetime","Date"], axis=1, inplace=True) 66 | 67 | # Running the xgb model # 68 | preds = runXGB(np.array(train_df), train_y, np.array(test_df)) 69 | preds = preds.astype('int') 70 | 71 | # Saving the predictions # 72 | sample = pd.read_csv(data_path + "Test_mvj827l.csv") 73 | sample["Count"] = preds 74 | sample.to_csv("sub_xgb.csv", index=False) 75 | -------------------------------------------------------------------------------- /AV_MiniHack1/readme.md: -------------------------------------------------------------------------------- 1 | This folder contains the codes for the [Analytics Vidhya - Mini DataHack] (http://datahack.analyticsvidhya.com/contest/mini-datahack) 2 | 3 | Order of files to run 4 | 1. model_lr.py - Linear Regression model 5 | 2. model_xgb.py - XGBoost model 6 | 3. model_ens.py - Averaging both 7 | -------------------------------------------------------------------------------- /AV_MiniHack2_SimpleBuy/finalModel.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from math import sqrt 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 6 | from sklearn.cross_validation import KFold 7 | from sklearn import ensemble 8 | from sklearn import linear_model as lm 9 | from sklearn.metrics import mean_squared_error as mse 10 | import xgboost as xgb 11 | 12 | 13 | def runXGB(train_X, train_y, test_X, test_y=None): 14 | params = {} 15 | params["objective"] = "reg:linear" 16 | params["eta"] = 0.002 17 | params["min_child_weight"] = 1 18 | params["subsample"] = 0.9 19 | params["colsample_bytree"] = 0.8 20 | params["silent"] = 1 21 | params["max_depth"] = 8 22 | params["seed"] = 1 23 | plst = list(params.items()) 24 | num_rounds = 900 25 | 26 | xgtrain = xgb.DMatrix(train_X, label=train_y) 27 | xgtest = xgb.DMatrix(test_X) 28 | model = xgb.train(plst, xgtrain, num_rounds) 29 | pred_test_y = model.predict(xgtest) 30 | return pred_test_y 31 | 32 | def rmse(act_y, pred_y): 33 | return np.sqrt(mse(act_y, pred_y)) 34 | 35 | if __name__ == "__main__": 36 | data_path = "../Data/" 37 | train_file = data_path + "Train_KQyJ5eh.csv" 38 | test_file = data_path + "Test_HmLwURQ.csv" 39 | 40 | train_df = pd.read_csv(train_file) 41 | test_df = pd.read_csv(test_file) 42 | 43 | print "Converting to date format" 44 | train_df["Date_mod"] = (pd.to_datetime(train_df["Date"], format="%d-%b-%y")) 45 | test_df["Date_mod"] = (pd.to_datetime(test_df["Date"], format="%d-%b-%y")) 46 | 47 | train_y = np.array(train_df.Number_SKU_Sold.values) 48 | train_y[train_y > 20000000] = 20000000 49 | test_id = test_df.Date.values 50 | 51 | print "Processing Dates.." 52 | train_df["DayOfMonth"] = train_df["Date_mod"].apply(lambda x: x.day) 53 | test_df["DayOfMonth"] = test_df["Date_mod"].apply(lambda x: x.day) 54 | train_df["Month"] = train_df["Date_mod"].apply(lambda x: x.month) 55 | test_df["Month"] = test_df["Date_mod"].apply(lambda x: x.month) 56 | #train_df["Year"] = train_df["Date"].apply(lambda x: x.year) 57 | #test_df["Year"] = test_df["Date"].apply(lambda x: x.year) 58 | #train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour) 59 | #test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour) 60 | train_df["WeekDay"] = train_df["Date_mod"].apply(lambda x: x.weekday()) 61 | test_df["WeekDay"] = test_df["Date_mod"].apply(lambda x: x.weekday()) 62 | #train_df["WeekNo"] = train_df["Date_mod"].apply(lambda x: x.isocalendar()[1]) 63 | #test_df["WeekNo"] = test_df["Date_mod"].apply(lambda x: x.isocalendar()[1]) 64 | train_df["DayOfYear"] = train_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday) 65 | test_df["DayOfYear"] = test_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday) 66 | train_df["DayCount"] = train_df["Date_mod"].apply(lambda x: x.toordinal()) 67 | test_df["DayCount"] = test_df["Date_mod"].apply(lambda x: x.toordinal()) 68 | 69 | 70 | 71 | train_df.drop(["Date_mod","Date","Number_SKU_Sold"], axis=1, inplace=True) 72 | test_df.drop(["Date_mod","Date"], axis=1, inplace=True) 73 | 74 | print train_df.shape, test_df.shape 75 | print train_df.head() 76 | print test_df.head() 77 | 78 | preds_xgb = runXGB(np.array(train_df)[299:,:], train_y[299:], np.array(test_df)) 79 | 80 | 81 | reg = lm.LinearRegression() 82 | reg.fit(np.array(train_df)[:,:], train_y[:]) 83 | preds_lm = reg.predict( np.array(test_df)) 84 | 85 | train_y[train_y > 15000000] = 15000000 86 | preds = 0.8*preds_xgb + 0.2*preds_lm 87 | 88 | preds[357] = 70000000 89 | 90 | # Saving the predictions # 91 | sample = pd.read_csv(data_path + "Sample_Submission_6FjDs3p.csv") 92 | sample["Number_SKU_Sold"] = preds 93 | sample.to_csv("sub.csv", index=False) 94 | -------------------------------------------------------------------------------- /AV_MiniHack2_SimpleBuy/readme.md: -------------------------------------------------------------------------------- 1 | Codes for the AV Time Series Mini Hack - SimpleBuy Sales 2 | -------------------------------------------------------------------------------- /AV_SmartRecruits/finalModel.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import csv 3 | import operator 4 | import pandas as pd 5 | import numpy as np 6 | from sklearn import preprocessing 7 | import xgboost as xgb 8 | from sklearn.metrics import roc_auc_score 9 | from sklearn.cross_validation import KFold 10 | 11 | data_path = "../input/" 12 | train_file_name = "Train_pjb2QcD.csv" 13 | test_file_name = "Test_wyCirpO.csv" 14 | 15 | def getCountVar(compute_df, count_df, var_name, count_var="Manager_Num_Application"): 16 | grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count') 17 | grouped_df.columns = [var_name, "var_count"] 18 | merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name) 19 | merged_df.fillna(-1, inplace=True) 20 | return list(merged_df["var_count"]) 21 | 22 | def create_feature_map(features): 23 | outfile = open('xgb.fmap', 'w') 24 | for i, feat in enumerate(features): 25 | outfile.write('{0}\t{1}\tq\n'.format(i,feat)) 26 | outfile.close() 27 | 28 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0): 29 | params = {} 30 | params["objective"] = "binary:logistic" 31 | params['eval_metric'] = 'auc' 32 | params["eta"] = 0.01 #0.00334 33 | params["min_child_weight"] = 1 34 | params["subsample"] = 0.8 35 | params["colsample_bytree"] = 0.3 36 | params["silent"] = 1 37 | params["max_depth"] = 6 38 | params["seed"] = seed_val 39 | #params["max_delta_step"] = 2 40 | #params["gamma"] = 0.5 41 | num_rounds = 1000 #2500 42 | 43 | plst = list(params.items()) 44 | xgtrain = xgb.DMatrix(train_X, label=train_y) 45 | 46 | if test_y is not None: 47 | xgtest = xgb.DMatrix(test_X, label=test_y) 48 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 49 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=500) 50 | else: 51 | xgtest = xgb.DMatrix(test_X) 52 | model = xgb.train(plst, xgtrain, num_rounds) 53 | 54 | if feature_names: 55 | create_feature_map(feature_names) 56 | model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True) 57 | importance = model.get_fscore(fmap='xgb.fmap') 58 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 59 | imp_df = pd.DataFrame(importance, columns=['feature','fscore']) 60 | imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() 61 | imp_df.to_csv("imp_feat.txt", index=False) 62 | 63 | pred_test_y = model.predict(xgtest) 64 | 65 | if test_y is not None: 66 | loss = roc_auc_score(test_y, pred_test_y) 67 | print loss 68 | return pred_test_y, loss 69 | else: 70 | return pred_test_y 71 | 72 | 73 | if __name__ == "__main__": 74 | print "Reading files.." 75 | train = pd.read_csv(data_path + train_file_name) 76 | test = pd.read_csv(data_path + test_file_name) 77 | print train.shape, test.shape 78 | 79 | print "Rank vars.." 80 | prev_date = 0 81 | count_dict = {} 82 | for name, row in train.iterrows(): 83 | count_dict[ row["Application_Receipt_Date"] ] = count_dict.get(row["Application_Receipt_Date"],0) + 1 84 | for name, row in test.iterrows(): 85 | count_dict[ row["Application_Receipt_Date"] ] = count_dict.get(row["Application_Receipt_Date"],0) + 1 86 | 87 | prev_date = 0 88 | rank_list = [] 89 | count_list = [] 90 | rankpct_list = [] 91 | for name, row in train.iterrows(): 92 | date_value = row["Application_Receipt_Date"] 93 | if date_value != prev_date: 94 | rank = 1 95 | prev_date = date_value 96 | else: 97 | rank += 1 98 | rank_list.append( rank ) 99 | count_list.append( count_dict[date_value] ) 100 | rankpct_list.append( float(rank) / count_dict[date_value] ) 101 | train["dayrank"] = rank_list[:] 102 | train["daycount"] = count_list[:] 103 | train["dayrankpct"] = rankpct_list[:] 104 | 105 | prev_date = 0 106 | rank_list = [] 107 | count_list = [] 108 | rankpct_list = [] 109 | for name, row in test.iterrows(): 110 | date_value = row["Application_Receipt_Date"] 111 | if date_value != prev_date: 112 | rank = 1 113 | prev_date = date_value 114 | else: 115 | rank += 1 116 | rank_list.append( rank ) 117 | count_list.append( count_dict[date_value] ) 118 | rankpct_list.append( float(rank) / count_dict[date_value] ) 119 | test["dayrank"] = rank_list[:] 120 | test["daycount"] = count_list[:] 121 | test["dayrankpct"] = rankpct_list[:] 122 | print train.dayrank.describe() 123 | print test.dayrank.describe() 124 | 125 | print "Getting DV and ID.." 126 | train_y = train.Business_Sourced.values 127 | train_ID = train.ID.values 128 | test_ID = test.ID.values 129 | 130 | print "New feats.." 131 | print "Some more features.." 132 | new_feats = ["DOJ_DOB", "DOB_Applicant_Gender", "DOB_Qualification", "DOB_Gender_Qual"] 133 | train["DOJ_DOB"] = train["Manager_DOJ"].astype('str') + "_" + train["Manager_DoB"].astype('str') 134 | train["DOB_Applicant_Gender"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Gender"].astype('str') 135 | train["DOB_Qualification"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Qualification"].astype('str') 136 | train["DOB_Gender_Qual"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Gender"].astype('str') + "_" + train["Applicant_Qualification"].astype('str') 137 | test["DOJ_DOB"] = test["Manager_DOJ"].astype('str') + "_" + test["Manager_DoB"].astype('str') 138 | test["DOB_Applicant_Gender"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Gender"].astype('str') 139 | test["DOB_Qualification"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Qualification"].astype('str') 140 | test["DOB_Gender_Qual"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Gender"].astype('str') + "_" + test["Applicant_Qualification"].astype('str') 141 | 142 | print "Label encoding.." 143 | cat_columns = ["Applicant_Gender", "Applicant_Marital_Status", "Applicant_Occupation", "Applicant_Qualification", "Manager_Joining_Designation", "Manager_Current_Designation", "Manager_Status", "Manager_Gender"] 144 | for f in cat_columns + new_feats: 145 | print(f), len(np.unique(train[f].values)) 146 | lbl = preprocessing.LabelEncoder() 147 | lbl.fit(list(train[f].values) + list(test[f].values)) 148 | train[f] = lbl.transform(list(train[f].values)) 149 | test[f] = lbl.transform(list(test[f].values)) 150 | new_train = pd.concat([ train[['Manager_Num_Application',f]], test[['Manager_Num_Application',f]] ]) 151 | train["CountVar_"+str(f)] = getCountVar(train[['Manager_Num_Application',f]], new_train[['Manager_Num_Application', f]], f) 152 | test["CountVar_"+str(f)] = getCountVar(test[['Manager_Num_Application',f]], new_train[['Manager_Num_Application',f]], f) 153 | 154 | print "Working on dates.." 155 | for date_col in ["Application_Receipt_Date", "Applicant_BirthDate", "Manager_DOJ", "Manager_DoB"]: 156 | print date_col 157 | train[date_col].fillna("1/1/1900", inplace=True) 158 | test[date_col].fillna("1/1/1900", inplace=True) 159 | train[date_col] = (pd.to_datetime(train[date_col], format="%m/%d/%Y")) 160 | test[date_col] = (pd.to_datetime(test[date_col], format="%m/%d/%Y")) 161 | train[date_col] = train[date_col].apply(lambda x: x.toordinal()) 162 | test[date_col] = test[date_col].apply(lambda x: x.toordinal()) 163 | 164 | dev_index = np.where(train["Application_Receipt_Date"]<=733100)[0] 165 | val_index = np.where(train["Application_Receipt_Date"]>733100)[0] 166 | print "Dropping unwanted cols.." 167 | drop_cols = [] 168 | train.drop(["ID", "Business_Sourced"]+drop_cols, axis=1, inplace=True) 169 | test.drop(["ID"] + drop_cols, axis=1, inplace=True) 170 | 171 | print "Fill NA.." 172 | train.fillna(-999, inplace=True) 173 | test.fillna(-999, inplace=True) 174 | 175 | print "New features.." 176 | train["Manager_Business2"] = train["Manager_Business"] - train["Manager_Business2"] 177 | test["Manager_Business2"] = test["Manager_Business"] - test["Manager_Business2"] 178 | train["Manager_Num_Products2"] = train["Manager_Num_Products"] - train["Manager_Num_Products2"] 179 | test["Manager_Num_Products2"] = test["Manager_Num_Products"] - test["Manager_Num_Products2"] 180 | 181 | print "Converting to array.." 182 | feat_names = list(train.columns) 183 | train = np.array(train) 184 | test = np.array(test) 185 | print train.shape, test.shape 186 | assert train.shape[1] == test.shape[1] 187 | 188 | full_preds = 0 189 | for rs in [1, 1343, 445234]: 190 | preds = runXGB(train, train_y, test, feature_names=feat_names, seed_val = rs) 191 | full_preds += preds 192 | full_preds /= 3. 193 | 194 | out_df = pd.DataFrame({"ID":test_ID}) 195 | out_df["Business_Sourced"] = full_preds 196 | out_df.to_csv("final.csv", index=False) 197 | -------------------------------------------------------------------------------- /AV_SmartRecruits/readme.md: -------------------------------------------------------------------------------- 1 | Codes for AV hackathon - The Smart Recruits 2 | -------------------------------------------------------------------------------- /AV_TheSeersAccuracy/createFeatures.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pandas as pd 3 | import numpy as np 4 | import datetime 5 | from sklearn.preprocessing import LabelEncoder 6 | 7 | def getFeatures(df, dv_list=set(), start_date=datetime.datetime(2006,1,1)): 8 | grouped_df = df.groupby("Client_ID") 9 | for name, group in grouped_df: 10 | #if group.shape[0] < 2: 11 | # continue 12 | #print name 13 | out = [name] 14 | #print group 15 | 16 | # time since last transaction # 17 | max_date = max(group["Transaction_Date"]) 18 | out.append( (start_date - max_date).days ) 19 | 20 | # Number of transactions # 21 | out.append(group.shape[0]) 22 | 23 | # Mean EMI # 24 | out.append( np.mean(group["Number_of_EMI"]) ) 25 | 26 | # Mean var1 # 27 | out.append( np.mean(group["Var1"]) ) 28 | 29 | # Mean Var2 # 30 | out.append( np.mean(group["Var2"]) ) 31 | 32 | # Mean Var3 # 33 | out.append( np.mean(group["Var3"]) ) 34 | 35 | # Mean Transaction_Amount # 36 | out.append( np.mean(group["Transaction_Amount"]) ) 37 | 38 | # Mean Purchased_in_Sale # 39 | out.append( np.mean(group["Purchased_in_Sale"]) ) 40 | 41 | # get last purchase # 42 | last_purchase = group[group["Transaction_Date"] == max_date] 43 | #print "Last Purchase is : ", last_purchase 44 | 45 | # last purchase in sale # 46 | out.append( int(last_purchase["Purchased_in_Sale"].iloc[-1]) ) 47 | 48 | # last EMI # 49 | out.append( int(last_purchase["Number_of_EMI"].iloc[-1]) ) 50 | 51 | # last store # 52 | out.append( int(last_purchase["Store_ID"].iloc[-1]) ) 53 | 54 | # last var1 # 55 | out.append( int(last_purchase["Var1"].iloc[-1]) ) 56 | 57 | # last var2 # 58 | out.append( int(last_purchase["Var2"].iloc[-1]) ) 59 | 60 | # last var3 # 61 | out.append( int(last_purchase["Var3"].iloc[-1]) ) 62 | 63 | # Gender # 64 | out.append( int(last_purchase["Gender"].iloc[-1]) ) 65 | 66 | # Last Referred_Friend # 67 | out.append( int(last_purchase["Referred_Friend"].iloc[-1]) ) 68 | 69 | # Last SE category # 70 | out.append( int(last_purchase["Sales_Executive_Category"].iloc[-1]) ) 71 | 72 | # Last SE ID # 73 | out.append( int(last_purchase["Sales_Executive_ID"].iloc[-1]) ) 74 | 75 | # Last Lead Source # 76 | out.append( int(last_purchase["Lead_Source_Category"].iloc[-1]) ) 77 | 78 | # Last Payment Mode # 79 | out.append( int(last_purchase["Payment_Mode"].iloc[-1]) ) 80 | 81 | # last product category # 82 | out.append( int(last_purchase["Product_Category"].iloc[-1]) ) 83 | 84 | # last transaction amount # 85 | out.append( int(last_purchase["Transaction_Amount"].iloc[-1]) ) 86 | 87 | # time since first transaction # 88 | min_date = min(group["Transaction_Date"]) 89 | out.append( (start_date - min_date).days ) 90 | 91 | # total time # 92 | out.append((max_date - min_date).days) 93 | 94 | # frequency # 95 | out.append( (max_date - min_date).days / float(group.shape[0]) ) 96 | 97 | # number of unique stores visited # 98 | out.append( len( np.unique( group["Store_ID"] )) ) 99 | 100 | # number of unique purchased in sale # 101 | out.append( len( np.unique( group["Purchased_in_Sale"] )) ) 102 | 103 | # number of unique var1 # 104 | out.append( len( np.unique( group["Var1"] )) ) 105 | 106 | # number of unique var2 # 107 | out.append( len( np.unique( group["Var2"] )) ) 108 | 109 | # number of unique var3 # 110 | out.append( len( np.unique( group["Var3"] )) ) 111 | 112 | # number of unique SE id # 113 | out.append( len( np.unique( group["Sales_Executive_ID"] )) ) 114 | 115 | # number of unique SE cat # 116 | out.append( len( np.unique( group["Sales_Executive_Category"] )) ) 117 | 118 | # number of unique LS cat # 119 | out.append( len( np.unique( group["Lead_Source_Category"] )) ) 120 | 121 | # number of unique paymenr mode # 122 | out.append( len( np.unique( group["Payment_Mode"])) ) 123 | 124 | # number of unique product category # 125 | out.append( len( np.unique( group["Product_Category"])) ) 126 | 127 | # getting year of birth # 128 | yob = int((last_purchase["DOB"].iloc[-1]).year) 129 | if yob > 2000: 130 | yob = yob-100 131 | out.append(yob) 132 | 133 | # number of unique dob # 134 | out.append( len( np.unique( group["DOB"])) ) 135 | 136 | # number of purchases in last one year # 137 | yop = (start_date.year - 1) 138 | temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) ) 139 | out.append(sum(temp_arr)) 140 | out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) ) 141 | 142 | # number of purchases in last two years # 143 | yop = (start_date.year - 2) 144 | temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) ) 145 | out.append(sum(temp_arr)) 146 | out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) ) 147 | 148 | # number of purchases in last three years # 149 | yop = (start_date.year - 3) 150 | temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) ) 151 | out.append(sum(temp_arr)) 152 | out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) ) 153 | 154 | # DV # 155 | if name in dv_list: 156 | out.append(1) 157 | else: 158 | out.append(0) 159 | 160 | yield out 161 | 162 | if __name__ == "__main__": 163 | train = pd.read_csv("../Data/dev.csv") 164 | repeat_clients = set(np.unique(pd.read_csv("../Data/val.csv")["Client_ID"])) 165 | print len(repeat_clients) 166 | test = pd.read_csv("../Data/Train_seers_accuracy.csv") 167 | 168 | print "Label Encoding.." 169 | for var in test.columns: 170 | if test[var].dtypes == object : 171 | if var in ["Transaction_Date", "DOB"]: 172 | continue 173 | print var 174 | lb = LabelEncoder() 175 | full_var_data = pd.concat((train[var],test[var]),axis=0).astype('str') 176 | lb.fit(np.array(full_var_data)) 177 | train[var] = lb.transform(np.array( train[var] ).astype('str')) 178 | test[var] = lb.transform(np.array( test[var] ).astype('str')) 179 | 180 | train["Transaction_Date"] = pd.to_datetime(train["Transaction_Date"], format="%d-%b-%y") 181 | test["Transaction_Date"] = pd.to_datetime(test["Transaction_Date"], format="%d-%b-%y") 182 | print min(train["Transaction_Date"]) 183 | print max(train["Transaction_Date"]) 184 | train["DOB"] = pd.to_datetime(train["DOB"], format="%d-%b-%y") 185 | test["DOB"] = pd.to_datetime(test["DOB"], format="%d-%b-%y") 186 | print min(train["DOB"]) 187 | print max(train["DOB"]) 188 | 189 | print "Processing train.." 190 | out_file = open("train_features3.csv","w") 191 | writer = csv.writer(out_file) 192 | header = ["Client_ID", "TimeSinceLastTrans", "NumberOfTrans", "MeanEMI", "MeanVar1", "MeanVar2", "MeanVar3", "MeanTransactionAmount", "MeanPurchasedInSale", "LastPurchasedInSale", "LastEMI", "LastStoreID", "LastVar1", "LastVar2", "LastVar3", "Gender", "LastReferredFriend", "LastSECat", "LastSEID", "LastLeadSource", "LastPayMode", "LastProdCat", "LastTransAmt", "TimeSinceFirstTrans", "TotalTime", "FreqTrans", "NumUniqueStore", "NumUniPurchasedInSale", "NumUniVar1", "NumUniVar2", "NumUniVar3", "NumUniSEID", "NumUniSECat", "NumUniLScat", "NumUniPayMode", "NumUniProdCat", "YoB", "NumUniDOB", "Last1YCount", "Last1YTA", "Last2YCount", "Last2YTA", "Last3YCount", "Last3YTA", "DV"] 193 | len_header = len(header) 194 | writer.writerow(header) 195 | count = 0 196 | for feature_list in getFeatures(train, repeat_clients, start_date=datetime.datetime(2006,1,1)): 197 | assert len_header == len(feature_list) 198 | writer.writerow( feature_list ) 199 | #break 200 | count +=1 201 | if count%10000 == 0: 202 | print count 203 | out_file.close() 204 | 205 | print "Processing test..." 206 | out_file = open("test_features3.csv","w") 207 | writer = csv.writer(out_file) 208 | #header = ["Client_ID", "TimeSinceLastTrans", "NumberOfTrans", "MeanEMI", "MeanVar1", "MeanVar2", "MeanVar3", "MeanTransactionAmount", "MeanPurchasedInSale", "LastPurchasedInSale", "DV"] 209 | #len_header = len(header) 210 | writer.writerow(header) 211 | count = 0 212 | for feature_list in getFeatures(test, start_date=datetime.datetime(2007,1,1)): 213 | assert len_header == len(feature_list) 214 | writer.writerow( feature_list ) 215 | count += 1 216 | if count%10000 == 0: 217 | print count 218 | out_file.close() 219 | -------------------------------------------------------------------------------- /AV_TheSeersAccuracy/finalModel.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pandas as pd 3 | import numpy as np 4 | import datetime 5 | import operator 6 | from sklearn import preprocessing 7 | from sklearn.cross_validation import KFold 8 | from sklearn import ensemble 9 | from sklearn.metrics import roc_auc_score,log_loss 10 | import xgboost as xgb 11 | import random 12 | 13 | def create_feature_map(features): 14 | outfile = open('xgb.fmap', 'w') 15 | for i, feat in enumerate(features): 16 | outfile.write('{0}\t{1}\tq\n'.format(i,feat)) 17 | outfile.close() 18 | 19 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, round_val=1650): 20 | params = {} 21 | params["objective"] = "binary:logistic" 22 | params['eval_metric'] = 'auc' 23 | params["eta"] = 0.01 24 | params["min_child_weight"] = 2 25 | params["subsample"] = 0.55 26 | params["colsample_bytree"] = 0.9 27 | params["silent"] = 1 28 | params["max_depth"] = 4 29 | params["seed"] = seed_val 30 | params["max_delta_step"] = 2 31 | num_rounds = round_val 32 | 33 | plst = list(params.items()) 34 | xgtrain = xgb.DMatrix(train_X, label=train_y) 35 | 36 | if test_y is not None: 37 | xgtest = xgb.DMatrix(test_X, label=test_y) 38 | watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] 39 | model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=10000) 40 | else: 41 | xgtest = xgb.DMatrix(test_X) 42 | model = xgb.train(plst, xgtrain, num_rounds) 43 | 44 | if feature_names: 45 | create_feature_map(feature_names) 46 | model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True) 47 | importance = model.get_fscore(fmap='xgb.fmap') 48 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 49 | imp_df = pd.DataFrame(importance, columns=['feature','fscore']) 50 | imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum() 51 | imp_df.to_csv("imp_feat.txt", index=False) 52 | 53 | pred_test_y = model.predict(xgtest) 54 | 55 | if test_y is not None: 56 | loss = roc_auc_score(test_y, pred_test_y) 57 | print loss 58 | 59 | return pred_test_y, loss 60 | else: 61 | return pred_test_y 62 | 63 | 64 | if __name__ == "__main__": 65 | print "Reading csv.." 66 | otrain = pd.read_csv("./train_features3.csv") 67 | otest = pd.read_csv("./test_features3.csv") 68 | print otrain.shape, otest.shape 69 | 70 | print "Getting DV.." 71 | train_y = np.array( otrain.DV.values ) 72 | train_id = np.array( otrain.Client_ID.values ) 73 | test_id = np.array( otest.Client_ID.values ) 74 | 75 | print "Dropping.." 76 | otrain = otrain.drop(['DV'], axis=1) 77 | otest = otest.drop(["DV"], axis=1) 78 | 79 | use_cols = ['Client_ID', 'TimeSinceLastTrans', 'NumberOfTrans', 'MeanEMI', 'MeanVar1', 'MeanVar2', 'MeanVar3', 'MeanTransactionAmount', 'MeanPurchasedInSale', 'LastPurchasedInSale', 'LastEMI', 'LastStoreID', 'LastVar1', 'LastVar2', 'Gender', 'LastReferredFriend', 'LastSECat', 'LastSEID', 'LastLeadSource', 'LastPayMode', 'LastProdCat', 'LastTransAmt'] 80 | train = otrain[use_cols] 81 | test = otest[use_cols] 82 | 83 | feat_names = list(train.columns) 84 | print "Converting to array.." 85 | train = np.array(train).astype('float') 86 | test = np.array(test).astype('float') 87 | print train.shape, test.shape 88 | 89 | assert train.shape[1] == test.shape[1] 90 | print "Final Model.." 91 | preds = runXGB(train, train_y, test, seed_val=0, round_val=1200) 92 | 93 | out_df = pd.DataFrame({"Client_ID":test_id}) 94 | out_df["Cross_Sell"] = preds 95 | out_df.to_csv("submission.csv", index=False) 96 | 97 | 98 | -------------------------------------------------------------------------------- /AV_TheSeersAccuracy/readme.md: -------------------------------------------------------------------------------- 1 | Code for the Hackathon - [The Seers Accuracy](http://datahack.analyticsvidhya.com/contest/the-seers-accuracy) by [Analytics Vidhya](http://www.analyticsvidhya.com/) 2 | 3 | ####Objective 4 | The objective of the competition is to predict whether the customer will come back in the next one year or not. 5 | 6 | ####Approach 7 | We had transaction data of all the customers from Jan 2003 to Dec 2006. The idea is to predict whether the customer will come back in 2007 or not. 8 | 9 | 1. The first step was to create a proper validation framework since there was no "target" variable 10 | 2. I have used transaction data from 2003 to 2005 to create the features. People who came back in 2006 were tagged as 1 and others were tagged as 0, thereby getting the target column 11 | 3. Feature selection, models tuning were done using this validation sample. 12 | 4. For the final model, features were created using all the given data (2003 to 2006) and prediction was done for 2007. 13 | 5. People were using different types of approaches as well. [Vopani](https://github.com/rohanrao91/AnalyticsVidhya_SeersAccuracy) followed a two stage validation approach using both 2005 and 2006 as validation samples. 14 | 15 | ####Codes 16 | ######splitDevVal.py 17 | Code to split the data into development(2003 to 2005 data) and validation sample(2006 data) 18 | 19 | ######createFeatures.py 20 | Code to create the features from the given input dataset for both validation and final model 21 | 22 | ######finalModel.py 23 | Code to get the final submission file 24 | -------------------------------------------------------------------------------- /AV_TheSeersAccuracy/splitDevVal.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from datetime import datetime 3 | 4 | with open("../Data/Train_seers_accuracy.csv") as train_file: 5 | dev_file = open("../Data/dev.csv","w") 6 | val_file = open("../Data/val.csv","w") 7 | 8 | dev_writer = csv.writer(dev_file) 9 | val_writer = csv.writer(val_file) 10 | 11 | reader = csv.reader(train_file) 12 | header = reader.next() 13 | dev_writer.writerow(header) 14 | val_writer.writerow(header) 15 | date_index = header.index("Transaction_Date") 16 | 17 | dev_counter = 0 18 | val_counter = 0 19 | total_counter = 0 20 | for row in reader: 21 | #print row 22 | date_val = datetime.strptime(row[date_index], "%d-%b-%y") 23 | if date_val.year == 2006: 24 | val_writer.writerow(row) 25 | val_counter += 1 26 | else: 27 | dev_writer.writerow(row) 28 | dev_counter += 1 29 | total_counter += 1 30 | if total_counter % 10000 == 0: 31 | print total_counter, dev_counter, val_counter 32 | 33 | dev_file.close() 34 | val_file.close() 35 | 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 SudalaiRajkumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Machine Learning 2 | 3 | Codes related to various Machine Learning Hackathons. 4 | 5 | 6 | --------------------------------------------------------------------------------