├── AV_BlackFridayHack
    ├── finalModel.py
    ├── prepData.py
    └── readme.md
├── AV_ChurnPrediction_Nov2017
    ├── buildModel.py
    └── readme.md
├── AV_ClubMahindra_May2019
    ├── FeatureEngg.ipynb
    ├── ModelBuild.ipynb
    └── README.md
├── AV_DHS_2017
    ├── Exploratory_Data_Analysis.ipynb
    ├── FeatureEngineering_Walkthrough.ipynb
    ├── Feature_Engineering.ipynb
    ├── Modeling.ipynb
    ├── Stacking_Walkthrough.ipynb
    └── readme.md
├── AV_DHS_2018
    ├── DataExploration.ipynb
    └── readme.md
├── AV_Genpact_2018
    ├── final_model.py
    └── readme.md
├── AV_Hack3
    ├── buildModel.py
    └── readme.md
├── AV_Hackathon_July11
    ├── benchmark.R
    ├── benchmark.py
    └── readme.md
├── AV_Knocktober
    ├── DataExploration.ipynb
    ├── getOutcome.py
    ├── readme.md
    ├── srk_final.py
    └── vopani_final.R
├── AV_LTFS_April2019
    ├── ModelBuild.ipynb
    └── README.md
├── AV_LordOfTheMachines
    ├── Explorations.ipynb
    ├── build_model.py
    ├── build_model_xgb.py
    ├── ensemble.py
    └── readme.md
├── AV_MiniHack1
    ├── model_ens.py
    ├── model_lr.py
    ├── model_xgb.py
    └── readme.md
├── AV_MiniHack2_SimpleBuy
    ├── finalModel.py
    └── readme.md
├── AV_SmartRecruits
    ├── finalModel.py
    └── readme.md
├── AV_TheSeersAccuracy
    ├── createFeatures.py
    ├── finalModel.py
    ├── readme.md
    └── splitDevVal.py
├── LICENSE
└── README.md


/AV_BlackFridayHack/finalModel.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import pandas as pd
  3 | import numpy as np
  4 | from sklearn.cross_validation import KFold
  5 | from sklearn import ensemble
  6 | from sklearn import metrics
  7 | from sklearn.preprocessing import LabelEncoder
  8 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/")
  9 | import xgboost as xgb
 10 | 
 11 | gender_dict = {'F':0, 'M':1}
 12 | age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
 13 | city_dict = {'A':0, 'B':1, 'C':2}
 14 | stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}
 15 | 
 16 | def runXGB(train_X, train_y, test_X):
 17 |         params = {}
 18 |         params["objective"] = "reg:linear"
 19 |         params["eta"] = 0.03
 20 |         params["min_child_weight"] = 10
 21 |         params["subsample"] = 0.8
 22 |         params["colsample_bytree"] = 0.7
 23 |         params["silent"] = 1
 24 |         params["max_depth"] = 10
 25 |         #params["max_delta_step"]=2
 26 |         params["seed"] = 0
 27 |         #params['eval_metric'] = "auc"
 28 |         plst = list(params.items())
 29 |         num_rounds = 1100
 30 | 
 31 | 	xgtrain = xgb.DMatrix(train_X, label=train_y)
 32 | 	xgtest = xgb.DMatrix(test_X)
 33 | 	model = xgb.train(plst, xgtrain, num_rounds)
 34 | 	pred_test_y = model.predict(xgtest)
 35 | 	return pred_test_y
 36 | 
 37 | def getCountVar(compute_df, count_df, var_name):
 38 |         grouped_df = count_df.groupby(var_name)
 39 |         count_dict = {}
 40 |         for name, group in grouped_df:
 41 |                 count_dict[name] = group.shape[0]
 42 | 
 43 |         count_list = []
 44 |         for index, row in compute_df.iterrows():
 45 |                 name = row[var_name]
 46 |                 count_list.append(count_dict.get(name, 0))
 47 |         return count_list
 48 | 
 49 | def getPurchaseVar(compute_df, purchase_df, var_name):
 50 |         grouped_df = purchase_df.groupby(var_name)
 51 |         min_dict = {}
 52 |         max_dict = {}
 53 |         mean_dict = {}
 54 |         twentyfive_dict = {}
 55 |         seventyfive_dict = {}
 56 |         for name, group in grouped_df:
 57 |                 min_dict[name] = min(np.array(group["Purchase"]))
 58 |                 max_dict[name] = max(np.array(group["Purchase"]))
 59 |                 mean_dict[name] = np.mean(np.array(group["Purchase"]))
 60 |                 twentyfive_dict[name] = np.percentile(np.array(group["Purchase"]),25)
 61 |                 seventyfive_dict[name] = np.percentile(np.array(group["Purchase"]),75)
 62 | 
 63 |         min_list = []
 64 |         max_list = []
 65 |         mean_list = []
 66 |         twentyfive_list = []
 67 |         seventyfive_list = []
 68 |         for index, row in compute_df.iterrows():
 69 |                 name = row[var_name]
 70 |                 min_list.append(min_dict.get(name,0))
 71 |                 max_list.append(max_dict.get(name,0))
 72 |                 mean_list.append(mean_dict.get(name,0))
 73 |                 twentyfive_list.append( twentyfive_dict.get(name,0))
 74 |                 seventyfive_list.append( seventyfive_dict.get(name,0))
 75 | 
 76 |         return min_list, max_list, mean_list, twentyfive_list, seventyfive_list
 77 | 
 78 | 
 79 | if __name__ == "__main__":
 80 | 	data_path = "../Data/"
 81 | 	train_file = data_path + "train_mod.csv"
 82 | 	test_file = data_path +  "test_mod.csv"
 83 | 
 84 | 	train_df = pd.read_csv(train_file)
 85 | 	test_df = pd.read_csv(test_file)
 86 | 	print train_df.shape, test_df.shape
 87 | 
 88 | 	min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "User_ID")
 89 |         train_df["User_ID_MinPrice"] = min_price_list
 90 |         train_df["User_ID_MaxPrice"] = max_price_list
 91 |         train_df["User_ID_MeanPrice"] = mean_price_list
 92 |         train_df["User_ID_25PercPrice"] = twentyfive_price_list
 93 |         train_df["User_ID_75PercPrice"] = seventyfive_price_list
 94 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "User_ID")
 95 |         test_df["User_ID_MinPrice"] = min_price_list
 96 |         test_df["User_ID_MaxPrice"] = max_price_list
 97 |         test_df["User_ID_MeanPrice"] = mean_price_list
 98 |         test_df["User_ID_25PercPrice"] = twentyfive_price_list
 99 |         test_df["User_ID_75PercPrice"] = seventyfive_price_list
100 |         #print np.unique(test_df["User_ID_MeanPrice"])[:10]
101 | 
102 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_ID")
103 |         train_df["Product_ID_MinPrice"] = min_price_list
104 |         train_df["Product_ID_MaxPrice"] = max_price_list
105 |         train_df["Product_ID_MeanPrice"] = mean_price_list
106 |         train_df["Product_ID_25PercPrice"] = twentyfive_price_list
107 |         train_df["Product_ID_75PercPrice"] = seventyfive_price_list
108 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_ID")
109 |         test_df["Product_ID_MinPrice"] = min_price_list
110 |         test_df["Product_ID_MaxPrice"] = max_price_list
111 |         test_df["Product_ID_MeanPrice"] = mean_price_list
112 |         test_df["Product_ID_25PercPrice"] = twentyfive_price_list
113 |         test_df["Product_ID_75PercPrice"] = seventyfive_price_list
114 |         #print np.unique(test_df["Product_ID_MeanPrice"])[:10]
115 | 
116 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_1")
117 |         train_df["Product_Cat1_MinPrice"] = min_price_list
118 |         train_df["Product_Cat1_MaxPrice"] = max_price_list
119 |         train_df["Product_Cat1_MeanPrice"] = mean_price_list
120 |         train_df["Product_Cat1_25PercPrice"] = twentyfive_price_list
121 |         train_df["Product_Cat1_75PercPrice"] = seventyfive_price_list
122 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_1")
123 |         test_df["Product_Cat1_MinPrice"] = min_price_list
124 |         test_df["Product_Cat1_MaxPrice"] = max_price_list
125 |         test_df["Product_Cat1_MeanPrice"] = mean_price_list
126 |         test_df["Product_Cat1_25PercPrice"] = twentyfive_price_list
127 |         test_df["Product_Cat1_75PercPrice"] = seventyfive_price_list
128 |         print np.unique(test_df["Product_Cat1_MeanPrice"])[:10]
129 | 
130 | 	min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_2")
131 |         train_df["Product_Cat2_MinPrice"] = min_price_list
132 |         train_df["Product_Cat2_MaxPrice"] = max_price_list
133 |         train_df["Product_Cat2_MeanPrice"] = mean_price_list
134 |         train_df["Product_Cat2_25PercPrice"] = twentyfive_price_list
135 |         train_df["Product_Cat2_75PercPrice"] = seventyfive_price_list
136 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_2")
137 |         test_df["Product_Cat2_MinPrice"] = min_price_list
138 |         test_df["Product_Cat2_MaxPrice"] = max_price_list
139 |         test_df["Product_Cat2_MeanPrice"] = mean_price_list
140 |         test_df["Product_Cat2_25PercPrice"] = twentyfive_price_list
141 |         test_df["Product_Cat2_75PercPrice"] = seventyfive_price_list
142 |         print np.unique(test_df["Product_Cat2_MeanPrice"])[:10]
143 | 
144 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(train_df, train_df, "Product_Category_3")
145 |         train_df["Product_Cat3_MinPrice"] = min_price_list
146 |         train_df["Product_Cat3_MaxPrice"] = max_price_list
147 |         train_df["Product_Cat3_MeanPrice"] = mean_price_list
148 |         train_df["Product_Cat3_25PercPrice"] = twentyfive_price_list
149 |         train_df["Product_Cat3_75PercPrice"] = seventyfive_price_list
150 |         min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = getPurchaseVar(test_df, train_df, "Product_Category_3")
151 |         test_df["Product_Cat3_MinPrice"] = min_price_list
152 |         test_df["Product_Cat3_MaxPrice"] = max_price_list
153 |         test_df["Product_Cat3_MeanPrice"] = mean_price_list
154 |         test_df["Product_Cat3_25PercPrice"] = twentyfive_price_list
155 |         test_df["Product_Cat3_75PercPrice"] = seventyfive_price_list
156 |         print np.unique(test_df["Product_Cat3_MeanPrice"])[:10]
157 | 
158 | 
159 | 
160 | 	train_y = np.array(train_df["Purchase"])
161 | 	test_user_id = np.array(test_df["User_ID"])
162 | 	test_product_id = np.array(test_df["Product_ID"])
163 | 
164 | 	train_df.drop(["Purchase"], axis=1, inplace=True)
165 | 
166 | 	cat_columns_list = ["User_ID", "Product_ID"]
167 | 	for var in cat_columns_list:
168 |                 lb = LabelEncoder()
169 |                 full_var_data = pd.concat((train_df[var],test_df[var]),axis=0).astype('str')
170 |                 temp = lb.fit_transform(np.array(full_var_data))
171 |                 train_df[var] = lb.transform(np.array( train_df[var] ).astype('str'))
172 |                 test_df[var] = lb.transform(np.array( test_df[var] ).astype('str'))
173 | 
174 | 	train_X = np.array(train_df).astype('float')
175 | 	test_X = np.array(test_df).astype('float')
176 | 	print train_X.shape, test_X.shape
177 | 
178 | 	print "Running model.."	
179 | 	pred_test_y = runXGB(train_X, train_y, test_X)
180 | 	pred_test_y[pred_test_y<0] = 1
181 | 
182 | 	out_df = pd.DataFrame({"User_ID":test_user_id})
183 | 	out_df["Product_ID"] = test_product_id
184 | 	out_df["Purchase"] = pred_test_y
185 | 	out_df.to_csv("sub20.csv", index=False)
186 | 


--------------------------------------------------------------------------------
/AV_BlackFridayHack/prepData.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import pandas as pd
 3 | import numpy as np
 4 | from sklearn.cross_validation import KFold
 5 | from sklearn import ensemble
 6 | from sklearn import metrics
 7 | from sklearn.preprocessing import LabelEncoder
 8 | sys.path.append("/home/sudalai/Softwares/XGB_pointfour/xgboost-master/wrapper/")
 9 | import xgboost as xgb
10 | 
11 | gender_dict = {'F':0, 'M':1}
12 | age_dict = {'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}
13 | city_dict = {'A':0, 'B':1, 'C':2}
14 | stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}
15 | 
16 | def getCountVar(compute_df, count_df, var_name):
17 | 	grouped_df = count_df.groupby(var_name)
18 | 	count_dict = {}
19 | 	for name, group in grouped_df:
20 | 		count_dict[name] = group.shape[0]
21 | 
22 | 	count_list = []
23 | 	for index, row in compute_df.iterrows():
24 | 		name = row[var_name]
25 | 		count_list.append(count_dict.get(name, 0))
26 | 	return count_list
27 | 
28 | if __name__ == "__main__":
29 | 	data_path = "../Data/"
30 | 	train_file = data_path + "train.csv"
31 | 	test_file = data_path +  "test.csv"
32 | 
33 | 	train_df = pd.read_csv(train_file)
34 | 	test_df = pd.read_csv(test_file)
35 | 	print train_df.shape, test_df.shape
36 | 
37 | 	train_df["Gender"] = train_df["Gender"].apply(lambda x: gender_dict[x])
38 | 	test_df["Gender"] = test_df["Gender"].apply(lambda x: gender_dict[x])
39 | 
40 | 	train_df["Age"] = train_df["Age"].apply(lambda x: age_dict[x])
41 | 	test_df["Age"] = test_df["Age"].apply(lambda x: age_dict[x])
42 | 
43 | 	train_df["City_Category"] = train_df["City_Category"].apply(lambda x: city_dict[x])
44 |         test_df["City_Category"] = test_df["City_Category"].apply(lambda x: city_dict[x])
45 | 
46 | 	train_df["Stay_In_Current_City_Years"] = train_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x])
47 |         test_df["Stay_In_Current_City_Years"] = test_df["Stay_In_Current_City_Years"].apply(lambda x: stay_dict[x])
48 | 
49 | 	
50 | 	print "Getting count features.."
51 | 	train_df["Age_Count"] = getCountVar(train_df, train_df, "Age")
52 | 	test_df["Age_Count"] = getCountVar(test_df, train_df, "Age")
53 | 	print "Age", np.unique(test_df["Age_Count"])
54 | 
55 | 	train_df["Occupation_Count"] = getCountVar(train_df, train_df, "Occupation")
56 |         test_df["Occupation_Count"] = getCountVar(test_df, train_df, "Occupation")
57 |         print "Occupation", np.unique(test_df["Occupation_Count"])
58 | 
59 | 	train_df["Product_Category_1_Count"] = getCountVar(train_df, train_df, "Product_Category_1")
60 |         test_df["Product_Category_1_Count"] = getCountVar(test_df, train_df, "Product_Category_1")
61 |         print "Cat 1 ",np.unique(test_df["Product_Category_1_Count"])
62 | 
63 | 	train_df["Product_Category_2_Count"] = getCountVar(train_df, train_df, "Product_Category_2")
64 |         test_df["Product_Category_2_Count"] = getCountVar(test_df, train_df, "Product_Category_2")
65 |         print "Cat 2 ", np.unique(test_df["Product_Category_2_Count"])
66 | 
67 | 	train_df["Product_Category_3_Count"] = getCountVar(train_df, train_df, "Product_Category_3")
68 |         test_df["Product_Category_3_Count"] = getCountVar(test_df, train_df, "Product_Category_3")
69 |         print "Cat 3 ", np.unique(test_df["Product_Category_3_Count"])
70 | 
71 | 	train_df["User_ID_Count"] = getCountVar(train_df, train_df, "User_ID")
72 |         test_df["User_ID_Count"] = getCountVar(test_df, train_df, "User_ID")
73 |         print "User id ", np.unique(test_df["User_ID_Count"])[:10]
74 | 
75 | 	train_df["Product_ID_Count"] = getCountVar(train_df, train_df, "Product_ID")
76 |         test_df["Product_ID_Count"] = getCountVar(test_df, train_df, "Product_ID")
77 |         print "Product id ", np.unique(test_df["Product_ID_Count"])[:10]
78 | 	
79 | 	train_df.fillna(-999, inplace=True)
80 | 	test_df.fillna(-999, inplace=True)
81 | 
82 | 	train_df.to_csv(data_path+"train_mod.csv", index=False)
83 | 	test_df.to_csv(data_path+"test_mod.csv", index=False)
84 | 
85 | 


--------------------------------------------------------------------------------
/AV_BlackFridayHack/readme.md:
--------------------------------------------------------------------------------
1 | #####Codes for Black Friday Hack#####
2 | 
3 | prepData.py - Creates count based variables and store it as new csv
4 | 
5 | finalModel.py - Creates the final model after creating few DV based variables
6 | 
7 | 


--------------------------------------------------------------------------------
/AV_ChurnPrediction_Nov2017/buildModel.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import random
  3 | import operator
  4 | import pandas as pd
  5 | import numpy as np
  6 | import xgboost as xgb
  7 | import lightgbm as lgb
  8 | from sklearn import preprocessing, metrics, ensemble, neighbors, linear_model, tree, model_selection
  9 | from sklearn.model_selection import KFold, StratifiedKFold
 10 | from sklearn import manifold, decomposition
 11 | from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
 12 | 
 13 | def create_feature_map(features):
 14 | 	outfile = open('xgb.fmap', 'w')
 15 | 	for i, feat in enumerate(features):
 16 | 		outfile.write('{0}\t{1}\tq\n'.format(i,feat))
 17 | 	outfile.close()
 18 | 
 19 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
 20 | 	params = {}
 21 | 	params["objective"] = "binary:logistic"
 22 | 	params['eval_metric'] = 'auc'
 23 | 	params["eta"] = eta
 24 | 	params["subsample"] = 0.7
 25 | 	params["min_child_weight"] = 1
 26 | 	params["colsample_bytree"] = 0.7
 27 | 	params["max_depth"] = dep
 28 | 
 29 | 	params["silent"] = 1
 30 | 	params["seed"] = seed_val
 31 | 	#params["max_delta_step"] = 2
 32 | 	#params["gamma"] = 0.5
 33 | 	num_rounds = rounds
 34 | 
 35 | 	plst = list(params.items())
 36 | 	xgtrain = xgb.DMatrix(train_X, label=train_y)
 37 | 
 38 | 	if test_y is not None:
 39 | 		xgtest = xgb.DMatrix(test_X, label=test_y)
 40 | 		watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
 41 | 		model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
 42 | 	else:
 43 | 		xgtest = xgb.DMatrix(test_X)
 44 | 		model = xgb.train(plst, xgtrain, num_rounds)
 45 | 
 46 | 	if feature_names is not None:
 47 | 		create_feature_map(feature_names)
 48 | 		model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
 49 | 		importance = model.get_fscore(fmap='xgb.fmap')
 50 | 		importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
 51 | 		imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
 52 | 		imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
 53 | 		imp_df.to_csv("imp_feat.txt", index=False)
 54 | 
 55 | 	pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
 56 | 	pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)
 57 | 
 58 | 	loss = 0
 59 | 	if test_y is not None:
 60 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 61 | 		return pred_test_y, loss, pred_test_y2
 62 | 	else:
 63 | 		return pred_test_y, loss, pred_test_y2
 64 | 
 65 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
 66 | 	params = {}
 67 | 	params["objective"] = "binary"
 68 | 	params['metric'] = 'auc'
 69 | 	params["max_depth"] = dep
 70 | 	params["min_data_in_leaf"] = 20
 71 | 	params["learning_rate"] = eta
 72 | 	params["bagging_fraction"] = 0.7
 73 | 	params["feature_fraction"] = 0.7
 74 | 	params["bagging_freq"] = 5
 75 | 	params["bagging_seed"] = seed_val
 76 | 	params["verbosity"] = 0
 77 | 	num_rounds = rounds
 78 | 
 79 | 	plst = list(params.items())
 80 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 81 | 
 82 | 	if test_y is not None:
 83 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 84 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
 85 | 	else:
 86 | 		lgtest = lgb.DMatrix(test_X)
 87 | 		model = lgb.train(params, lgtrain, num_rounds)
 88 | 
 89 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 90 | 	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 91 | 
 92 | 	loss = 0
 93 | 	if test_y is not None:
 94 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 95 | 		print loss
 96 | 		return pred_test_y, loss, pred_test_y2
 97 | 	else:
 98 | 		return pred_test_y, loss, pred_test_y2
 99 | 
100 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=20, leaf=10, feat=0.2):
101 | 	model = ensemble.ExtraTreesClassifier(
102 | 			n_estimators = 100,
103 | 					max_depth = depth,
104 | 					min_samples_split = 2,
105 | 					min_samples_leaf = leaf,
106 | 					max_features =  feat,
107 | 					n_jobs = 6,
108 | 					random_state = 0)
109 | 	model.fit(train_X, train_y)
110 | 	train_preds = model.predict_proba(train_X)[:,1]
111 | 	test_preds = model.predict_proba(test_X)[:,1]
112 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
113 | 	test_loss = 0
114 | 	if test_y is not None:
115 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
116 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
117 | 		print "Depth, leaf, feat : ", depth, leaf, feat
118 | 		print "Train and Test loss : ", train_loss, test_loss
119 | 	return test_preds, test_loss, test_preds2
120 | 
121 | if __name__ == "__main__":
122 | 	#model_name = "ET"
123 | 	for model_name in ["LGB1", "XGB1"]:
124 | 		data_path = "../input/"
125 | 		train_df = pd.read_csv(data_path + "train.csv")
126 | 		test_df = pd.read_csv(data_path + "test.csv")
127 | 
128 | 		# process columns, apply LabelEncoder to categorical features
129 | 		for c in train_df.columns:
130 | 			if train_df[c].dtype == 'object' and c not in ["Responders", "UCIC_ID"]:
131 | 				lbl = preprocessing.LabelEncoder()
132 | 				lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
133 | 				train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
134 | 				test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
135 | 
136 | 		train_df.fillna(-99, inplace=True)
137 | 		test_df.fillna(-99, inplace=True)
138 | 
139 | 		################### Feature Engineeering ###############################
140 | 		f1_f2_list = [["D_prev1", "D_prev2"], ["D_prev2", "D_prev3"], ["D_prev3", "D_prev4"], ["D_prev4", "D_prev5"], ["D_prev5", "D_prev6"],
141 | 					["CR_AMB_Prev1", "CR_AMB_Prev3"], ["CR_AMB_Prev1", "CR_AMB_Prev4"], ["CR_AMB_Prev1", "CR_AMB_Prev5"], ["CR_AMB_Prev1", "CR_AMB_Prev6"],
142 | 					["EOP_prev1", "CR_AMB_Prev1"], ["EOP_prev2", "CR_AMB_Prev2"], ["EOP_prev3", "CR_AMB_Prev3"], ["EOP_prev4", "CR_AMB_Prev4"], ["EOP_prev5", "CR_AMB_Prev5"], ["EOP_prev6", "CR_AMB_Prev6"],
143 | 					["EOP_prev1", "EOP_prev2"], ["EOP_prev2", "EOP_prev3"], ["EOP_prev3", "EOP_prev4"], ["EOP_prev4", "EOP_prev5"], ["EOP_prev5", "EOP_prev6"],
144 | 					["CR_AMB_Prev2", "CR_AMB_Prev4"], ["CR_AMB_Prev2", "CR_AMB_Prev5"], ["CR_AMB_Prev2", "CR_AMB_Prev6"],
145 | 					["EOP_prev1", "CR_AMB_Prev2"], ["EOP_prev1", "CR_AMB_Prev3"], ["EOP_prev1", "CR_AMB_Prev4"], ["EOP_prev1", "CR_AMB_Prev5"], ["EOP_prev1", "CR_AMB_Prev6"],
146 | 					["CR_AMB_Drop_Build_1", "CR_AMB_Drop_Build_2"], ["CR_AMB_Drop_Build_2", "CR_AMB_Drop_Build_3"], ["CR_AMB_Drop_Build_3", "CR_AMB_Drop_Build_4"],
147 | 					["BAL_prev1", "BAL_prev2"], ["BAL_prev2", "BAL_prev3"], ["BAL_prev3", "BAL_prev4"],
148 | 					["BAL_prev1", "CR_AMB_Prev1"], ["BAL_prev2", "CR_AMB_Prev2"], ["BAL_prev3", "CR_AMB_Prev3"],
149 | 					["I_AQB_PrevQ1", "I_AQB_PrevQ2"], ["I_NRV_PrevQ1", "I_NRV_PrevQ2"],
150 | 					["D_prev1", "D_prev3"], ["D_prev1", "D_prev4"], ["D_prev1", "D_prev6"],
151 | 					
152 | 					]
153 | 		for f1, f2 in f1_f2_list:
154 | 			train_df["Ratio_"+f1+"_"+f2] = train_df[f1].astype('float') / np.maximum(train_df[f2],1.)
155 | 			test_df["Ratio_"+f1+"_"+f2] = test_df[f1].astype('float') / np.maximum(test_df[f2],1.)
156 | 
157 | 
158 | 		print "Preparing response variable.."
159 | 		cols_to_leave = ["Responders", "UCIC_ID"]
160 | 		cols_to_use = [col for col in train_df.columns if col not in cols_to_leave]
161 | 		train_X = train_df[cols_to_use]
162 | 		test_X = test_df[cols_to_use]
163 | 		train_y = (train_df["Responders"]).values 
164 | 		train_id = train_df["UCIC_ID"].values
165 | 		test_id = test_df["UCIC_ID"].values
166 | 
167 | 		print "Model building.."
168 | 		kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018)
169 | 		cv_scores = []
170 | 		pred_test_full = 0
171 | 		pred_val_full = np.zeros(train_X.shape[0])
172 | 		for dev_index, val_index in kf.split(train_X):
173 | 			dev_X, val_X = train_X.iloc[dev_index,:], train_X.iloc[val_index,:]
174 | 			dev_y, val_y = train_y[dev_index], train_y[val_index]
175 | 
176 | 			if model_name == "XGB1":
177 | 				pred_val, loss, pred_test = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=8, feature_names=dev_X.columns.tolist())
178 | 			elif model_name == "LGB1":
179 | 				pred_val, loss, pred_test = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=8)
180 | 			pred_val_full[val_index] = pred_val
181 | 			pred_test_full = pred_test_full + pred_test
182 | 			cv_scores.append(loss)
183 | 			print cv_scores
184 | 		pred_test_full /= 5.
185 | 		print metrics.roc_auc_score(train_y, pred_val_full)
186 | 
187 | 		out_df = pd.DataFrame({"UCIC_ID":test_id})
188 | 		out_df["Responders"] = pred_test_full
189 | 		out_df.to_csv("./meta_models/test/pred_test_v5_"+model_name+".csv", index=False)
190 | 
191 | 		out_df = pd.DataFrame({"UCIC_ID":train_id})
192 | 		out_df["Responders"] = pred_val_full
193 | 		out_df.to_csv("./meta_models/val/pred_val_v5_"+model_name+".csv", index=False)
194 | 


--------------------------------------------------------------------------------
/AV_ChurnPrediction_Nov2017/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the Analytics Vidhya Hackathon - [Churn Prediction](https://datahack.analyticsvidhya.com/contest/data-science-hackathon-churn-prediction/)
2 | 


--------------------------------------------------------------------------------
/AV_ClubMahindra_May2019/README.md:
--------------------------------------------------------------------------------
1 | Codes for the [Analytics Vidhya Hackathon - Club Mahindra DataOlympics](https://datahack.analyticsvidhya.com/contest/club-mahindra-dataolympics/)
2 | 
3 | Finished [4th](https://datahack.analyticsvidhya.com/contest/club-mahindra-dataolympics/pvt_lb) on this competition
4 | 


--------------------------------------------------------------------------------
/AV_DHS_2017/Exploratory_Data_Analysis.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Data Exploration\n",
 8 |     "\n",
 9 |     "First, let us start with something numeric. We shall look into this [Kaggle competition - Zillow Prize Estimate](https://www.kaggle.com/c/zillow-prize-1)\n",
10 |     "\n",
11 |     "1. [Python EDA Notebook](https://www.kaggle.com/c/zillow-prize-1)\n",
12 |     "2. [R EDA Notebook](https://www.kaggle.com/philippsp/exploratory-analysis-zillow) by Philipp\n",
13 |     "\n",
14 |     "\n",
15 |     "\n",
16 |     "\n",
17 |     "We will look at the Data Exploration scripts of [Kaggle Competition - Spooky Author Identification](https://www.kaggle.com/c/spooky-author-identification)\n",
18 |     "\n",
19 |     "1. [Python EDA Notebook](https://www.kaggle.com/arthurtok/spooky-nlp-and-topic-modelling-tutorial) by Anisotropic\n",
20 |     "2. [R EDA Notebook](https://www.kaggle.com/headsortails/treemap-house-of-horror-spooky-eda-lda-features) by Heads or Tails"
21 |    ]
22 |   }
23 |  ],
24 |  "metadata": {
25 |   "kernelspec": {
26 |    "display_name": "Python 2",
27 |    "language": "python",
28 |    "name": "python2"
29 |   },
30 |   "language_info": {
31 |    "codemirror_mode": {
32 |     "name": "ipython",
33 |     "version": 2
34 |    },
35 |    "file_extension": ".py",
36 |    "mimetype": "text/x-python",
37 |    "name": "python",
38 |    "nbconvert_exporter": "python",
39 |    "pygments_lexer": "ipython2",
40 |    "version": "2.7.10"
41 |   }
42 |  },
43 |  "nbformat": 4,
44 |  "nbformat_minor": 2
45 | }
46 | 


--------------------------------------------------------------------------------
/AV_DHS_2017/Feature_Engineering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Feature Engineering\n",
  8 |     "\n",
  9 |     "Some codes related to feature engineering can be seen in this notebook\n",
 10 |     "\n",
 11 |     "### Count Encoding"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import numpy as np\n",
 23 |     "import pandas as pd\n",
 24 |     "\n",
 25 |     "def getCountVar(compute_df, count_df, var_name, count_var=\"v1\"):\n",
 26 |     "    \"\"\"\n",
 27 |     "    compute_df : Data frame for which the count encoding should be done\n",
 28 |     "    count_df : Data frame from which the counts should be taken\n",
 29 |     "    var_name : categorical variable for count encoding\n",
 30 |     "    count_var : some other variable from the dataset (used as dummy variable to get count)\n",
 31 |     "    \"\"\"\n",
 32 |     "    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')\n",
 33 |     "    grouped_df.columns = [var_name, \"var_count\"]\n",
 34 |     "    merged_df = pd.merge(compute_df, grouped_df, how=\"left\", on=var_name)\n",
 35 |     "    merged_df.fillna(-1, inplace=True)\n",
 36 |     "    return list(merged_df[\"var_count\"])"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Target Encoding"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 4,
 49 |    "metadata": {
 50 |     "collapsed": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "from sklearn import model_selection\n",
 55 |     "\n",
 56 |     "def getDVEncodeVar(compute_df, target_df, var_name, target_var=\"RESPONDERS\", min_cutoff=1):\n",
 57 |     "    if type(var_name) != type([]):\n",
 58 |     "        var_name = [var_name]\n",
 59 |     "    grouped_df = target_df.groupby(var_name)[target_var].agg([\"mean\"]).reset_index()\n",
 60 |     "    grouped_df.columns = var_name + [\"mean_value\"]\n",
 61 |     "    merged_df = pd.merge(compute_df, grouped_df, how=\"left\", on=var_name)\n",
 62 |     "    merged_df.fillna(-1, inplace=True)\n",
 63 |     "    return list(merged_df[\"mean_value\"])\n",
 64 |     "\n",
 65 |     "\n",
 66 |     "def do_target_encode():\n",
 67 |     "    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018)\n",
 68 |     "    for col in [\"ZIP_CODE_FINAL\", \"DESIGNATION_FINAL\"]:\n",
 69 |     "        train_enc_values = np.zeros(train_df.shape[0])\n",
 70 |     "        test_enc_values = 0\n",
 71 |     "        for dev_index, val_index in kf.split(train_df):\n",
 72 |     "            new_train_df = train_df[[col, \"RESPONDERS\"]]\n",
 73 |     "            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]\n",
 74 |     "            train_enc_values[val_index] =  np.array( getDVEncodeVar(val_X[[col]], dev_X, col))\n",
 75 |     "            test_enc_values += np.array( getDVEncodeVar(test_df[[col]], dev_X, col))\n",
 76 |     "        test_enc_values /= 5.\n",
 77 |     "        train_df[col + \"_enc\"] = train_enc_values\n",
 78 |     "        test_df[col + \"_enc\"] = test_enc_values\n",
 79 |     "        print train_df[col + \"_enc\"].describe()\n",
 80 |     "        print test_df[col + \"_enc\"].describe()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "### Interaction features \n",
 88 |     "\n",
 89 |     "[XGBoost Feature Interactions and Importance](https://github.com/Far0n/xgbfi) by Faron"
 90 |    ]
 91 |   }
 92 |  ],
 93 |  "metadata": {
 94 |   "kernelspec": {
 95 |    "display_name": "Python 2",
 96 |    "language": "python",
 97 |    "name": "python2"
 98 |   },
 99 |   "language_info": {
100 |    "codemirror_mode": {
101 |     "name": "ipython",
102 |     "version": 2
103 |    },
104 |    "file_extension": ".py",
105 |    "mimetype": "text/x-python",
106 |    "name": "python",
107 |    "nbconvert_exporter": "python",
108 |    "pygments_lexer": "ipython2",
109 |    "version": "2.7.10"
110 |   }
111 |  },
112 |  "nbformat": 4,
113 |  "nbformat_minor": 2
114 | }
115 | 


--------------------------------------------------------------------------------
/AV_DHS_2017/Modeling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Modeling\n",
 11 |     "\n",
 12 |     "Here we will have some sample codes and links with respect to modeling section.\n",
 13 |     "\n",
 14 |     "\n",
 15 |     "## Modeling Bigger Datasets \n",
 16 |     "\n",
 17 |     "1. [FTRL Implementation](https://www.kaggle.com/jiweiliu/ftrl-starter-code/code)\n",
 18 |     "2. [LibFFM](https://github.com/guestwalk/libffm)\n",
 19 |     "3. [Voapal Wabbit](https://github.com/JohnLangford/vowpal_wabbit/wiki)\n",
 20 |     "4. [Incremental Learning](http://scikit-learn.org/stable/modules/scaling_strategies.html#incremental-learning)\n",
 21 |     "\n",
 22 |     "## Time Series Forecasting\n",
 23 |     "\n",
 24 |     "1. [R Tutorial](https://www.analyticsvidhya.com/blog/2015/12/complete-tutorial-time-series-modeling/)\n",
 25 |     "\n",
 26 |     "2. [Python Tutorial](https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/)\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "## Bayesian Optimization\n",
 30 |     "\n",
 31 |     "Some python libraries are\n",
 32 |     "\n",
 33 |     "1. [Hyperopt](http://hyperopt.github.io/hyperopt/)\n",
 34 |     "\n",
 35 |     "2. [Spearmint](https://github.com/JasperSnoek/spearmint)\n",
 36 |     "\n",
 37 |     "3. [Bayesian Optimization](https://github.com/fmfn/BayesianOptimization) \n",
 38 |     "\n",
 39 |     "Example code can be seen in this [Kaggle Kernel](https://www.kaggle.com/dreeux/hyperparameter-tuning-using-hyperopt)\n",
 40 |     "\n",
 41 |     "### Random Forest ###"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 5,
 47 |    "metadata": {
 48 |     "collapsed": true,
 49 |     "deletable": true,
 50 |     "editable": true
 51 |    },
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "def runRF(train_X, train_y, test_X, test_y=None, test_X2=None, depth=20, leaf=10, feat=0.2):\n",
 55 |     "    model = ensemble.RandomForestClassifier(\n",
 56 |     "            n_estimators = 1000,\n",
 57 |     "                    max_depth = depth,\n",
 58 |     "                    min_samples_split = 2,\n",
 59 |     "                    min_samples_leaf = leaf,\n",
 60 |     "                    max_features =  feat,\n",
 61 |     "                    n_jobs = 4,\n",
 62 |     "                    random_state = 0)\n",
 63 |     "    model.fit(train_X, train_y)\n",
 64 |     "    train_preds = model.predict_proba(train_X)[:,1]\n",
 65 |     "    test_preds = model.predict_proba(test_X)[:,1]\n",
 66 |     "    test_preds2 = model.predict_proba(test_X2)[:,1]\n",
 67 |     "    test_loss = 0\n",
 68 |     "    \n",
 69 |     "    train_loss = metrics.log_loss(train_y, train_preds)\n",
 70 |     "    test_loss = metrics.log_loss(test_y, test_preds)\n",
 71 |     "    print \"Train and Test loss : \", train_loss, test_loss\n",
 72 |     "    return test_preds, test_loss, test_preds2"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {
 78 |     "deletable": true,
 79 |     "editable": true
 80 |    },
 81 |    "source": [
 82 |     "### XGBoost / Light GBM"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {
 89 |     "collapsed": true,
 90 |     "deletable": true,
 91 |     "editable": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, seed_val=0, rounds=500, dep=8, eta=0.05):\n",
 96 |     "    params = {}\n",
 97 |     "    params[\"objective\"] = \"binary:logistic\"\n",
 98 |     "    params['eval_metric'] = 'auc'\n",
 99 |     "    params[\"eta\"] = eta\n",
100 |     "    params[\"subsample\"] = 0.7\n",
101 |     "    params[\"min_child_weight\"] = 1\n",
102 |     "    params[\"colsample_bytree\"] = 0.7\n",
103 |     "    params[\"max_depth\"] = dep\n",
104 |     "    params[\"silent\"] = 1\n",
105 |     "    params[\"seed\"] = seed_val\n",
106 |     "    #params[\"max_delta_step\"] = 2\n",
107 |     "    #params[\"gamma\"] = 0.5\n",
108 |     "    num_rounds = rounds\n",
109 |     "\n",
110 |     "    plst = list(params.items())\n",
111 |     "    xgtrain = xgb.DMatrix(train_X, label=train_y)\n",
112 |     "\n",
113 |     "    xgtest = xgb.DMatrix(test_X, label=test_y)\n",
114 |     "    watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]\n",
115 |     "    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)\n",
116 |     "\n",
117 |     "\n",
118 |     "    pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)\n",
119 |     "    pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)\n",
120 |     "    \n",
121 |     "    loss = metrics.roc_auc_score(test_y, pred_test_y)\n",
122 |     "    return pred_test_y, loss, pred_test_y2"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {
128 |     "deletable": true,
129 |     "editable": true
130 |    },
131 |    "source": [
132 |     "### Neural Networks / Deep Learning"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 4,
138 |    "metadata": {
139 |     "collapsed": true,
140 |     "deletable": true,
141 |     "editable": true
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "def runNN(train_X, train_y, test_X, test_y=None, test_X2=None, epochs=100, scale=False):\n",
146 |     "    if scale:\n",
147 |     "        sc = preprocessing.StandardScaler()\n",
148 |     "        all_X = pd.concat([train_X, test_X, test_X2], axis=0)\n",
149 |     "        sc.fit(all_X)\n",
150 |     "        train_X = sc.transform(train_X)\n",
151 |     "        test_X = sc.transform(test_X)\n",
152 |     "        test_X2 = sc.transform(test_X2)\n",
153 |     "\n",
154 |     "    random.seed(12345)\n",
155 |     "    np.random.seed(12345)\n",
156 |     "    model = Sequential()\n",
157 |     "    model.add(Dense(200, input_shape=(train_X.shape[1],), init='he_uniform')) #, W_regularizer=regularizers.l1(0.002)))\n",
158 |     "    model.add(Activation('relu'))\n",
159 |     "    model.add(Dropout(0.3))\n",
160 |     "\n",
161 |     "    #model.add(Dense(50, init='he_uniform'))\n",
162 |     "    #model.add(Activation('relu'))\n",
163 |     "    #model.add(Dropout(0.3))\n",
164 |     "\n",
165 |     "    #model.add(Dense(100, init='he_uniform'))\n",
166 |     "    #model.add(Activation('relu'))\n",
167 |     "    #model.add(Dropout(0.3))\n",
168 |     "\n",
169 |     "    model.add(Dense(1, init='he_uniform'))\n",
170 |     "    model.add(Activation('sigmoid'))\n",
171 |     "    model.compile(loss='binary_crossentropy', optimizer='adagrad')\n",
172 |     "    \n",
173 |     "    ### Model fitting takes place ###\n",
174 |     "    model.fit(train_X, train_y, batch_size=512, nb_epoch=epochs, validation_data=(test_X, test_y), verbose=2, shuffle=True)\n",
175 |     "    \n",
176 |     "    preds = model.predict(test_X, verbose=0)\n",
177 |     "    preds_test2 = model.predict(test_X2, verbose=0)\n",
178 |     "    loss = metrics.log_loss(test_y, preds)\n",
179 |     "    return preds.ravel(), loss, preds_test2.ravel()"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {
185 |     "deletable": true,
186 |     "editable": true
187 |    },
188 |    "source": [
189 |     "## Ensembling\n",
190 |     "\n",
191 |     "Codes for basic ensembling methods can be seen in this [github link by MLWave](https://github.com/MLWave/Kaggle-Ensemble-Guide)\n",
192 |     "\n",
193 |     "## Stacking \n",
194 |     "\n",
195 |     "1. [StackNet](https://github.com/kaz-Anova/StackNet) by Marios KazAnova\n",
196 |     "2. [Stacked Ensembles](https://h2o-release.s3.amazonaws.com/h2o/rel-ueno/2/docs-website/h2o-docs/data-science/stacked-ensembles.html) by H2O"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 2",
212 |    "language": "python",
213 |    "name": "python2"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 2
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython2",
225 |    "version": "2.7.10"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 2
230 | }
231 | 


--------------------------------------------------------------------------------
/AV_DHS_2017/readme.md:
--------------------------------------------------------------------------------
1 | Codes and materials related to [Analytics Vidhya Datahack Summit workshop 2017](https://www.analyticsvidhya.com/datahacksummit/workshops/the-masterclass-how-to-win-data-science-challenges/) can be seen in this folder
2 | 


--------------------------------------------------------------------------------
/AV_DHS_2018/readme.md:
--------------------------------------------------------------------------------
1 | Codes for DHS 2018 is present here.
2 | 


--------------------------------------------------------------------------------
/AV_Genpact_2018/final_model.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import numpy as np
  3 | import pandas as pd
  4 | from sklearn import preprocessing, model_selection, metrics, ensemble
  5 | import lightgbm as lgb
  6 | 
  7 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, dep=10, seed=0, rounds=20000): 
  8 | 	params = {}
  9 | 	params["objective"] = "regression"
 10 | 	params['metric'] = 'rmse'
 11 | 	params["max_depth"] = dep
 12 | 	params["min_data_in_leaf"] = 100
 13 | 	params["learning_rate"] = 0.04
 14 | 	params["bagging_fraction"] = 0.7
 15 | 	params["feature_fraction"] = 0.5
 16 | 	params["bagging_freq"] = 5
 17 | 	params["bagging_seed"] = seed
 18 | 	#params["lambda_l2"] = 0.01
 19 | 	params["verbosity"] = -1
 20 | 	num_rounds = rounds
 21 | 
 22 | 	plst = list(params.items())
 23 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 24 | 
 25 | 	if test_y is not None:
 26 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 27 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=200, verbose_eval=100)
 28 | 	else:
 29 | 		lgtest = lgb.Dataset(test_X)
 30 | 		model = lgb.train(params, lgtrain, num_rounds)
 31 | 
 32 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 33 | 	if test_X2 is not None:
 34 | 		pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 35 | 	imps = model.feature_importance()
 36 | 	names = model.feature_name()
 37 | 	for fi, fn in enumerate(names):
 38 | 		print(fn, imps[fi])
 39 | 
 40 | 	loss = 0
 41 | 	if test_y is not None:
 42 | 		loss = np.sqrt(metrics.mean_squared_error(test_y, pred_test_y))
 43 | 		print(loss)
 44 | 		return pred_test_y, loss, pred_test_y2, model.best_iteration
 45 | 	else:
 46 | 		return pred_test_y
 47 | 
 48 | 
 49 | def run_model(week_num):
 50 | 	print("WEEK NUMBER IS : ", week_num)
 51 | 	week_shift_map = {
 52 | 	146 : ["target_shift1", "target_shift2", "target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"],
 53 | 	147 : ["target_shift2", "target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"],
 54 | 	148 : ["target_shift3", "target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"],
 55 | 	149 : ["target_shift4", "target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift12", "target_shift13"],
 56 | 	150 : ["target_shift5", "target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"],
 57 | 	151 : ["target_shift6", "target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"],
 58 | 	152 : ["target_shift7", "target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"],
 59 | 	153 : ["target_shift8", "target_shift9", "target_shift10", "target_shift11", "target_shift13"],
 60 | 	154 : ["target_shift9", "target_shift10", "target_shift11", "target_shift13"],
 61 | 	155 : ["target_shift10", "target_shift11", "target_shift13"]
 62 | 	}
 63 | 
 64 | 	train_df = pd.read_csv("../input/train.csv")
 65 | 	test_df = pd.read_csv("../input/test_QoiMO9B.csv")
 66 | 	center_df = pd.read_csv("../input/fulfilment_center_info.csv")
 67 | 	meal_df = pd.read_csv("../input/meal_info.csv")
 68 | 
 69 | 	train_df = pd.merge(train_df, center_df, on="center_id", how="left")
 70 | 	test_df = pd.merge(test_df, center_df, on="center_id", how="left")
 71 | 	train_df = pd.merge(train_df, meal_df, on="meal_id", how="left")
 72 | 	test_df = pd.merge(test_df, meal_df, on="meal_id", how="left")
 73 | 
 74 | 	cat_cols = ["center_type", "category", "cuisine"]
 75 | 	for c in cat_cols:
 76 | 		lbl = preprocessing.LabelEncoder()
 77 | 		lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
 78 | 		train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
 79 | 		test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
 80 | 
 81 | 	train_df["discount_ratio"] = train_df["base_price"] / train_df["checkout_price"]
 82 | 	test_df["discount_ratio"] = test_df["base_price"] / test_df["checkout_price"]
 83 | 
 84 | 	train_df["train_set"] = 1
 85 | 	test_df["train_set"] = 0
 86 | 	test_df["num_orders"] = -99
 87 | 
 88 | 	print(train_df.shape)
 89 | 	all_df = pd.concat([train_df, test_df])
 90 | 	all_df = all_df.sort_values(by=["center_id", "meal_id", "week"]).reset_index(drop=True)
 91 | 	print(all_df.shape)
 92 | 	all_df["target_shift1"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(1)
 93 | 	all_df["target_shift2"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(2)
 94 | 	all_df["target_shift3"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(3)
 95 | 	all_df["target_shift4"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(4)
 96 | 	all_df["target_shift5"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(5)
 97 | 	all_df["target_shift6"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(6)
 98 | 	all_df["target_shift7"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(7)
 99 | 	all_df["target_shift8"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(8)
100 | 	all_df["target_shift9"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(9)
101 | 	all_df["target_shift10"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(10)
102 | 	all_df["target_shift11"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(11)
103 | 	all_df["target_shift12"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(12)
104 | 	all_df["target_shift13"] = all_df.groupby(["center_id", "meal_id"])["num_orders"].shift(13)
105 | 
106 | 	all_df["discount_shift1"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(1)
107 | 	all_df["discount_shift2"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(2)
108 | 	all_df["discount_shift3"] = all_df.groupby(["center_id", "meal_id"])["discount_ratio"].shift(3)
109 | 
110 | 	#### center shift features ###
111 | 	#gdf = all_df.groupby(["center_id", "category", "week"])["target_shift11"].agg(['sum']).reset_index()
112 | 	#gdf.columns = ["center_id", "category", "week", "center_week_orders11"]
113 | 	#all_df = all_df.merge(gdf, on=["center_id", "category", "week"], how="left")
114 | 	gdf = all_df.groupby(["category"])["id"].agg(['size']).reset_index()
115 | 	gdf.columns = ["category", "cat_count"]
116 | 	all_df = all_df.merge(gdf, on=["category"], how="left")
117 | 
118 | 	gdf = all_df.groupby(["cuisine"])["id"].agg(['size']).reset_index()
119 | 	gdf.columns = ["cuisine", "cui_count"]
120 | 	all_df = all_df.merge(gdf, on=["cuisine"], how="left")
121 | 
122 | 	gdf = all_df.groupby(["city_code"])["id"].agg(['size']).reset_index()
123 | 	gdf.columns = ["city_code", "city_count"]
124 | 	all_df = all_df.merge(gdf, on=["city_code"], how="left")
125 | 
126 | 	gdf = all_df.groupby(["region_code"])["id"].agg(['size']).reset_index()
127 | 	gdf.columns = ["region_code", "region_count"]
128 | 	all_df = all_df.merge(gdf, on=["region_code"], how="left")
129 | 
130 | 	#gdf = all_df.groupby(["city_code", "category"])["id"].agg(['size']).reset_index()
131 | 	#gdf.columns = ["city_code", "category", "city_cat_count"]
132 | 	#all_df = all_df.merge(gdf, on=["city_code", "category"], how="left")
133 | 
134 | 	#gdf = all_df.groupby(["city_code", "cuisine"])["id"].agg(['size']).reset_index()
135 | 	#gdf.columns = ["city_code", "cuisine", "city_cui_count"]
136 | 	#all_df = all_df.merge(gdf, on=["city_code", "cuisine"], how="left")
137 | 
138 | 	#gdf = all_df.groupby(["region_code", "category"])["id"].agg(['size']).reset_index()
139 | 	#gdf.columns = ["region_code", "category", "region_cat_count"]
140 | 	#all_df = all_df.merge(gdf, on=["region_code", "category"], how="left")
141 | 
142 | 	#gdf = all_df.groupby(["region_code", "cuisine"])["id"].agg(['size']).reset_index()
143 | 	#gdf.columns = ["region_code", "cuisine", "region_cui_count"]
144 | 	#all_df = all_df.merge(gdf, on=["region_code", "cuisine"], how="left")
145 | 
146 | 	### Center count features ###
147 | 	gdf = all_df.groupby(["center_id", "week"])["id"].agg(['size']).reset_index()
148 | 	gdf.columns = ["center_id", "week", "center_week_count"]
149 | 	all_df = all_df.merge(gdf, on=["center_id", "week"], how="left")
150 | 
151 | 	gdf = all_df.groupby(["center_id", "category"])["id"].count().reset_index()
152 | 	gdf.columns = ["center_id", "category", "center_cat_count"]
153 | 	all_df = all_df.merge(gdf, on=["center_id", "category"], how="left")
154 | 
155 | 	gdf = all_df.groupby(["center_id", "category", "week"])["id"].count().reset_index()
156 | 	gdf.columns = ["center_id", "category", "week", "center_cat_week_count"]
157 | 	#gdf = gdf.sort_values(by=["center_id", "category", "week"]).reset_index(drop=True)
158 | 	#gdf["center_cat_week1_count"] = gdf.groupby(["center_id", "category", "week"])["center_cat_week_count"].shift(1)
159 | 	all_df = all_df.merge(gdf, on=["center_id", "category", "week"], how="left")
160 | 
161 | 	gdf = all_df.groupby(["center_id", "cuisine"])["id"].count().reset_index()
162 | 	gdf.columns = ["center_id", "cuisine", "center_cui_count"]
163 | 	all_df = all_df.merge(gdf, on=["center_id", "cuisine"], how="left")
164 | 
165 | 
166 | 	### Meal count features ###
167 | 	gdf = all_df.groupby(["meal_id"])["id"].count().reset_index()
168 | 	gdf.columns = ["meal_id", "meal_count"]
169 | 	all_df = all_df.merge(gdf, on=["meal_id"], how="left")
170 | 
171 | 	gdf = all_df.groupby(["region_code", "meal_id"])["id"].count().reset_index()
172 | 	gdf.columns = ["region_code", "meal_id", "region_meal_count"]
173 | 	all_df = all_df.merge(gdf, on=["region_code", "meal_id"], how="left")
174 | 
175 | 	gdf = all_df.groupby(["meal_id", "week"])["id"].count().reset_index()
176 | 	gdf.columns = ["meal_id", "week", "meal_week_count"]
177 | 	all_df = all_df.merge(gdf, on=["meal_id", "week"], how="left")
178 | 
179 | 	gdf = all_df.groupby(["center_type", "meal_id", "week"])["id"].count().reset_index()
180 | 	gdf.columns = ["center_type", "meal_id", "week", "type_meal_week_count"]
181 | 	all_df = all_df.merge(gdf, on=["center_type", "meal_id", "week"], how="left")
182 | 
183 | 	gdf = all_df.groupby(["region_code", "meal_id", "week"])["id"].count().reset_index()
184 | 	gdf.columns = ["region_code", "meal_id", "week", "region_meal_week_count"]
185 | 	all_df = all_df.merge(gdf, on=["region_code", "meal_id", "week"], how="left")
186 | 
187 | 	gdf = all_df.groupby(["city_code", "meal_id", "week"])["id"].count().reset_index()
188 | 	gdf.columns = ["city_code", "meal_id", "week", "city_meal_week_count"]
189 | 	all_df = all_df.merge(gdf, on=["city_code", "meal_id", "week"], how="left")
190 | 
191 | 	### Price rank ###
192 | 	all_df["meal_price_rank"] = all_df.groupby("meal_id")["checkout_price"].rank()
193 | 	all_df["meal_city_price_rank"] = all_df.groupby(["meal_id", "city_code"])["checkout_price"].rank()
194 | 	all_df["meal_region_price_rank"] = all_df.groupby(["meal_id", "region_code"])["checkout_price"].rank()
195 | 	all_df["meal_week_price_rank"] = all_df.groupby(["meal_id", "week"])["checkout_price"].rank()
196 | 
197 | 	all_df["center_price_rank"] = all_df.groupby("center_id")["checkout_price"].rank()
198 | 	all_df["center_week_price_rank"] = all_df.groupby(["center_id", "week"])["checkout_price"].rank()
199 | 	all_df["center_cat_price_rank"] = all_df.groupby(["center_id", "category"])["checkout_price"].rank()
200 | 
201 | 	### Week features ###
202 | 	gdf = all_df.groupby(["meal_id"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index()
203 | 	gdf.columns = ["meal_id", "meal_price_min", "meal_price_max", "meal_price_mean", "meal_price_std"]
204 | 	all_df = all_df.merge(gdf, on=["meal_id"], how="left")
205 | 
206 | 	gdf = all_df.groupby(["meal_id"])["base_price"].agg(["min", "max", "mean", "std"]).reset_index()
207 | 	gdf.columns = ["meal_id", "disc_price_min", "disc_price_max", "disc_price_mean", "disc_price_std"]
208 | 	all_df = all_df.merge(gdf, on=["meal_id"], how="left")
209 | 
210 | 	gdf = all_df.groupby(["city_code","meal_id", "week"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index()
211 | 	gdf.columns = ["city_code", "meal_id", "week", "meal_price2_min", "meal_price2_max", "meal_price2_mean", "meal_price2_std"]
212 | 	all_df = all_df.merge(gdf, on=["city_code", "meal_id", "week"], how="left")
213 | 
214 | 	gdf = all_df.groupby(["city_code", "category"])["checkout_price"].agg(["mean", "std"]).reset_index()
215 | 	gdf.columns = ["city_code", "category", "meal_price3_mean", "meal_price3_std"]
216 | 	all_df = all_df.merge(gdf, on=["city_code", "category"], how="left")
217 | 
218 | 	#gdf = all_df.groupby(["region_code","meal_id", "week"])["checkout_price"].agg(["min", "max", "mean", "std"]).reset_index()
219 | 	#gdf.columns = ["region_code", "meal_id", "week", "meal_price4_min", "meal_price4_max", "meal_price4_mean", "meal_price4_std"]
220 | 	#all_df = all_df.merge(gdf, on=["region_code", "meal_id", "week"], how="left")
221 | 
222 | 
223 | 	### New ones ###
224 | 	#all_df["ratio1"] = all_df["target_shift10"] / all_df["op_area"]	
225 | 	#all_df["ratio2"] = all_df["target_shift10"] / all_df["checkout_price"]	
226 | 
227 | 	### overall mean sum ###
228 | 	#gdf = all_df.groupby(["meal_id", "week"])["target_shift10"].sum().reset_index()
229 | 	#gdf.columns = ["meal_id", "week", "city_meal_week_lag10"]
230 | 	#all_df = all_df.merge(gdf, on=["meal_id", "week"], how="left")
231 | 
232 | 	train_df = all_df[all_df["train_set"]==1].reset_index(drop=True)
233 | 	test_df = all_df[all_df["train_set"]==0].reset_index(drop=True)
234 | 	test_df = test_df[test_df["week"] == week_num].reset_index(drop=True)
235 | 
236 | 	dev_df = train_df[train_df["week"]<=135]	
237 | 	#dev_df = dev_df[dev_df["week"]>20]	
238 | 	val_df = train_df[train_df["week"]>135]
239 | 	train_y = np.log1p(train_df["num_orders"].values)
240 | 	dev_y = np.log1p(dev_df["num_orders"].values)
241 | 	val_y = np.log1p(val_df["num_orders"].values)
242 | 	cols_to_use = ["center_id", "meal_id", "checkout_price", "base_price", "discount_ratio", "emailer_for_promotion", "homepage_featured"]
243 | 	cols_to_use += ["city_code","region_code","center_type","op_area"]
244 | 	cols_to_use += ["category", "cuisine"]
245 | 	cols_to_use += ["cat_count", "cui_count", "city_count", "region_count"]
246 | 	#cols_to_use += ["city_cat_count", "city_cui_count", "region_cat_count", "region_cui_count"]
247 | 	cols_to_use += ["center_cat_count", "center_cui_count", "center_week_count"]
248 | 	cols_to_use += ["meal_week_count", "type_meal_week_count", "region_meal_week_count", "city_meal_week_count", "meal_count", "region_meal_count"]
249 | 	cols_to_use += ["meal_price_rank", "meal_city_price_rank", "meal_region_price_rank", "meal_week_price_rank"]
250 | 	cols_to_use += ["center_price_rank", "center_cat_price_rank", "center_week_price_rank"]
251 | 	cols_to_use += ["meal_price_min", "meal_price_max", "meal_price_mean", "meal_price_std"]
252 | 	cols_to_use += ["disc_price_min", "disc_price_max", "disc_price_mean", "disc_price_std"]
253 | 	cols_to_use += ["meal_price2_min", "meal_price2_max", "meal_price2_mean", "meal_price2_std"]
254 | 	cols_to_use += ["meal_price3_mean", "meal_price3_std"]
255 | 	cols_to_use += week_shift_map[week_num]
256 | 
257 | 	train_X = train_df[cols_to_use]
258 | 	dev_X = dev_df[cols_to_use]
259 | 	val_X = val_df[cols_to_use]
260 | 	test_X = test_df[cols_to_use]
261 | 	print(val_X.tail())
262 | 
263 | 	pred_val, loss, pred_test, nrounds = runLGB(dev_X, dev_y, val_X, val_y, test_X)
264 | 	pred_test1 = runLGB(train_X, train_y, test_X, rounds=nrounds)
265 | 	pred_test2 = runLGB(train_X, train_y, test_X, rounds=nrounds, seed=2018)
266 | 	pred_test = 0.5*pred_test1 + 0.5*pred_test2
267 | 
268 | 	test_id = list(test_df["id"].values)
269 | 	test_preds = list(np.expm1(pred_test))
270 | 	return test_id, test_preds, loss
271 | 
272 | if __name__ == "__main__":
273 | 	test_ids = [] 
274 | 	preds = []
275 | 	cv = []
276 | 	for week_num in [146, 147, 148, 149, 150, 151, 152, 153, 154, 155]:
277 | 		ids, prs, ll = run_model(week_num)
278 | 		test_ids.extend(ids)
279 | 		preds.extend(prs)
280 | 		cv.append(ll)
281 | 		print(cv)
282 | 	sub_df = pd.DataFrame({"id":test_ids})
283 | 	sub_df["num_orders"] = preds
284 | 	sub_df.to_csv("sub8.csv", index=False)
285 | 


--------------------------------------------------------------------------------
/AV_Genpact_2018/readme.md:
--------------------------------------------------------------------------------
1 | Code for the Genpact Hackathon
2 | https://datahack.analyticsvidhya.com/contest/genpact-machine-learning-hackathon/
3 | 


--------------------------------------------------------------------------------
/AV_Hack3/buildModel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Code for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong !
  4 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838
  5 | __author__ : SRK
  6 | """
  7 | import sys
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.preprocessing import LabelEncoder
 11 | from sklearn.cross_validation import KFold
 12 | from sklearn import ensemble
 13 | from sklearn.metrics import mean_squared_error
 14 | import xgboost as xgb
 15 | 
 16 | if __name__ == "__main__":
 17 | 	# setting the input path and reading the data into dataframe #
 18 | 	data_path = "../Data/"
 19 | 	train = pd.read_csv(data_path+"Train.csv")
 20 | 	test = pd.read_csv(data_path+"Test.csv")
 21 | 
 22 | 	## mapping the var8 with the  given data and create a new column ##
 23 | 	var8_map_dict = {"HXYB":0, "HXYC":0, "HXYD":0, "HXYE":0, "HXYF":1, "HXFG":1, "HXYH":1, "HXYI":1, "HXYJ":2, "HXYK":2, "HXYL":2, "HXYM":3, "HXYN":3, "HXYO":3}
 24 | 	train_var8_map = []
 25 | 	for var_val in train["Var8"]:
 26 | 		if var8_map_dict.has_key(var_val):
 27 | 			train_var8_map.append(var8_map_dict[var_val])
 28 | 		else:
 29 | 			train_var8_map.append(4)  # just in case if the value is missing in dict, assign 4
 30 | 	test_var8_map = []
 31 | 	for var_val in test["Var8"]:
 32 |                 if var8_map_dict.has_key(var_val):
 33 |                         test_var8_map.append(var8_map_dict[var_val])
 34 |                 else:
 35 |                         test_var8_map.append(4)
 36 | 	train["Var8Map"] = train_var8_map
 37 | 	test["Var8Map"] = test_var8_map
 38 | 
 39 | 	## categical column name list ##
 40 | 	categorical_columns = ['Var4', 'institute_city', 'institute_state', 'Var8', 'institute_country', 'Var10', 'Var11', 'Var12', 'Var13', 'Var14', 'Var15', 'Instructor_Past_Performance', 'Instructor_Association_Industry_Expert', 'project_subject', 'subject_area', 'secondary_subject', 'secondary_area', 'Resource_Category', 'Resource_Sub_Category', 'Var23', 'Var24']
 41 | 
 42 | 	## Getting the ID and DV from the data frame ##
 43 | 	train_y = np.array(train["Project_Valuation"])
 44 | 	train_y[train_y>6121] = 6121
 45 | 	train_id = np.array(train["ID"])
 46 | 	test_id = np.array(test["ID"])
 47 | 
 48 | 	## Creating the IDVs from the train and test dataframe ##
 49 | 	train_X = train.copy()
 50 | 	test_X = test.copy()
 51 | 
 52 | 	## Fill up the na values with -999 ##
 53 | 	train_X = train_X.fillna(-999)
 54 | 	test_X = test_X.fillna(-999)
 55 | 
 56 | 	## One hot encoding the categorical variables ##
 57 | 	for var in categorical_columns:
 58 | 		lb = LabelEncoder()
 59 | 		full_var_data = pd.concat((train_X[var],test_X[var]),axis=0).astype('str')
 60 | 		lb.fit( full_var_data )
 61 | 		train_X[var] = lb.transform(train_X[var].astype('str'))
 62 | 		test_X[var] = lb.transform(test_X[var].astype('str'))
 63 | 
 64 | 	## Dropping the unnecessary columns from IDVs ##
 65 | 	train_X = np.array( train_X.drop(['ID','Project_Valuation'],axis=1) )
 66 | 	test_X = np.array( test_X.drop(['ID','Unnamed: 26'],axis=1) )
 67 | 	print "Train shape is : ",train_X.shape
 68 | 	print "Test shape is : ",test_X.shape
 69 | 
 70 | 	
 71 | 	################################ MODEL BUILDING ##################################################
 72 |         print "Building RF1"
 73 |         reg = ensemble.RandomForestRegressor(n_estimators=500, max_depth=None, min_samples_leaf=7, max_features="auto", n_jobs=4, random_state=0)
 74 |         reg.fit(train_X, train_y)
 75 |         pred_test_y_rf1 = reg.predict(test_X)
 76 | 
 77 |         print "Building RF2"
 78 |         reg = ensemble.RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=2, max_features=0.8, n_jobs=4, random_state=0)
 79 |         reg.fit(train_X, train_y)
 80 |         pred_test_y_rf2 = reg.predict(test_X)
 81 | 
 82 |         print "Building GB1"
 83 |         reg = ensemble.GradientBoostingRegressor(n_estimators=400, max_depth=7, min_samples_leaf=8, max_features=0.3, subsample=0.6, learning_rate=0.01, random_state=0)
 84 |         reg.fit(train_X, train_y)
 85 |         pred_test_y_gb1 = reg.predict(test_X)
 86 | 
 87 |         print "Building GB2"
 88 |         reg = ensemble.GradientBoostingRegressor(n_estimators=600, max_depth=6, min_samples_leaf=8, max_features=0.3, subsample=0.6, learning_rate=0.01, random_state=0)
 89 |         reg.fit(train_X, train_y)
 90 |         pred_test_y_gb2 = reg.predict(test_X)
 91 | 
 92 | 	print "Building XGB1"
 93 |         params = {}
 94 |         params["objective"] = "reg:linear"
 95 |         params["eta"] = 0.005
 96 |         params["min_child_weight"] = 10
 97 |         params["subsample"] = 0.7
 98 |         params["colsample_bytree"] = 0.6
 99 |         params["scale_pos_weight"] = 0.8
100 |         params["silent"] = 1
101 |         params["max_depth"] = 5
102 |         params["max_delta_step"]=2
103 |         params["seed"] = 0
104 |         plst = list(params.items())
105 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
106 |         xgtest = xgb.DMatrix(test_X)
107 |         num_rounds = 1100
108 |         model = xgb.train(plst, xgtrain, num_rounds)
109 |         pred_test_y_xgb1 = model.predict(xgtest)
110 | 
111 | 	print "Building XGB2"
112 | 	params = {}
113 |         params["objective"] = "reg:linear"
114 |         params["eta"] = 0.005
115 |         params["min_child_weight"] = 6
116 |         params["subsample"] = 0.7
117 |         params["colsample_bytree"] = 0.6
118 |         params["scale_pos_weight"] = 0.8
119 |         params["silent"] = 1
120 |         params["max_depth"] = 8
121 |         params["max_delta_step"]=2
122 |         params["seed"] = 0
123 |         plst = list(params.items())
124 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
125 |         xgtest = xgb.DMatrix(test_X)
126 |         num_rounds = 800
127 |         model = xgb.train(plst, xgtrain, num_rounds)
128 |         pred_test_y_xgb2 = model.predict(xgtest)
129 | 
130 | 	## Averaging the six models ##
131 |         pred_test_y = 0.15*pred_test_y_rf1 + 0.15*pred_test_y_rf2 + 0.15*pred_test_y_gb1 + 0.15*pred_test_y_gb2 + 0.2*pred_test_y_xgb1 + 0.2*pred_test_y_xgb2
132 | 
133 | 	## Writing the submission file ##
134 |         out_df = pd.DataFrame({"ID":test_id, "Project_Valuation":pred_test_y})
135 |         out_df.to_csv("sub_2.csv", index=False)
136 | 		
137 | 


--------------------------------------------------------------------------------
/AV_Hack3/readme.md:
--------------------------------------------------------------------------------
 1 | ##### Codes for Analytics Vidhya Online Hackathon 3.0 - Find the Next Brain Wong !
 2 | 
 3 | http://discuss.analyticsvidhya.com/t/online-hackathon-3-0-find-the-next-brain-wong/2838
 4 | 
 5 | ###### My approach for the hackathon is as follows:
 6 | 
 7 | 1. Converted all the categorical variables into one-hot encoded variables
 8 | 
 9 | 2. Truncate the "Project Evaluation" value at 99.9th percentile value (value is 6121) 
10 | 
11 | 3. Built tree based models by selecting the params through cross validation
12 | 
13 |         a. Random Forest (2 models with different params - 1 with shorter trees and 1 with deep trees)
14 | 
15 |         b. Gradient Boosting (2 models with different params)
16 | 
17 |         c. Extreme Gradient Boosting (2 models with different params)
18 | 
19 | 4. Simple weighted average of all the six models based on local validation
20 | 
21 | 


--------------------------------------------------------------------------------
/AV_Hackathon_July11/benchmark.R:
--------------------------------------------------------------------------------
 1 | ## setting the working directory ##
 2 | setwd("../Data/")
 3 | 
 4 | ## reading the train and test files ##
 5 | train = read.csv("train.csv")
 6 | test = read.csv("test.csv")
 7 | 
 8 | ## removing the categorical columns for benchmark script. Create dummy variables for further improvement ##
 9 | test_id = test["id"]
10 | train["id"] = NULL
11 | test["id"] = NULL
12 | train["Category_article"] = NULL
13 | test["Category_article"] = NULL
14 | train["Day_of_publishing"] = NULL
15 | test["Day_of_publishing"] = NULL
16 | 
17 | ## creating a linear regression model and predicting on teset set ##
18 | ## change the modeling methodology and try different models ##
19 | model = lm(shares~., data=train)
20 | summary(model)
21 | preds = predict(model, test, type='response')
22 | 
23 | ## writing the outputs to csv file ##
24 | out_df = data.frame(test_id, preds)
25 | names(out_df) = c("id", "predictions")
26 | write.csv(out_df, "benchmark_R.csv", row.names=F, quote=F)
27 | 


--------------------------------------------------------------------------------
/AV_Hackathon_July11/benchmark.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Benchmark script for Analytics Vidhya Online Hackathon using Linear Regression. 
 4 | __author__ : SRK
 5 | Date : July 11, 2015
 6 | """
 7 | 
 8 | import pandas as pd
 9 | import numpy as np
10 | from sklearn.feature_extraction import DictVectorizer
11 | from sklearn.linear_model import LinearRegression
12 | 
13 | if __name__ == "__main__":
14 | 	## specify the location of input files ##
15 | 	data_path = "../Data/"
16 | 	train_file = data_path + "train.csv"
17 | 	test_file = data_path + "test.csv"
18 | 	names_categorical = ['Category_article', 'Day_of_publishing']
19 | 
20 | 	## creating pandas data frame for train and test ##
21 | 	train = pd.read_csv(train_file)
22 | 	test = pd.read_csv(test_file)
23 | 
24 | 	# strpping the leading space in column names (some of them have leading spaces while reading using pandas read_csv) #
25 | 	train.columns =  [i.strip() for i in list(train.columns.values)]
26 | 	test.columns = [i.strip() for i in list(test.columns.values)]
27 | 
28 | 	## getting the DV and ID values ##
29 | 	train_y = train["shares"]
30 | 	train_id = train["id"]
31 | 	test_id = test["id"]
32 | 
33 | 	## dropping the categorical columns, ID and DV from dataframe ##
34 | 	train_X = train.drop( ["id"]+names_categorical+["shares"], axis=1)
35 | 	test_X = test.drop( ["id"]+names_categorical, axis=1)
36 | 	print "Train, test shape : ", train_X.shape, test_X.shape
37 | 
38 | 	## building a linear regression model and predicting on test set ##
39 | 	lm_model = LinearRegression()
40 | 	lm_model.fit(train_X, train_y)
41 | 	pred_test_y = lm_model.predict(test_X)
42 | 
43 | 	## Writing it to output csv files ##
44 | 	out_df = pd.DataFrame({"id":test_id, "predictions":pred_test_y})
45 | 	out_df.to_csv("benchmark.csv", index=False)
46 | 


--------------------------------------------------------------------------------
/AV_Hackathon_July11/readme.md:
--------------------------------------------------------------------------------
1 | This folder has codes for Analytics Vidhya Hackathon held on July 11,2015 
2 | 


--------------------------------------------------------------------------------
/AV_Knocktober/getOutcome.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | data_path = "../input/Train/"
 5 | first_camp = pd.read_csv( data_path + "First_Health_Camp_Attended.csv" )
 6 | second_camp = pd.read_csv( data_path + "Second_Health_Camp_Attended.csv" )
 7 | third_camp = pd.read_csv( data_path + "Third_Health_Camp_Attended.csv" )
 8 | print first_camp.shape, second_camp.shape, third_camp.shape
 9 | 
10 | col_names = [['Patient_ID','Health_Camp_ID','Outcome']]
11 | first_camp = first_camp[['Patient_ID','Health_Camp_ID','Health_Score']]
12 | first_camp.columns = col_names
13 | second_camp = second_camp[['Patient_ID','Health_Camp_ID','Health Score']]
14 | second_camp.columns = col_names
15 | third_camp = third_camp[['Patient_ID','Health_Camp_ID','Number_of_stall_visited']]
16 | third_camp = third_camp[third_camp['Number_of_stall_visited']>0]
17 | third_camp.columns = col_names
18 | print third_camp.shape
19 | 
20 | all_camps = pd.concat([first_camp, second_camp, third_camp])
21 | all_camps['Outcome'] = 1
22 | print all_camps.shape
23 | 
24 | train = pd.read_csv(data_path + "Train.csv")
25 | print train.shape
26 | 
27 | train = train.merge(all_camps, on=['Patient_ID','Health_Camp_ID'], how='left')
28 | train['Outcome'] = train['Outcome'].fillna(0).astype('int')
29 | train.to_csv(data_path+'train_with_outcome.csv', index=False)
30 | print train.Outcome.value_counts()
31 | 


--------------------------------------------------------------------------------
/AV_Knocktober/readme.md:
--------------------------------------------------------------------------------
1 | Codes and Files used for [AV Data hack](https://datahack.analyticsvidhya.com/contest/all/) - [Knocktober](https://datahack.analyticsvidhya.com/contest/knocktober-2016/)
2 | 
3 | We ([Rohan Rao](https://github.com/rohanrao91) and myself) finished first in this competition and the leaderboard can be accessed [here](https://datahack.analyticsvidhya.com/contest/knocktober-2016/lb)
4 | 
5 | The code file - vopani_final.R is written by Rohan Rao and you can see more about his comments [here](https://github.com/rohanrao91/AnalyticsVidhya_Knocktober)
6 | 
7 | The code file - srk_final.py is written by me. Finally we blended both our models which ended up at first position.
8 | 


--------------------------------------------------------------------------------
/AV_Knocktober/srk_final.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import operator
  3 | import pandas as pd
  4 | import numpy as np
  5 | from sklearn import preprocessing, model_selection, metrics, ensemble
  6 | import xgboost as xgb
  7 | 
  8 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
  9 |     grouped_df = count_df.groupby(var_name, as_index=False).agg('size').reset_index()
 10 |     grouped_df.columns = [var_name, "var_count"]
 11 |     merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 12 |     merged_df.fillna(-1, inplace=True)
 13 |     return list(merged_df["var_count"])
 14 | 
 15 | def create_feature_map(features):
 16 | 	outfile = open('xgb.fmap', 'w')
 17 | 	for i, feat in enumerate(features):
 18 | 		outfile.write('{0}\t{1}\tq\n'.format(i,feat))
 19 | 	outfile.close()
 20 | 
 21 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, extra_X=None, seed_val=0, num_rounds=200):
 22 | 	params = {}
 23 | 	params["objective"] = "binary:logistic"
 24 | 	params['eval_metric'] = 'auc'
 25 | 	params["eta"] = 0.02 
 26 | 	params["subsample"] = 0.8
 27 | 	params["min_child_weight"] = 5
 28 | 	params["colsample_bytree"] = 0.7
 29 | 	params["max_depth"] = 6
 30 | 	params["silent"] = 1
 31 | 	params["seed"] = seed_val
 32 | 
 33 | 	plst = list(params.items())
 34 | 	xgtrain = xgb.DMatrix(train_X, label=train_y)
 35 | 
 36 | 	if test_y is not None:
 37 | 		xgtest = xgb.DMatrix(test_X, label=test_y)
 38 | 		watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
 39 | 		model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=300)
 40 | 	else:
 41 | 		xgtest = xgb.DMatrix(test_X)
 42 | 		model = xgb.train(plst, xgtrain, num_rounds)
 43 | 
 44 | 	if feature_names is not None:
 45 | 		create_feature_map(feature_names)
 46 | 		model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
 47 | 		importance = model.get_fscore(fmap='xgb.fmap')
 48 | 		importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
 49 | 		imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
 50 | 		imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
 51 | 		imp_df.to_csv("imp_feat.txt", index=False)
 52 | 
 53 | 	pred_test_y = model.predict(xgtest)
 54 | 	loss = 0
 55 | 
 56 | 	if extra_X is not None:
 57 | 		xgtest = xgb.DMatrix(extra_X)
 58 | 		pred_extra_y = model.predict(xgtest)
 59 | 		return pred_test_y, pred_extra_y, loss 
 60 | 
 61 | 	if test_y is not None:
 62 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 63 | 		print loss
 64 | 		return pred_test_y, loss
 65 | 	else:
 66 | 	    return pred_test_y,loss
 67 | 
 68 | if __name__ == "__main__":
 69 | 	## Reading the files and converting the dates ##
 70 | 	data_path = "../input/Train/"
 71 | 	train = pd.read_csv(data_path + "train_with_outcome.csv") 
 72 | 	test = pd.read_csv(data_path + "Test.csv") 
 73 | 	train['Registration_Date'].fillna('10-jan-90', inplace=True)
 74 | 	test['Registration_Date'].fillna('10-jan-90', inplace=True)
 75 | 	train['Registration_Date'] = pd.to_datetime(train['Registration_Date'], format="%d-%b-%y")
 76 | 	test['Registration_Date'] = pd.to_datetime(test['Registration_Date'], format="%d-%b-%y")
 77 | 	train['Registration_Date'] = train['Registration_Date'].apply(lambda x: x.toordinal())
 78 | 	test['Registration_Date'] = test['Registration_Date'].apply(lambda x: x.toordinal())
 79 | 	print train.shape, test.shape
 80 | 
 81 | 	## Getting patient details and merging with train and test ##
 82 | 	patient = pd.read_csv(data_path + "Patient_Profile.csv", na_values=['None',''])
 83 | 	patient['First_Interaction'] = pd.to_datetime(patient['First_Interaction'], format="%d-%b-%y")
 84 | 	patient['First_Interaction'] = patient['First_Interaction'].apply(lambda x: x.toordinal())
 85 | 	print patient.shape
 86 | 	train = train.merge(patient, on=['Patient_ID'], how='left')
 87 | 	test = test.merge(patient, on=['Patient_ID'], how='left')
 88 | 	print train.shape, test.shape
 89 | 	
 90 | 	## Getting health camp details and merging with train and test ##
 91 | 	hc_details = pd.read_csv(data_path + "Health_Camp_Detail.csv")
 92 | 	hc_ids = list(hc_details.Health_Camp_ID.values)
 93 | 	hc_details['Camp_Start_Date'] = pd.to_datetime(hc_details['Camp_Start_Date'], format="%d-%b-%y")
 94 | 	hc_details['Camp_End_Date'] = pd.to_datetime(hc_details['Camp_End_Date'], format="%d-%b-%y")
 95 | 	hc_details['Camp_Start_Date'] = hc_details['Camp_Start_Date'].apply(lambda x: x.toordinal())
 96 | 	hc_details['Camp_End_Date'] = hc_details['Camp_End_Date'].apply(lambda x: x.toordinal())
 97 | 	hc_details['Camp_Duration_Days'] = hc_details['Camp_End_Date'] - hc_details['Camp_Start_Date']
 98 | 	print hc_details.head()
 99 | 	train = train.merge(hc_details, on=['Health_Camp_ID'], how='left')
100 | 	test = test.merge(hc_details, on=['Health_Camp_ID'], how='left')
101 | 	print train.shape, test.shape
102 | 
103 | 	## Reading the camp files ##
104 | 	first_camp_details = pd.read_csv(data_path + "First_Health_Camp_Attended.csv")
105 | 	first_camp_details = first_camp_details[["Patient_ID","Health_Camp_ID","Donation","Health_Score"]]
106 | 	train = train.merge(first_camp_details, on=["Patient_ID","Health_Camp_ID"], how='left')
107 | 	third_camp_details = pd.read_csv(data_path + "Third_Health_Camp_Attended.csv")
108 | 	third_camp_details = third_camp_details[["Patient_ID","Health_Camp_ID","Number_of_stall_visited","Last_Stall_Visited_Number"]]	
109 | 	train = train.merge(third_camp_details, on=["Patient_ID","Health_Camp_ID"], how='left')
110 | 	train["Number_of_stall_visited"].fillna(0, inplace=True)
111 | 	train["Donation"].fillna(0, inplace=True)
112 | 	train["Health_Score"].fillna(0, inplace=True)
113 | 	print train.shape, test.shape
114 | 
115 | 
116 | 	## Filling NA with -99 ##
117 | 	train.fillna(-99, inplace=True)	
118 | 	test.fillna(-99, inplace=True)
119 | 
120 | 	## print create additional features ##
121 | 	print "Getting additional features."
122 | 	train["Diff_CampStart_Registration"] = train["Camp_Start_Date"] - train["Registration_Date"]
123 | 	test["Diff_CampStart_Registration"] = test["Camp_Start_Date"] - test["Registration_Date"]
124 | 
125 | 	train["Diff_CampEnd_Registration"] = train["Camp_End_Date"] - train["Registration_Date"]
126 | 	test["Diff_CampEnd_Registration"] = test["Camp_End_Date"] - test["Registration_Date"]
127 | 
128 | 	train["Diff_Registration_FirstInteraction"] = train["Registration_Date"] - train["First_Interaction"]
129 | 	test["Diff_Registration_FirstInteraction"] = test["Registration_Date"] - test["First_Interaction"]
130 | 
131 | 	train["Diff_CampStart_FirstInteraction"] = train["Camp_Start_Date"] - train["First_Interaction"]
132 | 	test["Diff_CampStart_FirstInteraction"] = test["Camp_Start_Date"] - test["First_Interaction"]
133 | 	print train.shape, test.shape
134 | 
135 | 	## Getitng the cat columns and label encode them ##
136 | 	cat_columns = []
137 | 	for col in train.columns:
138 | 		if train[col].dtype == 'object':
139 | 			print col
140 | 			cat_columns.append(col)
141 | 			enc = preprocessing.LabelEncoder()
142 | 			full_list = list(train[col].values) + list(test[col].values)
143 | 			enc.fit(full_list)
144 | 			train[col] = enc.transform(list(train[col].values))
145 | 			test[col]  = enc.transform(list(test[col].values))
146 | 
147 | 	# getting count #
148 | 	for col in ["Patient_ID", "Health_Camp_ID"]:
149 | 		print "Count : ", col
150 | 		full_df = pd.concat([train, test])
151 | 		train["Count_"+col] = getCountVar(train, full_df, col)
152 | 		test["Count_"+col] = getCountVar(test, full_df, col)
153 | 
154 | 
155 | 	## do sorting so as to compute the next variables ##
156 | 	train = train.sort_values(['Camp_Start_Date', 'Camp_End_Date', 'Patient_ID']).reset_index(drop=True)
157 | 	test = test.sort_values(['Camp_Start_Date', 'Camp_End_Date', 'Patient_ID']).reset_index(drop=True)
158 | 	print train.head()
159 | 
160 | 	print "First pass to get necessary details.."
161 | 	people_camp_dict = {}
162 | 	people_date_dict = {}
163 | 	people_dv_dict = {}
164 | 	people_cat1_dict = {}
165 | 	people_cdate_dict = {}
166 | 	people_donation_dict = {}
167 | 	people_num_stall_dict = {}
168 | 	people_last_stall_dict = {}
169 | 	people_fscore_dict = {}
170 | 	for ind, row in train.iterrows():
171 | 		pid = row['Patient_ID']
172 | 		cid = row['Health_Camp_ID']
173 | 		reg_date = row['Registration_Date']
174 | 		dv = row['Outcome']
175 | 		cat1 = row['Category1']
176 | 		cdate = row['Camp_Start_Date']
177 | 		donation = row['Donation']
178 | 		num_stall = row['Number_of_stall_visited']
179 | 		fscore = row['Health_Score']
180 | 	
181 | 		tlist = people_camp_dict.get(pid,[])
182 | 		tlist.append(cid)
183 | 		people_camp_dict[pid] = tlist[:]
184 | 
185 | 		tlist = people_date_dict.get(pid,[])
186 | 		tlist.append(reg_date)
187 | 		people_date_dict[pid] = tlist[:]
188 | 
189 | 		tlist = people_dv_dict.get(pid, [])
190 | 		tlist.append(dv)
191 | 		people_dv_dict[pid] = tlist[:]
192 | 
193 | 		tlist = people_donation_dict.get(pid, [])
194 | 		tlist.append(donation)
195 | 		people_donation_dict[pid] = tlist[:]
196 | 	
197 | 		tlist = people_num_stall_dict.get(pid, [])
198 | 		tlist.append(num_stall)
199 | 		people_num_stall_dict[pid] = tlist[:]
200 | 
201 | 		tlist = people_fscore_dict.get(pid, [])
202 | 		tlist.append(fscore)
203 | 		people_fscore_dict[pid] = tlist[:]
204 | 
205 | 		tlist = people_cat1_dict.get(pid, [])
206 | 		tlist.append(cat1)
207 | 		people_cat1_dict[pid] = tlist[:]
208 | 
209 | 		tlist = people_cdate_dict.get(pid, [])
210 | 		tlist.append(cdate)
211 | 		people_cdate_dict[pid] = tlist[:]
212 | 
213 | 	print "Creating features now using dict for train.."
214 | 	last_date_list = []
215 | 	last_dv_list = []
216 | 	last_cat1_list = []
217 | 	mean_dv_list = []
218 | 	last_cdate_list = []
219 | 	last_donation_list = []
220 | 	last_num_stall_list = []
221 | 	last_fscore_list=[]
222 | 	for ind, row in train.iterrows():
223 | 		pid = row['Patient_ID']
224 | 		reg_date = row['Registration_Date']
225 | 		cat1 = row['Category1']
226 | 		cid = row['Health_Camp_ID']
227 | 		cdate = row['Camp_Start_Date']
228 | 
229 | 		camp_list = people_camp_dict[pid]
230 | 		for ind, camp in enumerate(camp_list):
231 | 			if camp == cid:
232 | 				use_index = ind
233 | 				break
234 | 	
235 | 		tlist = people_date_dict[pid][:use_index]
236 | 		if len(tlist)>0:
237 | 			last_date_list.append(reg_date-tlist[-1])
238 | 		else:
239 | 			last_date_list.append(-99)
240 | 
241 | 		tlist = people_dv_dict[pid][:use_index]
242 | 		if len(tlist)>0:
243 | 			last_dv_list.append(tlist[-1])
244 | 			mean_dv_list.append(np.mean(tlist))
245 | 		else:
246 | 			last_dv_list.append(-99)
247 | 			mean_dv_list.append(-99)
248 | 
249 | 		tlist = people_donation_dict[pid][:use_index]
250 | 		if len(tlist)>0:
251 | 			last_donation_list.append(np.sum(tlist))
252 | 		else:
253 | 			last_donation_list.append(-99)
254 | 
255 | 		tlist = people_num_stall_dict[pid][:use_index]
256 | 		if len(tlist)>0:
257 | 			last_num_stall_list.append(np.sum(tlist))
258 | 		else:
259 | 			last_num_stall_list.append(-99)
260 | 
261 | 		tlist = people_fscore_dict[pid][:use_index]
262 | 		if len(tlist)>0:
263 | 			last_fscore_list.append(np.mean([i for i in tlist if i!=0]))
264 | 		else:
265 | 			last_fscore_list.append(-99)
266 | 
267 | 		tlist = people_cat1_dict[pid][:use_index]
268 | 		if len(tlist)>0:
269 | 			last_cat1_list.append(tlist[-1])
270 | 		else:
271 | 			last_cat1_list.append(-99)
272 | 
273 | 		tlist = people_date_dict[pid][use_index+1:]
274 | 		if len(tlist)>0:
275 | 			last_cdate_list.append(reg_date-tlist[0]) 
276 | 		else:	
277 | 			last_cdate_list.append(-99)
278 | 
279 | 	print last_fscore_list[:50]
280 | 
281 | 	train["Last_Reg_Date"] = last_date_list[:]
282 | 	train["Mean_Outcome"] = mean_dv_list[:]
283 | 	train["Last_Cat1"] = last_cat1_list[:]
284 | 	train["Next_Reg_Date"] = last_cdate_list
285 | 	train["Sum_Donations"] = last_donation_list[:]
286 | 	train["Sum_NumStalls"] = last_num_stall_list[:]
287 | 	train["Mean_Fscore"] = last_fscore_list[:]
288 | 			
289 | 	print "Prepare dict using val.."
290 | 	for ind, row in test.iterrows():
291 | 		pid = row['Patient_ID']
292 | 		cid = row['Health_Camp_ID']
293 | 		reg_date = row['Registration_Date']
294 | 		cat1 = row['Category1']
295 | 		cdate = row['Camp_Start_Date']
296 | 		
297 | 		tlist = people_camp_dict.get(pid,[])
298 | 		tlist.append(cid)
299 | 		people_camp_dict[pid] = tlist[:]
300 | 		
301 | 		tlist = people_date_dict.get(pid,[])
302 | 		tlist.append(reg_date)
303 | 		people_date_dict[pid] = tlist[:]
304 | 		
305 | 		tlist = people_cat1_dict.get(pid, [])
306 | 		tlist.append(cat1)
307 | 		people_cat1_dict[pid] = tlist[:]
308 | 
309 | 		tlist = people_cdate_dict.get(pid, [])
310 | 		tlist.append(cdate)
311 | 		people_cdate_dict[pid] = tlist[:]
312 | 	
313 | 	print "Creating features for val using dict.."	
314 | 	last_date_list = []
315 | 	last_dv_list = []
316 | 	last_cat1_list = []
317 | 	mean_dv_list = []
318 | 	last_cdate_list = []
319 | 	last_donation_list = []
320 | 	last_num_stall_list = []
321 | 	last_fscore_list = []
322 | 	for ind, row in test.iterrows():
323 | 		pid = row['Patient_ID']
324 | 		reg_date = row['Registration_Date']
325 | 		cat1 = row['Category1']
326 | 		cid = row['Health_Camp_ID']
327 | 		cdate = row['Camp_Start_Date']
328 | 		
329 | 		camp_list = people_camp_dict[pid]
330 | 		for ind, camp in enumerate(camp_list):
331 | 			if camp == cid:
332 | 				use_index = ind
333 | 				break
334 | 		
335 | 		tlist = people_date_dict[pid][:use_index]
336 | 		if len(tlist)>0:
337 | 			last_date_list.append(reg_date-tlist[-1])
338 | 		else:
339 | 			last_date_list.append(-99)
340 | 		
341 | 		tlist = people_dv_dict.get(pid, [])
342 | 		if len(tlist)>0:
343 | 			last_dv_list.append(tlist[-1])
344 | 			mean_dv_list.append(np.mean(tlist))
345 | 		else:
346 | 			last_dv_list.append(-99)
347 | 			mean_dv_list.append(-99)
348 | 
349 | 		tlist = people_donation_dict.get(pid, [])
350 | 		if len(tlist)>0:
351 | 			last_donation_list.append(np.sum(tlist))
352 | 		else:
353 | 			last_donation_list.append(-99)
354 | 
355 | 		tlist = people_num_stall_dict.get(pid, [])
356 | 		if len(tlist)>0:
357 | 			last_num_stall_list.append(np.sum(tlist))
358 | 		else:
359 | 			last_num_stall_list.append(-99)
360 | 
361 | 		tlist = people_fscore_dict.get(pid, [])
362 | 		if len(tlist)>0:
363 | 			last_fscore_list.append(np.mean([i for i in tlist if i!=0]))
364 | 		else:
365 | 			last_fscore_list.append(-99)
366 | 		
367 | 		tlist = people_cat1_dict[pid][:use_index]
368 | 		if len(tlist)>0:
369 | 			last_cat1_list.append(tlist[-1])
370 | 		else:
371 | 			last_cat1_list.append(-99)
372 | 
373 | 		tlist = people_date_dict[pid][use_index+1:]
374 | 		if len(tlist)>0:
375 | 			last_cdate_list.append(reg_date-tlist[0])
376 | 		else:
377 | 			last_cdate_list.append(-99)
378 | 
379 | 	test["Last_Reg_Date"] = last_date_list[:]
380 | 	test["Mean_Outcome"] = mean_dv_list[:]
381 | 	test["Last_Cat1"] = last_cat1_list[:]
382 | 	test["Next_Reg_Date"] = last_cdate_list[:]
383 | 	test["Sum_Donations"] = last_donation_list[:]
384 | 	test["Sum_NumStalls"] = last_num_stall_list[:]
385 | 	test["Mean_Fscore"] = last_fscore_list[:]
386 | 
387 | 	train.fillna(-99, inplace=True)
388 | 	test.fillna(-99, inplace=True)
389 | 	
390 | 	print "Getting dv and id values"
391 | 	train_y = train.Outcome.values
392 | 
393 | 	## Columns to drop ##
394 | 	print "Dropping columns.."
395 | 	drop_cols = ["Camp_Start_Date", "Camp_End_Date", "Registration_Date"] #, "First_Interaction"]
396 | 	drop_cols = drop_cols + ["LinkedIn_Shared", "Facebook_Shared", "Twitter_Shared", "Online_Follower", "Var4"]
397 | 	train.drop(drop_cols, axis=1, inplace=True) 
398 | 	test.drop(drop_cols, axis=1, inplace=True) 
399 | 	print train.shape, test.shape
400 | 
401 | 	# preparing train and test #
402 | 	print "Choose the columns to use.."
403 | 	xcols = [col for col in train.columns if col not in ["Outcome", "Health_Camp_ID", "Patient_ID", "Der_Var1", "Number_of_stall_visited","Last_Stall_Visited_Number", "Donation", "Health_Score", "Mean_Fscore"]]
404 | 	print xcols
405 | 	train_X = np.array(train[xcols])
406 | 	test_X = np.array(test[xcols])
407 | 	print train_X.shape, test_X.shape
408 | 
409 | 	print "Final Model.."
410 | 	preds = 0
411 | 	for seed_val, num_rounds in [[0,200], [2016,250], [1323, 225]]:
412 | 		print seed_val, num_rounds
413 | 		temp_preds, loss = runXGB(train_X, train_y, test_X, feature_names=xcols, seed_val=seed_val, num_rounds=num_rounds)
414 | 		preds += temp_preds
415 | 	preds = preds/3.
416 | 
417 | 	out_df = pd.DataFrame({"Patient_ID":test.Patient_ID.values})
418 | 	out_df["Health_Camp_ID"] = test.Health_Camp_ID.values
419 | 	out_df["Outcome"] =  preds
420 | 	out_df.to_csv("sub_srk.csv", index=False)
421 | 


--------------------------------------------------------------------------------
/AV_Knocktober/vopani_final.R:
--------------------------------------------------------------------------------
  1 | ## setting working directory
  2 | path <- "/Volumes/External SD/AnalyticsVidhya/Knocktober"
  3 | setwd(path)
  4 | 
  5 | seed <- 235
  6 | set.seed(seed)
  7 | 
  8 | 
  9 | ## loading libraries
 10 | library(data.table)
 11 | library(xgboost)
 12 | 
 13 | 
 14 | ## loading data
 15 | train <- fread("./raw/Train.csv")
 16 | test <- fread("./raw/Test_D7W1juQ.csv")
 17 | 
 18 | health_camp <- fread("./raw/Health_Camp_Detail.csv")
 19 | 
 20 | health_1 <- fread("./raw/First_Health_Camp_Attended.csv")
 21 | health_2 <- fread("./raw/Second_Health_Camp_Attended.csv")
 22 | health_3 <- fread("./raw/Third_Health_Camp_Attended.csv")
 23 | 
 24 | health_1[, V5 := NULL]
 25 | setnames(health_1, "Health_Score", "Health_Score_1")
 26 | setnames(health_2, "Health Score", "Health_Score_2")
 27 | 
 28 | patient <- fread("./raw/Patient_Profile.csv")
 29 | 
 30 | train[, train_flag := 1]
 31 | test[, train_flag := 0]
 32 | 
 33 | 
 34 | ## processing data
 35 | X_panel <- rbind(train, test)
 36 | 
 37 | X_panel <- merge(X_panel, health_1, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID"))
 38 | X_panel <- merge(X_panel, health_2, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID"))
 39 | X_panel <- merge(X_panel, health_3, all.x = TRUE, by = c("Patient_ID", "Health_Camp_ID"))
 40 | 
 41 | X_panel <- merge(X_panel, health_camp, all.x = TRUE, by = "Health_Camp_ID")
 42 | X_panel <- merge(X_panel, patient, all.x = TRUE, by = "Patient_ID")
 43 | 
 44 | X_panel[, target := 0]
 45 | 
 46 | X_panel$target[X_panel$Category1 != "Third" & (X_panel$Health_Score_1 > 0 | X_panel$Health_Score_2 > 0)] <- 1
 47 | X_panel$target[X_panel$Category1 == "Third" & X_panel$Number_of_stall_visited > 0] <- 1
 48 | 
 49 | X_panel[, ":="(Registration_Date = as.Date(Registration_Date, "%d-%b-%y"),
 50 |                Camp_Start_Date = as.Date(Camp_Start_Date, "%d-%b-%y"),
 51 |                Camp_End_Date = as.Date(Camp_End_Date, "%d-%b-%y"),
 52 |                First_Interaction = as.Date(First_Interaction, "%d-%b-%y"),
 53 |                Category1 = as.numeric(as.factor(Category1)),
 54 |                Category2 = as.numeric(as.factor(Category2)),
 55 |                City_Type = as.numeric(as.factor(City_Type)),
 56 |                Income = as.numeric(as.factor(Income)),
 57 |                Employer_Category = as.numeric(as.factor(Employer_Category)),
 58 |                Education_Score = as.numeric(Education_Score),
 59 |                Age = as.numeric(Age))]
 60 | 
 61 | setorder(X_panel, Patient_ID, Registration_Date)
 62 | X_panel$order <- seq(1, nrow(X_panel))
 63 | 
 64 | X_date <- X_panel[, c("Patient_ID", "Registration_Date", "order"), with = FALSE]
 65 | X_date$order <- X_date$order + 1
 66 | names(X_date)[2] <- "Prev_Date"
 67 | 
 68 | X_panel <- merge(X_panel, X_date, all.x = TRUE, by = c("Patient_ID", "order"))
 69 | 
 70 | X_date$order <- X_date$order - 2
 71 | names(X_date)[2] <- "Next_Date"
 72 | 
 73 | X_panel <- merge(X_panel, X_date, all.x = TRUE, by = c("Patient_ID", "order"))
 74 | 
 75 | X_panel[, ":="(Start_Date_Diff = as.numeric(Registration_Date - Camp_Start_Date),
 76 |                End_Date_Diff = as.numeric(Camp_End_Date - Registration_Date),
 77 |                Interaction_Date_Diff = as.numeric(Registration_Date - First_Interaction),
 78 |                Prev_Date_Diff = as.numeric(Registration_Date - Prev_Date),
 79 |                Next_Date_Diff = as.numeric(Registration_Date - Next_Date),
 80 |                Camp_Start_Year = year(Camp_Start_Date),
 81 |                Registration_Year = year(Registration_Date),
 82 |                Registration_Month = month(Registration_Date),
 83 |                Registration_Day = wday(Registration_Date))]
 84 | 
 85 | X_panel <- X_panel[Camp_Start_Year >= 2005]
 86 | X_panel <- X_panel[!is.na(Registration_Date)]
 87 | X_panel <- X_panel[Category3 == 2]
 88 | 
 89 | X_patient <- X_panel[, .(Count_Patient = .N), .(Patient_ID)]
 90 | X_panel <- merge(X_panel, X_patient, by = "Patient_ID")
 91 | 
 92 | X_patient_date <- X_panel[, .(Count_Patient_Date = .N), .(Patient_ID, Registration_Date)]
 93 | X_panel <- merge(X_panel, X_patient_date, by = c("Patient_ID", "Registration_Date"))
 94 | 
 95 | X_donation <- X_panel[Donation > 0, .(Min_Date_Donation = min(Registration_Date)), .(Patient_ID)]
 96 | X_panel <- merge(X_panel, X_donation, all.x = T, by = "Patient_ID")
 97 | 
 98 | X_panel[, Donation_Flag := ifelse(is.na(Min_Date_Donation), 0, ifelse(Registration_Date > Min_Date_Donation, 1, 0))]
 99 | 
100 | X_train <- X_panel[train_flag == 1]
101 | X_test <- X_panel[train_flag == 0]
102 | 
103 | X_features <- c("Count_Patient", "Count_Patient_Date", "Donation_Flag",
104 |                 "City_Type", "Income", "Education_Score", "Age",
105 |                 "Category1", "Category2",
106 |                 "Start_Date_Diff", "End_Date_Diff", "Prev_Date_Diff", "Next_Date_Diff")
107 | X_target <- X_train$target
108 | 
109 | xgtrain <- xgb.DMatrix(data = as.matrix(X_train[, X_features, with = FALSE]), label = X_target, missing = NA)
110 | xgtest <- xgb.DMatrix(data = as.matrix(X_test[, X_features, with = FALSE]), missing = NA)
111 | 
112 | 
113 | ## xgboost
114 | params <- list()
115 | params$objective <- "binary:logistic"
116 | params$eta <- 0.1
117 | params$max_depth <- 5
118 | params$subsample <- 0.9
119 | params$colsample_bytree <- 0.9
120 | params$min_child_weight <- 2
121 | params$eval_metric <- "auc"
122 | 
123 | model_xgb_cv <- xgb.cv(params=params, xgtrain, nrounds = 100, nfold = 5, early.stop.round = 30, prediction = TRUE)
124 | 
125 | model_xgb <- xgb.train(params = params, xgtrain, nrounds = 100)
126 | 
127 | vimp <- xgb.importance(model = model_xgb, feature_names = X_features)
128 | View(vimp)
129 | 
130 | 
131 | ## submission
132 | pred <- predict(model_xgb, xgtest)
133 | 
134 | submit <- data.table(Patient_ID = X_test$Patient_ID,
135 |                      Health_Camp_ID = X_test$Health_Camp_ID,
136 |                      Outcome = pred)
137 | 
138 | write.csv(submit, "./sub_vopani.csv", row.names = FALSE)
139 | 


--------------------------------------------------------------------------------
/AV_LTFS_April2019/README.md:
--------------------------------------------------------------------------------
1 | Code for the Analytics Vidhya L&T Financial Services Hackathon
2 | 
3 | https://datahack.analyticsvidhya.com/contest/ltfs-datascience-finhack-an-online-hackathon/
4 | 
5 | Thanks to Ziron, we finished [4th](https://datahack.analyticsvidhya.com/contest/ltfs-datascience-finhack-an-online-hackathon/pvt_lb) on this one. 
6 | 


--------------------------------------------------------------------------------
/AV_LordOfTheMachines/build_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model
  4 | import lightgbm as lgb
  5 | 
  6 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
  7 | 	grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index()
  8 | 	grouped_df.columns = var_name + ["var_count"]
  9 | 
 10 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 11 | 	merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True)
 12 | 	return list(merged_df["var_count"])
 13 | 
 14 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 15 | 	if type(var_name) != type([]):
 16 | 		var_name = [var_name]
 17 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
 18 | 	grouped_df.columns = var_name + ["mean_value"]
 19 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 20 | 	merged_df.fillna(np.mean(target_df[target_var].values), inplace=True)
 21 | 	return list(merged_df["mean_value"])
 22 | 
 23 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 24 | 	if type(var_name) != type([]):
 25 | 		var_name = [var_name]
 26 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index()
 27 | 	grouped_df.columns = var_name + ["sum_value"]
 28 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 29 | 	merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True)
 30 | 	return list(merged_df["sum_value"])
 31 | 
 32 | 
 33 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None):
 34 | 	model = linear_model.LogisticRegression(fit_intercept=True, C=0.3)
 35 | 	model.fit(train_X, train_y)
 36 | 	print model.coef_, model.intercept_
 37 | 	train_preds = model.predict_proba(train_X)[:,1]
 38 | 	test_preds = model.predict_proba(test_X)[:,1]
 39 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 40 | 	test_loss = 0
 41 | 	if test_y is not None:
 42 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 43 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 44 | 		print "Train and Test loss : ", train_loss, test_loss
 45 | 	return test_preds, test_loss, test_preds2
 46 | 
 47 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3):
 48 | 	model = ensemble.ExtraTreesClassifier(
 49 | 			n_estimators = 300,
 50 | 					max_depth = depth,
 51 | 					min_samples_split = 10,
 52 | 					min_samples_leaf = leaf,
 53 | 					max_features =  feat,
 54 | 					n_jobs = 6,
 55 | 					random_state = 0)
 56 | 	model.fit(train_X, train_y)
 57 | 	train_preds = model.predict_proba(train_X)[:,1]
 58 | 	test_preds = model.predict_proba(test_X)[:,1]
 59 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 60 | 	test_loss = 0
 61 | 	if test_y is not None:
 62 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 63 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 64 | 		print "Depth, leaf, feat : ", depth, leaf, feat
 65 | 		print "Train and Test loss : ", train_loss, test_loss
 66 | 	return test_preds, test_loss, test_preds2
 67 | 
 68 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
 69 | 	params = {}
 70 | 	params["objective"] = "binary"
 71 | 	params['metric'] = 'auc'
 72 | 	params["max_depth"] = dep
 73 | 	params["min_data_in_leaf"] = 100
 74 | 	params["learning_rate"] = eta
 75 | 	params["bagging_fraction"] = 0.7
 76 | 	params["feature_fraction"] = 0.7
 77 | 	params["bagging_freq"] = 5
 78 | 	params["bagging_seed"] = seed_val
 79 | 	params["verbosity"] = -1
 80 | 	num_rounds = rounds
 81 | 
 82 | 	plst = list(params.items())
 83 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 84 | 
 85 | 	if test_y is not None:
 86 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 87 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
 88 | 	else:
 89 | 		lgtest = lgb.DMatrix(test_X)
 90 | 		model = lgb.train(params, lgtrain, num_rounds)
 91 | 
 92 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 93 | 	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 94 | 
 95 | 	loss = 0
 96 | 	if test_y is not None:
 97 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 98 | 		print loss
 99 | 		return pred_test_y, loss, pred_test_y2
100 | 	else:
101 | 		return pred_test_y, loss, pred_test_y2
102 | 
103 | if __name__ == "__main__":
104 | 	print "Reading input files..."
105 | 	train_df = pd.read_csv("../input/train_feat.csv")
106 | 	test_df = pd.read_csv("../input/test_feat.csv")
107 | 	campaign_df = pd.read_csv("../input/campaign_data.csv")
108 | 	train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1)
109 | 	print train_df.shape, test_df.shape
110 | 	print train_df.head()
111 | 
112 | 
113 | 	print np.sort(train_df["campaign_id"].unique())
114 | 	#camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]]
115 | 
116 | 	print "Merging with campaign data.."
117 | 	train_df = pd.merge(train_df, campaign_df, on="campaign_id")
118 | 	test_df = pd.merge(test_df, campaign_df, on="campaign_id")
119 | 	print train_df.shape, test_df.shape
120 | 	kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2017)
121 | 
122 | 	train_y_open = train_df["is_open"].values
123 | 	train_y = train_df["is_click"].values
124 | 	test_id = test_df["id"].values
125 | 	train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 
126 | 	cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"]
127 | 	#cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"]
128 | 	#cols_to_use = []
129 | 	#cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"]
130 | 	cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"]
131 | 	cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"]
132 | 	#cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"]
133 | 		
134 | 	#print "Label encoding.."
135 | 	#for c in ["communication_type"]:
136 | 	#		cols_to_use.append(c)
137 | 	#		lbl = preprocessing.LabelEncoder()
138 | 	#		lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
139 | 	#		train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
140 | 	#		test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
141 | 	
142 | 	
143 | 	#print "Full Count encoding.."
144 | 	#full_df = train_df.append(test_df)
145 | 	#print full_df.shape
146 | 	#for col in [["user_id"]]:
147 | 	#	if isinstance(col, list):
148 | 	#		col_name = "_".join(col)
149 | 	#	train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id'))
150 | 	#	test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id'))
151 | 	#	cols_to_use.append(col_name + "_full_count")
152 | 
153 | 			
154 | 	print "Count encoding.."
155 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
156 | 	#for col in [["user_id"]]:
157 | 		train_enc_values = np.zeros(train_df.shape[0])
158 | 		test_enc_values = 0
159 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
160 | 		#for [dev_camp, val_camp] in camp_indices:
161 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
162 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
163 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click'))
164 | 			test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click'))
165 | 		test_enc_values /= 5.
166 | 		if isinstance(col, list):
167 | 			col = "_".join(col)
168 | 		train_df[col + "_count"] = train_enc_values
169 | 		test_df[col + "_count"] = test_enc_values
170 | 		cols_to_use.append(col + "_count")
171 | 		
172 | 
173 | 		
174 | 	print "Target encoding.."
175 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
176 | 	#for col in [["user_id"]]:
177 | 		train_enc_values = np.zeros(train_df.shape[0])
178 | 		test_enc_values = 0
179 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
180 | 		#for [dev_camp, val_camp] in camp_indices:
181 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
182 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
183 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click'))
184 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click'))
185 | 		test_enc_values /= 5.
186 | 		if isinstance(col, list):
187 | 			col = "_".join(col)
188 | 		train_df[col + "_enc"] = train_enc_values
189 | 		test_df[col + "_enc"] = test_enc_values
190 | 		cols_to_use.append(col + "_enc")
191 | 	
192 | 
193 | 	print "Open Target encoding.."
194 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
195 | 	#for col in [["user_id"]]:
196 | 		train_enc_values = np.zeros(train_df.shape[0])
197 | 		test_enc_values = 0
198 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
199 | 		#for [dev_camp, val_camp] in camp_indices:
200 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
201 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
202 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open'))
203 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open'))
204 | 		test_enc_values /= 5.
205 | 		if isinstance(col, list):
206 | 			col = "_".join(col)
207 | 		train_df[col + "_open_enc"] = train_enc_values
208 | 		test_df[col + "_open_enc"] = test_enc_values
209 | 		cols_to_use.append(col + "_open_enc")
210 | 			
211 | 	
212 | 
213 | 
214 | 	"""	
215 | 	print "Open Alone Target encoding.."
216 | 	#for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]:
217 | 	for col in [["user_id"]]:
218 | 		train_enc_values = np.zeros(train_df.shape[0])
219 | 		test_enc_values = 0
220 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
221 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
222 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
223 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open'))
224 | 			test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open'))
225 | 		test_enc_values /= 5.
226 | 		if isinstance(col, list):
227 | 			col = "_".join(col)
228 | 		train_df[col + "_open_sum_enc"] = train_enc_values
229 | 		test_df[col + "_open_sum_enc"] = test_enc_values
230 | 		cols_to_use.append(col + "_open_sum_enc")	
231 | 	"""
232 | 	
233 | 	
234 | 	print cols_to_use
235 | 	train_X = train_df[cols_to_use]
236 | 	test_X = test_df[cols_to_use]
237 | 	print train_X.describe()
238 | 	print test_X.describe()
239 | 
240 | 	#train_X.fillna(-1, inplace=True)
241 | 	#test_X.fillna(-1, inplace=True)	
242 | 
243 | 	print "Model building.."
244 | 	model_name = "LGB"
245 | 	cv_scores = []
246 | 	pred_test_full = 0
247 | 	pred_val_full = np.zeros(train_df.shape[0])	
248 | 	for dev_index, val_index in kf.split(train_unique_campaigns):
249 | 	#for [dev_camp, val_camp] in camp_indices:
250 | 		dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
251 | 		dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)]
252 | 		dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)]
253 | 		print dev_X.shape, val_X.shape
254 | 
255 | 		if model_name == "LGB":
256 | 			pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
257 | 			pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
258 | 			pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
259 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
260 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
261 | 			loss = (loss1 + loss2 + loss3)/3. 
262 | 		elif model_name == "ET":
263 | 			pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
264 | 		elif model_name == "LR":
265 | 			pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)
266 | 
267 | 		pred_test_full += pred_test
268 | 		pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val
269 | 		loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val)
270 | 		cv_scores.append(loss)
271 | 		print cv_scores
272 | 	pred_test_full /= 5.
273 | 	print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full)
274 | 
275 | 	sub_df = pd.DataFrame({"id":test_id})
276 | 	sub_df["is_click"] = pred_test_full
277 | 	sub_df.to_csv("srk_sub47.csv", index=False)
278 | 
279 | 
280 | 	
281 | 


--------------------------------------------------------------------------------
/AV_LordOfTheMachines/build_model_xgb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn import metrics, model_selection, ensemble, preprocessing, linear_model
  4 | import lightgbm as lgb
  5 | import xgboost as xgb
  6 | 
  7 | def getCountVar(compute_df, count_df, var_name, count_var="v1"):
  8 | 	grouped_df = count_df.groupby(var_name)[count_var].agg('count').reset_index()
  9 | 	grouped_df.columns = var_name + ["var_count"]
 10 | 
 11 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 12 | 	merged_df.fillna(np.mean(grouped_df["var_count"].values), inplace=True)
 13 | 	return list(merged_df["var_count"])
 14 | 
 15 | def getDVEncodeVar(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 16 | 	if type(var_name) != type([]):
 17 | 		var_name = [var_name]
 18 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
 19 | 	grouped_df.columns = var_name + ["mean_value"]
 20 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 21 | 	merged_df.fillna(np.mean(target_df[target_var].values), inplace=True)
 22 | 	return list(merged_df["mean_value"])
 23 | 
 24 | def getDVEncodeVar2(compute_df, target_df, var_name, target_var="is_click", min_cutoff=1):
 25 | 	if type(var_name) != type([]):
 26 | 		var_name = [var_name]
 27 | 	grouped_df = target_df.groupby(var_name)[target_var].agg(["sum"]).reset_index()
 28 | 	grouped_df.columns = var_name + ["sum_value"]
 29 | 	merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 30 | 	merged_df.fillna(np.mean(grouped_df["sum_value"].values), inplace=True)
 31 | 	return list(merged_df["sum_value"])
 32 | 
 33 | 
 34 | def runLR(train_X, train_y, test_X, test_y=None, test_X2=None):
 35 | 	model = linear_model.LogisticRegression(fit_intercept=True, C=0.3)
 36 | 	model.fit(train_X, train_y)
 37 | 	print model.coef_, model.intercept_
 38 | 	train_preds = model.predict_proba(train_X)[:,1]
 39 | 	test_preds = model.predict_proba(test_X)[:,1]
 40 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 41 | 	test_loss = 0
 42 | 	if test_y is not None:
 43 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 44 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 45 | 		print "Train and Test loss : ", train_loss, test_loss
 46 | 	return test_preds, test_loss, test_preds2
 47 | 
 48 | def runET(train_X, train_y, test_X, test_y=None, test_X2=None, depth=10, leaf=5, feat=0.3):
 49 | 	model = ensemble.ExtraTreesClassifier(
 50 | 			n_estimators = 300,
 51 | 					max_depth = depth,
 52 | 					min_samples_split = 10,
 53 | 					min_samples_leaf = leaf,
 54 | 					max_features =  feat,
 55 | 					n_jobs = 6,
 56 | 					random_state = 0)
 57 | 	model.fit(train_X, train_y)
 58 | 	train_preds = model.predict_proba(train_X)[:,1]
 59 | 	test_preds = model.predict_proba(test_X)[:,1]
 60 | 	test_preds2 = model.predict_proba(test_X2)[:,1]
 61 | 	test_loss = 0
 62 | 	if test_y is not None:
 63 | 		train_loss = metrics.roc_auc_score(train_y, train_preds)
 64 | 		test_loss = metrics.roc_auc_score(test_y, test_preds)
 65 | 		print "Depth, leaf, feat : ", depth, leaf, feat
 66 | 		print "Train and Test loss : ", train_loss, test_loss
 67 | 	return test_preds, test_loss, test_preds2
 68 | 
 69 | def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=3, eta=0.001):
 70 | 	params = {}
 71 | 	params["objective"] = "binary"
 72 | 	params['metric'] = 'auc'
 73 | 	params["max_depth"] = dep
 74 | 	params["min_data_in_leaf"] = 100
 75 | 	params["learning_rate"] = eta
 76 | 	params["bagging_fraction"] = 0.7
 77 | 	params["feature_fraction"] = 0.7
 78 | 	params["bagging_freq"] = 5
 79 | 	params["bagging_seed"] = seed_val
 80 | 	params["verbosity"] = -1
 81 | 	num_rounds = rounds
 82 | 
 83 | 	plst = list(params.items())
 84 | 	lgtrain = lgb.Dataset(train_X, label=train_y)
 85 | 
 86 | 	if test_y is not None:
 87 | 		lgtest = lgb.Dataset(test_X, label=test_y)
 88 | 		model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest], early_stopping_rounds=100, verbose_eval=20)
 89 | 	else:
 90 | 		lgtest = lgb.DMatrix(test_X)
 91 | 		model = lgb.train(params, lgtrain, num_rounds)
 92 | 
 93 | 	pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)
 94 | 	pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)
 95 | 
 96 | 	loss = 0
 97 | 	if test_y is not None:
 98 | 		loss = metrics.roc_auc_score(test_y, pred_test_y)
 99 | 		print loss
100 | 		return pred_test_y, loss, pred_test_y2
101 | 	else:
102 | 		return pred_test_y, loss, pred_test_y2
103 | 
104 | def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None, feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.001):
105 |         params = {}
106 |         params["objective"] = "binary:logistic"
107 |         params['eval_metric'] = 'auc'
108 |         params["eta"] = eta
109 |         params["subsample"] = 0.7
110 |         params["min_child_weight"] = 10
111 |         params["colsample_bytree"] = 0.7
112 |         params["max_depth"] = dep
113 |         params["silent"] = 1
114 |         params["seed"] = seed_val
115 |         #params["max_delta_step"] = 2
116 |         #params["gamma"] = 0.5
117 |         num_rounds = rounds
118 | 
119 |         plst = list(params.items())
120 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
121 | 
122 |         if test_y is not None:
123 |                 xgtest = xgb.DMatrix(test_X, label=test_y)
124 |                 watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
125 |                 model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=20)
126 |         else:
127 |                 xgtest = xgb.DMatrix(test_X)
128 |                 model = xgb.train(plst, xgtrain, num_rounds)
129 | 
130 |         pred_test_y = model.predict(xgtest, ntree_limit=model.best_ntree_limit)
131 |         pred_test_y2 = model.predict(xgb.DMatrix(test_X2), ntree_limit=model.best_ntree_limit)
132 | 
133 |         loss = 0
134 |         if test_y is not None:
135 |                 loss = metrics.log_loss(test_y, pred_test_y)
136 |                 print loss
137 |                 return pred_test_y, loss, pred_test_y2
138 |         else:
139 |                 return pred_test_y, loss, pred_test_y2
140 | 
141 | 
142 | if __name__ == "__main__":
143 | 	print "Reading input files..."
144 | 	train_df = pd.read_csv("../input/train_feat.csv")
145 | 	test_df = pd.read_csv("../input/test_feat.csv")
146 | 	campaign_df = pd.read_csv("../input/campaign_data.csv")
147 | 	train_df["is_open_alone"] = train_df["is_click"].astype('float') / np.maximum(train_df["is_open"],1)
148 | 	print train_df.shape, test_df.shape
149 | 	print train_df.head()
150 | 
151 | 
152 | 	print np.sort(train_df["campaign_id"].unique())
153 | 	#camp_indices = [[range(29, 47), range(47,56)], [range(47,56), range(29, 47)]]
154 | 
155 | 	print "Merging with campaign data.."
156 | 	train_df = pd.merge(train_df, campaign_df, on="campaign_id")
157 | 	test_df = pd.merge(test_df, campaign_df, on="campaign_id")
158 | 	print train_df.shape, test_df.shape
159 | 	kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=98765)
160 | 
161 | 	train_y_open = train_df["is_open"].values
162 | 	train_y = train_df["is_click"].values
163 | 	test_id = test_df["id"].values
164 | 	train_unique_campaigns = np.array(train_df["campaign_id"].unique()) 
165 | 	cols_to_use = ["user_cum_count", "user_count", "user_date_diff", "user_camp_diff", "hour"] #, "total_links","no_of_internal_links","no_of_images","no_of_sections"]
166 | 	#cols_to_use = ["user_cum_count", "user_count", "user_camp_diff"]
167 | 	#cols_to_use = []
168 | 	#cols_to_use = cols_to_use + ["first_open", "first_click", "second_open", "second_click", "third_open", "third_click"]
169 | 	cols_to_use = cols_to_use + ["user_min_date", "user_mean_date", "user_max_date", "user_std_date"]
170 | 	cols_to_use = cols_to_use + ["camp_"+str(i) for i in range(29,81)] + ["camps_sent"]
171 | 	#cols_to_use = cols_to_use + ["user_std_date_click", "user_std_date_open"]
172 | 		
173 | 	#print "Label encoding.."
174 | 	#for c in ["communication_type"]:
175 | 	#		cols_to_use.append(c)
176 | 	#		lbl = preprocessing.LabelEncoder()
177 | 	#		lbl.fit(list(train_df[c].values.astype('str')) + list(test_df[c].values.astype('str')))
178 | 	#		train_df[c] = lbl.transform(list(train_df[c].values.astype('str')))
179 | 	#		test_df[c] = lbl.transform(list(test_df[c].values.astype('str')))
180 | 	
181 | 	
182 | 	#print "Full Count encoding.."
183 | 	#full_df = train_df.append(test_df)
184 | 	#print full_df.shape
185 | 	#for col in [["user_id"]]:
186 | 	#	if isinstance(col, list):
187 | 	#		col_name = "_".join(col)
188 | 	#	train_df[col_name + "_full_count"] = np.array( getCountVar(train_df, full_df, col, 'id'))
189 | 	#	test_df[col_name + "_full_count"] = np.array( getCountVar(test_df, full_df, col, 'id'))
190 | 	#	cols_to_use.append(col_name + "_full_count")
191 | 
192 | 			
193 | 	print "Count encoding.."
194 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
195 | 	#for col in [["user_id"]]:
196 | 		train_enc_values = np.zeros(train_df.shape[0])
197 | 		test_enc_values = 0
198 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
199 | 		#for [dev_camp, val_camp] in camp_indices:
200 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
201 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
202 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getCountVar(val_X[col], dev_X, col, 'is_click'))
203 | 			test_enc_values += np.array( getCountVar(test_df[col], dev_X, col, 'is_click'))
204 | 		test_enc_values /= 5.
205 | 		if isinstance(col, list):
206 | 			col = "_".join(col)
207 | 		train_df[col + "_count"] = train_enc_values
208 | 		test_df[col + "_count"] = test_enc_values
209 | 		cols_to_use.append(col + "_count")
210 | 		
211 | 
212 | 		
213 | 	print "Target encoding.."
214 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
215 | 	#for col in [["user_id"]]:
216 | 		train_enc_values = np.zeros(train_df.shape[0])
217 | 		test_enc_values = 0
218 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
219 | 		#for [dev_camp, val_camp] in camp_indices:
220 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
221 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
222 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_click'))
223 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_click'))
224 | 		test_enc_values /= 5.
225 | 		if isinstance(col, list):
226 | 			col = "_".join(col)
227 | 		train_df[col + "_enc"] = train_enc_values
228 | 		test_df[col + "_enc"] = test_enc_values
229 | 		cols_to_use.append(col + "_enc")
230 | 	
231 | 
232 | 	print "Open Target encoding.."
233 | 	for col in [["user_id"], ["user_id", "communication_type"]]:
234 | 	#for col in [["user_id"]]:
235 | 		train_enc_values = np.zeros(train_df.shape[0])
236 | 		test_enc_values = 0
237 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
238 | 		#for [dev_camp, val_camp] in camp_indices:
239 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
240 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
241 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar(val_X[col], dev_X, col, 'is_open'))
242 | 			test_enc_values += np.array( getDVEncodeVar(test_df[col], dev_X, col, 'is_open'))
243 | 		test_enc_values /= 5.
244 | 		if isinstance(col, list):
245 | 			col = "_".join(col)
246 | 		train_df[col + "_open_enc"] = train_enc_values
247 | 		test_df[col + "_open_enc"] = test_enc_values
248 | 		cols_to_use.append(col + "_open_enc")
249 | 			
250 | 	
251 | 
252 | 
253 | 	"""	
254 | 	print "Open Alone Target encoding.."
255 | 	#for col in [["user_id"], ["user_id", "communication_type"], ["user_id", "no_of_sections"]]:
256 | 	for col in [["user_id"]]:
257 | 		train_enc_values = np.zeros(train_df.shape[0])
258 | 		test_enc_values = 0
259 | 		for dev_index, val_index in kf.split(train_unique_campaigns):
260 | 			dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
261 | 			dev_X, val_X = train_df[train_df['campaign_id'].isin(dev_camp)], train_df[~train_df['campaign_id'].isin(dev_camp)]
262 | 			train_enc_values[train_df['campaign_id'].isin(val_camp)] = np.array( getDVEncodeVar2(val_X[col], dev_X, col, 'is_open'))
263 | 			test_enc_values += np.array( getDVEncodeVar2(test_df[col], dev_X, col, 'is_open'))
264 | 		test_enc_values /= 5.
265 | 		if isinstance(col, list):
266 | 			col = "_".join(col)
267 | 		train_df[col + "_open_sum_enc"] = train_enc_values
268 | 		test_df[col + "_open_sum_enc"] = test_enc_values
269 | 		cols_to_use.append(col + "_open_sum_enc")	
270 | 	"""
271 | 	
272 | 	
273 | 	print cols_to_use
274 | 	train_X = train_df[cols_to_use]
275 | 	test_X = test_df[cols_to_use]
276 | 	print train_X.describe()
277 | 	print test_X.describe()
278 | 
279 | 	#train_X.fillna(-1, inplace=True)
280 | 	#test_X.fillna(-1, inplace=True)	
281 | 
282 | 	print "Model building.."
283 | 	model_name = "XGB"
284 | 	cv_scores = []
285 | 	pred_test_full = 0
286 | 	pred_val_full = np.zeros(train_df.shape[0])	
287 | 	for dev_index, val_index in kf.split(train_unique_campaigns):
288 | 	#for [dev_camp, val_camp] in camp_indices:
289 | 		dev_camp, val_camp = train_unique_campaigns[dev_index].tolist(), train_unique_campaigns[val_index].tolist()
290 | 		dev_X, val_X = train_X[train_df['campaign_id'].isin(dev_camp)], train_X[train_df['campaign_id'].isin(val_camp)]
291 | 		dev_y, val_y = train_y[train_df['campaign_id'].isin(dev_camp)], train_y[train_df['campaign_id'].isin(val_camp)]
292 | 		print dev_X.shape, val_X.shape
293 | 
294 | 		if model_name == "LGB":
295 | 			pred_val1, loss1, pred_test1 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
296 | 			pred_val2, loss2, pred_test2 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
297 | 			pred_val3, loss3, pred_test3 = runLGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
298 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
299 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
300 | 			loss = (loss1 + loss2 + loss3)/3. 
301 | 		elif model_name == "XGB":
302 | 			pred_val1, loss1, pred_test1 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4)
303 | 			pred_val2, loss2, pred_test2 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=2018)
304 | 			pred_val3, loss3, pred_test3 = runXGB(dev_X, dev_y, val_X, val_y, test_X, rounds=5000, dep=4, seed_val=9876)
305 | 			pred_val = (pred_val1 + pred_val2 + pred_val3)/3. 
306 | 			pred_test = (pred_test1 + pred_test2 + pred_test3)/3.
307 | 			loss = (loss1 + loss2 + loss3)/3. 
308 | 		elif model_name == "ET":
309 | 			pred_val, loss, pred_test = runET(dev_X, dev_y, val_X, val_y, test_X, depth=20, leaf=20, feat=0.3)
310 | 		elif model_name == "LR":
311 | 			pred_val, loss, pred_test = runLR(dev_X, dev_y, val_X, val_y, test_X)
312 | 
313 | 		pred_test_full += pred_test
314 | 		pred_val_full[train_df['campaign_id'].isin(val_camp)] = pred_val
315 | 		loss = metrics.roc_auc_score(train_y[train_df['campaign_id'].isin(val_camp)], pred_val)
316 | 		cv_scores.append(loss)
317 | 		print cv_scores
318 | 	pred_test_full /= 5.
319 | 	print np.mean(cv_scores), metrics.roc_auc_score(train_y, pred_val_full)
320 | 
321 | 	sub_df = pd.DataFrame({"id":test_id})
322 | 	sub_df["is_click"] = pred_test_full
323 | 	sub_df.to_csv("srk_sub48.csv", index=False)
324 | 
325 | 
326 | 	
327 | 


--------------------------------------------------------------------------------
/AV_LordOfTheMachines/ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | s1 = pd.read_csv("../Submissions/srk_sub47.csv")
 5 | s2 = pd.read_csv("../Submissions/srk_sub48.csv")
 6 | #s3 = pd.read_csv("../Submissions/srk_sub23.csv")
 7 | #s4 = pd.read_csv("../Submissions/srk_sub24.csv")
 8 | 
 9 | #s1["is_click"] = 0.35*(0.5*s1["is_click"] + 0.5*s2["is_click"]) + 0.65*(0.65*(s3["is_click"])+0.35*(s4["is_click"]))
10 | s1["is_click"] = 0.5*s1["is_click"] + 0.5*s2["is_click"]
11 | s1.to_csv("srk_sub49.csv", index=False)
12 | 


--------------------------------------------------------------------------------
/AV_LordOfTheMachines/readme.md:
--------------------------------------------------------------------------------
 1 | This folder has the code files for the [Hackaton - Lord Of The Machines](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/)
 2 | 
 3 | We finished [third](https://datahack.analyticsvidhya.com/contest/lord-of-the-machines/pvt_lb) on this comptition.  
 4 | 
 5 | Order of files to run
 6 | 1. Explorations.ipynb - Code file to create the features.
 7 | 2. build_model.py - Code file to build the Light GBM model
 8 | 3. build_model_xgb.py - Code file to build the XGB model
 9 | 4. ensemble.py - Code file to merge both the results.
10 | 


--------------------------------------------------------------------------------
/AV_MiniHack1/model_ens.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | data_path = "./"
 5 | s1 = pd.read_csv(data_path + "sub_lr.csv")
 6 | s2 = pd.read_csv(data_path + "sub_xgb.csv")
 7 | 
 8 | s1["Count"] = 0.5*s1["Count"] + 0.5*s2["Count"]
 9 | s1.to_csv("sub_ens.csv", index=False)
10 | 


--------------------------------------------------------------------------------
/AV_MiniHack1/model_lr.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from math import sqrt
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 6 | from sklearn.cross_validation import KFold
 7 | from sklearn import ensemble, preprocessing
 8 | from sklearn import linear_model as lm
 9 | from sklearn.metrics import mean_squared_error as mse
10 | 
11 | def rmse(act_y, pred_y):
12 | 	return np.sqrt(mse(act_y, pred_y))
13 | 
14 | if __name__ == "__main__":
15 | 	# Data path of the input files #
16 |         data_path = "../Data/"
17 |         train_file = data_path + "Train_JPXjxg6.csv"
18 |         test_file = data_path + "Test_mvj827l.csv"
19 | 
20 | 	print "Reading the files into dataframes.."
21 | 	train_df = pd.read_csv(train_file)
22 | 	test_df = pd.read_csv(test_file)
23 | 
24 |         print "Converting to date format.."
25 |         train_df["Date"] = (pd.to_datetime(train_df["Datetime"], format="%d-%m-%Y %H:%M"))
26 |         test_df["Date"] = (pd.to_datetime(test_df["Datetime"], format="%d-%m-%Y %H:%M"))
27 | 
28 | 	print "Getting the dv and id column.."
29 | 	train_y = np.array(train_df.Count.values)
30 | 	test_id = test_df.Datetime.values
31 | 
32 |         print "Creating variables from date field.."
33 |         train_df["Year"] = train_df["Date"].apply(lambda x: x.year)
34 |         test_df["Year"] = test_df["Date"].apply(lambda x: x.year)
35 | 	train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour)
36 |         test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour)
37 | 	train_df["WeekDay"] = train_df["Date"].apply(lambda x: x.weekday())
38 | 	test_df["WeekDay"] = test_df["Date"].apply(lambda x: x.weekday())
39 | 	train_df["DayCount"] = train_df["Date"].apply(lambda x: x.toordinal())
40 |         test_df["DayCount"] = test_df["Date"].apply(lambda x: x.toordinal())
41 | 
42 | 	train = train_df.drop(["Datetime","Date","Count"], axis=1)
43 | 	test = test_df.drop(["Datetime","Date"], axis=1)
44 | 
45 | 	print "One hot encoding.."
46 |         temp_train_arr = np.empty([train.shape[0],0])
47 |         temp_test_arr = np.empty([test.shape[0],0])
48 |         cols_to_drop = []
49 |         for var in train.columns:
50 |                 if var in ["Hour", "WeekDay"]:
51 |                         print var
52 |                         lb = preprocessing.LabelEncoder()
53 |                         full_var_data = pd.concat((train[var],test[var]),axis=0).astype('str')
54 |                         temp = lb.fit_transform(np.array(full_var_data))
55 |                         train[var] = lb.transform(np.array( train[var] ).astype('str'))
56 |                         test[var] = lb.transform(np.array( test[var] ).astype('str'))
57 | 
58 |                         cols_to_drop.append(var)
59 |                         ohe = preprocessing.OneHotEncoder(sparse=False)
60 |                         ohe.fit(temp.reshape(-1,1))
61 |                         temp_arr = ohe.transform(train[var].reshape(-1,1))
62 |                         temp_train_arr = np.hstack([temp_train_arr, temp_arr])
63 |                         temp_arr = ohe.transform(test[var].reshape(-1,1))
64 |                         temp_test_arr = np.hstack([temp_test_arr, temp_arr])
65 | 
66 | 	train = train.drop(cols_to_drop, axis=1)
67 |         test = test.drop(cols_to_drop, axis=1)
68 | 	train = np.hstack( [np.array(train),temp_train_arr]).astype("float")
69 |         test = np.hstack( [np.array(test),temp_test_arr]).astype("float")
70 | 	print train.shape, test.shape
71 | 
72 | 	# Use the lastest data #
73 | 	train_X = np.array(train)[16000:]
74 | 	train_y = train_y[16000:]
75 | 	test_X = np.array(test)
76 | 
77 | 	# Train the linear model and predict on test data #
78 | 	reg = lm.LinearRegression()
79 | 	reg.fit(train_X, train_y)
80 | 	preds = reg.predict(test_X).astype('int')
81 | 	
82 | 	# writing to out file #
83 | 	sample = pd.read_csv(data_path + "Test_mvj827l.csv")
84 | 	sample["Count"] = preds
85 | 	sample.to_csv("sub_lr.csv", index=False)
86 | 


--------------------------------------------------------------------------------
/AV_MiniHack1/model_xgb.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from math import sqrt
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 6 | from sklearn.cross_validation import KFold
 7 | from sklearn import ensemble
 8 | from sklearn import linear_model as lm
 9 | from sklearn.metrics import mean_squared_error as mse
10 | import xgboost as xgb
11 | 
12 | def runXGB(train_X, train_y, test_X, test_y=None):
13 |         params = {}
14 |         params["objective"] = "reg:linear"
15 |         params["eta"] = 0.02
16 |         params["min_child_weight"] = 8
17 |         params["subsample"] = 0.9
18 |         params["colsample_bytree"] = 0.8
19 |         params["silent"] = 1
20 |         params["max_depth"] = 8
21 |         params["seed"] = 1
22 |         plst = list(params.items())
23 |         num_rounds = 500
24 | 
25 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
26 |         xgtest = xgb.DMatrix(test_X)
27 |         model = xgb.train(plst, xgtrain, num_rounds)
28 |         pred_test_y = model.predict(xgtest)
29 |         return pred_test_y
30 | 
31 | def rmse(act_y, pred_y):
32 | 	return np.sqrt(mse(act_y, pred_y))
33 | 
34 | 
35 | if __name__ == "__main__":
36 | 	# Input data path #
37 |         data_path = "../Data/"
38 |         train_file = data_path + "Train_JPXjxg6.csv"
39 |         test_file = data_path + "Test_mvj827l.csv"
40 | 
41 | 	# Reading the csv file into pandas dataframe #
42 | 	train_df = pd.read_csv(train_file)
43 | 	test_df = pd.read_csv(test_file)
44 | 
45 |         print "Converting to date format"
46 |         train_df["Date"] = (pd.to_datetime(train_df["Datetime"], format="%d-%m-%Y %H:%M"))
47 |         test_df["Date"] = (pd.to_datetime(test_df["Datetime"], format="%d-%m-%Y %H:%M"))
48 | 
49 | 	# Getting the dv and id values #
50 | 	train_y = np.array(train_df.Count.values)
51 | 	test_id = test_df.Datetime.values
52 | 
53 |         print "Processing Date field.."
54 |         train_df["DayOfMonth"] = train_df["Date"].apply(lambda x: x.day)
55 |         test_df["DayOfMonth"] = test_df["Date"].apply(lambda x: x.day)
56 | 	train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour)
57 |         test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour)
58 | 	train_df["WeekDay"] = train_df["Date"].apply(lambda x: x.weekday())
59 | 	test_df["WeekDay"] = test_df["Date"].apply(lambda x: x.weekday())
60 | 	train_df["DayCount"] = train_df["Date"].apply(lambda x: x.toordinal())
61 |         test_df["DayCount"] = test_df["Date"].apply(lambda x: x.toordinal())
62 | 
63 | 	# Dropping the columns that are not needed #	
64 | 	train_df.drop(["Datetime","Date","Count"], axis=1, inplace=True)
65 | 	test_df.drop(["Datetime","Date"], axis=1, inplace=True)
66 | 
67 | 	# Running the xgb model #
68 | 	preds = runXGB(np.array(train_df), train_y, np.array(test_df))
69 |         preds = preds.astype('int')
70 | 
71 | 	# Saving the predictions #
72 |         sample = pd.read_csv(data_path + "Test_mvj827l.csv")
73 |         sample["Count"] = preds
74 |         sample.to_csv("sub_xgb.csv", index=False)
75 | 


--------------------------------------------------------------------------------
/AV_MiniHack1/readme.md:
--------------------------------------------------------------------------------
1 | This folder contains the codes for the [Analytics Vidhya - Mini DataHack] (http://datahack.analyticsvidhya.com/contest/mini-datahack)
2 | 
3 | Order of files to run
4 |  1. model_lr.py - Linear Regression model
5 |  2. model_xgb.py - XGBoost model
6 |  3. model_ens.py - Averaging both
7 | 


--------------------------------------------------------------------------------
/AV_MiniHack2_SimpleBuy/finalModel.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from math import sqrt
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 6 | from sklearn.cross_validation import KFold
 7 | from sklearn import ensemble
 8 | from sklearn import linear_model as lm
 9 | from sklearn.metrics import mean_squared_error as mse
10 | import xgboost as xgb
11 | 
12 | 
13 | def runXGB(train_X, train_y, test_X, test_y=None):
14 |         params = {}
15 |         params["objective"] = "reg:linear"
16 |         params["eta"] = 0.002
17 |         params["min_child_weight"] = 1
18 |         params["subsample"] = 0.9
19 |         params["colsample_bytree"] = 0.8
20 |         params["silent"] = 1
21 |         params["max_depth"] = 8
22 |         params["seed"] = 1
23 |         plst = list(params.items())
24 |         num_rounds = 900
25 | 
26 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
27 |         xgtest = xgb.DMatrix(test_X)
28 |         model = xgb.train(plst, xgtrain, num_rounds)
29 |         pred_test_y = model.predict(xgtest)
30 |         return pred_test_y
31 | 
32 | def rmse(act_y, pred_y):
33 | 	return np.sqrt(mse(act_y, pred_y))
34 | 
35 | if __name__ == "__main__":
36 |         data_path = "../Data/"
37 |         train_file = data_path + "Train_KQyJ5eh.csv"
38 |         test_file = data_path + "Test_HmLwURQ.csv"
39 | 
40 | 	train_df = pd.read_csv(train_file)
41 | 	test_df = pd.read_csv(test_file)
42 | 
43 |         print "Converting to date format"
44 |         train_df["Date_mod"] = (pd.to_datetime(train_df["Date"], format="%d-%b-%y"))
45 |         test_df["Date_mod"] = (pd.to_datetime(test_df["Date"], format="%d-%b-%y"))
46 | 
47 | 	train_y = np.array(train_df.Number_SKU_Sold.values)
48 | 	train_y[train_y > 20000000] = 20000000
49 | 	test_id = test_df.Date.values
50 | 
51 |         print "Processing Dates.."
52 |         train_df["DayOfMonth"] = train_df["Date_mod"].apply(lambda x: x.day)
53 |         test_df["DayOfMonth"] = test_df["Date_mod"].apply(lambda x: x.day)
54 |         train_df["Month"] = train_df["Date_mod"].apply(lambda x: x.month)
55 |         test_df["Month"] = test_df["Date_mod"].apply(lambda x: x.month)
56 |         #train_df["Year"] = train_df["Date"].apply(lambda x: x.year)
57 |         #test_df["Year"] = test_df["Date"].apply(lambda x: x.year)
58 | 	#train_df["Hour"] = train_df["Date"].apply(lambda x: x.hour)
59 |         #test_df["Hour"] = test_df["Date"].apply(lambda x: x.hour)
60 | 	train_df["WeekDay"] = train_df["Date_mod"].apply(lambda x: x.weekday())
61 | 	test_df["WeekDay"] = test_df["Date_mod"].apply(lambda x: x.weekday())
62 |         #train_df["WeekNo"] = train_df["Date_mod"].apply(lambda x: x.isocalendar()[1])
63 |         #test_df["WeekNo"] = test_df["Date_mod"].apply(lambda x: x.isocalendar()[1])
64 |         train_df["DayOfYear"] = train_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday)
65 |         test_df["DayOfYear"] = test_df["Date_mod"].apply(lambda x: x.timetuple().tm_yday)
66 | 	train_df["DayCount"] = train_df["Date_mod"].apply(lambda x: x.toordinal())
67 |         test_df["DayCount"] = test_df["Date_mod"].apply(lambda x: x.toordinal())
68 | 	
69 | 	
70 | 
71 | 	train_df.drop(["Date_mod","Date","Number_SKU_Sold"], axis=1, inplace=True)
72 | 	test_df.drop(["Date_mod","Date"], axis=1, inplace=True)
73 | 
74 | 	print train_df.shape, test_df.shape
75 | 	print train_df.head()
76 | 	print test_df.head()
77 | 
78 | 	preds_xgb = runXGB(np.array(train_df)[299:,:], train_y[299:], np.array(test_df))
79 | 
80 | 	
81 | 	reg = lm.LinearRegression()
82 | 	reg.fit(np.array(train_df)[:,:], train_y[:])
83 | 	preds_lm = reg.predict( np.array(test_df))
84 | 
85 | 	train_y[train_y > 15000000] = 15000000
86 | 	preds = 0.8*preds_xgb + 0.2*preds_lm
87 | 
88 | 	preds[357] = 70000000
89 | 
90 | 	# Saving the predictions #
91 |         sample = pd.read_csv(data_path + "Sample_Submission_6FjDs3p.csv")
92 |         sample["Number_SKU_Sold"] = preds
93 |         sample.to_csv("sub.csv", index=False)
94 | 


--------------------------------------------------------------------------------
/AV_MiniHack2_SimpleBuy/readme.md:
--------------------------------------------------------------------------------
1 | Codes for the AV Time Series Mini Hack - SimpleBuy Sales 
2 | 


--------------------------------------------------------------------------------
/AV_SmartRecruits/finalModel.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import csv
  3 | import operator
  4 | import pandas as pd
  5 | import numpy as np
  6 | from sklearn import preprocessing
  7 | import xgboost as xgb
  8 | from sklearn.metrics import roc_auc_score
  9 | from sklearn.cross_validation import KFold
 10 | 
 11 | data_path = "../input/"
 12 | train_file_name = "Train_pjb2QcD.csv"
 13 | test_file_name = "Test_wyCirpO.csv"
 14 | 
 15 | def getCountVar(compute_df, count_df, var_name, count_var="Manager_Num_Application"):
 16 |         grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
 17 |         grouped_df.columns = [var_name, "var_count"]
 18 |         merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
 19 |         merged_df.fillna(-1, inplace=True)
 20 |         return list(merged_df["var_count"])
 21 | 
 22 | def create_feature_map(features):
 23 |         outfile = open('xgb.fmap', 'w')
 24 |         for i, feat in enumerate(features):
 25 |                 outfile.write('{0}\t{1}\tq\n'.format(i,feat))
 26 |         outfile.close()
 27 | 
 28 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0):
 29 |         params = {}
 30 |         params["objective"] = "binary:logistic"
 31 |         params['eval_metric'] = 'auc'
 32 |         params["eta"] = 0.01 #0.00334
 33 |         params["min_child_weight"] = 1
 34 |         params["subsample"] = 0.8
 35 |         params["colsample_bytree"] = 0.3
 36 |         params["silent"] = 1
 37 |         params["max_depth"] = 6
 38 |         params["seed"] = seed_val
 39 |         #params["max_delta_step"] = 2
 40 |         #params["gamma"] = 0.5
 41 |         num_rounds = 1000 #2500
 42 | 
 43 |         plst = list(params.items())
 44 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
 45 | 
 46 |         if test_y is not None:
 47 |                 xgtest = xgb.DMatrix(test_X, label=test_y)
 48 |                 watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
 49 |                 model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=500)
 50 |         else:
 51 |                 xgtest = xgb.DMatrix(test_X)
 52 |                 model = xgb.train(plst, xgtrain, num_rounds)
 53 | 
 54 |         if feature_names:
 55 |                         create_feature_map(feature_names)
 56 |                         model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
 57 |                         importance = model.get_fscore(fmap='xgb.fmap')
 58 |                         importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
 59 |                         imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
 60 |                         imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
 61 |                         imp_df.to_csv("imp_feat.txt", index=False)
 62 | 
 63 |         pred_test_y = model.predict(xgtest)
 64 | 
 65 |         if test_y is not None:
 66 |                 loss = roc_auc_score(test_y, pred_test_y)
 67 |                 print loss
 68 |         	return pred_test_y, loss
 69 | 	else:
 70 | 		return pred_test_y
 71 | 
 72 | 
 73 | if __name__ == "__main__":
 74 | 	print "Reading files.."
 75 |         train = pd.read_csv(data_path + train_file_name)
 76 |         test = pd.read_csv(data_path + test_file_name)
 77 |         print train.shape, test.shape
 78 | 
 79 | 	print "Rank vars.."
 80 |         prev_date = 0
 81 |         count_dict = {}
 82 |         for name, row in train.iterrows():
 83 |                 count_dict[ row["Application_Receipt_Date"] ] = count_dict.get(row["Application_Receipt_Date"],0) + 1
 84 |         for name, row in test.iterrows():
 85 |                 count_dict[ row["Application_Receipt_Date"] ] = count_dict.get(row["Application_Receipt_Date"],0) + 1
 86 | 
 87 |         prev_date = 0
 88 |         rank_list = []
 89 |         count_list = []
 90 |         rankpct_list = []
 91 |         for name, row in train.iterrows():
 92 |                 date_value = row["Application_Receipt_Date"]
 93 |                 if date_value != prev_date:
 94 |                         rank = 1
 95 |                         prev_date = date_value
 96 |                 else:
 97 |                         rank += 1
 98 |                 rank_list.append( rank )
 99 |                 count_list.append( count_dict[date_value] )
100 |                 rankpct_list.append( float(rank) / count_dict[date_value] )
101 |         train["dayrank"] = rank_list[:]
102 |         train["daycount"] = count_list[:]
103 |         train["dayrankpct"] = rankpct_list[:]
104 | 
105 |         prev_date = 0
106 |         rank_list = []
107 |         count_list = []
108 |         rankpct_list = []
109 |         for name, row in test.iterrows():
110 |                 date_value = row["Application_Receipt_Date"]
111 |                 if date_value != prev_date:
112 |                         rank = 1
113 |                         prev_date = date_value
114 |                 else:
115 |                         rank += 1
116 |                 rank_list.append( rank )
117 |                 count_list.append( count_dict[date_value] )
118 |                 rankpct_list.append( float(rank) / count_dict[date_value] )
119 |         test["dayrank"] = rank_list[:]
120 |         test["daycount"] = count_list[:]
121 |         test["dayrankpct"] = rankpct_list[:]
122 |         print train.dayrank.describe()
123 |         print test.dayrank.describe()
124 | 
125 |         print "Getting DV and ID.."
126 |         train_y = train.Business_Sourced.values
127 |         train_ID = train.ID.values
128 |         test_ID = test.ID.values
129 | 
130 | 	print "New feats.."
131 | 	print "Some more features.."
132 |         new_feats = ["DOJ_DOB", "DOB_Applicant_Gender", "DOB_Qualification", "DOB_Gender_Qual"] 
133 |         train["DOJ_DOB"] = train["Manager_DOJ"].astype('str') + "_" + train["Manager_DoB"].astype('str')
134 |         train["DOB_Applicant_Gender"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Gender"].astype('str')
135 |         train["DOB_Qualification"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Qualification"].astype('str')
136 | 	train["DOB_Gender_Qual"] = train["Manager_DoB"].astype('str') + "_" + train["Applicant_Gender"].astype('str') + "_" + train["Applicant_Qualification"].astype('str')
137 |         test["DOJ_DOB"] = test["Manager_DOJ"].astype('str') + "_" + test["Manager_DoB"].astype('str')
138 |         test["DOB_Applicant_Gender"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Gender"].astype('str')
139 |         test["DOB_Qualification"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Qualification"].astype('str')
140 | 	test["DOB_Gender_Qual"] = test["Manager_DoB"].astype('str') + "_" + test["Applicant_Gender"].astype('str') + "_" + test["Applicant_Qualification"].astype('str')
141 | 
142 | 	print "Label encoding.."
143 | 	cat_columns = ["Applicant_Gender", "Applicant_Marital_Status", "Applicant_Occupation", "Applicant_Qualification", "Manager_Joining_Designation", "Manager_Current_Designation", "Manager_Status", "Manager_Gender"]
144 | 	for f in cat_columns + new_feats:
145 |                         print(f), len(np.unique(train[f].values))
146 |                         lbl = preprocessing.LabelEncoder()
147 |                         lbl.fit(list(train[f].values) + list(test[f].values))
148 |                         train[f] = lbl.transform(list(train[f].values))
149 |                         test[f] = lbl.transform(list(test[f].values))
150 |                         new_train = pd.concat([ train[['Manager_Num_Application',f]], test[['Manager_Num_Application',f]] ])
151 |                         train["CountVar_"+str(f)] = getCountVar(train[['Manager_Num_Application',f]], new_train[['Manager_Num_Application', f]], f)
152 |                         test["CountVar_"+str(f)] = getCountVar(test[['Manager_Num_Application',f]], new_train[['Manager_Num_Application',f]], f)
153 | 
154 | 	print "Working on dates.."
155 | 	for date_col in ["Application_Receipt_Date", "Applicant_BirthDate", "Manager_DOJ", "Manager_DoB"]:
156 | 		print date_col
157 | 		train[date_col].fillna("1/1/1900", inplace=True)
158 | 		test[date_col].fillna("1/1/1900", inplace=True)
159 | 		train[date_col] = (pd.to_datetime(train[date_col], format="%m/%d/%Y"))
160 |         	test[date_col] = (pd.to_datetime(test[date_col], format="%m/%d/%Y"))
161 | 		train[date_col] = train[date_col].apply(lambda x: x.toordinal())
162 | 	        test[date_col] = test[date_col].apply(lambda x: x.toordinal())
163 | 
164 | 	dev_index = np.where(train["Application_Receipt_Date"]<=733100)[0]
165 | 	val_index = np.where(train["Application_Receipt_Date"]>733100)[0]
166 | 	print "Dropping unwanted cols.."
167 | 	drop_cols = []
168 | 	train.drop(["ID", "Business_Sourced"]+drop_cols, axis=1, inplace=True)
169 | 	test.drop(["ID"] + drop_cols, axis=1, inplace=True)
170 | 
171 | 	print "Fill NA.."
172 | 	train.fillna(-999, inplace=True)
173 | 	test.fillna(-999, inplace=True)
174 | 
175 | 	print "New features.."
176 |         train["Manager_Business2"] = train["Manager_Business"] - train["Manager_Business2"]
177 |         test["Manager_Business2"] = test["Manager_Business"] - test["Manager_Business2"]
178 |         train["Manager_Num_Products2"] = train["Manager_Num_Products"] - train["Manager_Num_Products2"]
179 |         test["Manager_Num_Products2"] = test["Manager_Num_Products"] - test["Manager_Num_Products2"]
180 | 
181 | 	print "Converting to array.."
182 | 	feat_names = list(train.columns)
183 | 	train = np.array(train)
184 | 	test = np.array(test)
185 | 	print train.shape, test.shape
186 | 	assert train.shape[1] == test.shape[1]
187 | 
188 | 	full_preds = 0
189 | 	for rs in [1, 1343, 445234]:
190 | 	        preds = runXGB(train, train_y, test, feature_names=feat_names, seed_val = rs)
191 | 		full_preds += preds
192 | 	full_preds /= 3.
193 | 	
194 | 	out_df = pd.DataFrame({"ID":test_ID})
195 | 	out_df["Business_Sourced"] = full_preds
196 | 	out_df.to_csv("final.csv", index=False)
197 | 


--------------------------------------------------------------------------------
/AV_SmartRecruits/readme.md:
--------------------------------------------------------------------------------
1 | Codes for AV hackathon - The Smart Recruits
2 | 


--------------------------------------------------------------------------------
/AV_TheSeersAccuracy/createFeatures.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pandas as pd
  3 | import numpy as np
  4 | import datetime
  5 | from sklearn.preprocessing import LabelEncoder
  6 | 
  7 | def getFeatures(df, dv_list=set(), start_date=datetime.datetime(2006,1,1)):
  8 | 	grouped_df = df.groupby("Client_ID")
  9 | 	for name, group in grouped_df:
 10 | 		#if group.shape[0] < 2:
 11 | 		#	continue
 12 | 		#print name
 13 | 		out = [name]
 14 | 		#print group
 15 | 
 16 | 		# time since last transaction #
 17 | 		max_date = max(group["Transaction_Date"])
 18 | 		out.append( (start_date - max_date).days )
 19 | 
 20 | 		# Number of transactions #
 21 | 		out.append(group.shape[0])
 22 | 
 23 | 		# Mean EMI #
 24 | 		out.append( np.mean(group["Number_of_EMI"]) )
 25 | 
 26 | 		# Mean var1 #
 27 | 		out.append( np.mean(group["Var1"]) )
 28 | 
 29 | 		# Mean Var2 #
 30 | 		out.append( np.mean(group["Var2"]) )
 31 | 
 32 | 		# Mean Var3 #
 33 |                 out.append( np.mean(group["Var3"]) )
 34 | 
 35 | 		# Mean Transaction_Amount #
 36 | 		out.append( np.mean(group["Transaction_Amount"]) )
 37 | 
 38 | 		# Mean Purchased_in_Sale #
 39 | 		out.append( np.mean(group["Purchased_in_Sale"]) )
 40 | 
 41 | 		# get last purchase #
 42 | 		last_purchase = group[group["Transaction_Date"] == max_date]
 43 | 		#print "Last Purchase is : ", last_purchase
 44 | 
 45 | 		# last purchase in sale #
 46 | 		out.append( int(last_purchase["Purchased_in_Sale"].iloc[-1]) )
 47 | 
 48 | 		# last EMI #
 49 | 		out.append( int(last_purchase["Number_of_EMI"].iloc[-1]) )
 50 | 
 51 | 		# last store #
 52 | 		out.append( int(last_purchase["Store_ID"].iloc[-1]) )
 53 | 
 54 | 		# last var1 #
 55 |                 out.append( int(last_purchase["Var1"].iloc[-1]) )
 56 | 
 57 | 		# last var2 #
 58 |                 out.append( int(last_purchase["Var2"].iloc[-1]) )
 59 | 
 60 | 		# last var3 #
 61 |                 out.append( int(last_purchase["Var3"].iloc[-1]) )
 62 | 
 63 | 		# Gender #
 64 |                 out.append( int(last_purchase["Gender"].iloc[-1]) )
 65 | 
 66 | 		# Last Referred_Friend #
 67 | 		out.append( int(last_purchase["Referred_Friend"].iloc[-1]) )
 68 | 
 69 | 		# Last SE category # 
 70 | 		out.append( int(last_purchase["Sales_Executive_Category"].iloc[-1]) )
 71 | 
 72 | 		# Last SE ID #
 73 | 		out.append( int(last_purchase["Sales_Executive_ID"].iloc[-1]) )
 74 | 		
 75 | 		# Last Lead Source #
 76 | 		out.append( int(last_purchase["Lead_Source_Category"].iloc[-1]) )
 77 | 
 78 | 		# Last Payment Mode #
 79 | 		out.append( int(last_purchase["Payment_Mode"].iloc[-1]) )
 80 | 
 81 | 		# last product category #
 82 | 		out.append( int(last_purchase["Product_Category"].iloc[-1]) )
 83 | 
 84 | 		# last transaction amount #
 85 | 		out.append( int(last_purchase["Transaction_Amount"].iloc[-1]) )
 86 | 
 87 | 		# time since first transaction #
 88 | 		min_date = min(group["Transaction_Date"])
 89 |                 out.append( (start_date - min_date).days )
 90 | 
 91 | 		# total time #
 92 | 		out.append((max_date - min_date).days)
 93 | 
 94 | 		# frequency #
 95 | 		out.append( (max_date - min_date).days / float(group.shape[0]) )
 96 | 
 97 | 		# number of unique stores visited #
 98 | 		out.append( len( np.unique( group["Store_ID"] )) )
 99 | 
100 | 		# number of unique purchased in sale #
101 |                 out.append( len( np.unique( group["Purchased_in_Sale"] )) )
102 | 
103 | 		# number of unique var1 #
104 | 		out.append( len( np.unique( group["Var1"] )) )
105 | 
106 | 		# number of unique var2 #
107 |                 out.append( len( np.unique( group["Var2"] )) )
108 | 
109 | 		# number of unique var3 #
110 |                 out.append( len( np.unique( group["Var3"] )) )
111 | 
112 | 		# number of unique SE id #
113 | 		out.append( len( np.unique( group["Sales_Executive_ID"] )) )
114 | 	
115 | 		# number of unique SE cat #
116 | 		out.append( len( np.unique( group["Sales_Executive_Category"] )) )
117 | 
118 | 		# number of unique LS cat #
119 |                 out.append( len( np.unique( group["Lead_Source_Category"] )) )
120 | 		
121 | 		# number of unique paymenr mode #
122 | 		out.append( len( np.unique( group["Payment_Mode"])) )
123 | 
124 | 		# number of unique product category #
125 | 		out.append( len( np.unique( group["Product_Category"])) )
126 | 	
127 | 		# getting year of birth #
128 |                 yob = int((last_purchase["DOB"].iloc[-1]).year)
129 | 		if yob > 2000:
130 | 			yob = yob-100
131 | 		out.append(yob)	
132 | 
133 | 		# number of unique dob #
134 |                 out.append( len( np.unique( group["DOB"])) )		
135 | 
136 | 		# number of purchases in last one year #
137 | 		yop = (start_date.year - 1)
138 | 		temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) )
139 | 		out.append(sum(temp_arr))
140 | 		out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) )
141 | 
142 | 		# number of purchases in last two years #
143 |                 yop = (start_date.year - 2)
144 |                 temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) )
145 |                 out.append(sum(temp_arr))
146 | 		out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) )
147 | 
148 | 		# number of purchases in last three years #
149 |                 yop = (start_date.year - 3)
150 |                 temp_arr = np.array( group["Transaction_Date"].apply(lambda x: int(x.year>=yop)) )
151 |                 out.append(sum(temp_arr))
152 | 		out.append( np.sum( np.array(group["Transaction_Amount"]) * temp_arr ) )
153 | 
154 | 		# DV #
155 | 		if name in dv_list:
156 | 			out.append(1)
157 | 		else:
158 | 			out.append(0)
159 | 
160 | 		yield out
161 | 
162 | if __name__ == "__main__":
163 | 	train = pd.read_csv("../Data/dev.csv")
164 | 	repeat_clients = set(np.unique(pd.read_csv("../Data/val.csv")["Client_ID"]))
165 | 	print len(repeat_clients)
166 | 	test = pd.read_csv("../Data/Train_seers_accuracy.csv")
167 | 
168 | 	print "Label Encoding.."
169 |         for var in test.columns:
170 |                 if test[var].dtypes == object :
171 | 			if var in ["Transaction_Date", "DOB"]:
172 | 				continue
173 |                         print var
174 |                         lb = LabelEncoder()
175 |                         full_var_data = pd.concat((train[var],test[var]),axis=0).astype('str')
176 |                         lb.fit(np.array(full_var_data))
177 |                         train[var] = lb.transform(np.array( train[var] ).astype('str'))
178 |                         test[var] = lb.transform(np.array( test[var] ).astype('str'))
179 | 
180 | 	train["Transaction_Date"] = pd.to_datetime(train["Transaction_Date"], format="%d-%b-%y")
181 | 	test["Transaction_Date"] = pd.to_datetime(test["Transaction_Date"], format="%d-%b-%y")
182 | 	print min(train["Transaction_Date"])
183 | 	print max(train["Transaction_Date"])
184 | 	train["DOB"] = pd.to_datetime(train["DOB"], format="%d-%b-%y")
185 |         test["DOB"] = pd.to_datetime(test["DOB"], format="%d-%b-%y")
186 |         print min(train["DOB"])
187 |         print max(train["DOB"])
188 | 
189 | 	print "Processing train.."
190 | 	out_file = open("train_features3.csv","w")
191 | 	writer = csv.writer(out_file)
192 | 	header = ["Client_ID", "TimeSinceLastTrans", "NumberOfTrans", "MeanEMI", "MeanVar1", "MeanVar2", "MeanVar3", "MeanTransactionAmount", "MeanPurchasedInSale", "LastPurchasedInSale", "LastEMI", "LastStoreID", "LastVar1", "LastVar2", "LastVar3", "Gender", "LastReferredFriend", "LastSECat", "LastSEID", "LastLeadSource", "LastPayMode", "LastProdCat", "LastTransAmt", "TimeSinceFirstTrans", "TotalTime", "FreqTrans", "NumUniqueStore", "NumUniPurchasedInSale", "NumUniVar1", "NumUniVar2", "NumUniVar3", "NumUniSEID", "NumUniSECat", "NumUniLScat", "NumUniPayMode", "NumUniProdCat", "YoB", "NumUniDOB", "Last1YCount", "Last1YTA", "Last2YCount", "Last2YTA", "Last3YCount", "Last3YTA", "DV"]
193 | 	len_header = len(header)
194 | 	writer.writerow(header)
195 | 	count = 0
196 | 	for feature_list in getFeatures(train, repeat_clients, start_date=datetime.datetime(2006,1,1)):
197 | 		assert len_header == len(feature_list)
198 | 		writer.writerow( feature_list )
199 | 		#break
200 | 		count +=1
201 | 		if count%10000 == 0:
202 | 			print count
203 | 	out_file.close()
204 | 
205 | 	print "Processing test..."
206 | 	out_file = open("test_features3.csv","w")
207 |         writer = csv.writer(out_file)
208 | 	#header = ["Client_ID", "TimeSinceLastTrans", "NumberOfTrans", "MeanEMI", "MeanVar1", "MeanVar2", "MeanVar3", "MeanTransactionAmount", "MeanPurchasedInSale", "LastPurchasedInSale", "DV"]
209 | 	#len_header = len(header)
210 |         writer.writerow(header)
211 | 	count = 0
212 |         for feature_list in getFeatures(test, start_date=datetime.datetime(2007,1,1)):
213 | 		assert len_header == len(feature_list)
214 |                 writer.writerow( feature_list )
215 | 		count += 1
216 | 		if count%10000 == 0:
217 |                         print count
218 |         out_file.close()
219 | 


--------------------------------------------------------------------------------
/AV_TheSeersAccuracy/finalModel.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import pandas as pd
 3 | import numpy as np
 4 | import datetime
 5 | import operator
 6 | from sklearn import preprocessing
 7 | from sklearn.cross_validation import KFold
 8 | from sklearn import ensemble
 9 | from sklearn.metrics import roc_auc_score,log_loss
10 | import xgboost as xgb
11 | import random
12 | 
13 | def create_feature_map(features):
14 |         outfile = open('xgb.fmap', 'w')
15 |         for i, feat in enumerate(features):
16 |                 outfile.write('{0}\t{1}\tq\n'.format(i,feat))
17 |         outfile.close()
18 | 
19 | def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, round_val=1650):
20 |         params = {}
21 |         params["objective"] = "binary:logistic"
22 |         params['eval_metric'] = 'auc'
23 |         params["eta"] = 0.01 
24 |         params["min_child_weight"] = 2
25 |         params["subsample"] = 0.55
26 |         params["colsample_bytree"] = 0.9
27 |         params["silent"] = 1
28 |         params["max_depth"] = 4
29 |         params["seed"] = seed_val
30 |         params["max_delta_step"] = 2
31 |         num_rounds = round_val
32 | 
33 |         plst = list(params.items())
34 |         xgtrain = xgb.DMatrix(train_X, label=train_y)
35 | 
36 |         if test_y is not None:
37 |                 xgtest = xgb.DMatrix(test_X, label=test_y)
38 |                 watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
39 |                 model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=10000)
40 |         else:
41 |                 xgtest = xgb.DMatrix(test_X)
42 |                 model = xgb.train(plst, xgtrain, num_rounds)
43 | 
44 |         if feature_names:
45 |                         create_feature_map(feature_names)
46 |                         model.dump_model('xgbmodel.txt', 'xgb.fmap', with_stats=True)
47 |                         importance = model.get_fscore(fmap='xgb.fmap')
48 |                         importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
49 |                         imp_df = pd.DataFrame(importance, columns=['feature','fscore'])
50 |                         imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
51 |                         imp_df.to_csv("imp_feat.txt", index=False)
52 | 
53 |         pred_test_y = model.predict(xgtest)
54 | 
55 |         if test_y is not None:
56 |                 loss = roc_auc_score(test_y, pred_test_y)
57 |                 print loss
58 | 
59 | 	        return pred_test_y, loss
60 | 	else:
61 | 		return pred_test_y
62 | 
63 | 
64 | if __name__ == "__main__":
65 | 	print "Reading csv.."
66 | 	otrain = pd.read_csv("./train_features3.csv")
67 | 	otest = pd.read_csv("./test_features3.csv")
68 | 	print otrain.shape, otest.shape
69 | 
70 | 	print "Getting DV.."
71 | 	train_y = np.array( otrain.DV.values )
72 | 	train_id = np.array( otrain.Client_ID.values )
73 | 	test_id = np.array( otest.Client_ID.values )
74 | 
75 | 	print "Dropping.."
76 | 	otrain = otrain.drop(['DV'], axis=1)
77 |         otest = otest.drop(["DV"], axis=1)
78 | 
79 | 	use_cols = ['Client_ID', 'TimeSinceLastTrans', 'NumberOfTrans', 'MeanEMI', 'MeanVar1', 'MeanVar2', 'MeanVar3', 'MeanTransactionAmount', 'MeanPurchasedInSale', 'LastPurchasedInSale', 'LastEMI', 'LastStoreID', 'LastVar1', 'LastVar2', 'Gender', 'LastReferredFriend', 'LastSECat', 'LastSEID', 'LastLeadSource', 'LastPayMode', 'LastProdCat', 'LastTransAmt'] 
80 | 	train = otrain[use_cols]
81 | 	test = otest[use_cols]
82 | 
83 | 	feat_names = list(train.columns)
84 |         print "Converting to array.."
85 |         train = np.array(train).astype('float')
86 |         test = np.array(test).astype('float')
87 |         print train.shape, test.shape
88 | 
89 | 	assert train.shape[1] == test.shape[1]
90 | 	print "Final Model.."
91 | 	preds = runXGB(train, train_y, test, seed_val=0, round_val=1200)
92 | 
93 | 	out_df = pd.DataFrame({"Client_ID":test_id})
94 | 	out_df["Cross_Sell"] = preds
95 | 	out_df.to_csv("submission.csv", index=False)
96 | 	
97 | 
98 | 


--------------------------------------------------------------------------------
/AV_TheSeersAccuracy/readme.md:
--------------------------------------------------------------------------------
 1 | Code for the Hackathon - [The Seers Accuracy](http://datahack.analyticsvidhya.com/contest/the-seers-accuracy) by [Analytics Vidhya](http://www.analyticsvidhya.com/)
 2 | 
 3 | ####Objective
 4 | The objective of the competition is to predict whether the customer will come back in the next one year or not.
 5 | 
 6 | ####Approach
 7 | We had transaction data of all the customers from Jan 2003 to Dec 2006. The idea is to predict whether the customer will come back in 2007 or not. 
 8 | 
 9 | 1. The first step was to create a proper validation framework since there was no "target" variable
10 | 2. I have used transaction data from 2003 to 2005 to create the features. People who came back in 2006 were tagged as 1 and others were tagged as 0, thereby getting the target column
11 | 3. Feature selection, models tuning were done using this validation sample.
12 | 4. For the final model, features were created using all the given data (2003 to 2006) and prediction was done for 2007.
13 | 5. People were using different types of approaches as well. [Vopani](https://github.com/rohanrao91/AnalyticsVidhya_SeersAccuracy) followed a two stage validation approach using both 2005 and 2006 as validation samples.
14 | 
15 | ####Codes 
16 | ######splitDevVal.py
17 | Code to split the data into development(2003 to 2005 data) and validation sample(2006 data) 
18 | 
19 | ######createFeatures.py
20 | Code to create the features from the given input dataset for both validation and final model
21 | 
22 | ######finalModel.py
23 | Code to get the final submission file
24 | 


--------------------------------------------------------------------------------
/AV_TheSeersAccuracy/splitDevVal.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from datetime import datetime
 3 | 
 4 | with open("../Data/Train_seers_accuracy.csv") as train_file:
 5 |         dev_file = open("../Data/dev.csv","w")
 6 |         val_file = open("../Data/val.csv","w")
 7 | 
 8 |         dev_writer = csv.writer(dev_file)
 9 |         val_writer = csv.writer(val_file)
10 | 
11 |         reader = csv.reader(train_file)
12 |         header = reader.next()
13 |         dev_writer.writerow(header)
14 |         val_writer.writerow(header)
15 |         date_index = header.index("Transaction_Date")
16 | 
17 |         dev_counter = 0
18 |         val_counter = 0
19 |         total_counter = 0
20 |         for row in reader:
21 |                 #print row
22 |                 date_val = datetime.strptime(row[date_index], "%d-%b-%y")
23 |                 if date_val.year == 2006:
24 |                         val_writer.writerow(row)
25 |                         val_counter += 1
26 |                 else:
27 |                         dev_writer.writerow(row)
28 |                         dev_counter += 1
29 |                 total_counter += 1
30 |                 if total_counter % 10000 == 0:
31 |                         print total_counter, dev_counter, val_counter
32 | 
33 |         dev_file.close()
34 |         val_file.close()
35 | 
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 SudalaiRajkumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Machine Learning
2 | 
3 | Codes related to various Machine Learning Hackathons.
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------