├── Allstate Claims Severity
    ├── MyScript.py
    ├── script0.py
    ├── script1.py
    ├── script2.py
    ├── script3.py
    ├── script4.py
    └── script_keras.py
├── README.md
└── Santander Product Recommendation
    ├── Others
        ├── Rule_main.py
        ├── code.py
        └── xgb_v1.py
    ├── ensemble.py
    ├── feature_combine.py
    ├── feature_extract_v1.py
    ├── feature_extract_v2.py
    ├── prepro.py
    ├── xgb_fast.py
    └── xgb_script.py


/Allstate Claims Severity/MyScript.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import xgboost as xgb
  4 | import datetime
  5 | import itertools
  6 | from scipy.stats import boxcox
  7 | from sklearn.preprocessing import StandardScaler
  8 | from sklearn.cross_validation import KFold
  9 | from sklearn.metrics import mean_absolute_error
 10 | from sklearn import preprocessing
 11 | 
 12 | pd.options.mode.chained_assignment = None
 13 | 
 14 | multi_corr = [79, 80, 81, 87, 89, 90, 101, 103, 111]
 15 | two_corr = [2, 3, 9, 10, 11, 12, 13, 23, 36, 57, 72]
 16 | multi_cat_diff = [90, 92, 96, 99, 101, 102, 103, 106, 109, 110, 113, 114, 116]
 17 | skewed_num = [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
 18 | cat2corr = [(29, 30), (40, 41), (43, 45), (55, 56), (8, 65), (8, 66), (104, 106)]
 19 | two_avg1 = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 16, 23, 24, 25, 26, 27, 28, 36, 38, 40, 44, 50, 53, 57, 72, 73,
 20 |             76, 79, 80, 81, 82, 87, 89, 90, 103, 111]
 21 | 
 22 | 
 23 | def logregobj(preds, dtrain):
 24 |     labels = dtrain.get_label()
 25 |     con = 2
 26 |     x = preds - labels
 27 |     grad = con * x / (np.abs(x) + con)
 28 |     hess = con ** 2 / (np.abs(x) + con) ** 2
 29 |     return grad, hess
 30 | 
 31 | 
 32 | def evalerror(preds, dtrain):
 33 |     labels = dtrain.get_label()
 34 |     return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
 35 | 
 36 | 
 37 | def encode(charcode):
 38 |     r = 0
 39 |     ln = len(str(charcode))
 40 |     for i in range(ln):
 41 |         r += (ord(str(charcode)[i]) - ord('A'))
 42 |     return r + 1
 43 | 
 44 | 
 45 | def prepro(train, test, cont_feature):
 46 |     joined = pd.concat((train, test)).reset_index(drop=True)
 47 |     skewed_feats = ['cont' + str(i) for i in skewed_num]
 48 |     for feats in skewed_feats:
 49 |         joined[feats] = joined[feats] + 1
 50 |         joined[feats], lam = boxcox(joined[feats])
 51 | 
 52 |     multi_diff_feats = ['cat' + str(i) for i in multi_cat_diff]
 53 |     for column in multi_diff_feats:
 54 |         set_train = set(train[column].unique())
 55 |         set_test = set(test[column].unique())
 56 |         remove_train = set_train - set_test
 57 |         remove_test = set_test - set_train
 58 |         remove = remove_train.union(remove_test)
 59 | 
 60 |         def filter_cat(x):
 61 |             if x in remove:
 62 |                 return np.nan
 63 |             return x
 64 | 
 65 |         joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)
 66 | 
 67 |     ss = StandardScaler()
 68 |     joined[cont_feature] = ss.fit_transform(joined[cont_feature].values)
 69 |     del train, test
 70 |     return joined
 71 | 
 72 | 
 73 | def feature_extract(joined, cont_feature):
 74 |     features = pd.DataFrame()
 75 |     features['id'] = joined['id']
 76 |     features['loss'] = np.log(joined['loss'] + 200)
 77 | 
 78 |     cat_sel = [n for n in joined.columns if n.startswith('cat')]
 79 |     for column in cat_sel:
 80 |         features[column] = pd.factorize(joined[column].values, sort=True)[0] + 1
 81 | 
 82 |     for column in cont_feature:
 83 |         features[column] = joined[column]
 84 | 
 85 |     features['cont_avg'] = joined[cont_feature].mean(axis=1)
 86 |     features['cont_min'] = joined[cont_feature].min(axis=1)
 87 |     features['cont_max'] = joined[cont_feature].max(axis=1)
 88 | 
 89 |     for i in [20, 40, 73]:
 90 |         cat_feats = ['cat' + str(i) for i in range(1, i)]
 91 |         idx = 'cat_' + 'sum_' + str(i)
 92 |         features[idx + '_A'] = joined[cat_feats].apply(lambda x: sum(x == 'A'), axis=1)
 93 |         features[idx + '_B'] = joined[cat_feats].apply(lambda x: sum(x == 'B'), axis=1)
 94 | 
 95 |     cat2_feats = [('cat' + str(i), 'cat' + str(j)) for (i, j) in cat2corr]
 96 |     for feat1, feat2 in cat2_feats:
 97 |         feat_comb = feat1 + '_' + feat2
 98 |         features[feat_comb] = joined[feat1] + joined[feat2]
 99 |         features[feat_comb] = features[feat_comb].apply(encode)
100 | 
101 |     cat2avg_feats = ['cat' + str(i) for i in two_avg1]
102 |     for feat1, feat2 in itertools.combinations(cat2avg_feats, 2):
103 |         feat_comb = feat1 + '_' + feat2
104 |         features[feat_comb] = joined[feat1] + joined[feat2]
105 |         features[feat_comb] = features[feat_comb].apply(encode)
106 | 
107 |     train = features[features['loss'].notnull()]
108 |     test = features[features['loss'].isnull()]
109 |     del features, joined
110 |     return train, test
111 | 
112 | 
113 | def ceate_feature_map(features):
114 |     outfile = open('xgb.fmap', 'w')
115 |     i = 0
116 |     for feat in features:
117 |         outfile.write('{0}\t{1}\tq\n'.format(i, feat))
118 |         i = i + 1
119 |     outfile.close()
120 | 
121 | 
122 | def feature_select(train, test):
123 |     import operator
124 |     params = {
125 |         'min_child_weight': 100,
126 |         'eta': 0.02,
127 |         'colsample_bytree': 0.7,
128 |         'max_depth': 12,
129 |         'subsample': 0.7,
130 |         'alpha': 1,
131 |         'gamma': 1,
132 |         'silent': 1,
133 |         'objective': 'reg:linear',
134 |         'verbose_eval': True,
135 |         'seed': 12
136 |     }
137 |     rounds = 300
138 |     y = train['loss']
139 |     X = train.drop(['loss', 'id'], 1)
140 | 
141 |     xgtrain = xgb.DMatrix(X, label=y)
142 |     bst = xgb.train(params, xgtrain, num_boost_round=rounds)
143 | 
144 |     feats = [x for x in train.columns if x not in ['id', 'loss']]
145 |     print len(feats)
146 |     outfile = open('xgb.fmap', 'w')
147 |     i = 0
148 |     for feat in feats:
149 |         outfile.write('{0}\t{1}\tq\n'.format(i, feat))
150 |         i = i + 1
151 |     outfile.close()
152 | 
153 |     importance = bst.get_fscore(fmap='xgb.fmap')
154 |     importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True)
155 |     feats = [a for (a, b) in importance]
156 |     feats = feats[:450]
157 |     print len(feats)
158 |     df = pd.DataFrame(importance, columns=['feature', 'fscore'])
159 |     df['fscore'] = df['fscore'] / df['fscore'].sum()
160 |     df.to_csv("../input/feat_sel/feat_importance.csv", index=False)
161 | 
162 |     train1 = train[['id', 'loss'] + feats]
163 |     test1 = test[['id'] + feats]
164 |     return train1, test1
165 | 
166 | 
167 | def runXGB(train, test, index, RANDOM_STATE):
168 |     train_index, test_index = index
169 |     y = train['loss']
170 |     X = train.drop(['loss', 'id'], 1)
171 |     X_test = test.drop(['id'], 1)
172 |     del train, test
173 |     X_train, X_val = X.iloc[train_index], X.iloc[test_index]
174 |     y_train, y_val = y.iloc[train_index], y.iloc[test_index]
175 | 
176 |     xgtrain = xgb.DMatrix(X_train, label=y_train)
177 |     xgval = xgb.DMatrix(X_val, label=y_val)
178 |     xgtest = xgb.DMatrix(X_test)
179 |     X_val = xgb.DMatrix(X_val)
180 | 
181 |     params = {
182 |         'min_child_weight': 10,
183 |         'eta': 0.01,
184 |         'colsample_bytree': 0.7,
185 |         'max_depth': 12,
186 |         'subsample': 0.7,
187 |         'alpha': 1,
188 |         'gamma': 1,
189 |         'silent': 1,
190 |         'verbose_eval': True,
191 |         'seed': RANDOM_STATE
192 |     }
193 |     rounds = 3000
194 | 
195 |     watchlist = [(xgtrain, 'train'), (xgval, 'eval')]
196 |     model = xgb.train(params, xgtrain, rounds, watchlist, obj=logregobj, feval=evalerror, early_stopping_rounds=100)
197 | 
198 |     cv_score = mean_absolute_error(np.exp(model.predict(X_val)) - 200, np.exp(y_val) - 200)
199 |     predict = np.exp(model.predict(xgtest)) - 200
200 |     print "iteration = %d" % (model.best_iteration)
201 |     return predict, cv_score
202 | 
203 | 
204 | if __name__ == '__main__':
205 | 
206 |     Generate_or_read = 0  # 0 generate
207 |     feat_sel = 1  # 1 select
208 |     start_time = datetime.datetime.now()
209 |     if Generate_or_read == 0:
210 |         print "generate features..."
211 |         train = pd.read_csv('../input/train.csv')
212 |         test = pd.read_csv('../input/test.csv')
213 |         test['loss'] = np.nan
214 |         cont_feature = [n for n in train.columns if n.startswith('cont')]
215 |         joined = prepro(train, test, cont_feature)
216 |         train, test = feature_extract(joined, cont_feature)
217 |         print train.shape, test.shape
218 |         print datetime.datetime.now() - start_time
219 |         if feat_sel == 1:
220 |             print "feature select..."
221 |             train, test = feature_select(train, test)
222 |         train.to_csv("../input/feature/train.csv", index=False)
223 |         test.to_csv("../input/feature/test.csv", index=False)
224 |         print train.shape, test.shape
225 |         print datetime.datetime.now() - start_time
226 | 
227 |     else:
228 |         print "read features..."
229 |         train = pd.read_csv("../input/feature/train.csv")
230 |         test = pd.read_csv("../input/feature/test.csv")
231 |         print train.shape, test.shape
232 | 
233 |     print "run model..."
234 |     nfolds = 10
235 |     RANDOM_STATE = 113
236 |     ids = test['id']
237 |     predicts = np.zeros(ids.shape)
238 |     kf = KFold(train.shape[0], n_folds=nfolds, shuffle=True, random_state=RANDOM_STATE)
239 |     for i, index in enumerate(kf):
240 |         print('Fold %d' % (i + 1))
241 |         predict, cv_score = runXGB(train, test, index, RANDOM_STATE)
242 |         print cv_score
243 |         predicts += predict
244 | 
245 |     print datetime.datetime.now() - start_time
246 |     predicts = predicts / nfolds
247 |     submission = pd.DataFrame()
248 |     submission['id'] = ids
249 |     submission['loss'] = predicts
250 |     submission.to_csv('../submit/submit_xgb.csv', index=False)
251 | 


--------------------------------------------------------------------------------
/Allstate Claims Severity/script0.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import xgboost as xgb
 4 | from sklearn.preprocessing import StandardScaler
 5 | from sklearn.metrics import mean_absolute_error
 6 | 
 7 | SHIFT = 200
 8 | 
 9 | 
10 | def df_cleaner(df_train, df_test):
11 |     cont_list = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10',
12 |                  'cont11', 'cont12', 'cont13', 'cont14']
13 | 
14 |     ntrain = df_train.shape[0]
15 |     df = pd.concat([df_train, df_test]).reset_index(drop=True)
16 | 
17 |     df_cat = pd.get_dummies(df.filter(regex="^cat"))
18 |     scale = StandardScaler()
19 |     df[cont_list] = scale.fit_transform(df[cont_list].values)
20 | 
21 |     df = pd.concat([df[['id', 'loss'] + cont_list], df_cat], axis=1)
22 |     df_out_train = df.iloc[:ntrain, :]
23 |     df_out_test = df.iloc[ntrain:, :]
24 | 
25 |     df_out_columns = df_out_train.loc[:, (df_out_train != 0).any(axis=0)].columns
26 |     data_columns = list(df_out_columns)
27 |     data_columns.remove('id')
28 |     data_columns.remove('loss')
29 |     return df_out_train, df_out_test, data_columns
30 | 
31 | 
32 | def evalerror(preds, dtrain):
33 |     labels = dtrain.get_label()
34 |     return 'mae', mean_absolute_error(np.exp(preds) - SHIFT, np.exp(labels) - SHIFT)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     df_train = pd.read_csv('../input/train.csv')
39 |     df_test = pd.read_csv('../input/test.csv')
40 |     df_cleaner(df_train, df_test)
41 |     train, test, features = df_cleaner(df_train, df_test)
42 |     del df_train
43 |     del df_test
44 | 
45 |     x_test = test[:][features]
46 |     train['loss_logshift'] = np.log(train['loss'] + SHIFT)
47 | 
48 |     number_of_bagging_iterations = 10
49 |     max_number_of_rounds = 1500
50 |     early_stopping_rounds = 20
51 | 
52 |     work_dataframe = test[['id']]
53 | 
54 |     for i in xrange(number_of_bagging_iterations):
55 |         train_slice = train[train.id % number_of_bagging_iterations != i]
56 |         val_slice = train[train.id % number_of_bagging_iterations == i]
57 | 
58 |         x_train = train_slice[features]
59 |         y_train = train_slice['loss_logshift']
60 | 
61 |         x_val = val_slice[features]
62 |         y_val = val_slice['loss_logshift']
63 | 
64 |         model = xgb.XGBRegressor(max_depth=12, colsample_bytree=0.5, min_child_weight=1, subsample=0.8, gamma=1,
65 |                                  n_estimators=max_number_of_rounds, learning_rate=0.1)
66 | 
67 |         model.fit(x_train, y_train, early_stopping_rounds=early_stopping_rounds,
68 |                   eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric=evalerror)
69 | 
70 |         this_iteration_predictions = model.predict(x_test).astype(float)
71 | 
72 |         temp_series = pd.Series(np.exp(this_iteration_predictions) - SHIFT)
73 |         work_dataframe['round' + str(i)] = temp_series.values
74 | 
75 |     work_dataframe['mean_values'] = work_dataframe.filter(regex="^round").mean(axis=1)
76 |     work_dataframe[['id', 'mean_values']].to_csv('../input/submit_claim.csv', index=False,
77 |                                                  float_format='%.2f', header=['id', 'loss'])
78 | 


--------------------------------------------------------------------------------
/Allstate Claims Severity/script1.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import xgboost as xgb
  4 | 
  5 | from sklearn.metrics import mean_absolute_error
  6 | from sklearn.cross_validation import KFold
  7 | from scipy.stats import skew, boxcox
  8 | from sklearn.preprocessing import StandardScaler
  9 | import itertools
 10 | 
 11 | shift = 200
 12 | fair_constant = 2
 13 | # COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,'\
 14 | #                'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
 15 | #                'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
 16 | #                'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
 17 | COMB_FEATURE = 'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
 18 | 
 19 | 
 20 | def encode(charcode):
 21 |     r = 0
 22 |     ln = len(str(charcode))
 23 |     for i in range(ln):
 24 |         r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
 25 |     return r
 26 | 
 27 | 
 28 | def fair_obj(preds, dtrain):
 29 |     labels = dtrain.get_label()
 30 |     x = (preds - labels)
 31 |     den = abs(x) + fair_constant
 32 |     grad = fair_constant * x / (den)
 33 |     hess = fair_constant * fair_constant / (den * den)
 34 |     return grad, hess
 35 | 
 36 | 
 37 | def xg_eval_mae(yhat, dtrain):
 38 |     y = dtrain.get_label()
 39 |     return 'mae', mean_absolute_error(np.exp(y) - shift, np.exp(yhat) - shift)
 40 | 
 41 | 
 42 | def mungeskewed(train, test, numeric_feats):
 43 |     ntrain = train.shape[0]
 44 |     test['loss'] = 0
 45 |     train_test = pd.concat((train, test)).reset_index(drop=True)
 46 |     skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
 47 |     skewed_feats = skewed_feats[skewed_feats > 0.25]
 48 |     skewed_feats = skewed_feats.index
 49 | 
 50 |     print skewed_feats
 51 |     for feats in skewed_feats:
 52 |         train_test[feats] = train_test[feats] + 1
 53 |         train_test[feats], lam = boxcox(train_test[feats])
 54 |     return train_test, ntrain
 55 | 
 56 | 
 57 | if __name__ == "__main__":
 58 | 
 59 |     print('Started')
 60 |     train = pd.read_csv('../input/train.csv')
 61 |     test = pd.read_csv('../input/test.csv')
 62 | 
 63 |     numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
 64 |     categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
 65 |     train_test, ntrain = mungeskewed(train, test, numeric_feats)
 66 | 
 67 |     print('')
 68 |     for comb in itertools.combinations(COMB_FEATURE, 2):
 69 |         feat = comb[0] + "_" + comb[1]
 70 |         train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
 71 |         # train_test[feat] = train_test[feat].apply(encode)
 72 |         print('Analyzing Columns:', feat)
 73 | 
 74 |     categorical_feats = [x for x in train_test.columns[1:] if 'cat' in x]
 75 | 
 76 |     print('')
 77 |     for col in categorical_feats:
 78 |         print('Analyzing Column:', col)
 79 |         train_test[col] = train_test[col].apply(encode)
 80 | 
 81 |     print(train_test[categorical_feats])
 82 | 
 83 |     ss = StandardScaler()
 84 |     train_test[numeric_feats] = \
 85 |         ss.fit_transform(train_test[numeric_feats].values)
 86 | 
 87 |     train = train_test.iloc[:ntrain, :].copy()
 88 |     test = train_test.iloc[ntrain:, :].copy()
 89 | 
 90 |     print('\nMedian Loss:', train.loss.median())
 91 |     print('Mean Loss:', train.loss.mean())
 92 | 
 93 |     ids = pd.read_csv('input/test.csv')['id']
 94 |     train_y = np.log(train['loss'] + shift)
 95 |     train_x = train.drop(['loss', 'id'], axis=1)
 96 |     test_x = test.drop(['loss', 'id'], axis=1)
 97 | 
 98 |     n_folds = 10
 99 |     cv_sum = 0
100 |     early_stopping = 100
101 |     fpred = []
102 |     xgb_rounds = []
103 | 
104 |     d_train_full = xgb.DMatrix(train_x, label=train_y)
105 |     d_test = xgb.DMatrix(test_x)
106 | 
107 |     kf = KFold(train.shape[0], n_folds=n_folds)
108 |     for i, (train_index, test_index) in enumerate(kf):
109 |         print('\n Fold %d' % (i + 1))
110 |         X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
111 |         y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
112 | 
113 |         rand_state = 2016
114 | 
115 |         params = {
116 |             'seed': 0,
117 |             'colsample_bytree': 0.7,
118 |             'silent': 1,
119 |             'subsample': 0.7,
120 |             'learning_rate': 0.03,
121 |             'objective': 'reg:linear',
122 |             'max_depth': 12,
123 |             'min_child_weight': 100,
124 |             'booster': 'gbtree'}
125 | 
126 |         d_train = xgb.DMatrix(X_train, label=y_train)
127 |         d_valid = xgb.DMatrix(X_val, label=y_val)
128 |         watchlist = [(d_train, 'train'), (d_valid, 'eval')]
129 | 
130 |         clf = xgb.train(params,
131 |                         d_train,
132 |                         100000,
133 |                         watchlist,
134 |                         early_stopping_rounds=50,
135 |                         obj=fair_obj,
136 |                         feval=xg_eval_mae)
137 | 
138 |         xgb_rounds.append(clf.best_iteration)
139 |         scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
140 |         cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
141 |         print('eval-MAE: %.6f' % cv_score)
142 |         y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift
143 | 
144 |         if i > 0:
145 |             fpred = pred + y_pred
146 |         else:
147 |             fpred = y_pred
148 |         pred = fpred
149 |         cv_sum = cv_sum + cv_score
150 | 
151 |     mpred = pred / n_folds
152 |     score = cv_sum / n_folds
153 |     print('Average eval-MAE: %.6f' % score)
154 |     n_rounds = int(np.mean(xgb_rounds))
155 | 
156 |     print("Writing results")
157 |     result = pd.DataFrame(mpred, columns=['loss'])
158 |     result["id"] = ids
159 |     result = result.set_index("id")
160 | 
161 |     print("Writing submission:")
162 |     result.to_csv('../submit/submit.csv', index=True, index_label='id')
163 | 


--------------------------------------------------------------------------------
/Allstate Claims Severity/script2.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import xgboost as xgb
  4 | from sklearn.cross_validation import KFold
  5 | from sklearn.metrics import mean_absolute_error
  6 | 
  7 | train = pd.read_csv('../input/train.csv')
  8 | test = pd.read_csv('../input/test.csv')
  9 | 
 10 | test['loss'] = np.nan
 11 | joined = pd.concat([train, test])
 12 | 
 13 | 
 14 | def logregobj(preds, dtrain):
 15 |     labels = dtrain.get_label()
 16 |     con = 2
 17 |     x = preds - labels
 18 |     grad = con * x / (np.abs(x) + con)
 19 |     hess = con ** 2 / (np.abs(x) + con) ** 2
 20 |     return grad, hess
 21 | 
 22 | 
 23 | def evalerror(preds, dtrain):
 24 |     labels = dtrain.get_label()
 25 |     return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
 26 | 
 27 | 
 28 | cat_feature = [n for n in joined.columns if n.startswith('cat')]
 29 | cont_feature = [n for n in joined.columns if n.startswith('cont')]
 30 | 
 31 | if __name__ == '__main__':
 32 | 
 33 |     for column in cat_feature:
 34 |         joined[column] = pd.factorize(joined[column].values, sort=True)[0]
 35 | 
 36 |     train = joined[joined['loss'].notnull()]
 37 |     test = joined[joined['loss'].isnull()]
 38 | 
 39 |     shift = 200
 40 |     y = np.log(train['loss'] + shift)
 41 |     ids = test['id']
 42 |     X = train.drop(['loss', 'id'], 1)
 43 |     X_test = test.drop(['loss', 'id'], 1)
 44 | 
 45 |     n_folds = 5
 46 |     kf = KFold(X.shape[0], n_folds=n_folds)
 47 |     prediction = np.zeros(ids.shape)
 48 | 
 49 |     # final_fold_prediction = []
 50 |     # final_fold_real = []
 51 | 
 52 |     partial_evalutaion = open('temp_scores.txt', 'w')
 53 |     for i, (train_index, test_index) in enumerate(kf):
 54 |         print('\n Fold %d' % (i + 1))
 55 |         X_train, X_val = X.iloc[train_index], X.iloc[test_index]
 56 |         y_train, y_val = y.iloc[train_index], y.iloc[test_index]
 57 | 
 58 |         RANDOM_STATE = 2016
 59 |         params = {
 60 |             'min_child_weight': 1,
 61 |             'eta': 0.001,
 62 |             'colsample_bytree': 0.5,
 63 |             'max_depth': 12,
 64 |             'subsample': 0.8,
 65 |             'alpha': 1,
 66 |             'gamma': 1,
 67 |             'silent': 1,
 68 |             'verbose_eval': True,
 69 |             'seed': RANDOM_STATE
 70 |         }
 71 | 
 72 |         xgtrain = xgb.DMatrix(X_train, label=y_train)
 73 |         xgtrain_2 = xgb.DMatrix(X_val, label=y_val)
 74 | 
 75 |         xgtest = xgb.DMatrix(X_test)
 76 | 
 77 |         watchlist = [(xgtrain, 'train'), (xgtrain_2, 'eval')]
 78 | 
 79 |         model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, early_stopping_rounds=300)
 80 |         prediction += np.exp(model.predict(xgtest)) - shift
 81 | 
 82 |         # X_val = xgb.DMatrix(X_val)
 83 |         # temp_serises = pd.Series(np.exp(model.predict(X_val)) - shift)
 84 |         # final_fold_prediction.append(temp_serises)
 85 |         # temp_serises = np.exp(y_val) - shift
 86 |         # final_fold_real.append(temp_serises)
 87 |         #
 88 |         # temp_cv_score = mean_absolute_error(np.exp(model.predict(X_val)) - shift, np.exp(y_val) - shift)
 89 |         #
 90 |         # partial_evalutaion.write('fold ' + str(i) + ' ' + str(temp_cv_score) + '\n')
 91 |         # partial_evalutaion.flush()
 92 | 
 93 |     prediction = prediction / n_folds
 94 |     submission = pd.DataFrame()
 95 |     submission['id'] = ids
 96 |     submission['loss'] = prediction
 97 | 
 98 |     submission.to_csv('../submit/submit2.csv', index=False)
 99 | 
100 |     # final_fold_prediction = pd.concat(final_fold_prediction, ignore_index=True)
101 |     # final_fold_real = pd.concat(final_fold_real, ignore_index=True)
102 |     #
103 |     # cv_score = mean_absolute_error(final_fold_prediction, final_fold_real)
104 |     # print cv_score
105 | 


--------------------------------------------------------------------------------
/Allstate Claims Severity/script3.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'Vladimir Iglovikov'
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | import xgboost as xgb
 6 | 
 7 | from sklearn.metrics import mean_absolute_error
 8 | 
 9 | train = pd.read_csv('../input/train.csv')
10 | test = pd.read_csv('../input/test.csv')
11 | test['loss'] = np.nan
12 | joined = pd.concat([train, test])
13 | 
14 | 
15 | def evalerror(preds, dtrain):
16 |     labels = dtrain.get_label()
17 |     return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     for column in list(train.select_dtypes(include=['object']).columns):
22 |         if train[column].nunique() != test[column].nunique():
23 | 
24 |             set_train = set(train[column].unique())
25 |             set_test = set(test[column].unique())
26 |             remove_train = set_train - set_test
27 |             remove_test = set_test - set_train
28 | 
29 |             remove = remove_train.union(remove_test)
30 | 
31 | 
32 |             def filter_cat(x):
33 |                 if x in remove:
34 |                     return np.nan
35 |                 return x
36 | 
37 | 
38 |             joined[column] = joined[column].apply(lambda x: filter_cat(x), 1)
39 | 
40 |         joined[column] = pd.factorize(joined[column].values, sort=True)[0]
41 | 
42 |     train = joined[joined['loss'].notnull()]
43 |     test = joined[joined['loss'].isnull()]
44 | 
45 |     shift = 200
46 |     y = np.log(train['loss'] + shift)
47 |     ids = test['id']
48 |     X = train.drop(['loss', 'id'], 1)
49 |     X_test = test.drop(['loss', 'id'], 1)
50 | 
51 |     RANDOM_STATE = 2016
52 |     params = {
53 |         'min_child_weight': 1,
54 |         'eta': 0.01,
55 |         'colsample_bytree': 0.5,
56 |         'max_depth': 12,
57 |         'subsample': 0.8,
58 |         'alpha': 1,
59 |         'gamma': 1,
60 |         'silent': 1,
61 |         'verbose_eval': True,
62 |         'seed': RANDOM_STATE
63 |     }
64 | 
65 |     xgtrain = xgb.DMatrix(X, label=y)
66 |     xgtest = xgb.DMatrix(X_test)
67 | 
68 |     model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)
69 | 
70 |     prediction = np.exp(model.predict(xgtest)) - shift
71 | 
72 |     submission = pd.DataFrame()
73 |     submission['loss'] = prediction
74 |     submission['id'] = ids
75 |     submission.to_csv('../submit/sub_v.csv', index=False)
76 | 


--------------------------------------------------------------------------------
/Allstate Claims Severity/script4.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import xgboost as xgb
  4 | 
  5 | from datetime import datetime
  6 | from sklearn.metrics import mean_absolute_error
  7 | from sklearn.cross_validation import KFold
  8 | from scipy.stats import skew, boxcox
  9 | from sklearn import preprocessing
 10 | from sklearn.preprocessing import StandardScaler
 11 | import itertools
 12 | 
 13 | shift = 200
 14 | COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
 15 |                'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
 16 |                'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
 17 |                'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')
 18 | 
 19 | 
 20 | def encode(charcode):
 21 |     r = 0
 22 |     ln = len(str(charcode))
 23 |     for i in range(ln):
 24 |         r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
 25 |     return r
 26 | 
 27 | fair_constant = 0.7
 28 | def fair_obj(preds, dtrain):
 29 |     labels = dtrain.get_label()
 30 |     x = (preds - labels)
 31 |     den = abs(x) + fair_constant
 32 |     grad = fair_constant * x / (den)
 33 |     hess = fair_constant * fair_constant / (den * den)
 34 |     return grad, hess
 35 | 
 36 | def xg_eval_mae(yhat, dtrain):
 37 |     y = dtrain.get_label()
 38 |     return 'mae', mean_absolute_error(np.exp(y) - shift, np.exp(yhat) - shift)
 39 | 
 40 | def mungeskewed(train, test, numeric_feats):
 41 |     ntrain = train.shape[0]
 42 |     test['loss'] = 0
 43 |     train_test = pd.concat((train, test)).reset_index(drop=True)
 44 |     skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
 45 |     skewed_feats = skewed_feats[skewed_feats > 0.25]
 46 |     skewed_feats = skewed_feats.index
 47 | 
 48 |     for feats in skewed_feats:
 49 |         train_test[feats] = train_test[feats] + 1
 50 |         train_test[feats], lam = boxcox(train_test[feats])
 51 |     return train_test, ntrain
 52 | 
 53 | 
 54 | if __name__ == "__main__":
 55 | 
 56 |     print('\nStarted')
 57 |     directory = '../input/'
 58 |     train = pd.read_csv(directory + 'train.csv')
 59 |     test = pd.read_csv(directory + 'test.csv')
 60 | 
 61 |     numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
 62 |     categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
 63 |     train_test, ntrain = mungeskewed(train, test, numeric_feats)
 64 | 
 65 |     # taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114)
 66 |     for column in list(train.select_dtypes(include=['object']).columns):
 67 |         if train[column].nunique() != test[column].nunique():
 68 |             set_train = set(train[column].unique())
 69 |             set_test = set(test[column].unique())
 70 |             remove_train = set_train - set_test
 71 |             remove_test = set_test - set_train
 72 | 
 73 |             remove = remove_train.union(remove_test)
 74 | 
 75 |             def filter_cat(x):
 76 |                 if x in remove:
 77 |                     return np.nan
 78 |                 return x
 79 | 
 80 |             train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)
 81 | 
 82 |     # taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117)
 83 |     train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
 84 |     train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
 85 |     train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
 86 |     train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
 87 |     train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
 88 |     train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
 89 |     train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))
 90 | 
 91 |     train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
 92 |     train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
 93 |     train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
 94 |     train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
 95 |     train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25
 96 | 
 97 |     print('')
 98 |     for comb in itertools.combinations(COMB_FEATURE, 2):
 99 |         feat = comb[0] + "_" + comb[1]
100 |         train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
101 |         train_test[feat] = train_test[feat].apply(encode)
102 |         print('Combining Columns:', feat)
103 | 
104 |     print('')
105 |     for col in categorical_feats:
106 |         print('Analyzing Column:', col)
107 |         train_test[col] = train_test[col].apply(encode)
108 | 
109 |     print(train_test[categorical_feats])
110 | 
111 |     ss = StandardScaler()
112 |     train_test[numeric_feats] = \
113 |         ss.fit_transform(train_test[numeric_feats].values)
114 | 
115 |     train = train_test.iloc[:ntrain, :].copy()
116 |     test = train_test.iloc[ntrain:, :].copy()
117 | 
118 |     print('\nMedian Loss:', train.loss.median())
119 |     print('Mean Loss:', train.loss.mean())
120 | 
121 |     ids = pd.read_csv('../input/test.csv')['id']
122 |     train_y = np.log(train['loss'] + shift)
123 |     train_x = train.drop(['loss', 'id'], axis=1)
124 |     test_x = test.drop(['loss', 'id'], axis=1)
125 | 
126 |     n_folds = 10
127 |     cv_sum = 0
128 |     early_stopping = 100
129 |     fpred = []
130 |     xgb_rounds = []
131 | 
132 |     d_train_full = xgb.DMatrix(train_x, label=train_y)
133 |     d_test = xgb.DMatrix(test_x)
134 | 
135 |     kf = KFold(train.shape[0], n_folds=n_folds)
136 |     for i, (train_index, test_index) in enumerate(kf):
137 |         print('\n Fold %d' % (i + 1))
138 |         X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
139 |         y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
140 | 
141 |         rand_state = 2016
142 | 
143 |         params = {
144 |             'seed': 0,
145 |             'colsample_bytree': 0.7,
146 |             'silent': 1,
147 |             'subsample': 0.7,
148 |             'learning_rate': 0.03,
149 |             'objective': 'reg:linear',
150 |             'max_depth': 12,
151 |             'min_child_weight': 100,
152 |             'booster': 'gbtree'}
153 | 
154 |         d_train = xgb.DMatrix(X_train, label=y_train)
155 |         d_valid = xgb.DMatrix(X_val, label=y_val)
156 |         watchlist = [(d_train, 'train'), (d_valid, 'eval')]
157 | 
158 |         clf = xgb.train(params,
159 |                         d_train,
160 |                         100000,
161 |                         watchlist,
162 |                         early_stopping_rounds=50,
163 |                         obj=fair_obj,
164 |                         feval=xg_eval_mae)
165 | 
166 |         xgb_rounds.append(clf.best_iteration)
167 |         scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
168 |         cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
169 |         print('eval-MAE: %.6f' % cv_score)
170 |         y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift
171 | 
172 |         if i > 0:
173 |             fpred = pred + y_pred
174 |         else:
175 |             fpred = y_pred
176 |         pred = fpred
177 |         cv_sum = cv_sum + cv_score
178 | 
179 |     mpred = pred / n_folds
180 |     score = cv_sum / n_folds
181 |     print('Average eval-MAE: %.6f' % score)
182 |     n_rounds = int(np.mean(xgb_rounds))
183 | 
184 |     print("Writing results")
185 |     result = pd.DataFrame(mpred, columns=['loss'])
186 |     result["id"] = ids
187 |     result = result.set_index("id")
188 |     print("%d-fold average prediction:" % n_folds)
189 | 
190 |     now = datetime.now()
191 |     score = str(round((cv_sum / n_folds), 6))
192 | 
193 |     result.to_csv('../submit/en_submit.csv', index=True, index_label='id')


--------------------------------------------------------------------------------
/Allstate Claims Severity/script_keras.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import subprocess
  4 | from scipy.sparse import csr_matrix, hstack
  5 | from sklearn.metrics import mean_absolute_error
  6 | from sklearn.preprocessing import StandardScaler
  7 | from sklearn.cross_validation import KFold
  8 | from keras.models import Sequential
  9 | from keras.layers import Dense, Dropout, Activation
 10 | from keras.layers.normalization import BatchNormalization
 11 | from keras.layers.advanced_activations import PReLU
 12 | 
 13 | np.random.seed(123)
 14 | 
 15 | 
 16 | def batch_generator(X, y, batch_size, shuffle):
 17 |     number_of_batches = np.ceil(X.shape[0] / batch_size)
 18 |     counter = 0
 19 |     sample_index = np.arange(X.shape[0])
 20 |     if shuffle:
 21 |         np.random.shuffle(sample_index)
 22 |     while True:
 23 |         batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
 24 |         X_batch = X[batch_index, :].toarray()
 25 |         y_batch = y[batch_index]
 26 |         counter += 1
 27 |         yield X_batch, y_batch
 28 |         if (counter == number_of_batches):
 29 |             if shuffle:
 30 |                 np.random.shuffle(sample_index)
 31 |             counter = 0
 32 | 
 33 | 
 34 | def batch_generatorp(X, batch_size, shuffle):
 35 |     number_of_batches = X.shape[0] / np.ceil(X.shape[0] / batch_size)
 36 |     counter = 0
 37 |     sample_index = np.arange(X.shape[0])
 38 |     while True:
 39 |         batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
 40 |         X_batch = X[batch_index, :].toarray()
 41 |         counter += 1
 42 |         yield X_batch
 43 |         if (counter == number_of_batches):
 44 |             counter = 0
 45 | 
 46 | 
 47 | ## read data
 48 | train = pd.read_csv('../input/train.csv')
 49 | test = pd.read_csv('../input/test.csv')
 50 | 
 51 | index = list(train.index)
 52 | train = train.iloc[index]
 53 | 'train = train.iloc[np.random.permutation(len(train))]'
 54 | 
 55 | ## set test loss to NaN
 56 | test['loss'] = np.nan
 57 | 
 58 | ## response and IDs
 59 | y = np.log(train['loss'].values + 200)
 60 | id_train = train['id'].values
 61 | id_test = test['id'].values
 62 | 
 63 | ## stack train test
 64 | ntrain = train.shape[0]
 65 | tr_te = pd.concat((train, test), axis=0)
 66 | 
 67 | ## Preprocessing and transforming to sparse data
 68 | sparse_data = []
 69 | 
 70 | f_cat = [f for f in tr_te.columns if 'cat' in f]
 71 | for f in f_cat:
 72 |     dummy = pd.get_dummies(tr_te[f].astype('category'))
 73 |     tmp = csr_matrix(dummy)
 74 |     sparse_data.append(tmp)
 75 | 
 76 | f_num = [f for f in tr_te.columns if 'cont' in f]
 77 | scaler = StandardScaler()
 78 | tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
 79 | sparse_data.append(tmp)
 80 | 
 81 | del (tr_te, train, test)
 82 | 
 83 | ## sparse train and test data
 84 | xtr_te = hstack(sparse_data, format='csr')
 85 | xtrain = xtr_te[:ntrain, :]
 86 | xtest = xtr_te[ntrain:, :]
 87 | 
 88 | print('Dim train', xtrain.shape)
 89 | print('Dim test', xtest.shape)
 90 | 
 91 | del (xtr_te, sparse_data, tmp)
 92 | 
 93 | 
 94 | ## neural net
 95 | def nn_model():
 96 |     model = Sequential()
 97 | 
 98 |     model.add(Dense(400, input_dim=xtrain.shape[1], init='he_normal'))
 99 |     model.add(PReLU())
100 |     model.add(BatchNormalization())
101 |     model.add(Dropout(0.4))
102 | 
103 |     model.add(Dense(200, init='he_normal'))
104 |     model.add(PReLU())
105 |     model.add(BatchNormalization())
106 |     model.add(Dropout(0.2))
107 | 
108 |     model.add(Dense(50, init='he_normal'))
109 |     model.add(PReLU())
110 |     model.add(BatchNormalization())
111 |     model.add(Dropout(0.2))
112 | 
113 |     model.add(Dense(1, init='he_normal'))
114 |     model.compile(loss='mae', optimizer='adadelta')
115 |     return (model)
116 | 
117 | 
118 | ## cv-folds
119 | nfolds = 5
120 | folds = KFold(len(y), n_folds=nfolds, shuffle=True, random_state=111)
121 | 
122 | ## train models
123 | i = 0
124 | nbags = 10
125 | nepochs = 55
126 | pred_oob = np.zeros(xtrain.shape[0])
127 | pred_test = np.zeros(xtest.shape[0])
128 | 
129 | for (inTr, inTe) in folds:
130 |     xtr = xtrain[inTr]
131 |     ytr = y[inTr]
132 |     xte = xtrain[inTe]
133 |     yte = y[inTe]
134 |     pred = np.zeros(xte.shape[0])
135 |     for j in range(nbags):
136 |         model = nn_model()
137 |         fit = model.fit_generator(generator=batch_generator(xtr, ytr, 128, True),
138 |                                   nb_epoch=nepochs,
139 |                                   samples_per_epoch=xtr.shape[0],
140 |                                   validation_data=(xte.todense(), yte),
141 |                                   verbose=0)
142 |         temp = np.exp(
143 |             model.predict_generator(generator=batch_generatorp(xte, 800, False), val_samples=xte.shape[0])[:, 0]) - 200
144 |         pred += temp
145 |         print(
146 |             "Fold val bagging score after", j + 1, "rounds is: ",
147 |             mean_absolute_error(np.exp(yte) - 200, pred / (j + 1)))
148 |         pred_test += np.exp(
149 |             model.predict_generator(generator=batch_generatorp(xtest, 800, False), val_samples=xtest.shape[0])[:,
150 |             0]) - 200
151 |     pred /= nbags
152 |     pred_oob[inTe] = pred
153 |     score = mean_absolute_error(np.exp(yte) - 200, pred)
154 |     i += 1
155 |     print('Fold ', i, '- MAE:', score)
156 | 
157 | print('Total - MAE:', mean_absolute_error(np.exp(y) - 200, pred_oob))
158 | 
159 | ## train predictions
160 | df = pd.DataFrame({'id': id_train, 'loss': pred_oob})
161 | df.to_csv('preds_oob.csv', index=False)
162 | 
163 | ## test predictions
164 | pred_test /= (nfolds * nbags)
165 | df = pd.DataFrame({'id': id_test, 'loss': pred_test})
166 | df.to_csv('submission_keras_shift_perm.csv', index=False)
167 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # kaggle-Solution
2 | This respository contains my code for competition in kaggle.
3 | 
4 | - Santander Product Recommendation : [https://www.kaggle.com/c/santander-product-recommendation](https://www.kaggle.com/c/santander-product-recommendation)
5 | - Allstate Claims Severity : [https://www.kaggle.com/c/allstate-claims-severity](https://www.kaggle.com/c/allstate-claims-severity)
6 | 
7 | ### Kaggle Top Solutions
8 | - Kaggle Past Solutions : [http://ndres.me/kaggle-past-solutions/](http://ndres.me/kaggle-past-solutions/)
9 | - Kaggle优胜者代码汇总： [http://suanfazu.com/t/kaggle/230](http://suanfazu.com/t/kaggle/230)


--------------------------------------------------------------------------------
/Santander Product Recommendation/Others/Rule_main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from collections import defaultdict
  4 | 
  5 | pd.options.mode.chained_assignment = None
  6 | 
  7 | target_col = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1',  'ind_cder_fin_ult1',
  8 |               'ind_cno_fin_ult1', 'ind_ctju_fin_ult1','ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
  9 |               'ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 10 |               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',  'ind_plan_fin_ult1',
 11 |               'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 12 |               'ind_viv_fin_ult1', 'ind_nomina_ult1',  'ind_nom_pens_ult1', 'ind_recibo_ult1']
 13 | use_cols = ['ncodpers'] + target_col + ['sexo','renta']
 14 | 
 15 | def get_overbest(df_train):
 16 |     overbest_dict = {}
 17 |     for col_name in use_cols[1:25]:
 18 |         overbest_dict[col_name] = np.sum(df_train[col_name])
 19 |     top_products = sorted(overbest_dict, key = overbest_dict.get,reverse = True)
 20 |     return top_products
 21 | 
 22 | def get_eachbest(df_train):
 23 |     df_group = df_train[target_col].groupby(df_train['ncodpers']).sum()
 24 |     eachbest_dict = defaultdict(list)
 25 |     for ind,row in df_group.iterrows():
 26 |         row = row[row != 0].sort_values(ascending=False)
 27 |         eachbest_dict[ind] = list(row.index)
 28 |     return eachbest_dict
 29 | 
 30 | def get_lastinstance(last_instance_df):
 31 |     cust_dict = {}
 32 |     target_cols = np.array(use_cols[1:25])
 33 |     for ind, row in last_instance_df.iterrows():
 34 |         cust = row['ncodpers']
 35 |         used_products = set(target_cols[np.array(row[1:25] == 1)])
 36 |         cust_dict[cust] = used_products
 37 |     return cust_dict
 38 | 
 39 | def get_similardict(df_train):
 40 |     df_train['renta'].fillna(0, inplace=True)
 41 |     df_group1 = df_train[['renta','ind_ahor_fin_ult1']].groupby(df_train['ncodpers']).mean()
 42 |     mapping = {}
 43 |     for ind, row in df_group1.iterrows():
 44 |         if row['renta'] == 0:
 45 |             mapping[ind] = '0'
 46 |         elif row['renta'] < 45542.97:
 47 |             mapping[ind] = '1'
 48 |         elif row['renta'] < 57629.67:
 49 |             mapping[ind] = '2'
 50 |         elif row['renta'] < 68211.78:
 51 |             mapping[ind] = '3'
 52 |         elif row['renta'] < 78852.39:
 53 |             mapping[ind] = '4'
 54 |         elif row['renta'] < 90461.97:
 55 |             mapping[ind] = '5'
 56 |         elif row['renta'] < 103855.23:
 57 |             mapping[ind] = '6'
 58 |         elif row['renta'] < 120063.00:
 59 |             mapping[ind] = '7'
 60 |         elif row['renta'] < 141347.49:
 61 |             mapping[ind] = '8'
 62 |         elif row['renta'] < 173418.12:
 63 |             mapping[ind] = '9'
 64 |         elif row['renta'] < 234687.12:
 65 |             mapping[ind] = '10'
 66 |         else:
 67 |             mapping[ind] = '11'
 68 |     print mapping
 69 |     df_group2 = df_train[target_col].groupby(df_train['ncodpers']).sum()
 70 |     df_group3 = df_group2[target_col].groupby(mapping).sum()
 71 | 
 72 |     temp_dict = defaultdict(list)
 73 |     for ind, row in df_group3.iterrows():
 74 |         row = row[row != 0].sort_values(ascending=False)
 75 |         temp_dict[ind] = list(row.index)
 76 | 
 77 |     similar_dict =  defaultdict(list)
 78 |     for key in list(df_group1.index):
 79 |         similar_dict[key] = temp_dict[mapping[key]]
 80 |     return similar_dict
 81 | 
 82 | def get_kmeansdict(df_train):
 83 |     df_group1 = df_train[target_col].groupby(df_train['ncodpers']).sum()
 84 |     df_group1.fillna(0, inplace=True)
 85 | 
 86 |     from sklearn.cluster import KMeans
 87 |     kmeans = KMeans(n_clusters=100)
 88 |     kmeans.fit(df_group1.values)
 89 | 
 90 |     mapping = {}
 91 |     for key,value in zip(list(df_group1.index),kmeans.labels_):
 92 |         mapping[key] = value
 93 |     df_group2 = df_group1[target_col].groupby(mapping).sum()
 94 |     print mapping
 95 | 
 96 | 
 97 |     temp_dict = defaultdict(list)
 98 |     for ind, row in df_group2.iterrows():
 99 |         row = row[row != 0].sort_values(ascending=False)
100 |         temp_dict[ind] = list(row.index)
101 | 
102 |     kmeans_dict = defaultdict(list)
103 |     for key in list(df_group1.index):
104 |         kmeans_dict[key] = temp_dict[mapping[key]]
105 |     return kmeans_dict
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     print("0")
110 |     df_test = pd.read_csv('../input/test_sub_1000.csv', usecols = ['ncodpers'] + target_col)
111 |     cust_dict = get_lastinstance(df_test)
112 |     del df_test
113 |     print("1")
114 |     df_train = pd.read_csv('../input/train_sub_1000.csv', usecols = use_cols)
115 |     top_products = get_overbest(df_train)
116 |     print("2")
117 |     eachbest_dict = get_eachbest(df_train)
118 |     print("3")
119 |     # similar_dict = get_similardict(df_train)
120 |     similar_dict = get_kmeansdict(df_train)
121 |     print("4")
122 |     del df_train
123 | 
124 |     sub_id = eachbest_dict.keys()
125 |     final_preds = []
126 | 
127 |     print("Running model")
128 |     for ncodper, each_list in eachbest_dict.iteritems():
129 |         used_products = cust_dict.get(ncodper,[])
130 |         similar_product = similar_dict[ncodper]
131 |         pred_products = []
132 |         for product in each_list:
133 |             if product not in used_products:
134 |                 pred_products.append(product)
135 |                 if len(pred_products) == 7:
136 |                     break
137 |         if len(pred_products) < 7:
138 |             for product in similar_product:
139 |                 if (product not in used_products) and (product not in pred_products):
140 |                     pred_products.append(product)
141 |                     if len(pred_products) == 7:
142 |                         break
143 |         if len(pred_products) < 7:
144 |             for product in top_products:
145 |                 if (product not in used_products) and (product not in pred_products):
146 |                     pred_products.append(product)
147 |                     if len(pred_products) == 7:
148 |                         break
149 | 
150 |         final_preds.append(" ".join(pred_products))
151 |     out_df = pd.DataFrame({'ncodpers':sub_id,'added_products':final_preds})
152 | 
153 |     print("Generate submission...")
154 |     sub_92 = pd.read_csv('../input/sample_submission.csv', usecols=['ncodpers']).values[:, 0]
155 |     submit = out_df [out_df ['ncodpers'].isin(sub_92)]
156 |     submit.loc[:,'ncodpers'] = submit.loc[:,'ncodpers'].astype('int32')
157 |     submit.to_csv('../input/submit1.csv', index=False)
158 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/Others/code.py:
--------------------------------------------------------------------------------
  1 | sub_rf = pd.read_csv('../input/sub_rf.csv', nrows=929615)
  2 | sub_reg = pd.read_csv('../input/sub_reg.csv', nrows=929615)
  3 | sub_union = pd.DataFrame(np.zeros((sub_rf.shape[0], 2)), columns=['ncodpers', 'added_products'])
  4 | sub_union['ncodpers'] = sub_rf['ncodpers']
  5 | 
  6 | added = []
  7 | for x in range(sub_rf.shape[0]):
  8 |     rf_str = sub_rf.loc[x]['added_products'].split(' ')
  9 |     reg_str = sub_reg.loc[x]['added_products'].split(' ')[:7]
 10 | 
 11 |     str = []
 12 |     for str1 in reg_str:
 13 |         if str1 in rf_str:
 14 |             str.append(str1)
 15 |     for str1 in rf_str:
 16 |         if str1 not in str:
 17 |             str.append(str1)
 18 |     added.append(" ".join(str))
 19 | 
 20 | sub_union.loc[:, 'added_products'] = added
 21 | sub_union.to_csv('submit.csv', index=False)
 22 | 
 23 | 
 24 | # ======================================================================================================================
 25 | def getAge(row):
 26 |     age = row['age'].strip()
 27 |     if age == 'NA' or age == '':
 28 |         age1 = 2
 29 |     elif float(age) < 20:
 30 |         age1 = 0
 31 |     elif float(age) < 30:
 32 |         age1 = 1
 33 |     elif float(age) < 40:
 34 |         age1 = 2
 35 |     elif float(age) < 50:
 36 |         age1 = 3
 37 |     elif float(age) < 60:
 38 |         age1 = 4
 39 |     else:
 40 |         age1 = 5
 41 |     return age1
 42 | 
 43 | 
 44 | def getCustSeniority(row):
 45 |     cust_seniority = row['antiguedad'].strip()
 46 |     if cust_seniority == 'NA' or cust_seniority == '':
 47 |         seniority = 2
 48 |     elif float(cust_seniority) < 50:
 49 |         seniority = 0
 50 |     elif float(cust_seniority) < 100:
 51 |         seniority = 1
 52 |     elif float(cust_seniority) < 150:
 53 |         seniority = 2
 54 |     elif float(cust_seniority) < 200:
 55 |         seniority = 3
 56 |     else:
 57 |         seniority = 4
 58 |     return seniority
 59 | 
 60 | 
 61 | def getRent(row):
 62 |     rent = row['renta'].strip()
 63 |     if rent == 'NA' or rent == '':
 64 |         rent1 = 4
 65 |     elif float(rent) < 45542.97:
 66 |         rent1 = 1
 67 |     elif float(rent) < 57629.67:
 68 |         rent1 = 2
 69 |     elif float(rent) < 68211.78:
 70 |         rent1 = 3
 71 |     elif float(rent) < 78852.39:
 72 |         rent1 = 4
 73 |     elif float(rent) < 90461.97:
 74 |         rent1 = 5
 75 |     elif float(rent) < 103855.23:
 76 |         rent1 = 6
 77 |     elif float(rent) < 120063.00:
 78 |         rent1 = 7
 79 |     elif float(rent) < 141347.49:
 80 |         rent1 = 8
 81 |     elif float(rent) < 173418.12:
 82 |         rent1 = 9
 83 |     elif float(rent) < 234687.12:
 84 |         rent1 = 10
 85 |     else:
 86 |         rent1 = 11
 87 |     return rent1
 88 | 
 89 | 
 90 | # df_user['renta'] = df_user['renta'].fillna(df_user.loc[df_user['renta'].notnull(),'renta'].median())
 91 | # ======================================================================================================================
 92 | for con_attr in ['age', 'antiguedad', 'renta']:
 93 |     group_feats_1 = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: x.sum())
 94 |     group_feats_0 = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: x.count() - x.sum())
 95 |     group_feats_r = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: round(x.sum() / x.count(), 2))
 96 |     group_feats_1.columns = [con_attr + '_1_' + str(i) for i in range(24)]
 97 |     group_feats_0.columns = [con_attr + '_0_' + str(i) for i in range(24)]
 98 |     group_feats_r.columns = [con_attr + '_r_' + str(i) for i in range(24)]
 99 |     lag_feats = pd.merge(lag_feats, group_feats_1, left_on=con_attr, right_index=True, how='left')
100 |     lag_feats = pd.merge(lag_feats, group_feats_0, left_on=con_attr, right_index=True, how='left')
101 |     lag_feats = pd.merge(lag_feats, group_feats_r, left_on=con_attr, right_index=True, how='left')
102 | 
103 | 
104 | ##======================================================================================================================
105 | def get_last_buy(x):
106 |     stop = 0
107 |     for i in [0, 1, 2, 3, 4]:
108 |         if x.values[i] == 1:
109 |             stop = 5 - i
110 |             break
111 |     return stop
112 | 
113 | 
114 | def get_first_buy(x):
115 |     start = 0
116 |     for i in [4, 3, 2, 1, 0]:
117 |         if x.values[i] == 1:
118 |             start = 5 - i
119 |             break
120 |     return start
121 | 
122 | 
123 | def get_buy_len(x):
124 |     x_value = x.values
125 |     if x_value[-1] != 0:
126 |         len1 = x_value[-1] - x_value[-2] + 1
127 |     else:
128 |         len1 = 0
129 |     return len1
130 | 
131 | 
132 | def add_com_features(lag_feats):
133 |     for i in range(24):
134 |         index_list = [11 + i, 35 + i, 59 + i, 83 + i, 107 + i]
135 |         lag_feats['prod_sum_' + str(i)] = lag_feats.iloc[:, index_list].sum(axis=1)
136 |         lag_feats['first_buy_' + str(i)] = lag_feats.iloc[:, index_list].apply(lambda x: get_first_buy(x), axis=1)
137 |         lag_feats['last_buy_' + str(i)] = lag_feats.iloc[:, index_list].apply(lambda x: get_last_buy(x), axis=1)
138 |         lag_feats['leng_buy_' + str(i)] = lag_feats.loc[:, ['first_buy_' + str(i), 'last_buy_' + str(i)]].apply(
139 |             lambda x: get_buy_len(x), axis=1)
140 | 
141 |     pro_sum_list = ['prod_sum_' + str(i) for i in range(24)]
142 |     pro_rank_list = ['prod_rank_' + str(i) for i in range(24)]
143 |     lag_feats[pro_rank_list] = lag_feats[pro_sum_list].apply(lambda x: x.rank(ascending=False).astype('int'), axis=1)
144 | 
145 |     import_col = [target_cols[i] for i in [0, 2, 4, 9, 10, 11, 15, 16, 19, 20, 21]]
146 |     for i in range(1, 6):
147 |         pre_import_col = [str(i) + '_' + col for col in import_col]
148 |         lag_feats[str(i) + '_11_sum_import'] = lag_feats[pre_import_col].sum(axis=1)
149 |     for col in import_col:
150 |         lag_feats['1_im_' + col] = lag_feats['1_' + col]
151 | 
152 |     com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21], [16, 19, 20, 21]]
153 |     for x in range(4):
154 |         import_col = [target_cols[i] for i in com_col[x]]
155 |         for i in range(1, 6):
156 |             pre_import_col = [str(i) + '_' + col for col in import_col]
157 |             lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1)
158 |     return lag_feats
159 |     # =======================================================================================================================
160 |     columns = ['age', 'antiguedad', 'renta', 'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext']
161 |     columns1 = []
162 |     target_cols1 = [target_cols[i] for i in [0, 2, 4, 5, 6, 7, 9, 10, 11, 15, 16, 17, 19, 20, 21]]
163 |     for i in range(1, 6):
164 |         columns1.extend([str(i) + '_' + col for col in target_cols1])
165 | 
166 |     train_X = pd.read_csv(data_path + 'train_feats_users.csv', usecols=columns + ['label'])
167 |     for col in ['sum', 'renta', 'canel', 'pais', 'com20', 'lagf1', 'sum8']:
168 |         if col == 'lagf1':
169 |             train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv', usecols=columns1)
170 |         else:
171 |             train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv')
172 |         train_X = pd.concat([train_X, train_temp], axis=1)
173 |     del train_temp
174 | 
175 |     test_X = pd.read_csv(data_path + 'test_feats_users.csv', usecols=columns)
176 |     for col in ['sum', 'renta', 'canel', 'pais', 'com20', 'lagf1', 'sum8']:
177 |         if col == 'lagf1':
178 |             test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv', usecols=columns1)
179 |         else:
180 |             test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv')
181 |         test_X = pd.concat([test_X, test_temp], axis=1)
182 |     del test_temp
183 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/Others/xgb_v1.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import csv
  3 | import datetime
  4 | import numpy as np
  5 | import pandas as pd
  6 | import xgboost as xgb
  7 | from sklearn.metrics import log_loss
  8 | 
  9 | 
 10 | mapping_dict = {
 11 | 'sexo'          : {-99:0, 'H':0, 'V':1},
 12 | 'ind_actividad_cliente' : {-99:0, '0.0':0, '0':0,'1.0':1, '1':1},
 13 | 'segmento'      : {-99:0, '01 - TOP':0, '03 - UNIVERSITARIO':1, '02 - PARTICULARES':2},
 14 | 'ind_nuevo'     : {-99:0,  '1.0':0, '1':0,  '0.0':1, '0':1 },
 15 | 'tiprel_1mes'   : {-99:0,  'P':0, 'R':0, 'N':0, 'I':1, 'A':2},
 16 | 'indext'        : {-99:0,  'S':0, 'N':1},
 17 | # 'canal_entrada' : {'KHE':6, 'KAT':5 ,'KFC':4, 'KFA':3, 'KHK':2, 'KHQ':1, -99: 0}
 18 | }
 19 | target_cols1 = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1',  'ind_cder_fin_ult1',
 20 |                 'ind_cno_fin_ult1',  'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
 21 |                 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 22 |                 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1',  'ind_plan_fin_ult1',
 23 |                 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 24 |                 'ind_viv_fin_ult1',  'ind_nomina_ult1',   'ind_nom_pens_ult1', 'ind_recibo_ult1']
 25 | 
 26 | cat_cols = list(mapping_dict.keys())
 27 | target_cols = target_cols1[2:]
 28 | target_cols.remove('ind_cder_fin_ult1')
 29 | NUM_CLASS = 22
 30 | 
 31 | def getIndex(row, col):
 32 |     val = row[col].strip()
 33 |     if val not in ['', 'NA']:
 34 |         ind = mapping_dict[col][val]
 35 |     else:
 36 |         ind = mapping_dict[col][-99]
 37 |     return ind
 38 | 
 39 | 
 40 | def getAge(row):
 41 |     age = row['age'].strip()
 42 |     if age == 'NA' or age == '':
 43 |         age1 = 2
 44 |     elif float(age) < 20:
 45 |         age1 = 0
 46 |     elif float(age) < 30:
 47 |         age1 = 1
 48 |     elif float(age) < 40:
 49 |         age1 = 2
 50 |     elif float(age) < 50:
 51 |         age1 = 3
 52 |     elif float(age) < 60:
 53 |         age1 = 4
 54 |     else:
 55 |         age1 = 5
 56 |     return age1
 57 | 
 58 | def getCustSeniority(row):
 59 |     cust_seniority = row['antiguedad'].strip()
 60 |     if cust_seniority == 'NA' or cust_seniority == '':
 61 |         seniority = 2
 62 |     elif float(cust_seniority) < 50:
 63 |         seniority = 0
 64 |     elif float(cust_seniority) < 100:
 65 |         seniority = 1
 66 |     elif float(cust_seniority) < 150:
 67 |         seniority = 2
 68 |     elif float(cust_seniority) < 200:
 69 |         seniority = 3
 70 |     else:
 71 |         seniority = 4
 72 |     return seniority
 73 | 
 74 | def getRent(row):
 75 |     rent = row['renta'].strip()
 76 |     if rent == 'NA' or rent == '':
 77 |         rent1 = 4
 78 |     elif float(rent) < 45542.97:
 79 |         rent1 = 1
 80 |     elif float(rent) < 57629.67:
 81 |         rent1 = 2
 82 |     elif float(rent) < 68211.78:
 83 |         rent1 = 3
 84 |     elif float(rent) < 78852.39:
 85 |         rent1 = 4
 86 |     elif float(rent) < 90461.97:
 87 |         rent1 = 5
 88 |     elif float(rent) < 103855.23:
 89 |         rent1 = 6
 90 |     elif float(rent) < 120063.00:
 91 |         rent1 = 7
 92 |     elif float(rent) < 141347.49:
 93 |         rent1 = 8
 94 |     elif float(rent) < 173418.12:
 95 |         rent1 = 9
 96 |     elif float(rent) < 234687.12:
 97 |         rent1 = 10
 98 |     else:
 99 |         rent1 = 11
100 |     return rent1
101 | 
102 | def getTarget(row):
103 |     tlist = []
104 |     for col in target_cols:
105 |         if row[col].strip() in ['', 'NA']:
106 |             target = 0
107 |         else:
108 |             target = int(float(row[col]))
109 |         tlist.append(target)
110 |     print len(tlist)
111 |     return tlist
112 | 
113 | def feature_extract(row, prev_target_list):
114 |     analy_index = [0,1,8,9,10,13,14,15,16,18,19,20]
115 |     pro_feats = [prev_target_list[i] for i in analy_index]
116 |     x_vars = []
117 |     for col in cat_cols:
118 |         x_vars.append(getIndex(row, col))
119 |     x_vars.append(getAge(row))
120 |     x_vars.append(getCustSeniority(row))
121 |     x_vars.append(getRent(row))
122 |     x_vars.append(prev_target_list.count(1))
123 |     return x_vars + pro_feats
124 | 
125 | def getLagFeature():
126 |     data_path = '../input/divide/train'
127 |     use_cols = ['ncodpers'] + target_cols1
128 |     train_05 = pd.read_csv(data_path + '2015-05-28.csv',usecols = use_cols)
129 |     train_04 = pd.read_csv(data_path + '2015-04-28.csv', usecols=use_cols)
130 |     train_03 = pd.read_csv(data_path + '2015-03-28.csv', usecols=use_cols)
131 |     train_02 = pd.read_csv(data_path + '2015-02-28.csv', usecols=use_cols)
132 |     train_01 = pd.read_csv(data_path + '2015-01-28.csv', usecols=use_cols)
133 |     train_lag = pd.merge(train_05,train_04, on = 'ncodpers',how = 'left')
134 |     train_lag = pd.merge(train_lag, train_03, on = 'ncodpers', how = 'left')
135 |     train_lag = pd.merge(train_lag, train_02, on = 'ncodpers', how = 'left')
136 |     train_lag = pd.merge(train_lag, train_01, on = 'ncodpers', how = 'left')
137 |     train_lag.fillna(0 ,inplace = True)
138 |     train_lag_dict = {}
139 |     for ind, row in train_lag.iterrows():
140 |         id = int(row['ncodpers'])
141 |         train_lag_dict[id] = list(row.values[1:])
142 | 
143 |     train_05 = pd.read_csv(data_path + '2016-05-28.csv', usecols=use_cols)
144 |     train_04 = pd.read_csv(data_path + '2016-04-28.csv', usecols=use_cols)
145 |     train_03 = pd.read_csv(data_path + '2016-03-28.csv', usecols=use_cols)
146 |     train_02 = pd.read_csv(data_path + '2016-02-28.csv', usecols=use_cols)
147 |     train_01 = pd.read_csv(data_path + '2016-01-28.csv', usecols=use_cols)
148 |     train_lag = pd.merge(train_05, train_04, on='ncodpers', how='left')
149 |     train_lag = pd.merge(train_lag, train_03, on='ncodpers', how='left')
150 |     train_lag = pd.merge(train_lag, train_02, on='ncodpers', how='left')
151 |     train_lag = pd.merge(train_lag, train_01, on='ncodpers', how='left')
152 |     train_lag.fillna(0, inplace=True)
153 |     test_lag_dict = {}
154 |     for ind, row in train_lag.iterrows():
155 |         id = int(row['ncodpers'])
156 |         test_lag_dict[id] = list(row.values[1:])
157 |     return train_lag_dict,test_lag_dict
158 | 
159 | def getTrainTestSet():
160 |     train_lag_dict, test_lag_dict = getLagFeature()
161 |     x_vars_list = []
162 |     y_vars_list = []
163 |     data_path = '../input/divide/train'
164 | 
165 |     train_file = open(data_path  + '2015-05-28.csv')
166 |     cust_dict = {}
167 |     for row in csv.DictReader(train_file):
168 |         cust_id = int(row['ncodpers'])
169 |         cust_dict[cust_id] = getTarget(row)
170 |     train_file.close()
171 | 
172 |     train_file = open(data_path + '2015-06-28.csv')
173 |     for row in csv.DictReader(train_file):
174 |         cust_id = int(row['ncodpers'])
175 |         prev_target_list = cust_dict.get(cust_id, [0] * NUM_CLASS)
176 |         target_list = getTarget(row)
177 |         new_products = [max(x1 - x2, 0) for (x1, x2) in zip(target_list, prev_target_list)]
178 |         if sum(new_products) > 0:
179 |             for ind, prod in enumerate(new_products):
180 |                 if prod > 0:
181 |                     x_vars = feature_extract(row, prev_target_list)
182 |                     x_vars.extend(train_lag_dict.get(cust_id, [0] * 120))
183 |                     x_vars_list.append(x_vars)
184 |                     y_vars_list.append(ind)
185 |     train_file.close()
186 | 
187 |     test_file = open(data_path + '2016-05-28.csv')
188 |     cust_dict = {}
189 |     for row in csv.DictReader(test_file):
190 |         cust_id = int(row['ncodpers'])
191 |         cust_dict[cust_id] = getTarget(row)
192 |     test_file.close()
193 | 
194 |     x_test_list = []
195 |     test_file = open('../input/test_ver2.csv')
196 |     for row in csv.DictReader(test_file):
197 |         cust_id = int(row['ncodpers'])
198 |         prev_target_list = cust_dict.get(cust_id, [0] * NUM_CLASS)
199 |         x_vars = feature_extract(row, prev_target_list)
200 |         x_vars.extend(test_lag_dict.get(cust_id, [0] * 120))
201 |         x_test_list.append(x_vars)
202 |     test_file.close()
203 | 
204 |     train_X = np.array(x_vars_list)
205 |     train_y = np.array(y_vars_list)
206 |     test_X = np.array(x_test_list)
207 | 
208 |     print train_X.shape, train_y.shape, test_X.shape
209 |     return train_X, train_y, test_X
210 | 
211 | def runXGB(xgtrain, seed_val=123):
212 |     param = {
213 |         'objective' : 'multi:softprob',
214 |         'eval_metric' : "mlogloss",
215 |         'num_class' : NUM_CLASS,
216 |         'silent' : 1,
217 |         'min_child_weight' : 2,
218 |         'eta': 0.05,
219 |         'max_depth': 6,
220 |         'subsample' : 0.9,
221 |         'colsample_bytree' : 0.8,
222 |         'seed' : seed_val
223 |     }
224 |     num_rounds = 100
225 |     model = xgb.train(param, xgtrain, num_rounds)
226 |     return model
227 | 
228 | if __name__ == "__main__":
229 | 
230 |     print "feature extract..."
231 |     start_time = datetime.datetime.now()
232 |     train_X, train_y, test_X = getTrainTestSet()
233 | 
234 |     xgtrain = xgb.DMatrix(train_X, label = train_y)
235 |     xgtest  = xgb.DMatrix(test_X)
236 |     xgval = xgb.DMatrix(train_X)
237 |     y_true = train_y
238 |     del train_X, train_y, test_X
239 |     print(datetime.datetime.now() - start_time)
240 | 
241 |     print "running model..."
242 |     model = runXGB(xgtrain, seed_val=123)
243 |     y_pred = model.predict(xgval)
244 |     print log_loss(y_true, y_pred)
245 | 
246 |     preds = model.predict(xgtest)
247 |     del xgtrain, xgtest
248 |     print(datetime.datetime.now() - start_time)
249 | 
250 |     print "Getting the top products.."
251 |     target_cols = np.array(target_cols)
252 |     preds = np.argsort(preds, axis=1)
253 |     preds = np.fliplr(preds)[:, :8]
254 |     test_id = np.array(pd.read_csv("../input/test_ver2.csv", usecols=['ncodpers'])['ncodpers'])
255 |     final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
256 |     out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
257 |     out_df.to_csv('../submit/sub_xgb.csv', index=False)
258 |     print(datetime.datetime.now() - start_time)


--------------------------------------------------------------------------------
/Santander Product Recommendation/ensemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | # test_id = pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])
 5 | # nn_preds = pd.read_csv('../input/ensemble/taozi.csv')
 6 | # nn_preds = pd.merge(test_id, nn_preds, on = 'ncodpers', how='left')
 7 | # del nn_preds['ncodpers']
 8 | # nn_preds.to_csv('../input/ensemble/nn_preds.csv',index = False)
 9 | 
10 | nn_preds = pd.read_csv('../input/ensemble/nn_preds.csv')
11 | xgb_preds = pd.read_csv('../input/ensemble/xgb_preds.csv')
12 | 
13 | preds = (nn_preds + xgb_preds) / 2
14 | target_cols = preds.columns
15 | del nn_preds, xgb_preds
16 | 
17 | # target_cols = np.array(target_cols)
18 | preds = np.argsort(preds, axis=1)
19 | preds = np.fliplr(preds)[:, :7]
20 | test_id = np.array(pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])['ncodpers'])
21 | final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
22 | out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
23 | out_df.to_csv('../submit/sub_com.csv', index=False)
24 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/feature_combine.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import itertools
 4 | 
 5 | target_cols = ['ind_cco_fin_ult1', 'ind_cder_fin_ult1',
 6 |                'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
 7 |                'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 8 |                'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
 9 |                'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
10 |                'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
11 | 
12 | 
13 | def add_com_feats(lag_feats):
14 |     com_feats = [target_cols[i] for i in [0, 2, 15, 16, 19, 20, 21]]
15 |     for x, com_cols in enumerate(itertools.combinations(com_feats, 4)):
16 |         for i in range(1, 6):
17 |             com_col = [str(i) + '_' + col for col in com_cols]
18 |             lag_feats[str(x) + '_com4_' + str(i)] = lag_feats[com_col].sum(axis=1)
19 | 
20 |     return lag_feats.iloc[:, -175:]
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     data_path = '../input/feats/'
25 |     train_lag5 = pd.read_csv(data_path + 'train_feats_lag5.csv')
26 |     train_add5 = add_com_feats(train_lag5)
27 |     train_add5.to_csv(data_path + 'train_feats_come175.csv', index=False)
28 | 
29 |     test_lag5 = pd.read_csv(data_path + 'test_feats_lag5.csv')
30 |     test_add5 = add_com_feats(test_lag5)
31 |     test_add5.to_csv(data_path + 'test_feats_come175.csv', index=False)
32 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/feature_extract_v1.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import numpy as np
  3 | import pandas as pd
  4 | import xgboost as xgb
  5 | import itertools
  6 | 
  7 | pd.options.mode.chained_assignment = None
  8 | 
  9 | mapping_dict = {
 10 |     'sexo': {'nan': 0, 'H': 0, 'V': 1},
 11 |     'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1},
 12 |     'segmento': {'nan': 0, '01 - TOP': 1, '03 - UNIVERSITARIO': 2, '02 - PARTICULARES': 3},
 13 |     'ind_nuevo': {'nan': 0, '1.0': 1, '1': 1, '0.0': 2, '0': 2},
 14 |     'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2},
 15 |     'indext': {'nan': 0, 'S': 0, 'N': 1},
 16 |     'indresi': {'nan': 0, 'S': 1, 'N': 2},
 17 |     'indfall': {'nan': 0, 'S': 1, 'N': 2},
 18 |     'indrel': {'nan': 1, '1': 0, '99': 1, '1.0': 0, '99.0': 1},
 19 |     'ind_empleado': {'nan': 0, 'N': 1, 'B': 2, 'F': 3, 'A': 4, 'S': 5},
 20 |     'pais_residencia': {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17,
 21 |                         'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73,
 22 |                         'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67,
 23 |                         'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20,
 24 |                         'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90,
 25 |                         'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118,
 26 |                         'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7,
 27 |                         'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4,
 28 |                         'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95,
 29 |                         'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66,
 30 |                         'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, 'nan': 1, 'LB': 81,
 31 |                         'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37,
 32 |                         'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5,
 33 |                         'QA': 58, 'MZ': 27},
 34 |     'canal_entrada': {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12,
 35 |                       'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57,
 36 |                       'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41,
 37 |                       'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32,
 38 |                       'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54,
 39 |                       'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102,
 40 |                       'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118,
 41 |                       'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59,
 42 |                       'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81,
 43 |                       'KCI': 65,
 44 |                       'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129,
 45 |                       'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60,
 46 |                       'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117,
 47 |                       'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132,
 48 |                       'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155,
 49 |                       'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152,
 50 |                       'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87,
 51 |                       'KEU': 72, 'KES': 68, 'KEQ': 138, 'nan': 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144,
 52 |                       'KFS': 38,
 53 |                       'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42,
 54 |                       'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83,
 55 |                       'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11}
 56 | }
 57 | 
 58 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
 59 |                    'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
 60 |                    'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 61 |                    'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
 62 |                    'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 63 |                    'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
 64 | 
 65 | target_cols = target_raw_cols[2:]
 66 | 
 67 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta']
 68 | cat_cols = mapping_dict.keys()
 69 | user_cols = con_cols + cat_cols + target_raw_cols
 70 | NUM_CLASS = 22
 71 | 
 72 | 
 73 | def getAge(str_age):
 74 |     age = str_age.strip()
 75 |     if age == 'NA' or age == 'nan':
 76 |         age1 = 2
 77 |     elif float(age) < 20:
 78 |         age1 = 0
 79 |     elif float(age) < 30:
 80 |         age1 = 1
 81 |     elif float(age) < 40:
 82 |         age1 = 2
 83 |     elif float(age) < 50:
 84 |         age1 = 3
 85 |     elif float(age) < 60:
 86 |         age1 = 4
 87 |     else:
 88 |         age1 = 5
 89 |     return age1
 90 | 
 91 | 
 92 | def getCustSeniority(str_seniority):
 93 |     cust_seniority = str_seniority.strip()
 94 |     if cust_seniority == 'NA' or cust_seniority == 'nan':
 95 |         seniority = 4
 96 |     elif float(cust_seniority) < 50:
 97 |         seniority = 0
 98 |     elif float(cust_seniority) < 75:
 99 |         seniority = 1
100 |     elif float(cust_seniority) < 100:
101 |         seniority = 2
102 |     elif float(cust_seniority) < 125:
103 |         seniority = 3
104 |     elif float(cust_seniority) < 150:
105 |         seniority = 4
106 |     elif float(cust_seniority) < 175:
107 |         seniority = 5
108 |     elif float(cust_seniority) < 200:
109 |         seniority = 6
110 |     elif float(cust_seniority) < 225:
111 |         seniority = 7
112 |     else:
113 |         seniority = 8
114 |     return seniority
115 | 
116 | 
117 | def getRent(str_rent):
118 |     rent = str_rent.strip()
119 |     if rent == 'NA' or rent == 'nan':
120 |         rent1 = 4
121 |     elif float(rent) < 45542.97:
122 |         rent1 = 1
123 |     elif float(rent) < 57629.67:
124 |         rent1 = 2
125 |     elif float(rent) < 68211.78:
126 |         rent1 = 3
127 |     elif float(rent) < 78852.39:
128 |         rent1 = 4
129 |     elif float(rent) < 90461.97:
130 |         rent1 = 5
131 |     elif float(rent) < 103855.23:
132 |         rent1 = 6
133 |     elif float(rent) < 120063.00:
134 |         rent1 = 7
135 |     elif float(rent) < 141347.49:
136 |         rent1 = 8
137 |     elif float(rent) < 173418.12:
138 |         rent1 = 9
139 |     elif float(rent) < 234687.12:
140 |         rent1 = 10
141 |     else:
142 |         rent1 = 11
143 |     return rent1
144 | 
145 | 
146 | def add_com_features(lag_feats):
147 |     com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]]
148 |     for x in range(4):
149 |         import_col = [target_cols[i] for i in com_col[x]]
150 |         for i in range(1, 6):
151 |             pre_import_col = [str(i) + '_' + col for col in import_col]
152 |             lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1)
153 |     return lag_feats
154 | 
155 | 
156 | # def add_com_features(lag_feats):
157 | #     lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1)
158 | #     for i in range(24):
159 | #         index_list = [17+i, 41+i, 65+i, 89+i, 113+i]
160 | #         lag_feats['prod_sum_' + str(i)] = lag_feats.iloc[:,index_list].sum(axis = 1)
161 | #
162 | # pro_sum_list = ['prod_sum_' + str(i) for i in range(24)]
163 | # group_feats_r = lag_feats[pro_sum_list].groupby(lag_feats['renta' ]).agg(lambda x: round(x.sum() / x.count(), 2))
164 | # group_feats_r.columns = ['renta_r_' + str(i) for i in range(24)]
165 | # lag_feats = pd.merge(lag_feats, group_feats_r, left_on='renta', right_index=True, how='left')
166 | # return lag_feats
167 | 
168 | 
169 | def process_train_data(in_file_name, date_list):
170 |     this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])]
171 |     for col in cat_cols:
172 |         this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)])
173 |     for col in target_raw_cols:
174 |         this_month[col].fillna(0, inplace=True)
175 |     this_month['age'] = this_month['age'].apply(lambda x: getAge(x))
176 |     this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x))
177 |     this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x)))
178 | 
179 |     hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols]
180 |     del in_file_name
181 |     pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])]
182 |     pre_month_ncodpers = pre_month[['ncodpers']]
183 |     pre_month_target = pre_month[target_raw_cols]
184 |     pre_month_target = pre_month_target.add_prefix('1_')
185 |     pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1)
186 |     this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left')
187 |     this_month.fillna(0, inplace=True)
188 |     for col in target_cols:
189 |         this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0,
190 |                                    (this_month[col] - this_month['1_' + col]), 0)
191 | 
192 |     this_month_target = this_month[target_cols]
193 |     this_month = this_month.drop(target_raw_cols, axis=1)
194 | 
195 |     x_vars_list = []
196 |     y_vars_list = []
197 | 
198 |     for i in range(2, len(date_list)):
199 |         tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
200 |         tmp = tmp.add_prefix(str(i) + "_")
201 |         tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True)
202 |         this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left')
203 |     this_month.fillna(0, inplace=True)
204 |     del hist_data, tmp
205 | 
206 |     # this_month = add_com_features(this_month)
207 |     # this_month.fillna(0, inplace=True)
208 | 
209 |     this_month = pd.concat([this_month, this_month_target], axis=1)
210 |     for idx, row in this_month.iterrows():
211 |         for i in range(0, NUM_CLASS):
212 |             if row[(-NUM_CLASS + i)] > 0:
213 |                 x_vars_list.append(row[:-NUM_CLASS])
214 |                 y_vars_list.append(i)
215 |     train_X = np.array(x_vars_list)
216 |     return train_X[:, -120:], np.array(y_vars_list)
217 |     # return train_X, np.array(y_vars_list)
218 | 
219 | 
220 | def process_test_data(test_file, hist_file, date_list):
221 |     for col in cat_cols:
222 |         test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)])
223 |     test_file['age'] = test_file['age'].apply(lambda x: getAge(x))
224 |     test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x))
225 |     test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x))
226 | 
227 |     for i in range(0, len(date_list)):
228 |         tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
229 |         tmp = tmp.add_prefix(str(i + 1) + "_")
230 |         tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True)
231 |         test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left')
232 |     test_file.fillna(0, inplace=True)
233 | 
234 |     del hist_file, tmp
235 |     # test_file = add_com_features(test_file)
236 |     # test_file.fillna(0, inplace=True)
237 | 
238 |     return test_file.values[:, -120:], test_file.columns[-120:]
239 |     # return test_file.values, test_file.columns
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     start_time = datetime.datetime.now()
244 |     data_path = '../input/'
245 |     print "feature extract..."
246 | 
247 |     train_file = pd.read_csv(data_path + 'train_ver3.csv',
248 |                              dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
249 |                              usecols=user_cols)
250 |     print datetime.datetime.now() - start_time
251 | 
252 |     train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28',
253 |                                                        '2015-03-28', '2015-02-28', '2015-01-28'])
254 |     # train_X = train_X[:, 2:]
255 |     print datetime.datetime.now() - start_time
256 | 
257 |     data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28']
258 |     train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:,
259 |                  ['ncodpers', 'fecha_dato'] + target_raw_cols]
260 | 
261 |     test_file = pd.read_csv(data_path + 'test_ver3.csv',
262 |                             dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
263 |                             usecols=con_cols + cat_cols)
264 | 
265 |     test_X, feats = process_test_data(test_file, train_file, data_date)
266 |     print datetime.datetime.now() - start_time
267 | 
268 |     del train_file, test_file
269 |     # test_X = test_X[:, 2:]
270 |     # feats = feats[2:]
271 |     print train_X.shape, train_y.shape, test_X.shape
272 | 
273 |     df_train = pd.DataFrame(train_X, columns=feats)
274 |     # df_train['label'] = train_y
275 |     df_test = pd.DataFrame(test_X, columns=feats)
276 | 
277 |     df_train.to_csv(data_path + 'feats/train_feats_lag5.csv', index=False)
278 |     df_test.to_csv(data_path + 'feats/test_feats_lag5.csv', index=False)
279 |     print datetime.datetime.now() - start_time
280 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/feature_extract_v2.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import numpy as np
  3 | import pandas as pd
  4 | import xgboost as xgb
  5 | import itertools
  6 | 
  7 | pd.options.mode.chained_assignment = None
  8 | 
  9 | mapping_dict = {
 10 |     'sexo': {'nan': 0, 'H': 0, 'V': 1},
 11 |     'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1},
 12 |     'segmento': {'nan': 0, '01 - TOP': 1, '03 - UNIVERSITARIO': 2, '02 - PARTICULARES': 3},
 13 |     'ind_nuevo': {'nan': 0, '1.0': 1, '1': 1, '0.0': 2, '0': 2},
 14 |     'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2},
 15 |     'indext': {'nan': 0, 'S': 0, 'N': 1},
 16 |     # 'indresi'       : {'nan':0, 'S':1, 'N':2},
 17 |     # 'indfall'       : {'nan':0, 'S':1, 'N':2},
 18 |     # 'indrel'        : {'nan':1, '1':0, '99':1, '1.0':0, '99.0':1},
 19 |     # 'ind_empleado'  : {'nan':0, 'N':1, 'B':2, 'F':3, 'A':4, 'S':5},
 20 |     'pais_residencia': {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17,
 21 |                         'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73,
 22 |                         'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67,
 23 |                         'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20,
 24 |                         'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90,
 25 |                         'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118,
 26 |                         'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7,
 27 |                         'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4,
 28 |                         'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95,
 29 |                         'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66,
 30 |                         'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, 'nan': 1, 'LB': 81,
 31 |                         'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37,
 32 |                         'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5,
 33 |                         'QA': 58, 'MZ': 27},
 34 |     'canal_entrada': {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12,
 35 |                       'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57,
 36 |                       'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41,
 37 |                       'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32,
 38 |                       'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54,
 39 |                       'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102,
 40 |                       'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118,
 41 |                       'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59,
 42 |                       'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81,
 43 |                       'KCI': 65,
 44 |                       'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129,
 45 |                       'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60,
 46 |                       'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117,
 47 |                       'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132,
 48 |                       'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155,
 49 |                       'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152,
 50 |                       'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87,
 51 |                       'KEU': 72, 'KES': 68, 'KEQ': 138, 'nan': 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144,
 52 |                       'KFS': 38,
 53 |                       'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42,
 54 |                       'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83,
 55 |                       'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11}
 56 | }
 57 | 
 58 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
 59 |                    'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
 60 |                    'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 61 |                    'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
 62 |                    'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 63 |                    'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
 64 | 
 65 | target_cols = target_raw_cols[2:]
 66 | 
 67 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta']
 68 | cat_cols = mapping_dict.keys()
 69 | user_cols = con_cols + cat_cols + target_raw_cols
 70 | NUM_CLASS = 22
 71 | 
 72 | 
 73 | def getAge(str_age):
 74 |     age = str_age.strip()
 75 |     if age == 'NA' or age == 'nan':
 76 |         age1 = 2
 77 |     elif float(age) < 20:
 78 |         age1 = 0
 79 |     elif float(age) < 30:
 80 |         age1 = 1
 81 |     elif float(age) < 40:
 82 |         age1 = 2
 83 |     elif float(age) < 50:
 84 |         age1 = 3
 85 |     elif float(age) < 60:
 86 |         age1 = 4
 87 |     else:
 88 |         age1 = 5
 89 |     return age1
 90 | 
 91 | 
 92 | def getCustSeniority(str_seniority):
 93 |     cust_seniority = str_seniority.strip()
 94 |     if cust_seniority == 'NA' or cust_seniority == 'nan':
 95 |         seniority = 4
 96 |     elif float(cust_seniority) < 50:
 97 |         seniority = 0
 98 |     elif float(cust_seniority) < 75:
 99 |         seniority = 1
100 |     elif float(cust_seniority) < 100:
101 |         seniority = 2
102 |     elif float(cust_seniority) < 125:
103 |         seniority = 3
104 |     elif float(cust_seniority) < 150:
105 |         seniority = 4
106 |     elif float(cust_seniority) < 175:
107 |         seniority = 5
108 |     elif float(cust_seniority) < 200:
109 |         seniority = 6
110 |     elif float(cust_seniority) < 225:
111 |         seniority = 7
112 |     else:
113 |         seniority = 8
114 |     return seniority
115 | 
116 | 
117 | def getRent(str_rent):
118 |     rent = str_rent.strip()
119 |     if rent == 'NA' or rent == 'nan':
120 |         rent1 = 4
121 |     elif float(rent) < 45542.97:
122 |         rent1 = 1
123 |     elif float(rent) < 57629.67:
124 |         rent1 = 2
125 |     elif float(rent) < 68211.78:
126 |         rent1 = 3
127 |     elif float(rent) < 78852.39:
128 |         rent1 = 4
129 |     elif float(rent) < 90461.97:
130 |         rent1 = 5
131 |     elif float(rent) < 103855.23:
132 |         rent1 = 6
133 |     elif float(rent) < 120063.00:
134 |         rent1 = 7
135 |     elif float(rent) < 141347.49:
136 |         rent1 = 8
137 |     elif float(rent) < 173418.12:
138 |         rent1 = 9
139 |     elif float(rent) < 234687.12:
140 |         rent1 = 10
141 |     else:
142 |         rent1 = 11
143 |     return rent1
144 | 
145 | 
146 | def add_com_features(lag_feats):
147 |     lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1)
148 |     for i, pre in enumerate(['1_', '2_', '3_', '4_', '5_']):
149 |         pre_cols = [pre + col for col in target_raw_cols]
150 |         lag_feats['sum_24_' + str(i + 1)] = lag_feats.loc[:, pre_cols].sum(axis=1)
151 |     sum_24_list = ['sum_24_' + str(i + 1) for i in range(5)]
152 |     lag_feats['sum_24_max'] = lag_feats[sum_24_list].max(axis=1)
153 |     lag_feats['sum_24_min'] = lag_feats[sum_24_list].min(axis=1)
154 |     lag_feats['sum_24_mean'] = lag_feats[sum_24_list].mean(axis=1)
155 | 
156 |     for i, col in enumerate(target_raw_cols):
157 |         index_list = [pre + col for pre in ['1_', '2_', '3_', '4_', '5_']]
158 |         lag_feats['prod_sum_' + str(i)] = lag_feats.loc[:, index_list].sum(axis=1)
159 | 
160 |     pro_sum_list = ['prod_sum_' + str(i) for i in range(24)]
161 |     for gp_col in ['renta', 'pais_residencia', 'canal_entrada']:
162 |         group_feats = lag_feats[pro_sum_list].groupby(lag_feats[gp_col]).agg(lambda x: round(x.sum() / x.count(), 2))
163 |         group_feats.columns = [gp_col + str(i) for i in range(24)]
164 |         lag_feats = pd.merge(lag_feats, group_feats, left_on=gp_col, right_index=True, how='left')
165 | 
166 |     com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]]
167 |     for x in range(4):
168 |         import_col = [target_cols[i] for i in com_col[x]]
169 |         for i in range(1, 6):
170 |             pre_import_col = [str(i) + '_' + col for col in import_col]
171 |             lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1)
172 |     return lag_feats
173 | 
174 | 
175 | def process_train_data(in_file_name, date_list):
176 |     this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])]
177 |     for col in cat_cols:
178 |         this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)])
179 |     for col in target_raw_cols:
180 |         this_month[col].fillna(0, inplace=True)
181 |     this_month['age'] = this_month['age'].apply(lambda x: getAge(x))
182 |     this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x))
183 |     this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x)))
184 | 
185 |     hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols]
186 |     del in_file_name
187 |     pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])]
188 |     pre_month_ncodpers = pre_month[['ncodpers']]
189 |     pre_month_target = pre_month[target_raw_cols]
190 |     pre_month_target = pre_month_target.add_prefix('1_')
191 |     pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1)
192 |     this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left')
193 |     this_month.fillna(0, inplace=True)
194 |     for col in target_cols:
195 |         this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0,
196 |                                    (this_month[col] - this_month['1_' + col]), 0)
197 | 
198 |     this_month_target = this_month[target_cols]
199 |     this_month = this_month.drop(target_raw_cols, axis=1)
200 | 
201 |     x_vars_list = []
202 |     y_vars_list = []
203 | 
204 |     for i in range(2, len(date_list)):
205 |         tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
206 |         tmp = tmp.add_prefix(str(i) + "_")
207 |         tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True)
208 |         this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left')
209 |     this_month.fillna(0, inplace=True)
210 |     del hist_data, tmp
211 | 
212 |     this_month = add_com_features(this_month)
213 |     this_month.fillna(0, inplace=True)
214 | 
215 |     this_month = pd.concat([this_month, this_month_target], axis=1)
216 |     for idx, row in this_month.iterrows():
217 |         for i in range(0, NUM_CLASS):
218 |             if row[(-NUM_CLASS + i)] > 0:
219 |                 x_vars_list.append(row[:-NUM_CLASS])
220 |                 y_vars_list.append(i)
221 |     train_X = np.array(x_vars_list)
222 |     # return train_X[:,-120:], np.array(y_vars_list)
223 |     return train_X, np.array(y_vars_list)
224 | 
225 | 
226 | def process_test_data(test_file, hist_file, date_list):
227 |     for col in cat_cols:
228 |         test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)])
229 |     test_file['age'] = test_file['age'].apply(lambda x: getAge(x))
230 |     test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x))
231 |     test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x))
232 | 
233 |     for i in range(0, len(date_list)):
234 |         tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
235 |         tmp = tmp.add_prefix(str(i + 1) + "_")
236 |         tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True)
237 |         test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left')
238 |     test_file.fillna(0, inplace=True)
239 | 
240 |     del hist_file, tmp
241 |     test_file = add_com_features(test_file)
242 |     test_file.fillna(0, inplace=True)
243 | 
244 |     # return test_file.values[:,-120:], test_file.columns[-120:]
245 |     return test_file.values, test_file.columns
246 | 
247 | 
248 | if __name__ == "__main__":
249 |     start_time = datetime.datetime.now()
250 |     data_path = '../input/'
251 |     print "feature extract..."
252 | 
253 |     train_file = pd.read_csv(data_path + 'train_ver3.csv',
254 |                              dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
255 |                              usecols=user_cols)
256 |     print datetime.datetime.now() - start_time
257 | 
258 |     train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28',
259 |                                                        '2015-03-28', '2015-02-28', '2015-01-28'])
260 |     train_X = train_X[:, 2:]
261 |     print datetime.datetime.now() - start_time
262 | 
263 |     data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28']
264 |     train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:,
265 |                  ['ncodpers', 'fecha_dato'] + target_raw_cols]
266 | 
267 |     test_file = pd.read_csv(data_path + 'test_ver3.csv',
268 |                             dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
269 |                             usecols=con_cols + cat_cols)
270 | 
271 |     test_X, feats = process_test_data(test_file, train_file, data_date)
272 |     print datetime.datetime.now() - start_time
273 | 
274 |     del train_file, test_file
275 |     test_X = test_X[:, 2:]
276 |     feats = feats[2:]
277 |     print train_X.shape, train_y.shape, test_X.shape
278 | 
279 |     df_train = pd.DataFrame(train_X, columns=feats)
280 |     df_train['label'] = train_y
281 |     df_test = pd.DataFrame(test_X, columns=feats)
282 | 
283 |     df_train.to_csv(data_path + 'feats/train_feats_v2.csv', index=False)
284 |     df_test.to_csv(data_path + 'feats/test_feats_v1.csv', index=False)
285 |     print datetime.datetime.now() - start_time
286 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/prepro.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | dtype_dict = \
 5 |     {'ncodpers': 'int32', 'age': 'str', 'antiguedad': 'str', 'renta': 'str',
 6 |      'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16',
 7 |      'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16',
 8 |      'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16',
 9 |      'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16',
10 |      'ind_cno_fin_ult1': 'float16', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16',
11 |      'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16',
12 |      'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16',
13 |      'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'}
14 | 
15 | user_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta', 'canal_entrada', 'pais_residencia',
16 |              'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext', 'indresi',
17 |              'indfall', 'indrel', 'ind_empleado']
18 | 
19 | pro_cols = \
20 |     ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
21 |      'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
22 |      'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
23 |      'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
24 |      'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
25 |      'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
26 | 
27 | # use_date = ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28', '2015-06-28',
28 | #             '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
29 | use_date = ['2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28', '2016-01-28']
30 | 
31 | df_train = pd.read_csv("../input/train_ver2.csv", dtype=dtype_dict, usecols=user_cols + pro_cols)
32 | 
33 | df_train = df_train[df_train['fecha_dato'].isin(use_date)]
34 | 
35 | df_train.to_csv('../input/train_ver4.csv', index=False)
36 | # df_test = pd.read_csv("../input/test_ver2.csv", dtype={'ncodpers':'int32'},usecols= user_cols)
37 | #
38 | # df_test.to_csv('../input/test_ver3.csv', index = False)
39 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/xgb_fast.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import xgboost as xgb
  4 | from sklearn.cross_validation import KFold
  5 | 
  6 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
  7 |                    'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
  8 |                    'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
  9 |                    'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
 10 |                    'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 11 |                    'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
 12 | 
 13 | target_cols = target_raw_cols[2:]
 14 | NUM_CLASS = 22
 15 | 
 16 | 
 17 | def runXGB_CV(train_X, train_y, test_X, index, seed_val):
 18 |     train_index, test_index = index
 19 |     X_train = train_X[train_index]
 20 |     y_train = train_y[train_index]
 21 |     xgtrain = xgb.DMatrix(X_train, label=y_train)
 22 |     xgtest = xgb.DMatrix(test_X)
 23 |     param = {
 24 |         'objective': 'multi:softprob',
 25 |         'eval_metric': "mlogloss",
 26 |         'num_class': NUM_CLASS,
 27 |         'silent': 1,
 28 |         'min_child_weight': 2,
 29 |         'eta': 0.06,
 30 |         'max_depth': 6,
 31 |         'subsample': 0.9,
 32 |         'colsample_bytree': 0.8,
 33 |         'seed': seed_val
 34 |     }
 35 |     num_rounds = 100
 36 |     model = xgb.train(param, xgtrain, num_rounds)
 37 |     pred = model.predict(xgtest)
 38 |     return pred
 39 | 
 40 | 
 41 | def runXGB(train_X, train_y, test_X, seed_val=123):
 42 |     param = {
 43 |         'objective': 'multi:softprob',
 44 |         'eval_metric': "mlogloss",
 45 |         'num_class': NUM_CLASS,
 46 |         'silent': 1,
 47 |         'min_child_weight': 2,
 48 |         'eta': 0.06,
 49 |         'max_depth': 8,
 50 |         'subsample': 0.9,
 51 |         'colsample_bytree': 0.8,
 52 |         'seed': seed_val
 53 |     }
 54 |     num_rounds = 100
 55 |     xgtrain = xgb.DMatrix(train_X, label=train_y)
 56 |     xgtest = xgb.DMatrix(test_X)
 57 |     model = xgb.train(param, xgtrain, num_rounds)
 58 |     preds = model.predict(xgtest)
 59 |     return preds
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 |     cv_sel = 0
 64 |     print 'read files...'
 65 |     data_path = '../input/feats/'
 66 | 
 67 |     columns = ['age', 'antiguedad', 'renta',
 68 |                'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext']
 69 |     train_X = pd.read_csv(data_path + 'train_feats_users.csv', usecols=columns + ['label'])
 70 |     for col in ['sum', 'renta', 'canel', 'lag5', 'com20', 'sum8']:
 71 |         train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv')
 72 |         train_X = pd.concat([train_X, train_temp], axis=1)
 73 |     del train_temp
 74 | 
 75 |     test_X = pd.read_csv(data_path + 'test_feats_users.csv', usecols=columns)
 76 |     for col in ['sum', 'renta', 'canel', 'lag5', 'com20', 'sum8']:
 77 |         test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv')
 78 |         test_X = pd.concat([test_X, test_temp], axis=1)
 79 |     del test_temp
 80 | 
 81 |     # train_X  = pd.read_csv(data_path + 'train_feats_v1.csv')
 82 |     # test_X  = pd.read_csv(data_path + 'test_feats_v1.csv')
 83 | 
 84 |     train_y = train_X['label'].values
 85 |     train_X = train_X.drop('label', axis=1).values
 86 |     test_X = test_X.values
 87 |     print train_X.shape, train_y.shape, test_X.shape
 88 | 
 89 |     seed_val = 1234
 90 |     if cv_sel == 1:
 91 |         print "running model with cv..."
 92 |         nfolds = 10
 93 |         kf = KFold(train_X.shape[0], n_folds=nfolds, shuffle=True, random_state=seed_val)
 94 |         preds = [0] * NUM_CLASS
 95 |         for i, index in enumerate(kf):
 96 |             preds += runXGB_CV(train_X, train_y, test_X, index, seed_val)
 97 |             print 'fold %d' % (i + 1)
 98 |         preds = preds / nfolds
 99 | 
100 |     else:
101 |         print "running model..."
102 |         preds = runXGB(train_X, train_y, test_X, seed_val=seed_val)
103 | 
104 |     print "Getting the top products..."
105 |     target_cols = np.array(target_cols)
106 |     preds = np.argsort(preds, axis=1)
107 |     preds = np.fliplr(preds)[:, :7]
108 |     test_id = np.array(pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])['ncodpers'])
109 |     final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
110 |     out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
111 |     out_df.to_csv('../submit/sub_xgb.csv', index=False)
112 | 


--------------------------------------------------------------------------------
/Santander Product Recommendation/xgb_script.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |    author:TaoZI
  3 |    date:2016/12/22
  4 | '''
  5 | import datetime
  6 | import numpy as np
  7 | import pandas as pd
  8 | import xgboost as xgb
  9 | from sklearn.cross_validation import KFold
 10 | 
 11 | pd.options.mode.chained_assignment = None
 12 | 
 13 | mapping_dict = {
 14 |     'sexo': {'nan': 0, 'H': 0, 'V': 1},
 15 |     'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1},
 16 |     'segmento': {'nan': 0, '01 - TOP': 0, '03 - UNIVERSITARIO': 1, '02 - PARTICULARES': 2},
 17 |     'ind_nuevo': {'nan': 0, '1.0': 0, '1': 0, '0.0': 1, '0': 1},
 18 |     'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2},
 19 |     'indext': {'nan': 0, 'S': 0, 'N': 1}
 20 | }
 21 | 
 22 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1',
 23 |                    'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1',
 24 |                    'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1',
 25 |                    'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1',
 26 |                    'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1',
 27 |                    'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1']
 28 | 
 29 | target_cols = target_raw_cols[2:]
 30 | 
 31 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta']
 32 | cat_cols = mapping_dict.keys()
 33 | user_cols = con_cols + cat_cols + target_raw_cols
 34 | NUM_CLASS = 22
 35 | 
 36 | 
 37 | def getAge(str_age):
 38 |     age = str_age.strip()
 39 |     if age == 'NA' or age == 'nan':
 40 |         age1 = 2
 41 |     elif float(age) < 20:
 42 |         age1 = 0
 43 |     elif float(age) < 30:
 44 |         age1 = 1
 45 |     elif float(age) < 40:
 46 |         age1 = 2
 47 |     elif float(age) < 50:
 48 |         age1 = 3
 49 |     elif float(age) < 60:
 50 |         age1 = 4
 51 |     else:
 52 |         age1 = 5
 53 |     return age1
 54 | 
 55 | 
 56 | def getCustSeniority(str_seniority):
 57 |     cust_seniority = str_seniority.strip()
 58 |     if cust_seniority == 'NA' or cust_seniority == 'nan':
 59 |         seniority = 4
 60 |     elif float(cust_seniority) < 50:
 61 |         seniority = 0
 62 |     elif float(cust_seniority) < 75:
 63 |         seniority = 1
 64 |     elif float(cust_seniority) < 100:
 65 |         seniority = 2
 66 |     elif float(cust_seniority) < 125:
 67 |         seniority = 3
 68 |     elif float(cust_seniority) < 150:
 69 |         seniority = 4
 70 |     elif float(cust_seniority) < 175:
 71 |         seniority = 5
 72 |     elif float(cust_seniority) < 200:
 73 |         seniority = 6
 74 |     elif float(cust_seniority) < 225:
 75 |         seniority = 7
 76 |     else:
 77 |         seniority = 8
 78 |     return seniority
 79 | 
 80 | 
 81 | def getRent(str_rent):
 82 |     rent = str_rent.strip()
 83 |     if rent == 'NA' or rent == 'nan':
 84 |         rent1 = 4
 85 |     elif float(rent) < 45542.97:
 86 |         rent1 = 1
 87 |     elif float(rent) < 57629.67:
 88 |         rent1 = 2
 89 |     elif float(rent) < 68211.78:
 90 |         rent1 = 3
 91 |     elif float(rent) < 78852.39:
 92 |         rent1 = 4
 93 |     elif float(rent) < 90461.97:
 94 |         rent1 = 5
 95 |     elif float(rent) < 103855.23:
 96 |         rent1 = 6
 97 |     elif float(rent) < 120063.00:
 98 |         rent1 = 7
 99 |     elif float(rent) < 141347.49:
100 |         rent1 = 8
101 |     elif float(rent) < 173418.12:
102 |         rent1 = 9
103 |     elif float(rent) < 234687.12:
104 |         rent1 = 10
105 |     else:
106 |         rent1 = 11
107 |     return rent1
108 | 
109 | 
110 | def add_com_features(lag_feats):
111 |     lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1)
112 | 
113 |     for i, pre in enumerate(['1_', '2_', '3_', '4_', '5_']):
114 |         pre_cols = [pre + col for col in target_raw_cols]
115 |         lag_feats['sum_24_' + str(i + 1)] = lag_feats.loc[:, pre_cols].sum(axis=1)
116 |     sum_24_list = ['sum_24_' + str(i + 1) for i in range(5)]
117 |     lag_feats['sum_24_max'] = lag_feats[sum_24_list].max(axis=1)
118 |     lag_feats['sum_24_min'] = lag_feats[sum_24_list].min(axis=1)
119 |     lag_feats['sum_24_mean'] = lag_feats[sum_24_list].mean(axis=1)
120 | 
121 |     for i, col in enumerate(target_raw_cols):
122 |         index_list = [pre + col for pre in ['1_', '2_', '3_', '4_', '5_']]
123 |         lag_feats['prod_sum_' + str(i)] = lag_feats.loc[:, index_list].sum(axis=1)
124 | 
125 |     pro_sum_list = ['prod_sum_' + str(i) for i in range(24)]
126 |     for gp_col in ['renta', 'sexo']:
127 |         group_feats = lag_feats[pro_sum_list].groupby(lag_feats[gp_col]).agg(lambda x: round(x.sum() / x.count(), 2))
128 |         group_feats.columns = [gp_col + str(i) for i in range(24)]
129 |         lag_feats = pd.merge(lag_feats, group_feats, left_on=gp_col, right_index=True, how='left')
130 | 
131 |     com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]]
132 |     for x in range(4):
133 |         import_col = [target_cols[i] for i in com_col[x]]
134 |         for i in range(1, 6):
135 |             pre_import_col = [str(i) + '_' + col for col in import_col]
136 |             lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1)
137 |     return lag_feats
138 | 
139 | 
140 | def process_train_data(in_file_name, date_list):
141 |     this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])]
142 |     for col in cat_cols:
143 |         this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)])
144 |     for col in target_raw_cols:
145 |         this_month[col].fillna(0, inplace=True)
146 |     this_month['age'] = this_month['age'].apply(lambda x: getAge(x))
147 |     this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x))
148 |     this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x)))
149 | 
150 |     hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols]
151 |     del in_file_name
152 |     pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])]
153 |     pre_month_ncodpers = pre_month[['ncodpers']]
154 |     pre_month_target = pre_month[target_raw_cols]
155 |     pre_month_target = pre_month_target.add_prefix('1_')
156 |     pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1)
157 |     this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left')
158 |     this_month.fillna(0, inplace=True)
159 |     for col in target_cols:
160 |         this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0,
161 |                                    (this_month[col] - this_month['1_' + col]), 0)
162 | 
163 |     this_month_target = this_month[target_cols]
164 |     this_month = this_month.drop(target_raw_cols, axis=1)
165 | 
166 |     x_vars_list = []
167 |     y_vars_list = []
168 | 
169 |     for i in range(2, len(date_list)):
170 |         tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
171 |         tmp = tmp.add_prefix(str(i) + "_")
172 |         tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True)
173 |         this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left')
174 |     this_month.fillna(0, inplace=True)
175 |     del hist_data
176 | 
177 |     this_month = add_com_features(this_month)
178 |     this_month.fillna(0, inplace=True)
179 | 
180 |     this_month = pd.concat([this_month, this_month_target], axis=1)
181 |     for idx, row in this_month.iterrows():
182 |         for i in range(0, 22):
183 |             if row[(-22 + i)] > 0:
184 |                 x_vars_list.append(row[:-22])
185 |                 y_vars_list.append(i)
186 | 
187 |     return np.array(x_vars_list), np.array(y_vars_list)
188 | 
189 | 
190 | def process_test_data(test_file, hist_file, date_list):
191 |     for col in cat_cols:
192 |         test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)])
193 |     test_file['age'] = test_file['age'].apply(lambda x: getAge(x))
194 |     test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x))
195 |     test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x))
196 | 
197 |     for i in range(0, len(date_list)):
198 |         tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols]
199 |         tmp = tmp.add_prefix(str(i + 1) + "_")
200 |         tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True)
201 |         test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left')
202 |     test_file.fillna(0, inplace=True)
203 | 
204 |     del hist_file
205 | 
206 |     test_file = add_com_features(test_file)
207 |     test_file.fillna(0, inplace=True)
208 |     return test_file.values
209 | 
210 | 
211 | def runXGB_CV(train_X, train_y, test_X, index, seed_val):
212 |     train_index, test_index = index
213 |     X_train = train_X[train_index]
214 |     y_train = train_y[train_index]
215 | 
216 |     xgtrain = xgb.DMatrix(X_train, label=y_train)
217 |     xgtest = xgb.DMatrix(test_X)
218 | 
219 |     param = {
220 |         'objective': 'multi:softprob',
221 |         'eval_metric': "mlogloss",
222 |         'num_class': NUM_CLASS,
223 |         'silent': 1,
224 |         'min_child_weight': 2,
225 |         'eta': 0.05,
226 |         'max_depth': 6,
227 |         'subsample': 0.9,
228 |         'colsample_bytree': 0.8,
229 |         'seed': seed_val
230 |     }
231 |     num_rounds = 100
232 |     model = xgb.train(param, xgtrain, num_rounds)
233 |     pred = model.predict(xgtest)
234 |     return pred
235 | 
236 | 
237 | def runXGB(train_X, train_y, test_X, seed_val=123):
238 |     param = {
239 |         'objective': 'multi:softprob',
240 |         'eval_metric': "mlogloss",
241 |         'num_class': NUM_CLASS,
242 |         'silent': 1,
243 |         'min_child_weight': 2,
244 |         'eta': 0.05,
245 |         'max_depth': 6,
246 |         'subsample': 0.9,
247 |         'colsample_bytree': 0.8,
248 |         'seed': seed_val
249 |     }
250 |     num_rounds = 100
251 |     xgtrain = xgb.DMatrix(train_X, label=train_y)
252 |     xgtest = xgb.DMatrix(test_X)
253 | 
254 |     model = xgb.train(param, xgtrain, num_rounds)
255 |     preds = model.predict(xgtest)
256 |     return preds
257 | 
258 | 
259 | if __name__ == "__main__":
260 | 
261 |     cv_sel = 1
262 |     start_time = datetime.datetime.now()
263 |     data_path = '../input/'
264 | 
265 |     print "feature extract..."
266 |     train_file = pd.read_csv(data_path + 'train_ver3.csv',
267 |                              dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
268 |                              usecols=user_cols)
269 |     print datetime.datetime.now() - start_time
270 | 
271 |     train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28',
272 |                                                        '2015-03-28', '2015-02-28', '2015-01-28'])
273 |     train_X = train_X[:, 2:]
274 |     print datetime.datetime.now() - start_time
275 | 
276 |     data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28']
277 |     train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:,
278 |                  ['ncodpers', 'fecha_dato'] + target_raw_cols]
279 | 
280 |     test_file = pd.read_csv(data_path + 'test_ver3.csv',
281 |                             dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'},
282 |                             usecols=con_cols + cat_cols)
283 | 
284 |     test_X = process_test_data(test_file, train_file, data_date)
285 |     print datetime.datetime.now() - start_time
286 | 
287 |     del train_file, test_file
288 |     test_X = test_X[:, 2:]
289 |     feats = feats[2:]
290 |     print train_X.shape, train_y.shape, test_X.shape
291 |     print datetime.datetime.now() - start_time
292 | 
293 |     seed_val = 123
294 |     if cv_sel == 1:
295 |         print "running model with cv..."
296 |         nfolds = 5
297 |         kf = KFold(train_X.shape[0], n_folds=nfolds, shuffle=True, random_state=seed_val)
298 |         preds = [0] * NUM_CLASS
299 |         for i, index in enumerate(kf):
300 |             preds += runXGB_CV(train_X, train_y, test_X, index, seed_val)
301 |             print 'fold %d' % (i + 1)
302 |         preds = preds / nfolds
303 | 
304 |     else:
305 |         print "running model with feature..."
306 |         preds = runXGB(train_X, train_y, test_X, seed_val)
307 | 
308 |     del train_X, test_X, train_y
309 | 
310 |     print "Getting the top products.."
311 |     target_cols = np.array(target_cols)
312 |     preds = np.argsort(preds, axis=1)
313 |     preds = np.fliplr(preds)[:, :7]
314 |     test_id = np.array(pd.read_csv(data_path + 'test_ver2.csv', usecols=['ncodpers'])['ncodpers'])
315 |     final_preds = [" ".join(list(target_cols[pred])) for pred in preds]
316 |     out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds})
317 |     out_df.to_csv('../submit/sub_xgb.csv', index=False)
318 | 


--------------------------------------------------------------------------------