├── Allstate Claims Severity ├── MyScript.py ├── script0.py ├── script1.py ├── script2.py ├── script3.py ├── script4.py └── script_keras.py ├── README.md └── Santander Product Recommendation ├── Others ├── Rule_main.py ├── code.py └── xgb_v1.py ├── ensemble.py ├── feature_combine.py ├── feature_extract_v1.py ├── feature_extract_v2.py ├── prepro.py ├── xgb_fast.py └── xgb_script.py /Allstate Claims Severity/MyScript.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | import datetime 5 | import itertools 6 | from scipy.stats import boxcox 7 | from sklearn.preprocessing import StandardScaler 8 | from sklearn.cross_validation import KFold 9 | from sklearn.metrics import mean_absolute_error 10 | from sklearn import preprocessing 11 | 12 | pd.options.mode.chained_assignment = None 13 | 14 | multi_corr = [79, 80, 81, 87, 89, 90, 101, 103, 111] 15 | two_corr = [2, 3, 9, 10, 11, 12, 13, 23, 36, 57, 72] 16 | multi_cat_diff = [90, 92, 96, 99, 101, 102, 103, 106, 109, 110, 113, 114, 116] 17 | skewed_num = [1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 18 | cat2corr = [(29, 30), (40, 41), (43, 45), (55, 56), (8, 65), (8, 66), (104, 106)] 19 | two_avg1 = [1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 16, 23, 24, 25, 26, 27, 28, 36, 38, 40, 44, 50, 53, 57, 72, 73, 20 | 76, 79, 80, 81, 82, 87, 89, 90, 103, 111] 21 | 22 | 23 | def logregobj(preds, dtrain): 24 | labels = dtrain.get_label() 25 | con = 2 26 | x = preds - labels 27 | grad = con * x / (np.abs(x) + con) 28 | hess = con ** 2 / (np.abs(x) + con) ** 2 29 | return grad, hess 30 | 31 | 32 | def evalerror(preds, dtrain): 33 | labels = dtrain.get_label() 34 | return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels)) 35 | 36 | 37 | def encode(charcode): 38 | r = 0 39 | ln = len(str(charcode)) 40 | for i in range(ln): 41 | r += (ord(str(charcode)[i]) - ord('A')) 42 | return r + 1 43 | 44 | 45 | def prepro(train, test, cont_feature): 46 | joined = pd.concat((train, test)).reset_index(drop=True) 47 | skewed_feats = ['cont' + str(i) for i in skewed_num] 48 | for feats in skewed_feats: 49 | joined[feats] = joined[feats] + 1 50 | joined[feats], lam = boxcox(joined[feats]) 51 | 52 | multi_diff_feats = ['cat' + str(i) for i in multi_cat_diff] 53 | for column in multi_diff_feats: 54 | set_train = set(train[column].unique()) 55 | set_test = set(test[column].unique()) 56 | remove_train = set_train - set_test 57 | remove_test = set_test - set_train 58 | remove = remove_train.union(remove_test) 59 | 60 | def filter_cat(x): 61 | if x in remove: 62 | return np.nan 63 | return x 64 | 65 | joined[column] = joined[column].apply(lambda x: filter_cat(x), 1) 66 | 67 | ss = StandardScaler() 68 | joined[cont_feature] = ss.fit_transform(joined[cont_feature].values) 69 | del train, test 70 | return joined 71 | 72 | 73 | def feature_extract(joined, cont_feature): 74 | features = pd.DataFrame() 75 | features['id'] = joined['id'] 76 | features['loss'] = np.log(joined['loss'] + 200) 77 | 78 | cat_sel = [n for n in joined.columns if n.startswith('cat')] 79 | for column in cat_sel: 80 | features[column] = pd.factorize(joined[column].values, sort=True)[0] + 1 81 | 82 | for column in cont_feature: 83 | features[column] = joined[column] 84 | 85 | features['cont_avg'] = joined[cont_feature].mean(axis=1) 86 | features['cont_min'] = joined[cont_feature].min(axis=1) 87 | features['cont_max'] = joined[cont_feature].max(axis=1) 88 | 89 | for i in [20, 40, 73]: 90 | cat_feats = ['cat' + str(i) for i in range(1, i)] 91 | idx = 'cat_' + 'sum_' + str(i) 92 | features[idx + '_A'] = joined[cat_feats].apply(lambda x: sum(x == 'A'), axis=1) 93 | features[idx + '_B'] = joined[cat_feats].apply(lambda x: sum(x == 'B'), axis=1) 94 | 95 | cat2_feats = [('cat' + str(i), 'cat' + str(j)) for (i, j) in cat2corr] 96 | for feat1, feat2 in cat2_feats: 97 | feat_comb = feat1 + '_' + feat2 98 | features[feat_comb] = joined[feat1] + joined[feat2] 99 | features[feat_comb] = features[feat_comb].apply(encode) 100 | 101 | cat2avg_feats = ['cat' + str(i) for i in two_avg1] 102 | for feat1, feat2 in itertools.combinations(cat2avg_feats, 2): 103 | feat_comb = feat1 + '_' + feat2 104 | features[feat_comb] = joined[feat1] + joined[feat2] 105 | features[feat_comb] = features[feat_comb].apply(encode) 106 | 107 | train = features[features['loss'].notnull()] 108 | test = features[features['loss'].isnull()] 109 | del features, joined 110 | return train, test 111 | 112 | 113 | def ceate_feature_map(features): 114 | outfile = open('xgb.fmap', 'w') 115 | i = 0 116 | for feat in features: 117 | outfile.write('{0}\t{1}\tq\n'.format(i, feat)) 118 | i = i + 1 119 | outfile.close() 120 | 121 | 122 | def feature_select(train, test): 123 | import operator 124 | params = { 125 | 'min_child_weight': 100, 126 | 'eta': 0.02, 127 | 'colsample_bytree': 0.7, 128 | 'max_depth': 12, 129 | 'subsample': 0.7, 130 | 'alpha': 1, 131 | 'gamma': 1, 132 | 'silent': 1, 133 | 'objective': 'reg:linear', 134 | 'verbose_eval': True, 135 | 'seed': 12 136 | } 137 | rounds = 300 138 | y = train['loss'] 139 | X = train.drop(['loss', 'id'], 1) 140 | 141 | xgtrain = xgb.DMatrix(X, label=y) 142 | bst = xgb.train(params, xgtrain, num_boost_round=rounds) 143 | 144 | feats = [x for x in train.columns if x not in ['id', 'loss']] 145 | print len(feats) 146 | outfile = open('xgb.fmap', 'w') 147 | i = 0 148 | for feat in feats: 149 | outfile.write('{0}\t{1}\tq\n'.format(i, feat)) 150 | i = i + 1 151 | outfile.close() 152 | 153 | importance = bst.get_fscore(fmap='xgb.fmap') 154 | importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) 155 | feats = [a for (a, b) in importance] 156 | feats = feats[:450] 157 | print len(feats) 158 | df = pd.DataFrame(importance, columns=['feature', 'fscore']) 159 | df['fscore'] = df['fscore'] / df['fscore'].sum() 160 | df.to_csv("../input/feat_sel/feat_importance.csv", index=False) 161 | 162 | train1 = train[['id', 'loss'] + feats] 163 | test1 = test[['id'] + feats] 164 | return train1, test1 165 | 166 | 167 | def runXGB(train, test, index, RANDOM_STATE): 168 | train_index, test_index = index 169 | y = train['loss'] 170 | X = train.drop(['loss', 'id'], 1) 171 | X_test = test.drop(['id'], 1) 172 | del train, test 173 | X_train, X_val = X.iloc[train_index], X.iloc[test_index] 174 | y_train, y_val = y.iloc[train_index], y.iloc[test_index] 175 | 176 | xgtrain = xgb.DMatrix(X_train, label=y_train) 177 | xgval = xgb.DMatrix(X_val, label=y_val) 178 | xgtest = xgb.DMatrix(X_test) 179 | X_val = xgb.DMatrix(X_val) 180 | 181 | params = { 182 | 'min_child_weight': 10, 183 | 'eta': 0.01, 184 | 'colsample_bytree': 0.7, 185 | 'max_depth': 12, 186 | 'subsample': 0.7, 187 | 'alpha': 1, 188 | 'gamma': 1, 189 | 'silent': 1, 190 | 'verbose_eval': True, 191 | 'seed': RANDOM_STATE 192 | } 193 | rounds = 3000 194 | 195 | watchlist = [(xgtrain, 'train'), (xgval, 'eval')] 196 | model = xgb.train(params, xgtrain, rounds, watchlist, obj=logregobj, feval=evalerror, early_stopping_rounds=100) 197 | 198 | cv_score = mean_absolute_error(np.exp(model.predict(X_val)) - 200, np.exp(y_val) - 200) 199 | predict = np.exp(model.predict(xgtest)) - 200 200 | print "iteration = %d" % (model.best_iteration) 201 | return predict, cv_score 202 | 203 | 204 | if __name__ == '__main__': 205 | 206 | Generate_or_read = 0 # 0 generate 207 | feat_sel = 1 # 1 select 208 | start_time = datetime.datetime.now() 209 | if Generate_or_read == 0: 210 | print "generate features..." 211 | train = pd.read_csv('../input/train.csv') 212 | test = pd.read_csv('../input/test.csv') 213 | test['loss'] = np.nan 214 | cont_feature = [n for n in train.columns if n.startswith('cont')] 215 | joined = prepro(train, test, cont_feature) 216 | train, test = feature_extract(joined, cont_feature) 217 | print train.shape, test.shape 218 | print datetime.datetime.now() - start_time 219 | if feat_sel == 1: 220 | print "feature select..." 221 | train, test = feature_select(train, test) 222 | train.to_csv("../input/feature/train.csv", index=False) 223 | test.to_csv("../input/feature/test.csv", index=False) 224 | print train.shape, test.shape 225 | print datetime.datetime.now() - start_time 226 | 227 | else: 228 | print "read features..." 229 | train = pd.read_csv("../input/feature/train.csv") 230 | test = pd.read_csv("../input/feature/test.csv") 231 | print train.shape, test.shape 232 | 233 | print "run model..." 234 | nfolds = 10 235 | RANDOM_STATE = 113 236 | ids = test['id'] 237 | predicts = np.zeros(ids.shape) 238 | kf = KFold(train.shape[0], n_folds=nfolds, shuffle=True, random_state=RANDOM_STATE) 239 | for i, index in enumerate(kf): 240 | print('Fold %d' % (i + 1)) 241 | predict, cv_score = runXGB(train, test, index, RANDOM_STATE) 242 | print cv_score 243 | predicts += predict 244 | 245 | print datetime.datetime.now() - start_time 246 | predicts = predicts / nfolds 247 | submission = pd.DataFrame() 248 | submission['id'] = ids 249 | submission['loss'] = predicts 250 | submission.to_csv('../submit/submit_xgb.csv', index=False) 251 | -------------------------------------------------------------------------------- /Allstate Claims Severity/script0.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.metrics import mean_absolute_error 6 | 7 | SHIFT = 200 8 | 9 | 10 | def df_cleaner(df_train, df_test): 11 | cont_list = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 12 | 'cont11', 'cont12', 'cont13', 'cont14'] 13 | 14 | ntrain = df_train.shape[0] 15 | df = pd.concat([df_train, df_test]).reset_index(drop=True) 16 | 17 | df_cat = pd.get_dummies(df.filter(regex="^cat")) 18 | scale = StandardScaler() 19 | df[cont_list] = scale.fit_transform(df[cont_list].values) 20 | 21 | df = pd.concat([df[['id', 'loss'] + cont_list], df_cat], axis=1) 22 | df_out_train = df.iloc[:ntrain, :] 23 | df_out_test = df.iloc[ntrain:, :] 24 | 25 | df_out_columns = df_out_train.loc[:, (df_out_train != 0).any(axis=0)].columns 26 | data_columns = list(df_out_columns) 27 | data_columns.remove('id') 28 | data_columns.remove('loss') 29 | return df_out_train, df_out_test, data_columns 30 | 31 | 32 | def evalerror(preds, dtrain): 33 | labels = dtrain.get_label() 34 | return 'mae', mean_absolute_error(np.exp(preds) - SHIFT, np.exp(labels) - SHIFT) 35 | 36 | 37 | if __name__ == '__main__': 38 | df_train = pd.read_csv('../input/train.csv') 39 | df_test = pd.read_csv('../input/test.csv') 40 | df_cleaner(df_train, df_test) 41 | train, test, features = df_cleaner(df_train, df_test) 42 | del df_train 43 | del df_test 44 | 45 | x_test = test[:][features] 46 | train['loss_logshift'] = np.log(train['loss'] + SHIFT) 47 | 48 | number_of_bagging_iterations = 10 49 | max_number_of_rounds = 1500 50 | early_stopping_rounds = 20 51 | 52 | work_dataframe = test[['id']] 53 | 54 | for i in xrange(number_of_bagging_iterations): 55 | train_slice = train[train.id % number_of_bagging_iterations != i] 56 | val_slice = train[train.id % number_of_bagging_iterations == i] 57 | 58 | x_train = train_slice[features] 59 | y_train = train_slice['loss_logshift'] 60 | 61 | x_val = val_slice[features] 62 | y_val = val_slice['loss_logshift'] 63 | 64 | model = xgb.XGBRegressor(max_depth=12, colsample_bytree=0.5, min_child_weight=1, subsample=0.8, gamma=1, 65 | n_estimators=max_number_of_rounds, learning_rate=0.1) 66 | 67 | model.fit(x_train, y_train, early_stopping_rounds=early_stopping_rounds, 68 | eval_set=[(x_train, y_train), (x_val, y_val)], eval_metric=evalerror) 69 | 70 | this_iteration_predictions = model.predict(x_test).astype(float) 71 | 72 | temp_series = pd.Series(np.exp(this_iteration_predictions) - SHIFT) 73 | work_dataframe['round' + str(i)] = temp_series.values 74 | 75 | work_dataframe['mean_values'] = work_dataframe.filter(regex="^round").mean(axis=1) 76 | work_dataframe[['id', 'mean_values']].to_csv('../input/submit_claim.csv', index=False, 77 | float_format='%.2f', header=['id', 'loss']) 78 | -------------------------------------------------------------------------------- /Allstate Claims Severity/script1.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import xgboost as xgb 4 | 5 | from sklearn.metrics import mean_absolute_error 6 | from sklearn.cross_validation import KFold 7 | from scipy.stats import skew, boxcox 8 | from sklearn.preprocessing import StandardScaler 9 | import itertools 10 | 11 | shift = 200 12 | fair_constant = 2 13 | # COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,'\ 14 | # 'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \ 15 | # 'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \ 16 | # 'cat4,cat14,cat38,cat24,cat82,cat25'.split(',') 17 | COMB_FEATURE = 'cat4,cat14,cat38,cat24,cat82,cat25'.split(',') 18 | 19 | 20 | def encode(charcode): 21 | r = 0 22 | ln = len(str(charcode)) 23 | for i in range(ln): 24 | r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1) 25 | return r 26 | 27 | 28 | def fair_obj(preds, dtrain): 29 | labels = dtrain.get_label() 30 | x = (preds - labels) 31 | den = abs(x) + fair_constant 32 | grad = fair_constant * x / (den) 33 | hess = fair_constant * fair_constant / (den * den) 34 | return grad, hess 35 | 36 | 37 | def xg_eval_mae(yhat, dtrain): 38 | y = dtrain.get_label() 39 | return 'mae', mean_absolute_error(np.exp(y) - shift, np.exp(yhat) - shift) 40 | 41 | 42 | def mungeskewed(train, test, numeric_feats): 43 | ntrain = train.shape[0] 44 | test['loss'] = 0 45 | train_test = pd.concat((train, test)).reset_index(drop=True) 46 | skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) 47 | skewed_feats = skewed_feats[skewed_feats > 0.25] 48 | skewed_feats = skewed_feats.index 49 | 50 | print skewed_feats 51 | for feats in skewed_feats: 52 | train_test[feats] = train_test[feats] + 1 53 | train_test[feats], lam = boxcox(train_test[feats]) 54 | return train_test, ntrain 55 | 56 | 57 | if __name__ == "__main__": 58 | 59 | print('Started') 60 | train = pd.read_csv('../input/train.csv') 61 | test = pd.read_csv('../input/test.csv') 62 | 63 | numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x] 64 | categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x] 65 | train_test, ntrain = mungeskewed(train, test, numeric_feats) 66 | 67 | print('') 68 | for comb in itertools.combinations(COMB_FEATURE, 2): 69 | feat = comb[0] + "_" + comb[1] 70 | train_test[feat] = train_test[comb[0]] + train_test[comb[1]] 71 | # train_test[feat] = train_test[feat].apply(encode) 72 | print('Analyzing Columns:', feat) 73 | 74 | categorical_feats = [x for x in train_test.columns[1:] if 'cat' in x] 75 | 76 | print('') 77 | for col in categorical_feats: 78 | print('Analyzing Column:', col) 79 | train_test[col] = train_test[col].apply(encode) 80 | 81 | print(train_test[categorical_feats]) 82 | 83 | ss = StandardScaler() 84 | train_test[numeric_feats] = \ 85 | ss.fit_transform(train_test[numeric_feats].values) 86 | 87 | train = train_test.iloc[:ntrain, :].copy() 88 | test = train_test.iloc[ntrain:, :].copy() 89 | 90 | print('\nMedian Loss:', train.loss.median()) 91 | print('Mean Loss:', train.loss.mean()) 92 | 93 | ids = pd.read_csv('input/test.csv')['id'] 94 | train_y = np.log(train['loss'] + shift) 95 | train_x = train.drop(['loss', 'id'], axis=1) 96 | test_x = test.drop(['loss', 'id'], axis=1) 97 | 98 | n_folds = 10 99 | cv_sum = 0 100 | early_stopping = 100 101 | fpred = [] 102 | xgb_rounds = [] 103 | 104 | d_train_full = xgb.DMatrix(train_x, label=train_y) 105 | d_test = xgb.DMatrix(test_x) 106 | 107 | kf = KFold(train.shape[0], n_folds=n_folds) 108 | for i, (train_index, test_index) in enumerate(kf): 109 | print('\n Fold %d' % (i + 1)) 110 | X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index] 111 | y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index] 112 | 113 | rand_state = 2016 114 | 115 | params = { 116 | 'seed': 0, 117 | 'colsample_bytree': 0.7, 118 | 'silent': 1, 119 | 'subsample': 0.7, 120 | 'learning_rate': 0.03, 121 | 'objective': 'reg:linear', 122 | 'max_depth': 12, 123 | 'min_child_weight': 100, 124 | 'booster': 'gbtree'} 125 | 126 | d_train = xgb.DMatrix(X_train, label=y_train) 127 | d_valid = xgb.DMatrix(X_val, label=y_val) 128 | watchlist = [(d_train, 'train'), (d_valid, 'eval')] 129 | 130 | clf = xgb.train(params, 131 | d_train, 132 | 100000, 133 | watchlist, 134 | early_stopping_rounds=50, 135 | obj=fair_obj, 136 | feval=xg_eval_mae) 137 | 138 | xgb_rounds.append(clf.best_iteration) 139 | scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit) 140 | cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val)) 141 | print('eval-MAE: %.6f' % cv_score) 142 | y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift 143 | 144 | if i > 0: 145 | fpred = pred + y_pred 146 | else: 147 | fpred = y_pred 148 | pred = fpred 149 | cv_sum = cv_sum + cv_score 150 | 151 | mpred = pred / n_folds 152 | score = cv_sum / n_folds 153 | print('Average eval-MAE: %.6f' % score) 154 | n_rounds = int(np.mean(xgb_rounds)) 155 | 156 | print("Writing results") 157 | result = pd.DataFrame(mpred, columns=['loss']) 158 | result["id"] = ids 159 | result = result.set_index("id") 160 | 161 | print("Writing submission:") 162 | result.to_csv('../submit/submit.csv', index=True, index_label='id') 163 | -------------------------------------------------------------------------------- /Allstate Claims Severity/script2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import xgboost as xgb 4 | from sklearn.cross_validation import KFold 5 | from sklearn.metrics import mean_absolute_error 6 | 7 | train = pd.read_csv('../input/train.csv') 8 | test = pd.read_csv('../input/test.csv') 9 | 10 | test['loss'] = np.nan 11 | joined = pd.concat([train, test]) 12 | 13 | 14 | def logregobj(preds, dtrain): 15 | labels = dtrain.get_label() 16 | con = 2 17 | x = preds - labels 18 | grad = con * x / (np.abs(x) + con) 19 | hess = con ** 2 / (np.abs(x) + con) ** 2 20 | return grad, hess 21 | 22 | 23 | def evalerror(preds, dtrain): 24 | labels = dtrain.get_label() 25 | return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels)) 26 | 27 | 28 | cat_feature = [n for n in joined.columns if n.startswith('cat')] 29 | cont_feature = [n for n in joined.columns if n.startswith('cont')] 30 | 31 | if __name__ == '__main__': 32 | 33 | for column in cat_feature: 34 | joined[column] = pd.factorize(joined[column].values, sort=True)[0] 35 | 36 | train = joined[joined['loss'].notnull()] 37 | test = joined[joined['loss'].isnull()] 38 | 39 | shift = 200 40 | y = np.log(train['loss'] + shift) 41 | ids = test['id'] 42 | X = train.drop(['loss', 'id'], 1) 43 | X_test = test.drop(['loss', 'id'], 1) 44 | 45 | n_folds = 5 46 | kf = KFold(X.shape[0], n_folds=n_folds) 47 | prediction = np.zeros(ids.shape) 48 | 49 | # final_fold_prediction = [] 50 | # final_fold_real = [] 51 | 52 | partial_evalutaion = open('temp_scores.txt', 'w') 53 | for i, (train_index, test_index) in enumerate(kf): 54 | print('\n Fold %d' % (i + 1)) 55 | X_train, X_val = X.iloc[train_index], X.iloc[test_index] 56 | y_train, y_val = y.iloc[train_index], y.iloc[test_index] 57 | 58 | RANDOM_STATE = 2016 59 | params = { 60 | 'min_child_weight': 1, 61 | 'eta': 0.001, 62 | 'colsample_bytree': 0.5, 63 | 'max_depth': 12, 64 | 'subsample': 0.8, 65 | 'alpha': 1, 66 | 'gamma': 1, 67 | 'silent': 1, 68 | 'verbose_eval': True, 69 | 'seed': RANDOM_STATE 70 | } 71 | 72 | xgtrain = xgb.DMatrix(X_train, label=y_train) 73 | xgtrain_2 = xgb.DMatrix(X_val, label=y_val) 74 | 75 | xgtest = xgb.DMatrix(X_test) 76 | 77 | watchlist = [(xgtrain, 'train'), (xgtrain_2, 'eval')] 78 | 79 | model = xgb.train(params, xgtrain, 100000, watchlist, obj=logregobj, feval=evalerror, early_stopping_rounds=300) 80 | prediction += np.exp(model.predict(xgtest)) - shift 81 | 82 | # X_val = xgb.DMatrix(X_val) 83 | # temp_serises = pd.Series(np.exp(model.predict(X_val)) - shift) 84 | # final_fold_prediction.append(temp_serises) 85 | # temp_serises = np.exp(y_val) - shift 86 | # final_fold_real.append(temp_serises) 87 | # 88 | # temp_cv_score = mean_absolute_error(np.exp(model.predict(X_val)) - shift, np.exp(y_val) - shift) 89 | # 90 | # partial_evalutaion.write('fold ' + str(i) + ' ' + str(temp_cv_score) + '\n') 91 | # partial_evalutaion.flush() 92 | 93 | prediction = prediction / n_folds 94 | submission = pd.DataFrame() 95 | submission['id'] = ids 96 | submission['loss'] = prediction 97 | 98 | submission.to_csv('../submit/submit2.csv', index=False) 99 | 100 | # final_fold_prediction = pd.concat(final_fold_prediction, ignore_index=True) 101 | # final_fold_real = pd.concat(final_fold_real, ignore_index=True) 102 | # 103 | # cv_score = mean_absolute_error(final_fold_prediction, final_fold_real) 104 | # print cv_score 105 | -------------------------------------------------------------------------------- /Allstate Claims Severity/script3.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Vladimir Iglovikov' 2 | 3 | import pandas as pd 4 | import numpy as np 5 | import xgboost as xgb 6 | 7 | from sklearn.metrics import mean_absolute_error 8 | 9 | train = pd.read_csv('../input/train.csv') 10 | test = pd.read_csv('../input/test.csv') 11 | test['loss'] = np.nan 12 | joined = pd.concat([train, test]) 13 | 14 | 15 | def evalerror(preds, dtrain): 16 | labels = dtrain.get_label() 17 | return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels)) 18 | 19 | 20 | if __name__ == '__main__': 21 | for column in list(train.select_dtypes(include=['object']).columns): 22 | if train[column].nunique() != test[column].nunique(): 23 | 24 | set_train = set(train[column].unique()) 25 | set_test = set(test[column].unique()) 26 | remove_train = set_train - set_test 27 | remove_test = set_test - set_train 28 | 29 | remove = remove_train.union(remove_test) 30 | 31 | 32 | def filter_cat(x): 33 | if x in remove: 34 | return np.nan 35 | return x 36 | 37 | 38 | joined[column] = joined[column].apply(lambda x: filter_cat(x), 1) 39 | 40 | joined[column] = pd.factorize(joined[column].values, sort=True)[0] 41 | 42 | train = joined[joined['loss'].notnull()] 43 | test = joined[joined['loss'].isnull()] 44 | 45 | shift = 200 46 | y = np.log(train['loss'] + shift) 47 | ids = test['id'] 48 | X = train.drop(['loss', 'id'], 1) 49 | X_test = test.drop(['loss', 'id'], 1) 50 | 51 | RANDOM_STATE = 2016 52 | params = { 53 | 'min_child_weight': 1, 54 | 'eta': 0.01, 55 | 'colsample_bytree': 0.5, 56 | 'max_depth': 12, 57 | 'subsample': 0.8, 58 | 'alpha': 1, 59 | 'gamma': 1, 60 | 'silent': 1, 61 | 'verbose_eval': True, 62 | 'seed': RANDOM_STATE 63 | } 64 | 65 | xgtrain = xgb.DMatrix(X, label=y) 66 | xgtest = xgb.DMatrix(X_test) 67 | 68 | model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror) 69 | 70 | prediction = np.exp(model.predict(xgtest)) - shift 71 | 72 | submission = pd.DataFrame() 73 | submission['loss'] = prediction 74 | submission['id'] = ids 75 | submission.to_csv('../submit/sub_v.csv', index=False) 76 | -------------------------------------------------------------------------------- /Allstate Claims Severity/script4.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import xgboost as xgb 4 | 5 | from datetime import datetime 6 | from sklearn.metrics import mean_absolute_error 7 | from sklearn.cross_validation import KFold 8 | from scipy.stats import skew, boxcox 9 | from sklearn import preprocessing 10 | from sklearn.preprocessing import StandardScaler 11 | import itertools 12 | 13 | shift = 200 14 | COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \ 15 | 'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \ 16 | 'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \ 17 | 'cat4,cat14,cat38,cat24,cat82,cat25'.split(',') 18 | 19 | 20 | def encode(charcode): 21 | r = 0 22 | ln = len(str(charcode)) 23 | for i in range(ln): 24 | r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1) 25 | return r 26 | 27 | fair_constant = 0.7 28 | def fair_obj(preds, dtrain): 29 | labels = dtrain.get_label() 30 | x = (preds - labels) 31 | den = abs(x) + fair_constant 32 | grad = fair_constant * x / (den) 33 | hess = fair_constant * fair_constant / (den * den) 34 | return grad, hess 35 | 36 | def xg_eval_mae(yhat, dtrain): 37 | y = dtrain.get_label() 38 | return 'mae', mean_absolute_error(np.exp(y) - shift, np.exp(yhat) - shift) 39 | 40 | def mungeskewed(train, test, numeric_feats): 41 | ntrain = train.shape[0] 42 | test['loss'] = 0 43 | train_test = pd.concat((train, test)).reset_index(drop=True) 44 | skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) 45 | skewed_feats = skewed_feats[skewed_feats > 0.25] 46 | skewed_feats = skewed_feats.index 47 | 48 | for feats in skewed_feats: 49 | train_test[feats] = train_test[feats] + 1 50 | train_test[feats], lam = boxcox(train_test[feats]) 51 | return train_test, ntrain 52 | 53 | 54 | if __name__ == "__main__": 55 | 56 | print('\nStarted') 57 | directory = '../input/' 58 | train = pd.read_csv(directory + 'train.csv') 59 | test = pd.read_csv(directory + 'test.csv') 60 | 61 | numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x] 62 | categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x] 63 | train_test, ntrain = mungeskewed(train, test, numeric_feats) 64 | 65 | # taken from Vladimir's script (https://www.kaggle.com/iglovikov/allstate-claims-severity/xgb-1114) 66 | for column in list(train.select_dtypes(include=['object']).columns): 67 | if train[column].nunique() != test[column].nunique(): 68 | set_train = set(train[column].unique()) 69 | set_test = set(test[column].unique()) 70 | remove_train = set_train - set_test 71 | remove_test = set_test - set_train 72 | 73 | remove = remove_train.union(remove_test) 74 | 75 | def filter_cat(x): 76 | if x in remove: 77 | return np.nan 78 | return x 79 | 80 | train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1) 81 | 82 | # taken from Ali's script (https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117) 83 | train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"])) 84 | train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"])) 85 | train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"])) 86 | train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"])) 87 | train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"])) 88 | train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"])) 89 | train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"])) 90 | 91 | train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1) 92 | train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1) 93 | train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1) 94 | train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1) 95 | train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25 96 | 97 | print('') 98 | for comb in itertools.combinations(COMB_FEATURE, 2): 99 | feat = comb[0] + "_" + comb[1] 100 | train_test[feat] = train_test[comb[0]] + train_test[comb[1]] 101 | train_test[feat] = train_test[feat].apply(encode) 102 | print('Combining Columns:', feat) 103 | 104 | print('') 105 | for col in categorical_feats: 106 | print('Analyzing Column:', col) 107 | train_test[col] = train_test[col].apply(encode) 108 | 109 | print(train_test[categorical_feats]) 110 | 111 | ss = StandardScaler() 112 | train_test[numeric_feats] = \ 113 | ss.fit_transform(train_test[numeric_feats].values) 114 | 115 | train = train_test.iloc[:ntrain, :].copy() 116 | test = train_test.iloc[ntrain:, :].copy() 117 | 118 | print('\nMedian Loss:', train.loss.median()) 119 | print('Mean Loss:', train.loss.mean()) 120 | 121 | ids = pd.read_csv('../input/test.csv')['id'] 122 | train_y = np.log(train['loss'] + shift) 123 | train_x = train.drop(['loss', 'id'], axis=1) 124 | test_x = test.drop(['loss', 'id'], axis=1) 125 | 126 | n_folds = 10 127 | cv_sum = 0 128 | early_stopping = 100 129 | fpred = [] 130 | xgb_rounds = [] 131 | 132 | d_train_full = xgb.DMatrix(train_x, label=train_y) 133 | d_test = xgb.DMatrix(test_x) 134 | 135 | kf = KFold(train.shape[0], n_folds=n_folds) 136 | for i, (train_index, test_index) in enumerate(kf): 137 | print('\n Fold %d' % (i + 1)) 138 | X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index] 139 | y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index] 140 | 141 | rand_state = 2016 142 | 143 | params = { 144 | 'seed': 0, 145 | 'colsample_bytree': 0.7, 146 | 'silent': 1, 147 | 'subsample': 0.7, 148 | 'learning_rate': 0.03, 149 | 'objective': 'reg:linear', 150 | 'max_depth': 12, 151 | 'min_child_weight': 100, 152 | 'booster': 'gbtree'} 153 | 154 | d_train = xgb.DMatrix(X_train, label=y_train) 155 | d_valid = xgb.DMatrix(X_val, label=y_val) 156 | watchlist = [(d_train, 'train'), (d_valid, 'eval')] 157 | 158 | clf = xgb.train(params, 159 | d_train, 160 | 100000, 161 | watchlist, 162 | early_stopping_rounds=50, 163 | obj=fair_obj, 164 | feval=xg_eval_mae) 165 | 166 | xgb_rounds.append(clf.best_iteration) 167 | scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit) 168 | cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val)) 169 | print('eval-MAE: %.6f' % cv_score) 170 | y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift 171 | 172 | if i > 0: 173 | fpred = pred + y_pred 174 | else: 175 | fpred = y_pred 176 | pred = fpred 177 | cv_sum = cv_sum + cv_score 178 | 179 | mpred = pred / n_folds 180 | score = cv_sum / n_folds 181 | print('Average eval-MAE: %.6f' % score) 182 | n_rounds = int(np.mean(xgb_rounds)) 183 | 184 | print("Writing results") 185 | result = pd.DataFrame(mpred, columns=['loss']) 186 | result["id"] = ids 187 | result = result.set_index("id") 188 | print("%d-fold average prediction:" % n_folds) 189 | 190 | now = datetime.now() 191 | score = str(round((cv_sum / n_folds), 6)) 192 | 193 | result.to_csv('../submit/en_submit.csv', index=True, index_label='id') -------------------------------------------------------------------------------- /Allstate Claims Severity/script_keras.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import subprocess 4 | from scipy.sparse import csr_matrix, hstack 5 | from sklearn.metrics import mean_absolute_error 6 | from sklearn.preprocessing import StandardScaler 7 | from sklearn.cross_validation import KFold 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Dropout, Activation 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.layers.advanced_activations import PReLU 12 | 13 | np.random.seed(123) 14 | 15 | 16 | def batch_generator(X, y, batch_size, shuffle): 17 | number_of_batches = np.ceil(X.shape[0] / batch_size) 18 | counter = 0 19 | sample_index = np.arange(X.shape[0]) 20 | if shuffle: 21 | np.random.shuffle(sample_index) 22 | while True: 23 | batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] 24 | X_batch = X[batch_index, :].toarray() 25 | y_batch = y[batch_index] 26 | counter += 1 27 | yield X_batch, y_batch 28 | if (counter == number_of_batches): 29 | if shuffle: 30 | np.random.shuffle(sample_index) 31 | counter = 0 32 | 33 | 34 | def batch_generatorp(X, batch_size, shuffle): 35 | number_of_batches = X.shape[0] / np.ceil(X.shape[0] / batch_size) 36 | counter = 0 37 | sample_index = np.arange(X.shape[0]) 38 | while True: 39 | batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)] 40 | X_batch = X[batch_index, :].toarray() 41 | counter += 1 42 | yield X_batch 43 | if (counter == number_of_batches): 44 | counter = 0 45 | 46 | 47 | ## read data 48 | train = pd.read_csv('../input/train.csv') 49 | test = pd.read_csv('../input/test.csv') 50 | 51 | index = list(train.index) 52 | train = train.iloc[index] 53 | 'train = train.iloc[np.random.permutation(len(train))]' 54 | 55 | ## set test loss to NaN 56 | test['loss'] = np.nan 57 | 58 | ## response and IDs 59 | y = np.log(train['loss'].values + 200) 60 | id_train = train['id'].values 61 | id_test = test['id'].values 62 | 63 | ## stack train test 64 | ntrain = train.shape[0] 65 | tr_te = pd.concat((train, test), axis=0) 66 | 67 | ## Preprocessing and transforming to sparse data 68 | sparse_data = [] 69 | 70 | f_cat = [f for f in tr_te.columns if 'cat' in f] 71 | for f in f_cat: 72 | dummy = pd.get_dummies(tr_te[f].astype('category')) 73 | tmp = csr_matrix(dummy) 74 | sparse_data.append(tmp) 75 | 76 | f_num = [f for f in tr_te.columns if 'cont' in f] 77 | scaler = StandardScaler() 78 | tmp = csr_matrix(scaler.fit_transform(tr_te[f_num])) 79 | sparse_data.append(tmp) 80 | 81 | del (tr_te, train, test) 82 | 83 | ## sparse train and test data 84 | xtr_te = hstack(sparse_data, format='csr') 85 | xtrain = xtr_te[:ntrain, :] 86 | xtest = xtr_te[ntrain:, :] 87 | 88 | print('Dim train', xtrain.shape) 89 | print('Dim test', xtest.shape) 90 | 91 | del (xtr_te, sparse_data, tmp) 92 | 93 | 94 | ## neural net 95 | def nn_model(): 96 | model = Sequential() 97 | 98 | model.add(Dense(400, input_dim=xtrain.shape[1], init='he_normal')) 99 | model.add(PReLU()) 100 | model.add(BatchNormalization()) 101 | model.add(Dropout(0.4)) 102 | 103 | model.add(Dense(200, init='he_normal')) 104 | model.add(PReLU()) 105 | model.add(BatchNormalization()) 106 | model.add(Dropout(0.2)) 107 | 108 | model.add(Dense(50, init='he_normal')) 109 | model.add(PReLU()) 110 | model.add(BatchNormalization()) 111 | model.add(Dropout(0.2)) 112 | 113 | model.add(Dense(1, init='he_normal')) 114 | model.compile(loss='mae', optimizer='adadelta') 115 | return (model) 116 | 117 | 118 | ## cv-folds 119 | nfolds = 5 120 | folds = KFold(len(y), n_folds=nfolds, shuffle=True, random_state=111) 121 | 122 | ## train models 123 | i = 0 124 | nbags = 10 125 | nepochs = 55 126 | pred_oob = np.zeros(xtrain.shape[0]) 127 | pred_test = np.zeros(xtest.shape[0]) 128 | 129 | for (inTr, inTe) in folds: 130 | xtr = xtrain[inTr] 131 | ytr = y[inTr] 132 | xte = xtrain[inTe] 133 | yte = y[inTe] 134 | pred = np.zeros(xte.shape[0]) 135 | for j in range(nbags): 136 | model = nn_model() 137 | fit = model.fit_generator(generator=batch_generator(xtr, ytr, 128, True), 138 | nb_epoch=nepochs, 139 | samples_per_epoch=xtr.shape[0], 140 | validation_data=(xte.todense(), yte), 141 | verbose=0) 142 | temp = np.exp( 143 | model.predict_generator(generator=batch_generatorp(xte, 800, False), val_samples=xte.shape[0])[:, 0]) - 200 144 | pred += temp 145 | print( 146 | "Fold val bagging score after", j + 1, "rounds is: ", 147 | mean_absolute_error(np.exp(yte) - 200, pred / (j + 1))) 148 | pred_test += np.exp( 149 | model.predict_generator(generator=batch_generatorp(xtest, 800, False), val_samples=xtest.shape[0])[:, 150 | 0]) - 200 151 | pred /= nbags 152 | pred_oob[inTe] = pred 153 | score = mean_absolute_error(np.exp(yte) - 200, pred) 154 | i += 1 155 | print('Fold ', i, '- MAE:', score) 156 | 157 | print('Total - MAE:', mean_absolute_error(np.exp(y) - 200, pred_oob)) 158 | 159 | ## train predictions 160 | df = pd.DataFrame({'id': id_train, 'loss': pred_oob}) 161 | df.to_csv('preds_oob.csv', index=False) 162 | 163 | ## test predictions 164 | pred_test /= (nfolds * nbags) 165 | df = pd.DataFrame({'id': id_test, 'loss': pred_test}) 166 | df.to_csv('submission_keras_shift_perm.csv', index=False) 167 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kaggle-Solution 2 | This respository contains my code for competition in kaggle. 3 | 4 | - Santander Product Recommendation : [https://www.kaggle.com/c/santander-product-recommendation](https://www.kaggle.com/c/santander-product-recommendation) 5 | - Allstate Claims Severity : [https://www.kaggle.com/c/allstate-claims-severity](https://www.kaggle.com/c/allstate-claims-severity) 6 | 7 | ### Kaggle Top Solutions 8 | - Kaggle Past Solutions : [http://ndres.me/kaggle-past-solutions/](http://ndres.me/kaggle-past-solutions/) 9 | - Kaggle优胜者代码汇总: [http://suanfazu.com/t/kaggle/230](http://suanfazu.com/t/kaggle/230) -------------------------------------------------------------------------------- /Santander Product Recommendation/Others/Rule_main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from collections import defaultdict 4 | 5 | pd.options.mode.chained_assignment = None 6 | 7 | target_col = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1', 'ind_cder_fin_ult1', 8 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1','ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 9 | 'ind_ctpp_fin_ult1','ind_deco_fin_ult1','ind_deme_fin_ult1', 'ind_dela_fin_ult1', 10 | 'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1', 'ind_plan_fin_ult1', 11 | 'ind_pres_fin_ult1','ind_reca_fin_ult1','ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 12 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 13 | use_cols = ['ncodpers'] + target_col + ['sexo','renta'] 14 | 15 | def get_overbest(df_train): 16 | overbest_dict = {} 17 | for col_name in use_cols[1:25]: 18 | overbest_dict[col_name] = np.sum(df_train[col_name]) 19 | top_products = sorted(overbest_dict, key = overbest_dict.get,reverse = True) 20 | return top_products 21 | 22 | def get_eachbest(df_train): 23 | df_group = df_train[target_col].groupby(df_train['ncodpers']).sum() 24 | eachbest_dict = defaultdict(list) 25 | for ind,row in df_group.iterrows(): 26 | row = row[row != 0].sort_values(ascending=False) 27 | eachbest_dict[ind] = list(row.index) 28 | return eachbest_dict 29 | 30 | def get_lastinstance(last_instance_df): 31 | cust_dict = {} 32 | target_cols = np.array(use_cols[1:25]) 33 | for ind, row in last_instance_df.iterrows(): 34 | cust = row['ncodpers'] 35 | used_products = set(target_cols[np.array(row[1:25] == 1)]) 36 | cust_dict[cust] = used_products 37 | return cust_dict 38 | 39 | def get_similardict(df_train): 40 | df_train['renta'].fillna(0, inplace=True) 41 | df_group1 = df_train[['renta','ind_ahor_fin_ult1']].groupby(df_train['ncodpers']).mean() 42 | mapping = {} 43 | for ind, row in df_group1.iterrows(): 44 | if row['renta'] == 0: 45 | mapping[ind] = '0' 46 | elif row['renta'] < 45542.97: 47 | mapping[ind] = '1' 48 | elif row['renta'] < 57629.67: 49 | mapping[ind] = '2' 50 | elif row['renta'] < 68211.78: 51 | mapping[ind] = '3' 52 | elif row['renta'] < 78852.39: 53 | mapping[ind] = '4' 54 | elif row['renta'] < 90461.97: 55 | mapping[ind] = '5' 56 | elif row['renta'] < 103855.23: 57 | mapping[ind] = '6' 58 | elif row['renta'] < 120063.00: 59 | mapping[ind] = '7' 60 | elif row['renta'] < 141347.49: 61 | mapping[ind] = '8' 62 | elif row['renta'] < 173418.12: 63 | mapping[ind] = '9' 64 | elif row['renta'] < 234687.12: 65 | mapping[ind] = '10' 66 | else: 67 | mapping[ind] = '11' 68 | print mapping 69 | df_group2 = df_train[target_col].groupby(df_train['ncodpers']).sum() 70 | df_group3 = df_group2[target_col].groupby(mapping).sum() 71 | 72 | temp_dict = defaultdict(list) 73 | for ind, row in df_group3.iterrows(): 74 | row = row[row != 0].sort_values(ascending=False) 75 | temp_dict[ind] = list(row.index) 76 | 77 | similar_dict = defaultdict(list) 78 | for key in list(df_group1.index): 79 | similar_dict[key] = temp_dict[mapping[key]] 80 | return similar_dict 81 | 82 | def get_kmeansdict(df_train): 83 | df_group1 = df_train[target_col].groupby(df_train['ncodpers']).sum() 84 | df_group1.fillna(0, inplace=True) 85 | 86 | from sklearn.cluster import KMeans 87 | kmeans = KMeans(n_clusters=100) 88 | kmeans.fit(df_group1.values) 89 | 90 | mapping = {} 91 | for key,value in zip(list(df_group1.index),kmeans.labels_): 92 | mapping[key] = value 93 | df_group2 = df_group1[target_col].groupby(mapping).sum() 94 | print mapping 95 | 96 | 97 | temp_dict = defaultdict(list) 98 | for ind, row in df_group2.iterrows(): 99 | row = row[row != 0].sort_values(ascending=False) 100 | temp_dict[ind] = list(row.index) 101 | 102 | kmeans_dict = defaultdict(list) 103 | for key in list(df_group1.index): 104 | kmeans_dict[key] = temp_dict[mapping[key]] 105 | return kmeans_dict 106 | 107 | 108 | if __name__ == "__main__": 109 | print("0") 110 | df_test = pd.read_csv('../input/test_sub_1000.csv', usecols = ['ncodpers'] + target_col) 111 | cust_dict = get_lastinstance(df_test) 112 | del df_test 113 | print("1") 114 | df_train = pd.read_csv('../input/train_sub_1000.csv', usecols = use_cols) 115 | top_products = get_overbest(df_train) 116 | print("2") 117 | eachbest_dict = get_eachbest(df_train) 118 | print("3") 119 | # similar_dict = get_similardict(df_train) 120 | similar_dict = get_kmeansdict(df_train) 121 | print("4") 122 | del df_train 123 | 124 | sub_id = eachbest_dict.keys() 125 | final_preds = [] 126 | 127 | print("Running model") 128 | for ncodper, each_list in eachbest_dict.iteritems(): 129 | used_products = cust_dict.get(ncodper,[]) 130 | similar_product = similar_dict[ncodper] 131 | pred_products = [] 132 | for product in each_list: 133 | if product not in used_products: 134 | pred_products.append(product) 135 | if len(pred_products) == 7: 136 | break 137 | if len(pred_products) < 7: 138 | for product in similar_product: 139 | if (product not in used_products) and (product not in pred_products): 140 | pred_products.append(product) 141 | if len(pred_products) == 7: 142 | break 143 | if len(pred_products) < 7: 144 | for product in top_products: 145 | if (product not in used_products) and (product not in pred_products): 146 | pred_products.append(product) 147 | if len(pred_products) == 7: 148 | break 149 | 150 | final_preds.append(" ".join(pred_products)) 151 | out_df = pd.DataFrame({'ncodpers':sub_id,'added_products':final_preds}) 152 | 153 | print("Generate submission...") 154 | sub_92 = pd.read_csv('../input/sample_submission.csv', usecols=['ncodpers']).values[:, 0] 155 | submit = out_df [out_df ['ncodpers'].isin(sub_92)] 156 | submit.loc[:,'ncodpers'] = submit.loc[:,'ncodpers'].astype('int32') 157 | submit.to_csv('../input/submit1.csv', index=False) 158 | -------------------------------------------------------------------------------- /Santander Product Recommendation/Others/code.py: -------------------------------------------------------------------------------- 1 | sub_rf = pd.read_csv('../input/sub_rf.csv', nrows=929615) 2 | sub_reg = pd.read_csv('../input/sub_reg.csv', nrows=929615) 3 | sub_union = pd.DataFrame(np.zeros((sub_rf.shape[0], 2)), columns=['ncodpers', 'added_products']) 4 | sub_union['ncodpers'] = sub_rf['ncodpers'] 5 | 6 | added = [] 7 | for x in range(sub_rf.shape[0]): 8 | rf_str = sub_rf.loc[x]['added_products'].split(' ') 9 | reg_str = sub_reg.loc[x]['added_products'].split(' ')[:7] 10 | 11 | str = [] 12 | for str1 in reg_str: 13 | if str1 in rf_str: 14 | str.append(str1) 15 | for str1 in rf_str: 16 | if str1 not in str: 17 | str.append(str1) 18 | added.append(" ".join(str)) 19 | 20 | sub_union.loc[:, 'added_products'] = added 21 | sub_union.to_csv('submit.csv', index=False) 22 | 23 | 24 | # ====================================================================================================================== 25 | def getAge(row): 26 | age = row['age'].strip() 27 | if age == 'NA' or age == '': 28 | age1 = 2 29 | elif float(age) < 20: 30 | age1 = 0 31 | elif float(age) < 30: 32 | age1 = 1 33 | elif float(age) < 40: 34 | age1 = 2 35 | elif float(age) < 50: 36 | age1 = 3 37 | elif float(age) < 60: 38 | age1 = 4 39 | else: 40 | age1 = 5 41 | return age1 42 | 43 | 44 | def getCustSeniority(row): 45 | cust_seniority = row['antiguedad'].strip() 46 | if cust_seniority == 'NA' or cust_seniority == '': 47 | seniority = 2 48 | elif float(cust_seniority) < 50: 49 | seniority = 0 50 | elif float(cust_seniority) < 100: 51 | seniority = 1 52 | elif float(cust_seniority) < 150: 53 | seniority = 2 54 | elif float(cust_seniority) < 200: 55 | seniority = 3 56 | else: 57 | seniority = 4 58 | return seniority 59 | 60 | 61 | def getRent(row): 62 | rent = row['renta'].strip() 63 | if rent == 'NA' or rent == '': 64 | rent1 = 4 65 | elif float(rent) < 45542.97: 66 | rent1 = 1 67 | elif float(rent) < 57629.67: 68 | rent1 = 2 69 | elif float(rent) < 68211.78: 70 | rent1 = 3 71 | elif float(rent) < 78852.39: 72 | rent1 = 4 73 | elif float(rent) < 90461.97: 74 | rent1 = 5 75 | elif float(rent) < 103855.23: 76 | rent1 = 6 77 | elif float(rent) < 120063.00: 78 | rent1 = 7 79 | elif float(rent) < 141347.49: 80 | rent1 = 8 81 | elif float(rent) < 173418.12: 82 | rent1 = 9 83 | elif float(rent) < 234687.12: 84 | rent1 = 10 85 | else: 86 | rent1 = 11 87 | return rent1 88 | 89 | 90 | # df_user['renta'] = df_user['renta'].fillna(df_user.loc[df_user['renta'].notnull(),'renta'].median()) 91 | # ====================================================================================================================== 92 | for con_attr in ['age', 'antiguedad', 'renta']: 93 | group_feats_1 = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: x.sum()) 94 | group_feats_0 = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: x.count() - x.sum()) 95 | group_feats_r = lag_feats[pro_sum_list].groupby(lag_feats[con_attr]).agg(lambda x: round(x.sum() / x.count(), 2)) 96 | group_feats_1.columns = [con_attr + '_1_' + str(i) for i in range(24)] 97 | group_feats_0.columns = [con_attr + '_0_' + str(i) for i in range(24)] 98 | group_feats_r.columns = [con_attr + '_r_' + str(i) for i in range(24)] 99 | lag_feats = pd.merge(lag_feats, group_feats_1, left_on=con_attr, right_index=True, how='left') 100 | lag_feats = pd.merge(lag_feats, group_feats_0, left_on=con_attr, right_index=True, how='left') 101 | lag_feats = pd.merge(lag_feats, group_feats_r, left_on=con_attr, right_index=True, how='left') 102 | 103 | 104 | ##====================================================================================================================== 105 | def get_last_buy(x): 106 | stop = 0 107 | for i in [0, 1, 2, 3, 4]: 108 | if x.values[i] == 1: 109 | stop = 5 - i 110 | break 111 | return stop 112 | 113 | 114 | def get_first_buy(x): 115 | start = 0 116 | for i in [4, 3, 2, 1, 0]: 117 | if x.values[i] == 1: 118 | start = 5 - i 119 | break 120 | return start 121 | 122 | 123 | def get_buy_len(x): 124 | x_value = x.values 125 | if x_value[-1] != 0: 126 | len1 = x_value[-1] - x_value[-2] + 1 127 | else: 128 | len1 = 0 129 | return len1 130 | 131 | 132 | def add_com_features(lag_feats): 133 | for i in range(24): 134 | index_list = [11 + i, 35 + i, 59 + i, 83 + i, 107 + i] 135 | lag_feats['prod_sum_' + str(i)] = lag_feats.iloc[:, index_list].sum(axis=1) 136 | lag_feats['first_buy_' + str(i)] = lag_feats.iloc[:, index_list].apply(lambda x: get_first_buy(x), axis=1) 137 | lag_feats['last_buy_' + str(i)] = lag_feats.iloc[:, index_list].apply(lambda x: get_last_buy(x), axis=1) 138 | lag_feats['leng_buy_' + str(i)] = lag_feats.loc[:, ['first_buy_' + str(i), 'last_buy_' + str(i)]].apply( 139 | lambda x: get_buy_len(x), axis=1) 140 | 141 | pro_sum_list = ['prod_sum_' + str(i) for i in range(24)] 142 | pro_rank_list = ['prod_rank_' + str(i) for i in range(24)] 143 | lag_feats[pro_rank_list] = lag_feats[pro_sum_list].apply(lambda x: x.rank(ascending=False).astype('int'), axis=1) 144 | 145 | import_col = [target_cols[i] for i in [0, 2, 4, 9, 10, 11, 15, 16, 19, 20, 21]] 146 | for i in range(1, 6): 147 | pre_import_col = [str(i) + '_' + col for col in import_col] 148 | lag_feats[str(i) + '_11_sum_import'] = lag_feats[pre_import_col].sum(axis=1) 149 | for col in import_col: 150 | lag_feats['1_im_' + col] = lag_feats['1_' + col] 151 | 152 | com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21], [16, 19, 20, 21]] 153 | for x in range(4): 154 | import_col = [target_cols[i] for i in com_col[x]] 155 | for i in range(1, 6): 156 | pre_import_col = [str(i) + '_' + col for col in import_col] 157 | lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1) 158 | return lag_feats 159 | # ======================================================================================================================= 160 | columns = ['age', 'antiguedad', 'renta', 'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext'] 161 | columns1 = [] 162 | target_cols1 = [target_cols[i] for i in [0, 2, 4, 5, 6, 7, 9, 10, 11, 15, 16, 17, 19, 20, 21]] 163 | for i in range(1, 6): 164 | columns1.extend([str(i) + '_' + col for col in target_cols1]) 165 | 166 | train_X = pd.read_csv(data_path + 'train_feats_users.csv', usecols=columns + ['label']) 167 | for col in ['sum', 'renta', 'canel', 'pais', 'com20', 'lagf1', 'sum8']: 168 | if col == 'lagf1': 169 | train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv', usecols=columns1) 170 | else: 171 | train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv') 172 | train_X = pd.concat([train_X, train_temp], axis=1) 173 | del train_temp 174 | 175 | test_X = pd.read_csv(data_path + 'test_feats_users.csv', usecols=columns) 176 | for col in ['sum', 'renta', 'canel', 'pais', 'com20', 'lagf1', 'sum8']: 177 | if col == 'lagf1': 178 | test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv', usecols=columns1) 179 | else: 180 | test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv') 181 | test_X = pd.concat([test_X, test_temp], axis=1) 182 | del test_temp 183 | -------------------------------------------------------------------------------- /Santander Product Recommendation/Others/xgb_v1.py: -------------------------------------------------------------------------------- 1 | 2 | import csv 3 | import datetime 4 | import numpy as np 5 | import pandas as pd 6 | import xgboost as xgb 7 | from sklearn.metrics import log_loss 8 | 9 | 10 | mapping_dict = { 11 | 'sexo' : {-99:0, 'H':0, 'V':1}, 12 | 'ind_actividad_cliente' : {-99:0, '0.0':0, '0':0,'1.0':1, '1':1}, 13 | 'segmento' : {-99:0, '01 - TOP':0, '03 - UNIVERSITARIO':1, '02 - PARTICULARES':2}, 14 | 'ind_nuevo' : {-99:0, '1.0':0, '1':0, '0.0':1, '0':1 }, 15 | 'tiprel_1mes' : {-99:0, 'P':0, 'R':0, 'N':0, 'I':1, 'A':2}, 16 | 'indext' : {-99:0, 'S':0, 'N':1}, 17 | # 'canal_entrada' : {'KHE':6, 'KAT':5 ,'KFC':4, 'KFA':3, 'KHK':2, 'KHQ':1, -99: 0} 18 | } 19 | target_cols1 = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 20 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 21 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 22 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 23 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 24 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 25 | 26 | cat_cols = list(mapping_dict.keys()) 27 | target_cols = target_cols1[2:] 28 | target_cols.remove('ind_cder_fin_ult1') 29 | NUM_CLASS = 22 30 | 31 | def getIndex(row, col): 32 | val = row[col].strip() 33 | if val not in ['', 'NA']: 34 | ind = mapping_dict[col][val] 35 | else: 36 | ind = mapping_dict[col][-99] 37 | return ind 38 | 39 | 40 | def getAge(row): 41 | age = row['age'].strip() 42 | if age == 'NA' or age == '': 43 | age1 = 2 44 | elif float(age) < 20: 45 | age1 = 0 46 | elif float(age) < 30: 47 | age1 = 1 48 | elif float(age) < 40: 49 | age1 = 2 50 | elif float(age) < 50: 51 | age1 = 3 52 | elif float(age) < 60: 53 | age1 = 4 54 | else: 55 | age1 = 5 56 | return age1 57 | 58 | def getCustSeniority(row): 59 | cust_seniority = row['antiguedad'].strip() 60 | if cust_seniority == 'NA' or cust_seniority == '': 61 | seniority = 2 62 | elif float(cust_seniority) < 50: 63 | seniority = 0 64 | elif float(cust_seniority) < 100: 65 | seniority = 1 66 | elif float(cust_seniority) < 150: 67 | seniority = 2 68 | elif float(cust_seniority) < 200: 69 | seniority = 3 70 | else: 71 | seniority = 4 72 | return seniority 73 | 74 | def getRent(row): 75 | rent = row['renta'].strip() 76 | if rent == 'NA' or rent == '': 77 | rent1 = 4 78 | elif float(rent) < 45542.97: 79 | rent1 = 1 80 | elif float(rent) < 57629.67: 81 | rent1 = 2 82 | elif float(rent) < 68211.78: 83 | rent1 = 3 84 | elif float(rent) < 78852.39: 85 | rent1 = 4 86 | elif float(rent) < 90461.97: 87 | rent1 = 5 88 | elif float(rent) < 103855.23: 89 | rent1 = 6 90 | elif float(rent) < 120063.00: 91 | rent1 = 7 92 | elif float(rent) < 141347.49: 93 | rent1 = 8 94 | elif float(rent) < 173418.12: 95 | rent1 = 9 96 | elif float(rent) < 234687.12: 97 | rent1 = 10 98 | else: 99 | rent1 = 11 100 | return rent1 101 | 102 | def getTarget(row): 103 | tlist = [] 104 | for col in target_cols: 105 | if row[col].strip() in ['', 'NA']: 106 | target = 0 107 | else: 108 | target = int(float(row[col])) 109 | tlist.append(target) 110 | print len(tlist) 111 | return tlist 112 | 113 | def feature_extract(row, prev_target_list): 114 | analy_index = [0,1,8,9,10,13,14,15,16,18,19,20] 115 | pro_feats = [prev_target_list[i] for i in analy_index] 116 | x_vars = [] 117 | for col in cat_cols: 118 | x_vars.append(getIndex(row, col)) 119 | x_vars.append(getAge(row)) 120 | x_vars.append(getCustSeniority(row)) 121 | x_vars.append(getRent(row)) 122 | x_vars.append(prev_target_list.count(1)) 123 | return x_vars + pro_feats 124 | 125 | def getLagFeature(): 126 | data_path = '../input/divide/train' 127 | use_cols = ['ncodpers'] + target_cols1 128 | train_05 = pd.read_csv(data_path + '2015-05-28.csv',usecols = use_cols) 129 | train_04 = pd.read_csv(data_path + '2015-04-28.csv', usecols=use_cols) 130 | train_03 = pd.read_csv(data_path + '2015-03-28.csv', usecols=use_cols) 131 | train_02 = pd.read_csv(data_path + '2015-02-28.csv', usecols=use_cols) 132 | train_01 = pd.read_csv(data_path + '2015-01-28.csv', usecols=use_cols) 133 | train_lag = pd.merge(train_05,train_04, on = 'ncodpers',how = 'left') 134 | train_lag = pd.merge(train_lag, train_03, on = 'ncodpers', how = 'left') 135 | train_lag = pd.merge(train_lag, train_02, on = 'ncodpers', how = 'left') 136 | train_lag = pd.merge(train_lag, train_01, on = 'ncodpers', how = 'left') 137 | train_lag.fillna(0 ,inplace = True) 138 | train_lag_dict = {} 139 | for ind, row in train_lag.iterrows(): 140 | id = int(row['ncodpers']) 141 | train_lag_dict[id] = list(row.values[1:]) 142 | 143 | train_05 = pd.read_csv(data_path + '2016-05-28.csv', usecols=use_cols) 144 | train_04 = pd.read_csv(data_path + '2016-04-28.csv', usecols=use_cols) 145 | train_03 = pd.read_csv(data_path + '2016-03-28.csv', usecols=use_cols) 146 | train_02 = pd.read_csv(data_path + '2016-02-28.csv', usecols=use_cols) 147 | train_01 = pd.read_csv(data_path + '2016-01-28.csv', usecols=use_cols) 148 | train_lag = pd.merge(train_05, train_04, on='ncodpers', how='left') 149 | train_lag = pd.merge(train_lag, train_03, on='ncodpers', how='left') 150 | train_lag = pd.merge(train_lag, train_02, on='ncodpers', how='left') 151 | train_lag = pd.merge(train_lag, train_01, on='ncodpers', how='left') 152 | train_lag.fillna(0, inplace=True) 153 | test_lag_dict = {} 154 | for ind, row in train_lag.iterrows(): 155 | id = int(row['ncodpers']) 156 | test_lag_dict[id] = list(row.values[1:]) 157 | return train_lag_dict,test_lag_dict 158 | 159 | def getTrainTestSet(): 160 | train_lag_dict, test_lag_dict = getLagFeature() 161 | x_vars_list = [] 162 | y_vars_list = [] 163 | data_path = '../input/divide/train' 164 | 165 | train_file = open(data_path + '2015-05-28.csv') 166 | cust_dict = {} 167 | for row in csv.DictReader(train_file): 168 | cust_id = int(row['ncodpers']) 169 | cust_dict[cust_id] = getTarget(row) 170 | train_file.close() 171 | 172 | train_file = open(data_path + '2015-06-28.csv') 173 | for row in csv.DictReader(train_file): 174 | cust_id = int(row['ncodpers']) 175 | prev_target_list = cust_dict.get(cust_id, [0] * NUM_CLASS) 176 | target_list = getTarget(row) 177 | new_products = [max(x1 - x2, 0) for (x1, x2) in zip(target_list, prev_target_list)] 178 | if sum(new_products) > 0: 179 | for ind, prod in enumerate(new_products): 180 | if prod > 0: 181 | x_vars = feature_extract(row, prev_target_list) 182 | x_vars.extend(train_lag_dict.get(cust_id, [0] * 120)) 183 | x_vars_list.append(x_vars) 184 | y_vars_list.append(ind) 185 | train_file.close() 186 | 187 | test_file = open(data_path + '2016-05-28.csv') 188 | cust_dict = {} 189 | for row in csv.DictReader(test_file): 190 | cust_id = int(row['ncodpers']) 191 | cust_dict[cust_id] = getTarget(row) 192 | test_file.close() 193 | 194 | x_test_list = [] 195 | test_file = open('../input/test_ver2.csv') 196 | for row in csv.DictReader(test_file): 197 | cust_id = int(row['ncodpers']) 198 | prev_target_list = cust_dict.get(cust_id, [0] * NUM_CLASS) 199 | x_vars = feature_extract(row, prev_target_list) 200 | x_vars.extend(test_lag_dict.get(cust_id, [0] * 120)) 201 | x_test_list.append(x_vars) 202 | test_file.close() 203 | 204 | train_X = np.array(x_vars_list) 205 | train_y = np.array(y_vars_list) 206 | test_X = np.array(x_test_list) 207 | 208 | print train_X.shape, train_y.shape, test_X.shape 209 | return train_X, train_y, test_X 210 | 211 | def runXGB(xgtrain, seed_val=123): 212 | param = { 213 | 'objective' : 'multi:softprob', 214 | 'eval_metric' : "mlogloss", 215 | 'num_class' : NUM_CLASS, 216 | 'silent' : 1, 217 | 'min_child_weight' : 2, 218 | 'eta': 0.05, 219 | 'max_depth': 6, 220 | 'subsample' : 0.9, 221 | 'colsample_bytree' : 0.8, 222 | 'seed' : seed_val 223 | } 224 | num_rounds = 100 225 | model = xgb.train(param, xgtrain, num_rounds) 226 | return model 227 | 228 | if __name__ == "__main__": 229 | 230 | print "feature extract..." 231 | start_time = datetime.datetime.now() 232 | train_X, train_y, test_X = getTrainTestSet() 233 | 234 | xgtrain = xgb.DMatrix(train_X, label = train_y) 235 | xgtest = xgb.DMatrix(test_X) 236 | xgval = xgb.DMatrix(train_X) 237 | y_true = train_y 238 | del train_X, train_y, test_X 239 | print(datetime.datetime.now() - start_time) 240 | 241 | print "running model..." 242 | model = runXGB(xgtrain, seed_val=123) 243 | y_pred = model.predict(xgval) 244 | print log_loss(y_true, y_pred) 245 | 246 | preds = model.predict(xgtest) 247 | del xgtrain, xgtest 248 | print(datetime.datetime.now() - start_time) 249 | 250 | print "Getting the top products.." 251 | target_cols = np.array(target_cols) 252 | preds = np.argsort(preds, axis=1) 253 | preds = np.fliplr(preds)[:, :8] 254 | test_id = np.array(pd.read_csv("../input/test_ver2.csv", usecols=['ncodpers'])['ncodpers']) 255 | final_preds = [" ".join(list(target_cols[pred])) for pred in preds] 256 | out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds}) 257 | out_df.to_csv('../submit/sub_xgb.csv', index=False) 258 | print(datetime.datetime.now() - start_time) -------------------------------------------------------------------------------- /Santander Product Recommendation/ensemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # test_id = pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers']) 5 | # nn_preds = pd.read_csv('../input/ensemble/taozi.csv') 6 | # nn_preds = pd.merge(test_id, nn_preds, on = 'ncodpers', how='left') 7 | # del nn_preds['ncodpers'] 8 | # nn_preds.to_csv('../input/ensemble/nn_preds.csv',index = False) 9 | 10 | nn_preds = pd.read_csv('../input/ensemble/nn_preds.csv') 11 | xgb_preds = pd.read_csv('../input/ensemble/xgb_preds.csv') 12 | 13 | preds = (nn_preds + xgb_preds) / 2 14 | target_cols = preds.columns 15 | del nn_preds, xgb_preds 16 | 17 | # target_cols = np.array(target_cols) 18 | preds = np.argsort(preds, axis=1) 19 | preds = np.fliplr(preds)[:, :7] 20 | test_id = np.array(pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])['ncodpers']) 21 | final_preds = [" ".join(list(target_cols[pred])) for pred in preds] 22 | out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds}) 23 | out_df.to_csv('../submit/sub_com.csv', index=False) 24 | -------------------------------------------------------------------------------- /Santander Product Recommendation/feature_combine.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import itertools 4 | 5 | target_cols = ['ind_cco_fin_ult1', 'ind_cder_fin_ult1', 6 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 7 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 8 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 9 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 10 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 11 | 12 | 13 | def add_com_feats(lag_feats): 14 | com_feats = [target_cols[i] for i in [0, 2, 15, 16, 19, 20, 21]] 15 | for x, com_cols in enumerate(itertools.combinations(com_feats, 4)): 16 | for i in range(1, 6): 17 | com_col = [str(i) + '_' + col for col in com_cols] 18 | lag_feats[str(x) + '_com4_' + str(i)] = lag_feats[com_col].sum(axis=1) 19 | 20 | return lag_feats.iloc[:, -175:] 21 | 22 | 23 | if __name__ == "__main__": 24 | data_path = '../input/feats/' 25 | train_lag5 = pd.read_csv(data_path + 'train_feats_lag5.csv') 26 | train_add5 = add_com_feats(train_lag5) 27 | train_add5.to_csv(data_path + 'train_feats_come175.csv', index=False) 28 | 29 | test_lag5 = pd.read_csv(data_path + 'test_feats_lag5.csv') 30 | test_add5 = add_com_feats(test_lag5) 31 | test_add5.to_csv(data_path + 'test_feats_come175.csv', index=False) 32 | -------------------------------------------------------------------------------- /Santander Product Recommendation/feature_extract_v1.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import numpy as np 3 | import pandas as pd 4 | import xgboost as xgb 5 | import itertools 6 | 7 | pd.options.mode.chained_assignment = None 8 | 9 | mapping_dict = { 10 | 'sexo': {'nan': 0, 'H': 0, 'V': 1}, 11 | 'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1}, 12 | 'segmento': {'nan': 0, '01 - TOP': 1, '03 - UNIVERSITARIO': 2, '02 - PARTICULARES': 3}, 13 | 'ind_nuevo': {'nan': 0, '1.0': 1, '1': 1, '0.0': 2, '0': 2}, 14 | 'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2}, 15 | 'indext': {'nan': 0, 'S': 0, 'N': 1}, 16 | 'indresi': {'nan': 0, 'S': 1, 'N': 2}, 17 | 'indfall': {'nan': 0, 'S': 1, 'N': 2}, 18 | 'indrel': {'nan': 1, '1': 0, '99': 1, '1.0': 0, '99.0': 1}, 19 | 'ind_empleado': {'nan': 0, 'N': 1, 'B': 2, 'F': 3, 'A': 4, 'S': 5}, 20 | 'pais_residencia': {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 21 | 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 22 | 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 23 | 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 24 | 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 25 | 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 26 | 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 27 | 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 28 | 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 29 | 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 30 | 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, 'nan': 1, 'LB': 81, 31 | 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 32 | 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 33 | 'QA': 58, 'MZ': 27}, 34 | 'canal_entrada': {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 35 | 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 36 | 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 37 | 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 38 | 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 39 | 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 40 | 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 41 | 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 42 | 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 43 | 'KCI': 65, 44 | 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 45 | 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 46 | 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 47 | 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 48 | 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 49 | 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 50 | 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 51 | 'KEU': 72, 'KES': 68, 'KEQ': 138, 'nan': 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 52 | 'KFS': 38, 53 | 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 54 | 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 55 | 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11} 56 | } 57 | 58 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 59 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 60 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 61 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 62 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 63 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 64 | 65 | target_cols = target_raw_cols[2:] 66 | 67 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta'] 68 | cat_cols = mapping_dict.keys() 69 | user_cols = con_cols + cat_cols + target_raw_cols 70 | NUM_CLASS = 22 71 | 72 | 73 | def getAge(str_age): 74 | age = str_age.strip() 75 | if age == 'NA' or age == 'nan': 76 | age1 = 2 77 | elif float(age) < 20: 78 | age1 = 0 79 | elif float(age) < 30: 80 | age1 = 1 81 | elif float(age) < 40: 82 | age1 = 2 83 | elif float(age) < 50: 84 | age1 = 3 85 | elif float(age) < 60: 86 | age1 = 4 87 | else: 88 | age1 = 5 89 | return age1 90 | 91 | 92 | def getCustSeniority(str_seniority): 93 | cust_seniority = str_seniority.strip() 94 | if cust_seniority == 'NA' or cust_seniority == 'nan': 95 | seniority = 4 96 | elif float(cust_seniority) < 50: 97 | seniority = 0 98 | elif float(cust_seniority) < 75: 99 | seniority = 1 100 | elif float(cust_seniority) < 100: 101 | seniority = 2 102 | elif float(cust_seniority) < 125: 103 | seniority = 3 104 | elif float(cust_seniority) < 150: 105 | seniority = 4 106 | elif float(cust_seniority) < 175: 107 | seniority = 5 108 | elif float(cust_seniority) < 200: 109 | seniority = 6 110 | elif float(cust_seniority) < 225: 111 | seniority = 7 112 | else: 113 | seniority = 8 114 | return seniority 115 | 116 | 117 | def getRent(str_rent): 118 | rent = str_rent.strip() 119 | if rent == 'NA' or rent == 'nan': 120 | rent1 = 4 121 | elif float(rent) < 45542.97: 122 | rent1 = 1 123 | elif float(rent) < 57629.67: 124 | rent1 = 2 125 | elif float(rent) < 68211.78: 126 | rent1 = 3 127 | elif float(rent) < 78852.39: 128 | rent1 = 4 129 | elif float(rent) < 90461.97: 130 | rent1 = 5 131 | elif float(rent) < 103855.23: 132 | rent1 = 6 133 | elif float(rent) < 120063.00: 134 | rent1 = 7 135 | elif float(rent) < 141347.49: 136 | rent1 = 8 137 | elif float(rent) < 173418.12: 138 | rent1 = 9 139 | elif float(rent) < 234687.12: 140 | rent1 = 10 141 | else: 142 | rent1 = 11 143 | return rent1 144 | 145 | 146 | def add_com_features(lag_feats): 147 | com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]] 148 | for x in range(4): 149 | import_col = [target_cols[i] for i in com_col[x]] 150 | for i in range(1, 6): 151 | pre_import_col = [str(i) + '_' + col for col in import_col] 152 | lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1) 153 | return lag_feats 154 | 155 | 156 | # def add_com_features(lag_feats): 157 | # lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1) 158 | # for i in range(24): 159 | # index_list = [17+i, 41+i, 65+i, 89+i, 113+i] 160 | # lag_feats['prod_sum_' + str(i)] = lag_feats.iloc[:,index_list].sum(axis = 1) 161 | # 162 | # pro_sum_list = ['prod_sum_' + str(i) for i in range(24)] 163 | # group_feats_r = lag_feats[pro_sum_list].groupby(lag_feats['renta' ]).agg(lambda x: round(x.sum() / x.count(), 2)) 164 | # group_feats_r.columns = ['renta_r_' + str(i) for i in range(24)] 165 | # lag_feats = pd.merge(lag_feats, group_feats_r, left_on='renta', right_index=True, how='left') 166 | # return lag_feats 167 | 168 | 169 | def process_train_data(in_file_name, date_list): 170 | this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])] 171 | for col in cat_cols: 172 | this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)]) 173 | for col in target_raw_cols: 174 | this_month[col].fillna(0, inplace=True) 175 | this_month['age'] = this_month['age'].apply(lambda x: getAge(x)) 176 | this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x)) 177 | this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x))) 178 | 179 | hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols] 180 | del in_file_name 181 | pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])] 182 | pre_month_ncodpers = pre_month[['ncodpers']] 183 | pre_month_target = pre_month[target_raw_cols] 184 | pre_month_target = pre_month_target.add_prefix('1_') 185 | pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1) 186 | this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left') 187 | this_month.fillna(0, inplace=True) 188 | for col in target_cols: 189 | this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0, 190 | (this_month[col] - this_month['1_' + col]), 0) 191 | 192 | this_month_target = this_month[target_cols] 193 | this_month = this_month.drop(target_raw_cols, axis=1) 194 | 195 | x_vars_list = [] 196 | y_vars_list = [] 197 | 198 | for i in range(2, len(date_list)): 199 | tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 200 | tmp = tmp.add_prefix(str(i) + "_") 201 | tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True) 202 | this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left') 203 | this_month.fillna(0, inplace=True) 204 | del hist_data, tmp 205 | 206 | # this_month = add_com_features(this_month) 207 | # this_month.fillna(0, inplace=True) 208 | 209 | this_month = pd.concat([this_month, this_month_target], axis=1) 210 | for idx, row in this_month.iterrows(): 211 | for i in range(0, NUM_CLASS): 212 | if row[(-NUM_CLASS + i)] > 0: 213 | x_vars_list.append(row[:-NUM_CLASS]) 214 | y_vars_list.append(i) 215 | train_X = np.array(x_vars_list) 216 | return train_X[:, -120:], np.array(y_vars_list) 217 | # return train_X, np.array(y_vars_list) 218 | 219 | 220 | def process_test_data(test_file, hist_file, date_list): 221 | for col in cat_cols: 222 | test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)]) 223 | test_file['age'] = test_file['age'].apply(lambda x: getAge(x)) 224 | test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x)) 225 | test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x)) 226 | 227 | for i in range(0, len(date_list)): 228 | tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 229 | tmp = tmp.add_prefix(str(i + 1) + "_") 230 | tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True) 231 | test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left') 232 | test_file.fillna(0, inplace=True) 233 | 234 | del hist_file, tmp 235 | # test_file = add_com_features(test_file) 236 | # test_file.fillna(0, inplace=True) 237 | 238 | return test_file.values[:, -120:], test_file.columns[-120:] 239 | # return test_file.values, test_file.columns 240 | 241 | 242 | if __name__ == "__main__": 243 | start_time = datetime.datetime.now() 244 | data_path = '../input/' 245 | print "feature extract..." 246 | 247 | train_file = pd.read_csv(data_path + 'train_ver3.csv', 248 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 249 | usecols=user_cols) 250 | print datetime.datetime.now() - start_time 251 | 252 | train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28', 253 | '2015-03-28', '2015-02-28', '2015-01-28']) 254 | # train_X = train_X[:, 2:] 255 | print datetime.datetime.now() - start_time 256 | 257 | data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28'] 258 | train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:, 259 | ['ncodpers', 'fecha_dato'] + target_raw_cols] 260 | 261 | test_file = pd.read_csv(data_path + 'test_ver3.csv', 262 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 263 | usecols=con_cols + cat_cols) 264 | 265 | test_X, feats = process_test_data(test_file, train_file, data_date) 266 | print datetime.datetime.now() - start_time 267 | 268 | del train_file, test_file 269 | # test_X = test_X[:, 2:] 270 | # feats = feats[2:] 271 | print train_X.shape, train_y.shape, test_X.shape 272 | 273 | df_train = pd.DataFrame(train_X, columns=feats) 274 | # df_train['label'] = train_y 275 | df_test = pd.DataFrame(test_X, columns=feats) 276 | 277 | df_train.to_csv(data_path + 'feats/train_feats_lag5.csv', index=False) 278 | df_test.to_csv(data_path + 'feats/test_feats_lag5.csv', index=False) 279 | print datetime.datetime.now() - start_time 280 | -------------------------------------------------------------------------------- /Santander Product Recommendation/feature_extract_v2.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import numpy as np 3 | import pandas as pd 4 | import xgboost as xgb 5 | import itertools 6 | 7 | pd.options.mode.chained_assignment = None 8 | 9 | mapping_dict = { 10 | 'sexo': {'nan': 0, 'H': 0, 'V': 1}, 11 | 'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1}, 12 | 'segmento': {'nan': 0, '01 - TOP': 1, '03 - UNIVERSITARIO': 2, '02 - PARTICULARES': 3}, 13 | 'ind_nuevo': {'nan': 0, '1.0': 1, '1': 1, '0.0': 2, '0': 2}, 14 | 'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2}, 15 | 'indext': {'nan': 0, 'S': 0, 'N': 1}, 16 | # 'indresi' : {'nan':0, 'S':1, 'N':2}, 17 | # 'indfall' : {'nan':0, 'S':1, 'N':2}, 18 | # 'indrel' : {'nan':1, '1':0, '99':1, '1.0':0, '99.0':1}, 19 | # 'ind_empleado' : {'nan':0, 'N':1, 'B':2, 'F':3, 'A':4, 'S':5}, 20 | 'pais_residencia': {'LV': 102, 'BE': 12, 'BG': 50, 'BA': 61, 'BM': 117, 'BO': 62, 'JP': 82, 'JM': 116, 'BR': 17, 21 | 'BY': 64, 'BZ': 113, 'RU': 43, 'RS': 89, 'RO': 41, 'GW': 99, 'GT': 44, 'GR': 39, 'GQ': 73, 22 | 'GE': 78, 'GB': 9, 'GA': 45, 'GN': 98, 'GM': 110, 'GI': 96, 'GH': 88, 'OM': 100, 'HR': 67, 23 | 'HU': 106, 'HK': 34, 'HN': 22, 'AD': 35, 'PR': 40, 'PT': 26, 'PY': 51, 'PA': 60, 'PE': 20, 24 | 'PK': 84, 'PH': 91, 'PL': 30, 'EE': 52, 'EG': 74, 'ZA': 75, 'EC': 19, 'AL': 25, 'VN': 90, 25 | 'ET': 54, 'ZW': 114, 'ES': 0, 'MD': 68, 'UY': 77, 'MM': 94, 'ML': 104, 'US': 15, 'MT': 118, 26 | 'MR': 48, 'UA': 49, 'MX': 16, 'IL': 42, 'FR': 8, 'MA': 38, 'FI': 23, 'NI': 33, 'NL': 7, 27 | 'NO': 46, 'NG': 83, 'NZ': 93, 'CI': 57, 'CH': 3, 'CO': 21, 'CN': 28, 'CM': 55, 'CL': 4, 28 | 'CA': 2, 'CG': 101, 'CF': 109, 'CD': 112, 'CZ': 36, 'CR': 32, 'CU': 72, 'KE': 65, 'KH': 95, 29 | 'SV': 53, 'SK': 69, 'KR': 87, 'KW': 92, 'SN': 47, 'SL': 97, 'KZ': 111, 'SA': 56, 'SG': 66, 30 | 'SE': 24, 'DO': 11, 'DJ': 115, 'DK': 76, 'DE': 10, 'DZ': 80, 'MK': 105, 'nan': 1, 'LB': 81, 31 | 'TW': 29, 'TR': 70, 'TN': 85, 'LT': 103, 'LU': 59, 'TH': 79, 'TG': 86, 'LY': 108, 'AE': 37, 32 | 'VE': 14, 'IS': 107, 'IT': 18, 'AO': 71, 'AR': 13, 'AU': 63, 'AT': 6, 'IN': 31, 'IE': 5, 33 | 'QA': 58, 'MZ': 27}, 34 | 'canal_entrada': {'013': 49, 'KHP': 160, 'KHQ': 157, 'KHR': 161, 'KHS': 162, 'KHK': 10, 'KHL': 0, 'KHM': 12, 35 | 'KHN': 21, 'KHO': 13, 'KHA': 22, 'KHC': 9, 'KHD': 2, 'KHE': 1, 'KHF': 19, '025': 159, 'KAC': 57, 36 | 'KAB': 28, 'KAA': 39, 'KAG': 26, 'KAF': 23, 'KAE': 30, 'KAD': 16, 'KAK': 51, 'KAJ': 41, 37 | 'KAI': 35, 'KAH': 31, 'KAO': 94, 'KAN': 110, 'KAM': 107, 'KAL': 74, 'KAS': 70, 'KAR': 32, 38 | 'KAQ': 37, 'KAP': 46, 'KAW': 76, 'KAV': 139, 'KAU': 142, 'KAT': 5, 'KAZ': 7, 'KAY': 54, 39 | 'KBJ': 133, 'KBH': 90, 'KBN': 122, 'KBO': 64, 'KBL': 88, 'KBM': 135, 'KBB': 131, 'KBF': 102, 40 | 'KBG': 17, 'KBD': 109, 'KBE': 119, 'KBZ': 67, 'KBX': 116, 'KBY': 111, 'KBR': 101, 'KBS': 118, 41 | 'KBP': 121, 'KBQ': 62, 'KBV': 100, 'KBW': 114, 'KBU': 55, 'KCE': 86, 'KCD': 85, 'KCG': 59, 42 | 'KCF': 105, 'KCA': 73, 'KCC': 29, 'KCB': 78, 'KCM': 82, 'KCL': 53, 'KCO': 104, 'KCN': 81, 43 | 'KCI': 65, 44 | 'KCH': 84, 'KCK': 52, 'KCJ': 156, 'KCU': 115, 'KCT': 112, 'KCV': 106, 'KCQ': 154, 'KCP': 129, 45 | 'KCS': 77, 'KCR': 153, 'KCX': 120, 'RED': 8, 'KDL': 158, 'KDM': 130, 'KDN': 151, 'KDO': 60, 46 | 'KDH': 14, 'KDI': 150, 'KDD': 113, 'KDE': 47, 'KDF': 127, 'KDG': 126, 'KDA': 63, 'KDB': 117, 47 | 'KDC': 75, 'KDX': 69, 'KDY': 61, 'KDZ': 99, 'KDT': 58, 'KDU': 79, 'KDV': 91, 'KDW': 132, 48 | 'KDP': 103, 'KDQ': 80, 'KDR': 56, 'KDS': 124, 'K00': 50, 'KEO': 96, 'KEN': 137, 'KEM': 155, 49 | 'KEL': 125, 'KEK': 145, 'KEJ': 95, 'KEI': 97, 'KEH': 15, 'KEG': 136, 'KEF': 128, 'KEE': 152, 50 | 'KED': 143, 'KEC': 66, 'KEB': 123, 'KEA': 89, 'KEZ': 108, 'KEY': 93, 'KEW': 98, 'KEV': 87, 51 | 'KEU': 72, 'KES': 68, 'KEQ': 138, 'nan': 6, 'KFV': 48, 'KFT': 92, 'KFU': 36, 'KFR': 144, 52 | 'KFS': 38, 53 | 'KFP': 40, 'KFF': 45, 'KFG': 27, 'KFD': 25, 'KFE': 148, 'KFB': 146, 'KFC': 4, 'KFA': 3, 'KFN': 42, 54 | 'KFL': 34, 'KFM': 141, 'KFJ': 33, 'KFK': 20, 'KFH': 140, 'KFI': 134, '007': 71, '004': 83, 55 | 'KGU': 149, 'KGW': 147, 'KGV': 43, 'KGY': 44, 'KGX': 24, 'KGC': 18, 'KGN': 11} 56 | } 57 | 58 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 59 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 60 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 61 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 62 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 63 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 64 | 65 | target_cols = target_raw_cols[2:] 66 | 67 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta'] 68 | cat_cols = mapping_dict.keys() 69 | user_cols = con_cols + cat_cols + target_raw_cols 70 | NUM_CLASS = 22 71 | 72 | 73 | def getAge(str_age): 74 | age = str_age.strip() 75 | if age == 'NA' or age == 'nan': 76 | age1 = 2 77 | elif float(age) < 20: 78 | age1 = 0 79 | elif float(age) < 30: 80 | age1 = 1 81 | elif float(age) < 40: 82 | age1 = 2 83 | elif float(age) < 50: 84 | age1 = 3 85 | elif float(age) < 60: 86 | age1 = 4 87 | else: 88 | age1 = 5 89 | return age1 90 | 91 | 92 | def getCustSeniority(str_seniority): 93 | cust_seniority = str_seniority.strip() 94 | if cust_seniority == 'NA' or cust_seniority == 'nan': 95 | seniority = 4 96 | elif float(cust_seniority) < 50: 97 | seniority = 0 98 | elif float(cust_seniority) < 75: 99 | seniority = 1 100 | elif float(cust_seniority) < 100: 101 | seniority = 2 102 | elif float(cust_seniority) < 125: 103 | seniority = 3 104 | elif float(cust_seniority) < 150: 105 | seniority = 4 106 | elif float(cust_seniority) < 175: 107 | seniority = 5 108 | elif float(cust_seniority) < 200: 109 | seniority = 6 110 | elif float(cust_seniority) < 225: 111 | seniority = 7 112 | else: 113 | seniority = 8 114 | return seniority 115 | 116 | 117 | def getRent(str_rent): 118 | rent = str_rent.strip() 119 | if rent == 'NA' or rent == 'nan': 120 | rent1 = 4 121 | elif float(rent) < 45542.97: 122 | rent1 = 1 123 | elif float(rent) < 57629.67: 124 | rent1 = 2 125 | elif float(rent) < 68211.78: 126 | rent1 = 3 127 | elif float(rent) < 78852.39: 128 | rent1 = 4 129 | elif float(rent) < 90461.97: 130 | rent1 = 5 131 | elif float(rent) < 103855.23: 132 | rent1 = 6 133 | elif float(rent) < 120063.00: 134 | rent1 = 7 135 | elif float(rent) < 141347.49: 136 | rent1 = 8 137 | elif float(rent) < 173418.12: 138 | rent1 = 9 139 | elif float(rent) < 234687.12: 140 | rent1 = 10 141 | else: 142 | rent1 = 11 143 | return rent1 144 | 145 | 146 | def add_com_features(lag_feats): 147 | lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1) 148 | for i, pre in enumerate(['1_', '2_', '3_', '4_', '5_']): 149 | pre_cols = [pre + col for col in target_raw_cols] 150 | lag_feats['sum_24_' + str(i + 1)] = lag_feats.loc[:, pre_cols].sum(axis=1) 151 | sum_24_list = ['sum_24_' + str(i + 1) for i in range(5)] 152 | lag_feats['sum_24_max'] = lag_feats[sum_24_list].max(axis=1) 153 | lag_feats['sum_24_min'] = lag_feats[sum_24_list].min(axis=1) 154 | lag_feats['sum_24_mean'] = lag_feats[sum_24_list].mean(axis=1) 155 | 156 | for i, col in enumerate(target_raw_cols): 157 | index_list = [pre + col for pre in ['1_', '2_', '3_', '4_', '5_']] 158 | lag_feats['prod_sum_' + str(i)] = lag_feats.loc[:, index_list].sum(axis=1) 159 | 160 | pro_sum_list = ['prod_sum_' + str(i) for i in range(24)] 161 | for gp_col in ['renta', 'pais_residencia', 'canal_entrada']: 162 | group_feats = lag_feats[pro_sum_list].groupby(lag_feats[gp_col]).agg(lambda x: round(x.sum() / x.count(), 2)) 163 | group_feats.columns = [gp_col + str(i) for i in range(24)] 164 | lag_feats = pd.merge(lag_feats, group_feats, left_on=gp_col, right_index=True, how='left') 165 | 166 | com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]] 167 | for x in range(4): 168 | import_col = [target_cols[i] for i in com_col[x]] 169 | for i in range(1, 6): 170 | pre_import_col = [str(i) + '_' + col for col in import_col] 171 | lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1) 172 | return lag_feats 173 | 174 | 175 | def process_train_data(in_file_name, date_list): 176 | this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])] 177 | for col in cat_cols: 178 | this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)]) 179 | for col in target_raw_cols: 180 | this_month[col].fillna(0, inplace=True) 181 | this_month['age'] = this_month['age'].apply(lambda x: getAge(x)) 182 | this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x)) 183 | this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x))) 184 | 185 | hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols] 186 | del in_file_name 187 | pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])] 188 | pre_month_ncodpers = pre_month[['ncodpers']] 189 | pre_month_target = pre_month[target_raw_cols] 190 | pre_month_target = pre_month_target.add_prefix('1_') 191 | pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1) 192 | this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left') 193 | this_month.fillna(0, inplace=True) 194 | for col in target_cols: 195 | this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0, 196 | (this_month[col] - this_month['1_' + col]), 0) 197 | 198 | this_month_target = this_month[target_cols] 199 | this_month = this_month.drop(target_raw_cols, axis=1) 200 | 201 | x_vars_list = [] 202 | y_vars_list = [] 203 | 204 | for i in range(2, len(date_list)): 205 | tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 206 | tmp = tmp.add_prefix(str(i) + "_") 207 | tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True) 208 | this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left') 209 | this_month.fillna(0, inplace=True) 210 | del hist_data, tmp 211 | 212 | this_month = add_com_features(this_month) 213 | this_month.fillna(0, inplace=True) 214 | 215 | this_month = pd.concat([this_month, this_month_target], axis=1) 216 | for idx, row in this_month.iterrows(): 217 | for i in range(0, NUM_CLASS): 218 | if row[(-NUM_CLASS + i)] > 0: 219 | x_vars_list.append(row[:-NUM_CLASS]) 220 | y_vars_list.append(i) 221 | train_X = np.array(x_vars_list) 222 | # return train_X[:,-120:], np.array(y_vars_list) 223 | return train_X, np.array(y_vars_list) 224 | 225 | 226 | def process_test_data(test_file, hist_file, date_list): 227 | for col in cat_cols: 228 | test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)]) 229 | test_file['age'] = test_file['age'].apply(lambda x: getAge(x)) 230 | test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x)) 231 | test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x)) 232 | 233 | for i in range(0, len(date_list)): 234 | tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 235 | tmp = tmp.add_prefix(str(i + 1) + "_") 236 | tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True) 237 | test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left') 238 | test_file.fillna(0, inplace=True) 239 | 240 | del hist_file, tmp 241 | test_file = add_com_features(test_file) 242 | test_file.fillna(0, inplace=True) 243 | 244 | # return test_file.values[:,-120:], test_file.columns[-120:] 245 | return test_file.values, test_file.columns 246 | 247 | 248 | if __name__ == "__main__": 249 | start_time = datetime.datetime.now() 250 | data_path = '../input/' 251 | print "feature extract..." 252 | 253 | train_file = pd.read_csv(data_path + 'train_ver3.csv', 254 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 255 | usecols=user_cols) 256 | print datetime.datetime.now() - start_time 257 | 258 | train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28', 259 | '2015-03-28', '2015-02-28', '2015-01-28']) 260 | train_X = train_X[:, 2:] 261 | print datetime.datetime.now() - start_time 262 | 263 | data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28'] 264 | train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:, 265 | ['ncodpers', 'fecha_dato'] + target_raw_cols] 266 | 267 | test_file = pd.read_csv(data_path + 'test_ver3.csv', 268 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 269 | usecols=con_cols + cat_cols) 270 | 271 | test_X, feats = process_test_data(test_file, train_file, data_date) 272 | print datetime.datetime.now() - start_time 273 | 274 | del train_file, test_file 275 | test_X = test_X[:, 2:] 276 | feats = feats[2:] 277 | print train_X.shape, train_y.shape, test_X.shape 278 | 279 | df_train = pd.DataFrame(train_X, columns=feats) 280 | df_train['label'] = train_y 281 | df_test = pd.DataFrame(test_X, columns=feats) 282 | 283 | df_train.to_csv(data_path + 'feats/train_feats_v2.csv', index=False) 284 | df_test.to_csv(data_path + 'feats/test_feats_v1.csv', index=False) 285 | print datetime.datetime.now() - start_time 286 | -------------------------------------------------------------------------------- /Santander Product Recommendation/prepro.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | dtype_dict = \ 5 | {'ncodpers': 'int32', 'age': 'str', 'antiguedad': 'str', 'renta': 'str', 6 | 'ind_cco_fin_ult1': 'float16', 'ind_deme_fin_ult1': 'float16', 'ind_aval_fin_ult1': 'float16', 7 | 'ind_valo_fin_ult1': 'float16', 'ind_reca_fin_ult1': 'float16', 'ind_ctju_fin_ult1': 'float16', 8 | 'ind_cder_fin_ult1': 'float16', 'ind_plan_fin_ult1': 'float16', 'ind_fond_fin_ult1': 'float16', 9 | 'ind_hip_fin_ult1': 'float16', 'ind_pres_fin_ult1': 'float16', 'ind_nomina_ult1': 'float16', 10 | 'ind_cno_fin_ult1': 'float16', 'ind_ctpp_fin_ult1': 'float16', 'ind_ahor_fin_ult1': 'float16', 11 | 'ind_dela_fin_ult1': 'float16', 'ind_ecue_fin_ult1': 'float16', 'ind_nom_pens_ult1': 'float16', 12 | 'ind_recibo_ult1': 'float16', 'ind_deco_fin_ult1': 'float16', 'ind_tjcr_fin_ult1': 'float16', 13 | 'ind_ctop_fin_ult1': 'float16', 'ind_viv_fin_ult1': 'float16', 'ind_ctma_fin_ult1': 'float16'} 14 | 15 | user_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta', 'canal_entrada', 'pais_residencia', 16 | 'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext', 'indresi', 17 | 'indfall', 'indrel', 'ind_empleado'] 18 | 19 | pro_cols = \ 20 | ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 21 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 22 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 23 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 24 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 25 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 26 | 27 | # use_date = ['2015-01-28', '2015-02-28', '2015-03-28', '2015-04-28', '2015-05-28', '2015-06-28', 28 | # '2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28'] 29 | use_date = ['2015-07-28', '2015-08-28', '2015-09-28', '2015-10-28', '2015-11-28', '2015-12-28', '2016-01-28'] 30 | 31 | df_train = pd.read_csv("../input/train_ver2.csv", dtype=dtype_dict, usecols=user_cols + pro_cols) 32 | 33 | df_train = df_train[df_train['fecha_dato'].isin(use_date)] 34 | 35 | df_train.to_csv('../input/train_ver4.csv', index=False) 36 | # df_test = pd.read_csv("../input/test_ver2.csv", dtype={'ncodpers':'int32'},usecols= user_cols) 37 | # 38 | # df_test.to_csv('../input/test_ver3.csv', index = False) 39 | -------------------------------------------------------------------------------- /Santander Product Recommendation/xgb_fast.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import xgboost as xgb 4 | from sklearn.cross_validation import KFold 5 | 6 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 7 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 8 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 9 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 10 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 11 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 12 | 13 | target_cols = target_raw_cols[2:] 14 | NUM_CLASS = 22 15 | 16 | 17 | def runXGB_CV(train_X, train_y, test_X, index, seed_val): 18 | train_index, test_index = index 19 | X_train = train_X[train_index] 20 | y_train = train_y[train_index] 21 | xgtrain = xgb.DMatrix(X_train, label=y_train) 22 | xgtest = xgb.DMatrix(test_X) 23 | param = { 24 | 'objective': 'multi:softprob', 25 | 'eval_metric': "mlogloss", 26 | 'num_class': NUM_CLASS, 27 | 'silent': 1, 28 | 'min_child_weight': 2, 29 | 'eta': 0.06, 30 | 'max_depth': 6, 31 | 'subsample': 0.9, 32 | 'colsample_bytree': 0.8, 33 | 'seed': seed_val 34 | } 35 | num_rounds = 100 36 | model = xgb.train(param, xgtrain, num_rounds) 37 | pred = model.predict(xgtest) 38 | return pred 39 | 40 | 41 | def runXGB(train_X, train_y, test_X, seed_val=123): 42 | param = { 43 | 'objective': 'multi:softprob', 44 | 'eval_metric': "mlogloss", 45 | 'num_class': NUM_CLASS, 46 | 'silent': 1, 47 | 'min_child_weight': 2, 48 | 'eta': 0.06, 49 | 'max_depth': 8, 50 | 'subsample': 0.9, 51 | 'colsample_bytree': 0.8, 52 | 'seed': seed_val 53 | } 54 | num_rounds = 100 55 | xgtrain = xgb.DMatrix(train_X, label=train_y) 56 | xgtest = xgb.DMatrix(test_X) 57 | model = xgb.train(param, xgtrain, num_rounds) 58 | preds = model.predict(xgtest) 59 | return preds 60 | 61 | 62 | if __name__ == "__main__": 63 | cv_sel = 0 64 | print 'read files...' 65 | data_path = '../input/feats/' 66 | 67 | columns = ['age', 'antiguedad', 'renta', 68 | 'sexo', 'ind_actividad_cliente', 'segmento', 'ind_nuevo', 'tiprel_1mes', 'indext'] 69 | train_X = pd.read_csv(data_path + 'train_feats_users.csv', usecols=columns + ['label']) 70 | for col in ['sum', 'renta', 'canel', 'lag5', 'com20', 'sum8']: 71 | train_temp = pd.read_csv(data_path + 'train_feats_' + col + '.csv') 72 | train_X = pd.concat([train_X, train_temp], axis=1) 73 | del train_temp 74 | 75 | test_X = pd.read_csv(data_path + 'test_feats_users.csv', usecols=columns) 76 | for col in ['sum', 'renta', 'canel', 'lag5', 'com20', 'sum8']: 77 | test_temp = pd.read_csv(data_path + 'test_feats_' + col + '.csv') 78 | test_X = pd.concat([test_X, test_temp], axis=1) 79 | del test_temp 80 | 81 | # train_X = pd.read_csv(data_path + 'train_feats_v1.csv') 82 | # test_X = pd.read_csv(data_path + 'test_feats_v1.csv') 83 | 84 | train_y = train_X['label'].values 85 | train_X = train_X.drop('label', axis=1).values 86 | test_X = test_X.values 87 | print train_X.shape, train_y.shape, test_X.shape 88 | 89 | seed_val = 1234 90 | if cv_sel == 1: 91 | print "running model with cv..." 92 | nfolds = 10 93 | kf = KFold(train_X.shape[0], n_folds=nfolds, shuffle=True, random_state=seed_val) 94 | preds = [0] * NUM_CLASS 95 | for i, index in enumerate(kf): 96 | preds += runXGB_CV(train_X, train_y, test_X, index, seed_val) 97 | print 'fold %d' % (i + 1) 98 | preds = preds / nfolds 99 | 100 | else: 101 | print "running model..." 102 | preds = runXGB(train_X, train_y, test_X, seed_val=seed_val) 103 | 104 | print "Getting the top products..." 105 | target_cols = np.array(target_cols) 106 | preds = np.argsort(preds, axis=1) 107 | preds = np.fliplr(preds)[:, :7] 108 | test_id = np.array(pd.read_csv('../input/test_ver2.csv', usecols=['ncodpers'])['ncodpers']) 109 | final_preds = [" ".join(list(target_cols[pred])) for pred in preds] 110 | out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds}) 111 | out_df.to_csv('../submit/sub_xgb.csv', index=False) 112 | -------------------------------------------------------------------------------- /Santander Product Recommendation/xgb_script.py: -------------------------------------------------------------------------------- 1 | ''' 2 | author:TaoZI 3 | date:2016/12/22 4 | ''' 5 | import datetime 6 | import numpy as np 7 | import pandas as pd 8 | import xgboost as xgb 9 | from sklearn.cross_validation import KFold 10 | 11 | pd.options.mode.chained_assignment = None 12 | 13 | mapping_dict = { 14 | 'sexo': {'nan': 0, 'H': 0, 'V': 1}, 15 | 'ind_actividad_cliente': {'nan': 0, '0.0': 0, '0': 0, '1.0': 1, '1': 1}, 16 | 'segmento': {'nan': 0, '01 - TOP': 0, '03 - UNIVERSITARIO': 1, '02 - PARTICULARES': 2}, 17 | 'ind_nuevo': {'nan': 0, '1.0': 0, '1': 0, '0.0': 1, '0': 1}, 18 | 'tiprel_1mes': {'nan': 0, 'P': 0, 'R': 0, 'N': 0, 'I': 1, 'A': 2}, 19 | 'indext': {'nan': 0, 'S': 0, 'N': 1} 20 | } 21 | 22 | target_raw_cols = ['ind_ahor_fin_ult1', 'ind_aval_fin_ult1', 'ind_cco_fin_ult1', 'ind_cder_fin_ult1', 23 | 'ind_cno_fin_ult1', 'ind_ctju_fin_ult1', 'ind_ctma_fin_ult1', 'ind_ctop_fin_ult1', 24 | 'ind_ctpp_fin_ult1', 'ind_deco_fin_ult1', 'ind_deme_fin_ult1', 'ind_dela_fin_ult1', 25 | 'ind_ecue_fin_ult1', 'ind_fond_fin_ult1', 'ind_hip_fin_ult1', 'ind_plan_fin_ult1', 26 | 'ind_pres_fin_ult1', 'ind_reca_fin_ult1', 'ind_tjcr_fin_ult1', 'ind_valo_fin_ult1', 27 | 'ind_viv_fin_ult1', 'ind_nomina_ult1', 'ind_nom_pens_ult1', 'ind_recibo_ult1'] 28 | 29 | target_cols = target_raw_cols[2:] 30 | 31 | con_cols = ['ncodpers', 'fecha_dato', 'age', 'antiguedad', 'renta'] 32 | cat_cols = mapping_dict.keys() 33 | user_cols = con_cols + cat_cols + target_raw_cols 34 | NUM_CLASS = 22 35 | 36 | 37 | def getAge(str_age): 38 | age = str_age.strip() 39 | if age == 'NA' or age == 'nan': 40 | age1 = 2 41 | elif float(age) < 20: 42 | age1 = 0 43 | elif float(age) < 30: 44 | age1 = 1 45 | elif float(age) < 40: 46 | age1 = 2 47 | elif float(age) < 50: 48 | age1 = 3 49 | elif float(age) < 60: 50 | age1 = 4 51 | else: 52 | age1 = 5 53 | return age1 54 | 55 | 56 | def getCustSeniority(str_seniority): 57 | cust_seniority = str_seniority.strip() 58 | if cust_seniority == 'NA' or cust_seniority == 'nan': 59 | seniority = 4 60 | elif float(cust_seniority) < 50: 61 | seniority = 0 62 | elif float(cust_seniority) < 75: 63 | seniority = 1 64 | elif float(cust_seniority) < 100: 65 | seniority = 2 66 | elif float(cust_seniority) < 125: 67 | seniority = 3 68 | elif float(cust_seniority) < 150: 69 | seniority = 4 70 | elif float(cust_seniority) < 175: 71 | seniority = 5 72 | elif float(cust_seniority) < 200: 73 | seniority = 6 74 | elif float(cust_seniority) < 225: 75 | seniority = 7 76 | else: 77 | seniority = 8 78 | return seniority 79 | 80 | 81 | def getRent(str_rent): 82 | rent = str_rent.strip() 83 | if rent == 'NA' or rent == 'nan': 84 | rent1 = 4 85 | elif float(rent) < 45542.97: 86 | rent1 = 1 87 | elif float(rent) < 57629.67: 88 | rent1 = 2 89 | elif float(rent) < 68211.78: 90 | rent1 = 3 91 | elif float(rent) < 78852.39: 92 | rent1 = 4 93 | elif float(rent) < 90461.97: 94 | rent1 = 5 95 | elif float(rent) < 103855.23: 96 | rent1 = 6 97 | elif float(rent) < 120063.00: 98 | rent1 = 7 99 | elif float(rent) < 141347.49: 100 | rent1 = 8 101 | elif float(rent) < 173418.12: 102 | rent1 = 9 103 | elif float(rent) < 234687.12: 104 | rent1 = 10 105 | else: 106 | rent1 = 11 107 | return rent1 108 | 109 | 110 | def add_com_features(lag_feats): 111 | lag_feats['prod_sum'] = lag_feats.apply(lambda x: np.sum(x[-120:]), axis=1) 112 | 113 | for i, pre in enumerate(['1_', '2_', '3_', '4_', '5_']): 114 | pre_cols = [pre + col for col in target_raw_cols] 115 | lag_feats['sum_24_' + str(i + 1)] = lag_feats.loc[:, pre_cols].sum(axis=1) 116 | sum_24_list = ['sum_24_' + str(i + 1) for i in range(5)] 117 | lag_feats['sum_24_max'] = lag_feats[sum_24_list].max(axis=1) 118 | lag_feats['sum_24_min'] = lag_feats[sum_24_list].min(axis=1) 119 | lag_feats['sum_24_mean'] = lag_feats[sum_24_list].mean(axis=1) 120 | 121 | for i, col in enumerate(target_raw_cols): 122 | index_list = [pre + col for pre in ['1_', '2_', '3_', '4_', '5_']] 123 | lag_feats['prod_sum_' + str(i)] = lag_feats.loc[:, index_list].sum(axis=1) 124 | 125 | pro_sum_list = ['prod_sum_' + str(i) for i in range(24)] 126 | for gp_col in ['renta', 'sexo']: 127 | group_feats = lag_feats[pro_sum_list].groupby(lag_feats[gp_col]).agg(lambda x: round(x.sum() / x.count(), 2)) 128 | group_feats.columns = [gp_col + str(i) for i in range(24)] 129 | lag_feats = pd.merge(lag_feats, group_feats, left_on=gp_col, right_index=True, how='left') 130 | 131 | com_col = [[0, 2], [7, 8, 9], [9, 10, 11], [19, 20, 21]] 132 | for x in range(4): 133 | import_col = [target_cols[i] for i in com_col[x]] 134 | for i in range(1, 6): 135 | pre_import_col = [str(i) + '_' + col for col in import_col] 136 | lag_feats[str(i) + '_' + str(x + 1) + '_s_sum_import'] = lag_feats[pre_import_col].sum(axis=1) 137 | return lag_feats 138 | 139 | 140 | def process_train_data(in_file_name, date_list): 141 | this_month = in_file_name[in_file_name['fecha_dato'].isin([date_list[0]])] 142 | for col in cat_cols: 143 | this_month[col] = this_month[col].apply(lambda x: mapping_dict[col][str(x)]) 144 | for col in target_raw_cols: 145 | this_month[col].fillna(0, inplace=True) 146 | this_month['age'] = this_month['age'].apply(lambda x: getAge(x)) 147 | this_month['antiguedad'] = this_month['antiguedad'].apply(lambda x: getCustSeniority(x)) 148 | this_month['renta'] = this_month['renta'].apply(lambda x: getRent(str(x))) 149 | 150 | hist_data = in_file_name.loc[:, ['ncodpers', 'fecha_dato'] + target_raw_cols] 151 | del in_file_name 152 | pre_month = hist_data[hist_data['fecha_dato'].isin([date_list[1]])] 153 | pre_month_ncodpers = pre_month[['ncodpers']] 154 | pre_month_target = pre_month[target_raw_cols] 155 | pre_month_target = pre_month_target.add_prefix('1_') 156 | pre_month = pd.concat([pre_month_ncodpers, pre_month_target], axis=1) 157 | this_month = pd.merge(this_month, pre_month, on=['ncodpers'], how='left') 158 | this_month.fillna(0, inplace=True) 159 | for col in target_cols: 160 | this_month[col] = np.where(this_month[col] - this_month['1_' + col] > 0, 161 | (this_month[col] - this_month['1_' + col]), 0) 162 | 163 | this_month_target = this_month[target_cols] 164 | this_month = this_month.drop(target_raw_cols, axis=1) 165 | 166 | x_vars_list = [] 167 | y_vars_list = [] 168 | 169 | for i in range(2, len(date_list)): 170 | tmp = hist_data[hist_data['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 171 | tmp = tmp.add_prefix(str(i) + "_") 172 | tmp.rename(columns={str(i) + '_ncodpers': 'ncodpers'}, inplace=True) 173 | this_month = pd.merge(this_month, tmp, on=['ncodpers'], how='left') 174 | this_month.fillna(0, inplace=True) 175 | del hist_data 176 | 177 | this_month = add_com_features(this_month) 178 | this_month.fillna(0, inplace=True) 179 | 180 | this_month = pd.concat([this_month, this_month_target], axis=1) 181 | for idx, row in this_month.iterrows(): 182 | for i in range(0, 22): 183 | if row[(-22 + i)] > 0: 184 | x_vars_list.append(row[:-22]) 185 | y_vars_list.append(i) 186 | 187 | return np.array(x_vars_list), np.array(y_vars_list) 188 | 189 | 190 | def process_test_data(test_file, hist_file, date_list): 191 | for col in cat_cols: 192 | test_file[col] = test_file[col].apply(lambda x: mapping_dict[col][str(x)]) 193 | test_file['age'] = test_file['age'].apply(lambda x: getAge(x)) 194 | test_file['antiguedad'] = test_file['antiguedad'].apply(lambda x: getCustSeniority(x)) 195 | test_file['renta'] = test_file['renta'].apply(lambda x: getRent(x)) 196 | 197 | for i in range(0, len(date_list)): 198 | tmp = hist_file[hist_file['fecha_dato'].isin([date_list[i]])].loc[:, ['ncodpers'] + target_raw_cols] 199 | tmp = tmp.add_prefix(str(i + 1) + "_") 200 | tmp.rename(columns={str(i + 1) + '_ncodpers': 'ncodpers'}, inplace=True) 201 | test_file = pd.merge(test_file, tmp, on=['ncodpers'], how='left') 202 | test_file.fillna(0, inplace=True) 203 | 204 | del hist_file 205 | 206 | test_file = add_com_features(test_file) 207 | test_file.fillna(0, inplace=True) 208 | return test_file.values 209 | 210 | 211 | def runXGB_CV(train_X, train_y, test_X, index, seed_val): 212 | train_index, test_index = index 213 | X_train = train_X[train_index] 214 | y_train = train_y[train_index] 215 | 216 | xgtrain = xgb.DMatrix(X_train, label=y_train) 217 | xgtest = xgb.DMatrix(test_X) 218 | 219 | param = { 220 | 'objective': 'multi:softprob', 221 | 'eval_metric': "mlogloss", 222 | 'num_class': NUM_CLASS, 223 | 'silent': 1, 224 | 'min_child_weight': 2, 225 | 'eta': 0.05, 226 | 'max_depth': 6, 227 | 'subsample': 0.9, 228 | 'colsample_bytree': 0.8, 229 | 'seed': seed_val 230 | } 231 | num_rounds = 100 232 | model = xgb.train(param, xgtrain, num_rounds) 233 | pred = model.predict(xgtest) 234 | return pred 235 | 236 | 237 | def runXGB(train_X, train_y, test_X, seed_val=123): 238 | param = { 239 | 'objective': 'multi:softprob', 240 | 'eval_metric': "mlogloss", 241 | 'num_class': NUM_CLASS, 242 | 'silent': 1, 243 | 'min_child_weight': 2, 244 | 'eta': 0.05, 245 | 'max_depth': 6, 246 | 'subsample': 0.9, 247 | 'colsample_bytree': 0.8, 248 | 'seed': seed_val 249 | } 250 | num_rounds = 100 251 | xgtrain = xgb.DMatrix(train_X, label=train_y) 252 | xgtest = xgb.DMatrix(test_X) 253 | 254 | model = xgb.train(param, xgtrain, num_rounds) 255 | preds = model.predict(xgtest) 256 | return preds 257 | 258 | 259 | if __name__ == "__main__": 260 | 261 | cv_sel = 1 262 | start_time = datetime.datetime.now() 263 | data_path = '../input/' 264 | 265 | print "feature extract..." 266 | train_file = pd.read_csv(data_path + 'train_ver3.csv', 267 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 268 | usecols=user_cols) 269 | print datetime.datetime.now() - start_time 270 | 271 | train_X, train_y = process_train_data(train_file, ['2015-06-28', '2015-05-28', '2015-04-28', 272 | '2015-03-28', '2015-02-28', '2015-01-28']) 273 | train_X = train_X[:, 2:] 274 | print datetime.datetime.now() - start_time 275 | 276 | data_date = ['2016-05-28', '2016-04-28', '2016-03-28', '2016-02-28', '2016-01-28'] 277 | train_file = train_file[train_file['fecha_dato'].isin(data_date)].loc[:, 278 | ['ncodpers', 'fecha_dato'] + target_raw_cols] 279 | 280 | test_file = pd.read_csv(data_path + 'test_ver3.csv', 281 | dtype={'age': 'str', 'antiguedad': 'str', 'renta': 'str'}, 282 | usecols=con_cols + cat_cols) 283 | 284 | test_X = process_test_data(test_file, train_file, data_date) 285 | print datetime.datetime.now() - start_time 286 | 287 | del train_file, test_file 288 | test_X = test_X[:, 2:] 289 | feats = feats[2:] 290 | print train_X.shape, train_y.shape, test_X.shape 291 | print datetime.datetime.now() - start_time 292 | 293 | seed_val = 123 294 | if cv_sel == 1: 295 | print "running model with cv..." 296 | nfolds = 5 297 | kf = KFold(train_X.shape[0], n_folds=nfolds, shuffle=True, random_state=seed_val) 298 | preds = [0] * NUM_CLASS 299 | for i, index in enumerate(kf): 300 | preds += runXGB_CV(train_X, train_y, test_X, index, seed_val) 301 | print 'fold %d' % (i + 1) 302 | preds = preds / nfolds 303 | 304 | else: 305 | print "running model with feature..." 306 | preds = runXGB(train_X, train_y, test_X, seed_val) 307 | 308 | del train_X, test_X, train_y 309 | 310 | print "Getting the top products.." 311 | target_cols = np.array(target_cols) 312 | preds = np.argsort(preds, axis=1) 313 | preds = np.fliplr(preds)[:, :7] 314 | test_id = np.array(pd.read_csv(data_path + 'test_ver2.csv', usecols=['ncodpers'])['ncodpers']) 315 | final_preds = [" ".join(list(target_cols[pred])) for pred in preds] 316 | out_df = pd.DataFrame({'ncodpers': test_id, 'added_products': final_preds}) 317 | out_df.to_csv('../submit/sub_xgb.csv', index=False) 318 | --------------------------------------------------------------------------------