├── .gitignore ├── README.md ├── input └── .gitignore └── src ├── .gitignore ├── autoencoder_baseline.py ├── feature_importance.py ├── find_correlation.py ├── lightgbm_all_features.py ├── lightgbm_feaeture_importance_.png ├── lightgbm_feaeture_importance_all_time.png ├── lightgbm_features.py ├── process_features_userlog_all.py ├── process_features_userlog_feb_mar.py ├── process_userlog_all.py ├── process_userlog_feb.py ├── process_userlog_mar.py ├── weight_AveragingEnsemble.py ├── xgboost_features.py └── xgboost_gridsearch.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WSDM-KKBox-s-Churn-Prediction-Challenge 2 | The 11th ACM International Conference on Web Search and Data Mining (WSDM 2018) is challenging you to build an algorithm that predicts whether a subscription user will churn using a donated dataset from KKBOX. 3 | 4 | # Final: rank 43/575 5 | 6 | userlog_features分两个角度:过往所有时间段的features | 过往部分时间段的features
7 | 8 | process_userlog_feb.py 提取二月份训练数据的features
9 | process_userlog_mar.py 提取三月份测试数据的features
10 | process_userlog_all.py 提取过往所有时间段的features
11 | 12 | process_features_userlog_feb_mar.py 提取过往一个月的交叉features
13 | process_features_userlog_all.py     提取过往所有时间段的交叉features
14 | -------------------------------------------------------------------------------- /input/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/autoencoder_baseline.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from keras import optimizers 6 | from keras.callbacks import ModelCheckpoint, TensorBoard 7 | from keras.layers import Dense, Dropout 8 | from keras.models import Sequential 9 | from numpy import random as rm 10 | from sklearn import preprocessing 11 | from sklearn.model_selection import train_test_split 12 | 13 | gc.enable() 14 | 15 | # transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0) 16 | # transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0) 17 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0) 18 | 19 | members = pd.read_csv('../input/members_v3.csv') 20 | 21 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv') 22 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv') 23 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv') 24 | 25 | train = pd.read_csv('../input/train.csv') 26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True) 27 | 28 | test = pd.read_csv('../input/sample_submission_v2.csv') 29 | 30 | # Merge Data 31 | 32 | # train = pd.merge(train, transactions_train, how='left', on='msno') 33 | # test = pd.merge(test, transactions_test, how='left', on='msno') 34 | 35 | train = pd.merge(train, transactions, how='left', on='msno') 36 | test = pd.merge(test, transactions, how='left', on='msno') 37 | 38 | train = pd.merge(train, user_log_train, how='left', on='msno') 39 | test = pd.merge(test, user_log_test, how='left', on='msno') 40 | 41 | train = pd.merge(train, user_log_all, how='left', on='msno') 42 | test = pd.merge(test, user_log_all, how='left', on='msno') 43 | 44 | train = pd.merge(train, members, how='left', on='msno') 45 | test = pd.merge(test, members, how='left', on='msno') 46 | 47 | del transactions, members, user_log_train, user_log_test 48 | gc.collect() 49 | 50 | # Drop duplicates first 51 | test = test.drop_duplicates('msno') 52 | 53 | gender = {'male': 1, 'female': 2} 54 | train['gender'] = train['gender'].map(gender) 55 | test['gender'] = test['gender'].map(gender) 56 | 57 | train['bd'] = train['bd'].replace(0, train['bd'].mode()) 58 | test['bd'] = test['bd'].replace(0, test['bd'].mode()) 59 | 60 | train['gender'] = train['gender'].replace(0, train['gender'].mean()) 61 | test['gender'] = test['gender'].replace(0, test['gender'].mean()) 62 | 63 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 64 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 65 | 66 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8) 67 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8) 68 | 69 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8) 70 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8) 71 | 72 | train = train.replace([np.inf, -np.inf], np.nan) 73 | 74 | train = train.fillna(0) 75 | test = test.fillna(0) 76 | 77 | train_0 = train[train['is_churn'] == 0] 78 | train_1 = train[train['is_churn'] == 1] 79 | 80 | ''' 81 | # Enlarge train_1 for 17 times 82 | train_append = train_1 83 | 84 | for _ in range(17): 85 | train_append = train_append.append(train_1) 86 | 87 | train = train_0.append(train_append) 88 | ''' 89 | 90 | 91 | # train1 random sample 1/17 92 | def rand_rows(df, num_rows=5): 93 | subset = rm.choice(df.index.values, size=num_rows) 94 | return df.loc[subset] 95 | 96 | 97 | train_0 = rand_rows(train_0, len(train_1)) 98 | train = train_0.append(train_1) 99 | 100 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']] 101 | 102 | # Add Normalize 103 | min_max_scaler = preprocessing.MinMaxScaler() 104 | train[cols] = min_max_scaler.fit_transform(train[cols]) 105 | 106 | X_train, X_test = train_test_split(train, test_size=0.2, random_state=47, shuffle=True) 107 | y_train = X_train['is_churn'] 108 | X_train = X_train.drop(['msno', 'is_churn'], axis=1) 109 | 110 | y_test = X_test['is_churn'] 111 | X_test = X_test.drop(['msno', 'is_churn'], axis=1) 112 | 113 | X_train = X_train.values 114 | X_test = X_test.values 115 | 116 | input_dim = X_train.shape[1] 117 | 118 | autoencoder = Sequential() 119 | autoencoder.add(Dense(input_dim, input_dim=input_dim)) 120 | 121 | input_dim = int(input_dim / 2) 122 | autoencoder.add(Dense(input_dim, activation='relu')) 123 | autoencoder.add(Dropout(0.5)) 124 | 125 | input_dim = int(input_dim / 2) 126 | autoencoder.add(Dense(input_dim, activation='relu')) 127 | autoencoder.add(Dropout(0.5)) 128 | 129 | input_dim = int(input_dim / 2) 130 | autoencoder.add(Dense(input_dim, activation='relu')) 131 | autoencoder.add(Dropout(0.5)) 132 | 133 | autoencoder.add(Dense(1, activation='sigmoid')) 134 | 135 | autoencoder.summary() 136 | 137 | nb_epoch = 50 138 | batch_size = 32 139 | 140 | sgd = optimizers.SGD(lr=0.002, decay=1e-6, momentum=0.9, nesterov=True) 141 | 142 | autoencoder.compile(optimizer=sgd, 143 | loss='binary_crossentropy', 144 | metrics=['accuracy']) 145 | 146 | checkpointer = ModelCheckpoint(filepath="model.h5", 147 | verbose=1, 148 | save_best_only=True) 149 | 150 | tensorboard = TensorBoard(log_dir='./log', 151 | histogram_freq=0, 152 | write_graph=True, 153 | write_images=True) 154 | 155 | print(X_train.shape) 156 | 157 | history = autoencoder.fit(X_train, y_train, 158 | epochs=nb_epoch, 159 | batch_size=batch_size, 160 | shuffle=True, 161 | validation_data=(X_test, y_test), 162 | verbose=1, 163 | callbacks=[checkpointer, tensorboard]).history 164 | 165 | # autoencoder = load_model('model.h5') 166 | 167 | predictions = autoencoder.predict(test.drop(['msno', 'is_churn'], axis=1).values) 168 | 169 | test['is_churn'] = predictions 170 | test = test[['msno', 'is_churn']] 171 | 172 | test.to_csv('submission_autoencoder_baseline_sgd_0.002_50_32_Dec_15.csv', index=False) 173 | -------------------------------------------------------------------------------- /src/feature_importance.py: -------------------------------------------------------------------------------- 1 | 2 | import pickle 3 | import operator 4 | 5 | 6 | def plot(model): 7 | from xgboost import plot_importance 8 | from matplotlib import pyplot as plt 9 | plot_importance(model) 10 | plt.show() 11 | 12 | 13 | if __name__ == '__main__': 14 | 15 | filename = 'model/xgb_depth_7_round_1800_fold_2_eta_0.002.pkl' 16 | 17 | model = pickle.load(open(filename)) 18 | 19 | importance = model.get_fscore() 20 | importance = sorted(importance.items(), key=operator.itemgetter(1)) 21 | 22 | importance = importance[::-1] 23 | print(importance) 24 | plot(model) 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/find_correlation.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import pandas as pd 6 | import seaborn as sns 7 | 8 | gc.enable() 9 | 10 | # transactions = pd.read_csv('../input/processed_transaction_all.csv') 11 | 12 | members_v1 = pd.read_csv('../input/members.csv') 13 | members_v2 = pd.read_csv('../input/members_v2.csv') 14 | members = members_v1.append(members_v2, ignore_index=True) 15 | 16 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv') 17 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv') 18 | 19 | train_v1 = pd.read_csv('../input/train.csv') 20 | train_v2 = pd.read_csv('../input/train_v2.csv') 21 | train = train_v1.append(train_v2, ignore_index=True) 22 | 23 | test = pd.read_csv('../input/sample_submission_v2.csv') 24 | 25 | # Merge Data 26 | 27 | # train = pd.merge(train, transactions, how='left', on='msno') 28 | # test = pd.merge(test, transactions, how='left', on='msno') 29 | 30 | train = pd.merge(train, user_log_train, how='left', on='msno') 31 | test = pd.merge(test, user_log_test, how='left', on='msno') 32 | 33 | train = pd.merge(train, members, how='left', on='msno') 34 | test = pd.merge(test, members, how='left', on='msno') 35 | 36 | # Drop duplicates first 37 | test = test.drop_duplicates('msno') 38 | 39 | gender = {'male': 1, 'female': 2} 40 | train['gender'] = train['gender'].map(gender) 41 | test['gender'] = test['gender'].map(gender) 42 | 43 | train = train.fillna(0) 44 | test = test.fillna(0) 45 | 46 | # Delete date for now 47 | # train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1) 48 | # test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1) 49 | 50 | corr = train.corr() 51 | 52 | # print('Train Data Set Correlation:') 53 | # print(corr) 54 | 55 | corr.to_csv('user_log_features_without_transaction_corr.csv', index=False) 56 | 57 | # Generate a mask for the upper triangle 58 | mask = np.zeros_like(corr, dtype=np.bool) 59 | mask[np.triu_indices_from(mask)] = True 60 | 61 | # Set up the matplotlib figure 62 | f, ax = plt.subplots(figsize=(11, 9)) 63 | 64 | # Generate a custom diverging colormap 65 | cmap = sns.diverging_palette(220, 10, as_cmap=True) 66 | 67 | # Draw the heatmap with the mask and correct aspect ratio 68 | headmap = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, 69 | square=True, linewidths=.5, cbar_kws={"shrink": .5}) 70 | fig = headmap.get_figure() 71 | fig.savefig('Features_Correlation_Heatmap_user_log') 72 | -------------------------------------------------------------------------------- /src/lightgbm_all_features.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import lightgbm as lgb 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import ShuffleSplit 8 | 9 | gc.enable() 10 | 11 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0) 12 | 13 | members = pd.read_csv('../input/members_v3.csv') 14 | 15 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv') 16 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv') 17 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv') 18 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv') 19 | 20 | train = pd.read_csv('../input/train.csv') 21 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True) 22 | 23 | test = pd.read_csv('../input/sample_submission_v2.csv') 24 | 25 | # Merge Data 26 | 27 | train = pd.merge(train, transactions, how='left', on='msno') 28 | test = pd.merge(test, transactions, how='left', on='msno') 29 | 30 | train = pd.merge(train, user_log_all, how='left', on='msno') 31 | test = pd.merge(test, user_log_all, how='left', on='msno') 32 | 33 | train = pd.merge(train, user_log_feb, how='left', on='msno') 34 | test = pd.merge(test, user_log_mar, how='left', on='msno') 35 | 36 | train = pd.merge(train, members, how='left', on='msno') 37 | test = pd.merge(test, members, how='left', on='msno') 38 | 39 | del transactions, members 40 | gc.collect() 41 | 42 | # Drop duplicates first 43 | test = test.drop_duplicates('msno') 44 | 45 | gender = {'male': 1, 'female': 2} 46 | train['gender'] = train['gender'].map(gender) 47 | test['gender'] = test['gender'].map(gender) 48 | 49 | train['bd'] = train['bd'].replace(0, train['bd'].mode()) 50 | test['bd'] = test['bd'].replace(0, test['bd'].mode()) 51 | 52 | train['gender'] = train['gender'].replace(0, train['gender'].mean()) 53 | test['gender'] = test['gender'].replace(0, test['gender'].mean()) 54 | 55 | # train = train.fillna(0) 56 | # test = test.fillna(0) 57 | 58 | # Delete date for now 59 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 60 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 61 | 62 | # Create 4 new features 63 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8) 64 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8) 65 | 66 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8) 67 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8) 68 | 69 | train = train.drop(['payment_method_id2', 70 | 'payment_method_id3', 71 | 'payment_method_id4', 72 | 'payment_method_id5', 73 | 'payment_method_id6', 74 | 'payment_method_id8', 75 | 'payment_method_id10', 76 | 'payment_method_id11', 77 | 'payment_method_id12', 78 | 'payment_method_id13', 79 | 'payment_method_id14', 80 | 'payment_method_id16', 81 | 'payment_method_id17', 82 | 'payment_method_id18', 83 | 'payment_method_id19', 84 | 'payment_method_id20', 85 | 'payment_method_id21', 86 | 'payment_method_id22', 87 | 'payment_method_id23', 88 | 'payment_method_id24', 89 | 'payment_method_id25', 90 | 'payment_method_id27', 91 | 'payment_method_id28', 92 | 'payment_method_id31', 93 | 'payment_method_id33', 94 | 'payment_method_id34', 95 | 'transaction_date_day', 96 | 'membership_expire_date_day'], axis=1) 97 | 98 | test = test.drop(['payment_method_id2', 99 | 'payment_method_id3', 100 | 'payment_method_id4', 101 | 'payment_method_id5', 102 | 'payment_method_id6', 103 | 'payment_method_id8', 104 | 'payment_method_id10', 105 | 'payment_method_id11', 106 | 'payment_method_id12', 107 | 'payment_method_id13', 108 | 'payment_method_id14', 109 | 'payment_method_id16', 110 | 'payment_method_id17', 111 | 'payment_method_id18', 112 | 'payment_method_id19', 113 | 'payment_method_id20', 114 | 'payment_method_id21', 115 | 'payment_method_id22', 116 | 'payment_method_id23', 117 | 'payment_method_id24', 118 | 'payment_method_id25', 119 | 'payment_method_id27', 120 | 'payment_method_id28', 121 | 'payment_method_id31', 122 | 'payment_method_id33', 123 | 'payment_method_id34', 124 | 'transaction_date_day', 125 | 'membership_expire_date_day'], axis=1) 126 | 127 | feature_list = [ 128 | # raw data 129 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 130 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn', 131 | # advanced features 132 | # user_log 133 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum', 134 | 'total_secs_sum', 135 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime', 136 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio', 137 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum', 138 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio', 139 | # transactions 140 | 'discount', 'amt_per_day', 'is_discount', 'membership_days', 141 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day', 142 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day' 143 | # members 144 | ] 145 | 146 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']] 147 | 148 | print(cols) 149 | 150 | params = { 151 | 'objective': 'binary', 152 | 'metric': 'binary_logloss', 153 | 'boosting': 'gbdt', 154 | 'learning_rate': 0.002, # small learn rate, large number of iterations 155 | 'verbose': 0, 156 | 'num_leaves': 108, 157 | 'bagging_fraction': 0.95, 158 | 'bagging_freq': 1, 159 | 'bagging_seed': 1, 160 | 'feature_fraction': 0.9, 161 | 'feature_fraction_seed': 1, 162 | 'max_bin': 128, 163 | 'max_depth': 7, 164 | 'reg_alpha': 1, 165 | 'reg_lambda': 0, 166 | 'min_split_gain': 0.5, 167 | 'min_child_weight': 1, 168 | 'min_child_samples': 10, 169 | 'scale_pos_weight': 1 170 | } 171 | 172 | bst = None 173 | 174 | cv_results = lgb.cv( 175 | params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False, 176 | shuffle=True, 177 | metrics='binary_logloss', 178 | early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0) 179 | 180 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train): 181 | train_data = lgb.Dataset(train[cols].loc[train_indices, :], 182 | label=train.loc[train_indices, 'is_churn']) 183 | val_data = lgb.Dataset(train[cols].loc[val_indices, :], 184 | label=train.loc[val_indices, 'is_churn']) 185 | 186 | bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50) 187 | 188 | predictions = bst.predict(test[cols]) 189 | test['is_churn'] = predictions 190 | test = test[['msno', 'is_churn']] 191 | test.to_csv('submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv', 192 | index=False) 193 | 194 | print('Plot feature importances...') 195 | ax = lgb.plot_importance(bst) 196 | importance = bst.feature_importance() 197 | # importance = sorted(importance., key=operator.itemgetter(1)) 198 | 199 | # importance = importance[::-1] 200 | # print(cols) 201 | # print(type(importance)) 202 | a = pd.DataFrame({'feature': cols, 'importance': importance}) 203 | # print(a) 204 | a.to_csv('feature_importance_features_selection.csv') 205 | # plt.show() 206 | plt.savefig('lightgbm_feaeture_importance_all_time') 207 | -------------------------------------------------------------------------------- /src/lightgbm_feaeture_importance_.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_.png -------------------------------------------------------------------------------- /src/lightgbm_feaeture_importance_all_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_all_time.png -------------------------------------------------------------------------------- /src/lightgbm_features.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import lightgbm as lgb 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import pandas as pd 7 | from sklearn.model_selection import ShuffleSplit 8 | 9 | gc.enable() 10 | 11 | transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0) 12 | transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0) 13 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0) 14 | 15 | transactions = transactions[ 16 | ['msno', 'discount', 'amt_per_day', 'is_discount', 'membership_days', 'transaction_date_year', 17 | 'transaction_date_month', 18 | 'transaction_date_day', 'membership_expire_date_year', 'membership_expire_date_month', 19 | 'membership_expire_date_day']] 20 | 21 | members = pd.read_csv('../input/members_v3.csv') 22 | 23 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv') 24 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv') 25 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv') 26 | 27 | train = pd.read_csv('../input/train_v2.csv') 28 | 29 | test = pd.read_csv('../input/sample_submission_v2.csv') 30 | 31 | # Merge Data 32 | 33 | train = pd.merge(train, transactions_train, how='left', on='msno') 34 | test = pd.merge(test, transactions_test, how='left', on='msno') 35 | 36 | train = pd.merge(train, transactions, how='left', on='msno') 37 | test = pd.merge(test, transactions, how='left', on='msno') 38 | 39 | train = pd.merge(train, user_log_train, how='left', on='msno') 40 | test = pd.merge(test, user_log_test, how='left', on='msno') 41 | 42 | train = pd.merge(train, user_log_all, how='left', on='msno') 43 | test = pd.merge(test, user_log_all, how='left', on='msno') 44 | 45 | train = pd.merge(train, members, how='left', on='msno') 46 | test = pd.merge(test, members, how='left', on='msno') 47 | 48 | del transactions, members, user_log_train, user_log_test 49 | gc.collect() 50 | 51 | # Drop duplicates first 52 | test = test.drop_duplicates('msno') 53 | 54 | gender = {'male': 1, 'female': 2} 55 | train['gender'] = train['gender'].map(gender) 56 | test['gender'] = test['gender'].map(gender) 57 | 58 | train['bd'] = train['bd'].replace(0, train['bd'].mode()) 59 | test['bd'] = test['bd'].replace(0, test['bd'].mode()) 60 | 61 | train['gender'] = train['gender'].replace(0, train['gender'].mean()) 62 | test['gender'] = test['gender'].replace(0, test['gender'].mean()) 63 | 64 | train = train.fillna(0) 65 | test = test.fillna(0) 66 | 67 | # Delete date for now 68 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 69 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 70 | 71 | # Remove Features with 0 feature importance 72 | train = train.drop( 73 | ['payment_method_id14', 74 | 'payment_method_id18', 75 | 'payment_method_id21', 76 | 'payment_method_id26', 77 | 'payment_method_id35', 78 | 'transaction_date_month_x', 79 | 'transaction_date_day_x', 80 | 'membership_expire_date_year_x', 81 | 'membership_expire_date_month_x', 82 | 'membership_expire_date_day_x', 83 | 'transaction_date_day_y', 84 | 'membership_expire_date_day_y'], axis=1) 85 | test = test.drop( 86 | ['payment_method_id14', 87 | 'payment_method_id18', 88 | 'payment_method_id21', 89 | 'payment_method_id26', 90 | 'payment_method_id35', 91 | 'transaction_date_month_x', 92 | 'transaction_date_day_x', 93 | 'membership_expire_date_year_x', 94 | 'membership_expire_date_month_x', 95 | 'membership_expire_date_day_x', 96 | 'transaction_date_day_y', 97 | 'membership_expire_date_day_y'], axis=1) 98 | 99 | # Remove Features with feature importance less than 100 100 | train = train.drop( 101 | ['payment_method_id16', 102 | 'payment_method_id17', 103 | 'payment_method_id19', 104 | 'payment_method_id23', 105 | 'payment_method_id27', 106 | 'payment_method_id28', 107 | 'payment_method_id31', 108 | 'payment_method_id33', 109 | 'payment_method_id34', 110 | 'payment_method_id39', 111 | 'is_discount_x', 112 | 'transaction_date_year_x'], axis=1) 113 | test = test.drop( 114 | ['payment_method_id16', 115 | 'payment_method_id17', 116 | 'payment_method_id19', 117 | 'payment_method_id23', 118 | 'payment_method_id27', 119 | 'payment_method_id28', 120 | 'payment_method_id31', 121 | 'payment_method_id33', 122 | 'payment_method_id34', 123 | 'payment_method_id39', 124 | 'is_discount_x', 125 | 'transaction_date_year_x'], axis=1) 126 | 127 | # Create 4 new features 128 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8) 129 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8) 130 | 131 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8) 132 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8) 133 | 134 | feature_list = [ 135 | # raw data 136 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 137 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn', 138 | # advanced features 139 | # user_log 140 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum', 141 | 'total_secs_sum', 142 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime', 143 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio', 144 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum', 145 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio', 146 | # transactions 147 | 'discount', 'amt_per_day', 'is_discount', 'membership_days', 148 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day', 149 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day' 150 | # members 151 | ] 152 | 153 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']] 154 | 155 | print(cols) 156 | 157 | params = { 158 | 'objective': 'binary', 159 | 'metric': 'binary_logloss', 160 | 'boosting': 'gbdt', 161 | 'learning_rate': 0.002, # small learn rate, large number of iterations 162 | 'verbose': 0, 163 | 'num_leaves': 108, 164 | 'bagging_fraction': 0.95, 165 | 'bagging_freq': 1, 166 | 'bagging_seed': 1, 167 | 'feature_fraction': 0.9, 168 | 'feature_fraction_seed': 1, 169 | 'max_bin': 128, 170 | 'max_depth': 7, 171 | 'reg_alpha': 1, 172 | 'reg_lambda': 0, 173 | 'min_split_gain': 0.5, 174 | 'min_child_weight': 1, 175 | 'min_child_samples': 10, 176 | 'scale_pos_weight': 1 177 | } 178 | 179 | bst = None 180 | 181 | cv_results = lgb.cv( 182 | params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False, 183 | shuffle=True, 184 | metrics='binary_logloss', 185 | early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0) 186 | 187 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train): 188 | train_data = lgb.Dataset(train[cols].loc[train_indices, :], 189 | label=train.loc[train_indices, 'is_churn']) 190 | val_data = lgb.Dataset(train[cols].loc[val_indices, :], 191 | label=train.loc[val_indices, 'is_churn']) 192 | 193 | bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50) 194 | 195 | predictions = bst.predict(test[cols]) 196 | test['is_churn'] = predictions 197 | test = test[['msno', 'is_churn']] 198 | test.to_csv('submission_lightgbm_features_features_selection_best_parameter_eta_0.002_round_2000_Dec_15.csv', 199 | index=False) 200 | 201 | print('Plot feature importances...') 202 | ax = lgb.plot_importance(bst) 203 | importance = bst.feature_importance() 204 | # importance = sorted(importance., key=operator.itemgetter(1)) 205 | 206 | # importance = importance[::-1] 207 | # print(cols) 208 | # print(type(importance)) 209 | a = pd.DataFrame({'feature': cols, 'importance': importance}) 210 | # print(a) 211 | a.to_csv('feature_importance_all.csv') 212 | # plt.show() 213 | plt.savefig('lightgbm_feaeture_importance_') 214 | -------------------------------------------------------------------------------- /src/process_features_userlog_all.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def process_user_log_together(df): 6 | """ 7 | After union all chunk file, do sum again. 8 | :param df: 9 | :return: 10 | """ 11 | 12 | df = df.fillna(0) 13 | 14 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 15 | func = {'log_day_monthly': ['sum'], 16 | 'total_25_sum_monthly': ['sum'], 17 | 'total_50_sum_monthly': ['sum'], 18 | 'total_75_sum_monthly': ['sum'], 19 | 'total_985_sum_monthly': ['sum'], 20 | 'total_100_sum_monthly': ['sum'], 21 | 'total_unq_sum_monthly': ['sum'], 22 | 'total_secs_sum_monthly': ['sum'] 23 | } 24 | user_log_all = grouped_object.agg(func).reset_index() 25 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values] 26 | user_log_all.rename(columns={'msno_': 'msno', 27 | 'log_day_monthly_sum': 'log_day_monthly', 28 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly', 29 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly', 30 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly', 31 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly', 32 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly', 33 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly', 34 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly', 35 | }, inplace=True) 36 | 37 | return user_log_all 38 | 39 | 40 | def calculate_user_log_features(train): 41 | """ 42 | Calculate the user log features. 43 | :param train: 44 | :return: 45 | """ 46 | train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[ 47 | 'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly'] 48 | 49 | # Monthly Habit for listening to music 50 | train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum'] 51 | train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum'] 52 | 53 | # 听歌是循环播放还是试听,每首歌播放次数 54 | train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly'] 55 | 56 | # 听歌每首歌平均播放时间 57 | train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum'] 58 | 59 | # 平均每天听歌数量 60 | train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly'] 61 | 62 | # 平均每天听歌时间 63 | train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly'] 64 | 65 | train.replace(np.inf, 0, inplace=True) 66 | train = train.fillna(0) 67 | 68 | return train 69 | 70 | 71 | train = pd.read_csv('../input/processed_user_log_mid_all.csv') 72 | user_log_test = pd.read_csv('../input/processed_user_log_mid_all.csv') 73 | user_log_test = user_log_test[['msno', 74 | 'log_day_monthly', 75 | 'total_25_sum_monthly', 76 | 'total_50_sum_monthly', 77 | 'total_75_sum_monthly', 78 | 'total_985_sum_monthly', 79 | 'total_100_sum_monthly', 80 | 'total_unq_sum_monthly', 81 | 'total_secs_sum_monthly']] 82 | 83 | print(train.columns) 84 | print(user_log_test.columns) 85 | 86 | train = train.append(user_log_test) 87 | 88 | train = process_user_log_together(train) 89 | 90 | train = calculate_user_log_features(train) 91 | 92 | print(len(train)) 93 | 94 | train.to_csv('../input/processed_features_user_log_all_time_including_mar.csv', index=False) 95 | -------------------------------------------------------------------------------- /src/process_features_userlog_feb_mar.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | 7 | def calculate_user_log_features(train): 8 | """ 9 | Calculate the user log features. 10 | :param train: 11 | :return: 12 | """ 13 | train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[ 14 | 'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly'] 15 | 16 | # Monthly Habit for listening to music 17 | train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum'] 18 | train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum'] 19 | 20 | # 听歌是循环播放还是试听,每首歌播放次数 21 | train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly'] 22 | 23 | # 听歌每首歌平均播放时间 24 | train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum'] 25 | 26 | # 平均每天听歌数量 27 | train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly'] 28 | 29 | # 平均每天听歌时间 30 | train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly'] 31 | 32 | train['one_week_sum'] = train['one_week_total_25_sum'] + train['one_week_total_50_sum'] + train[ 33 | 'one_week_total_75_sum'] + train['one_week_total_985_sum'] + train['one_week_total_100_sum'] 34 | 35 | train['two_week_sum'] = train['two_week_total_25_sum'] + train['two_week_total_50_sum'] + train[ 36 | 'two_week_total_75_sum'] + train['two_week_total_985_sum'] + train['two_week_total_100_sum'] 37 | 38 | # 第四周听歌时间与第三周比较 39 | train['week_secs_sum_ratio'] = train['two_week_total_secs_sum'] / train['one_week_total_secs_sum'] 40 | # 第四周听歌数与第三周比较 41 | train['week_sum_ratio'] = train['two_week_sum'] / train['one_week_sum'] 42 | 43 | train['one_semimonth_sum'] = train['one_semimonth_total_25_sum'] + train['one_semimonth_total_50_sum'] \ 44 | + train['one_semimonth_total_75_sum'] + train[ 45 | 'one_semimonth_total_985_sum'] + train['one_semimonth_total_100_sum'] 46 | 47 | train['two_semimonth_sum'] = train['two_semimonth_total_25_sum'] + train['two_semimonth_total_50_sum'] \ 48 | + train['two_semimonth_total_75_sum'] + train[ 49 | 'two_semimonth_total_985_sum'] + train['two_semimonth_total_100_sum'] 50 | 51 | # 第二个半月听歌时间与第一个半月比较 52 | train['semimonth_secs_sum_ratio'] = train['two_semimonth_total_secs_sum'] / train['one_semimonth_total_secs_sum'] 53 | # 第二个半月听歌数与第一个半月比较 54 | train['semimonth_sum_ratio'] = train['two_semimonth_sum'] / train['one_semimonth_sum'] 55 | 56 | train.replace(np.inf, 0, inplace=True) 57 | train = train.fillna(0) 58 | train = train.drop(['log_day_monthly', 59 | 'total_25_sum_monthly', 60 | 'total_50_sum_monthly', 61 | 'total_75_sum_monthly', 62 | 'total_985_sum_monthly', 63 | 'total_100_sum_monthly', 64 | 'total_unq_sum_monthly', 65 | 'total_secs_sum_monthly', 66 | 'one_week_log_day', 67 | 'one_week_total_25_sum', 68 | 'one_week_total_50_sum', 69 | 'one_week_total_75_sum', 70 | 'one_week_total_985_sum', 71 | 'one_week_total_100_sum', 72 | 'one_week_total_unq_sum', 73 | 'one_week_total_secs_sum', 74 | 'two_week_log_day', 75 | 'two_week_total_25_sum', 76 | 'two_week_total_50_sum', 77 | 'two_week_total_75_sum', 78 | 'two_week_total_985_sum', 79 | 'two_week_total_100_sum', 80 | 'two_week_total_unq_sum', 81 | 'two_week_total_secs_sum', 82 | 'one_semimonth_log_day', 83 | 'one_semimonth_total_25_sum', 84 | 'one_semimonth_total_50_sum', 85 | 'one_semimonth_total_75_sum', 86 | 'one_semimonth_total_985_sum', 87 | 'one_semimonth_total_100_sum', 88 | 'one_semimonth_total_unq_sum', 89 | 'one_semimonth_total_secs_sum', 90 | 'two_semimonth_log_day', 91 | 'two_semimonth_total_25_sum', 92 | 'two_semimonth_total_50_sum', 93 | 'two_semimonth_total_75_sum', 94 | 'two_semimonth_total_985_sum', 95 | 'two_semimonth_total_100_sum', 96 | 'two_semimonth_total_unq_sum', 97 | 'two_semimonth_total_secs_sum'], axis=1) 98 | 99 | return train 100 | 101 | 102 | train = pd.read_csv('../input/processed_user_log_feb.csv') 103 | 104 | train = calculate_user_log_features(train) 105 | 106 | train.to_csv('../input/processed_features_user_log_feb.csv', index=False) 107 | 108 | del train 109 | gc.collect() 110 | 111 | test = pd.read_csv('../input/processed_user_log_mar.csv') 112 | 113 | test = calculate_user_log_features(test) 114 | 115 | test.to_csv('../input/processed_features_user_log_mar.csv', index=False) 116 | -------------------------------------------------------------------------------- /src/process_userlog_all.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | 4 | import pandas as pd 5 | 6 | 7 | def process_user_log(df): 8 | """ 9 | Only do simple sum. mean operation. 10 | :param df: chunk dataframe from very large file. 11 | :return: processed dataframe 12 | """ 13 | 14 | # Divided DataFrame by date 15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)] 16 | 17 | # Stage 1: One Month Total Data 18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 19 | func = {'date': ['count'], 20 | 'num_25': ['sum'], 'num_50': ['sum'], 21 | 'num_75': ['sum'], 'num_985': ['sum'], 22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']} 23 | one_month = grouped_object.agg(func).reset_index() 24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values] 25 | one_month.rename(columns={'msno_': 'msno', 26 | 'date_count': 'log_day_monthly', 27 | 'num_25_sum': 'total_25_sum_monthly', 28 | 'num_50_sum': 'total_50_sum_monthly', 29 | 'num_75_sum': 'total_75_sum_monthly', 30 | 'num_985_sum': 'total_985_sum_monthly', 31 | 'num_100_sum': 'total_100_sum_monthly', 32 | 'num_unq_sum': 'total_unq_sum_monthly', 33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True) 34 | 35 | return one_month 36 | 37 | 38 | def process_user_log_together(df): 39 | """ 40 | After union all chunk file, do sum again. 41 | :param df: 42 | :return: 43 | """ 44 | 45 | df = df.fillna(0) 46 | 47 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 48 | func = {'log_day_monthly': ['sum'], 49 | 'total_25_sum_monthly': ['sum'], 50 | 'total_50_sum_monthly': ['sum'], 51 | 'total_75_sum_monthly': ['sum'], 52 | 'total_985_sum_monthly': ['sum'], 53 | 'total_100_sum_monthly': ['sum'], 54 | 'total_unq_sum_monthly': ['sum'], 55 | 'total_secs_sum_monthly': ['sum'] 56 | } 57 | user_log_all = grouped_object.agg(func).reset_index() 58 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values] 59 | user_log_all.rename(columns={'msno_': 'msno', 60 | 'log_day_monthly_sum': 'log_day_monthly', 61 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly', 62 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly', 63 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly', 64 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly', 65 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly', 66 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly', 67 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly', 68 | }, inplace=True) 69 | 70 | return user_log_all 71 | 72 | 73 | gc.enable() 74 | 75 | size = 4e7 # 40 million 76 | reader = pd.read_csv('../input/user_logs.csv', chunksize=size) 77 | start_time = time.time() 78 | for i in range(10): 79 | user_log_chunk = next(reader) 80 | if i == 0: 81 | user_log_feb = process_user_log(user_log_chunk) 82 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 83 | else: 84 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk)) 85 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 86 | del user_log_chunk 87 | 88 | user_log_feb = process_user_log_together(user_log_feb) 89 | 90 | print(len(user_log_feb)) 91 | 92 | user_log_feb.to_csv("../input/processed_user_log_mid_all.csv", index=False) 93 | 94 | print('Done') 95 | -------------------------------------------------------------------------------- /src/process_userlog_feb.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | 4 | import pandas as pd 5 | 6 | 7 | def process_user_log(df): 8 | """ 9 | Only do simple sum. mean operation. 10 | :param df: chunk dataframe from very large file. 11 | :return: processed dataframe 12 | """ 13 | 14 | # Divided DataFrame by date 15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)] 16 | 17 | # Stage 1: One Month Total Data 18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 19 | func = {'date': ['count'], 20 | 'num_25': ['sum'], 'num_50': ['sum'], 21 | 'num_75': ['sum'], 'num_985': ['sum'], 22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']} 23 | one_month = grouped_object.agg(func).reset_index() 24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values] 25 | one_month.rename(columns={'msno_': 'msno', 26 | 'date_count': 'log_day_monthly', 27 | 'num_25_sum': 'total_25_sum_monthly', 28 | 'num_50_sum': 'total_50_sum_monthly', 29 | 'num_75_sum': 'total_75_sum_monthly', 30 | 'num_985_sum': 'total_985_sum_monthly', 31 | 'num_100_sum': 'total_100_sum_monthly', 32 | 'num_unq_sum': 'total_unq_sum_monthly', 33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True) 34 | 35 | # Stage 2: Week Total Data 36 | # Divided DataFrame by Two Week 37 | one_week = df[(df['date'] < 20170220) & (df['date'] > 20170212)] 38 | 39 | grouped_object = one_week.groupby('msno', sort=False) 40 | one_week = grouped_object.agg(func).reset_index() 41 | one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values] 42 | one_week.rename(columns={'msno_': 'msno', 43 | 'date_count': 'one_week_log_day', 44 | 'num_25_sum': 'one_week_total_25_sum', 45 | 'num_50_sum': 'one_week_total_50_sum', 46 | 'num_75_sum': 'one_week_total_75_sum', 47 | 'num_985_sum': 'one_week_total_985_sum', 48 | 'num_100_sum': 'one_week_total_100_sum', 49 | 'num_unq_sum': 'one_week_total_unq_sum', 50 | 'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True) 51 | 52 | one_month = pd.merge(one_month, one_week, on=['msno'], how='left') 53 | 54 | del one_week 55 | gc.collect() 56 | 57 | two_week = df[(df['date'] < 20170227) & (df['date'] > 20170219)] 58 | 59 | grouped_object = two_week.groupby('msno', sort=False) 60 | two_week = grouped_object.agg(func).reset_index() 61 | two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values] 62 | two_week.rename(columns={'msno_': 'msno', 63 | 'date_count': 'two_week_log_day', 64 | 'num_25_sum': 'two_week_total_25_sum', 65 | 'num_50_sum': 'two_week_total_50_sum', 66 | 'num_75_sum': 'two_week_total_75_sum', 67 | 'num_985_sum': 'two_week_total_985_sum', 68 | 'num_100_sum': 'two_week_total_100_sum', 69 | 'num_unq_sum': 'two_week_total_unq_sum', 70 | 'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True) 71 | 72 | one_month = pd.merge(one_month, two_week, on=['msno'], how='left') 73 | 74 | del two_week 75 | gc.collect() 76 | 77 | # Stage 3: Semimonth Total Data 78 | one_semimonth = df[(df['date'] < 20170215) & (df['date'] > 20170131)] 79 | 80 | grouped_object = one_semimonth.groupby('msno', sort=False) 81 | one_semimonth = grouped_object.agg(func).reset_index() 82 | one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values] 83 | one_semimonth.rename(columns={'msno_': 'msno', 84 | 'date_count': 'one_semimonth_log_day', 85 | 'num_25_sum': 'one_semimonth_total_25_sum', 86 | 'num_50_sum': 'one_semimonth_total_50_sum', 87 | 'num_75_sum': 'one_semimonth_total_75_sum', 88 | 'num_985_sum': 'one_semimonth_total_985_sum', 89 | 'num_100_sum': 'one_semimonth_total_100_sum', 90 | 'num_unq_sum': 'one_semimonth_total_unq_sum', 91 | 'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True) 92 | 93 | one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left') 94 | 95 | del one_semimonth 96 | gc.collect() 97 | 98 | two_semimonth = df[(df['date'] < 20170301) & (df['date'] > 20170214)] 99 | 100 | grouped_object = two_semimonth.groupby('msno', sort=False) 101 | two_semimonth = grouped_object.agg(func).reset_index() 102 | two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values] 103 | two_semimonth.rename(columns={'msno_': 'msno', 104 | 'date_count': 'two_semimonth_log_day', 105 | 'num_25_sum': 'two_semimonth_total_25_sum', 106 | 'num_50_sum': 'two_semimonth_total_50_sum', 107 | 'num_75_sum': 'two_semimonth_total_75_sum', 108 | 'num_985_sum': 'two_semimonth_total_985_sum', 109 | 'num_100_sum': 'two_semimonth_total_100_sum', 110 | 'num_unq_sum': 'two_semimonth_total_unq_sum', 111 | 'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True) 112 | 113 | one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left') 114 | 115 | del two_semimonth 116 | gc.collect() 117 | 118 | return one_month 119 | 120 | 121 | def process_user_log_together(df): 122 | """ 123 | After union all chunk file, do sum again. 124 | :param df: 125 | :return: 126 | """ 127 | 128 | df = df.fillna(0) 129 | 130 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 131 | func = {'log_day_monthly': ['sum'], 132 | 'total_25_sum_monthly': ['sum'], 133 | 'total_50_sum_monthly': ['sum'], 134 | 'total_75_sum_monthly': ['sum'], 135 | 'total_985_sum_monthly': ['sum'], 136 | 'total_100_sum_monthly': ['sum'], 137 | 'total_unq_sum_monthly': ['sum'], 138 | 'total_secs_sum_monthly': ['sum'], 139 | 'one_week_log_day': ['sum'], 140 | 'one_week_total_25_sum': ['sum'], 141 | 'one_week_total_50_sum': ['sum'], 142 | 'one_week_total_75_sum': ['sum'], 143 | 'one_week_total_985_sum': ['sum'], 144 | 'one_week_total_100_sum': ['sum'], 145 | 'one_week_total_unq_sum': ['sum'], 146 | 'one_week_total_secs_sum': ['sum'], 147 | 'two_week_log_day': ['sum'], 148 | 'two_week_total_25_sum': ['sum'], 149 | 'two_week_total_50_sum': ['sum'], 150 | 'two_week_total_75_sum': ['sum'], 151 | 'two_week_total_985_sum': ['sum'], 152 | 'two_week_total_100_sum': ['sum'], 153 | 'two_week_total_unq_sum': ['sum'], 154 | 'two_week_total_secs_sum': ['sum'], 155 | 'one_semimonth_log_day': ['sum'], 156 | 'one_semimonth_total_25_sum': ['sum'], 157 | 'one_semimonth_total_50_sum': ['sum'], 158 | 'one_semimonth_total_75_sum': ['sum'], 159 | 'one_semimonth_total_985_sum': ['sum'], 160 | 'one_semimonth_total_100_sum': ['sum'], 161 | 'one_semimonth_total_unq_sum': ['sum'], 162 | 'one_semimonth_total_secs_sum': ['sum'], 163 | 'two_semimonth_log_day': ['sum'], 164 | 'two_semimonth_total_25_sum': ['sum'], 165 | 'two_semimonth_total_50_sum': ['sum'], 166 | 'two_semimonth_total_75_sum': ['sum'], 167 | 'two_semimonth_total_985_sum': ['sum'], 168 | 'two_semimonth_total_100_sum': ['sum'], 169 | 'two_semimonth_total_unq_sum': ['sum'], 170 | 'two_semimonth_total_secs_sum': ['sum'] 171 | } 172 | user_log_all = grouped_object.agg(func).reset_index() 173 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values] 174 | user_log_all.rename(columns={'msno_': 'msno', 175 | 'log_day_monthly_sum': 'log_day_monthly', 176 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly', 177 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly', 178 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly', 179 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly', 180 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly', 181 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly', 182 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly', 183 | 'one_week_log_day_sum': 'one_week_log_day', 184 | 'one_week_total_25_sum_sum': 'one_week_total_25_sum', 185 | 'one_week_total_50_sum_sum': 'one_week_total_50_sum', 186 | 'one_week_total_75_sum_sum': 'one_week_total_75_sum', 187 | 'one_week_total_985_sum_sum': 'one_week_total_985_sum', 188 | 'one_week_total_100_sum_sum': 'one_week_total_100_sum', 189 | 'one_week_total_unq_sum_sum': 'one_week_total_unq_sum', 190 | 'one_week_total_secs_sum_sum': 'one_week_total_secs_sum', 191 | 'two_week_log_day_sum': 'two_week_log_day', 192 | 'two_week_total_25_sum_sum': 'two_week_total_25_sum', 193 | 'two_week_total_50_sum_sum': 'two_week_total_50_sum', 194 | 'two_week_total_75_sum_sum': 'two_week_total_75_sum', 195 | 'two_week_total_985_sum_sum': 'two_week_total_985_sum', 196 | 'two_week_total_100_sum_sum': 'two_week_total_100_sum', 197 | 'two_week_total_unq_sum_sum': 'two_week_total_unq_sum', 198 | 'two_week_total_secs_sum_sum': 'two_week_total_secs_sum', 199 | 'one_semimonth_log_day_sum': 'one_semimonth_log_day', 200 | 'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum', 201 | 'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum', 202 | 'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum', 203 | 'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum', 204 | 'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum', 205 | 'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum', 206 | 'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum', 207 | 'two_semimonth_log_day_sum': 'two_semimonth_log_day', 208 | 'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum', 209 | 'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum', 210 | 'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum', 211 | 'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum', 212 | 'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum', 213 | 'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum', 214 | 'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum' 215 | }, inplace=True) 216 | 217 | return user_log_all 218 | 219 | 220 | gc.enable() 221 | 222 | size = 1e6 223 | reader = pd.read_csv('../input/user_log_feb.csv', chunksize=size) 224 | start_time = time.time() 225 | for i in range(17): # 17 226 | user_log_chunk = next(reader) 227 | if i == 0: 228 | user_log_feb = process_user_log(user_log_chunk) 229 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 230 | else: 231 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk)) 232 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 233 | del user_log_chunk 234 | 235 | user_log_feb = process_user_log_together(user_log_feb) 236 | 237 | user_log_feb.to_csv("../input/processed_user_log_feb.csv", index=False) 238 | 239 | print('Done') 240 | -------------------------------------------------------------------------------- /src/process_userlog_mar.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import time 3 | 4 | import pandas as pd 5 | 6 | 7 | def process_user_log(df): 8 | """ 9 | Only do simple sum. mean operation. 10 | :param df: chunk dataframe from very large file. 11 | :return: processed dataframe 12 | """ 13 | 14 | # Divided DataFrame by date 15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)] 16 | 17 | # Stage 1: One Month Total Data 18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 19 | func = {'date': ['count'], 20 | 'num_25': ['sum'], 'num_50': ['sum'], 21 | 'num_75': ['sum'], 'num_985': ['sum'], 22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']} 23 | one_month = grouped_object.agg(func).reset_index() 24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values] 25 | one_month.rename(columns={'msno_': 'msno', 26 | 'date_count': 'log_day_monthly', 27 | 'num_25_sum': 'total_25_sum_monthly', 28 | 'num_50_sum': 'total_50_sum_monthly', 29 | 'num_75_sum': 'total_75_sum_monthly', 30 | 'num_985_sum': 'total_985_sum_monthly', 31 | 'num_100_sum': 'total_100_sum_monthly', 32 | 'num_unq_sum': 'total_unq_sum_monthly', 33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True) 34 | 35 | # Stage 2: Week Total Data 36 | # Divided DataFrame by Two Week 37 | one_week = df[(df['date'] < 20170320) & (df['date'] > 20170312)] 38 | 39 | grouped_object = one_week.groupby('msno', sort=False) 40 | one_week = grouped_object.agg(func).reset_index() 41 | one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values] 42 | one_week.rename(columns={'msno_': 'msno', 43 | 'date_count': 'one_week_log_day', 44 | 'num_25_sum': 'one_week_total_25_sum', 45 | 'num_50_sum': 'one_week_total_50_sum', 46 | 'num_75_sum': 'one_week_total_75_sum', 47 | 'num_985_sum': 'one_week_total_985_sum', 48 | 'num_100_sum': 'one_week_total_100_sum', 49 | 'num_unq_sum': 'one_week_total_unq_sum', 50 | 'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True) 51 | 52 | one_month = pd.merge(one_month, one_week, on=['msno'], how='left') 53 | 54 | del one_week 55 | gc.collect() 56 | 57 | two_week = df[(df['date'] < 20170327) & (df['date'] > 20170319)] 58 | 59 | grouped_object = two_week.groupby('msno', sort=False) 60 | two_week = grouped_object.agg(func).reset_index() 61 | two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values] 62 | two_week.rename(columns={'msno_': 'msno', 63 | 'date_count': 'two_week_log_day', 64 | 'num_25_sum': 'two_week_total_25_sum', 65 | 'num_50_sum': 'two_week_total_50_sum', 66 | 'num_75_sum': 'two_week_total_75_sum', 67 | 'num_985_sum': 'two_week_total_985_sum', 68 | 'num_100_sum': 'two_week_total_100_sum', 69 | 'num_unq_sum': 'two_week_total_unq_sum', 70 | 'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True) 71 | 72 | one_month = pd.merge(one_month, two_week, on=['msno'], how='left') 73 | 74 | del two_week 75 | gc.collect() 76 | 77 | # Stage 3: Semimonth Total Data 78 | one_semimonth = df[(df['date'] < 20170315) & (df['date'] > 20170228)] 79 | 80 | grouped_object = one_semimonth.groupby('msno', sort=False) 81 | one_semimonth = grouped_object.agg(func).reset_index() 82 | one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values] 83 | one_semimonth.rename(columns={'msno_': 'msno', 84 | 'date_count': 'one_semimonth_log_day', 85 | 'num_25_sum': 'one_semimonth_total_25_sum', 86 | 'num_50_sum': 'one_semimonth_total_50_sum', 87 | 'num_75_sum': 'one_semimonth_total_75_sum', 88 | 'num_985_sum': 'one_semimonth_total_985_sum', 89 | 'num_100_sum': 'one_semimonth_total_100_sum', 90 | 'num_unq_sum': 'one_semimonth_total_unq_sum', 91 | 'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True) 92 | 93 | one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left') 94 | 95 | del one_semimonth 96 | gc.collect() 97 | 98 | two_semimonth = df[(df['date'] < 20170329) & (df['date'] > 20170314)] 99 | 100 | grouped_object = two_semimonth.groupby('msno', sort=False) 101 | two_semimonth = grouped_object.agg(func).reset_index() 102 | two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values] 103 | two_semimonth.rename(columns={'msno_': 'msno', 104 | 'date_count': 'two_semimonth_log_day', 105 | 'num_25_sum': 'two_semimonth_total_25_sum', 106 | 'num_50_sum': 'two_semimonth_total_50_sum', 107 | 'num_75_sum': 'two_semimonth_total_75_sum', 108 | 'num_985_sum': 'two_semimonth_total_985_sum', 109 | 'num_100_sum': 'two_semimonth_total_100_sum', 110 | 'num_unq_sum': 'two_semimonth_total_unq_sum', 111 | 'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True) 112 | 113 | one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left') 114 | 115 | del two_semimonth 116 | gc.collect() 117 | 118 | return one_month 119 | 120 | 121 | def process_user_log_together(df): 122 | """ 123 | After union all chunk file, do sum again. 124 | :param df: 125 | :return: 126 | """ 127 | 128 | df = df.fillna(0) 129 | 130 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup 131 | func = {'log_day_monthly': ['sum'], 132 | 'total_25_sum_monthly': ['sum'], 133 | 'total_50_sum_monthly': ['sum'], 134 | 'total_75_sum_monthly': ['sum'], 135 | 'total_985_sum_monthly': ['sum'], 136 | 'total_100_sum_monthly': ['sum'], 137 | 'total_unq_sum_monthly': ['sum'], 138 | 'total_secs_sum_monthly': ['sum'], 139 | 'one_week_log_day': ['sum'], 140 | 'one_week_total_25_sum': ['sum'], 141 | 'one_week_total_50_sum': ['sum'], 142 | 'one_week_total_75_sum': ['sum'], 143 | 'one_week_total_985_sum': ['sum'], 144 | 'one_week_total_100_sum': ['sum'], 145 | 'one_week_total_unq_sum': ['sum'], 146 | 'one_week_total_secs_sum': ['sum'], 147 | 'two_week_log_day': ['sum'], 148 | 'two_week_total_25_sum': ['sum'], 149 | 'two_week_total_50_sum': ['sum'], 150 | 'two_week_total_75_sum': ['sum'], 151 | 'two_week_total_985_sum': ['sum'], 152 | 'two_week_total_100_sum': ['sum'], 153 | 'two_week_total_unq_sum': ['sum'], 154 | 'two_week_total_secs_sum': ['sum'], 155 | 'one_semimonth_log_day': ['sum'], 156 | 'one_semimonth_total_25_sum': ['sum'], 157 | 'one_semimonth_total_50_sum': ['sum'], 158 | 'one_semimonth_total_75_sum': ['sum'], 159 | 'one_semimonth_total_985_sum': ['sum'], 160 | 'one_semimonth_total_100_sum': ['sum'], 161 | 'one_semimonth_total_unq_sum': ['sum'], 162 | 'one_semimonth_total_secs_sum': ['sum'], 163 | 'two_semimonth_log_day': ['sum'], 164 | 'two_semimonth_total_25_sum': ['sum'], 165 | 'two_semimonth_total_50_sum': ['sum'], 166 | 'two_semimonth_total_75_sum': ['sum'], 167 | 'two_semimonth_total_985_sum': ['sum'], 168 | 'two_semimonth_total_100_sum': ['sum'], 169 | 'two_semimonth_total_unq_sum': ['sum'], 170 | 'two_semimonth_total_secs_sum': ['sum'] 171 | } 172 | user_log_all = grouped_object.agg(func).reset_index() 173 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values] 174 | user_log_all.rename(columns={'msno_': 'msno', 175 | 'log_day_monthly_sum': 'log_day_monthly', 176 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly', 177 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly', 178 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly', 179 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly', 180 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly', 181 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly', 182 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly', 183 | 'one_week_log_day_sum': 'one_week_log_day', 184 | 'one_week_total_25_sum_sum': 'one_week_total_25_sum', 185 | 'one_week_total_50_sum_sum': 'one_week_total_50_sum', 186 | 'one_week_total_75_sum_sum': 'one_week_total_75_sum', 187 | 'one_week_total_985_sum_sum': 'one_week_total_985_sum', 188 | 'one_week_total_100_sum_sum': 'one_week_total_100_sum', 189 | 'one_week_total_unq_sum_sum': 'one_week_total_unq_sum', 190 | 'one_week_total_secs_sum_sum': 'one_week_total_secs_sum', 191 | 'two_week_log_day_sum': 'two_week_log_day', 192 | 'two_week_total_25_sum_sum': 'two_week_total_25_sum', 193 | 'two_week_total_50_sum_sum': 'two_week_total_50_sum', 194 | 'two_week_total_75_sum_sum': 'two_week_total_75_sum', 195 | 'two_week_total_985_sum_sum': 'two_week_total_985_sum', 196 | 'two_week_total_100_sum_sum': 'two_week_total_100_sum', 197 | 'two_week_total_unq_sum_sum': 'two_week_total_unq_sum', 198 | 'two_week_total_secs_sum_sum': 'two_week_total_secs_sum', 199 | 'one_semimonth_log_day_sum': 'one_semimonth_log_day', 200 | 'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum', 201 | 'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum', 202 | 'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum', 203 | 'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum', 204 | 'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum', 205 | 'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum', 206 | 'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum', 207 | 'two_semimonth_log_day_sum': 'two_semimonth_log_day', 208 | 'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum', 209 | 'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum', 210 | 'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum', 211 | 'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum', 212 | 'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum', 213 | 'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum', 214 | 'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum' 215 | }, inplace=True) 216 | 217 | return user_log_all 218 | 219 | 220 | gc.enable() 221 | 222 | size = 1e6 223 | reader = pd.read_csv('../input/user_logs_v2.csv', chunksize=size) 224 | start_time = time.time() 225 | for i in range(18): # 17 226 | user_log_chunk = next(reader) 227 | if i == 0: 228 | user_log_feb = process_user_log(user_log_chunk) 229 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 230 | else: 231 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk)) 232 | print("Loop ", i, "took %s seconds" % (time.time() - start_time)) 233 | del user_log_chunk 234 | 235 | user_log_feb = process_user_log_together(user_log_feb) 236 | 237 | user_log_feb.to_csv("../input/processed_user_log_mar.csv", index=False) 238 | 239 | print('Done') 240 | -------------------------------------------------------------------------------- /src/weight_AveragingEnsemble.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | ''' 4 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781 5 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv') 6 | weight1 = 0.30 7 | 8 | # LB 0.12383 CV 0.127227 9 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv') 10 | weight2 = 0.30 11 | 12 | # LB 0.12323 Train LogLoss 0.0966805 13 | file3 = pd.read_csv('result/submission_lightgbm_features_all_eta_0.002_round_2000_Dec_13.csv') 14 | weight3 = 0.2 15 | 16 | # LB 0.12705 CV 0.136615 Train LogLoss 0.094903 17 | file4 = pd.read_csv('result/submission_xgboost_user_log_transaction_features_eta_0.002_round_2500_Dec_11.csv') 18 | weight4 = 0.2 19 | 20 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \ 21 | file3['is_churn'] * weight3 + file4['is_churn'] * weight4 22 | 23 | file1.to_csv('submission_weight_avg_4_0.3_0.3_0.2_0.2.csv', index=False) 24 | ''' 25 | 26 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781 27 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv') 28 | weight1 = 0.28 29 | 30 | # LB 0.12383 CV 0.127227 31 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv') 32 | weight2 = 0.28 33 | 34 | # LB 0.12393 CV 0.122639 Train LogLoss 0.102916 35 | file3 = pd.read_csv('result/submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv') 36 | weight3 = 0.44 37 | 38 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \ 39 | file3['is_churn'] * weight3 40 | 41 | file1.to_csv('submission_weight_avg_0.44_0.28_0.28.csv', index=False) 42 | -------------------------------------------------------------------------------- /src/xgboost_features.py: -------------------------------------------------------------------------------- 1 | import gc 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import sklearn 6 | import xgboost as xgb 7 | 8 | 9 | def xgb_score(preds, dtrain): 10 | labels = dtrain.get_label() 11 | return 'log_loss', sklearn.metrics.log_loss(labels, preds) 12 | 13 | 14 | gc.enable() 15 | 16 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0) 17 | 18 | members = pd.read_csv('../input/members_v3.csv') 19 | 20 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv') 21 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv') 22 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv') 23 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv') 24 | 25 | train = pd.read_csv('../input/train.csv') 26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True) 27 | 28 | test = pd.read_csv('../input/sample_submission_v2.csv') 29 | 30 | # Merge Data 31 | 32 | train = pd.merge(train, transactions, how='left', on='msno') 33 | test = pd.merge(test, transactions, how='left', on='msno') 34 | 35 | train = pd.merge(train, user_log_all, how='left', on='msno') 36 | test = pd.merge(test, user_log_all, how='left', on='msno') 37 | 38 | train = pd.merge(train, user_log_feb, how='left', on='msno') 39 | test = pd.merge(test, user_log_mar, how='left', on='msno') 40 | 41 | train = pd.merge(train, members, how='left', on='msno') 42 | test = pd.merge(test, members, how='left', on='msno') 43 | 44 | del transactions, members 45 | gc.collect() 46 | 47 | # Drop duplicates first 48 | test = test.drop_duplicates('msno') 49 | 50 | gender = {'male': 1, 'female': 2} 51 | train['gender'] = train['gender'].map(gender) 52 | test['gender'] = test['gender'].map(gender) 53 | 54 | train['bd'] = train['bd'].replace(0, train['bd'].mode()) 55 | test['bd'] = test['bd'].replace(0, test['bd'].mode()) 56 | 57 | train['gender'] = train['gender'].replace(0, train['gender'].mean()) 58 | test['gender'] = test['gender'].replace(0, test['gender'].mean()) 59 | 60 | # train = train.fillna(0) 61 | # test = test.fillna(0) 62 | 63 | # Delete date for now 64 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 65 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1) 66 | 67 | # Create 4 new features 68 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8) 69 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8) 70 | 71 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8) 72 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8) 73 | 74 | train = train.drop(['payment_method_id2', 75 | 'payment_method_id3', 76 | 'payment_method_id4', 77 | 'payment_method_id5', 78 | 'payment_method_id6', 79 | 'payment_method_id8', 80 | 'payment_method_id10', 81 | 'payment_method_id11', 82 | 'payment_method_id12', 83 | 'payment_method_id13', 84 | 'payment_method_id14', 85 | 'payment_method_id16', 86 | 'payment_method_id17', 87 | 'payment_method_id18', 88 | 'payment_method_id19', 89 | 'payment_method_id20', 90 | 'payment_method_id21', 91 | 'payment_method_id22', 92 | 'payment_method_id23', 93 | 'payment_method_id24', 94 | 'payment_method_id25', 95 | 'payment_method_id27', 96 | 'payment_method_id28', 97 | 'payment_method_id31', 98 | 'payment_method_id33', 99 | 'payment_method_id34', 100 | 'transaction_date_day', 101 | 'membership_expire_date_day'], axis=1) 102 | 103 | test = test.drop(['payment_method_id2', 104 | 'payment_method_id3', 105 | 'payment_method_id4', 106 | 'payment_method_id5', 107 | 'payment_method_id6', 108 | 'payment_method_id8', 109 | 'payment_method_id10', 110 | 'payment_method_id11', 111 | 'payment_method_id12', 112 | 'payment_method_id13', 113 | 'payment_method_id14', 114 | 'payment_method_id16', 115 | 'payment_method_id17', 116 | 'payment_method_id18', 117 | 'payment_method_id19', 118 | 'payment_method_id20', 119 | 'payment_method_id21', 120 | 'payment_method_id22', 121 | 'payment_method_id23', 122 | 'payment_method_id24', 123 | 'payment_method_id25', 124 | 'payment_method_id27', 125 | 'payment_method_id28', 126 | 'payment_method_id31', 127 | 'payment_method_id33', 128 | 'payment_method_id34', 129 | 'transaction_date_day', 130 | 'membership_expire_date_day'], axis=1) 131 | 132 | feature_list = [ 133 | # raw data 134 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 135 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn', 136 | # advanced features 137 | # user_log 138 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum', 139 | 'total_secs_sum', 140 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime', 141 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio', 142 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum', 143 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio', 144 | # transactions 145 | 'discount', 'amt_per_day', 'is_discount', 'membership_days', 146 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day', 147 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day' 148 | # members 149 | ] 150 | 151 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']] 152 | 153 | params = { 154 | 'base_score': 0.5, 155 | 'eta': 0.002, 156 | 'max_depth': 6, 157 | 'booster': 'gbtree', 158 | 'colsample_bylevel': 1, 159 | 'colsample_bytree': 1.0, 160 | 'gamma': 1, 161 | 'max_child_weight': 5, 162 | 'n_estimators': 600, 163 | 'reg_alpha': '0', 164 | 'reg_lambda': '1', 165 | 'scale_pos_weight': 1, 166 | 'objective': 'binary:logistic', 167 | 'eval_metric': 'logloss', 168 | 'seed': 2017, 169 | 'silent': True 170 | } 171 | x1, x2, y1, y2 = sklearn.model_selection.train_test_split(train[cols], train['is_churn'], test_size=0.3, 172 | random_state=2017) 173 | watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')] 174 | cv_output = xgb.cv(params, xgb.DMatrix(x1, y1), num_boost_round=1500, early_stopping_rounds=20, verbose_eval=50, 175 | show_stdv=False) 176 | model = xgb.train(params, xgb.DMatrix(x1, y1), 2500, watchlist, feval=xgb_score, maximize=False, verbose_eval=50, 177 | early_stopping_rounds=50) 178 | 179 | pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit) 180 | 181 | test['is_churn'] = pred.clip(0.0000001, 0.999999) 182 | print(len(test)) 183 | test[['msno', 'is_churn']].to_csv('submission_xgboost_all_features_selection_eta_0.002_round_2500_Dec_15.csv', 184 | index=False) 185 | -------------------------------------------------------------------------------- /src/xgboost_gridsearch.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import warnings 3 | from datetime import datetime 4 | 5 | import pandas as pd 6 | import sklearn 7 | import xgboost as xgb 8 | from sklearn.model_selection import GridSearchCV 9 | from sklearn.model_selection import RandomizedSearchCV 10 | from sklearn.model_selection import StratifiedKFold 11 | 12 | 13 | def xgb_score(preds, dtrain): 14 | labels = dtrain.get_label() 15 | return 'log_loss', sklearn.metrics.log_loss(labels, preds) 16 | 17 | 18 | def timer(start_time=None): 19 | if not start_time: 20 | start_time = datetime.now() 21 | return start_time 22 | elif start_time: 23 | thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600) 24 | tmin, tsec = divmod(temp_sec, 60) 25 | print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2))) 26 | 27 | 28 | gc.enable() 29 | warnings.filterwarnings('ignore') 30 | 31 | transactions = pd.read_csv('../input/processed_transaction_all.csv') 32 | 33 | members_v1 = pd.read_csv('../input/members.csv') 34 | members_v2 = pd.read_csv('../input/members_v2.csv') 35 | members = members_v1.append(members_v2, ignore_index=True) 36 | 37 | user_log = pd.read_csv('../input/processed_user_log_all.csv') 38 | 39 | train_v1 = pd.read_csv('../input/train.csv') 40 | train_v2 = pd.read_csv('../input/train_v2.csv') 41 | train = train_v1.append(train_v2, ignore_index=True) 42 | 43 | test = pd.read_csv('../input/sample_submission_v2.csv') 44 | 45 | # Merge Data 46 | 47 | train = pd.merge(train, transactions, how='left', on='msno') 48 | test = pd.merge(test, transactions, how='left', on='msno') 49 | 50 | train = pd.merge(train, user_log, how='left', on='msno') 51 | test = pd.merge(test, user_log, how='left', on='msno') 52 | 53 | train = pd.merge(train, members, how='left', on='msno') 54 | test = pd.merge(test, members, how='left', on='msno') 55 | 56 | # Drop duplicates first 57 | test = test.drop_duplicates('msno') 58 | 59 | gender = {'male': 1, 'female': 2} 60 | train['gender'] = train['gender'].map(gender) 61 | test['gender'] = test['gender'].map(gender) 62 | 63 | train = train.fillna(0) 64 | test = test.fillna(0) 65 | 66 | # Delete date for now 67 | train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1) 68 | test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1) 69 | # Delete date for now 70 | 71 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']] 72 | 73 | Y = train['is_churn'].values 74 | X = train[cols] 75 | 76 | # A parameter grid for XGBoost 77 | params = { 78 | 'min_child_weight': [1, 5, 10], 79 | 'gamma': [0.5, 1, 1.5, 2, 5], 80 | 'colsample_bytree': [0.6, 0.8, 1.0], 81 | 'max_depth': [3, 4, 5, 6, 7], 82 | 'subsample': [0.7, 0.75, 0.8] 83 | } 84 | 85 | model = xgb.XGBClassifier(learning_rate=0.002, n_estimators=600, objective='binary:logistic', 86 | silent=True, nthread=1) 87 | 88 | folds = 3 89 | param_comb = 5 90 | 91 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) 92 | 93 | random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='neg_log_loss', n_jobs=4, 94 | cv=skf.split(X, Y), verbose=3, random_state=1001) 95 | 96 | # Here we go 97 | start_time = timer(None) # timing starts from this point for "start_time" variable 98 | random_search.fit(X, Y) 99 | timer(start_time) # timing ends here for "start_time" variable 100 | 101 | print('\n All results:') 102 | print(random_search.cv_results_) 103 | print('\n Best estimator:') 104 | print(random_search.best_estimator_) 105 | print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb)) 106 | print(random_search.best_score_ * 2 - 1) 107 | print('\n Best hyperparameters:') 108 | print(random_search.best_params_) 109 | results = pd.DataFrame(random_search.cv_results_) 110 | results.to_csv('xgboost_random_grid_search_results_01.csv', index=False) 111 | 112 | pred = random_search.predict_proba(xgb.DMatrix(test[cols])) 113 | test['is_churn'] = pred.clip(0.0000001, 0.999999) 114 | print(len(test)) 115 | test[['msno', 'is_churn']].to_csv('submission_xgboost_random_serach_best_param.csv', index=False) 116 | 117 | grid = GridSearchCV(estimator=model, param_grid=params, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X, Y), verbose=3) 118 | grid.fit(X, Y) 119 | print('\n All results:') 120 | print(grid.cv_results_) 121 | print('\n Best estimator:') 122 | print(grid.best_estimator_) 123 | print('\n Best score:') 124 | print(grid.best_score_ * 2 - 1) 125 | print('\n Best parameters:') 126 | print(grid.best_params_) 127 | results = pd.DataFrame(grid.cv_results_) 128 | results.to_csv('xgboost_grid_search_results_01.csv', index=False) 129 | 130 | pred = grid.best_estimator_.predict_proba(xgb.DMatrix(test[cols])) 131 | test['is_churn'] = pred.clip(0.0000001, 0.999999) 132 | print(len(test)) 133 | test[['msno', 'is_churn']].to_csv('submission_xgboost_grid_search_best_param.csv', index=False) 134 | --------------------------------------------------------------------------------