├── .gitignore
├── README.md
├── input
    └── .gitignore
└── src
    ├── .gitignore
    ├── autoencoder_baseline.py
    ├── feature_importance.py
    ├── find_correlation.py
    ├── lightgbm_all_features.py
    ├── lightgbm_feaeture_importance_.png
    ├── lightgbm_feaeture_importance_all_time.png
    ├── lightgbm_features.py
    ├── process_features_userlog_all.py
    ├── process_features_userlog_feb_mar.py
    ├── process_userlog_all.py
    ├── process_userlog_feb.py
    ├── process_userlog_mar.py
    ├── weight_AveragingEnsemble.py
    ├── xgboost_features.py
    └── xgboost_gridsearch.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WSDM-KKBox-s-Churn-Prediction-Challenge
 2 | The 11th ACM International Conference on Web Search and Data Mining (WSDM 2018) is challenging you to build an algorithm that predicts whether a subscription user will churn using a donated dataset from KKBOX.
 3 | 
 4 | # Final:  rank 43/575
 5 | 
 6 | userlog_features分两个角度：过往所有时间段的features | 过往部分时间段的features <br>
 7 | 
 8 | process_userlog_feb.py 提取二月份训练数据的features <br>
 9 | process_userlog_mar.py 提取三月份测试数据的features <br>
10 | process_userlog_all.py 提取过往所有时间段的features <br>
11 | 
12 | process_features_userlog_feb_mar.py 提取过往一个月的交叉features <br>
13 | process_features_userlog_all.py     提取过往所有时间段的交叉features <br>
14 | 


--------------------------------------------------------------------------------
/input/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/autoencoder_baseline.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | from keras import optimizers
  6 | from keras.callbacks import ModelCheckpoint, TensorBoard
  7 | from keras.layers import Dense, Dropout
  8 | from keras.models import Sequential
  9 | from numpy import random as rm
 10 | from sklearn import preprocessing
 11 | from sklearn.model_selection import train_test_split
 12 | 
 13 | gc.enable()
 14 | 
 15 | # transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0)
 16 | # transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0)
 17 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
 18 | 
 19 | members = pd.read_csv('../input/members_v3.csv')
 20 | 
 21 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
 22 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
 23 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
 24 | 
 25 | train = pd.read_csv('../input/train.csv')
 26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
 27 | 
 28 | test = pd.read_csv('../input/sample_submission_v2.csv')
 29 | 
 30 | # Merge Data
 31 | 
 32 | # train = pd.merge(train, transactions_train, how='left', on='msno')
 33 | # test = pd.merge(test, transactions_test, how='left', on='msno')
 34 | 
 35 | train = pd.merge(train, transactions, how='left', on='msno')
 36 | test = pd.merge(test, transactions, how='left', on='msno')
 37 | 
 38 | train = pd.merge(train, user_log_train, how='left', on='msno')
 39 | test = pd.merge(test, user_log_test, how='left', on='msno')
 40 | 
 41 | train = pd.merge(train, user_log_all, how='left', on='msno')
 42 | test = pd.merge(test, user_log_all, how='left', on='msno')
 43 | 
 44 | train = pd.merge(train, members, how='left', on='msno')
 45 | test = pd.merge(test, members, how='left', on='msno')
 46 | 
 47 | del transactions, members, user_log_train, user_log_test
 48 | gc.collect()
 49 | 
 50 | # Drop duplicates first
 51 | test = test.drop_duplicates('msno')
 52 | 
 53 | gender = {'male': 1, 'female': 2}
 54 | train['gender'] = train['gender'].map(gender)
 55 | test['gender'] = test['gender'].map(gender)
 56 | 
 57 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
 58 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
 59 | 
 60 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
 61 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
 62 | 
 63 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 64 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 65 | 
 66 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
 67 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
 68 | 
 69 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
 70 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
 71 | 
 72 | train = train.replace([np.inf, -np.inf], np.nan)
 73 | 
 74 | train = train.fillna(0)
 75 | test = test.fillna(0)
 76 | 
 77 | train_0 = train[train['is_churn'] == 0]
 78 | train_1 = train[train['is_churn'] == 1]
 79 | 
 80 | '''
 81 | # Enlarge train_1 for 17 times
 82 | train_append = train_1
 83 | 
 84 | for _ in range(17):
 85 |     train_append = train_append.append(train_1)
 86 | 
 87 | train = train_0.append(train_append)
 88 | '''
 89 | 
 90 | 
 91 | # train1 random sample 1/17
 92 | def rand_rows(df, num_rows=5):
 93 |     subset = rm.choice(df.index.values, size=num_rows)
 94 |     return df.loc[subset]
 95 | 
 96 | 
 97 | train_0 = rand_rows(train_0, len(train_1))
 98 | train = train_0.append(train_1)
 99 | 
100 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
101 | 
102 | # Add Normalize
103 | min_max_scaler = preprocessing.MinMaxScaler()
104 | train[cols] = min_max_scaler.fit_transform(train[cols])
105 | 
106 | X_train, X_test = train_test_split(train, test_size=0.2, random_state=47, shuffle=True)
107 | y_train = X_train['is_churn']
108 | X_train = X_train.drop(['msno', 'is_churn'], axis=1)
109 | 
110 | y_test = X_test['is_churn']
111 | X_test = X_test.drop(['msno', 'is_churn'], axis=1)
112 | 
113 | X_train = X_train.values
114 | X_test = X_test.values
115 | 
116 | input_dim = X_train.shape[1]
117 | 
118 | autoencoder = Sequential()
119 | autoencoder.add(Dense(input_dim, input_dim=input_dim))
120 | 
121 | input_dim = int(input_dim / 2)
122 | autoencoder.add(Dense(input_dim, activation='relu'))
123 | autoencoder.add(Dropout(0.5))
124 | 
125 | input_dim = int(input_dim / 2)
126 | autoencoder.add(Dense(input_dim, activation='relu'))
127 | autoencoder.add(Dropout(0.5))
128 | 
129 | input_dim = int(input_dim / 2)
130 | autoencoder.add(Dense(input_dim, activation='relu'))
131 | autoencoder.add(Dropout(0.5))
132 | 
133 | autoencoder.add(Dense(1, activation='sigmoid'))
134 | 
135 | autoencoder.summary()
136 | 
137 | nb_epoch = 50
138 | batch_size = 32
139 | 
140 | sgd = optimizers.SGD(lr=0.002, decay=1e-6, momentum=0.9, nesterov=True)
141 | 
142 | autoencoder.compile(optimizer=sgd,
143 |                     loss='binary_crossentropy',
144 |                     metrics=['accuracy'])
145 | 
146 | checkpointer = ModelCheckpoint(filepath="model.h5",
147 |                                verbose=1,
148 |                                save_best_only=True)
149 | 
150 | tensorboard = TensorBoard(log_dir='./log',
151 |                           histogram_freq=0,
152 |                           write_graph=True,
153 |                           write_images=True)
154 | 
155 | print(X_train.shape)
156 | 
157 | history = autoencoder.fit(X_train, y_train,
158 |                           epochs=nb_epoch,
159 |                           batch_size=batch_size,
160 |                           shuffle=True,
161 |                           validation_data=(X_test, y_test),
162 |                           verbose=1,
163 |                           callbacks=[checkpointer, tensorboard]).history
164 | 
165 | # autoencoder = load_model('model.h5')
166 | 
167 | predictions = autoencoder.predict(test.drop(['msno', 'is_churn'], axis=1).values)
168 | 
169 | test['is_churn'] = predictions
170 | test = test[['msno', 'is_churn']]
171 | 
172 | test.to_csv('submission_autoencoder_baseline_sgd_0.002_50_32_Dec_15.csv', index=False)
173 | 


--------------------------------------------------------------------------------
/src/feature_importance.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pickle
 3 | import operator
 4 | 
 5 | 
 6 | def plot(model):
 7 |     from xgboost import plot_importance
 8 |     from matplotlib import pyplot as plt
 9 |     plot_importance(model)
10 |     plt.show()
11 | 
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     filename = 'model/xgb_depth_7_round_1800_fold_2_eta_0.002.pkl'
16 | 
17 |     model = pickle.load(open(filename))
18 | 
19 |     importance = model.get_fscore()
20 |     importance = sorted(importance.items(), key=operator.itemgetter(1))
21 | 
22 |     importance = importance[::-1]
23 |     print(importance)
24 |     plot(model)
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/src/find_correlation.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import numpy as np
 5 | import pandas as pd
 6 | import seaborn as sns
 7 | 
 8 | gc.enable()
 9 | 
10 | # transactions = pd.read_csv('../input/processed_transaction_all.csv')
11 | 
12 | members_v1 = pd.read_csv('../input/members.csv')
13 | members_v2 = pd.read_csv('../input/members_v2.csv')
14 | members = members_v1.append(members_v2, ignore_index=True)
15 | 
16 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
17 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
18 | 
19 | train_v1 = pd.read_csv('../input/train.csv')
20 | train_v2 = pd.read_csv('../input/train_v2.csv')
21 | train = train_v1.append(train_v2, ignore_index=True)
22 | 
23 | test = pd.read_csv('../input/sample_submission_v2.csv')
24 | 
25 | # Merge Data
26 | 
27 | # train = pd.merge(train, transactions, how='left', on='msno')
28 | # test = pd.merge(test, transactions, how='left', on='msno')
29 | 
30 | train = pd.merge(train, user_log_train, how='left', on='msno')
31 | test = pd.merge(test, user_log_test, how='left', on='msno')
32 | 
33 | train = pd.merge(train, members, how='left', on='msno')
34 | test = pd.merge(test, members, how='left', on='msno')
35 | 
36 | # Drop duplicates first
37 | test = test.drop_duplicates('msno')
38 | 
39 | gender = {'male': 1, 'female': 2}
40 | train['gender'] = train['gender'].map(gender)
41 | test['gender'] = test['gender'].map(gender)
42 | 
43 | train = train.fillna(0)
44 | test = test.fillna(0)
45 | 
46 | # Delete date for now
47 | # train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
48 | # test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
49 | 
50 | corr = train.corr()
51 | 
52 | # print('Train Data Set Correlation:')
53 | # print(corr)
54 | 
55 | corr.to_csv('user_log_features_without_transaction_corr.csv', index=False)
56 | 
57 | # Generate a mask for the upper triangle
58 | mask = np.zeros_like(corr, dtype=np.bool)
59 | mask[np.triu_indices_from(mask)] = True
60 | 
61 | # Set up the matplotlib figure
62 | f, ax = plt.subplots(figsize=(11, 9))
63 | 
64 | # Generate a custom diverging colormap
65 | cmap = sns.diverging_palette(220, 10, as_cmap=True)
66 | 
67 | # Draw the heatmap with the mask and correct aspect ratio
68 | headmap = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
69 |                       square=True, linewidths=.5, cbar_kws={"shrink": .5})
70 | fig = headmap.get_figure()
71 | fig.savefig('Features_Correlation_Heatmap_user_log')
72 | 


--------------------------------------------------------------------------------
/src/lightgbm_all_features.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import lightgbm as lgb
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.model_selection import ShuffleSplit
  8 | 
  9 | gc.enable()
 10 | 
 11 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
 12 | 
 13 | members = pd.read_csv('../input/members_v3.csv')
 14 | 
 15 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
 16 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv')
 17 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv')
 18 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv')
 19 | 
 20 | train = pd.read_csv('../input/train.csv')
 21 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
 22 | 
 23 | test = pd.read_csv('../input/sample_submission_v2.csv')
 24 | 
 25 | # Merge Data
 26 | 
 27 | train = pd.merge(train, transactions, how='left', on='msno')
 28 | test = pd.merge(test, transactions, how='left', on='msno')
 29 | 
 30 | train = pd.merge(train, user_log_all, how='left', on='msno')
 31 | test = pd.merge(test, user_log_all, how='left', on='msno')
 32 | 
 33 | train = pd.merge(train, user_log_feb, how='left', on='msno')
 34 | test = pd.merge(test, user_log_mar, how='left', on='msno')
 35 | 
 36 | train = pd.merge(train, members, how='left', on='msno')
 37 | test = pd.merge(test, members, how='left', on='msno')
 38 | 
 39 | del transactions, members
 40 | gc.collect()
 41 | 
 42 | # Drop duplicates first
 43 | test = test.drop_duplicates('msno')
 44 | 
 45 | gender = {'male': 1, 'female': 2}
 46 | train['gender'] = train['gender'].map(gender)
 47 | test['gender'] = test['gender'].map(gender)
 48 | 
 49 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
 50 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
 51 | 
 52 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
 53 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
 54 | 
 55 | # train = train.fillna(0)
 56 | # test = test.fillna(0)
 57 | 
 58 | # Delete date for now
 59 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 60 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 61 | 
 62 | # Create 4 new features
 63 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
 64 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
 65 | 
 66 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
 67 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
 68 | 
 69 | train = train.drop(['payment_method_id2',
 70 |                     'payment_method_id3',
 71 |                     'payment_method_id4',
 72 |                     'payment_method_id5',
 73 |                     'payment_method_id6',
 74 |                     'payment_method_id8',
 75 |                     'payment_method_id10',
 76 |                     'payment_method_id11',
 77 |                     'payment_method_id12',
 78 |                     'payment_method_id13',
 79 |                     'payment_method_id14',
 80 |                     'payment_method_id16',
 81 |                     'payment_method_id17',
 82 |                     'payment_method_id18',
 83 |                     'payment_method_id19',
 84 |                     'payment_method_id20',
 85 |                     'payment_method_id21',
 86 |                     'payment_method_id22',
 87 |                     'payment_method_id23',
 88 |                     'payment_method_id24',
 89 |                     'payment_method_id25',
 90 |                     'payment_method_id27',
 91 |                     'payment_method_id28',
 92 |                     'payment_method_id31',
 93 |                     'payment_method_id33',
 94 |                     'payment_method_id34',
 95 |                     'transaction_date_day',
 96 |                     'membership_expire_date_day'], axis=1)
 97 | 
 98 | test = test.drop(['payment_method_id2',
 99 |                   'payment_method_id3',
100 |                   'payment_method_id4',
101 |                   'payment_method_id5',
102 |                   'payment_method_id6',
103 |                   'payment_method_id8',
104 |                   'payment_method_id10',
105 |                   'payment_method_id11',
106 |                   'payment_method_id12',
107 |                   'payment_method_id13',
108 |                   'payment_method_id14',
109 |                   'payment_method_id16',
110 |                   'payment_method_id17',
111 |                   'payment_method_id18',
112 |                   'payment_method_id19',
113 |                   'payment_method_id20',
114 |                   'payment_method_id21',
115 |                   'payment_method_id22',
116 |                   'payment_method_id23',
117 |                   'payment_method_id24',
118 |                   'payment_method_id25',
119 |                   'payment_method_id27',
120 |                   'payment_method_id28',
121 |                   'payment_method_id31',
122 |                   'payment_method_id33',
123 |                   'payment_method_id34',
124 |                   'transaction_date_day',
125 |                   'membership_expire_date_day'], axis=1)
126 | 
127 | feature_list = [
128 |     # raw data
129 |     'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
130 |     'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
131 |     # advanced features
132 |     # user_log
133 |     'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
134 |     'total_secs_sum',
135 |     'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
136 |     'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
137 |     'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
138 |     'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
139 |     # transactions
140 |     'discount', 'amt_per_day', 'is_discount', 'membership_days',
141 |     'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
142 |     'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
143 |     # members
144 | ]
145 | 
146 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
147 | 
148 | print(cols)
149 | 
150 | params = {
151 |     'objective': 'binary',
152 |     'metric': 'binary_logloss',
153 |     'boosting': 'gbdt',
154 |     'learning_rate': 0.002,  # small learn rate, large number of iterations
155 |     'verbose': 0,
156 |     'num_leaves': 108,
157 |     'bagging_fraction': 0.95,
158 |     'bagging_freq': 1,
159 |     'bagging_seed': 1,
160 |     'feature_fraction': 0.9,
161 |     'feature_fraction_seed': 1,
162 |     'max_bin': 128,
163 |     'max_depth': 7,
164 |     'reg_alpha': 1,
165 |     'reg_lambda': 0,
166 |     'min_split_gain': 0.5,
167 |     'min_child_weight': 1,
168 |     'min_child_samples': 10,
169 |     'scale_pos_weight': 1
170 | }
171 | 
172 | bst = None
173 | 
174 | cv_results = lgb.cv(
175 |     params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False,
176 |     shuffle=True,
177 |     metrics='binary_logloss',
178 |     early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
179 | 
180 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train):
181 |     train_data = lgb.Dataset(train[cols].loc[train_indices, :],
182 |                              label=train.loc[train_indices, 'is_churn'])
183 |     val_data = lgb.Dataset(train[cols].loc[val_indices, :],
184 |                            label=train.loc[val_indices, 'is_churn'])
185 | 
186 |     bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50)
187 | 
188 | predictions = bst.predict(test[cols])
189 | test['is_churn'] = predictions
190 | test = test[['msno', 'is_churn']]
191 | test.to_csv('submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv',
192 |             index=False)
193 | 
194 | print('Plot feature importances...')
195 | ax = lgb.plot_importance(bst)
196 | importance = bst.feature_importance()
197 | # importance = sorted(importance., key=operator.itemgetter(1))
198 | 
199 | # importance = importance[::-1]
200 | # print(cols)
201 | # print(type(importance))
202 | a = pd.DataFrame({'feature': cols, 'importance': importance})
203 | # print(a)
204 | a.to_csv('feature_importance_features_selection.csv')
205 | # plt.show()
206 | plt.savefig('lightgbm_feaeture_importance_all_time')
207 | 


--------------------------------------------------------------------------------
/src/lightgbm_feaeture_importance_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_.png


--------------------------------------------------------------------------------
/src/lightgbm_feaeture_importance_all_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_all_time.png


--------------------------------------------------------------------------------
/src/lightgbm_features.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import lightgbm as lgb
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import pandas as pd
  7 | from sklearn.model_selection import ShuffleSplit
  8 | 
  9 | gc.enable()
 10 | 
 11 | transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0)
 12 | transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0)
 13 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
 14 | 
 15 | transactions = transactions[
 16 |     ['msno', 'discount', 'amt_per_day', 'is_discount', 'membership_days', 'transaction_date_year',
 17 |      'transaction_date_month',
 18 |      'transaction_date_day', 'membership_expire_date_year', 'membership_expire_date_month',
 19 |      'membership_expire_date_day']]
 20 | 
 21 | members = pd.read_csv('../input/members_v3.csv')
 22 | 
 23 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
 24 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
 25 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
 26 | 
 27 | train = pd.read_csv('../input/train_v2.csv')
 28 | 
 29 | test = pd.read_csv('../input/sample_submission_v2.csv')
 30 | 
 31 | # Merge Data
 32 | 
 33 | train = pd.merge(train, transactions_train, how='left', on='msno')
 34 | test = pd.merge(test, transactions_test, how='left', on='msno')
 35 | 
 36 | train = pd.merge(train, transactions, how='left', on='msno')
 37 | test = pd.merge(test, transactions, how='left', on='msno')
 38 | 
 39 | train = pd.merge(train, user_log_train, how='left', on='msno')
 40 | test = pd.merge(test, user_log_test, how='left', on='msno')
 41 | 
 42 | train = pd.merge(train, user_log_all, how='left', on='msno')
 43 | test = pd.merge(test, user_log_all, how='left', on='msno')
 44 | 
 45 | train = pd.merge(train, members, how='left', on='msno')
 46 | test = pd.merge(test, members, how='left', on='msno')
 47 | 
 48 | del transactions, members, user_log_train, user_log_test
 49 | gc.collect()
 50 | 
 51 | # Drop duplicates first
 52 | test = test.drop_duplicates('msno')
 53 | 
 54 | gender = {'male': 1, 'female': 2}
 55 | train['gender'] = train['gender'].map(gender)
 56 | test['gender'] = test['gender'].map(gender)
 57 | 
 58 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
 59 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
 60 | 
 61 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
 62 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
 63 | 
 64 | train = train.fillna(0)
 65 | test = test.fillna(0)
 66 | 
 67 | # Delete date for now
 68 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 69 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 70 | 
 71 | # Remove Features with 0 feature importance
 72 | train = train.drop(
 73 |     ['payment_method_id14',
 74 |      'payment_method_id18',
 75 |      'payment_method_id21',
 76 |      'payment_method_id26',
 77 |      'payment_method_id35',
 78 |      'transaction_date_month_x',
 79 |      'transaction_date_day_x',
 80 |      'membership_expire_date_year_x',
 81 |      'membership_expire_date_month_x',
 82 |      'membership_expire_date_day_x',
 83 |      'transaction_date_day_y',
 84 |      'membership_expire_date_day_y'], axis=1)
 85 | test = test.drop(
 86 |     ['payment_method_id14',
 87 |      'payment_method_id18',
 88 |      'payment_method_id21',
 89 |      'payment_method_id26',
 90 |      'payment_method_id35',
 91 |      'transaction_date_month_x',
 92 |      'transaction_date_day_x',
 93 |      'membership_expire_date_year_x',
 94 |      'membership_expire_date_month_x',
 95 |      'membership_expire_date_day_x',
 96 |      'transaction_date_day_y',
 97 |      'membership_expire_date_day_y'], axis=1)
 98 | 
 99 | # Remove Features with feature importance less than 100
100 | train = train.drop(
101 |     ['payment_method_id16',
102 |      'payment_method_id17',
103 |      'payment_method_id19',
104 |      'payment_method_id23',
105 |      'payment_method_id27',
106 |      'payment_method_id28',
107 |      'payment_method_id31',
108 |      'payment_method_id33',
109 |      'payment_method_id34',
110 |      'payment_method_id39',
111 |      'is_discount_x',
112 |      'transaction_date_year_x'], axis=1)
113 | test = test.drop(
114 |     ['payment_method_id16',
115 |      'payment_method_id17',
116 |      'payment_method_id19',
117 |      'payment_method_id23',
118 |      'payment_method_id27',
119 |      'payment_method_id28',
120 |      'payment_method_id31',
121 |      'payment_method_id33',
122 |      'payment_method_id34',
123 |      'payment_method_id39',
124 |      'is_discount_x',
125 |      'transaction_date_year_x'], axis=1)
126 | 
127 | # Create 4 new features
128 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
129 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
130 | 
131 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
132 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
133 | 
134 | feature_list = [
135 |     # raw data
136 |     'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
137 |     'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
138 |     # advanced features
139 |     # user_log
140 |     'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
141 |     'total_secs_sum',
142 |     'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
143 |     'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
144 |     'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
145 |     'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
146 |     # transactions
147 |     'discount', 'amt_per_day', 'is_discount', 'membership_days',
148 |     'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
149 |     'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
150 |     # members
151 | ]
152 | 
153 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
154 | 
155 | print(cols)
156 | 
157 | params = {
158 |     'objective': 'binary',
159 |     'metric': 'binary_logloss',
160 |     'boosting': 'gbdt',
161 |     'learning_rate': 0.002,  # small learn rate, large number of iterations
162 |     'verbose': 0,
163 |     'num_leaves': 108,
164 |     'bagging_fraction': 0.95,
165 |     'bagging_freq': 1,
166 |     'bagging_seed': 1,
167 |     'feature_fraction': 0.9,
168 |     'feature_fraction_seed': 1,
169 |     'max_bin': 128,
170 |     'max_depth': 7,
171 |     'reg_alpha': 1,
172 |     'reg_lambda': 0,
173 |     'min_split_gain': 0.5,
174 |     'min_child_weight': 1,
175 |     'min_child_samples': 10,
176 |     'scale_pos_weight': 1
177 | }
178 | 
179 | bst = None
180 | 
181 | cv_results = lgb.cv(
182 |     params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False,
183 |     shuffle=True,
184 |     metrics='binary_logloss',
185 |     early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
186 | 
187 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train):
188 |     train_data = lgb.Dataset(train[cols].loc[train_indices, :],
189 |                              label=train.loc[train_indices, 'is_churn'])
190 |     val_data = lgb.Dataset(train[cols].loc[val_indices, :],
191 |                            label=train.loc[val_indices, 'is_churn'])
192 | 
193 |     bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50)
194 | 
195 | predictions = bst.predict(test[cols])
196 | test['is_churn'] = predictions
197 | test = test[['msno', 'is_churn']]
198 | test.to_csv('submission_lightgbm_features_features_selection_best_parameter_eta_0.002_round_2000_Dec_15.csv',
199 |             index=False)
200 | 
201 | print('Plot feature importances...')
202 | ax = lgb.plot_importance(bst)
203 | importance = bst.feature_importance()
204 | # importance = sorted(importance., key=operator.itemgetter(1))
205 | 
206 | # importance = importance[::-1]
207 | # print(cols)
208 | # print(type(importance))
209 | a = pd.DataFrame({'feature': cols, 'importance': importance})
210 | # print(a)
211 | a.to_csv('feature_importance_all.csv')
212 | # plt.show()
213 | plt.savefig('lightgbm_feaeture_importance_')
214 | 


--------------------------------------------------------------------------------
/src/process_features_userlog_all.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def process_user_log_together(df):
 6 |     """
 7 |     After union all chunk file, do sum again.
 8 |     :param df:
 9 |     :return:
10 |     """
11 | 
12 |     df = df.fillna(0)
13 | 
14 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
15 |     func = {'log_day_monthly': ['sum'],
16 |             'total_25_sum_monthly': ['sum'],
17 |             'total_50_sum_monthly': ['sum'],
18 |             'total_75_sum_monthly': ['sum'],
19 |             'total_985_sum_monthly': ['sum'],
20 |             'total_100_sum_monthly': ['sum'],
21 |             'total_unq_sum_monthly': ['sum'],
22 |             'total_secs_sum_monthly': ['sum']
23 |             }
24 |     user_log_all = grouped_object.agg(func).reset_index()
25 |     user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
26 |     user_log_all.rename(columns={'msno_': 'msno',
27 |                                  'log_day_monthly_sum': 'log_day_monthly',
28 |                                  'total_25_sum_monthly_sum': 'total_25_sum_monthly',
29 |                                  'total_50_sum_monthly_sum': 'total_50_sum_monthly',
30 |                                  'total_75_sum_monthly_sum': 'total_75_sum_monthly',
31 |                                  'total_985_sum_monthly_sum': 'total_985_sum_monthly',
32 |                                  'total_100_sum_monthly_sum': 'total_100_sum_monthly',
33 |                                  'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
34 |                                  'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
35 |                                  }, inplace=True)
36 | 
37 |     return user_log_all
38 | 
39 | 
40 | def calculate_user_log_features(train):
41 |     """
42 |     Calculate the user log features.
43 |     :param train:
44 |     :return:
45 |     """
46 |     train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[
47 |         'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly']
48 | 
49 |     # Monthly Habit for listening to music
50 |     train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum']
51 |     train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum']
52 | 
53 |     # 听歌是循环播放还是试听,每首歌播放次数
54 |     train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly']
55 | 
56 |     # 听歌每首歌平均播放时间
57 |     train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum']
58 | 
59 |     # 平均每天听歌数量
60 |     train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly']
61 | 
62 |     # 平均每天听歌时间
63 |     train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly']
64 | 
65 |     train.replace(np.inf, 0, inplace=True)
66 |     train = train.fillna(0)
67 | 
68 |     return train
69 | 
70 | 
71 | train = pd.read_csv('../input/processed_user_log_mid_all.csv')
72 | user_log_test = pd.read_csv('../input/processed_user_log_mid_all.csv')
73 | user_log_test = user_log_test[['msno',
74 |                                'log_day_monthly',
75 |                                'total_25_sum_monthly',
76 |                                'total_50_sum_monthly',
77 |                                'total_75_sum_monthly',
78 |                                'total_985_sum_monthly',
79 |                                'total_100_sum_monthly',
80 |                                'total_unq_sum_monthly',
81 |                                'total_secs_sum_monthly']]
82 | 
83 | print(train.columns)
84 | print(user_log_test.columns)
85 | 
86 | train = train.append(user_log_test)
87 | 
88 | train = process_user_log_together(train)
89 | 
90 | train = calculate_user_log_features(train)
91 | 
92 | print(len(train))
93 | 
94 | train.to_csv('../input/processed_features_user_log_all_time_including_mar.csv', index=False)
95 | 


--------------------------------------------------------------------------------
/src/process_features_userlog_feb_mar.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def calculate_user_log_features(train):
  8 |     """
  9 |     Calculate the user log features.
 10 |     :param train:
 11 |     :return:
 12 |     """
 13 |     train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[
 14 |         'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly']
 15 | 
 16 |     # Monthly Habit for listening to music
 17 |     train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum']
 18 |     train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum']
 19 | 
 20 |     # 听歌是循环播放还是试听,每首歌播放次数
 21 |     train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly']
 22 | 
 23 |     # 听歌每首歌平均播放时间
 24 |     train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum']
 25 | 
 26 |     # 平均每天听歌数量
 27 |     train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly']
 28 | 
 29 |     # 平均每天听歌时间
 30 |     train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly']
 31 | 
 32 |     train['one_week_sum'] = train['one_week_total_25_sum'] + train['one_week_total_50_sum'] + train[
 33 |         'one_week_total_75_sum'] + train['one_week_total_985_sum'] + train['one_week_total_100_sum']
 34 | 
 35 |     train['two_week_sum'] = train['two_week_total_25_sum'] + train['two_week_total_50_sum'] + train[
 36 |         'two_week_total_75_sum'] + train['two_week_total_985_sum'] + train['two_week_total_100_sum']
 37 | 
 38 |     # 第四周听歌时间与第三周比较
 39 |     train['week_secs_sum_ratio'] = train['two_week_total_secs_sum'] / train['one_week_total_secs_sum']
 40 |     # 第四周听歌数与第三周比较
 41 |     train['week_sum_ratio'] = train['two_week_sum'] / train['one_week_sum']
 42 | 
 43 |     train['one_semimonth_sum'] = train['one_semimonth_total_25_sum'] + train['one_semimonth_total_50_sum'] \
 44 |                                  + train['one_semimonth_total_75_sum'] + train[
 45 |                                      'one_semimonth_total_985_sum'] + train['one_semimonth_total_100_sum']
 46 | 
 47 |     train['two_semimonth_sum'] = train['two_semimonth_total_25_sum'] + train['two_semimonth_total_50_sum'] \
 48 |                                  + train['two_semimonth_total_75_sum'] + train[
 49 |                                      'two_semimonth_total_985_sum'] + train['two_semimonth_total_100_sum']
 50 | 
 51 |     # 第二个半月听歌时间与第一个半月比较
 52 |     train['semimonth_secs_sum_ratio'] = train['two_semimonth_total_secs_sum'] / train['one_semimonth_total_secs_sum']
 53 |     # 第二个半月听歌数与第一个半月比较
 54 |     train['semimonth_sum_ratio'] = train['two_semimonth_sum'] / train['one_semimonth_sum']
 55 | 
 56 |     train.replace(np.inf, 0, inplace=True)
 57 |     train = train.fillna(0)
 58 |     train = train.drop(['log_day_monthly',
 59 |                         'total_25_sum_monthly',
 60 |                         'total_50_sum_monthly',
 61 |                         'total_75_sum_monthly',
 62 |                         'total_985_sum_monthly',
 63 |                         'total_100_sum_monthly',
 64 |                         'total_unq_sum_monthly',
 65 |                         'total_secs_sum_monthly',
 66 |                         'one_week_log_day',
 67 |                         'one_week_total_25_sum',
 68 |                         'one_week_total_50_sum',
 69 |                         'one_week_total_75_sum',
 70 |                         'one_week_total_985_sum',
 71 |                         'one_week_total_100_sum',
 72 |                         'one_week_total_unq_sum',
 73 |                         'one_week_total_secs_sum',
 74 |                         'two_week_log_day',
 75 |                         'two_week_total_25_sum',
 76 |                         'two_week_total_50_sum',
 77 |                         'two_week_total_75_sum',
 78 |                         'two_week_total_985_sum',
 79 |                         'two_week_total_100_sum',
 80 |                         'two_week_total_unq_sum',
 81 |                         'two_week_total_secs_sum',
 82 |                         'one_semimonth_log_day',
 83 |                         'one_semimonth_total_25_sum',
 84 |                         'one_semimonth_total_50_sum',
 85 |                         'one_semimonth_total_75_sum',
 86 |                         'one_semimonth_total_985_sum',
 87 |                         'one_semimonth_total_100_sum',
 88 |                         'one_semimonth_total_unq_sum',
 89 |                         'one_semimonth_total_secs_sum',
 90 |                         'two_semimonth_log_day',
 91 |                         'two_semimonth_total_25_sum',
 92 |                         'two_semimonth_total_50_sum',
 93 |                         'two_semimonth_total_75_sum',
 94 |                         'two_semimonth_total_985_sum',
 95 |                         'two_semimonth_total_100_sum',
 96 |                         'two_semimonth_total_unq_sum',
 97 |                         'two_semimonth_total_secs_sum'], axis=1)
 98 | 
 99 |     return train
100 | 
101 | 
102 | train = pd.read_csv('../input/processed_user_log_feb.csv')
103 | 
104 | train = calculate_user_log_features(train)
105 | 
106 | train.to_csv('../input/processed_features_user_log_feb.csv', index=False)
107 | 
108 | del train
109 | gc.collect()
110 | 
111 | test = pd.read_csv('../input/processed_user_log_mar.csv')
112 | 
113 | test = calculate_user_log_features(test)
114 | 
115 | test.to_csv('../input/processed_features_user_log_mar.csv', index=False)
116 | 


--------------------------------------------------------------------------------
/src/process_userlog_all.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import time
 3 | 
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def process_user_log(df):
 8 |     """
 9 |     Only do simple sum. mean operation.
10 |     :param df: chunk dataframe from very large file.
11 |     :return: processed dataframe
12 |     """
13 | 
14 |     # Divided DataFrame by date
15 |     # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
16 | 
17 |     # Stage 1: One Month Total Data
18 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
19 |     func = {'date': ['count'],
20 |             'num_25': ['sum'], 'num_50': ['sum'],
21 |             'num_75': ['sum'], 'num_985': ['sum'],
22 |             'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
23 |     one_month = grouped_object.agg(func).reset_index()
24 |     one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
25 |     one_month.rename(columns={'msno_': 'msno',
26 |                               'date_count': 'log_day_monthly',
27 |                               'num_25_sum': 'total_25_sum_monthly',
28 |                               'num_50_sum': 'total_50_sum_monthly',
29 |                               'num_75_sum': 'total_75_sum_monthly',
30 |                               'num_985_sum': 'total_985_sum_monthly',
31 |                               'num_100_sum': 'total_100_sum_monthly',
32 |                               'num_unq_sum': 'total_unq_sum_monthly',
33 |                               'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
34 | 
35 |     return one_month
36 | 
37 | 
38 | def process_user_log_together(df):
39 |     """
40 |     After union all chunk file, do sum again.
41 |     :param df:
42 |     :return:
43 |     """
44 | 
45 |     df = df.fillna(0)
46 | 
47 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
48 |     func = {'log_day_monthly': ['sum'],
49 |             'total_25_sum_monthly': ['sum'],
50 |             'total_50_sum_monthly': ['sum'],
51 |             'total_75_sum_monthly': ['sum'],
52 |             'total_985_sum_monthly': ['sum'],
53 |             'total_100_sum_monthly': ['sum'],
54 |             'total_unq_sum_monthly': ['sum'],
55 |             'total_secs_sum_monthly': ['sum']
56 |             }
57 |     user_log_all = grouped_object.agg(func).reset_index()
58 |     user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
59 |     user_log_all.rename(columns={'msno_': 'msno',
60 |                                  'log_day_monthly_sum': 'log_day_monthly',
61 |                                  'total_25_sum_monthly_sum': 'total_25_sum_monthly',
62 |                                  'total_50_sum_monthly_sum': 'total_50_sum_monthly',
63 |                                  'total_75_sum_monthly_sum': 'total_75_sum_monthly',
64 |                                  'total_985_sum_monthly_sum': 'total_985_sum_monthly',
65 |                                  'total_100_sum_monthly_sum': 'total_100_sum_monthly',
66 |                                  'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
67 |                                  'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
68 |                                  }, inplace=True)
69 | 
70 |     return user_log_all
71 | 
72 | 
73 | gc.enable()
74 | 
75 | size = 4e7  # 40 million
76 | reader = pd.read_csv('../input/user_logs.csv', chunksize=size)
77 | start_time = time.time()
78 | for i in range(10):
79 |     user_log_chunk = next(reader)
80 |     if i == 0:
81 |         user_log_feb = process_user_log(user_log_chunk)
82 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
83 |     else:
84 |         user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
85 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
86 |     del user_log_chunk
87 | 
88 | user_log_feb = process_user_log_together(user_log_feb)
89 | 
90 | print(len(user_log_feb))
91 | 
92 | user_log_feb.to_csv("../input/processed_user_log_mid_all.csv", index=False)
93 | 
94 | print('Done')
95 | 


--------------------------------------------------------------------------------
/src/process_userlog_feb.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import time
  3 | 
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def process_user_log(df):
  8 |     """
  9 |     Only do simple sum. mean operation.
 10 |     :param df: chunk dataframe from very large file.
 11 |     :return: processed dataframe
 12 |     """
 13 | 
 14 |     # Divided DataFrame by date
 15 |     # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
 16 | 
 17 |     # Stage 1: One Month Total Data
 18 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
 19 |     func = {'date': ['count'],
 20 |             'num_25': ['sum'], 'num_50': ['sum'],
 21 |             'num_75': ['sum'], 'num_985': ['sum'],
 22 |             'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
 23 |     one_month = grouped_object.agg(func).reset_index()
 24 |     one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
 25 |     one_month.rename(columns={'msno_': 'msno',
 26 |                               'date_count': 'log_day_monthly',
 27 |                               'num_25_sum': 'total_25_sum_monthly',
 28 |                               'num_50_sum': 'total_50_sum_monthly',
 29 |                               'num_75_sum': 'total_75_sum_monthly',
 30 |                               'num_985_sum': 'total_985_sum_monthly',
 31 |                               'num_100_sum': 'total_100_sum_monthly',
 32 |                               'num_unq_sum': 'total_unq_sum_monthly',
 33 |                               'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
 34 | 
 35 |     # Stage 2: Week Total Data
 36 |     # Divided DataFrame by Two Week
 37 |     one_week = df[(df['date'] < 20170220) & (df['date'] > 20170212)]
 38 | 
 39 |     grouped_object = one_week.groupby('msno', sort=False)
 40 |     one_week = grouped_object.agg(func).reset_index()
 41 |     one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values]
 42 |     one_week.rename(columns={'msno_': 'msno',
 43 |                              'date_count': 'one_week_log_day',
 44 |                              'num_25_sum': 'one_week_total_25_sum',
 45 |                              'num_50_sum': 'one_week_total_50_sum',
 46 |                              'num_75_sum': 'one_week_total_75_sum',
 47 |                              'num_985_sum': 'one_week_total_985_sum',
 48 |                              'num_100_sum': 'one_week_total_100_sum',
 49 |                              'num_unq_sum': 'one_week_total_unq_sum',
 50 |                              'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True)
 51 | 
 52 |     one_month = pd.merge(one_month, one_week, on=['msno'], how='left')
 53 | 
 54 |     del one_week
 55 |     gc.collect()
 56 | 
 57 |     two_week = df[(df['date'] < 20170227) & (df['date'] > 20170219)]
 58 | 
 59 |     grouped_object = two_week.groupby('msno', sort=False)
 60 |     two_week = grouped_object.agg(func).reset_index()
 61 |     two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values]
 62 |     two_week.rename(columns={'msno_': 'msno',
 63 |                              'date_count': 'two_week_log_day',
 64 |                              'num_25_sum': 'two_week_total_25_sum',
 65 |                              'num_50_sum': 'two_week_total_50_sum',
 66 |                              'num_75_sum': 'two_week_total_75_sum',
 67 |                              'num_985_sum': 'two_week_total_985_sum',
 68 |                              'num_100_sum': 'two_week_total_100_sum',
 69 |                              'num_unq_sum': 'two_week_total_unq_sum',
 70 |                              'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True)
 71 | 
 72 |     one_month = pd.merge(one_month, two_week, on=['msno'], how='left')
 73 | 
 74 |     del two_week
 75 |     gc.collect()
 76 | 
 77 |     # Stage 3: Semimonth Total Data
 78 |     one_semimonth = df[(df['date'] < 20170215) & (df['date'] > 20170131)]
 79 | 
 80 |     grouped_object = one_semimonth.groupby('msno', sort=False)
 81 |     one_semimonth = grouped_object.agg(func).reset_index()
 82 |     one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values]
 83 |     one_semimonth.rename(columns={'msno_': 'msno',
 84 |                                   'date_count': 'one_semimonth_log_day',
 85 |                                   'num_25_sum': 'one_semimonth_total_25_sum',
 86 |                                   'num_50_sum': 'one_semimonth_total_50_sum',
 87 |                                   'num_75_sum': 'one_semimonth_total_75_sum',
 88 |                                   'num_985_sum': 'one_semimonth_total_985_sum',
 89 |                                   'num_100_sum': 'one_semimonth_total_100_sum',
 90 |                                   'num_unq_sum': 'one_semimonth_total_unq_sum',
 91 |                                   'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True)
 92 | 
 93 |     one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left')
 94 | 
 95 |     del one_semimonth
 96 |     gc.collect()
 97 | 
 98 |     two_semimonth = df[(df['date'] < 20170301) & (df['date'] > 20170214)]
 99 | 
100 |     grouped_object = two_semimonth.groupby('msno', sort=False)
101 |     two_semimonth = grouped_object.agg(func).reset_index()
102 |     two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values]
103 |     two_semimonth.rename(columns={'msno_': 'msno',
104 |                                   'date_count': 'two_semimonth_log_day',
105 |                                   'num_25_sum': 'two_semimonth_total_25_sum',
106 |                                   'num_50_sum': 'two_semimonth_total_50_sum',
107 |                                   'num_75_sum': 'two_semimonth_total_75_sum',
108 |                                   'num_985_sum': 'two_semimonth_total_985_sum',
109 |                                   'num_100_sum': 'two_semimonth_total_100_sum',
110 |                                   'num_unq_sum': 'two_semimonth_total_unq_sum',
111 |                                   'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True)
112 | 
113 |     one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left')
114 | 
115 |     del two_semimonth
116 |     gc.collect()
117 | 
118 |     return one_month
119 | 
120 | 
121 | def process_user_log_together(df):
122 |     """
123 |     After union all chunk file, do sum again.
124 |     :param df:
125 |     :return:
126 |     """
127 | 
128 |     df = df.fillna(0)
129 | 
130 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
131 |     func = {'log_day_monthly': ['sum'],
132 |             'total_25_sum_monthly': ['sum'],
133 |             'total_50_sum_monthly': ['sum'],
134 |             'total_75_sum_monthly': ['sum'],
135 |             'total_985_sum_monthly': ['sum'],
136 |             'total_100_sum_monthly': ['sum'],
137 |             'total_unq_sum_monthly': ['sum'],
138 |             'total_secs_sum_monthly': ['sum'],
139 |             'one_week_log_day': ['sum'],
140 |             'one_week_total_25_sum': ['sum'],
141 |             'one_week_total_50_sum': ['sum'],
142 |             'one_week_total_75_sum': ['sum'],
143 |             'one_week_total_985_sum': ['sum'],
144 |             'one_week_total_100_sum': ['sum'],
145 |             'one_week_total_unq_sum': ['sum'],
146 |             'one_week_total_secs_sum': ['sum'],
147 |             'two_week_log_day': ['sum'],
148 |             'two_week_total_25_sum': ['sum'],
149 |             'two_week_total_50_sum': ['sum'],
150 |             'two_week_total_75_sum': ['sum'],
151 |             'two_week_total_985_sum': ['sum'],
152 |             'two_week_total_100_sum': ['sum'],
153 |             'two_week_total_unq_sum': ['sum'],
154 |             'two_week_total_secs_sum': ['sum'],
155 |             'one_semimonth_log_day': ['sum'],
156 |             'one_semimonth_total_25_sum': ['sum'],
157 |             'one_semimonth_total_50_sum': ['sum'],
158 |             'one_semimonth_total_75_sum': ['sum'],
159 |             'one_semimonth_total_985_sum': ['sum'],
160 |             'one_semimonth_total_100_sum': ['sum'],
161 |             'one_semimonth_total_unq_sum': ['sum'],
162 |             'one_semimonth_total_secs_sum': ['sum'],
163 |             'two_semimonth_log_day': ['sum'],
164 |             'two_semimonth_total_25_sum': ['sum'],
165 |             'two_semimonth_total_50_sum': ['sum'],
166 |             'two_semimonth_total_75_sum': ['sum'],
167 |             'two_semimonth_total_985_sum': ['sum'],
168 |             'two_semimonth_total_100_sum': ['sum'],
169 |             'two_semimonth_total_unq_sum': ['sum'],
170 |             'two_semimonth_total_secs_sum': ['sum']
171 |             }
172 |     user_log_all = grouped_object.agg(func).reset_index()
173 |     user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
174 |     user_log_all.rename(columns={'msno_': 'msno',
175 |                                  'log_day_monthly_sum': 'log_day_monthly',
176 |                                  'total_25_sum_monthly_sum': 'total_25_sum_monthly',
177 |                                  'total_50_sum_monthly_sum': 'total_50_sum_monthly',
178 |                                  'total_75_sum_monthly_sum': 'total_75_sum_monthly',
179 |                                  'total_985_sum_monthly_sum': 'total_985_sum_monthly',
180 |                                  'total_100_sum_monthly_sum': 'total_100_sum_monthly',
181 |                                  'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
182 |                                  'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
183 |                                  'one_week_log_day_sum': 'one_week_log_day',
184 |                                  'one_week_total_25_sum_sum': 'one_week_total_25_sum',
185 |                                  'one_week_total_50_sum_sum': 'one_week_total_50_sum',
186 |                                  'one_week_total_75_sum_sum': 'one_week_total_75_sum',
187 |                                  'one_week_total_985_sum_sum': 'one_week_total_985_sum',
188 |                                  'one_week_total_100_sum_sum': 'one_week_total_100_sum',
189 |                                  'one_week_total_unq_sum_sum': 'one_week_total_unq_sum',
190 |                                  'one_week_total_secs_sum_sum': 'one_week_total_secs_sum',
191 |                                  'two_week_log_day_sum': 'two_week_log_day',
192 |                                  'two_week_total_25_sum_sum': 'two_week_total_25_sum',
193 |                                  'two_week_total_50_sum_sum': 'two_week_total_50_sum',
194 |                                  'two_week_total_75_sum_sum': 'two_week_total_75_sum',
195 |                                  'two_week_total_985_sum_sum': 'two_week_total_985_sum',
196 |                                  'two_week_total_100_sum_sum': 'two_week_total_100_sum',
197 |                                  'two_week_total_unq_sum_sum': 'two_week_total_unq_sum',
198 |                                  'two_week_total_secs_sum_sum': 'two_week_total_secs_sum',
199 |                                  'one_semimonth_log_day_sum': 'one_semimonth_log_day',
200 |                                  'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum',
201 |                                  'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum',
202 |                                  'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum',
203 |                                  'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum',
204 |                                  'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum',
205 |                                  'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum',
206 |                                  'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum',
207 |                                  'two_semimonth_log_day_sum': 'two_semimonth_log_day',
208 |                                  'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum',
209 |                                  'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum',
210 |                                  'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum',
211 |                                  'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum',
212 |                                  'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum',
213 |                                  'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum',
214 |                                  'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum'
215 |                                  }, inplace=True)
216 | 
217 |     return user_log_all
218 | 
219 | 
220 | gc.enable()
221 | 
222 | size = 1e6
223 | reader = pd.read_csv('../input/user_log_feb.csv', chunksize=size)
224 | start_time = time.time()
225 | for i in range(17):  # 17
226 |     user_log_chunk = next(reader)
227 |     if i == 0:
228 |         user_log_feb = process_user_log(user_log_chunk)
229 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
230 |     else:
231 |         user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
232 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
233 |     del user_log_chunk
234 | 
235 | user_log_feb = process_user_log_together(user_log_feb)
236 | 
237 | user_log_feb.to_csv("../input/processed_user_log_feb.csv", index=False)
238 | 
239 | print('Done')
240 | 


--------------------------------------------------------------------------------
/src/process_userlog_mar.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import time
  3 | 
  4 | import pandas as pd
  5 | 
  6 | 
  7 | def process_user_log(df):
  8 |     """
  9 |     Only do simple sum. mean operation.
 10 |     :param df: chunk dataframe from very large file.
 11 |     :return: processed dataframe
 12 |     """
 13 | 
 14 |     # Divided DataFrame by date
 15 |     # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
 16 | 
 17 |     # Stage 1: One Month Total Data
 18 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
 19 |     func = {'date': ['count'],
 20 |             'num_25': ['sum'], 'num_50': ['sum'],
 21 |             'num_75': ['sum'], 'num_985': ['sum'],
 22 |             'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
 23 |     one_month = grouped_object.agg(func).reset_index()
 24 |     one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
 25 |     one_month.rename(columns={'msno_': 'msno',
 26 |                               'date_count': 'log_day_monthly',
 27 |                               'num_25_sum': 'total_25_sum_monthly',
 28 |                               'num_50_sum': 'total_50_sum_monthly',
 29 |                               'num_75_sum': 'total_75_sum_monthly',
 30 |                               'num_985_sum': 'total_985_sum_monthly',
 31 |                               'num_100_sum': 'total_100_sum_monthly',
 32 |                               'num_unq_sum': 'total_unq_sum_monthly',
 33 |                               'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
 34 | 
 35 |     # Stage 2: Week Total Data
 36 |     # Divided DataFrame by Two Week
 37 |     one_week = df[(df['date'] < 20170320) & (df['date'] > 20170312)]
 38 | 
 39 |     grouped_object = one_week.groupby('msno', sort=False)
 40 |     one_week = grouped_object.agg(func).reset_index()
 41 |     one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values]
 42 |     one_week.rename(columns={'msno_': 'msno',
 43 |                              'date_count': 'one_week_log_day',
 44 |                              'num_25_sum': 'one_week_total_25_sum',
 45 |                              'num_50_sum': 'one_week_total_50_sum',
 46 |                              'num_75_sum': 'one_week_total_75_sum',
 47 |                              'num_985_sum': 'one_week_total_985_sum',
 48 |                              'num_100_sum': 'one_week_total_100_sum',
 49 |                              'num_unq_sum': 'one_week_total_unq_sum',
 50 |                              'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True)
 51 | 
 52 |     one_month = pd.merge(one_month, one_week, on=['msno'], how='left')
 53 | 
 54 |     del one_week
 55 |     gc.collect()
 56 | 
 57 |     two_week = df[(df['date'] < 20170327) & (df['date'] > 20170319)]
 58 | 
 59 |     grouped_object = two_week.groupby('msno', sort=False)
 60 |     two_week = grouped_object.agg(func).reset_index()
 61 |     two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values]
 62 |     two_week.rename(columns={'msno_': 'msno',
 63 |                              'date_count': 'two_week_log_day',
 64 |                              'num_25_sum': 'two_week_total_25_sum',
 65 |                              'num_50_sum': 'two_week_total_50_sum',
 66 |                              'num_75_sum': 'two_week_total_75_sum',
 67 |                              'num_985_sum': 'two_week_total_985_sum',
 68 |                              'num_100_sum': 'two_week_total_100_sum',
 69 |                              'num_unq_sum': 'two_week_total_unq_sum',
 70 |                              'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True)
 71 | 
 72 |     one_month = pd.merge(one_month, two_week, on=['msno'], how='left')
 73 | 
 74 |     del two_week
 75 |     gc.collect()
 76 | 
 77 |     # Stage 3: Semimonth Total Data
 78 |     one_semimonth = df[(df['date'] < 20170315) & (df['date'] > 20170228)]
 79 | 
 80 |     grouped_object = one_semimonth.groupby('msno', sort=False)
 81 |     one_semimonth = grouped_object.agg(func).reset_index()
 82 |     one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values]
 83 |     one_semimonth.rename(columns={'msno_': 'msno',
 84 |                                   'date_count': 'one_semimonth_log_day',
 85 |                                   'num_25_sum': 'one_semimonth_total_25_sum',
 86 |                                   'num_50_sum': 'one_semimonth_total_50_sum',
 87 |                                   'num_75_sum': 'one_semimonth_total_75_sum',
 88 |                                   'num_985_sum': 'one_semimonth_total_985_sum',
 89 |                                   'num_100_sum': 'one_semimonth_total_100_sum',
 90 |                                   'num_unq_sum': 'one_semimonth_total_unq_sum',
 91 |                                   'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True)
 92 | 
 93 |     one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left')
 94 | 
 95 |     del one_semimonth
 96 |     gc.collect()
 97 | 
 98 |     two_semimonth = df[(df['date'] < 20170329) & (df['date'] > 20170314)]
 99 | 
100 |     grouped_object = two_semimonth.groupby('msno', sort=False)
101 |     two_semimonth = grouped_object.agg(func).reset_index()
102 |     two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values]
103 |     two_semimonth.rename(columns={'msno_': 'msno',
104 |                                   'date_count': 'two_semimonth_log_day',
105 |                                   'num_25_sum': 'two_semimonth_total_25_sum',
106 |                                   'num_50_sum': 'two_semimonth_total_50_sum',
107 |                                   'num_75_sum': 'two_semimonth_total_75_sum',
108 |                                   'num_985_sum': 'two_semimonth_total_985_sum',
109 |                                   'num_100_sum': 'two_semimonth_total_100_sum',
110 |                                   'num_unq_sum': 'two_semimonth_total_unq_sum',
111 |                                   'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True)
112 | 
113 |     one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left')
114 | 
115 |     del two_semimonth
116 |     gc.collect()
117 | 
118 |     return one_month
119 | 
120 | 
121 | def process_user_log_together(df):
122 |     """
123 |     After union all chunk file, do sum again.
124 |     :param df:
125 |     :return:
126 |     """
127 | 
128 |     df = df.fillna(0)
129 | 
130 |     grouped_object = df.groupby('msno', sort=False)  # not sorting results in a minor speedup
131 |     func = {'log_day_monthly': ['sum'],
132 |             'total_25_sum_monthly': ['sum'],
133 |             'total_50_sum_monthly': ['sum'],
134 |             'total_75_sum_monthly': ['sum'],
135 |             'total_985_sum_monthly': ['sum'],
136 |             'total_100_sum_monthly': ['sum'],
137 |             'total_unq_sum_monthly': ['sum'],
138 |             'total_secs_sum_monthly': ['sum'],
139 |             'one_week_log_day': ['sum'],
140 |             'one_week_total_25_sum': ['sum'],
141 |             'one_week_total_50_sum': ['sum'],
142 |             'one_week_total_75_sum': ['sum'],
143 |             'one_week_total_985_sum': ['sum'],
144 |             'one_week_total_100_sum': ['sum'],
145 |             'one_week_total_unq_sum': ['sum'],
146 |             'one_week_total_secs_sum': ['sum'],
147 |             'two_week_log_day': ['sum'],
148 |             'two_week_total_25_sum': ['sum'],
149 |             'two_week_total_50_sum': ['sum'],
150 |             'two_week_total_75_sum': ['sum'],
151 |             'two_week_total_985_sum': ['sum'],
152 |             'two_week_total_100_sum': ['sum'],
153 |             'two_week_total_unq_sum': ['sum'],
154 |             'two_week_total_secs_sum': ['sum'],
155 |             'one_semimonth_log_day': ['sum'],
156 |             'one_semimonth_total_25_sum': ['sum'],
157 |             'one_semimonth_total_50_sum': ['sum'],
158 |             'one_semimonth_total_75_sum': ['sum'],
159 |             'one_semimonth_total_985_sum': ['sum'],
160 |             'one_semimonth_total_100_sum': ['sum'],
161 |             'one_semimonth_total_unq_sum': ['sum'],
162 |             'one_semimonth_total_secs_sum': ['sum'],
163 |             'two_semimonth_log_day': ['sum'],
164 |             'two_semimonth_total_25_sum': ['sum'],
165 |             'two_semimonth_total_50_sum': ['sum'],
166 |             'two_semimonth_total_75_sum': ['sum'],
167 |             'two_semimonth_total_985_sum': ['sum'],
168 |             'two_semimonth_total_100_sum': ['sum'],
169 |             'two_semimonth_total_unq_sum': ['sum'],
170 |             'two_semimonth_total_secs_sum': ['sum']
171 |             }
172 |     user_log_all = grouped_object.agg(func).reset_index()
173 |     user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
174 |     user_log_all.rename(columns={'msno_': 'msno',
175 |                                  'log_day_monthly_sum': 'log_day_monthly',
176 |                                  'total_25_sum_monthly_sum': 'total_25_sum_monthly',
177 |                                  'total_50_sum_monthly_sum': 'total_50_sum_monthly',
178 |                                  'total_75_sum_monthly_sum': 'total_75_sum_monthly',
179 |                                  'total_985_sum_monthly_sum': 'total_985_sum_monthly',
180 |                                  'total_100_sum_monthly_sum': 'total_100_sum_monthly',
181 |                                  'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
182 |                                  'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
183 |                                  'one_week_log_day_sum': 'one_week_log_day',
184 |                                  'one_week_total_25_sum_sum': 'one_week_total_25_sum',
185 |                                  'one_week_total_50_sum_sum': 'one_week_total_50_sum',
186 |                                  'one_week_total_75_sum_sum': 'one_week_total_75_sum',
187 |                                  'one_week_total_985_sum_sum': 'one_week_total_985_sum',
188 |                                  'one_week_total_100_sum_sum': 'one_week_total_100_sum',
189 |                                  'one_week_total_unq_sum_sum': 'one_week_total_unq_sum',
190 |                                  'one_week_total_secs_sum_sum': 'one_week_total_secs_sum',
191 |                                  'two_week_log_day_sum': 'two_week_log_day',
192 |                                  'two_week_total_25_sum_sum': 'two_week_total_25_sum',
193 |                                  'two_week_total_50_sum_sum': 'two_week_total_50_sum',
194 |                                  'two_week_total_75_sum_sum': 'two_week_total_75_sum',
195 |                                  'two_week_total_985_sum_sum': 'two_week_total_985_sum',
196 |                                  'two_week_total_100_sum_sum': 'two_week_total_100_sum',
197 |                                  'two_week_total_unq_sum_sum': 'two_week_total_unq_sum',
198 |                                  'two_week_total_secs_sum_sum': 'two_week_total_secs_sum',
199 |                                  'one_semimonth_log_day_sum': 'one_semimonth_log_day',
200 |                                  'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum',
201 |                                  'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum',
202 |                                  'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum',
203 |                                  'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum',
204 |                                  'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum',
205 |                                  'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum',
206 |                                  'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum',
207 |                                  'two_semimonth_log_day_sum': 'two_semimonth_log_day',
208 |                                  'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum',
209 |                                  'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum',
210 |                                  'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum',
211 |                                  'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum',
212 |                                  'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum',
213 |                                  'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum',
214 |                                  'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum'
215 |                                  }, inplace=True)
216 | 
217 |     return user_log_all
218 | 
219 | 
220 | gc.enable()
221 | 
222 | size = 1e6
223 | reader = pd.read_csv('../input/user_logs_v2.csv', chunksize=size)
224 | start_time = time.time()
225 | for i in range(18):  # 17
226 |     user_log_chunk = next(reader)
227 |     if i == 0:
228 |         user_log_feb = process_user_log(user_log_chunk)
229 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
230 |     else:
231 |         user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
232 |         print("Loop ", i, "took %s seconds" % (time.time() - start_time))
233 |     del user_log_chunk
234 | 
235 | user_log_feb = process_user_log_together(user_log_feb)
236 | 
237 | user_log_feb.to_csv("../input/processed_user_log_mar.csv", index=False)
238 | 
239 | print('Done')
240 | 


--------------------------------------------------------------------------------
/src/weight_AveragingEnsemble.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | '''
 4 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781
 5 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv')
 6 | weight1 = 0.30
 7 | 
 8 | # LB 0.12383 CV 0.127227
 9 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv')
10 | weight2 = 0.30
11 | 
12 | # LB 0.12323 Train LogLoss 0.0966805
13 | file3 = pd.read_csv('result/submission_lightgbm_features_all_eta_0.002_round_2000_Dec_13.csv')
14 | weight3 = 0.2
15 | 
16 | # LB 0.12705 CV 0.136615 Train LogLoss 0.094903
17 | file4 = pd.read_csv('result/submission_xgboost_user_log_transaction_features_eta_0.002_round_2500_Dec_11.csv')
18 | weight4 = 0.2
19 | 
20 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \
21 |                     file3['is_churn'] * weight3 + file4['is_churn'] * weight4
22 | 
23 | file1.to_csv('submission_weight_avg_4_0.3_0.3_0.2_0.2.csv', index=False)
24 | '''
25 | 
26 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781
27 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv')
28 | weight1 = 0.28
29 | 
30 | # LB 0.12383 CV 0.127227
31 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv')
32 | weight2 = 0.28
33 | 
34 | # LB 0.12393 CV 0.122639 Train LogLoss 0.102916
35 | file3 = pd.read_csv('result/submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv')
36 | weight3 = 0.44
37 | 
38 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \
39 |                     file3['is_churn'] * weight3
40 | 
41 | file1.to_csv('submission_weight_avg_0.44_0.28_0.28.csv', index=False)
42 | 


--------------------------------------------------------------------------------
/src/xgboost_features.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import sklearn
  6 | import xgboost as xgb
  7 | 
  8 | 
  9 | def xgb_score(preds, dtrain):
 10 |     labels = dtrain.get_label()
 11 |     return 'log_loss', sklearn.metrics.log_loss(labels, preds)
 12 | 
 13 | 
 14 | gc.enable()
 15 | 
 16 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
 17 | 
 18 | members = pd.read_csv('../input/members_v3.csv')
 19 | 
 20 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
 21 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv')
 22 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv')
 23 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv')
 24 | 
 25 | train = pd.read_csv('../input/train.csv')
 26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
 27 | 
 28 | test = pd.read_csv('../input/sample_submission_v2.csv')
 29 | 
 30 | # Merge Data
 31 | 
 32 | train = pd.merge(train, transactions, how='left', on='msno')
 33 | test = pd.merge(test, transactions, how='left', on='msno')
 34 | 
 35 | train = pd.merge(train, user_log_all, how='left', on='msno')
 36 | test = pd.merge(test, user_log_all, how='left', on='msno')
 37 | 
 38 | train = pd.merge(train, user_log_feb, how='left', on='msno')
 39 | test = pd.merge(test, user_log_mar, how='left', on='msno')
 40 | 
 41 | train = pd.merge(train, members, how='left', on='msno')
 42 | test = pd.merge(test, members, how='left', on='msno')
 43 | 
 44 | del transactions, members
 45 | gc.collect()
 46 | 
 47 | # Drop duplicates first
 48 | test = test.drop_duplicates('msno')
 49 | 
 50 | gender = {'male': 1, 'female': 2}
 51 | train['gender'] = train['gender'].map(gender)
 52 | test['gender'] = test['gender'].map(gender)
 53 | 
 54 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
 55 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
 56 | 
 57 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
 58 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
 59 | 
 60 | # train = train.fillna(0)
 61 | # test = test.fillna(0)
 62 | 
 63 | # Delete date for now
 64 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 65 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
 66 | 
 67 | # Create 4 new features
 68 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
 69 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
 70 | 
 71 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
 72 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
 73 | 
 74 | train = train.drop(['payment_method_id2',
 75 |                     'payment_method_id3',
 76 |                     'payment_method_id4',
 77 |                     'payment_method_id5',
 78 |                     'payment_method_id6',
 79 |                     'payment_method_id8',
 80 |                     'payment_method_id10',
 81 |                     'payment_method_id11',
 82 |                     'payment_method_id12',
 83 |                     'payment_method_id13',
 84 |                     'payment_method_id14',
 85 |                     'payment_method_id16',
 86 |                     'payment_method_id17',
 87 |                     'payment_method_id18',
 88 |                     'payment_method_id19',
 89 |                     'payment_method_id20',
 90 |                     'payment_method_id21',
 91 |                     'payment_method_id22',
 92 |                     'payment_method_id23',
 93 |                     'payment_method_id24',
 94 |                     'payment_method_id25',
 95 |                     'payment_method_id27',
 96 |                     'payment_method_id28',
 97 |                     'payment_method_id31',
 98 |                     'payment_method_id33',
 99 |                     'payment_method_id34',
100 |                     'transaction_date_day',
101 |                     'membership_expire_date_day'], axis=1)
102 | 
103 | test = test.drop(['payment_method_id2',
104 |                   'payment_method_id3',
105 |                   'payment_method_id4',
106 |                   'payment_method_id5',
107 |                   'payment_method_id6',
108 |                   'payment_method_id8',
109 |                   'payment_method_id10',
110 |                   'payment_method_id11',
111 |                   'payment_method_id12',
112 |                   'payment_method_id13',
113 |                   'payment_method_id14',
114 |                   'payment_method_id16',
115 |                   'payment_method_id17',
116 |                   'payment_method_id18',
117 |                   'payment_method_id19',
118 |                   'payment_method_id20',
119 |                   'payment_method_id21',
120 |                   'payment_method_id22',
121 |                   'payment_method_id23',
122 |                   'payment_method_id24',
123 |                   'payment_method_id25',
124 |                   'payment_method_id27',
125 |                   'payment_method_id28',
126 |                   'payment_method_id31',
127 |                   'payment_method_id33',
128 |                   'payment_method_id34',
129 |                   'transaction_date_day',
130 |                   'membership_expire_date_day'], axis=1)
131 | 
132 | feature_list = [
133 |     # raw data
134 |     'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
135 |     'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
136 |     # advanced features
137 |     # user_log
138 |     'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
139 |     'total_secs_sum',
140 |     'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
141 |     'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
142 |     'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
143 |     'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
144 |     # transactions
145 |     'discount', 'amt_per_day', 'is_discount', 'membership_days',
146 |     'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
147 |     'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
148 |     # members
149 | ]
150 | 
151 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
152 | 
153 | params = {
154 |     'base_score': 0.5,
155 |     'eta': 0.002,
156 |     'max_depth': 6,
157 |     'booster': 'gbtree',
158 |     'colsample_bylevel': 1,
159 |     'colsample_bytree': 1.0,
160 |     'gamma': 1,
161 |     'max_child_weight': 5,
162 |     'n_estimators': 600,
163 |     'reg_alpha': '0',
164 |     'reg_lambda': '1',
165 |     'scale_pos_weight': 1,
166 |     'objective': 'binary:logistic',
167 |     'eval_metric': 'logloss',
168 |     'seed': 2017,
169 |     'silent': True
170 | }
171 | x1, x2, y1, y2 = sklearn.model_selection.train_test_split(train[cols], train['is_churn'], test_size=0.3,
172 |                                                           random_state=2017)
173 | watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
174 | cv_output = xgb.cv(params, xgb.DMatrix(x1, y1), num_boost_round=1500, early_stopping_rounds=20, verbose_eval=50,
175 |                    show_stdv=False)
176 | model = xgb.train(params, xgb.DMatrix(x1, y1), 2500, watchlist, feval=xgb_score, maximize=False, verbose_eval=50,
177 |                   early_stopping_rounds=50)
178 | 
179 | pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
180 | 
181 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
182 | print(len(test))
183 | test[['msno', 'is_churn']].to_csv('submission_xgboost_all_features_selection_eta_0.002_round_2500_Dec_15.csv',
184 |                                   index=False)
185 | 


--------------------------------------------------------------------------------
/src/xgboost_gridsearch.py:
--------------------------------------------------------------------------------
  1 | import gc
  2 | import warnings
  3 | from datetime import datetime
  4 | 
  5 | import pandas as pd
  6 | import sklearn
  7 | import xgboost as xgb
  8 | from sklearn.model_selection import GridSearchCV
  9 | from sklearn.model_selection import RandomizedSearchCV
 10 | from sklearn.model_selection import StratifiedKFold
 11 | 
 12 | 
 13 | def xgb_score(preds, dtrain):
 14 |     labels = dtrain.get_label()
 15 |     return 'log_loss', sklearn.metrics.log_loss(labels, preds)
 16 | 
 17 | 
 18 | def timer(start_time=None):
 19 |     if not start_time:
 20 |         start_time = datetime.now()
 21 |         return start_time
 22 |     elif start_time:
 23 |         thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
 24 |         tmin, tsec = divmod(temp_sec, 60)
 25 |         print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
 26 | 
 27 | 
 28 | gc.enable()
 29 | warnings.filterwarnings('ignore')
 30 | 
 31 | transactions = pd.read_csv('../input/processed_transaction_all.csv')
 32 | 
 33 | members_v1 = pd.read_csv('../input/members.csv')
 34 | members_v2 = pd.read_csv('../input/members_v2.csv')
 35 | members = members_v1.append(members_v2, ignore_index=True)
 36 | 
 37 | user_log = pd.read_csv('../input/processed_user_log_all.csv')
 38 | 
 39 | train_v1 = pd.read_csv('../input/train.csv')
 40 | train_v2 = pd.read_csv('../input/train_v2.csv')
 41 | train = train_v1.append(train_v2, ignore_index=True)
 42 | 
 43 | test = pd.read_csv('../input/sample_submission_v2.csv')
 44 | 
 45 | # Merge Data
 46 | 
 47 | train = pd.merge(train, transactions, how='left', on='msno')
 48 | test = pd.merge(test, transactions, how='left', on='msno')
 49 | 
 50 | train = pd.merge(train, user_log, how='left', on='msno')
 51 | test = pd.merge(test, user_log, how='left', on='msno')
 52 | 
 53 | train = pd.merge(train, members, how='left', on='msno')
 54 | test = pd.merge(test, members, how='left', on='msno')
 55 | 
 56 | # Drop duplicates first
 57 | test = test.drop_duplicates('msno')
 58 | 
 59 | gender = {'male': 1, 'female': 2}
 60 | train['gender'] = train['gender'].map(gender)
 61 | test['gender'] = test['gender'].map(gender)
 62 | 
 63 | train = train.fillna(0)
 64 | test = test.fillna(0)
 65 | 
 66 | # Delete date for now
 67 | train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
 68 | test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
 69 | # Delete date for now
 70 | 
 71 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
 72 | 
 73 | Y = train['is_churn'].values
 74 | X = train[cols]
 75 | 
 76 | # A parameter grid for XGBoost
 77 | params = {
 78 |     'min_child_weight': [1, 5, 10],
 79 |     'gamma': [0.5, 1, 1.5, 2, 5],
 80 |     'colsample_bytree': [0.6, 0.8, 1.0],
 81 |     'max_depth': [3, 4, 5, 6, 7],
 82 |     'subsample': [0.7, 0.75, 0.8]
 83 | }
 84 | 
 85 | model = xgb.XGBClassifier(learning_rate=0.002, n_estimators=600, objective='binary:logistic',
 86 |                         silent=True, nthread=1)
 87 | 
 88 | folds = 3
 89 | param_comb = 5
 90 | 
 91 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
 92 | 
 93 | random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='neg_log_loss', n_jobs=4,
 94 |                                    cv=skf.split(X, Y), verbose=3, random_state=1001)
 95 | 
 96 | # Here we go
 97 | start_time = timer(None)  # timing starts from this point for "start_time" variable
 98 | random_search.fit(X, Y)
 99 | timer(start_time)  # timing ends here for "start_time" variable
100 | 
101 | print('\n All results:')
102 | print(random_search.cv_results_)
103 | print('\n Best estimator:')
104 | print(random_search.best_estimator_)
105 | print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
106 | print(random_search.best_score_ * 2 - 1)
107 | print('\n Best hyperparameters:')
108 | print(random_search.best_params_)
109 | results = pd.DataFrame(random_search.cv_results_)
110 | results.to_csv('xgboost_random_grid_search_results_01.csv', index=False)
111 | 
112 | pred = random_search.predict_proba(xgb.DMatrix(test[cols]))
113 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
114 | print(len(test))
115 | test[['msno', 'is_churn']].to_csv('submission_xgboost_random_serach_best_param.csv', index=False)
116 | 
117 | grid = GridSearchCV(estimator=model, param_grid=params, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X, Y), verbose=3)
118 | grid.fit(X, Y)
119 | print('\n All results:')
120 | print(grid.cv_results_)
121 | print('\n Best estimator:')
122 | print(grid.best_estimator_)
123 | print('\n Best score:')
124 | print(grid.best_score_ * 2 - 1)
125 | print('\n Best parameters:')
126 | print(grid.best_params_)
127 | results = pd.DataFrame(grid.cv_results_)
128 | results.to_csv('xgboost_grid_search_results_01.csv', index=False)
129 | 
130 | pred = grid.best_estimator_.predict_proba(xgb.DMatrix(test[cols]))
131 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
132 | print(len(test))
133 | test[['msno', 'is_churn']].to_csv('submission_xgboost_grid_search_best_param.csv', index=False)
134 | 


--------------------------------------------------------------------------------