├── .gitignore
├── README.md
├── input
└── .gitignore
└── src
├── .gitignore
├── autoencoder_baseline.py
├── feature_importance.py
├── find_correlation.py
├── lightgbm_all_features.py
├── lightgbm_feaeture_importance_.png
├── lightgbm_feaeture_importance_all_time.png
├── lightgbm_features.py
├── process_features_userlog_all.py
├── process_features_userlog_feb_mar.py
├── process_userlog_all.py
├── process_userlog_feb.py
├── process_userlog_mar.py
├── weight_AveragingEnsemble.py
├── xgboost_features.py
└── xgboost_gridsearch.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WSDM-KKBox-s-Churn-Prediction-Challenge
2 | The 11th ACM International Conference on Web Search and Data Mining (WSDM 2018) is challenging you to build an algorithm that predicts whether a subscription user will churn using a donated dataset from KKBOX.
3 |
4 | # Final: rank 43/575
5 |
6 | userlog_features分两个角度:过往所有时间段的features | 过往部分时间段的features
7 |
8 | process_userlog_feb.py 提取二月份训练数据的features
9 | process_userlog_mar.py 提取三月份测试数据的features
10 | process_userlog_all.py 提取过往所有时间段的features
11 |
12 | process_features_userlog_feb_mar.py 提取过往一个月的交叉features
13 | process_features_userlog_all.py 提取过往所有时间段的交叉features
14 |
--------------------------------------------------------------------------------
/input/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/.gitignore:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/autoencoder_baseline.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from keras import optimizers
6 | from keras.callbacks import ModelCheckpoint, TensorBoard
7 | from keras.layers import Dense, Dropout
8 | from keras.models import Sequential
9 | from numpy import random as rm
10 | from sklearn import preprocessing
11 | from sklearn.model_selection import train_test_split
12 |
13 | gc.enable()
14 |
15 | # transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0)
16 | # transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0)
17 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
18 |
19 | members = pd.read_csv('../input/members_v3.csv')
20 |
21 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
22 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
23 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
24 |
25 | train = pd.read_csv('../input/train.csv')
26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
27 |
28 | test = pd.read_csv('../input/sample_submission_v2.csv')
29 |
30 | # Merge Data
31 |
32 | # train = pd.merge(train, transactions_train, how='left', on='msno')
33 | # test = pd.merge(test, transactions_test, how='left', on='msno')
34 |
35 | train = pd.merge(train, transactions, how='left', on='msno')
36 | test = pd.merge(test, transactions, how='left', on='msno')
37 |
38 | train = pd.merge(train, user_log_train, how='left', on='msno')
39 | test = pd.merge(test, user_log_test, how='left', on='msno')
40 |
41 | train = pd.merge(train, user_log_all, how='left', on='msno')
42 | test = pd.merge(test, user_log_all, how='left', on='msno')
43 |
44 | train = pd.merge(train, members, how='left', on='msno')
45 | test = pd.merge(test, members, how='left', on='msno')
46 |
47 | del transactions, members, user_log_train, user_log_test
48 | gc.collect()
49 |
50 | # Drop duplicates first
51 | test = test.drop_duplicates('msno')
52 |
53 | gender = {'male': 1, 'female': 2}
54 | train['gender'] = train['gender'].map(gender)
55 | test['gender'] = test['gender'].map(gender)
56 |
57 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
58 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
59 |
60 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
61 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
62 |
63 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
64 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
65 |
66 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
67 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
68 |
69 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
70 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
71 |
72 | train = train.replace([np.inf, -np.inf], np.nan)
73 |
74 | train = train.fillna(0)
75 | test = test.fillna(0)
76 |
77 | train_0 = train[train['is_churn'] == 0]
78 | train_1 = train[train['is_churn'] == 1]
79 |
80 | '''
81 | # Enlarge train_1 for 17 times
82 | train_append = train_1
83 |
84 | for _ in range(17):
85 | train_append = train_append.append(train_1)
86 |
87 | train = train_0.append(train_append)
88 | '''
89 |
90 |
91 | # train1 random sample 1/17
92 | def rand_rows(df, num_rows=5):
93 | subset = rm.choice(df.index.values, size=num_rows)
94 | return df.loc[subset]
95 |
96 |
97 | train_0 = rand_rows(train_0, len(train_1))
98 | train = train_0.append(train_1)
99 |
100 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
101 |
102 | # Add Normalize
103 | min_max_scaler = preprocessing.MinMaxScaler()
104 | train[cols] = min_max_scaler.fit_transform(train[cols])
105 |
106 | X_train, X_test = train_test_split(train, test_size=0.2, random_state=47, shuffle=True)
107 | y_train = X_train['is_churn']
108 | X_train = X_train.drop(['msno', 'is_churn'], axis=1)
109 |
110 | y_test = X_test['is_churn']
111 | X_test = X_test.drop(['msno', 'is_churn'], axis=1)
112 |
113 | X_train = X_train.values
114 | X_test = X_test.values
115 |
116 | input_dim = X_train.shape[1]
117 |
118 | autoencoder = Sequential()
119 | autoencoder.add(Dense(input_dim, input_dim=input_dim))
120 |
121 | input_dim = int(input_dim / 2)
122 | autoencoder.add(Dense(input_dim, activation='relu'))
123 | autoencoder.add(Dropout(0.5))
124 |
125 | input_dim = int(input_dim / 2)
126 | autoencoder.add(Dense(input_dim, activation='relu'))
127 | autoencoder.add(Dropout(0.5))
128 |
129 | input_dim = int(input_dim / 2)
130 | autoencoder.add(Dense(input_dim, activation='relu'))
131 | autoencoder.add(Dropout(0.5))
132 |
133 | autoencoder.add(Dense(1, activation='sigmoid'))
134 |
135 | autoencoder.summary()
136 |
137 | nb_epoch = 50
138 | batch_size = 32
139 |
140 | sgd = optimizers.SGD(lr=0.002, decay=1e-6, momentum=0.9, nesterov=True)
141 |
142 | autoencoder.compile(optimizer=sgd,
143 | loss='binary_crossentropy',
144 | metrics=['accuracy'])
145 |
146 | checkpointer = ModelCheckpoint(filepath="model.h5",
147 | verbose=1,
148 | save_best_only=True)
149 |
150 | tensorboard = TensorBoard(log_dir='./log',
151 | histogram_freq=0,
152 | write_graph=True,
153 | write_images=True)
154 |
155 | print(X_train.shape)
156 |
157 | history = autoencoder.fit(X_train, y_train,
158 | epochs=nb_epoch,
159 | batch_size=batch_size,
160 | shuffle=True,
161 | validation_data=(X_test, y_test),
162 | verbose=1,
163 | callbacks=[checkpointer, tensorboard]).history
164 |
165 | # autoencoder = load_model('model.h5')
166 |
167 | predictions = autoencoder.predict(test.drop(['msno', 'is_churn'], axis=1).values)
168 |
169 | test['is_churn'] = predictions
170 | test = test[['msno', 'is_churn']]
171 |
172 | test.to_csv('submission_autoencoder_baseline_sgd_0.002_50_32_Dec_15.csv', index=False)
173 |
--------------------------------------------------------------------------------
/src/feature_importance.py:
--------------------------------------------------------------------------------
1 |
2 | import pickle
3 | import operator
4 |
5 |
6 | def plot(model):
7 | from xgboost import plot_importance
8 | from matplotlib import pyplot as plt
9 | plot_importance(model)
10 | plt.show()
11 |
12 |
13 | if __name__ == '__main__':
14 |
15 | filename = 'model/xgb_depth_7_round_1800_fold_2_eta_0.002.pkl'
16 |
17 | model = pickle.load(open(filename))
18 |
19 | importance = model.get_fscore()
20 | importance = sorted(importance.items(), key=operator.itemgetter(1))
21 |
22 | importance = importance[::-1]
23 | print(importance)
24 | plot(model)
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/find_correlation.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import matplotlib.pyplot as plt
4 | import numpy as np
5 | import pandas as pd
6 | import seaborn as sns
7 |
8 | gc.enable()
9 |
10 | # transactions = pd.read_csv('../input/processed_transaction_all.csv')
11 |
12 | members_v1 = pd.read_csv('../input/members.csv')
13 | members_v2 = pd.read_csv('../input/members_v2.csv')
14 | members = members_v1.append(members_v2, ignore_index=True)
15 |
16 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
17 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
18 |
19 | train_v1 = pd.read_csv('../input/train.csv')
20 | train_v2 = pd.read_csv('../input/train_v2.csv')
21 | train = train_v1.append(train_v2, ignore_index=True)
22 |
23 | test = pd.read_csv('../input/sample_submission_v2.csv')
24 |
25 | # Merge Data
26 |
27 | # train = pd.merge(train, transactions, how='left', on='msno')
28 | # test = pd.merge(test, transactions, how='left', on='msno')
29 |
30 | train = pd.merge(train, user_log_train, how='left', on='msno')
31 | test = pd.merge(test, user_log_test, how='left', on='msno')
32 |
33 | train = pd.merge(train, members, how='left', on='msno')
34 | test = pd.merge(test, members, how='left', on='msno')
35 |
36 | # Drop duplicates first
37 | test = test.drop_duplicates('msno')
38 |
39 | gender = {'male': 1, 'female': 2}
40 | train['gender'] = train['gender'].map(gender)
41 | test['gender'] = test['gender'].map(gender)
42 |
43 | train = train.fillna(0)
44 | test = test.fillna(0)
45 |
46 | # Delete date for now
47 | # train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
48 | # test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
49 |
50 | corr = train.corr()
51 |
52 | # print('Train Data Set Correlation:')
53 | # print(corr)
54 |
55 | corr.to_csv('user_log_features_without_transaction_corr.csv', index=False)
56 |
57 | # Generate a mask for the upper triangle
58 | mask = np.zeros_like(corr, dtype=np.bool)
59 | mask[np.triu_indices_from(mask)] = True
60 |
61 | # Set up the matplotlib figure
62 | f, ax = plt.subplots(figsize=(11, 9))
63 |
64 | # Generate a custom diverging colormap
65 | cmap = sns.diverging_palette(220, 10, as_cmap=True)
66 |
67 | # Draw the heatmap with the mask and correct aspect ratio
68 | headmap = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
69 | square=True, linewidths=.5, cbar_kws={"shrink": .5})
70 | fig = headmap.get_figure()
71 | fig.savefig('Features_Correlation_Heatmap_user_log')
72 |
--------------------------------------------------------------------------------
/src/lightgbm_all_features.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import lightgbm as lgb
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.model_selection import ShuffleSplit
8 |
9 | gc.enable()
10 |
11 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
12 |
13 | members = pd.read_csv('../input/members_v3.csv')
14 |
15 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
16 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv')
17 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv')
18 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv')
19 |
20 | train = pd.read_csv('../input/train.csv')
21 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
22 |
23 | test = pd.read_csv('../input/sample_submission_v2.csv')
24 |
25 | # Merge Data
26 |
27 | train = pd.merge(train, transactions, how='left', on='msno')
28 | test = pd.merge(test, transactions, how='left', on='msno')
29 |
30 | train = pd.merge(train, user_log_all, how='left', on='msno')
31 | test = pd.merge(test, user_log_all, how='left', on='msno')
32 |
33 | train = pd.merge(train, user_log_feb, how='left', on='msno')
34 | test = pd.merge(test, user_log_mar, how='left', on='msno')
35 |
36 | train = pd.merge(train, members, how='left', on='msno')
37 | test = pd.merge(test, members, how='left', on='msno')
38 |
39 | del transactions, members
40 | gc.collect()
41 |
42 | # Drop duplicates first
43 | test = test.drop_duplicates('msno')
44 |
45 | gender = {'male': 1, 'female': 2}
46 | train['gender'] = train['gender'].map(gender)
47 | test['gender'] = test['gender'].map(gender)
48 |
49 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
50 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
51 |
52 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
53 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
54 |
55 | # train = train.fillna(0)
56 | # test = test.fillna(0)
57 |
58 | # Delete date for now
59 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
60 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
61 |
62 | # Create 4 new features
63 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
64 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
65 |
66 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
67 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
68 |
69 | train = train.drop(['payment_method_id2',
70 | 'payment_method_id3',
71 | 'payment_method_id4',
72 | 'payment_method_id5',
73 | 'payment_method_id6',
74 | 'payment_method_id8',
75 | 'payment_method_id10',
76 | 'payment_method_id11',
77 | 'payment_method_id12',
78 | 'payment_method_id13',
79 | 'payment_method_id14',
80 | 'payment_method_id16',
81 | 'payment_method_id17',
82 | 'payment_method_id18',
83 | 'payment_method_id19',
84 | 'payment_method_id20',
85 | 'payment_method_id21',
86 | 'payment_method_id22',
87 | 'payment_method_id23',
88 | 'payment_method_id24',
89 | 'payment_method_id25',
90 | 'payment_method_id27',
91 | 'payment_method_id28',
92 | 'payment_method_id31',
93 | 'payment_method_id33',
94 | 'payment_method_id34',
95 | 'transaction_date_day',
96 | 'membership_expire_date_day'], axis=1)
97 |
98 | test = test.drop(['payment_method_id2',
99 | 'payment_method_id3',
100 | 'payment_method_id4',
101 | 'payment_method_id5',
102 | 'payment_method_id6',
103 | 'payment_method_id8',
104 | 'payment_method_id10',
105 | 'payment_method_id11',
106 | 'payment_method_id12',
107 | 'payment_method_id13',
108 | 'payment_method_id14',
109 | 'payment_method_id16',
110 | 'payment_method_id17',
111 | 'payment_method_id18',
112 | 'payment_method_id19',
113 | 'payment_method_id20',
114 | 'payment_method_id21',
115 | 'payment_method_id22',
116 | 'payment_method_id23',
117 | 'payment_method_id24',
118 | 'payment_method_id25',
119 | 'payment_method_id27',
120 | 'payment_method_id28',
121 | 'payment_method_id31',
122 | 'payment_method_id33',
123 | 'payment_method_id34',
124 | 'transaction_date_day',
125 | 'membership_expire_date_day'], axis=1)
126 |
127 | feature_list = [
128 | # raw data
129 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
130 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
131 | # advanced features
132 | # user_log
133 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
134 | 'total_secs_sum',
135 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
136 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
137 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
138 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
139 | # transactions
140 | 'discount', 'amt_per_day', 'is_discount', 'membership_days',
141 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
142 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
143 | # members
144 | ]
145 |
146 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
147 |
148 | print(cols)
149 |
150 | params = {
151 | 'objective': 'binary',
152 | 'metric': 'binary_logloss',
153 | 'boosting': 'gbdt',
154 | 'learning_rate': 0.002, # small learn rate, large number of iterations
155 | 'verbose': 0,
156 | 'num_leaves': 108,
157 | 'bagging_fraction': 0.95,
158 | 'bagging_freq': 1,
159 | 'bagging_seed': 1,
160 | 'feature_fraction': 0.9,
161 | 'feature_fraction_seed': 1,
162 | 'max_bin': 128,
163 | 'max_depth': 7,
164 | 'reg_alpha': 1,
165 | 'reg_lambda': 0,
166 | 'min_split_gain': 0.5,
167 | 'min_child_weight': 1,
168 | 'min_child_samples': 10,
169 | 'scale_pos_weight': 1
170 | }
171 |
172 | bst = None
173 |
174 | cv_results = lgb.cv(
175 | params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False,
176 | shuffle=True,
177 | metrics='binary_logloss',
178 | early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
179 |
180 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train):
181 | train_data = lgb.Dataset(train[cols].loc[train_indices, :],
182 | label=train.loc[train_indices, 'is_churn'])
183 | val_data = lgb.Dataset(train[cols].loc[val_indices, :],
184 | label=train.loc[val_indices, 'is_churn'])
185 |
186 | bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50)
187 |
188 | predictions = bst.predict(test[cols])
189 | test['is_churn'] = predictions
190 | test = test[['msno', 'is_churn']]
191 | test.to_csv('submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv',
192 | index=False)
193 |
194 | print('Plot feature importances...')
195 | ax = lgb.plot_importance(bst)
196 | importance = bst.feature_importance()
197 | # importance = sorted(importance., key=operator.itemgetter(1))
198 |
199 | # importance = importance[::-1]
200 | # print(cols)
201 | # print(type(importance))
202 | a = pd.DataFrame({'feature': cols, 'importance': importance})
203 | # print(a)
204 | a.to_csv('feature_importance_features_selection.csv')
205 | # plt.show()
206 | plt.savefig('lightgbm_feaeture_importance_all_time')
207 |
--------------------------------------------------------------------------------
/src/lightgbm_feaeture_importance_.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_.png
--------------------------------------------------------------------------------
/src/lightgbm_feaeture_importance_all_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jason-learn/WSDM-KKBoxs-Churn-Prediction-Challenge/8ab255eef73d883b3351b1e5a1703b7a4e79ee36/src/lightgbm_feaeture_importance_all_time.png
--------------------------------------------------------------------------------
/src/lightgbm_features.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import lightgbm as lgb
4 | import matplotlib.pyplot as plt
5 | import numpy as np
6 | import pandas as pd
7 | from sklearn.model_selection import ShuffleSplit
8 |
9 | gc.enable()
10 |
11 | transactions_train = pd.read_csv('../input/processed_transaction_features_feb.csv', index_col=0)
12 | transactions_test = pd.read_csv('../input/processed_transaction_features_mar.csv', index_col=0)
13 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
14 |
15 | transactions = transactions[
16 | ['msno', 'discount', 'amt_per_day', 'is_discount', 'membership_days', 'transaction_date_year',
17 | 'transaction_date_month',
18 | 'transaction_date_day', 'membership_expire_date_year', 'membership_expire_date_month',
19 | 'membership_expire_date_day']]
20 |
21 | members = pd.read_csv('../input/members_v3.csv')
22 |
23 | user_log_train = pd.read_csv('../input/processed_features_user_log_feb.csv')
24 | user_log_test = pd.read_csv('../input/processed_features_user_log_mar.csv')
25 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
26 |
27 | train = pd.read_csv('../input/train_v2.csv')
28 |
29 | test = pd.read_csv('../input/sample_submission_v2.csv')
30 |
31 | # Merge Data
32 |
33 | train = pd.merge(train, transactions_train, how='left', on='msno')
34 | test = pd.merge(test, transactions_test, how='left', on='msno')
35 |
36 | train = pd.merge(train, transactions, how='left', on='msno')
37 | test = pd.merge(test, transactions, how='left', on='msno')
38 |
39 | train = pd.merge(train, user_log_train, how='left', on='msno')
40 | test = pd.merge(test, user_log_test, how='left', on='msno')
41 |
42 | train = pd.merge(train, user_log_all, how='left', on='msno')
43 | test = pd.merge(test, user_log_all, how='left', on='msno')
44 |
45 | train = pd.merge(train, members, how='left', on='msno')
46 | test = pd.merge(test, members, how='left', on='msno')
47 |
48 | del transactions, members, user_log_train, user_log_test
49 | gc.collect()
50 |
51 | # Drop duplicates first
52 | test = test.drop_duplicates('msno')
53 |
54 | gender = {'male': 1, 'female': 2}
55 | train['gender'] = train['gender'].map(gender)
56 | test['gender'] = test['gender'].map(gender)
57 |
58 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
59 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
60 |
61 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
62 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
63 |
64 | train = train.fillna(0)
65 | test = test.fillna(0)
66 |
67 | # Delete date for now
68 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
69 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
70 |
71 | # Remove Features with 0 feature importance
72 | train = train.drop(
73 | ['payment_method_id14',
74 | 'payment_method_id18',
75 | 'payment_method_id21',
76 | 'payment_method_id26',
77 | 'payment_method_id35',
78 | 'transaction_date_month_x',
79 | 'transaction_date_day_x',
80 | 'membership_expire_date_year_x',
81 | 'membership_expire_date_month_x',
82 | 'membership_expire_date_day_x',
83 | 'transaction_date_day_y',
84 | 'membership_expire_date_day_y'], axis=1)
85 | test = test.drop(
86 | ['payment_method_id14',
87 | 'payment_method_id18',
88 | 'payment_method_id21',
89 | 'payment_method_id26',
90 | 'payment_method_id35',
91 | 'transaction_date_month_x',
92 | 'transaction_date_day_x',
93 | 'membership_expire_date_year_x',
94 | 'membership_expire_date_month_x',
95 | 'membership_expire_date_day_x',
96 | 'transaction_date_day_y',
97 | 'membership_expire_date_day_y'], axis=1)
98 |
99 | # Remove Features with feature importance less than 100
100 | train = train.drop(
101 | ['payment_method_id16',
102 | 'payment_method_id17',
103 | 'payment_method_id19',
104 | 'payment_method_id23',
105 | 'payment_method_id27',
106 | 'payment_method_id28',
107 | 'payment_method_id31',
108 | 'payment_method_id33',
109 | 'payment_method_id34',
110 | 'payment_method_id39',
111 | 'is_discount_x',
112 | 'transaction_date_year_x'], axis=1)
113 | test = test.drop(
114 | ['payment_method_id16',
115 | 'payment_method_id17',
116 | 'payment_method_id19',
117 | 'payment_method_id23',
118 | 'payment_method_id27',
119 | 'payment_method_id28',
120 | 'payment_method_id31',
121 | 'payment_method_id33',
122 | 'payment_method_id34',
123 | 'payment_method_id39',
124 | 'is_discount_x',
125 | 'transaction_date_year_x'], axis=1)
126 |
127 | # Create 4 new features
128 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
129 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
130 |
131 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
132 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
133 |
134 | feature_list = [
135 | # raw data
136 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
137 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
138 | # advanced features
139 | # user_log
140 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
141 | 'total_secs_sum',
142 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
143 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
144 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
145 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
146 | # transactions
147 | 'discount', 'amt_per_day', 'is_discount', 'membership_days',
148 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
149 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
150 | # members
151 | ]
152 |
153 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
154 |
155 | print(cols)
156 |
157 | params = {
158 | 'objective': 'binary',
159 | 'metric': 'binary_logloss',
160 | 'boosting': 'gbdt',
161 | 'learning_rate': 0.002, # small learn rate, large number of iterations
162 | 'verbose': 0,
163 | 'num_leaves': 108,
164 | 'bagging_fraction': 0.95,
165 | 'bagging_freq': 1,
166 | 'bagging_seed': 1,
167 | 'feature_fraction': 0.9,
168 | 'feature_fraction_seed': 1,
169 | 'max_bin': 128,
170 | 'max_depth': 7,
171 | 'reg_alpha': 1,
172 | 'reg_lambda': 0,
173 | 'min_split_gain': 0.5,
174 | 'min_child_weight': 1,
175 | 'min_child_samples': 10,
176 | 'scale_pos_weight': 1
177 | }
178 |
179 | bst = None
180 |
181 | cv_results = lgb.cv(
182 | params, lgb.Dataset(train[cols], label=train['is_churn']), num_boost_round=1500, nfold=5, stratified=False,
183 | shuffle=True,
184 | metrics='binary_logloss',
185 | early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
186 |
187 | for train_indices, val_indices in ShuffleSplit(n_splits=1, test_size=0.1, train_size=0.4).split(train):
188 | train_data = lgb.Dataset(train[cols].loc[train_indices, :],
189 | label=train.loc[train_indices, 'is_churn'])
190 | val_data = lgb.Dataset(train[cols].loc[val_indices, :],
191 | label=train.loc[val_indices, 'is_churn'])
192 |
193 | bst = lgb.train(params, train_data, 2500, valid_sets=[val_data], early_stopping_rounds=50)
194 |
195 | predictions = bst.predict(test[cols])
196 | test['is_churn'] = predictions
197 | test = test[['msno', 'is_churn']]
198 | test.to_csv('submission_lightgbm_features_features_selection_best_parameter_eta_0.002_round_2000_Dec_15.csv',
199 | index=False)
200 |
201 | print('Plot feature importances...')
202 | ax = lgb.plot_importance(bst)
203 | importance = bst.feature_importance()
204 | # importance = sorted(importance., key=operator.itemgetter(1))
205 |
206 | # importance = importance[::-1]
207 | # print(cols)
208 | # print(type(importance))
209 | a = pd.DataFrame({'feature': cols, 'importance': importance})
210 | # print(a)
211 | a.to_csv('feature_importance_all.csv')
212 | # plt.show()
213 | plt.savefig('lightgbm_feaeture_importance_')
214 |
--------------------------------------------------------------------------------
/src/process_features_userlog_all.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def process_user_log_together(df):
6 | """
7 | After union all chunk file, do sum again.
8 | :param df:
9 | :return:
10 | """
11 |
12 | df = df.fillna(0)
13 |
14 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
15 | func = {'log_day_monthly': ['sum'],
16 | 'total_25_sum_monthly': ['sum'],
17 | 'total_50_sum_monthly': ['sum'],
18 | 'total_75_sum_monthly': ['sum'],
19 | 'total_985_sum_monthly': ['sum'],
20 | 'total_100_sum_monthly': ['sum'],
21 | 'total_unq_sum_monthly': ['sum'],
22 | 'total_secs_sum_monthly': ['sum']
23 | }
24 | user_log_all = grouped_object.agg(func).reset_index()
25 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
26 | user_log_all.rename(columns={'msno_': 'msno',
27 | 'log_day_monthly_sum': 'log_day_monthly',
28 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly',
29 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly',
30 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly',
31 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly',
32 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly',
33 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
34 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
35 | }, inplace=True)
36 |
37 | return user_log_all
38 |
39 |
40 | def calculate_user_log_features(train):
41 | """
42 | Calculate the user log features.
43 | :param train:
44 | :return:
45 | """
46 | train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[
47 | 'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly']
48 |
49 | # Monthly Habit for listening to music
50 | train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum']
51 | train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum']
52 |
53 | # 听歌是循环播放还是试听,每首歌播放次数
54 | train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly']
55 |
56 | # 听歌每首歌平均播放时间
57 | train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum']
58 |
59 | # 平均每天听歌数量
60 | train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly']
61 |
62 | # 平均每天听歌时间
63 | train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly']
64 |
65 | train.replace(np.inf, 0, inplace=True)
66 | train = train.fillna(0)
67 |
68 | return train
69 |
70 |
71 | train = pd.read_csv('../input/processed_user_log_mid_all.csv')
72 | user_log_test = pd.read_csv('../input/processed_user_log_mid_all.csv')
73 | user_log_test = user_log_test[['msno',
74 | 'log_day_monthly',
75 | 'total_25_sum_monthly',
76 | 'total_50_sum_monthly',
77 | 'total_75_sum_monthly',
78 | 'total_985_sum_monthly',
79 | 'total_100_sum_monthly',
80 | 'total_unq_sum_monthly',
81 | 'total_secs_sum_monthly']]
82 |
83 | print(train.columns)
84 | print(user_log_test.columns)
85 |
86 | train = train.append(user_log_test)
87 |
88 | train = process_user_log_together(train)
89 |
90 | train = calculate_user_log_features(train)
91 |
92 | print(len(train))
93 |
94 | train.to_csv('../input/processed_features_user_log_all_time_including_mar.csv', index=False)
95 |
--------------------------------------------------------------------------------
/src/process_features_userlog_feb_mar.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import numpy as np
4 | import pandas as pd
5 |
6 |
7 | def calculate_user_log_features(train):
8 | """
9 | Calculate the user log features.
10 | :param train:
11 | :return:
12 | """
13 | train['total_monthly_sum'] = train['total_25_sum_monthly'] + train['total_50_sum_monthly'] + train[
14 | 'total_75_sum_monthly'] + train['total_985_sum_monthly'] + train['total_100_sum_monthly']
15 |
16 | # Monthly Habit for listening to music
17 | train['total_25_ratio'] = train['total_25_sum_monthly'] / train['total_monthly_sum']
18 | train['total_100_ratio'] = train['total_100_sum_monthly'] / train['total_monthly_sum']
19 |
20 | # 听歌是循环播放还是试听,每首歌播放次数
21 | train['persong_play'] = train['total_monthly_sum'] / train['total_unq_sum_monthly']
22 |
23 | # 听歌每首歌平均播放时间
24 | train['persong_time'] = train['total_secs_sum_monthly'] / train['total_monthly_sum']
25 |
26 | # 平均每天听歌数量
27 | train['daily_play'] = train['total_monthly_sum'] / train['log_day_monthly']
28 |
29 | # 平均每天听歌时间
30 | train['daily_listentime'] = train['total_secs_sum_monthly'] / train['log_day_monthly']
31 |
32 | train['one_week_sum'] = train['one_week_total_25_sum'] + train['one_week_total_50_sum'] + train[
33 | 'one_week_total_75_sum'] + train['one_week_total_985_sum'] + train['one_week_total_100_sum']
34 |
35 | train['two_week_sum'] = train['two_week_total_25_sum'] + train['two_week_total_50_sum'] + train[
36 | 'two_week_total_75_sum'] + train['two_week_total_985_sum'] + train['two_week_total_100_sum']
37 |
38 | # 第四周听歌时间与第三周比较
39 | train['week_secs_sum_ratio'] = train['two_week_total_secs_sum'] / train['one_week_total_secs_sum']
40 | # 第四周听歌数与第三周比较
41 | train['week_sum_ratio'] = train['two_week_sum'] / train['one_week_sum']
42 |
43 | train['one_semimonth_sum'] = train['one_semimonth_total_25_sum'] + train['one_semimonth_total_50_sum'] \
44 | + train['one_semimonth_total_75_sum'] + train[
45 | 'one_semimonth_total_985_sum'] + train['one_semimonth_total_100_sum']
46 |
47 | train['two_semimonth_sum'] = train['two_semimonth_total_25_sum'] + train['two_semimonth_total_50_sum'] \
48 | + train['two_semimonth_total_75_sum'] + train[
49 | 'two_semimonth_total_985_sum'] + train['two_semimonth_total_100_sum']
50 |
51 | # 第二个半月听歌时间与第一个半月比较
52 | train['semimonth_secs_sum_ratio'] = train['two_semimonth_total_secs_sum'] / train['one_semimonth_total_secs_sum']
53 | # 第二个半月听歌数与第一个半月比较
54 | train['semimonth_sum_ratio'] = train['two_semimonth_sum'] / train['one_semimonth_sum']
55 |
56 | train.replace(np.inf, 0, inplace=True)
57 | train = train.fillna(0)
58 | train = train.drop(['log_day_monthly',
59 | 'total_25_sum_monthly',
60 | 'total_50_sum_monthly',
61 | 'total_75_sum_monthly',
62 | 'total_985_sum_monthly',
63 | 'total_100_sum_monthly',
64 | 'total_unq_sum_monthly',
65 | 'total_secs_sum_monthly',
66 | 'one_week_log_day',
67 | 'one_week_total_25_sum',
68 | 'one_week_total_50_sum',
69 | 'one_week_total_75_sum',
70 | 'one_week_total_985_sum',
71 | 'one_week_total_100_sum',
72 | 'one_week_total_unq_sum',
73 | 'one_week_total_secs_sum',
74 | 'two_week_log_day',
75 | 'two_week_total_25_sum',
76 | 'two_week_total_50_sum',
77 | 'two_week_total_75_sum',
78 | 'two_week_total_985_sum',
79 | 'two_week_total_100_sum',
80 | 'two_week_total_unq_sum',
81 | 'two_week_total_secs_sum',
82 | 'one_semimonth_log_day',
83 | 'one_semimonth_total_25_sum',
84 | 'one_semimonth_total_50_sum',
85 | 'one_semimonth_total_75_sum',
86 | 'one_semimonth_total_985_sum',
87 | 'one_semimonth_total_100_sum',
88 | 'one_semimonth_total_unq_sum',
89 | 'one_semimonth_total_secs_sum',
90 | 'two_semimonth_log_day',
91 | 'two_semimonth_total_25_sum',
92 | 'two_semimonth_total_50_sum',
93 | 'two_semimonth_total_75_sum',
94 | 'two_semimonth_total_985_sum',
95 | 'two_semimonth_total_100_sum',
96 | 'two_semimonth_total_unq_sum',
97 | 'two_semimonth_total_secs_sum'], axis=1)
98 |
99 | return train
100 |
101 |
102 | train = pd.read_csv('../input/processed_user_log_feb.csv')
103 |
104 | train = calculate_user_log_features(train)
105 |
106 | train.to_csv('../input/processed_features_user_log_feb.csv', index=False)
107 |
108 | del train
109 | gc.collect()
110 |
111 | test = pd.read_csv('../input/processed_user_log_mar.csv')
112 |
113 | test = calculate_user_log_features(test)
114 |
115 | test.to_csv('../input/processed_features_user_log_mar.csv', index=False)
116 |
--------------------------------------------------------------------------------
/src/process_userlog_all.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import time
3 |
4 | import pandas as pd
5 |
6 |
7 | def process_user_log(df):
8 | """
9 | Only do simple sum. mean operation.
10 | :param df: chunk dataframe from very large file.
11 | :return: processed dataframe
12 | """
13 |
14 | # Divided DataFrame by date
15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
16 |
17 | # Stage 1: One Month Total Data
18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
19 | func = {'date': ['count'],
20 | 'num_25': ['sum'], 'num_50': ['sum'],
21 | 'num_75': ['sum'], 'num_985': ['sum'],
22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
23 | one_month = grouped_object.agg(func).reset_index()
24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
25 | one_month.rename(columns={'msno_': 'msno',
26 | 'date_count': 'log_day_monthly',
27 | 'num_25_sum': 'total_25_sum_monthly',
28 | 'num_50_sum': 'total_50_sum_monthly',
29 | 'num_75_sum': 'total_75_sum_monthly',
30 | 'num_985_sum': 'total_985_sum_monthly',
31 | 'num_100_sum': 'total_100_sum_monthly',
32 | 'num_unq_sum': 'total_unq_sum_monthly',
33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
34 |
35 | return one_month
36 |
37 |
38 | def process_user_log_together(df):
39 | """
40 | After union all chunk file, do sum again.
41 | :param df:
42 | :return:
43 | """
44 |
45 | df = df.fillna(0)
46 |
47 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
48 | func = {'log_day_monthly': ['sum'],
49 | 'total_25_sum_monthly': ['sum'],
50 | 'total_50_sum_monthly': ['sum'],
51 | 'total_75_sum_monthly': ['sum'],
52 | 'total_985_sum_monthly': ['sum'],
53 | 'total_100_sum_monthly': ['sum'],
54 | 'total_unq_sum_monthly': ['sum'],
55 | 'total_secs_sum_monthly': ['sum']
56 | }
57 | user_log_all = grouped_object.agg(func).reset_index()
58 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
59 | user_log_all.rename(columns={'msno_': 'msno',
60 | 'log_day_monthly_sum': 'log_day_monthly',
61 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly',
62 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly',
63 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly',
64 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly',
65 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly',
66 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
67 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
68 | }, inplace=True)
69 |
70 | return user_log_all
71 |
72 |
73 | gc.enable()
74 |
75 | size = 4e7 # 40 million
76 | reader = pd.read_csv('../input/user_logs.csv', chunksize=size)
77 | start_time = time.time()
78 | for i in range(10):
79 | user_log_chunk = next(reader)
80 | if i == 0:
81 | user_log_feb = process_user_log(user_log_chunk)
82 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
83 | else:
84 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
85 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
86 | del user_log_chunk
87 |
88 | user_log_feb = process_user_log_together(user_log_feb)
89 |
90 | print(len(user_log_feb))
91 |
92 | user_log_feb.to_csv("../input/processed_user_log_mid_all.csv", index=False)
93 |
94 | print('Done')
95 |
--------------------------------------------------------------------------------
/src/process_userlog_feb.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import time
3 |
4 | import pandas as pd
5 |
6 |
7 | def process_user_log(df):
8 | """
9 | Only do simple sum. mean operation.
10 | :param df: chunk dataframe from very large file.
11 | :return: processed dataframe
12 | """
13 |
14 | # Divided DataFrame by date
15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
16 |
17 | # Stage 1: One Month Total Data
18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
19 | func = {'date': ['count'],
20 | 'num_25': ['sum'], 'num_50': ['sum'],
21 | 'num_75': ['sum'], 'num_985': ['sum'],
22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
23 | one_month = grouped_object.agg(func).reset_index()
24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
25 | one_month.rename(columns={'msno_': 'msno',
26 | 'date_count': 'log_day_monthly',
27 | 'num_25_sum': 'total_25_sum_monthly',
28 | 'num_50_sum': 'total_50_sum_monthly',
29 | 'num_75_sum': 'total_75_sum_monthly',
30 | 'num_985_sum': 'total_985_sum_monthly',
31 | 'num_100_sum': 'total_100_sum_monthly',
32 | 'num_unq_sum': 'total_unq_sum_monthly',
33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
34 |
35 | # Stage 2: Week Total Data
36 | # Divided DataFrame by Two Week
37 | one_week = df[(df['date'] < 20170220) & (df['date'] > 20170212)]
38 |
39 | grouped_object = one_week.groupby('msno', sort=False)
40 | one_week = grouped_object.agg(func).reset_index()
41 | one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values]
42 | one_week.rename(columns={'msno_': 'msno',
43 | 'date_count': 'one_week_log_day',
44 | 'num_25_sum': 'one_week_total_25_sum',
45 | 'num_50_sum': 'one_week_total_50_sum',
46 | 'num_75_sum': 'one_week_total_75_sum',
47 | 'num_985_sum': 'one_week_total_985_sum',
48 | 'num_100_sum': 'one_week_total_100_sum',
49 | 'num_unq_sum': 'one_week_total_unq_sum',
50 | 'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True)
51 |
52 | one_month = pd.merge(one_month, one_week, on=['msno'], how='left')
53 |
54 | del one_week
55 | gc.collect()
56 |
57 | two_week = df[(df['date'] < 20170227) & (df['date'] > 20170219)]
58 |
59 | grouped_object = two_week.groupby('msno', sort=False)
60 | two_week = grouped_object.agg(func).reset_index()
61 | two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values]
62 | two_week.rename(columns={'msno_': 'msno',
63 | 'date_count': 'two_week_log_day',
64 | 'num_25_sum': 'two_week_total_25_sum',
65 | 'num_50_sum': 'two_week_total_50_sum',
66 | 'num_75_sum': 'two_week_total_75_sum',
67 | 'num_985_sum': 'two_week_total_985_sum',
68 | 'num_100_sum': 'two_week_total_100_sum',
69 | 'num_unq_sum': 'two_week_total_unq_sum',
70 | 'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True)
71 |
72 | one_month = pd.merge(one_month, two_week, on=['msno'], how='left')
73 |
74 | del two_week
75 | gc.collect()
76 |
77 | # Stage 3: Semimonth Total Data
78 | one_semimonth = df[(df['date'] < 20170215) & (df['date'] > 20170131)]
79 |
80 | grouped_object = one_semimonth.groupby('msno', sort=False)
81 | one_semimonth = grouped_object.agg(func).reset_index()
82 | one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values]
83 | one_semimonth.rename(columns={'msno_': 'msno',
84 | 'date_count': 'one_semimonth_log_day',
85 | 'num_25_sum': 'one_semimonth_total_25_sum',
86 | 'num_50_sum': 'one_semimonth_total_50_sum',
87 | 'num_75_sum': 'one_semimonth_total_75_sum',
88 | 'num_985_sum': 'one_semimonth_total_985_sum',
89 | 'num_100_sum': 'one_semimonth_total_100_sum',
90 | 'num_unq_sum': 'one_semimonth_total_unq_sum',
91 | 'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True)
92 |
93 | one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left')
94 |
95 | del one_semimonth
96 | gc.collect()
97 |
98 | two_semimonth = df[(df['date'] < 20170301) & (df['date'] > 20170214)]
99 |
100 | grouped_object = two_semimonth.groupby('msno', sort=False)
101 | two_semimonth = grouped_object.agg(func).reset_index()
102 | two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values]
103 | two_semimonth.rename(columns={'msno_': 'msno',
104 | 'date_count': 'two_semimonth_log_day',
105 | 'num_25_sum': 'two_semimonth_total_25_sum',
106 | 'num_50_sum': 'two_semimonth_total_50_sum',
107 | 'num_75_sum': 'two_semimonth_total_75_sum',
108 | 'num_985_sum': 'two_semimonth_total_985_sum',
109 | 'num_100_sum': 'two_semimonth_total_100_sum',
110 | 'num_unq_sum': 'two_semimonth_total_unq_sum',
111 | 'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True)
112 |
113 | one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left')
114 |
115 | del two_semimonth
116 | gc.collect()
117 |
118 | return one_month
119 |
120 |
121 | def process_user_log_together(df):
122 | """
123 | After union all chunk file, do sum again.
124 | :param df:
125 | :return:
126 | """
127 |
128 | df = df.fillna(0)
129 |
130 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
131 | func = {'log_day_monthly': ['sum'],
132 | 'total_25_sum_monthly': ['sum'],
133 | 'total_50_sum_monthly': ['sum'],
134 | 'total_75_sum_monthly': ['sum'],
135 | 'total_985_sum_monthly': ['sum'],
136 | 'total_100_sum_monthly': ['sum'],
137 | 'total_unq_sum_monthly': ['sum'],
138 | 'total_secs_sum_monthly': ['sum'],
139 | 'one_week_log_day': ['sum'],
140 | 'one_week_total_25_sum': ['sum'],
141 | 'one_week_total_50_sum': ['sum'],
142 | 'one_week_total_75_sum': ['sum'],
143 | 'one_week_total_985_sum': ['sum'],
144 | 'one_week_total_100_sum': ['sum'],
145 | 'one_week_total_unq_sum': ['sum'],
146 | 'one_week_total_secs_sum': ['sum'],
147 | 'two_week_log_day': ['sum'],
148 | 'two_week_total_25_sum': ['sum'],
149 | 'two_week_total_50_sum': ['sum'],
150 | 'two_week_total_75_sum': ['sum'],
151 | 'two_week_total_985_sum': ['sum'],
152 | 'two_week_total_100_sum': ['sum'],
153 | 'two_week_total_unq_sum': ['sum'],
154 | 'two_week_total_secs_sum': ['sum'],
155 | 'one_semimonth_log_day': ['sum'],
156 | 'one_semimonth_total_25_sum': ['sum'],
157 | 'one_semimonth_total_50_sum': ['sum'],
158 | 'one_semimonth_total_75_sum': ['sum'],
159 | 'one_semimonth_total_985_sum': ['sum'],
160 | 'one_semimonth_total_100_sum': ['sum'],
161 | 'one_semimonth_total_unq_sum': ['sum'],
162 | 'one_semimonth_total_secs_sum': ['sum'],
163 | 'two_semimonth_log_day': ['sum'],
164 | 'two_semimonth_total_25_sum': ['sum'],
165 | 'two_semimonth_total_50_sum': ['sum'],
166 | 'two_semimonth_total_75_sum': ['sum'],
167 | 'two_semimonth_total_985_sum': ['sum'],
168 | 'two_semimonth_total_100_sum': ['sum'],
169 | 'two_semimonth_total_unq_sum': ['sum'],
170 | 'two_semimonth_total_secs_sum': ['sum']
171 | }
172 | user_log_all = grouped_object.agg(func).reset_index()
173 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
174 | user_log_all.rename(columns={'msno_': 'msno',
175 | 'log_day_monthly_sum': 'log_day_monthly',
176 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly',
177 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly',
178 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly',
179 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly',
180 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly',
181 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
182 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
183 | 'one_week_log_day_sum': 'one_week_log_day',
184 | 'one_week_total_25_sum_sum': 'one_week_total_25_sum',
185 | 'one_week_total_50_sum_sum': 'one_week_total_50_sum',
186 | 'one_week_total_75_sum_sum': 'one_week_total_75_sum',
187 | 'one_week_total_985_sum_sum': 'one_week_total_985_sum',
188 | 'one_week_total_100_sum_sum': 'one_week_total_100_sum',
189 | 'one_week_total_unq_sum_sum': 'one_week_total_unq_sum',
190 | 'one_week_total_secs_sum_sum': 'one_week_total_secs_sum',
191 | 'two_week_log_day_sum': 'two_week_log_day',
192 | 'two_week_total_25_sum_sum': 'two_week_total_25_sum',
193 | 'two_week_total_50_sum_sum': 'two_week_total_50_sum',
194 | 'two_week_total_75_sum_sum': 'two_week_total_75_sum',
195 | 'two_week_total_985_sum_sum': 'two_week_total_985_sum',
196 | 'two_week_total_100_sum_sum': 'two_week_total_100_sum',
197 | 'two_week_total_unq_sum_sum': 'two_week_total_unq_sum',
198 | 'two_week_total_secs_sum_sum': 'two_week_total_secs_sum',
199 | 'one_semimonth_log_day_sum': 'one_semimonth_log_day',
200 | 'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum',
201 | 'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum',
202 | 'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum',
203 | 'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum',
204 | 'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum',
205 | 'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum',
206 | 'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum',
207 | 'two_semimonth_log_day_sum': 'two_semimonth_log_day',
208 | 'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum',
209 | 'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum',
210 | 'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum',
211 | 'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum',
212 | 'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum',
213 | 'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum',
214 | 'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum'
215 | }, inplace=True)
216 |
217 | return user_log_all
218 |
219 |
220 | gc.enable()
221 |
222 | size = 1e6
223 | reader = pd.read_csv('../input/user_log_feb.csv', chunksize=size)
224 | start_time = time.time()
225 | for i in range(17): # 17
226 | user_log_chunk = next(reader)
227 | if i == 0:
228 | user_log_feb = process_user_log(user_log_chunk)
229 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
230 | else:
231 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
232 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
233 | del user_log_chunk
234 |
235 | user_log_feb = process_user_log_together(user_log_feb)
236 |
237 | user_log_feb.to_csv("../input/processed_user_log_feb.csv", index=False)
238 |
239 | print('Done')
240 |
--------------------------------------------------------------------------------
/src/process_userlog_mar.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import time
3 |
4 | import pandas as pd
5 |
6 |
7 | def process_user_log(df):
8 | """
9 | Only do simple sum. mean operation.
10 | :param df: chunk dataframe from very large file.
11 | :return: processed dataframe
12 | """
13 |
14 | # Divided DataFrame by date
15 | # train = train[(train['date'] < 20170301) & (train['date'] > 20170131)]
16 |
17 | # Stage 1: One Month Total Data
18 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
19 | func = {'date': ['count'],
20 | 'num_25': ['sum'], 'num_50': ['sum'],
21 | 'num_75': ['sum'], 'num_985': ['sum'],
22 | 'num_100': ['sum'], 'num_unq': ['sum'], 'total_secs': ['sum']}
23 | one_month = grouped_object.agg(func).reset_index()
24 | one_month.columns = ['_'.join(col).strip() for col in one_month.columns.values]
25 | one_month.rename(columns={'msno_': 'msno',
26 | 'date_count': 'log_day_monthly',
27 | 'num_25_sum': 'total_25_sum_monthly',
28 | 'num_50_sum': 'total_50_sum_monthly',
29 | 'num_75_sum': 'total_75_sum_monthly',
30 | 'num_985_sum': 'total_985_sum_monthly',
31 | 'num_100_sum': 'total_100_sum_monthly',
32 | 'num_unq_sum': 'total_unq_sum_monthly',
33 | 'total_secs_sum': 'total_secs_sum_monthly'}, inplace=True)
34 |
35 | # Stage 2: Week Total Data
36 | # Divided DataFrame by Two Week
37 | one_week = df[(df['date'] < 20170320) & (df['date'] > 20170312)]
38 |
39 | grouped_object = one_week.groupby('msno', sort=False)
40 | one_week = grouped_object.agg(func).reset_index()
41 | one_week.columns = ['_'.join(col).strip() for col in one_week.columns.values]
42 | one_week.rename(columns={'msno_': 'msno',
43 | 'date_count': 'one_week_log_day',
44 | 'num_25_sum': 'one_week_total_25_sum',
45 | 'num_50_sum': 'one_week_total_50_sum',
46 | 'num_75_sum': 'one_week_total_75_sum',
47 | 'num_985_sum': 'one_week_total_985_sum',
48 | 'num_100_sum': 'one_week_total_100_sum',
49 | 'num_unq_sum': 'one_week_total_unq_sum',
50 | 'total_secs_sum': 'one_week_total_secs_sum'}, inplace=True)
51 |
52 | one_month = pd.merge(one_month, one_week, on=['msno'], how='left')
53 |
54 | del one_week
55 | gc.collect()
56 |
57 | two_week = df[(df['date'] < 20170327) & (df['date'] > 20170319)]
58 |
59 | grouped_object = two_week.groupby('msno', sort=False)
60 | two_week = grouped_object.agg(func).reset_index()
61 | two_week.columns = ['_'.join(col).strip() for col in two_week.columns.values]
62 | two_week.rename(columns={'msno_': 'msno',
63 | 'date_count': 'two_week_log_day',
64 | 'num_25_sum': 'two_week_total_25_sum',
65 | 'num_50_sum': 'two_week_total_50_sum',
66 | 'num_75_sum': 'two_week_total_75_sum',
67 | 'num_985_sum': 'two_week_total_985_sum',
68 | 'num_100_sum': 'two_week_total_100_sum',
69 | 'num_unq_sum': 'two_week_total_unq_sum',
70 | 'total_secs_sum': 'two_week_total_secs_sum'}, inplace=True)
71 |
72 | one_month = pd.merge(one_month, two_week, on=['msno'], how='left')
73 |
74 | del two_week
75 | gc.collect()
76 |
77 | # Stage 3: Semimonth Total Data
78 | one_semimonth = df[(df['date'] < 20170315) & (df['date'] > 20170228)]
79 |
80 | grouped_object = one_semimonth.groupby('msno', sort=False)
81 | one_semimonth = grouped_object.agg(func).reset_index()
82 | one_semimonth.columns = ['_'.join(col).strip() for col in one_semimonth.columns.values]
83 | one_semimonth.rename(columns={'msno_': 'msno',
84 | 'date_count': 'one_semimonth_log_day',
85 | 'num_25_sum': 'one_semimonth_total_25_sum',
86 | 'num_50_sum': 'one_semimonth_total_50_sum',
87 | 'num_75_sum': 'one_semimonth_total_75_sum',
88 | 'num_985_sum': 'one_semimonth_total_985_sum',
89 | 'num_100_sum': 'one_semimonth_total_100_sum',
90 | 'num_unq_sum': 'one_semimonth_total_unq_sum',
91 | 'total_secs_sum': 'one_semimonth_total_secs_sum'}, inplace=True)
92 |
93 | one_month = pd.merge(one_month, one_semimonth, on=['msno'], how='left')
94 |
95 | del one_semimonth
96 | gc.collect()
97 |
98 | two_semimonth = df[(df['date'] < 20170329) & (df['date'] > 20170314)]
99 |
100 | grouped_object = two_semimonth.groupby('msno', sort=False)
101 | two_semimonth = grouped_object.agg(func).reset_index()
102 | two_semimonth.columns = ['_'.join(col).strip() for col in two_semimonth.columns.values]
103 | two_semimonth.rename(columns={'msno_': 'msno',
104 | 'date_count': 'two_semimonth_log_day',
105 | 'num_25_sum': 'two_semimonth_total_25_sum',
106 | 'num_50_sum': 'two_semimonth_total_50_sum',
107 | 'num_75_sum': 'two_semimonth_total_75_sum',
108 | 'num_985_sum': 'two_semimonth_total_985_sum',
109 | 'num_100_sum': 'two_semimonth_total_100_sum',
110 | 'num_unq_sum': 'two_semimonth_total_unq_sum',
111 | 'total_secs_sum': 'two_semimonth_total_secs_sum'}, inplace=True)
112 |
113 | one_month = pd.merge(one_month, two_semimonth, on=['msno'], how='left')
114 |
115 | del two_semimonth
116 | gc.collect()
117 |
118 | return one_month
119 |
120 |
121 | def process_user_log_together(df):
122 | """
123 | After union all chunk file, do sum again.
124 | :param df:
125 | :return:
126 | """
127 |
128 | df = df.fillna(0)
129 |
130 | grouped_object = df.groupby('msno', sort=False) # not sorting results in a minor speedup
131 | func = {'log_day_monthly': ['sum'],
132 | 'total_25_sum_monthly': ['sum'],
133 | 'total_50_sum_monthly': ['sum'],
134 | 'total_75_sum_monthly': ['sum'],
135 | 'total_985_sum_monthly': ['sum'],
136 | 'total_100_sum_monthly': ['sum'],
137 | 'total_unq_sum_monthly': ['sum'],
138 | 'total_secs_sum_monthly': ['sum'],
139 | 'one_week_log_day': ['sum'],
140 | 'one_week_total_25_sum': ['sum'],
141 | 'one_week_total_50_sum': ['sum'],
142 | 'one_week_total_75_sum': ['sum'],
143 | 'one_week_total_985_sum': ['sum'],
144 | 'one_week_total_100_sum': ['sum'],
145 | 'one_week_total_unq_sum': ['sum'],
146 | 'one_week_total_secs_sum': ['sum'],
147 | 'two_week_log_day': ['sum'],
148 | 'two_week_total_25_sum': ['sum'],
149 | 'two_week_total_50_sum': ['sum'],
150 | 'two_week_total_75_sum': ['sum'],
151 | 'two_week_total_985_sum': ['sum'],
152 | 'two_week_total_100_sum': ['sum'],
153 | 'two_week_total_unq_sum': ['sum'],
154 | 'two_week_total_secs_sum': ['sum'],
155 | 'one_semimonth_log_day': ['sum'],
156 | 'one_semimonth_total_25_sum': ['sum'],
157 | 'one_semimonth_total_50_sum': ['sum'],
158 | 'one_semimonth_total_75_sum': ['sum'],
159 | 'one_semimonth_total_985_sum': ['sum'],
160 | 'one_semimonth_total_100_sum': ['sum'],
161 | 'one_semimonth_total_unq_sum': ['sum'],
162 | 'one_semimonth_total_secs_sum': ['sum'],
163 | 'two_semimonth_log_day': ['sum'],
164 | 'two_semimonth_total_25_sum': ['sum'],
165 | 'two_semimonth_total_50_sum': ['sum'],
166 | 'two_semimonth_total_75_sum': ['sum'],
167 | 'two_semimonth_total_985_sum': ['sum'],
168 | 'two_semimonth_total_100_sum': ['sum'],
169 | 'two_semimonth_total_unq_sum': ['sum'],
170 | 'two_semimonth_total_secs_sum': ['sum']
171 | }
172 | user_log_all = grouped_object.agg(func).reset_index()
173 | user_log_all.columns = ['_'.join(col).strip() for col in user_log_all.columns.values]
174 | user_log_all.rename(columns={'msno_': 'msno',
175 | 'log_day_monthly_sum': 'log_day_monthly',
176 | 'total_25_sum_monthly_sum': 'total_25_sum_monthly',
177 | 'total_50_sum_monthly_sum': 'total_50_sum_monthly',
178 | 'total_75_sum_monthly_sum': 'total_75_sum_monthly',
179 | 'total_985_sum_monthly_sum': 'total_985_sum_monthly',
180 | 'total_100_sum_monthly_sum': 'total_100_sum_monthly',
181 | 'total_unq_sum_monthly_sum': 'total_unq_sum_monthly',
182 | 'total_secs_sum_monthly_sum': 'total_secs_sum_monthly',
183 | 'one_week_log_day_sum': 'one_week_log_day',
184 | 'one_week_total_25_sum_sum': 'one_week_total_25_sum',
185 | 'one_week_total_50_sum_sum': 'one_week_total_50_sum',
186 | 'one_week_total_75_sum_sum': 'one_week_total_75_sum',
187 | 'one_week_total_985_sum_sum': 'one_week_total_985_sum',
188 | 'one_week_total_100_sum_sum': 'one_week_total_100_sum',
189 | 'one_week_total_unq_sum_sum': 'one_week_total_unq_sum',
190 | 'one_week_total_secs_sum_sum': 'one_week_total_secs_sum',
191 | 'two_week_log_day_sum': 'two_week_log_day',
192 | 'two_week_total_25_sum_sum': 'two_week_total_25_sum',
193 | 'two_week_total_50_sum_sum': 'two_week_total_50_sum',
194 | 'two_week_total_75_sum_sum': 'two_week_total_75_sum',
195 | 'two_week_total_985_sum_sum': 'two_week_total_985_sum',
196 | 'two_week_total_100_sum_sum': 'two_week_total_100_sum',
197 | 'two_week_total_unq_sum_sum': 'two_week_total_unq_sum',
198 | 'two_week_total_secs_sum_sum': 'two_week_total_secs_sum',
199 | 'one_semimonth_log_day_sum': 'one_semimonth_log_day',
200 | 'one_semimonth_total_25_sum_sum': 'one_semimonth_total_25_sum',
201 | 'one_semimonth_total_50_sum_sum': 'one_semimonth_total_50_sum',
202 | 'one_semimonth_total_75_sum_sum': 'one_semimonth_total_75_sum',
203 | 'one_semimonth_total_985_sum_sum': 'one_semimonth_total_985_sum',
204 | 'one_semimonth_total_100_sum_sum': 'one_semimonth_total_100_sum',
205 | 'one_semimonth_total_unq_sum_sum': 'one_semimonth_total_unq_sum',
206 | 'one_semimonth_total_secs_sum_sum': 'one_semimonth_total_secs_sum',
207 | 'two_semimonth_log_day_sum': 'two_semimonth_log_day',
208 | 'two_semimonth_total_25_sum_sum': 'two_semimonth_total_25_sum',
209 | 'two_semimonth_total_50_sum_sum': 'two_semimonth_total_50_sum',
210 | 'two_semimonth_total_75_sum_sum': 'two_semimonth_total_75_sum',
211 | 'two_semimonth_total_985_sum_sum': 'two_semimonth_total_985_sum',
212 | 'two_semimonth_total_100_sum_sum': 'two_semimonth_total_100_sum',
213 | 'two_semimonth_total_unq_sum_sum': 'two_semimonth_total_unq_sum',
214 | 'two_semimonth_total_secs_sum_sum': 'two_semimonth_total_secs_sum'
215 | }, inplace=True)
216 |
217 | return user_log_all
218 |
219 |
220 | gc.enable()
221 |
222 | size = 1e6
223 | reader = pd.read_csv('../input/user_logs_v2.csv', chunksize=size)
224 | start_time = time.time()
225 | for i in range(18): # 17
226 | user_log_chunk = next(reader)
227 | if i == 0:
228 | user_log_feb = process_user_log(user_log_chunk)
229 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
230 | else:
231 | user_log_feb = user_log_feb.append(process_user_log(user_log_chunk))
232 | print("Loop ", i, "took %s seconds" % (time.time() - start_time))
233 | del user_log_chunk
234 |
235 | user_log_feb = process_user_log_together(user_log_feb)
236 |
237 | user_log_feb.to_csv("../input/processed_user_log_mar.csv", index=False)
238 |
239 | print('Done')
240 |
--------------------------------------------------------------------------------
/src/weight_AveragingEnsemble.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | '''
4 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781
5 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv')
6 | weight1 = 0.30
7 |
8 | # LB 0.12383 CV 0.127227
9 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv')
10 | weight2 = 0.30
11 |
12 | # LB 0.12323 Train LogLoss 0.0966805
13 | file3 = pd.read_csv('result/submission_lightgbm_features_all_eta_0.002_round_2000_Dec_13.csv')
14 | weight3 = 0.2
15 |
16 | # LB 0.12705 CV 0.136615 Train LogLoss 0.094903
17 | file4 = pd.read_csv('result/submission_xgboost_user_log_transaction_features_eta_0.002_round_2500_Dec_11.csv')
18 | weight4 = 0.2
19 |
20 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \
21 | file3['is_churn'] * weight3 + file4['is_churn'] * weight4
22 |
23 | file1.to_csv('submission_weight_avg_4_0.3_0.3_0.2_0.2.csv', index=False)
24 | '''
25 |
26 | # LB 0.12432 CV 0.122651 Train LogLoss 0.103781
27 | file1 = pd.read_csv('result/submission_lightgbm_all_time_feaetures_origin_version_eta_0.002_round_2500_Dec_16.csv')
28 | weight1 = 0.28
29 |
30 | # LB 0.12383 CV 0.127227
31 | file2 = pd.read_csv('result/submission_lightgbm_features_trans_user_log_split_by_month_eta_0.002_round_2500_Dec_15.csv')
32 | weight2 = 0.28
33 |
34 | # LB 0.12393 CV 0.122639 Train LogLoss 0.102916
35 | file3 = pd.read_csv('result/submission_lightgbm_features_selection_origin_version_eta_0.002_round_2500_Dec_17.csv')
36 | weight3 = 0.44
37 |
38 | file1['is_churn'] = file1['is_churn'] * weight1 + file2['is_churn'] * weight2 + \
39 | file3['is_churn'] * weight3
40 |
41 | file1.to_csv('submission_weight_avg_0.44_0.28_0.28.csv', index=False)
42 |
--------------------------------------------------------------------------------
/src/xgboost_features.py:
--------------------------------------------------------------------------------
1 | import gc
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import sklearn
6 | import xgboost as xgb
7 |
8 |
9 | def xgb_score(preds, dtrain):
10 | labels = dtrain.get_label()
11 | return 'log_loss', sklearn.metrics.log_loss(labels, preds)
12 |
13 |
14 | gc.enable()
15 |
16 | transactions = pd.read_csv('../input/processed_transaction_features.csv', index_col=0)
17 |
18 | members = pd.read_csv('../input/members_v3.csv')
19 |
20 | user_log_all = pd.read_csv('../input/processed_user_log_all.csv')
21 | # user_log_test = pd.read_csv('../input/processed_features_user_log_all_time_including_mar.csv')
22 | user_log_feb = pd.read_csv('../input/processed_features_user_log_feb.csv')
23 | user_log_mar = pd.read_csv('../input/processed_features_user_log_mar.csv')
24 |
25 | train = pd.read_csv('../input/train.csv')
26 | train = train.append(pd.read_csv('../input/train_v2.csv'), ignore_index=True)
27 |
28 | test = pd.read_csv('../input/sample_submission_v2.csv')
29 |
30 | # Merge Data
31 |
32 | train = pd.merge(train, transactions, how='left', on='msno')
33 | test = pd.merge(test, transactions, how='left', on='msno')
34 |
35 | train = pd.merge(train, user_log_all, how='left', on='msno')
36 | test = pd.merge(test, user_log_all, how='left', on='msno')
37 |
38 | train = pd.merge(train, user_log_feb, how='left', on='msno')
39 | test = pd.merge(test, user_log_mar, how='left', on='msno')
40 |
41 | train = pd.merge(train, members, how='left', on='msno')
42 | test = pd.merge(test, members, how='left', on='msno')
43 |
44 | del transactions, members
45 | gc.collect()
46 |
47 | # Drop duplicates first
48 | test = test.drop_duplicates('msno')
49 |
50 | gender = {'male': 1, 'female': 2}
51 | train['gender'] = train['gender'].map(gender)
52 | test['gender'] = test['gender'].map(gender)
53 |
54 | train['bd'] = train['bd'].replace(0, train['bd'].mode())
55 | test['bd'] = test['bd'].replace(0, test['bd'].mode())
56 |
57 | train['gender'] = train['gender'].replace(0, train['gender'].mean())
58 | test['gender'] = test['gender'].replace(0, test['gender'].mean())
59 |
60 | # train = train.fillna(0)
61 | # test = test.fillna(0)
62 |
63 | # Delete date for now
64 | train = train.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
65 | test = test.drop(['transaction_date', 'membership_expire_date', 'registration_init_time'], axis=1)
66 |
67 | # Create 4 new features
68 | train['autorenew_&_not_cancel'] = ((train.is_auto_renew == 1) == (train.is_cancel == 0)).astype(np.int8)
69 | test['autorenew_&_not_cancel'] = ((test.is_auto_renew == 1) == (test.is_cancel == 0)).astype(np.int8)
70 |
71 | train['notAutorenew_&_cancel'] = ((train.is_auto_renew == 0) == (train.is_cancel == 1)).astype(np.int8)
72 | test['notAutorenew_&_cancel'] = ((test.is_auto_renew == 0) == (test.is_cancel == 1)).astype(np.int8)
73 |
74 | train = train.drop(['payment_method_id2',
75 | 'payment_method_id3',
76 | 'payment_method_id4',
77 | 'payment_method_id5',
78 | 'payment_method_id6',
79 | 'payment_method_id8',
80 | 'payment_method_id10',
81 | 'payment_method_id11',
82 | 'payment_method_id12',
83 | 'payment_method_id13',
84 | 'payment_method_id14',
85 | 'payment_method_id16',
86 | 'payment_method_id17',
87 | 'payment_method_id18',
88 | 'payment_method_id19',
89 | 'payment_method_id20',
90 | 'payment_method_id21',
91 | 'payment_method_id22',
92 | 'payment_method_id23',
93 | 'payment_method_id24',
94 | 'payment_method_id25',
95 | 'payment_method_id27',
96 | 'payment_method_id28',
97 | 'payment_method_id31',
98 | 'payment_method_id33',
99 | 'payment_method_id34',
100 | 'transaction_date_day',
101 | 'membership_expire_date_day'], axis=1)
102 |
103 | test = test.drop(['payment_method_id2',
104 | 'payment_method_id3',
105 | 'payment_method_id4',
106 | 'payment_method_id5',
107 | 'payment_method_id6',
108 | 'payment_method_id8',
109 | 'payment_method_id10',
110 | 'payment_method_id11',
111 | 'payment_method_id12',
112 | 'payment_method_id13',
113 | 'payment_method_id14',
114 | 'payment_method_id16',
115 | 'payment_method_id17',
116 | 'payment_method_id18',
117 | 'payment_method_id19',
118 | 'payment_method_id20',
119 | 'payment_method_id21',
120 | 'payment_method_id22',
121 | 'payment_method_id23',
122 | 'payment_method_id24',
123 | 'payment_method_id25',
124 | 'payment_method_id27',
125 | 'payment_method_id28',
126 | 'payment_method_id31',
127 | 'payment_method_id33',
128 | 'payment_method_id34',
129 | 'transaction_date_day',
130 | 'membership_expire_date_day'], axis=1)
131 |
132 | feature_list = [
133 | # raw data
134 | 'msno', 'payment_method_id', 'payment_plan_days', 'plan_list_price', 'actual_amount_paid', 'is_auto_renew',
135 | 'is_cancel', 'city', 'bd', 'gender', 'registered_via', 'is_churn',
136 | # advanced features
137 | # user_log
138 | 'log_day', 'total_25_sum', 'total_50_sum', 'total_75_sum', 'total_985_sum', 'total_100_sum', 'total_unq_sum',
139 | 'total_secs_sum',
140 | 'total_sum', 'total_25ratio', 'total_100ratio', 'persong_play', 'persong_time', 'daily_play', 'daily_listentime',
141 | 'one_week_sum', 'two_week_sum', 'one_week_secs_sum', 'two_week_secs_sum', 'week_secs_sum_ratio', 'week_sum_ratio',
142 | 'one_semimonth_sum', 'two_semimonth_sum', 'one_semimonth_secs_sum', 'two_semimonth_secs_sum',
143 | 'semimonth_secs_sum_ratio', 'semimonth_sum_ratio',
144 | # transactions
145 | 'discount', 'amt_per_day', 'is_discount', 'membership_days',
146 | 'transaction_date_year', 'transaction_date_month', 'transaction_date_day',
147 | 'membership_expire_date_year', 'membership_expire_date_month', 'membership_expire_date_day'
148 | # members
149 | ]
150 |
151 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
152 |
153 | params = {
154 | 'base_score': 0.5,
155 | 'eta': 0.002,
156 | 'max_depth': 6,
157 | 'booster': 'gbtree',
158 | 'colsample_bylevel': 1,
159 | 'colsample_bytree': 1.0,
160 | 'gamma': 1,
161 | 'max_child_weight': 5,
162 | 'n_estimators': 600,
163 | 'reg_alpha': '0',
164 | 'reg_lambda': '1',
165 | 'scale_pos_weight': 1,
166 | 'objective': 'binary:logistic',
167 | 'eval_metric': 'logloss',
168 | 'seed': 2017,
169 | 'silent': True
170 | }
171 | x1, x2, y1, y2 = sklearn.model_selection.train_test_split(train[cols], train['is_churn'], test_size=0.3,
172 | random_state=2017)
173 | watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
174 | cv_output = xgb.cv(params, xgb.DMatrix(x1, y1), num_boost_round=1500, early_stopping_rounds=20, verbose_eval=50,
175 | show_stdv=False)
176 | model = xgb.train(params, xgb.DMatrix(x1, y1), 2500, watchlist, feval=xgb_score, maximize=False, verbose_eval=50,
177 | early_stopping_rounds=50)
178 |
179 | pred = model.predict(xgb.DMatrix(test[cols]), ntree_limit=model.best_ntree_limit)
180 |
181 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
182 | print(len(test))
183 | test[['msno', 'is_churn']].to_csv('submission_xgboost_all_features_selection_eta_0.002_round_2500_Dec_15.csv',
184 | index=False)
185 |
--------------------------------------------------------------------------------
/src/xgboost_gridsearch.py:
--------------------------------------------------------------------------------
1 | import gc
2 | import warnings
3 | from datetime import datetime
4 |
5 | import pandas as pd
6 | import sklearn
7 | import xgboost as xgb
8 | from sklearn.model_selection import GridSearchCV
9 | from sklearn.model_selection import RandomizedSearchCV
10 | from sklearn.model_selection import StratifiedKFold
11 |
12 |
13 | def xgb_score(preds, dtrain):
14 | labels = dtrain.get_label()
15 | return 'log_loss', sklearn.metrics.log_loss(labels, preds)
16 |
17 |
18 | def timer(start_time=None):
19 | if not start_time:
20 | start_time = datetime.now()
21 | return start_time
22 | elif start_time:
23 | thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
24 | tmin, tsec = divmod(temp_sec, 60)
25 | print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
26 |
27 |
28 | gc.enable()
29 | warnings.filterwarnings('ignore')
30 |
31 | transactions = pd.read_csv('../input/processed_transaction_all.csv')
32 |
33 | members_v1 = pd.read_csv('../input/members.csv')
34 | members_v2 = pd.read_csv('../input/members_v2.csv')
35 | members = members_v1.append(members_v2, ignore_index=True)
36 |
37 | user_log = pd.read_csv('../input/processed_user_log_all.csv')
38 |
39 | train_v1 = pd.read_csv('../input/train.csv')
40 | train_v2 = pd.read_csv('../input/train_v2.csv')
41 | train = train_v1.append(train_v2, ignore_index=True)
42 |
43 | test = pd.read_csv('../input/sample_submission_v2.csv')
44 |
45 | # Merge Data
46 |
47 | train = pd.merge(train, transactions, how='left', on='msno')
48 | test = pd.merge(test, transactions, how='left', on='msno')
49 |
50 | train = pd.merge(train, user_log, how='left', on='msno')
51 | test = pd.merge(test, user_log, how='left', on='msno')
52 |
53 | train = pd.merge(train, members, how='left', on='msno')
54 | test = pd.merge(test, members, how='left', on='msno')
55 |
56 | # Drop duplicates first
57 | test = test.drop_duplicates('msno')
58 |
59 | gender = {'male': 1, 'female': 2}
60 | train['gender'] = train['gender'].map(gender)
61 | test['gender'] = test['gender'].map(gender)
62 |
63 | train = train.fillna(0)
64 | test = test.fillna(0)
65 |
66 | # Delete date for now
67 | train = train.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
68 | test = test.drop(['transaction_date', 'membership_expire_date', 'expiration_date', 'registration_init_time'], axis=1)
69 | # Delete date for now
70 |
71 | cols = [c for c in train.columns if c not in ['is_churn', 'msno']]
72 |
73 | Y = train['is_churn'].values
74 | X = train[cols]
75 |
76 | # A parameter grid for XGBoost
77 | params = {
78 | 'min_child_weight': [1, 5, 10],
79 | 'gamma': [0.5, 1, 1.5, 2, 5],
80 | 'colsample_bytree': [0.6, 0.8, 1.0],
81 | 'max_depth': [3, 4, 5, 6, 7],
82 | 'subsample': [0.7, 0.75, 0.8]
83 | }
84 |
85 | model = xgb.XGBClassifier(learning_rate=0.002, n_estimators=600, objective='binary:logistic',
86 | silent=True, nthread=1)
87 |
88 | folds = 3
89 | param_comb = 5
90 |
91 | skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
92 |
93 | random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='neg_log_loss', n_jobs=4,
94 | cv=skf.split(X, Y), verbose=3, random_state=1001)
95 |
96 | # Here we go
97 | start_time = timer(None) # timing starts from this point for "start_time" variable
98 | random_search.fit(X, Y)
99 | timer(start_time) # timing ends here for "start_time" variable
100 |
101 | print('\n All results:')
102 | print(random_search.cv_results_)
103 | print('\n Best estimator:')
104 | print(random_search.best_estimator_)
105 | print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
106 | print(random_search.best_score_ * 2 - 1)
107 | print('\n Best hyperparameters:')
108 | print(random_search.best_params_)
109 | results = pd.DataFrame(random_search.cv_results_)
110 | results.to_csv('xgboost_random_grid_search_results_01.csv', index=False)
111 |
112 | pred = random_search.predict_proba(xgb.DMatrix(test[cols]))
113 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
114 | print(len(test))
115 | test[['msno', 'is_churn']].to_csv('submission_xgboost_random_serach_best_param.csv', index=False)
116 |
117 | grid = GridSearchCV(estimator=model, param_grid=params, scoring='neg_log_loss', n_jobs=4, cv=skf.split(X, Y), verbose=3)
118 | grid.fit(X, Y)
119 | print('\n All results:')
120 | print(grid.cv_results_)
121 | print('\n Best estimator:')
122 | print(grid.best_estimator_)
123 | print('\n Best score:')
124 | print(grid.best_score_ * 2 - 1)
125 | print('\n Best parameters:')
126 | print(grid.best_params_)
127 | results = pd.DataFrame(grid.cv_results_)
128 | results.to_csv('xgboost_grid_search_results_01.csv', index=False)
129 |
130 | pred = grid.best_estimator_.predict_proba(xgb.DMatrix(test[cols]))
131 | test['is_churn'] = pred.clip(0.0000001, 0.999999)
132 | print(len(test))
133 | test[['msno', 'is_churn']].to_csv('submission_xgboost_grid_search_best_param.csv', index=False)
134 |
--------------------------------------------------------------------------------