├── README.md
├── code
├── .ipynb_checkpoints
│ ├── Baseline-checkpoint.ipynb
│ └── Baseline_bagging_version-checkpoint.ipynb
├── Baseline.ipynb
└── Baseline_bagging_version.ipynb
└── input
├── submit_example.csv
├── test_dataset.csv
└── train_dataset.csv
/README.md:
--------------------------------------------------------------------------------
1 | # Credit-Scoring-Regression by YourVenn@Kaggle
2 | - 消费者人群画像—信用智能评分比赛开源
3 | - 请把源数据放在 input/
4 | - Baseline: 线上6379+,Baseline_bagging_version: 线上6388+
5 | - 认为有用的朋友,方便的话求点赞~ 谢谢!
6 |
--------------------------------------------------------------------------------
/code/.ipynb_checkpoints/Baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Packages"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 14,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "#过年没时间做了,专心搞kaggle去了\n",
19 | "#kaggle玩家欢迎和我交流,ID是YourVenn"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "name": "stderr",
29 | "output_type": "stream",
30 | "text": [
31 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
32 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
33 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
34 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
35 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "import time\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "import seaborn as sns\n",
43 | "import numpy as np\n",
44 | "import pandas as pd\n",
45 | "import lightgbm as lgb\n",
46 | "from sklearn.model_selection import StratifiedKFold\n",
47 | "from sklearn.preprocessing import LabelEncoder"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Input data"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "metadata": {
61 | "collapsed": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "data_path = '../input/'\n",
66 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
67 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
68 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "### Pre-processing"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/html": [
86 | "
\n",
87 | "\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " 用户编码 | \n",
105 | " 用户实名制是否通过核实 | \n",
106 | " 用户年龄 | \n",
107 | " 是否大学生客户 | \n",
108 | " 是否黑名单客户 | \n",
109 | " 是否4G不健康客户 | \n",
110 | " 用户网龄(月) | \n",
111 | " 用户最近一次缴费距今时长(月) | \n",
112 | " 缴费用户最近一次缴费金额(元) | \n",
113 | " 用户近6个月平均消费值(元) | \n",
114 | " ... | \n",
115 | " 当月是否景点游览 | \n",
116 | " 当月是否体育场馆消费 | \n",
117 | " 当月网购类应用使用次数 | \n",
118 | " 当月物流快递类应用使用次数 | \n",
119 | " 当月金融理财类应用使用总次数 | \n",
120 | " 当月视频播放类应用使用次数 | \n",
121 | " 当月飞机类应用使用次数 | \n",
122 | " 当月火车类应用使用次数 | \n",
123 | " 当月旅游资讯类应用使用次数 | \n",
124 | " 信用分 | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " 0 | \n",
130 | " a4651f98c82948b186bdcdc8108381b4 | \n",
131 | " 1 | \n",
132 | " 44 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 186 | \n",
137 | " 1 | \n",
138 | " 99.8 | \n",
139 | " 163.86 | \n",
140 | " ... | \n",
141 | " 1 | \n",
142 | " 1 | \n",
143 | " 713 | \n",
144 | " 0 | \n",
145 | " 2740 | \n",
146 | " 7145 | \n",
147 | " 0 | \n",
148 | " 0 | \n",
149 | " 30 | \n",
150 | " 664 | \n",
151 | "
\n",
152 | " \n",
153 | "
\n",
154 | "
1 rows × 30 columns
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n",
159 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n",
160 | "\n",
161 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n",
162 | "0 0 186 1 99.8 163.86 ... \n",
163 | "\n",
164 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n",
165 | "0 1 1 713 0 2740 \n",
166 | "\n",
167 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n",
168 | "0 7145 0 0 30 664 \n",
169 | "\n",
170 | "[1 rows x 30 columns]"
171 | ]
172 | },
173 | "execution_count": 3,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "train_data.head(1)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 4,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "name": "stdout",
189 | "output_type": "stream",
190 | "text": [
191 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
192 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n",
193 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
194 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
195 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
196 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
197 | " '当月旅游资讯类应用使用次数', '信用分'],\n",
198 | " dtype='object')\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "#all chinese name- -\n",
204 | "#rename one by one\n",
205 | "print(train_data.columns)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 5,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
217 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
218 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
219 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
220 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
221 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
222 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n",
223 | " 'tour_app_count','score']\n",
224 | "test_data.columns = train_data.columns[:-1]"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "### Feature Engineering"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 6,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "name": "stderr",
241 | "output_type": "stream",
242 | "text": [
243 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
244 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
245 | "\n",
246 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
247 | " \n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
253 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n",
254 | "\n",
255 | "def produce_offline_feat(train_data):\n",
256 | " train_data['top_up_amount_offline'] = 0\n",
257 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
258 | " train_data['top_up_amount'] != 0] = 1\n",
259 | " return train_data\n",
260 | "\n",
261 | "train_data = produce_offline_feat(train_data)\n",
262 | "test_data = produce_offline_feat(test_data)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 7,
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "outputs": [],
272 | "source": [
273 | "def produce_fee_rate(train_data):\n",
274 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n",
275 | " train_data['current_fee_stability'] = \\\n",
276 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
277 | " \n",
278 | " #当月话费/当月账户余额\n",
279 | " train_data['use_left_rate'] = \\\n",
280 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
281 | " return train_data\n",
282 | "\n",
283 | "train_data = produce_fee_rate(train_data)\n",
284 | "test_data = produce_fee_rate(test_data)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "### Training"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 8,
297 | "metadata": {
298 | "collapsed": true
299 | },
300 | "outputs": [],
301 | "source": [
302 | "def display_importances(feature_importance_df_):\n",
303 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
304 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
305 | " plt.figure(figsize=(8, 10))\n",
306 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
307 | " plt.title('LightGBM Features (avg over folds)')\n",
308 | " plt.tight_layout()\n",
309 | " plt.show()"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 9,
315 | "metadata": {
316 | "collapsed": true
317 | },
318 | "outputs": [],
319 | "source": [
320 | "#para\n",
321 | "params = {\n",
322 | " 'learning_rate': 0.01,\n",
323 | " 'boosting_type': 'gbdt',\n",
324 | " 'objective': 'regression_l1',\n",
325 | " 'metric': 'mae',\n",
326 | " 'feature_fraction': 0.6,\n",
327 | " 'bagging_fraction': 0.8,\n",
328 | " 'bagging_freq': 2,\n",
329 | " 'num_leaves': 31,\n",
330 | " 'verbose': -1,\n",
331 | " 'max_depth': 5,\n",
332 | " 'lambda_l2': 5, 'lambda_l1': 0\n",
333 | "}"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 10,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "NFOLDS = 5\n",
345 | "train_label = train_data['score']\n",
346 | "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n",
347 | "kf = kfold.split(train_data, train_label)"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 11,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n",
359 | " '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n",
360 | " 'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n",
361 | " 'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n",
362 | " 'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n",
363 | " 'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n",
364 | " 'online_shopping_count', 'express_count', 'finance_app_count',\n",
365 | " 'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n",
366 | " 'score', 'top_up_amount_offline', 'current_fee_stability',\n",
367 | " 'use_left_rate'],\n",
368 | " dtype='object')"
369 | ]
370 | },
371 | "execution_count": 11,
372 | "metadata": {},
373 | "output_type": "execute_result"
374 | }
375 | ],
376 | "source": [
377 | "train_data.columns"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 12,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
389 | "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 13,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stderr",
399 | "output_type": "stream",
400 | "text": [
401 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
402 | " % (min_groups, self.n_splits)), Warning)\n"
403 | ]
404 | },
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "fold: 0 training\n",
410 | "Training until validation scores don't improve for 50 rounds.\n",
411 | "Early stopping, best iteration is:\n",
412 | "[2834]\tvalid_0's l1: 14.7519\n",
413 | "fold: 1 training\n",
414 | "Training until validation scores don't improve for 50 rounds.\n",
415 | "Early stopping, best iteration is:\n",
416 | "[2780]\tvalid_0's l1: 14.6775\n",
417 | "fold: 2 training\n",
418 | "Training until validation scores don't improve for 50 rounds.\n",
419 | "Early stopping, best iteration is:\n",
420 | "[3745]\tvalid_0's l1: 14.728\n",
421 | "fold: 3 training\n",
422 | "Training until validation scores don't improve for 50 rounds.\n",
423 | "Early stopping, best iteration is:\n",
424 | "[3009]\tvalid_0's l1: 14.46\n",
425 | "fold: 4 training\n",
426 | "Training until validation scores don't improve for 50 rounds.\n",
427 | "Early stopping, best iteration is:\n",
428 | "[2544]\tvalid_0's l1: 14.7818\n",
429 | "cv score for valid is: 0.06377613710442855\n"
430 | ]
431 | }
432 | ],
433 | "source": [
434 | "cv_pred = np.zeros(test_data.shape[0])\n",
435 | "valid_best_l2_all = 0\n",
436 | "\n",
437 | "feature_importance_df = pd.DataFrame()\n",
438 | "count = 0\n",
439 | "for i, (train_fold, validate) in enumerate(kf):\n",
440 | " print('fold: ',i, ' training')\n",
441 | " X_train, X_validate, label_train, label_validate = \\\n",
442 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
443 | " train_label[train_fold], train_label[validate]\n",
444 | " dtrain = lgb.Dataset(X_train, label_train)\n",
445 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
446 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
447 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
448 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
449 | "\n",
450 | " fold_importance_df = pd.DataFrame()\n",
451 | " fold_importance_df[\"feature\"] = list(X_train.columns)\n",
452 | " fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n",
453 | " fold_importance_df[\"fold\"] = count + 1\n",
454 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
455 | " count += 1\n",
456 | "\n",
457 | "cv_pred /= NFOLDS\n",
458 | "valid_best_l2_all /= NFOLDS\n",
459 | "print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 15,
465 | "metadata": {
466 | "collapsed": true
467 | },
468 | "outputs": [],
469 | "source": [
470 | "display_importances(feature_importance_df)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {
477 | "collapsed": true
478 | },
479 | "outputs": [],
480 | "source": [
481 | "baseline\n",
482 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n",
483 | " \n",
484 | "#充值金额是否为整数\n",
485 | "cv score for valid is: 0.06343660584697094\n",
486 | "#当月话费/半年话费\n",
487 | "cv score for valid is: 0.06349188259250227\n",
488 | "#当月话费/余额\n",
489 | "cv score for valid is: 0.06350638782547711\n",
490 | " \n",
491 | "#leaves 31\n",
492 | "cv score for valid is: 0.06354362406472286\n",
493 | "#remove l1, l2 = 5\n",
494 | "cv score for valid is: 0.06358730556250403\n",
495 | "#feature fraction 0.7\n",
496 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n",
497 | "max_depth 5, objective l1\n",
498 | "cv score for valid is: 0.06367445081783887\n",
499 | "feature fraction 0.6\n",
500 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n",
501 | "remove blk flag\n",
502 | "cv score for valid is: 0.06377613710442855"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "### Submit"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 39,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "name": "stderr",
519 | "output_type": "stream",
520 | "text": [
521 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
522 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
523 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
524 | "\n",
525 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
526 | " \n"
527 | ]
528 | }
529 | ],
530 | "source": [
531 | "test_data_sub = test_data[['uid']]\n",
532 | "test_data_sub['score'] = cv_pred\n",
533 | "test_data_sub.columns = ['id','score']"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 40,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stderr",
543 | "output_type": "stream",
544 | "text": [
545 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
546 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
547 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
548 | "\n",
549 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
550 | " \"\"\"Entry point for launching an IPython kernel.\n"
551 | ]
552 | }
553 | ],
554 | "source": [
555 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 41,
561 | "metadata": {
562 | "collapsed": true
563 | },
564 | "outputs": [],
565 | "source": [
566 | "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "metadata": {
573 | "collapsed": true
574 | },
575 | "outputs": [],
576 | "source": []
577 | }
578 | ],
579 | "metadata": {
580 | "kernelspec": {
581 | "display_name": "Python 3",
582 | "language": "python",
583 | "name": "python3"
584 | },
585 | "language_info": {
586 | "codemirror_mode": {
587 | "name": "ipython",
588 | "version": 3
589 | },
590 | "file_extension": ".py",
591 | "mimetype": "text/x-python",
592 | "name": "python",
593 | "nbconvert_exporter": "python",
594 | "pygments_lexer": "ipython3",
595 | "version": "3.6.1"
596 | }
597 | },
598 | "nbformat": 4,
599 | "nbformat_minor": 2
600 | }
601 |
--------------------------------------------------------------------------------
/code/.ipynb_checkpoints/Baseline_bagging_version-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Packages"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stderr",
17 | "output_type": "stream",
18 | "text": [
19 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
20 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
21 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
22 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
23 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "import time\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "import seaborn as sns\n",
31 | "import numpy as np\n",
32 | "import pandas as pd\n",
33 | "import lightgbm as lgb\n",
34 | "from sklearn.model_selection import StratifiedKFold\n",
35 | "from sklearn.preprocessing import LabelEncoder"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "### Input data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {
49 | "collapsed": true
50 | },
51 | "outputs": [],
52 | "source": [
53 | "data_path = '../input/'\n",
54 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
55 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
56 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
57 | ]
58 | },
59 | {
60 | "cell_type": "markdown",
61 | "metadata": {},
62 | "source": [
63 | "### Pre-processing"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 3,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/html": [
74 | "\n",
75 | "\n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " | \n",
92 | " 用户编码 | \n",
93 | " 用户实名制是否通过核实 | \n",
94 | " 用户年龄 | \n",
95 | " 是否大学生客户 | \n",
96 | " 是否黑名单客户 | \n",
97 | " 是否4G不健康客户 | \n",
98 | " 用户网龄(月) | \n",
99 | " 用户最近一次缴费距今时长(月) | \n",
100 | " 缴费用户最近一次缴费金额(元) | \n",
101 | " 用户近6个月平均消费值(元) | \n",
102 | " ... | \n",
103 | " 当月是否景点游览 | \n",
104 | " 当月是否体育场馆消费 | \n",
105 | " 当月网购类应用使用次数 | \n",
106 | " 当月物流快递类应用使用次数 | \n",
107 | " 当月金融理财类应用使用总次数 | \n",
108 | " 当月视频播放类应用使用次数 | \n",
109 | " 当月飞机类应用使用次数 | \n",
110 | " 当月火车类应用使用次数 | \n",
111 | " 当月旅游资讯类应用使用次数 | \n",
112 | " 信用分 | \n",
113 | "
\n",
114 | " \n",
115 | " \n",
116 | " \n",
117 | " 0 | \n",
118 | " a4651f98c82948b186bdcdc8108381b4 | \n",
119 | " 1 | \n",
120 | " 44 | \n",
121 | " 0 | \n",
122 | " 0 | \n",
123 | " 0 | \n",
124 | " 186 | \n",
125 | " 1 | \n",
126 | " 99.8 | \n",
127 | " 163.86 | \n",
128 | " ... | \n",
129 | " 1 | \n",
130 | " 1 | \n",
131 | " 713 | \n",
132 | " 0 | \n",
133 | " 2740 | \n",
134 | " 7145 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 30 | \n",
138 | " 664 | \n",
139 | "
\n",
140 | " \n",
141 | "
\n",
142 | "
1 rows × 30 columns
\n",
143 | "
"
144 | ],
145 | "text/plain": [
146 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n",
147 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n",
148 | "\n",
149 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n",
150 | "0 0 186 1 99.8 163.86 ... \n",
151 | "\n",
152 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n",
153 | "0 1 1 713 0 2740 \n",
154 | "\n",
155 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n",
156 | "0 7145 0 0 30 664 \n",
157 | "\n",
158 | "[1 rows x 30 columns]"
159 | ]
160 | },
161 | "execution_count": 3,
162 | "metadata": {},
163 | "output_type": "execute_result"
164 | }
165 | ],
166 | "source": [
167 | "train_data.head(1)"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 4,
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "name": "stdout",
177 | "output_type": "stream",
178 | "text": [
179 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
180 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n",
181 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
182 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
183 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
184 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
185 | " '当月旅游资讯类应用使用次数', '信用分'],\n",
186 | " dtype='object')\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "#all chinese name- -\n",
192 | "#rename one by one\n",
193 | "print(train_data.columns)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 5,
199 | "metadata": {
200 | "collapsed": true
201 | },
202 | "outputs": [],
203 | "source": [
204 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
205 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
206 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
207 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
208 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
209 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
210 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n",
211 | " 'tour_app_count','score']\n",
212 | "test_data.columns = train_data.columns[:-1]"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": true
220 | },
221 | "outputs": [],
222 | "source": []
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "metadata": {},
227 | "source": [
228 | "### Feature Engineering"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 6,
234 | "metadata": {},
235 | "outputs": [
236 | {
237 | "name": "stderr",
238 | "output_type": "stream",
239 | "text": [
240 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
241 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
242 | "\n",
243 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
244 | " # This is added back by InteractiveShellApp.init_path()\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
250 | "#先前余额,当前余额 + 当月话费 - 上次缴费 --- useless\n",
251 | "#充值金额/余额 --- useless\n",
252 | "#当月话费/最近充值金额 --- useless\n",
253 | "#六个月均值/充值金额 --- useless\n",
254 | "\n",
255 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n",
256 | "\n",
257 | "def produce_offline_feat(train_data):\n",
258 | " train_data['top_up_amount_offline'] = 0\n",
259 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
260 | " train_data['top_up_amount'] != 0] = 1\n",
261 | " return train_data\n",
262 | "\n",
263 | "train_data = produce_offline_feat(train_data)\n",
264 | "test_data = produce_offline_feat(test_data)"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 7,
270 | "metadata": {
271 | "collapsed": true
272 | },
273 | "outputs": [],
274 | "source": [
275 | "def produce_fee_rate(train_data):\n",
276 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n",
277 | " train_data['current_fee_stability'] = \\\n",
278 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
279 | " \n",
280 | " #当月话费/当月账户余额\n",
281 | " train_data['use_left_rate'] = \\\n",
282 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
283 | " return train_data\n",
284 | "\n",
285 | "train_data = produce_fee_rate(train_data)\n",
286 | "test_data = produce_fee_rate(test_data)"
287 | ]
288 | },
289 | {
290 | "cell_type": "markdown",
291 | "metadata": {},
292 | "source": [
293 | "### Training"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 8,
299 | "metadata": {
300 | "collapsed": true
301 | },
302 | "outputs": [],
303 | "source": [
304 | "def display_importances(feature_importance_df_):\n",
305 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
306 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
307 | " plt.figure(figsize=(8, 10))\n",
308 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
309 | " plt.title('LightGBM Features (avg over folds)')\n",
310 | " plt.tight_layout()\n",
311 | " plt.show()"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": 9,
317 | "metadata": {
318 | "collapsed": true
319 | },
320 | "outputs": [],
321 | "source": [
322 | "#para\n",
323 | "params = {\n",
324 | " 'learning_rate': 0.01,\n",
325 | " 'boosting_type': 'gbdt',\n",
326 | " 'objective': 'regression_l1',\n",
327 | " 'metric': 'mae',\n",
328 | " 'feature_fraction': 0.6,\n",
329 | " 'bagging_fraction': 0.8,\n",
330 | " 'bagging_freq': 2,\n",
331 | " 'num_leaves': 31,\n",
332 | " 'verbose': -1,\n",
333 | " 'max_depth': 5,\n",
334 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n",
335 | "}"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 10,
341 | "metadata": {
342 | "collapsed": true
343 | },
344 | "outputs": [],
345 | "source": [
346 | "#para\n",
347 | "params2 = {\n",
348 | " 'learning_rate': 0.01,\n",
349 | " 'boosting_type': 'gbdt',\n",
350 | " 'objective': 'regression_l2',\n",
351 | " 'metric': 'mae',\n",
352 | " 'feature_fraction': 0.6,\n",
353 | " 'bagging_fraction': 0.8,\n",
354 | " 'bagging_freq': 2,\n",
355 | " 'num_leaves': 31,\n",
356 | " 'verbose': -1,\n",
357 | " 'max_depth': 5,\n",
358 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n",
359 | " 'seed': 89\n",
360 | "}"
361 | ]
362 | },
363 | {
364 | "cell_type": "code",
365 | "execution_count": 11,
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "name": "stderr",
370 | "output_type": "stream",
371 | "text": [
372 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
373 | " % (min_groups, self.n_splits)), Warning)\n"
374 | ]
375 | },
376 | {
377 | "name": "stdout",
378 | "output_type": "stream",
379 | "text": [
380 | "fold: 0 training\n",
381 | "Training until validation scores don't improve for 50 rounds.\n",
382 | "Early stopping, best iteration is:\n",
383 | "[2433]\tvalid_0's l1: 14.7441\n",
384 | "fold: 1 training\n",
385 | "Training until validation scores don't improve for 50 rounds.\n",
386 | "Early stopping, best iteration is:\n",
387 | "[1876]\tvalid_0's l1: 14.8595\n",
388 | "fold: 2 training\n",
389 | "Training until validation scores don't improve for 50 rounds.\n",
390 | "Early stopping, best iteration is:\n",
391 | "[2459]\tvalid_0's l1: 14.7082\n",
392 | "fold: 3 training\n",
393 | "Training until validation scores don't improve for 50 rounds.\n",
394 | "Early stopping, best iteration is:\n",
395 | "[2468]\tvalid_0's l1: 14.6564\n",
396 | "fold: 4 training\n",
397 | "Training until validation scores don't improve for 50 rounds.\n",
398 | "Early stopping, best iteration is:\n",
399 | "[2599]\tvalid_0's l1: 14.5114\n",
400 | "fold: 0 training\n",
401 | "Training until validation scores don't improve for 50 rounds.\n",
402 | "Early stopping, best iteration is:\n",
403 | "[3313]\tvalid_0's l1: 14.743\n",
404 | "fold: 1 training\n",
405 | "Training until validation scores don't improve for 50 rounds.\n",
406 | "Early stopping, best iteration is:\n",
407 | "[2590]\tvalid_0's l1: 14.8562\n",
408 | "fold: 2 training\n",
409 | "Training until validation scores don't improve for 50 rounds.\n",
410 | "Early stopping, best iteration is:\n",
411 | "[2523]\tvalid_0's l1: 14.5752\n",
412 | "fold: 3 training\n",
413 | "Training until validation scores don't improve for 50 rounds.\n",
414 | "Early stopping, best iteration is:\n",
415 | "[3564]\tvalid_0's l1: 14.6125\n",
416 | "fold: 4 training\n",
417 | "Training until validation scores don't improve for 50 rounds.\n",
418 | "Early stopping, best iteration is:\n",
419 | "[1853]\tvalid_0's l1: 14.6333\n",
420 | "fold: 0 training\n",
421 | "Training until validation scores don't improve for 50 rounds.\n",
422 | "Early stopping, best iteration is:\n",
423 | "[2851]\tvalid_0's l1: 14.9587\n",
424 | "fold: 1 training\n",
425 | "Training until validation scores don't improve for 50 rounds.\n",
426 | "Early stopping, best iteration is:\n",
427 | "[1875]\tvalid_0's l1: 14.7808\n",
428 | "fold: 2 training\n",
429 | "Training until validation scores don't improve for 50 rounds.\n",
430 | "Early stopping, best iteration is:\n",
431 | "[2957]\tvalid_0's l1: 14.5525\n",
432 | "fold: 3 training\n",
433 | "Training until validation scores don't improve for 50 rounds.\n",
434 | "Early stopping, best iteration is:\n",
435 | "[2723]\tvalid_0's l1: 14.4804\n",
436 | "fold: 4 training\n",
437 | "Training until validation scores don't improve for 50 rounds.\n",
438 | "Early stopping, best iteration is:\n",
439 | "[3311]\tvalid_0's l1: 14.6854\n"
440 | ]
441 | }
442 | ],
443 | "source": [
444 | "cv_pred_all = 0\n",
445 | "en_amount = 3\n",
446 | "for seed in range(en_amount):\n",
447 | " NFOLDS = 5\n",
448 | " train_label = train_data['score']\n",
449 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n",
450 | " kf = kfold.split(train_data, train_label)\n",
451 | "\n",
452 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
453 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
454 | "\n",
455 | "\n",
456 | " cv_pred = np.zeros(test_data.shape[0])\n",
457 | " valid_best_l2_all = 0\n",
458 | "\n",
459 | " feature_importance_df = pd.DataFrame()\n",
460 | " count = 0\n",
461 | " for i, (train_fold, validate) in enumerate(kf):\n",
462 | " print('fold: ',i, ' training')\n",
463 | " X_train, X_validate, label_train, label_validate = \\\n",
464 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
465 | " train_label[train_fold], train_label[validate]\n",
466 | " dtrain = lgb.Dataset(X_train, label_train)\n",
467 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
468 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
469 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
470 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
471 | "\n",
472 | "# fold_importance_df = pd.DataFrame()\n",
473 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n",
474 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
475 | "# fold_importance_df[\"fold\"] = count + 1\n",
476 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
477 | " count += 1\n",
478 | "\n",
479 | " cv_pred /= NFOLDS\n",
480 | " valid_best_l2_all /= NFOLDS\n",
481 | " \n",
482 | " cv_pred_all += cv_pred\n",
483 | "cv_pred_all /= en_amount\n",
484 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
485 | ]
486 | },
487 | {
488 | "cell_type": "code",
489 | "execution_count": 12,
490 | "metadata": {},
491 | "outputs": [
492 | {
493 | "name": "stderr",
494 | "output_type": "stream",
495 | "text": [
496 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
497 | " % (min_groups, self.n_splits)), Warning)\n"
498 | ]
499 | },
500 | {
501 | "name": "stdout",
502 | "output_type": "stream",
503 | "text": [
504 | "fold: 0 training\n",
505 | "Training until validation scores don't improve for 50 rounds.\n",
506 | "Early stopping, best iteration is:\n",
507 | "[2457]\tvalid_0's l1: 14.7871\n",
508 | "fold: 1 training\n",
509 | "Training until validation scores don't improve for 50 rounds.\n",
510 | "Early stopping, best iteration is:\n",
511 | "[2365]\tvalid_0's l1: 14.6983\n",
512 | "fold: 2 training\n",
513 | "Training until validation scores don't improve for 50 rounds.\n",
514 | "Early stopping, best iteration is:\n",
515 | "[2082]\tvalid_0's l1: 14.7999\n",
516 | "fold: 3 training\n",
517 | "Training until validation scores don't improve for 50 rounds.\n",
518 | "Early stopping, best iteration is:\n",
519 | "[2266]\tvalid_0's l1: 14.483\n",
520 | "fold: 4 training\n",
521 | "Training until validation scores don't improve for 50 rounds.\n",
522 | "Early stopping, best iteration is:\n",
523 | "[2046]\tvalid_0's l1: 14.7681\n",
524 | "fold: 0 training\n",
525 | "Training until validation scores don't improve for 50 rounds.\n",
526 | "Early stopping, best iteration is:\n",
527 | "[2436]\tvalid_0's l1: 14.7728\n",
528 | "fold: 1 training\n",
529 | "Training until validation scores don't improve for 50 rounds.\n",
530 | "Early stopping, best iteration is:\n",
531 | "[2053]\tvalid_0's l1: 14.8066\n",
532 | "fold: 2 training\n",
533 | "Training until validation scores don't improve for 50 rounds.\n",
534 | "Early stopping, best iteration is:\n",
535 | "[2221]\tvalid_0's l1: 14.5464\n",
536 | "fold: 3 training\n",
537 | "Training until validation scores don't improve for 50 rounds.\n",
538 | "Early stopping, best iteration is:\n",
539 | "[2348]\tvalid_0's l1: 14.5198\n",
540 | "fold: 4 training\n",
541 | "Training until validation scores don't improve for 50 rounds.\n",
542 | "Early stopping, best iteration is:\n",
543 | "[2207]\tvalid_0's l1: 14.8169\n",
544 | "fold: 0 training\n",
545 | "Training until validation scores don't improve for 50 rounds.\n",
546 | "Early stopping, best iteration is:\n",
547 | "[2110]\tvalid_0's l1: 14.5323\n",
548 | "fold: 1 training\n",
549 | "Training until validation scores don't improve for 50 rounds.\n",
550 | "Early stopping, best iteration is:\n",
551 | "[2627]\tvalid_0's l1: 14.8493\n",
552 | "fold: 2 training\n",
553 | "Training until validation scores don't improve for 50 rounds.\n",
554 | "Early stopping, best iteration is:\n",
555 | "[2040]\tvalid_0's l1: 14.8335\n",
556 | "fold: 3 training\n",
557 | "Training until validation scores don't improve for 50 rounds.\n",
558 | "Early stopping, best iteration is:\n",
559 | "[2241]\tvalid_0's l1: 14.6379\n",
560 | "fold: 4 training\n",
561 | "Training until validation scores don't improve for 50 rounds.\n",
562 | "Early stopping, best iteration is:\n",
563 | "[2424]\tvalid_0's l1: 14.6794\n"
564 | ]
565 | }
566 | ],
567 | "source": [
568 | "cv_pred_all2 = 0\n",
569 | "en_amount = 3\n",
570 | "for seed in range(en_amount):\n",
571 | " NFOLDS = 5\n",
572 | " train_label = train_data['score']\n",
573 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n",
574 | " kf = kfold.split(train_data, train_label)\n",
575 | "\n",
576 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
577 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
578 | "\n",
579 | "\n",
580 | " cv_pred = np.zeros(test_data.shape[0])\n",
581 | " valid_best_l2_all = 0\n",
582 | "\n",
583 | " feature_importance_df = pd.DataFrame()\n",
584 | " count = 0\n",
585 | " for i, (train_fold, validate) in enumerate(kf):\n",
586 | " print('fold: ',i, ' training')\n",
587 | " X_train, X_validate, label_train, label_validate = \\\n",
588 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
589 | " train_label[train_fold], train_label[validate]\n",
590 | " dtrain = lgb.Dataset(X_train, label_train)\n",
591 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
592 | " bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
593 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
594 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
595 | "\n",
596 | "# fold_importance_df = pd.DataFrame()\n",
597 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n",
598 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
599 | "# fold_importance_df[\"fold\"] = count + 1\n",
600 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
601 | " count += 1\n",
602 | "\n",
603 | " cv_pred /= NFOLDS\n",
604 | " valid_best_l2_all /= NFOLDS\n",
605 | " \n",
606 | " cv_pred_all2 += cv_pred\n",
607 | " \n",
608 | "cv_pred_all2 /= en_amount\n",
609 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
610 | ]
611 | },
612 | {
613 | "cell_type": "code",
614 | "execution_count": 13,
615 | "metadata": {
616 | "collapsed": true
617 | },
618 | "outputs": [],
619 | "source": [
620 | "# display_importances(feature_importance_df)"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": null,
626 | "metadata": {
627 | "collapsed": true
628 | },
629 | "outputs": [],
630 | "source": [
631 | "baseline\n",
632 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n",
633 | " \n",
634 | "#充值金额是否为整数\n",
635 | "cv score for valid is: 0.06343660584697094\n",
636 | "#当月话费/半年话费\n",
637 | "cv score for valid is: 0.06349188259250227\n",
638 | "#当月话费/余额\n",
639 | "cv score for valid is: 0.06350638782547711\n",
640 | " \n",
641 | "#leaves 31\n",
642 | "cv score for valid is: 0.06354362406472286\n",
643 | "#remove l1, l2 = 5\n",
644 | "cv score for valid is: 0.06358730556250403\n",
645 | "#feature fraction 0.7\n",
646 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n",
647 | "max_depth 5, objective l1\n",
648 | "cv score for valid is: 0.06367445081783887\n",
649 | "feature fraction 0.6\n",
650 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n",
651 | "10 fold\n",
652 | "cv score for valid is: 0.0637915578042461 --- 6378 --- useless\n",
653 | "remove blk list flag\n",
654 | "cv score for valid is: 0.06377613710442855"
655 | ]
656 | },
657 | {
658 | "cell_type": "markdown",
659 | "metadata": {},
660 | "source": [
661 | "### Submit"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 14,
667 | "metadata": {},
668 | "outputs": [
669 | {
670 | "name": "stderr",
671 | "output_type": "stream",
672 | "text": [
673 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
674 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
675 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
676 | "\n",
677 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
678 | " \n",
679 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
680 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
681 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
682 | "\n",
683 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
684 | " after removing the cwd from sys.path.\n",
685 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
686 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
687 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
688 | "\n",
689 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
690 | " \"\"\"\n"
691 | ]
692 | }
693 | ],
694 | "source": [
695 | "test_data_sub = test_data[['uid']]\n",
696 | "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n",
697 | "test_data_sub.columns = ['id','score']\n",
698 | "test_data_sub['score1'] = cv_pred_all\n",
699 | "test_data_sub['score2'] = cv_pred_all2"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": 17,
705 | "metadata": {},
706 | "outputs": [
707 | {
708 | "name": "stderr",
709 | "output_type": "stream",
710 | "text": [
711 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
712 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
713 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
714 | "\n",
715 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
716 | " \"\"\"Entry point for launching an IPython kernel.\n"
717 | ]
718 | }
719 | ],
720 | "source": [
721 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
722 | ]
723 | },
724 | {
725 | "cell_type": "code",
726 | "execution_count": 18,
727 | "metadata": {
728 | "collapsed": true
729 | },
730 | "outputs": [],
731 | "source": [
732 | "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 31,
738 | "metadata": {},
739 | "outputs": [
740 | {
741 | "data": {
742 | "text/plain": [
743 | "617.8386873193765"
744 | ]
745 | },
746 | "execution_count": 31,
747 | "metadata": {},
748 | "output_type": "execute_result"
749 | }
750 | ],
751 | "source": [
752 | "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n",
753 | "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725"
754 | ]
755 | },
756 | {
757 | "cell_type": "code",
758 | "execution_count": null,
759 | "metadata": {
760 | "collapsed": true
761 | },
762 | "outputs": [],
763 | "source": []
764 | }
765 | ],
766 | "metadata": {
767 | "kernelspec": {
768 | "display_name": "Python 3",
769 | "language": "python",
770 | "name": "python3"
771 | },
772 | "language_info": {
773 | "codemirror_mode": {
774 | "name": "ipython",
775 | "version": 3
776 | },
777 | "file_extension": ".py",
778 | "mimetype": "text/x-python",
779 | "name": "python",
780 | "nbconvert_exporter": "python",
781 | "pygments_lexer": "ipython3",
782 | "version": "3.6.1"
783 | }
784 | },
785 | "nbformat": 4,
786 | "nbformat_minor": 2
787 | }
788 |
--------------------------------------------------------------------------------
/code/Baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Packages"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 14,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "#过年没时间做了,专心搞kaggle去了\n",
19 | "#kaggle玩家欢迎和我交流,ID是YourVenn"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "metadata": {},
26 | "outputs": [
27 | {
28 | "name": "stderr",
29 | "output_type": "stream",
30 | "text": [
31 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
32 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
33 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
34 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
35 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
36 | ]
37 | }
38 | ],
39 | "source": [
40 | "import time\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "import seaborn as sns\n",
43 | "import numpy as np\n",
44 | "import pandas as pd\n",
45 | "import lightgbm as lgb\n",
46 | "from sklearn.model_selection import StratifiedKFold\n",
47 | "from sklearn.preprocessing import LabelEncoder"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "### Input data"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 2,
60 | "metadata": {
61 | "collapsed": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "data_path = '../input/'\n",
66 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
67 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
68 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "### Pre-processing"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/html": [
86 | "\n",
87 | "\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " 用户编码 | \n",
105 | " 用户实名制是否通过核实 | \n",
106 | " 用户年龄 | \n",
107 | " 是否大学生客户 | \n",
108 | " 是否黑名单客户 | \n",
109 | " 是否4G不健康客户 | \n",
110 | " 用户网龄(月) | \n",
111 | " 用户最近一次缴费距今时长(月) | \n",
112 | " 缴费用户最近一次缴费金额(元) | \n",
113 | " 用户近6个月平均消费值(元) | \n",
114 | " ... | \n",
115 | " 当月是否景点游览 | \n",
116 | " 当月是否体育场馆消费 | \n",
117 | " 当月网购类应用使用次数 | \n",
118 | " 当月物流快递类应用使用次数 | \n",
119 | " 当月金融理财类应用使用总次数 | \n",
120 | " 当月视频播放类应用使用次数 | \n",
121 | " 当月飞机类应用使用次数 | \n",
122 | " 当月火车类应用使用次数 | \n",
123 | " 当月旅游资讯类应用使用次数 | \n",
124 | " 信用分 | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " 0 | \n",
130 | " a4651f98c82948b186bdcdc8108381b4 | \n",
131 | " 1 | \n",
132 | " 44 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 0 | \n",
136 | " 186 | \n",
137 | " 1 | \n",
138 | " 99.8 | \n",
139 | " 163.86 | \n",
140 | " ... | \n",
141 | " 1 | \n",
142 | " 1 | \n",
143 | " 713 | \n",
144 | " 0 | \n",
145 | " 2740 | \n",
146 | " 7145 | \n",
147 | " 0 | \n",
148 | " 0 | \n",
149 | " 30 | \n",
150 | " 664 | \n",
151 | "
\n",
152 | " \n",
153 | "
\n",
154 | "
1 rows × 30 columns
\n",
155 | "
"
156 | ],
157 | "text/plain": [
158 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n",
159 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n",
160 | "\n",
161 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n",
162 | "0 0 186 1 99.8 163.86 ... \n",
163 | "\n",
164 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n",
165 | "0 1 1 713 0 2740 \n",
166 | "\n",
167 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n",
168 | "0 7145 0 0 30 664 \n",
169 | "\n",
170 | "[1 rows x 30 columns]"
171 | ]
172 | },
173 | "execution_count": 3,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "train_data.head(1)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 4,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "name": "stdout",
189 | "output_type": "stream",
190 | "text": [
191 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
192 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n",
193 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
194 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
195 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
196 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
197 | " '当月旅游资讯类应用使用次数', '信用分'],\n",
198 | " dtype='object')\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "#all chinese name- -\n",
204 | "#rename one by one\n",
205 | "print(train_data.columns)"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 5,
211 | "metadata": {
212 | "collapsed": true
213 | },
214 | "outputs": [],
215 | "source": [
216 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
217 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
218 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
219 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
220 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
221 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
222 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n",
223 | " 'tour_app_count','score']\n",
224 | "test_data.columns = train_data.columns[:-1]"
225 | ]
226 | },
227 | {
228 | "cell_type": "markdown",
229 | "metadata": {},
230 | "source": [
231 | "### Feature Engineering"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 6,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "name": "stderr",
241 | "output_type": "stream",
242 | "text": [
243 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
244 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
245 | "\n",
246 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
247 | " \n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
253 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n",
254 | "\n",
255 | "def produce_offline_feat(train_data):\n",
256 | " train_data['top_up_amount_offline'] = 0\n",
257 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
258 | " train_data['top_up_amount'] != 0] = 1\n",
259 | " return train_data\n",
260 | "\n",
261 | "train_data = produce_offline_feat(train_data)\n",
262 | "test_data = produce_offline_feat(test_data)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 7,
268 | "metadata": {
269 | "collapsed": true
270 | },
271 | "outputs": [],
272 | "source": [
273 | "def produce_fee_rate(train_data):\n",
274 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n",
275 | " train_data['current_fee_stability'] = \\\n",
276 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
277 | " \n",
278 | " #当月话费/当月账户余额\n",
279 | " train_data['use_left_rate'] = \\\n",
280 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
281 | " return train_data\n",
282 | "\n",
283 | "train_data = produce_fee_rate(train_data)\n",
284 | "test_data = produce_fee_rate(test_data)"
285 | ]
286 | },
287 | {
288 | "cell_type": "markdown",
289 | "metadata": {},
290 | "source": [
291 | "### Training"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 8,
297 | "metadata": {
298 | "collapsed": true
299 | },
300 | "outputs": [],
301 | "source": [
302 | "def display_importances(feature_importance_df_):\n",
303 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
304 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
305 | " plt.figure(figsize=(8, 10))\n",
306 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
307 | " plt.title('LightGBM Features (avg over folds)')\n",
308 | " plt.tight_layout()\n",
309 | " plt.show()"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 9,
315 | "metadata": {
316 | "collapsed": true
317 | },
318 | "outputs": [],
319 | "source": [
320 | "#para\n",
321 | "params = {\n",
322 | " 'learning_rate': 0.01,\n",
323 | " 'boosting_type': 'gbdt',\n",
324 | " 'objective': 'regression_l1',\n",
325 | " 'metric': 'mae',\n",
326 | " 'feature_fraction': 0.6,\n",
327 | " 'bagging_fraction': 0.8,\n",
328 | " 'bagging_freq': 2,\n",
329 | " 'num_leaves': 31,\n",
330 | " 'verbose': -1,\n",
331 | " 'max_depth': 5,\n",
332 | " 'lambda_l2': 5, 'lambda_l1': 0\n",
333 | "}"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 10,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "NFOLDS = 5\n",
345 | "train_label = train_data['score']\n",
346 | "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n",
347 | "kf = kfold.split(train_data, train_label)"
348 | ]
349 | },
350 | {
351 | "cell_type": "code",
352 | "execution_count": 11,
353 | "metadata": {},
354 | "outputs": [
355 | {
356 | "data": {
357 | "text/plain": [
358 | "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n",
359 | " '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n",
360 | " 'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n",
361 | " 'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n",
362 | " 'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n",
363 | " 'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n",
364 | " 'online_shopping_count', 'express_count', 'finance_app_count',\n",
365 | " 'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n",
366 | " 'score', 'top_up_amount_offline', 'current_fee_stability',\n",
367 | " 'use_left_rate'],\n",
368 | " dtype='object')"
369 | ]
370 | },
371 | "execution_count": 11,
372 | "metadata": {},
373 | "output_type": "execute_result"
374 | }
375 | ],
376 | "source": [
377 | "train_data.columns"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 12,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
389 | "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 13,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stderr",
399 | "output_type": "stream",
400 | "text": [
401 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
402 | " % (min_groups, self.n_splits)), Warning)\n"
403 | ]
404 | },
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "fold: 0 training\n",
410 | "Training until validation scores don't improve for 50 rounds.\n",
411 | "Early stopping, best iteration is:\n",
412 | "[2834]\tvalid_0's l1: 14.7519\n",
413 | "fold: 1 training\n",
414 | "Training until validation scores don't improve for 50 rounds.\n",
415 | "Early stopping, best iteration is:\n",
416 | "[2780]\tvalid_0's l1: 14.6775\n",
417 | "fold: 2 training\n",
418 | "Training until validation scores don't improve for 50 rounds.\n",
419 | "Early stopping, best iteration is:\n",
420 | "[3745]\tvalid_0's l1: 14.728\n",
421 | "fold: 3 training\n",
422 | "Training until validation scores don't improve for 50 rounds.\n",
423 | "Early stopping, best iteration is:\n",
424 | "[3009]\tvalid_0's l1: 14.46\n",
425 | "fold: 4 training\n",
426 | "Training until validation scores don't improve for 50 rounds.\n",
427 | "Early stopping, best iteration is:\n",
428 | "[2544]\tvalid_0's l1: 14.7818\n",
429 | "cv score for valid is: 0.06377613710442855\n"
430 | ]
431 | }
432 | ],
433 | "source": [
434 | "cv_pred = np.zeros(test_data.shape[0])\n",
435 | "valid_best_l2_all = 0\n",
436 | "\n",
437 | "feature_importance_df = pd.DataFrame()\n",
438 | "count = 0\n",
439 | "for i, (train_fold, validate) in enumerate(kf):\n",
440 | " print('fold: ',i, ' training')\n",
441 | " X_train, X_validate, label_train, label_validate = \\\n",
442 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
443 | " train_label[train_fold], train_label[validate]\n",
444 | " dtrain = lgb.Dataset(X_train, label_train)\n",
445 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
446 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
447 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
448 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
449 | "\n",
450 | " fold_importance_df = pd.DataFrame()\n",
451 | " fold_importance_df[\"feature\"] = list(X_train.columns)\n",
452 | " fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n",
453 | " fold_importance_df[\"fold\"] = count + 1\n",
454 | " feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
455 | " count += 1\n",
456 | "\n",
457 | "cv_pred /= NFOLDS\n",
458 | "valid_best_l2_all /= NFOLDS\n",
459 | "print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 15,
465 | "metadata": {
466 | "collapsed": true
467 | },
468 | "outputs": [],
469 | "source": [
470 | "display_importances(feature_importance_df)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": null,
476 | "metadata": {
477 | "collapsed": true
478 | },
479 | "outputs": [],
480 | "source": [
481 | "baseline\n",
482 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n",
483 | " \n",
484 | "#充值金额是否为整数\n",
485 | "cv score for valid is: 0.06343660584697094\n",
486 | "#当月话费/半年话费\n",
487 | "cv score for valid is: 0.06349188259250227\n",
488 | "#当月话费/余额\n",
489 | "cv score for valid is: 0.06350638782547711\n",
490 | " \n",
491 | "#leaves 31\n",
492 | "cv score for valid is: 0.06354362406472286\n",
493 | "#remove l1, l2 = 5\n",
494 | "cv score for valid is: 0.06358730556250403\n",
495 | "#feature fraction 0.7\n",
496 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n",
497 | "max_depth 5, objective l1\n",
498 | "cv score for valid is: 0.06367445081783887\n",
499 | "feature fraction 0.6\n",
500 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n",
501 | "remove blk flag\n",
502 | "cv score for valid is: 0.06377613710442855"
503 | ]
504 | },
505 | {
506 | "cell_type": "markdown",
507 | "metadata": {},
508 | "source": [
509 | "### Submit"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 39,
515 | "metadata": {},
516 | "outputs": [
517 | {
518 | "name": "stderr",
519 | "output_type": "stream",
520 | "text": [
521 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
522 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
523 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
524 | "\n",
525 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
526 | " \n"
527 | ]
528 | }
529 | ],
530 | "source": [
531 | "test_data_sub = test_data[['uid']]\n",
532 | "test_data_sub['score'] = cv_pred\n",
533 | "test_data_sub.columns = ['id','score']"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 40,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stderr",
543 | "output_type": "stream",
544 | "text": [
545 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
546 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
547 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
548 | "\n",
549 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
550 | " \"\"\"Entry point for launching an IPython kernel.\n"
551 | ]
552 | }
553 | ],
554 | "source": [
555 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
556 | ]
557 | },
558 | {
559 | "cell_type": "code",
560 | "execution_count": 41,
561 | "metadata": {
562 | "collapsed": true
563 | },
564 | "outputs": [],
565 | "source": [
566 | "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": null,
572 | "metadata": {
573 | "collapsed": true
574 | },
575 | "outputs": [],
576 | "source": []
577 | }
578 | ],
579 | "metadata": {
580 | "kernelspec": {
581 | "display_name": "Python 3",
582 | "language": "python",
583 | "name": "python3"
584 | },
585 | "language_info": {
586 | "codemirror_mode": {
587 | "name": "ipython",
588 | "version": 3
589 | },
590 | "file_extension": ".py",
591 | "mimetype": "text/x-python",
592 | "name": "python",
593 | "nbconvert_exporter": "python",
594 | "pygments_lexer": "ipython3",
595 | "version": "3.6.1"
596 | }
597 | },
598 | "nbformat": 4,
599 | "nbformat_minor": 2
600 | }
601 |
--------------------------------------------------------------------------------
/code/Baseline_bagging_version.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### Packages"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "#在local cv 6377的基础上,加上MSE优化,MAE & MAE各自用N个seed 最终加权平均"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stderr",
28 | "output_type": "stream",
29 | "text": [
30 | "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
31 | "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
32 | "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
33 | "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
34 | " \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "import time\n",
40 | "import matplotlib.pyplot as plt\n",
41 | "import seaborn as sns\n",
42 | "import numpy as np\n",
43 | "import pandas as pd\n",
44 | "import lightgbm as lgb\n",
45 | "from sklearn.model_selection import StratifiedKFold\n",
46 | "from sklearn.preprocessing import LabelEncoder"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "### Input data"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "data_path = '../input/'\n",
65 | "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
66 | "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
67 | "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {},
73 | "source": [
74 | "### Pre-processing"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 3,
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "data": {
84 | "text/html": [
85 | "\n",
86 | "\n",
99 | "
\n",
100 | " \n",
101 | " \n",
102 | " | \n",
103 | " 用户编码 | \n",
104 | " 用户实名制是否通过核实 | \n",
105 | " 用户年龄 | \n",
106 | " 是否大学生客户 | \n",
107 | " 是否黑名单客户 | \n",
108 | " 是否4G不健康客户 | \n",
109 | " 用户网龄(月) | \n",
110 | " 用户最近一次缴费距今时长(月) | \n",
111 | " 缴费用户最近一次缴费金额(元) | \n",
112 | " 用户近6个月平均消费值(元) | \n",
113 | " ... | \n",
114 | " 当月是否景点游览 | \n",
115 | " 当月是否体育场馆消费 | \n",
116 | " 当月网购类应用使用次数 | \n",
117 | " 当月物流快递类应用使用次数 | \n",
118 | " 当月金融理财类应用使用总次数 | \n",
119 | " 当月视频播放类应用使用次数 | \n",
120 | " 当月飞机类应用使用次数 | \n",
121 | " 当月火车类应用使用次数 | \n",
122 | " 当月旅游资讯类应用使用次数 | \n",
123 | " 信用分 | \n",
124 | "
\n",
125 | " \n",
126 | " \n",
127 | " \n",
128 | " 0 | \n",
129 | " a4651f98c82948b186bdcdc8108381b4 | \n",
130 | " 1 | \n",
131 | " 44 | \n",
132 | " 0 | \n",
133 | " 0 | \n",
134 | " 0 | \n",
135 | " 186 | \n",
136 | " 1 | \n",
137 | " 99.8 | \n",
138 | " 163.86 | \n",
139 | " ... | \n",
140 | " 1 | \n",
141 | " 1 | \n",
142 | " 713 | \n",
143 | " 0 | \n",
144 | " 2740 | \n",
145 | " 7145 | \n",
146 | " 0 | \n",
147 | " 0 | \n",
148 | " 30 | \n",
149 | " 664 | \n",
150 | "
\n",
151 | " \n",
152 | "
\n",
153 | "
1 rows × 30 columns
\n",
154 | "
"
155 | ],
156 | "text/plain": [
157 | " 用户编码 用户实名制是否通过核实 用户年龄 是否大学生客户 是否黑名单客户 \\\n",
158 | "0 a4651f98c82948b186bdcdc8108381b4 1 44 0 0 \n",
159 | "\n",
160 | " 是否4G不健康客户 用户网龄(月) 用户最近一次缴费距今时长(月) 缴费用户最近一次缴费金额(元) 用户近6个月平均消费值(元) ... \\\n",
161 | "0 0 186 1 99.8 163.86 ... \n",
162 | "\n",
163 | " 当月是否景点游览 当月是否体育场馆消费 当月网购类应用使用次数 当月物流快递类应用使用次数 当月金融理财类应用使用总次数 \\\n",
164 | "0 1 1 713 0 2740 \n",
165 | "\n",
166 | " 当月视频播放类应用使用次数 当月飞机类应用使用次数 当月火车类应用使用次数 当月旅游资讯类应用使用次数 信用分 \n",
167 | "0 7145 0 0 30 664 \n",
168 | "\n",
169 | "[1 rows x 30 columns]"
170 | ]
171 | },
172 | "execution_count": 3,
173 | "metadata": {},
174 | "output_type": "execute_result"
175 | }
176 | ],
177 | "source": [
178 | "train_data.head(1)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 4,
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
191 | " '用户网龄(月)', '用户最近一次缴费距今时长(月)', '缴费用户最近一次缴费金额(元)', '用户近6个月平均消费值(元)',\n",
192 | " '用户账单当月总费用(元)', '用户当月账户余额(元)', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
193 | " '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
194 | " '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
195 | " '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
196 | " '当月旅游资讯类应用使用次数', '信用分'],\n",
197 | " dtype='object')\n"
198 | ]
199 | }
200 | ],
201 | "source": [
202 | "#all chinese name- -\n",
203 | "#rename one by one\n",
204 | "print(train_data.columns)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 5,
210 | "metadata": {
211 | "collapsed": true
212 | },
213 | "outputs": [],
214 | "source": [
215 | "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
216 | " '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
217 | " 'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
218 | " 'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
219 | " 'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
220 | " 'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
221 | " 'finance_app_count','video_app_count','flight_count','train_count',\\\n",
222 | " 'tour_app_count','score']\n",
223 | "test_data.columns = train_data.columns[:-1]"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": null,
229 | "metadata": {
230 | "collapsed": true
231 | },
232 | "outputs": [],
233 | "source": []
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### Feature Engineering"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 6,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stderr",
249 | "output_type": "stream",
250 | "text": [
251 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
252 | "A value is trying to be set on a copy of a slice from a DataFrame\n",
253 | "\n",
254 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
255 | " # This is added back by InteractiveShellApp.init_path()\n"
256 | ]
257 | }
258 | ],
259 | "source": [
260 | "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
261 | "#先前余额,当前余额 + 当月话费 - 上次缴费 --- useless\n",
262 | "#充值金额/余额 --- useless\n",
263 | "#当月话费/最近充值金额 --- useless\n",
264 | "#六个月均值/充值金额 --- useless\n",
265 | "\n",
266 | "#top up amount, 充值金额是整数,和小数,应该对应不同的充值途径?\n",
267 | "\n",
268 | "def produce_offline_feat(train_data):\n",
269 | " train_data['top_up_amount_offline'] = 0\n",
270 | " train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
271 | " train_data['top_up_amount'] != 0] = 1\n",
272 | " return train_data\n",
273 | "\n",
274 | "train_data = produce_offline_feat(train_data)\n",
275 | "test_data = produce_offline_feat(test_data)"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 7,
281 | "metadata": {
282 | "collapsed": true
283 | },
284 | "outputs": [],
285 | "source": [
286 | "def produce_fee_rate(train_data):\n",
287 | " #看importance,当月话费 和最近半年平均话费都很高,算一下当月/半年 -->稳定性\n",
288 | " train_data['current_fee_stability'] = \\\n",
289 | " train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
290 | " \n",
291 | " #当月话费/当月账户余额\n",
292 | " train_data['use_left_rate'] = \\\n",
293 | " train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
294 | " return train_data\n",
295 | "\n",
296 | "train_data = produce_fee_rate(train_data)\n",
297 | "test_data = produce_fee_rate(test_data)"
298 | ]
299 | },
300 | {
301 | "cell_type": "markdown",
302 | "metadata": {},
303 | "source": [
304 | "### Training"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 8,
310 | "metadata": {
311 | "collapsed": true
312 | },
313 | "outputs": [],
314 | "source": [
315 | "def display_importances(feature_importance_df_):\n",
316 | " cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
317 | " best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
318 | " plt.figure(figsize=(8, 10))\n",
319 | " sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
320 | " plt.title('LightGBM Features (avg over folds)')\n",
321 | " plt.tight_layout()\n",
322 | " plt.show()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 9,
328 | "metadata": {
329 | "collapsed": true
330 | },
331 | "outputs": [],
332 | "source": [
333 | "#para\n",
334 | "params = {\n",
335 | " 'learning_rate': 0.01,\n",
336 | " 'boosting_type': 'gbdt',\n",
337 | " 'objective': 'regression_l1',\n",
338 | " 'metric': 'mae',\n",
339 | " 'feature_fraction': 0.6,\n",
340 | " 'bagging_fraction': 0.8,\n",
341 | " 'bagging_freq': 2,\n",
342 | " 'num_leaves': 31,\n",
343 | " 'verbose': -1,\n",
344 | " 'max_depth': 5,\n",
345 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n",
346 | "}"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": 10,
352 | "metadata": {
353 | "collapsed": true
354 | },
355 | "outputs": [],
356 | "source": [
357 | "#para\n",
358 | "params2 = {\n",
359 | " 'learning_rate': 0.01,\n",
360 | " 'boosting_type': 'gbdt',\n",
361 | " 'objective': 'regression_l2',\n",
362 | " 'metric': 'mae',\n",
363 | " 'feature_fraction': 0.6,\n",
364 | " 'bagging_fraction': 0.8,\n",
365 | " 'bagging_freq': 2,\n",
366 | " 'num_leaves': 31,\n",
367 | " 'verbose': -1,\n",
368 | " 'max_depth': 5,\n",
369 | " 'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n",
370 | " 'seed': 89\n",
371 | "}"
372 | ]
373 | },
374 | {
375 | "cell_type": "code",
376 | "execution_count": 11,
377 | "metadata": {},
378 | "outputs": [
379 | {
380 | "name": "stderr",
381 | "output_type": "stream",
382 | "text": [
383 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
384 | " % (min_groups, self.n_splits)), Warning)\n"
385 | ]
386 | },
387 | {
388 | "name": "stdout",
389 | "output_type": "stream",
390 | "text": [
391 | "fold: 0 training\n",
392 | "Training until validation scores don't improve for 50 rounds.\n",
393 | "Early stopping, best iteration is:\n",
394 | "[2433]\tvalid_0's l1: 14.7441\n",
395 | "fold: 1 training\n",
396 | "Training until validation scores don't improve for 50 rounds.\n",
397 | "Early stopping, best iteration is:\n",
398 | "[1876]\tvalid_0's l1: 14.8595\n",
399 | "fold: 2 training\n",
400 | "Training until validation scores don't improve for 50 rounds.\n",
401 | "Early stopping, best iteration is:\n",
402 | "[2459]\tvalid_0's l1: 14.7082\n",
403 | "fold: 3 training\n",
404 | "Training until validation scores don't improve for 50 rounds.\n",
405 | "Early stopping, best iteration is:\n",
406 | "[2468]\tvalid_0's l1: 14.6564\n",
407 | "fold: 4 training\n",
408 | "Training until validation scores don't improve for 50 rounds.\n",
409 | "Early stopping, best iteration is:\n",
410 | "[2599]\tvalid_0's l1: 14.5114\n",
411 | "fold: 0 training\n",
412 | "Training until validation scores don't improve for 50 rounds.\n",
413 | "Early stopping, best iteration is:\n",
414 | "[3313]\tvalid_0's l1: 14.743\n",
415 | "fold: 1 training\n",
416 | "Training until validation scores don't improve for 50 rounds.\n",
417 | "Early stopping, best iteration is:\n",
418 | "[2590]\tvalid_0's l1: 14.8562\n",
419 | "fold: 2 training\n",
420 | "Training until validation scores don't improve for 50 rounds.\n",
421 | "Early stopping, best iteration is:\n",
422 | "[2523]\tvalid_0's l1: 14.5752\n",
423 | "fold: 3 training\n",
424 | "Training until validation scores don't improve for 50 rounds.\n",
425 | "Early stopping, best iteration is:\n",
426 | "[3564]\tvalid_0's l1: 14.6125\n",
427 | "fold: 4 training\n",
428 | "Training until validation scores don't improve for 50 rounds.\n",
429 | "Early stopping, best iteration is:\n",
430 | "[1853]\tvalid_0's l1: 14.6333\n",
431 | "fold: 0 training\n",
432 | "Training until validation scores don't improve for 50 rounds.\n",
433 | "Early stopping, best iteration is:\n",
434 | "[2851]\tvalid_0's l1: 14.9587\n",
435 | "fold: 1 training\n",
436 | "Training until validation scores don't improve for 50 rounds.\n",
437 | "Early stopping, best iteration is:\n",
438 | "[1875]\tvalid_0's l1: 14.7808\n",
439 | "fold: 2 training\n",
440 | "Training until validation scores don't improve for 50 rounds.\n",
441 | "Early stopping, best iteration is:\n",
442 | "[2957]\tvalid_0's l1: 14.5525\n",
443 | "fold: 3 training\n",
444 | "Training until validation scores don't improve for 50 rounds.\n",
445 | "Early stopping, best iteration is:\n",
446 | "[2723]\tvalid_0's l1: 14.4804\n",
447 | "fold: 4 training\n",
448 | "Training until validation scores don't improve for 50 rounds.\n",
449 | "Early stopping, best iteration is:\n",
450 | "[3311]\tvalid_0's l1: 14.6854\n"
451 | ]
452 | }
453 | ],
454 | "source": [
455 | "cv_pred_all = 0\n",
456 | "en_amount = 3\n",
457 | "for seed in range(en_amount):\n",
458 | " NFOLDS = 5\n",
459 | " train_label = train_data['score']\n",
460 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n",
461 | " kf = kfold.split(train_data, train_label)\n",
462 | "\n",
463 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
464 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
465 | "\n",
466 | "\n",
467 | " cv_pred = np.zeros(test_data.shape[0])\n",
468 | " valid_best_l2_all = 0\n",
469 | "\n",
470 | " feature_importance_df = pd.DataFrame()\n",
471 | " count = 0\n",
472 | " for i, (train_fold, validate) in enumerate(kf):\n",
473 | " print('fold: ',i, ' training')\n",
474 | " X_train, X_validate, label_train, label_validate = \\\n",
475 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
476 | " train_label[train_fold], train_label[validate]\n",
477 | " dtrain = lgb.Dataset(X_train, label_train)\n",
478 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
479 | " bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
480 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
481 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
482 | "\n",
483 | "# fold_importance_df = pd.DataFrame()\n",
484 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n",
485 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
486 | "# fold_importance_df[\"fold\"] = count + 1\n",
487 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
488 | " count += 1\n",
489 | "\n",
490 | " cv_pred /= NFOLDS\n",
491 | " valid_best_l2_all /= NFOLDS\n",
492 | " \n",
493 | " cv_pred_all += cv_pred\n",
494 | "cv_pred_all /= en_amount\n",
495 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 12,
501 | "metadata": {},
502 | "outputs": [
503 | {
504 | "name": "stderr",
505 | "output_type": "stream",
506 | "text": [
507 | "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
508 | " % (min_groups, self.n_splits)), Warning)\n"
509 | ]
510 | },
511 | {
512 | "name": "stdout",
513 | "output_type": "stream",
514 | "text": [
515 | "fold: 0 training\n",
516 | "Training until validation scores don't improve for 50 rounds.\n",
517 | "Early stopping, best iteration is:\n",
518 | "[2457]\tvalid_0's l1: 14.7871\n",
519 | "fold: 1 training\n",
520 | "Training until validation scores don't improve for 50 rounds.\n",
521 | "Early stopping, best iteration is:\n",
522 | "[2365]\tvalid_0's l1: 14.6983\n",
523 | "fold: 2 training\n",
524 | "Training until validation scores don't improve for 50 rounds.\n",
525 | "Early stopping, best iteration is:\n",
526 | "[2082]\tvalid_0's l1: 14.7999\n",
527 | "fold: 3 training\n",
528 | "Training until validation scores don't improve for 50 rounds.\n",
529 | "Early stopping, best iteration is:\n",
530 | "[2266]\tvalid_0's l1: 14.483\n",
531 | "fold: 4 training\n",
532 | "Training until validation scores don't improve for 50 rounds.\n",
533 | "Early stopping, best iteration is:\n",
534 | "[2046]\tvalid_0's l1: 14.7681\n",
535 | "fold: 0 training\n",
536 | "Training until validation scores don't improve for 50 rounds.\n",
537 | "Early stopping, best iteration is:\n",
538 | "[2436]\tvalid_0's l1: 14.7728\n",
539 | "fold: 1 training\n",
540 | "Training until validation scores don't improve for 50 rounds.\n",
541 | "Early stopping, best iteration is:\n",
542 | "[2053]\tvalid_0's l1: 14.8066\n",
543 | "fold: 2 training\n",
544 | "Training until validation scores don't improve for 50 rounds.\n",
545 | "Early stopping, best iteration is:\n",
546 | "[2221]\tvalid_0's l1: 14.5464\n",
547 | "fold: 3 training\n",
548 | "Training until validation scores don't improve for 50 rounds.\n",
549 | "Early stopping, best iteration is:\n",
550 | "[2348]\tvalid_0's l1: 14.5198\n",
551 | "fold: 4 training\n",
552 | "Training until validation scores don't improve for 50 rounds.\n",
553 | "Early stopping, best iteration is:\n",
554 | "[2207]\tvalid_0's l1: 14.8169\n",
555 | "fold: 0 training\n",
556 | "Training until validation scores don't improve for 50 rounds.\n",
557 | "Early stopping, best iteration is:\n",
558 | "[2110]\tvalid_0's l1: 14.5323\n",
559 | "fold: 1 training\n",
560 | "Training until validation scores don't improve for 50 rounds.\n",
561 | "Early stopping, best iteration is:\n",
562 | "[2627]\tvalid_0's l1: 14.8493\n",
563 | "fold: 2 training\n",
564 | "Training until validation scores don't improve for 50 rounds.\n",
565 | "Early stopping, best iteration is:\n",
566 | "[2040]\tvalid_0's l1: 14.8335\n",
567 | "fold: 3 training\n",
568 | "Training until validation scores don't improve for 50 rounds.\n",
569 | "Early stopping, best iteration is:\n",
570 | "[2241]\tvalid_0's l1: 14.6379\n",
571 | "fold: 4 training\n",
572 | "Training until validation scores don't improve for 50 rounds.\n",
573 | "Early stopping, best iteration is:\n",
574 | "[2424]\tvalid_0's l1: 14.6794\n"
575 | ]
576 | }
577 | ],
578 | "source": [
579 | "cv_pred_all2 = 0\n",
580 | "en_amount = 3\n",
581 | "for seed in range(en_amount):\n",
582 | " NFOLDS = 5\n",
583 | " train_label = train_data['score']\n",
584 | " kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n",
585 | " kf = kfold.split(train_data, train_label)\n",
586 | "\n",
587 | " train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
588 | " test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
589 | "\n",
590 | "\n",
591 | " cv_pred = np.zeros(test_data.shape[0])\n",
592 | " valid_best_l2_all = 0\n",
593 | "\n",
594 | " feature_importance_df = pd.DataFrame()\n",
595 | " count = 0\n",
596 | " for i, (train_fold, validate) in enumerate(kf):\n",
597 | " print('fold: ',i, ' training')\n",
598 | " X_train, X_validate, label_train, label_validate = \\\n",
599 | " train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
600 | " train_label[train_fold], train_label[validate]\n",
601 | " dtrain = lgb.Dataset(X_train, label_train)\n",
602 | " dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
603 | " bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
604 | " cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
605 | " valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
606 | "\n",
607 | "# fold_importance_df = pd.DataFrame()\n",
608 | "# fold_importance_df[\"feature\"] = list(X_train.columns)\n",
609 | "# fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
610 | "# fold_importance_df[\"fold\"] = count + 1\n",
611 | "# feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
612 | " count += 1\n",
613 | "\n",
614 | " cv_pred /= NFOLDS\n",
615 | " valid_best_l2_all /= NFOLDS\n",
616 | " \n",
617 | " cv_pred_all2 += cv_pred\n",
618 | " \n",
619 | "cv_pred_all2 /= en_amount\n",
620 | " #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 13,
626 | "metadata": {
627 | "collapsed": true
628 | },
629 | "outputs": [],
630 | "source": [
631 | "# display_importances(feature_importance_df)"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": null,
637 | "metadata": {
638 | "collapsed": true
639 | },
640 | "outputs": [],
641 | "source": [
642 | "baseline\n",
643 | "cv score for valid is: 0.06342856152921912 --- 0.06339265000\n",
644 | " \n",
645 | "#充值金额是否为整数\n",
646 | "cv score for valid is: 0.06343660584697094\n",
647 | "#当月话费/半年话费\n",
648 | "cv score for valid is: 0.06349188259250227\n",
649 | "#当月话费/余额\n",
650 | "cv score for valid is: 0.06350638782547711\n",
651 | " \n",
652 | "#leaves 31\n",
653 | "cv score for valid is: 0.06354362406472286\n",
654 | "#remove l1, l2 = 5\n",
655 | "cv score for valid is: 0.06358730556250403\n",
656 | "#feature fraction 0.7\n",
657 | "cv score for valid is: 0.06361478051326884 --- 0.06355141000\n",
658 | "max_depth 5, objective l1\n",
659 | "cv score for valid is: 0.06367445081783887\n",
660 | "feature fraction 0.6\n",
661 | "cv score for valid is: 0.06377264215140695 --- 0.06379867000\n",
662 | "10 fold\n",
663 | "cv score for valid is: 0.0637915578042461 --- 6378 --- useless\n",
664 | "remove blk list flag\n",
665 | "cv score for valid is: 0.06377613710442855"
666 | ]
667 | },
668 | {
669 | "cell_type": "markdown",
670 | "metadata": {},
671 | "source": [
672 | "### Submit"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": 14,
678 | "metadata": {},
679 | "outputs": [
680 | {
681 | "name": "stderr",
682 | "output_type": "stream",
683 | "text": [
684 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
685 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
686 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
687 | "\n",
688 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
689 | " \n",
690 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
691 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
692 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
693 | "\n",
694 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
695 | " after removing the cwd from sys.path.\n",
696 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
697 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
698 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
699 | "\n",
700 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
701 | " \"\"\"\n"
702 | ]
703 | }
704 | ],
705 | "source": [
706 | "test_data_sub = test_data[['uid']]\n",
707 | "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n",
708 | "test_data_sub.columns = ['id','score']\n",
709 | "test_data_sub['score1'] = cv_pred_all\n",
710 | "test_data_sub['score2'] = cv_pred_all2"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 17,
716 | "metadata": {},
717 | "outputs": [
718 | {
719 | "name": "stderr",
720 | "output_type": "stream",
721 | "text": [
722 | "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
723 | "A value is trying to be set on a copy of a slice from a DataFrame.\n",
724 | "Try using .loc[row_indexer,col_indexer] = value instead\n",
725 | "\n",
726 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
727 | " \"\"\"Entry point for launching an IPython kernel.\n"
728 | ]
729 | }
730 | ],
731 | "source": [
732 | "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 18,
738 | "metadata": {
739 | "collapsed": true
740 | },
741 | "outputs": [],
742 | "source": [
743 | "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)"
744 | ]
745 | },
746 | {
747 | "cell_type": "code",
748 | "execution_count": 31,
749 | "metadata": {},
750 | "outputs": [
751 | {
752 | "data": {
753 | "text/plain": [
754 | "617.8386873193765"
755 | ]
756 | },
757 | "execution_count": 31,
758 | "metadata": {},
759 | "output_type": "execute_result"
760 | }
761 | ],
762 | "source": [
763 | "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n",
764 | "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725"
765 | ]
766 | },
767 | {
768 | "cell_type": "code",
769 | "execution_count": null,
770 | "metadata": {
771 | "collapsed": true
772 | },
773 | "outputs": [],
774 | "source": []
775 | }
776 | ],
777 | "metadata": {
778 | "kernelspec": {
779 | "display_name": "Python 3",
780 | "language": "python",
781 | "name": "python3"
782 | },
783 | "language_info": {
784 | "codemirror_mode": {
785 | "name": "ipython",
786 | "version": 3
787 | },
788 | "file_extension": ".py",
789 | "mimetype": "text/x-python",
790 | "name": "python",
791 | "nbconvert_exporter": "python",
792 | "pygments_lexer": "ipython3",
793 | "version": "3.6.1"
794 | }
795 | },
796 | "nbformat": 4,
797 | "nbformat_minor": 2
798 | }
799 |
--------------------------------------------------------------------------------