├── README.md
├── code
    ├── .ipynb_checkpoints
    │   ├── Baseline-checkpoint.ipynb
    │   └── Baseline_bagging_version-checkpoint.ipynb
    ├── Baseline.ipynb
    └── Baseline_bagging_version.ipynb
└── input
    ├── submit_example.csv
    ├── test_dataset.csv
    └── train_dataset.csv


/README.md:
--------------------------------------------------------------------------------
1 | # Credit-Scoring-Regression by YourVenn@Kaggle
2 | - 消费者人群画像—信用智能评分比赛开源
3 | - 请把源数据放在 input/
4 | - Baseline: 线上6379+，Baseline_bagging_version: 线上6388+
5 | - 认为有用的朋友，方便的话求点赞~ 谢谢！
6 | 


--------------------------------------------------------------------------------
/code/.ipynb_checkpoints/Baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 14,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#过年没时间做了，专心搞kaggle去了\n",
 19 |     "#kaggle玩家欢迎和我交流，ID是YourVenn"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stderr",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
 32 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 33 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 34 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 35 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "import time\n",
 41 |     "import matplotlib.pyplot as plt\n",
 42 |     "import seaborn as sns\n",
 43 |     "import numpy as np\n",
 44 |     "import pandas as pd\n",
 45 |     "import lightgbm as lgb\n",
 46 |     "from sklearn.model_selection import StratifiedKFold\n",
 47 |     "from sklearn.preprocessing import LabelEncoder"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Input data"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "data_path = '../input/'\n",
 66 |     "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
 67 |     "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
 68 |     "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Pre-processing"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/html": [
 86 |        "<div>\n",
 87 |        "<style>\n",
 88 |        "    .dataframe thead tr:only-child th {\n",
 89 |        "        text-align: right;\n",
 90 |        "    }\n",
 91 |        "\n",
 92 |        "    .dataframe thead th {\n",
 93 |        "        text-align: left;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe tbody tr th {\n",
 97 |        "        vertical-align: top;\n",
 98 |        "    }\n",
 99 |        "</style>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>用户编码</th>\n",
105 |        "      <th>用户实名制是否通过核实</th>\n",
106 |        "      <th>用户年龄</th>\n",
107 |        "      <th>是否大学生客户</th>\n",
108 |        "      <th>是否黑名单客户</th>\n",
109 |        "      <th>是否4G不健康客户</th>\n",
110 |        "      <th>用户网龄（月）</th>\n",
111 |        "      <th>用户最近一次缴费距今时长（月）</th>\n",
112 |        "      <th>缴费用户最近一次缴费金额（元）</th>\n",
113 |        "      <th>用户近6个月平均消费值（元）</th>\n",
114 |        "      <th>...</th>\n",
115 |        "      <th>当月是否景点游览</th>\n",
116 |        "      <th>当月是否体育场馆消费</th>\n",
117 |        "      <th>当月网购类应用使用次数</th>\n",
118 |        "      <th>当月物流快递类应用使用次数</th>\n",
119 |        "      <th>当月金融理财类应用使用总次数</th>\n",
120 |        "      <th>当月视频播放类应用使用次数</th>\n",
121 |        "      <th>当月飞机类应用使用次数</th>\n",
122 |        "      <th>当月火车类应用使用次数</th>\n",
123 |        "      <th>当月旅游资讯类应用使用次数</th>\n",
124 |        "      <th>信用分</th>\n",
125 |        "    </tr>\n",
126 |        "  </thead>\n",
127 |        "  <tbody>\n",
128 |        "    <tr>\n",
129 |        "      <th>0</th>\n",
130 |        "      <td>a4651f98c82948b186bdcdc8108381b4</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>44</td>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>186</td>\n",
137 |        "      <td>1</td>\n",
138 |        "      <td>99.8</td>\n",
139 |        "      <td>163.86</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>1</td>\n",
143 |        "      <td>713</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>2740</td>\n",
146 |        "      <td>7145</td>\n",
147 |        "      <td>0</td>\n",
148 |        "      <td>0</td>\n",
149 |        "      <td>30</td>\n",
150 |        "      <td>664</td>\n",
151 |        "    </tr>\n",
152 |        "  </tbody>\n",
153 |        "</table>\n",
154 |        "<p>1 rows × 30 columns</p>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \\\n",
159 |        "0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   \n",
160 |        "\n",
161 |        "   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \\\n",
162 |        "0          0      186                1             99.8          163.86 ...    \n",
163 |        "\n",
164 |        "   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \\\n",
165 |        "0         1           1          713              0            2740   \n",
166 |        "\n",
167 |        "   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  \n",
168 |        "0           7145            0            0             30  664  \n",
169 |        "\n",
170 |        "[1 rows x 30 columns]"
171 |       ]
172 |      },
173 |      "execution_count": 3,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "train_data.head(1)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
192 |       "       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',\n",
193 |       "       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
194 |       "       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
195 |       "       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
196 |       "       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
197 |       "       '当月旅游资讯类应用使用次数', '信用分'],\n",
198 |       "      dtype='object')\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "#all chinese name- -\n",
204 |     "#rename one by one\n",
205 |     "print(train_data.columns)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 5,
211 |    "metadata": {
212 |     "collapsed": true
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
217 |     "                     '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
218 |     "                     'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
219 |     "                     'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
220 |     "                     'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
221 |     "                     'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
222 |     "                     'finance_app_count','video_app_count','flight_count','train_count',\\\n",
223 |     "                     'tour_app_count','score']\n",
224 |     "test_data.columns = train_data.columns[:-1]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Feature Engineering"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 6,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "name": "stderr",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
244 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
245 |       "\n",
246 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
247 |       "  \n"
248 |      ]
249 |     }
250 |    ],
251 |    "source": [
252 |     "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
253 |     "#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？\n",
254 |     "\n",
255 |     "def produce_offline_feat(train_data):\n",
256 |     "    train_data['top_up_amount_offline'] = 0\n",
257 |     "    train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
258 |     "                               train_data['top_up_amount'] != 0] = 1\n",
259 |     "    return train_data\n",
260 |     "\n",
261 |     "train_data = produce_offline_feat(train_data)\n",
262 |     "test_data = produce_offline_feat(test_data)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 7,
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "def produce_fee_rate(train_data):\n",
274 |     "    #看importance，当月话费 和最近半年平均话费都很高，算一下当月/半年 -->稳定性\n",
275 |     "    train_data['current_fee_stability'] = \\\n",
276 |     "    train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
277 |     "    \n",
278 |     "    #当月话费/当月账户余额\n",
279 |     "    train_data['use_left_rate'] = \\\n",
280 |     "    train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
281 |     "    return train_data\n",
282 |     "\n",
283 |     "train_data = produce_fee_rate(train_data)\n",
284 |     "test_data = produce_fee_rate(test_data)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "### Training"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 8,
297 |    "metadata": {
298 |     "collapsed": true
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "def display_importances(feature_importance_df_):\n",
303 |     "    cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
304 |     "    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
305 |     "    plt.figure(figsize=(8, 10))\n",
306 |     "    sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
307 |     "    plt.title('LightGBM Features (avg over folds)')\n",
308 |     "    plt.tight_layout()\n",
309 |     "    plt.show()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "#para\n",
321 |     "params = {\n",
322 |     "    'learning_rate': 0.01,\n",
323 |     "    'boosting_type': 'gbdt',\n",
324 |     "    'objective': 'regression_l1',\n",
325 |     "    'metric': 'mae',\n",
326 |     "    'feature_fraction': 0.6,\n",
327 |     "    'bagging_fraction': 0.8,\n",
328 |     "    'bagging_freq': 2,\n",
329 |     "    'num_leaves': 31,\n",
330 |     "    'verbose': -1,\n",
331 |     "    'max_depth': 5,\n",
332 |     "    'lambda_l2': 5, 'lambda_l1': 0\n",
333 |     "}"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 10,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "NFOLDS = 5\n",
345 |     "train_label = train_data['score']\n",
346 |     "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n",
347 |     "kf = kfold.split(train_data, train_label)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 11,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "data": {
357 |       "text/plain": [
358 |        "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n",
359 |        "       '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n",
360 |        "       'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n",
361 |        "       'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n",
362 |        "       'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n",
363 |        "       'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n",
364 |        "       'online_shopping_count', 'express_count', 'finance_app_count',\n",
365 |        "       'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n",
366 |        "       'score', 'top_up_amount_offline', 'current_fee_stability',\n",
367 |        "       'use_left_rate'],\n",
368 |        "      dtype='object')"
369 |       ]
370 |      },
371 |      "execution_count": 11,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "train_data.columns"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 12,
383 |    "metadata": {
384 |     "collapsed": true
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
389 |     "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 13,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "name": "stderr",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
402 |       "  % (min_groups, self.n_splits)), Warning)\n"
403 |      ]
404 |     },
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "fold:  0  training\n",
410 |       "Training until validation scores don't improve for 50 rounds.\n",
411 |       "Early stopping, best iteration is:\n",
412 |       "[2834]\tvalid_0's l1: 14.7519\n",
413 |       "fold:  1  training\n",
414 |       "Training until validation scores don't improve for 50 rounds.\n",
415 |       "Early stopping, best iteration is:\n",
416 |       "[2780]\tvalid_0's l1: 14.6775\n",
417 |       "fold:  2  training\n",
418 |       "Training until validation scores don't improve for 50 rounds.\n",
419 |       "Early stopping, best iteration is:\n",
420 |       "[3745]\tvalid_0's l1: 14.728\n",
421 |       "fold:  3  training\n",
422 |       "Training until validation scores don't improve for 50 rounds.\n",
423 |       "Early stopping, best iteration is:\n",
424 |       "[3009]\tvalid_0's l1: 14.46\n",
425 |       "fold:  4  training\n",
426 |       "Training until validation scores don't improve for 50 rounds.\n",
427 |       "Early stopping, best iteration is:\n",
428 |       "[2544]\tvalid_0's l1: 14.7818\n",
429 |       "cv score for valid is:  0.06377613710442855\n"
430 |      ]
431 |     }
432 |    ],
433 |    "source": [
434 |     "cv_pred = np.zeros(test_data.shape[0])\n",
435 |     "valid_best_l2_all = 0\n",
436 |     "\n",
437 |     "feature_importance_df = pd.DataFrame()\n",
438 |     "count = 0\n",
439 |     "for i, (train_fold, validate) in enumerate(kf):\n",
440 |     "    print('fold: ',i, ' training')\n",
441 |     "    X_train, X_validate, label_train, label_validate = \\\n",
442 |     "    train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
443 |     "    train_label[train_fold], train_label[validate]\n",
444 |     "    dtrain = lgb.Dataset(X_train, label_train)\n",
445 |     "    dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
446 |     "    bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
447 |     "    cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
448 |     "    valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
449 |     "\n",
450 |     "    fold_importance_df = pd.DataFrame()\n",
451 |     "    fold_importance_df[\"feature\"] = list(X_train.columns)\n",
452 |     "    fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n",
453 |     "    fold_importance_df[\"fold\"] = count + 1\n",
454 |     "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
455 |     "    count += 1\n",
456 |     "\n",
457 |     "cv_pred /= NFOLDS\n",
458 |     "valid_best_l2_all /= NFOLDS\n",
459 |     "print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 15,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "display_importances(feature_importance_df)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "collapsed": true
478 |    },
479 |    "outputs": [],
480 |    "source": [
481 |     "baseline\n",
482 |     "cv score for valid is:  0.06342856152921912 --- 0.06339265000\n",
483 |     "    \n",
484 |     "#充值金额是否为整数\n",
485 |     "cv score for valid is:  0.06343660584697094\n",
486 |     "#当月话费/半年话费\n",
487 |     "cv score for valid is:  0.06349188259250227\n",
488 |     "#当月话费/余额\n",
489 |     "cv score for valid is:  0.06350638782547711\n",
490 |     "    \n",
491 |     "#leaves 31\n",
492 |     "cv score for valid is:  0.06354362406472286\n",
493 |     "#remove l1, l2 = 5\n",
494 |     "cv score for valid is:  0.06358730556250403\n",
495 |     "#feature fraction 0.7\n",
496 |     "cv score for valid is:  0.06361478051326884 --- 0.06355141000\n",
497 |     "max_depth 5, objective l1\n",
498 |     "cv score for valid is:  0.06367445081783887\n",
499 |     "feature fraction 0.6\n",
500 |     "cv score for valid is:  0.06377264215140695 --- 0.06379867000\n",
501 |     "remove blk flag\n",
502 |     "cv score for valid is:  0.06377613710442855"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "### Submit"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 39,
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "name": "stderr",
519 |      "output_type": "stream",
520 |      "text": [
521 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
522 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
523 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
524 |       "\n",
525 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
526 |       "  \n"
527 |      ]
528 |     }
529 |    ],
530 |    "source": [
531 |     "test_data_sub = test_data[['uid']]\n",
532 |     "test_data_sub['score'] = cv_pred\n",
533 |     "test_data_sub.columns = ['id','score']"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 40,
539 |    "metadata": {},
540 |    "outputs": [
541 |     {
542 |      "name": "stderr",
543 |      "output_type": "stream",
544 |      "text": [
545 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
546 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
547 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
548 |       "\n",
549 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
550 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
551 |      ]
552 |     }
553 |    ],
554 |    "source": [
555 |     "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 41,
561 |    "metadata": {
562 |     "collapsed": true
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {
573 |     "collapsed": true
574 |    },
575 |    "outputs": [],
576 |    "source": []
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "kernelspec": {
581 |    "display_name": "Python 3",
582 |    "language": "python",
583 |    "name": "python3"
584 |   },
585 |   "language_info": {
586 |    "codemirror_mode": {
587 |     "name": "ipython",
588 |     "version": 3
589 |    },
590 |    "file_extension": ".py",
591 |    "mimetype": "text/x-python",
592 |    "name": "python",
593 |    "nbconvert_exporter": "python",
594 |    "pygments_lexer": "ipython3",
595 |    "version": "3.6.1"
596 |   }
597 |  },
598 |  "nbformat": 4,
599 |  "nbformat_minor": 2
600 | }
601 | 


--------------------------------------------------------------------------------
/code/.ipynb_checkpoints/Baseline_bagging_version-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stderr",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
 20 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 21 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 22 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 23 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "import time\n",
 29 |     "import matplotlib.pyplot as plt\n",
 30 |     "import seaborn as sns\n",
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "import lightgbm as lgb\n",
 34 |     "from sklearn.model_selection import StratifiedKFold\n",
 35 |     "from sklearn.preprocessing import LabelEncoder"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### Input data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "data_path = '../input/'\n",
 54 |     "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
 55 |     "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
 56 |     "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "### Pre-processing"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "data": {
 73 |       "text/html": [
 74 |        "<div>\n",
 75 |        "<style>\n",
 76 |        "    .dataframe thead tr:only-child th {\n",
 77 |        "        text-align: right;\n",
 78 |        "    }\n",
 79 |        "\n",
 80 |        "    .dataframe thead th {\n",
 81 |        "        text-align: left;\n",
 82 |        "    }\n",
 83 |        "\n",
 84 |        "    .dataframe tbody tr th {\n",
 85 |        "        vertical-align: top;\n",
 86 |        "    }\n",
 87 |        "</style>\n",
 88 |        "<table border=\"1\" class=\"dataframe\">\n",
 89 |        "  <thead>\n",
 90 |        "    <tr style=\"text-align: right;\">\n",
 91 |        "      <th></th>\n",
 92 |        "      <th>用户编码</th>\n",
 93 |        "      <th>用户实名制是否通过核实</th>\n",
 94 |        "      <th>用户年龄</th>\n",
 95 |        "      <th>是否大学生客户</th>\n",
 96 |        "      <th>是否黑名单客户</th>\n",
 97 |        "      <th>是否4G不健康客户</th>\n",
 98 |        "      <th>用户网龄（月）</th>\n",
 99 |        "      <th>用户最近一次缴费距今时长（月）</th>\n",
100 |        "      <th>缴费用户最近一次缴费金额（元）</th>\n",
101 |        "      <th>用户近6个月平均消费值（元）</th>\n",
102 |        "      <th>...</th>\n",
103 |        "      <th>当月是否景点游览</th>\n",
104 |        "      <th>当月是否体育场馆消费</th>\n",
105 |        "      <th>当月网购类应用使用次数</th>\n",
106 |        "      <th>当月物流快递类应用使用次数</th>\n",
107 |        "      <th>当月金融理财类应用使用总次数</th>\n",
108 |        "      <th>当月视频播放类应用使用次数</th>\n",
109 |        "      <th>当月飞机类应用使用次数</th>\n",
110 |        "      <th>当月火车类应用使用次数</th>\n",
111 |        "      <th>当月旅游资讯类应用使用次数</th>\n",
112 |        "      <th>信用分</th>\n",
113 |        "    </tr>\n",
114 |        "  </thead>\n",
115 |        "  <tbody>\n",
116 |        "    <tr>\n",
117 |        "      <th>0</th>\n",
118 |        "      <td>a4651f98c82948b186bdcdc8108381b4</td>\n",
119 |        "      <td>1</td>\n",
120 |        "      <td>44</td>\n",
121 |        "      <td>0</td>\n",
122 |        "      <td>0</td>\n",
123 |        "      <td>0</td>\n",
124 |        "      <td>186</td>\n",
125 |        "      <td>1</td>\n",
126 |        "      <td>99.8</td>\n",
127 |        "      <td>163.86</td>\n",
128 |        "      <td>...</td>\n",
129 |        "      <td>1</td>\n",
130 |        "      <td>1</td>\n",
131 |        "      <td>713</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>2740</td>\n",
134 |        "      <td>7145</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>30</td>\n",
138 |        "      <td>664</td>\n",
139 |        "    </tr>\n",
140 |        "  </tbody>\n",
141 |        "</table>\n",
142 |        "<p>1 rows × 30 columns</p>\n",
143 |        "</div>"
144 |       ],
145 |       "text/plain": [
146 |        "                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \\\n",
147 |        "0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   \n",
148 |        "\n",
149 |        "   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \\\n",
150 |        "0          0      186                1             99.8          163.86 ...    \n",
151 |        "\n",
152 |        "   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \\\n",
153 |        "0         1           1          713              0            2740   \n",
154 |        "\n",
155 |        "   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  \n",
156 |        "0           7145            0            0             30  664  \n",
157 |        "\n",
158 |        "[1 rows x 30 columns]"
159 |       ]
160 |      },
161 |      "execution_count": 3,
162 |      "metadata": {},
163 |      "output_type": "execute_result"
164 |     }
165 |    ],
166 |    "source": [
167 |     "train_data.head(1)"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 4,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "name": "stdout",
177 |      "output_type": "stream",
178 |      "text": [
179 |       "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
180 |       "       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',\n",
181 |       "       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
182 |       "       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
183 |       "       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
184 |       "       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
185 |       "       '当月旅游资讯类应用使用次数', '信用分'],\n",
186 |       "      dtype='object')\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "#all chinese name- -\n",
192 |     "#rename one by one\n",
193 |     "print(train_data.columns)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": 5,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
205 |     "                     '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
206 |     "                     'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
207 |     "                     'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
208 |     "                     'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
209 |     "                     'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
210 |     "                     'finance_app_count','video_app_count','flight_count','train_count',\\\n",
211 |     "                     'tour_app_count','score']\n",
212 |     "test_data.columns = train_data.columns[:-1]"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": []
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Feature Engineering"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 6,
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
241 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
242 |       "\n",
243 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
244 |       "  # This is added back by InteractiveShellApp.init_path()\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
250 |     "#先前余额，当前余额 + 当月话费 - 上次缴费 --- useless\n",
251 |     "#充值金额/余额 --- useless\n",
252 |     "#当月话费/最近充值金额 --- useless\n",
253 |     "#六个月均值/充值金额 --- useless\n",
254 |     "\n",
255 |     "#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？\n",
256 |     "\n",
257 |     "def produce_offline_feat(train_data):\n",
258 |     "    train_data['top_up_amount_offline'] = 0\n",
259 |     "    train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
260 |     "                               train_data['top_up_amount'] != 0] = 1\n",
261 |     "    return train_data\n",
262 |     "\n",
263 |     "train_data = produce_offline_feat(train_data)\n",
264 |     "test_data = produce_offline_feat(test_data)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 7,
270 |    "metadata": {
271 |     "collapsed": true
272 |    },
273 |    "outputs": [],
274 |    "source": [
275 |     "def produce_fee_rate(train_data):\n",
276 |     "    #看importance，当月话费 和最近半年平均话费都很高，算一下当月/半年 -->稳定性\n",
277 |     "    train_data['current_fee_stability'] = \\\n",
278 |     "    train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
279 |     "    \n",
280 |     "    #当月话费/当月账户余额\n",
281 |     "    train_data['use_left_rate'] = \\\n",
282 |     "    train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
283 |     "    return train_data\n",
284 |     "\n",
285 |     "train_data = produce_fee_rate(train_data)\n",
286 |     "test_data = produce_fee_rate(test_data)"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "### Training"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 8,
299 |    "metadata": {
300 |     "collapsed": true
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "def display_importances(feature_importance_df_):\n",
305 |     "    cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
306 |     "    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
307 |     "    plt.figure(figsize=(8, 10))\n",
308 |     "    sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
309 |     "    plt.title('LightGBM Features (avg over folds)')\n",
310 |     "    plt.tight_layout()\n",
311 |     "    plt.show()"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 9,
317 |    "metadata": {
318 |     "collapsed": true
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "#para\n",
323 |     "params = {\n",
324 |     "    'learning_rate': 0.01,\n",
325 |     "    'boosting_type': 'gbdt',\n",
326 |     "    'objective': 'regression_l1',\n",
327 |     "    'metric': 'mae',\n",
328 |     "    'feature_fraction': 0.6,\n",
329 |     "    'bagging_fraction': 0.8,\n",
330 |     "    'bagging_freq': 2,\n",
331 |     "    'num_leaves': 31,\n",
332 |     "    'verbose': -1,\n",
333 |     "    'max_depth': 5,\n",
334 |     "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n",
335 |     "}"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 10,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": [
346 |     "#para\n",
347 |     "params2 = {\n",
348 |     "    'learning_rate': 0.01,\n",
349 |     "    'boosting_type': 'gbdt',\n",
350 |     "    'objective': 'regression_l2',\n",
351 |     "    'metric': 'mae',\n",
352 |     "    'feature_fraction': 0.6,\n",
353 |     "    'bagging_fraction': 0.8,\n",
354 |     "    'bagging_freq': 2,\n",
355 |     "    'num_leaves': 31,\n",
356 |     "    'verbose': -1,\n",
357 |     "    'max_depth': 5,\n",
358 |     "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n",
359 |     "    'seed': 89\n",
360 |     "}"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "code",
365 |    "execution_count": 11,
366 |    "metadata": {},
367 |    "outputs": [
368 |     {
369 |      "name": "stderr",
370 |      "output_type": "stream",
371 |      "text": [
372 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
373 |       "  % (min_groups, self.n_splits)), Warning)\n"
374 |      ]
375 |     },
376 |     {
377 |      "name": "stdout",
378 |      "output_type": "stream",
379 |      "text": [
380 |       "fold:  0  training\n",
381 |       "Training until validation scores don't improve for 50 rounds.\n",
382 |       "Early stopping, best iteration is:\n",
383 |       "[2433]\tvalid_0's l1: 14.7441\n",
384 |       "fold:  1  training\n",
385 |       "Training until validation scores don't improve for 50 rounds.\n",
386 |       "Early stopping, best iteration is:\n",
387 |       "[1876]\tvalid_0's l1: 14.8595\n",
388 |       "fold:  2  training\n",
389 |       "Training until validation scores don't improve for 50 rounds.\n",
390 |       "Early stopping, best iteration is:\n",
391 |       "[2459]\tvalid_0's l1: 14.7082\n",
392 |       "fold:  3  training\n",
393 |       "Training until validation scores don't improve for 50 rounds.\n",
394 |       "Early stopping, best iteration is:\n",
395 |       "[2468]\tvalid_0's l1: 14.6564\n",
396 |       "fold:  4  training\n",
397 |       "Training until validation scores don't improve for 50 rounds.\n",
398 |       "Early stopping, best iteration is:\n",
399 |       "[2599]\tvalid_0's l1: 14.5114\n",
400 |       "fold:  0  training\n",
401 |       "Training until validation scores don't improve for 50 rounds.\n",
402 |       "Early stopping, best iteration is:\n",
403 |       "[3313]\tvalid_0's l1: 14.743\n",
404 |       "fold:  1  training\n",
405 |       "Training until validation scores don't improve for 50 rounds.\n",
406 |       "Early stopping, best iteration is:\n",
407 |       "[2590]\tvalid_0's l1: 14.8562\n",
408 |       "fold:  2  training\n",
409 |       "Training until validation scores don't improve for 50 rounds.\n",
410 |       "Early stopping, best iteration is:\n",
411 |       "[2523]\tvalid_0's l1: 14.5752\n",
412 |       "fold:  3  training\n",
413 |       "Training until validation scores don't improve for 50 rounds.\n",
414 |       "Early stopping, best iteration is:\n",
415 |       "[3564]\tvalid_0's l1: 14.6125\n",
416 |       "fold:  4  training\n",
417 |       "Training until validation scores don't improve for 50 rounds.\n",
418 |       "Early stopping, best iteration is:\n",
419 |       "[1853]\tvalid_0's l1: 14.6333\n",
420 |       "fold:  0  training\n",
421 |       "Training until validation scores don't improve for 50 rounds.\n",
422 |       "Early stopping, best iteration is:\n",
423 |       "[2851]\tvalid_0's l1: 14.9587\n",
424 |       "fold:  1  training\n",
425 |       "Training until validation scores don't improve for 50 rounds.\n",
426 |       "Early stopping, best iteration is:\n",
427 |       "[1875]\tvalid_0's l1: 14.7808\n",
428 |       "fold:  2  training\n",
429 |       "Training until validation scores don't improve for 50 rounds.\n",
430 |       "Early stopping, best iteration is:\n",
431 |       "[2957]\tvalid_0's l1: 14.5525\n",
432 |       "fold:  3  training\n",
433 |       "Training until validation scores don't improve for 50 rounds.\n",
434 |       "Early stopping, best iteration is:\n",
435 |       "[2723]\tvalid_0's l1: 14.4804\n",
436 |       "fold:  4  training\n",
437 |       "Training until validation scores don't improve for 50 rounds.\n",
438 |       "Early stopping, best iteration is:\n",
439 |       "[3311]\tvalid_0's l1: 14.6854\n"
440 |      ]
441 |     }
442 |    ],
443 |    "source": [
444 |     "cv_pred_all = 0\n",
445 |     "en_amount = 3\n",
446 |     "for seed in range(en_amount):\n",
447 |     "    NFOLDS = 5\n",
448 |     "    train_label = train_data['score']\n",
449 |     "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n",
450 |     "    kf = kfold.split(train_data, train_label)\n",
451 |     "\n",
452 |     "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
453 |     "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
454 |     "\n",
455 |     "\n",
456 |     "    cv_pred = np.zeros(test_data.shape[0])\n",
457 |     "    valid_best_l2_all = 0\n",
458 |     "\n",
459 |     "    feature_importance_df = pd.DataFrame()\n",
460 |     "    count = 0\n",
461 |     "    for i, (train_fold, validate) in enumerate(kf):\n",
462 |     "        print('fold: ',i, ' training')\n",
463 |     "        X_train, X_validate, label_train, label_validate = \\\n",
464 |     "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
465 |     "        train_label[train_fold], train_label[validate]\n",
466 |     "        dtrain = lgb.Dataset(X_train, label_train)\n",
467 |     "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
468 |     "        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
469 |     "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
470 |     "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
471 |     "\n",
472 |     "#         fold_importance_df = pd.DataFrame()\n",
473 |     "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
474 |     "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
475 |     "#         fold_importance_df[\"fold\"] = count + 1\n",
476 |     "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
477 |     "        count += 1\n",
478 |     "\n",
479 |     "    cv_pred /= NFOLDS\n",
480 |     "    valid_best_l2_all /= NFOLDS\n",
481 |     "    \n",
482 |     "    cv_pred_all += cv_pred\n",
483 |     "cv_pred_all /= en_amount\n",
484 |     "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 12,
490 |    "metadata": {},
491 |    "outputs": [
492 |     {
493 |      "name": "stderr",
494 |      "output_type": "stream",
495 |      "text": [
496 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
497 |       "  % (min_groups, self.n_splits)), Warning)\n"
498 |      ]
499 |     },
500 |     {
501 |      "name": "stdout",
502 |      "output_type": "stream",
503 |      "text": [
504 |       "fold:  0  training\n",
505 |       "Training until validation scores don't improve for 50 rounds.\n",
506 |       "Early stopping, best iteration is:\n",
507 |       "[2457]\tvalid_0's l1: 14.7871\n",
508 |       "fold:  1  training\n",
509 |       "Training until validation scores don't improve for 50 rounds.\n",
510 |       "Early stopping, best iteration is:\n",
511 |       "[2365]\tvalid_0's l1: 14.6983\n",
512 |       "fold:  2  training\n",
513 |       "Training until validation scores don't improve for 50 rounds.\n",
514 |       "Early stopping, best iteration is:\n",
515 |       "[2082]\tvalid_0's l1: 14.7999\n",
516 |       "fold:  3  training\n",
517 |       "Training until validation scores don't improve for 50 rounds.\n",
518 |       "Early stopping, best iteration is:\n",
519 |       "[2266]\tvalid_0's l1: 14.483\n",
520 |       "fold:  4  training\n",
521 |       "Training until validation scores don't improve for 50 rounds.\n",
522 |       "Early stopping, best iteration is:\n",
523 |       "[2046]\tvalid_0's l1: 14.7681\n",
524 |       "fold:  0  training\n",
525 |       "Training until validation scores don't improve for 50 rounds.\n",
526 |       "Early stopping, best iteration is:\n",
527 |       "[2436]\tvalid_0's l1: 14.7728\n",
528 |       "fold:  1  training\n",
529 |       "Training until validation scores don't improve for 50 rounds.\n",
530 |       "Early stopping, best iteration is:\n",
531 |       "[2053]\tvalid_0's l1: 14.8066\n",
532 |       "fold:  2  training\n",
533 |       "Training until validation scores don't improve for 50 rounds.\n",
534 |       "Early stopping, best iteration is:\n",
535 |       "[2221]\tvalid_0's l1: 14.5464\n",
536 |       "fold:  3  training\n",
537 |       "Training until validation scores don't improve for 50 rounds.\n",
538 |       "Early stopping, best iteration is:\n",
539 |       "[2348]\tvalid_0's l1: 14.5198\n",
540 |       "fold:  4  training\n",
541 |       "Training until validation scores don't improve for 50 rounds.\n",
542 |       "Early stopping, best iteration is:\n",
543 |       "[2207]\tvalid_0's l1: 14.8169\n",
544 |       "fold:  0  training\n",
545 |       "Training until validation scores don't improve for 50 rounds.\n",
546 |       "Early stopping, best iteration is:\n",
547 |       "[2110]\tvalid_0's l1: 14.5323\n",
548 |       "fold:  1  training\n",
549 |       "Training until validation scores don't improve for 50 rounds.\n",
550 |       "Early stopping, best iteration is:\n",
551 |       "[2627]\tvalid_0's l1: 14.8493\n",
552 |       "fold:  2  training\n",
553 |       "Training until validation scores don't improve for 50 rounds.\n",
554 |       "Early stopping, best iteration is:\n",
555 |       "[2040]\tvalid_0's l1: 14.8335\n",
556 |       "fold:  3  training\n",
557 |       "Training until validation scores don't improve for 50 rounds.\n",
558 |       "Early stopping, best iteration is:\n",
559 |       "[2241]\tvalid_0's l1: 14.6379\n",
560 |       "fold:  4  training\n",
561 |       "Training until validation scores don't improve for 50 rounds.\n",
562 |       "Early stopping, best iteration is:\n",
563 |       "[2424]\tvalid_0's l1: 14.6794\n"
564 |      ]
565 |     }
566 |    ],
567 |    "source": [
568 |     "cv_pred_all2 = 0\n",
569 |     "en_amount = 3\n",
570 |     "for seed in range(en_amount):\n",
571 |     "    NFOLDS = 5\n",
572 |     "    train_label = train_data['score']\n",
573 |     "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n",
574 |     "    kf = kfold.split(train_data, train_label)\n",
575 |     "\n",
576 |     "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
577 |     "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
578 |     "\n",
579 |     "\n",
580 |     "    cv_pred = np.zeros(test_data.shape[0])\n",
581 |     "    valid_best_l2_all = 0\n",
582 |     "\n",
583 |     "    feature_importance_df = pd.DataFrame()\n",
584 |     "    count = 0\n",
585 |     "    for i, (train_fold, validate) in enumerate(kf):\n",
586 |     "        print('fold: ',i, ' training')\n",
587 |     "        X_train, X_validate, label_train, label_validate = \\\n",
588 |     "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
589 |     "        train_label[train_fold], train_label[validate]\n",
590 |     "        dtrain = lgb.Dataset(X_train, label_train)\n",
591 |     "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
592 |     "        bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
593 |     "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
594 |     "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
595 |     "\n",
596 |     "#         fold_importance_df = pd.DataFrame()\n",
597 |     "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
598 |     "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
599 |     "#         fold_importance_df[\"fold\"] = count + 1\n",
600 |     "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
601 |     "        count += 1\n",
602 |     "\n",
603 |     "    cv_pred /= NFOLDS\n",
604 |     "    valid_best_l2_all /= NFOLDS\n",
605 |     "    \n",
606 |     "    cv_pred_all2 += cv_pred\n",
607 |     "    \n",
608 |     "cv_pred_all2 /= en_amount\n",
609 |     "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 13,
615 |    "metadata": {
616 |     "collapsed": true
617 |    },
618 |    "outputs": [],
619 |    "source": [
620 |     "# display_importances(feature_importance_df)"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {
627 |     "collapsed": true
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "baseline\n",
632 |     "cv score for valid is:  0.06342856152921912 --- 0.06339265000\n",
633 |     "    \n",
634 |     "#充值金额是否为整数\n",
635 |     "cv score for valid is:  0.06343660584697094\n",
636 |     "#当月话费/半年话费\n",
637 |     "cv score for valid is:  0.06349188259250227\n",
638 |     "#当月话费/余额\n",
639 |     "cv score for valid is:  0.06350638782547711\n",
640 |     "    \n",
641 |     "#leaves 31\n",
642 |     "cv score for valid is:  0.06354362406472286\n",
643 |     "#remove l1, l2 = 5\n",
644 |     "cv score for valid is:  0.06358730556250403\n",
645 |     "#feature fraction 0.7\n",
646 |     "cv score for valid is:  0.06361478051326884 --- 0.06355141000\n",
647 |     "max_depth 5, objective l1\n",
648 |     "cv score for valid is:  0.06367445081783887\n",
649 |     "feature fraction 0.6\n",
650 |     "cv score for valid is:  0.06377264215140695 --- 0.06379867000\n",
651 |     "10 fold\n",
652 |     "cv score for valid is:  0.0637915578042461 --- 6378 --- useless\n",
653 |     "remove blk list flag\n",
654 |     "cv score for valid is:  0.06377613710442855"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "metadata": {},
660 |    "source": [
661 |     "### Submit"
662 |    ]
663 |   },
664 |   {
665 |    "cell_type": "code",
666 |    "execution_count": 14,
667 |    "metadata": {},
668 |    "outputs": [
669 |     {
670 |      "name": "stderr",
671 |      "output_type": "stream",
672 |      "text": [
673 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
674 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
675 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
676 |       "\n",
677 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
678 |       "  \n",
679 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
680 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
681 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
682 |       "\n",
683 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
684 |       "  after removing the cwd from sys.path.\n",
685 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
686 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
687 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
688 |       "\n",
689 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
690 |       "  \"\"\"\n"
691 |      ]
692 |     }
693 |    ],
694 |    "source": [
695 |     "test_data_sub = test_data[['uid']]\n",
696 |     "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n",
697 |     "test_data_sub.columns = ['id','score']\n",
698 |     "test_data_sub['score1'] = cv_pred_all\n",
699 |     "test_data_sub['score2'] = cv_pred_all2"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": 17,
705 |    "metadata": {},
706 |    "outputs": [
707 |     {
708 |      "name": "stderr",
709 |      "output_type": "stream",
710 |      "text": [
711 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
712 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
713 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
714 |       "\n",
715 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
716 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
717 |      ]
718 |     }
719 |    ],
720 |    "source": [
721 |     "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "code",
726 |    "execution_count": 18,
727 |    "metadata": {
728 |     "collapsed": true
729 |    },
730 |    "outputs": [],
731 |    "source": [
732 |     "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": 31,
738 |    "metadata": {},
739 |    "outputs": [
740 |     {
741 |      "data": {
742 |       "text/plain": [
743 |        "617.8386873193765"
744 |       ]
745 |      },
746 |      "execution_count": 31,
747 |      "metadata": {},
748 |      "output_type": "execute_result"
749 |     }
750 |    ],
751 |    "source": [
752 |     "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n",
753 |     "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": null,
759 |    "metadata": {
760 |     "collapsed": true
761 |    },
762 |    "outputs": [],
763 |    "source": []
764 |   }
765 |  ],
766 |  "metadata": {
767 |   "kernelspec": {
768 |    "display_name": "Python 3",
769 |    "language": "python",
770 |    "name": "python3"
771 |   },
772 |   "language_info": {
773 |    "codemirror_mode": {
774 |     "name": "ipython",
775 |     "version": 3
776 |    },
777 |    "file_extension": ".py",
778 |    "mimetype": "text/x-python",
779 |    "name": "python",
780 |    "nbconvert_exporter": "python",
781 |    "pygments_lexer": "ipython3",
782 |    "version": "3.6.1"
783 |   }
784 |  },
785 |  "nbformat": 4,
786 |  "nbformat_minor": 2
787 | }
788 | 


--------------------------------------------------------------------------------
/code/Baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 14,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#过年没时间做了，专心搞kaggle去了\n",
 19 |     "#kaggle玩家欢迎和我交流，ID是YourVenn"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {},
 26 |    "outputs": [
 27 |     {
 28 |      "name": "stderr",
 29 |      "output_type": "stream",
 30 |      "text": [
 31 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
 32 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 33 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 34 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 35 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "import time\n",
 41 |     "import matplotlib.pyplot as plt\n",
 42 |     "import seaborn as sns\n",
 43 |     "import numpy as np\n",
 44 |     "import pandas as pd\n",
 45 |     "import lightgbm as lgb\n",
 46 |     "from sklearn.model_selection import StratifiedKFold\n",
 47 |     "from sklearn.preprocessing import LabelEncoder"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "### Input data"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {
 61 |     "collapsed": true
 62 |    },
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "data_path = '../input/'\n",
 66 |     "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
 67 |     "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
 68 |     "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Pre-processing"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/html": [
 86 |        "<div>\n",
 87 |        "<style>\n",
 88 |        "    .dataframe thead tr:only-child th {\n",
 89 |        "        text-align: right;\n",
 90 |        "    }\n",
 91 |        "\n",
 92 |        "    .dataframe thead th {\n",
 93 |        "        text-align: left;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe tbody tr th {\n",
 97 |        "        vertical-align: top;\n",
 98 |        "    }\n",
 99 |        "</style>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>用户编码</th>\n",
105 |        "      <th>用户实名制是否通过核实</th>\n",
106 |        "      <th>用户年龄</th>\n",
107 |        "      <th>是否大学生客户</th>\n",
108 |        "      <th>是否黑名单客户</th>\n",
109 |        "      <th>是否4G不健康客户</th>\n",
110 |        "      <th>用户网龄（月）</th>\n",
111 |        "      <th>用户最近一次缴费距今时长（月）</th>\n",
112 |        "      <th>缴费用户最近一次缴费金额（元）</th>\n",
113 |        "      <th>用户近6个月平均消费值（元）</th>\n",
114 |        "      <th>...</th>\n",
115 |        "      <th>当月是否景点游览</th>\n",
116 |        "      <th>当月是否体育场馆消费</th>\n",
117 |        "      <th>当月网购类应用使用次数</th>\n",
118 |        "      <th>当月物流快递类应用使用次数</th>\n",
119 |        "      <th>当月金融理财类应用使用总次数</th>\n",
120 |        "      <th>当月视频播放类应用使用次数</th>\n",
121 |        "      <th>当月飞机类应用使用次数</th>\n",
122 |        "      <th>当月火车类应用使用次数</th>\n",
123 |        "      <th>当月旅游资讯类应用使用次数</th>\n",
124 |        "      <th>信用分</th>\n",
125 |        "    </tr>\n",
126 |        "  </thead>\n",
127 |        "  <tbody>\n",
128 |        "    <tr>\n",
129 |        "      <th>0</th>\n",
130 |        "      <td>a4651f98c82948b186bdcdc8108381b4</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>44</td>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>186</td>\n",
137 |        "      <td>1</td>\n",
138 |        "      <td>99.8</td>\n",
139 |        "      <td>163.86</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>1</td>\n",
143 |        "      <td>713</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>2740</td>\n",
146 |        "      <td>7145</td>\n",
147 |        "      <td>0</td>\n",
148 |        "      <td>0</td>\n",
149 |        "      <td>30</td>\n",
150 |        "      <td>664</td>\n",
151 |        "    </tr>\n",
152 |        "  </tbody>\n",
153 |        "</table>\n",
154 |        "<p>1 rows × 30 columns</p>\n",
155 |        "</div>"
156 |       ],
157 |       "text/plain": [
158 |        "                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \\\n",
159 |        "0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   \n",
160 |        "\n",
161 |        "   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \\\n",
162 |        "0          0      186                1             99.8          163.86 ...    \n",
163 |        "\n",
164 |        "   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \\\n",
165 |        "0         1           1          713              0            2740   \n",
166 |        "\n",
167 |        "   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  \n",
168 |        "0           7145            0            0             30  664  \n",
169 |        "\n",
170 |        "[1 rows x 30 columns]"
171 |       ]
172 |      },
173 |      "execution_count": 3,
174 |      "metadata": {},
175 |      "output_type": "execute_result"
176 |     }
177 |    ],
178 |    "source": [
179 |     "train_data.head(1)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "name": "stdout",
189 |      "output_type": "stream",
190 |      "text": [
191 |       "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
192 |       "       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',\n",
193 |       "       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
194 |       "       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
195 |       "       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
196 |       "       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
197 |       "       '当月旅游资讯类应用使用次数', '信用分'],\n",
198 |       "      dtype='object')\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "#all chinese name- -\n",
204 |     "#rename one by one\n",
205 |     "print(train_data.columns)"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 5,
211 |    "metadata": {
212 |     "collapsed": true
213 |    },
214 |    "outputs": [],
215 |    "source": [
216 |     "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
217 |     "                     '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
218 |     "                     'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
219 |     "                     'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
220 |     "                     'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
221 |     "                     'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
222 |     "                     'finance_app_count','video_app_count','flight_count','train_count',\\\n",
223 |     "                     'tour_app_count','score']\n",
224 |     "test_data.columns = train_data.columns[:-1]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "### Feature Engineering"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 6,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "name": "stderr",
241 |      "output_type": "stream",
242 |      "text": [
243 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: \n",
244 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
245 |       "\n",
246 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
247 |       "  \n"
248 |      ]
249 |     }
250 |    ],
251 |    "source": [
252 |     "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
253 |     "#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？\n",
254 |     "\n",
255 |     "def produce_offline_feat(train_data):\n",
256 |     "    train_data['top_up_amount_offline'] = 0\n",
257 |     "    train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
258 |     "                               train_data['top_up_amount'] != 0] = 1\n",
259 |     "    return train_data\n",
260 |     "\n",
261 |     "train_data = produce_offline_feat(train_data)\n",
262 |     "test_data = produce_offline_feat(test_data)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 7,
268 |    "metadata": {
269 |     "collapsed": true
270 |    },
271 |    "outputs": [],
272 |    "source": [
273 |     "def produce_fee_rate(train_data):\n",
274 |     "    #看importance，当月话费 和最近半年平均话费都很高，算一下当月/半年 -->稳定性\n",
275 |     "    train_data['current_fee_stability'] = \\\n",
276 |     "    train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
277 |     "    \n",
278 |     "    #当月话费/当月账户余额\n",
279 |     "    train_data['use_left_rate'] = \\\n",
280 |     "    train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
281 |     "    return train_data\n",
282 |     "\n",
283 |     "train_data = produce_fee_rate(train_data)\n",
284 |     "test_data = produce_fee_rate(test_data)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {},
290 |    "source": [
291 |     "### Training"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 8,
297 |    "metadata": {
298 |     "collapsed": true
299 |    },
300 |    "outputs": [],
301 |    "source": [
302 |     "def display_importances(feature_importance_df_):\n",
303 |     "    cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
304 |     "    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
305 |     "    plt.figure(figsize=(8, 10))\n",
306 |     "    sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
307 |     "    plt.title('LightGBM Features (avg over folds)')\n",
308 |     "    plt.tight_layout()\n",
309 |     "    plt.show()"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 9,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "#para\n",
321 |     "params = {\n",
322 |     "    'learning_rate': 0.01,\n",
323 |     "    'boosting_type': 'gbdt',\n",
324 |     "    'objective': 'regression_l1',\n",
325 |     "    'metric': 'mae',\n",
326 |     "    'feature_fraction': 0.6,\n",
327 |     "    'bagging_fraction': 0.8,\n",
328 |     "    'bagging_freq': 2,\n",
329 |     "    'num_leaves': 31,\n",
330 |     "    'verbose': -1,\n",
331 |     "    'max_depth': 5,\n",
332 |     "    'lambda_l2': 5, 'lambda_l1': 0\n",
333 |     "}"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 10,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "NFOLDS = 5\n",
345 |     "train_label = train_data['score']\n",
346 |     "kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2019)\n",
347 |     "kf = kfold.split(train_data, train_label)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 11,
353 |    "metadata": {},
354 |    "outputs": [
355 |     {
356 |      "data": {
357 |       "text/plain": [
358 |        "Index(['uid', 'true_name_flag', 'age', 'uni_student_flag', 'blk_list_flag',\n",
359 |        "       '4g_unhealth_flag', 'net_age_till_now', 'top_up_month_diff',\n",
360 |        "       'top_up_amount', 'recent_6month_avg_use', 'total_account_fee',\n",
361 |        "       'curr_month_balance', 'curr_overdue_flag', 'cost_sensitivity',\n",
362 |        "       'connect_num', 'freq_shopping_flag', 'recent_3month_shopping_count',\n",
363 |        "       'wanda_flag', 'sam_flag', 'movie_flag', 'tour_flag', 'sport_flag',\n",
364 |        "       'online_shopping_count', 'express_count', 'finance_app_count',\n",
365 |        "       'video_app_count', 'flight_count', 'train_count', 'tour_app_count',\n",
366 |        "       'score', 'top_up_amount_offline', 'current_fee_stability',\n",
367 |        "       'use_left_rate'],\n",
368 |        "      dtype='object')"
369 |       ]
370 |      },
371 |      "execution_count": 11,
372 |      "metadata": {},
373 |      "output_type": "execute_result"
374 |     }
375 |    ],
376 |    "source": [
377 |     "train_data.columns"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 12,
383 |    "metadata": {
384 |     "collapsed": true
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
389 |     "test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 13,
395 |    "metadata": {},
396 |    "outputs": [
397 |     {
398 |      "name": "stderr",
399 |      "output_type": "stream",
400 |      "text": [
401 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
402 |       "  % (min_groups, self.n_splits)), Warning)\n"
403 |      ]
404 |     },
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "fold:  0  training\n",
410 |       "Training until validation scores don't improve for 50 rounds.\n",
411 |       "Early stopping, best iteration is:\n",
412 |       "[2834]\tvalid_0's l1: 14.7519\n",
413 |       "fold:  1  training\n",
414 |       "Training until validation scores don't improve for 50 rounds.\n",
415 |       "Early stopping, best iteration is:\n",
416 |       "[2780]\tvalid_0's l1: 14.6775\n",
417 |       "fold:  2  training\n",
418 |       "Training until validation scores don't improve for 50 rounds.\n",
419 |       "Early stopping, best iteration is:\n",
420 |       "[3745]\tvalid_0's l1: 14.728\n",
421 |       "fold:  3  training\n",
422 |       "Training until validation scores don't improve for 50 rounds.\n",
423 |       "Early stopping, best iteration is:\n",
424 |       "[3009]\tvalid_0's l1: 14.46\n",
425 |       "fold:  4  training\n",
426 |       "Training until validation scores don't improve for 50 rounds.\n",
427 |       "Early stopping, best iteration is:\n",
428 |       "[2544]\tvalid_0's l1: 14.7818\n",
429 |       "cv score for valid is:  0.06377613710442855\n"
430 |      ]
431 |     }
432 |    ],
433 |    "source": [
434 |     "cv_pred = np.zeros(test_data.shape[0])\n",
435 |     "valid_best_l2_all = 0\n",
436 |     "\n",
437 |     "feature_importance_df = pd.DataFrame()\n",
438 |     "count = 0\n",
439 |     "for i, (train_fold, validate) in enumerate(kf):\n",
440 |     "    print('fold: ',i, ' training')\n",
441 |     "    X_train, X_validate, label_train, label_validate = \\\n",
442 |     "    train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
443 |     "    train_label[train_fold], train_label[validate]\n",
444 |     "    dtrain = lgb.Dataset(X_train, label_train)\n",
445 |     "    dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
446 |     "    bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
447 |     "    cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
448 |     "    valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
449 |     "\n",
450 |     "    fold_importance_df = pd.DataFrame()\n",
451 |     "    fold_importance_df[\"feature\"] = list(X_train.columns)\n",
452 |     "    fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='gain', iteration=bst.best_iteration)\n",
453 |     "    fold_importance_df[\"fold\"] = count + 1\n",
454 |     "    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
455 |     "    count += 1\n",
456 |     "\n",
457 |     "cv_pred /= NFOLDS\n",
458 |     "valid_best_l2_all /= NFOLDS\n",
459 |     "print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 15,
465 |    "metadata": {
466 |     "collapsed": true
467 |    },
468 |    "outputs": [],
469 |    "source": [
470 |     "display_importances(feature_importance_df)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "metadata": {
477 |     "collapsed": true
478 |    },
479 |    "outputs": [],
480 |    "source": [
481 |     "baseline\n",
482 |     "cv score for valid is:  0.06342856152921912 --- 0.06339265000\n",
483 |     "    \n",
484 |     "#充值金额是否为整数\n",
485 |     "cv score for valid is:  0.06343660584697094\n",
486 |     "#当月话费/半年话费\n",
487 |     "cv score for valid is:  0.06349188259250227\n",
488 |     "#当月话费/余额\n",
489 |     "cv score for valid is:  0.06350638782547711\n",
490 |     "    \n",
491 |     "#leaves 31\n",
492 |     "cv score for valid is:  0.06354362406472286\n",
493 |     "#remove l1, l2 = 5\n",
494 |     "cv score for valid is:  0.06358730556250403\n",
495 |     "#feature fraction 0.7\n",
496 |     "cv score for valid is:  0.06361478051326884 --- 0.06355141000\n",
497 |     "max_depth 5, objective l1\n",
498 |     "cv score for valid is:  0.06367445081783887\n",
499 |     "feature fraction 0.6\n",
500 |     "cv score for valid is:  0.06377264215140695 --- 0.06379867000\n",
501 |     "remove blk flag\n",
502 |     "cv score for valid is:  0.06377613710442855"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {},
508 |    "source": [
509 |     "### Submit"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 39,
515 |    "metadata": {},
516 |    "outputs": [
517 |     {
518 |      "name": "stderr",
519 |      "output_type": "stream",
520 |      "text": [
521 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
522 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
523 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
524 |       "\n",
525 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
526 |       "  \n"
527 |      ]
528 |     }
529 |    ],
530 |    "source": [
531 |     "test_data_sub = test_data[['uid']]\n",
532 |     "test_data_sub['score'] = cv_pred\n",
533 |     "test_data_sub.columns = ['id','score']"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 40,
539 |    "metadata": {},
540 |    "outputs": [
541 |     {
542 |      "name": "stderr",
543 |      "output_type": "stream",
544 |      "text": [
545 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
546 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
547 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
548 |       "\n",
549 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
550 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
551 |      ]
552 |     }
553 |    ],
554 |    "source": [
555 |     "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 41,
561 |    "metadata": {
562 |     "collapsed": true
563 |    },
564 |    "outputs": [],
565 |    "source": [
566 |     "test_data_sub.to_csv('../output/baseline_63776.csv', index=False)"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {
573 |     "collapsed": true
574 |    },
575 |    "outputs": [],
576 |    "source": []
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "kernelspec": {
581 |    "display_name": "Python 3",
582 |    "language": "python",
583 |    "name": "python3"
584 |   },
585 |   "language_info": {
586 |    "codemirror_mode": {
587 |     "name": "ipython",
588 |     "version": 3
589 |    },
590 |    "file_extension": ".py",
591 |    "mimetype": "text/x-python",
592 |    "name": "python",
593 |    "nbconvert_exporter": "python",
594 |    "pygments_lexer": "ipython3",
595 |    "version": "3.6.1"
596 |   }
597 |  },
598 |  "nbformat": 4,
599 |  "nbformat_minor": 2
600 | }
601 | 


--------------------------------------------------------------------------------
/code/Baseline_bagging_version.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Packages"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#在local cv 6377的基础上，加上MSE优化，MAE & MAE各自用N个seed 最终加权平均"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stderr",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/lightgbm/__init__.py:46: UserWarning: Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_8.3.1) compiler.\n",
 31 |       "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
 32 |       "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
 33 |       "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
 34 |       "  \"You can install the OpenMP library by the following command: ``brew install libomp``.\", UserWarning)\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "import time\n",
 40 |     "import matplotlib.pyplot as plt\n",
 41 |     "import seaborn as sns\n",
 42 |     "import numpy as np\n",
 43 |     "import pandas as pd\n",
 44 |     "import lightgbm as lgb\n",
 45 |     "from sklearn.model_selection import StratifiedKFold\n",
 46 |     "from sklearn.preprocessing import LabelEncoder"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "### Input data"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 2,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "data_path = '../input/'\n",
 65 |     "train_data = pd.read_csv(data_path + 'train_dataset.csv')\n",
 66 |     "test_data = pd.read_csv(data_path + 'test_dataset.csv')\n",
 67 |     "sample_sub = pd.read_csv(data_path + 'submit_example.csv')"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "### Pre-processing"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 3,
 80 |    "metadata": {},
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/html": [
 85 |        "<div>\n",
 86 |        "<style>\n",
 87 |        "    .dataframe thead tr:only-child th {\n",
 88 |        "        text-align: right;\n",
 89 |        "    }\n",
 90 |        "\n",
 91 |        "    .dataframe thead th {\n",
 92 |        "        text-align: left;\n",
 93 |        "    }\n",
 94 |        "\n",
 95 |        "    .dataframe tbody tr th {\n",
 96 |        "        vertical-align: top;\n",
 97 |        "    }\n",
 98 |        "</style>\n",
 99 |        "<table border=\"1\" class=\"dataframe\">\n",
100 |        "  <thead>\n",
101 |        "    <tr style=\"text-align: right;\">\n",
102 |        "      <th></th>\n",
103 |        "      <th>用户编码</th>\n",
104 |        "      <th>用户实名制是否通过核实</th>\n",
105 |        "      <th>用户年龄</th>\n",
106 |        "      <th>是否大学生客户</th>\n",
107 |        "      <th>是否黑名单客户</th>\n",
108 |        "      <th>是否4G不健康客户</th>\n",
109 |        "      <th>用户网龄（月）</th>\n",
110 |        "      <th>用户最近一次缴费距今时长（月）</th>\n",
111 |        "      <th>缴费用户最近一次缴费金额（元）</th>\n",
112 |        "      <th>用户近6个月平均消费值（元）</th>\n",
113 |        "      <th>...</th>\n",
114 |        "      <th>当月是否景点游览</th>\n",
115 |        "      <th>当月是否体育场馆消费</th>\n",
116 |        "      <th>当月网购类应用使用次数</th>\n",
117 |        "      <th>当月物流快递类应用使用次数</th>\n",
118 |        "      <th>当月金融理财类应用使用总次数</th>\n",
119 |        "      <th>当月视频播放类应用使用次数</th>\n",
120 |        "      <th>当月飞机类应用使用次数</th>\n",
121 |        "      <th>当月火车类应用使用次数</th>\n",
122 |        "      <th>当月旅游资讯类应用使用次数</th>\n",
123 |        "      <th>信用分</th>\n",
124 |        "    </tr>\n",
125 |        "  </thead>\n",
126 |        "  <tbody>\n",
127 |        "    <tr>\n",
128 |        "      <th>0</th>\n",
129 |        "      <td>a4651f98c82948b186bdcdc8108381b4</td>\n",
130 |        "      <td>1</td>\n",
131 |        "      <td>44</td>\n",
132 |        "      <td>0</td>\n",
133 |        "      <td>0</td>\n",
134 |        "      <td>0</td>\n",
135 |        "      <td>186</td>\n",
136 |        "      <td>1</td>\n",
137 |        "      <td>99.8</td>\n",
138 |        "      <td>163.86</td>\n",
139 |        "      <td>...</td>\n",
140 |        "      <td>1</td>\n",
141 |        "      <td>1</td>\n",
142 |        "      <td>713</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>2740</td>\n",
145 |        "      <td>7145</td>\n",
146 |        "      <td>0</td>\n",
147 |        "      <td>0</td>\n",
148 |        "      <td>30</td>\n",
149 |        "      <td>664</td>\n",
150 |        "    </tr>\n",
151 |        "  </tbody>\n",
152 |        "</table>\n",
153 |        "<p>1 rows × 30 columns</p>\n",
154 |        "</div>"
155 |       ],
156 |       "text/plain": [
157 |        "                               用户编码  用户实名制是否通过核实  用户年龄  是否大学生客户  是否黑名单客户  \\\n",
158 |        "0  a4651f98c82948b186bdcdc8108381b4            1    44        0        0   \n",
159 |        "\n",
160 |        "   是否4G不健康客户  用户网龄（月）  用户最近一次缴费距今时长（月）  缴费用户最近一次缴费金额（元）  用户近6个月平均消费值（元） ...   \\\n",
161 |        "0          0      186                1             99.8          163.86 ...    \n",
162 |        "\n",
163 |        "   当月是否景点游览  当月是否体育场馆消费  当月网购类应用使用次数  当月物流快递类应用使用次数  当月金融理财类应用使用总次数  \\\n",
164 |        "0         1           1          713              0            2740   \n",
165 |        "\n",
166 |        "   当月视频播放类应用使用次数  当月飞机类应用使用次数  当月火车类应用使用次数  当月旅游资讯类应用使用次数  信用分  \n",
167 |        "0           7145            0            0             30  664  \n",
168 |        "\n",
169 |        "[1 rows x 30 columns]"
170 |       ]
171 |      },
172 |      "execution_count": 3,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "train_data.head(1)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 4,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "name": "stdout",
188 |      "output_type": "stream",
189 |      "text": [
190 |       "Index(['用户编码', '用户实名制是否通过核实', '用户年龄', '是否大学生客户', '是否黑名单客户', '是否4G不健康客户',\n",
191 |       "       '用户网龄（月）', '用户最近一次缴费距今时长（月）', '缴费用户最近一次缴费金额（元）', '用户近6个月平均消费值（元）',\n",
192 |       "       '用户账单当月总费用（元）', '用户当月账户余额（元）', '缴费用户当前是否欠费缴费', '用户话费敏感度', '当月通话交往圈人数',\n",
193 |       "       '是否经常逛商场的人', '近三个月月均商场出现次数', '当月是否逛过福州仓山万达', '当月是否到过福州山姆会员店', '当月是否看电影',\n",
194 |       "       '当月是否景点游览', '当月是否体育场馆消费', '当月网购类应用使用次数', '当月物流快递类应用使用次数',\n",
195 |       "       '当月金融理财类应用使用总次数', '当月视频播放类应用使用次数', '当月飞机类应用使用次数', '当月火车类应用使用次数',\n",
196 |       "       '当月旅游资讯类应用使用次数', '信用分'],\n",
197 |       "      dtype='object')\n"
198 |      ]
199 |     }
200 |    ],
201 |    "source": [
202 |     "#all chinese name- -\n",
203 |     "#rename one by one\n",
204 |     "print(train_data.columns)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 5,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "train_data.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',\\\n",
216 |     "                     '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',\\\n",
217 |     "                     'recent_6month_avg_use','total_account_fee','curr_month_balance',\\\n",
218 |     "                     'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',\\\n",
219 |     "                     'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',\\\n",
220 |     "                     'tour_flag','sport_flag','online_shopping_count','express_count',\\\n",
221 |     "                     'finance_app_count','video_app_count','flight_count','train_count',\\\n",
222 |     "                     'tour_app_count','score']\n",
223 |     "test_data.columns = train_data.columns[:-1]"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": true
231 |    },
232 |    "outputs": [],
233 |    "source": []
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "### Feature Engineering"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 6,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "name": "stderr",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n",
252 |       "A value is trying to be set on a copy of a slice from a DataFrame\n",
253 |       "\n",
254 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
255 |       "  # This is added back by InteractiveShellApp.init_path()\n"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "#age and net_age_in_month ---> 入网时的年龄 --- useless\n",
261 |     "#先前余额，当前余额 + 当月话费 - 上次缴费 --- useless\n",
262 |     "#充值金额/余额 --- useless\n",
263 |     "#当月话费/最近充值金额 --- useless\n",
264 |     "#六个月均值/充值金额 --- useless\n",
265 |     "\n",
266 |     "#top up amount, 充值金额是整数，和小数，应该对应不同的充值途径？\n",
267 |     "\n",
268 |     "def produce_offline_feat(train_data):\n",
269 |     "    train_data['top_up_amount_offline'] = 0\n",
270 |     "    train_data['top_up_amount_offline'][(train_data['top_up_amount'] % 10 == 0)&\\\n",
271 |     "                               train_data['top_up_amount'] != 0] = 1\n",
272 |     "    return train_data\n",
273 |     "\n",
274 |     "train_data = produce_offline_feat(train_data)\n",
275 |     "test_data = produce_offline_feat(test_data)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 7,
281 |    "metadata": {
282 |     "collapsed": true
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "def produce_fee_rate(train_data):\n",
287 |     "    #看importance，当月话费 和最近半年平均话费都很高，算一下当月/半年 -->稳定性\n",
288 |     "    train_data['current_fee_stability'] = \\\n",
289 |     "    train_data['total_account_fee']/(train_data['recent_6month_avg_use'] + 1)\n",
290 |     "    \n",
291 |     "    #当月话费/当月账户余额\n",
292 |     "    train_data['use_left_rate'] = \\\n",
293 |     "    train_data['total_account_fee']/(train_data['curr_month_balance'] + 1)\n",
294 |     "    return train_data\n",
295 |     "\n",
296 |     "train_data = produce_fee_rate(train_data)\n",
297 |     "test_data = produce_fee_rate(test_data)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "### Training"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 8,
310 |    "metadata": {
311 |     "collapsed": true
312 |    },
313 |    "outputs": [],
314 |    "source": [
315 |     "def display_importances(feature_importance_df_):\n",
316 |     "    cols = feature_importance_df_[[\"feature\", \"importance\"]].groupby(\"feature\").mean().sort_values(by=\"importance\", ascending=False)[:40].index\n",
317 |     "    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]\n",
318 |     "    plt.figure(figsize=(8, 10))\n",
319 |     "    sns.barplot(x=\"importance\", y=\"feature\", data=best_features.sort_values(by=\"importance\", ascending=False))\n",
320 |     "    plt.title('LightGBM Features (avg over folds)')\n",
321 |     "    plt.tight_layout()\n",
322 |     "    plt.show()"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": 9,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "#para\n",
334 |     "params = {\n",
335 |     "    'learning_rate': 0.01,\n",
336 |     "    'boosting_type': 'gbdt',\n",
337 |     "    'objective': 'regression_l1',\n",
338 |     "    'metric': 'mae',\n",
339 |     "    'feature_fraction': 0.6,\n",
340 |     "    'bagging_fraction': 0.8,\n",
341 |     "    'bagging_freq': 2,\n",
342 |     "    'num_leaves': 31,\n",
343 |     "    'verbose': -1,\n",
344 |     "    'max_depth': 5,\n",
345 |     "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8\n",
346 |     "}"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 10,
352 |    "metadata": {
353 |     "collapsed": true
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "#para\n",
358 |     "params2 = {\n",
359 |     "    'learning_rate': 0.01,\n",
360 |     "    'boosting_type': 'gbdt',\n",
361 |     "    'objective': 'regression_l2',\n",
362 |     "    'metric': 'mae',\n",
363 |     "    'feature_fraction': 0.6,\n",
364 |     "    'bagging_fraction': 0.8,\n",
365 |     "    'bagging_freq': 2,\n",
366 |     "    'num_leaves': 31,\n",
367 |     "    'verbose': -1,\n",
368 |     "    'max_depth': 5,\n",
369 |     "    'lambda_l2': 5, 'lambda_l1': 0,'nthread': 8,\n",
370 |     "    'seed': 89\n",
371 |     "}"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 11,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stderr",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
384 |       "  % (min_groups, self.n_splits)), Warning)\n"
385 |      ]
386 |     },
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "fold:  0  training\n",
392 |       "Training until validation scores don't improve for 50 rounds.\n",
393 |       "Early stopping, best iteration is:\n",
394 |       "[2433]\tvalid_0's l1: 14.7441\n",
395 |       "fold:  1  training\n",
396 |       "Training until validation scores don't improve for 50 rounds.\n",
397 |       "Early stopping, best iteration is:\n",
398 |       "[1876]\tvalid_0's l1: 14.8595\n",
399 |       "fold:  2  training\n",
400 |       "Training until validation scores don't improve for 50 rounds.\n",
401 |       "Early stopping, best iteration is:\n",
402 |       "[2459]\tvalid_0's l1: 14.7082\n",
403 |       "fold:  3  training\n",
404 |       "Training until validation scores don't improve for 50 rounds.\n",
405 |       "Early stopping, best iteration is:\n",
406 |       "[2468]\tvalid_0's l1: 14.6564\n",
407 |       "fold:  4  training\n",
408 |       "Training until validation scores don't improve for 50 rounds.\n",
409 |       "Early stopping, best iteration is:\n",
410 |       "[2599]\tvalid_0's l1: 14.5114\n",
411 |       "fold:  0  training\n",
412 |       "Training until validation scores don't improve for 50 rounds.\n",
413 |       "Early stopping, best iteration is:\n",
414 |       "[3313]\tvalid_0's l1: 14.743\n",
415 |       "fold:  1  training\n",
416 |       "Training until validation scores don't improve for 50 rounds.\n",
417 |       "Early stopping, best iteration is:\n",
418 |       "[2590]\tvalid_0's l1: 14.8562\n",
419 |       "fold:  2  training\n",
420 |       "Training until validation scores don't improve for 50 rounds.\n",
421 |       "Early stopping, best iteration is:\n",
422 |       "[2523]\tvalid_0's l1: 14.5752\n",
423 |       "fold:  3  training\n",
424 |       "Training until validation scores don't improve for 50 rounds.\n",
425 |       "Early stopping, best iteration is:\n",
426 |       "[3564]\tvalid_0's l1: 14.6125\n",
427 |       "fold:  4  training\n",
428 |       "Training until validation scores don't improve for 50 rounds.\n",
429 |       "Early stopping, best iteration is:\n",
430 |       "[1853]\tvalid_0's l1: 14.6333\n",
431 |       "fold:  0  training\n",
432 |       "Training until validation scores don't improve for 50 rounds.\n",
433 |       "Early stopping, best iteration is:\n",
434 |       "[2851]\tvalid_0's l1: 14.9587\n",
435 |       "fold:  1  training\n",
436 |       "Training until validation scores don't improve for 50 rounds.\n",
437 |       "Early stopping, best iteration is:\n",
438 |       "[1875]\tvalid_0's l1: 14.7808\n",
439 |       "fold:  2  training\n",
440 |       "Training until validation scores don't improve for 50 rounds.\n",
441 |       "Early stopping, best iteration is:\n",
442 |       "[2957]\tvalid_0's l1: 14.5525\n",
443 |       "fold:  3  training\n",
444 |       "Training until validation scores don't improve for 50 rounds.\n",
445 |       "Early stopping, best iteration is:\n",
446 |       "[2723]\tvalid_0's l1: 14.4804\n",
447 |       "fold:  4  training\n",
448 |       "Training until validation scores don't improve for 50 rounds.\n",
449 |       "Early stopping, best iteration is:\n",
450 |       "[3311]\tvalid_0's l1: 14.6854\n"
451 |      ]
452 |     }
453 |    ],
454 |    "source": [
455 |     "cv_pred_all = 0\n",
456 |     "en_amount = 3\n",
457 |     "for seed in range(en_amount):\n",
458 |     "    NFOLDS = 5\n",
459 |     "    train_label = train_data['score']\n",
460 |     "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed)\n",
461 |     "    kf = kfold.split(train_data, train_label)\n",
462 |     "\n",
463 |     "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
464 |     "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
465 |     "\n",
466 |     "\n",
467 |     "    cv_pred = np.zeros(test_data.shape[0])\n",
468 |     "    valid_best_l2_all = 0\n",
469 |     "\n",
470 |     "    feature_importance_df = pd.DataFrame()\n",
471 |     "    count = 0\n",
472 |     "    for i, (train_fold, validate) in enumerate(kf):\n",
473 |     "        print('fold: ',i, ' training')\n",
474 |     "        X_train, X_validate, label_train, label_validate = \\\n",
475 |     "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
476 |     "        train_label[train_fold], train_label[validate]\n",
477 |     "        dtrain = lgb.Dataset(X_train, label_train)\n",
478 |     "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
479 |     "        bst = lgb.train(params, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
480 |     "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
481 |     "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
482 |     "\n",
483 |     "#         fold_importance_df = pd.DataFrame()\n",
484 |     "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
485 |     "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
486 |     "#         fold_importance_df[\"fold\"] = count + 1\n",
487 |     "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
488 |     "        count += 1\n",
489 |     "\n",
490 |     "    cv_pred /= NFOLDS\n",
491 |     "    valid_best_l2_all /= NFOLDS\n",
492 |     "    \n",
493 |     "    cv_pred_all += cv_pred\n",
494 |     "cv_pred_all /= en_amount\n",
495 |     "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 12,
501 |    "metadata": {},
502 |    "outputs": [
503 |     {
504 |      "name": "stderr",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:605: Warning: The least populated class in y has only 1 members, which is too few. The minimum number of members in any class cannot be less than n_splits=5.\n",
508 |       "  % (min_groups, self.n_splits)), Warning)\n"
509 |      ]
510 |     },
511 |     {
512 |      "name": "stdout",
513 |      "output_type": "stream",
514 |      "text": [
515 |       "fold:  0  training\n",
516 |       "Training until validation scores don't improve for 50 rounds.\n",
517 |       "Early stopping, best iteration is:\n",
518 |       "[2457]\tvalid_0's l1: 14.7871\n",
519 |       "fold:  1  training\n",
520 |       "Training until validation scores don't improve for 50 rounds.\n",
521 |       "Early stopping, best iteration is:\n",
522 |       "[2365]\tvalid_0's l1: 14.6983\n",
523 |       "fold:  2  training\n",
524 |       "Training until validation scores don't improve for 50 rounds.\n",
525 |       "Early stopping, best iteration is:\n",
526 |       "[2082]\tvalid_0's l1: 14.7999\n",
527 |       "fold:  3  training\n",
528 |       "Training until validation scores don't improve for 50 rounds.\n",
529 |       "Early stopping, best iteration is:\n",
530 |       "[2266]\tvalid_0's l1: 14.483\n",
531 |       "fold:  4  training\n",
532 |       "Training until validation scores don't improve for 50 rounds.\n",
533 |       "Early stopping, best iteration is:\n",
534 |       "[2046]\tvalid_0's l1: 14.7681\n",
535 |       "fold:  0  training\n",
536 |       "Training until validation scores don't improve for 50 rounds.\n",
537 |       "Early stopping, best iteration is:\n",
538 |       "[2436]\tvalid_0's l1: 14.7728\n",
539 |       "fold:  1  training\n",
540 |       "Training until validation scores don't improve for 50 rounds.\n",
541 |       "Early stopping, best iteration is:\n",
542 |       "[2053]\tvalid_0's l1: 14.8066\n",
543 |       "fold:  2  training\n",
544 |       "Training until validation scores don't improve for 50 rounds.\n",
545 |       "Early stopping, best iteration is:\n",
546 |       "[2221]\tvalid_0's l1: 14.5464\n",
547 |       "fold:  3  training\n",
548 |       "Training until validation scores don't improve for 50 rounds.\n",
549 |       "Early stopping, best iteration is:\n",
550 |       "[2348]\tvalid_0's l1: 14.5198\n",
551 |       "fold:  4  training\n",
552 |       "Training until validation scores don't improve for 50 rounds.\n",
553 |       "Early stopping, best iteration is:\n",
554 |       "[2207]\tvalid_0's l1: 14.8169\n",
555 |       "fold:  0  training\n",
556 |       "Training until validation scores don't improve for 50 rounds.\n",
557 |       "Early stopping, best iteration is:\n",
558 |       "[2110]\tvalid_0's l1: 14.5323\n",
559 |       "fold:  1  training\n",
560 |       "Training until validation scores don't improve for 50 rounds.\n",
561 |       "Early stopping, best iteration is:\n",
562 |       "[2627]\tvalid_0's l1: 14.8493\n",
563 |       "fold:  2  training\n",
564 |       "Training until validation scores don't improve for 50 rounds.\n",
565 |       "Early stopping, best iteration is:\n",
566 |       "[2040]\tvalid_0's l1: 14.8335\n",
567 |       "fold:  3  training\n",
568 |       "Training until validation scores don't improve for 50 rounds.\n",
569 |       "Early stopping, best iteration is:\n",
570 |       "[2241]\tvalid_0's l1: 14.6379\n",
571 |       "fold:  4  training\n",
572 |       "Training until validation scores don't improve for 50 rounds.\n",
573 |       "Early stopping, best iteration is:\n",
574 |       "[2424]\tvalid_0's l1: 14.6794\n"
575 |      ]
576 |     }
577 |    ],
578 |    "source": [
579 |     "cv_pred_all2 = 0\n",
580 |     "en_amount = 3\n",
581 |     "for seed in range(en_amount):\n",
582 |     "    NFOLDS = 5\n",
583 |     "    train_label = train_data['score']\n",
584 |     "    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=(seed + 2019))\n",
585 |     "    kf = kfold.split(train_data, train_label)\n",
586 |     "\n",
587 |     "    train_data_use = train_data.drop(['uid','score','blk_list_flag'], axis=1)\n",
588 |     "    test_data_use = test_data.drop(['uid','blk_list_flag'], axis=1)\n",
589 |     "\n",
590 |     "\n",
591 |     "    cv_pred = np.zeros(test_data.shape[0])\n",
592 |     "    valid_best_l2_all = 0\n",
593 |     "\n",
594 |     "    feature_importance_df = pd.DataFrame()\n",
595 |     "    count = 0\n",
596 |     "    for i, (train_fold, validate) in enumerate(kf):\n",
597 |     "        print('fold: ',i, ' training')\n",
598 |     "        X_train, X_validate, label_train, label_validate = \\\n",
599 |     "        train_data_use.iloc[train_fold, :], train_data_use.iloc[validate, :], \\\n",
600 |     "        train_label[train_fold], train_label[validate]\n",
601 |     "        dtrain = lgb.Dataset(X_train, label_train)\n",
602 |     "        dvalid = lgb.Dataset(X_validate, label_validate, reference=dtrain)\n",
603 |     "        bst = lgb.train(params2, dtrain, num_boost_round=10000, valid_sets=dvalid, verbose_eval=-1,early_stopping_rounds=50)\n",
604 |     "        cv_pred += bst.predict(test_data_use, num_iteration=bst.best_iteration)\n",
605 |     "        valid_best_l2_all += bst.best_score['valid_0']['l1']\n",
606 |     "\n",
607 |     "#         fold_importance_df = pd.DataFrame()\n",
608 |     "#         fold_importance_df[\"feature\"] = list(X_train.columns)\n",
609 |     "#         fold_importance_df[\"importance\"] = bst.feature_importance(importance_type='split', iteration=bst.best_iteration)\n",
610 |     "#         fold_importance_df[\"fold\"] = count + 1\n",
611 |     "#         feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)\n",
612 |     "        count += 1\n",
613 |     "\n",
614 |     "    cv_pred /= NFOLDS\n",
615 |     "    valid_best_l2_all /= NFOLDS\n",
616 |     "    \n",
617 |     "    cv_pred_all2 += cv_pred\n",
618 |     "    \n",
619 |     "cv_pred_all2 /= en_amount\n",
620 |     "    #print('cv score for valid is: ', 1/(1+valid_best_l2_all))"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 13,
626 |    "metadata": {
627 |     "collapsed": true
628 |    },
629 |    "outputs": [],
630 |    "source": [
631 |     "# display_importances(feature_importance_df)"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {
638 |     "collapsed": true
639 |    },
640 |    "outputs": [],
641 |    "source": [
642 |     "baseline\n",
643 |     "cv score for valid is:  0.06342856152921912 --- 0.06339265000\n",
644 |     "    \n",
645 |     "#充值金额是否为整数\n",
646 |     "cv score for valid is:  0.06343660584697094\n",
647 |     "#当月话费/半年话费\n",
648 |     "cv score for valid is:  0.06349188259250227\n",
649 |     "#当月话费/余额\n",
650 |     "cv score for valid is:  0.06350638782547711\n",
651 |     "    \n",
652 |     "#leaves 31\n",
653 |     "cv score for valid is:  0.06354362406472286\n",
654 |     "#remove l1, l2 = 5\n",
655 |     "cv score for valid is:  0.06358730556250403\n",
656 |     "#feature fraction 0.7\n",
657 |     "cv score for valid is:  0.06361478051326884 --- 0.06355141000\n",
658 |     "max_depth 5, objective l1\n",
659 |     "cv score for valid is:  0.06367445081783887\n",
660 |     "feature fraction 0.6\n",
661 |     "cv score for valid is:  0.06377264215140695 --- 0.06379867000\n",
662 |     "10 fold\n",
663 |     "cv score for valid is:  0.0637915578042461 --- 6378 --- useless\n",
664 |     "remove blk list flag\n",
665 |     "cv score for valid is:  0.06377613710442855"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "markdown",
670 |    "metadata": {},
671 |    "source": [
672 |     "### Submit"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": 14,
678 |    "metadata": {},
679 |    "outputs": [
680 |     {
681 |      "name": "stderr",
682 |      "output_type": "stream",
683 |      "text": [
684 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
685 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
686 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
687 |       "\n",
688 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
689 |       "  \n",
690 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n",
691 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
692 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
693 |       "\n",
694 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
695 |       "  after removing the cwd from sys.path.\n",
696 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: \n",
697 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
698 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
699 |       "\n",
700 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
701 |       "  \"\"\"\n"
702 |      ]
703 |     }
704 |    ],
705 |    "source": [
706 |     "test_data_sub = test_data[['uid']]\n",
707 |     "test_data_sub['score'] = (cv_pred_all2 + cv_pred_all)/2\n",
708 |     "test_data_sub.columns = ['id','score']\n",
709 |     "test_data_sub['score1'] = cv_pred_all\n",
710 |     "test_data_sub['score2'] = cv_pred_all2"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": 17,
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "name": "stderr",
720 |      "output_type": "stream",
721 |      "text": [
722 |       "/Users/Venn/anaconda/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
723 |       "A value is trying to be set on a copy of a slice from a DataFrame.\n",
724 |       "Try using .loc[row_indexer,col_indexer] = value instead\n",
725 |       "\n",
726 |       "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
727 |       "  \"\"\"Entry point for launching an IPython kernel.\n"
728 |      ]
729 |     }
730 |    ],
731 |    "source": [
732 |     "test_data_sub['score'] = test_data_sub['score'].apply(lambda x: int(np.round(x)))"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": 18,
738 |    "metadata": {
739 |     "collapsed": true
740 |    },
741 |    "outputs": [],
742 |    "source": [
743 |     "test_data_sub[['id','score']].to_csv('../output/baseline_6377_mae_mse_mean_6bagging.csv', index=False)"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": 31,
749 |    "metadata": {},
750 |    "outputs": [
751 |     {
752 |      "data": {
753 |       "text/plain": [
754 |        "617.8386873193765"
755 |       ]
756 |      },
757 |      "execution_count": 31,
758 |      "metadata": {},
759 |      "output_type": "execute_result"
760 |     }
761 |    ],
762 |    "source": [
763 |     "#mean is: 1/(0.00161593) - 1, --- 617.8386873193765\n",
764 |     "#std is around: 1/(0.02869282) - 1, --- 33.851924627833725"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "code",
769 |    "execution_count": null,
770 |    "metadata": {
771 |     "collapsed": true
772 |    },
773 |    "outputs": [],
774 |    "source": []
775 |   }
776 |  ],
777 |  "metadata": {
778 |   "kernelspec": {
779 |    "display_name": "Python 3",
780 |    "language": "python",
781 |    "name": "python3"
782 |   },
783 |   "language_info": {
784 |    "codemirror_mode": {
785 |     "name": "ipython",
786 |     "version": 3
787 |    },
788 |    "file_extension": ".py",
789 |    "mimetype": "text/x-python",
790 |    "name": "python",
791 |    "nbconvert_exporter": "python",
792 |    "pygments_lexer": "ipython3",
793 |    "version": "3.6.1"
794 |   }
795 |  },
796 |  "nbformat": 4,
797 |  "nbformat_minor": 2
798 | }
799 | 


--------------------------------------------------------------------------------