├── 1.feature.ipynb
├── 2.fold_model.ipynb
├── 3.offline_model.ipynb
├── 4.online_model.ipynb
└── README.md


/1.feature.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": null,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import pandas as pd\n",
  10 |     "import numpy as np\n",
  11 |     "import os\n",
  12 |     "from sklearn.preprocessing import LabelEncoder\n",
  13 |     "from tqdm import tqdm\n",
  14 |     "import lightgbm as lgb\n",
  15 |     "from sklearn.model_selection import KFold, StratifiedKFold\n",
  16 |     "import warnings\n",
  17 |     "from sklearn.metrics import f1_score, classification_report\n",
  18 |     "import gc\n",
  19 |     "import xgboost as xgb\n",
  20 |     "from scipy import stats\n",
  21 |     "import datetime\n",
  22 |     "import time\n",
  23 |     "from scipy.stats import entropy, kurtosis\n",
  24 |     "import multiprocessing\n",
  25 |     "from gensim.models.word2vec import LineSentence\n",
  26 |     "from gensim.corpora import WikiCorpus\n",
  27 |     "from gensim.models import Word2Vec\n",
  28 |     "tqdm.pandas()\n",
  29 |     "\n",
  30 |     "pd.set_option('display.max_columns', None)\n",
  31 |     "pd.set_option('display.max_rows', None)\n",
  32 |     "\n",
  33 |     "warnings.filterwarnings('ignore')"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "code",
  38 |    "execution_count": null,
  39 |    "metadata": {},
  40 |    "outputs": [],
  41 |    "source": [
  42 |     "current_path = './'\n",
  43 |     "seed = 2019"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "code",
  48 |    "execution_count": null,
  49 |    "metadata": {},
  50 |    "outputs": [],
  51 |    "source": [
  52 |     "df_train = pd.read_csv(os.path.join(current_path, 'raw_data', 'train.csv'))\n",
  53 |     "df_test = pd.read_csv(os.path.join(current_path, 'raw_data', 'test.csv'))"
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "code",
  58 |    "execution_count": null,
  59 |    "metadata": {},
  60 |    "outputs": [],
  61 |    "source": [
  62 |     "df_train.head()"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "code",
  67 |    "execution_count": null,
  68 |    "metadata": {},
  69 |    "outputs": [],
  70 |    "source": [
  71 |     "df_feature = pd.concat([df_train, df_test], sort=False)\n",
  72 |     "df_feature = df_feature.sort_values(\n",
  73 |     "    ['deviceid', 'ts']).reset_index().drop('index', axis=1)"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": null,
  79 |    "metadata": {},
  80 |    "outputs": [],
  81 |    "source": [
  82 |     "df_feature['newsid'] = df_feature['newsid'].map(lambda x: str(x))"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "code",
  87 |    "execution_count": null,
  88 |    "metadata": {},
  89 |    "outputs": [],
  90 |    "source": [
  91 |     "# 时间\n",
  92 |     "df_feature['ts_datetime'] = df_feature['ts'] + 8 * 60 * 60 * 1000\n",
  93 |     "df_feature['ts_datetime'] = pd.to_datetime(\n",
  94 |     "    df_feature['ts_datetime'], unit='ms')\n",
  95 |     "df_feature['day'] = df_feature['ts_datetime'].dt.day\n",
  96 |     "df_feature['hour'] = df_feature['ts_datetime'].dt.hour\n",
  97 |     "df_feature['minute'] = df_feature['ts_datetime'].dt.minute\n",
  98 |     "df_feature['minute10'] = (df_feature['minute'] // 10) * 10\n",
  99 |     "\n",
 100 |     "df_feature['hourl'] = df_feature['day'] * 24 + df_feature['hour']\n",
 101 |     "df_feature['hourl'] = df_feature['hourl'] - df_feature['hourl'].min()"
 102 |    ]
 103 |   },
 104 |   {
 105 |    "cell_type": "markdown",
 106 |    "metadata": {},
 107 |    "source": [
 108 |     "# 基本特征"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "code",
 113 |    "execution_count": null,
 114 |    "metadata": {},
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "group = df_feature.groupby('deviceid')\n",
 118 |     "df_feature['ts_before'] = group['ts'].shift(0) - group['ts'].shift(1)\n",
 119 |     "df_feature['ts_before'] = df_feature['ts_before'].fillna(3 * 60 * 1000)\n",
 120 |     "INDEX = df_feature[df_feature['ts_before'] > (3 * 60 * 1000 - 1)].index\n",
 121 |     "df_feature['ts_before'] = np.log(df_feature['ts_before'] // 1000 + 1)\n",
 122 |     "LENGTH = len(INDEX)\n",
 123 |     "ts_len = []\n",
 124 |     "group = []\n",
 125 |     "for i in tqdm(range(1, LENGTH)):\n",
 126 |     "    ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])\n",
 127 |     "    group += [i] * (INDEX[i] - INDEX[i - 1])\n",
 128 |     "ts_len += [(len(df_feature) - INDEX[LENGTH - 1])] * \\\n",
 129 |     "    (len(df_feature) - INDEX[LENGTH - 1])\n",
 130 |     "group += [LENGTH] * (len(df_feature) - INDEX[LENGTH - 1])\n",
 131 |     "df_feature['ts_before_len'] = ts_len\n",
 132 |     "df_feature['group'] = group\n",
 133 |     "\n",
 134 |     "group = df_feature.groupby('deviceid')\n",
 135 |     "df_feature['ts_after'] = group['ts'].shift(-1) - group['ts'].shift(0)\n",
 136 |     "df_feature['ts_after'] = df_feature['ts_after'].fillna(3 * 60 * 1000)\n",
 137 |     "INDEX = df_feature[df_feature['ts_after'] > (3 * 60 * 1000 - 1)].index\n",
 138 |     "df_feature['ts_after'] = np.log(df_feature['ts_after'] // 1000 + 1)\n",
 139 |     "LENGTH = len(INDEX)\n",
 140 |     "ts_len = [INDEX[0]] * (INDEX[0] + 1)\n",
 141 |     "for i in tqdm(range(1, LENGTH)):\n",
 142 |     "    ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])\n",
 143 |     "df_feature['ts_after_len'] = ts_len"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "code",
 148 |    "execution_count": null,
 149 |    "metadata": {},
 150 |    "outputs": [],
 151 |    "source": [
 152 |     "# 类别交叉特征\n",
 153 |     "df_feature['devicevendor_osv'] = df_feature['device_vendor'].astype(\n",
 154 |     "    'str') + '_' + df_feature['osversion'].astype('str')"
 155 |    ]
 156 |   },
 157 |   {
 158 |    "cell_type": "code",
 159 |    "execution_count": null,
 160 |    "metadata": {},
 161 |    "outputs": [],
 162 |    "source": [
 163 |     "# 下一次 pos\n",
 164 |     "df_feature['before_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(1)\n",
 165 |     "df_feature['next_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(-1)\n",
 166 |     "df_feature['diff_pos'] = df_feature['next_pos'] - df_feature['pos']\n",
 167 |     "\n",
 168 |     "# 距离变化\n",
 169 |     "df_feature['next_lat'] = df_feature.groupby(['deviceid'])['lat'].shift(-1)\n",
 170 |     "df_feature['next_lng'] = df_feature.groupby(['deviceid'])['lng'].shift(-1)\n",
 171 |     "df_feature['dist_diff'] = (df_feature['next_lat'] - df_feature['lat']\n",
 172 |     "                           ) ** 2 + (df_feature['lng'] - df_feature['next_lng']) ** 2\n",
 173 |     "\n",
 174 |     "del df_feature['next_lat']\n",
 175 |     "del df_feature['next_lng']\n",
 176 |     "\n",
 177 |     "# 下一次 网络\n",
 178 |     "df_feature['next_netmodel'] = df_feature.groupby(['deviceid'])[\n",
 179 |     "    'netmodel'].shift(-1)"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": null,
 185 |    "metadata": {},
 186 |    "outputs": [],
 187 |    "source": [
 188 |     "df_feature.head()"
 189 |    ]
 190 |   },
 191 |   {
 192 |    "cell_type": "markdown",
 193 |    "metadata": {},
 194 |    "source": [
 195 |     "# 历史特征"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "markdown",
 200 |    "metadata": {},
 201 |    "source": [
 202 |     "## day 为单位 "
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": null,
 208 |    "metadata": {},
 209 |    "outputs": [],
 210 |    "source": [
 211 |     "# 对前一天的样本的所有反应时间进行统计量提取\n",
 212 |     "df_temp = df_feature[df_feature['target'] == 1]\n",
 213 |     "df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']\n",
 214 |     "\n",
 215 |     "col = 'deviceid'\n",
 216 |     "col2 = 'click_minus'\n",
 217 |     "\n",
 218 |     "df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({\n",
 219 |     "    'yesterday_{}_{}_max'.format(col, col2): 'max',\n",
 220 |     "    'yesterday_{}_{}_mean'.format(col, col2): 'mean',\n",
 221 |     "    'yesterday_{}_{}_min'.format(col, col2): 'min',\n",
 222 |     "    'yesterday_{}_{}_std'.format(col, col2): 'std',\n",
 223 |     "    'yesterday_{}_{}_median'.format(col, col2): 'median',\n",
 224 |     "    'yesterday_{}_{}_kurt'.format(col, col2): kurtosis,\n",
 225 |     "    'yesterday_{}_{}_q3'.format(col, col2): lambda x: np.quantile(x, q=0.75),\n",
 226 |     "})\n",
 227 |     "df_temp['day'] += 1\n",
 228 |     "\n",
 229 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 230 |     "\n",
 231 |     "del df_temp\n",
 232 |     "gc.collect()"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": null,
 238 |    "metadata": {},
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "# 昨日 deviceid 点击次数，点击率\n",
 242 |     "col = 'deviceid'\n",
 243 |     "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n",
 244 |     "    'yesterday_{}_click_count'.format(col): 'sum',\n",
 245 |     "    'yesterday_{}_count'.format(col): 'count',\n",
 246 |     "})\n",
 247 |     "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n",
 248 |     "    / df_temp['yesterday_{}_count'.format(col)]\n",
 249 |     "df_temp['day'] += 1\n",
 250 |     "del df_temp['yesterday_{}_count'.format(col)]\n",
 251 |     "\n",
 252 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 253 |     "\n",
 254 |     "del df_temp\n",
 255 |     "gc.collect()"
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": null,
 261 |    "metadata": {},
 262 |    "outputs": [],
 263 |    "source": [
 264 |     "# 昨日小时点击率\n",
 265 |     "groups = ['deviceid', 'hour']\n",
 266 |     "df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({\n",
 267 |     "    'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',\n",
 268 |     "    'yesterday_{}_count'.format('_'.join(groups)): 'count',\n",
 269 |     "})\n",
 270 |     "\n",
 271 |     "df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \\\n",
 272 |     "    / df_temp['yesterday_{}_count'.format('_'.join(groups))]\n",
 273 |     "df_temp['day'] += 1\n",
 274 |     "\n",
 275 |     "del df_temp['yesterday_{}_click_count'.format('_'.join(groups))]\n",
 276 |     "del df_temp['yesterday_{}_count'.format('_'.join(groups))]\n",
 277 |     "\n",
 278 |     "df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')\n",
 279 |     "\n",
 280 |     "del df_temp\n",
 281 |     "gc.collect()"
 282 |    ]
 283 |   },
 284 |   {
 285 |    "cell_type": "code",
 286 |    "execution_count": null,
 287 |    "metadata": {},
 288 |    "outputs": [],
 289 |    "source": [
 290 |     "# 昨日曝光 pos 平均值\n",
 291 |     "col = 'deviceid'\n",
 292 |     "df_temp = df_feature.groupby([col, 'day'], as_index=False)['pos'].agg({\n",
 293 |     "    'yesterday_{}_pos_mean'.format(col): 'mean',\n",
 294 |     "})\n",
 295 |     "df_temp['day'] += 1\n",
 296 |     "\n",
 297 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 298 |     "\n",
 299 |     "del df_temp\n",
 300 |     "gc.collect()"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": null,
 306 |    "metadata": {},
 307 |    "outputs": [],
 308 |    "source": [
 309 |     "# 昨日 deviceid netmodel 点击率\n",
 310 |     "groups = ['deviceid', 'netmodel']\n",
 311 |     "df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({\n",
 312 |     "    'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',\n",
 313 |     "    'yesterday_{}_count'.format('_'.join(groups)): 'count',\n",
 314 |     "})\n",
 315 |     "\n",
 316 |     "df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \\\n",
 317 |     "    / df_temp['yesterday_{}_count'.format('_'.join(groups))]\n",
 318 |     "\n",
 319 |     "df_temp['day'] += 1\n",
 320 |     "\n",
 321 |     "df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')\n",
 322 |     "df_feature['yesterday_deviceid_netmodel_click_ratio'] = df_feature['yesterday_deviceid_netmodel_click_count'] / \\\n",
 323 |     "    df_feature['yesterday_deviceid_click_count']\n",
 324 |     "\n",
 325 |     "del df_feature['yesterday_{}_click_count'.format('_'.join(groups))]\n",
 326 |     "del df_feature['yesterday_{}_count'.format('_'.join(groups))]\n",
 327 |     "\n",
 328 |     "del df_temp\n",
 329 |     "gc.collect()"
 330 |    ]
 331 |   },
 332 |   {
 333 |    "cell_type": "code",
 334 |    "execution_count": null,
 335 |    "metadata": {},
 336 |    "outputs": [],
 337 |    "source": [
 338 |     "# 对前一天的 newsid 所有反应时间进行统计量提取\n",
 339 |     "df_temp = df_feature[df_feature['target'] == 1]\n",
 340 |     "df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']\n",
 341 |     "\n",
 342 |     "col = 'newsid'\n",
 343 |     "col2 = 'click_minus'\n",
 344 |     "\n",
 345 |     "df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({\n",
 346 |     "    'yesterday_{}_{}_std'.format(col, col2): 'std',\n",
 347 |     "})\n",
 348 |     "df_temp['day'] += 1\n",
 349 |     "\n",
 350 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 351 |     "\n",
 352 |     "del df_temp\n",
 353 |     "gc.collect()"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "code",
 358 |    "execution_count": null,
 359 |    "metadata": {},
 360 |    "outputs": [],
 361 |    "source": [
 362 |     "# 昨日 newsid 点击次数，点击率\n",
 363 |     "col = 'newsid'\n",
 364 |     "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n",
 365 |     "    'yesterday_{}_click_count'.format(col): 'sum',\n",
 366 |     "    'yesterday_{}_count'.format(col): 'count',\n",
 367 |     "})\n",
 368 |     "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n",
 369 |     "    / df_temp['yesterday_{}_count'.format(col)]\n",
 370 |     "\n",
 371 |     "df_temp['day'] += 1\n",
 372 |     "del df_temp['yesterday_{}_count'.format(col)]\n",
 373 |     "\n",
 374 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 375 |     "\n",
 376 |     "del df_temp\n",
 377 |     "gc.collect()"
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": null,
 383 |    "metadata": {},
 384 |    "outputs": [],
 385 |    "source": [
 386 |     "# 昨日 next_pos 点击率\n",
 387 |     "col = 'next_pos'\n",
 388 |     "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n",
 389 |     "    'yesterday_{}_click_count'.format(col): 'sum',\n",
 390 |     "    'yesterday_{}_count'.format(col): 'count',\n",
 391 |     "})\n",
 392 |     "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n",
 393 |     "    / df_temp['yesterday_{}_count'.format(col)]\n",
 394 |     "\n",
 395 |     "df_temp['day'] += 1\n",
 396 |     "\n",
 397 |     "del df_temp['yesterday_{}_count'.format(col)]\n",
 398 |     "del df_temp['yesterday_{}_click_count'.format(col)]\n",
 399 |     "\n",
 400 |     "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n",
 401 |     "\n",
 402 |     "del df_temp\n",
 403 |     "gc.collect()"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": null,
 409 |    "metadata": {},
 410 |    "outputs": [],
 411 |    "source": [
 412 |     "cat_list = tqdm([['deviceid', 'netmodel']])\n",
 413 |     "for f1, f2 in cat_list:\n",
 414 |     "    df_feature['t_{}_count'.format(f1)] = df_feature.groupby([f1, 'day'])[\n",
 415 |     "        'id'].transform('count')\n",
 416 |     "    df_feature['t_{}_count'.format(f2)] = df_feature.groupby([f2, 'day'])[\n",
 417 |     "        'id'].transform('count')\n",
 418 |     "    df_feature['t_{}_count'.format('_'.join([f1, f2]))] = df_feature.groupby([\n",
 419 |     "        f1, f2, 'day'])['id'].transform('count')\n",
 420 |     "\n",
 421 |     "    df_feature['{}_coratio'.format('_'.join([f1, f2]))] = (df_feature['t_{}_count'.format(\n",
 422 |     "        f1)] * df_feature['t_{}_count'.format(f2)]) / df_feature['t_{}_count'.format('_'.join([f1, f2]))]\n",
 423 |     "    df_feature['yesterday_{}_coratio'.format('_'.join([f1, f2]))] = df_feature.groupby(\n",
 424 |     "        [f1, f2, 'day'])['{}_coratio'.format('_'.join([f1, f2]))].shift()\n",
 425 |     "\n",
 426 |     "    del df_feature['t_{}_count'.format(f1)]\n",
 427 |     "    del df_feature['t_{}_count'.format(f2)]\n",
 428 |     "    del df_feature['t_{}_count'.format('_'.join([f1, f2]))]\n",
 429 |     "    del df_feature['{}_coratio'.format('_'.join([f1, f2]))]\n",
 430 |     "\n",
 431 |     "    gc.collect()"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": null,
 437 |    "metadata": {},
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "df_feature.head()"
 441 |    ]
 442 |   },
 443 |   {
 444 |    "cell_type": "markdown",
 445 |    "metadata": {},
 446 |    "source": [
 447 |     "## 以 hour 为单位"
 448 |    ]
 449 |   },
 450 |   {
 451 |    "cell_type": "code",
 452 |    "execution_count": null,
 453 |    "metadata": {},
 454 |    "outputs": [],
 455 |    "source": [
 456 |     "# 一小时之前 deviceid 点击次数，点击率\n",
 457 |     "col = 'deviceid'\n",
 458 |     "df_temp = df_feature.groupby([col, 'hourl'], as_index=False)['id'].agg({\n",
 459 |     "    'pre_hour_{}_count'.format(col): 'count',\n",
 460 |     "})\n",
 461 |     "df_temp['hourl'] += 1\n",
 462 |     "\n",
 463 |     "df_feature = df_feature.merge(df_temp, on=[col, 'hourl'], how='left')\n",
 464 |     "\n",
 465 |     "del df_temp\n",
 466 |     "gc.collect()"
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "code",
 471 |    "execution_count": null,
 472 |    "metadata": {},
 473 |    "outputs": [],
 474 |    "source": [
 475 |     "df_feature.head()"
 476 |    ]
 477 |   },
 478 |   {
 479 |    "cell_type": "markdown",
 480 |    "metadata": {},
 481 |    "source": [
 482 |     "# 统计特征"
 483 |    ]
 484 |   },
 485 |   {
 486 |    "cell_type": "code",
 487 |    "execution_count": null,
 488 |    "metadata": {},
 489 |    "outputs": [],
 490 |    "source": [
 491 |     "cat_list = [['deviceid'], ['guid'], ['newsid'], ['deviceid', 'pos'], ['newsid', 'pos'],\n",
 492 |     "            ['deviceid', 'guid', 'newsid'], ['deviceid', 'next_pos']]\n",
 493 |     "for f in tqdm(cat_list):\n",
 494 |     "    df_feature['{}_day_count'.format('_'.join(f))] = df_feature.groupby([\n",
 495 |     "        'day'] + f)['id'].transform('count')\n",
 496 |     "\n",
 497 |     "cat_list = [['deviceid'], ['guid'], [\n",
 498 |     "    'deviceid', 'pos'], ['deviceid', 'netmodel']]\n",
 499 |     "for f in tqdm(cat_list):\n",
 500 |     "    df_feature['{}_minute10_count'.format('_'.join(f))] = df_feature.groupby(\n",
 501 |     "        ['day', 'hour', 'minute10'] + f)['id'].transform('count')\n",
 502 |     "\n",
 503 |     "cat_list = [['deviceid', 'netmodel']]\n",
 504 |     "for f in tqdm(cat_list):\n",
 505 |     "    df_feature['{}_hour_count'.format('_'.join(f))] = df_feature.groupby([\n",
 506 |     "        'hourl'] + f)['id'].transform('count')\n",
 507 |     "\n",
 508 |     "cat_list = [['deviceid', 'group', 'pos']]\n",
 509 |     "for f in tqdm(cat_list):\n",
 510 |     "    df_feature['{}_count'.format('_'.join(f))] = df_feature.groupby(f)[\n",
 511 |     "        'id'].transform('count')"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": null,
 517 |    "metadata": {},
 518 |    "outputs": [],
 519 |    "source": [
 520 |     "col = 'group'\n",
 521 |     "df_temp = df_feature.groupby([col], as_index=False)['ts_before'].agg({\n",
 522 |     "    '{}_ts_before_mean'.format(col): 'mean',\n",
 523 |     "    '{}_ts_before_std'.format(col): 'std'\n",
 524 |     "})\n",
 525 |     "df_feature = df_feature.merge(df_temp, on=col, how='left')\n",
 526 |     "\n",
 527 |     "del df_temp\n",
 528 |     "gc.collect()"
 529 |    ]
 530 |   },
 531 |   {
 532 |    "cell_type": "code",
 533 |    "execution_count": null,
 534 |    "metadata": {},
 535 |    "outputs": [],
 536 |    "source": [
 537 |     "col = 'deviceid'\n",
 538 |     "df_temp = df_feature.groupby([col], as_index=False)['ts_after'].agg({\n",
 539 |     "    '{}_ts_after_mean'.format('deviceid'): 'mean',\n",
 540 |     "    '{}_ts_after_std'.format('deviceid'): 'std',\n",
 541 |     "    '{}_ts_after_median'.format('deviceid'): 'median',\n",
 542 |     "    '{}_ts_after_skew'.format('deviceid'): 'skew',\n",
 543 |     "})\n",
 544 |     "df_feature = df_feature.merge(df_temp, on=col, how='left')\n",
 545 |     "\n",
 546 |     "del df_temp\n",
 547 |     "gc.collect()"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "code",
 552 |    "execution_count": null,
 553 |    "metadata": {},
 554 |    "outputs": [],
 555 |    "source": [
 556 |     "df_temp = df_feature.groupby(['deviceid', 'hourl'], as_index=False)[\n",
 557 |     "    'target'].agg({'hour_count': 'size'})\n",
 558 |     "df_temp = df_temp.groupby(['deviceid'], as_index=False)['hour_count'].agg({\n",
 559 |     "    '{}_hour_count_mean'.format('deviceid'): 'mean'\n",
 560 |     "})\n",
 561 |     "\n",
 562 |     "df_feature = df_feature.merge(df_temp, how='left')\n",
 563 |     "\n",
 564 |     "del df_temp\n",
 565 |     "gc.collect()"
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "code",
 570 |    "execution_count": null,
 571 |    "metadata": {},
 572 |    "outputs": [],
 573 |    "source": [
 574 |     "df_feature['deviceid_hour_cumsum'] = df_feature.groupby(['deviceid', 'hourl'])[\n",
 575 |     "    'ts'].cumcount()"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "code",
 580 |    "execution_count": null,
 581 |    "metadata": {
 582 |     "scrolled": true
 583 |    },
 584 |    "outputs": [],
 585 |    "source": [
 586 |     "df_temp = df_feature[['deviceid', 'day', 'deviceid_day_count']].copy(deep=True)\n",
 587 |     "df_temp.drop_duplicates(inplace=True)\n",
 588 |     "df_temp['deviceid_day_count_diff_1'] = df_temp.groupby(\n",
 589 |     "    ['deviceid'])['deviceid_day_count'].diff()\n",
 590 |     "\n",
 591 |     "del df_temp['deviceid_day_count']\n",
 592 |     "df_feature = df_feature.merge(df_temp, how='left')\n",
 593 |     "\n",
 594 |     "del df_temp\n",
 595 |     "gc.collect()"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "code",
 600 |    "execution_count": null,
 601 |    "metadata": {},
 602 |    "outputs": [],
 603 |    "source": [
 604 |     "# 未来一小时 deviceid, netmodel 曝光数量\n",
 605 |     "cat_list = [['deviceid', 'netmodel']]\n",
 606 |     "for f in tqdm(cat_list):\n",
 607 |     "    df_feature['temp'] = df_feature.groupby(\n",
 608 |     "        ['hourl'] + f)['id'].transform('count')\n",
 609 |     "    df_feature['next_{}_hour_count'.format('_'.join(f))] = df_feature.groupby(f)[\n",
 610 |     "        'temp'].shift(-1)\n",
 611 |     "\n",
 612 |     "    del df_feature['temp']"
 613 |    ]
 614 |   },
 615 |   {
 616 |    "cell_type": "code",
 617 |    "execution_count": null,
 618 |    "metadata": {},
 619 |    "outputs": [],
 620 |    "source": [
 621 |     "df_feature.head()"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "markdown",
 626 |    "metadata": {},
 627 |    "source": [
 628 |     "# ts 相关特征"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": null,
 634 |    "metadata": {},
 635 |    "outputs": [],
 636 |    "source": [
 637 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 638 |     "for f in [['deviceid']]:\n",
 639 |     "    tmp = sort_df.groupby(f)\n",
 640 |     "    # 前x次曝光到当前的时间差\n",
 641 |     "    for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30]):\n",
 642 |     "        sort_df['{}_prev{}_exposure_ts_gap'.format(\n",
 643 |     "            '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)\n",
 644 |     "        tmp2 = sort_df[\n",
 645 |     "            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 646 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 647 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 648 |     "\n",
 649 |     "del tmp2, sort_df, tmp\n",
 650 |     "gc.collect()"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {},
 657 |    "outputs": [],
 658 |    "source": [
 659 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 660 |     "for f in [['netmodel', 'deviceid']]:\n",
 661 |     "    tmp = sort_df.groupby(f)\n",
 662 |     "    # 前x次曝光到当前的时间差\n",
 663 |     "    for gap in tqdm([2, 3]):\n",
 664 |     "        sort_df['{}_prev{}_exposure_ts_gap'.format(\n",
 665 |     "            '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)\n",
 666 |     "        tmp2 = sort_df[\n",
 667 |     "            f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 668 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 669 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 670 |     "\n",
 671 |     "del tmp2, sort_df, tmp\n",
 672 |     "gc.collect()"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": null,
 678 |    "metadata": {},
 679 |    "outputs": [],
 680 |    "source": [
 681 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 682 |     "for f in [['deviceid']]:\n",
 683 |     "    tmp = sort_df.groupby(f)\n",
 684 |     "    # 后x次曝光到当前的时间差\n",
 685 |     "    for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30, 50]):\n",
 686 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 687 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 688 |     "        tmp2 = sort_df[\n",
 689 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 690 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 691 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 692 |     "\n",
 693 |     "del tmp2, sort_df, tmp\n",
 694 |     "gc.collect()"
 695 |    ]
 696 |   },
 697 |   {
 698 |    "cell_type": "code",
 699 |    "execution_count": null,
 700 |    "metadata": {},
 701 |    "outputs": [],
 702 |    "source": [
 703 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 704 |     "for f in [['pos', 'deviceid']]:\n",
 705 |     "    tmp = sort_df.groupby(f)\n",
 706 |     "    # 后x次曝光到当前的时间差\n",
 707 |     "    for gap in tqdm([1, 2]):\n",
 708 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 709 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 710 |     "        tmp2 = sort_df[\n",
 711 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 712 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 713 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 714 |     "\n",
 715 |     "del tmp2, sort_df, tmp\n",
 716 |     "gc.collect()"
 717 |    ]
 718 |   },
 719 |   {
 720 |    "cell_type": "code",
 721 |    "execution_count": null,
 722 |    "metadata": {},
 723 |    "outputs": [],
 724 |    "source": [
 725 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 726 |     "for f in [['netmodel', 'deviceid']]:\n",
 727 |     "    tmp = sort_df.groupby(f)\n",
 728 |     "    # 后x次曝光到当前的时间差\n",
 729 |     "    for gap in tqdm([1, 2]):\n",
 730 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 731 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 732 |     "        tmp2 = sort_df[\n",
 733 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 734 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 735 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 736 |     "\n",
 737 |     "del tmp2, sort_df, tmp\n",
 738 |     "gc.collect()"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "code",
 743 |    "execution_count": null,
 744 |    "metadata": {},
 745 |    "outputs": [],
 746 |    "source": [
 747 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 748 |     "for f in [['pos', 'netmodel', 'deviceid']]:\n",
 749 |     "    tmp = sort_df.groupby(f)\n",
 750 |     "    # 后x次曝光到当前的时间差\n",
 751 |     "    for gap in tqdm([1]):\n",
 752 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 753 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 754 |     "        tmp2 = sort_df[\n",
 755 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 756 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 757 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 758 |     "\n",
 759 |     "del tmp2, sort_df, tmp\n",
 760 |     "gc.collect()"
 761 |    ]
 762 |   },
 763 |   {
 764 |    "cell_type": "code",
 765 |    "execution_count": null,
 766 |    "metadata": {},
 767 |    "outputs": [],
 768 |    "source": [
 769 |     "df_feature['lng_lat'] = df_feature['lng'].astype(\n",
 770 |     "    'str') + '_' + df_feature['lat'].astype('str')\n",
 771 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 772 |     "for f in [['deviceid', 'lng_lat']]:\n",
 773 |     "    tmp = sort_df.groupby(f)\n",
 774 |     "    # 后x次曝光到当前的时间差\n",
 775 |     "    for gap in tqdm([1]):\n",
 776 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 777 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 778 |     "        tmp2 = sort_df[\n",
 779 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 780 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 781 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 782 |     "\n",
 783 |     "del tmp2, sort_df, tmp\n",
 784 |     "gc.collect()"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": null,
 790 |    "metadata": {},
 791 |    "outputs": [],
 792 |    "source": [
 793 |     "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n",
 794 |     "for f in [['pos', 'deviceid', 'lng_lat']]:\n",
 795 |     "    tmp = sort_df.groupby(f)\n",
 796 |     "    # 后x次曝光到当前的时间差\n",
 797 |     "    for gap in tqdm([1]):\n",
 798 |     "        sort_df['{}_next{}_exposure_ts_gap'.format(\n",
 799 |     "            '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n",
 800 |     "        tmp2 = sort_df[\n",
 801 |     "            f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n",
 802 |     "        ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n",
 803 |     "        df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n",
 804 |     "\n",
 805 |     "del tmp2, sort_df, tmp\n",
 806 |     "gc.collect()"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "code",
 811 |    "execution_count": null,
 812 |    "metadata": {},
 813 |    "outputs": [],
 814 |    "source": [
 815 |     "for gap in tqdm([2, 3, 4, 5, 6, 7]):\n",
 816 |     "    df_feature['next_pos{}'.format(gap)] = df_feature.groupby(\n",
 817 |     "        ['deviceid'])['pos'].shift(-gap)"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": null,
 823 |    "metadata": {},
 824 |    "outputs": [],
 825 |    "source": [
 826 |     "df_feature['next_pos_ts'] = df_feature['next_pos'] * \\\n",
 827 |     "    100 + df_feature['ts_after']"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": null,
 833 |    "metadata": {},
 834 |    "outputs": [],
 835 |    "source": [
 836 |     "df_feature.head()"
 837 |    ]
 838 |   },
 839 |   {
 840 |    "cell_type": "markdown",
 841 |    "metadata": {},
 842 |    "source": [
 843 |     "# user 表"
 844 |    ]
 845 |   },
 846 |   {
 847 |    "cell_type": "code",
 848 |    "execution_count": null,
 849 |    "metadata": {},
 850 |    "outputs": [],
 851 |    "source": [
 852 |     "df_user = pd.read_csv(os.path.join(current_path, 'raw_data', 'user.csv'))\n",
 853 |     "df_feature = df_feature.merge(\n",
 854 |     "    df_user[['deviceid', 'guid', 'level']], how='left', on=['deviceid', 'guid'])"
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "metadata": {},
 861 |    "outputs": [],
 862 |    "source": [
 863 |     "df_tag = df_user[['deviceid', 'tag']].copy()\n",
 864 |     "\n",
 865 |     "node_pairs = []\n",
 866 |     "for item in tqdm(df_user[['deviceid', 'tag']].values):\n",
 867 |     "    deviceid = str(item[0])\n",
 868 |     "    tags = item[1]\n",
 869 |     "\n",
 870 |     "    if type(tags) != float:\n",
 871 |     "        tags = tags.split('|')\n",
 872 |     "        for tag in tags:\n",
 873 |     "            try:\n",
 874 |     "                key, value = tag.split(':')\n",
 875 |     "            except Exception:\n",
 876 |     "                pass\n",
 877 |     "            node_pairs.append([deviceid, key, value])\n",
 878 |     "\n",
 879 |     "df_tag = pd.DataFrame(node_pairs)\n",
 880 |     "df_tag.columns = ['deviceid', 'tag', 'score']\n",
 881 |     "df_tag['score'] = df_tag['score'].astype('float')\n",
 882 |     "\n",
 883 |     "df_temp = df_tag.groupby(['deviceid'])['score'].agg({'tag_score_mean': 'mean',\n",
 884 |     "                                                     'tag_score_std': 'std',\n",
 885 |     "                                                     'tag_score_count': 'count',\n",
 886 |     "                                                     'tag_score_q2': lambda x: np.quantile(x, q=0.5),\n",
 887 |     "                                                     'tag_score_q3': lambda x: np.quantile(x, q=0.75),\n",
 888 |     "                                                     }).reset_index()\n",
 889 |     "\n",
 890 |     "df_feature = df_feature.merge(df_temp, how='left')\n",
 891 |     "\n",
 892 |     "del df_temp\n",
 893 |     "del df_tag\n",
 894 |     "\n",
 895 |     "gc.collect()"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "markdown",
 900 |    "metadata": {},
 901 |    "source": [
 902 |     "# embedding"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "code",
 907 |    "execution_count": null,
 908 |    "metadata": {},
 909 |    "outputs": [],
 910 |    "source": [
 911 |     "from gensim.models import Word2Vec\n",
 912 |     "\n",
 913 |     "\n",
 914 |     "def emb(df, f1, f2):\n",
 915 |     "    emb_size = 16\n",
 916 |     "    print('====================================== {} {} ======================================'.format(f1, f2))\n",
 917 |     "    tmp = df.groupby(f1, as_index=False)[f2].agg(\n",
 918 |     "        {'{}_{}_list'.format(f1, f2): list})\n",
 919 |     "    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()\n",
 920 |     "    del tmp['{}_{}_list'.format(f1, f2)]\n",
 921 |     "    for i in range(len(sentences)):\n",
 922 |     "        sentences[i] = [str(x) for x in sentences[i]]\n",
 923 |     "    model = Word2Vec(sentences, size=emb_size, window=5,\n",
 924 |     "                     min_count=5, sg=0, hs=1, seed=2019)\n",
 925 |     "    emb_matrix = []\n",
 926 |     "    for seq in sentences:\n",
 927 |     "        vec = []\n",
 928 |     "        for w in seq:\n",
 929 |     "            if w in model:\n",
 930 |     "                vec.append(model[w])\n",
 931 |     "        if len(vec) > 0:\n",
 932 |     "            emb_matrix.append(np.mean(vec, axis=0))\n",
 933 |     "        else:\n",
 934 |     "            emb_matrix.append([0] * emb_size)\n",
 935 |     "\n",
 936 |     "    df_emb = pd.DataFrame(emb_matrix)\n",
 937 |     "    df_emb.columns = ['{}_{}_emb_{}'.format(\n",
 938 |     "        f1, f2, i) for i in range(emb_size)]\n",
 939 |     "\n",
 940 |     "    tmp = pd.concat([tmp, df_emb], axis=1)\n",
 941 |     "\n",
 942 |     "    del model, emb_matrix, sentences\n",
 943 |     "    return tmp"
 944 |    ]
 945 |   },
 946 |   {
 947 |    "cell_type": "code",
 948 |    "execution_count": null,
 949 |    "metadata": {},
 950 |    "outputs": [],
 951 |    "source": [
 952 |     "for f1, f2 in [['newsid', 'deviceid'], ['lng_lat', 'deviceid']]:\n",
 953 |     "    df_feature = df_feature.merge(emb(df_feature, f1, f2), on=f1, how='left')"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {},
 960 |    "outputs": [],
 961 |    "source": [
 962 |     "df_feature['o_d'] = df_feature['deviceid'].astype(\n",
 963 |     "    str)+'_'+df_feature['newsid'].astype(str)\n",
 964 |     "\n",
 965 |     "sentence = df_feature[['deviceid', 'newsid', 'o_d']].astype(\n",
 966 |     "    str).fillna('-1').astype(str).values\n",
 967 |     "sentence = sentence.tolist()\n",
 968 |     "print('training...')\n",
 969 |     "np.random.seed(2019)\n",
 970 |     "\n",
 971 |     "L = 5\n",
 972 |     "model = Word2Vec(sentence, size=L, window=20, min_count=3,\n",
 973 |     "                 workers=multiprocessing.cpu_count(), iter=10)\n",
 974 |     "print('outputing...')\n",
 975 |     "\n",
 976 |     "\n",
 977 |     "for fea in tqdm(['deviceid', 'newsid', 'o_d']):\n",
 978 |     "    values = df_feature[fea].unique()\n",
 979 |     "    print(len(values))\n",
 980 |     "    w2v = []\n",
 981 |     "    for i in values:\n",
 982 |     "        a = [i]\n",
 983 |     "        if str(i) in model:\n",
 984 |     "            a.extend(model[str(i)])\n",
 985 |     "        else:\n",
 986 |     "            a.extend(np.ones(L) * -10)\n",
 987 |     "        w2v.append(a)\n",
 988 |     "    w2v = pd.DataFrame(w2v)\n",
 989 |     "    w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',\n",
 990 |     "                   fea+'_w2v_4', fea+'_w2v_5']\n",
 991 |     "    df_feature = df_feature.merge(w2v, on=fea, how='left')"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "code",
 996 |    "execution_count": null,
 997 |    "metadata": {},
 998 |    "outputs": [],
 999 |    "source": [
1000 |     "df_feature['o_d1'] = df_feature['lng'].astype(\n",
1001 |     "    str)+'_'+df_feature['lat'].astype(str)\n",
1002 |     "\n",
1003 |     "sentence = df_feature[['lng', 'lat', 'o_d1']].astype(\n",
1004 |     "    str).fillna('-1').astype(str).values\n",
1005 |     "sentence = sentence.tolist()\n",
1006 |     "print('training...')\n",
1007 |     "np.random.seed(2019)\n",
1008 |     "\n",
1009 |     "L = 5\n",
1010 |     "model = Word2Vec(sentence, size=L, window=20, min_count=3,\n",
1011 |     "                 workers=multiprocessing.cpu_count(), iter=10)\n",
1012 |     "print('outputing...')\n",
1013 |     "\n",
1014 |     "for fea in tqdm(['lng', 'lat', 'o_d1']):\n",
1015 |     "    values = df_feature[fea].unique()\n",
1016 |     "    print(len(values))\n",
1017 |     "    w2v = []\n",
1018 |     "    for i in values:\n",
1019 |     "        a = [i]\n",
1020 |     "        if str(i) in model:\n",
1021 |     "            a.extend(model[str(i)])\n",
1022 |     "        else:\n",
1023 |     "            a.extend(np.ones(L) * -10)\n",
1024 |     "        w2v.append(a)\n",
1025 |     "    w2v = pd.DataFrame(w2v)\n",
1026 |     "    w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',\n",
1027 |     "                   fea+'_w2v_4', fea+'_w2v_5']\n",
1028 |     "    df_feature = df_feature.merge(w2v, on=fea, how='left')"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "markdown",
1033 |    "metadata": {},
1034 |    "source": [
1035 |     "# 减少内存"
1036 |    ]
1037 |   },
1038 |   {
1039 |    "cell_type": "code",
1040 |    "execution_count": null,
1041 |    "metadata": {},
1042 |    "outputs": [],
1043 |    "source": [
1044 |     "# Function to reduce the memory usage\n",
1045 |     "def reduce_mem_usage(df, verbose=True):\n",
1046 |     "    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n",
1047 |     "    start_mem = df.memory_usage().sum() / 1024**2\n",
1048 |     "    for col in tqdm(df.columns):\n",
1049 |     "        col_type = df[col].dtypes\n",
1050 |     "        if col_type in numerics:\n",
1051 |     "            c_min = df[col].min()\n",
1052 |     "            c_max = df[col].max()\n",
1053 |     "            if str(col_type)[:3] == 'int':\n",
1054 |     "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(\n",
1055 |     "                        np.int8).max:\n",
1056 |     "                    df[col] = df[col].astype(np.int8)\n",
1057 |     "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(\n",
1058 |     "                        np.int16).max:\n",
1059 |     "                    df[col] = df[col].astype(np.int16)\n",
1060 |     "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(\n",
1061 |     "                        np.int32).max:\n",
1062 |     "                    df[col] = df[col].astype(np.int32)\n",
1063 |     "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(\n",
1064 |     "                        np.int64).max:\n",
1065 |     "                    df[col] = df[col].astype(np.int64)\n",
1066 |     "            else:\n",
1067 |     "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(\n",
1068 |     "                        np.float16).max:\n",
1069 |     "                    df[col] = df[col].astype(np.float16)\n",
1070 |     "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(\n",
1071 |     "                        np.float32).max:\n",
1072 |     "                    df[col] = df[col].astype(np.float32)\n",
1073 |     "                else:\n",
1074 |     "                    df[col] = df[col].astype(np.float64)\n",
1075 |     "    end_mem = df.memory_usage().sum() / 1024**2\n",
1076 |     "    if verbose:\n",
1077 |     "        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(\n",
1078 |     "            end_mem, 100 * (start_mem - end_mem) / start_mem))\n",
1079 |     "    return df"
1080 |    ]
1081 |   },
1082 |   {
1083 |    "cell_type": "code",
1084 |    "execution_count": null,
1085 |    "metadata": {},
1086 |    "outputs": [],
1087 |    "source": [
1088 |     "df_feature = reduce_mem_usage(df_feature)"
1089 |    ]
1090 |   },
1091 |   {
1092 |    "cell_type": "code",
1093 |    "execution_count": null,
1094 |    "metadata": {},
1095 |    "outputs": [],
1096 |    "source": [
1097 |     "df_feature.to_pickle(os.path.join(current_path, 'feature', 'feature.pickle'))"
1098 |    ]
1099 |   }
1100 |  ],
1101 |  "metadata": {
1102 |   "kernelspec": {
1103 |    "display_name": "Python [conda env:dm] *",
1104 |    "language": "python",
1105 |    "name": "conda-env-dm-py"
1106 |   },
1107 |   "language_info": {
1108 |    "codemirror_mode": {
1109 |     "name": "ipython",
1110 |     "version": 3
1111 |    },
1112 |    "file_extension": ".py",
1113 |    "mimetype": "text/x-python",
1114 |    "name": "python",
1115 |    "nbconvert_exporter": "python",
1116 |    "pygments_lexer": "ipython3",
1117 |    "version": "3.6.9"
1118 |   }
1119 |  },
1120 |  "nbformat": 4,
1121 |  "nbformat_minor": 4
1122 | }
1123 | 


--------------------------------------------------------------------------------
/2.fold_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2020-02-01T15:00:19.964638Z",
  9 |      "start_time": "2020-02-01T15:00:15.809763Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np\n",
 16 |     "import os\n",
 17 |     "from sklearn.preprocessing import LabelEncoder\n",
 18 |     "from tqdm import tqdm\n",
 19 |     "import lightgbm as lgb\n",
 20 |     "from sklearn.model_selection import KFold, StratifiedKFold\n",
 21 |     "import warnings\n",
 22 |     "from sklearn.metrics import f1_score, roc_auc_score\n",
 23 |     "import catboost as cbt\n",
 24 |     "import gc\n",
 25 |     "\n",
 26 |     "pd.set_option('display.max_columns', None)\n",
 27 |     "pd.set_option('display.max_rows', None)\n",
 28 |     "warnings.filterwarnings('ignore')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "ExecuteTime": {
 36 |      "end_time": "2020-02-01T15:00:19.968893Z",
 37 |      "start_time": "2020-02-01T15:00:19.966513Z"
 38 |     }
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "current_path = './'\n",
 43 |     "seed = 2019\n",
 44 |     "n_fold = 5"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "ExecuteTime": {
 52 |      "end_time": "2020-02-01T12:40:36.400426Z",
 53 |      "start_time": "2020-02-01T12:40:08.871667Z"
 54 |     },
 55 |     "scrolled": true
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "df_feature = pd.read_pickle(os.path.join(\n",
 60 |     "    current_path, 'feature', 'feature.pickle'))"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "ExecuteTime": {
 68 |      "end_time": "2020-02-01T12:40:36.512167Z",
 69 |      "start_time": "2020-02-01T12:40:36.401765Z"
 70 |     },
 71 |     "scrolled": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "df_feature.head()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "ExecuteTime": {
 83 |      "end_time": "2020-02-01T12:42:29.640354Z",
 84 |      "start_time": "2020-02-01T12:40:36.513434Z"
 85 |     }
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "for f in tqdm(list(df_feature.select_dtypes('object'))):\n",
 90 |     "    if f not in ['id']:\n",
 91 |     "        le = LabelEncoder()\n",
 92 |     "        df_feature[f] = le.fit_transform(\n",
 93 |     "            df_feature[f].astype('str')).astype('int')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "ExecuteTime": {
101 |      "end_time": "2020-02-01T12:42:43.750204Z",
102 |      "start_time": "2020-02-01T12:42:29.642972Z"
103 |     }
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "df_test = df_feature[df_feature['target'].isnull()].copy()\n",
108 |     "df_train = df_feature[df_feature['target'].notnull()].copy()\n",
109 |     "\n",
110 |     "del df_feature\n",
111 |     "gc.collect()"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "ExecuteTime": {
119 |      "end_time": "2020-02-01T13:43:58.090439Z",
120 |      "start_time": "2020-02-01T12:42:43.751557Z"
121 |     },
122 |     "scrolled": true
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "ycol = 'target'\n",
127 |     "feature_names = list(\n",
128 |     "    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n",
129 |     "                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n",
130 |     "                               'hourl', 'group'],\n",
131 |     "           df_train.columns))\n",
132 |     "\n",
133 |     "model = lgb.LGBMClassifier(num_leaves=64,\n",
134 |     "                           max_depth=10,\n",
135 |     "                           learning_rate=0.4,\n",
136 |     "                           n_estimators=10000000,\n",
137 |     "                           subsample=0.8,\n",
138 |     "                           feature_fraction=0.8,\n",
139 |     "                           reg_alpha=0.5,\n",
140 |     "                           reg_lambda=0.5,\n",
141 |     "                           random_state=seed,\n",
142 |     "                           metric='auc'\n",
143 |     "                           )\n",
144 |     "\n",
145 |     "# model = lgb.LGBMClassifier(\n",
146 |     "#     learning_rate=0.01,\n",
147 |     "#     n_estimators=10000000,\n",
148 |     "#     num_leaves=255,\n",
149 |     "#     subsample=0.9,\n",
150 |     "#     colsample_bytree=0.8,\n",
151 |     "#     random_state=seed,\n",
152 |     "#     metric='auc'\n",
153 |     "# )\n",
154 |     "\n",
155 |     "oof = []\n",
156 |     "prediction = df_test[['id']]\n",
157 |     "prediction['target'] = 0\n",
158 |     "df_importance_list = []\n",
159 |     "\n",
160 |     "kfold = StratifiedKFold(n_splits=n_fold, shuffle=False, random_state=seed)\n",
161 |     "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train['day'])):\n",
162 |     "    X_train = df_train.iloc[trn_idx][feature_names]\n",
163 |     "    Y_train = df_train.iloc[trn_idx][ycol]\n",
164 |     "\n",
165 |     "    X_val = df_train.iloc[val_idx][feature_names]\n",
166 |     "    Y_val = df_train.iloc[val_idx][ycol]\n",
167 |     "\n",
168 |     "    print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n",
169 |     "\n",
170 |     "    lgb_model = model.fit(X_train,\n",
171 |     "                          Y_train,\n",
172 |     "                          eval_names=['train', 'valid'],\n",
173 |     "                          eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
174 |     "                          verbose=100,\n",
175 |     "                          eval_metric='auc',\n",
176 |     "                          early_stopping_rounds=50)\n",
177 |     "\n",
178 |     "    pred_val = lgb_model.predict_proba(\n",
179 |     "        X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n",
180 |     "    df_oof = df_train.iloc[val_idx][['id', ycol]].copy()\n",
181 |     "    df_oof['pred'] = pred_val\n",
182 |     "    oof.append(df_oof)\n",
183 |     "\n",
184 |     "    pred_test = lgb_model.predict_proba(\n",
185 |     "        df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]\n",
186 |     "    prediction['target'] += pred_test / n_fold\n",
187 |     "\n",
188 |     "    df_importance = pd.DataFrame({\n",
189 |     "        'column': feature_names,\n",
190 |     "        'importance': lgb_model.feature_importances_,\n",
191 |     "    })\n",
192 |     "    df_importance_list.append(df_importance)\n",
193 |     "\n",
194 |     "    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n",
195 |     "    gc.collect()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "ExecuteTime": {
203 |      "end_time": "2020-02-01T13:43:58.129389Z",
204 |      "start_time": "2020-02-01T13:43:58.097914Z"
205 |     }
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "df_importance = pd.concat(df_importance_list)\n",
210 |     "df_importance = df_importance.groupby(['column'])['importance'].agg(\n",
211 |     "    'mean').sort_values(ascending=False).reset_index()\n",
212 |     "df_importance"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "ExecuteTime": {
220 |      "end_time": "2020-02-01T13:44:17.576713Z",
221 |      "start_time": "2020-02-01T13:43:58.132193Z"
222 |     }
223 |    },
224 |    "outputs": [],
225 |    "source": [
226 |     "df_oof = pd.concat(oof)\n",
227 |     "df_oof['pred_bin'] = df_oof['pred'].rank()\n",
228 |     "df_oof['pred_bin'] = (df_oof['pred_bin'] >= df_oof.shape[0]\n",
229 |     "                      * 0.8934642948637943).astype(int)\n",
230 |     "\n",
231 |     "auc = roc_auc_score(df_oof['target'], df_oof['pred_bin'])\n",
232 |     "f1 = f1_score(df_oof['target'], df_oof['pred_bin'])\n",
233 |     "\n",
234 |     "print('f1:', f1)\n",
235 |     "print('auc:', auc)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {
242 |     "ExecuteTime": {
243 |      "end_time": "2020-02-01T13:44:26.479783Z",
244 |      "start_time": "2020-02-01T13:44:17.578394Z"
245 |     }
246 |    },
247 |    "outputs": [],
248 |    "source": [
249 |     "sub = prediction.copy(deep=True)\n",
250 |     "sub['target'] = sub['target'].rank()\n",
251 |     "sub['target'] = (sub['target'] >= sub.shape[0] *\n",
252 |     "                 0.8934642948637943).astype(int)\n",
253 |     "sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),\n",
254 |     "           index=False, encoding='utf-8')"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {
261 |     "ExecuteTime": {
262 |      "end_time": "2020-02-01T13:44:33.645096Z",
263 |      "start_time": "2020-02-01T13:44:32.889239Z"
264 |     }
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "oof_train = df_oof[['id', 'pred']]\n",
269 |     "oof_test = prediction[['id', 'target']]\n",
270 |     "\n",
271 |     "oof_train.columns = ['id', 'oof_prob']\n",
272 |     "oof_test.columns = ['id', 'oof_prob']\n",
273 |     "\n",
274 |     "oof = pd.concat([oof_train, oof_test], sort=False)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {
281 |     "ExecuteTime": {
282 |      "end_time": "2020-02-01T15:03:28.700849Z",
283 |      "start_time": "2020-02-01T15:03:24.089293Z"
284 |     }
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "oof.to_pickle(os.path.join(current_path, 'prob', 'oof_lgb_qian.pickle'))"
289 |    ]
290 |   }
291 |  ],
292 |  "metadata": {
293 |   "kernelspec": {
294 |    "display_name": "Python [conda env:dm] *",
295 |    "language": "python",
296 |    "name": "conda-env-dm-py"
297 |   },
298 |   "language_info": {
299 |    "codemirror_mode": {
300 |     "name": "ipython",
301 |     "version": 3
302 |    },
303 |    "file_extension": ".py",
304 |    "mimetype": "text/x-python",
305 |    "name": "python",
306 |    "nbconvert_exporter": "python",
307 |    "pygments_lexer": "ipython3",
308 |    "version": "3.6.9"
309 |   }
310 |  },
311 |  "nbformat": 4,
312 |  "nbformat_minor": 2
313 | }
314 | 


--------------------------------------------------------------------------------
/3.offline_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2020-02-04T09:26:16.527236Z",
  9 |      "start_time": "2020-02-04T09:26:15.261543Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np\n",
 16 |     "import os\n",
 17 |     "from sklearn.preprocessing import LabelEncoder\n",
 18 |     "from tqdm import tqdm\n",
 19 |     "import lightgbm as lgb\n",
 20 |     "from sklearn.model_selection import KFold, StratifiedKFold\n",
 21 |     "import warnings\n",
 22 |     "from sklearn.metrics import f1_score, roc_auc_score\n",
 23 |     "import catboost as cbt\n",
 24 |     "import gc\n",
 25 |     "\n",
 26 |     "pd.set_option('display.max_columns', None)\n",
 27 |     "pd.set_option('display.max_rows', None)\n",
 28 |     "warnings.filterwarnings('ignore')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "ExecuteTime": {
 36 |      "end_time": "2020-02-04T09:26:16.531438Z",
 37 |      "start_time": "2020-02-04T09:26:16.528868Z"
 38 |     }
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "current_path = './'\n",
 43 |     "seed = 2019"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "ExecuteTime": {
 51 |      "end_time": "2020-02-04T09:27:27.235669Z",
 52 |      "start_time": "2020-02-04T09:26:16.532526Z"
 53 |     },
 54 |     "scrolled": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "df_feature = pd.read_pickle(os.path.join(\n",
 59 |     "    current_path, 'feature', 'feature.pickle'))\n",
 60 |     "df_feature['id'] = df_feature['id'].astype('str')"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {
 67 |     "ExecuteTime": {
 68 |      "end_time": "2020-02-04T09:28:00.169320Z",
 69 |      "start_time": "2020-02-04T09:27:27.236924Z"
 70 |     }
 71 |    },
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "df_oof_lgb = pd.read_pickle(os.path.join(\n",
 75 |     "    current_path, 'prob', 'oof_lgb_qian.pickle'))\n",
 76 |     "df_oof_lgb.columns = ['id', 'lgb_oof_prob']\n",
 77 |     "df_oof_lgb['id'] = df_oof_lgb['id'].astype('str')\n",
 78 |     "df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')\n",
 79 |     "print(df_feature['lgb_oof_prob'].isnull().sum())"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {
 86 |     "ExecuteTime": {
 87 |      "end_time": "2020-02-04T09:28:33.602746Z",
 88 |      "start_time": "2020-02-04T09:28:33.481456Z"
 89 |     },
 90 |     "scrolled": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "df_feature.head()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {
101 |     "ExecuteTime": {
102 |      "end_time": "2020-02-04T09:30:25.513293Z",
103 |      "start_time": "2020-02-04T09:28:33.604825Z"
104 |     }
105 |    },
106 |    "outputs": [],
107 |    "source": [
108 |     "for f in tqdm(list(df_feature.select_dtypes('object'))):\n",
109 |     "    if f not in ['id']:\n",
110 |     "        le = LabelEncoder()\n",
111 |     "        df_feature[f] = le.fit_transform(\n",
112 |     "            df_feature[f].astype('str')).astype('int')"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "ExecuteTime": {
120 |      "end_time": "2020-02-04T09:30:33.720366Z",
121 |      "start_time": "2020-02-04T09:30:25.514531Z"
122 |     }
123 |    },
124 |    "outputs": [],
125 |    "source": [
126 |     "df_val = df_feature[(df_feature['target'].notnull())\n",
127 |     "                    & (df_feature['day'] == 10)]\n",
128 |     "df_train = df_feature[df_feature['day'] < 10]"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "ExecuteTime": {
136 |      "end_time": "2020-02-04T09:33:40.807346Z",
137 |      "start_time": "2020-02-04T09:30:33.721845Z"
138 |     },
139 |     "scrolled": true
140 |    },
141 |    "outputs": [],
142 |    "source": [
143 |     "gc.collect()\n",
144 |     "\n",
145 |     "ycol = 'target'\n",
146 |     "feature_names = list(\n",
147 |     "    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n",
148 |     "                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n",
149 |     "                               'hourl', 'group'],\n",
150 |     "           df_train.columns))\n",
151 |     "\n",
152 |     "\n",
153 |     "model = lgb.LGBMClassifier(num_leaves=64,\n",
154 |     "                           max_depth=10,\n",
155 |     "                           learning_rate=0.4,\n",
156 |     "                           n_estimators=100000,\n",
157 |     "                           subsample=0.8,\n",
158 |     "                           feature_fraction=0.8,\n",
159 |     "                           reg_alpha=0.5,\n",
160 |     "                           reg_lambda=0.5,\n",
161 |     "                           random_state=seed,\n",
162 |     "                           metric='auc'\n",
163 |     "                           )\n",
164 |     "\n",
165 |     "loss = 0\n",
166 |     "df_importance_list = []\n",
167 |     "oof_list = []\n",
168 |     "\n",
169 |     "X_train = df_train[feature_names]\n",
170 |     "Y_train = df_train[ycol]\n",
171 |     "\n",
172 |     "X_val = df_val[feature_names]\n",
173 |     "Y_val = df_val[ycol]\n",
174 |     "\n",
175 |     "lgb_model = model.fit(X_train,\n",
176 |     "                      Y_train,\n",
177 |     "                      eval_names=['train', 'valid'],\n",
178 |     "                      eval_set=[(X_train, Y_train), (X_val, Y_val)],\n",
179 |     "                      verbose=50,\n",
180 |     "                      eval_metric='auc',\n",
181 |     "                      early_stopping_rounds=50)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {
188 |     "ExecuteTime": {
189 |      "end_time": "2020-02-04T09:33:40.865336Z",
190 |      "start_time": "2020-02-04T09:33:40.830776Z"
191 |     }
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "auc = lgb_model.best_score_['valid']['auc']\n",
196 |     "print(auc)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "ExecuteTime": {
204 |      "end_time": "2020-02-04T09:33:41.034844Z",
205 |      "start_time": "2020-02-04T09:33:40.868212Z"
206 |     }
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "df_importance = pd.DataFrame({\n",
211 |     "    'feature': feature_names,\n",
212 |     "    'importance': lgb_model.feature_importances_,\n",
213 |     "})\n",
214 |     "df_importance = df_importance.sort_values(by='importance', ascending=False)\n",
215 |     "df_importance"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {
222 |     "ExecuteTime": {
223 |      "end_time": "2020-02-04T09:33:53.213172Z",
224 |      "start_time": "2020-02-04T09:33:41.036538Z"
225 |     }
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "val_pred = lgb_model.predict_proba(\n",
230 |     "    X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n",
231 |     "df_oof = pd.DataFrame()\n",
232 |     "df_oof['lgb_pred'] = val_pred\n",
233 |     "df_oof['target'] = Y_val.values\n",
234 |     "df_oof['pred_label'] = df_oof['lgb_pred'].rank()\n",
235 |     "df_oof['pred_label'] = (df_oof['pred_label'] >=\n",
236 |     "                        df_oof.shape[0] * 0.8934642948637943).astype(int)\n",
237 |     "f1 = f1_score(df_oof['target'], df_oof['pred_label'])\n",
238 |     "print('f1:', f1)"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {
245 |     "ExecuteTime": {
246 |      "end_time": "2020-02-04T09:33:53.239449Z",
247 |      "start_time": "2020-02-04T09:33:53.214474Z"
248 |     }
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "best_iteration = lgb_model.best_iteration_\n",
253 |     "\n",
254 |     "with open(os.path.join(current_path, 'best_it.txt'), 'w') as f:\n",
255 |     "    f.write(str(best_iteration)+'\\n')\n",
256 |     "    f.write(str(f1))"
257 |    ]
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python [conda env:dm] *",
263 |    "language": "python",
264 |    "name": "conda-env-dm-py"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 3
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython3",
276 |    "version": "3.6.9"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 2
281 | }
282 | 


--------------------------------------------------------------------------------
/4.online_model.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "ExecuteTime": {
  8 |      "end_time": "2020-02-11T05:39:50.155898Z",
  9 |      "start_time": "2020-02-11T05:39:49.510099Z"
 10 |     }
 11 |    },
 12 |    "outputs": [],
 13 |    "source": [
 14 |     "import pandas as pd\n",
 15 |     "import numpy as np\n",
 16 |     "import os\n",
 17 |     "from sklearn.preprocessing import LabelEncoder\n",
 18 |     "from tqdm import tqdm\n",
 19 |     "import lightgbm as lgb\n",
 20 |     "from sklearn.model_selection import KFold, StratifiedKFold\n",
 21 |     "import warnings\n",
 22 |     "from sklearn.metrics import f1_score\n",
 23 |     "import catboost as cbt\n",
 24 |     "import gc\n",
 25 |     "\n",
 26 |     "pd.set_option('display.max_columns', None)\n",
 27 |     "pd.set_option('display.max_rows', None)\n",
 28 |     "warnings.filterwarnings('ignore')"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {
 35 |     "ExecuteTime": {
 36 |      "end_time": "2020-02-11T05:39:50.159779Z",
 37 |      "start_time": "2020-02-11T05:39:50.157281Z"
 38 |     }
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "current_path = './'\n",
 43 |     "seed = 2019"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "ExecuteTime": {
 51 |      "end_time": "2020-02-11T05:41:08.245550Z",
 52 |      "start_time": "2020-02-11T05:39:50.225930Z"
 53 |     }
 54 |    },
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "df_feature = pd.read_pickle(os.path.join(\n",
 58 |     "    current_path, 'feature', 'feature_1.pickle'))\n",
 59 |     "df_feature['id'] = df_feature['id'].astype('str')"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {
 66 |     "ExecuteTime": {
 67 |      "end_time": "2020-02-11T05:41:08.284415Z",
 68 |      "start_time": "2020-02-11T05:41:08.268086Z"
 69 |     }
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "df_feature.shape"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "ExecuteTime": {
 81 |      "end_time": "2020-02-04T01:26:19.495464Z",
 82 |      "start_time": "2020-02-04T01:25:47.948238Z"
 83 |     }
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "df_oof_lgb = pd.read_pickle(os.path.join(\n",
 88 |     "    current_path, 'prob', 'oof_lgb_qian.pickle'))\n",
 89 |     "df_oof_lgb.columns = ['id', 'lgb_oof_prob']\n",
 90 |     "df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')\n",
 91 |     "print(df_feature['lgb_oof_prob'].isnull().sum())"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {
 98 |     "ExecuteTime": {
 99 |      "end_time": "2020-02-04T01:26:51.489261Z",
100 |      "start_time": "2020-02-04T01:26:19.511612Z"
101 |     }
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "df_oof_cat = pd.read_pickle(os.path.join(\n",
106 |     "    current_path, 'prob', 'oof_cat.pickle'))\n",
107 |     "df_oof_cat.columns = ['id', 'cat_oof_prob']\n",
108 |     "df_feature = df_feature.merge(df_oof_cat, how='left', on='id')\n",
109 |     "print(df_feature['cat_oof_prob'].isnull().sum())"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "ExecuteTime": {
117 |      "end_time": "2020-02-04T01:26:51.662981Z",
118 |      "start_time": "2020-02-04T01:26:51.490569Z"
119 |     }
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "df_feature.head()"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "ExecuteTime": {
131 |      "end_time": "2020-02-04T01:26:51.692673Z",
132 |      "start_time": "2020-02-04T01:26:51.664289Z"
133 |     }
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "with open(os.path.join(current_path, 'best_it.txt'), 'r') as f:\n",
138 |     "    lines = f.readlines()\n",
139 |     "    best_iteration = lines[0]\n",
140 |     "    f1 = lines[1]\n",
141 |     "\n",
142 |     "best_iteration = int(best_iteration)\n",
143 |     "f1 = float(f1)\n",
144 |     "print(best_iteration, f1)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "ExecuteTime": {
152 |      "end_time": "2020-02-04T01:26:51.699332Z",
153 |      "start_time": "2020-02-04T01:26:51.695196Z"
154 |     }
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "bt = int(best_iteration * 1)\n",
159 |     "print(bt)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {
166 |     "ExecuteTime": {
167 |      "end_time": "2020-02-04T01:28:44.322415Z",
168 |      "start_time": "2020-02-04T01:26:51.701108Z"
169 |     }
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "for f in tqdm(list(df_feature.select_dtypes('object'))):\n",
174 |     "    if f not in ['id']:\n",
175 |     "        le = LabelEncoder()\n",
176 |     "        df_feature[f] = le.fit_transform(\n",
177 |     "            df_feature[f].astype('str')).astype('int')"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {
184 |     "ExecuteTime": {
185 |      "end_time": "2020-02-04T01:29:16.236420Z",
186 |      "start_time": "2020-02-04T01:29:16.221939Z"
187 |     }
188 |    },
189 |    "outputs": [],
190 |    "source": [
191 |     "df_test = df_feature[df_feature['target'].isnull()]\n",
192 |     "df_train = df_feature[df_feature['target'].notnull()]\n",
193 |     "\n",
194 |     "del df_feature, df_oof_lgb, df_oof_cat\n",
195 |     "gc.collect()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {
202 |     "ExecuteTime": {
203 |      "end_time": "2020-02-04T01:31:34.791258Z",
204 |      "start_time": "2020-02-04T01:29:20.948750Z"
205 |     },
206 |     "scrolled": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "ycol = 'target'\n",
211 |     "feature_names = list(\n",
212 |     "    filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n",
213 |     "                               'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n",
214 |     "                               'hourl', 'group'],\n",
215 |     "           df_train.columns))\n",
216 |     "\n",
217 |     "X_train = df_train[feature_names]\n",
218 |     "Y_train = df_train[ycol]\n",
219 |     "\n",
220 |     "model = lgb.LGBMClassifier(num_leaves=64,\n",
221 |     "                           max_depth=10,\n",
222 |     "                           learning_rate=0.4,\n",
223 |     "                           n_estimators=bt,\n",
224 |     "                           subsample=0.8,\n",
225 |     "                           feature_fraction=0.8,\n",
226 |     "                           reg_alpha=0.5,\n",
227 |     "                           reg_lambda=0.5,\n",
228 |     "                           random_state=seed,\n",
229 |     "                           metric='auc',\n",
230 |     "                           )\n",
231 |     "\n",
232 |     "lgb_model2 = model.fit(X_train,\n",
233 |     "                       Y_train,\n",
234 |     "                       eval_names=['train', 'valid'],\n",
235 |     "                       eval_set=[(X_train, Y_train)],\n",
236 |     "                       verbose=50,\n",
237 |     "                       eval_metric='auc')"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "ExecuteTime": {
245 |      "end_time": "2020-02-04T01:31:35.054266Z",
246 |      "start_time": "2020-02-04T01:31:34.794101Z"
247 |     }
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "df_importance = pd.DataFrame({\n",
252 |     "    'feature': feature_names,\n",
253 |     "    'importance': lgb_model2.feature_importances_,\n",
254 |     "})\n",
255 |     "\n",
256 |     "df_importance = df_importance.sort_values(by='importance', ascending=False)\n",
257 |     "df_importance"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "ExecuteTime": {
265 |      "end_time": "2020-02-04T01:31:52.563920Z",
266 |      "start_time": "2020-02-04T01:31:35.057758Z"
267 |     }
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "test_pred = lgb_model2.predict_proba(\n",
272 |     "    df_test[feature_names], num_iteration=bt)[:, 1]\n",
273 |     "prediction = df_test[['id']]\n",
274 |     "prediction['target'] = test_pred\n",
275 |     "np.save(os.path.join(current_path, 'prob',\n",
276 |     "                     'sub_{}.npy'.format(f1)), prediction.values)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": null,
282 |    "metadata": {
283 |     "ExecuteTime": {
284 |      "end_time": "2020-02-04T01:31:58.920512Z",
285 |      "start_time": "2020-02-04T01:31:52.565295Z"
286 |     }
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "sub = prediction.copy(deep=True)\n",
291 |     "sub['target'] = sub['target'].rank()\n",
292 |     "sub['target'] = (sub['target'] >= sub.shape[0] *\n",
293 |     "                 0.8934642948637943).astype(int)\n",
294 |     "sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),\n",
295 |     "           index=False, encoding='utf-8')"
296 |    ]
297 |   }
298 |  ],
299 |  "metadata": {
300 |   "kernelspec": {
301 |    "display_name": "Python [conda env:dm] *",
302 |    "language": "python",
303 |    "name": "conda-env-dm-py"
304 |   },
305 |   "language_info": {
306 |    "codemirror_mode": {
307 |     "name": "ipython",
308 |     "version": 3
309 |    },
310 |    "file_extension": ".py",
311 |    "mimetype": "text/x-python",
312 |    "name": "python",
313 |    "nbconvert_exporter": "python",
314 |    "pygments_lexer": "ipython3",
315 |    "version": "3.6.9"
316 |   }
317 |  },
318 |  "nbformat": 4,
319 |  "nbformat_minor": 2
320 | }
321 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # tuling-video-click-top3
  2 | 图灵联邦视频点击预测大赛线上第三
  3 | 
  4 | # 2020-TURING-TOPIA-Video-Click-SINGLE-LightGBM-top3
  5 | ===============================================================================================================     
  6 | **图灵联邦视频点击预测大赛线上第三（LightGBM单模）**
  7 | ## 主办方： 图灵联邦
  8 | ## 赛道：2020-视频点击预测大赛
  9 | 
 10 | **赛道链接**：https://www.turingtopia.com/competitionnew/detail/e4880352b6ef4f9f8f28e8f98498dbc4/sketch       
 11 | **赛程时间**：*2019.11.11-2020.03.09*  
 12 | **参与人**：[第一次打比赛](https://github.com/LogicJake)、[郑](https://github.com/jackhuntcn) 、[小兔子乖乖](https://github.com/PandasCute) 、   [Freak](https://github.com/BovenPeng/)  、[luweihai](https://github.com/luweihai)     
 13 | **方案文档**：[文档链接](https://www.logicjake.xyz/2020/02/10/%E5%9B%BE%E7%81%B5%E8%81%94%E9%82%A6%E8%A7%86%E9%A2%91%E7%82%B9%E5%87%BB%E9%A2%84%E6%B5%8B%E5%A4%A7%E8%B5%9B-%E8%B5%9B%E5%90%8E%E6%80%BB%E7%BB%93/)      
 14 | **百度云盘下载链接**:为避免数据丢失，提供数据集下载地址链接: https://pan.baidu.com/s/1YPtg4QyiAdhRAMoxjis_Gw  密码: 0a3r       
 15 | ## 1.数据说明  
 16 | **train.csv**
 17 | 
 18 | | 字段     | 中文名| 数据类型|  说明 |
 19 | |:-------:|:-------:|:-------:|:-------:|
 20 | |id|	用户ID|	VARCHAR2(50)|	代表数据集的第几条数据，从1到11376681|
 21 | |target|	是否点击|	VARCHAR2(50)|	代表该视频是否被用户点击了，1代表点击，0代表未点击。|
 22 | |timestamp|修改时间戳|	VARCHAR2(50)|代表改用户点击改视频的时间戳，如果未点击则为NULL。|
 23 | |deviceid|	设备ID|	VARCHAR2(50)|用户的设备id|
 24 | |newsid|视频ID|	VARCHAR2(50)|视频的id。|
 25 | |guid|注册ID|	VARCHAR2(50)|	用户的注册id。|
 26 | |pos|推荐位置|	VARCHAR2(50)|	视频推荐位置|
 27 | |app_version|app版本|	VARCHAR2(50)|	app版本。|
 28 | |device_vendor|设备厂商|	VARCHAR2(50)|	设备厂商|
 29 | |netmodel|网络类型|	VARCHAR2(50)|	网络类型。|
 30 | |osversion|操作系统版本|	VARCHAR2(50)|	操作系统版本。|
 31 | |lng|经度|	VARCHAR2(50)|经度。|
 32 | |lat|维度|	VARCHAR2(50)|	维度。|
 33 | |device_version|设备版本|	VARCHAR2(50)|	设备版本。|
 34 | |ts|	用户ID|时间戳|	视频暴光给用户的时间戳。|
 35 | 
 36 | **test.csv**
 37 | 
 38 | | 字段     | 中文名| 数据类型|  说明 |
 39 | |:-------:|:-------:|:-------:|:-------:|
 40 | |id|	用户ID|	VARCHAR2(50)|	test_1到test_3653592|
 41 | |deviceid|	设备ID|	VARCHAR2(50)|用户的设备id|
 42 | |newsid|视频ID|	VARCHAR2(50)|视频的id。|
 43 | |guid|注册ID|	VARCHAR2(50)|	用户的注册id。|
 44 | |pos|推荐位置|	VARCHAR2(50)|	视频推荐位置|
 45 | |app_version|app版本|	VARCHAR2(50)|	app版本。|
 46 | |device_vendor|设备厂商|	VARCHAR2(50)|	设备厂商|
 47 | |netmodel|网络类型|	VARCHAR2(50)|	网络类型。|
 48 | |osversion|操作系统版本|	VARCHAR2(50)|	操作系统版本。|
 49 | |lng|经度|	VARCHAR2(50)|经度。|
 50 | |lat|维度|	VARCHAR2(50)|	维度。|
 51 | |device_version|设备版本|	VARCHAR2(50)|	设备版本。|
 52 | |ts|	用户ID|时间戳|	视频暴光给用户的时间戳。|
 53 | 
 54 | **app.csv**
 55 | 
 56 | | 字段     | 中文名| 数据类型|  说明 |
 57 | |:-------:|:-------:|:-------:|:-------:|
 58 | |id|	用户ID|	VARCHAR2(50)|	test_1到test_3653592|
 59 | |**deviceid**|	设备ID|	VARCHAR2(50)|用户的设备id|
 60 | |applist deviceid|视频ID|	VARCHAR2(50)|用户所拥有的app，我们已将app的名字设置成了app_1,app_2..的形式。|
 61 | 
 62 | **test.csv**
 63 | 
 64 | | 字段     | 中文名| 数据类型|  说明 |
 65 | |:-------:|:-------:|:-------:|:-------:|
 66 | |id|	用户ID|	VARCHAR2(50)|	test_1到test_3653592|
 67 | |deviceid|	设备ID|	VARCHAR2(50)|用户的设备id|
 68 | |guid|注册ID|	VARCHAR2(50)|	用户的注册id。|
 69 | |outertag|用户画像|	VARCHAR2(50)|用户画像用竖号分隔，冒号后面的数字代表对该标签的符合程度，分数越高代表该标签越符合该用户。|
 70 | |tag|用户画像|	VARCHAR2(50)|同outertag|
 71 | |level|用户等级|	VARCHAR2(50)|用户等级。|
 72 | |personidentification|是否优劣|	VARCHAR2(50)|1表示劣质用户 0表示正常用户。|
 73 | |followscore|徒弟分|	VARCHAR2(50)|徒弟分（好友分）。 |
 74 | |personalscore|个人分|	VARCHAR2(50)|个人分。 |
 75 | |gender|性别|	VARCHAR2(50)|性别|
 76 | 
 77 | ## 2.配置环境与依赖库 
 78 |   - python3
 79 |   - scikit-learn
 80 |   - gensim
 81 |   - Ubuntu   
 82 |   - LightGBM
 83 |   - notebook 
 84 | ## 3.运行代码步骤说明  
 85 | 分别按照代码顺序  
 86 | 运行1,2,3,4 
 87 | > 1 feature.ipynb	 特征工程   
 88 | > 2 fold_model.ipynb	     
 89 | > 3 offline_model.ipynb 离线模型  
 90 | > 4 online_model.ipynb 线上模型 
 91 | 
 92 | ## 4.特征工程      
 93 |  - **原始特征**     
 94 |  - **穿越特征**   
 95 |  - **统计特征** 
 96 |  - **embedding特征**     
 97 | ## 5.模型训练   
 98 | 单模，初赛最终榜：0.83695 线上第三
 99 | 
100 | 


--------------------------------------------------------------------------------