├── 1.feature.ipynb ├── 2.fold_model.ipynb ├── 3.offline_model.ipynb ├── 4.online_model.ipynb └── README.md /1.feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import os\n", 12 | "from sklearn.preprocessing import LabelEncoder\n", 13 | "from tqdm import tqdm\n", 14 | "import lightgbm as lgb\n", 15 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 16 | "import warnings\n", 17 | "from sklearn.metrics import f1_score, classification_report\n", 18 | "import gc\n", 19 | "import xgboost as xgb\n", 20 | "from scipy import stats\n", 21 | "import datetime\n", 22 | "import time\n", 23 | "from scipy.stats import entropy, kurtosis\n", 24 | "import multiprocessing\n", 25 | "from gensim.models.word2vec import LineSentence\n", 26 | "from gensim.corpora import WikiCorpus\n", 27 | "from gensim.models import Word2Vec\n", 28 | "tqdm.pandas()\n", 29 | "\n", 30 | "pd.set_option('display.max_columns', None)\n", 31 | "pd.set_option('display.max_rows', None)\n", 32 | "\n", 33 | "warnings.filterwarnings('ignore')" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "current_path = './'\n", 43 | "seed = 2019" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "df_train = pd.read_csv(os.path.join(current_path, 'raw_data', 'train.csv'))\n", 53 | "df_test = pd.read_csv(os.path.join(current_path, 'raw_data', 'test.csv'))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "df_train.head()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "df_feature = pd.concat([df_train, df_test], sort=False)\n", 72 | "df_feature = df_feature.sort_values(\n", 73 | " ['deviceid', 'ts']).reset_index().drop('index', axis=1)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "df_feature['newsid'] = df_feature['newsid'].map(lambda x: str(x))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# 时间\n", 92 | "df_feature['ts_datetime'] = df_feature['ts'] + 8 * 60 * 60 * 1000\n", 93 | "df_feature['ts_datetime'] = pd.to_datetime(\n", 94 | " df_feature['ts_datetime'], unit='ms')\n", 95 | "df_feature['day'] = df_feature['ts_datetime'].dt.day\n", 96 | "df_feature['hour'] = df_feature['ts_datetime'].dt.hour\n", 97 | "df_feature['minute'] = df_feature['ts_datetime'].dt.minute\n", 98 | "df_feature['minute10'] = (df_feature['minute'] // 10) * 10\n", 99 | "\n", 100 | "df_feature['hourl'] = df_feature['day'] * 24 + df_feature['hour']\n", 101 | "df_feature['hourl'] = df_feature['hourl'] - df_feature['hourl'].min()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "# 基本特征" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "group = df_feature.groupby('deviceid')\n", 118 | "df_feature['ts_before'] = group['ts'].shift(0) - group['ts'].shift(1)\n", 119 | "df_feature['ts_before'] = df_feature['ts_before'].fillna(3 * 60 * 1000)\n", 120 | "INDEX = df_feature[df_feature['ts_before'] > (3 * 60 * 1000 - 1)].index\n", 121 | "df_feature['ts_before'] = np.log(df_feature['ts_before'] // 1000 + 1)\n", 122 | "LENGTH = len(INDEX)\n", 123 | "ts_len = []\n", 124 | "group = []\n", 125 | "for i in tqdm(range(1, LENGTH)):\n", 126 | " ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])\n", 127 | " group += [i] * (INDEX[i] - INDEX[i - 1])\n", 128 | "ts_len += [(len(df_feature) - INDEX[LENGTH - 1])] * \\\n", 129 | " (len(df_feature) - INDEX[LENGTH - 1])\n", 130 | "group += [LENGTH] * (len(df_feature) - INDEX[LENGTH - 1])\n", 131 | "df_feature['ts_before_len'] = ts_len\n", 132 | "df_feature['group'] = group\n", 133 | "\n", 134 | "group = df_feature.groupby('deviceid')\n", 135 | "df_feature['ts_after'] = group['ts'].shift(-1) - group['ts'].shift(0)\n", 136 | "df_feature['ts_after'] = df_feature['ts_after'].fillna(3 * 60 * 1000)\n", 137 | "INDEX = df_feature[df_feature['ts_after'] > (3 * 60 * 1000 - 1)].index\n", 138 | "df_feature['ts_after'] = np.log(df_feature['ts_after'] // 1000 + 1)\n", 139 | "LENGTH = len(INDEX)\n", 140 | "ts_len = [INDEX[0]] * (INDEX[0] + 1)\n", 141 | "for i in tqdm(range(1, LENGTH)):\n", 142 | " ts_len += [(INDEX[i] - INDEX[i - 1])] * (INDEX[i] - INDEX[i - 1])\n", 143 | "df_feature['ts_after_len'] = ts_len" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# 类别交叉特征\n", 153 | "df_feature['devicevendor_osv'] = df_feature['device_vendor'].astype(\n", 154 | " 'str') + '_' + df_feature['osversion'].astype('str')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "# 下一次 pos\n", 164 | "df_feature['before_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(1)\n", 165 | "df_feature['next_pos'] = df_feature.groupby(['deviceid'])['pos'].shift(-1)\n", 166 | "df_feature['diff_pos'] = df_feature['next_pos'] - df_feature['pos']\n", 167 | "\n", 168 | "# 距离变化\n", 169 | "df_feature['next_lat'] = df_feature.groupby(['deviceid'])['lat'].shift(-1)\n", 170 | "df_feature['next_lng'] = df_feature.groupby(['deviceid'])['lng'].shift(-1)\n", 171 | "df_feature['dist_diff'] = (df_feature['next_lat'] - df_feature['lat']\n", 172 | " ) ** 2 + (df_feature['lng'] - df_feature['next_lng']) ** 2\n", 173 | "\n", 174 | "del df_feature['next_lat']\n", 175 | "del df_feature['next_lng']\n", 176 | "\n", 177 | "# 下一次 网络\n", 178 | "df_feature['next_netmodel'] = df_feature.groupby(['deviceid'])[\n", 179 | " 'netmodel'].shift(-1)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "df_feature.head()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "# 历史特征" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## day 为单位 " 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "# 对前一天的样本的所有反应时间进行统计量提取\n", 212 | "df_temp = df_feature[df_feature['target'] == 1]\n", 213 | "df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']\n", 214 | "\n", 215 | "col = 'deviceid'\n", 216 | "col2 = 'click_minus'\n", 217 | "\n", 218 | "df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({\n", 219 | " 'yesterday_{}_{}_max'.format(col, col2): 'max',\n", 220 | " 'yesterday_{}_{}_mean'.format(col, col2): 'mean',\n", 221 | " 'yesterday_{}_{}_min'.format(col, col2): 'min',\n", 222 | " 'yesterday_{}_{}_std'.format(col, col2): 'std',\n", 223 | " 'yesterday_{}_{}_median'.format(col, col2): 'median',\n", 224 | " 'yesterday_{}_{}_kurt'.format(col, col2): kurtosis,\n", 225 | " 'yesterday_{}_{}_q3'.format(col, col2): lambda x: np.quantile(x, q=0.75),\n", 226 | "})\n", 227 | "df_temp['day'] += 1\n", 228 | "\n", 229 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 230 | "\n", 231 | "del df_temp\n", 232 | "gc.collect()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "# 昨日 deviceid 点击次数,点击率\n", 242 | "col = 'deviceid'\n", 243 | "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n", 244 | " 'yesterday_{}_click_count'.format(col): 'sum',\n", 245 | " 'yesterday_{}_count'.format(col): 'count',\n", 246 | "})\n", 247 | "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n", 248 | " / df_temp['yesterday_{}_count'.format(col)]\n", 249 | "df_temp['day'] += 1\n", 250 | "del df_temp['yesterday_{}_count'.format(col)]\n", 251 | "\n", 252 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 253 | "\n", 254 | "del df_temp\n", 255 | "gc.collect()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# 昨日小时点击率\n", 265 | "groups = ['deviceid', 'hour']\n", 266 | "df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({\n", 267 | " 'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',\n", 268 | " 'yesterday_{}_count'.format('_'.join(groups)): 'count',\n", 269 | "})\n", 270 | "\n", 271 | "df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \\\n", 272 | " / df_temp['yesterday_{}_count'.format('_'.join(groups))]\n", 273 | "df_temp['day'] += 1\n", 274 | "\n", 275 | "del df_temp['yesterday_{}_click_count'.format('_'.join(groups))]\n", 276 | "del df_temp['yesterday_{}_count'.format('_'.join(groups))]\n", 277 | "\n", 278 | "df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')\n", 279 | "\n", 280 | "del df_temp\n", 281 | "gc.collect()" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# 昨日曝光 pos 平均值\n", 291 | "col = 'deviceid'\n", 292 | "df_temp = df_feature.groupby([col, 'day'], as_index=False)['pos'].agg({\n", 293 | " 'yesterday_{}_pos_mean'.format(col): 'mean',\n", 294 | "})\n", 295 | "df_temp['day'] += 1\n", 296 | "\n", 297 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 298 | "\n", 299 | "del df_temp\n", 300 | "gc.collect()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "# 昨日 deviceid netmodel 点击率\n", 310 | "groups = ['deviceid', 'netmodel']\n", 311 | "df_temp = df_feature.groupby(groups + ['day'], as_index=False)['target'].agg({\n", 312 | " 'yesterday_{}_click_count'.format('_'.join(groups)): 'sum',\n", 313 | " 'yesterday_{}_count'.format('_'.join(groups)): 'count',\n", 314 | "})\n", 315 | "\n", 316 | "df_temp['yesterday_{}_ctr'.format('_'.join(groups))] = df_temp['yesterday_{}_click_count'.format('_'.join(groups))] \\\n", 317 | " / df_temp['yesterday_{}_count'.format('_'.join(groups))]\n", 318 | "\n", 319 | "df_temp['day'] += 1\n", 320 | "\n", 321 | "df_feature = df_feature.merge(df_temp, on=groups + ['day'], how='left')\n", 322 | "df_feature['yesterday_deviceid_netmodel_click_ratio'] = df_feature['yesterday_deviceid_netmodel_click_count'] / \\\n", 323 | " df_feature['yesterday_deviceid_click_count']\n", 324 | "\n", 325 | "del df_feature['yesterday_{}_click_count'.format('_'.join(groups))]\n", 326 | "del df_feature['yesterday_{}_count'.format('_'.join(groups))]\n", 327 | "\n", 328 | "del df_temp\n", 329 | "gc.collect()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "# 对前一天的 newsid 所有反应时间进行统计量提取\n", 339 | "df_temp = df_feature[df_feature['target'] == 1]\n", 340 | "df_temp['click_minus'] = df_temp['timestamp'] - df_temp['ts']\n", 341 | "\n", 342 | "col = 'newsid'\n", 343 | "col2 = 'click_minus'\n", 344 | "\n", 345 | "df_temp = df_temp.groupby([col, 'day'], as_index=False)[col2].agg({\n", 346 | " 'yesterday_{}_{}_std'.format(col, col2): 'std',\n", 347 | "})\n", 348 | "df_temp['day'] += 1\n", 349 | "\n", 350 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 351 | "\n", 352 | "del df_temp\n", 353 | "gc.collect()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "# 昨日 newsid 点击次数,点击率\n", 363 | "col = 'newsid'\n", 364 | "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n", 365 | " 'yesterday_{}_click_count'.format(col): 'sum',\n", 366 | " 'yesterday_{}_count'.format(col): 'count',\n", 367 | "})\n", 368 | "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n", 369 | " / df_temp['yesterday_{}_count'.format(col)]\n", 370 | "\n", 371 | "df_temp['day'] += 1\n", 372 | "del df_temp['yesterday_{}_count'.format(col)]\n", 373 | "\n", 374 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 375 | "\n", 376 | "del df_temp\n", 377 | "gc.collect()" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "# 昨日 next_pos 点击率\n", 387 | "col = 'next_pos'\n", 388 | "df_temp = df_feature.groupby([col, 'day'], as_index=False)['target'].agg({\n", 389 | " 'yesterday_{}_click_count'.format(col): 'sum',\n", 390 | " 'yesterday_{}_count'.format(col): 'count',\n", 391 | "})\n", 392 | "df_temp['yesterday_{}_ctr'.format(col)] = df_temp['yesterday_{}_click_count'.format(col)] \\\n", 393 | " / df_temp['yesterday_{}_count'.format(col)]\n", 394 | "\n", 395 | "df_temp['day'] += 1\n", 396 | "\n", 397 | "del df_temp['yesterday_{}_count'.format(col)]\n", 398 | "del df_temp['yesterday_{}_click_count'.format(col)]\n", 399 | "\n", 400 | "df_feature = df_feature.merge(df_temp, on=[col, 'day'], how='left')\n", 401 | "\n", 402 | "del df_temp\n", 403 | "gc.collect()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "cat_list = tqdm([['deviceid', 'netmodel']])\n", 413 | "for f1, f2 in cat_list:\n", 414 | " df_feature['t_{}_count'.format(f1)] = df_feature.groupby([f1, 'day'])[\n", 415 | " 'id'].transform('count')\n", 416 | " df_feature['t_{}_count'.format(f2)] = df_feature.groupby([f2, 'day'])[\n", 417 | " 'id'].transform('count')\n", 418 | " df_feature['t_{}_count'.format('_'.join([f1, f2]))] = df_feature.groupby([\n", 419 | " f1, f2, 'day'])['id'].transform('count')\n", 420 | "\n", 421 | " df_feature['{}_coratio'.format('_'.join([f1, f2]))] = (df_feature['t_{}_count'.format(\n", 422 | " f1)] * df_feature['t_{}_count'.format(f2)]) / df_feature['t_{}_count'.format('_'.join([f1, f2]))]\n", 423 | " df_feature['yesterday_{}_coratio'.format('_'.join([f1, f2]))] = df_feature.groupby(\n", 424 | " [f1, f2, 'day'])['{}_coratio'.format('_'.join([f1, f2]))].shift()\n", 425 | "\n", 426 | " del df_feature['t_{}_count'.format(f1)]\n", 427 | " del df_feature['t_{}_count'.format(f2)]\n", 428 | " del df_feature['t_{}_count'.format('_'.join([f1, f2]))]\n", 429 | " del df_feature['{}_coratio'.format('_'.join([f1, f2]))]\n", 430 | "\n", 431 | " gc.collect()" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": null, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "df_feature.head()" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "## 以 hour 为单位" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "# 一小时之前 deviceid 点击次数,点击率\n", 457 | "col = 'deviceid'\n", 458 | "df_temp = df_feature.groupby([col, 'hourl'], as_index=False)['id'].agg({\n", 459 | " 'pre_hour_{}_count'.format(col): 'count',\n", 460 | "})\n", 461 | "df_temp['hourl'] += 1\n", 462 | "\n", 463 | "df_feature = df_feature.merge(df_temp, on=[col, 'hourl'], how='left')\n", 464 | "\n", 465 | "del df_temp\n", 466 | "gc.collect()" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "df_feature.head()" 476 | ] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": {}, 481 | "source": [ 482 | "# 统计特征" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "cat_list = [['deviceid'], ['guid'], ['newsid'], ['deviceid', 'pos'], ['newsid', 'pos'],\n", 492 | " ['deviceid', 'guid', 'newsid'], ['deviceid', 'next_pos']]\n", 493 | "for f in tqdm(cat_list):\n", 494 | " df_feature['{}_day_count'.format('_'.join(f))] = df_feature.groupby([\n", 495 | " 'day'] + f)['id'].transform('count')\n", 496 | "\n", 497 | "cat_list = [['deviceid'], ['guid'], [\n", 498 | " 'deviceid', 'pos'], ['deviceid', 'netmodel']]\n", 499 | "for f in tqdm(cat_list):\n", 500 | " df_feature['{}_minute10_count'.format('_'.join(f))] = df_feature.groupby(\n", 501 | " ['day', 'hour', 'minute10'] + f)['id'].transform('count')\n", 502 | "\n", 503 | "cat_list = [['deviceid', 'netmodel']]\n", 504 | "for f in tqdm(cat_list):\n", 505 | " df_feature['{}_hour_count'.format('_'.join(f))] = df_feature.groupby([\n", 506 | " 'hourl'] + f)['id'].transform('count')\n", 507 | "\n", 508 | "cat_list = [['deviceid', 'group', 'pos']]\n", 509 | "for f in tqdm(cat_list):\n", 510 | " df_feature['{}_count'.format('_'.join(f))] = df_feature.groupby(f)[\n", 511 | " 'id'].transform('count')" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": null, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "col = 'group'\n", 521 | "df_temp = df_feature.groupby([col], as_index=False)['ts_before'].agg({\n", 522 | " '{}_ts_before_mean'.format(col): 'mean',\n", 523 | " '{}_ts_before_std'.format(col): 'std'\n", 524 | "})\n", 525 | "df_feature = df_feature.merge(df_temp, on=col, how='left')\n", 526 | "\n", 527 | "del df_temp\n", 528 | "gc.collect()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "col = 'deviceid'\n", 538 | "df_temp = df_feature.groupby([col], as_index=False)['ts_after'].agg({\n", 539 | " '{}_ts_after_mean'.format('deviceid'): 'mean',\n", 540 | " '{}_ts_after_std'.format('deviceid'): 'std',\n", 541 | " '{}_ts_after_median'.format('deviceid'): 'median',\n", 542 | " '{}_ts_after_skew'.format('deviceid'): 'skew',\n", 543 | "})\n", 544 | "df_feature = df_feature.merge(df_temp, on=col, how='left')\n", 545 | "\n", 546 | "del df_temp\n", 547 | "gc.collect()" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "df_temp = df_feature.groupby(['deviceid', 'hourl'], as_index=False)[\n", 557 | " 'target'].agg({'hour_count': 'size'})\n", 558 | "df_temp = df_temp.groupby(['deviceid'], as_index=False)['hour_count'].agg({\n", 559 | " '{}_hour_count_mean'.format('deviceid'): 'mean'\n", 560 | "})\n", 561 | "\n", 562 | "df_feature = df_feature.merge(df_temp, how='left')\n", 563 | "\n", 564 | "del df_temp\n", 565 | "gc.collect()" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": {}, 572 | "outputs": [], 573 | "source": [ 574 | "df_feature['deviceid_hour_cumsum'] = df_feature.groupby(['deviceid', 'hourl'])[\n", 575 | " 'ts'].cumcount()" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": { 582 | "scrolled": true 583 | }, 584 | "outputs": [], 585 | "source": [ 586 | "df_temp = df_feature[['deviceid', 'day', 'deviceid_day_count']].copy(deep=True)\n", 587 | "df_temp.drop_duplicates(inplace=True)\n", 588 | "df_temp['deviceid_day_count_diff_1'] = df_temp.groupby(\n", 589 | " ['deviceid'])['deviceid_day_count'].diff()\n", 590 | "\n", 591 | "del df_temp['deviceid_day_count']\n", 592 | "df_feature = df_feature.merge(df_temp, how='left')\n", 593 | "\n", 594 | "del df_temp\n", 595 | "gc.collect()" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "# 未来一小时 deviceid, netmodel 曝光数量\n", 605 | "cat_list = [['deviceid', 'netmodel']]\n", 606 | "for f in tqdm(cat_list):\n", 607 | " df_feature['temp'] = df_feature.groupby(\n", 608 | " ['hourl'] + f)['id'].transform('count')\n", 609 | " df_feature['next_{}_hour_count'.format('_'.join(f))] = df_feature.groupby(f)[\n", 610 | " 'temp'].shift(-1)\n", 611 | "\n", 612 | " del df_feature['temp']" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | "df_feature.head()" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "# ts 相关特征" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": null, 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 638 | "for f in [['deviceid']]:\n", 639 | " tmp = sort_df.groupby(f)\n", 640 | " # 前x次曝光到当前的时间差\n", 641 | " for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30]):\n", 642 | " sort_df['{}_prev{}_exposure_ts_gap'.format(\n", 643 | " '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)\n", 644 | " tmp2 = sort_df[\n", 645 | " f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 646 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 647 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 648 | "\n", 649 | "del tmp2, sort_df, tmp\n", 650 | "gc.collect()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 660 | "for f in [['netmodel', 'deviceid']]:\n", 661 | " tmp = sort_df.groupby(f)\n", 662 | " # 前x次曝光到当前的时间差\n", 663 | " for gap in tqdm([2, 3]):\n", 664 | " sort_df['{}_prev{}_exposure_ts_gap'.format(\n", 665 | " '_'.join(f), gap)] = tmp['ts'].shift(0) - tmp['ts'].shift(gap)\n", 666 | " tmp2 = sort_df[\n", 667 | " f + ['ts', '{}_prev{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 668 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 669 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 670 | "\n", 671 | "del tmp2, sort_df, tmp\n", 672 | "gc.collect()" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 682 | "for f in [['deviceid']]:\n", 683 | " tmp = sort_df.groupby(f)\n", 684 | " # 后x次曝光到当前的时间差\n", 685 | " for gap in tqdm([2, 3, 4, 5, 8, 10, 20, 30, 50]):\n", 686 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 687 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 688 | " tmp2 = sort_df[\n", 689 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 690 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 691 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 692 | "\n", 693 | "del tmp2, sort_df, tmp\n", 694 | "gc.collect()" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [ 703 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 704 | "for f in [['pos', 'deviceid']]:\n", 705 | " tmp = sort_df.groupby(f)\n", 706 | " # 后x次曝光到当前的时间差\n", 707 | " for gap in tqdm([1, 2]):\n", 708 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 709 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 710 | " tmp2 = sort_df[\n", 711 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 712 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 713 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 714 | "\n", 715 | "del tmp2, sort_df, tmp\n", 716 | "gc.collect()" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 726 | "for f in [['netmodel', 'deviceid']]:\n", 727 | " tmp = sort_df.groupby(f)\n", 728 | " # 后x次曝光到当前的时间差\n", 729 | " for gap in tqdm([1, 2]):\n", 730 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 731 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 732 | " tmp2 = sort_df[\n", 733 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 734 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 735 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 736 | "\n", 737 | "del tmp2, sort_df, tmp\n", 738 | "gc.collect()" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 748 | "for f in [['pos', 'netmodel', 'deviceid']]:\n", 749 | " tmp = sort_df.groupby(f)\n", 750 | " # 后x次曝光到当前的时间差\n", 751 | " for gap in tqdm([1]):\n", 752 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 753 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 754 | " tmp2 = sort_df[\n", 755 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 756 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 757 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 758 | "\n", 759 | "del tmp2, sort_df, tmp\n", 760 | "gc.collect()" 761 | ] 762 | }, 763 | { 764 | "cell_type": "code", 765 | "execution_count": null, 766 | "metadata": {}, 767 | "outputs": [], 768 | "source": [ 769 | "df_feature['lng_lat'] = df_feature['lng'].astype(\n", 770 | " 'str') + '_' + df_feature['lat'].astype('str')\n", 771 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 772 | "for f in [['deviceid', 'lng_lat']]:\n", 773 | " tmp = sort_df.groupby(f)\n", 774 | " # 后x次曝光到当前的时间差\n", 775 | " for gap in tqdm([1]):\n", 776 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 777 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 778 | " tmp2 = sort_df[\n", 779 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 780 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 781 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 782 | "\n", 783 | "del tmp2, sort_df, tmp\n", 784 | "gc.collect()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "sort_df = df_feature.sort_values('ts').reset_index(drop=True)\n", 794 | "for f in [['pos', 'deviceid', 'lng_lat']]:\n", 795 | " tmp = sort_df.groupby(f)\n", 796 | " # 后x次曝光到当前的时间差\n", 797 | " for gap in tqdm([1]):\n", 798 | " sort_df['{}_next{}_exposure_ts_gap'.format(\n", 799 | " '_'.join(f), gap)] = tmp['ts'].shift(-gap) - tmp['ts'].shift(0)\n", 800 | " tmp2 = sort_df[\n", 801 | " f + ['ts', '{}_next{}_exposure_ts_gap'.format('_'.join(f), gap)]\n", 802 | " ].drop_duplicates(f + ['ts']).reset_index(drop=True)\n", 803 | " df_feature = df_feature.merge(tmp2, on=f + ['ts'], how='left')\n", 804 | "\n", 805 | "del tmp2, sort_df, tmp\n", 806 | "gc.collect()" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": {}, 813 | "outputs": [], 814 | "source": [ 815 | "for gap in tqdm([2, 3, 4, 5, 6, 7]):\n", 816 | " df_feature['next_pos{}'.format(gap)] = df_feature.groupby(\n", 817 | " ['deviceid'])['pos'].shift(-gap)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": null, 823 | "metadata": {}, 824 | "outputs": [], 825 | "source": [ 826 | "df_feature['next_pos_ts'] = df_feature['next_pos'] * \\\n", 827 | " 100 + df_feature['ts_after']" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": null, 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [ 836 | "df_feature.head()" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": {}, 842 | "source": [ 843 | "# user 表" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": null, 849 | "metadata": {}, 850 | "outputs": [], 851 | "source": [ 852 | "df_user = pd.read_csv(os.path.join(current_path, 'raw_data', 'user.csv'))\n", 853 | "df_feature = df_feature.merge(\n", 854 | " df_user[['deviceid', 'guid', 'level']], how='left', on=['deviceid', 'guid'])" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": {}, 861 | "outputs": [], 862 | "source": [ 863 | "df_tag = df_user[['deviceid', 'tag']].copy()\n", 864 | "\n", 865 | "node_pairs = []\n", 866 | "for item in tqdm(df_user[['deviceid', 'tag']].values):\n", 867 | " deviceid = str(item[0])\n", 868 | " tags = item[1]\n", 869 | "\n", 870 | " if type(tags) != float:\n", 871 | " tags = tags.split('|')\n", 872 | " for tag in tags:\n", 873 | " try:\n", 874 | " key, value = tag.split(':')\n", 875 | " except Exception:\n", 876 | " pass\n", 877 | " node_pairs.append([deviceid, key, value])\n", 878 | "\n", 879 | "df_tag = pd.DataFrame(node_pairs)\n", 880 | "df_tag.columns = ['deviceid', 'tag', 'score']\n", 881 | "df_tag['score'] = df_tag['score'].astype('float')\n", 882 | "\n", 883 | "df_temp = df_tag.groupby(['deviceid'])['score'].agg({'tag_score_mean': 'mean',\n", 884 | " 'tag_score_std': 'std',\n", 885 | " 'tag_score_count': 'count',\n", 886 | " 'tag_score_q2': lambda x: np.quantile(x, q=0.5),\n", 887 | " 'tag_score_q3': lambda x: np.quantile(x, q=0.75),\n", 888 | " }).reset_index()\n", 889 | "\n", 890 | "df_feature = df_feature.merge(df_temp, how='left')\n", 891 | "\n", 892 | "del df_temp\n", 893 | "del df_tag\n", 894 | "\n", 895 | "gc.collect()" 896 | ] 897 | }, 898 | { 899 | "cell_type": "markdown", 900 | "metadata": {}, 901 | "source": [ 902 | "# embedding" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": null, 908 | "metadata": {}, 909 | "outputs": [], 910 | "source": [ 911 | "from gensim.models import Word2Vec\n", 912 | "\n", 913 | "\n", 914 | "def emb(df, f1, f2):\n", 915 | " emb_size = 16\n", 916 | " print('====================================== {} {} ======================================'.format(f1, f2))\n", 917 | " tmp = df.groupby(f1, as_index=False)[f2].agg(\n", 918 | " {'{}_{}_list'.format(f1, f2): list})\n", 919 | " sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()\n", 920 | " del tmp['{}_{}_list'.format(f1, f2)]\n", 921 | " for i in range(len(sentences)):\n", 922 | " sentences[i] = [str(x) for x in sentences[i]]\n", 923 | " model = Word2Vec(sentences, size=emb_size, window=5,\n", 924 | " min_count=5, sg=0, hs=1, seed=2019)\n", 925 | " emb_matrix = []\n", 926 | " for seq in sentences:\n", 927 | " vec = []\n", 928 | " for w in seq:\n", 929 | " if w in model:\n", 930 | " vec.append(model[w])\n", 931 | " if len(vec) > 0:\n", 932 | " emb_matrix.append(np.mean(vec, axis=0))\n", 933 | " else:\n", 934 | " emb_matrix.append([0] * emb_size)\n", 935 | "\n", 936 | " df_emb = pd.DataFrame(emb_matrix)\n", 937 | " df_emb.columns = ['{}_{}_emb_{}'.format(\n", 938 | " f1, f2, i) for i in range(emb_size)]\n", 939 | "\n", 940 | " tmp = pd.concat([tmp, df_emb], axis=1)\n", 941 | "\n", 942 | " del model, emb_matrix, sentences\n", 943 | " return tmp" 944 | ] 945 | }, 946 | { 947 | "cell_type": "code", 948 | "execution_count": null, 949 | "metadata": {}, 950 | "outputs": [], 951 | "source": [ 952 | "for f1, f2 in [['newsid', 'deviceid'], ['lng_lat', 'deviceid']]:\n", 953 | " df_feature = df_feature.merge(emb(df_feature, f1, f2), on=f1, how='left')" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": {}, 960 | "outputs": [], 961 | "source": [ 962 | "df_feature['o_d'] = df_feature['deviceid'].astype(\n", 963 | " str)+'_'+df_feature['newsid'].astype(str)\n", 964 | "\n", 965 | "sentence = df_feature[['deviceid', 'newsid', 'o_d']].astype(\n", 966 | " str).fillna('-1').astype(str).values\n", 967 | "sentence = sentence.tolist()\n", 968 | "print('training...')\n", 969 | "np.random.seed(2019)\n", 970 | "\n", 971 | "L = 5\n", 972 | "model = Word2Vec(sentence, size=L, window=20, min_count=3,\n", 973 | " workers=multiprocessing.cpu_count(), iter=10)\n", 974 | "print('outputing...')\n", 975 | "\n", 976 | "\n", 977 | "for fea in tqdm(['deviceid', 'newsid', 'o_d']):\n", 978 | " values = df_feature[fea].unique()\n", 979 | " print(len(values))\n", 980 | " w2v = []\n", 981 | " for i in values:\n", 982 | " a = [i]\n", 983 | " if str(i) in model:\n", 984 | " a.extend(model[str(i)])\n", 985 | " else:\n", 986 | " a.extend(np.ones(L) * -10)\n", 987 | " w2v.append(a)\n", 988 | " w2v = pd.DataFrame(w2v)\n", 989 | " w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',\n", 990 | " fea+'_w2v_4', fea+'_w2v_5']\n", 991 | " df_feature = df_feature.merge(w2v, on=fea, how='left')" 992 | ] 993 | }, 994 | { 995 | "cell_type": "code", 996 | "execution_count": null, 997 | "metadata": {}, 998 | "outputs": [], 999 | "source": [ 1000 | "df_feature['o_d1'] = df_feature['lng'].astype(\n", 1001 | " str)+'_'+df_feature['lat'].astype(str)\n", 1002 | "\n", 1003 | "sentence = df_feature[['lng', 'lat', 'o_d1']].astype(\n", 1004 | " str).fillna('-1').astype(str).values\n", 1005 | "sentence = sentence.tolist()\n", 1006 | "print('training...')\n", 1007 | "np.random.seed(2019)\n", 1008 | "\n", 1009 | "L = 5\n", 1010 | "model = Word2Vec(sentence, size=L, window=20, min_count=3,\n", 1011 | " workers=multiprocessing.cpu_count(), iter=10)\n", 1012 | "print('outputing...')\n", 1013 | "\n", 1014 | "for fea in tqdm(['lng', 'lat', 'o_d1']):\n", 1015 | " values = df_feature[fea].unique()\n", 1016 | " print(len(values))\n", 1017 | " w2v = []\n", 1018 | " for i in values:\n", 1019 | " a = [i]\n", 1020 | " if str(i) in model:\n", 1021 | " a.extend(model[str(i)])\n", 1022 | " else:\n", 1023 | " a.extend(np.ones(L) * -10)\n", 1024 | " w2v.append(a)\n", 1025 | " w2v = pd.DataFrame(w2v)\n", 1026 | " w2v.columns = [fea, fea+'_w2v_1', fea+'_w2v_2', fea+'_w2v_3',\n", 1027 | " fea+'_w2v_4', fea+'_w2v_5']\n", 1028 | " df_feature = df_feature.merge(w2v, on=fea, how='left')" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "markdown", 1033 | "metadata": {}, 1034 | "source": [ 1035 | "# 减少内存" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": null, 1041 | "metadata": {}, 1042 | "outputs": [], 1043 | "source": [ 1044 | "# Function to reduce the memory usage\n", 1045 | "def reduce_mem_usage(df, verbose=True):\n", 1046 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", 1047 | " start_mem = df.memory_usage().sum() / 1024**2\n", 1048 | " for col in tqdm(df.columns):\n", 1049 | " col_type = df[col].dtypes\n", 1050 | " if col_type in numerics:\n", 1051 | " c_min = df[col].min()\n", 1052 | " c_max = df[col].max()\n", 1053 | " if str(col_type)[:3] == 'int':\n", 1054 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(\n", 1055 | " np.int8).max:\n", 1056 | " df[col] = df[col].astype(np.int8)\n", 1057 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(\n", 1058 | " np.int16).max:\n", 1059 | " df[col] = df[col].astype(np.int16)\n", 1060 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(\n", 1061 | " np.int32).max:\n", 1062 | " df[col] = df[col].astype(np.int32)\n", 1063 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(\n", 1064 | " np.int64).max:\n", 1065 | " df[col] = df[col].astype(np.int64)\n", 1066 | " else:\n", 1067 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(\n", 1068 | " np.float16).max:\n", 1069 | " df[col] = df[col].astype(np.float16)\n", 1070 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(\n", 1071 | " np.float32).max:\n", 1072 | " df[col] = df[col].astype(np.float32)\n", 1073 | " else:\n", 1074 | " df[col] = df[col].astype(np.float64)\n", 1075 | " end_mem = df.memory_usage().sum() / 1024**2\n", 1076 | " if verbose:\n", 1077 | " print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(\n", 1078 | " end_mem, 100 * (start_mem - end_mem) / start_mem))\n", 1079 | " return df" 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "code", 1084 | "execution_count": null, 1085 | "metadata": {}, 1086 | "outputs": [], 1087 | "source": [ 1088 | "df_feature = reduce_mem_usage(df_feature)" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": null, 1094 | "metadata": {}, 1095 | "outputs": [], 1096 | "source": [ 1097 | "df_feature.to_pickle(os.path.join(current_path, 'feature', 'feature.pickle'))" 1098 | ] 1099 | } 1100 | ], 1101 | "metadata": { 1102 | "kernelspec": { 1103 | "display_name": "Python [conda env:dm] *", 1104 | "language": "python", 1105 | "name": "conda-env-dm-py" 1106 | }, 1107 | "language_info": { 1108 | "codemirror_mode": { 1109 | "name": "ipython", 1110 | "version": 3 1111 | }, 1112 | "file_extension": ".py", 1113 | "mimetype": "text/x-python", 1114 | "name": "python", 1115 | "nbconvert_exporter": "python", 1116 | "pygments_lexer": "ipython3", 1117 | "version": "3.6.9" 1118 | } 1119 | }, 1120 | "nbformat": 4, 1121 | "nbformat_minor": 4 1122 | } 1123 | -------------------------------------------------------------------------------- /2.fold_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-02-01T15:00:19.964638Z", 9 | "start_time": "2020-02-01T15:00:15.809763Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "import os\n", 17 | "from sklearn.preprocessing import LabelEncoder\n", 18 | "from tqdm import tqdm\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 21 | "import warnings\n", 22 | "from sklearn.metrics import f1_score, roc_auc_score\n", 23 | "import catboost as cbt\n", 24 | "import gc\n", 25 | "\n", 26 | "pd.set_option('display.max_columns', None)\n", 27 | "pd.set_option('display.max_rows', None)\n", 28 | "warnings.filterwarnings('ignore')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "ExecuteTime": { 36 | "end_time": "2020-02-01T15:00:19.968893Z", 37 | "start_time": "2020-02-01T15:00:19.966513Z" 38 | } 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "current_path = './'\n", 43 | "seed = 2019\n", 44 | "n_fold = 5" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2020-02-01T12:40:36.400426Z", 53 | "start_time": "2020-02-01T12:40:08.871667Z" 54 | }, 55 | "scrolled": true 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "df_feature = pd.read_pickle(os.path.join(\n", 60 | " current_path, 'feature', 'feature.pickle'))" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2020-02-01T12:40:36.512167Z", 69 | "start_time": "2020-02-01T12:40:36.401765Z" 70 | }, 71 | "scrolled": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "df_feature.head()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "ExecuteTime": { 83 | "end_time": "2020-02-01T12:42:29.640354Z", 84 | "start_time": "2020-02-01T12:40:36.513434Z" 85 | } 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "for f in tqdm(list(df_feature.select_dtypes('object'))):\n", 90 | " if f not in ['id']:\n", 91 | " le = LabelEncoder()\n", 92 | " df_feature[f] = le.fit_transform(\n", 93 | " df_feature[f].astype('str')).astype('int')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "ExecuteTime": { 101 | "end_time": "2020-02-01T12:42:43.750204Z", 102 | "start_time": "2020-02-01T12:42:29.642972Z" 103 | } 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "df_test = df_feature[df_feature['target'].isnull()].copy()\n", 108 | "df_train = df_feature[df_feature['target'].notnull()].copy()\n", 109 | "\n", 110 | "del df_feature\n", 111 | "gc.collect()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "ExecuteTime": { 119 | "end_time": "2020-02-01T13:43:58.090439Z", 120 | "start_time": "2020-02-01T12:42:43.751557Z" 121 | }, 122 | "scrolled": true 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "ycol = 'target'\n", 127 | "feature_names = list(\n", 128 | " filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n", 129 | " 'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n", 130 | " 'hourl', 'group'],\n", 131 | " df_train.columns))\n", 132 | "\n", 133 | "model = lgb.LGBMClassifier(num_leaves=64,\n", 134 | " max_depth=10,\n", 135 | " learning_rate=0.4,\n", 136 | " n_estimators=10000000,\n", 137 | " subsample=0.8,\n", 138 | " feature_fraction=0.8,\n", 139 | " reg_alpha=0.5,\n", 140 | " reg_lambda=0.5,\n", 141 | " random_state=seed,\n", 142 | " metric='auc'\n", 143 | " )\n", 144 | "\n", 145 | "# model = lgb.LGBMClassifier(\n", 146 | "# learning_rate=0.01,\n", 147 | "# n_estimators=10000000,\n", 148 | "# num_leaves=255,\n", 149 | "# subsample=0.9,\n", 150 | "# colsample_bytree=0.8,\n", 151 | "# random_state=seed,\n", 152 | "# metric='auc'\n", 153 | "# )\n", 154 | "\n", 155 | "oof = []\n", 156 | "prediction = df_test[['id']]\n", 157 | "prediction['target'] = 0\n", 158 | "df_importance_list = []\n", 159 | "\n", 160 | "kfold = StratifiedKFold(n_splits=n_fold, shuffle=False, random_state=seed)\n", 161 | "for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train['day'])):\n", 162 | " X_train = df_train.iloc[trn_idx][feature_names]\n", 163 | " Y_train = df_train.iloc[trn_idx][ycol]\n", 164 | "\n", 165 | " X_val = df_train.iloc[val_idx][feature_names]\n", 166 | " Y_val = df_train.iloc[val_idx][ycol]\n", 167 | "\n", 168 | " print('\\nFold_{} Training ================================\\n'.format(fold_id+1))\n", 169 | "\n", 170 | " lgb_model = model.fit(X_train,\n", 171 | " Y_train,\n", 172 | " eval_names=['train', 'valid'],\n", 173 | " eval_set=[(X_train, Y_train), (X_val, Y_val)],\n", 174 | " verbose=100,\n", 175 | " eval_metric='auc',\n", 176 | " early_stopping_rounds=50)\n", 177 | "\n", 178 | " pred_val = lgb_model.predict_proba(\n", 179 | " X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n", 180 | " df_oof = df_train.iloc[val_idx][['id', ycol]].copy()\n", 181 | " df_oof['pred'] = pred_val\n", 182 | " oof.append(df_oof)\n", 183 | "\n", 184 | " pred_test = lgb_model.predict_proba(\n", 185 | " df_test[feature_names], num_iteration=lgb_model.best_iteration_)[:, 1]\n", 186 | " prediction['target'] += pred_test / n_fold\n", 187 | "\n", 188 | " df_importance = pd.DataFrame({\n", 189 | " 'column': feature_names,\n", 190 | " 'importance': lgb_model.feature_importances_,\n", 191 | " })\n", 192 | " df_importance_list.append(df_importance)\n", 193 | "\n", 194 | " del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val\n", 195 | " gc.collect()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "ExecuteTime": { 203 | "end_time": "2020-02-01T13:43:58.129389Z", 204 | "start_time": "2020-02-01T13:43:58.097914Z" 205 | } 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "df_importance = pd.concat(df_importance_list)\n", 210 | "df_importance = df_importance.groupby(['column'])['importance'].agg(\n", 211 | " 'mean').sort_values(ascending=False).reset_index()\n", 212 | "df_importance" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "ExecuteTime": { 220 | "end_time": "2020-02-01T13:44:17.576713Z", 221 | "start_time": "2020-02-01T13:43:58.132193Z" 222 | } 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "df_oof = pd.concat(oof)\n", 227 | "df_oof['pred_bin'] = df_oof['pred'].rank()\n", 228 | "df_oof['pred_bin'] = (df_oof['pred_bin'] >= df_oof.shape[0]\n", 229 | " * 0.8934642948637943).astype(int)\n", 230 | "\n", 231 | "auc = roc_auc_score(df_oof['target'], df_oof['pred_bin'])\n", 232 | "f1 = f1_score(df_oof['target'], df_oof['pred_bin'])\n", 233 | "\n", 234 | "print('f1:', f1)\n", 235 | "print('auc:', auc)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "ExecuteTime": { 243 | "end_time": "2020-02-01T13:44:26.479783Z", 244 | "start_time": "2020-02-01T13:44:17.578394Z" 245 | } 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "sub = prediction.copy(deep=True)\n", 250 | "sub['target'] = sub['target'].rank()\n", 251 | "sub['target'] = (sub['target'] >= sub.shape[0] *\n", 252 | " 0.8934642948637943).astype(int)\n", 253 | "sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),\n", 254 | " index=False, encoding='utf-8')" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": { 261 | "ExecuteTime": { 262 | "end_time": "2020-02-01T13:44:33.645096Z", 263 | "start_time": "2020-02-01T13:44:32.889239Z" 264 | } 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "oof_train = df_oof[['id', 'pred']]\n", 269 | "oof_test = prediction[['id', 'target']]\n", 270 | "\n", 271 | "oof_train.columns = ['id', 'oof_prob']\n", 272 | "oof_test.columns = ['id', 'oof_prob']\n", 273 | "\n", 274 | "oof = pd.concat([oof_train, oof_test], sort=False)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "ExecuteTime": { 282 | "end_time": "2020-02-01T15:03:28.700849Z", 283 | "start_time": "2020-02-01T15:03:24.089293Z" 284 | } 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "oof.to_pickle(os.path.join(current_path, 'prob', 'oof_lgb_qian.pickle'))" 289 | ] 290 | } 291 | ], 292 | "metadata": { 293 | "kernelspec": { 294 | "display_name": "Python [conda env:dm] *", 295 | "language": "python", 296 | "name": "conda-env-dm-py" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.6.9" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 2 313 | } 314 | -------------------------------------------------------------------------------- /3.offline_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-02-04T09:26:16.527236Z", 9 | "start_time": "2020-02-04T09:26:15.261543Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "import os\n", 17 | "from sklearn.preprocessing import LabelEncoder\n", 18 | "from tqdm import tqdm\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 21 | "import warnings\n", 22 | "from sklearn.metrics import f1_score, roc_auc_score\n", 23 | "import catboost as cbt\n", 24 | "import gc\n", 25 | "\n", 26 | "pd.set_option('display.max_columns', None)\n", 27 | "pd.set_option('display.max_rows', None)\n", 28 | "warnings.filterwarnings('ignore')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "ExecuteTime": { 36 | "end_time": "2020-02-04T09:26:16.531438Z", 37 | "start_time": "2020-02-04T09:26:16.528868Z" 38 | } 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "current_path = './'\n", 43 | "seed = 2019" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "ExecuteTime": { 51 | "end_time": "2020-02-04T09:27:27.235669Z", 52 | "start_time": "2020-02-04T09:26:16.532526Z" 53 | }, 54 | "scrolled": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "df_feature = pd.read_pickle(os.path.join(\n", 59 | " current_path, 'feature', 'feature.pickle'))\n", 60 | "df_feature['id'] = df_feature['id'].astype('str')" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2020-02-04T09:28:00.169320Z", 69 | "start_time": "2020-02-04T09:27:27.236924Z" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "df_oof_lgb = pd.read_pickle(os.path.join(\n", 75 | " current_path, 'prob', 'oof_lgb_qian.pickle'))\n", 76 | "df_oof_lgb.columns = ['id', 'lgb_oof_prob']\n", 77 | "df_oof_lgb['id'] = df_oof_lgb['id'].astype('str')\n", 78 | "df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')\n", 79 | "print(df_feature['lgb_oof_prob'].isnull().sum())" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "ExecuteTime": { 87 | "end_time": "2020-02-04T09:28:33.602746Z", 88 | "start_time": "2020-02-04T09:28:33.481456Z" 89 | }, 90 | "scrolled": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "df_feature.head()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "ExecuteTime": { 102 | "end_time": "2020-02-04T09:30:25.513293Z", 103 | "start_time": "2020-02-04T09:28:33.604825Z" 104 | } 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "for f in tqdm(list(df_feature.select_dtypes('object'))):\n", 109 | " if f not in ['id']:\n", 110 | " le = LabelEncoder()\n", 111 | " df_feature[f] = le.fit_transform(\n", 112 | " df_feature[f].astype('str')).astype('int')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "ExecuteTime": { 120 | "end_time": "2020-02-04T09:30:33.720366Z", 121 | "start_time": "2020-02-04T09:30:25.514531Z" 122 | } 123 | }, 124 | "outputs": [], 125 | "source": [ 126 | "df_val = df_feature[(df_feature['target'].notnull())\n", 127 | " & (df_feature['day'] == 10)]\n", 128 | "df_train = df_feature[df_feature['day'] < 10]" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "ExecuteTime": { 136 | "end_time": "2020-02-04T09:33:40.807346Z", 137 | "start_time": "2020-02-04T09:30:33.721845Z" 138 | }, 139 | "scrolled": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "gc.collect()\n", 144 | "\n", 145 | "ycol = 'target'\n", 146 | "feature_names = list(\n", 147 | " filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n", 148 | " 'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n", 149 | " 'hourl', 'group'],\n", 150 | " df_train.columns))\n", 151 | "\n", 152 | "\n", 153 | "model = lgb.LGBMClassifier(num_leaves=64,\n", 154 | " max_depth=10,\n", 155 | " learning_rate=0.4,\n", 156 | " n_estimators=100000,\n", 157 | " subsample=0.8,\n", 158 | " feature_fraction=0.8,\n", 159 | " reg_alpha=0.5,\n", 160 | " reg_lambda=0.5,\n", 161 | " random_state=seed,\n", 162 | " metric='auc'\n", 163 | " )\n", 164 | "\n", 165 | "loss = 0\n", 166 | "df_importance_list = []\n", 167 | "oof_list = []\n", 168 | "\n", 169 | "X_train = df_train[feature_names]\n", 170 | "Y_train = df_train[ycol]\n", 171 | "\n", 172 | "X_val = df_val[feature_names]\n", 173 | "Y_val = df_val[ycol]\n", 174 | "\n", 175 | "lgb_model = model.fit(X_train,\n", 176 | " Y_train,\n", 177 | " eval_names=['train', 'valid'],\n", 178 | " eval_set=[(X_train, Y_train), (X_val, Y_val)],\n", 179 | " verbose=50,\n", 180 | " eval_metric='auc',\n", 181 | " early_stopping_rounds=50)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "ExecuteTime": { 189 | "end_time": "2020-02-04T09:33:40.865336Z", 190 | "start_time": "2020-02-04T09:33:40.830776Z" 191 | } 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "auc = lgb_model.best_score_['valid']['auc']\n", 196 | "print(auc)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "ExecuteTime": { 204 | "end_time": "2020-02-04T09:33:41.034844Z", 205 | "start_time": "2020-02-04T09:33:40.868212Z" 206 | } 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "df_importance = pd.DataFrame({\n", 211 | " 'feature': feature_names,\n", 212 | " 'importance': lgb_model.feature_importances_,\n", 213 | "})\n", 214 | "df_importance = df_importance.sort_values(by='importance', ascending=False)\n", 215 | "df_importance" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "ExecuteTime": { 223 | "end_time": "2020-02-04T09:33:53.213172Z", 224 | "start_time": "2020-02-04T09:33:41.036538Z" 225 | } 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "val_pred = lgb_model.predict_proba(\n", 230 | " X_val, num_iteration=lgb_model.best_iteration_)[:, 1]\n", 231 | "df_oof = pd.DataFrame()\n", 232 | "df_oof['lgb_pred'] = val_pred\n", 233 | "df_oof['target'] = Y_val.values\n", 234 | "df_oof['pred_label'] = df_oof['lgb_pred'].rank()\n", 235 | "df_oof['pred_label'] = (df_oof['pred_label'] >=\n", 236 | " df_oof.shape[0] * 0.8934642948637943).astype(int)\n", 237 | "f1 = f1_score(df_oof['target'], df_oof['pred_label'])\n", 238 | "print('f1:', f1)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "ExecuteTime": { 246 | "end_time": "2020-02-04T09:33:53.239449Z", 247 | "start_time": "2020-02-04T09:33:53.214474Z" 248 | } 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "best_iteration = lgb_model.best_iteration_\n", 253 | "\n", 254 | "with open(os.path.join(current_path, 'best_it.txt'), 'w') as f:\n", 255 | " f.write(str(best_iteration)+'\\n')\n", 256 | " f.write(str(f1))" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python [conda env:dm] *", 263 | "language": "python", 264 | "name": "conda-env-dm-py" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.6.9" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 2 281 | } 282 | -------------------------------------------------------------------------------- /4.online_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2020-02-11T05:39:50.155898Z", 9 | "start_time": "2020-02-11T05:39:49.510099Z" 10 | } 11 | }, 12 | "outputs": [], 13 | "source": [ 14 | "import pandas as pd\n", 15 | "import numpy as np\n", 16 | "import os\n", 17 | "from sklearn.preprocessing import LabelEncoder\n", 18 | "from tqdm import tqdm\n", 19 | "import lightgbm as lgb\n", 20 | "from sklearn.model_selection import KFold, StratifiedKFold\n", 21 | "import warnings\n", 22 | "from sklearn.metrics import f1_score\n", 23 | "import catboost as cbt\n", 24 | "import gc\n", 25 | "\n", 26 | "pd.set_option('display.max_columns', None)\n", 27 | "pd.set_option('display.max_rows', None)\n", 28 | "warnings.filterwarnings('ignore')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": { 35 | "ExecuteTime": { 36 | "end_time": "2020-02-11T05:39:50.159779Z", 37 | "start_time": "2020-02-11T05:39:50.157281Z" 38 | } 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "current_path = './'\n", 43 | "seed = 2019" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "ExecuteTime": { 51 | "end_time": "2020-02-11T05:41:08.245550Z", 52 | "start_time": "2020-02-11T05:39:50.225930Z" 53 | } 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "df_feature = pd.read_pickle(os.path.join(\n", 58 | " current_path, 'feature', 'feature_1.pickle'))\n", 59 | "df_feature['id'] = df_feature['id'].astype('str')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "ExecuteTime": { 67 | "end_time": "2020-02-11T05:41:08.284415Z", 68 | "start_time": "2020-02-11T05:41:08.268086Z" 69 | } 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "df_feature.shape" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "ExecuteTime": { 81 | "end_time": "2020-02-04T01:26:19.495464Z", 82 | "start_time": "2020-02-04T01:25:47.948238Z" 83 | } 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "df_oof_lgb = pd.read_pickle(os.path.join(\n", 88 | " current_path, 'prob', 'oof_lgb_qian.pickle'))\n", 89 | "df_oof_lgb.columns = ['id', 'lgb_oof_prob']\n", 90 | "df_feature = df_feature.merge(df_oof_lgb, how='left', on='id')\n", 91 | "print(df_feature['lgb_oof_prob'].isnull().sum())" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "ExecuteTime": { 99 | "end_time": "2020-02-04T01:26:51.489261Z", 100 | "start_time": "2020-02-04T01:26:19.511612Z" 101 | } 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "df_oof_cat = pd.read_pickle(os.path.join(\n", 106 | " current_path, 'prob', 'oof_cat.pickle'))\n", 107 | "df_oof_cat.columns = ['id', 'cat_oof_prob']\n", 108 | "df_feature = df_feature.merge(df_oof_cat, how='left', on='id')\n", 109 | "print(df_feature['cat_oof_prob'].isnull().sum())" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "ExecuteTime": { 117 | "end_time": "2020-02-04T01:26:51.662981Z", 118 | "start_time": "2020-02-04T01:26:51.490569Z" 119 | } 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "df_feature.head()" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "ExecuteTime": { 131 | "end_time": "2020-02-04T01:26:51.692673Z", 132 | "start_time": "2020-02-04T01:26:51.664289Z" 133 | } 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "with open(os.path.join(current_path, 'best_it.txt'), 'r') as f:\n", 138 | " lines = f.readlines()\n", 139 | " best_iteration = lines[0]\n", 140 | " f1 = lines[1]\n", 141 | "\n", 142 | "best_iteration = int(best_iteration)\n", 143 | "f1 = float(f1)\n", 144 | "print(best_iteration, f1)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "ExecuteTime": { 152 | "end_time": "2020-02-04T01:26:51.699332Z", 153 | "start_time": "2020-02-04T01:26:51.695196Z" 154 | } 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "bt = int(best_iteration * 1)\n", 159 | "print(bt)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": { 166 | "ExecuteTime": { 167 | "end_time": "2020-02-04T01:28:44.322415Z", 168 | "start_time": "2020-02-04T01:26:51.701108Z" 169 | } 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "for f in tqdm(list(df_feature.select_dtypes('object'))):\n", 174 | " if f not in ['id']:\n", 175 | " le = LabelEncoder()\n", 176 | " df_feature[f] = le.fit_transform(\n", 177 | " df_feature[f].astype('str')).astype('int')" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "ExecuteTime": { 185 | "end_time": "2020-02-04T01:29:16.236420Z", 186 | "start_time": "2020-02-04T01:29:16.221939Z" 187 | } 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "df_test = df_feature[df_feature['target'].isnull()]\n", 192 | "df_train = df_feature[df_feature['target'].notnull()]\n", 193 | "\n", 194 | "del df_feature, df_oof_lgb, df_oof_cat\n", 195 | "gc.collect()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "ExecuteTime": { 203 | "end_time": "2020-02-04T01:31:34.791258Z", 204 | "start_time": "2020-02-04T01:29:20.948750Z" 205 | }, 206 | "scrolled": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "ycol = 'target'\n", 211 | "feature_names = list(\n", 212 | " filter(lambda x: x not in [ycol, 'timestamp', 'ts', 'id', 'day', 'hour', 'minute', 'ts_datetime', 'minute10',\n", 213 | " 'personidentification', 'level', 'followscore', 'personalscore', 'gender',\n", 214 | " 'hourl', 'group'],\n", 215 | " df_train.columns))\n", 216 | "\n", 217 | "X_train = df_train[feature_names]\n", 218 | "Y_train = df_train[ycol]\n", 219 | "\n", 220 | "model = lgb.LGBMClassifier(num_leaves=64,\n", 221 | " max_depth=10,\n", 222 | " learning_rate=0.4,\n", 223 | " n_estimators=bt,\n", 224 | " subsample=0.8,\n", 225 | " feature_fraction=0.8,\n", 226 | " reg_alpha=0.5,\n", 227 | " reg_lambda=0.5,\n", 228 | " random_state=seed,\n", 229 | " metric='auc',\n", 230 | " )\n", 231 | "\n", 232 | "lgb_model2 = model.fit(X_train,\n", 233 | " Y_train,\n", 234 | " eval_names=['train', 'valid'],\n", 235 | " eval_set=[(X_train, Y_train)],\n", 236 | " verbose=50,\n", 237 | " eval_metric='auc')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "ExecuteTime": { 245 | "end_time": "2020-02-04T01:31:35.054266Z", 246 | "start_time": "2020-02-04T01:31:34.794101Z" 247 | } 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "df_importance = pd.DataFrame({\n", 252 | " 'feature': feature_names,\n", 253 | " 'importance': lgb_model2.feature_importances_,\n", 254 | "})\n", 255 | "\n", 256 | "df_importance = df_importance.sort_values(by='importance', ascending=False)\n", 257 | "df_importance" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "ExecuteTime": { 265 | "end_time": "2020-02-04T01:31:52.563920Z", 266 | "start_time": "2020-02-04T01:31:35.057758Z" 267 | } 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "test_pred = lgb_model2.predict_proba(\n", 272 | " df_test[feature_names], num_iteration=bt)[:, 1]\n", 273 | "prediction = df_test[['id']]\n", 274 | "prediction['target'] = test_pred\n", 275 | "np.save(os.path.join(current_path, 'prob',\n", 276 | " 'sub_{}.npy'.format(f1)), prediction.values)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": { 283 | "ExecuteTime": { 284 | "end_time": "2020-02-04T01:31:58.920512Z", 285 | "start_time": "2020-02-04T01:31:52.565295Z" 286 | } 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "sub = prediction.copy(deep=True)\n", 291 | "sub['target'] = sub['target'].rank()\n", 292 | "sub['target'] = (sub['target'] >= sub.shape[0] *\n", 293 | " 0.8934642948637943).astype(int)\n", 294 | "sub.to_csv(os.path.join(current_path, 'sub', '{}.csv'.format(f1)),\n", 295 | " index=False, encoding='utf-8')" 296 | ] 297 | } 298 | ], 299 | "metadata": { 300 | "kernelspec": { 301 | "display_name": "Python [conda env:dm] *", 302 | "language": "python", 303 | "name": "conda-env-dm-py" 304 | }, 305 | "language_info": { 306 | "codemirror_mode": { 307 | "name": "ipython", 308 | "version": 3 309 | }, 310 | "file_extension": ".py", 311 | "mimetype": "text/x-python", 312 | "name": "python", 313 | "nbconvert_exporter": "python", 314 | "pygments_lexer": "ipython3", 315 | "version": "3.6.9" 316 | } 317 | }, 318 | "nbformat": 4, 319 | "nbformat_minor": 2 320 | } 321 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tuling-video-click-top3 2 | 图灵联邦视频点击预测大赛线上第三 3 | 4 | # 2020-TURING-TOPIA-Video-Click-SINGLE-LightGBM-top3 5 | =============================================================================================================== 6 | **图灵联邦视频点击预测大赛线上第三(LightGBM单模)** 7 | ## 主办方: 图灵联邦 8 | ## 赛道:2020-视频点击预测大赛 9 | 10 | **赛道链接**:https://www.turingtopia.com/competitionnew/detail/e4880352b6ef4f9f8f28e8f98498dbc4/sketch 11 | **赛程时间**:*2019.11.11-2020.03.09* 12 | **参与人**:[第一次打比赛](https://github.com/LogicJake)、[郑](https://github.com/jackhuntcn) 、[小兔子乖乖](https://github.com/PandasCute) 、 [Freak](https://github.com/BovenPeng/) 、[luweihai](https://github.com/luweihai) 13 | **方案文档**:[文档链接](https://www.logicjake.xyz/2020/02/10/%E5%9B%BE%E7%81%B5%E8%81%94%E9%82%A6%E8%A7%86%E9%A2%91%E7%82%B9%E5%87%BB%E9%A2%84%E6%B5%8B%E5%A4%A7%E8%B5%9B-%E8%B5%9B%E5%90%8E%E6%80%BB%E7%BB%93/) 14 | **百度云盘下载链接**:为避免数据丢失,提供数据集下载地址链接: https://pan.baidu.com/s/1YPtg4QyiAdhRAMoxjis_Gw 密码: 0a3r 15 | ## 1.数据说明 16 | **train.csv** 17 | 18 | | 字段 | 中文名| 数据类型| 说明 | 19 | |:-------:|:-------:|:-------:|:-------:| 20 | |id| 用户ID| VARCHAR2(50)| 代表数据集的第几条数据,从1到11376681| 21 | |target| 是否点击| VARCHAR2(50)| 代表该视频是否被用户点击了,1代表点击,0代表未点击。| 22 | |timestamp|修改时间戳| VARCHAR2(50)|代表改用户点击改视频的时间戳,如果未点击则为NULL。| 23 | |deviceid| 设备ID| VARCHAR2(50)|用户的设备id| 24 | |newsid|视频ID| VARCHAR2(50)|视频的id。| 25 | |guid|注册ID| VARCHAR2(50)| 用户的注册id。| 26 | |pos|推荐位置| VARCHAR2(50)| 视频推荐位置| 27 | |app_version|app版本| VARCHAR2(50)| app版本。| 28 | |device_vendor|设备厂商| VARCHAR2(50)| 设备厂商| 29 | |netmodel|网络类型| VARCHAR2(50)| 网络类型。| 30 | |osversion|操作系统版本| VARCHAR2(50)| 操作系统版本。| 31 | |lng|经度| VARCHAR2(50)|经度。| 32 | |lat|维度| VARCHAR2(50)| 维度。| 33 | |device_version|设备版本| VARCHAR2(50)| 设备版本。| 34 | |ts| 用户ID|时间戳| 视频暴光给用户的时间戳。| 35 | 36 | **test.csv** 37 | 38 | | 字段 | 中文名| 数据类型| 说明 | 39 | |:-------:|:-------:|:-------:|:-------:| 40 | |id| 用户ID| VARCHAR2(50)| test_1到test_3653592| 41 | |deviceid| 设备ID| VARCHAR2(50)|用户的设备id| 42 | |newsid|视频ID| VARCHAR2(50)|视频的id。| 43 | |guid|注册ID| VARCHAR2(50)| 用户的注册id。| 44 | |pos|推荐位置| VARCHAR2(50)| 视频推荐位置| 45 | |app_version|app版本| VARCHAR2(50)| app版本。| 46 | |device_vendor|设备厂商| VARCHAR2(50)| 设备厂商| 47 | |netmodel|网络类型| VARCHAR2(50)| 网络类型。| 48 | |osversion|操作系统版本| VARCHAR2(50)| 操作系统版本。| 49 | |lng|经度| VARCHAR2(50)|经度。| 50 | |lat|维度| VARCHAR2(50)| 维度。| 51 | |device_version|设备版本| VARCHAR2(50)| 设备版本。| 52 | |ts| 用户ID|时间戳| 视频暴光给用户的时间戳。| 53 | 54 | **app.csv** 55 | 56 | | 字段 | 中文名| 数据类型| 说明 | 57 | |:-------:|:-------:|:-------:|:-------:| 58 | |id| 用户ID| VARCHAR2(50)| test_1到test_3653592| 59 | |**deviceid**| 设备ID| VARCHAR2(50)|用户的设备id| 60 | |applist deviceid|视频ID| VARCHAR2(50)|用户所拥有的app,我们已将app的名字设置成了app_1,app_2..的形式。| 61 | 62 | **test.csv** 63 | 64 | | 字段 | 中文名| 数据类型| 说明 | 65 | |:-------:|:-------:|:-------:|:-------:| 66 | |id| 用户ID| VARCHAR2(50)| test_1到test_3653592| 67 | |deviceid| 设备ID| VARCHAR2(50)|用户的设备id| 68 | |guid|注册ID| VARCHAR2(50)| 用户的注册id。| 69 | |outertag|用户画像| VARCHAR2(50)|用户画像用竖号分隔,冒号后面的数字代表对该标签的符合程度,分数越高代表该标签越符合该用户。| 70 | |tag|用户画像| VARCHAR2(50)|同outertag| 71 | |level|用户等级| VARCHAR2(50)|用户等级。| 72 | |personidentification|是否优劣| VARCHAR2(50)|1表示劣质用户 0表示正常用户。| 73 | |followscore|徒弟分| VARCHAR2(50)|徒弟分(好友分)。 | 74 | |personalscore|个人分| VARCHAR2(50)|个人分。 | 75 | |gender|性别| VARCHAR2(50)|性别| 76 | 77 | ## 2.配置环境与依赖库 78 | - python3 79 | - scikit-learn 80 | - gensim 81 | - Ubuntu 82 | - LightGBM 83 | - notebook 84 | ## 3.运行代码步骤说明 85 | 分别按照代码顺序 86 | 运行1,2,3,4 87 | > 1 feature.ipynb 特征工程 88 | > 2 fold_model.ipynb 89 | > 3 offline_model.ipynb 离线模型 90 | > 4 online_model.ipynb 线上模型 91 | 92 | ## 4.特征工程 93 | - **原始特征** 94 | - **穿越特征** 95 | - **统计特征** 96 | - **embedding特征** 97 | ## 5.模型训练 98 | 单模,初赛最终榜:0.83695 线上第三 99 | 100 | --------------------------------------------------------------------------------