├── README.md ├── chizhu_nurbs ├── .ipynb_checkpoints │ ├── LGB特征提取-checkpoint.ipynb │ └── NN模型-V1-checkpoint.ipynb ├── LGB模型.ipynb ├── LGB特征提取.ipynb ├── NN模型-V1.ipynb ├── NN模型-V2.ipynb ├── NN模型-V3.ipynb ├── 提取特征.ipynb └── 生成序列数据.ipynb ├── snake ├── 1. Data Prepare.ipynb ├── 10. xDeepFM Multi Value Model.ipynb ├── 11. LSTM-Attention Model.ipynb ├── 2. Word Vector & Sparse Matrix.ipynb ├── 3. Graph Feature.ipynb ├── 4. Flatten Stat Feature.ipynb ├── 5. W2V Feature.ipynb ├── 6. TFIDF-COUNT Feature.ipynb ├── 7. Meta Active Train (Strong).ipynb ├── 8. Meta Train Usage (Strong).ipynb └── 9. LGB COUNT & TFIDF Model.ipynb └── 开源方案.pdf /README.md: -------------------------------------------------------------------------------- 1 | # HUAWEI-DIGIX-AgeGroup 2 | 2019 HUAWEI DIGIX Nurbs Solutions 3 | 4 | #### Notes: 5 | ###### RAdam,AdamW,Lookahead,CycLearn在/snake/11. lstm-atten中 6 | ###### CTR模型的选择,只需要更换调用的模型,如FibiNet,FGCNN,xDeepFM,调用的是 deepctr 库 7 | ###### Meta Train在/snake/7和8中,参考的是Home Credit 17名和Elo Senkin大佬的Solutions 8 | ###### 原生的adam与Lookahead配合较好,效果比较为adam 0:\n", 182 | " lr *= (1. / (1. + self.decay * K.cast(self.iterations,\n", 183 | " K.dtype(self.decay))))\n", 184 | "\n", 185 | " t = K.cast(self.iterations, K.floatx()) + 1\n", 186 | " lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /\n", 187 | " (1. - K.pow(self.beta_1, t)))\n", 188 | "\n", 189 | " ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]\n", 190 | " vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]\n", 191 | " self.weights = [self.iterations] + ms + vs\n", 192 | "\n", 193 | " for p, g, m, v in zip(params, grads, ms, vs):\n", 194 | " m_t = (self.beta_1 * m) + (1. - self.beta_1) * g\n", 195 | " v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)\n", 196 | " # decoupled weight decay (4/4)\n", 197 | " p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p\n", 198 | "\n", 199 | " self.updates.append(K.update(m, m_t))\n", 200 | " self.updates.append(K.update(v, v_t))\n", 201 | " new_p = p_t\n", 202 | "\n", 203 | " # Apply constraints.\n", 204 | " if getattr(p, 'constraint', None) is not None:\n", 205 | " new_p = p.constraint(new_p)\n", 206 | "\n", 207 | " self.updates.append(K.update(p, new_p))\n", 208 | " return self.updates\n", 209 | "\n", 210 | " def get_config(self):\n", 211 | " config = {'lr': float(K.get_value(self.lr)),\n", 212 | " 'beta_1': float(K.get_value(self.beta_1)),\n", 213 | " 'beta_2': float(K.get_value(self.beta_2)),\n", 214 | " 'decay': float(K.get_value(self.decay)),\n", 215 | " 'weight_decay': float(K.get_value(self.wd)),\n", 216 | " 'epsilon': self.epsilon}\n", 217 | " base_config = super(AdamW, self).get_config()\n", 218 | " return dict(list(base_config.items()) + list(config.items()))\n", 219 | "\n", 220 | "\n", 221 | "from keras.engine.topology import Layer\n", 222 | "class Attention(Layer):\n", 223 | " def __init__(self, step_dim,\n", 224 | " W_regularizer=None, b_regularizer=None,\n", 225 | " W_constraint=None, b_constraint=None,\n", 226 | " bias=True, **kwargs):\n", 227 | " self.supports_masking = True\n", 228 | " self.init = initializers.get('glorot_uniform')\n", 229 | "\n", 230 | " self.W_regularizer = regularizers.get(W_regularizer)\n", 231 | " self.b_regularizer = regularizers.get(b_regularizer)\n", 232 | "\n", 233 | " self.W_constraint = constraints.get(W_constraint)\n", 234 | " self.b_constraint = constraints.get(b_constraint)\n", 235 | "\n", 236 | " self.bias = bias\n", 237 | " self.step_dim = step_dim\n", 238 | " self.features_dim = 0\n", 239 | " super(Attention, self).__init__(**kwargs)\n", 240 | "\n", 241 | " def build(self, input_shape):\n", 242 | " assert len(input_shape) == 3\n", 243 | "\n", 244 | " self.W = self.add_weight((input_shape[-1],),\n", 245 | " initializer=self.init,\n", 246 | " name='{}_W'.format(self.name),\n", 247 | " regularizer=self.W_regularizer,\n", 248 | " constraint=self.W_constraint)\n", 249 | " self.features_dim = input_shape[-1]\n", 250 | "\n", 251 | " if self.bias:\n", 252 | " self.b = self.add_weight((input_shape[1],),\n", 253 | " initializer='zero',\n", 254 | " name='{}_b'.format(self.name),\n", 255 | " regularizer=self.b_regularizer,\n", 256 | " constraint=self.b_constraint)\n", 257 | " else:\n", 258 | " self.b = None\n", 259 | "\n", 260 | " self.built = True\n", 261 | "\n", 262 | " def compute_mask(self, input, input_mask=None):\n", 263 | " return None\n", 264 | "\n", 265 | " def call(self, x, mask=None):\n", 266 | " features_dim = self.features_dim\n", 267 | " step_dim = self.step_dim\n", 268 | "\n", 269 | " eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),\n", 270 | " K.reshape(self.W, (features_dim, 1))), (-1, step_dim))\n", 271 | "\n", 272 | " if self.bias:\n", 273 | " eij += self.b\n", 274 | "\n", 275 | " eij = K.tanh(eij)\n", 276 | "\n", 277 | " a = K.exp(eij)\n", 278 | "\n", 279 | " if mask is not None:\n", 280 | " a *= K.cast(mask, K.floatx())\n", 281 | "\n", 282 | " a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())\n", 283 | "\n", 284 | " a = K.expand_dims(a)\n", 285 | " weighted_input = x * a\n", 286 | " return K.sum(weighted_input, axis=1)\n", 287 | "\n", 288 | " def compute_output_shape(self, input_shape):\n", 289 | " return input_shape[0], self.features_dim" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "ExecuteTime": { 297 | "end_time": "2019-08-11T03:17:17.952533Z", 298 | "start_time": "2019-08-11T03:14:38.631052Z" 299 | }, 300 | "scrolled": true 301 | }, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "nan\n", 308 | "nan\n", 309 | "nan\n", 310 | "nan\n", 311 | "nan\n", 312 | "nan\n", 313 | "nan\n", 314 | "nan\n", 315 | "nan\n", 316 | "nan\n", 317 | "nan\n", 318 | "nan\n", 319 | "nan\n", 320 | "nan\n", 321 | "nan\n", 322 | "nan\n", 323 | "nan\n", 324 | "nan\n", 325 | "nan\n", 326 | "nan\n", 327 | "nan\n", 328 | "nan\n", 329 | "nan\n", 330 | "nan\n", 331 | "nan\n", 332 | "nan\n", 333 | "nan\n", 334 | "nan\n", 335 | "nan\n", 336 | "nan\n", 337 | "nan\n", 338 | "nan\n", 339 | "nan\n", 340 | "nan\n", 341 | "nan\n", 342 | "nan\n", 343 | "nan\n", 344 | "nan\n", 345 | "nan\n", 346 | "nan\n", 347 | "nan\n", 348 | "nan\n", 349 | "nan\n", 350 | "nan\n", 351 | "nan\n", 352 | "nan\n", 353 | "nan\n", 354 | "nan\n", 355 | "nan\n", 356 | "nan\n", 357 | "nan\n", 358 | "nan\n", 359 | "nan\n", 360 | "nan\n", 361 | "nan\n", 362 | "nan\n", 363 | "nan\n", 364 | "nan\n", 365 | "nan\n", 366 | "nan\n", 367 | "nan\n", 368 | "nan\n", 369 | "nan\n", 370 | "nan\n", 371 | "nan\n", 372 | "nan\n", 373 | "nan\n", 374 | "nan\n", 375 | "nan\n", 376 | "nan\n", 377 | "nan\n", 378 | "nan\n", 379 | "nan\n", 380 | "nan\n", 381 | "nan\n", 382 | "nan\n", 383 | "nan\n", 384 | "nan\n", 385 | "nan\n", 386 | "nan\n", 387 | "nan\n", 388 | "nan\n", 389 | "nan\n", 390 | "nan\n", 391 | "nan\n", 392 | "nan\n", 393 | "nan\n", 394 | "nan\n", 395 | "nan\n", 396 | "nan\n", 397 | "nan\n", 398 | "nan\n", 399 | "nan\n", 400 | "nan\n", 401 | "nan\n", 402 | "nan\n", 403 | "nan\n", 404 | "nan\n", 405 | "nan\n", 406 | "nan\n", 407 | "nan\n", 408 | "nan\n", 409 | "nan\n", 410 | "nan\n", 411 | "nan\n", 412 | "nan\n", 413 | "nan\n", 414 | "nan\n", 415 | "nan\n", 416 | "nan\n", 417 | "nan\n", 418 | "nan\n", 419 | "nan\n", 420 | "nan\n", 421 | "nan\n", 422 | "nan\n", 423 | "nan\n", 424 | "nan\n", 425 | "nan\n", 426 | "nan\n", 427 | "nan\n", 428 | "nan\n", 429 | "nan\n", 430 | "nan\n", 431 | "nan\n", 432 | "nan\n", 433 | "nan\n", 434 | "nan\n", 435 | "nan\n", 436 | "nan\n", 437 | "nan\n", 438 | "nan\n", 439 | "nan\n", 440 | "nan\n", 441 | "nan\n", 442 | "nan\n", 443 | "nan\n", 444 | "nan\n", 445 | "nan\n", 446 | "nan\n", 447 | "nan\n", 448 | "nan\n", 449 | "nan\n", 450 | "nan\n", 451 | "nan\n", 452 | "nan\n", 453 | "nan\n", 454 | "nan\n", 455 | "nan\n", 456 | "nan\n", 457 | "nan\n", 458 | "nan\n", 459 | "nan\n", 460 | "nan\n", 461 | "nan\n", 462 | "nan\n", 463 | "nan\n", 464 | "nan\n", 465 | "nan\n", 466 | "nan\n", 467 | "nan\n", 468 | "nan\n", 469 | "nan\n", 470 | "nan\n", 471 | "nan\n", 472 | "nan\n", 473 | "nan\n", 474 | "nan\n", 475 | "nan\n", 476 | "nan\n", 477 | "nan\n", 478 | "nan\n", 479 | "nan\n", 480 | "nan\n", 481 | "nan\n", 482 | "nan\n", 483 | "nan\n", 484 | "nan\n", 485 | "nan\n", 486 | "nan\n", 487 | "nan\n", 488 | "nan\n", 489 | "nan\n", 490 | "nan\n", 491 | "nan\n", 492 | "nan\n", 493 | "nan\n", 494 | "nan\n", 495 | "nan\n", 496 | "nan\n", 497 | "nan\n", 498 | "nan\n", 499 | "nan\n", 500 | "nan\n", 501 | "nan\n", 502 | "nan\n", 503 | "nan\n", 504 | "nan\n", 505 | "nan\n", 506 | "nan\n", 507 | "nan\n", 508 | "nan\n", 509 | "nan\n", 510 | "nan\n", 511 | "nan\n", 512 | "nan\n", 513 | "nan\n", 514 | "nan\n", 515 | "nan\n", 516 | "nan\n", 517 | "nan\n", 518 | "nan\n", 519 | "nan\n", 520 | "nan\n", 521 | "nan\n", 522 | "nan\n", 523 | "nan\n", 524 | "nan\n", 525 | "nan\n", 526 | "nan\n", 527 | "nan\n", 528 | "nan\n", 529 | "nan\n", 530 | "nan\n", 531 | "nan\n", 532 | "nan\n", 533 | "nan\n", 534 | "nan\n", 535 | "nan\n", 536 | "nan\n", 537 | "nan\n", 538 | "nan\n", 539 | "nan\n", 540 | "nan\n", 541 | "nan\n", 542 | "nan\n", 543 | "nan\n", 544 | "nan\n", 545 | "nan\n", 546 | "nan\n", 547 | "nan\n", 548 | "nan\n", 549 | "nan\n", 550 | "nan\n", 551 | "nan\n", 552 | "nan\n", 553 | "nan\n", 554 | "nan\n", 555 | "nan\n", 556 | "nan\n", 557 | "nan\n", 558 | "nan\n", 559 | "nan\n", 560 | "nan\n", 561 | "nan\n", 562 | "nan\n" 563 | ] 564 | } 565 | ], 566 | "source": [ 567 | "### 读入数据(想要五输入)\n", 568 | "id_label = get_age_data()\n", 569 | "active_data = get_user_app_actived()\n", 570 | "data = pd.merge(id_label, active_data, on='uId', how='left')\n", 571 | "data_info = get_app_info()\n", 572 | "cat_dict = dict(zip(data_info['appId'], data_info['category']))\n", 573 | "def get_review_data(row):\n", 574 | " review = ''\n", 575 | " try:\n", 576 | " app_list = row['appId'].split('#')\n", 577 | " except Exception:\n", 578 | " app_list = ['0']\n", 579 | " print(row['appId'])\n", 580 | " for i in app_list:\n", 581 | " try:\n", 582 | " review += cat_dict[i] + ' '\n", 583 | " except Exception:\n", 584 | " review += '未知' + ' ' \n", 585 | " return review\n", 586 | "data['appInfo'] = data.apply(lambda row:get_review_data(row), axis=1)\n", 587 | "time_usage_data = get_time_usage()\n", 588 | "data = pd.merge(data, time_usage_data, on='uId', how='left')\n", 589 | "data = data.fillna('未知')\n", 590 | "data.head(5)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": { 597 | "ExecuteTime": { 598 | "end_time": "2019-08-11T03:17:17.993755Z", 599 | "start_time": "2019-08-11T03:17:17.954187Z" 600 | } 601 | }, 602 | "outputs": [], 603 | "source": [ 604 | "del id_label, active_data, data_info, cat_dict" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "ExecuteTime": { 612 | "end_time": "2019-08-11T03:17:18.005700Z", 613 | "start_time": "2019-08-11T03:17:17.995869Z" 614 | } 615 | }, 616 | "outputs": [], 617 | "source": [ 618 | "### Tokenizer 序列化文本\n", 619 | "def set_tokenizer(docs, split_char=' ', max_len=100):\n", 620 | " '''\n", 621 | " 输入\n", 622 | " docs:文本列表\n", 623 | " split_char:按什么字符切割\n", 624 | " max_len:截取的最大长度\n", 625 | " \n", 626 | " 输出\n", 627 | " X:序列化后的数据\n", 628 | " word_index:文本和数字对应的索引\n", 629 | " '''\n", 630 | " tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)\n", 631 | " tokenizer.fit_on_texts(docs)\n", 632 | " X = tokenizer.texts_to_sequences(docs)\n", 633 | " maxlen = max_len\n", 634 | " X = pad_sequences(X, maxlen=maxlen, value=0)\n", 635 | " word_index=tokenizer.word_index\n", 636 | " return X, word_index\n", 637 | "\n", 638 | "### 做embedding 这里采用word2vec 可以换成其他例如(glove词向量)\n", 639 | "def trian_save_word2vec(docs, embed_size=300, save_name='w2v.txt', split_char=' '):\n", 640 | " '''\n", 641 | " 输入\n", 642 | " docs:输入的文本列表\n", 643 | " embed_size:embed长度\n", 644 | " save_name:保存的word2vec位置\n", 645 | " \n", 646 | " 输出\n", 647 | " w2v:返回的模型\n", 648 | " '''\n", 649 | " input_docs = []\n", 650 | " for i in docs:\n", 651 | " input_docs.append(i.split(split_char))\n", 652 | " logging.basicConfig(\n", 653 | " format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)\n", 654 | " w2v = Word2Vec(input_docs, size=embed_size, sg=1, window=8, seed=1017, workers=24, min_count=1, iter=10)\n", 655 | " w2v.wv.save_word2vec_format(save_name)\n", 656 | " print(\"w2v model done\")\n", 657 | " return w2v\n", 658 | "\n", 659 | "# 得到embedding矩阵\n", 660 | "def get_embedding_matrix(word_index, embed_size=300, Emed_path=\"w2v_300.txt\"):\n", 661 | " embeddings_index = gensim.models.KeyedVectors.load_word2vec_format(\n", 662 | " Emed_path, binary=False)\n", 663 | " nb_words = len(word_index)+1\n", 664 | " embedding_matrix = np.zeros((nb_words, embed_size))\n", 665 | " count = 0\n", 666 | " for word, i in tqdm(word_index.items()):\n", 667 | " if i >= nb_words:\n", 668 | " continue\n", 669 | " try:\n", 670 | " embedding_vector = embeddings_index[word]\n", 671 | " except:\n", 672 | " embedding_vector = np.zeros(embed_size)\n", 673 | " count += 1\n", 674 | " if embedding_vector is not None:\n", 675 | " embedding_matrix[i] = embedding_vector \n", 676 | " print(\"null cnt\",count)\n", 677 | " return embedding_matrix" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": { 684 | "ExecuteTime": { 685 | "end_time": "2019-08-11T03:33:35.679359Z", 686 | "start_time": "2019-08-11T03:17:18.007484Z" 687 | } 688 | }, 689 | "outputs": [], 690 | "source": [ 691 | "# appId 90\n", 692 | "# timesUsage 600\n", 693 | "text_1_list = list(data['appId'])\n", 694 | "text_3_list = list(data['timesUsage'])\n", 695 | "\n", 696 | "del data['appId']\n", 697 | "del data['timesUsage']\n", 698 | "\n", 699 | "print('开始序列化')\n", 700 | "x1, index_1 = set_tokenizer(text_1_list, split_char='#', max_len=90)\n", 701 | "x3, index_3 = set_tokenizer(text_3_list, split_char=' ', max_len=600)\n", 702 | "print('序列化完成')\n", 703 | "gc.collect()\n", 704 | "\n", 705 | "np.save('tmp/x1.npy', x1)\n", 706 | "np.save('tmp/x3.npy', x3)\n", 707 | "np.save('tmp/x1.npy', index_1)\n", 708 | "np.save('tmp/x3.npy', index_3)\n", 709 | "\n", 710 | "# 值得提醒的是这个保存方法是采用w2v.wv.save_word2vec_format\n", 711 | "# 因此你如果载入自己训练模型的时候,需要载入后再按照这个函数来保存再在emed_path中输入\n", 712 | "trian_save_word2vec(text_1_list, save_name='w2v_model/cate_w2v_300.txt', split_char='#')\n", 713 | "gc.collect()\n", 714 | "trian_save_word2vec(text_3_list, save_name='w2v_model/w2v_300.txt', split_char=' ')\n", 715 | "gc.collect()\n", 716 | "\n", 717 | "# 得到emb矩阵\n", 718 | "emb1 = get_embedding_matrix(index_1, Emed_path='w2v_model/cate_w2v_300.txt')\n", 719 | "emb3 = get_embedding_matrix(index_3, Emed_path='w2v_model/w2v_300.txt')\n", 720 | "gc.collect()" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "metadata": { 727 | "ExecuteTime": { 728 | "end_time": "2019-08-11T03:33:53.477232Z", 729 | "start_time": "2019-08-11T03:33:35.681116Z" 730 | } 731 | }, 732 | "outputs": [], 733 | "source": [ 734 | "# 将feature作为输入进行处理(这个feature一定要替换掉nan与inf)\n", 735 | "# 这个feature的顺序一定要对应之前的feature的顺序,保证每条是对应的\n", 736 | "f1 = pd.read_csv('feature/f1.csv')\n", 737 | "f2 = pd.read_csv('feature/f2.csv')\n", 738 | "f3 = pd.read_csv('feature/f3.csv')\n", 739 | "f4 = pd.read_csv('feature/f4.csv')\n", 740 | "f5 = pd.read_csv('feature/f5.csv')\n", 741 | "\n", 742 | "feature = pd.concat([f1, f2, f3, f4, f5], axis=1, sort=False)\n", 743 | "feature = feature.fillna(-1)\n", 744 | "from sklearn.preprocessing import StandardScaler\n", 745 | "ss=StandardScaler()\n", 746 | "ss.fit(feature)\n", 747 | "hin_feature = ss.transform(feature)\n", 748 | "num_feature_input = hin_feature.shape[1]" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "metadata": { 755 | "ExecuteTime": { 756 | "end_time": "2019-08-11T03:33:53.789323Z", 757 | "start_time": "2019-08-11T03:33:53.478841Z" 758 | } 759 | }, 760 | "outputs": [], 761 | "source": [ 762 | "# 区分开train和valid,test\n", 763 | "# 这里是假设三输入\n", 764 | "train_data = data[data['age_group']!=-1]\n", 765 | "train_input_1 = x1[:len(train_data)]\n", 766 | "test_input_1 = x1[len(train_data):]\n", 767 | "train_input_3 = x3[:len(train_data)]\n", 768 | "test_input_3 = x3[len(train_data):]\n", 769 | "train_input_5 = hin_feature[:len(train_data)]\n", 770 | "test_input_5 = hin_feature[len(train_data):]\n", 771 | "label = to_categorical(train_data['age_group'] - 1)" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": { 778 | "ExecuteTime": { 779 | "end_time": "2019-08-11T03:33:54.070279Z", 780 | "start_time": "2019-08-11T03:33:53.790972Z" 781 | } 782 | }, 783 | "outputs": [], 784 | "source": [ 785 | "from keras.initializers import *\n", 786 | "\n", 787 | "def model_conv(emb1, emb3, num_feature_input):\n", 788 | " '''\n", 789 | " 注意这个inputs\n", 790 | " seq1、seq2分别是两个输入\n", 791 | " hin是feature层输入\n", 792 | " 是否做emb可选可不选,\n", 793 | " 这个就是我们之前训练已经得到的用于embedding的(embedding_matrix1, embedding_matrix2)\n", 794 | " '''\n", 795 | " K.clear_session()\n", 796 | "\n", 797 | " emb_layer_1 = Embedding(\n", 798 | " input_dim=emb1.shape[0],\n", 799 | " output_dim=emb1.shape[1],\n", 800 | " weights=[emb1],\n", 801 | " input_length=90,\n", 802 | " trainable=False\n", 803 | " )\n", 804 | " \n", 805 | " emb_layer_3 = Embedding(\n", 806 | " input_dim=emb3.shape[0],\n", 807 | " output_dim=emb3.shape[1],\n", 808 | " weights=[emb3],\n", 809 | " input_length=600,\n", 810 | " trainable=False\n", 811 | " )\n", 812 | " \n", 813 | " \n", 814 | " seq1 = Input(shape=(90,))\n", 815 | " seq3 = Input(shape=(600,)) \n", 816 | " \n", 817 | " x1 = emb_layer_1(seq1)\n", 818 | " x3 = emb_layer_3(seq3)\n", 819 | " \n", 820 | " sdrop=SpatialDropout1D(rate=0.2)\n", 821 | "\n", 822 | " x1 = sdrop(x1)\n", 823 | " x3 = sdrop(x3)\n", 824 | " \n", 825 | " x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(x1))\n", 826 | " semantic = TimeDistributed(Dense(100, activation=\"tanh\"))(x)\n", 827 | " merged_1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)\n", 828 | " merged_1_avg = Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,))(semantic)\n", 829 | " \n", 830 | " x = Dropout(0.2)(Bidirectional(CuDNNLSTM(200, return_sequences=True))(x3))\n", 831 | " semantic = TimeDistributed(Dense(100, activation=\"tanh\"))(x)\n", 832 | " merged_3 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)\n", 833 | " merged_3_avg = Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,))(semantic)\n", 834 | " \n", 835 | " hin = Input(shape=(num_feature_input, ))\n", 836 | " htime = Dense(16, activation='relu')(hin)\n", 837 | " \n", 838 | " x = concatenate([merged_1, merged_3, merged_1_avg, merged_3_avg, htime])\n", 839 | " \n", 840 | " x = Dropout(0.2)(Activation(activation=\"relu\")(BatchNormalization()(Dense(1000)(x))))\n", 841 | " x = Activation(activation=\"relu\")(BatchNormalization()(Dense(500)(x)))\n", 842 | " pred = Dense(6, activation='softmax')(x)\n", 843 | " model = Model(inputs=[seq1, seq3, hin], outputs=pred)\n", 844 | " from keras.utils import multi_gpu_model\n", 845 | " model = multi_gpu_model(model, 2)\n", 846 | " model.compile(loss='categorical_crossentropy',\n", 847 | " optimizer=AdamW(lr=0.001,weight_decay=0.08,),metrics=[\"accuracy\"])\n", 848 | " return model\n", 849 | "gc.collect()" 850 | ] 851 | }, 852 | { 853 | "cell_type": "code", 854 | "execution_count": null, 855 | "metadata": { 856 | "ExecuteTime": { 857 | "start_time": "2019-08-11T03:14:39.713Z" 858 | }, 859 | "scrolled": true 860 | }, 861 | "outputs": [], 862 | "source": [ 863 | "skf = StratifiedKFold(n_splits=5, random_state=1017, shuffle=True)\n", 864 | "sub = np.zeros((test_input_5.shape[0], 6))\n", 865 | "oof_pred = np.zeros((train_input_5.shape[0], 6))\n", 866 | "score = []\n", 867 | "count = 0\n", 868 | "if not os.path.exists(\"model\"):\n", 869 | " os.mkdir(\"model\")\n", 870 | "\n", 871 | "for i, (train_index, test_index) in enumerate(skf.split(train_input_5, train_data['age_group'])):\n", 872 | " print(\"FOLD | \", count+1)\n", 873 | " print(\"###\"*35)\n", 874 | " gc.collect()\n", 875 | " filepath = \"model/nn_v1_%d.h5\" % count\n", 876 | " checkpoint = ModelCheckpoint(\n", 877 | " filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)\n", 878 | " reduce_lr = ReduceLROnPlateau(\n", 879 | " monitor='val_acc', factor=0.5, patience=3, min_lr=0.0001, verbose=1)\n", 880 | " earlystopping = EarlyStopping(\n", 881 | " monitor='val_acc', min_delta=0.0001, patience=5, verbose=1, mode='max')\n", 882 | " callbacks = [checkpoint, reduce_lr, earlystopping]\n", 883 | " model_age = model_conv(emb1, emb3, num_feature_input)\n", 884 | " if count==0:model_age.summary()\n", 885 | " x1_tr, x1_va = np.array(train_input_1)[train_index], np.array(train_input_1)[test_index] \n", 886 | " x3_tr, x3_va = np.array(train_input_3)[train_index], np.array(train_input_3)[test_index]\n", 887 | " x5_tr, x5_va = np.array(train_input_5)[train_index], np.array(train_input_5)[test_index]\n", 888 | " y_tr, y_va = label[train_index], label[test_index]\n", 889 | " \n", 890 | " hist = model_age.fit([x1_tr, x3_tr, x5_tr],\n", 891 | " y_tr, batch_size=4096, epochs=50, \n", 892 | " validation_data=([x1_va, x3_va, x5_va], y_va),\n", 893 | " callbacks=callbacks, verbose=1, shuffle=True)\n", 894 | "\n", 895 | " model_age.load_weights(filepath)\n", 896 | " oof_pred[test_index] = model_age.predict([x1_va, x3_va, x5_va],batch_size=2048,verbose=1)\n", 897 | " sub += model_age.predict([test_input_1, test_input_3, test_input_5],batch_size=2048,verbose=1)/skf.n_splits\n", 898 | " score.append(np.max(hist.history['val_acc']))\n", 899 | " count += 1\n", 900 | "print('acc:', np.mean(score))" 901 | ] 902 | }, 903 | { 904 | "cell_type": "code", 905 | "execution_count": null, 906 | "metadata": { 907 | "ExecuteTime": { 908 | "start_time": "2019-08-11T03:14:40.012Z" 909 | } 910 | }, 911 | "outputs": [], 912 | "source": [ 913 | "test = data[data['age_group'] == -1]\n", 914 | "submit = test[['uId']]\n", 915 | "submit.columns = ['id']\n", 916 | "submit['label'] = sub.argmax(1)+1\n", 917 | "if not os.path.exists(\"result\"):\n", 918 | " os.mkdir(\"result\")\n", 919 | "submit.to_csv(\"./result/submission.csv\",index=False)" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": null, 925 | "metadata": { 926 | "ExecuteTime": { 927 | "start_time": "2019-08-11T03:14:40.347Z" 928 | } 929 | }, 930 | "outputs": [], 931 | "source": [ 932 | "oof = np.concatenate((oof_pred,sub))\n", 933 | "oof = pd.DataFrame(oof)\n", 934 | "oof.columns = [str(i+1) for i in range(6)]\n", 935 | "oof['id'] = pd.concat([train_data[['uId']],test[['uId']]])['uId'].values\n", 936 | "oof.to_csv(\"./result/v1_test.csv\",index=False)" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": { 943 | "ExecuteTime": { 944 | "start_time": "2019-08-11T03:14:40.612Z" 945 | } 946 | }, 947 | "outputs": [], 948 | "source": [ 949 | "!nvidia-smi" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": null, 955 | "metadata": {}, 956 | "outputs": [], 957 | "source": [] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": null, 962 | "metadata": {}, 963 | "outputs": [], 964 | "source": [] 965 | }, 966 | { 967 | "cell_type": "code", 968 | "execution_count": null, 969 | "metadata": {}, 970 | "outputs": [], 971 | "source": [] 972 | } 973 | ], 974 | "metadata": { 975 | "kernelspec": { 976 | "display_name": "Python 3", 977 | "language": "python", 978 | "name": "python3" 979 | }, 980 | "language_info": { 981 | "codemirror_mode": { 982 | "name": "ipython", 983 | "version": 3 984 | }, 985 | "file_extension": ".py", 986 | "mimetype": "text/x-python", 987 | "name": "python", 988 | "nbconvert_exporter": "python", 989 | "pygments_lexer": "ipython3", 990 | "version": "3.6.4" 991 | } 992 | }, 993 | "nbformat": 4, 994 | "nbformat_minor": 4 995 | } 996 | -------------------------------------------------------------------------------- /chizhu_nurbs/生成序列数据.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from tqdm import *\n", 13 | "from sklearn.decomposition import LatentDirichletAllocation\n", 14 | "from sklearn.metrics import accuracy_score\n", 15 | "import time\n", 16 | "from sklearn.feature_extraction.text import TfidfTransformer\n", 17 | "from sklearn.feature_extraction.text import CountVectorizer\n", 18 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 19 | "from scipy.sparse import hstack\n", 20 | "from sklearn.model_selection import StratifiedKFold\n", 21 | "from gensim.models import FastText, Word2Vec\n", 22 | "import re\n", 23 | "import random as rn\n", 24 | "import gc\n", 25 | "import logging\n", 26 | "tqdm.pandas()\n", 27 | "os.environ['PYTHONHASHSEED'] = '0'\n", 28 | "np.random.seed(1017)\n", 29 | "rn.seed(1017)\n", 30 | "path=\"data/\"\n", 31 | "os.listdir(\"data/\")" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# 读入数据(需加速)\n", 41 | "def get_age_data():\n", 42 | " train_data = pd.read_csv(path + 'age_train.csv', header=None)\n", 43 | " test_data = pd.read_csv(path + 'age_test.csv', header=None)\n", 44 | " data = pd.concat([train_data, test_data], axis=0, sort=False).fillna(-1)\n", 45 | " data.columns = ['uId', 'age_group']\n", 46 | " return data\n", 47 | "\n", 48 | "def get_user_app_actived():\n", 49 | " data = pd.read_csv(path + 'user_app_actived.csv', header=None)\n", 50 | " data.columns = ['uId', 'appId']\n", 51 | " return data\n", 52 | "\n", 53 | "def get_user_behavior_info():\n", 54 | " data = pd.read_csv(path + 'user_behavior_info.csv', header=None)\n", 55 | " data.columns = ['uId', 'bootTimes', 'AFuncTimes', 'BFuncTimes', 'CFuncTimes',\n", 56 | " 'DFuncTimes', 'EFuncTimes', 'FFuncTimes', 'FFuncSum']\n", 57 | " return data\n", 58 | "\n", 59 | "def get_user_basic_info():\n", 60 | " data = pd.read_csv(path + 'user_basic_info.csv', header=None)\n", 61 | " data.columns = ['uId', 'gender', 'city', 'prodName', 'ramCapacity', \n", 62 | " 'ramLeftRation', 'romCapacity', 'romLeftRation', 'color',\n", 63 | " 'fontSize', 'ct', 'carrier', 'os']\n", 64 | " return data\n", 65 | "\n", 66 | "def get_app_info():\n", 67 | " data = pd.read_csv(path + 'app_info.csv', header=None)\n", 68 | " data.columns = ['appId', 'category']\n", 69 | " return data\n", 70 | "\n", 71 | "# 测试的时候用True\n", 72 | "# 提特征改用False\n", 73 | "def get_user_app_usage(less_data=False):\n", 74 | " if less_data:\n", 75 | " reader = pd.read_csv(path + 'user_app_usage.csv', chunksize=2000000)\n", 76 | " for i in reader:\n", 77 | " data = i\n", 78 | " break\n", 79 | " else:\n", 80 | " data = pd.read_csv(path + 'user_app_usage.csv', header=None)\n", 81 | " data.columns = ['uId', 'appId', 'duration', 'times', 'use_date']\n", 82 | " return data" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 6, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "ename": "NameError", 92 | "evalue": "name 'get_user_app_usage' is not defined", 93 | "output_type": "error", 94 | "traceback": [ 95 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 96 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 97 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_user_app_usage\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mless_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'uId'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'app'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'peroid'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'times'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'start'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mdel\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'peroid'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'start'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mpacktime_all\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpacktime_all\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msort_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'times'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 98 | "\u001b[0;31mNameError\u001b[0m: name 'get_user_app_usage' is not defined" 99 | ] 100 | } 101 | ], 102 | "source": [ 103 | "packtime_all = get_user_app_usage(less_data=False)\n", 104 | "packtime_all.columns = ['uId', 'app', 'peroid', 'times', 'start']\n", 105 | "del packtime_all['peroid']\n", 106 | "packtime_all = packtime_all.sort_values('start')\n", 107 | "packtime_all = packtime_all.sort_values('times', ascending=False)\n", 108 | "\n", 109 | "train_data = pd.read_csv(path + 'age_train.csv', header=None)\n", 110 | "test_data = pd.read_csv(path + 'age_test.csv', header=None)\n", 111 | "del train_data[1]\n", 112 | "train_data.columns = ['uId']\n", 113 | "test_data.columns = ['uId']\n", 114 | "\n", 115 | "d1 = train_data[:500000]\n", 116 | "d2 = train_data[500000:1000000]\n", 117 | "d3 = train_data[1000000:1500000]\n", 118 | "d4 = train_data[1500000:]\n", 119 | "d5 = test_data\n", 120 | "\n", 121 | "def set_first_times(row):\n", 122 | " return ' '.join(list(row['app']))\n", 123 | "\n", 124 | "df_value = []\n", 125 | "for i in tqdm([d1, d2, d3, d4, d5]):\n", 126 | " i = pd.merge(i, packtime_all, on='uId', how='left')\n", 127 | " i =i.fillna(0)\n", 128 | " i['app'] = i['app'].astype(str)\n", 129 | " group_data = i.groupby(['uId', 'start']).progress_apply(lambda row: set_first_times(row)).reset_index()\n", 130 | " group_data.columns = ['uId', 'start', 'app']\n", 131 | " group_data = group_data.groupby('uId').progress_apply(lambda row:set_first_times(row)).reset_index()\n", 132 | " df_value.append(group_data)\n", 133 | "data = pd.concat([df_value[0], df_value[1], df_value[2], df_value[3], df_value[4]], axis=0, sort=False)\n", 134 | "data.to_csv('data/set_series_1.csv', index=False, header=None)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.6.7" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /snake/1. Data Prepare.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 18 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import gc\n", 30 | "import time\n", 31 | "import os\n", 32 | "import sys\n", 33 | "import warnings\n", 34 | "warnings.filterwarnings('ignore')" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 5, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 44 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 45 | "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n", 46 | "active = pd.read_csv(\"../data/user_app_actived.csv\",names=['uid','appid']).sort_values(by=['uid'])\n", 47 | "usage = pd.read_csv(\"../data/user_app_usage.csv\",names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])\n", 48 | "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 49 | "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 50 | "print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape))#usage.shape,\n" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 8, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "pickle_path = \"../pickle\"\n", 60 | "if not os.path.exists(pickle_path):\n", 61 | " os.mkdir(pickle_path)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 13, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "USAGE TO PICKLE: 109.59347701072693\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "if not os.path.exists(\"{}/user_app_usage.pickle\".format(pickle_path)):\n", 79 | " t1 = time.time()\n", 80 | " usage.to_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n", 81 | " print('USAGE TO PICKLE: ',time.time()-t1)\n", 82 | "\n", 83 | "usage_app_seq = usage[['uid','appid']].groupby(['uid'])['appid'].apply(lambda x:list(x)).reset_index()\n", 84 | "usage_app_seq.to_pickle(\"{}/user_app_seq.pickle\".format(pickle_path))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 14, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "from tqdm import tqdm\n", 94 | "\n", 95 | "def flatten_active(df): \n", 96 | " u = []\n", 97 | " a = []\n", 98 | " for i in tqdm(range(len(df['appid'].values))):\n", 99 | " u += [df['uid'].values[i]]*df['app_len'].values[i]\n", 100 | " a += list(df['appid'].values[i])\n", 101 | " \n", 102 | " new_df = pd.DataFrame()\n", 103 | " new_df['uid'] = u\n", 104 | " new_df['appid'] = a\n", 105 | " \n", 106 | " return new_df" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 15, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "name": "stderr", 116 | "output_type": "stream", 117 | "text": [ 118 | "100%|██████████| 4999341/4999341 [00:51<00:00, 96820.22it/s]\n" 119 | ] 120 | } 121 | ], 122 | "source": [ 123 | "active['appid'] = active['appid'].map(lambda x:x.split('#'))\n", 124 | "active['app_len'] = active['appid'].map(lambda x:len(x))\n", 125 | "active = active.reset_index(drop=True)\n", 126 | "deal_active = flatten_active(active)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 16, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "ACTIVE TO PICKLE: 54.16705322265625\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "if not os.path.exists(\"{}/user_app_active.pickle\".format(pickle_path)):\n", 144 | " t1 = time.time()\n", 145 | " active.to_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n", 146 | " print('ACTIVE TO PICKLE: ',time.time()-t1)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 17, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "Deal ACTIVE TO PICKLE: 59.198060512542725\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "if not os.path.exists(\"{}/user_app_active_flatten.pickle\".format(pickle_path)):\n", 164 | " t1 = time.time()\n", 165 | " deal_active.to_pickle(\"{}/user_app_active_flatten.pickle\".format(pickle_path))\n", 166 | " print('Deal ACTIVE TO PICKLE: ',time.time()-t1)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 27, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "active = pd.read_pickle(\"../pickle/user_app_active.pickle\")" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 28, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/html": [ 186 | "
\n", 187 | "\n", 200 | "\n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
uidappidapp_len
01000006[a001012, a001036, a001062, a001172, a001275, ...47
11000009[a001012, a001015, a001055, a001062, a00107, a...73
21000010[a001012, a001036, a001050, a001055, a001062, ...96
31000011[a001012, a001063, a002450, a003083, a00326, a...21
41000012[a001036, a001062, a001580, a001583, a003570, ...33
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " uid appid app_len\n", 246 | "0 1000006 [a001012, a001036, a001062, a001172, a001275, ... 47\n", 247 | "1 1000009 [a001012, a001015, a001055, a001062, a00107, a... 73\n", 248 | "2 1000010 [a001012, a001036, a001050, a001055, a001062, ... 96\n", 249 | "3 1000011 [a001012, a001063, a002450, a003083, a00326, a... 21\n", 250 | "4 1000012 [a001036, a001062, a001580, a001583, a003570, ... 33" 251 | ] 252 | }, 253 | "execution_count": 28, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "active.head()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 16, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "all_data = train.append(test)\n", 269 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 29, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "active_train = active.merge(train,how='right',on='uid')\n", 279 | "active_test = active.merge(test,how='right',on='uid')\n", 280 | "active_train.to_pickle(\"../pickle/active_text_train.pickle\")\n", 281 | "active_test.to_pickle(\"../pickle/active_text_test.pickle\")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 30, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "usage_train = usage.merge(train,how='right',on='uid')\n", 291 | "usage_test = usage.merge(test,how='right',on='uid')\n", 292 | "usage_train.to_pickle(\"../pickle/usage_text_train.pickle\")\n", 293 | "usage_test.to_pickle(\"../pickle/usage_text_test.pickle\")" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [] 309 | } 310 | ], 311 | "metadata": { 312 | "kernelspec": { 313 | "display_name": "Python 3", 314 | "language": "python", 315 | "name": "python3" 316 | }, 317 | "language_info": { 318 | "codemirror_mode": { 319 | "name": "ipython", 320 | "version": 3 321 | }, 322 | "file_extension": ".py", 323 | "mimetype": "text/x-python", 324 | "name": "python", 325 | "nbconvert_exporter": "python", 326 | "pygments_lexer": "ipython3", 327 | "version": "3.6.4" 328 | } 329 | }, 330 | "nbformat": 4, 331 | "nbformat_minor": 4 332 | } 333 | -------------------------------------------------------------------------------- /snake/10. xDeepFM Multi Value Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 17 | "from sklearn.linear_model import LogisticRegression\n", 18 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 19 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 20 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 21 | "import gc\n", 22 | "import time\n", 23 | "import os\n", 24 | "import sys\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings('ignore')\n", 27 | "\n", 28 | "train = pd.read_csv(\"../data2/data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 29 | "test = pd.read_csv(\"../data2/data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 30 | "info = pd.read_csv(\"../data2/data/app_info.csv\",names=['appid','category'])\n", 31 | "active = pd.read_csv(\"../data2/data/user_app_actived.csv\",names=['uid','appid']).sort_values(by=['uid'])\n", 32 | "usage = pd.read_pickle(\"../data2/user_app_usage.pickle\")#,names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])\n", 33 | "user_basic_info = pd.read_csv(\"../data2/data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 34 | "behavior_info = pd.read_csv(\"../data2/data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 35 | "# (train.shape,test.shape),(info.shape,active.shape,usage.shape,user_basic_info.shape,behavior_info.shape)\n", 36 | "\n", 37 | "all_data = train.append(test).reset_index(drop=True)\n", 38 | "all_data.head()\n", 39 | "\n", 40 | "active['appid'] = active['appid'].map(lambda x:x.split('#'))\n", 41 | "active['app_len'] = active['appid'].map(lambda x:len(x))\n", 42 | "\n", 43 | "def get_category(x):\n", 44 | " col = []\n", 45 | " no_col = 0\n", 46 | " for i in x:\n", 47 | " try:\n", 48 | " col.append(hash_dict[i])\n", 49 | " except:\n", 50 | " no_col+=1\n", 51 | " return col,no_col\n", 52 | "\n", 53 | "hash_dict = dict(info.values)\n", 54 | "active['category'] = active['appid'].map(lambda x:get_category(x))\n", 55 | "active['category_nan'] = active['category'].map(lambda x:x[1])\n", 56 | "active['category'] = active['category'].map(lambda x:x[0])\n", 57 | "active['category_len'] = active['category'].map(lambda x:len(x))\n", 58 | "active['category_nunique'] = active['category'].map(lambda x:len(set(x)))\n", 59 | "active['category_ratio'] = active['category_nunique']/active['category_len']\n", 60 | "del active['category']\n", 61 | "\n", 62 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)\n", 63 | "all_data = all_data.merge(user_basic_info,how='left',on=['uid'])\n", 64 | "all_data = all_data.merge(behavior_info,how='left',on=['uid'])\n", 65 | "all_data = all_data.merge(active[active['app_len']<=150],how='left',on=['uid'])\n", 66 | "lj = active[active['app_len']>150]\n", 67 | "active = pd.read_csv(\"../data2/data/user_app_actived.csv\",names=['uid','appid']).sort_values(by=['uid'])\n", 68 | "active.reset_index(drop=True,inplace=True)\n", 69 | "\n", 70 | "active.loc[active['uid'].isin(lj['uid'].unique()),'appid'] = '#a00101827'\n", 71 | "all_data = all_data.merge(active.rename(columns={'appid' : 'multi_appid'}),how='left',on='uid')\n", 72 | "\n", 73 | "def split(x):\n", 74 | " key_ans = x.split('#')\n", 75 | " for key in key_ans:\n", 76 | " if key not in key2index:\n", 77 | " # Notice : data value 0 is a special \"padding\",so we do not use 0 to encode valid feature for sequence data\n", 78 | " key2index[key] = len(key2index) + 1\n", 79 | " return list(map(lambda x: key2index[x], key_ans))\n", 80 | "\n", 81 | "all_data['multi_appid'] = all_data['multi_appid'].astype('str')\n", 82 | "from tensorflow.python.keras.preprocessing.sequence import pad_sequences\n", 83 | "\n", 84 | "# 多值特征处理\n", 85 | "from deepctr.models import *\n", 86 | "from deepctr.datas import SparseFeat, VarLenSparseFeat,get_fixlen_feature_names,get_varlen_feature_names,DenseFeat\n", 87 | "key2index = {}\n", 88 | "app_list = list(map(split, all_data['multi_appid'].values))\n", 89 | "app_length = np.array(list(map(len, app_list)))\n", 90 | "app_key = key2index.copy()\n", 91 | "\n", 92 | "max_len_app = max(app_length)\n", 93 | "app_list = pad_sequences(app_list, maxlen=max_len_app, padding='post', )\n", 94 | "print(max_len_app)\n", 95 | "\n", 96 | "uid_seq = pd.read_pickle(\"usage_uid_appid_seq.pickle\")\n", 97 | "uid_seq['appid_len'] = uid_seq['appid'].map(lambda x:len(x))\n", 98 | "uid_seq = uid_seq[uid_seq['appid_len']<=300]\n", 99 | "uid_seq['appid'] = uid_seq['appid'].map(lambda x:\"#\".join(x))\n", 100 | "\n", 101 | "del uid_seq['appid_len']\n", 102 | "\n", 103 | "all_data = all_data.merge(uid_seq.rename(columns={'appid' : 'usage_seq'}),how='left',on='uid')\n", 104 | "\n", 105 | "all_data['usage_seq'] = all_data['usage_seq'].astype('str')\n", 106 | "key2index = {}\n", 107 | "app1_list = list(map(split, all_data['usage_seq'].values))\n", 108 | "app1_length = np.array(list(map(len, app1_list)))\n", 109 | "app1_key = key2index.copy()\n", 110 | "\n", 111 | "max_len_app1 = max(app1_length)\n", 112 | "app1_list = pad_sequences(app1_list, maxlen=max_len_app1, padding='post', )\n", 113 | "print(max_len_app1)\n", 114 | "\n", 115 | "from sklearn.preprocessing import MinMaxScaler,StandardScaler\n", 116 | "from tqdm import tqdm\n", 117 | "\n", 118 | "sparse_features = [i for i in all_data.select_dtypes(object).columns if i not in ['uid','age_group']]\n", 119 | "dense_features = [i for i in all_data.columns if i not in sparse_features+['uid','age_group']]\n", 120 | "target = ['age_group']\n", 121 | "for feat in tqdm(sparse_features):\n", 122 | " lbl = LabelEncoder()\n", 123 | " all_data[feat] = lbl.fit_transform(all_data[feat].astype('str'))\n", 124 | "\n", 125 | "mm = StandardScaler()\n", 126 | "all_data[dense_features] = mm.fit_transform(all_data[dense_features].replace([np.inf,-np.inf],0).fillna(0))\n", 127 | "\n", 128 | "choose = all_data['age_group'].notnull()\n", 129 | "fixlen_feature_columns = [SparseFeat(feat,all_data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat,1,) for feat in dense_features]\n", 130 | "varlen_feature_columns = [VarLenSparseFeat('app', len(app_key) + 1, max_len_app, 'mean')] + [VarLenSparseFeat('app1', len(app_key) + 1, max_len_app, 'max')] + \\\n", 131 | " [VarLenSparseFeat('usage_app',len(app1_key) + 1, max_len_app1, 'mean')] + [VarLenSparseFeat('usage_app1', len(app1_key) + 1, max_len_app1,'max')]\n", 132 | "linear_feature_columns = fixlen_feature_columns + varlen_feature_columns\n", 133 | "dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns\n", 134 | "fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)\n", 135 | "varlen_feature_names = get_varlen_feature_names(linear_feature_columns + dnn_feature_columns)\n", 136 | "len(sparse_features),len(dense_features)\n", 137 | "\n", 138 | "import tensorflow as tf\n", 139 | "from tensorflow.python.keras import backend as K\n", 140 | "from tensorflow.python.keras.initializers import Zeros, glorot_normal\n", 141 | "from tensorflow.python.keras.layers import Layer\n", 142 | "from tensorflow.python.keras.regularizers import l2\n", 143 | "from deepctr.datas import *\n", 144 | "from deepctr.contrib import *\n", 145 | "from deepctr.layers import *\n", 146 | "from deepctr.models import *\n", 147 | "from keras.optimizers import Adam\n", 148 | "from keras.models import Model\n", 149 | "from keras import backend as K\n", 150 | "from keras.engine.topology import Layer\n", 151 | "from keras import initializers, regularizers, constraints, optimizers, layers\n", 152 | "from keras.layers import concatenate\n", 153 | "from keras.callbacks import *\n", 154 | "from keras.constraints import *\n", 155 | "from keras.layers import *\n", 156 | "from keras.models import *\n", 157 | "from keras.initializers import *\n", 158 | "from keras.optimizers import *\n", 159 | "\n", 160 | "label_name = 'age_group'\n", 161 | "all_data[label_name] = all_data[label_name] - 1\n", 162 | "all_data[label_name].value_counts()\n", 163 | "\n", 164 | "# from tf.keras.activations.softplus\n", 165 | "\n", 166 | "def xDeepFM(linear_feature_columns, dnn_feature_columns, embedding_size=8, dnn_hidden_units=(512, 256),\n", 167 | " cin_layer_size=(256, 256,), cin_split_half=True, cin_activation='relu', l2_reg_linear=0.00001,\n", 168 | " l2_reg_embedding=0.00001, l2_reg_dnn=0, l2_reg_cin=0, init_std=0.0001, seed=2019, dnn_dropout=0,\n", 169 | " dnn_activation='relu', dnn_use_bn=True, task='binary'):\n", 170 | " \"\"\"Instantiates the xDeepFM architecture.\n", 171 | " :param linear_feature_columns: An iterable containing all the features used by linear part of the model.\n", 172 | " :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.\n", 173 | " :param embedding_size: positive integer,sparse feature embedding_size\n", 174 | " :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of deep net\n", 175 | " :param cin_layer_size: list,list of positive integer or empty list, the feature maps in each hidden layer of Compressed Interaction Network\n", 176 | " :param cin_split_half: bool.if set to True, half of the feature maps in each hidden will connect to output unit\n", 177 | " :param cin_activation: activation function used on feature maps\n", 178 | " :param l2_reg_linear: float. L2 regularizer strength applied to linear part\n", 179 | " :param l2_reg_embedding: L2 regularizer strength applied to embedding vector\n", 180 | " :param l2_reg_dnn: L2 regularizer strength applied to deep net\n", 181 | " :param l2_reg_cin: L2 regularizer strength applied to CIN.\n", 182 | " :param init_std: float,to use as the initialize std of embedding vector\n", 183 | " :param seed: integer ,to use as random seed.\n", 184 | " :param dnn_dropout: float in [0,1), the probability we will drop out a given DNN coordinate.\n", 185 | " :param dnn_activation: Activation function to use in DNN\n", 186 | " :param dnn_use_bn: bool. Whether use BatchNormalization before activation or not in DNN\n", 187 | " :param task: str, ``\"binary\"`` for binary logloss or ``\"regression\"`` for regression loss\n", 188 | " :return: A Keras model instance.\n", 189 | " \"\"\"\n", 190 | "\n", 191 | "\n", 192 | " features = build_data_features(linear_feature_columns + dnn_feature_columns)\n", 193 | "\n", 194 | " datas_list = list(features.values())\n", 195 | "\n", 196 | " sparse_embedding_list, dense_value_list = data_from_feature_columns(features,dnn_feature_columns,\n", 197 | " embedding_size,\n", 198 | " l2_reg_embedding,init_std,\n", 199 | " seed)\n", 200 | "\n", 201 | "# linear_logit \n", 202 | " feature_columns = linear_feature_columns\n", 203 | " prefix = 'linear'\n", 204 | " units = 6\n", 205 | " l2_reg = l2_reg_linear\n", 206 | " linear_emb_list = [data_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix+str(i))[0] for i in range(units)]\n", 207 | " _, dense_data_list = data_from_feature_columns(features,feature_columns,1,l2_reg,init_std,seed,prefix=prefix)\n", 208 | " \n", 209 | "\n", 210 | " if len(linear_emb_list[0]) > 1:\n", 211 | " linear_term = concat_fun([tf.keras.layers.add(linear_emb) for linear_emb in linear_emb_list])\n", 212 | " elif len(linear_emb_list[0]) == 1:\n", 213 | " linear_term = concat_fun([linear_emb[0] for linear_emb in linear_emb_list])\n", 214 | " else:\n", 215 | " linear_term = None\n", 216 | " \n", 217 | " if len(dense_data_list) > 0:\n", 218 | " dense_data__ = dense_data_list[0] if len(\n", 219 | " dense_data_list) == 1 else tf.keras.layers.Concatenate()(dense_data_list)\n", 220 | " linear_dense_logit = tf.keras.layers.Dense(\n", 221 | " units, activation='softplus', use_bias=True, kernel_regularizer=l2(l2_reg))(dense_data__)\n", 222 | " \n", 223 | " if linear_term is not None:\n", 224 | " linear_term = tf.keras.layers.add([linear_dense_logit, linear_term])\n", 225 | " else:\n", 226 | " linear_term = linear_dense_logit\n", 227 | " \n", 228 | " linear_logit = tf.keras.layers.Flatten()(linear_term)\n", 229 | "\n", 230 | " fm_data = concat_fun(sparse_embedding_list, axis=1)\n", 231 | "\n", 232 | " if len(cin_layer_size) > 0:\n", 233 | " exFM_out = CIN(cin_layer_size, cin_activation,\n", 234 | " cin_split_half, l2_reg_cin, seed)(fm_data)\n", 235 | " exFM_logit = tf.keras.layers.Dense(6, activation='softplus', )(exFM_out)\n", 236 | " exFM_logit_reg = tf.keras.layers.Dense(1, activation='relu')(exFM_out)\n", 237 | "\n", 238 | " dnn_data_1 = combined_dnn_data(sparse_embedding_list,dense_value_list)\n", 239 | " \n", 240 | " deep_out_1 = DNN(dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,\n", 241 | " dnn_use_bn, seed)(dnn_data_1)\n", 242 | " \n", 243 | " deep_logit_1 = tf.keras.layers.Dense(\n", 244 | " 6, use_bias=False, activation='softmax')(deep_out_1)\n", 245 | "\n", 246 | " x = tf.keras.layers.average([exFM_logit,linear_logit,deep_logit_1])\n", 247 | " x = tf.keras.layers.concatenate([x,exFM_logit_reg])\n", 248 | " x = tf.keras.layers.BatchNormalization()(x)\n", 249 | " x = tf.keras.layers.Dense(256)(x)\n", 250 | " x = tf.keras.layers.PReLU()(x)\n", 251 | " x = tf.keras.layers.BatchNormalization()(x)\n", 252 | " output = tf.keras.layers.Dense(6,activation='softmax')(x)\n", 253 | " model = tf.keras.models.Model(datas=datas_list, outputs=output)\n", 254 | " return model\n", 255 | "\n", 256 | "import os\n", 257 | "os.environ['CUDA_VISIBLE_DEVICES'] = \"3\"\n", 258 | "\n", 259 | "from keras.utils import np_utils\n", 260 | "\n", 261 | "def make_label(x):\n", 262 | " return np_utils.to_categorical(x)\n", 263 | "\n", 264 | "def make_data(JB,index):\n", 265 | " JB = JB.iloc[index]\n", 266 | " fixlen_data = [JB[name].values for name in fixlen_feature_names]\n", 267 | " v0 = [app_list[index]]\n", 268 | " v1 = [app_list[index]]\n", 269 | " v2 = [app1_list[index]]\n", 270 | " v3 = [app1_list[index]]\n", 271 | " return fixlen_data + v0 + v1 + v2 + v3\n", 272 | "\n", 273 | "random_seed = 2019\n", 274 | "tr_index = choose\n", 275 | "X_train = all_data[tr_index].reset_index(drop=True)\n", 276 | "y = all_data[tr_index]['age_group'].reset_index(drop=True).astype(int)\n", 277 | "X_test = all_data[~tr_index].reset_index(drop=True)\n", 278 | "print(X_train.shape,X_test.shape)\n", 279 | "\n", 280 | "from sklearn.metrics import f1_score,accuracy_score\n", 281 | "\n", 282 | "cv_pred = []\n", 283 | "test_pred = []\n", 284 | "cv_score = []\n", 285 | "cv_model = []\n", 286 | "skf = StratifiedKFold(n_splits=5, random_state=random_seed, shuffle=True)\n", 287 | "for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):\n", 288 | " print(index)\n", 289 | " model = xDeepFM(linear_feature_columns, dnn_feature_columns,embedding_size=8,task='multicalss') # xDeepFM DeepFM AFM NFM\n", 290 | " model.compile(RAdam(lr=0.01),'categorical_crossentropy',\n", 291 | " metrics = ['accuracy',],)\n", 292 | " train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]\n", 293 | " \n", 294 | " train_x = make_data(X_train,train_index)\n", 295 | " train_y = make_label(train_y)\n", 296 | " test_x = make_data(X_train,test_index)\n", 297 | " test_y = make_label(test_y)\n", 298 | " test_data = make_data(all_data,range(2010000,2512500))\n", 299 | " history = model.fit(train_x,train_y,batch_size=512,epochs=1,verbose=1,validation_data=(test_x,test_y))\n", 300 | " cv_model.append(model)\n", 301 | " y_test = model.predict(test_data,batch_size=512)\n", 302 | " y_val = model.predict(test_x,batch_size=512)\n", 303 | " cv_score.append(accuracy_score(y.iloc[test_index],np.argmax(y_val,axis=1)))\n", 304 | " print(cv_score)\n", 305 | " cv_pred.append(y_val)\n", 306 | " test_pred.append(y_test)\n", 307 | "\n", 308 | "cv_pred = np.zeros((X_train.shape[0],6))\n", 309 | "test_pred = np.zeros((X_test.shape[0],6))\n", 310 | "for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):\n", 311 | " print(index)\n", 312 | " train_x, test_x, train_y, test_y = X_train.iloc[train_index], X_train.iloc[test_index], y.iloc[train_index], y.iloc[test_index]\n", 313 | " train_x = make_data(X_train,train_index)\n", 314 | " train_y = make_label(train_y)\n", 315 | " test_x = make_data(X_train,test_index)\n", 316 | " test_y = make_label(test_y)\n", 317 | " test_data = make_data(all_data,range(2010000,2512500))\n", 318 | " y_val = cv_model[index].predict(test_x,batch_size=256,verbose=1)\n", 319 | " print(y_val.shape)\n", 320 | " cv_pred[test_index] = y_val\n", 321 | " test_pred += cv_model[index].predict(test_data,batch_size=256,verbose=1) / 5\n", 322 | "\n", 323 | "oof_train = pd.DataFrame(cv_pred)\n", 324 | "oof_train.columns = ['proba_{}'.format(i) for i in range(6)]\n", 325 | "oof_train['uid'] = train['uid']\n", 326 | "\n", 327 | "oof_test = pd.DataFrame(test_pred)\n", 328 | "oof_test.columns = ['proba_{}'.format(i) for i in range(6)]\n", 329 | "oof_test['uid'] = test['uid']\n", 330 | "\n", 331 | "oof_train.to_hdf(\"xDeepFM_cv_6449.hdf\",\"train\")\n", 332 | "oof_test.to_hdf(\"xDeepFM_cv_6449.hdf\",\"test\")\n", 333 | "# 5855 5993" 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.6.4" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 2 358 | } 359 | -------------------------------------------------------------------------------- /snake/2. Word Vector & Sparse Matrix.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 18 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 21 | "\n", 22 | "import gc\n", 23 | "import time\n", 24 | "import os\n", 25 | "import sys\n", 26 | "import warnings\n", 27 | "warnings.filterwarnings('ignore')" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "pickle_path = \"../pickle\"\n", 37 | "active = pd.read_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n", 38 | "usage = pd.read_pickle(\"{}/user_app_seq.pickle\".format(pickle_path))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 5, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "vector_path = \"../vector\"\n", 48 | "from glove import *\n", 49 | "\n", 50 | "for i,df in enumerate([active,usage]): # 预计耗时 222 + 2100s\n", 51 | " t1 = time.time()\n", 52 | " c = Corpus()\n", 53 | " c.fit(df['appid'].values)\n", 54 | " glove = Glove(no_components=300, learning_rate=0.05) \n", 55 | " glove.fit(c.matrix,epochs=12,no_threads=30,verbose=1)\n", 56 | " glove.add_dictionary(c.dictionary)\n", 57 | " glove.save(\"{}/{}_glove300.model\".format(vector_path,i))\n", 58 | " print(time.time()-t1)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "1511.6713757514954\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# Gen W2V Vector\n", 76 | "from gensim import models\n", 77 | "\n", 78 | "for i,df in enumerate([active,usage]): # 预计耗时 1511 + 5874\n", 79 | " t1 = time.time()\n", 80 | " w2v = models.Word2Vec(df['appid'].values, size=300, window=20, workers=40,hs=1) # 设置sg的话 变成skip-gram方法 我们测试效果差不多\n", 81 | " w2v.wv.save_word2vec_format(\"{}/{}_w2v300.model\".format(vector_path,i))\n", 82 | " print(time.time()-t1)\n" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "t = []\n", 92 | "c = []\n", 93 | "for i,df in enumerate([active,usage]): # 预计耗时 200 + 750s\n", 94 | " t1 = time.time()\n", 95 | " tfidf = TfidfVectorizer(analyzer='word',token_pattern=u\"(?u)\\\\b\\\\w+\\\\b\",min_df=1,ngram_range=(1,1))\n", 96 | " t.append(tfidf.fit_transform(df['appid'].map(lambda x:' '.join(x)).values))\n", 97 | " cv = CountVectorizer(analyzer='word',token_pattern=u\"(?u)\\\\b\\\\w+\\\\b\",min_df=1,ngram_range=(1,1))\n", 98 | " c.append(cv.fit_transform(df['appid'].map(lambda x:' '.join(x)).values))\n", 99 | " \n", 100 | " print(time.time()-t1)\n", 101 | "\n", 102 | "from scipy import sparse\n", 103 | "if not os.path.exists(\"{}/Sparse_Matrix\".format(vector_path)):\n", 104 | " os.mkdir(\"{}/Sparse_Matrix\".format(vector_path))\n", 105 | "sparse.save_npz('{}/Sparse_Matrix/active_tfidf.npz'.format(vector_path), t[0])\n", 106 | "sparse.save_npz('{}/Sparse_Matrix/usage_tfidf.npz'.format(vector_path), t[1])\n", 107 | "\n", 108 | "sparse.save_npz('{}/Sparse_Matrix/active_count.npz'.format(vector_path), c[0])\n", 109 | "sparse.save_npz('{}/Sparse_Matrix/usage_count.npz'.format(vector_path), c[1])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | } 126 | ], 127 | "metadata": { 128 | "kernelspec": { 129 | "display_name": "Python 3", 130 | "language": "python", 131 | "name": "python3" 132 | }, 133 | "language_info": { 134 | "codemirror_mode": { 135 | "name": "ipython", 136 | "version": 3 137 | }, 138 | "file_extension": ".py", 139 | "mimetype": "text/x-python", 140 | "name": "python", 141 | "nbconvert_exporter": "python", 142 | "pygments_lexer": "ipython3", 143 | "version": "3.6.4" 144 | } 145 | }, 146 | "nbformat": 4, 147 | "nbformat_minor": 4 148 | } 149 | -------------------------------------------------------------------------------- /snake/3. Graph Feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "((173596669, 2), (840560515, 5))" 12 | ] 13 | }, 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "output_type": "execute_result" 17 | } 18 | ], 19 | "source": [ 20 | "import numpy as np\n", 21 | "import pandas as pd\n", 22 | "import seaborn as sns\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "import lightgbm as lgb\n", 25 | "import xgboost as xgb\n", 26 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 27 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 28 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 29 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 30 | "from sklearn.linear_model import LogisticRegression\n", 31 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 32 | "\n", 33 | "import gc\n", 34 | "import time\n", 35 | "import os\n", 36 | "import sys\n", 37 | "import warnings\n", 38 | "warnings.filterwarnings('ignore')\n", 39 | "\n", 40 | "pickle_path = \"../pickle\"\n", 41 | "active = pd.read_pickle(\"{}/user_app_active_flatten.pickle\".format(pickle_path))\n", 42 | "usage = pd.read_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n", 43 | "active.shape,usage.shape" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# Graph Feature\n", 53 | "import scipy.sparse\n", 54 | "from scipy import linalg\n", 55 | "from scipy.special import iv\n", 56 | "import scipy.sparse as sp\n", 57 | "\n", 58 | "from sklearn import preprocessing\n", 59 | "from sklearn.utils.extmath import randomized_svd\n", 60 | "from sklearn.decomposition import TruncatedSVD\n", 61 | "\n", 62 | "import argparse\n", 63 | "import time\n", 64 | "\n", 65 | "class ProNE():\n", 66 | " def __init__(self, G, emb_size=128, step=10, theta=0.5, mu=0.2, n_iter=5, random_state=2019):\n", 67 | " self.G = G\n", 68 | " self.emb_size = emb_size\n", 69 | " self.G = self.G.to_undirected()\n", 70 | " self.node_number = self.G.number_of_nodes()\n", 71 | " self.random_state = random_state\n", 72 | " self.step = step\n", 73 | " self.theta = theta\n", 74 | " self.mu = mu\n", 75 | " self.n_iter = n_iter\n", 76 | " \n", 77 | " mat = scipy.sparse.lil_matrix((self.node_number, self.node_number))\n", 78 | "\n", 79 | " for e in tqdm(self.G.edges()):\n", 80 | " if e[0] != e[1]:\n", 81 | " mat[int(e[0]), int(e[1])] = 1\n", 82 | " mat[int(e[1]), int(e[0])] = 1\n", 83 | " self.mat = scipy.sparse.csr_matrix(mat)\n", 84 | " print(mat.shape)\n", 85 | "\n", 86 | " def get_embedding_rand(self, matrix):\n", 87 | " # Sparse randomized tSVD for fast embedding\n", 88 | " t1 = time.time()\n", 89 | " l = matrix.shape[0]\n", 90 | " smat = scipy.sparse.csc_matrix(matrix) # convert to sparse CSC format\n", 91 | " print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)\n", 92 | " U, Sigma, VT = randomized_svd(smat, n_components=self.emb_size, n_iter=self.n_iter, random_state=self.random_state)\n", 93 | " U = U * np.sqrt(Sigma)\n", 94 | " U = preprocessing.normalize(U, \"l2\")\n", 95 | " print('sparsesvd time', time.time() - t1)\n", 96 | " return U\n", 97 | "\n", 98 | " def get_embedding_dense(self, matrix, emb_size):\n", 99 | " # get dense embedding via SVD\n", 100 | " t1 = time.time()\n", 101 | " U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)\n", 102 | " U = np.array(U)\n", 103 | " U = U[:, :emb_size]\n", 104 | " s = s[:emb_size]\n", 105 | " s = np.sqrt(s)\n", 106 | " U = U * s\n", 107 | " U = preprocessing.normalize(U, \"l2\")\n", 108 | " print('densesvd time', time.time() - t1)\n", 109 | " return U\n", 110 | "\n", 111 | " def fit(self, tran, mask):\n", 112 | " # Network Embedding as Sparse Matrix Factorization\n", 113 | " t1 = time.time()\n", 114 | " l1 = 0.75\n", 115 | " C1 = preprocessing.normalize(tran, \"l1\")\n", 116 | " neg = np.array(C1.sum(axis=0))[0] ** l1\n", 117 | "\n", 118 | " neg = neg / neg.sum()\n", 119 | "\n", 120 | " neg = scipy.sparse.diags(neg, format=\"csr\")\n", 121 | " neg = mask.dot(neg)\n", 122 | " print(\"neg\", time.time() - t1)\n", 123 | "\n", 124 | " C1.data[C1.data <= 0] = 1\n", 125 | " neg.data[neg.data <= 0] = 1\n", 126 | "\n", 127 | " C1.data = np.log(C1.data)\n", 128 | " neg.data = np.log(neg.data)\n", 129 | "\n", 130 | " C1 -= neg\n", 131 | " F = C1\n", 132 | " features_matrix = self.get_embedding_rand(F)\n", 133 | " return features_matrix\n", 134 | "\n", 135 | " def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):\n", 136 | " # NE Enhancement via Spectral Propagation\n", 137 | " print('Chebyshev Series -----------------')\n", 138 | " t1 = time.time()\n", 139 | "\n", 140 | " if order == 1:\n", 141 | " return a\n", 142 | "\n", 143 | " A = sp.eye(self.node_number) + A\n", 144 | " DA = preprocessing.normalize(A, norm='l1')\n", 145 | " L = sp.eye(self.node_number) - DA\n", 146 | "\n", 147 | " M = L - mu * sp.eye(self.node_number)\n", 148 | "\n", 149 | " Lx0 = a\n", 150 | " Lx1 = M.dot(a)\n", 151 | " Lx1 = 0.5 * M.dot(Lx1) - a\n", 152 | "\n", 153 | " conv = iv(0, s) * Lx0\n", 154 | " conv -= 2 * iv(1, s) * Lx1\n", 155 | " for i in range(2, order):\n", 156 | " Lx2 = M.dot(Lx1)\n", 157 | " Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0\n", 158 | " # Lx2 = 2*L.dot(Lx1) - Lx0\n", 159 | " if i % 2 == 0:\n", 160 | " conv += 2 * iv(i, s) * Lx2\n", 161 | " else:\n", 162 | " conv -= 2 * iv(i, s) * Lx2\n", 163 | " Lx0 = Lx1\n", 164 | " Lx1 = Lx2\n", 165 | " del Lx2\n", 166 | " print('Bessell time', i, time.time() - t1)\n", 167 | " mm = A.dot(a - conv)\n", 168 | " self.embeddings = self.get_embedding_dense(mm, self.emb_size)\n", 169 | " return self.embeddings\n", 170 | " \n", 171 | " def transform(self):\n", 172 | " if self.embeddings is None:\n", 173 | " print(\"Embedding is not train\")\n", 174 | " return {}\n", 175 | " self.embeddings = pd.DataFrame(self.embeddings)\n", 176 | " self.embeddings.columns = ['ProNE_Emb_{}'.format(i) for i in range(len(self.embeddings.columns))]\n", 177 | " self.embeddings = self.embeddings.reset_index().rename(columns={'index' : 'nodes'}).sort_values(by=['nodes'],ascending=True).reset_index(drop=True)\n", 178 | "\n", 179 | " return self.embeddings" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "from tqdm import tqdm\n", 189 | "import networkx as nx\n", 190 | "import igraph as ig\n", 191 | "\n", 192 | "def get_graph_embedding(df,prefix):\n", 193 | " \n", 194 | " uid_lbl,appid_lbl = LabelEncoder(),LabelEncoder()\n", 195 | " df['new_uid'] = uid_lbl.fit_transform(df['uid'])\n", 196 | " df['new_appid'] = appid_lbl.fit_transform(df['appid'])\n", 197 | " df['new_appid'] += df['new_uid'].max() + 1\n", 198 | " \n", 199 | " print(\"Encoder Finished...\")\n", 200 | " \n", 201 | " G = ig.Graph()\n", 202 | " G.add_vertices(df['new_appid'].max()+1)\n", 203 | " G.add_edges(df[['new_uid','new_appid']].values)\n", 204 | " print(\"Build Graph Finished...\")\n", 205 | " evcent = G.evcent() # 计算图中节点的向量中心性\n", 206 | " shell_index = G.shell_index() # 计算图中节点度至少为K的最大子图\n", 207 | " degree = G.degree() # 总度数\n", 208 | " pagerank = G.pagerank() # pagerank\n", 209 | " # 以下4个计算的很慢,效果还不错,可以注释掉,观察evcent的效果\n", 210 | " closeness = G.closeness() # 计算节点与网络中其他所有节点的距离的平均值 \n", 211 | " betweenness = G.betweenness() # 计算节点的介值\n", 212 | " constraint = G.constraint()\n", 213 | " eccentricity = G.eccentricity() # 计算给定节点到图中其他节点的最短距离的最大值。\n", 214 | " \n", 215 | " G_stat = pd.DataFrame()\n", 216 | " G_stat['evcent'] = evcent\n", 217 | " G_stat['shell_index'] = shell_index\n", 218 | " G_stat['degree'] = degree\n", 219 | " G_stat['pagerank'] = pagerank\n", 220 | " print(\"PR Finished...\")\n", 221 | " G_stat['closeness'] = closeness\n", 222 | " G_stat['betweenness'] = betweenness\n", 223 | " G_stat['constraint'] = constraint\n", 224 | " G_stat['eccentricity'] = eccentricity\n", 225 | " G_stat = G_stat.reset_index()\n", 226 | " G_stat = G_stat[G_stat['index'].isin(df['new_uid'])]\n", 227 | " G_stat['index'] = uid_lbl.inverse_transform(G_stat['index'])\n", 228 | " \n", 229 | " print(\"Graph Stat Finished...\")\n", 230 | " G_stat.to_pickle(\"../pickle/Graph_Stat_{}.pickle\".format(prefix))\n", 231 | " \n", 232 | " del G\n", 233 | " \n", 234 | " import gc\n", 235 | " gc.collect()\n", 236 | " \n", 237 | " G = nx.Graph()\n", 238 | " G.add_edges_from(df[['new_uid','new_appid']].values)\n", 239 | " model = ProNE(G,emb_size=32,n_iter=6,step=12)\n", 240 | " features_matrix = model.fit(model.mat, model.mat)\n", 241 | " model.chebyshev_gaussian(model.mat, features_matrix, model.step, model.mu, model.theta)\n", 242 | " emb = model.transform()\n", 243 | " fea = emb[emb['nodes'].isin(df['new_uid'])]\n", 244 | " fea['nodes'] = uid_lbl.inverse_transform(fea['nodes'])\n", 245 | " fea.rename(columns={'nodes' : 'uid'},inplace=True)\n", 246 | " del G\n", 247 | " gc.collect()\n", 248 | " print(\"Embedding Finished...\")\n", 249 | " fea.to_pickle(\"../pickle/Graph_Bi_{}.pickle\".format(prefix))\n", 250 | " \n", 251 | " return fea,G_stat" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "Encoder Finished...\n", 264 | "PR Finished...\n", 265 | "Graph Stat Finished...\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "fea1,stat1 = get_graph_embedding(active,'active').set_index('uid').add_prefix(\"active_\").reset_index()\n", 271 | "fea0,stat0 = get_graph_embedding(usage,'usage').set_index('uid').add_prefix(\"usage_\").reset_index()\n", 272 | "\n", 273 | "# fea0.to_pickle(\"../pickle/usage_bi_graph_ProNE.pickle\")\n", 274 | "# fea1.to_pickle(\"../pickle/active_bi_graph_ProNE.pickle\")\n", 275 | "# stat0.to_pickle(\"../pickle/usage_graph_stat.pickle\")\n", 276 | "# stat1.to_pickle(\"../pickle/active_graph_stat.pickle\")" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "Python 3", 297 | "language": "python", 298 | "name": "python3" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.6.4" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 4 315 | } 316 | -------------------------------------------------------------------------------- /snake/5. W2V Feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "((4000000, 2), (1000000, 1)) ((12460, 2), (4999341, 3), (5000000, 13), (5000000, 9), (840560515, 5))\n", 13 | "(5000000, 2)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "import seaborn as sns\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "import lightgbm as lgb\n", 23 | "import xgboost as xgb\n", 24 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 25 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 26 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 27 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 28 | "from sklearn.linear_model import LogisticRegression\n", 29 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 30 | "\n", 31 | "import gc\n", 32 | "import time\n", 33 | "import os\n", 34 | "import sys\n", 35 | "import warnings\n", 36 | "warnings.filterwarnings('ignore')\n", 37 | "\n", 38 | "pickle_path = \"../pickle\"\n", 39 | "\n", 40 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 41 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 42 | "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n", 43 | "active = pd.read_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n", 44 | "usage = pd.read_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n", 45 | "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 46 | "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 47 | "print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))\n", 48 | "\n", 49 | "all_data = train.append(test)\n", 50 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)\n", 51 | "print(all_data.shape)\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "from gensim import corpora, models, similarities\n", 61 | "from gensim.models.doc2vec import TaggedDocument\n", 62 | "from glove import *\n", 63 | "\n", 64 | "def get_w2c_feature(df,load_model,model,prefix):\n", 65 | " w2c_arr = []\n", 66 | " vocab = load_model.vocab.keys()\n", 67 | " \n", 68 | " for v in vocab :\n", 69 | " w2c_arr.append(list(load_model.wv[v]))\n", 70 | "\n", 71 | " # w2v Stat\n", 72 | " df_w2c = pd.DataFrame()\n", 73 | " df_w2c['word_id'] = vocab\n", 74 | " df_w2c = pd.concat([df_w2c, pd.DataFrame(w2c_arr)], axis=1)\n", 75 | " df_w2c.columns = ['appid'] + ['appid_{}'.format(model) + '_embedding_' + str(i) for i in range(size)]\n", 76 | " df_w2c_feat = df[['uid', 'appid']].merge(df_w2c, on='appid', how='left')\n", 77 | "\n", 78 | " agg = {}\n", 79 | " for l in ['appid_{}'.format(model) + '_embedding_' + str(i) for i in range(size)] :\n", 80 | " agg[l] = ['mean', 'std', 'max', 'min']\n", 81 | "\n", 82 | " df_agg = df_w2c_feat.groupby('uid').agg(agg)\n", 83 | " df_agg.columns = pd.Index(['{}_uid_'.format(model) + prefix + e[0] + \"_\" + e[1].upper() for e in df_agg.columns.tolist()])\n", 84 | " df_agg = df_agg.reset_index().sort_values(by=['uid'],ascending=True)\n", 85 | " return df_agg\n", 86 | "\n", 87 | "def get_gensim_feature(now=None,model='word2vec',size=5,window=10,prefix='active'):\n", 88 | "\n", 89 | " df = now.copy()\n", 90 | " if os.path.exists(\"../pickle/{}_{}_emb.pickle\".format(prefix,model)):\n", 91 | " return pd.read_pickle(\"../pickle/{}_{}_emb.pickle\".format(prefix,model))\n", 92 | " else:\n", 93 | " dictionary = corpora.Dictionary(df['appid'].values)\n", 94 | " corpus = [dictionary.doc2bow(text) for text in df['appid'].values]\n", 95 | " if model=='word2vec':\n", 96 | " if os.path.exists(\"../vector/w2v.model\"):\n", 97 | " w2v = models.KeyedVectors.load_word2vec_format(\"../vector/w2v.model\", binary=False)\n", 98 | " else:\n", 99 | " w2v = models.Word2Vec(df['appid'].values, size=size, window=window, workers=40)\n", 100 | " w2v.wv.save_word2vec_format(\"../vector/w2v.model\")\n", 101 | " vocab = list(w2v.wv.vocab.keys())\n", 102 | "\n", 103 | " # Sentence Embedding\n", 104 | "\n", 105 | " w2v_feature = np.zeros((df.shape[0],size))\n", 106 | " w2v_feature_avg = np.zeros((df.shape[0],size))\n", 107 | "\n", 108 | " for i,line in tqdm(enumerate(df['appid'].values.tolist())):\n", 109 | " num = 0\n", 110 | " if line == '':\n", 111 | " w2v_feature_avg[i,:] = np.zeros(size)\n", 112 | " else:\n", 113 | " for word in line:\n", 114 | " num += 1\n", 115 | " vec = w2v[word] if word in vocab else np.zeros(size)\n", 116 | " w2v_feature[i,:] += vec\n", 117 | " w2v_feature_avg[i,:] = w2v_feature[i,:] / num\n", 118 | " w2v_avg = pd.DataFrame(w2v_feature_avg)\n", 119 | " w2v_avg = w2v_avg.add_prefix(\"W2V_AVG_{}_\".format(prefix))\n", 120 | " w2v_avg['uid'] = df['uid']\n", 121 | " df_agg = w2v_avg\n", 122 | "\n", 123 | " elif model=='lda':\n", 124 | " lda = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=size)\n", 125 | " col = np.zeros((df.shape[0],20))\n", 126 | " ans = lda.get_document_topics(corpus)\n", 127 | " for i in tqdm(range(df.shape[0])):\n", 128 | " for j in ans[i]:\n", 129 | " col[i][j[0]] = j[1]\n", 130 | "\n", 131 | " df_agg = pd.DataFrame(col)\n", 132 | " df_agg = df_agg.add_prefix(\"LDA_TOPIC_{}_\".format(prefix))\n", 133 | " df_agg['uid'] = df['uid']\n", 134 | "\n", 135 | " elif model=='fasttext':\n", 136 | " if os.path.exists(\"../vector/fasttext.model\"):\n", 137 | " fasttext = models.KeyedVectors.load_word2vec_format(\"../vector/fasttext.model\", binary=False)\n", 138 | " else:\n", 139 | " fasttext = models.FastText(df['appid'].values, size=size, window=window, workers=40)\n", 140 | " fasttext.wv.save_word2vec_format(\"../vector/fasttext.model\")\n", 141 | " vocab = list(fasttext.wv.vocab.keys())\n", 142 | "\n", 143 | " fasttext_feature = np.zeros((df.shape[0],size))\n", 144 | " fasttext_feature_avg = np.zeros((df.shape[0],size))\n", 145 | "\n", 146 | " for i,line in tqdm(enumerate(df['appid'].values.tolist())):\n", 147 | " num = 0\n", 148 | " if line == '':\n", 149 | " fasttext_feature_avg[i,:] = np.zeros(size)\n", 150 | " else:\n", 151 | " for word in line:\n", 152 | " num += 1\n", 153 | " vec = fasttext[word] if word in vocab else np.zeros(size)\n", 154 | " fasttext_feature[i,:] += vec\n", 155 | " fasttext_feature_avg[i,:] = fasttext_feature[i,:] / num\n", 156 | " fasttext_avg = pd.DataFrame(fasttext_feature_avg)\n", 157 | " fasttext_avg = fasttext_avg.add_prefix(\"FASTTEXT_AVG_{}\".format(prefix))\n", 158 | " fasttext_avg['uid'] = df['uid']\n", 159 | " df_agg = fasttext_avg\n", 160 | "\n", 161 | " elif model=='doc2vec':\n", 162 | " if os.path.exists(\"../vector/d2v.model\"):\n", 163 | " d2v = models.KeyedVectors.load_word2vec_format(\"../vector/d2v.model\", binary=False)\n", 164 | " else: \n", 165 | " docs = [TaggedDocument(words=i[1],tags=[str(i[0])]) for i in df[['uid','appid']].values]\n", 166 | " d2v = models.Doc2Vec(docs,size=size,window=window,workers=40)\n", 167 | " d2v.wv.save_word2vec_format(\"../vector/d2v.model\")\n", 168 | " vocab = list(d2v.wv.vocab.keys())\n", 169 | " \n", 170 | " d2v_avg = []\n", 171 | " for i in tqdm(df['appid'].values):\n", 172 | " line = []\n", 173 | " for j in i:\n", 174 | " line.append(d2v[j] if j in vocab else 0)\n", 175 | " d2v_avg.append(np.mean(line,axis=0))\n", 176 | " d2v_avg = pd.DataFrame(d2v_avg)\n", 177 | " d2v_avg = d2v_avg.add_prefix(\"d2v_AVG_{}\".format(prefix))\n", 178 | " d2v_avg['uid'] = df['uid']\n", 179 | " df_agg = d2v_avg\n", 180 | "\n", 181 | " elif model=='lsi':\n", 182 | " lsi = models.LsiModel(corpus=corpus, id2word=dictionary, num_topics=size)\n", 183 | " df_agg = []\n", 184 | " for i in tqdm(df['appid'].values):\n", 185 | " lsi_ = lsi[dictionary.doc2bow(i)]\n", 186 | " df_agg.append([tmp[1] for tmp in lsi[lsi_]])\n", 187 | "\n", 188 | " df_agg = pd.DataFrame(df_agg)\n", 189 | " df_agg = df_agg.add_prefix(\"LSI_TOPIC_{}_\".format(prefix))\n", 190 | " df_agg['uid'] = df['uid']\n", 191 | " \n", 192 | " elif model=='glove':\n", 193 | " matrix = Corpus()\n", 194 | " matrix.fit(df['appid'].values)\n", 195 | " glove = Glove(no_components=size, learning_rate=0.05)\n", 196 | " glove.fit(matrix.matrix,epochs=10,no_threads=30,verbose=1)\n", 197 | " glove.add_dictionary(matrix.dictionary)\n", 198 | " ans = []\n", 199 | " for i in tqdm(df['appid'].values):\n", 200 | " line = []\n", 201 | " for j in i:\n", 202 | " line.append(glove.word_vectors[glove.dictionary[j]])\n", 203 | " ans.append(np.mean(line,axis=0))\n", 204 | " df_agg = pd.DataFrame(ans)\n", 205 | " df_agg = df_agg.add_prefix(\"Glove_AVG_{}\".format(prefix))\n", 206 | " df_agg['uid'] = df['uid']\n", 207 | " \n", 208 | " df_agg.to_pickle(\"../pickle/{}_{}_emb.pickle\".format(prefix,model))\n", 209 | " \n", 210 | " return df_agg" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "name": "stderr", 220 | "output_type": "stream", 221 | "text": [ 222 | " 99%|█████████▉| 4942930/4999341 [50:48<00:25, 2244.35it/s] " 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "from tqdm import tqdm\n", 228 | "lsi = get_gensim_feature(active,'lsi',32,10,'active')\n", 229 | "w2v = get_gensim_feature(active,'word2vec',64,10,'active')\n", 230 | "fasttext = get_gensim_feature(active,'fasttext',64,10,'active')\n", 231 | "d2v = get_gensim_feature(active,'doc2vec',64,10,'active')\n", 232 | "lda = get_gensim_feature(active,'lda',20,10,'active')\n", 233 | "\n", 234 | "lsi_1 = get_gensim_feature(usage,'lsi',32,10,'usage')\n", 235 | "w2v_1 = get_gensim_feature(usage,'word2vec',64,10,'usage')\n", 236 | "fasttext_1 = get_gensim_feature(usage,'fasttext',64,10,'usage')\n", 237 | "d2v_1 = get_gensim_feature(usage,'doc2vec',64,10,'usage')\n", 238 | "lda_1 = get_gensim_feature(usage,'lda',20,10,'usage')" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [] 254 | } 255 | ], 256 | "metadata": { 257 | "kernelspec": { 258 | "display_name": "Python 3", 259 | "language": "python", 260 | "name": "python3" 261 | }, 262 | "language_info": { 263 | "codemirror_mode": { 264 | "name": "ipython", 265 | "version": 3 266 | }, 267 | "file_extension": ".py", 268 | "mimetype": "text/x-python", 269 | "name": "python", 270 | "nbconvert_exporter": "python", 271 | "pygments_lexer": "ipython3", 272 | "version": "3.6.7" 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 4 277 | } 278 | -------------------------------------------------------------------------------- /snake/6. TFIDF-COUNT Feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 18 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 21 | "\n", 22 | "import gc\n", 23 | "import time\n", 24 | "import os\n", 25 | "import sys\n", 26 | "import warnings\n", 27 | "warnings.filterwarnings('ignore')\n", 28 | "\n", 29 | "pickle_path = \"../pickle\"\n", 30 | "\n", 31 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 32 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 33 | "active = pd.read_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n", 34 | "usage_appid_seq = pd.read_pickle(\"{}/user_app_seq.pickle\".format(pickle_path))\n", 35 | "# print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))\n", 36 | "\n", 37 | "all_data = train.append(test)\n", 38 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)\n", 39 | "print(all_data.shape)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | " uid appid app_len \\\n", 52 | "0 1000006 a001012 a001036 a001062 a001172 a001275 a00135... 47.0 \n", 53 | "1 1000009 a001012 a001015 a001055 a001062 a00107 a001072... 73.0 \n", 54 | "2 1000010 a001012 a001036 a001050 a001055 a001062 a00107... 96.0 \n", 55 | "3 1000011 a001012 a001063 a002450 a003083 a00326 a003987... 21.0 \n", 56 | "4 1000012 a001036 a001062 a001580 a001583 a003570 a00365... 33.0 \n", 57 | "\n", 58 | " age_group \n", 59 | "0 4.0 \n", 60 | "1 4.0 \n", 61 | "2 5.0 \n", 62 | "3 NaN \n", 63 | "4 5.0 \n", 64 | "TFIDF & COUNT FINISHED...\n", 65 | "0\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 71 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 72 | "from sklearn.linear_model import LogisticRegression,BayesianRidge,SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier\n", 73 | "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n", 74 | "from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier\n", 75 | "from sklearn.neural_network import MLPClassifier\n", 76 | "from sklearn.svm import LinearSVC,NuSVC,SVC\n", 77 | "from sklearn.metrics import roc_auc_score,accuracy_score\n", 78 | "from sklearn.model_selection import KFold,StratifiedKFold,TimeSeriesSplit\n", 79 | "from scipy import sparse\n", 80 | "import xgboost as xgb\n", 81 | "import lightgbm as lgb\n", 82 | "import catboost as cbt\n", 83 | "\n", 84 | "def get_sklearn_embedding(now,n_splits=5,ngram=1,prefix=None):\n", 85 | " \n", 86 | " if os.path.exists(\"../pickle/{}_tfidf_count_emb_all.pickle\".format(prefix)):\n", 87 | " return pd.read_pickle(\"../pickle/{}_tfidf_count_emb_all.pickle\".format(prefix))\n", 88 | " else:\n", 89 | " df = now.copy()\n", 90 | " df['appid'] = df['appid'].map(lambda x:\" \".join(x))\n", 91 | " df = df.merge(all_data,how='right',on='uid')\n", 92 | " print(df.head())\n", 93 | " tfidf = TfidfVectorizer(ngram_range=(1,ngram))\n", 94 | " tf = tfidf.fit_transform(df['appid'].fillna(\"##\").values)\n", 95 | " count = CountVectorizer(ngram_range=(1,ngram))\n", 96 | " cv = count.fit_transform(df['appid'].fillna(\"##\").values)\n", 97 | " all_ = sparse.csr_matrix(sparse.hstack([tf, cv]))\n", 98 | " print(\"TFIDF & COUNT FINISHED...\")\n", 99 | " tr = df['age_group'].notnull()\n", 100 | " te = df['age_group'].isnull()\n", 101 | " y = df[tr]['age_group']-1\n", 102 | " X_train = all_[df[tr].index]\n", 103 | " X_test = all_[df[te].index]\n", 104 | "\n", 105 | " random_seed = 2019\n", 106 | " model_zoo = [SGDClassifier(n_jobs=10,verbose=1),SGDClassifier(loss='log',n_jobs=10,verbose=1),\n", 107 | " SGDClassifier(loss='modified_huber',n_jobs=10,verbose=1),\n", 108 | " PassiveAggressiveClassifier(n_jobs=10,verbose=1),LogisticRegression(C=10),\n", 109 | " RidgeClassifier(solver='lsqr',fit_intercept=False),LinearSVC(verbose=1,max_iter=500),\n", 110 | " BernoulliNB(),MultinomialNB()]\n", 111 | "\n", 112 | " columns = ['SGD_HINGE','SGD_LOG','SGD_HUBER','PAC','LR','RIDGE','LSVC','BNB','MNB']\n", 113 | "\n", 114 | " oof = []\n", 115 | " count = 0\n", 116 | "\n", 117 | " for model in model_zoo:\n", 118 | " t1 = time.time()\n", 119 | " cv_pred_stack = np.zeros((X_train.shape[0],num_classes))\n", 120 | " test_pred_stack = np.zeros((X_test.shape[0],num_classes))\n", 121 | " skf = KFold(n_splits=n_splits,random_state=random_seed)\n", 122 | " if os.path.exists(\"../pickle/{}_TFIDF_COUNT_{}.pickle\".format(prefix,columns[count])):\n", 123 | " tmp = pd.read_pickle(\"../pickle/{}_TFIDF_COUNT_{}.pickle\".format(prefix,columns[count]))\n", 124 | " else:\n", 125 | " for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):\n", 126 | " print(index,model)\n", 127 | " train_x, test_x, train_y, test_y = X_train[train_index], X_train[test_index], y.iloc[train_index], y.iloc[test_index]\n", 128 | " model.fit(train_x,train_y)\n", 129 | " try:\n", 130 | " y_val = model._predict_proba_lr(test_x)\n", 131 | " except:\n", 132 | " y_val = model.predict_proba(test_x)\n", 133 | " cv_pred_stack[test_index] = y_val\n", 134 | " print(y_val.shape)\n", 135 | " try:\n", 136 | " test_pred_stack += model._predict_proba_lr(X_test) / n_splits\n", 137 | " except:\n", 138 | " test_pred_stack += model.predict_proba(X_test) / n_splits\n", 139 | " print(model,'score:',accuracy_score(y,np.argmax(cv_pred_stack,axis=1)))\n", 140 | " print(time.time()-t1)\n", 141 | " a = pd.DataFrame(cv_pred_stack).add_prefix(columns[count]+\"_\")\n", 142 | " a['uid'] = df[tr]['uid'].values\n", 143 | " b = pd.DataFrame(test_pred_stack).add_prefix(columns[count]+\"_\")\n", 144 | " b['uid'] = df[te]['uid'].values\n", 145 | " tmp = a.append(b).sort_values(by=['uid']).reset_index(drop=True)\n", 146 | " tmp.to_pickle(\"../pickle/{}_TFIDF_COUNT_{}.pickle\".format(prefix,columns[count]))\n", 147 | " \n", 148 | " count += 1\n", 149 | " oof.append(tmp)\n", 150 | " \n", 151 | " df_agg = pd.DataFrame()\n", 152 | " for i in tqdm(oof):\n", 153 | " df_agg[i.columns] = i\n", 154 | " df_agg = df_agg.sort_values(by=['uid'],ascending=True)\n", 155 | " df_agg.to_pickle(\"../pickle/{}_tfidf_count_emb_all.pickle\".format(prefix))\n", 156 | " \n", 157 | " return df_agg\n", 158 | "\n", 159 | "num_classes = 6\n", 160 | "prob_active = get_sklearn_embedding(active,n_splits=5,ngram=1,prefix='active')\n", 161 | "prob_usage = get_sklearn_embedding(usage_appid_seq,n_splits=5,ngram=1,prefix='usage')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.6.7" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 4 200 | } 201 | -------------------------------------------------------------------------------- /snake/7. Meta Active Train (Strong).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 18 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 21 | "from sklearn.metrics import mean_squared_error\n", 22 | "\n", 23 | "import gc\n", 24 | "import time\n", 25 | "import os\n", 26 | "import sys\n", 27 | "import warning\n", 28 | "\n", 29 | "pickle_path = \"../pickle\"\n", 30 | "\n", 31 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 32 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 33 | "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n", 34 | "active = pd.read_pickle(\"{}/user_app_active_flatten.pickle\".format(pickle_path))\n", 35 | "# usage = pd.read_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n", 36 | "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 37 | "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 38 | "print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))\n", 39 | "\n", 40 | "all_data = train.append(test)\n", 41 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)\n", 42 | "print(all_data.shape)\n", 43 | "\n", 44 | "for i in tqdm(user_basic_info.select_dtypes('object').columns):\n", 45 | " lbl = LabelEncoder()\n", 46 | " user_basic_info[i] = lbl.fit_transform(user_basic_info[i].astype('str')) \n", 47 | "\n", 48 | "appid = LabelEncoder()\n", 49 | "active['appid'] = appid.fit_transform(active['appid'])\n", 50 | "\n", 51 | "active = active.merge(behavior_info,how='left',on='uid').merge(user_basic_info,how='left',on='uid')\n", 52 | "print(active.head())\n", 53 | "\n", 54 | "active['uid_appid_count'] = active[['uid','appid']].groupby(['uid'])['appid'].transform('count')\n", 55 | "active['uid_appid_std'] = active[['uid','appid']].groupby(['uid'])['appid'].transform('std')\n", 56 | "\n", 57 | "active['appid_uid_count'] = active[['uid','appid']].groupby(['appid'])['uid'].transform('count')\n", 58 | "active['appid_uid_std'] = active[['uid','appid']].groupby(['appid'])['uid'].transform('std')\n", 59 | "\n", 60 | "active['uid_fontsize_std'] = active[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('std')\n", 61 | "active['uid_fontsize_mean'] = active[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('mean')\n", 62 | "\n", 63 | "def reduce_mem_usage(df, verbose=True):\n", 64 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", 65 | " start_mem = df.memory_usage().sum() / 1024**2 \n", 66 | " for col in df.columns:\n", 67 | " col_type = df[col].dtypes\n", 68 | " if col_type in numerics:\n", 69 | " c_min = df[col].min()\n", 70 | " c_max = df[col].max()\n", 71 | " if str(col_type)[:3] == 'int':\n", 72 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", 73 | " df[col] = df[col].astype(np.int8)\n", 74 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", 75 | " df[col] = df[col].astype(np.int16)\n", 76 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", 77 | " df[col] = df[col].astype(np.int32)\n", 78 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", 79 | " df[col] = df[col].astype(np.int64) \n", 80 | " else:\n", 81 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", 82 | " df[col] = df[col].astype(np.float16)\n", 83 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", 84 | " df[col] = df[col].astype(np.float32)\n", 85 | " else:\n", 86 | " df[col] = df[col].astype(np.float64) \n", 87 | " end_mem = df.memory_usage().sum() / 1024**2\n", 88 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n", 89 | " return df\n", 90 | "\n", 91 | "active = reduce_mem_usage(active)\n", 92 | "\n", 93 | "from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedKFold\n", 94 | "\n", 95 | "def rmse(y_true, y_pred):\n", 96 | " return (mean_squared_error(y_true, y_pred))** .5\n", 97 | "\n", 98 | "# use df_hist_train df_new_train df_hist_new_train to train 3 models\n", 99 | "train_df = active[active['age_group'].notnull()]\n", 100 | "test_df = active[active['age_group'].isnull()]\n", 101 | "\n", 102 | "drop_features = ['age_group', 'uid',]\n", 103 | "cat_features = ['appid'] \n", 104 | "\n", 105 | "feats = [f for f in active.columns if f not in drop_features]\n", 106 | "\n", 107 | "n_splits= 3\n", 108 | "folds = GroupKFold(n_splits=n_splits)\n", 109 | "oof_preds = np.zeros(train_df.shape[0])\n", 110 | "sub_preds = np.zeros(test_df.shape[0])\n", 111 | "\n", 112 | "print ('feats:' + str(len(feats)))\n", 113 | "\n", 114 | "for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['age_group'],groups=train_df['uid'])):\n", 115 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['age_group'].iloc[train_idx]\n", 116 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['age_group'].iloc[valid_idx] \n", 117 | " \n", 118 | " print(\"Train Index:\",train_idx,\",Val Index:\",valid_idx)\n", 119 | "\n", 120 | " params = {\n", 121 | " \"objective\" : \"regression\", \n", 122 | " \"boosting\" : \"gbdt\", \n", 123 | " \"metric\" : \"rmse\", \n", 124 | " \"max_depth\": 7, \n", 125 | " \"num_leaves\" : 31, \n", 126 | " \"max_bin\" : 255, \n", 127 | " \"learning_rate\" : 0.1, \n", 128 | " \"subsample\" : 0.8,\n", 129 | " \"colsample_bytree\" : 0.8, \n", 130 | " \"verbosity\": -1,\n", 131 | " \"num_threads\" : 40,\n", 132 | " }\n", 133 | " \n", 134 | "\n", 135 | " if n_fold >= 0:\n", 136 | " evals_result = {}\n", 137 | " dtrain = lgb.Dataset(\n", 138 | " train_x, label=train_y,categorical_feature=cat_features)\n", 139 | " dval = lgb.Dataset(\n", 140 | " valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)\n", 141 | " bst = lgb.train(\n", 142 | " params, dtrain, num_boost_round=30000,\n", 143 | " valid_sets=[dval], early_stopping_rounds=100, verbose_eval=20,)#feval = evalerror\n", 144 | " \n", 145 | " new_list = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]\n", 146 | " for item in new_list:\n", 147 | " print (item) \n", 148 | "\n", 149 | " oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)\n", 150 | "\n", 151 | " sub_preds += bst.predict(test_df[feats], num_iteration=bst.best_iteration) / folds.n_splits # test_df_new\n", 152 | "\n", 153 | "cv = rmse(train_df['age_group'], oof_preds)\n", 154 | "print('Full OOF RMSE %.6f' % cv) \n", 155 | "\n", 156 | "a = train_df[['uid']]\n", 157 | "b = test_df[['uid']]\n", 158 | "\n", 159 | "a['age_pred'] = oof_preds\n", 160 | "b['age_pred'] = sub_preds\n", 161 | "\n", 162 | "a1 = a.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median','skew'])\n", 163 | "b1 = b.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median','skew'])\n", 164 | "\n", 165 | "a1.append(b1).add_prefix(\"active_GROUPKFOLD_agg_pred_\").reset_index().sort_values(by=['uid']).reset_index(drop=True).to_pickle(\"../pickle/Meta_active_GROUP_Regeress.pickle\")" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.6.4" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 2 190 | } 191 | -------------------------------------------------------------------------------- /snake/8. Meta Train Usage (Strong).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 17 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 18 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 19 | "from sklearn.linear_model import LogisticRegression\n", 20 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 21 | "from sklearn.metrics import mean_squared_error\n", 22 | "\n", 23 | "import gc\n", 24 | "import time\n", 25 | "import os\n", 26 | "import sys\n", 27 | "import warning\n", 28 | "\n", 29 | "pickle_path = \"../pickle\"\n", 30 | "\n", 31 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 32 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 33 | "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n", 34 | "usage = pd.read_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n", 35 | "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 36 | "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 37 | "print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape,usage.shape))\n", 38 | "\n", 39 | "all_data = train.append(test)\n", 40 | "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)\n", 41 | "print(all_data.shape)\n", 42 | "\n", 43 | "for i in tqdm(user_basic_info.select_dtypes('object').columns):\n", 44 | " lbl = LabelEncoder()\n", 45 | " user_basic_info[i] = lbl.fit_transform(user_basic_info[i].astype('str')) \n", 46 | "\n", 47 | "appid = LabelEncoder()\n", 48 | "usage['appid'] = appid.fit_transform(usage['appid'])\n", 49 | "\n", 50 | "usage = usage.merge(behavior_info,how='left',on='uid').merge(user_basic_info,how='left',on='uid')\n", 51 | "print(usage.head())\n", 52 | "\n", 53 | "usage['uid_appid_count'] = usage[['uid','appid']].groupby(['uid'])['appid'].transform('count')\n", 54 | "usage['appid_uid_count'] = usage[['uid','appid']].groupby(['appid'])['uid'].transform('count')\n", 55 | "\n", 56 | "usage['uid_fontsize_std'] = usage[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('std')\n", 57 | "usage['uid_fontsize_mean'] = usage[['uid','fontsize']].groupby(['uid'])['fontsize'].transform('mean')\n", 58 | "\n", 59 | "usage['woy'] = usage['use_date'].dt.weekofyear\n", 60 | "usage['doy'] = usage['use_date'].dt.dayofyear\n", 61 | "usage['wday'] = usage['use_date'].dt.dayofweek\n", 62 | "usage['weekend'] = (usage.use_date.dt.weekday >=5).astype(int)\n", 63 | "usage['day'] = usage['use_date'].dt.day\n", 64 | "\n", 65 | "for i in tqdm(['duration','times']):\n", 66 | " usage['appid_{}_mean'.format(i)] = usage[['appid',i]].groupby(['appid'])[i].transform('mean')\n", 67 | " usage['appid_{}_mean'.format(i)] = usage[['appid',i]].groupby(['appid'])[i].transform('std')\n", 68 | " \n", 69 | "def reduce_mem_usage(df, verbose=True):\n", 70 | " numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n", 71 | " start_mem = df.memory_usage().sum() / 1024**2 \n", 72 | " for col in df.columns:\n", 73 | " col_type = df[col].dtypes\n", 74 | " if col_type in numerics:\n", 75 | " c_min = df[col].min()\n", 76 | " c_max = df[col].max()\n", 77 | " if str(col_type)[:3] == 'int':\n", 78 | " if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n", 79 | " df[col] = df[col].astype(np.int8)\n", 80 | " elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n", 81 | " df[col] = df[col].astype(np.int16)\n", 82 | " elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n", 83 | " df[col] = df[col].astype(np.int32)\n", 84 | " elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n", 85 | " df[col] = df[col].astype(np.int64) \n", 86 | " else:\n", 87 | " if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n", 88 | " df[col] = df[col].astype(np.float16)\n", 89 | " elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n", 90 | " df[col] = df[col].astype(np.float32)\n", 91 | " else:\n", 92 | " df[col] = df[col].astype(np.float64) \n", 93 | " end_mem = df.memory_usage().sum() / 1024**2\n", 94 | " if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n", 95 | " return df\n", 96 | "\n", 97 | "usage = reduce_mem_usage(usage)\n", 98 | "usage['use_date'] = usage['use_date'].astype('int') * 1e-16\n", 99 | "usage = usage.merge(train,how='left',on='uid')\n", 100 | "\n", 101 | "from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedKFold\n", 102 | "\n", 103 | "def rmse(y_true, y_pred):\n", 104 | " return (mean_squared_error(y_true, y_pred))** .5\n", 105 | "\n", 106 | "# use df_hist_train df_new_train df_hist_new_train to train 3 models\n", 107 | "train_df = usage[usage['age_group'].notnull()]\n", 108 | "test_df = usage[usage['age_group'].isnull()]\n", 109 | "\n", 110 | "drop_features = ['age_group', 'uid',]\n", 111 | "cat_features = ['appid'] \n", 112 | "\n", 113 | "feats = [f for f in usage.columns if f not in drop_features]\n", 114 | "\n", 115 | "n_splits= 3\n", 116 | "folds = GroupKFold(n_splits=n_splits)\n", 117 | "oof_preds = np.zeros(train_df.shape[0])\n", 118 | "sub_preds = np.zeros(test_df.shape[0])\n", 119 | "\n", 120 | "print ('feats:' + str(len(feats)))\n", 121 | "\n", 122 | "for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['age_group'],groups=train_df['uid'])):\n", 123 | " train_x, train_y = train_df[feats].iloc[train_idx], train_df['age_group'].iloc[train_idx]\n", 124 | " valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['age_group'].iloc[valid_idx] \n", 125 | " \n", 126 | " print(\"Train Index:\",train_idx,\",Val Index:\",valid_idx)\n", 127 | "\n", 128 | " params = {\n", 129 | " \"objective\" : \"regression\", \n", 130 | " \"boosting\" : \"gbdt\", \n", 131 | " \"metric\" : \"rmse\", \n", 132 | " \"max_depth\": 7, \n", 133 | " \"num_leaves\" : 31, \n", 134 | " \"max_bin\" : 255, \n", 135 | " \"learning_rate\" : 0.1, \n", 136 | " \"subsample\" : 0.8,\n", 137 | " \"colsample_bytree\" : 0.8, \n", 138 | " \"verbosity\": -1,\n", 139 | " \"num_threads\" : 40,\n", 140 | " }\n", 141 | " \n", 142 | "\n", 143 | " if n_fold >= 0:\n", 144 | " evals_result = {}\n", 145 | " dtrain = lgb.Dataset(\n", 146 | " train_x, label=train_y,categorical_feature=cat_features)\n", 147 | " dval = lgb.Dataset(\n", 148 | " valid_x, label=valid_y, reference=dtrain,categorical_feature=cat_features)\n", 149 | " bst = lgb.train(\n", 150 | " params, dtrain, num_boost_round=30000,\n", 151 | " valid_sets=[dval], early_stopping_rounds=100, verbose_eval=20,)#feval = evalerror\n", 152 | " \n", 153 | " new_list = sorted(zip(feats, bst.feature_importance('gain')),key=lambda x: x[1], reverse=True)[:]\n", 154 | " for item in new_list:\n", 155 | " print (item) \n", 156 | "\n", 157 | " oof_preds[valid_idx] = bst.predict(valid_x, num_iteration=bst.best_iteration)\n", 158 | "\n", 159 | " sub_preds += bst.predict(test_df[feats], num_iteration=bst.best_iteration) / folds.n_splits # test_df_new\n", 160 | "\n", 161 | "cv = rmse(train_df['age_group'], oof_preds)\n", 162 | "print('Full OOF RMSE %.6f' % cv) \n", 163 | "\n", 164 | "a = train_df[['uid']]\n", 165 | "b = test_df[['uid']]\n", 166 | "\n", 167 | "a['age_pred'] = oof_preds\n", 168 | "b['age_pred'] = sub_preds\n", 169 | "\n", 170 | "a1 = a.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median',])\n", 171 | "b1 = b.groupby(['uid'])['age_pred'].agg(['mean','std','min','max','median',])\n", 172 | "\n", 173 | "a1.append(b1).add_prefix(\"usage_GROUPKFOLD_agg_pred_\").reset_index().sort_values(by=['uid']).reset_index(drop=True).to_pickle(\"../pickle/Meta_usage_GROUP_Regeress.pickle\")" 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "kernelspec": { 179 | "display_name": "Python 3", 180 | "language": "python", 181 | "name": "python3" 182 | }, 183 | "language_info": { 184 | "codemirror_mode": { 185 | "name": "ipython", 186 | "version": 3 187 | }, 188 | "file_extension": ".py", 189 | "mimetype": "text/x-python", 190 | "name": "python", 191 | "nbconvert_exporter": "python", 192 | "pygments_lexer": "ipython3", 193 | "version": "3.6.4" 194 | } 195 | }, 196 | "nbformat": 4, 197 | "nbformat_minor": 2 198 | } 199 | -------------------------------------------------------------------------------- /snake/9. LGB COUNT & TFIDF Model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import seaborn as sns\n", 12 | "import matplotlib.pyplot as plt\n", 13 | "import lightgbm as lgb\n", 14 | "import xgboost as xgb\n", 15 | "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n", 16 | "from sklearn.model_selection import KFold,StratifiedKFold\n", 17 | "from sklearn.linear_model import LogisticRegression\n", 18 | "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score\n", 19 | "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n", 20 | "from sklearn.decomposition import TruncatedSVD,SparsePCA\n", 21 | "import gc\n", 22 | "import time\n", 23 | "import os\n", 24 | "import sys\n", 25 | "import warnings\n", 26 | "warnings.filterwarnings('ignore')\n", 27 | "\n", 28 | "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n", 29 | "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n", 30 | "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n", 31 | "active = pd.read_pickle(\"../pickle/user_app_active.pickle\")\n", 32 | "# usage = pd.read_pickle(\"../input2/user_app_usage.pickle\")#,names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])\n", 33 | "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n", 34 | "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n", 35 | "# (train.shape,test.shape),(info.shape,active.shape,usage.shape,user_basic_info.shape,behavior_info.shape)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/html": [ 46 | "
\n", 47 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | "
age_groupuid
04.01000006
14.01000009
25.01000010
35.01000012
44.01000027
\n", 96 | "
" 97 | ], 98 | "text/plain": [ 99 | " age_group uid\n", 100 | "0 4.0 1000006\n", 101 | "1 4.0 1000009\n", 102 | "2 5.0 1000010\n", 103 | "3 5.0 1000012\n", 104 | "4 4.0 1000027" 105 | ] 106 | }, 107 | "execution_count": 3, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "all_data = train.append(test).reset_index(drop=True)\n", 114 | "all_data.head()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def get_category(x):\n", 124 | " col = []\n", 125 | " no_col = 0\n", 126 | " for i in x:\n", 127 | " try:\n", 128 | " col.append(hash_dict[i])\n", 129 | " except:\n", 130 | " no_col+=1\n", 131 | " return col,no_col\n", 132 | "\n", 133 | "hash_dict = dict(info.values)\n", 134 | "active['category'] = active['appid'].map(lambda x:get_category(x))\n", 135 | "active['category_nan'] = active['category'].map(lambda x:x[1])\n", 136 | "active['category'] = active['category'].map(lambda x:x[0])\n", 137 | "active['category_len'] = active['category'].map(lambda x:len(x))\n", 138 | "active['category_nunique'] = active['category'].map(lambda x:len(set(x)))\n", 139 | "active['category_ratio'] = active['category_nunique']/active['category_len']\n", 140 | "del active['category']" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "((5000000, 28), 5000000)" 152 | ] 153 | }, 154 | "execution_count": 6, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "all_data = all_data.merge(user_basic_info,how='left',on=['uid'])\n", 161 | "all_data = all_data.merge(behavior_info,how='left',on=['uid'])\n", 162 | "all_data = all_data.merge(active,how='left',on=['uid'])\n", 163 | "feature_name = [i for i in all_data.columns if i not in ['uid','age_group']]\n", 164 | "all_data.shape,len(all_data)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "all_data['city_count_user'] = all_data.groupby(['city'])['uid'].transform('count')\n", 174 | "all_data['romleftration_count_user'] = all_data.groupby(['romleftration'])['uid'].transform('count')\n", 175 | "all_data['prodname_count_user'] = all_data.groupby(['prodname'])['uid'].transform('count')\n", 176 | "all_data['color_count_user'] = all_data.groupby(['color'])['uid'].transform('count')\n", 177 | "all_data['ct_count_user'] = all_data.groupby(['ct'])['uid'].transform('count')\n", 178 | "all_data['carrier_count_user'] = all_data.groupby(['carrier'])['uid'].transform('count')\n", 179 | "\n", 180 | "all_data['city_nunique_user'] = all_data.groupby(['city'])['uid'].transform('nunique')\n", 181 | "all_data['romleftration_nunique_user'] = all_data.groupby(['romleftration'])['uid'].transform('nunique')\n", 182 | "all_data['prodname_nunique_user'] = all_data.groupby(['prodname'])['uid'].transform('nunique')\n", 183 | "all_data['ct_nunique_user'] = all_data.groupby(['ct'])['uid'].transform('nunique')\n", 184 | "all_data['carrier_nunique_user'] = all_data.groupby(['carrier'])['uid'].transform('nunique')" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 9, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/html": [ 195 | "
\n", 196 | "\n", 209 | "\n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | "
age_groupuidgendercityprodnameramcapacityramleftrationromcapacityromleftrationcolor...romleftration_count_userprodname_count_usercolor_count_userct_count_usercarrier_count_usercity_nunique_userromleftration_nunique_userprodname_nunique_userct_nunique_usercarrier_nunique_user
04.010000061c00253p00548.0NaN128.0NaN翡冷翠...NaN71149536442514879.0246351777071.0NaN711492514879.02463517
14.010000090c0043p00188.00.22256.00.49渐变黑...54080.022495132832514879.02463517213066.054080.0224952514879.02463517
25.010000100c00284p00548.00.38128.00.04翡冷翠...49470.071149536442514879.024635174897.049470.0711492514879.02463517
35.010000120c0087p00594.00.3464.00.21香槟金...48572.0963973061582077881.0112691430551.048572.0963972077881.01126914
44.010000270c00206p0016.00.26137.00.79海鸥灰...53404.0147835581082077881.01126914117881.053404.01478352077881.01126914
\n", 359 | "

5 rows × 39 columns

\n", 360 | "
" 361 | ], 362 | "text/plain": [ 363 | " age_group uid gender city prodname ramcapacity ramleftration \\\n", 364 | "0 4.0 1000006 1 c00253 p0054 8.0 NaN \n", 365 | "1 4.0 1000009 0 c0043 p0018 8.0 0.22 \n", 366 | "2 5.0 1000010 0 c00284 p0054 8.0 0.38 \n", 367 | "3 5.0 1000012 0 c0087 p0059 4.0 0.34 \n", 368 | "4 4.0 1000027 0 c00206 p001 6.0 0.26 \n", 369 | "\n", 370 | " romcapacity romleftration color ... romleftration_count_user \\\n", 371 | "0 128.0 NaN 翡冷翠 ... NaN \n", 372 | "1 256.0 0.49 渐变黑 ... 54080.0 \n", 373 | "2 128.0 0.04 翡冷翠 ... 49470.0 \n", 374 | "3 64.0 0.21 香槟金 ... 48572.0 \n", 375 | "4 137.0 0.79 海鸥灰 ... 53404.0 \n", 376 | "\n", 377 | " prodname_count_user color_count_user ct_count_user carrier_count_user \\\n", 378 | "0 71149 53644 2514879.0 2463517 \n", 379 | "1 22495 13283 2514879.0 2463517 \n", 380 | "2 71149 53644 2514879.0 2463517 \n", 381 | "3 96397 306158 2077881.0 1126914 \n", 382 | "4 147835 58108 2077881.0 1126914 \n", 383 | "\n", 384 | " city_nunique_user romleftration_nunique_user prodname_nunique_user \\\n", 385 | "0 77071.0 NaN 71149 \n", 386 | "1 213066.0 54080.0 22495 \n", 387 | "2 4897.0 49470.0 71149 \n", 388 | "3 30551.0 48572.0 96397 \n", 389 | "4 117881.0 53404.0 147835 \n", 390 | "\n", 391 | " ct_nunique_user carrier_nunique_user \n", 392 | "0 2514879.0 2463517 \n", 393 | "1 2514879.0 2463517 \n", 394 | "2 2514879.0 2463517 \n", 395 | "3 2077881.0 1126914 \n", 396 | "4 2077881.0 1126914 \n", 397 | "\n", 398 | "[5 rows x 39 columns]" 399 | ] 400 | }, 401 | "execution_count": 9, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "all_data.head()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 10, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "feature_name = [i for i in all_data.columns if i not in ['uid','age_group']]\n", 417 | "cat_col = [col for col in all_data.columns if all_data[col].dtype == np.object]\n", 418 | "num_col = [i for i in feature_name if i not in cat_col]" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 31, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "label_name = ['age_group']\n", 428 | "from tqdm import tqdm\n", 429 | "from scipy import sparse\n", 430 | "vector_feature = ['appid']\n", 431 | "onehot_feature = [i for i in cat_col if i not in vector_feature]" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 12, 437 | "metadata": {}, 438 | "outputs": [], 439 | "source": [ 440 | "tr = None\n", 441 | "train_ix = list(range(train.shape[0]))\n", 442 | "test_ix = list(range(train.shape[0],all_data.shape[0]))" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "print('onehot...')\n", 452 | "enc = OneHotEncoder(handle_unknown='ignore')\n", 453 | "for feature in tqdm(onehot_feature):\n", 454 | " lbl = LabelEncoder()\n", 455 | " all_data[feature] = lbl.fit_transform(all_data[feature].astype('str').fillna('0').values.reshape(-1, 1))" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 22, 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "from scipy import sparse\n", 465 | "\n", 466 | "c1 = sparse.load_npz(\"../vector/Sparse_Matrix/active_count.npz\")\n", 467 | "c2 = sparse.load_npz(\"../vector/Sparse_Matrix/active_tfidf.npz\")" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 29, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "svd = TruncatedSVD(n_components=100,n_iter=10,random_state=2019)\n", 477 | "c1_svd = svd.fit_transform(c1)\n", 478 | "c2_svd = svd.fit_transform(c2)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 32, 484 | "metadata": {}, 485 | "outputs": [ 486 | { 487 | "name": "stderr", 488 | "output_type": "stream", 489 | "text": [ 490 | "\n", 491 | "\n", 492 | " 0%| | 0/1 [00:00