├── Feature.ipynb ├── arcii.py ├── cnn-dssm.py ├── images ├── Code.png ├── arcii.png ├── cnn-dssm.png ├── lstm-dssm.png ├── mvlstm.png ├── params.png └── textcnn.png ├── lstm-dssm.py ├── mvlstm.py ├── readme.md └── textcnn.py /Feature.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "DA3BF7A274C9420984E6233A09460048", 7 | "mdEditEnable": false 8 | }, 9 | "source": [ 10 | "# 导包" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 5, 16 | "metadata": { 17 | "id": "96EE9D8C953142D7AEE81B639EA3DF92", 18 | "collapsed": false, 19 | "scrolled": true 20 | }, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "100000000 ./predictions/final_submission_20190811.csv\n\n1,1,0.53125741515603\n1,2,0.4392272420196072\n1,3,0.5879643281299144\n1,4,0.576160912203239\n1,5,0.5052100835619331\n1,6,0.4324311875604571\n1,7,0.4066308458541816\n2,1,0.5768602948272425\n2,2,0.6554118653967363\n2,3,0.5325662217303531\n1,128 770 122 1192,1,770 36 1192 8 33 10048 122 193 469 31 37\t\n1,128 770 122 1192,2,354 770 1192 40 9315 15 3545 14 522 3159 122 1645 4626 31 37\t\n1,128 770 122 1192,3,770 69380 1845 34 644 115 10320 11843 12348\t\n1,128 770 122 1192,4,770 770 4241 2131 7036 122 1192 851 2507\t\n1,128 770 122 1192,5,770 14692 3933 3194 27 10320 128 4346 1192\t\n1,128 770 122 1192,6,770 36 39 1192 266 108 5229 192 122 26831 11\t\n1,128 770 122 1192,7,770 36 10048 122 13 449 1192 33 708 30 37\t\n2,66 64 123 848,1,3589 458 848 66 26 123 848 388 17779 29794 27 1282 2435 389\t\n2,66 64 123 848,2,3589 458 848 66 26 123 848 388 17779 328 274 82424\t\n2,66 64 123 848,3,5218 1257 31 4712 6030 27 47 267 659 1392 3589 66 64 123 27 5692 848 819\t\n\n10318082,8,0.26414565025898734\n10318082,9,0.42693966127920707\n10318082,10,0.28341074642672387\n10318082,11,0.2947948388734799\n10318082,12,0.26474696779822326\n10318082,13,0.32344879648630276\n10318082,14,0.34928164192293326\n10318082,15,0.2033738233624376\n10318082,16,0.055942568163945425\n10318082,17,0.19353918772727668\n10318082,794 39322,8,14703 82 16 21 258 13958 15 39322 27 15488 794 91154 1075 39322 148 4633\t\n10318082,794 39322,9,19352 1424 9240 203 11529 27 6526 4452 19398 4813 733\t\n10318082,794 39322,10,23860 68 9 220 235 161 16 794 39410 1151 2448\t\n10318082,794 39322,11,37314 80 180 11 794 20 915 102 1424 9560 35 9240 31420 936 36 10939 236 2947\t\n10318082,794 39322,12,9683 2165 817 3541 1424 39322 35 55495 15 69\t\n10318082,794 39322,13,49 118 1424 39322 3909 1351 27 196 151 1585 15 52\t\n10318082,794 39322,14,8633 18020 83 262 11 1383 1424 154 679 5760 27 2860 2251 281\t\n10318082,794 39322,15,8633 25659 1121 1424 9240 10773 27 1851 24 47965 23\t\n10318082,794 39322,16,8633 18866 98 3526 624 85 527 31 27 124 2150 281\t\n10318082,794 39322,17,16001 145 794 75 2597 1424 35 10960 39 38276 2030\t\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "!wc -l ./predictions/final_submission_20190811.csv\n", 32 | "print()\n", 33 | "!head ./predictions/final_submission_20190811.csv\n", 34 | "!head /home/kesci/input/bytedance/bytedance_contest.final_2.csv\n", 35 | "print()\n", 36 | "!tail ./predictions/final_submission_20190811.csv\n", 37 | "!tail /home/kesci/input/bytedance/bytedance_contest.final_2.csv" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 13, 43 | "metadata": { 44 | "id": "24CDE881EEC648168F273287B4921514", 45 | "collapsed": false, 46 | "scrolled": true 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Collecting fuzzywuzzy\n Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl\nInstalling collected packages: fuzzywuzzy\nSuccessfully installed fuzzywuzzy-0.17.0\nCollecting jellyfish\n Downloading https://pypi.tuna.tsinghua.edu.cn/packages/3f/80/bcacc7affb47be7279d7d35225e1a932416ed051b315a7f9df20acf04cbe/jellyfish-0.7.2.tar.gz (133kB)\n\u001b[K 100% |████████████████████████████████| 143kB 7.6MB/s eta 0:00:01\n\u001b[?25hBuilding wheels for collected packages: jellyfish\n Running setup.py bdist_wheel for jellyfish ... \u001b[?25ldone\n\u001b[?25h Stored in directory: /home/kesci/.cache/pip/wheels/bc/b7/78/6736d761d7635d2af9579e040f342b5482850d856d26cbefa3\nSuccessfully built jellyfish\nInstalling collected packages: jellyfish\nSuccessfully installed jellyfish-0.7.2\nCollecting pyemd\n Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c0/c5/7fea8e7a71cd026b30ed3c40e4c5ea13a173e28f8855da17e25271e8f545/pyemd-0.5.1.tar.gz (91kB)\n\u001b[K 100% |████████████████████████████████| 92kB 7.5MB/s eta 0:00:01\n\u001b[?25hRequirement already satisfied: numpy<2.0.0,>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from pyemd)\nBuilding wheels for collected packages: pyemd\n Running setup.py bdist_wheel for pyemd ... \u001b[?25ldone\n\u001b[?25h Stored in directory: /home/kesci/.cache/pip/wheels/94/20/c1/ccdf0e9878f5c76def850603e62a572746036829f8353804bd\nSuccessfully built pyemd\nInstalling collected packages: pyemd\nSuccessfully installed pyemd-0.5.1\nCollecting python-levenshtein\n Downloading https://pypi.tuna.tsinghua.edu.cn/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)\n\u001b[K 100% |████████████████████████████████| 51kB 5.9MB/s eta 0:00:01\n\u001b[?25hRequirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from python-levenshtein)\nBuilding wheels for collected packages: python-levenshtein\n Running setup.py bdist_wheel for python-levenshtein ... \u001b[?25ldone\n\u001b[?25h Stored in directory: /home/kesci/.cache/pip/wheels/ef/af/8f/b3250804480b8d14ca55d436129a2fb53798a0ae9287b686c0\nSuccessfully built python-levenshtein\nInstalling collected packages: python-levenshtein\nSuccessfully installed python-levenshtein-0.12.0\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "# !pip install fuzzywuzzy -i https://pypi.tuna.tsinghua.edu.cn/simple\n", 59 | "# !pip install jellyfish -i https://pypi.tuna.tsinghua.edu.cn/simple\n", 60 | "# !pip install pyemd -i https://pypi.tuna.tsinghua.edu.cn/simple\n", 61 | "# !pip install python-levenshtein -i https://pypi.tuna.tsinghua.edu.cn/simple" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 1, 67 | "metadata": { 68 | "cell_type": "code", 69 | "id": "A5B4D140EABC45748C3C9FDBD11352E9", 70 | "collapsed": false, 71 | "scrolled": false 72 | }, 73 | "outputs": [ 74 | { 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "CPU times: user 1.3 s, sys: 596 ms, total: 1.9 s\nWall time: 1.45 s\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%%time\r\n", 84 | "\r\n", 85 | "import os\r\n", 86 | "import sys\r\n", 87 | "import math\r\n", 88 | "import pickle\r\n", 89 | "import collections\r\n", 90 | "import gc\r\n", 91 | "import joblib\r\n", 92 | "import gzip\r\n", 93 | "import xgboost as xgb\r\n", 94 | "import seaborn as sns\r\n", 95 | "import matplotlib.pyplot as plt\r\n", 96 | "import pandas as pd\r\n", 97 | "import numpy as np\r\n", 98 | "import lightgbm as lgb\r\n", 99 | "from tqdm import tqdm\r\n", 100 | "# from fuzzywuzzy import fuzz\r\n", 101 | "from datetime import datetime\r\n", 102 | "from multiprocessing import Pool\r\n", 103 | "# from jellyfish import jaro_distance, jaro_winkler\r\n", 104 | "from scipy.spatial.distance import cosine, euclidean, cityblock\r\n", 105 | "\r\n", 106 | "import gensim\r\n", 107 | "from gensim.corpora import Dictionary\r\n", 108 | "from gensim.models import TfidfModel, FastText, KeyedVectors\r\n", 109 | "from gensim.models.word2vec import Word2Vec, PathLineSentences, LineSentence\r\n", 110 | "\r\n", 111 | "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\r\n", 112 | "from sklearn.metrics import roc_auc_score, precision_score\r\n", 113 | "from sklearn.ensemble import RandomForestClassifier\r\n", 114 | "from sklearn.externals import joblib\r\n", 115 | "\r\n", 116 | "import networkx as nx\r\n", 117 | "from networkx.readwrite.gpickle import write_gpickle, read_gpickle\r\n", 118 | "\r\n", 119 | "import warnings\r\n", 120 | "warnings.filterwarnings('ignore', category=Warning)" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "CB3D344A4EC943C0A0C15B9A2A12B5D2", 127 | "mdEditEnable": false 128 | }, 129 | "source": [ 130 | "# 常量初始化" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 2, 136 | "metadata": { 137 | "id": "3F0E11F980C4407BBE75220A36839EB2", 138 | "collapsed": false, 139 | "scrolled": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\nWall time: 5.72 µs\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "%%time\r\n", 152 | "test_data_path = '/home/kesci/input/bytedance/test_final_part1.csv'\r\n", 153 | "train_data_path = './split/10kw.csv'\r\n", 154 | "final_data_path = '/home/kesci/input/bytedance/bytedance_contest.final_2.csv'" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "A79292BD4C6743DB87694C0BF6317C01", 161 | "mdEditEnable": false 162 | }, 163 | "source": [ 164 | "# 数据预处理" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "id": "2AC4F400337042CD9995DFAA8F54ED87", 171 | "mdEditEnable": false 172 | }, 173 | "source": [ 174 | "## 获取query和title的set集合——train" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 16, 180 | "metadata": { 181 | "id": "0F3B4DE0DBCE4646A79321325D56F5B3", 182 | "collapsed": false 183 | }, 184 | "outputs": [ 185 | { 186 | "name": "stdout", 187 | "output_type": "stream", 188 | "text": [ 189 | "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\nWall time: 10.3 µs\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "%%time\r\n", 195 | "def get_unique(filepath):\r\n", 196 | " print('-----------------reading {}-----------------'.format(filepath))\r\n", 197 | " data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n", 198 | " data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n", 199 | " sentences = set()\r\n", 200 | " query = set()\r\n", 201 | " title = set()\r\n", 202 | " print('------------------parse data------------------')\r\n", 203 | " i = 0\r\n", 204 | " for chunk in data:\r\n", 205 | " if i%10 == 0: print(i)\r\n", 206 | " i+=1\r\n", 207 | " query |= set(chunk[1].unique().tolist())\r\n", 208 | " title |= set(chunk[3].unique().tolist())\r\n", 209 | " sentences = query | title\r\n", 210 | " \r\n", 211 | " print('unique query size', query.__len__())\r\n", 212 | " print('unique title size', title.__len__())\r\n", 213 | " print('unique sentences size', sentences.__len__())\r\n", 214 | " # print('---------------Writing to csv file---------------')\r\n", 215 | "\r\n", 216 | " # with open(handled_path + 'sentences.csv', 'w', encoding='utf-8') as f:\r\n", 217 | " # for item in sentences:\r\n", 218 | " # f.write(item)\r\n", 219 | " # f.write('\\n')\r\n", 220 | " return query, title, sentences\r\n", 221 | "test_query, test_title, test_sentence = get_unique(test_data_path)\r\n", 222 | "train_query, train_title, train_sentence = get_unique(train_data_path)\r\n", 223 | "final_query, final_title, final_sentence = get_unique(final_data_path)\r\n", 224 | "\r\n", 225 | "sentence = test_sentence | train_sentence | final_sentence\r\n", 226 | "print(sentence.__len__())\r\n", 227 | "with open('./handled_data/all_sentences.csv', 'w', encoding='utf-8') as f:\r\n", 228 | " for item in sentence:\r\n", 229 | " f.write(item)\r\n", 230 | " f.write('\\n')" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": { 236 | "id": "F4AA3A285E81414588A75871F76BADC1", 237 | "mdEditEnable": false 238 | }, 239 | "source": [ 240 | "## wordcount" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "id": "702619C5323F48EB86F7B69F60054BB0" 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "word_counter = collections.Counter()\r\n", 252 | "\r\n", 253 | "def word_count(filepath):\r\n", 254 | " data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n", 255 | " for chunk in tqdm(data):\r\n", 256 | " for item in chunk[1].unique():\r\n", 257 | " for word in item.split():\r\n", 258 | " word_counter[word] += 1\r\n", 259 | " for item in chunk[3].tolist():\r\n", 260 | " for word in item.split():\r\n", 261 | " word_counter[word] += 1\r\n", 262 | " \r\n", 263 | "def get_weight(count, eps=10000, min_count=2):\r\n", 264 | " return 0 if count < min_count else 1/(count + eps)\r\n", 265 | " \r\n", 266 | "word_count(test_data_path)\r\n", 267 | "word_count(train_data_path)\r\n", 268 | "word_count(final_data_path)\r\n", 269 | "\r\n", 270 | "weights = {word : get_weight(count) for word, count in word_counter.items()}" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": { 277 | "id": "816FB6F2013E4F7187CD3AC4DCECFC2A" 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "def word_shares(row):\r\n", 282 | " q1_list = str(row['question1']).lower().split()\r\n", 283 | " q1 = set(q1_list)\r\n", 284 | " q1words = q1.difference(stops)\r\n", 285 | " if len(q1words) == 0:\r\n", 286 | " return '0:0:0:0:0:0:0:0'\r\n", 287 | "\r\n", 288 | " q2_list = str(row['question2']).lower().split()\r\n", 289 | " q2 = set(q2_list)\r\n", 290 | " q2words = q2.difference(stops)\r\n", 291 | " if len(q2words) == 0:\r\n", 292 | " return '0:0:0:0:0:0:0:0'\r\n", 293 | "\r\n", 294 | " words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))\r\n", 295 | "\r\n", 296 | " q1stops = q1.intersection(stops)\r\n", 297 | " q2stops = q2.intersection(stops)\r\n", 298 | "\r\n", 299 | " q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])\r\n", 300 | " q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])\r\n", 301 | "\r\n", 302 | " shared_2gram = q1_2gram.intersection(q2_2gram)\r\n", 303 | "\r\n", 304 | " shared_words = q1words.intersection(q2words)\r\n", 305 | " shared_weights = [weights.get(w, 0) for w in shared_words]\r\n", 306 | " q1_weights = [weights.get(w, 0) for w in q1words]\r\n", 307 | " q2_weights = [weights.get(w, 0) for w in q2words]\r\n", 308 | " total_weights = q1_weights + q1_weights\r\n", 309 | "\r\n", 310 | " R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share\r\n", 311 | " R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share\r\n", 312 | " R31 = len(q1stops) / len(q1words) #stops in q1\r\n", 313 | " R32 = len(q2stops) / len(q2words) #stops in q2\r\n", 314 | " Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))\r\n", 315 | " Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator\r\n", 316 | " if len(q1_2gram) + len(q2_2gram) == 0:\r\n", 317 | " R2gram = 0\r\n", 318 | " else:\r\n", 319 | " R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))\r\n", 320 | " return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "metadata": { 326 | "id": "58E84E29DB9A4779B04D1D0608625D1C", 327 | "mdEditEnable": false 328 | }, 329 | "source": [ 330 | "# 生成无向图" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "id": "48483DE7E283464194017A7FC7AA040D", 337 | "mdEditEnable": false 338 | }, 339 | "source": [ 340 | "## 准备数据——无向图生成" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 1, 346 | "metadata": { 347 | "id": "EFC4238523B94860B94B2C0B1E78E2A4", 348 | "collapsed": false, 349 | "scrolled": false 350 | }, 351 | "outputs": [ 352 | { 353 | "name": "stdout", 354 | "output_type": "stream", 355 | "text": [ 356 | "start reading seq2id\nend reading seq2id\nseq2id with length ---> 87907776\nCPU times: user 1min 31s, sys: 14.5 s, total: 1min 45s\nWall time: 2min 4s\n" 357 | ] 358 | } 359 | ], 360 | "source": [ 361 | "%%time\n", 362 | "print('start reading seq2id')\n", 363 | "seq2id = {}\n", 364 | "with open('./handled_data/all_sentences.csv', 'r', encoding='utf-8') as f:\n", 365 | " for i, sentence in enumerate(f.readlines()):\n", 366 | " seq2id[sentence.strip()] = i\n", 367 | "print('end reading seq2id')\n", 368 | "print('seq2id with length --->',len(seq2id))" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": { 374 | "id": "7377F073719A4D9E85A8E2AC1BF9A80B", 375 | "mdEditEnable": false 376 | }, 377 | "source": [ 378 | "## 文本转ID" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 26, 384 | "metadata": { 385 | "id": "CE5D7FCDC3474DD5B96D9BE64D271CD3", 386 | "collapsed": false, 387 | "scrolled": false 388 | }, 389 | "outputs": [ 390 | { 391 | "name": "stdout", 392 | "output_type": "stream", 393 | "text": [ 394 | "2019-08-04 18:25:18.316136\nstart transform data\n0\n10\n 0 1\ncount 2.000000e+07 2.000000e+07\nmean 5.122025e+07 6.027587e+07\nstd 2.944711e+07 2.793264e+07\nmin 1.060000e+02 0.000000e+00\n25% 2.575488e+07 3.862078e+07\n50% 5.130830e+07 6.404053e+07\n75% 7.676274e+07 8.443098e+07\nmax 1.019757e+08 1.019757e+08\nCPU times: user 1min 18s, sys: 1.86 s, total: 1min 20s\nWall time: 1min 28s\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "%%time\n", 400 | "print(datetime.now())\n", 401 | "def transform(filepath):\n", 402 | " data = pd.read_csv(filepath, chunksize=1000000, header=None)\n", 403 | " data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n", 404 | " result = []\n", 405 | " print('start transform data')\n", 406 | " idx = 0\n", 407 | " for chunk in data:\n", 408 | " if idx%10==0: print(idx)\n", 409 | " idx += 1\n", 410 | " result.append(pd.DataFrame({0:chunk[1].apply(lambda x : seq2id[x.strip()]), \n", 411 | " 1:chunk[3].apply(lambda x : seq2id[x.strip()])}))\n", 412 | " return result\n", 413 | "# id = pd.concat(transform(test_data_path))\n", 414 | "print(id.describe())\n", 415 | "id.to_csv('./handled_data/id_test.csv', header=None, index=False)\n", 416 | "!wc -l ./handled_data/id_test.csv" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": { 422 | "id": "AA79B229CB7A4F4E9E91B07BFE51D406", 423 | "mdEditEnable": false 424 | }, 425 | "source": [ 426 | "## 生成无向图" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 3, 432 | "metadata": { 433 | "id": "5A8E5EEB63F44D558A48483A78E43597", 434 | "collapsed": false, 435 | "scrolled": true 436 | }, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "2019-08-04 20:09:48.426697\ngenerate train graph\n2019-08-04 20:10:11.280335 ---> 0\n2019-08-04 20:10:16.641768 ---> 1\n2019-08-04 20:10:21.355543 ---> 2\n2019-08-04 20:10:27.104319 ---> 3\n2019-08-04 20:10:31.788189 ---> 4\n2019-08-04 20:10:36.931844 ---> 5\n2019-08-04 20:10:40.341101 ---> 6\n2019-08-04 20:10:46.926877 ---> 7\n2019-08-04 20:10:53.250733 ---> 8\n2019-08-04 20:10:56.573122 ---> 9\n2019-08-04 20:10:59.920231 ---> 10\n2019-08-04 20:11:07.224468 ---> 11\n2019-08-04 20:11:10.635839 ---> 12\n2019-08-04 20:11:14.068377 ---> 13\n2019-08-04 20:11:24.505138 ---> 14\n2019-08-04 20:11:27.838067 ---> 15\n2019-08-04 20:11:31.123212 ---> 16\n2019-08-04 20:11:34.418728 ---> 17\n2019-08-04 20:11:44.396497 ---> 18\n2019-08-04 20:11:47.722042 ---> 19\n2019-08-04 20:11:51.068181 ---> 20\n2019-08-04 20:11:54.431964 ---> 21\n2019-08-04 20:11:57.809854 ---> 22\n2019-08-04 20:12:10.010839 ---> 23\n2019-08-04 20:12:13.399495 ---> 24\n2019-08-04 20:12:16.806213 ---> 25\n2019-08-04 20:12:20.229955 ---> 26\n2019-08-04 20:12:23.646422 ---> 27\n2019-08-04 20:12:27.061730 ---> 28\n2019-08-04 20:12:30.500476 ---> 29\nfinish generate train graph, temporaray graph size -> 21236030 29997376\ngenerate final data graph\n2019-08-04 20:13:01.251764 ---> 0\n2019-08-04 20:13:04.778456 ---> 1\n2019-08-04 20:13:12.256328 ---> 2\n2019-08-04 20:13:15.593915 ---> 3\n2019-08-04 20:13:18.930359 ---> 4\n2019-08-04 20:13:22.265382 ---> 5\n2019-08-04 20:13:25.573647 ---> 6\n2019-08-04 20:13:28.886724 ---> 7\n2019-08-04 20:13:32.212384 ---> 8\n2019-08-04 20:13:50.524109 ---> 9\n2019-08-04 20:13:53.876998 ---> 10\n2019-08-04 20:13:57.215895 ---> 11\n2019-08-04 20:14:00.570448 ---> 12\n2019-08-04 20:14:03.952938 ---> 13\n2019-08-04 20:14:07.322928 ---> 14\n2019-08-04 20:14:10.687618 ---> 15\n2019-08-04 20:14:14.065740 ---> 16\n2019-08-04 20:14:17.448141 ---> 17\n2019-08-04 20:14:20.841102 ---> 18\n2019-08-04 20:14:24.321381 ---> 19\n2019-08-04 20:14:27.720988 ---> 20\n2019-08-04 20:14:31.123869 ---> 21\n2019-08-04 20:14:34.535073 ---> 22\n2019-08-04 20:14:58.253944 ---> 23\n2019-08-04 20:15:01.721445 ---> 24\n2019-08-04 20:15:05.162061 ---> 25\n2019-08-04 20:15:08.601864 ---> 26\n2019-08-04 20:15:12.042046 ---> 27\n2019-08-04 20:15:15.479877 ---> 28\n2019-08-04 20:15:18.911179 ---> 29\n2019-08-04 20:15:22.363532 ---> 30\n2019-08-04 20:15:25.815807 ---> 31\n2019-08-04 20:15:29.282121 ---> 32\n2019-08-04 20:15:32.731438 ---> 33\n2019-08-04 20:15:36.196011 ---> 34\n2019-08-04 20:15:39.658354 ---> 35\n2019-08-04 20:15:43.136249 ---> 36\n2019-08-04 20:15:46.602263 ---> 37\n2019-08-04 20:15:50.072394 ---> 38\n2019-08-04 20:15:53.564058 ---> 39\n2019-08-04 20:15:57.031866 ---> 40\n2019-08-04 20:16:00.521455 ---> 41\n2019-08-04 20:16:04.020592 ---> 42\n2019-08-04 20:16:07.518894 ---> 43\n2019-08-04 20:16:39.318845 ---> 44\n2019-08-04 20:16:42.826865 ---> 45\n2019-08-04 20:16:46.338864 ---> 46\n2019-08-04 20:16:49.844742 ---> 47\n2019-08-04 20:16:53.361940 ---> 48\n2019-08-04 20:16:56.883326 ---> 49\n2019-08-04 20:17:00.410362 ---> 50\n2019-08-04 20:17:03.951037 ---> 51\n2019-08-04 20:17:07.481663 ---> 52\n2019-08-04 20:17:18.732522 ---> 53\n2019-08-04 20:17:22.134094 ---> 54\n2019-08-04 20:17:25.447526 ---> 55\n2019-08-04 20:17:28.776571 ---> 56\n2019-08-04 20:17:32.107264 ---> 57\n2019-08-04 20:17:35.436777 ---> 58\n2019-08-04 20:17:38.796158 ---> 59\n2019-08-04 20:17:42.136024 ---> 60\n2019-08-04 20:17:45.490189 ---> 61\n2019-08-04 20:17:48.824860 ---> 62\n2019-08-04 20:17:52.182171 ---> 63\n2019-08-04 20:17:55.530998 ---> 64\n2019-08-04 20:17:58.895756 ---> 65\n2019-08-04 20:18:02.289543 ---> 66\n2019-08-04 20:18:05.658089 ---> 67\n2019-08-04 20:18:09.042868 ---> 68\n2019-08-04 20:18:12.423860 ---> 69\n2019-08-04 20:18:15.742892 ---> 70\n2019-08-04 20:18:19.046646 ---> 71\n2019-08-04 20:18:22.362017 ---> 72\n2019-08-04 20:18:25.699028 ---> 73\n2019-08-04 20:18:29.022436 ---> 74\n2019-08-04 20:18:32.372405 ---> 75\n2019-08-04 20:19:16.084752 ---> 76\n2019-08-04 20:19:19.441777 ---> 77\n2019-08-04 20:19:22.803283 ---> 78\n2019-08-04 20:19:26.165686 ---> 79\n2019-08-04 20:19:29.536950 ---> 80\n2019-08-04 20:19:32.917675 ---> 81\n2019-08-04 20:19:36.283755 ---> 82\n2019-08-04 20:19:39.648486 ---> 83\n2019-08-04 20:19:43.019010 ---> 84\n2019-08-04 20:19:46.380202 ---> 85\n2019-08-04 20:19:49.725886 ---> 86\n2019-08-04 20:19:53.097143 ---> 87\n2019-08-04 20:19:56.476532 ---> 88\n2019-08-04 20:19:59.839292 ---> 89\n2019-08-04 20:20:03.241092 ---> 90\n2019-08-04 20:20:06.634720 ---> 91\n2019-08-04 20:20:10.003162 ---> 92\n2019-08-04 20:20:13.364029 ---> 93\n2019-08-04 20:20:16.724924 ---> 94\n2019-08-04 20:20:20.098426 ---> 95\n2019-08-04 20:20:23.469628 ---> 96\n2019-08-04 20:20:26.833314 ---> 97\n2019-08-04 20:20:30.207836 ---> 98\n2019-08-04 20:20:33.582591 ---> 99\nfinish generate final graph, temporaray graph size -> 59028158 129984026\nsave graph\nCPU times: user 13min 57s, sys: 51.8 s, total: 14min 49s\nWall time: 15min 5s\n" 443 | ] 444 | } 445 | ], 446 | "source": [ 447 | "%%time\n", 448 | "print(datetime.now())\n", 449 | "G = nx.Graph()\n", 450 | " \n", 451 | "def make_graph_parallel():\n", 452 | " CHUNK_SIZE = 1000000\n", 453 | " # test_data = pd.read_csv('./handled_data/id_test.csv', chunksize=CHUNK_SIZE, header=None)\n", 454 | " # test_data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n", 455 | " \n", 456 | " # print('generate test graph')\n", 457 | " # idx = 0\n", 458 | " # print(datetime.now())\n", 459 | " # for chunk in test_data:\n", 460 | " # print(datetime.now(),'--->',idx)\n", 461 | " # idx += 1 \n", 462 | " # ziped = list(zip(chunk[0], chunk[1]))\n", 463 | " # G.add_edges_from(ziped)\n", 464 | " \n", 465 | " # del chunk\n", 466 | " # del ziped\n", 467 | " # print('finish generate test graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n", 468 | " # del test_data\n", 469 | " # gc.collect()\n", 470 | " \n", 471 | " data = pd.read_csv('./handled_data/id_train.csv', chunksize=CHUNK_SIZE, header=None)\n", 472 | " data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n", 473 | "\n", 474 | " print('generate train graph')\n", 475 | " idx = 0\n", 476 | " # for chunk in data:\n", 477 | " for i in range(100):\n", 478 | " chunk = data.get_chunk()\n", 479 | " if i < 70: continue\n", 480 | " print(datetime.now(), '--->', idx)\n", 481 | " idx += 1\n", 482 | " ziped = list(zip(chunk[0], chunk[1]))\n", 483 | " G.add_edges_from(ziped)\n", 484 | " \n", 485 | " del chunk\n", 486 | " del ziped\n", 487 | " del data\n", 488 | " gc.collect()\n", 489 | " print('finish generate train graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n", 490 | " \n", 491 | " data = pd.read_csv('./handled_data/id_final.csv', chunksize=CHUNK_SIZE, header=None)\n", 492 | " data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n", 493 | " print('generate final data graph')\n", 494 | " idx = 0\n", 495 | " for chunk in data:\n", 496 | " print(datetime.now(), '--->', idx)\n", 497 | " idx += 1\n", 498 | " ziped = list(zip(chunk[0], chunk[1]))\n", 499 | " G.add_edges_from(ziped)\n", 500 | " \n", 501 | " del chunk\n", 502 | " del ziped\n", 503 | " print('finish generate final graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n", 504 | " \n", 505 | " del data\n", 506 | " gc.collect()\n", 507 | " print('save graph')\n", 508 | " write_gpickle(G, './handled_data/final_graph.pkl')\n", 509 | " \n", 510 | "make_graph_parallel()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "metadata": { 516 | "id": "8D8EA63E6CA2463C9485424F7D635E46", 517 | "mdEditEnable": false 518 | }, 519 | "source": [ 520 | "## 生成pagerank模型" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": 5, 526 | "metadata": { 527 | "id": "07291A2657994E6095D81705A142B993", 528 | "collapsed": false, 529 | "scrolled": false 530 | }, 531 | "outputs": [ 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "2019-08-04 20:26:20.708101\nend generate pagerank value\nCPU times: user 21min, sys: 51.1 s, total: 21min 52s\nWall time: 21min 51s\n" 537 | ] 538 | } 539 | ], 540 | "source": [ 541 | "%%time\r\n", 542 | "print(datetime.now())\r\n", 543 | "graph = read_gpickle('./handled_data/final_graph.pkl')\r\n", 544 | "print(datetime.now(),'---> end load graph')\r\n", 545 | "page_rank = nx.pagerank_scipy(graph)\r\n", 546 | "print('end generate pagerank value')\r\n", 547 | "pickle.dump(page_rank, open('./handled_data/pagerank_final.model', 'wb'))" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": { 553 | "id": "66D2DD90D6744BDF9ADD4D6E6588003C", 554 | "mdEditEnable": false 555 | }, 556 | "source": [ 557 | "## 生成HITS模型" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 1, 563 | "metadata": { 564 | "id": "A671B600F9154D908E108D9B14317892", 565 | "collapsed": false 566 | }, 567 | "outputs": [ 568 | { 569 | "name": "stdout", 570 | "output_type": "stream", 571 | "text": [ 572 | "2019-07-27 03:39:19.758518\nend load graph\n" 573 | ] 574 | } 575 | ], 576 | "source": [ 577 | "%%time\r\n", 578 | "print(datetime.datetime.now())\r\n", 579 | "graph = joblib.load('./models/un_direct_graph.model')\r\n", 580 | "print('end load graph')\r\n", 581 | "hits_h, hits_a = nx.hits(graph)\r\n", 582 | "print('end generate HITS value')\r\n", 583 | "joblib.dump(hits_h, './models/hub_value.model')\r\n", 584 | "joblib.dump(hits_a, './models/authority_value.model')" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "metadata": { 590 | "id": "0B5EB347F584409BB654A641F66B1B2A", 591 | "mdEditEnable": false 592 | }, 593 | "source": [ 594 | "## 数据准备——特征生成" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 14, 600 | "metadata": { 601 | "id": "D9427953B24748D5892B3709E233DC89", 602 | "collapsed": false, 603 | "scrolled": false 604 | }, 605 | "outputs": [ 606 | { 607 | "name": "stdout", 608 | "output_type": "stream", 609 | "text": [ 610 | "CPU times: user 52.5 s, sys: 12.3 s, total: 1min 4s\nWall time: 1min 5s\n" 611 | ] 612 | } 613 | ], 614 | "source": [ 615 | "%%time\r\n", 616 | "# graph = read_gpickle('./handled_data/final/final_graph.pkl')\r\n", 617 | "# print('end load graph')\r\n", 618 | "# seq2id = {}\r\n", 619 | "# with open('./handled_data/all_sentences_verseion_2.csv', 'r', encoding='utf-8') as f:\r\n", 620 | "# for i, sentence in enumerate(f.readlines()):\r\n", 621 | "# seq2id[sentence.strip()] = i\r\n", 622 | "# print('end seq2id')\r\n", 623 | "def gen_degrees():\r\n", 624 | " max_degrees = {}\r\n", 625 | " edges = graph.edges()\r\n", 626 | " for edge in edges:\r\n", 627 | " for n in edge:\r\n", 628 | " max_degrees[n] = max_degrees.get(n, 0) + 1\r\n", 629 | " return max_degrees\r\n", 630 | "\r\n", 631 | "def gen_components():\r\n", 632 | " max_components = {}\r\n", 633 | " components = nx.connected_components(graph)\r\n", 634 | " for component in components:\r\n", 635 | " for n in component:\r\n", 636 | " max_components[n] = max(max_components.get(n, 0), len(component))\r\n", 637 | " return max_components\r\n", 638 | "\r\n", 639 | "def gen_hits():\r\n", 640 | " hits_h, hits_a = nx.hits(graph, max_iter=500)\r\n", 641 | " return hits_h, hits_a\r\n", 642 | "\r\n", 643 | "# max_degrees = gen_degrees()\r\n", 644 | "# pickle.dump(max_degrees, open('./handled_data/max_degree_final', 'wb'))\r\n", 645 | "# print('end degree')\r\n", 646 | "# max_components = gen_components()\r\n", 647 | "# pickle.dump(max_components, open('./handled_data/max_components_final', 'wb'))\r\n", 648 | "# print('end components')\r\n", 649 | "# del graph\r\n", 650 | "# gc.collect()\r\n", 651 | "\r\n", 652 | "max_degrees = pickle.load(open('./handled_data/final/max_degree_final', 'rb'))\r\n", 653 | "max_components = pickle.load(open('./handled_data/final/max_components_final', 'rb'))\r\n", 654 | "page_rank = pickle.load(open('./handled_data/final/pagerank_final.model', 'rb'))\r\n", 655 | "# print('end pagerank')\r\n", 656 | "# hits_h, hits_a = gen_hits()\r\n", 657 | "\r\n", 658 | "# max_degree max_components\r\n", 659 | "def calculate_statistics(row):\r\n", 660 | " return [max_degrees[row[0]], max_degrees[row[1]], max_components[row[0]]] \r\n", 661 | " \r\n", 662 | "# PageRank \r\n", 663 | "def calculate_pagerank(row):\r\n", 664 | " return [page_rank[row[0]] * 1e6, page_rank[row[1]] * 1e6] \r\n", 665 | "\r\n", 666 | "# Hits\r\n", 667 | "def calculate_hits(row):\r\n", 668 | " hits_h_1 = hits_h[row[0]] * 1e6\r\n", 669 | " hits_a_1 = hits_a[row[0]] * 1e6\r\n", 670 | " hits_h_2 = hits_h[row[1]] * 1e6\r\n", 671 | " hits_a_2 = hits_a[row[1]] * 1e6\r\n", 672 | " return [hits_h_1, hits_a_1, hits_h_2, hits_a_2]\r\n", 673 | "\r\n", 674 | "# ShortestPath\r\n", 675 | "def calculate_shortestpath(row):\r\n", 676 | " graph.remove_edge(row[0], row[1])\r\n", 677 | " if nx.has_path(graph, row[0], row[1]):\r\n", 678 | " shortest_path = len(nx.shortest_path(graph, row[0], row[1]))\r\n", 679 | " else:\r\n", 680 | " shortest_path = -1\r\n", 681 | " graph.add_edge(row[0], row[1])\r\n", 682 | " return [shortest_path]\r\n", 683 | "\r\n", 684 | "# Neighbour\r\n", 685 | "def calculate_neighbour(row):\r\n", 686 | " neighbor_1 = set(graph.neighbors(row[0]))\r\n", 687 | " neighbor_2 = set(graph.neighbors(row[1]))\r\n", 688 | " return [len(neighbor_1), len(neighbor_2), len(neighbor_1 | neighbor_2)]" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": { 694 | "id": "049F004A503C405A85B47CBD208D9567", 695 | "mdEditEnable": false 696 | }, 697 | "source": [ 698 | "## 生成图特征" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 17, 704 | "metadata": { 705 | "id": "D36B363E31CD476C870BEDF3958CCD4A", 706 | "collapsed": false, 707 | "scrolled": true 708 | }, 709 | "outputs": [ 710 | { 711 | "name": "stdout", 712 | "output_type": "stream", 713 | "text": [ 714 | "start generate feature\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n(30000000, 5)\n 0 1 2 0 1\ncount 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07\nmean 1.274293e+02 1.106532e+01 5.754290e+07 1.234310e+00 2.669189e-02\nstd 3.819820e+03 3.443744e+02 5.868692e+06 3.824809e+01 3.425827e+00\nmin 1.000000e+00 1.000000e+00 2.000000e+00 2.574202e-03 2.541272e-03\n25% 7.000000e+00 1.000000e+00 5.814144e+07 4.419803e-02 3.981151e-03\n50% 1.600000e+01 3.000000e+00 5.814144e+07 8.919203e-02 6.508193e-03\n75% 2.000000e+01 7.000000e+00 5.814144e+07 1.496774e-01 1.367823e-02\nmax 1.289980e+05 1.289980e+05 5.814144e+07 1.291623e+03 1.291623e+03\nCPU times: user 6min 29s, sys: 21.1 s, total: 6min 50s\nWall time: 7min 1s\n" 715 | ] 716 | }, 717 | { 718 | "data": { 719 | "text/plain": [ 720 | "14" 721 | ] 722 | }, 723 | "execution_count": 17, 724 | "metadata": {}, 725 | "output_type": "execute_result" 726 | } 727 | ], 728 | "source": [ 729 | "%%time\r\n", 730 | "gc.collect()\r\n", 731 | "CHUNKSIZE = 1000000\r\n", 732 | "data = pd.read_csv('./handled_data/id/id_train.csv', chunksize=CHUNKSIZE, header=None)\r\n", 733 | "data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n", 734 | "pool = Pool()\r\n", 735 | "res = []\r\n", 736 | "idx = 0\r\n", 737 | "for i in range(70):\r\n", 738 | " data.get_chunk()\r\n", 739 | "print('start generate feature')\r\n", 740 | "# for chunk in data:\r\n", 741 | "for i in range(30):\r\n", 742 | " chunk = data.get_chunk()\r\n", 743 | " print(idx)\r\n", 744 | " idx += 1\r\n", 745 | " df = []\r\n", 746 | " ziped = list(zip(chunk[0], chunk[1]))\r\n", 747 | " # ziped = list(zip(chunk[1].apply(lambda x:seq2id[x.strip()]).tolist(), \r\n", 748 | " # chunk[3].apply(lambda x:seq2id[x.strip()]).tolist()))\r\n", 749 | " \r\n", 750 | " df.append(pd.DataFrame(pool.map(calculate_statistics, ziped)))\r\n", 751 | " df.append(pd.DataFrame(pool.map(calculate_pagerank, ziped)))\r\n", 752 | " \r\n", 753 | " # df.append(pd.DataFrame(pool.map(calculate_hits, ziped)))\r\n", 754 | " # df.append(pd.DataFrame(pool.map(calculate_neighbour, ziped)))\r\n", 755 | " \r\n", 756 | " res.append(pd.concat(df, axis=1))\r\n", 757 | " del ziped\r\n", 758 | " del chunk\r\n", 759 | " del df\r\n", 760 | " gc.collect()\r\n", 761 | " \r\n", 762 | "graph_features = pd.concat(res)\r\n", 763 | "graph_features.to_csv('./handled_data/train_feature/train_feature_graph_final.csv', header=None, index=False)\r\n", 764 | "print(graph_features.shape)\r\n", 765 | "print(graph_features.describe())\r\n", 766 | "del graph_features\r\n", 767 | "gc.collect()" 768 | ] 769 | }, 770 | { 771 | "cell_type": "markdown", 772 | "metadata": { 773 | "id": "2BF16F8FC7694CAB9237103C777228F4", 774 | "collapsed": false, 775 | "mdEditEnable": false 776 | }, 777 | "source": [ 778 | "# 一般特征" 779 | ] 780 | }, 781 | { 782 | "cell_type": "markdown", 783 | "metadata": { 784 | "id": "779E3B5AEB4A4C78893F3321105A4A8C", 785 | "mdEditEnable": false 786 | }, 787 | "source": [ 788 | "## 准备数据" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": 5, 794 | "metadata": { 795 | "id": "A34605E7A9914661966CE15A3D173E34", 796 | "collapsed": false, 797 | "scrolled": false 798 | }, 799 | "outputs": [ 800 | { 801 | "name": "stdout", 802 | "output_type": "stream", 803 | "text": [ 804 | "loaded dict\nloaded word2vec\nloaded TF-IDF\n----------------initialized models----------------\nCPU times: user 9.07 s, sys: 4.41 s, total: 13.5 s\nWall time: 27.4 s\n" 805 | ] 806 | } 807 | ], 808 | "source": [ 809 | "%%time\r\n", 810 | "dictionary = Dictionary.load('./models/word_dict.dict') # gensim Dictionary\r\n", 811 | "print('loaded dict')\r\n", 812 | "w2v_model = KeyedVectors.load('/home/kesci/work/word_vectors/w2v-300d-new.txt')\r\n", 813 | "print('loaded word2vec')\r\n", 814 | "tfidf_model = TfidfModel.load('./models/tfidf.model')\r\n", 815 | "print('loaded TF-IDF')\r\n", 816 | "fasttext_model = FastText.load('./word_vectors/fast-300d-new.txt')\r\n", 817 | "# tfidf = joblib.load(model_path + 'scikit_tfidf_model.m')\r\n", 818 | "print('----------------initialized models----------------')\r\n", 819 | "\r\n", 820 | "def init_prob():\r\n", 821 | " CHUNK_SIZE = 1000000\r\n", 822 | " reader = pd.read_csv('./split/10kw.csv',\r\n", 823 | " chunksize=CHUNK_SIZE,\r\n", 824 | " header=None,\r\n", 825 | " names=['query_id','query','query_title_id','title','label'])\r\n", 826 | " \r\n", 827 | " totalCounter = collections.Counter()\r\n", 828 | " posCounter = collections.Counter()\r\n", 829 | " \r\n", 830 | " idx = 0\r\n", 831 | " for chunk in reader:\r\n", 832 | " # if idx == 10: break\r\n", 833 | " print(idx)\r\n", 834 | " idx += 1\r\n", 835 | " query = chunk['query'].apply(lambda x : x.split()).values.tolist()\r\n", 836 | " title = chunk['title'].apply(lambda x : x.split()).values.tolist()\r\n", 837 | " label = chunk['label'].values.tolist()\r\n", 838 | " for i in range(CHUNK_SIZE):\r\n", 839 | " for word in query[i]:\r\n", 840 | " totalCounter[word] += 1\r\n", 841 | " if label[i] == 1:\r\n", 842 | " posCounter[word] += 1\r\n", 843 | " for word in title[i]:\r\n", 844 | " totalCounter[word] += 1\r\n", 845 | " if label[i] == 1:\r\n", 846 | " posCounter[word] += 1\r\n", 847 | " prob = {}\r\n", 848 | " for key, value in posCounter.items():\r\n", 849 | " prob[key] = value / totalCounter[key]\r\n", 850 | " return prob \r\n", 851 | "# pos_prob = init_prob() # 需要计算出这个单词出现之后label为1的概率\r\n", 852 | "# file = open(handled_path + 'pos_10kw.pkl', 'wb+')\r\n", 853 | "# pickle.dump(pos_prob, file)\r\n", 854 | "\r\n", 855 | "# file = open('./handled_data/pos_10kw.pkl', 'rb+')\r\n", 856 | "# pos_prob = pickle.load(file)\r\n", 857 | "# print('----------------initialized prob----------------')" 858 | ] 859 | }, 860 | { 861 | "cell_type": "markdown", 862 | "metadata": { 863 | "id": "37F857D3CDA0457187A536D291796771", 864 | "mdEditEnable": false 865 | }, 866 | "source": [ 867 | "## powerful word" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 3, 873 | "metadata": { 874 | "id": "9A946F0C3D1F408C851722616112EFFA", 875 | "collapsed": false, 876 | "scrolled": false 877 | }, 878 | "outputs": [ 879 | { 880 | "name": "stdout", 881 | "output_type": "stream", 882 | "text": [ 883 | "CPU times: user 2.1 s, sys: 204 ms, total: 2.3 s\nWall time: 2.3 s\n" 884 | ] 885 | } 886 | ], 887 | "source": [ 888 | "%%time\r\n", 889 | "# words_power = {}\r\n", 890 | "# def parse(row):\r\n", 891 | "# label = int(row[2])\r\n", 892 | "# q1_words = row[0]\r\n", 893 | "# q2_words = row[1]\r\n", 894 | "# q1_words = set(q1_words)\r\n", 895 | "# q2_words = set(q2_words)\r\n", 896 | "# all_words = q1_words |q2_words\r\n", 897 | "# for word in all_words:\r\n", 898 | "# if word not in words_power:\r\n", 899 | "# words_power[word] = [0. for i in range(5)]\r\n", 900 | "# words_power[word][0] += 1.\r\n", 901 | "\r\n", 902 | "# if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):\r\n", 903 | "# words_power[word][1] += 1.\r\n", 904 | "# if 1 == label:\r\n", 905 | "# words_power[word][2] += 1.\r\n", 906 | "# if (word in q1_words) and (word in q2_words):\r\n", 907 | "# words_power[word][3] += 1.\r\n", 908 | "# if 1 == label:\r\n", 909 | "# words_power[word][4] += 1.\r\n", 910 | "# def generate_powerful_word(filepath):\r\n", 911 | "# data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n", 912 | "\r\n", 913 | "# for chunk in tqdm(data):\r\n", 914 | "# ziped = list(zip(chunk[1].apply(lambda x:x.split()).tolist(),\r\n", 915 | "# chunk[3].apply(lambda x:x.split()).tolist(),\r\n", 916 | "# chunk[4]))\r\n", 917 | "# for row in ziped:\r\n", 918 | "# parse(row)\r\n", 919 | "# generate_powerful_word('/home/kesci/input/bytedance/train_final.csv')\r\n", 920 | "# for word in tqdm(words_power):\r\n", 921 | "# if words_power[word][1] > 1e-6:\r\n", 922 | "# words_power[word][2] /= words_power[word][1]\r\n", 923 | "# words_power[word][1] /= words_power[word][0]\r\n", 924 | "# if words_power[word][3] > 1e-6:\r\n", 925 | "# words_power[word][4] /= words_power[word][3]\r\n", 926 | "# words_power[word][3] /= words_power[word][0]\r\n", 927 | "# pickle.dump(words_power, open('./handled_data/powerful_words', 'wb'))\r\n", 928 | "\r\n", 929 | "\r\n", 930 | "words_power = pickle.load(open('./handled_data/powerful_words', 'rb'))" 931 | ] 932 | }, 933 | { 934 | "cell_type": "markdown", 935 | "metadata": { 936 | "id": "7DB87679506D455689343D2DA8A6BD78", 937 | "mdEditEnable": false 938 | }, 939 | "source": [ 940 | "## 并行统计特征" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 17, 946 | "metadata": { 947 | "id": "9984EF24EE164E1E9DBAC075C899F2DF", 948 | "collapsed": false, 949 | "scrolled": false 950 | }, 951 | "outputs": [], 952 | "source": [ 953 | "# 获取 sentence 的tfidf值\r\n", 954 | "def getTFIDF(words):\r\n", 955 | " tfidf = tfidf_model[dictionary.doc2bow(words)]\r\n", 956 | " dic = {word:0 for word in words}\r\n", 957 | " sum_weight = 1e-9\r\n", 958 | " for idx, val in tfidf:\r\n", 959 | " dic[dictionary[idx]] = val\r\n", 960 | " sum_weight += val\r\n", 961 | " \r\n", 962 | " return dic, sum_weight\r\n", 963 | "\r\n", 964 | "# query与title都出现过的word的词数除以总词数\r\n", 965 | "def concurrence(data):\r\n", 966 | " query = data[0]\r\n", 967 | " title = data[1]\r\n", 968 | " query_words = {}\r\n", 969 | " title_words = {}\r\n", 970 | " for word in query:\r\n", 971 | " query_words[word] = query_words.get(word, 0) + 1\r\n", 972 | " for word in title:\r\n", 973 | " title_words[word] = title_words.get(word, 0) + 1\r\n", 974 | "\r\n", 975 | " shared_query_word = sum([query_words[w] for w in query if w in title])\r\n", 976 | " shared_titel_word = sum([title_words[w] for w in title if w in query])\r\n", 977 | " total = sum(query_words.values()) + sum(title_words.values())\r\n", 978 | "\r\n", 979 | " if 1e-6 > total:\r\n", 980 | " return [0]\r\n", 981 | " else:\r\n", 982 | " return [1.0 * (shared_titel_word + shared_query_word) / total]\r\n", 983 | "\r\n", 984 | "# 编辑距离,词粒度\r\n", 985 | "def levenshteinDistance(data):\r\n", 986 | " query = data[0]\r\n", 987 | " title = data[1]\r\n", 988 | " \r\n", 989 | " len_query = len(query) + 1\r\n", 990 | " len_title = len(title) + 1\r\n", 991 | " dp = [[0] * len_title] * len_query\r\n", 992 | "\r\n", 993 | " for i in range(1, len_query):\r\n", 994 | " for j in range(1, len_title):\r\n", 995 | " deletion = dp[i-1][j] + 1\r\n", 996 | " insertion = dp[i][j-1] + 1\r\n", 997 | " substitution = dp[i-1][j-1]\r\n", 998 | " if query[i-1] != title[j-1]:\r\n", 999 | " substitution += 1\r\n", 1000 | " dp[i][j] = min(deletion, insertion, substitution)\r\n", 1001 | " return [1 - dp[-1][-1] / max(len_title, len_query)]\r\n", 1002 | "\r\n", 1003 | "# sorensen距离\r\n", 1004 | "def sorensenDistance(data):\r\n", 1005 | " query = data[0]\r\n", 1006 | " title = data[1]\r\n", 1007 | " words = set(query) | set(title)\r\n", 1008 | "\r\n", 1009 | " query_dict = {}\r\n", 1010 | " for word in query:\r\n", 1011 | " query_dict[word] = query_dict.get(word, 0) + 1\r\n", 1012 | "\r\n", 1013 | " title_dict = {}\r\n", 1014 | " for word in title:\r\n", 1015 | " title_dict[word] = title_dict.get(word, 0) + 1\r\n", 1016 | "\r\n", 1017 | " total = sum(query_dict.values()) + sum(title_dict.values())\r\n", 1018 | "\r\n", 1019 | " diff = 0\r\n", 1020 | " for word in words:\r\n", 1021 | " diff += abs(query_dict.get(word, 0) - title_dict.get(word, 0))\r\n", 1022 | "\r\n", 1023 | " return [diff / total]\r\n", 1024 | " \r\n", 1025 | "# query和title中的共现词 \r\n", 1026 | "# 计算某个词出现的时候label为1的概率,然后求所有词的概率乘积\r\n", 1027 | "def sameWord(data):\r\n", 1028 | " unique_words = set(data[0]) | set(data[1])\r\n", 1029 | " prob = 1\r\n", 1030 | " for word in unique_words:\r\n", 1031 | " prob *= pos_prob.get(word, 1)\r\n", 1032 | " return [len(unique_words), prob]\r\n", 1033 | "\r\n", 1034 | "# Dice Ochi\r\n", 1035 | "# 1 - 交集除以并集jaccard\r\n", 1036 | "def distance(data):\r\n", 1037 | " query = data[0]\r\n", 1038 | " title = data[1]\r\n", 1039 | " intersection = len([x for x in query if x in title])\r\n", 1040 | " l = len(query) + len(title)\r\n", 1041 | " return [2*intersection / l, intersection / np.sqrt(l),\r\n", 1042 | " 1- (intersection / (l - intersection))]\r\n", 1043 | " \r\n", 1044 | "def fuzzyDistance(data):\r\n", 1045 | " return [\r\n", 1046 | " fuzz.ratio(data[0], data[1]),\r\n", 1047 | " fuzz.partial_ratio(data[0], data[1]),\r\n", 1048 | " fuzz.token_sort_ratio(data[0], data[1]),\r\n", 1049 | " fuzz.token_set_ratio(data[0], data[1]),\r\n", 1050 | " fuzz.partial_token_sort_ratio(data[0], data[1]),\r\n", 1051 | " w2v_model.wmdistance(data[0], data[1]),\r\n", 1052 | " fasttext_model.wmdistance(data[0], data[1])\r\n", 1053 | " ]\r\n", 1054 | "\r\n", 1055 | "def powerfuleWord(data):\r\n", 1056 | " rate_single = 1.0\r\n", 1057 | " rate_double = 1.0\r\n", 1058 | " query = set(data[0])\r\n", 1059 | " title = set(data[1])\r\n", 1060 | " \r\n", 1061 | " share_words = query.intersection(title)\r\n", 1062 | " all_diff = set(query.difference(title) | title.difference(query))\r\n", 1063 | " for word in share_words:\r\n", 1064 | " if word in words_power:\r\n", 1065 | " rate_double *= (1.0 - words_power[word][4])\r\n", 1066 | " for word in all_diff:\r\n", 1067 | " if word in words_power:\r\n", 1068 | " rate_single *= (1.0 - words_power[word][2])\r\n", 1069 | " \r\n", 1070 | " return [1-rate_single, 1-rate_double]" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "metadata": { 1076 | "id": "3199EF41D500495FB28DFC5C113BDE38", 1077 | "mdEditEnable": false 1078 | }, 1079 | "source": [ 1080 | "## 并行词嵌入特征" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "code", 1085 | "execution_count": 18, 1086 | "metadata": { 1087 | "id": "E3A82A356CF04F0986A888EDE46FA47B", 1088 | "collapsed": false, 1089 | "scrolled": false 1090 | }, 1091 | "outputs": [], 1092 | "source": [ 1093 | "# 词向量距离\r\n", 1094 | "def word2vecDistance(data):\r\n", 1095 | " query = data[0]\r\n", 1096 | " title = data[1]\r\n", 1097 | " vec_query = np.array([w2v_model[x] if x in w2v_model else [1e-9]*300 for x in query]).mean(axis=0)\r\n", 1098 | " vec_title = np.array([w2v_model[x] if x in w2v_model else [1e-9]*300 for x in title]).mean(axis=0)\r\n", 1099 | "\r\n", 1100 | " cos_dis = 1 - cosine(vec_title, vec_query)\r\n", 1101 | " euclidean_dis = 1 - euclidean(vec_title, vec_query)\r\n", 1102 | " manhattan_dis = 1 - cityblock(vec_title, vec_query)\r\n", 1103 | " \r\n", 1104 | " vec_query = np.array([fasttext_model[x] if x in fasttext_model else [1e-9]*300 for x in query]).mean(axis=0)\r\n", 1105 | " vec_title = np.array([fasttext_model[x] if x in fasttext_model else [1e-9]*300 for x in title]).mean(axis=0)\r\n", 1106 | "\r\n", 1107 | " cos_dis_fasttext = 1 - cosine(vec_title, vec_query)\r\n", 1108 | " euclidean_dis_fasttext = 1 - euclidean(vec_title, vec_query)\r\n", 1109 | " manhattan_dis_fasttext = 1 - cityblock(vec_title, vec_query)\r\n", 1110 | "\r\n", 1111 | " return [cos_dis, euclidean_dis, manhattan_dis, \r\n", 1112 | " cos_dis_fasttext, euclidean_dis_fasttext, manhattan_dis_fasttext] \r\n", 1113 | "\r\n", 1114 | "# 词向量距离,考虑TFIDF距离\r\n", 1115 | "# 获取重要程度在前topN的词的向量,求距离\r\n", 1116 | "# 只考虑tfidf权重的值的距离,cos、总和、平均\r\n", 1117 | "def w2vWeightDistance(data):\r\n", 1118 | " topN = 3\r\n", 1119 | " query = data[0]\r\n", 1120 | " title = data[1]\r\n", 1121 | " query_tfidf, query_weight = getTFIDF(query)\r\n", 1122 | " title_tfidf, title_weight = getTFIDF(title)\r\n", 1123 | " \r\n", 1124 | " # word2vec距离,考虑TFIDF\r\n", 1125 | " vec_query = np.sum(np.array([w2v_model[key] * value if key in w2v_model else [1e-9]*300 for key, value in query_tfidf.items()]), axis=0) / query_weight\r\n", 1126 | " vec_title = np.sum(np.array([w2v_model[key] * value if key in w2v_model else [1e-9]*300 for key, value in title_tfidf.items()]), axis=0) / title_weight\r\n", 1127 | "\r\n", 1128 | " cos_dis = 1 - cosine(vec_query, vec_title)\r\n", 1129 | " if np.isnan(cos_dis):\r\n", 1130 | " cos_dis = 1\r\n", 1131 | " euclidean_dis = 1 - euclidean(vec_query, vec_title)\r\n", 1132 | " manhattan_dis = 1- cityblock(vec_query, vec_title)\r\n", 1133 | " \r\n", 1134 | " # fasttext距离,考虑TFIDF\r\n", 1135 | " vec_query = np.sum(np.array([fasttext_model[key] * value if key in fasttext_model else [1e-9]*300 for key, value in query_tfidf.items()]), axis=0) / query_weight\r\n", 1136 | " vec_title = np.sum(np.array([fasttext_model[key] * value if key in fasttext_model else [1e-9]*300 for key, value in title_tfidf.items()]), axis=0) / title_weight\r\n", 1137 | "\r\n", 1138 | " cos_dis_fasttext = 1 - cosine(vec_query, vec_title)\r\n", 1139 | " if np.isnan(cos_dis_fasttext):\r\n", 1140 | " cos_dis_fasttext = 1\r\n", 1141 | " euclidean_dis_fasttext = 1 - euclidean(vec_query, vec_title)\r\n", 1142 | " manhattan_dis_fasttext = 1- cityblock(vec_query, vec_title)\r\n", 1143 | " \r\n", 1144 | " # 获取重要程度在前topN的词\r\n", 1145 | " query_keywords = sorted(zip(query_tfidf.keys(), query_tfidf.values()), key=lambda x : -x[1])[0 : min(topN, len(query_tfidf))]\r\n", 1146 | " title_keywords = sorted(zip(title_tfidf.keys(), title_tfidf.values()), key=lambda x : -x[1])[0: min(topN, len(title_tfidf))]\r\n", 1147 | " \r\n", 1148 | " # 重要词的word2vec距离\r\n", 1149 | " query_vector = np.mean(np.array([w2v_model[word] if word in w2v_model else [1e-9]*300 for word, value in query_keywords]), axis=0)\r\n", 1150 | " title_vector = np.mean(np.array([w2v_model[word] if word in w2v_model else [1e-9]*300 for word, value in title_keywords]), axis=0)\r\n", 1151 | "\r\n", 1152 | " cos_dis_keyword = 1 - cosine(query_vector, title_vector)\r\n", 1153 | " \r\n", 1154 | " euclidean_dis_keyword = 1 - euclidean(query_vector, title_vector)\r\n", 1155 | " manhattan_dis_keyword = 1 - cityblock(query_vector, title_vector)\r\n", 1156 | " \r\n", 1157 | " # 重要词的fasttext距离\r\n", 1158 | " query_vector = np.mean(np.array([fasttext_model[word] if word in fasttext_model else [1e-9]*300 for word, value in query_keywords]), axis=0)\r\n", 1159 | " title_vector = np.mean(np.array([fasttext_model[word] if word in fasttext_model else [1e-9]*300 for word, value in title_keywords]), axis=0)\r\n", 1160 | "\r\n", 1161 | " cos_dis_keyword_fasttext = 1 - cosine(query_vector, title_vector)\r\n", 1162 | " euclidean_dis_keyword_fasttext = 1 - euclidean(query_vector, title_vector)\r\n", 1163 | " manhattan_dis_keyword_fasttext = 1 - cityblock(query_vector, title_vector)\r\n", 1164 | " \r\n", 1165 | " # 只考虑tfidf权重的值的距离,cosine、sum、avg\r\n", 1166 | " len1 = query_tfidf.__len__()\r\n", 1167 | " len2 = title_tfidf.__len__()\r\n", 1168 | " \r\n", 1169 | " query_tfidf_value = np.array(list(query_tfidf.values()) + [1e-9]*max(len2-len1, 0))\r\n", 1170 | " title_tfidf_value = np.array(list(title_tfidf.values()) + [1e-9]*max(len1-len2, 0))\r\n", 1171 | " \r\n", 1172 | " tfidf_distance = 1 - cosine(query_tfidf_value, title_tfidf_value)\r\n", 1173 | " if np.isnan(tfidf_distance):\r\n", 1174 | " tfidf_distance = 1\r\n", 1175 | " query_tfidf_sum = np.sum(query_tfidf_value)\r\n", 1176 | " title_tfidf_sum = np.sum(title_tfidf_value)\r\n", 1177 | " query_tfidf_mean = np.mean(query_tfidf_value)\r\n", 1178 | " title_tfidf_mean = np.mean(title_tfidf_value)\r\n", 1179 | "\r\n", 1180 | " return [cos_dis, euclidean_dis, manhattan_dis, \r\n", 1181 | " cos_dis_keyword, euclidean_dis_keyword, manhattan_dis_keyword,\r\n", 1182 | " tfidf_distance, query_tfidf_sum, title_tfidf_sum, \r\n", 1183 | " query_tfidf_mean, title_tfidf_mean, \r\n", 1184 | " cos_dis_fasttext, euclidean_dis_fasttext, manhattan_dis_fasttext,\r\n", 1185 | " cos_dis_keyword_fasttext, euclidean_dis_keyword_fasttext, manhattan_dis_keyword_fasttext]\r\n", 1186 | "\r\n" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "markdown", 1191 | "metadata": { 1192 | "id": "6A7CD35225FD4AF5AC8C590334845120", 1193 | "mdEditEnable": false 1194 | }, 1195 | "source": [ 1196 | "## 并行生成特征" 1197 | ] 1198 | }, 1199 | { 1200 | "cell_type": "code", 1201 | "execution_count": 16, 1202 | "metadata": { 1203 | "id": "D74C4E247697433E8555DFB34B7CE243", 1204 | "collapsed": false, 1205 | "scrolled": false 1206 | }, 1207 | "outputs": [ 1208 | { 1209 | "name": "stderr", 1210 | "output_type": "stream", 1211 | "text": [ 1212 | "30it [02:33, 5.08s/it]\n" 1213 | ] 1214 | }, 1215 | { 1216 | "name": "stdout", 1217 | "output_type": "stream", 1218 | "text": [ 1219 | " 0 1\ncount 3.000000e+07 3.000000e+07\nmean 6.411826e-01 6.411826e-01\nstd 7.225124e-02 7.225124e-02\nmin 0.000000e+00 0.000000e+00\n25% 5.931235e-01 5.931235e-01\n50% 6.340817e-01 6.340817e-01\n75% 6.841270e-01 6.841270e-01\nmax 1.000000e+00 1.000000e+00\n" 1220 | ] 1221 | } 1222 | ], 1223 | "source": [ 1224 | "%%time\r\n", 1225 | "pool = Pool(8)\r\n", 1226 | "CHUNK_SIZE = 1000000\r\n", 1227 | "data = pd.read_csv('./split/3kw.csv', chunksize=CHUNK_SIZE, header=None)\r\n", 1228 | "data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n", 1229 | "\r\n", 1230 | "result = []\r\n", 1231 | "i = 0\r\n", 1232 | "for chunk in tqdm(data):\r\n", 1233 | " i += 1\r\n", 1234 | " # if i>10:break\r\n", 1235 | " \r\n", 1236 | " df = []\r\n", 1237 | " # query = chunk[1].apply(lambda x : x.split()).tolist()\r\n", 1238 | " # title = chunk[3].apply(lambda x : x.split()).tolist()\r\n", 1239 | " \r\n", 1240 | " query = chunk[1].tolist()\r\n", 1241 | " title = chunk[3].tolist()\r\n", 1242 | " d = list(zip(query, title))\r\n", 1243 | " \r\n", 1244 | " # df.append(pd.DataFrame(pool.map(concurrence, d)))\r\n", 1245 | " # print(1)\r\n", 1246 | " # df.append(pd.DataFrame(pool.map(levenshteinDistance, d)))\r\n", 1247 | " # print(2)\r\n", 1248 | " # df.append(pd.DataFrame(pool.map(sorensenDistance, d)))\r\n", 1249 | " # print(3)\r\n", 1250 | " # df.append(pd.DataFrame(pool.map(sameWord, d)))\r\n", 1251 | " # print(4)\r\n", 1252 | " # df.append(pd.DataFrame(pool.map(distance, d)))\r\n", 1253 | " # print(5)\r\n", 1254 | " # df.append(pd.DataFrame(pool.map(word2vecDistance, d)))\r\n", 1255 | " # print(6)\r\n", 1256 | " # df.append(pd.DataFrame(pool.map(w2vWeightDistance, d)))\r\n", 1257 | " # print(7)\r\n", 1258 | " # df.append(pd.DataFrame(pool.map(fuzzyDistance, d)))\r\n", 1259 | " # print(8)\r\n", 1260 | " # df.append(pd.DataFrame(pool.map(powerfuleWord, d)))\r\n", 1261 | " # print(9)\r\n", 1262 | " \r\n", 1263 | " result.append(pd.concat(df, axis=1))\r\n", 1264 | "\r\n", 1265 | " del df\r\n", 1266 | " del query\r\n", 1267 | " del title\r\n", 1268 | " del d\r\n", 1269 | " del chunk\r\n", 1270 | " gc.collect()\r\n", 1271 | "\r\n", 1272 | "del data\r\n", 1273 | "result = pd.concat(result)\r\n", 1274 | "print(result.describe())\r\n", 1275 | "result.to_csv('./handled_data/train_feature/train_feature_jellyfish.csv', header=None, index=False)\r\n", 1276 | "# 释放资源\r\n", 1277 | "pool.close()\r\n", 1278 | "pool.terminate()\r\n", 1279 | "pool.join()\r\n", 1280 | "gc.collect()" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "markdown", 1285 | "metadata": { 1286 | "id": "D23CD913318A48C9A86AD2CC28FB1F08", 1287 | "mdEditEnable": false 1288 | }, 1289 | "source": [ 1290 | "# 特征相关性分析" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "markdown", 1295 | "metadata": { 1296 | "id": "55A6DF5EEEDE4EB388356B43F0B03CC2", 1297 | "mdEditEnable": false 1298 | }, 1299 | "source": [ 1300 | "## 相关系数" 1301 | ] 1302 | }, 1303 | { 1304 | "cell_type": "code", 1305 | "execution_count": 21, 1306 | "metadata": { 1307 | "id": "AC1339E852414CAC9D6C44CA2B2B9C9B", 1308 | "collapsed": false, 1309 | "scrolled": true 1310 | }, 1311 | "outputs": [ 1312 | { 1313 | "name": "stdout", 1314 | "output_type": "stream", 1315 | "text": [ 1316 | "0.9776392403538859 sorensenDistance jaccardDistance\n0.9191483643445837 diceDistance ochiaiDistance\n0.9191483643445837 ochiaiDistance diceDistance\n0.9776392403538859 jaccardDistance sorensenDistance\n0.9790698526721849 word2vecDistance_1 fasttextDistance_1\n0.9013797584783118 word2vecDistance_1 w2vWeightDistance_1\n0.9982673082219394 word2vecDistance_2 word2vecDistance_3\n0.971311990796078 word2vecDistance_2 fasttextDistance_2\n0.9694247650601884 word2vecDistance_2 fasttextDistance_3\n0.9982673082219394 word2vecDistance_3 word2vecDistance_2\n0.9696246774668986 word2vecDistance_3 fasttextDistance_2\n0.9677387984890892 word2vecDistance_3 fasttextDistance_3\n0.9790698526721849 fasttextDistance_1 word2vecDistance_1\n0.971311990796078 fasttextDistance_2 word2vecDistance_2\n0.9696246774668986 fasttextDistance_2 word2vecDistance_3\n0.9980665751664842 fasttextDistance_2 fasttextDistance_3\n0.9694247650601884 fasttextDistance_3 word2vecDistance_2\n0.9677387984890892 fasttextDistance_3 word2vecDistance_3\n0.9980665751664842 fasttextDistance_3 fasttextDistance_2\n0.9013797584783118 w2vWeightDistance_1 word2vecDistance_1\n0.9517133422175729 w2vWeightDistance_1 fasttextWeightDistance_1\n0.9920940986091317 w2vWeightDistance_2 w2vWeightDistance_3\n0.9920940986091317 w2vWeightDistance_3 w2vWeightDistance_2\n0.9822766542630752 w2vKeywordDistance_1 fasttextKeywordDistance_1\n0.9980499767731188 w2vKeywordDistance_2 w2vKeywordDistance_3\n0.9704524215324315 w2vKeywordDistance_2 fasttextKeywordDistance_2\n0.9684116753038846 w2vKeywordDistance_2 fasttextKeywordDistance_3\n0.9980499767731188 w2vKeywordDistance_3 w2vKeywordDistance_2\n0.9685512758429773 w2vKeywordDistance_3 fasttextKeywordDistance_2\n0.9665154030013232 w2vKeywordDistance_3 fasttextKeywordDistance_3\n0.9517133422175729 fasttextWeightDistance_1 w2vWeightDistance_1\n0.9977510526794808 fasttextWeightDistance_2 fasttextWeightDistance_3\n0.9977510526794808 fasttextWeightDistance_3 fasttextWeightDistance_2\n0.9822766542630752 fasttextKeywordDistance_1 w2vKeywordDistance_1\n0.9041843257841926 fasttextKeywordDistance_1 fasttextKeywordDistance_2\n0.9023746325133278 fasttextKeywordDistance_1 fasttextKeywordDistance_3\n0.9704524215324315 fasttextKeywordDistance_2 w2vKeywordDistance_2\n0.9685512758429773 fasttextKeywordDistance_2 w2vKeywordDistance_3\n0.9041843257841926 fasttextKeywordDistance_2 fasttextKeywordDistance_1\n0.9979225827100741 fasttextKeywordDistance_2 fasttextKeywordDistance_3\n0.9684116753038846 fasttextKeywordDistance_3 w2vKeywordDistance_2\n0.9665154030013232 fasttextKeywordDistance_3 w2vKeywordDistance_3\n0.9023746325133278 fasttextKeywordDistance_3 fasttextKeywordDistance_1\n0.9979225827100741 fasttextKeywordDistance_3 fasttextKeywordDistance_2\n0.9999576749537433 pagerank_feature1 seq2_max_degree\n0.9999999866735351 pagerank_feature1 seq1_neighbor\n0.9953337614918548 pagerank_feature1 total_neighbor\n0.9670942889451213 pagerank+feature2 seq1_max_component\n0.9999995206910431 pagerank+feature2 seq2_neighbor\n0.9999576749537433 seq2_max_degree pagerank_feature1\n0.9999576834960514 seq2_max_degree seq1_neighbor\n0.995232077474844 seq2_max_degree total_neighbor\n0.9670942889451213 seq1_max_component pagerank+feature2\n0.9670951938053971 seq1_max_component seq2_neighbor\n0.9999999866735351 seq1_neighbor pagerank_feature1\n0.9999576834960514 seq1_neighbor seq2_max_degree\n0.9953338453527495 seq1_neighbor total_neighbor\n0.9999995206910431 seq2_neighbor pagerank+feature2\n0.9670951938053971 seq2_neighbor seq1_max_component\n0.9953337614918548 total_neighbor pagerank_feature1\n0.995232077474844 total_neighbor seq2_max_degree\n0.9953338453527495 total_neighbor seq1_neighbor\n" 1317 | ] 1318 | } 1319 | ], 1320 | "source": [ 1321 | "feature_name = ['concurrence', 'levenshteinDistance', 'sorensenDistance', \n", 1322 | " 'sameWord', 'specialConcurrence',\n", 1323 | " 'diceDistance', 'ochiaiDistance', 'jaccardDistance', \n", 1324 | " 'word2vecDistance_1', 'word2vecDistance_2', 'word2vecDistance_3', \n", 1325 | " 'fasttextDistance_1', 'fasttextDistance_2', 'fasttextDistance_3', \n", 1326 | " 'w2vWeightDistance_1', 'w2vWeightDistance_2', 'w2vWeightDistance_3',\n", 1327 | " 'w2vKeywordDistance_1', 'w2vKeywordDistance_2', 'w2vKeywordDistance_3',\n", 1328 | " 'TFIDFDistance_1', 'TFIDFDistance_2', 'TFIDFDistance_3', \n", 1329 | " 'TFIDFDistance_4', 'TFIDFDistance_5', \n", 1330 | " 'fasttextWeightDistance_1', 'fasttextWeightDistance_2', 'fasttextWeightDistance_3',\n", 1331 | " 'fasttextKeywordDistance_1', 'fasttextKeywordDistance_2', 'fasttextKeywordDistance_3',\n", 1332 | " 'pagerank_feature1', 'pagerank_feature2',\n", 1333 | " 'seq1_max_degree', 'seq2_max_degree', 'seq1_max_component',\n", 1334 | " 'seq1_neighbor', 'seq2_neighbor', 'total_neighbor',\n", 1335 | " 'DSSM_Feature', 'Deep_Model1', 'ARC', 'MVLSTM']\n", 1336 | " \n", 1337 | "with open('./handled_data/correlation', 'r', encoding='utf-8') as f:\n", 1338 | " for idx, line in enumerate(f.readlines()):\n", 1339 | " tmp_data = list(map(lambda x:float(x), line.split(',')))\n", 1340 | " for item in range(len(tmp_data)):\n", 1341 | " if tmp_data[item] > 0.9 and item != idx:\n", 1342 | " print(tmp_data[item], feature_name[idx], feature_name[item])" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "markdown", 1347 | "metadata": { 1348 | "id": "08DA2FEA1043479495F8624EAC6E8336", 1349 | "mdEditEnable": false 1350 | }, 1351 | "source": [ 1352 | "## heatmap" 1353 | ] 1354 | }, 1355 | { 1356 | "cell_type": "code", 1357 | "execution_count": 47, 1358 | "metadata": { 1359 | "id": "7C7598F567B7418785EBE3B64A30B1B2", 1360 | "collapsed": false, 1361 | "scrolled": false 1362 | }, 1363 | "outputs": [ 1364 | { 1365 | "data": { 1366 | "text/html": [ 1367 | "" 1368 | ], 1369 | "text/plain": [ 1370 | "
" 1371 | ] 1372 | }, 1373 | "metadata": { 1374 | "needs_background": "light" 1375 | }, 1376 | "output_type": "execute_result" 1377 | }, 1378 | { 1379 | "data": { 1380 | "text/plain": [ 1381 | "
" 1382 | ] 1383 | }, 1384 | "metadata": {}, 1385 | "output_type": "display_data" 1386 | }, 1387 | { 1388 | "data": { 1389 | "text/plain": [ 1390 | "
" 1391 | ] 1392 | }, 1393 | "metadata": {}, 1394 | "output_type": "display_data" 1395 | } 1396 | ], 1397 | "source": [ 1398 | "def plot_corr():\r\n", 1399 | " corr = pd.read_csv('./handled_data/correlation', header=None)\r\n", 1400 | " mask = np.zeros_like(corr, dtype=np.bool)\r\n", 1401 | " mask[np.triu_indices_from(mask)] = True\r\n", 1402 | " cmap = sns.diverging_palette(220, 10, as_cmap=True)\r\n", 1403 | " g = sns.heatmap(corr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')\r\n", 1404 | " plt.figure(dpi=500)\r\n", 1405 | " plt.show()\r\n", 1406 | " plt.savefig('./try_test')\r\n", 1407 | "plot_corr()" 1408 | ] 1409 | }, 1410 | { 1411 | "cell_type": "markdown", 1412 | "metadata": { 1413 | "id": "C4D670551EDD4DF78453E4AEF3437963", 1414 | "mdEditEnable": false 1415 | }, 1416 | "source": [ 1417 | "# RF" 1418 | ] 1419 | }, 1420 | { 1421 | "cell_type": "code", 1422 | "execution_count": 6, 1423 | "metadata": { 1424 | "id": "C143F1CF75C84E6C8CB9A410E4982CC7", 1425 | "collapsed": false, 1426 | "scrolled": false 1427 | }, 1428 | "outputs": [ 1429 | { 1430 | "data": { 1431 | "text/plain": [ 1432 | "7" 1433 | ] 1434 | }, 1435 | "execution_count": 6, 1436 | "metadata": {}, 1437 | "output_type": "execute_result" 1438 | } 1439 | ], 1440 | "source": [ 1441 | "del x_train\n", 1442 | "del y_train\n", 1443 | "gc.collect()" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": 5, 1449 | "metadata": { 1450 | "id": "CA9F515F3FFC4EA488778D87C5268F1C", 1451 | "collapsed": false, 1452 | "scrolled": false 1453 | }, 1454 | "outputs": [ 1455 | { 1456 | "name": "stdout", 1457 | "output_type": "stream", 1458 | "text": [ 1459 | "-------------------start train-------------------\n" 1460 | ] 1461 | }, 1462 | { 1463 | "name": "stderr", 1464 | "output_type": "stream", 1465 | "text": [ 1466 | "[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.\n" 1467 | ] 1468 | } 1469 | ], 1470 | "source": [ 1471 | "rf = RandomForestClassifier(n_jobs=15,\n", 1472 | " n_estimators=150,\n", 1473 | " class_weight={0:1, 1:3},\n", 1474 | " verbose=1)\n", 1475 | "train_x = np.array(x_train.values).reshape(-1, 1).astype('int')\n", 1476 | "train_y = np.array(y_train.values).reshape(-1, 1).astype('int')\n", 1477 | "\n", 1478 | "test_x = np.array(x_test.values).reshape(-1, 1).astype('int')\n", 1479 | "test_y = np.array(y_test.values).reshape(-1, 1).astype('int')\n", 1480 | "print('-------------------start train-------------------')\n", 1481 | "rf.fit(train_x, train_y)\n", 1482 | "pred_rf = rf.predict_proba(test_x)\n", 1483 | "auc_score = roc_auc_score(test_y, pred_rf)\n", 1484 | "acc_score = precision_score(test_y, pred_rf)\n", 1485 | "print('auc_score:', auc_score)\n", 1486 | "print('acc_score:', acc_score)" 1487 | ] 1488 | }, 1489 | { 1490 | "cell_type": "markdown", 1491 | "metadata": { 1492 | "id": "00668DC1B0E74253AC74E3DA5E444CC4", 1493 | "mdEditEnable": false 1494 | }, 1495 | "source": [ 1496 | "# XGBoost" 1497 | ] 1498 | }, 1499 | { 1500 | "cell_type": "code", 1501 | "execution_count": 19, 1502 | "metadata": { 1503 | "id": "FD68F203FD1A444486493FB70F4A64D3", 1504 | "collapsed": false, 1505 | "scrolled": false 1506 | }, 1507 | "outputs": [ 1508 | { 1509 | "name": "stdout", 1510 | "output_type": "stream", 1511 | "text": [ 1512 | "[]\n" 1513 | ] 1514 | } 1515 | ], 1516 | "source": [ 1517 | "print(train.get_label())" 1518 | ] 1519 | }, 1520 | { 1521 | "cell_type": "code", 1522 | "execution_count": 18, 1523 | "metadata": { 1524 | "id": "BC7D52A783334956A93B90A77C182F38", 1525 | "collapsed": false, 1526 | "scrolled": true 1527 | }, 1528 | "outputs": [ 1529 | { 1530 | "name": "stdout", 1531 | "output_type": "stream", 1532 | "text": [ 1533 | "[14:20:06] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.\n" 1534 | ] 1535 | }, 1536 | { 1537 | "ename": "XGBoostError", 1538 | "evalue": "b'[14:20:06] /workspace/src/objective/regression_obj.cu:65: Check failed: info.labels_.Size() != 0U (0 vs. 0) label set cannot be empty\\n\\nStack trace returned 10 entries:\\n[bt] (0) /opt/conda/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7fb9897f85cd]\\n[bt] (1) /opt/conda/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7fb9897f89c8]\\n[bt] (2) /opt/conda/xgboost/libxgboost.so(xgboost::obj::RegLossObj::GetGradient(xgboost::HostDeviceVector const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector >*)+0xcd) [0x7fb9899ee28d]\\n[bt] (3) /opt/conda/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7fb98986f1e2]\\n[bt] (4) /opt/conda/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb9897f0ab5]\\n[bt] (5) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fb9e0c32ec0]\\n[bt] (6) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fb9e0c3287d]\\n[bt] (7) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fb9e0e47dee]\\n[bt] (8) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7fb9e0e48825]\\n[bt] (9) python(_PyObject_FastCallDict+0x8b) [0x55ebba14c1bb]\\n\\n'", 1539 | "traceback": [ 1540 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1541 | "\u001b[0;31mXGBoostError\u001b[0m Traceback (most recent call last)", 1542 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 1543 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)\u001b[0m\n\u001b[1;32m 214\u001b[0m \u001b[0mevals\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mevals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 215\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeval\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 216\u001b[0;31m xgb_model=xgb_model, callbacks=callbacks)\n\u001b[0m\u001b[1;32m 217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 218\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1544 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36m_train_internal\u001b[0;34m(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)\u001b[0m\n\u001b[1;32m 72\u001b[0m \u001b[0;31m# Skip the first update if it is a recovery step.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mversion\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 74\u001b[0;31m \u001b[0mbst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 75\u001b[0m \u001b[0mbst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_rabit_checkpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 76\u001b[0m \u001b[0mversion\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1545 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[1;32m 1043\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1044\u001b[0m _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),\n\u001b[0;32m-> 1045\u001b[0;31m dtrain.handle))\n\u001b[0m\u001b[1;32m 1046\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1047\u001b[0m \u001b[0mpred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1546 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m_check_call\u001b[0;34m(ret)\u001b[0m\n\u001b[1;32m 163\u001b[0m \"\"\"\n\u001b[1;32m 164\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mXGBoostError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXGBGetLastError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 166\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1547 | "\u001b[0;31mXGBoostError\u001b[0m: b'[14:20:06] /workspace/src/objective/regression_obj.cu:65: Check failed: info.labels_.Size() != 0U (0 vs. 0) label set cannot be empty\\n\\nStack trace returned 10 entries:\\n[bt] (0) /opt/conda/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7fb9897f85cd]\\n[bt] (1) /opt/conda/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7fb9897f89c8]\\n[bt] (2) /opt/conda/xgboost/libxgboost.so(xgboost::obj::RegLossObj::GetGradient(xgboost::HostDeviceVector const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector >*)+0xcd) [0x7fb9899ee28d]\\n[bt] (3) /opt/conda/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7fb98986f1e2]\\n[bt] (4) /opt/conda/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb9897f0ab5]\\n[bt] (5) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fb9e0c32ec0]\\n[bt] (6) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fb9e0c3287d]\\n[bt] (7) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fb9e0e47dee]\\n[bt] (8) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7fb9e0e48825]\\n[bt] (9) python(_PyObject_FastCallDict+0x8b) [0x55ebba14c1bb]\\n\\n'" 1548 | ], 1549 | "output_type": "error" 1550 | } 1551 | ], 1552 | "source": [ 1553 | "%%time\n", 1554 | "# train, test = train_test_split(total, test_size=0.1, random_state=20, stratify=total[label_idx])\n", 1555 | "# print('end split')\n", 1556 | "# param = {'objective': 'binary:logistic'}\n", 1557 | "# train = xgb.DMatrix(train)\n", 1558 | "# test = xgb.DMatrix(test)\n", 1559 | "# print(train.num_col())\n", 1560 | "# print(train.num_row())\n", 1561 | "\n", 1562 | "bst = xgb.train(param, train, 2)\n", 1563 | "\n", 1564 | "# pred_train = bst.predict(train)\n", 1565 | "# acc_score = accuracy_score(train.get_label(), pred_train)\n", 1566 | "# print(acc_score)" 1567 | ] 1568 | }, 1569 | { 1570 | "cell_type": "code", 1571 | "execution_count": 12, 1572 | "metadata": { 1573 | "id": "77F7C13F377544128FEE0CC87859FAE9", 1574 | "collapsed": false, 1575 | "scrolled": false 1576 | }, 1577 | "outputs": [ 1578 | { 1579 | "data": { 1580 | "text/plain": [ 1581 | "7" 1582 | ] 1583 | }, 1584 | "execution_count": 12, 1585 | "metadata": {}, 1586 | "output_type": "execute_result" 1587 | } 1588 | ], 1589 | "source": [ 1590 | "del total\n", 1591 | "gc.collect()" 1592 | ] 1593 | }, 1594 | { 1595 | "cell_type": "code", 1596 | "execution_count": null, 1597 | "metadata": { 1598 | "id": "63DA97D8ADDE475EACA2EB7875FE2A26" 1599 | }, 1600 | "outputs": [], 1601 | "source": [ 1602 | "pred_test = bst.predict(test)\n", 1603 | "test_acc = accuracy_score(test.get_label(), pred_test)\n", 1604 | "test_auc = roc_auc_score(test.get_label(), pred_test)" 1605 | ] 1606 | }, 1607 | { 1608 | "cell_type": "markdown", 1609 | "metadata": { 1610 | "id": "D90F0E908A3E4E4A8436C11BA5CA0537", 1611 | "mdEditEnable": false 1612 | }, 1613 | "source": [ 1614 | "# LightGBM模型" 1615 | ] 1616 | }, 1617 | { 1618 | "cell_type": "markdown", 1619 | "metadata": { 1620 | "id": "6ADF6DA536D94C5B99CA1A9D2C0FE965", 1621 | "mdEditEnable": false 1622 | }, 1623 | "source": [ 1624 | "## 准备训练数据" 1625 | ] 1626 | }, 1627 | { 1628 | "cell_type": "code", 1629 | "execution_count": 4, 1630 | "metadata": { 1631 | "id": "4371E4A6693B4712A3FFE7FCB3B96050", 1632 | "collapsed": false, 1633 | "scrolled": true 1634 | }, 1635 | "outputs": [ 1636 | { 1637 | "name": "stdout", 1638 | "output_type": "stream", 1639 | "text": [ 1640 | "2019-08-11 19:19:30.037143\nloaded normal feature ---> (30000000, 31)\nloaded graph feature ---> (30000000, 5)\nloaded fuzzy feature ---> (30000000, 7)\nloaded powerful feature ---> (30000000, 2)\nloaded dssm_300 feature ---> (30000000, 1)\nloaded dssm_600 feature ---> (30000000, 1)\nloaded mvlstm feature ---> (30000000, 1)\nloaded ARC feature ---> (30000000, 1)\nloaded TextCNN feature ---> (30000000, 1)\nloaded OOF CNN feature ---> (30000000, 1)\nloaded OOF LSTM feature ---> (30000000, 1)\nloaded label with shape ---> (30000000, 1)\n----------------------end reading file--------------------------\ntotal feature shape ---> (30000000, 53)\nsplit train set and test set\ninitialize dataset\nCPU times: user 8min 10s, sys: 40.9 s, total: 8min 51s\nWall time: 9min 9s\n" 1641 | ] 1642 | } 1643 | ], 1644 | "source": [ 1645 | "%%time\r\n", 1646 | "\r\n", 1647 | "print(datetime.now())\r\n", 1648 | "# 一般特征\r\n", 1649 | "feature_normal = pd.read_csv('./handled_data/train_feature/train_feature_3kw_normal.csv.gz', header=None)\r\n", 1650 | "print('loaded normal feature --->',feature_normal.shape)\r\n", 1651 | "\r\n", 1652 | "# feature_statistics = pd.read_csv('./handled_data/train_feature/train_feature_3kw_statistics.csv.gz', header=None)\r\n", 1653 | "# print('loaded statistics feature --->',feature_statistics.shape)\r\n", 1654 | "# feature_pagerank = pd.read_csv('./handled_data/train_feature/train_feature_3kw_pagerank.csv.gz', header=None)\r\n", 1655 | "# print('loaded pagerank feature --->',feature_pagerank.shape)\r\n", 1656 | "\r\n", 1657 | "feature_graph = pd.read_csv('./handled_data/train_feature/train_feature_graph_final.csv.gz', header=None)\r\n", 1658 | "print('loaded graph feature --->',feature_graph.shape)\r\n", 1659 | "\r\n", 1660 | "feature_fuzzy = pd.read_csv('./handled_data/train_feature/train_feature_fuzzy.csv', header=None)\r\n", 1661 | "print('loaded fuzzy feature --->',feature_fuzzy.shape)\r\n", 1662 | "feature_powerful = pd.read_csv('./handled_data/train_feature/train_feature_powerful.csv', header=None)\r\n", 1663 | "print('loaded powerful feature --->',feature_powerful.shape)\r\n", 1664 | "\r\n", 1665 | "# 融合特征\r\n", 1666 | "feat_dssm_300 = pd.read_csv('./predictions/feat_dssm_300_3kw.csv.gz', header=None)\r\n", 1667 | "print('loaded dssm_300 feature --->', feat_dssm_300.shape)\r\n", 1668 | "\r\n", 1669 | "feat_dssm_600 = pd.read_csv('./predictions/feat_dssm_600_3kw.csv.gz', header=None)\r\n", 1670 | "print('loaded dssm_600 feature --->', feat_dssm_600.shape) \r\n", 1671 | "\r\n", 1672 | "feat_mvlstm = pd.read_csv('./predictions/feat_mvlstm_3kw.csv.gz', header=None)\r\n", 1673 | "print('loaded mvlstm feature --->', feat_mvlstm.shape) \r\n", 1674 | "\r\n", 1675 | "feat_arc = pd.read_csv('./predictions/feat_arc_3kw.csv.gz', header=None)\r\n", 1676 | "print('loaded ARC feature --->', feat_arc.shape) \r\n", 1677 | " \r\n", 1678 | "feat_textcnn = pd.read_csv('./predictions/feat_textcnn_3kw.csv.gz', header=None)\r\n", 1679 | "print('loaded TextCNN feature --->', feat_textcnn.shape)\r\n", 1680 | "\r\n", 1681 | "feat_oofcnn = pd.read_csv('./predictions/feat_oofcnn_3kw.csv', header=None)\r\n", 1682 | "print('loaded OOF CNN feature --->', feat_oofcnn.shape)\r\n", 1683 | "\r\n", 1684 | "feat_ooflstm = pd.read_csv('./predictions/feat_ooflstm_3kw.csv', header=None)\r\n", 1685 | "print('loaded OOF LSTM feature --->', feat_ooflstm.shape)\r\n", 1686 | "\r\n", 1687 | "label = pd.read_csv('./handled_data/label', header=None)\r\n", 1688 | "print('loaded label with shape --->',label.shape)\r\n", 1689 | "print('----------------------end reading file--------------------------')\r\n", 1690 | "total = pd.concat([feature_normal, \r\n", 1691 | " # feature_statistics, feature_pagerank, \r\n", 1692 | " feature_graph,\r\n", 1693 | " feature_fuzzy, feature_powerful,\r\n", 1694 | " feat_dssm_300,\r\n", 1695 | " feat_dssm_600,\r\n", 1696 | " feat_mvlstm,\r\n", 1697 | " feat_arc,\r\n", 1698 | " feat_textcnn,\r\n", 1699 | " feat_oofcnn,\r\n", 1700 | " feat_ooflstm,\r\n", 1701 | " label], axis=1)\r\n", 1702 | "total.columns = range(total.shape[1])\r\n", 1703 | "print('total feature shape --->', total.shape)\r\n", 1704 | "\r\n", 1705 | "print('split train set and test set')\r\n", 1706 | "label_idx = total.shape[1] - 1\r\n", 1707 | "\r\n", 1708 | "train, val = train_test_split(total, test_size=0.1, random_state=20, stratify=total[label_idx])\r\n", 1709 | "\r\n", 1710 | "x = train.drop(label_idx, axis=1)\r\n", 1711 | "y = train[label_idx]\r\n", 1712 | "\r\n", 1713 | "val_x = val.drop(label_idx, axis=1)\r\n", 1714 | "val_y = val[label_idx]\r\n", 1715 | "\r\n", 1716 | "print('initialize dataset')\r\n", 1717 | "lgb_train = lgb.Dataset(x, y, free_raw_data=False)\r\n", 1718 | "lgb_eval = lgb.Dataset(val_x, val_y, reference=lgb_train, free_raw_data=False)" 1719 | ] 1720 | }, 1721 | { 1722 | "cell_type": "code", 1723 | "execution_count": 9, 1724 | "metadata": { 1725 | "id": "019B00D9B9914A51A76009D3FBA3E7C5", 1726 | "collapsed": false, 1727 | "scrolled": false 1728 | }, 1729 | "outputs": [], 1730 | "source": [ 1731 | "total_x = total.drop(label_idx, axis=1)\n", 1732 | "total_y = total[label_idx]\n", 1733 | "print(total_x.shape)\n", 1734 | "print(total_x.info())\n", 1735 | "print(total_y.shape)\n", 1736 | "pritn(total_y.info())\n", 1737 | "lgb_train = lgb.Dataset(total_x, total_y, free_raw_data=False)" 1738 | ] 1739 | }, 1740 | { 1741 | "cell_type": "code", 1742 | "execution_count": 6, 1743 | "metadata": { 1744 | "id": "D79357D486D244678838A702C3148530", 1745 | "collapsed": false, 1746 | "scrolled": true 1747 | }, 1748 | "outputs": [ 1749 | { 1750 | "name": "stdout", 1751 | "output_type": "stream", 1752 | "text": [ 1753 | "(30000000, 52)\n 0 1 2 3 4 \\\ncount 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 \nmean 3.138709e-01 6.360163e-01 7.210984e-01 1.328846e+01 5.023723e-05 \nstd 1.871304e-01 1.825601e-01 1.542859e-01 4.734423e+00 7.655814e-04 \nmin 0.000000e+00 3.225806e-02 0.000000e+00 1.000000e+00 3.876599e-302 \n25% 1.904762e-01 5.000000e-01 6.363636e-01 1.000000e+01 1.425644e-12 \n50% 2.857143e-01 6.666667e-01 7.333333e-01 1.300000e+01 3.036743e-10 \n75% 4.090909e-01 7.777778e-01 8.181818e-01 1.600000e+01 6.644035e-08 \nmax 3.902353e+01 1.000000e+00 1.000000e+00 3.850000e+02 6.000000e-01 \n\n 5 6 7 8 9 \\\ncount 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 \nmean 2.809463e-01 5.420982e-01 8.260438e-01 7.305404e-01 -7.067164e-01 \nstd 1.559342e-01 2.922543e-01 1.176581e-01 1.138556e-01 4.590602e-01 \nmin 0.000000e+00 0.000000e+00 -5.142857e+00 -1.806917e-01 -5.323372e+00 \n25% 1.818182e-01 3.333333e-01 7.777778e-01 6.662080e-01 -9.964515e-01 \n50% 2.666667e-01 5.163978e-01 8.461538e-01 7.450130e-01 -6.658506e-01 \n75% 3.636364e-01 7.071068e-01 9.000000e-01 8.105919e-01 -3.794953e-01 \nmax 1.720000e+00 1.576659e+01 1.000000e+00 1.000000e+00 1.000000e+00 \n\n ... 42 43 44 45 \\\ncount ... 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 \nmean ... 3.053099e+00 8.375073e-01 3.500157e-01 4.725512e-01 \nstd ... 6.043442e-01 1.453110e-01 1.638842e-01 1.583299e-01 \nmin ... 0.000000e+00 0.000000e+00 0.000000e+00 -4.724671e-01 \n25% ... 2.714013e+00 7.789553e-01 2.111369e-01 3.822839e-01 \n50% ... 3.127549e+00 8.863827e-01 3.517495e-01 4.979405e-01 \n75% ... 3.467873e+00 9.420421e-01 4.681469e-01 5.879889e-01 \nmax ... 5.794093e+00 1.000000e+00 1.000000e+00 8.932437e-01 \n\n 46 47 48 49 50 \\\ncount 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 3.000000e+07 \nmean 4.913790e-01 1.704771e-01 1.771518e-01 1.857886e-01 1.686962e-01 \nstd 1.603338e-01 4.920971e-02 3.686785e-02 5.073331e-02 4.917796e-02 \nmin -4.589797e-01 1.637830e-02 1.085761e-02 7.413505e-03 2.439348e-03 \n25% 3.987670e-01 1.346706e-01 1.530675e-01 1.492922e-01 1.368189e-01 \n50% 5.176989e-01 1.684497e-01 1.779618e-01 1.904999e-01 1.739967e-01 \n75% 6.093000e-01 2.041900e-01 2.028155e-01 2.268609e-01 2.051853e-01 \nmax 9.266943e-01 3.532417e-01 3.169906e-01 2.812516e-01 5.599475e-01 \n\n 51 \ncount 3.000000e+07 \nmean 1.833208e-01 \nstd 6.467187e-02 \nmin 2.731318e-14 \n25% 1.404598e-01 \n50% 1.873902e-01 \n75% 2.247325e-01 \nmax 9.985868e-01 \n\n[8 rows x 52 columns]\n(30000000,)\ncount 3.000000e+07\nmean 1.822721e-01\nstd 3.860687e-01\nmin 0.000000e+00\n25% 0.000000e+00\n50% 0.000000e+00\n75% 0.000000e+00\nmax 1.000000e+00\nName: 52, dtype: float64\n" 1754 | ] 1755 | } 1756 | ], 1757 | "source": [ 1758 | "" 1759 | ] 1760 | }, 1761 | { 1762 | "cell_type": "markdown", 1763 | "metadata": { 1764 | "id": "55C87C30E20844FA8677C3BBFD4E2D45", 1765 | "mdEditEnable": false 1766 | }, 1767 | "source": [ 1768 | "## 训练" 1769 | ] 1770 | }, 1771 | { 1772 | "cell_type": "code", 1773 | "execution_count": 10, 1774 | "metadata": { 1775 | "id": "0716C46A15684CA1A77343249DC9FAD1", 1776 | "collapsed": false, 1777 | "scrolled": false 1778 | }, 1779 | "outputs": [ 1780 | { 1781 | "name": "stdout", 1782 | "output_type": "stream", 1783 | "text": [ 1784 | "2019-08-11 19:36:41.955449\nCPU times: user 9h 24min 49s, sys: 1min 7s, total: 9h 25min 56s\nWall time: 41min 38s\n" 1785 | ] 1786 | }, 1787 | { 1788 | "data": { 1789 | "text/plain": [ 1790 | "" 1791 | ] 1792 | }, 1793 | "execution_count": 10, 1794 | "metadata": {}, 1795 | "output_type": "execute_result" 1796 | } 1797 | ], 1798 | "source": [ 1799 | "%%time\r\n", 1800 | "params = {\r\n", 1801 | " 'boosting': 'dart',\r\n", 1802 | " 'objective': 'binary',\r\n", 1803 | " 'metric': 'auc',\r\n", 1804 | " 'learning_rate': '0.5',\r\n", 1805 | " \r\n", 1806 | " 'num_leaves':59,\r\n", 1807 | " 'max_depth':7,\r\n", 1808 | " \r\n", 1809 | " 'max_bin':251,\r\n", 1810 | " 'min_data_in_leaf':19,\r\n", 1811 | " \r\n", 1812 | " 'scale_pos_weight': 3.1,\r\n", 1813 | " \r\n", 1814 | " 'lambda_l1':0,\r\n", 1815 | " 'lambda_l2':0,\r\n", 1816 | " 'min_split_gain':0,\r\n", 1817 | " \r\n", 1818 | " 'device': 'gpu',\r\n", 1819 | " 'gpu_platform_id':0,\r\n", 1820 | " 'gpu_device_id':0\r\n", 1821 | "}\r\n", 1822 | "\r\n", 1823 | "print(datetime.now())\r\n", 1824 | "gbm = lgb.train(params, lgb_train, num_boost_round=450, early_stopping_rounds=30)\r\n", 1825 | "gbm.save_model('./handled_data/final_lgb_450.model') \r\n", 1826 | "# init_model=gbm, keep_training_booster=True" 1827 | ] 1828 | }, 1829 | { 1830 | "cell_type": "markdown", 1831 | "metadata": { 1832 | "id": "4B62C670C46546168FAA5DB184641EB9", 1833 | "mdEditEnable": false 1834 | }, 1835 | "source": [ 1836 | "## GridSearch" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 14, 1842 | "metadata": { 1843 | "id": "182A5DC4B3D74AC195675B4D444AFCEA", 1844 | "collapsed": false, 1845 | "scrolled": true 1846 | }, 1847 | "outputs": [ 1848 | { 1849 | "name": "stdout", 1850 | "output_type": "stream", 1851 | "text": [ 1852 | "Fitting 5 folds for each of 21 candidates, totalling 105 fits\n" 1853 | ] 1854 | }, 1855 | { 1856 | "name": "stderr", 1857 | "output_type": "stream", 1858 | "text": [ 1859 | "[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.\n[Parallel(n_jobs=10)]: Done 30 tasks | elapsed: 34.0min\n[Parallel(n_jobs=10)]: Done 105 out of 105 | elapsed: 120.1min finished\n" 1860 | ] 1861 | }, 1862 | { 1863 | "name": "stdout", 1864 | "output_type": "stream", 1865 | "text": [ 1866 | "{'min_data_in_leaf': 251} \n 0.7039860009576545\n" 1867 | ] 1868 | } 1869 | ], 1870 | "source": [ 1871 | "# 6 64\r\n", 1872 | "# 7 58\r\n", 1873 | "# params = {'max_depth':range(4,8), 'num_leaves':range(56, 60)}\r\n", 1874 | "# 2 20\r\n", 1875 | "# params = {'min_child_samples':range(18,23)}\r\n", 1876 | "\r\n", 1877 | "# params ={'max_bin': range(5,100,10), }\r\n", 1878 | "\r\n", 1879 | "params = {'min_data_in_leaf':range(240, 261)}\r\n", 1880 | "gbm = lgb.LGBMClassifier(boost='dart',\r\n", 1881 | " max_depth=7, \r\n", 1882 | " num_leaves=59,\r\n", 1883 | " max_bin = 95,\r\n", 1884 | " learning_rate=0.5, \r\n", 1885 | " n_estimators=100,\r\n", 1886 | " metric='auc')\r\n", 1887 | "gridSearch = GridSearchCV(estimator=gbm, param_grid=params, scoring='roc_auc', \r\n", 1888 | " cv=5, verbose=1, n_jobs=15)\r\n", 1889 | "\r\n", 1890 | "gridSearch.fit(x, y)\r\n", 1891 | "\r\n", 1892 | "print(gridSearch.best_params_ , '\\n', gridSearch.best_score_)" 1893 | ] 1894 | }, 1895 | { 1896 | "cell_type": "markdown", 1897 | "metadata": { 1898 | "id": "EDEF130C7D6E4C149C34CA5FA916CA2C", 1899 | "mdEditEnable": false 1900 | }, 1901 | "source": [ 1902 | "## 查看特征重要性" 1903 | ] 1904 | }, 1905 | { 1906 | "cell_type": "code", 1907 | "execution_count": 11, 1908 | "metadata": { 1909 | "id": "AB78DAF736034D958BD510FBBDD13A49", 1910 | "collapsed": false, 1911 | "scrolled": false 1912 | }, 1913 | "outputs": [ 1914 | { 1915 | "name": "stdout", 1916 | "output_type": "stream", 1917 | "text": [ 1918 | "52\n52\n feature_name importance\n0 DSSM_300 1522\n1 seq1_max_degree 1460\n2 pagerank_feature2 1374\n3 fuzzy_1 1246\n4 pagerank_feature1 1182\n5 fuzzy_3 1154\n6 fuzzy_4 1119\n7 doubleSide 890\n8 sameWord 750\n9 OOFCNN 743\n10 fasttextDistance_1 611\n11 levenshteinDistance 590\n12 fuzzy_5 562\n13 TFIDFDistance_2 550\n14 specialConcurrence 519\n15 DSSM_600 512\n16 fuzzy_2 476\n17 word2vecDistance_1 448\n18 seq2_max_degree 430\n19 fasttextWeightDistance_2 425\n20 ARC 399\n21 fasttextWeightDistance_1 397\n22 singleSide 392\n23 word2vec 360\n24 ochiaiDistance 340\n25 OOFLSTM 328\n26 w2vWeightDistance_1 316\n27 fasttextKeywordDistance_1 307\n28 TextCNN 296\n29 TFIDFDistance_3 281\n30 MVLSTM 254\n31 sorensenDistance 244\n32 concurrence 229\n33 w2vWeightDistance_2 222\n34 TFIDFDistance_4 219\n35 fasttextWeightDistance_3 204\n36 w2vKeywordDistance_1 191\n37 fasttextKeywordDistance_2 185\n38 TFIDFDistance_1 173\n39 TFIDFDistance_5 173\n40 word2vecDistance_2 162\n41 w2vWeightDistance_3 149\n42 word2vecDistance_3 142\n43 fasttextKeywordDistance_3 133\n44 fasttext 120\n45 jaccardDistance 119\n46 fasttextDistance_3 112\n47 w2vKeywordDistance_2 109\n48 w2vKeywordDistance_3 102\n49 fasttextDistance_2 94\n50 seq1_max_component 85\n51 diceDistance 71\n0 concurrence 229\n1 levenshteinDistance 590\n2 sorensenDistance 244\n3 sameWord 750\n4 specialConcurrence 519\n5 diceDistance 71\n6 ochiaiDistance 340\n7 jaccardDistance 119\n8 word2vecDistance_1 448\n9 word2vecDistance_2 162\n10 word2vecDistance_3 142\n11 fasttextDistance_1 611\n12 fasttextDistance_2 94\n13 fasttextDistance_3 112\n14 w2vWeightDistance_1 316\n15 w2vWeightDistance_2 222\n16 w2vWeightDistance_3 149\n17 w2vKeywordDistance_1 191\n18 w2vKeywordDistance_2 109\n19 w2vKeywordDistance_3 102\n20 TFIDFDistance_1 173\n21 TFIDFDistance_2 550\n22 TFIDFDistance_3 281\n23 TFIDFDistance_4 219\n24 TFIDFDistance_5 173\n25 fasttextWeightDistance_1 397\n26 fasttextWeightDistance_2 425\n27 fasttextWeightDistance_3 204\n28 fasttextKeywordDistance_1 307\n29 fasttextKeywordDistance_2 185\n30 fasttextKeywordDistance_3 133\n31 seq1_max_degree 1460\n32 seq2_max_degree 430\n33 seq1_max_component 85\n34 pagerank_feature1 1182\n35 pagerank_feature2 1374\n36 fuzzy_1 1246\n37 fuzzy_2 476\n38 fuzzy_3 1154\n39 fuzzy_4 1119\n40 fuzzy_5 562\n41 word2vec 360\n42 fasttext 120\n43 singleSide 392\n44 doubleSide 890\n45 DSSM_300 1522\n46 DSSM_600 512\n47 MVLSTM 254\n48 ARC 399\n49 TextCNN 296\n50 OOFCNN 743\n51 OOFLSTM 328\n" 1919 | ] 1920 | } 1921 | ], 1922 | "source": [ 1923 | "name = ['concurrence', 'levenshteinDistance', 'sorensenDistance', \r\n", 1924 | " 'sameWord', 'specialConcurrence',\r\n", 1925 | " 'diceDistance', 'ochiaiDistance', 'jaccardDistance', \r\n", 1926 | " 'word2vecDistance_1', 'word2vecDistance_2', 'word2vecDistance_3', \r\n", 1927 | " 'fasttextDistance_1', 'fasttextDistance_2', 'fasttextDistance_3', \r\n", 1928 | " 'w2vWeightDistance_1', 'w2vWeightDistance_2', 'w2vWeightDistance_3',\r\n", 1929 | " 'w2vKeywordDistance_1', 'w2vKeywordDistance_2', 'w2vKeywordDistance_3',\r\n", 1930 | " 'TFIDFDistance_1', 'TFIDFDistance_2', 'TFIDFDistance_3', \r\n", 1931 | " 'TFIDFDistance_4', 'TFIDFDistance_5', \r\n", 1932 | " 'fasttextWeightDistance_1', 'fasttextWeightDistance_2', 'fasttextWeightDistance_3',\r\n", 1933 | " 'fasttextKeywordDistance_1', 'fasttextKeywordDistance_2', 'fasttextKeywordDistance_3',\r\n", 1934 | " 'seq1_max_degree', 'seq2_max_degree', 'seq1_max_component',\r\n", 1935 | " 'pagerank_feature1', 'pagerank_feature2',\r\n", 1936 | " 'fuzzy_1', 'fuzzy_2', 'fuzzy_3', 'fuzzy_4', 'fuzzy_5', 'word2vec', 'fasttext',\r\n", 1937 | " 'singleSide', 'doubleSide',\r\n", 1938 | " # 'seq1_neighbor', 'seq2_neighbor', 'total_neighbor',\r\n", 1939 | " 'DSSM_300', 'DSSM_600', \r\n", 1940 | " 'MVLSTM', 'ARC','TextCNN','OOFCNN' ,'OOFLSTM' ]\r\n", 1941 | " \r\n", 1942 | "print(len(name))\r\n", 1943 | "print(gbm.feature_importance().__len__())\r\n", 1944 | "feature_importance = pd.DataFrame({'feature_name': name[:gbm.feature_importance().__len__()], \r\n", 1945 | " 'importance': gbm.feature_importance()}).sort_values(by='importance', ascending=False).reset_index(drop = True)\r\n", 1946 | "print(feature_importance)\r\n", 1947 | "for i in range(len(gbm.feature_importance())):\r\n", 1948 | " print(i, name[i], gbm.feature_importance()[i])" 1949 | ] 1950 | }, 1951 | { 1952 | "cell_type": "markdown", 1953 | "metadata": { 1954 | "id": "D54F3E9A7A8244968747F2C45CF15329", 1955 | "mdEditEnable": false 1956 | }, 1957 | "source": [ 1958 | "## 测试结果" 1959 | ] 1960 | }, 1961 | { 1962 | "cell_type": "code", 1963 | "execution_count": 12, 1964 | "metadata": { 1965 | "id": "70BD1ABB77634747A35495A26496A518", 1966 | "collapsed": false, 1967 | "scrolled": false 1968 | }, 1969 | "outputs": [ 1970 | { 1971 | "name": "stdout", 1972 | "output_type": "stream", 1973 | "text": [ 1974 | "2019-08-11 20:18:38.544223\n" 1975 | ] 1976 | }, 1977 | { 1978 | "ename": "NameError", 1979 | "evalue": "name 'val_x' is not defined", 1980 | "traceback": [ 1981 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1982 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 1983 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 1984 | "\u001b[0;31mNameError\u001b[0m: name 'val_x' is not defined" 1985 | ], 1986 | "output_type": "error" 1987 | } 1988 | ], 1989 | "source": [ 1990 | "%%time\n", 1991 | "print(datetime.now())\n", 1992 | "# gbm = lgb.Booster(model_file='./handled_data/lgb_2kw_650.model')\n", 1993 | "pred = gbm.predict(val_x, num_iteration=gbm.best_iteration)\n", 1994 | "\n", 1995 | "print('end predict')\n", 1996 | "result = []\n", 1997 | "for item in pred:\n", 1998 | " if item>=0.5:\n", 1999 | " result.append(1)\n", 2000 | " else:\n", 2001 | " result.append(0)\n", 2002 | "count = 0\n", 2003 | "pos_count = 0\n", 2004 | "print('pos_predict:',np.sum(result))\n", 2005 | "\n", 2006 | "val_y_list = val_y.tolist()\n", 2007 | "for i in range(len(pred)):\n", 2008 | " if val_y_list[i] == 1 and result[i] == val_y_list[i]:\n", 2009 | " pos_count += 1\n", 2010 | " if result[i] == val_y_list[i]:\n", 2011 | " count+=1\n", 2012 | "print('acc:',count/len(pred))\n", 2013 | "print('pos_right_count:',pos_count)\n", 2014 | "print('pos_label:',np.sum(val_y.tolist()))\n", 2015 | "print('pos_acc:',pos_count / np.sum(val_y.tolist()))" 2016 | ] 2017 | }, 2018 | { 2019 | "cell_type": "markdown", 2020 | "metadata": { 2021 | "id": "A3E2CEF3CFC641CC9E2A3CD628F7C161", 2022 | "mdEditEnable": false 2023 | }, 2024 | "source": [ 2025 | "## 提交文件" 2026 | ] 2027 | }, 2028 | { 2029 | "cell_type": "code", 2030 | "execution_count": 13, 2031 | "metadata": { 2032 | "id": "952FBCFDA61D40D6B19562CB54E2575D", 2033 | "collapsed": false, 2034 | "scrolled": false 2035 | }, 2036 | "outputs": [ 2037 | { 2038 | "ename": "LightGBMError", 2039 | "evalue": "Could not open ./handled_data/lgb_power_450.model", 2040 | "traceback": [ 2041 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 2042 | "\u001b[0;31mLightGBMError\u001b[0m Traceback (most recent call last)", 2043 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", 2044 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, params, train_set, model_file, model_str, silent)\u001b[0m\n\u001b[1;32m 1662\u001b[0m \u001b[0mc_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1663\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbyref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout_num_iterations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1664\u001b[0;31m ctypes.byref(self.handle)))\n\u001b[0m\u001b[1;32m 1665\u001b[0m \u001b[0mout_num_class\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_int\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1666\u001b[0m _safe_call(_LIB.LGBM_BoosterGetNumClasses(\n", 2045 | "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py\u001b[0m in \u001b[0;36m_safe_call\u001b[0;34m(ret)\u001b[0m\n\u001b[1;32m 45\u001b[0m \"\"\"\n\u001b[1;32m 46\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mLightGBMError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdecode_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLGBM_GetLastError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 2046 | "\u001b[0;31mLightGBMError\u001b[0m: Could not open ./handled_data/lgb_power_450.model" 2047 | ], 2048 | "output_type": "error" 2049 | } 2050 | ], 2051 | "source": [ 2052 | "%%time\n", 2053 | "\n", 2054 | "gbm = lgb.Booster(model_file='./handled_data/final_lgb_450.model')\n", 2055 | "feature_normal_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_normal.csv', header=None)\n", 2056 | "print('loaded feature normal --->',feature_normal_test.shape)\n", 2057 | "feature_statistics_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_statistics.csv', header=None)\n", 2058 | "print('loaded feature statistics --->',feature_statistics_test.shape)\n", 2059 | "feature_pagerank_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_pagerank.csv', header=None)\n", 2060 | "print('loaded feature pagerank --->',feature_pagerank_test.shape)\n", 2061 | "feature_fuzzy_test = pd.read_csv('./handled_data/test_feature/test_feature_fuzzy.csv', header=None)\n", 2062 | "print('loaded fuzzy feature --->',feature_fuzzy_test.shape)\n", 2063 | "feature_powerful_test = pd.read_csv('./handled_data/test_feature/test_feature_powerful.csv', header=None)\n", 2064 | "print('loaded powerfule feature --->',feature_powerful_test.shape)\n", 2065 | "\n", 2066 | "# 融合特征\n", 2067 | "feat_dssm_300_test = pd.read_csv('./predictions/test_dssm_300_2kw.csv.gz', header=None)\n", 2068 | "print('loaded feature dssm_300 --->', feat_dssm_300_test.shape)\n", 2069 | "feat_dssm_600_test = pd.read_csv('./predictions/test_dssm_600_2kw.csv.gz', header=None)\n", 2070 | "print('loaded feature dssm_600 --->', feat_dssm_600_test.shape)\n", 2071 | "feat_mvlstm_test = pd.read_csv('./predictions/test_mv_2kw.csv.gz', header=None)\n", 2072 | "print('loaded feature MVLSTM --->', feat_mvlstm_test.shape)\n", 2073 | "feat_arc_test = pd.read_csv('./predictions/test_arc_2kw.csv.gz', header=None)\n", 2074 | "print('loaded feature ARC --->', feat_arc_test.shape)\n", 2075 | "feat_textcnn_test = pd.read_csv('./predictions/test_textcnn_2kw.csv.gz', header=None)\n", 2076 | "print('loaded feature TextCNN --->', feat_textcnn_test.shape)\n", 2077 | "\n", 2078 | "# feat_lstm_dssm_test = pd.read_csv('./predictions/test_lstm_dssm_2kw.csv', header=None)\n", 2079 | "# print('loaded feature LSTM-DSSM --->', feat_lstm_dssm_test.shape)\n", 2080 | "\n", 2081 | "print('------------finish reading file, concating feature------------')\n", 2082 | "test_feature = pd.concat([feature_normal_test, feature_statistics_test, feature_pagerank_test, \n", 2083 | " feature_fuzzy_test, feature_powerful_test, \n", 2084 | " feat_dssm_300_test,\n", 2085 | " feat_dssm_600_test,\n", 2086 | " feat_mvlstm_test,\n", 2087 | " feat_arc_test,\n", 2088 | " feat_textcnn_test,\n", 2089 | " # feat_lstm_dssm_test\n", 2090 | " ], axis=1)\n", 2091 | "print('concated feature, shape --->', test_feature.shape)" 2092 | ] 2093 | }, 2094 | { 2095 | "cell_type": "code", 2096 | "execution_count": 7, 2097 | "metadata": { 2098 | "id": "C709A47F9B8F45119C4BB2DEF2F96AFA", 2099 | "collapsed": false, 2100 | "scrolled": false 2101 | }, 2102 | "outputs": [ 2103 | { 2104 | "name": "stdout", 2105 | "output_type": "stream", 2106 | "text": [ 2107 | "finish prediction, shpae ---> (20000000, 3)\n" 2108 | ] 2109 | } 2110 | ], 2111 | "source": [ 2112 | "%%time\n", 2113 | "pred_test = pd.DataFrame(gbm.predict(test_feature, num_iteration=gbm.best_iteration))\n", 2114 | "\n", 2115 | "test_data = pd.read_csv(test_data_path, header=None)\n", 2116 | "submission = pd.concat([test_data[0], test_data[2], pred_test], axis=1)\n", 2117 | "print('finish prediction, shpae --->', submission.shape)\n", 2118 | "submission.to_csv('./predictions/lgb_submission_power_450.csv', header=None, index=False)" 2119 | ] 2120 | }, 2121 | { 2122 | "cell_type": "code", 2123 | "execution_count": 10, 2124 | "metadata": { 2125 | "id": "B5882F60E2E9433285E7C3C4B65EC5D7", 2126 | "collapsed": false, 2127 | "scrolled": false 2128 | }, 2129 | "outputs": [ 2130 | { 2131 | "name": "stdout", 2132 | "output_type": "stream", 2133 | "text": [ 2134 | "Kesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 ./predictions/lgb_submission_power_450.csv (565534.77 KiB)\n> 已上传 100 %\n> 文件已上传 \n> 服务器响应: 200 提交成功,请等待评审完成\n> 提交完成\n" 2135 | ] 2136 | } 2137 | ], 2138 | "source": [ 2139 | "!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file ./predictions/lgb_submission_power_450.csv -token 8be4f72dc2395a8d" 2140 | ] 2141 | }, 2142 | { 2143 | "cell_type": "markdown", 2144 | "metadata": { 2145 | "id": "BA927732609B435D981ABD77FF0A00E3", 2146 | "mdEditEnable": false 2147 | }, 2148 | "source": [ 2149 | "## Final提交文件" 2150 | ] 2151 | }, 2152 | { 2153 | "cell_type": "code", 2154 | "execution_count": 4, 2155 | "metadata": { 2156 | "id": "166E19CE673F447A84B2C4EC610BB4B5", 2157 | "collapsed": false, 2158 | "scrolled": false 2159 | }, 2160 | "outputs": [], 2161 | "source": [ 2162 | "# !wc -l /home/kesci/work/predictions/testfin_mvlstm.csv\n", 2163 | "# !wc -l /home/kesci/work/predictions/testfin_textcnn.csv\n", 2164 | "!killall python" 2165 | ] 2166 | }, 2167 | { 2168 | "cell_type": "code", 2169 | "execution_count": 6, 2170 | "metadata": { 2171 | "id": "FE7C8E4888A14AAA81B133E6F0F80CD4", 2172 | "collapsed": false, 2173 | "scrolled": false 2174 | }, 2175 | "outputs": [ 2176 | { 2177 | "name": "stdout", 2178 | "output_type": "stream", 2179 | "text": [ 2180 | "[ 1 2 3 ... 10318080 10318081 10318082]\n[ 1 2 3 ... 10318080 10318081 10318082]\n10318082\n10318082\n\n[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]\n[ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]\n20\n20\nCPU times: user 8.47 s, sys: 2.29 s, total: 10.8 s\nWall time: 11 s\n" 2181 | ] 2182 | } 2183 | ], 2184 | "source": [ 2185 | "%%time\n", 2186 | "# final = pd.read_csv(final_data_path, header=None)\n", 2187 | "# result = pd.read_csv('./predictions/final_submission_20190811.csv', header=None)\n", 2188 | "print(final[0].unique())\n", 2189 | "print(result[0].unique())\n", 2190 | "print(final[0].unique().__len__())\n", 2191 | "print(result[0].unique().__len__())\n", 2192 | "print()\n", 2193 | "\n", 2194 | "print(final[2].unique())\n", 2195 | "print(result[1].unique())\n", 2196 | "print(final[2].unique().__len__())\n", 2197 | "print(result[1].unique().__len__())" 2198 | ] 2199 | }, 2200 | { 2201 | "cell_type": "code", 2202 | "execution_count": 3, 2203 | "metadata": { 2204 | "id": "84E58641682D4B9BB34BA28DA4F058BF", 2205 | "collapsed": false, 2206 | "scrolled": false 2207 | }, 2208 | "outputs": [ 2209 | { 2210 | "name": "stderr", 2211 | "output_type": "stream", 2212 | "text": [ 2213 | "\r 0%| | 0/10 [00:00 (100000000, 3)\nCPU times: user 2h 26min 37s, sys: 4min 11s, total: 2h 30min 49s\nWall time: 50min 28s\n" 2375 | ] 2376 | } 2377 | ], 2378 | "source": [ 2379 | "%%time\n", 2380 | "print(datetime.now())\n", 2381 | "CHUNKSIZE = 10000000\n", 2382 | "gbm = lgb.Booster(model_file='./handled_data/final_lgb_450.model')\n", 2383 | "\n", 2384 | "feature_normal_final = pd.read_csv('./handled_data/final/final_feature_normal.csv', header=None, chunksize=CHUNKSIZE)\n", 2385 | "feature_statistics_final = pd.read_csv('./handled_data/final/final_feature_statistics.csv', header=None, chunksize=CHUNKSIZE)\n", 2386 | "feature_pagerank_final = pd.read_csv('./handled_data/final/final_feature_pagerank.csv', header=None, chunksize=CHUNKSIZE)\n", 2387 | "feature_fuzzy_final = pd.read_csv('./handled_data/final/final_feature_fuzzy.csv', header=None, chunksize=CHUNKSIZE)\n", 2388 | "feature_powerful_final = pd.read_csv('./handled_data/final/final_feature_powerful.csv', header=None, chunksize=CHUNKSIZE)\n", 2389 | "\n", 2390 | "feat_dssm_300_final = pd.read_csv('./predictions/testfin_300_dssm.csv_10kw.gz', header=None, chunksize=CHUNKSIZE)\n", 2391 | "feat_dssm_600_final = pd.read_csv('./predictions/testfin_600_dssm.csv', header=None, chunksize=CHUNKSIZE)\n", 2392 | "feat_mvlstm_final = pd.read_csv('./predictions/testfin_mvlstm.csv', header=None, chunksize=CHUNKSIZE)\n", 2393 | "feat_arc_final = pd.read_csv('./predictions/testfin_arc.csv', header=None, chunksize=CHUNKSIZE)\n", 2394 | "feat_textcnn_final = pd.read_csv('./predictions/testfin_textcnn.csv', header=None, chunksize=CHUNKSIZE)\n", 2395 | "feat_oofcnn_final = pd.read_csv('./predictions/testfin_oofcnn.csv', header=None, chunksize=CHUNKSIZE)\n", 2396 | "feat_ooflstm_final = pd.read_csv('./predictions/testfin_ooflstm.csv', header=None, chunksize=CHUNKSIZE)\n", 2397 | "final_data = pd.read_csv(final_data_path, header=None, chunksize=CHUNKSIZE)\n", 2398 | "result = []\n", 2399 | "print('start iteration')\n", 2400 | "for i in tqdm(range(10)):\n", 2401 | " features = pd.concat([feature_normal_final.get_chunk(), \n", 2402 | " feature_statistics_final.get_chunk(), feature_pagerank_final.get_chunk(),\n", 2403 | " feature_fuzzy_final.get_chunk(), feature_powerful_final.get_chunk(),\n", 2404 | " feat_dssm_300_final.get_chunk(),\n", 2405 | " feat_dssm_600_final.get_chunk(),\n", 2406 | " feat_mvlstm_final.get_chunk(),\n", 2407 | " feat_arc_final.get_chunk(),\n", 2408 | " feat_textcnn_final.get_chunk(),\n", 2409 | " feat_oofcnn_final.get_chunk(),\n", 2410 | " feat_ooflstm_final.get_chunk()], axis=1)\n", 2411 | " \n", 2412 | " idx = final_data.get_chunk()\n", 2413 | " pred = gbm.predict(features, num_iteration=gbm.best_iteration)\n", 2414 | " result.append(pd.DataFrame({'query_id': idx[0].tolist(), 'title_id': idx[2].tolist(), 'label': pred}))\n", 2415 | " print(result[i].shape)\n", 2416 | " \n", 2417 | " del pred\n", 2418 | " del idx\n", 2419 | " del features\n", 2420 | " gc.collect()\n", 2421 | " \n", 2422 | "result = pd.concat(result)\n", 2423 | "result.columns = range(result.shape[1])\n", 2424 | "print(result.describe())\n", 2425 | "print('finish prediction, shpae --->', result.shape)\n", 2426 | "result.to_csv('./predictions/final_submission_20190811_2.csv', header=None, index=False)\n", 2427 | "\n", 2428 | "# feature_append_test = pd.read_csv('./handled_data/final/test_feature_append.csv', header=None)\n", 2429 | "# print('loaded append feature --->',feature_append_test.shape)\n", 2430 | "\n", 2431 | "# 融合特征\n", 2432 | "# with gzip.open('./predictions/testfin_300_dssm.csv_10kw.gz', 'rb') as f:\n", 2433 | "# feat_dssm_300_final = pd.read_csv(f, header=None)\n", 2434 | "# print('loaded feature dssm_300 --->', feat_dssm_300_final.shape)\n", 2435 | " \n", 2436 | "# with gzip.open('./predictions/testfin_600_dssm.csv_10kw.gz', 'rb') as f:\n", 2437 | "# feat_dssm_600_final = pd.read_csv(f, header=None)\n", 2438 | "# feat_dssm_600_final = pd.read_csv('./predictions/testfin_600_dssm_p1.csv', header=None)\n", 2439 | "# print('loaded feature dssm_600 --->', feat_dssm_600_final.shape)\n", 2440 | " \n", 2441 | "# with gzip.open('./predictions/test_mv_2kw.csv.gz', 'rb') as f:\n", 2442 | "# feat_mvlstm_test = pd.read_csv(f, header=None)\n", 2443 | "# print('loaded feature MVLSTM --->', feat_mvlstm_test.shape)\n", 2444 | " \n", 2445 | "# with gzip.open('./predictions/test_arc_2kw.csv.gz', 'rb') as f:\n", 2446 | "# feat_arc_test = pd.read_csv(f, header=None)\n", 2447 | "# print('loaded feature ARC --->', feat_arc_test.shape)\n", 2448 | " \n", 2449 | "# with gzip.open('./predictions/test_textcnn_2kw.csv.gz', 'rb') as f:\n", 2450 | "# feat_textcnn_test = pd.read_csv(f, header=None)\n", 2451 | "# print('loaded feature TextCNN --->', feat_textcnn_test.shape)\n", 2452 | "\n", 2453 | "# feat_lstm_dssm_test = pd.read_csv('./predictions/test_lstm_dssm_2kw.csv', header=None)\n", 2454 | "# print('loaded feature LSTM-DSSM --->', feat_lstm_dssm_test.shape)" 2455 | ] 2456 | }, 2457 | { 2458 | "cell_type": "code", 2459 | "execution_count": 4, 2460 | "metadata": { 2461 | "id": "D3072559E2AB4D09B7A6B1A5C5F99C5D", 2462 | "hide_input": false, 2463 | "collapsed": false, 2464 | "scrolled": false 2465 | }, 2466 | "outputs": [ 2467 | { 2468 | "name": "stdout", 2469 | "output_type": "stream", 2470 | "text": [ 2471 | "Kesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 ./predictions/final_submission_20190811_2.csv (2877771.46 KiB)\n> 已上传 100 %\n> 文件已上传 \n> 服务器响应: 200 提交成功\n> 提交完成\n" 2472 | ] 2473 | } 2474 | ], 2475 | "source": [ 2476 | "!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file ./predictions/final_submission_20190811_2.csv -token 8be4f72dc2395a8d -mode archive " 2477 | ] 2478 | }, 2479 | { 2480 | "cell_type": "markdown", 2481 | "metadata": { 2482 | "id": "F8C1BDC9AC484B3F81EA96EEF49108D9", 2483 | "mdEditEnable": false 2484 | }, 2485 | "source": [ 2486 | "# end" 2487 | ] 2488 | } 2489 | ], 2490 | "metadata": { 2491 | "kernelspec": { 2492 | "name": "python3", 2493 | "display_name": "Python 3", 2494 | "language": "python" 2495 | }, 2496 | "language_info": { 2497 | "name": "python", 2498 | "version": "3.6.4", 2499 | "mimetype": "text/x-python", 2500 | "codemirror_mode": { 2501 | "name": "ipython", 2502 | "version": 3 2503 | }, 2504 | "pygments_lexer": "ipython3", 2505 | "nbconvert_exporter": "python", 2506 | "file_extension": ".py" 2507 | } 2508 | }, 2509 | "nbformat": 4, 2510 | "nbformat_minor": 0 2511 | } 2512 | -------------------------------------------------------------------------------- /arcii.py: -------------------------------------------------------------------------------- 1 | """An implementation of Matching Layer.""" 2 | import typing 3 | 4 | from keras import backend as K 5 | from keras.engine import Layer 6 | 7 | 8 | class MatchingLayer(Layer): 9 | """ 10 | Layer that computes a matching matrix between samples in two tensors. 11 | :param normalize: Whether to L2-normalize samples along the 12 | dot product axis before taking the dot product. 13 | If set to True, then the output of the dot product 14 | is the cosine proximity between the two samples. 15 | :param matching_type: the similarity function for matching 16 | :param kwargs: Standard layer keyword arguments. 17 | Examples: 18 | >>> import matchzoo as mz 19 | >>> layer = mz.layers.MatchingLayer(matching_type='dot', 20 | ... normalize=True) 21 | >>> num_batch, left_len, right_len, num_dim = 5, 3, 2, 10 22 | >>> layer.build([[num_batch, left_len, num_dim], 23 | ... [num_batch, right_len, num_dim]]) 24 | """ 25 | 26 | def __init__(self, normalize: bool = False, 27 | matching_type: str = 'dot', **kwargs): 28 | """:class:`MatchingLayer` constructor.""" 29 | super().__init__(**kwargs) 30 | self._normalize = normalize 31 | self._validate_matching_type(matching_type) 32 | self._matching_type = matching_type 33 | self._shape1 = None 34 | self._shape2 = None 35 | 36 | @classmethod 37 | def _validate_matching_type(cls, matching_type: str = 'dot'): 38 | valid_matching_type = ['dot', 'mul', 'plus', 'minus', 'concat'] 39 | if matching_type not in valid_matching_type: 40 | raise ValueError(f"{matching_type} is not a valid matching type, " 41 | f"{valid_matching_type} expected.") 42 | 43 | def build(self, input_shape: list): 44 | """ 45 | Build the layer. 46 | :param input_shape: the shapes of the input tensors, 47 | for MatchingLayer we need tow input tensors. 48 | """ 49 | # Used purely for shape validation. 50 | if not isinstance(input_shape, list) or len(input_shape) != 2: 51 | raise ValueError('A `MatchingLayer` layer should be called ' 52 | 'on a list of 2 inputs.') 53 | self._shape1 = input_shape[0] 54 | self._shape2 = input_shape[1] 55 | for idx in 0, 2: 56 | if self._shape1[idx] != self._shape2[idx]: 57 | raise ValueError( 58 | 'Incompatible dimensions: ' 59 | f'{self._shape1[idx]} != {self._shape2[idx]}.' 60 | f'Layer shapes: {self._shape1}, {self._shape2}.' 61 | ) 62 | 63 | def call(self, inputs: list, **kwargs) -> typing.Any: 64 | """ 65 | The computation logic of MatchingLayer. 66 | :param inputs: two input tensors. 67 | """ 68 | x1 = inputs[0] 69 | x2 = inputs[1] 70 | if self._matching_type == 'dot': 71 | if self._normalize: 72 | x1 = K.l2_normalize(x1, axis=2) 73 | x2 = K.l2_normalize(x2, axis=2) 74 | return K.tf.expand_dims(K.tf.einsum('abd,acd->abc', x1, x2), 3) 75 | else: 76 | if self._matching_type == 'mul': 77 | def func(x, y): 78 | return x * y 79 | elif self._matching_type == 'plus': 80 | def func(x, y): 81 | return x + y 82 | elif self._matching_type == 'minus': 83 | def func(x, y): 84 | return x - y 85 | elif self._matching_type == 'concat': 86 | def func(x, y): 87 | return K.tf.concat([x, y], axis=3) 88 | else: 89 | raise ValueError(f"Invalid matching type." 90 | f"{self._matching_type} received." 91 | f"Mut be in `dot`, `mul`, `plus`, " 92 | f"`minus` and `concat`.") 93 | x1_exp = K.tf.stack([x1] * self._shape2[1], 2) 94 | x2_exp = K.tf.stack([x2] * self._shape1[1], 1) 95 | return func(x1_exp, x2_exp) 96 | 97 | def compute_output_shape(self, input_shape: list) -> tuple: 98 | """ 99 | Calculate the layer output shape. 100 | :param input_shape: the shapes of the input tensors, 101 | for MatchingLayer we need tow input tensors. 102 | """ 103 | if not isinstance(input_shape, list) or len(input_shape) != 2: 104 | raise ValueError('A `MatchingLayer` layer should be called ' 105 | 'on a list of 2 inputs.') 106 | shape1 = list(input_shape[0]) 107 | shape2 = list(input_shape[1]) 108 | if len(shape1) != 3 or len(shape2) != 3: 109 | raise ValueError('A `MatchingLayer` layer should be called ' 110 | 'on 2 inputs with 3 dimensions.') 111 | if shape1[0] != shape2[0] or shape1[2] != shape2[2]: 112 | raise ValueError('A `MatchingLayer` layer should be called ' 113 | 'on 2 inputs with same 0,2 dimensions.') 114 | 115 | if self._matching_type in ['mul', 'plus', 'minus']: 116 | return shape1[0], shape1[1], shape2[1], shape1[2] 117 | elif self._matching_type == 'dot': 118 | return shape1[0], shape1[1], shape2[1], 1 119 | elif self._matching_type == 'concat': 120 | return shape1[0], shape1[1], shape2[1], shape1[2] + shape2[2] 121 | else: 122 | raise ValueError(f"Invalid `matching_type`." 123 | f"{self._matching_type} received." 124 | f"Must be in `mul`, `plus`, `minus` " 125 | f"`dot` and `concat`.") 126 | 127 | def get_config(self) -> dict: 128 | """Get the config dict of MatchingLayer.""" 129 | config = { 130 | 'normalize': self._normalize, 131 | 'matching_type': self._matching_type, 132 | } 133 | base_config = super(MatchingLayer, self).get_config() 134 | return dict(list(base_config.items()) + list(config.items())) 135 | 136 | 137 | queryInput = Input(shape=(30,600)) 138 | 139 | titleInput = Input(shape=(30,600)) 140 | 141 | conv_1d_left = Conv1D(128, 3, padding = 'same')(queryInput) 142 | conv_1d_right = Conv1D(128, 3, padding = 'same')(titleInput) 143 | 144 | matching_layer = MatchingLayer(matching_type='plus') 145 | embed_cross = matching_layer([conv_1d_left, conv_1d_right]) 146 | 147 | embed_cross = Conv2D(16, [5, 5], padding = 'same', activation = 'relu')(embed_cross) 148 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross) 149 | 150 | embed_cross = Conv2D(32, [5, 5], padding = 'same', activation = 'relu')(embed_cross) 151 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross) 152 | 153 | embed_cross = Conv2D(64, [5, 5], padding = 'same', activation = 'relu')(embed_cross) 154 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross) 155 | 156 | 157 | embed_flat = Flatten()(embed_cross) 158 | 159 | x = Dense(256, activation='relu')(embed_flat) 160 | x = Dense(64, activation='relu')(x) 161 | x = Dense(1, activation='sigmoid')(x) 162 | 163 | model_arc = Model(inputs=[queryInput,titleInput], outputs=x) 164 | model_arc.compile(loss='binary_crossentropy', 165 | optimizer='adam', 166 | metrics=[metrics.mae, metrics.binary_accuracy]) 167 | 168 | -------------------------------------------------------------------------------- /cnn-dssm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras import backend 3 | from keras.layers import Activation, Input 4 | from keras.layers.core import Dense, Lambda, Reshape 5 | from keras.layers.convolutional import Convolution1D 6 | from keras.layers.merge import concatenate, dot 7 | from keras.models import Model 8 | from keras import metrics 9 | 10 | K = 300 11 | L = 128 12 | J = 2 13 | 14 | query = Input(shape = (10, 300)) 15 | pos_doc = Input(shape = (30, 300)) 16 | neg_docs = [Input(shape = (30, 300)) for j in range(J)] 17 | 18 | # 在 DSSM 的表示层使用了类似 TextCNN 的架构 19 | 20 | query_conv1 = Convolution1D(K, 1, padding = "same", 21 | input_shape = (None, WORD_DEPTH), 22 | activation = "tanh")(query) # See equation (2). 23 | 24 | query_conv2 = Convolution1D(K, 2, padding = "same", 25 | input_shape = (None, WORD_DEPTH), 26 | activation = "tanh")(query) 27 | 28 | query_conv3 = Convolution1D(K, 3, padding = "same", 29 | input_shape = (None, WORD_DEPTH), 30 | activation = "tanh")(query) 31 | 32 | query_conv4 = Convolution1D(K, 4, padding = "same",dilation_rate=2, 33 | input_shape = (None, WORD_DEPTH), 34 | 35 | activation = "tanh")(query) 36 | 37 | query_conv5 = Convolution1D(K, 5, padding = "same",dilation_rate=2, 38 | input_shape = (None, WORD_DEPTH), 39 | activation = "tanh")(query) 40 | 41 | query_conv6 = Convolution1D(K, 6, padding = "same", 42 | input_shape = (None, WORD_DEPTH), 43 | activation = "tanh")(query) 44 | 45 | # 下一步,将最大池化层应用在卷积后的query上。 46 | # 这一操作选择了每一列的最大值 47 | 48 | query_max1 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv1) 49 | 50 | query_max2 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv2) 51 | 52 | query_max3 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv3) 53 | 54 | query_max4 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv4) 55 | 56 | query_max5 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv5) 57 | 58 | query_max6 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv6) 59 | 60 | query_concat_1_2 = concatenate([query_max1,query_max2]) 61 | 62 | query_concat_3_4 = concatenate([query_max3,query_max4]) 63 | 64 | query_concat_5_6 = concatenate([query_max5,query_max6]) 65 | 66 | 67 | query_sem1 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_1_2) 68 | 69 | query_sem2 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_3_4) 70 | 71 | query_sem3 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_5_6) 72 | 73 | # 在这一步中,生成一个句向量来表示一个query。这是一个标准的神经网络层。 74 | 75 | 76 | query_concat = concatenate([query_sem1,query_sem2,query_sem3]) 77 | 78 | query_sem = Dense(L, activation = "tanh", input_dim = K*3)(query_concat) 79 | 80 | doc_conv1 = Convolution1D(K, 1, padding = "same", 81 | input_shape = (None, WORD_DEPTH), 82 | activation = "tanh") 83 | 84 | doc_conv2 = Convolution1D(K, 2, padding = "same", 85 | input_shape = (None, WORD_DEPTH), 86 | activation = "tanh") 87 | 88 | doc_conv3 = Convolution1D(K, 3, padding = "same", 89 | input_shape = (None, WORD_DEPTH), 90 | activation = "tanh") 91 | 92 | doc_conv4 = Convolution1D(K, 4, padding = "same",dilation_rate=2, 93 | input_shape = (None, WORD_DEPTH), 94 | activation = "tanh") 95 | 96 | doc_conv5 = Convolution1D(K, 5, padding = "same",dilation_rate=2, 97 | input_shape = (None, WORD_DEPTH), 98 | activation = "tanh") 99 | 100 | doc_conv6 = Convolution1D(K, 6, padding = "same", 101 | input_shape = (None, WORD_DEPTH), 102 | activation = "tanh") 103 | 104 | 105 | doc_max = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (K, )) 106 | 107 | doc_sem1 = Dense(L, activation = "tanh", input_dim = K*2) 108 | 109 | doc_sem2 = Dense(L, activation = "tanh", input_dim = K*2) 110 | 111 | doc_sem3 = Dense(L, activation = "tanh", input_dim = K*2) 112 | 113 | doc_sem = Dense(L, activation = "tanh", input_dim = K*3) 114 | 115 | 116 | # 正样本 117 | pos_doc_conv1 = doc_conv1(pos_doc) 118 | pos_doc_max1 = doc_max(pos_doc_conv1) 119 | 120 | pos_doc_conv2 = doc_conv2(pos_doc) 121 | pos_doc_max2 = doc_max(pos_doc_conv2) 122 | 123 | pos_doc_conv3 = doc_conv3(pos_doc) 124 | pos_doc_max3 = doc_max(pos_doc_conv3) 125 | 126 | pos_doc_conv4 = doc_conv4(pos_doc) 127 | pos_doc_max4 = doc_max(pos_doc_conv4) 128 | 129 | pos_doc_conv5 = doc_conv5(pos_doc) 130 | pos_doc_max5 = doc_max(pos_doc_conv5) 131 | 132 | pos_doc_conv6 = doc_conv6(pos_doc) 133 | pos_doc_max6 = doc_max(pos_doc_conv6) 134 | 135 | pos_doc_concat_1_2 = concatenate([pos_doc_max1,pos_doc_max2]) 136 | 137 | pos_doc_concat_3_4 = concatenate([pos_doc_max3,pos_doc_max4]) 138 | 139 | pos_doc_concat_5_6 = concatenate([pos_doc_max5,pos_doc_max6]) 140 | 141 | 142 | pos_doc_sem1 = doc_sem1(pos_doc_concat_1_2) 143 | 144 | pos_doc_sem2 = doc_sem2(pos_doc_concat_3_4) 145 | 146 | pos_doc_sem3 = doc_sem3(pos_doc_concat_5_6) 147 | 148 | pos_doc_concat = concatenate([pos_doc_sem1,pos_doc_sem2,pos_doc_sem3]) 149 | 150 | pos_doc_sem = doc_sem(pos_doc_concat) 151 | 152 | 153 | # 负样本 154 | 155 | neg_doc_convs1 = [doc_conv1(neg_doc) for neg_doc in neg_docs] 156 | neg_doc_maxes1 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs1] 157 | 158 | neg_doc_convs2 = [doc_conv2(neg_doc) for neg_doc in neg_docs] 159 | neg_doc_maxes2 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs2] 160 | 161 | neg_doc_convs3 = [doc_conv3(neg_doc) for neg_doc in neg_docs] 162 | neg_doc_maxes3 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs3] 163 | 164 | neg_doc_convs4 = [doc_conv4(neg_doc) for neg_doc in neg_docs] 165 | neg_doc_maxes4 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs4] 166 | 167 | neg_doc_convs5 = [doc_conv5(neg_doc) for neg_doc in neg_docs] 168 | neg_doc_maxes5 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs5] 169 | 170 | neg_doc_convs6 = [doc_conv6(neg_doc) for neg_doc in neg_docs] 171 | neg_doc_maxes6 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs6] 172 | 173 | neg_doc_concats_1_2 = [concatenate([l1,l2]) for l1,l2 in zip(neg_doc_maxes1,neg_doc_maxes2)] 174 | 175 | neg_doc_concats_3_4 = [concatenate([l3,l4]) for l3,l4 in zip(neg_doc_maxes3,neg_doc_maxes4)] 176 | 177 | neg_doc_concats_5_6 = [concatenate([l5,l6]) for l5,l6 in zip(neg_doc_maxes5,neg_doc_maxes6)] 178 | 179 | neg_doc_sems1 = [doc_sem1(neg_doc_concat) for neg_doc_concat in neg_doc_concats_1_2] 180 | 181 | neg_doc_sems2 = [doc_sem2(neg_doc_concat) for neg_doc_concat in neg_doc_concats_3_4] 182 | 183 | neg_doc_sems3 = [doc_sem3(neg_doc_concat) for neg_doc_concat in neg_doc_concats_5_6] 184 | 185 | neg_doc_concats = [concatenate([l1,l2,l3]) for l1,l2,l3 in zip(neg_doc_sems1,neg_doc_sems2,neg_doc_sems3)] 186 | 187 | 188 | neg_doc_sems = [doc_sem(neg_doc_concat) for neg_doc_concat in neg_doc_concats] 189 | 190 | # 计算 query 和每个 title 的余弦相似度 R(Q, D) 191 | 192 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes = 1, normalize = True) # See equation (4). 193 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes = 1, normalize = True) for neg_doc_sem in neg_doc_sems] 194 | 195 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns) 196 | concat_Rs = Reshape((J + 1, 1))(concat_Rs) 197 | 198 | # 在这一步,将每个 R(Q, D) 乘以 gamma。 199 | # 在论文中,gamma 是 softmax 的平滑因子, 200 | # 这里用 CNN 来学习gamma的值,是一个 1*1 的卷积核。 201 | 202 | weight = np.array([1]).reshape(1, 1, 1) 203 | with_gamma = Convolution1D(1, 1, padding = "same", 204 | input_shape = (J + 1, 1), 205 | activation = "linear", 206 | use_bias = False, 207 | weights = [weight])(concat_Rs) 208 | with_gamma = Reshape((J + 1, ))(with_gamma) 209 | 210 | prob = Activation("softmax")(with_gamma) 211 | 212 | model = Model(inputs = [query, pos_doc] + neg_docs, outputs = prob) 213 | model.compile(optimizer = "adadelta", loss = "categorical_crossentropy", 214 | metrics=[metrics.mae, metrics.binary_accuracy]) 215 | 216 | 217 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p]) 218 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns) -------------------------------------------------------------------------------- /images/Code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/Code.png -------------------------------------------------------------------------------- /images/arcii.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/arcii.png -------------------------------------------------------------------------------- /images/cnn-dssm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/cnn-dssm.png -------------------------------------------------------------------------------- /images/lstm-dssm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/lstm-dssm.png -------------------------------------------------------------------------------- /images/mvlstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/mvlstm.png -------------------------------------------------------------------------------- /images/params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/params.png -------------------------------------------------------------------------------- /images/textcnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/textcnn.png -------------------------------------------------------------------------------- /lstm-dssm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.layers import Permute 3 | from keras import backend 4 | from keras.layers import * 5 | from keras.layers.core import Dense, Lambda, Reshape 6 | from keras.layers.convolutional import Convolution1D 7 | from keras.layers.merge import concatenate, dot 8 | from keras.models import Model 9 | from keras import metrics 10 | 11 | K = 300 12 | L = 128 13 | J = 2 14 | 15 | query = Input(shape = (10, 300)) 16 | pos_doc = Input(shape = (30, 300)) 17 | neg_docs = [Input(shape = (30, 300)) for j in range(J)] 18 | 19 | 20 | query_lstm1 = Bidirectional(CuDNNLSTM(K, return_sequences=True))(query) 21 | 22 | query_lstm2 = Bidirectional(CuDNNLSTM(K, return_sequences=True))(query_lstm1) 23 | 24 | # Attention 25 | query_a1 = Permute((2, 1))(query_lstm2) 26 | 27 | query_a3 = Dense(10, activation='softmax')(query_a1) 28 | 29 | query_a_probs = Permute((2, 1))(query_a3) 30 | 31 | query_attention_out = multiply([query_lstm2, query_a_probs]) 32 | 33 | query_lstm3 = Bidirectional(CuDNNLSTM(K))(query_attention_out) 34 | 35 | query_sem = Dense(L, activation = "tanh", input_dim = K)(query_lstm3) 36 | 37 | 38 | 39 | doc_lstm1 = Bidirectional(CuDNNLSTM(K, return_sequences=True)) 40 | 41 | doc_lstm2 = Bidirectional(CuDNNLSTM(K, return_sequences=True)) 42 | 43 | doc_a1 = Permute((2, 1)) 44 | 45 | doc_a2 = Reshape((300, 10)) 46 | 47 | doc_att_dense = Dense(30, activation='softmax') 48 | 49 | doc_a_probs = Permute((2, 1)) 50 | 51 | 52 | doc_lstm3 = Bidirectional(CuDNNLSTM( 150 )) 53 | 54 | doc_sem = Dense(L, activation = "tanh", input_dim = K) 55 | 56 | 57 | # 正样本 58 | 59 | pos_doc_lstm1 = doc_lstm1(pos_doc) 60 | 61 | pos_doc_lstm2 = doc_lstm2(pos_doc_lstm1) 62 | 63 | pos_doc_a1 = Permute((2, 1))(pos_doc_lstm2) 64 | 65 | pos_doc_a3 = doc_att_dense(pos_doc_a1) 66 | 67 | pos_doc_probs = Permute((2, 1))(pos_doc_a3) 68 | 69 | pos_doc_att_out = multiply([pos_doc_lstm2,pos_doc_probs]) 70 | 71 | pos_doc_lstm3 = doc_lstm3(pos_doc_att_out) 72 | 73 | pos_doc_sem = doc_sem(pos_doc_lstm3) 74 | 75 | # 负样本 76 | 77 | neg_doc_lstm1 = [doc_lstm1(neg_doc) for neg_doc in neg_docs] 78 | neg_doc_lstm2 = [doc_lstm2(neg_doc) for neg_doc in neg_doc_lstm1] 79 | 80 | neg_doc_a1 = [Permute((2, 1))(neg_doc) for neg_doc in neg_doc_lstm2] 81 | 82 | neg_doc_a3 = [doc_att_dense(neg_doc) for neg_doc in neg_doc_a1] 83 | 84 | neg_doc_probs = [Permute((2, 1))(neg_doc) for neg_doc in neg_doc_a3] 85 | 86 | neg_doc_att_out = [multiply([lstm,prb]) for lstm,prb in zip(neg_doc_lstm2,neg_doc_probs)] 87 | 88 | neg_doc_lstm3 = [doc_lstm3(neg_doc) for neg_doc in neg_doc_att_out] 89 | 90 | neg_doc_sems = [doc_sem(neg_doc_lstm_mx) for neg_doc_lstm_mx in neg_doc_lstm3] 91 | 92 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes = 1, normalize = True) # See equation (4). 93 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes = 1, normalize = True) for neg_doc_sem in neg_doc_sems] # See equation (4). 94 | 95 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns) 96 | concat_Rs = Reshape((J + 1, 1))(concat_Rs) 97 | 98 | weight = np.array([1]).reshape(1, 1, 1) 99 | with_gamma = Convolution1D(1, 1, padding = "same", 100 | input_shape = (J + 1, 1), 101 | activation = "linear", 102 | use_bias = False, 103 | weights = [weight])(concat_Rs) 104 | with_gamma = Reshape((J + 1, ))(with_gamma) 105 | 106 | prob = Activation("softmax")(with_gamma) 107 | 108 | model = Model(inputs = [query, pos_doc] + neg_docs, outputs = prob) 109 | model.compile(optimizer = "adam", loss = "categorical_crossentropy", 110 | metrics=[metrics.mae, metrics.binary_accuracy]) 111 | 112 | 113 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p]) 114 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns) -------------------------------------------------------------------------------- /mvlstm.py: -------------------------------------------------------------------------------- 1 | queryInput = Input(shape=(30,600)) 2 | 3 | titleInput = Input(shape=(30,600)) 4 | 5 | rep_query = Bidirectional(CuDNNLSTM(128,return_sequences=True))(queryInput) 6 | rep_query = Bidirectional(CuDNNLSTM(128,return_sequences=True))(rep_query) 7 | 8 | rep_doc = Bidirectional(CuDNNLSTM(128,return_sequences=True))(titleInput) 9 | rep_doc = Bidirectional(CuDNNLSTM(128,return_sequences=True))(rep_doc) 10 | 11 | # Top-k matching layer 12 | matching_matrix = Dot(axes=[2, 2], normalize=False)([rep_query, rep_doc]) 13 | matching_signals = Reshape((-1,))(matching_matrix) 14 | matching_topk = Lambda(lambda x: K.tf.nn.top_k(x, k=50, sorted=True)[0])(matching_signals) 15 | 16 | # Multilayer perceptron layer. 17 | dnn = Dense(256,activation = 'relu')(matching_topk) 18 | dnn = Dense(64,activation = 'relu')(matching_topk) 19 | out = Dense(1,activation = 'sigmoid')(dnn) 20 | model_mvlstm = Model(inputs=[queryInput,titleInput], outputs=out) 21 | model_mvlstm.compile(loss='binary_crossentropy', 22 | optimizer='adam', 23 | metrics=[metrics.mae, metrics.binary_accuracy]) 24 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 2019中国高校计算机大赛——大数据挑战赛 WriteUp 2 | 赛题:[【正式赛】2019中国高校计算机大赛——大数据挑战赛](https://www.kesci.com/home/competition/5cc51043f71088002c5b8840/content/1) 3 | 4 | 我们是最终排名第 15 名的**改革春风吹满地**队伍,对方法和模型进行了简单的整理。 5 | 6 | * [2019中国高校计算机大赛——大数据挑战赛 WriteUp](#2019中国高校计算机大赛大数据挑战赛-writeup) 7 | * [传统模型](#传统模型) 8 | * [query和title去重](#query和title去重) 9 | * [图特征](#图特征) 10 | * [普通统计特征](#普通统计特征) 11 | * [词向量特征](#词向量特征) 12 | * [特征重要性分析](#特征重要性分析) 13 | * [LightGBM模型](#lightgbm模型) 14 | * [LightGBM参数](#lightgbm参数) 15 | * [训练](#训练) 16 | * [深度模型](#深度模型) 17 | * [CNN-DSSM](#cnn-dssm) 18 | * [LSTM-DSSM](#lstm-dssm) 19 | * [mvlstm](#mvlstm) 20 | * [ARC II](#arc-ii) 21 | 22 | # 传统模型 23 | 24 | ## query和title去重 25 | 26 | 训练词向量以及词频统计是query和title需做去重处理 27 | 28 | ## 图特征 29 | 30 | *每一个query或title都作为图中唯一一个节点,基于query与title的句子对构建边。计划构建无向图以及权重图。无向图规模最大建立了一亿三千万数据量的图,生成图的过程把query和title都转换成一个Int32的ID值,减少内存消耗。* 31 | 32 | > Trick:利用Pickle保存、读取图是最快的,文件的体积也是最小的。 33 | 34 | **无向图特征——最大完全子图max_clique**:最大完全子图的大小(特征效果提升不明显,舍弃!)。 35 | 36 | **无向图特征——边连接数max_degrees**:统计每个节点的边连接数 37 | 38 | **无向图特征——最大连通子图规模max_components**:统计每个节点最大连通子图规模 39 | 40 | **无向图特征——pagerank值**:根据Google的pagerank算法计算每个节点的pagerank值,这个虽然我们使是使用无向图进行计算的,但是pagerank是通过迭代计算每个节点的入度来评判每个节点的重要程度。所以在运行pagerank算法的时候会默认把无向边连接转化成双向的边连接。 41 | 42 | **无向图特征——HITS算法A值和H值**:时间关系没有训练出HITS模型,不过HITS类似pagerank模型,在HIST算法中,分为Hub页面和Authority页面,Authority页面是指与某个领域或者某个话题相关的高质量页面,Hub页面则是包含很多指向高质量Authority页面链接的网页。HITS算法模型会给出每个节点的A值和H值用来评估节点的重要程度。 43 | 44 | **无向图特征——shortestpath**:query和title之间的最短路径,对于共现的query和title我们会建立一条边,这样shortestpath会默认是1,所以在计算shortestpath特征的时候,我们会把这条边先删除,shortestpath计算结束后在重新加会这条边。 45 | 46 | **无向图特征——neighbour**:邻居数,由于图过大计算邻居数会爆内存。在大数据量上,我们舍弃了这维度特征。 47 | 48 | **权重图特征**:图特征的边的权重计划通过,query和title的句子词向量相似度或者BM25算法实现,由于时间紧张没有实现。大概特征类别等同于无向图特征,更多的是作为对于无向图特征的修正。 49 | 50 | ## 普通统计特征 51 | 52 | *复赛提供15个核的CPU,利用multiprocessing库的进程池管理模块Pool,可以大大加快特征生成速度* 53 | 54 | **concurrence**:query与title中相同词所占词总数比 55 | 56 | **levenshteinDistance**: 编辑距离(后面通过fuzzywuzzy实现了更为详细的分析) 57 | 58 | **sorensenDistance**:sorrensen距离 59 | 60 | **sameWord**:query与title中相同词种类数,某个词出现label为1的概率乘积(需要初始化pos_prob) 61 | 62 | **distance**: Dice Distance、Ochi DIstance、 Jaccard Distance 63 | 64 | **fuzzyDistance**:通过fuzzywuzzy库(字符串模糊匹配工具)实现的详细编辑距离分析,包括简单匹配(Simple Ratio)、非完全匹配(Partial Ration)、忽略顺序匹配(Token Sort Ratio)、去重子集匹配(Token Set Ratio) 65 | 66 | **powerful words**:统计label为1时,query和title中同时出现的单词词频(双边概率)、只出现在query或title中的单词词频(单边概率) 67 | 68 | ## 词向量特征 69 | 70 | **word2vecDistance**:基于word2vec词向量的Cosine Distance、Euclidean Distance、Manhattan Distance。(后加入fasttext模型) 71 | 72 | **w2vWeightDistance**:基于word2vec词向量,并考虑TF-IDF权重的Cosine Distance、Euclidean Distance、Manhattan Distance。(后加入fasttext模型) 73 | 74 | **NGramDistance**:词粒度NGram距离(词粒度效果不好,舍弃!) 75 | 76 | ## 特征重要性分析 77 | 78 | 图中包括以上没介绍的特征,均为深度文本匹配模型的输出结果。 作为一维特征 79 | 80 | ![](images/Code.png) 81 | 82 | ## LightGBM模型 83 | 84 | ### LightGBM参数 85 | 86 | ![](images/params.png) 87 | 88 | ### 训练 89 | 90 | *训练集是采用十亿数据集中后三千万作为训练集,百分之二十作为验证集,保留原数据Label比例采样。* 91 | 92 | # 深度模型 93 | 94 | ## CNN-DSSM 95 | 96 | 传统的CNN - DSSM 只用了一个卷积作为表示层,这里用了类似 TextCNN 的架构作为表示层,用了 kernel_size 不同的 6 个卷积层。 97 | 98 | 99 | ![cnn-dssm](images/cnn-dssm.png) 100 | 101 | 参考 102 | 103 | http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf 104 | 105 | https://github.com/airalcorn2/Deep-Semantic-Similarity-Model/blob/master/deep_semantic_similarity_keras.py 106 | 107 | ## LSTM-DSSM 108 | 109 | LSTM-DSSM 用 Bi-LSTM + Attention 作为 DSSM 的表示层。 110 | 111 | ![lstm-dssm](images/lstm-dssm.png) 112 | 113 | ## mvlstm 114 | 115 | 通过 Bi-LSTM 构造对齐矩阵,匹配句子之间的关系。 116 | 117 | ![mvlstm](images/mvlstm.png) 118 | 119 | 参考 120 | 121 | https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/mvlstm.py 122 | 123 | ## ARC II 124 | 125 | ![arcii](images/arcii.png) 126 | 127 | 参考 128 | 129 | https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/arcii.py 130 | 131 | ## TextCNN 132 | ![textcnn](images/textcnn.png) -------------------------------------------------------------------------------- /textcnn.py: -------------------------------------------------------------------------------- 1 | from keras.layers import * 2 | from keras.models import Model 3 | from keras import metrics 4 | 5 | # query输入 6 | queryInput = Input(shape=(30,600)) 7 | 8 | #title输入 9 | titleInput = Input(shape=(30,600)) 10 | 11 | x = TimeDistributed(Dense(150, activation='relu'))(queryInput) 12 | xlstm = CuDNNLSTM(150, return_sequences=True)(x) 13 | xlstm1 = GlobalMaxPooling1D()(xlstm) 14 | xa = concatenate([xlstm, x]) 15 | 16 | xconv1 = Convolution1D(filters=100, 17 | kernel_size=1, 18 | padding='same', 19 | activation='relu')(xa) 20 | xconv1 = GlobalMaxPooling1D()(xconv1) 21 | 22 | xconv2 = Convolution1D(filters=100, 23 | kernel_size=2, 24 | padding='same', 25 | activation='relu')(xa) 26 | xconv2 = GlobalMaxPooling1D()(xconv2) 27 | 28 | xconv3 = Convolution1D(filters=100, 29 | kernel_size=3, 30 | padding='same', 31 | activation='relu')(xa) 32 | xconv3 = GlobalMaxPooling1D()(xconv3) 33 | 34 | xconv4 = Convolution1D(filters=100, 35 | kernel_size=4,dilation_rate=2, 36 | padding='same', 37 | activation='relu')(xa) 38 | xconv4 = GlobalMaxPooling1D()(xconv4) 39 | 40 | xconv5 = Convolution1D(filters=100, 41 | kernel_size=5,dilation_rate=2, 42 | padding='same', 43 | activation='relu')(xa) 44 | xconv5 = GlobalMaxPooling1D()(xconv5) 45 | 46 | xconv6 = Convolution1D(filters=100, 47 | kernel_size=6, 48 | padding='same', 49 | activation='relu')(xa) 50 | xconv6 = GlobalMaxPooling1D()(xconv6) 51 | xgru = CuDNNGRU(300, return_sequences=True)(xa) 52 | x = concatenate([xconv1,xconv2,xconv3,xconv4,xconv5,xconv6,xlstm1]) 53 | x = Dropout(0.2)(x) 54 | x = Dense(200)(x) 55 | x_out = PReLU()(x) 56 | 57 | 58 | y = TimeDistributed(Dense(150, activation='relu'))(titleInput) 59 | ylstm = CuDNNLSTM(150, return_sequences=True)(y) 60 | ylstm1 = GlobalMaxPooling1D()(ylstm) 61 | ya = concatenate([ylstm, y]) 62 | 63 | yconv1 = Convolution1D(filters=100, 64 | kernel_size=1, 65 | padding='same', 66 | activation='relu')(ya) 67 | yconv1 = GlobalMaxPooling1D()(yconv1) 68 | 69 | yconv2 = Convolution1D(filters=100, 70 | kernel_size=2, 71 | padding='same', 72 | activation='relu')(ya) 73 | yconv2 = GlobalMaxPooling1D()(yconv2) 74 | 75 | yconv3 = Convolution1D(filters=100, 76 | kernel_size=3, 77 | padding='same', 78 | activation='relu')(ya) 79 | yconv3 = GlobalMaxPooling1D()(yconv3) 80 | 81 | yconv4 = Convolution1D(filters=100, 82 | kernel_size=4,dilation_rate=2, 83 | padding='same', 84 | activation='relu')(ya) 85 | yconv4 = GlobalMaxPooling1D()(yconv4) 86 | 87 | yconv5 = Convolution1D(filters=100, 88 | kernel_size=5,dilation_rate=2, 89 | padding='same', 90 | activation='relu')(ya) 91 | yconv5 = GlobalMaxPooling1D()(yconv5) 92 | 93 | yconv6 = Convolution1D(filters=100, 94 | kernel_size=6, 95 | padding='same', 96 | activation='relu')(ya) 97 | yconv6 = GlobalMaxPooling1D()(yconv6) 98 | ygru = CuDNNGRU(300, return_sequences=True)(ya) 99 | y = concatenate([yconv1,yconv2,yconv3,yconv4,yconv5,yconv6,ylstm1]) 100 | y = Dropout(0.2)(y) 101 | y = Dense(200)(y) 102 | y_out = PReLU()(y) 103 | 104 | # interaction 105 | x1,l,lc = [x_out,xlstm,xgru] 106 | 107 | x2,r,rc = [y_out,ylstm,ygru] 108 | 109 | cross1 = Dot(axes=[2, 2], normalize=True)([l,r]) 110 | cross1 = Reshape((-1, ))(cross1) 111 | cross1 = Dropout(0.5)(cross1) 112 | cross1 = Dense(200)(cross1) 113 | cross1 = PReLU()(cross1) 114 | 115 | cross2 = Dot(axes=[2, 2], normalize=True)([lc,rc]) 116 | cross2 = Reshape((-1, ))(cross2) 117 | cross2 = Dropout(0.5)(cross2) 118 | cross2 = Dense(200)(cross2) 119 | cross2 = PReLU()(cross2) 120 | 121 | diff = subtract([x1,x2]) 122 | mul = multiply([x1,x2]) 123 | x = concatenate([x1,x2,diff,mul,cross1,cross2]) 124 | x = BatchNormalization()(x) 125 | 126 | x = Dense(500)(x) 127 | x = PReLU()(x) 128 | x = Dropout(0.2)(x) 129 | 130 | 131 | hidden1 = Dense(200)(x) 132 | hidden1 = PReLU()(hidden1) 133 | hidden1 = Dropout(0.2)(hidden1) 134 | 135 | 136 | hidden2 = Dense(50)(hidden1) 137 | hidden2 = PReLU()(hidden2) 138 | hidden2 = Dropout(0.2)(hidden2) 139 | 140 | out = Dense(1, activation='sigmoid')(hidden2) 141 | model_t2 = Model(inputs=[queryInput,titleInput], outputs=out) 142 | model_t2.compile(loss='binary_crossentropy', 143 | optimizer='adam', 144 | metrics=[metrics.mae, metrics.binary_accuracy]) --------------------------------------------------------------------------------