├── Feature.ipynb
├── arcii.py
├── cnn-dssm.py
├── images
    ├── Code.png
    ├── arcii.png
    ├── cnn-dssm.png
    ├── lstm-dssm.png
    ├── mvlstm.png
    ├── params.png
    └── textcnn.png
├── lstm-dssm.py
├── mvlstm.py
├── readme.md
└── textcnn.py


/Feature.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "id": "DA3BF7A274C9420984E6233A09460048",
   7 |     "mdEditEnable": false
   8 |    },
   9 |    "source": [
  10 |     "# 导包"
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "code",
  15 |    "execution_count": 5,
  16 |    "metadata": {
  17 |     "id": "96EE9D8C953142D7AEE81B639EA3DF92",
  18 |     "collapsed": false,
  19 |     "scrolled": true
  20 |    },
  21 |    "outputs": [
  22 |     {
  23 |      "name": "stdout",
  24 |      "output_type": "stream",
  25 |      "text": [
  26 |       "100000000 ./predictions/final_submission_20190811.csv\n\n1,1,0.53125741515603\n1,2,0.4392272420196072\n1,3,0.5879643281299144\n1,4,0.576160912203239\n1,5,0.5052100835619331\n1,6,0.4324311875604571\n1,7,0.4066308458541816\n2,1,0.5768602948272425\n2,2,0.6554118653967363\n2,3,0.5325662217303531\n1,128 770 122 1192,1,770 36 1192 8 33 10048 122 193 469 31 37\t\n1,128 770 122 1192,2,354 770 1192 40 9315 15 3545 14 522 3159 122 1645 4626 31 37\t\n1,128 770 122 1192,3,770 69380 1845 34 644 115 10320 11843 12348\t\n1,128 770 122 1192,4,770 770 4241 2131 7036 122 1192 851 2507\t\n1,128 770 122 1192,5,770 14692 3933 3194 27 10320 128 4346 1192\t\n1,128 770 122 1192,6,770 36 39 1192 266 108 5229 192 122 26831 11\t\n1,128 770 122 1192,7,770 36 10048 122 13 449 1192 33 708 30 37\t\n2,66 64 123 848,1,3589 458 848 66 26 123 848 388 17779 29794 27 1282 2435 389\t\n2,66 64 123 848,2,3589 458 848 66 26 123 848 388 17779 328 274 82424\t\n2,66 64 123 848,3,5218 1257 31 4712 6030 27 47 267 659 1392 3589 66 64 123 27 5692 848 819\t\n\n10318082,8,0.26414565025898734\n10318082,9,0.42693966127920707\n10318082,10,0.28341074642672387\n10318082,11,0.2947948388734799\n10318082,12,0.26474696779822326\n10318082,13,0.32344879648630276\n10318082,14,0.34928164192293326\n10318082,15,0.2033738233624376\n10318082,16,0.055942568163945425\n10318082,17,0.19353918772727668\n10318082,794 39322,8,14703 82 16 21 258 13958 15 39322 27 15488 794 91154 1075 39322 148 4633\t\n10318082,794 39322,9,19352 1424 9240 203 11529 27 6526 4452 19398 4813 733\t\n10318082,794 39322,10,23860 68 9 220 235 161 16 794 39410 1151 2448\t\n10318082,794 39322,11,37314 80 180 11 794 20 915 102 1424 9560 35 9240 31420 936 36 10939 236 2947\t\n10318082,794 39322,12,9683 2165 817 3541 1424 39322 35 55495 15 69\t\n10318082,794 39322,13,49 118 1424 39322 3909 1351 27 196 151 1585 15 52\t\n10318082,794 39322,14,8633 18020 83 262 11 1383 1424 154 679 5760 27 2860 2251 281\t\n10318082,794 39322,15,8633 25659 1121 1424 9240 10773 27 1851 24 47965 23\t\n10318082,794 39322,16,8633 18866 98 3526 624 85 527 31 27 124 2150 281\t\n10318082,794 39322,17,16001 145 794 75 2597 1424 35 10960 39 38276 2030\t\n"
  27 |      ]
  28 |     }
  29 |    ],
  30 |    "source": [
  31 |     "!wc -l ./predictions/final_submission_20190811.csv\n",
  32 |     "print()\n",
  33 |     "!head ./predictions/final_submission_20190811.csv\n",
  34 |     "!head /home/kesci/input/bytedance/bytedance_contest.final_2.csv\n",
  35 |     "print()\n",
  36 |     "!tail ./predictions/final_submission_20190811.csv\n",
  37 |     "!tail /home/kesci/input/bytedance/bytedance_contest.final_2.csv"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "code",
  42 |    "execution_count": 13,
  43 |    "metadata": {
  44 |     "id": "24CDE881EEC648168F273287B4921514",
  45 |     "collapsed": false,
  46 |     "scrolled": true
  47 |    },
  48 |    "outputs": [
  49 |     {
  50 |      "name": "stdout",
  51 |      "output_type": "stream",
  52 |      "text": [
  53 |       "Collecting fuzzywuzzy\n  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/d8/f1/5a267addb30ab7eaa1beab2b9323073815da4551076554ecc890a3595ec9/fuzzywuzzy-0.17.0-py2.py3-none-any.whl\nInstalling collected packages: fuzzywuzzy\nSuccessfully installed fuzzywuzzy-0.17.0\nCollecting jellyfish\n  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/3f/80/bcacc7affb47be7279d7d35225e1a932416ed051b315a7f9df20acf04cbe/jellyfish-0.7.2.tar.gz (133kB)\n\u001b[K    100% |████████████████████████████████| 143kB 7.6MB/s eta 0:00:01\n\u001b[?25hBuilding wheels for collected packages: jellyfish\n  Running setup.py bdist_wheel for jellyfish ... \u001b[?25ldone\n\u001b[?25h  Stored in directory: /home/kesci/.cache/pip/wheels/bc/b7/78/6736d761d7635d2af9579e040f342b5482850d856d26cbefa3\nSuccessfully built jellyfish\nInstalling collected packages: jellyfish\nSuccessfully installed jellyfish-0.7.2\nCollecting pyemd\n  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/c0/c5/7fea8e7a71cd026b30ed3c40e4c5ea13a173e28f8855da17e25271e8f545/pyemd-0.5.1.tar.gz (91kB)\n\u001b[K    100% |████████████████████████████████| 92kB 7.5MB/s eta 0:00:01\n\u001b[?25hRequirement already satisfied: numpy<2.0.0,>=1.9.0 in /opt/conda/lib/python3.6/site-packages (from pyemd)\nBuilding wheels for collected packages: pyemd\n  Running setup.py bdist_wheel for pyemd ... \u001b[?25ldone\n\u001b[?25h  Stored in directory: /home/kesci/.cache/pip/wheels/94/20/c1/ccdf0e9878f5c76def850603e62a572746036829f8353804bd\nSuccessfully built pyemd\nInstalling collected packages: pyemd\nSuccessfully installed pyemd-0.5.1\nCollecting python-levenshtein\n  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/42/a9/d1785c85ebf9b7dfacd08938dd028209c34a0ea3b1bcdb895208bd40a67d/python-Levenshtein-0.12.0.tar.gz (48kB)\n\u001b[K    100% |████████████████████████████████| 51kB 5.9MB/s eta 0:00:01\n\u001b[?25hRequirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from python-levenshtein)\nBuilding wheels for collected packages: python-levenshtein\n  Running setup.py bdist_wheel for python-levenshtein ... \u001b[?25ldone\n\u001b[?25h  Stored in directory: /home/kesci/.cache/pip/wheels/ef/af/8f/b3250804480b8d14ca55d436129a2fb53798a0ae9287b686c0\nSuccessfully built python-levenshtein\nInstalling collected packages: python-levenshtein\nSuccessfully installed python-levenshtein-0.12.0\n"
  54 |      ]
  55 |     }
  56 |    ],
  57 |    "source": [
  58 |     "# !pip install fuzzywuzzy -i https://pypi.tuna.tsinghua.edu.cn/simple\n",
  59 |     "# !pip install jellyfish -i https://pypi.tuna.tsinghua.edu.cn/simple\n",
  60 |     "# !pip install pyemd -i https://pypi.tuna.tsinghua.edu.cn/simple\n",
  61 |     "# !pip install python-levenshtein -i https://pypi.tuna.tsinghua.edu.cn/simple"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 1,
  67 |    "metadata": {
  68 |     "cell_type": "code",
  69 |     "id": "A5B4D140EABC45748C3C9FDBD11352E9",
  70 |     "collapsed": false,
  71 |     "scrolled": false
  72 |    },
  73 |    "outputs": [
  74 |     {
  75 |      "name": "stdout",
  76 |      "output_type": "stream",
  77 |      "text": [
  78 |       "CPU times: user 1.3 s, sys: 596 ms, total: 1.9 s\nWall time: 1.45 s\n"
  79 |      ]
  80 |     }
  81 |    ],
  82 |    "source": [
  83 |     "%%time\r\n",
  84 |     "\r\n",
  85 |     "import os\r\n",
  86 |     "import sys\r\n",
  87 |     "import math\r\n",
  88 |     "import pickle\r\n",
  89 |     "import collections\r\n",
  90 |     "import gc\r\n",
  91 |     "import joblib\r\n",
  92 |     "import gzip\r\n",
  93 |     "import xgboost as xgb\r\n",
  94 |     "import seaborn as sns\r\n",
  95 |     "import matplotlib.pyplot as plt\r\n",
  96 |     "import pandas as pd\r\n",
  97 |     "import numpy as np\r\n",
  98 |     "import lightgbm as lgb\r\n",
  99 |     "from tqdm import tqdm\r\n",
 100 |     "# from fuzzywuzzy import fuzz\r\n",
 101 |     "from datetime import datetime\r\n",
 102 |     "from multiprocessing import Pool\r\n",
 103 |     "# from jellyfish import jaro_distance, jaro_winkler\r\n",
 104 |     "from scipy.spatial.distance import cosine, euclidean, cityblock\r\n",
 105 |     "\r\n",
 106 |     "import gensim\r\n",
 107 |     "from gensim.corpora import Dictionary\r\n",
 108 |     "from gensim.models import TfidfModel, FastText, KeyedVectors\r\n",
 109 |     "from gensim.models.word2vec import Word2Vec, PathLineSentences, LineSentence\r\n",
 110 |     "\r\n",
 111 |     "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\r\n",
 112 |     "from sklearn.metrics import roc_auc_score, precision_score\r\n",
 113 |     "from sklearn.ensemble import RandomForestClassifier\r\n",
 114 |     "from sklearn.externals import joblib\r\n",
 115 |     "\r\n",
 116 |     "import networkx as nx\r\n",
 117 |     "from networkx.readwrite.gpickle import write_gpickle, read_gpickle\r\n",
 118 |     "\r\n",
 119 |     "import warnings\r\n",
 120 |     "warnings.filterwarnings('ignore', category=Warning)"
 121 |    ]
 122 |   },
 123 |   {
 124 |    "cell_type": "markdown",
 125 |    "metadata": {
 126 |     "id": "CB3D344A4EC943C0A0C15B9A2A12B5D2",
 127 |     "mdEditEnable": false
 128 |    },
 129 |    "source": [
 130 |     "# 常量初始化"
 131 |    ]
 132 |   },
 133 |   {
 134 |    "cell_type": "code",
 135 |    "execution_count": 2,
 136 |    "metadata": {
 137 |     "id": "3F0E11F980C4407BBE75220A36839EB2",
 138 |     "collapsed": false,
 139 |     "scrolled": false
 140 |    },
 141 |    "outputs": [
 142 |     {
 143 |      "name": "stdout",
 144 |      "output_type": "stream",
 145 |      "text": [
 146 |       "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\nWall time: 5.72 µs\n"
 147 |      ]
 148 |     }
 149 |    ],
 150 |    "source": [
 151 |     "%%time\r\n",
 152 |     "test_data_path = '/home/kesci/input/bytedance/test_final_part1.csv'\r\n",
 153 |     "train_data_path = './split/10kw.csv'\r\n",
 154 |     "final_data_path = '/home/kesci/input/bytedance/bytedance_contest.final_2.csv'"
 155 |    ]
 156 |   },
 157 |   {
 158 |    "cell_type": "markdown",
 159 |    "metadata": {
 160 |     "id": "A79292BD4C6743DB87694C0BF6317C01",
 161 |     "mdEditEnable": false
 162 |    },
 163 |    "source": [
 164 |     "# 数据预处理"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "markdown",
 169 |    "metadata": {
 170 |     "id": "2AC4F400337042CD9995DFAA8F54ED87",
 171 |     "mdEditEnable": false
 172 |    },
 173 |    "source": [
 174 |     "## 获取query和title的set集合——train"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": 16,
 180 |    "metadata": {
 181 |     "id": "0F3B4DE0DBCE4646A79321325D56F5B3",
 182 |     "collapsed": false
 183 |    },
 184 |    "outputs": [
 185 |     {
 186 |      "name": "stdout",
 187 |      "output_type": "stream",
 188 |      "text": [
 189 |       "CPU times: user 0 ns, sys: 0 ns, total: 0 ns\nWall time: 10.3 µs\n"
 190 |      ]
 191 |     }
 192 |    ],
 193 |    "source": [
 194 |     "%%time\r\n",
 195 |     "def get_unique(filepath):\r\n",
 196 |     "    print('-----------------reading {}-----------------'.format(filepath))\r\n",
 197 |     "    data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n",
 198 |     "    data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n",
 199 |     "    sentences = set()\r\n",
 200 |     "    query = set()\r\n",
 201 |     "    title = set()\r\n",
 202 |     "    print('------------------parse data------------------')\r\n",
 203 |     "    i = 0\r\n",
 204 |     "    for chunk in data:\r\n",
 205 |     "        if i%10 == 0: print(i)\r\n",
 206 |     "        i+=1\r\n",
 207 |     "        query |= set(chunk[1].unique().tolist())\r\n",
 208 |     "        title |= set(chunk[3].unique().tolist())\r\n",
 209 |     "    sentences = query | title\r\n",
 210 |     "    \r\n",
 211 |     "    print('unique query size', query.__len__())\r\n",
 212 |     "    print('unique title size', title.__len__())\r\n",
 213 |     "    print('unique sentences size', sentences.__len__())\r\n",
 214 |     "    # print('---------------Writing to csv file---------------')\r\n",
 215 |     "\r\n",
 216 |     "    # with open(handled_path + 'sentences.csv', 'w', encoding='utf-8') as f:\r\n",
 217 |     "    #     for item in sentences:\r\n",
 218 |     "    #         f.write(item)\r\n",
 219 |     "    #         f.write('\\n')\r\n",
 220 |     "    return query, title, sentences\r\n",
 221 |     "test_query, test_title, test_sentence = get_unique(test_data_path)\r\n",
 222 |     "train_query, train_title, train_sentence = get_unique(train_data_path)\r\n",
 223 |     "final_query, final_title, final_sentence = get_unique(final_data_path)\r\n",
 224 |     "\r\n",
 225 |     "sentence = test_sentence | train_sentence | final_sentence\r\n",
 226 |     "print(sentence.__len__())\r\n",
 227 |     "with open('./handled_data/all_sentences.csv', 'w', encoding='utf-8') as f:\r\n",
 228 |     "    for item in sentence:\r\n",
 229 |     "        f.write(item)\r\n",
 230 |     "        f.write('\\n')"
 231 |    ]
 232 |   },
 233 |   {
 234 |    "cell_type": "markdown",
 235 |    "metadata": {
 236 |     "id": "F4AA3A285E81414588A75871F76BADC1",
 237 |     "mdEditEnable": false
 238 |    },
 239 |    "source": [
 240 |     "## wordcount"
 241 |    ]
 242 |   },
 243 |   {
 244 |    "cell_type": "code",
 245 |    "execution_count": null,
 246 |    "metadata": {
 247 |     "id": "702619C5323F48EB86F7B69F60054BB0"
 248 |    },
 249 |    "outputs": [],
 250 |    "source": [
 251 |     "word_counter = collections.Counter()\r\n",
 252 |     "\r\n",
 253 |     "def word_count(filepath):\r\n",
 254 |     "    data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n",
 255 |     "    for chunk in tqdm(data):\r\n",
 256 |     "        for item in chunk[1].unique():\r\n",
 257 |     "            for word in item.split():\r\n",
 258 |     "                word_counter[word] += 1\r\n",
 259 |     "        for item in chunk[3].tolist():\r\n",
 260 |     "            for word in item.split():\r\n",
 261 |     "                word_counter[word] += 1\r\n",
 262 |     "                \r\n",
 263 |     "def get_weight(count, eps=10000, min_count=2):\r\n",
 264 |     "    return 0 if count < min_count else 1/(count + eps)\r\n",
 265 |     "    \r\n",
 266 |     "word_count(test_data_path)\r\n",
 267 |     "word_count(train_data_path)\r\n",
 268 |     "word_count(final_data_path)\r\n",
 269 |     "\r\n",
 270 |     "weights = {word : get_weight(count) for word, count in word_counter.items()}"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "code",
 275 |    "execution_count": null,
 276 |    "metadata": {
 277 |     "id": "816FB6F2013E4F7187CD3AC4DCECFC2A"
 278 |    },
 279 |    "outputs": [],
 280 |    "source": [
 281 |     "def word_shares(row):\r\n",
 282 |     "    q1_list = str(row['question1']).lower().split()\r\n",
 283 |     "    q1 = set(q1_list)\r\n",
 284 |     "    q1words = q1.difference(stops)\r\n",
 285 |     "    if len(q1words) == 0:\r\n",
 286 |     "        return '0:0:0:0:0:0:0:0'\r\n",
 287 |     "\r\n",
 288 |     "    q2_list = str(row['question2']).lower().split()\r\n",
 289 |     "    q2 = set(q2_list)\r\n",
 290 |     "    q2words = q2.difference(stops)\r\n",
 291 |     "    if len(q2words) == 0:\r\n",
 292 |     "        return '0:0:0:0:0:0:0:0'\r\n",
 293 |     "\r\n",
 294 |     "    words_hamming = sum(1 for i in zip(q1_list, q2_list) if i[0]==i[1])/max(len(q1_list), len(q2_list))\r\n",
 295 |     "\r\n",
 296 |     "    q1stops = q1.intersection(stops)\r\n",
 297 |     "    q2stops = q2.intersection(stops)\r\n",
 298 |     "\r\n",
 299 |     "    q1_2gram = set([i for i in zip(q1_list, q1_list[1:])])\r\n",
 300 |     "    q2_2gram = set([i for i in zip(q2_list, q2_list[1:])])\r\n",
 301 |     "\r\n",
 302 |     "    shared_2gram = q1_2gram.intersection(q2_2gram)\r\n",
 303 |     "\r\n",
 304 |     "    shared_words = q1words.intersection(q2words)\r\n",
 305 |     "    shared_weights = [weights.get(w, 0) for w in shared_words]\r\n",
 306 |     "    q1_weights = [weights.get(w, 0) for w in q1words]\r\n",
 307 |     "    q2_weights = [weights.get(w, 0) for w in q2words]\r\n",
 308 |     "    total_weights = q1_weights + q1_weights\r\n",
 309 |     "\r\n",
 310 |     "    R1 = np.sum(shared_weights) / np.sum(total_weights) #tfidf share\r\n",
 311 |     "    R2 = len(shared_words) / (len(q1words) + len(q2words) - len(shared_words)) #count share\r\n",
 312 |     "    R31 = len(q1stops) / len(q1words) #stops in q1\r\n",
 313 |     "    R32 = len(q2stops) / len(q2words) #stops in q2\r\n",
 314 |     "    Rcosine_denominator = (np.sqrt(np.dot(q1_weights,q1_weights))*np.sqrt(np.dot(q2_weights,q2_weights)))\r\n",
 315 |     "    Rcosine = np.dot(shared_weights, shared_weights)/Rcosine_denominator\r\n",
 316 |     "    if len(q1_2gram) + len(q2_2gram) == 0:\r\n",
 317 |     "        R2gram = 0\r\n",
 318 |     "    else:\r\n",
 319 |     "        R2gram = len(shared_2gram) / (len(q1_2gram) + len(q2_2gram))\r\n",
 320 |     "    return '{}:{}:{}:{}:{}:{}:{}:{}'.format(R1, R2, len(shared_words), R31, R32, R2gram, Rcosine, words_hamming)"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "markdown",
 325 |    "metadata": {
 326 |     "id": "58E84E29DB9A4779B04D1D0608625D1C",
 327 |     "mdEditEnable": false
 328 |    },
 329 |    "source": [
 330 |     "# 生成无向图"
 331 |    ]
 332 |   },
 333 |   {
 334 |    "cell_type": "markdown",
 335 |    "metadata": {
 336 |     "id": "48483DE7E283464194017A7FC7AA040D",
 337 |     "mdEditEnable": false
 338 |    },
 339 |    "source": [
 340 |     "## 准备数据——无向图生成"
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "code",
 345 |    "execution_count": 1,
 346 |    "metadata": {
 347 |     "id": "EFC4238523B94860B94B2C0B1E78E2A4",
 348 |     "collapsed": false,
 349 |     "scrolled": false
 350 |    },
 351 |    "outputs": [
 352 |     {
 353 |      "name": "stdout",
 354 |      "output_type": "stream",
 355 |      "text": [
 356 |       "start reading seq2id\nend reading seq2id\nseq2id with length ---> 87907776\nCPU times: user 1min 31s, sys: 14.5 s, total: 1min 45s\nWall time: 2min 4s\n"
 357 |      ]
 358 |     }
 359 |    ],
 360 |    "source": [
 361 |     "%%time\n",
 362 |     "print('start reading seq2id')\n",
 363 |     "seq2id = {}\n",
 364 |     "with open('./handled_data/all_sentences.csv', 'r', encoding='utf-8') as f:\n",
 365 |     "    for i, sentence in enumerate(f.readlines()):\n",
 366 |     "        seq2id[sentence.strip()] = i\n",
 367 |     "print('end reading seq2id')\n",
 368 |     "print('seq2id with length --->',len(seq2id))"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "markdown",
 373 |    "metadata": {
 374 |     "id": "7377F073719A4D9E85A8E2AC1BF9A80B",
 375 |     "mdEditEnable": false
 376 |    },
 377 |    "source": [
 378 |     "## 文本转ID"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": 26,
 384 |    "metadata": {
 385 |     "id": "CE5D7FCDC3474DD5B96D9BE64D271CD3",
 386 |     "collapsed": false,
 387 |     "scrolled": false
 388 |    },
 389 |    "outputs": [
 390 |     {
 391 |      "name": "stdout",
 392 |      "output_type": "stream",
 393 |      "text": [
 394 |       "2019-08-04 18:25:18.316136\nstart transform data\n0\n10\n                  0             1\ncount  2.000000e+07  2.000000e+07\nmean   5.122025e+07  6.027587e+07\nstd    2.944711e+07  2.793264e+07\nmin    1.060000e+02  0.000000e+00\n25%    2.575488e+07  3.862078e+07\n50%    5.130830e+07  6.404053e+07\n75%    7.676274e+07  8.443098e+07\nmax    1.019757e+08  1.019757e+08\nCPU times: user 1min 18s, sys: 1.86 s, total: 1min 20s\nWall time: 1min 28s\n"
 395 |      ]
 396 |     }
 397 |    ],
 398 |    "source": [
 399 |     "%%time\n",
 400 |     "print(datetime.now())\n",
 401 |     "def transform(filepath):\n",
 402 |     "    data = pd.read_csv(filepath, chunksize=1000000, header=None)\n",
 403 |     "    data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n",
 404 |     "    result = []\n",
 405 |     "    print('start transform data')\n",
 406 |     "    idx = 0\n",
 407 |     "    for chunk in data:\n",
 408 |     "        if idx%10==0: print(idx)\n",
 409 |     "        idx += 1\n",
 410 |     "        result.append(pd.DataFrame({0:chunk[1].apply(lambda x : seq2id[x.strip()]), \n",
 411 |     "                                    1:chunk[3].apply(lambda x : seq2id[x.strip()])}))\n",
 412 |     "    return result\n",
 413 |     "# id = pd.concat(transform(test_data_path))\n",
 414 |     "print(id.describe())\n",
 415 |     "id.to_csv('./handled_data/id_test.csv', header=None, index=False)\n",
 416 |     "!wc -l ./handled_data/id_test.csv"
 417 |    ]
 418 |   },
 419 |   {
 420 |    "cell_type": "markdown",
 421 |    "metadata": {
 422 |     "id": "AA79B229CB7A4F4E9E91B07BFE51D406",
 423 |     "mdEditEnable": false
 424 |    },
 425 |    "source": [
 426 |     "## 生成无向图"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "code",
 431 |    "execution_count": 3,
 432 |    "metadata": {
 433 |     "id": "5A8E5EEB63F44D558A48483A78E43597",
 434 |     "collapsed": false,
 435 |     "scrolled": true
 436 |    },
 437 |    "outputs": [
 438 |     {
 439 |      "name": "stdout",
 440 |      "output_type": "stream",
 441 |      "text": [
 442 |       "2019-08-04 20:09:48.426697\ngenerate train graph\n2019-08-04 20:10:11.280335 ---> 0\n2019-08-04 20:10:16.641768 ---> 1\n2019-08-04 20:10:21.355543 ---> 2\n2019-08-04 20:10:27.104319 ---> 3\n2019-08-04 20:10:31.788189 ---> 4\n2019-08-04 20:10:36.931844 ---> 5\n2019-08-04 20:10:40.341101 ---> 6\n2019-08-04 20:10:46.926877 ---> 7\n2019-08-04 20:10:53.250733 ---> 8\n2019-08-04 20:10:56.573122 ---> 9\n2019-08-04 20:10:59.920231 ---> 10\n2019-08-04 20:11:07.224468 ---> 11\n2019-08-04 20:11:10.635839 ---> 12\n2019-08-04 20:11:14.068377 ---> 13\n2019-08-04 20:11:24.505138 ---> 14\n2019-08-04 20:11:27.838067 ---> 15\n2019-08-04 20:11:31.123212 ---> 16\n2019-08-04 20:11:34.418728 ---> 17\n2019-08-04 20:11:44.396497 ---> 18\n2019-08-04 20:11:47.722042 ---> 19\n2019-08-04 20:11:51.068181 ---> 20\n2019-08-04 20:11:54.431964 ---> 21\n2019-08-04 20:11:57.809854 ---> 22\n2019-08-04 20:12:10.010839 ---> 23\n2019-08-04 20:12:13.399495 ---> 24\n2019-08-04 20:12:16.806213 ---> 25\n2019-08-04 20:12:20.229955 ---> 26\n2019-08-04 20:12:23.646422 ---> 27\n2019-08-04 20:12:27.061730 ---> 28\n2019-08-04 20:12:30.500476 ---> 29\nfinish generate train graph, temporaray graph size -> 21236030 29997376\ngenerate final data graph\n2019-08-04 20:13:01.251764 ---> 0\n2019-08-04 20:13:04.778456 ---> 1\n2019-08-04 20:13:12.256328 ---> 2\n2019-08-04 20:13:15.593915 ---> 3\n2019-08-04 20:13:18.930359 ---> 4\n2019-08-04 20:13:22.265382 ---> 5\n2019-08-04 20:13:25.573647 ---> 6\n2019-08-04 20:13:28.886724 ---> 7\n2019-08-04 20:13:32.212384 ---> 8\n2019-08-04 20:13:50.524109 ---> 9\n2019-08-04 20:13:53.876998 ---> 10\n2019-08-04 20:13:57.215895 ---> 11\n2019-08-04 20:14:00.570448 ---> 12\n2019-08-04 20:14:03.952938 ---> 13\n2019-08-04 20:14:07.322928 ---> 14\n2019-08-04 20:14:10.687618 ---> 15\n2019-08-04 20:14:14.065740 ---> 16\n2019-08-04 20:14:17.448141 ---> 17\n2019-08-04 20:14:20.841102 ---> 18\n2019-08-04 20:14:24.321381 ---> 19\n2019-08-04 20:14:27.720988 ---> 20\n2019-08-04 20:14:31.123869 ---> 21\n2019-08-04 20:14:34.535073 ---> 22\n2019-08-04 20:14:58.253944 ---> 23\n2019-08-04 20:15:01.721445 ---> 24\n2019-08-04 20:15:05.162061 ---> 25\n2019-08-04 20:15:08.601864 ---> 26\n2019-08-04 20:15:12.042046 ---> 27\n2019-08-04 20:15:15.479877 ---> 28\n2019-08-04 20:15:18.911179 ---> 29\n2019-08-04 20:15:22.363532 ---> 30\n2019-08-04 20:15:25.815807 ---> 31\n2019-08-04 20:15:29.282121 ---> 32\n2019-08-04 20:15:32.731438 ---> 33\n2019-08-04 20:15:36.196011 ---> 34\n2019-08-04 20:15:39.658354 ---> 35\n2019-08-04 20:15:43.136249 ---> 36\n2019-08-04 20:15:46.602263 ---> 37\n2019-08-04 20:15:50.072394 ---> 38\n2019-08-04 20:15:53.564058 ---> 39\n2019-08-04 20:15:57.031866 ---> 40\n2019-08-04 20:16:00.521455 ---> 41\n2019-08-04 20:16:04.020592 ---> 42\n2019-08-04 20:16:07.518894 ---> 43\n2019-08-04 20:16:39.318845 ---> 44\n2019-08-04 20:16:42.826865 ---> 45\n2019-08-04 20:16:46.338864 ---> 46\n2019-08-04 20:16:49.844742 ---> 47\n2019-08-04 20:16:53.361940 ---> 48\n2019-08-04 20:16:56.883326 ---> 49\n2019-08-04 20:17:00.410362 ---> 50\n2019-08-04 20:17:03.951037 ---> 51\n2019-08-04 20:17:07.481663 ---> 52\n2019-08-04 20:17:18.732522 ---> 53\n2019-08-04 20:17:22.134094 ---> 54\n2019-08-04 20:17:25.447526 ---> 55\n2019-08-04 20:17:28.776571 ---> 56\n2019-08-04 20:17:32.107264 ---> 57\n2019-08-04 20:17:35.436777 ---> 58\n2019-08-04 20:17:38.796158 ---> 59\n2019-08-04 20:17:42.136024 ---> 60\n2019-08-04 20:17:45.490189 ---> 61\n2019-08-04 20:17:48.824860 ---> 62\n2019-08-04 20:17:52.182171 ---> 63\n2019-08-04 20:17:55.530998 ---> 64\n2019-08-04 20:17:58.895756 ---> 65\n2019-08-04 20:18:02.289543 ---> 66\n2019-08-04 20:18:05.658089 ---> 67\n2019-08-04 20:18:09.042868 ---> 68\n2019-08-04 20:18:12.423860 ---> 69\n2019-08-04 20:18:15.742892 ---> 70\n2019-08-04 20:18:19.046646 ---> 71\n2019-08-04 20:18:22.362017 ---> 72\n2019-08-04 20:18:25.699028 ---> 73\n2019-08-04 20:18:29.022436 ---> 74\n2019-08-04 20:18:32.372405 ---> 75\n2019-08-04 20:19:16.084752 ---> 76\n2019-08-04 20:19:19.441777 ---> 77\n2019-08-04 20:19:22.803283 ---> 78\n2019-08-04 20:19:26.165686 ---> 79\n2019-08-04 20:19:29.536950 ---> 80\n2019-08-04 20:19:32.917675 ---> 81\n2019-08-04 20:19:36.283755 ---> 82\n2019-08-04 20:19:39.648486 ---> 83\n2019-08-04 20:19:43.019010 ---> 84\n2019-08-04 20:19:46.380202 ---> 85\n2019-08-04 20:19:49.725886 ---> 86\n2019-08-04 20:19:53.097143 ---> 87\n2019-08-04 20:19:56.476532 ---> 88\n2019-08-04 20:19:59.839292 ---> 89\n2019-08-04 20:20:03.241092 ---> 90\n2019-08-04 20:20:06.634720 ---> 91\n2019-08-04 20:20:10.003162 ---> 92\n2019-08-04 20:20:13.364029 ---> 93\n2019-08-04 20:20:16.724924 ---> 94\n2019-08-04 20:20:20.098426 ---> 95\n2019-08-04 20:20:23.469628 ---> 96\n2019-08-04 20:20:26.833314 ---> 97\n2019-08-04 20:20:30.207836 ---> 98\n2019-08-04 20:20:33.582591 ---> 99\nfinish generate final graph, temporaray graph size -> 59028158 129984026\nsave graph\nCPU times: user 13min 57s, sys: 51.8 s, total: 14min 49s\nWall time: 15min 5s\n"
 443 |      ]
 444 |     }
 445 |    ],
 446 |    "source": [
 447 |     "%%time\n",
 448 |     "print(datetime.now())\n",
 449 |     "G = nx.Graph()\n",
 450 |     "    \n",
 451 |     "def make_graph_parallel():\n",
 452 |     "    CHUNK_SIZE = 1000000\n",
 453 |     "    # test_data = pd.read_csv('./handled_data/id_test.csv', chunksize=CHUNK_SIZE, header=None)\n",
 454 |     "    # test_data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n",
 455 |     "    \n",
 456 |     "    # print('generate test graph')\n",
 457 |     "    # idx = 0\n",
 458 |     "    # print(datetime.now())\n",
 459 |     "    # for chunk in test_data:\n",
 460 |     "    #     print(datetime.now(),'--->',idx)\n",
 461 |     "    #     idx += 1 \n",
 462 |     "    #     ziped = list(zip(chunk[0], chunk[1]))\n",
 463 |     "    #     G.add_edges_from(ziped)\n",
 464 |     "        \n",
 465 |     "    #     del chunk\n",
 466 |     "    #     del ziped\n",
 467 |     "    # print('finish generate test graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n",
 468 |     "    # del test_data\n",
 469 |     "    # gc.collect()\n",
 470 |     "    \n",
 471 |     "    data = pd.read_csv('./handled_data/id_train.csv', chunksize=CHUNK_SIZE, header=None)\n",
 472 |     "    data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n",
 473 |     "\n",
 474 |     "    print('generate train graph')\n",
 475 |     "    idx = 0\n",
 476 |     "    # for chunk in data:\n",
 477 |     "    for i in range(100):\n",
 478 |     "        chunk = data.get_chunk()\n",
 479 |     "        if i < 70: continue\n",
 480 |     "        print(datetime.now(), '--->', idx)\n",
 481 |     "        idx += 1\n",
 482 |     "        ziped = list(zip(chunk[0], chunk[1]))\n",
 483 |     "        G.add_edges_from(ziped)\n",
 484 |     "        \n",
 485 |     "        del chunk\n",
 486 |     "        del ziped\n",
 487 |     "    del data\n",
 488 |     "    gc.collect()\n",
 489 |     "    print('finish generate train graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n",
 490 |     "    \n",
 491 |     "    data = pd.read_csv('./handled_data/id_final.csv', chunksize=CHUNK_SIZE, header=None)\n",
 492 |     "    data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\n",
 493 |     "    print('generate final data graph')\n",
 494 |     "    idx = 0\n",
 495 |     "    for chunk in data:\n",
 496 |     "        print(datetime.now(), '--->', idx)\n",
 497 |     "        idx += 1\n",
 498 |     "        ziped = list(zip(chunk[0], chunk[1]))\n",
 499 |     "        G.add_edges_from(ziped)\n",
 500 |     "        \n",
 501 |     "        del chunk\n",
 502 |     "        del ziped\n",
 503 |     "    print('finish generate final graph, temporaray graph size ->', G.number_of_nodes(), G.number_of_edges())\n",
 504 |     "   \n",
 505 |     "    del data\n",
 506 |     "    gc.collect()\n",
 507 |     "    print('save graph')\n",
 508 |     "    write_gpickle(G, './handled_data/final_graph.pkl')\n",
 509 |     "    \n",
 510 |     "make_graph_parallel()"
 511 |    ]
 512 |   },
 513 |   {
 514 |    "cell_type": "markdown",
 515 |    "metadata": {
 516 |     "id": "8D8EA63E6CA2463C9485424F7D635E46",
 517 |     "mdEditEnable": false
 518 |    },
 519 |    "source": [
 520 |     "## 生成pagerank模型"
 521 |    ]
 522 |   },
 523 |   {
 524 |    "cell_type": "code",
 525 |    "execution_count": 5,
 526 |    "metadata": {
 527 |     "id": "07291A2657994E6095D81705A142B993",
 528 |     "collapsed": false,
 529 |     "scrolled": false
 530 |    },
 531 |    "outputs": [
 532 |     {
 533 |      "name": "stdout",
 534 |      "output_type": "stream",
 535 |      "text": [
 536 |       "2019-08-04 20:26:20.708101\nend generate pagerank value\nCPU times: user 21min, sys: 51.1 s, total: 21min 52s\nWall time: 21min 51s\n"
 537 |      ]
 538 |     }
 539 |    ],
 540 |    "source": [
 541 |     "%%time\r\n",
 542 |     "print(datetime.now())\r\n",
 543 |     "graph = read_gpickle('./handled_data/final_graph.pkl')\r\n",
 544 |     "print(datetime.now(),'---> end load graph')\r\n",
 545 |     "page_rank = nx.pagerank_scipy(graph)\r\n",
 546 |     "print('end generate pagerank value')\r\n",
 547 |     "pickle.dump(page_rank, open('./handled_data/pagerank_final.model', 'wb'))"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "markdown",
 552 |    "metadata": {
 553 |     "id": "66D2DD90D6744BDF9ADD4D6E6588003C",
 554 |     "mdEditEnable": false
 555 |    },
 556 |    "source": [
 557 |     "## 生成HITS模型"
 558 |    ]
 559 |   },
 560 |   {
 561 |    "cell_type": "code",
 562 |    "execution_count": 1,
 563 |    "metadata": {
 564 |     "id": "A671B600F9154D908E108D9B14317892",
 565 |     "collapsed": false
 566 |    },
 567 |    "outputs": [
 568 |     {
 569 |      "name": "stdout",
 570 |      "output_type": "stream",
 571 |      "text": [
 572 |       "2019-07-27 03:39:19.758518\nend load graph\n"
 573 |      ]
 574 |     }
 575 |    ],
 576 |    "source": [
 577 |     "%%time\r\n",
 578 |     "print(datetime.datetime.now())\r\n",
 579 |     "graph = joblib.load('./models/un_direct_graph.model')\r\n",
 580 |     "print('end load graph')\r\n",
 581 |     "hits_h, hits_a = nx.hits(graph)\r\n",
 582 |     "print('end generate HITS value')\r\n",
 583 |     "joblib.dump(hits_h, './models/hub_value.model')\r\n",
 584 |     "joblib.dump(hits_a, './models/authority_value.model')"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "markdown",
 589 |    "metadata": {
 590 |     "id": "0B5EB347F584409BB654A641F66B1B2A",
 591 |     "mdEditEnable": false
 592 |    },
 593 |    "source": [
 594 |     "## 数据准备——特征生成"
 595 |    ]
 596 |   },
 597 |   {
 598 |    "cell_type": "code",
 599 |    "execution_count": 14,
 600 |    "metadata": {
 601 |     "id": "D9427953B24748D5892B3709E233DC89",
 602 |     "collapsed": false,
 603 |     "scrolled": false
 604 |    },
 605 |    "outputs": [
 606 |     {
 607 |      "name": "stdout",
 608 |      "output_type": "stream",
 609 |      "text": [
 610 |       "CPU times: user 52.5 s, sys: 12.3 s, total: 1min 4s\nWall time: 1min 5s\n"
 611 |      ]
 612 |     }
 613 |    ],
 614 |    "source": [
 615 |     "%%time\r\n",
 616 |     "# graph = read_gpickle('./handled_data/final/final_graph.pkl')\r\n",
 617 |     "# print('end load graph')\r\n",
 618 |     "# seq2id = {}\r\n",
 619 |     "# with open('./handled_data/all_sentences_verseion_2.csv', 'r', encoding='utf-8') as f:\r\n",
 620 |     "#     for i, sentence in enumerate(f.readlines()):\r\n",
 621 |     "#         seq2id[sentence.strip()] = i\r\n",
 622 |     "# print('end seq2id')\r\n",
 623 |     "def gen_degrees():\r\n",
 624 |     "    max_degrees = {}\r\n",
 625 |     "    edges = graph.edges()\r\n",
 626 |     "    for edge in edges:\r\n",
 627 |     "        for n in edge:\r\n",
 628 |     "            max_degrees[n] = max_degrees.get(n, 0) + 1\r\n",
 629 |     "    return max_degrees\r\n",
 630 |     "\r\n",
 631 |     "def gen_components():\r\n",
 632 |     "    max_components = {}\r\n",
 633 |     "    components = nx.connected_components(graph)\r\n",
 634 |     "    for component in components:\r\n",
 635 |     "        for n in component:\r\n",
 636 |     "            max_components[n] = max(max_components.get(n, 0), len(component))\r\n",
 637 |     "    return max_components\r\n",
 638 |     "\r\n",
 639 |     "def gen_hits():\r\n",
 640 |     "    hits_h, hits_a = nx.hits(graph, max_iter=500)\r\n",
 641 |     "    return hits_h, hits_a\r\n",
 642 |     "\r\n",
 643 |     "# max_degrees = gen_degrees()\r\n",
 644 |     "# pickle.dump(max_degrees, open('./handled_data/max_degree_final', 'wb'))\r\n",
 645 |     "# print('end degree')\r\n",
 646 |     "# max_components = gen_components()\r\n",
 647 |     "# pickle.dump(max_components, open('./handled_data/max_components_final', 'wb'))\r\n",
 648 |     "# print('end components')\r\n",
 649 |     "# del graph\r\n",
 650 |     "# gc.collect()\r\n",
 651 |     "\r\n",
 652 |     "max_degrees = pickle.load(open('./handled_data/final/max_degree_final', 'rb'))\r\n",
 653 |     "max_components = pickle.load(open('./handled_data/final/max_components_final', 'rb'))\r\n",
 654 |     "page_rank = pickle.load(open('./handled_data/final/pagerank_final.model', 'rb'))\r\n",
 655 |     "# print('end pagerank')\r\n",
 656 |     "# hits_h, hits_a = gen_hits()\r\n",
 657 |     "\r\n",
 658 |     "# max_degree max_components\r\n",
 659 |     "def calculate_statistics(row):\r\n",
 660 |     "    return [max_degrees[row[0]], max_degrees[row[1]], max_components[row[0]]]      \r\n",
 661 |     "     \r\n",
 662 |     "# PageRank     \r\n",
 663 |     "def calculate_pagerank(row):\r\n",
 664 |     "    return [page_rank[row[0]] * 1e6, page_rank[row[1]] * 1e6]       \r\n",
 665 |     "\r\n",
 666 |     "# Hits\r\n",
 667 |     "def calculate_hits(row):\r\n",
 668 |     "    hits_h_1 = hits_h[row[0]] * 1e6\r\n",
 669 |     "    hits_a_1 = hits_a[row[0]] * 1e6\r\n",
 670 |     "    hits_h_2 = hits_h[row[1]] * 1e6\r\n",
 671 |     "    hits_a_2 = hits_a[row[1]] * 1e6\r\n",
 672 |     "    return [hits_h_1, hits_a_1, hits_h_2, hits_a_2]\r\n",
 673 |     "\r\n",
 674 |     "# ShortestPath\r\n",
 675 |     "def calculate_shortestpath(row):\r\n",
 676 |     "    graph.remove_edge(row[0], row[1])\r\n",
 677 |     "    if nx.has_path(graph, row[0], row[1]):\r\n",
 678 |     "        shortest_path = len(nx.shortest_path(graph, row[0], row[1]))\r\n",
 679 |     "    else:\r\n",
 680 |     "        shortest_path = -1\r\n",
 681 |     "    graph.add_edge(row[0], row[1])\r\n",
 682 |     "    return [shortest_path]\r\n",
 683 |     "\r\n",
 684 |     "# Neighbour\r\n",
 685 |     "def calculate_neighbour(row):\r\n",
 686 |     "    neighbor_1 = set(graph.neighbors(row[0]))\r\n",
 687 |     "    neighbor_2 = set(graph.neighbors(row[1]))\r\n",
 688 |     "    return [len(neighbor_1), len(neighbor_2), len(neighbor_1 | neighbor_2)]"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "markdown",
 693 |    "metadata": {
 694 |     "id": "049F004A503C405A85B47CBD208D9567",
 695 |     "mdEditEnable": false
 696 |    },
 697 |    "source": [
 698 |     "## 生成图特征"
 699 |    ]
 700 |   },
 701 |   {
 702 |    "cell_type": "code",
 703 |    "execution_count": 17,
 704 |    "metadata": {
 705 |     "id": "D36B363E31CD476C870BEDF3958CCD4A",
 706 |     "collapsed": false,
 707 |     "scrolled": true
 708 |    },
 709 |    "outputs": [
 710 |     {
 711 |      "name": "stdout",
 712 |      "output_type": "stream",
 713 |      "text": [
 714 |       "start generate feature\n0\n1\n2\n3\n4\n5\n6\n7\n8\n9\n10\n11\n12\n13\n14\n15\n16\n17\n18\n19\n20\n21\n22\n23\n24\n25\n26\n27\n28\n29\n(30000000, 5)\n                  0             1             2             0             1\ncount  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07\nmean   1.274293e+02  1.106532e+01  5.754290e+07  1.234310e+00  2.669189e-02\nstd    3.819820e+03  3.443744e+02  5.868692e+06  3.824809e+01  3.425827e+00\nmin    1.000000e+00  1.000000e+00  2.000000e+00  2.574202e-03  2.541272e-03\n25%    7.000000e+00  1.000000e+00  5.814144e+07  4.419803e-02  3.981151e-03\n50%    1.600000e+01  3.000000e+00  5.814144e+07  8.919203e-02  6.508193e-03\n75%    2.000000e+01  7.000000e+00  5.814144e+07  1.496774e-01  1.367823e-02\nmax    1.289980e+05  1.289980e+05  5.814144e+07  1.291623e+03  1.291623e+03\nCPU times: user 6min 29s, sys: 21.1 s, total: 6min 50s\nWall time: 7min 1s\n"
 715 |      ]
 716 |     },
 717 |     {
 718 |      "data": {
 719 |       "text/plain": [
 720 |        "14"
 721 |       ]
 722 |      },
 723 |      "execution_count": 17,
 724 |      "metadata": {},
 725 |      "output_type": "execute_result"
 726 |     }
 727 |    ],
 728 |    "source": [
 729 |     "%%time\r\n",
 730 |     "gc.collect()\r\n",
 731 |     "CHUNKSIZE = 1000000\r\n",
 732 |     "data = pd.read_csv('./handled_data/id/id_train.csv', chunksize=CHUNKSIZE, header=None)\r\n",
 733 |     "data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n",
 734 |     "pool = Pool()\r\n",
 735 |     "res = []\r\n",
 736 |     "idx = 0\r\n",
 737 |     "for i in range(70):\r\n",
 738 |     "    data.get_chunk()\r\n",
 739 |     "print('start generate feature')\r\n",
 740 |     "# for chunk in data:\r\n",
 741 |     "for i in range(30):\r\n",
 742 |     "    chunk = data.get_chunk()\r\n",
 743 |     "    print(idx)\r\n",
 744 |     "    idx += 1\r\n",
 745 |     "    df = []\r\n",
 746 |     "    ziped = list(zip(chunk[0], chunk[1]))\r\n",
 747 |     "    # ziped = list(zip(chunk[1].apply(lambda x:seq2id[x.strip()]).tolist(), \r\n",
 748 |     "    #                                 chunk[3].apply(lambda x:seq2id[x.strip()]).tolist()))\r\n",
 749 |     "                                    \r\n",
 750 |     "    df.append(pd.DataFrame(pool.map(calculate_statistics, ziped)))\r\n",
 751 |     "    df.append(pd.DataFrame(pool.map(calculate_pagerank, ziped)))\r\n",
 752 |     "    \r\n",
 753 |     "    # df.append(pd.DataFrame(pool.map(calculate_hits, ziped)))\r\n",
 754 |     "    # df.append(pd.DataFrame(pool.map(calculate_neighbour, ziped)))\r\n",
 755 |     "    \r\n",
 756 |     "    res.append(pd.concat(df, axis=1))\r\n",
 757 |     "    del ziped\r\n",
 758 |     "    del chunk\r\n",
 759 |     "    del df\r\n",
 760 |     "    gc.collect()\r\n",
 761 |     "    \r\n",
 762 |     "graph_features = pd.concat(res)\r\n",
 763 |     "graph_features.to_csv('./handled_data/train_feature/train_feature_graph_final.csv', header=None, index=False)\r\n",
 764 |     "print(graph_features.shape)\r\n",
 765 |     "print(graph_features.describe())\r\n",
 766 |     "del graph_features\r\n",
 767 |     "gc.collect()"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "markdown",
 772 |    "metadata": {
 773 |     "id": "2BF16F8FC7694CAB9237103C777228F4",
 774 |     "collapsed": false,
 775 |     "mdEditEnable": false
 776 |    },
 777 |    "source": [
 778 |     "# 一般特征"
 779 |    ]
 780 |   },
 781 |   {
 782 |    "cell_type": "markdown",
 783 |    "metadata": {
 784 |     "id": "779E3B5AEB4A4C78893F3321105A4A8C",
 785 |     "mdEditEnable": false
 786 |    },
 787 |    "source": [
 788 |     "## 准备数据"
 789 |    ]
 790 |   },
 791 |   {
 792 |    "cell_type": "code",
 793 |    "execution_count": 5,
 794 |    "metadata": {
 795 |     "id": "A34605E7A9914661966CE15A3D173E34",
 796 |     "collapsed": false,
 797 |     "scrolled": false
 798 |    },
 799 |    "outputs": [
 800 |     {
 801 |      "name": "stdout",
 802 |      "output_type": "stream",
 803 |      "text": [
 804 |       "loaded dict\nloaded word2vec\nloaded TF-IDF\n----------------initialized models----------------\nCPU times: user 9.07 s, sys: 4.41 s, total: 13.5 s\nWall time: 27.4 s\n"
 805 |      ]
 806 |     }
 807 |    ],
 808 |    "source": [
 809 |     "%%time\r\n",
 810 |     "dictionary = Dictionary.load('./models/word_dict.dict') # gensim Dictionary\r\n",
 811 |     "print('loaded dict')\r\n",
 812 |     "w2v_model = KeyedVectors.load('/home/kesci/work/word_vectors/w2v-300d-new.txt')\r\n",
 813 |     "print('loaded word2vec')\r\n",
 814 |     "tfidf_model = TfidfModel.load('./models/tfidf.model')\r\n",
 815 |     "print('loaded TF-IDF')\r\n",
 816 |     "fasttext_model = FastText.load('./word_vectors/fast-300d-new.txt')\r\n",
 817 |     "# tfidf = joblib.load(model_path + 'scikit_tfidf_model.m')\r\n",
 818 |     "print('----------------initialized models----------------')\r\n",
 819 |     "\r\n",
 820 |     "def init_prob():\r\n",
 821 |     "    CHUNK_SIZE = 1000000\r\n",
 822 |     "    reader = pd.read_csv('./split/10kw.csv',\r\n",
 823 |     "                    chunksize=CHUNK_SIZE,\r\n",
 824 |     "                    header=None,\r\n",
 825 |     "                    names=['query_id','query','query_title_id','title','label'])\r\n",
 826 |     "        \r\n",
 827 |     "    totalCounter = collections.Counter()\r\n",
 828 |     "    posCounter = collections.Counter()\r\n",
 829 |     "    \r\n",
 830 |     "    idx = 0\r\n",
 831 |     "    for chunk in reader:\r\n",
 832 |     "        # if idx == 10: break\r\n",
 833 |     "        print(idx)\r\n",
 834 |     "        idx += 1\r\n",
 835 |     "        query = chunk['query'].apply(lambda x : x.split()).values.tolist()\r\n",
 836 |     "        title = chunk['title'].apply(lambda x : x.split()).values.tolist()\r\n",
 837 |     "        label = chunk['label'].values.tolist()\r\n",
 838 |     "        for i in range(CHUNK_SIZE):\r\n",
 839 |     "            for word in query[i]:\r\n",
 840 |     "                totalCounter[word] += 1\r\n",
 841 |     "                if label[i] == 1:\r\n",
 842 |     "                    posCounter[word] += 1\r\n",
 843 |     "            for word in title[i]:\r\n",
 844 |     "                totalCounter[word] += 1\r\n",
 845 |     "                if label[i] == 1:\r\n",
 846 |     "                    posCounter[word] += 1\r\n",
 847 |     "    prob = {}\r\n",
 848 |     "    for key, value in posCounter.items():\r\n",
 849 |     "        prob[key] = value / totalCounter[key]\r\n",
 850 |     "    return prob   \r\n",
 851 |     "# pos_prob = init_prob() # 需要计算出这个单词出现之后label为1的概率\r\n",
 852 |     "# file = open(handled_path + 'pos_10kw.pkl', 'wb+')\r\n",
 853 |     "# pickle.dump(pos_prob, file)\r\n",
 854 |     "\r\n",
 855 |     "# file = open('./handled_data/pos_10kw.pkl', 'rb+')\r\n",
 856 |     "# pos_prob = pickle.load(file)\r\n",
 857 |     "# print('----------------initialized prob----------------')"
 858 |    ]
 859 |   },
 860 |   {
 861 |    "cell_type": "markdown",
 862 |    "metadata": {
 863 |     "id": "37F857D3CDA0457187A536D291796771",
 864 |     "mdEditEnable": false
 865 |    },
 866 |    "source": [
 867 |     "## powerful word"
 868 |    ]
 869 |   },
 870 |   {
 871 |    "cell_type": "code",
 872 |    "execution_count": 3,
 873 |    "metadata": {
 874 |     "id": "9A946F0C3D1F408C851722616112EFFA",
 875 |     "collapsed": false,
 876 |     "scrolled": false
 877 |    },
 878 |    "outputs": [
 879 |     {
 880 |      "name": "stdout",
 881 |      "output_type": "stream",
 882 |      "text": [
 883 |       "CPU times: user 2.1 s, sys: 204 ms, total: 2.3 s\nWall time: 2.3 s\n"
 884 |      ]
 885 |     }
 886 |    ],
 887 |    "source": [
 888 |     "%%time\r\n",
 889 |     "# words_power = {}\r\n",
 890 |     "# def parse(row):\r\n",
 891 |     "#     label = int(row[2])\r\n",
 892 |     "#     q1_words = row[0]\r\n",
 893 |     "#     q2_words = row[1]\r\n",
 894 |     "#     q1_words = set(q1_words)\r\n",
 895 |     "#     q2_words = set(q2_words)\r\n",
 896 |     "#     all_words = q1_words |q2_words\r\n",
 897 |     "#     for word in all_words:\r\n",
 898 |     "#         if word not in words_power:\r\n",
 899 |     "#             words_power[word] = [0. for i in range(5)]\r\n",
 900 |     "#         words_power[word][0] += 1.\r\n",
 901 |     "\r\n",
 902 |     "#         if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)):\r\n",
 903 |     "#             words_power[word][1] += 1.\r\n",
 904 |     "#             if 1 == label:\r\n",
 905 |     "#                 words_power[word][2] += 1.\r\n",
 906 |     "#         if (word in q1_words) and (word in q2_words):\r\n",
 907 |     "#             words_power[word][3] += 1.\r\n",
 908 |     "#             if 1 == label:\r\n",
 909 |     "#                 words_power[word][4] += 1.\r\n",
 910 |     "# def generate_powerful_word(filepath):\r\n",
 911 |     "#     data = pd.read_csv(filepath, chunksize=1000000, header=None)\r\n",
 912 |     "\r\n",
 913 |     "#     for chunk in tqdm(data):\r\n",
 914 |     "#         ziped = list(zip(chunk[1].apply(lambda x:x.split()).tolist(),\r\n",
 915 |     "#                         chunk[3].apply(lambda x:x.split()).tolist(),\r\n",
 916 |     "#                             chunk[4]))\r\n",
 917 |     "#         for row in ziped:\r\n",
 918 |     "#             parse(row)\r\n",
 919 |     "# generate_powerful_word('/home/kesci/input/bytedance/train_final.csv')\r\n",
 920 |     "# for word in tqdm(words_power):\r\n",
 921 |     "#     if words_power[word][1] > 1e-6:\r\n",
 922 |     "#         words_power[word][2] /= words_power[word][1]\r\n",
 923 |     "#     words_power[word][1] /= words_power[word][0]\r\n",
 924 |     "#     if words_power[word][3] > 1e-6:\r\n",
 925 |     "#         words_power[word][4] /= words_power[word][3]\r\n",
 926 |     "#     words_power[word][3] /= words_power[word][0]\r\n",
 927 |     "# pickle.dump(words_power, open('./handled_data/powerful_words', 'wb'))\r\n",
 928 |     "\r\n",
 929 |     "\r\n",
 930 |     "words_power = pickle.load(open('./handled_data/powerful_words', 'rb'))"
 931 |    ]
 932 |   },
 933 |   {
 934 |    "cell_type": "markdown",
 935 |    "metadata": {
 936 |     "id": "7DB87679506D455689343D2DA8A6BD78",
 937 |     "mdEditEnable": false
 938 |    },
 939 |    "source": [
 940 |     "## 并行统计特征"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "code",
 945 |    "execution_count": 17,
 946 |    "metadata": {
 947 |     "id": "9984EF24EE164E1E9DBAC075C899F2DF",
 948 |     "collapsed": false,
 949 |     "scrolled": false
 950 |    },
 951 |    "outputs": [],
 952 |    "source": [
 953 |     "# 获取 sentence 的tfidf值\r\n",
 954 |     "def getTFIDF(words):\r\n",
 955 |     "    tfidf = tfidf_model[dictionary.doc2bow(words)]\r\n",
 956 |     "    dic = {word:0 for word in words}\r\n",
 957 |     "    sum_weight = 1e-9\r\n",
 958 |     "    for idx, val in tfidf:\r\n",
 959 |     "        dic[dictionary[idx]] = val\r\n",
 960 |     "        sum_weight += val\r\n",
 961 |     "        \r\n",
 962 |     "    return dic, sum_weight\r\n",
 963 |     "\r\n",
 964 |     "# query与title都出现过的word的词数除以总词数\r\n",
 965 |     "def concurrence(data):\r\n",
 966 |     "    query = data[0]\r\n",
 967 |     "    title = data[1]\r\n",
 968 |     "    query_words = {}\r\n",
 969 |     "    title_words = {}\r\n",
 970 |     "    for word in query:\r\n",
 971 |     "        query_words[word] = query_words.get(word, 0) + 1\r\n",
 972 |     "    for word in title:\r\n",
 973 |     "        title_words[word] = title_words.get(word, 0) + 1\r\n",
 974 |     "\r\n",
 975 |     "    shared_query_word = sum([query_words[w] for w in query if w in title])\r\n",
 976 |     "    shared_titel_word = sum([title_words[w] for w in title if w in query])\r\n",
 977 |     "    total = sum(query_words.values()) + sum(title_words.values())\r\n",
 978 |     "\r\n",
 979 |     "    if 1e-6 > total:\r\n",
 980 |     "        return [0]\r\n",
 981 |     "    else:\r\n",
 982 |     "        return [1.0 * (shared_titel_word + shared_query_word) / total]\r\n",
 983 |     "\r\n",
 984 |     "# 编辑距离，词粒度\r\n",
 985 |     "def levenshteinDistance(data):\r\n",
 986 |     "    query = data[0]\r\n",
 987 |     "    title = data[1]\r\n",
 988 |     "    \r\n",
 989 |     "    len_query = len(query) + 1\r\n",
 990 |     "    len_title = len(title) + 1\r\n",
 991 |     "    dp = [[0] * len_title] * len_query\r\n",
 992 |     "\r\n",
 993 |     "    for i in range(1, len_query):\r\n",
 994 |     "        for j in range(1, len_title):\r\n",
 995 |     "            deletion = dp[i-1][j] + 1\r\n",
 996 |     "            insertion = dp[i][j-1] + 1\r\n",
 997 |     "            substitution = dp[i-1][j-1]\r\n",
 998 |     "            if query[i-1] != title[j-1]:\r\n",
 999 |     "                substitution += 1\r\n",
1000 |     "            dp[i][j] = min(deletion, insertion, substitution)\r\n",
1001 |     "    return [1 - dp[-1][-1] / max(len_title, len_query)]\r\n",
1002 |     "\r\n",
1003 |     "# sorensen距离\r\n",
1004 |     "def sorensenDistance(data):\r\n",
1005 |     "    query = data[0]\r\n",
1006 |     "    title = data[1]\r\n",
1007 |     "    words = set(query) | set(title)\r\n",
1008 |     "\r\n",
1009 |     "    query_dict = {}\r\n",
1010 |     "    for word in query:\r\n",
1011 |     "        query_dict[word] = query_dict.get(word, 0) + 1\r\n",
1012 |     "\r\n",
1013 |     "    title_dict = {}\r\n",
1014 |     "    for word in title:\r\n",
1015 |     "        title_dict[word] = title_dict.get(word, 0) + 1\r\n",
1016 |     "\r\n",
1017 |     "    total = sum(query_dict.values()) + sum(title_dict.values())\r\n",
1018 |     "\r\n",
1019 |     "    diff = 0\r\n",
1020 |     "    for word in words:\r\n",
1021 |     "        diff += abs(query_dict.get(word, 0) - title_dict.get(word, 0))\r\n",
1022 |     "\r\n",
1023 |     "    return [diff / total]\r\n",
1024 |     "    \r\n",
1025 |     "# query和title中的共现词 \r\n",
1026 |     "# 计算某个词出现的时候label为1的概率，然后求所有词的概率乘积\r\n",
1027 |     "def sameWord(data):\r\n",
1028 |     "    unique_words = set(data[0]) | set(data[1])\r\n",
1029 |     "    prob = 1\r\n",
1030 |     "    for word in unique_words:\r\n",
1031 |     "        prob *= pos_prob.get(word, 1)\r\n",
1032 |     "    return [len(unique_words), prob]\r\n",
1033 |     "\r\n",
1034 |     "# Dice Ochi\r\n",
1035 |     "# 1 - 交集除以并集jaccard\r\n",
1036 |     "def distance(data):\r\n",
1037 |     "    query = data[0]\r\n",
1038 |     "    title = data[1]\r\n",
1039 |     "    intersection = len([x for x in query if x in title])\r\n",
1040 |     "    l = len(query) + len(title)\r\n",
1041 |     "    return [2*intersection / l, intersection / np.sqrt(l),\r\n",
1042 |     "            1- (intersection / (l - intersection))]\r\n",
1043 |     "            \r\n",
1044 |     "def fuzzyDistance(data):\r\n",
1045 |     "    return [\r\n",
1046 |     "        fuzz.ratio(data[0], data[1]),\r\n",
1047 |     "        fuzz.partial_ratio(data[0], data[1]),\r\n",
1048 |     "        fuzz.token_sort_ratio(data[0], data[1]),\r\n",
1049 |     "        fuzz.token_set_ratio(data[0], data[1]),\r\n",
1050 |     "        fuzz.partial_token_sort_ratio(data[0], data[1]),\r\n",
1051 |     "        w2v_model.wmdistance(data[0], data[1]),\r\n",
1052 |     "        fasttext_model.wmdistance(data[0], data[1])\r\n",
1053 |     "    ]\r\n",
1054 |     "\r\n",
1055 |     "def powerfuleWord(data):\r\n",
1056 |     "    rate_single = 1.0\r\n",
1057 |     "    rate_double = 1.0\r\n",
1058 |     "    query = set(data[0])\r\n",
1059 |     "    title = set(data[1])\r\n",
1060 |     "    \r\n",
1061 |     "    share_words = query.intersection(title)\r\n",
1062 |     "    all_diff = set(query.difference(title) | title.difference(query))\r\n",
1063 |     "    for word in share_words:\r\n",
1064 |     "        if word in words_power:\r\n",
1065 |     "            rate_double *= (1.0 - words_power[word][4])\r\n",
1066 |     "    for word in all_diff:\r\n",
1067 |     "        if word in words_power:\r\n",
1068 |     "            rate_single *= (1.0 - words_power[word][2])\r\n",
1069 |     "    \r\n",
1070 |     "    return [1-rate_single, 1-rate_double]"
1071 |    ]
1072 |   },
1073 |   {
1074 |    "cell_type": "markdown",
1075 |    "metadata": {
1076 |     "id": "3199EF41D500495FB28DFC5C113BDE38",
1077 |     "mdEditEnable": false
1078 |    },
1079 |    "source": [
1080 |     "## 并行词嵌入特征"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "code",
1085 |    "execution_count": 18,
1086 |    "metadata": {
1087 |     "id": "E3A82A356CF04F0986A888EDE46FA47B",
1088 |     "collapsed": false,
1089 |     "scrolled": false
1090 |    },
1091 |    "outputs": [],
1092 |    "source": [
1093 |     "# 词向量距离\r\n",
1094 |     "def word2vecDistance(data):\r\n",
1095 |     "    query = data[0]\r\n",
1096 |     "    title = data[1]\r\n",
1097 |     "    vec_query = np.array([w2v_model[x] if x in w2v_model else [1e-9]*300 for x in query]).mean(axis=0)\r\n",
1098 |     "    vec_title = np.array([w2v_model[x] if x in w2v_model else [1e-9]*300 for x in title]).mean(axis=0)\r\n",
1099 |     "\r\n",
1100 |     "    cos_dis = 1 - cosine(vec_title, vec_query)\r\n",
1101 |     "    euclidean_dis = 1 - euclidean(vec_title, vec_query)\r\n",
1102 |     "    manhattan_dis = 1 - cityblock(vec_title, vec_query)\r\n",
1103 |     "    \r\n",
1104 |     "    vec_query = np.array([fasttext_model[x] if x in fasttext_model else [1e-9]*300 for x in query]).mean(axis=0)\r\n",
1105 |     "    vec_title = np.array([fasttext_model[x] if x in fasttext_model else [1e-9]*300 for x in title]).mean(axis=0)\r\n",
1106 |     "\r\n",
1107 |     "    cos_dis_fasttext = 1 - cosine(vec_title, vec_query)\r\n",
1108 |     "    euclidean_dis_fasttext = 1 - euclidean(vec_title, vec_query)\r\n",
1109 |     "    manhattan_dis_fasttext = 1 - cityblock(vec_title, vec_query)\r\n",
1110 |     "\r\n",
1111 |     "    return [cos_dis, euclidean_dis, manhattan_dis, \r\n",
1112 |     "            cos_dis_fasttext, euclidean_dis_fasttext, manhattan_dis_fasttext] \r\n",
1113 |     "\r\n",
1114 |     "# 词向量距离，考虑TFIDF距离\r\n",
1115 |     "# 获取重要程度在前topN的词的向量，求距离\r\n",
1116 |     "# 只考虑tfidf权重的值的距离，cos、总和、平均\r\n",
1117 |     "def w2vWeightDistance(data):\r\n",
1118 |     "    topN = 3\r\n",
1119 |     "    query = data[0]\r\n",
1120 |     "    title = data[1]\r\n",
1121 |     "    query_tfidf, query_weight = getTFIDF(query)\r\n",
1122 |     "    title_tfidf, title_weight = getTFIDF(title)\r\n",
1123 |     "    \r\n",
1124 |     "    # word2vec距离，考虑TFIDF\r\n",
1125 |     "    vec_query = np.sum(np.array([w2v_model[key] * value if key in w2v_model else [1e-9]*300 for key, value in query_tfidf.items()]), axis=0) / query_weight\r\n",
1126 |     "    vec_title = np.sum(np.array([w2v_model[key] * value if key in w2v_model else [1e-9]*300 for key, value in title_tfidf.items()]), axis=0) / title_weight\r\n",
1127 |     "\r\n",
1128 |     "    cos_dis = 1 - cosine(vec_query, vec_title)\r\n",
1129 |     "    if np.isnan(cos_dis):\r\n",
1130 |     "        cos_dis = 1\r\n",
1131 |     "    euclidean_dis = 1 - euclidean(vec_query, vec_title)\r\n",
1132 |     "    manhattan_dis = 1- cityblock(vec_query, vec_title)\r\n",
1133 |     "    \r\n",
1134 |     "    # fasttext距离，考虑TFIDF\r\n",
1135 |     "    vec_query = np.sum(np.array([fasttext_model[key] * value if key in fasttext_model else [1e-9]*300 for key, value in query_tfidf.items()]), axis=0) / query_weight\r\n",
1136 |     "    vec_title = np.sum(np.array([fasttext_model[key] * value if key in fasttext_model else [1e-9]*300 for key, value in title_tfidf.items()]), axis=0) / title_weight\r\n",
1137 |     "\r\n",
1138 |     "    cos_dis_fasttext = 1 - cosine(vec_query, vec_title)\r\n",
1139 |     "    if np.isnan(cos_dis_fasttext):\r\n",
1140 |     "        cos_dis_fasttext = 1\r\n",
1141 |     "    euclidean_dis_fasttext = 1 - euclidean(vec_query, vec_title)\r\n",
1142 |     "    manhattan_dis_fasttext = 1- cityblock(vec_query, vec_title)\r\n",
1143 |     "    \r\n",
1144 |     "    # 获取重要程度在前topN的词\r\n",
1145 |     "    query_keywords = sorted(zip(query_tfidf.keys(), query_tfidf.values()), key=lambda x : -x[1])[0 : min(topN, len(query_tfidf))]\r\n",
1146 |     "    title_keywords = sorted(zip(title_tfidf.keys(), title_tfidf.values()), key=lambda x : -x[1])[0: min(topN, len(title_tfidf))]\r\n",
1147 |     "    \r\n",
1148 |     "    # 重要词的word2vec距离\r\n",
1149 |     "    query_vector = np.mean(np.array([w2v_model[word] if word in w2v_model else [1e-9]*300 for word, value in query_keywords]), axis=0)\r\n",
1150 |     "    title_vector = np.mean(np.array([w2v_model[word] if word in w2v_model else [1e-9]*300 for word, value in title_keywords]), axis=0)\r\n",
1151 |     "\r\n",
1152 |     "    cos_dis_keyword = 1 - cosine(query_vector, title_vector)\r\n",
1153 |     "    \r\n",
1154 |     "    euclidean_dis_keyword = 1 - euclidean(query_vector, title_vector)\r\n",
1155 |     "    manhattan_dis_keyword = 1 - cityblock(query_vector, title_vector)\r\n",
1156 |     "    \r\n",
1157 |     "    # 重要词的fasttext距离\r\n",
1158 |     "    query_vector = np.mean(np.array([fasttext_model[word] if word in fasttext_model else [1e-9]*300 for word, value in query_keywords]), axis=0)\r\n",
1159 |     "    title_vector = np.mean(np.array([fasttext_model[word] if word in fasttext_model else [1e-9]*300 for word, value in title_keywords]), axis=0)\r\n",
1160 |     "\r\n",
1161 |     "    cos_dis_keyword_fasttext = 1 - cosine(query_vector, title_vector)\r\n",
1162 |     "    euclidean_dis_keyword_fasttext = 1 - euclidean(query_vector, title_vector)\r\n",
1163 |     "    manhattan_dis_keyword_fasttext = 1 - cityblock(query_vector, title_vector)\r\n",
1164 |     "    \r\n",
1165 |     "    # 只考虑tfidf权重的值的距离，cosine、sum、avg\r\n",
1166 |     "    len1 = query_tfidf.__len__()\r\n",
1167 |     "    len2 = title_tfidf.__len__()\r\n",
1168 |     "    \r\n",
1169 |     "    query_tfidf_value = np.array(list(query_tfidf.values()) + [1e-9]*max(len2-len1, 0))\r\n",
1170 |     "    title_tfidf_value = np.array(list(title_tfidf.values()) + [1e-9]*max(len1-len2, 0))\r\n",
1171 |     "    \r\n",
1172 |     "    tfidf_distance = 1 - cosine(query_tfidf_value, title_tfidf_value)\r\n",
1173 |     "    if np.isnan(tfidf_distance):\r\n",
1174 |     "        tfidf_distance = 1\r\n",
1175 |     "    query_tfidf_sum = np.sum(query_tfidf_value)\r\n",
1176 |     "    title_tfidf_sum = np.sum(title_tfidf_value)\r\n",
1177 |     "    query_tfidf_mean = np.mean(query_tfidf_value)\r\n",
1178 |     "    title_tfidf_mean = np.mean(title_tfidf_value)\r\n",
1179 |     "\r\n",
1180 |     "    return [cos_dis, euclidean_dis, manhattan_dis, \r\n",
1181 |     "            cos_dis_keyword, euclidean_dis_keyword, manhattan_dis_keyword,\r\n",
1182 |     "            tfidf_distance, query_tfidf_sum, title_tfidf_sum, \r\n",
1183 |     "            query_tfidf_mean, title_tfidf_mean, \r\n",
1184 |     "            cos_dis_fasttext, euclidean_dis_fasttext, manhattan_dis_fasttext,\r\n",
1185 |     "            cos_dis_keyword_fasttext, euclidean_dis_keyword_fasttext, manhattan_dis_keyword_fasttext]\r\n",
1186 |     "\r\n"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "markdown",
1191 |    "metadata": {
1192 |     "id": "6A7CD35225FD4AF5AC8C590334845120",
1193 |     "mdEditEnable": false
1194 |    },
1195 |    "source": [
1196 |     "## 并行生成特征"
1197 |    ]
1198 |   },
1199 |   {
1200 |    "cell_type": "code",
1201 |    "execution_count": 16,
1202 |    "metadata": {
1203 |     "id": "D74C4E247697433E8555DFB34B7CE243",
1204 |     "collapsed": false,
1205 |     "scrolled": false
1206 |    },
1207 |    "outputs": [
1208 |     {
1209 |      "name": "stderr",
1210 |      "output_type": "stream",
1211 |      "text": [
1212 |       "30it [02:33,  5.08s/it]\n"
1213 |      ]
1214 |     },
1215 |     {
1216 |      "name": "stdout",
1217 |      "output_type": "stream",
1218 |      "text": [
1219 |       "                  0             1\ncount  3.000000e+07  3.000000e+07\nmean   6.411826e-01  6.411826e-01\nstd    7.225124e-02  7.225124e-02\nmin    0.000000e+00  0.000000e+00\n25%    5.931235e-01  5.931235e-01\n50%    6.340817e-01  6.340817e-01\n75%    6.841270e-01  6.841270e-01\nmax    1.000000e+00  1.000000e+00\n"
1220 |      ]
1221 |     }
1222 |    ],
1223 |    "source": [
1224 |     "%%time\r\n",
1225 |     "pool = Pool(8)\r\n",
1226 |     "CHUNK_SIZE = 1000000\r\n",
1227 |     "data = pd.read_csv('./split/3kw.csv', chunksize=CHUNK_SIZE, header=None)\r\n",
1228 |     "data.columns = {'query_id', 'query', 'title_id', 'title', 'label'}\r\n",
1229 |     "\r\n",
1230 |     "result = []\r\n",
1231 |     "i = 0\r\n",
1232 |     "for chunk in tqdm(data):\r\n",
1233 |     "    i += 1\r\n",
1234 |     "    # if i>10:break\r\n",
1235 |     "    \r\n",
1236 |     "    df = []\r\n",
1237 |     "    # query = chunk[1].apply(lambda x : x.split()).tolist()\r\n",
1238 |     "    # title = chunk[3].apply(lambda x : x.split()).tolist()\r\n",
1239 |     "    \r\n",
1240 |     "    query = chunk[1].tolist()\r\n",
1241 |     "    title = chunk[3].tolist()\r\n",
1242 |     "    d = list(zip(query, title))\r\n",
1243 |     "    \r\n",
1244 |     "    # df.append(pd.DataFrame(pool.map(concurrence, d)))\r\n",
1245 |     "    # print(1)\r\n",
1246 |     "    # df.append(pd.DataFrame(pool.map(levenshteinDistance, d)))\r\n",
1247 |     "    # print(2)\r\n",
1248 |     "    # df.append(pd.DataFrame(pool.map(sorensenDistance, d)))\r\n",
1249 |     "    # print(3)\r\n",
1250 |     "    # df.append(pd.DataFrame(pool.map(sameWord, d)))\r\n",
1251 |     "    # print(4)\r\n",
1252 |     "    # df.append(pd.DataFrame(pool.map(distance, d)))\r\n",
1253 |     "    # print(5)\r\n",
1254 |     "    # df.append(pd.DataFrame(pool.map(word2vecDistance, d)))\r\n",
1255 |     "    # print(6)\r\n",
1256 |     "    # df.append(pd.DataFrame(pool.map(w2vWeightDistance, d)))\r\n",
1257 |     "    # print(7)\r\n",
1258 |     "    # df.append(pd.DataFrame(pool.map(fuzzyDistance, d)))\r\n",
1259 |     "    # print(8)\r\n",
1260 |     "    # df.append(pd.DataFrame(pool.map(powerfuleWord, d)))\r\n",
1261 |     "    # print(9)\r\n",
1262 |     "    \r\n",
1263 |     "    result.append(pd.concat(df, axis=1))\r\n",
1264 |     "\r\n",
1265 |     "    del df\r\n",
1266 |     "    del query\r\n",
1267 |     "    del title\r\n",
1268 |     "    del d\r\n",
1269 |     "    del chunk\r\n",
1270 |     "    gc.collect()\r\n",
1271 |     "\r\n",
1272 |     "del data\r\n",
1273 |     "result = pd.concat(result)\r\n",
1274 |     "print(result.describe())\r\n",
1275 |     "result.to_csv('./handled_data/train_feature/train_feature_jellyfish.csv', header=None, index=False)\r\n",
1276 |     "# 释放资源\r\n",
1277 |     "pool.close()\r\n",
1278 |     "pool.terminate()\r\n",
1279 |     "pool.join()\r\n",
1280 |     "gc.collect()"
1281 |    ]
1282 |   },
1283 |   {
1284 |    "cell_type": "markdown",
1285 |    "metadata": {
1286 |     "id": "D23CD913318A48C9A86AD2CC28FB1F08",
1287 |     "mdEditEnable": false
1288 |    },
1289 |    "source": [
1290 |     "# 特征相关性分析"
1291 |    ]
1292 |   },
1293 |   {
1294 |    "cell_type": "markdown",
1295 |    "metadata": {
1296 |     "id": "55A6DF5EEEDE4EB388356B43F0B03CC2",
1297 |     "mdEditEnable": false
1298 |    },
1299 |    "source": [
1300 |     "## 相关系数"
1301 |    ]
1302 |   },
1303 |   {
1304 |    "cell_type": "code",
1305 |    "execution_count": 21,
1306 |    "metadata": {
1307 |     "id": "AC1339E852414CAC9D6C44CA2B2B9C9B",
1308 |     "collapsed": false,
1309 |     "scrolled": true
1310 |    },
1311 |    "outputs": [
1312 |     {
1313 |      "name": "stdout",
1314 |      "output_type": "stream",
1315 |      "text": [
1316 |       "0.9776392403538859 sorensenDistance jaccardDistance\n0.9191483643445837 diceDistance ochiaiDistance\n0.9191483643445837 ochiaiDistance diceDistance\n0.9776392403538859 jaccardDistance sorensenDistance\n0.9790698526721849 word2vecDistance_1 fasttextDistance_1\n0.9013797584783118 word2vecDistance_1 w2vWeightDistance_1\n0.9982673082219394 word2vecDistance_2 word2vecDistance_3\n0.971311990796078 word2vecDistance_2 fasttextDistance_2\n0.9694247650601884 word2vecDistance_2 fasttextDistance_3\n0.9982673082219394 word2vecDistance_3 word2vecDistance_2\n0.9696246774668986 word2vecDistance_3 fasttextDistance_2\n0.9677387984890892 word2vecDistance_3 fasttextDistance_3\n0.9790698526721849 fasttextDistance_1 word2vecDistance_1\n0.971311990796078 fasttextDistance_2 word2vecDistance_2\n0.9696246774668986 fasttextDistance_2 word2vecDistance_3\n0.9980665751664842 fasttextDistance_2 fasttextDistance_3\n0.9694247650601884 fasttextDistance_3 word2vecDistance_2\n0.9677387984890892 fasttextDistance_3 word2vecDistance_3\n0.9980665751664842 fasttextDistance_3 fasttextDistance_2\n0.9013797584783118 w2vWeightDistance_1 word2vecDistance_1\n0.9517133422175729 w2vWeightDistance_1 fasttextWeightDistance_1\n0.9920940986091317 w2vWeightDistance_2 w2vWeightDistance_3\n0.9920940986091317 w2vWeightDistance_3 w2vWeightDistance_2\n0.9822766542630752 w2vKeywordDistance_1 fasttextKeywordDistance_1\n0.9980499767731188 w2vKeywordDistance_2 w2vKeywordDistance_3\n0.9704524215324315 w2vKeywordDistance_2 fasttextKeywordDistance_2\n0.9684116753038846 w2vKeywordDistance_2 fasttextKeywordDistance_3\n0.9980499767731188 w2vKeywordDistance_3 w2vKeywordDistance_2\n0.9685512758429773 w2vKeywordDistance_3 fasttextKeywordDistance_2\n0.9665154030013232 w2vKeywordDistance_3 fasttextKeywordDistance_3\n0.9517133422175729 fasttextWeightDistance_1 w2vWeightDistance_1\n0.9977510526794808 fasttextWeightDistance_2 fasttextWeightDistance_3\n0.9977510526794808 fasttextWeightDistance_3 fasttextWeightDistance_2\n0.9822766542630752 fasttextKeywordDistance_1 w2vKeywordDistance_1\n0.9041843257841926 fasttextKeywordDistance_1 fasttextKeywordDistance_2\n0.9023746325133278 fasttextKeywordDistance_1 fasttextKeywordDistance_3\n0.9704524215324315 fasttextKeywordDistance_2 w2vKeywordDistance_2\n0.9685512758429773 fasttextKeywordDistance_2 w2vKeywordDistance_3\n0.9041843257841926 fasttextKeywordDistance_2 fasttextKeywordDistance_1\n0.9979225827100741 fasttextKeywordDistance_2 fasttextKeywordDistance_3\n0.9684116753038846 fasttextKeywordDistance_3 w2vKeywordDistance_2\n0.9665154030013232 fasttextKeywordDistance_3 w2vKeywordDistance_3\n0.9023746325133278 fasttextKeywordDistance_3 fasttextKeywordDistance_1\n0.9979225827100741 fasttextKeywordDistance_3 fasttextKeywordDistance_2\n0.9999576749537433 pagerank_feature1 seq2_max_degree\n0.9999999866735351 pagerank_feature1 seq1_neighbor\n0.9953337614918548 pagerank_feature1 total_neighbor\n0.9670942889451213 pagerank+feature2 seq1_max_component\n0.9999995206910431 pagerank+feature2 seq2_neighbor\n0.9999576749537433 seq2_max_degree pagerank_feature1\n0.9999576834960514 seq2_max_degree seq1_neighbor\n0.995232077474844 seq2_max_degree total_neighbor\n0.9670942889451213 seq1_max_component pagerank+feature2\n0.9670951938053971 seq1_max_component seq2_neighbor\n0.9999999866735351 seq1_neighbor pagerank_feature1\n0.9999576834960514 seq1_neighbor seq2_max_degree\n0.9953338453527495 seq1_neighbor total_neighbor\n0.9999995206910431 seq2_neighbor pagerank+feature2\n0.9670951938053971 seq2_neighbor seq1_max_component\n0.9953337614918548 total_neighbor pagerank_feature1\n0.995232077474844 total_neighbor seq2_max_degree\n0.9953338453527495 total_neighbor seq1_neighbor\n"
1317 |      ]
1318 |     }
1319 |    ],
1320 |    "source": [
1321 |     "feature_name = ['concurrence', 'levenshteinDistance', 'sorensenDistance', \n",
1322 |     "        'sameWord', 'specialConcurrence',\n",
1323 |     "        'diceDistance', 'ochiaiDistance', 'jaccardDistance', \n",
1324 |     "        'word2vecDistance_1', 'word2vecDistance_2', 'word2vecDistance_3', \n",
1325 |     "        'fasttextDistance_1', 'fasttextDistance_2', 'fasttextDistance_3', \n",
1326 |     "        'w2vWeightDistance_1', 'w2vWeightDistance_2', 'w2vWeightDistance_3',\n",
1327 |     "        'w2vKeywordDistance_1', 'w2vKeywordDistance_2', 'w2vKeywordDistance_3',\n",
1328 |     "        'TFIDFDistance_1', 'TFIDFDistance_2', 'TFIDFDistance_3', \n",
1329 |     "        'TFIDFDistance_4', 'TFIDFDistance_5', \n",
1330 |     "        'fasttextWeightDistance_1', 'fasttextWeightDistance_2', 'fasttextWeightDistance_3',\n",
1331 |     "        'fasttextKeywordDistance_1', 'fasttextKeywordDistance_2', 'fasttextKeywordDistance_3',\n",
1332 |     "        'pagerank_feature1', 'pagerank_feature2',\n",
1333 |     "        'seq1_max_degree', 'seq2_max_degree', 'seq1_max_component',\n",
1334 |     "        'seq1_neighbor', 'seq2_neighbor', 'total_neighbor',\n",
1335 |     "        'DSSM_Feature', 'Deep_Model1', 'ARC', 'MVLSTM']\n",
1336 |     "        \n",
1337 |     "with open('./handled_data/correlation', 'r', encoding='utf-8') as f:\n",
1338 |     "    for idx, line in enumerate(f.readlines()):\n",
1339 |     "        tmp_data = list(map(lambda x:float(x), line.split(',')))\n",
1340 |     "        for item in range(len(tmp_data)):\n",
1341 |     "            if tmp_data[item] > 0.9 and item != idx:\n",
1342 |     "                print(tmp_data[item], feature_name[idx], feature_name[item])"
1343 |    ]
1344 |   },
1345 |   {
1346 |    "cell_type": "markdown",
1347 |    "metadata": {
1348 |     "id": "08DA2FEA1043479495F8624EAC6E8336",
1349 |     "mdEditEnable": false
1350 |    },
1351 |    "source": [
1352 |     "## heatmap"
1353 |    ]
1354 |   },
1355 |   {
1356 |    "cell_type": "code",
1357 |    "execution_count": 47,
1358 |    "metadata": {
1359 |     "id": "7C7598F567B7418785EBE3B64A30B1B2",
1360 |     "collapsed": false,
1361 |     "scrolled": false
1362 |    },
1363 |    "outputs": [
1364 |     {
1365 |      "data": {
1366 |       "text/html": [
1367 |        "<img src=\"https://cdn.kesci.com/rt_upload/7C7598F567B7418785EBE3B64A30B1B2/pvtej76qjv.png\">"
1368 |       ],
1369 |       "text/plain": [
1370 |        "<Figure size 432x288 with 2 Axes>"
1371 |       ]
1372 |      },
1373 |      "metadata": {
1374 |       "needs_background": "light"
1375 |      },
1376 |      "output_type": "execute_result"
1377 |     },
1378 |     {
1379 |      "data": {
1380 |       "text/plain": [
1381 |        "<Figure size 3000x2000 with 0 Axes>"
1382 |       ]
1383 |      },
1384 |      "metadata": {},
1385 |      "output_type": "display_data"
1386 |     },
1387 |     {
1388 |      "data": {
1389 |       "text/plain": [
1390 |        "<Figure size 432x288 with 0 Axes>"
1391 |       ]
1392 |      },
1393 |      "metadata": {},
1394 |      "output_type": "display_data"
1395 |     }
1396 |    ],
1397 |    "source": [
1398 |     "def plot_corr():\r\n",
1399 |     "    corr = pd.read_csv('./handled_data/correlation', header=None)\r\n",
1400 |     "    mask = np.zeros_like(corr, dtype=np.bool)\r\n",
1401 |     "    mask[np.triu_indices_from(mask)] = True\r\n",
1402 |     "    cmap = sns.diverging_palette(220, 10, as_cmap=True)\r\n",
1403 |     "    g = sns.heatmap(corr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')\r\n",
1404 |     "    plt.figure(dpi=500)\r\n",
1405 |     "    plt.show()\r\n",
1406 |     "    plt.savefig('./try_test')\r\n",
1407 |     "plot_corr()"
1408 |    ]
1409 |   },
1410 |   {
1411 |    "cell_type": "markdown",
1412 |    "metadata": {
1413 |     "id": "C4D670551EDD4DF78453E4AEF3437963",
1414 |     "mdEditEnable": false
1415 |    },
1416 |    "source": [
1417 |     "# RF"
1418 |    ]
1419 |   },
1420 |   {
1421 |    "cell_type": "code",
1422 |    "execution_count": 6,
1423 |    "metadata": {
1424 |     "id": "C143F1CF75C84E6C8CB9A410E4982CC7",
1425 |     "collapsed": false,
1426 |     "scrolled": false
1427 |    },
1428 |    "outputs": [
1429 |     {
1430 |      "data": {
1431 |       "text/plain": [
1432 |        "7"
1433 |       ]
1434 |      },
1435 |      "execution_count": 6,
1436 |      "metadata": {},
1437 |      "output_type": "execute_result"
1438 |     }
1439 |    ],
1440 |    "source": [
1441 |     "del x_train\n",
1442 |     "del y_train\n",
1443 |     "gc.collect()"
1444 |    ]
1445 |   },
1446 |   {
1447 |    "cell_type": "code",
1448 |    "execution_count": 5,
1449 |    "metadata": {
1450 |     "id": "CA9F515F3FFC4EA488778D87C5268F1C",
1451 |     "collapsed": false,
1452 |     "scrolled": false
1453 |    },
1454 |    "outputs": [
1455 |     {
1456 |      "name": "stdout",
1457 |      "output_type": "stream",
1458 |      "text": [
1459 |       "-------------------start train-------------------\n"
1460 |      ]
1461 |     },
1462 |     {
1463 |      "name": "stderr",
1464 |      "output_type": "stream",
1465 |      "text": [
1466 |       "[Parallel(n_jobs=15)]: Using backend ThreadingBackend with 15 concurrent workers.\n"
1467 |      ]
1468 |     }
1469 |    ],
1470 |    "source": [
1471 |     "rf = RandomForestClassifier(n_jobs=15,\n",
1472 |     "                                n_estimators=150,\n",
1473 |     "                                class_weight={0:1, 1:3},\n",
1474 |     "                                verbose=1)\n",
1475 |     "train_x = np.array(x_train.values).reshape(-1, 1).astype('int')\n",
1476 |     "train_y = np.array(y_train.values).reshape(-1, 1).astype('int')\n",
1477 |     "\n",
1478 |     "test_x = np.array(x_test.values).reshape(-1, 1).astype('int')\n",
1479 |     "test_y = np.array(y_test.values).reshape(-1, 1).astype('int')\n",
1480 |     "print('-------------------start train-------------------')\n",
1481 |     "rf.fit(train_x, train_y)\n",
1482 |     "pred_rf = rf.predict_proba(test_x)\n",
1483 |     "auc_score = roc_auc_score(test_y, pred_rf)\n",
1484 |     "acc_score = precision_score(test_y, pred_rf)\n",
1485 |     "print('auc_score:', auc_score)\n",
1486 |     "print('acc_score:', acc_score)"
1487 |    ]
1488 |   },
1489 |   {
1490 |    "cell_type": "markdown",
1491 |    "metadata": {
1492 |     "id": "00668DC1B0E74253AC74E3DA5E444CC4",
1493 |     "mdEditEnable": false
1494 |    },
1495 |    "source": [
1496 |     "# XGBoost"
1497 |    ]
1498 |   },
1499 |   {
1500 |    "cell_type": "code",
1501 |    "execution_count": 19,
1502 |    "metadata": {
1503 |     "id": "FD68F203FD1A444486493FB70F4A64D3",
1504 |     "collapsed": false,
1505 |     "scrolled": false
1506 |    },
1507 |    "outputs": [
1508 |     {
1509 |      "name": "stdout",
1510 |      "output_type": "stream",
1511 |      "text": [
1512 |       "[]\n"
1513 |      ]
1514 |     }
1515 |    ],
1516 |    "source": [
1517 |     "print(train.get_label())"
1518 |    ]
1519 |   },
1520 |   {
1521 |    "cell_type": "code",
1522 |    "execution_count": 18,
1523 |    "metadata": {
1524 |     "id": "BC7D52A783334956A93B90A77C182F38",
1525 |     "collapsed": false,
1526 |     "scrolled": true
1527 |    },
1528 |    "outputs": [
1529 |     {
1530 |      "name": "stdout",
1531 |      "output_type": "stream",
1532 |      "text": [
1533 |       "[14:20:06] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.\n"
1534 |      ]
1535 |     },
1536 |     {
1537 |      "ename": "XGBoostError",
1538 |      "evalue": "b'[14:20:06] /workspace/src/objective/regression_obj.cu:65: Check failed: info.labels_.Size() != 0U (0 vs. 0) label set cannot be empty\\n\\nStack trace returned 10 entries:\\n[bt] (0) /opt/conda/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7fb9897f85cd]\\n[bt] (1) /opt/conda/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7fb9897f89c8]\\n[bt] (2) /opt/conda/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xcd) [0x7fb9899ee28d]\\n[bt] (3) /opt/conda/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7fb98986f1e2]\\n[bt] (4) /opt/conda/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb9897f0ab5]\\n[bt] (5) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fb9e0c32ec0]\\n[bt] (6) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fb9e0c3287d]\\n[bt] (7) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fb9e0e47dee]\\n[bt] (8) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7fb9e0e48825]\\n[bt] (9) python(_PyObject_FastCallDict+0x8b) [0x55ebba14c1bb]\\n\\n'",
1539 |      "traceback": [
1540 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1541 |       "\u001b[0;31mXGBoostError\u001b[0m                              Traceback (most recent call last)",
1542 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
1543 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks, learning_rates)\u001b[0m\n\u001b[1;32m    214\u001b[0m                            \u001b[0mevals\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mevals\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    215\u001b[0m                            \u001b[0mobj\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeval\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfeval\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 216\u001b[0;31m                            xgb_model=xgb_model, callbacks=callbacks)\n\u001b[0m\u001b[1;32m    217\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    218\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1544 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/training.py\u001b[0m in \u001b[0;36m_train_internal\u001b[0;34m(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)\u001b[0m\n\u001b[1;32m     72\u001b[0m         \u001b[0;31m# Skip the first update if it is a recovery step.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     73\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mversion\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0;36m2\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 74\u001b[0;31m             \u001b[0mbst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     75\u001b[0m             \u001b[0mbst\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msave_rabit_checkpoint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     76\u001b[0m             \u001b[0mversion\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1545 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36mupdate\u001b[0;34m(self, dtrain, iteration, fobj)\u001b[0m\n\u001b[1;32m   1043\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1044\u001b[0m             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, ctypes.c_int(iteration),\n\u001b[0;32m-> 1045\u001b[0;31m                                                     dtrain.handle))\n\u001b[0m\u001b[1;32m   1046\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1047\u001b[0m             \u001b[0mpred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtrain\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1546 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/xgboost/core.py\u001b[0m in \u001b[0;36m_check_call\u001b[0;34m(ret)\u001b[0m\n\u001b[1;32m    163\u001b[0m     \"\"\"\n\u001b[1;32m    164\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 165\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mXGBoostError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mXGBGetLastError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    166\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    167\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1547 |       "\u001b[0;31mXGBoostError\u001b[0m: b'[14:20:06] /workspace/src/objective/regression_obj.cu:65: Check failed: info.labels_.Size() != 0U (0 vs. 0) label set cannot be empty\\n\\nStack trace returned 10 entries:\\n[bt] (0) /opt/conda/xgboost/libxgboost.so(dmlc::StackTrace()+0x3d) [0x7fb9897f85cd]\\n[bt] (1) /opt/conda/xgboost/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x18) [0x7fb9897f89c8]\\n[bt] (2) /opt/conda/xgboost/libxgboost.so(xgboost::obj::RegLossObj<xgboost::obj::LogisticClassification>::GetGradient(xgboost::HostDeviceVector<float> const&, xgboost::MetaInfo const&, int, xgboost::HostDeviceVector<xgboost::detail::GradientPairInternal<float> >*)+0xcd) [0x7fb9899ee28d]\\n[bt] (3) /opt/conda/xgboost/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, xgboost::DMatrix*)+0x362) [0x7fb98986f1e2]\\n[bt] (4) /opt/conda/xgboost/libxgboost.so(XGBoosterUpdateOneIter+0x35) [0x7fb9897f0ab5]\\n[bt] (5) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call_unix64+0x4c) [0x7fb9e0c32ec0]\\n[bt] (6) /opt/conda/lib/python3.6/lib-dynload/../../libffi.so.6(ffi_call+0x22d) [0x7fb9e0c3287d]\\n[bt] (7) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(_ctypes_callproc+0x2ce) [0x7fb9e0e47dee]\\n[bt] (8) /opt/conda/lib/python3.6/lib-dynload/_ctypes.cpython-36m-x86_64-linux-gnu.so(+0x12825) [0x7fb9e0e48825]\\n[bt] (9) python(_PyObject_FastCallDict+0x8b) [0x55ebba14c1bb]\\n\\n'"
1548 |      ],
1549 |      "output_type": "error"
1550 |     }
1551 |    ],
1552 |    "source": [
1553 |     "%%time\n",
1554 |     "# train, test = train_test_split(total, test_size=0.1, random_state=20, stratify=total[label_idx])\n",
1555 |     "# print('end split')\n",
1556 |     "# param = {'objective': 'binary:logistic'}\n",
1557 |     "# train = xgb.DMatrix(train)\n",
1558 |     "# test = xgb.DMatrix(test)\n",
1559 |     "# print(train.num_col())\n",
1560 |     "# print(train.num_row())\n",
1561 |     "\n",
1562 |     "bst = xgb.train(param, train, 2)\n",
1563 |     "\n",
1564 |     "# pred_train = bst.predict(train)\n",
1565 |     "# acc_score = accuracy_score(train.get_label(), pred_train)\n",
1566 |     "# print(acc_score)"
1567 |    ]
1568 |   },
1569 |   {
1570 |    "cell_type": "code",
1571 |    "execution_count": 12,
1572 |    "metadata": {
1573 |     "id": "77F7C13F377544128FEE0CC87859FAE9",
1574 |     "collapsed": false,
1575 |     "scrolled": false
1576 |    },
1577 |    "outputs": [
1578 |     {
1579 |      "data": {
1580 |       "text/plain": [
1581 |        "7"
1582 |       ]
1583 |      },
1584 |      "execution_count": 12,
1585 |      "metadata": {},
1586 |      "output_type": "execute_result"
1587 |     }
1588 |    ],
1589 |    "source": [
1590 |     "del total\n",
1591 |     "gc.collect()"
1592 |    ]
1593 |   },
1594 |   {
1595 |    "cell_type": "code",
1596 |    "execution_count": null,
1597 |    "metadata": {
1598 |     "id": "63DA97D8ADDE475EACA2EB7875FE2A26"
1599 |    },
1600 |    "outputs": [],
1601 |    "source": [
1602 |     "pred_test = bst.predict(test)\n",
1603 |     "test_acc = accuracy_score(test.get_label(), pred_test)\n",
1604 |     "test_auc = roc_auc_score(test.get_label(), pred_test)"
1605 |    ]
1606 |   },
1607 |   {
1608 |    "cell_type": "markdown",
1609 |    "metadata": {
1610 |     "id": "D90F0E908A3E4E4A8436C11BA5CA0537",
1611 |     "mdEditEnable": false
1612 |    },
1613 |    "source": [
1614 |     "# LightGBM模型"
1615 |    ]
1616 |   },
1617 |   {
1618 |    "cell_type": "markdown",
1619 |    "metadata": {
1620 |     "id": "6ADF6DA536D94C5B99CA1A9D2C0FE965",
1621 |     "mdEditEnable": false
1622 |    },
1623 |    "source": [
1624 |     "## 准备训练数据"
1625 |    ]
1626 |   },
1627 |   {
1628 |    "cell_type": "code",
1629 |    "execution_count": 4,
1630 |    "metadata": {
1631 |     "id": "4371E4A6693B4712A3FFE7FCB3B96050",
1632 |     "collapsed": false,
1633 |     "scrolled": true
1634 |    },
1635 |    "outputs": [
1636 |     {
1637 |      "name": "stdout",
1638 |      "output_type": "stream",
1639 |      "text": [
1640 |       "2019-08-11 19:19:30.037143\nloaded normal feature     ---> (30000000, 31)\nloaded graph feature      ---> (30000000, 5)\nloaded fuzzy feature      ---> (30000000, 7)\nloaded powerful feature   ---> (30000000, 2)\nloaded dssm_300 feature   ---> (30000000, 1)\nloaded dssm_600 feature   ---> (30000000, 1)\nloaded mvlstm feature     ---> (30000000, 1)\nloaded ARC feature        ---> (30000000, 1)\nloaded TextCNN feature    ---> (30000000, 1)\nloaded OOF CNN feature    ---> (30000000, 1)\nloaded OOF LSTM feature   ---> (30000000, 1)\nloaded label with shape   ---> (30000000, 1)\n----------------------end reading file--------------------------\ntotal feature shape       ---> (30000000, 53)\nsplit train set and test set\ninitialize dataset\nCPU times: user 8min 10s, sys: 40.9 s, total: 8min 51s\nWall time: 9min 9s\n"
1641 |      ]
1642 |     }
1643 |    ],
1644 |    "source": [
1645 |     "%%time\r\n",
1646 |     "\r\n",
1647 |     "print(datetime.now())\r\n",
1648 |     "# 一般特征\r\n",
1649 |     "feature_normal = pd.read_csv('./handled_data/train_feature/train_feature_3kw_normal.csv.gz', header=None)\r\n",
1650 |     "print('loaded normal feature     --->',feature_normal.shape)\r\n",
1651 |     "\r\n",
1652 |     "# feature_statistics = pd.read_csv('./handled_data/train_feature/train_feature_3kw_statistics.csv.gz', header=None)\r\n",
1653 |     "# print('loaded statistics feature --->',feature_statistics.shape)\r\n",
1654 |     "# feature_pagerank = pd.read_csv('./handled_data/train_feature/train_feature_3kw_pagerank.csv.gz', header=None)\r\n",
1655 |     "# print('loaded pagerank feature   --->',feature_pagerank.shape)\r\n",
1656 |     "\r\n",
1657 |     "feature_graph = pd.read_csv('./handled_data/train_feature/train_feature_graph_final.csv.gz', header=None)\r\n",
1658 |     "print('loaded graph feature      --->',feature_graph.shape)\r\n",
1659 |     "\r\n",
1660 |     "feature_fuzzy = pd.read_csv('./handled_data/train_feature/train_feature_fuzzy.csv', header=None)\r\n",
1661 |     "print('loaded fuzzy feature      --->',feature_fuzzy.shape)\r\n",
1662 |     "feature_powerful = pd.read_csv('./handled_data/train_feature/train_feature_powerful.csv', header=None)\r\n",
1663 |     "print('loaded powerful feature   --->',feature_powerful.shape)\r\n",
1664 |     "\r\n",
1665 |     "# 融合特征\r\n",
1666 |     "feat_dssm_300 = pd.read_csv('./predictions/feat_dssm_300_3kw.csv.gz', header=None)\r\n",
1667 |     "print('loaded dssm_300 feature   --->', feat_dssm_300.shape)\r\n",
1668 |     "\r\n",
1669 |     "feat_dssm_600 = pd.read_csv('./predictions/feat_dssm_600_3kw.csv.gz', header=None)\r\n",
1670 |     "print('loaded dssm_600 feature   --->', feat_dssm_600.shape)   \r\n",
1671 |     "\r\n",
1672 |     "feat_mvlstm = pd.read_csv('./predictions/feat_mvlstm_3kw.csv.gz', header=None)\r\n",
1673 |     "print('loaded mvlstm feature     --->', feat_mvlstm.shape) \r\n",
1674 |     "\r\n",
1675 |     "feat_arc = pd.read_csv('./predictions/feat_arc_3kw.csv.gz', header=None)\r\n",
1676 |     "print('loaded ARC feature        --->', feat_arc.shape) \r\n",
1677 |     "    \r\n",
1678 |     "feat_textcnn = pd.read_csv('./predictions/feat_textcnn_3kw.csv.gz', header=None)\r\n",
1679 |     "print('loaded TextCNN feature    --->', feat_textcnn.shape)\r\n",
1680 |     "\r\n",
1681 |     "feat_oofcnn = pd.read_csv('./predictions/feat_oofcnn_3kw.csv', header=None)\r\n",
1682 |     "print('loaded OOF CNN feature    --->', feat_oofcnn.shape)\r\n",
1683 |     "\r\n",
1684 |     "feat_ooflstm = pd.read_csv('./predictions/feat_ooflstm_3kw.csv', header=None)\r\n",
1685 |     "print('loaded OOF LSTM feature   --->', feat_ooflstm.shape)\r\n",
1686 |     "\r\n",
1687 |     "label = pd.read_csv('./handled_data/label', header=None)\r\n",
1688 |     "print('loaded label with shape   --->',label.shape)\r\n",
1689 |     "print('----------------------end reading file--------------------------')\r\n",
1690 |     "total = pd.concat([feature_normal, \r\n",
1691 |     "                    # feature_statistics, feature_pagerank, \r\n",
1692 |     "                    feature_graph,\r\n",
1693 |     "                    feature_fuzzy, feature_powerful,\r\n",
1694 |     "                    feat_dssm_300,\r\n",
1695 |     "                    feat_dssm_600,\r\n",
1696 |     "                    feat_mvlstm,\r\n",
1697 |     "                    feat_arc,\r\n",
1698 |     "                    feat_textcnn,\r\n",
1699 |     "                    feat_oofcnn,\r\n",
1700 |     "                    feat_ooflstm,\r\n",
1701 |     "                    label], axis=1)\r\n",
1702 |     "total.columns = range(total.shape[1])\r\n",
1703 |     "print('total feature shape       --->', total.shape)\r\n",
1704 |     "\r\n",
1705 |     "print('split train set and test set')\r\n",
1706 |     "label_idx = total.shape[1] - 1\r\n",
1707 |     "\r\n",
1708 |     "train, val = train_test_split(total, test_size=0.1, random_state=20, stratify=total[label_idx])\r\n",
1709 |     "\r\n",
1710 |     "x = train.drop(label_idx, axis=1)\r\n",
1711 |     "y = train[label_idx]\r\n",
1712 |     "\r\n",
1713 |     "val_x = val.drop(label_idx, axis=1)\r\n",
1714 |     "val_y = val[label_idx]\r\n",
1715 |     "\r\n",
1716 |     "print('initialize dataset')\r\n",
1717 |     "lgb_train = lgb.Dataset(x, y, free_raw_data=False)\r\n",
1718 |     "lgb_eval = lgb.Dataset(val_x, val_y, reference=lgb_train, free_raw_data=False)"
1719 |    ]
1720 |   },
1721 |   {
1722 |    "cell_type": "code",
1723 |    "execution_count": 9,
1724 |    "metadata": {
1725 |     "id": "019B00D9B9914A51A76009D3FBA3E7C5",
1726 |     "collapsed": false,
1727 |     "scrolled": false
1728 |    },
1729 |    "outputs": [],
1730 |    "source": [
1731 |     "total_x = total.drop(label_idx, axis=1)\n",
1732 |     "total_y = total[label_idx]\n",
1733 |     "print(total_x.shape)\n",
1734 |     "print(total_x.info())\n",
1735 |     "print(total_y.shape)\n",
1736 |     "pritn(total_y.info())\n",
1737 |     "lgb_train = lgb.Dataset(total_x, total_y, free_raw_data=False)"
1738 |    ]
1739 |   },
1740 |   {
1741 |    "cell_type": "code",
1742 |    "execution_count": 6,
1743 |    "metadata": {
1744 |     "id": "D79357D486D244678838A702C3148530",
1745 |     "collapsed": false,
1746 |     "scrolled": true
1747 |    },
1748 |    "outputs": [
1749 |     {
1750 |      "name": "stdout",
1751 |      "output_type": "stream",
1752 |      "text": [
1753 |       "(30000000, 52)\n                 0             1             2             3              4   \\\ncount  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07   3.000000e+07   \nmean   3.138709e-01  6.360163e-01  7.210984e-01  1.328846e+01   5.023723e-05   \nstd    1.871304e-01  1.825601e-01  1.542859e-01  4.734423e+00   7.655814e-04   \nmin    0.000000e+00  3.225806e-02  0.000000e+00  1.000000e+00  3.876599e-302   \n25%    1.904762e-01  5.000000e-01  6.363636e-01  1.000000e+01   1.425644e-12   \n50%    2.857143e-01  6.666667e-01  7.333333e-01  1.300000e+01   3.036743e-10   \n75%    4.090909e-01  7.777778e-01  8.181818e-01  1.600000e+01   6.644035e-08   \nmax    3.902353e+01  1.000000e+00  1.000000e+00  3.850000e+02   6.000000e-01   \n\n                 5             6             7             8             9   \\\ncount  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07   \nmean   2.809463e-01  5.420982e-01  8.260438e-01  7.305404e-01 -7.067164e-01   \nstd    1.559342e-01  2.922543e-01  1.176581e-01  1.138556e-01  4.590602e-01   \nmin    0.000000e+00  0.000000e+00 -5.142857e+00 -1.806917e-01 -5.323372e+00   \n25%    1.818182e-01  3.333333e-01  7.777778e-01  6.662080e-01 -9.964515e-01   \n50%    2.666667e-01  5.163978e-01  8.461538e-01  7.450130e-01 -6.658506e-01   \n75%    3.636364e-01  7.071068e-01  9.000000e-01  8.105919e-01 -3.794953e-01   \nmax    1.720000e+00  1.576659e+01  1.000000e+00  1.000000e+00  1.000000e+00   \n\n           ...                 42            43            44            45  \\\ncount      ...       3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07   \nmean       ...       3.053099e+00  8.375073e-01  3.500157e-01  4.725512e-01   \nstd        ...       6.043442e-01  1.453110e-01  1.638842e-01  1.583299e-01   \nmin        ...       0.000000e+00  0.000000e+00  0.000000e+00 -4.724671e-01   \n25%        ...       2.714013e+00  7.789553e-01  2.111369e-01  3.822839e-01   \n50%        ...       3.127549e+00  8.863827e-01  3.517495e-01  4.979405e-01   \n75%        ...       3.467873e+00  9.420421e-01  4.681469e-01  5.879889e-01   \nmax        ...       5.794093e+00  1.000000e+00  1.000000e+00  8.932437e-01   \n\n                 46            47            48            49            50  \\\ncount  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07  3.000000e+07   \nmean   4.913790e-01  1.704771e-01  1.771518e-01  1.857886e-01  1.686962e-01   \nstd    1.603338e-01  4.920971e-02  3.686785e-02  5.073331e-02  4.917796e-02   \nmin   -4.589797e-01  1.637830e-02  1.085761e-02  7.413505e-03  2.439348e-03   \n25%    3.987670e-01  1.346706e-01  1.530675e-01  1.492922e-01  1.368189e-01   \n50%    5.176989e-01  1.684497e-01  1.779618e-01  1.904999e-01  1.739967e-01   \n75%    6.093000e-01  2.041900e-01  2.028155e-01  2.268609e-01  2.051853e-01   \nmax    9.266943e-01  3.532417e-01  3.169906e-01  2.812516e-01  5.599475e-01   \n\n                 51  \ncount  3.000000e+07  \nmean   1.833208e-01  \nstd    6.467187e-02  \nmin    2.731318e-14  \n25%    1.404598e-01  \n50%    1.873902e-01  \n75%    2.247325e-01  \nmax    9.985868e-01  \n\n[8 rows x 52 columns]\n(30000000,)\ncount    3.000000e+07\nmean     1.822721e-01\nstd      3.860687e-01\nmin      0.000000e+00\n25%      0.000000e+00\n50%      0.000000e+00\n75%      0.000000e+00\nmax      1.000000e+00\nName: 52, dtype: float64\n"
1754 |      ]
1755 |     }
1756 |    ],
1757 |    "source": [
1758 |     ""
1759 |    ]
1760 |   },
1761 |   {
1762 |    "cell_type": "markdown",
1763 |    "metadata": {
1764 |     "id": "55C87C30E20844FA8677C3BBFD4E2D45",
1765 |     "mdEditEnable": false
1766 |    },
1767 |    "source": [
1768 |     "## 训练"
1769 |    ]
1770 |   },
1771 |   {
1772 |    "cell_type": "code",
1773 |    "execution_count": 10,
1774 |    "metadata": {
1775 |     "id": "0716C46A15684CA1A77343249DC9FAD1",
1776 |     "collapsed": false,
1777 |     "scrolled": false
1778 |    },
1779 |    "outputs": [
1780 |     {
1781 |      "name": "stdout",
1782 |      "output_type": "stream",
1783 |      "text": [
1784 |       "2019-08-11 19:36:41.955449\nCPU times: user 9h 24min 49s, sys: 1min 7s, total: 9h 25min 56s\nWall time: 41min 38s\n"
1785 |      ]
1786 |     },
1787 |     {
1788 |      "data": {
1789 |       "text/plain": [
1790 |        "<lightgbm.basic.Booster at 0x7f9562d70a20>"
1791 |       ]
1792 |      },
1793 |      "execution_count": 10,
1794 |      "metadata": {},
1795 |      "output_type": "execute_result"
1796 |     }
1797 |    ],
1798 |    "source": [
1799 |     "%%time\r\n",
1800 |     "params = {\r\n",
1801 |     "    'boosting': 'dart',\r\n",
1802 |     "    'objective': 'binary',\r\n",
1803 |     "    'metric': 'auc',\r\n",
1804 |     "    'learning_rate': '0.5',\r\n",
1805 |     "    \r\n",
1806 |     "    'num_leaves':59,\r\n",
1807 |     "    'max_depth':7,\r\n",
1808 |     "    \r\n",
1809 |     "    'max_bin':251,\r\n",
1810 |     "    'min_data_in_leaf':19,\r\n",
1811 |     "         \r\n",
1812 |     "    'scale_pos_weight': 3.1,\r\n",
1813 |     "         \r\n",
1814 |     "    'lambda_l1':0,\r\n",
1815 |     "    'lambda_l2':0,\r\n",
1816 |     "    'min_split_gain':0,\r\n",
1817 |     "    \r\n",
1818 |     "    'device': 'gpu',\r\n",
1819 |     "    'gpu_platform_id':0,\r\n",
1820 |     "    'gpu_device_id':0\r\n",
1821 |     "}\r\n",
1822 |     "\r\n",
1823 |     "print(datetime.now())\r\n",
1824 |     "gbm = lgb.train(params, lgb_train, num_boost_round=450, early_stopping_rounds=30)\r\n",
1825 |     "gbm.save_model('./handled_data/final_lgb_450.model') \r\n",
1826 |     "# init_model=gbm, keep_training_booster=True"
1827 |    ]
1828 |   },
1829 |   {
1830 |    "cell_type": "markdown",
1831 |    "metadata": {
1832 |     "id": "4B62C670C46546168FAA5DB184641EB9",
1833 |     "mdEditEnable": false
1834 |    },
1835 |    "source": [
1836 |     "## GridSearch"
1837 |    ]
1838 |   },
1839 |   {
1840 |    "cell_type": "code",
1841 |    "execution_count": 14,
1842 |    "metadata": {
1843 |     "id": "182A5DC4B3D74AC195675B4D444AFCEA",
1844 |     "collapsed": false,
1845 |     "scrolled": true
1846 |    },
1847 |    "outputs": [
1848 |     {
1849 |      "name": "stdout",
1850 |      "output_type": "stream",
1851 |      "text": [
1852 |       "Fitting 5 folds for each of 21 candidates, totalling 105 fits\n"
1853 |      ]
1854 |     },
1855 |     {
1856 |      "name": "stderr",
1857 |      "output_type": "stream",
1858 |      "text": [
1859 |       "[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.\n[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 34.0min\n[Parallel(n_jobs=10)]: Done 105 out of 105 | elapsed: 120.1min finished\n"
1860 |      ]
1861 |     },
1862 |     {
1863 |      "name": "stdout",
1864 |      "output_type": "stream",
1865 |      "text": [
1866 |       "{'min_data_in_leaf': 251} \n 0.7039860009576545\n"
1867 |      ]
1868 |     }
1869 |    ],
1870 |    "source": [
1871 |     "# 6 64\r\n",
1872 |     "# 7 58\r\n",
1873 |     "# params = {'max_depth':range(4,8), 'num_leaves':range(56, 60)}\r\n",
1874 |     "# 2 20\r\n",
1875 |     "# params = {'min_child_samples':range(18,23)}\r\n",
1876 |     "\r\n",
1877 |     "# params ={'max_bin': range(5,100,10), }\r\n",
1878 |     "\r\n",
1879 |     "params = {'min_data_in_leaf':range(240, 261)}\r\n",
1880 |     "gbm = lgb.LGBMClassifier(boost='dart',\r\n",
1881 |     "                            max_depth=7, \r\n",
1882 |     "                            num_leaves=59,\r\n",
1883 |     "                            max_bin = 95,\r\n",
1884 |     "                            learning_rate=0.5, \r\n",
1885 |     "                            n_estimators=100,\r\n",
1886 |     "                            metric='auc')\r\n",
1887 |     "gridSearch = GridSearchCV(estimator=gbm, param_grid=params, scoring='roc_auc', \r\n",
1888 |     "                                    cv=5, verbose=1, n_jobs=15)\r\n",
1889 |     "\r\n",
1890 |     "gridSearch.fit(x, y)\r\n",
1891 |     "\r\n",
1892 |     "print(gridSearch.best_params_ , '\\n', gridSearch.best_score_)"
1893 |    ]
1894 |   },
1895 |   {
1896 |    "cell_type": "markdown",
1897 |    "metadata": {
1898 |     "id": "EDEF130C7D6E4C149C34CA5FA916CA2C",
1899 |     "mdEditEnable": false
1900 |    },
1901 |    "source": [
1902 |     "## 查看特征重要性"
1903 |    ]
1904 |   },
1905 |   {
1906 |    "cell_type": "code",
1907 |    "execution_count": 11,
1908 |    "metadata": {
1909 |     "id": "AB78DAF736034D958BD510FBBDD13A49",
1910 |     "collapsed": false,
1911 |     "scrolled": false
1912 |    },
1913 |    "outputs": [
1914 |     {
1915 |      "name": "stdout",
1916 |      "output_type": "stream",
1917 |      "text": [
1918 |       "52\n52\n                 feature_name  importance\n0                    DSSM_300        1522\n1             seq1_max_degree        1460\n2           pagerank_feature2        1374\n3                     fuzzy_1        1246\n4           pagerank_feature1        1182\n5                     fuzzy_3        1154\n6                     fuzzy_4        1119\n7                  doubleSide         890\n8                    sameWord         750\n9                      OOFCNN         743\n10         fasttextDistance_1         611\n11        levenshteinDistance         590\n12                    fuzzy_5         562\n13            TFIDFDistance_2         550\n14         specialConcurrence         519\n15                   DSSM_600         512\n16                    fuzzy_2         476\n17         word2vecDistance_1         448\n18            seq2_max_degree         430\n19   fasttextWeightDistance_2         425\n20                        ARC         399\n21   fasttextWeightDistance_1         397\n22                 singleSide         392\n23                   word2vec         360\n24             ochiaiDistance         340\n25                    OOFLSTM         328\n26        w2vWeightDistance_1         316\n27  fasttextKeywordDistance_1         307\n28                    TextCNN         296\n29            TFIDFDistance_3         281\n30                     MVLSTM         254\n31           sorensenDistance         244\n32                concurrence         229\n33        w2vWeightDistance_2         222\n34            TFIDFDistance_4         219\n35   fasttextWeightDistance_3         204\n36       w2vKeywordDistance_1         191\n37  fasttextKeywordDistance_2         185\n38            TFIDFDistance_1         173\n39            TFIDFDistance_5         173\n40         word2vecDistance_2         162\n41        w2vWeightDistance_3         149\n42         word2vecDistance_3         142\n43  fasttextKeywordDistance_3         133\n44                   fasttext         120\n45            jaccardDistance         119\n46         fasttextDistance_3         112\n47       w2vKeywordDistance_2         109\n48       w2vKeywordDistance_3         102\n49         fasttextDistance_2          94\n50         seq1_max_component          85\n51               diceDistance          71\n0 concurrence 229\n1 levenshteinDistance 590\n2 sorensenDistance 244\n3 sameWord 750\n4 specialConcurrence 519\n5 diceDistance 71\n6 ochiaiDistance 340\n7 jaccardDistance 119\n8 word2vecDistance_1 448\n9 word2vecDistance_2 162\n10 word2vecDistance_3 142\n11 fasttextDistance_1 611\n12 fasttextDistance_2 94\n13 fasttextDistance_3 112\n14 w2vWeightDistance_1 316\n15 w2vWeightDistance_2 222\n16 w2vWeightDistance_3 149\n17 w2vKeywordDistance_1 191\n18 w2vKeywordDistance_2 109\n19 w2vKeywordDistance_3 102\n20 TFIDFDistance_1 173\n21 TFIDFDistance_2 550\n22 TFIDFDistance_3 281\n23 TFIDFDistance_4 219\n24 TFIDFDistance_5 173\n25 fasttextWeightDistance_1 397\n26 fasttextWeightDistance_2 425\n27 fasttextWeightDistance_3 204\n28 fasttextKeywordDistance_1 307\n29 fasttextKeywordDistance_2 185\n30 fasttextKeywordDistance_3 133\n31 seq1_max_degree 1460\n32 seq2_max_degree 430\n33 seq1_max_component 85\n34 pagerank_feature1 1182\n35 pagerank_feature2 1374\n36 fuzzy_1 1246\n37 fuzzy_2 476\n38 fuzzy_3 1154\n39 fuzzy_4 1119\n40 fuzzy_5 562\n41 word2vec 360\n42 fasttext 120\n43 singleSide 392\n44 doubleSide 890\n45 DSSM_300 1522\n46 DSSM_600 512\n47 MVLSTM 254\n48 ARC 399\n49 TextCNN 296\n50 OOFCNN 743\n51 OOFLSTM 328\n"
1919 |      ]
1920 |     }
1921 |    ],
1922 |    "source": [
1923 |     "name = ['concurrence', 'levenshteinDistance', 'sorensenDistance', \r\n",
1924 |     "        'sameWord', 'specialConcurrence',\r\n",
1925 |     "        'diceDistance', 'ochiaiDistance', 'jaccardDistance', \r\n",
1926 |     "        'word2vecDistance_1', 'word2vecDistance_2', 'word2vecDistance_3', \r\n",
1927 |     "        'fasttextDistance_1', 'fasttextDistance_2', 'fasttextDistance_3', \r\n",
1928 |     "        'w2vWeightDistance_1', 'w2vWeightDistance_2', 'w2vWeightDistance_3',\r\n",
1929 |     "        'w2vKeywordDistance_1', 'w2vKeywordDistance_2', 'w2vKeywordDistance_3',\r\n",
1930 |     "        'TFIDFDistance_1', 'TFIDFDistance_2', 'TFIDFDistance_3', \r\n",
1931 |     "        'TFIDFDistance_4', 'TFIDFDistance_5', \r\n",
1932 |     "        'fasttextWeightDistance_1', 'fasttextWeightDistance_2', 'fasttextWeightDistance_3',\r\n",
1933 |     "        'fasttextKeywordDistance_1', 'fasttextKeywordDistance_2', 'fasttextKeywordDistance_3',\r\n",
1934 |     "        'seq1_max_degree', 'seq2_max_degree', 'seq1_max_component',\r\n",
1935 |     "        'pagerank_feature1', 'pagerank_feature2',\r\n",
1936 |     "        'fuzzy_1', 'fuzzy_2', 'fuzzy_3', 'fuzzy_4', 'fuzzy_5', 'word2vec', 'fasttext',\r\n",
1937 |     "        'singleSide', 'doubleSide',\r\n",
1938 |     "        # 'seq1_neighbor', 'seq2_neighbor', 'total_neighbor',\r\n",
1939 |     "        'DSSM_300', 'DSSM_600', \r\n",
1940 |     "        'MVLSTM', 'ARC','TextCNN','OOFCNN' ,'OOFLSTM' ]\r\n",
1941 |     "        \r\n",
1942 |     "print(len(name))\r\n",
1943 |     "print(gbm.feature_importance().__len__())\r\n",
1944 |     "feature_importance = pd.DataFrame({'feature_name': name[:gbm.feature_importance().__len__()], \r\n",
1945 |     "                                'importance': gbm.feature_importance()}).sort_values(by='importance', ascending=False).reset_index(drop = True)\r\n",
1946 |     "print(feature_importance)\r\n",
1947 |     "for i in range(len(gbm.feature_importance())):\r\n",
1948 |     "    print(i, name[i], gbm.feature_importance()[i])"
1949 |    ]
1950 |   },
1951 |   {
1952 |    "cell_type": "markdown",
1953 |    "metadata": {
1954 |     "id": "D54F3E9A7A8244968747F2C45CF15329",
1955 |     "mdEditEnable": false
1956 |    },
1957 |    "source": [
1958 |     "## 测试结果"
1959 |    ]
1960 |   },
1961 |   {
1962 |    "cell_type": "code",
1963 |    "execution_count": 12,
1964 |    "metadata": {
1965 |     "id": "70BD1ABB77634747A35495A26496A518",
1966 |     "collapsed": false,
1967 |     "scrolled": false
1968 |    },
1969 |    "outputs": [
1970 |     {
1971 |      "name": "stdout",
1972 |      "output_type": "stream",
1973 |      "text": [
1974 |       "2019-08-11 20:18:38.544223\n"
1975 |      ]
1976 |     },
1977 |     {
1978 |      "ename": "NameError",
1979 |      "evalue": "name 'val_x' is not defined",
1980 |      "traceback": [
1981 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1982 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
1983 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
1984 |       "\u001b[0;31mNameError\u001b[0m: name 'val_x' is not defined"
1985 |      ],
1986 |      "output_type": "error"
1987 |     }
1988 |    ],
1989 |    "source": [
1990 |     "%%time\n",
1991 |     "print(datetime.now())\n",
1992 |     "# gbm = lgb.Booster(model_file='./handled_data/lgb_2kw_650.model')\n",
1993 |     "pred = gbm.predict(val_x, num_iteration=gbm.best_iteration)\n",
1994 |     "\n",
1995 |     "print('end predict')\n",
1996 |     "result = []\n",
1997 |     "for item in pred:\n",
1998 |     "    if item>=0.5:\n",
1999 |     "        result.append(1)\n",
2000 |     "    else:\n",
2001 |     "        result.append(0)\n",
2002 |     "count = 0\n",
2003 |     "pos_count = 0\n",
2004 |     "print('pos_predict:',np.sum(result))\n",
2005 |     "\n",
2006 |     "val_y_list = val_y.tolist()\n",
2007 |     "for i in range(len(pred)):\n",
2008 |     "    if val_y_list[i] == 1 and result[i] == val_y_list[i]:\n",
2009 |     "        pos_count += 1\n",
2010 |     "    if result[i] == val_y_list[i]:\n",
2011 |     "        count+=1\n",
2012 |     "print('acc:',count/len(pred))\n",
2013 |     "print('pos_right_count:',pos_count)\n",
2014 |     "print('pos_label:',np.sum(val_y.tolist()))\n",
2015 |     "print('pos_acc:',pos_count / np.sum(val_y.tolist()))"
2016 |    ]
2017 |   },
2018 |   {
2019 |    "cell_type": "markdown",
2020 |    "metadata": {
2021 |     "id": "A3E2CEF3CFC641CC9E2A3CD628F7C161",
2022 |     "mdEditEnable": false
2023 |    },
2024 |    "source": [
2025 |     "## 提交文件"
2026 |    ]
2027 |   },
2028 |   {
2029 |    "cell_type": "code",
2030 |    "execution_count": 13,
2031 |    "metadata": {
2032 |     "id": "952FBCFDA61D40D6B19562CB54E2575D",
2033 |     "collapsed": false,
2034 |     "scrolled": false
2035 |    },
2036 |    "outputs": [
2037 |     {
2038 |      "ename": "LightGBMError",
2039 |      "evalue": "Could not open ./handled_data/lgb_power_450.model",
2040 |      "traceback": [
2041 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
2042 |       "\u001b[0;31mLightGBMError\u001b[0m                             Traceback (most recent call last)",
2043 |       "\u001b[0;32m<timed exec>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
2044 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, params, train_set, model_file, model_str, silent)\u001b[0m\n\u001b[1;32m   1662\u001b[0m                 \u001b[0mc_str\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1663\u001b[0m                 \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbyref\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mout_num_iterations\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1664\u001b[0;31m                 ctypes.byref(self.handle)))\n\u001b[0m\u001b[1;32m   1665\u001b[0m             \u001b[0mout_num_class\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mctypes\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mc_int\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1666\u001b[0m             _safe_call(_LIB.LGBM_BoosterGetNumClasses(\n",
2045 |       "\u001b[0;32m/opt/conda/lib/python3.6/site-packages/lightgbm/basic.py\u001b[0m in \u001b[0;36m_safe_call\u001b[0;34m(ret)\u001b[0m\n\u001b[1;32m     45\u001b[0m     \"\"\"\n\u001b[1;32m     46\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mret\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 47\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mLightGBMError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdecode_string\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_LIB\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mLGBM_GetLastError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     48\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     49\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
2046 |       "\u001b[0;31mLightGBMError\u001b[0m: Could not open ./handled_data/lgb_power_450.model"
2047 |      ],
2048 |      "output_type": "error"
2049 |     }
2050 |    ],
2051 |    "source": [
2052 |     "%%time\n",
2053 |     "\n",
2054 |     "gbm = lgb.Booster(model_file='./handled_data/final_lgb_450.model')\n",
2055 |     "feature_normal_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_normal.csv', header=None)\n",
2056 |     "print('loaded feature normal     --->',feature_normal_test.shape)\n",
2057 |     "feature_statistics_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_statistics.csv', header=None)\n",
2058 |     "print('loaded feature statistics --->',feature_statistics_test.shape)\n",
2059 |     "feature_pagerank_test = pd.read_csv('./handled_data/test_feature/test_feature_3kw_pagerank.csv', header=None)\n",
2060 |     "print('loaded feature pagerank   --->',feature_pagerank_test.shape)\n",
2061 |     "feature_fuzzy_test = pd.read_csv('./handled_data/test_feature/test_feature_fuzzy.csv', header=None)\n",
2062 |     "print('loaded fuzzy feature      --->',feature_fuzzy_test.shape)\n",
2063 |     "feature_powerful_test = pd.read_csv('./handled_data/test_feature/test_feature_powerful.csv', header=None)\n",
2064 |     "print('loaded powerfule feature  --->',feature_powerful_test.shape)\n",
2065 |     "\n",
2066 |     "# 融合特征\n",
2067 |     "feat_dssm_300_test = pd.read_csv('./predictions/test_dssm_300_2kw.csv.gz', header=None)\n",
2068 |     "print('loaded feature dssm_300   --->', feat_dssm_300_test.shape)\n",
2069 |     "feat_dssm_600_test = pd.read_csv('./predictions/test_dssm_600_2kw.csv.gz', header=None)\n",
2070 |     "print('loaded feature dssm_600   --->', feat_dssm_600_test.shape)\n",
2071 |     "feat_mvlstm_test = pd.read_csv('./predictions/test_mv_2kw.csv.gz', header=None)\n",
2072 |     "print('loaded feature MVLSTM     --->', feat_mvlstm_test.shape)\n",
2073 |     "feat_arc_test = pd.read_csv('./predictions/test_arc_2kw.csv.gz', header=None)\n",
2074 |     "print('loaded feature ARC        --->', feat_arc_test.shape)\n",
2075 |     "feat_textcnn_test = pd.read_csv('./predictions/test_textcnn_2kw.csv.gz', header=None)\n",
2076 |     "print('loaded feature TextCNN    --->', feat_textcnn_test.shape)\n",
2077 |     "\n",
2078 |     "# feat_lstm_dssm_test = pd.read_csv('./predictions/test_lstm_dssm_2kw.csv', header=None)\n",
2079 |     "# print('loaded feature LSTM-DSSM  --->', feat_lstm_dssm_test.shape)\n",
2080 |     "\n",
2081 |     "print('------------finish reading file, concating feature------------')\n",
2082 |     "test_feature = pd.concat([feature_normal_test, feature_statistics_test, feature_pagerank_test, \n",
2083 |     "                feature_fuzzy_test, feature_powerful_test, \n",
2084 |     "                feat_dssm_300_test,\n",
2085 |     "                feat_dssm_600_test,\n",
2086 |     "                feat_mvlstm_test,\n",
2087 |     "                feat_arc_test,\n",
2088 |     "                feat_textcnn_test,\n",
2089 |     "                # feat_lstm_dssm_test\n",
2090 |     "                ], axis=1)\n",
2091 |     "print('concated feature, shape    --->', test_feature.shape)"
2092 |    ]
2093 |   },
2094 |   {
2095 |    "cell_type": "code",
2096 |    "execution_count": 7,
2097 |    "metadata": {
2098 |     "id": "C709A47F9B8F45119C4BB2DEF2F96AFA",
2099 |     "collapsed": false,
2100 |     "scrolled": false
2101 |    },
2102 |    "outputs": [
2103 |     {
2104 |      "name": "stdout",
2105 |      "output_type": "stream",
2106 |      "text": [
2107 |       "finish prediction, shpae   ---> (20000000, 3)\n"
2108 |      ]
2109 |     }
2110 |    ],
2111 |    "source": [
2112 |     "%%time\n",
2113 |     "pred_test = pd.DataFrame(gbm.predict(test_feature, num_iteration=gbm.best_iteration))\n",
2114 |     "\n",
2115 |     "test_data = pd.read_csv(test_data_path, header=None)\n",
2116 |     "submission = pd.concat([test_data[0], test_data[2], pred_test], axis=1)\n",
2117 |     "print('finish prediction, shpae   --->', submission.shape)\n",
2118 |     "submission.to_csv('./predictions/lgb_submission_power_450.csv', header=None, index=False)"
2119 |    ]
2120 |   },
2121 |   {
2122 |    "cell_type": "code",
2123 |    "execution_count": 10,
2124 |    "metadata": {
2125 |     "id": "B5882F60E2E9433285E7C3C4B65EC5D7",
2126 |     "collapsed": false,
2127 |     "scrolled": false
2128 |    },
2129 |    "outputs": [
2130 |     {
2131 |      "name": "stdout",
2132 |      "output_type": "stream",
2133 |      "text": [
2134 |       "Kesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 ./predictions/lgb_submission_power_450.csv (565534.77 KiB)\n> 已上传 100 %\n> 文件已上传        \n> 服务器响应: 200 提交成功，请等待评审完成\n> 提交完成\n"
2135 |      ]
2136 |     }
2137 |    ],
2138 |    "source": [
2139 |     "!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file ./predictions/lgb_submission_power_450.csv -token 8be4f72dc2395a8d"
2140 |    ]
2141 |   },
2142 |   {
2143 |    "cell_type": "markdown",
2144 |    "metadata": {
2145 |     "id": "BA927732609B435D981ABD77FF0A00E3",
2146 |     "mdEditEnable": false
2147 |    },
2148 |    "source": [
2149 |     "## Final提交文件"
2150 |    ]
2151 |   },
2152 |   {
2153 |    "cell_type": "code",
2154 |    "execution_count": 4,
2155 |    "metadata": {
2156 |     "id": "166E19CE673F447A84B2C4EC610BB4B5",
2157 |     "collapsed": false,
2158 |     "scrolled": false
2159 |    },
2160 |    "outputs": [],
2161 |    "source": [
2162 |     "# !wc -l /home/kesci/work/predictions/testfin_mvlstm.csv\n",
2163 |     "# !wc -l /home/kesci/work/predictions/testfin_textcnn.csv\n",
2164 |     "!killall python"
2165 |    ]
2166 |   },
2167 |   {
2168 |    "cell_type": "code",
2169 |    "execution_count": 6,
2170 |    "metadata": {
2171 |     "id": "FE7C8E4888A14AAA81B133E6F0F80CD4",
2172 |     "collapsed": false,
2173 |     "scrolled": false
2174 |    },
2175 |    "outputs": [
2176 |     {
2177 |      "name": "stdout",
2178 |      "output_type": "stream",
2179 |      "text": [
2180 |       "[       1        2        3 ... 10318080 10318081 10318082]\n[       1        2        3 ... 10318080 10318081 10318082]\n10318082\n10318082\n\n[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]\n[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]\n20\n20\nCPU times: user 8.47 s, sys: 2.29 s, total: 10.8 s\nWall time: 11 s\n"
2181 |      ]
2182 |     }
2183 |    ],
2184 |    "source": [
2185 |     "%%time\n",
2186 |     "# final = pd.read_csv(final_data_path, header=None)\n",
2187 |     "# result = pd.read_csv('./predictions/final_submission_20190811.csv', header=None)\n",
2188 |     "print(final[0].unique())\n",
2189 |     "print(result[0].unique())\n",
2190 |     "print(final[0].unique().__len__())\n",
2191 |     "print(result[0].unique().__len__())\n",
2192 |     "print()\n",
2193 |     "\n",
2194 |     "print(final[2].unique())\n",
2195 |     "print(result[1].unique())\n",
2196 |     "print(final[2].unique().__len__())\n",
2197 |     "print(result[1].unique().__len__())"
2198 |    ]
2199 |   },
2200 |   {
2201 |    "cell_type": "code",
2202 |    "execution_count": 3,
2203 |    "metadata": {
2204 |     "id": "84E58641682D4B9BB34BA28DA4F058BF",
2205 |     "collapsed": false,
2206 |     "scrolled": false
2207 |    },
2208 |    "outputs": [
2209 |     {
2210 |      "name": "stderr",
2211 |      "output_type": "stream",
2212 |      "text": [
2213 |       "\r  0%|          | 0/10 [00:00<?, ?it/s]"
2214 |      ]
2215 |     },
2216 |     {
2217 |      "name": "stdout",
2218 |      "output_type": "stream",
2219 |      "text": [
2220 |       "2019-08-11 20:20:04.433063\nstart iteration\n"
2221 |      ]
2222 |     },
2223 |     {
2224 |      "name": "stderr",
2225 |      "output_type": "stream",
2226 |      "text": [
2227 |       "\r 10%|█         | 1/10 [04:20<39:01, 260.19s/it]"
2228 |      ]
2229 |     },
2230 |     {
2231 |      "name": "stdout",
2232 |      "output_type": "stream",
2233 |      "text": [
2234 |       "(10000000, 3)\n"
2235 |      ]
2236 |     },
2237 |     {
2238 |      "name": "stderr",
2239 |      "output_type": "stream",
2240 |      "text": [
2241 |       "\r 20%|██        | 2/10 [08:31<34:19, 257.43s/it]"
2242 |      ]
2243 |     },
2244 |     {
2245 |      "name": "stdout",
2246 |      "output_type": "stream",
2247 |      "text": [
2248 |       "(10000000, 3)\n"
2249 |      ]
2250 |     },
2251 |     {
2252 |      "name": "stderr",
2253 |      "output_type": "stream",
2254 |      "text": [
2255 |       "\r 30%|███       | 3/10 [12:46<29:57, 256.73s/it]"
2256 |      ]
2257 |     },
2258 |     {
2259 |      "name": "stdout",
2260 |      "output_type": "stream",
2261 |      "text": [
2262 |       "(10000000, 3)\n"
2263 |      ]
2264 |     },
2265 |     {
2266 |      "name": "stderr",
2267 |      "output_type": "stream",
2268 |      "text": [
2269 |       "\r 40%|████      | 4/10 [16:55<25:27, 254.55s/it]"
2270 |      ]
2271 |     },
2272 |     {
2273 |      "name": "stdout",
2274 |      "output_type": "stream",
2275 |      "text": [
2276 |       "(10000000, 3)\n"
2277 |      ]
2278 |     },
2279 |     {
2280 |      "name": "stderr",
2281 |      "output_type": "stream",
2282 |      "text": [
2283 |       "\r 50%|█████     | 5/10 [21:05<21:06, 253.24s/it]"
2284 |      ]
2285 |     },
2286 |     {
2287 |      "name": "stdout",
2288 |      "output_type": "stream",
2289 |      "text": [
2290 |       "(10000000, 3)\n"
2291 |      ]
2292 |     },
2293 |     {
2294 |      "name": "stderr",
2295 |      "output_type": "stream",
2296 |      "text": [
2297 |       "\r 60%|██████    | 6/10 [25:20<16:54, 253.63s/it]"
2298 |      ]
2299 |     },
2300 |     {
2301 |      "name": "stdout",
2302 |      "output_type": "stream",
2303 |      "text": [
2304 |       "(10000000, 3)\n"
2305 |      ]
2306 |     },
2307 |     {
2308 |      "name": "stderr",
2309 |      "output_type": "stream",
2310 |      "text": [
2311 |       "\r 70%|███████   | 7/10 [29:33<12:40, 253.58s/it]"
2312 |      ]
2313 |     },
2314 |     {
2315 |      "name": "stdout",
2316 |      "output_type": "stream",
2317 |      "text": [
2318 |       "(10000000, 3)\n"
2319 |      ]
2320 |     },
2321 |     {
2322 |      "name": "stderr",
2323 |      "output_type": "stream",
2324 |      "text": [
2325 |       "\r 80%|████████  | 8/10 [33:45<08:26, 253.05s/it]"
2326 |      ]
2327 |     },
2328 |     {
2329 |      "name": "stdout",
2330 |      "output_type": "stream",
2331 |      "text": [
2332 |       "(10000000, 3)\n"
2333 |      ]
2334 |     },
2335 |     {
2336 |      "name": "stderr",
2337 |      "output_type": "stream",
2338 |      "text": [
2339 |       "\r 90%|█████████ | 9/10 [37:57<04:12, 252.71s/it]"
2340 |      ]
2341 |     },
2342 |     {
2343 |      "name": "stdout",
2344 |      "output_type": "stream",
2345 |      "text": [
2346 |       "(10000000, 3)\n"
2347 |      ]
2348 |     },
2349 |     {
2350 |      "name": "stderr",
2351 |      "output_type": "stream",
2352 |      "text": [
2353 |       "\r100%|██████████| 10/10 [42:09<00:00, 252.36s/it]"
2354 |      ]
2355 |     },
2356 |     {
2357 |      "name": "stdout",
2358 |      "output_type": "stream",
2359 |      "text": [
2360 |       "(10000000, 3)\n"
2361 |      ]
2362 |     },
2363 |     {
2364 |      "name": "stderr",
2365 |      "output_type": "stream",
2366 |      "text": [
2367 |       "\n"
2368 |      ]
2369 |     },
2370 |     {
2371 |      "name": "stdout",
2372 |      "output_type": "stream",
2373 |      "text": [
2374 |       "                  0             1             2\ncount  1.000000e+08  1.000000e+08  1.000000e+08\nmean   5.170643e+06  7.224337e+00  4.130191e-01\nstd    2.967558e+06  5.240233e+00  1.687013e-01\nmin    1.000000e+00  1.000000e+00  0.000000e+00\n25%    2.606632e+06  3.000000e+00  2.880982e-01\n50%    5.195137e+06  6.000000e+00  4.040706e-01\n75%    7.727133e+06  1.100000e+01  5.317806e-01\nmax    1.031808e+07  2.000000e+01  1.000000e+00\nfinish prediction, shpae   ---> (100000000, 3)\nCPU times: user 2h 26min 37s, sys: 4min 11s, total: 2h 30min 49s\nWall time: 50min 28s\n"
2375 |      ]
2376 |     }
2377 |    ],
2378 |    "source": [
2379 |     "%%time\n",
2380 |     "print(datetime.now())\n",
2381 |     "CHUNKSIZE = 10000000\n",
2382 |     "gbm = lgb.Booster(model_file='./handled_data/final_lgb_450.model')\n",
2383 |     "\n",
2384 |     "feature_normal_final = pd.read_csv('./handled_data/final/final_feature_normal.csv', header=None, chunksize=CHUNKSIZE)\n",
2385 |     "feature_statistics_final = pd.read_csv('./handled_data/final/final_feature_statistics.csv', header=None, chunksize=CHUNKSIZE)\n",
2386 |     "feature_pagerank_final = pd.read_csv('./handled_data/final/final_feature_pagerank.csv', header=None, chunksize=CHUNKSIZE)\n",
2387 |     "feature_fuzzy_final = pd.read_csv('./handled_data/final/final_feature_fuzzy.csv', header=None, chunksize=CHUNKSIZE)\n",
2388 |     "feature_powerful_final = pd.read_csv('./handled_data/final/final_feature_powerful.csv', header=None, chunksize=CHUNKSIZE)\n",
2389 |     "\n",
2390 |     "feat_dssm_300_final = pd.read_csv('./predictions/testfin_300_dssm.csv_10kw.gz', header=None, chunksize=CHUNKSIZE)\n",
2391 |     "feat_dssm_600_final = pd.read_csv('./predictions/testfin_600_dssm.csv', header=None, chunksize=CHUNKSIZE)\n",
2392 |     "feat_mvlstm_final = pd.read_csv('./predictions/testfin_mvlstm.csv', header=None, chunksize=CHUNKSIZE)\n",
2393 |     "feat_arc_final = pd.read_csv('./predictions/testfin_arc.csv', header=None, chunksize=CHUNKSIZE)\n",
2394 |     "feat_textcnn_final = pd.read_csv('./predictions/testfin_textcnn.csv', header=None, chunksize=CHUNKSIZE)\n",
2395 |     "feat_oofcnn_final = pd.read_csv('./predictions/testfin_oofcnn.csv', header=None, chunksize=CHUNKSIZE)\n",
2396 |     "feat_ooflstm_final = pd.read_csv('./predictions/testfin_ooflstm.csv', header=None, chunksize=CHUNKSIZE)\n",
2397 |     "final_data = pd.read_csv(final_data_path, header=None, chunksize=CHUNKSIZE)\n",
2398 |     "result = []\n",
2399 |     "print('start iteration')\n",
2400 |     "for i in tqdm(range(10)):\n",
2401 |     "    features = pd.concat([feature_normal_final.get_chunk(), \n",
2402 |     "                            feature_statistics_final.get_chunk(), feature_pagerank_final.get_chunk(),\n",
2403 |     "                            feature_fuzzy_final.get_chunk(), feature_powerful_final.get_chunk(),\n",
2404 |     "                            feat_dssm_300_final.get_chunk(),\n",
2405 |     "                            feat_dssm_600_final.get_chunk(),\n",
2406 |     "                            feat_mvlstm_final.get_chunk(),\n",
2407 |     "                            feat_arc_final.get_chunk(),\n",
2408 |     "                            feat_textcnn_final.get_chunk(),\n",
2409 |     "                            feat_oofcnn_final.get_chunk(),\n",
2410 |     "                            feat_ooflstm_final.get_chunk()], axis=1)\n",
2411 |     "                            \n",
2412 |     "    idx = final_data.get_chunk()\n",
2413 |     "    pred = gbm.predict(features, num_iteration=gbm.best_iteration)\n",
2414 |     "    result.append(pd.DataFrame({'query_id': idx[0].tolist(), 'title_id': idx[2].tolist(), 'label': pred}))\n",
2415 |     "    print(result[i].shape)\n",
2416 |     "    \n",
2417 |     "    del pred\n",
2418 |     "    del idx\n",
2419 |     "    del features\n",
2420 |     "    gc.collect()\n",
2421 |     "    \n",
2422 |     "result = pd.concat(result)\n",
2423 |     "result.columns = range(result.shape[1])\n",
2424 |     "print(result.describe())\n",
2425 |     "print('finish prediction, shpae   --->', result.shape)\n",
2426 |     "result.to_csv('./predictions/final_submission_20190811_2.csv', header=None, index=False)\n",
2427 |     "\n",
2428 |     "# feature_append_test = pd.read_csv('./handled_data/final/test_feature_append.csv', header=None)\n",
2429 |     "# print('loaded append feature     --->',feature_append_test.shape)\n",
2430 |     "\n",
2431 |     "# 融合特征\n",
2432 |     "# with gzip.open('./predictions/testfin_300_dssm.csv_10kw.gz', 'rb') as f:\n",
2433 |     "#     feat_dssm_300_final = pd.read_csv(f, header=None)\n",
2434 |     "# print('loaded feature dssm_300   --->', feat_dssm_300_final.shape)\n",
2435 |     "    \n",
2436 |     "# with gzip.open('./predictions/testfin_600_dssm.csv_10kw.gz', 'rb') as f:\n",
2437 |     "#     feat_dssm_600_final = pd.read_csv(f, header=None)\n",
2438 |     "# feat_dssm_600_final = pd.read_csv('./predictions/testfin_600_dssm_p1.csv', header=None)\n",
2439 |     "# print('loaded feature dssm_600   --->', feat_dssm_600_final.shape)\n",
2440 |     "    \n",
2441 |     "# with gzip.open('./predictions/test_mv_2kw.csv.gz', 'rb') as f:\n",
2442 |     "#     feat_mvlstm_test = pd.read_csv(f, header=None)\n",
2443 |     "# print('loaded feature MVLSTM     --->', feat_mvlstm_test.shape)\n",
2444 |     "    \n",
2445 |     "# with gzip.open('./predictions/test_arc_2kw.csv.gz', 'rb') as f:\n",
2446 |     "#     feat_arc_test = pd.read_csv(f, header=None)\n",
2447 |     "# print('loaded feature ARC        --->', feat_arc_test.shape)\n",
2448 |     "    \n",
2449 |     "# with gzip.open('./predictions/test_textcnn_2kw.csv.gz', 'rb') as f:\n",
2450 |     "#     feat_textcnn_test = pd.read_csv(f, header=None)\n",
2451 |     "# print('loaded feature TextCNN    --->', feat_textcnn_test.shape)\n",
2452 |     "\n",
2453 |     "# feat_lstm_dssm_test = pd.read_csv('./predictions/test_lstm_dssm_2kw.csv', header=None)\n",
2454 |     "# print('loaded feature LSTM-DSSM  --->', feat_lstm_dssm_test.shape)"
2455 |    ]
2456 |   },
2457 |   {
2458 |    "cell_type": "code",
2459 |    "execution_count": 4,
2460 |    "metadata": {
2461 |     "id": "D3072559E2AB4D09B7A6B1A5C5F99C5D",
2462 |     "hide_input": false,
2463 |     "collapsed": false,
2464 |     "scrolled": false
2465 |    },
2466 |    "outputs": [
2467 |     {
2468 |      "name": "stdout",
2469 |      "output_type": "stream",
2470 |      "text": [
2471 |       "Kesci Submit Tool 3.2.1\n\n> 已验证Token\n> 提交文件 ./predictions/final_submission_20190811_2.csv (2877771.46 KiB)\n> 已上传 100 %\n> 文件已上传        \n> 服务器响应: 200 提交成功\n> 提交完成\n"
2472 |      ]
2473 |     }
2474 |    ],
2475 |    "source": [
2476 |     "!https_proxy=\"http://klab-external-proxy\" ./kesci_submit -file ./predictions/final_submission_20190811_2.csv -token 8be4f72dc2395a8d -mode archive            "
2477 |    ]
2478 |   },
2479 |   {
2480 |    "cell_type": "markdown",
2481 |    "metadata": {
2482 |     "id": "F8C1BDC9AC484B3F81EA96EEF49108D9",
2483 |     "mdEditEnable": false
2484 |    },
2485 |    "source": [
2486 |     "# end"
2487 |    ]
2488 |   }
2489 |  ],
2490 |  "metadata": {
2491 |   "kernelspec": {
2492 |    "name": "python3",
2493 |    "display_name": "Python 3",
2494 |    "language": "python"
2495 |   },
2496 |   "language_info": {
2497 |    "name": "python",
2498 |    "version": "3.6.4",
2499 |    "mimetype": "text/x-python",
2500 |    "codemirror_mode": {
2501 |     "name": "ipython",
2502 |     "version": 3
2503 |    },
2504 |    "pygments_lexer": "ipython3",
2505 |    "nbconvert_exporter": "python",
2506 |    "file_extension": ".py"
2507 |   }
2508 |  },
2509 |  "nbformat": 4,
2510 |  "nbformat_minor": 0
2511 | }
2512 | 


--------------------------------------------------------------------------------
/arcii.py:
--------------------------------------------------------------------------------
  1 | """An implementation of Matching Layer."""
  2 | import typing
  3 | 
  4 | from keras import backend as K
  5 | from keras.engine import Layer
  6 | 
  7 | 
  8 | class MatchingLayer(Layer):
  9 |     """
 10 |     Layer that computes a matching matrix between samples in two tensors.
 11 |     :param normalize: Whether to L2-normalize samples along the
 12 |         dot product axis before taking the dot product.
 13 |         If set to True, then the output of the dot product
 14 |         is the cosine proximity between the two samples.
 15 |     :param matching_type: the similarity function for matching
 16 |     :param kwargs: Standard layer keyword arguments.
 17 |     Examples:
 18 |         >>> import matchzoo as mz
 19 |         >>> layer = mz.layers.MatchingLayer(matching_type='dot',
 20 |         ...                                 normalize=True)
 21 |         >>> num_batch, left_len, right_len, num_dim = 5, 3, 2, 10
 22 |         >>> layer.build([[num_batch, left_len, num_dim],
 23 |         ...              [num_batch, right_len, num_dim]])
 24 |     """
 25 | 
 26 |     def __init__(self, normalize: bool = False,
 27 |                  matching_type: str = 'dot', **kwargs):
 28 |         """:class:`MatchingLayer` constructor."""
 29 |         super().__init__(**kwargs)
 30 |         self._normalize = normalize
 31 |         self._validate_matching_type(matching_type)
 32 |         self._matching_type = matching_type
 33 |         self._shape1 = None
 34 |         self._shape2 = None
 35 | 
 36 |     @classmethod
 37 |     def _validate_matching_type(cls, matching_type: str = 'dot'):
 38 |         valid_matching_type = ['dot', 'mul', 'plus', 'minus', 'concat']
 39 |         if matching_type not in valid_matching_type:
 40 |             raise ValueError(f"{matching_type} is not a valid matching type, "
 41 |                              f"{valid_matching_type} expected.")
 42 | 
 43 |     def build(self, input_shape: list):
 44 |         """
 45 |         Build the layer.
 46 |         :param input_shape: the shapes of the input tensors,
 47 |             for MatchingLayer we need tow input tensors.
 48 |         """
 49 |         # Used purely for shape validation.
 50 |         if not isinstance(input_shape, list) or len(input_shape) != 2:
 51 |             raise ValueError('A `MatchingLayer` layer should be called '
 52 |                              'on a list of 2 inputs.')
 53 |         self._shape1 = input_shape[0]
 54 |         self._shape2 = input_shape[1]
 55 |         for idx in 0, 2:
 56 |             if self._shape1[idx] != self._shape2[idx]:
 57 |                 raise ValueError(
 58 |                     'Incompatible dimensions: '
 59 |                     f'{self._shape1[idx]} != {self._shape2[idx]}.'
 60 |                     f'Layer shapes: {self._shape1}, {self._shape2}.'
 61 |                 )
 62 | 
 63 |     def call(self, inputs: list, **kwargs) -> typing.Any:
 64 |         """
 65 |         The computation logic of MatchingLayer.
 66 |         :param inputs: two input tensors.
 67 |         """
 68 |         x1 = inputs[0]
 69 |         x2 = inputs[1]
 70 |         if self._matching_type == 'dot':
 71 |             if self._normalize:
 72 |                 x1 = K.l2_normalize(x1, axis=2)
 73 |                 x2 = K.l2_normalize(x2, axis=2)
 74 |             return K.tf.expand_dims(K.tf.einsum('abd,acd->abc', x1, x2), 3)
 75 |         else:
 76 |             if self._matching_type == 'mul':
 77 |                 def func(x, y):
 78 |                     return x * y
 79 |             elif self._matching_type == 'plus':
 80 |                 def func(x, y):
 81 |                     return x + y
 82 |             elif self._matching_type == 'minus':
 83 |                 def func(x, y):
 84 |                     return x - y
 85 |             elif self._matching_type == 'concat':
 86 |                 def func(x, y):
 87 |                     return K.tf.concat([x, y], axis=3)
 88 |             else:
 89 |                 raise ValueError(f"Invalid matching type."
 90 |                                  f"{self._matching_type} received."
 91 |                                  f"Mut be in `dot`, `mul`, `plus`, "
 92 |                                  f"`minus` and `concat`.")
 93 |             x1_exp = K.tf.stack([x1] * self._shape2[1], 2)
 94 |             x2_exp = K.tf.stack([x2] * self._shape1[1], 1)
 95 |             return func(x1_exp, x2_exp)
 96 | 
 97 |     def compute_output_shape(self, input_shape: list) -> tuple:
 98 |         """
 99 |         Calculate the layer output shape.
100 |         :param input_shape: the shapes of the input tensors,
101 |             for MatchingLayer we need tow input tensors.
102 |         """
103 |         if not isinstance(input_shape, list) or len(input_shape) != 2:
104 |             raise ValueError('A `MatchingLayer` layer should be called '
105 |                              'on a list of 2 inputs.')
106 |         shape1 = list(input_shape[0])
107 |         shape2 = list(input_shape[1])
108 |         if len(shape1) != 3 or len(shape2) != 3:
109 |             raise ValueError('A `MatchingLayer` layer should be called '
110 |                              'on 2 inputs with 3 dimensions.')
111 |         if shape1[0] != shape2[0] or shape1[2] != shape2[2]:
112 |             raise ValueError('A `MatchingLayer` layer should be called '
113 |                              'on 2 inputs with same 0,2 dimensions.')
114 | 
115 |         if self._matching_type in ['mul', 'plus', 'minus']:
116 |             return shape1[0], shape1[1], shape2[1], shape1[2]
117 |         elif self._matching_type == 'dot':
118 |             return shape1[0], shape1[1], shape2[1], 1
119 |         elif self._matching_type == 'concat':
120 |             return shape1[0], shape1[1], shape2[1], shape1[2] + shape2[2]
121 |         else:
122 |             raise ValueError(f"Invalid `matching_type`."
123 |                              f"{self._matching_type} received."
124 |                              f"Must be in `mul`, `plus`, `minus` "
125 |                              f"`dot` and `concat`.")
126 | 
127 |     def get_config(self) -> dict:
128 |         """Get the config dict of MatchingLayer."""
129 |         config = {
130 |             'normalize': self._normalize,
131 |             'matching_type': self._matching_type,
132 |         }
133 |         base_config = super(MatchingLayer, self).get_config()
134 |         return dict(list(base_config.items()) + list(config.items()))
135 | 
136 | 
137 | queryInput = Input(shape=(30,600))
138 | 
139 | titleInput = Input(shape=(30,600))
140 | 
141 | conv_1d_left = Conv1D(128, 3, padding = 'same')(queryInput)
142 | conv_1d_right = Conv1D(128, 3, padding = 'same')(titleInput)
143 | 
144 | matching_layer = MatchingLayer(matching_type='plus')
145 | embed_cross = matching_layer([conv_1d_left, conv_1d_right])
146 | 
147 | embed_cross = Conv2D(16, [5, 5], padding = 'same', activation = 'relu')(embed_cross)
148 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross)
149 | 
150 | embed_cross = Conv2D(32, [5, 5], padding = 'same', activation = 'relu')(embed_cross)
151 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross)
152 | 
153 | embed_cross = Conv2D(64, [5, 5], padding = 'same', activation = 'relu')(embed_cross)
154 | embed_cross = MaxPooling2D(pool_size= [2, 2])(embed_cross)
155 | 
156 | 
157 | embed_flat = Flatten()(embed_cross)
158 | 
159 | x = Dense(256, activation='relu')(embed_flat)
160 | x = Dense(64, activation='relu')(x)
161 | x = Dense(1, activation='sigmoid')(x)
162 | 
163 | model_arc = Model(inputs=[queryInput,titleInput], outputs=x)
164 | model_arc.compile(loss='binary_crossentropy', 
165 |                 optimizer='adam',
166 |                 metrics=[metrics.mae, metrics.binary_accuracy])
167 | 
168 | 


--------------------------------------------------------------------------------
/cnn-dssm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras import backend
  3 | from keras.layers import Activation, Input
  4 | from keras.layers.core import Dense, Lambda, Reshape
  5 | from keras.layers.convolutional import Convolution1D
  6 | from keras.layers.merge import concatenate, dot
  7 | from keras.models import Model
  8 | from keras import metrics
  9 | 
 10 | K = 300 
 11 | L = 128 
 12 | J = 2 
 13 | 
 14 | query = Input(shape = (10, 300))
 15 | pos_doc = Input(shape = (30, 300))
 16 | neg_docs = [Input(shape = (30, 300)) for j in range(J)]
 17 | 
 18 | # 在 DSSM 的表示层使用了类似 TextCNN 的架构
 19 | 
 20 | query_conv1 = Convolution1D(K, 1, padding = "same",
 21 |                             input_shape = (None, WORD_DEPTH),
 22 |                             activation = "tanh")(query) # See equation (2).
 23 | 
 24 | query_conv2 = Convolution1D(K, 2, padding = "same",
 25 |                             input_shape = (None, WORD_DEPTH),
 26 |                             activation = "tanh")(query)
 27 | 
 28 | query_conv3 = Convolution1D(K, 3, padding = "same",
 29 |                             input_shape = (None, WORD_DEPTH),
 30 |                             activation = "tanh")(query)
 31 | 
 32 | query_conv4 = Convolution1D(K, 4, padding = "same",dilation_rate=2,
 33 |                             input_shape = (None, WORD_DEPTH),
 34 |                             
 35 |                             activation = "tanh")(query)  
 36 | 
 37 | query_conv5 = Convolution1D(K, 5, padding = "same",dilation_rate=2,
 38 |                             input_shape = (None, WORD_DEPTH),
 39 |                             activation = "tanh")(query)
 40 | 
 41 | query_conv6 = Convolution1D(K, 6, padding = "same",
 42 |                             input_shape = (None, WORD_DEPTH),
 43 |                             activation = "tanh")(query)                                          
 44 | 
 45 | # 下一步，将最大池化层应用在卷积后的query上。
 46 | # 这一操作选择了每一列的最大值
 47 | 
 48 | query_max1 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv1) 
 49 | 
 50 | query_max2 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv2)
 51 | 
 52 | query_max3 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv3)
 53 | 
 54 | query_max4 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv4)
 55 | 
 56 | query_max5 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv5)
 57 | 
 58 | query_max6 = Lambda(lambda x: backend.max(x, axis = 1),output_shape = (K, ))(query_conv6)
 59 | 
 60 | query_concat_1_2 = concatenate([query_max1,query_max2])
 61 | 
 62 | query_concat_3_4 = concatenate([query_max3,query_max4])
 63 | 
 64 | query_concat_5_6 = concatenate([query_max5,query_max6])
 65 | 
 66 | 
 67 | query_sem1 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_1_2)
 68 | 
 69 | query_sem2 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_3_4)
 70 | 
 71 | query_sem3 = Dense(L, activation = "tanh", input_dim = K*2)(query_concat_5_6)
 72 | 
 73 | # 在这一步中，生成一个句向量来表示一个query。这是一个标准的神经网络层。
 74 | 
 75 | 
 76 | query_concat = concatenate([query_sem1,query_sem2,query_sem3])
 77 | 
 78 | query_sem = Dense(L, activation = "tanh", input_dim = K*3)(query_concat) 
 79 | 
 80 | doc_conv1 = Convolution1D(K, 1, padding = "same",
 81 |                             input_shape = (None, WORD_DEPTH),
 82 |                             activation = "tanh")
 83 |                             
 84 | doc_conv2 = Convolution1D(K, 2, padding = "same",
 85 |                             input_shape = (None, WORD_DEPTH),
 86 |                             activation = "tanh")
 87 |      
 88 | doc_conv3 = Convolution1D(K, 3, padding = "same",
 89 |                             input_shape = (None, WORD_DEPTH),
 90 |                             activation = "tanh")
 91 |                             
 92 | doc_conv4 = Convolution1D(K, 4, padding = "same",dilation_rate=2,
 93 |                             input_shape = (None, WORD_DEPTH),
 94 |                             activation = "tanh")
 95 | 
 96 | doc_conv5 = Convolution1D(K, 5, padding = "same",dilation_rate=2,
 97 |                             input_shape = (None, WORD_DEPTH),
 98 |                             activation = "tanh")
 99 |                             
100 | doc_conv6 = Convolution1D(K, 6, padding = "same",
101 |                             input_shape = (None, WORD_DEPTH),
102 |                             activation = "tanh")                                                        
103 | 
104 |                                                                                     
105 | doc_max = Lambda(lambda x: backend.max(x, axis = 1), output_shape = (K, ))
106 | 
107 | doc_sem1 = Dense(L, activation = "tanh", input_dim = K*2)
108 | 
109 | doc_sem2 = Dense(L, activation = "tanh", input_dim = K*2)
110 | 
111 | doc_sem3 = Dense(L, activation = "tanh", input_dim = K*2)
112 | 
113 | doc_sem = Dense(L, activation = "tanh", input_dim = K*3)
114 | 
115 | 
116 | # 正样本
117 | pos_doc_conv1 = doc_conv1(pos_doc)
118 | pos_doc_max1 = doc_max(pos_doc_conv1)
119 | 
120 | pos_doc_conv2 = doc_conv2(pos_doc)
121 | pos_doc_max2 = doc_max(pos_doc_conv2)
122 | 
123 | pos_doc_conv3 = doc_conv3(pos_doc)
124 | pos_doc_max3 = doc_max(pos_doc_conv3)
125 | 
126 | pos_doc_conv4 = doc_conv4(pos_doc)
127 | pos_doc_max4 = doc_max(pos_doc_conv4)
128 | 
129 | pos_doc_conv5 = doc_conv5(pos_doc)
130 | pos_doc_max5 = doc_max(pos_doc_conv5)
131 | 
132 | pos_doc_conv6 = doc_conv6(pos_doc)
133 | pos_doc_max6 = doc_max(pos_doc_conv6)
134 | 
135 | pos_doc_concat_1_2 = concatenate([pos_doc_max1,pos_doc_max2])
136 | 
137 | pos_doc_concat_3_4 = concatenate([pos_doc_max3,pos_doc_max4])
138 | 
139 | pos_doc_concat_5_6 = concatenate([pos_doc_max5,pos_doc_max6])
140 | 
141 | 
142 | pos_doc_sem1 = doc_sem1(pos_doc_concat_1_2)
143 | 
144 | pos_doc_sem2 = doc_sem2(pos_doc_concat_3_4)
145 | 
146 | pos_doc_sem3 = doc_sem3(pos_doc_concat_5_6)
147 | 
148 | pos_doc_concat = concatenate([pos_doc_sem1,pos_doc_sem2,pos_doc_sem3])
149 | 
150 | pos_doc_sem = doc_sem(pos_doc_concat)
151 | 
152 | 
153 | # 负样本
154 | 
155 | neg_doc_convs1 = [doc_conv1(neg_doc) for neg_doc in neg_docs]
156 | neg_doc_maxes1 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs1]
157 | 
158 | neg_doc_convs2 = [doc_conv2(neg_doc) for neg_doc in neg_docs]
159 | neg_doc_maxes2 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs2]
160 | 
161 | neg_doc_convs3 = [doc_conv3(neg_doc) for neg_doc in neg_docs]
162 | neg_doc_maxes3 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs3]
163 | 
164 | neg_doc_convs4 = [doc_conv4(neg_doc) for neg_doc in neg_docs]
165 | neg_doc_maxes4 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs4]
166 | 
167 | neg_doc_convs5 = [doc_conv5(neg_doc) for neg_doc in neg_docs]
168 | neg_doc_maxes5 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs5]
169 | 
170 | neg_doc_convs6 = [doc_conv6(neg_doc) for neg_doc in neg_docs]
171 | neg_doc_maxes6 = [doc_max(neg_doc_conv) for neg_doc_conv in neg_doc_convs6]
172 | 
173 | neg_doc_concats_1_2 = [concatenate([l1,l2]) for l1,l2 in zip(neg_doc_maxes1,neg_doc_maxes2)]
174 | 
175 | neg_doc_concats_3_4 = [concatenate([l3,l4]) for l3,l4 in zip(neg_doc_maxes3,neg_doc_maxes4)]
176 | 
177 | neg_doc_concats_5_6 = [concatenate([l5,l6]) for l5,l6 in zip(neg_doc_maxes5,neg_doc_maxes6)]
178 | 
179 | neg_doc_sems1 = [doc_sem1(neg_doc_concat) for neg_doc_concat in neg_doc_concats_1_2]
180 | 
181 | neg_doc_sems2 = [doc_sem2(neg_doc_concat) for neg_doc_concat in neg_doc_concats_3_4]
182 | 
183 | neg_doc_sems3 = [doc_sem3(neg_doc_concat) for neg_doc_concat in neg_doc_concats_5_6]
184 | 
185 | neg_doc_concats = [concatenate([l1,l2,l3]) for l1,l2,l3 in zip(neg_doc_sems1,neg_doc_sems2,neg_doc_sems3)]
186 | 
187 | 
188 | neg_doc_sems = [doc_sem(neg_doc_concat) for neg_doc_concat in neg_doc_concats]
189 | 
190 | # 计算 query 和每个 title 的余弦相似度 R(Q, D)
191 | 
192 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes = 1, normalize = True) # See equation (4).
193 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes = 1, normalize = True) for neg_doc_sem in neg_doc_sems] 
194 | 
195 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns)
196 | concat_Rs = Reshape((J + 1, 1))(concat_Rs)
197 | 
198 | # 在这一步，将每个 R(Q, D) 乘以 gamma。 
199 | # 在论文中，gamma 是 softmax 的平滑因子，
200 | # 这里用 CNN 来学习gamma的值，是一个 1*1 的卷积核。
201 | 
202 | weight = np.array([1]).reshape(1, 1, 1)
203 | with_gamma = Convolution1D(1, 1, padding = "same",
204 |                         input_shape = (J + 1, 1),
205 |                         activation = "linear",
206 |                         use_bias = False,
207 |                         weights = [weight])(concat_Rs) 
208 | with_gamma = Reshape((J + 1, ))(with_gamma)
209 | 
210 | prob = Activation("softmax")(with_gamma) 
211 | 
212 | model = Model(inputs = [query, pos_doc] + neg_docs, outputs = prob)
213 | model.compile(optimizer = "adadelta", loss = "categorical_crossentropy",
214 |                 metrics=[metrics.mae, metrics.binary_accuracy])
215 |                 
216 | 
217 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p])
218 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns)


--------------------------------------------------------------------------------
/images/Code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/Code.png


--------------------------------------------------------------------------------
/images/arcii.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/arcii.png


--------------------------------------------------------------------------------
/images/cnn-dssm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/cnn-dssm.png


--------------------------------------------------------------------------------
/images/lstm-dssm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/lstm-dssm.png


--------------------------------------------------------------------------------
/images/mvlstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/mvlstm.png


--------------------------------------------------------------------------------
/images/params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/params.png


--------------------------------------------------------------------------------
/images/textcnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ydli-ai/MatchModels/110fe7477543da4bf666ea0151c6c8b4e94655f2/images/textcnn.png


--------------------------------------------------------------------------------
/lstm-dssm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.layers import Permute
  3 | from keras import backend
  4 | from keras.layers import *
  5 | from keras.layers.core import Dense, Lambda, Reshape
  6 | from keras.layers.convolutional import Convolution1D
  7 | from keras.layers.merge import concatenate, dot
  8 | from keras.models import Model
  9 | from keras import metrics
 10 | 
 11 | K = 300 
 12 | L = 128 
 13 | J = 2 
 14 | 
 15 | query = Input(shape = (10, 300))
 16 | pos_doc = Input(shape = (30, 300))
 17 | neg_docs = [Input(shape = (30, 300)) for j in range(J)]
 18 | 
 19 | 
 20 | query_lstm1 = Bidirectional(CuDNNLSTM(K, return_sequences=True))(query)
 21 | 
 22 | query_lstm2 = Bidirectional(CuDNNLSTM(K, return_sequences=True))(query_lstm1)
 23 | 
 24 | # Attention
 25 | query_a1 = Permute((2, 1))(query_lstm2)
 26 | 
 27 | query_a3 = Dense(10, activation='softmax')(query_a1)
 28 | 
 29 | query_a_probs = Permute((2, 1))(query_a3)
 30 | 
 31 | query_attention_out = multiply([query_lstm2, query_a_probs])
 32 | 
 33 | query_lstm3 = Bidirectional(CuDNNLSTM(K))(query_attention_out)
 34 | 
 35 | query_sem = Dense(L, activation = "tanh", input_dim = K)(query_lstm3)
 36 | 
 37 | 
 38 | 
 39 | doc_lstm1 = Bidirectional(CuDNNLSTM(K, return_sequences=True))
 40 | 
 41 | doc_lstm2 = Bidirectional(CuDNNLSTM(K, return_sequences=True))
 42 | 
 43 | doc_a1 = Permute((2, 1))
 44 | 
 45 | doc_a2 = Reshape((300, 10))
 46 | 
 47 | doc_att_dense = Dense(30, activation='softmax')
 48 | 
 49 | doc_a_probs = Permute((2, 1))
 50 | 
 51 | 
 52 | doc_lstm3 = Bidirectional(CuDNNLSTM( 150 ))
 53 | 
 54 | doc_sem = Dense(L, activation = "tanh", input_dim = K)
 55 | 
 56 | 
 57 | # 正样本
 58 | 
 59 | pos_doc_lstm1 = doc_lstm1(pos_doc)
 60 | 
 61 | pos_doc_lstm2 = doc_lstm2(pos_doc_lstm1)
 62 | 
 63 | pos_doc_a1 = Permute((2, 1))(pos_doc_lstm2)
 64 | 
 65 | pos_doc_a3 = doc_att_dense(pos_doc_a1)
 66 | 
 67 | pos_doc_probs = Permute((2, 1))(pos_doc_a3)
 68 | 
 69 | pos_doc_att_out = multiply([pos_doc_lstm2,pos_doc_probs])
 70 | 
 71 | pos_doc_lstm3 = doc_lstm3(pos_doc_att_out)
 72 | 
 73 | pos_doc_sem = doc_sem(pos_doc_lstm3)
 74 | 
 75 | # 负样本
 76 | 
 77 | neg_doc_lstm1 = [doc_lstm1(neg_doc) for neg_doc in neg_docs]
 78 | neg_doc_lstm2 = [doc_lstm2(neg_doc) for neg_doc in neg_doc_lstm1]
 79 | 
 80 | neg_doc_a1 = [Permute((2, 1))(neg_doc) for neg_doc in neg_doc_lstm2]
 81 | 
 82 | neg_doc_a3 = [doc_att_dense(neg_doc) for neg_doc in neg_doc_a1]
 83 | 
 84 | neg_doc_probs = [Permute((2, 1))(neg_doc) for neg_doc in neg_doc_a3]
 85 | 
 86 | neg_doc_att_out = [multiply([lstm,prb]) for lstm,prb in zip(neg_doc_lstm2,neg_doc_probs)]
 87 | 
 88 | neg_doc_lstm3 = [doc_lstm3(neg_doc) for neg_doc in neg_doc_att_out]
 89 | 
 90 | neg_doc_sems = [doc_sem(neg_doc_lstm_mx) for neg_doc_lstm_mx in neg_doc_lstm3]
 91 | 
 92 | R_Q_D_p = dot([query_sem, pos_doc_sem], axes = 1, normalize = True) # See equation (4).
 93 | R_Q_D_ns = [dot([query_sem, neg_doc_sem], axes = 1, normalize = True) for neg_doc_sem in neg_doc_sems] # See equation (4).
 94 | 
 95 | concat_Rs = concatenate([R_Q_D_p] + R_Q_D_ns)
 96 | concat_Rs = Reshape((J + 1, 1))(concat_Rs)
 97 | 
 98 | weight = np.array([1]).reshape(1, 1, 1)
 99 | with_gamma = Convolution1D(1, 1, padding = "same",
100 |                         input_shape = (J + 1, 1),
101 |                         activation = "linear",
102 |                         use_bias = False,
103 |                         weights = [weight])(concat_Rs) 
104 | with_gamma = Reshape((J + 1, ))(with_gamma)
105 | 
106 | prob = Activation("softmax")(with_gamma) 
107 | 
108 | model = Model(inputs = [query, pos_doc] + neg_docs, outputs = prob)
109 | model.compile(optimizer = "adam", loss = "categorical_crossentropy",
110 |                 metrics=[metrics.mae, metrics.binary_accuracy])
111 | 
112 | 
113 | get_R_Q_D_p = backend.function([query, pos_doc], [R_Q_D_p])
114 | get_R_Q_D_ns = backend.function([query] + neg_docs, R_Q_D_ns)


--------------------------------------------------------------------------------
/mvlstm.py:
--------------------------------------------------------------------------------
 1 | queryInput = Input(shape=(30,600))
 2 | 
 3 | titleInput = Input(shape=(30,600))
 4 | 
 5 | rep_query = Bidirectional(CuDNNLSTM(128,return_sequences=True))(queryInput)
 6 | rep_query = Bidirectional(CuDNNLSTM(128,return_sequences=True))(rep_query)
 7 | 
 8 | rep_doc = Bidirectional(CuDNNLSTM(128,return_sequences=True))(titleInput)
 9 | rep_doc = Bidirectional(CuDNNLSTM(128,return_sequences=True))(rep_doc)
10 | 
11 | # Top-k matching layer
12 | matching_matrix = Dot(axes=[2, 2], normalize=False)([rep_query, rep_doc])
13 | matching_signals = Reshape((-1,))(matching_matrix)
14 | matching_topk = Lambda(lambda x: K.tf.nn.top_k(x, k=50, sorted=True)[0])(matching_signals)
15 | 
16 | # Multilayer perceptron layer.
17 | dnn = Dense(256,activation = 'relu')(matching_topk)
18 | dnn = Dense(64,activation = 'relu')(matching_topk)
19 | out = Dense(1,activation = 'sigmoid')(dnn)
20 | model_mvlstm = Model(inputs=[queryInput,titleInput], outputs=out)
21 | model_mvlstm.compile(loss='binary_crossentropy', 
22 |                 optimizer='adam',
23 |                 metrics=[metrics.mae, metrics.binary_accuracy])
24 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # 2019中国高校计算机大赛——大数据挑战赛 WriteUp
  2 | 赛题：[【正式赛】2019中国高校计算机大赛——大数据挑战赛](https://www.kesci.com/home/competition/5cc51043f71088002c5b8840/content/1)
  3 | 
  4 | 我们是最终排名第 15 名的**改革春风吹满地**队伍，对方法和模型进行了简单的整理。
  5 | 
  6 |    * [2019中国高校计算机大赛——大数据挑战赛 WriteUp](#2019中国高校计算机大赛大数据挑战赛-writeup)
  7 |    * [传统模型](#传统模型)
  8 |       * [query和title去重](#query和title去重)
  9 |       * [图特征](#图特征)
 10 |       * [普通统计特征](#普通统计特征)
 11 |       * [词向量特征](#词向量特征)
 12 |       * [特征重要性分析](#特征重要性分析)
 13 |       * [LightGBM模型](#lightgbm模型)
 14 |          * [LightGBM参数](#lightgbm参数)
 15 |          * [训练](#训练)
 16 |    * [深度模型](#深度模型)
 17 |       * [CNN-DSSM](#cnn-dssm)
 18 |       * [LSTM-DSSM](#lstm-dssm)
 19 |       * [mvlstm](#mvlstm)
 20 |       * [ARC II](#arc-ii)
 21 | 
 22 | # 传统模型
 23 | 
 24 | ## query和title去重
 25 | 
 26 | 训练词向量以及词频统计是query和title需做去重处理
 27 | 
 28 | ## 图特征
 29 | 
 30 | *每一个query或title都作为图中唯一一个节点,基于query与title的句子对构建边。计划构建无向图以及权重图。无向图规模最大建立了一亿三千万数据量的图，生成图的过程把query和title都转换成一个Int32的ID值，减少内存消耗。*
 31 | 
 32 | > Trick：利用Pickle保存、读取图是最快的，文件的体积也是最小的。
 33 | 
 34 | **无向图特征——最大完全子图max_clique**：最大完全子图的大小(特征效果提升不明显，舍弃！）。
 35 | 
 36 | **无向图特征——边连接数max_degrees**：统计每个节点的边连接数
 37 | 
 38 | **无向图特征——最大连通子图规模max_components**：统计每个节点最大连通子图规模
 39 | 
 40 | **无向图特征——pagerank值**：根据Google的pagerank算法计算每个节点的pagerank值，这个虽然我们使是使用无向图进行计算的，但是pagerank是通过迭代计算每个节点的入度来评判每个节点的重要程度。所以在运行pagerank算法的时候会默认把无向边连接转化成双向的边连接。
 41 | 
 42 | **无向图特征——HITS算法A值和H值**：时间关系没有训练出HITS模型，不过HITS类似pagerank模型，在HIST算法中，分为Hub页面和Authority页面，Authority页面是指与某个领域或者某个话题相关的高质量页面，Hub页面则是包含很多指向高质量Authority页面链接的网页。HITS算法模型会给出每个节点的A值和H值用来评估节点的重要程度。
 43 | 
 44 | **无向图特征——shortestpath**：query和title之间的最短路径，对于共现的query和title我们会建立一条边，这样shortestpath会默认是1，所以在计算shortestpath特征的时候，我们会把这条边先删除，shortestpath计算结束后在重新加会这条边。
 45 | 
 46 | **无向图特征——neighbour**：邻居数，由于图过大计算邻居数会爆内存。在大数据量上，我们舍弃了这维度特征。
 47 | 
 48 | **权重图特征**：图特征的边的权重计划通过，query和title的句子词向量相似度或者BM25算法实现，由于时间紧张没有实现。大概特征类别等同于无向图特征，更多的是作为对于无向图特征的修正。
 49 | 
 50 | ## 普通统计特征
 51 | 
 52 | *复赛提供15个核的CPU，利用multiprocessing库的进程池管理模块Pool，可以大大加快特征生成速度*
 53 | 
 54 | **concurrence**：query与title中相同词所占词总数比
 55 | 
 56 | **levenshteinDistance**： 编辑距离（后面通过fuzzywuzzy实现了更为详细的分析）
 57 | 
 58 | **sorensenDistance**：sorrensen距离
 59 | 
 60 | **sameWord**：query与title中相同词种类数，某个词出现label为1的概率乘积（需要初始化pos_prob）
 61 | 
 62 | **distance**： Dice Distance、Ochi DIstance、 Jaccard Distance
 63 | 
 64 | **fuzzyDistance**：通过fuzzywuzzy库(字符串模糊匹配工具)实现的详细编辑距离分析，包括简单匹配(Simple Ratio)、非完全匹配(Partial Ration)、忽略顺序匹配(Token Sort Ratio)、去重子集匹配(Token Set Ratio)
 65 | 
 66 | **powerful words**：统计label为1时，query和title中同时出现的单词词频(双边概率)、只出现在query或title中的单词词频(单边概率)
 67 | 
 68 | ## 词向量特征
 69 | 
 70 | **word2vecDistance**：基于word2vec词向量的Cosine Distance、Euclidean Distance、Manhattan Distance。（后加入fasttext模型）
 71 | 
 72 | **w2vWeightDistance**：基于word2vec词向量，并考虑TF-IDF权重的Cosine Distance、Euclidean Distance、Manhattan Distance。（后加入fasttext模型）
 73 | 
 74 | **NGramDistance**：词粒度NGram距离（词粒度效果不好，舍弃！）
 75 | 
 76 | ## 特征重要性分析
 77 | 
 78 | 图中包括以上没介绍的特征，均为深度文本匹配模型的输出结果。 作为一维特征
 79 | 
 80 | ![](images/Code.png)
 81 | 
 82 | ## LightGBM模型
 83 | 
 84 | ### LightGBM参数
 85 | 
 86 | ![](images/params.png)
 87 | 
 88 | ### 训练
 89 | 
 90 | *训练集是采用十亿数据集中后三千万作为训练集，百分之二十作为验证集，保留原数据Label比例采样。*
 91 | 
 92 | # 深度模型
 93 | 
 94 | ## CNN-DSSM
 95 | 
 96 | 传统的CNN - DSSM 只用了一个卷积作为表示层，这里用了类似 TextCNN 的架构作为表示层，用了 kernel_size 不同的 6 个卷积层。
 97 | 
 98 | 
 99 | ![cnn-dssm](images/cnn-dssm.png)
100 | 
101 | 参考
102 | 
103 | http://research.microsoft.com/pubs/226585/cikm2014_cdssm_final.pdf
104 | 
105 | https://github.com/airalcorn2/Deep-Semantic-Similarity-Model/blob/master/deep_semantic_similarity_keras.py
106 | 
107 | ## LSTM-DSSM
108 | 
109 | LSTM-DSSM 用 Bi-LSTM + Attention 作为 DSSM 的表示层。
110 | 
111 | ![lstm-dssm](images/lstm-dssm.png)
112 | 
113 | ## mvlstm
114 | 
115 | 通过 Bi-LSTM 构造对齐矩阵，匹配句子之间的关系。
116 | 
117 | ![mvlstm](images/mvlstm.png)
118 | 
119 | 参考
120 | 
121 | https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/mvlstm.py
122 | 
123 | ## ARC II
124 | 
125 | ![arcii](images/arcii.png)
126 | 
127 | 参考
128 | 
129 | https://github.com/NTMC-Community/MatchZoo/blob/master/matchzoo/models/arcii.py
130 | 
131 | ## TextCNN
132 | ![textcnn](images/textcnn.png)


--------------------------------------------------------------------------------
/textcnn.py:
--------------------------------------------------------------------------------
  1 | from keras.layers import *
  2 | from keras.models import Model
  3 | from keras import metrics
  4 | 
  5 | # query输入
  6 | queryInput = Input(shape=(30,600))
  7 | 
  8 | #title输入
  9 | titleInput = Input(shape=(30,600))
 10 | 
 11 | x = TimeDistributed(Dense(150, activation='relu'))(queryInput)
 12 | xlstm = CuDNNLSTM(150, return_sequences=True)(x)
 13 | xlstm1 = GlobalMaxPooling1D()(xlstm)
 14 | xa = concatenate([xlstm, x])
 15 | 
 16 | xconv1 = Convolution1D(filters=100,
 17 |                        kernel_size=1,
 18 |                        padding='same',
 19 |                        activation='relu')(xa)
 20 | xconv1 = GlobalMaxPooling1D()(xconv1)
 21 | 
 22 | xconv2 = Convolution1D(filters=100,
 23 |                        kernel_size=2,
 24 |                        padding='same',
 25 |                        activation='relu')(xa)
 26 | xconv2 = GlobalMaxPooling1D()(xconv2)
 27 | 
 28 | xconv3 = Convolution1D(filters=100,
 29 |                        kernel_size=3,
 30 |                        padding='same',
 31 |                        activation='relu')(xa)
 32 | xconv3 = GlobalMaxPooling1D()(xconv3)
 33 | 
 34 | xconv4 = Convolution1D(filters=100,
 35 |                        kernel_size=4,dilation_rate=2,
 36 |                        padding='same',
 37 |                        activation='relu')(xa)
 38 | xconv4 = GlobalMaxPooling1D()(xconv4)
 39 | 
 40 | xconv5 = Convolution1D(filters=100,
 41 |                        kernel_size=5,dilation_rate=2,
 42 |                        padding='same',
 43 |                        activation='relu')(xa)
 44 | xconv5 = GlobalMaxPooling1D()(xconv5)
 45 | 
 46 | xconv6 = Convolution1D(filters=100,
 47 |                        kernel_size=6,
 48 |                        padding='same',
 49 |                        activation='relu')(xa)
 50 | xconv6 = GlobalMaxPooling1D()(xconv6)
 51 | xgru = CuDNNGRU(300, return_sequences=True)(xa)
 52 | x = concatenate([xconv1,xconv2,xconv3,xconv4,xconv5,xconv6,xlstm1])
 53 | x = Dropout(0.2)(x)
 54 | x = Dense(200)(x)
 55 | x_out = PReLU()(x)
 56 | 
 57 | 
 58 | y = TimeDistributed(Dense(150, activation='relu'))(titleInput)
 59 | ylstm = CuDNNLSTM(150, return_sequences=True)(y)
 60 | ylstm1 = GlobalMaxPooling1D()(ylstm)
 61 | ya = concatenate([ylstm, y])
 62 | 
 63 | yconv1 = Convolution1D(filters=100,
 64 |                        kernel_size=1,
 65 |                        padding='same',
 66 |                        activation='relu')(ya)
 67 | yconv1 = GlobalMaxPooling1D()(yconv1)
 68 | 
 69 | yconv2 = Convolution1D(filters=100,
 70 |                        kernel_size=2,
 71 |                        padding='same',
 72 |                        activation='relu')(ya)
 73 | yconv2 = GlobalMaxPooling1D()(yconv2)
 74 | 
 75 | yconv3 = Convolution1D(filters=100,
 76 |                        kernel_size=3,
 77 |                        padding='same',
 78 |                        activation='relu')(ya)
 79 | yconv3 = GlobalMaxPooling1D()(yconv3)
 80 | 
 81 | yconv4 = Convolution1D(filters=100,
 82 |                        kernel_size=4,dilation_rate=2,
 83 |                        padding='same',
 84 |                        activation='relu')(ya)
 85 | yconv4 = GlobalMaxPooling1D()(yconv4)
 86 | 
 87 | yconv5 = Convolution1D(filters=100,
 88 |                        kernel_size=5,dilation_rate=2,
 89 |                        padding='same',
 90 |                        activation='relu')(ya)
 91 | yconv5 = GlobalMaxPooling1D()(yconv5)
 92 | 
 93 | yconv6 = Convolution1D(filters=100,
 94 |                        kernel_size=6,
 95 |                        padding='same',
 96 |                        activation='relu')(ya)
 97 | yconv6 = GlobalMaxPooling1D()(yconv6)
 98 | ygru = CuDNNGRU(300, return_sequences=True)(ya)
 99 | y = concatenate([yconv1,yconv2,yconv3,yconv4,yconv5,yconv6,ylstm1])
100 | y = Dropout(0.2)(y)
101 | y = Dense(200)(y)
102 | y_out = PReLU()(y)
103 | 
104 | # interaction
105 | x1,l,lc = [x_out,xlstm,xgru]
106 | 
107 | x2,r,rc = [y_out,ylstm,ygru]
108 | 
109 | cross1 = Dot(axes=[2, 2], normalize=True)([l,r])
110 | cross1 = Reshape((-1, ))(cross1)
111 | cross1 = Dropout(0.5)(cross1)
112 | cross1 = Dense(200)(cross1)
113 | cross1 = PReLU()(cross1)
114 | 
115 | cross2 = Dot(axes=[2, 2], normalize=True)([lc,rc])
116 | cross2 = Reshape((-1, ))(cross2)
117 | cross2 = Dropout(0.5)(cross2)
118 | cross2 = Dense(200)(cross2)
119 | cross2 = PReLU()(cross2)
120 | 
121 | diff = subtract([x1,x2])
122 | mul = multiply([x1,x2])
123 | x = concatenate([x1,x2,diff,mul,cross1,cross2])
124 | x = BatchNormalization()(x)
125 | 
126 | x = Dense(500)(x)
127 | x = PReLU()(x)
128 | x = Dropout(0.2)(x)
129 | 
130 | 
131 | hidden1 = Dense(200)(x)
132 | hidden1 = PReLU()(hidden1)
133 | hidden1 = Dropout(0.2)(hidden1)
134 | 
135 | 
136 | hidden2 = Dense(50)(hidden1)
137 | hidden2 = PReLU()(hidden2)
138 | hidden2 = Dropout(0.2)(hidden2)
139 | 
140 | out = Dense(1, activation='sigmoid')(hidden2)
141 | model_t2 = Model(inputs=[queryInput,titleInput], outputs=out)
142 | model_t2.compile(loss='binary_crossentropy',
143 |                  optimizer='adam',
144 |                  metrics=[metrics.mae, metrics.binary_accuracy])


--------------------------------------------------------------------------------