├── README.md └── 互联网金融新实体发现 └── bert_ner_rank.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # competition 2 | 比赛相关的实践 3 | 4 | 1. [互联网金融新实体发现](https://www.datafountain.cn/competitions/361) 5 | 基于bert+bilstm+crf来进行实体发现。提供jupyter, 开箱即用,线上成绩0.2以上。代码持续迭代中,欢迎大家star。 6 | 部分结果截图如下: 7 | ![](https://i.imgur.com/afLZrMQ.png) 8 | 9 | 另外可以关注我的另外项目https://github.com/searchlink/Bert-Chinese-Task-Pytorch, 基于预训练数据来处理中文任务,谢谢! -------------------------------------------------------------------------------- /互联网金融新实体发现/bert_ner_rank.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 基于bert的ner" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2019-09-05T03:01:07.925298Z", 16 | "start_time": "2019-09-05T03:01:05.982504Z" 17 | } 18 | }, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "Using TensorFlow backend.\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import os\n", 30 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"5\"\n", 31 | "os.environ['TF_CPP_MIN_LOG_LEVEL'] = \"3\"\n", 32 | "import codecs\n", 33 | "import re\n", 34 | "import random\n", 35 | "import string\n", 36 | "from tqdm import tqdm\n", 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "from zhon.hanzi import punctuation\n", 40 | "from sklearn.model_selection import train_test_split\n", 41 | "from keras_bert import load_trained_model_from_checkpoint, Tokenizer\n", 42 | "from keras_contrib.layers import CRF\n", 43 | "import tensorflow as tf\n", 44 | "import keras\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", 47 | "%matplotlib inline\n", 48 | "tf.logging.set_verbosity(tf.logging.ERROR)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "ExecuteTime": { 55 | "end_time": "2019-09-05T00:34:46.972232Z", 56 | "start_time": "2019-09-05T00:34:46.967863Z" 57 | } 58 | }, 59 | "source": [ 60 | "## 读取数据" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2019-09-05T03:01:07.931254Z", 69 | "start_time": "2019-09-05T03:01:07.928006Z" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# 数据路径\n", 75 | "train_path = \"/home/wangwei/tf_workdir/word_detect/data/Train_Data.csv\"\n", 76 | "test_path = \"/home/wangwei/tf_workdir/word_detect/data/Test_Data.csv\"\n", 77 | "submit_path = \"/home/wangwei/tf_workdir/word_detect/data/Submit_Example_Data.csv\"" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### 缺失值填充和删除" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 137, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2019-09-05T05:52:09.856878Z", 93 | "start_time": "2019-09-05T05:52:09.495247Z" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# 开始合并数据\n", 99 | "df_train = pd.read_csv(train_path, header=0, encoding='utf-8')\n", 100 | "# 对缺失值进行处理\n", 101 | "df_train[\"title\"] = df_train[\"title\"].fillna('')\n", 102 | "df_train[\"text\"] = df_train[\"text\"].fillna('')\n", 103 | "df_train = df_train[df_train[\"unknownEntities\"].notnull()] # 剔除没有新词的行(剔除了300个,数量有点大)\n", 104 | "\n", 105 | "df_test = pd.read_csv(test_path, header=0, encoding='utf-8')\n", 106 | "df_test[\"title\"] = df_test[\"title\"].fillna('')\n", 107 | "df_test[\"text\"] = df_test[\"text\"].fillna('')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## 数据预处理" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### 定义正则表达式的pattern" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 4, 127 | "metadata": { 128 | "ExecuteTime": { 129 | "end_time": "2019-09-05T03:01:08.306580Z", 130 | "start_time": "2019-09-05T03:01:08.299173Z" 131 | } 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "## 数据预处理\n", 136 | "pattern1 = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # 剔除链接\n", 137 | "pattern2 = re.compile(\"\\{IMG.*?\\}\") # 剔除{IMG:1}{IMG:2}等等\n", 138 | "# pattern3 = re.compile(\"(.*?\\)\") # 剔除括号等等\n", 139 | "pattern4 = re.compile(\"《.*?\\》\") # 剔除括号等等\n", 140 | "pattern5 = re.compile(\"【.*?】\") # 删除括号内容\n", 141 | "pattern6 = re.compile(\"\\?+\") # 删除多个问号\n", 142 | "pattern7 = re.compile(\"[\\w!#$%&'*+/=?^_`{|}~-]+(?:\\.[\\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\\w](?:[\\w-]*[\\w])?\\.)+[\\w](?:[\\w-]*[\\w])?\") # 邮箱\n", 143 | "pattern8 = re.compile(\"0\\d{2}-\\d{8}|0\\d{3}-\\d{7}|\\d{5}-\\d{5}\") # 剔除电话\n", 144 | "pattern9 = re.compile(\"(20\\d{2}([\\.\\-/|年月\\s]{1,3}\\d{1,2}){2}日?(\\s?\\d{2}:\\d{2}(:\\d{2})?)?)|(\\d{1,2}\\s?(分钟|小时|天)前)\") # 日期\n", 145 | "pattern10 = re.compile(\"<.*?>\")\n", 146 | "pattern = [pattern1, pattern2, pattern4, pattern5, pattern6, pattern7, pattern8, pattern9, pattern10]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### 定义各种数据处理函数" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "ExecuteTime": { 161 | "end_time": "2019-09-05T03:01:08.322783Z", 162 | "start_time": "2019-09-05T03:01:08.309257Z" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def clean_zh(text):\n", 168 | " '''清洗文本,保证语句通顺(关于小数点的问题无法处理)'''\n", 169 | " text = text.replace(\"(\", \"(\").replace(\")\", \")\")\n", 170 | " punct = string.punctuation + punctuation\n", 171 | " punct = \"\".join([c for c in punct if c not in [\".\", \"、\", \"%\", \"“\", \"”\", \"(\", \")\", \"!\", \"。\", \"?\"]])\n", 172 | " text = re.sub(r\"[%s]+\" % punct, \" \", text)\n", 173 | " # 将引号替换\n", 174 | " text = re.sub(r\"[%s]+\" % \"“”()\", \"\", text)\n", 175 | " text = re.sub(r\"[%s]+\" % \":\", \" \", text)\n", 176 | " # 多个空格替换成一个\n", 177 | " text = re.sub(' +', ' ', text)\n", 178 | " return text\n", 179 | "\n", 180 | "def clean_data(text):\n", 181 | " \"\"\"清理各种脏数据\"\"\"\n", 182 | " for p in pattern:\n", 183 | " text = re.sub(p, \"\", text)\n", 184 | " text = clean_zh(text)\n", 185 | " return text\n", 186 | "\n", 187 | "def clean_label(label):\n", 188 | " \"\"\"对标签进行清洗\"\"\"\n", 189 | " label_list = []\n", 190 | " label = re.sub(pattern6, \"\", label)\n", 191 | " # 替换\n", 192 | " label = label.replace(\"(\", \"(\").replace(\")\", \")\")\n", 193 | " label = re.sub(r\"[%s]+\" % \"()\", \"\", label)\n", 194 | " return label\n", 195 | "\n", 196 | "def is_contain(col, words):\n", 197 | " words_list = words.split(\";\") # 多个实体\n", 198 | " length = len(words_list)\n", 199 | " flag = []\n", 200 | " for word in words_list:\n", 201 | " if word in col:\n", 202 | " flag.append(1)\n", 203 | " if len(flag) == length:\n", 204 | " return 1\n", 205 | " else:\n", 206 | " return 0\n", 207 | " \n", 208 | "def title_contain(col, words):\n", 209 | " words_list = words.split(\";\") # 多个实体\n", 210 | " flag = 0 \n", 211 | " for word in words_list:\n", 212 | " if word in col:\n", 213 | " flag = 1\n", 214 | " return flag\n", 215 | "\n", 216 | "# 由于每个句子较长,这里只去包含新词实体的句子\n", 217 | "def get_sentence(text, words):\n", 218 | " \"\"\"缩小数据规模\"\"\"\n", 219 | " new_str = []\n", 220 | " # 按照多个分隔符进行分割\n", 221 | " sentences = re.split(\"。|!|?\", text.strip())\n", 222 | " words_list = words.split(\";\") # 多个实体\n", 223 | " for sent in sentences:\n", 224 | " for word in words_list:\n", 225 | " if word in sent and sent not in new_str:\n", 226 | " new_str.append(sent)\n", 227 | " return \"。\".join(new_str)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### 开始数据处理" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": { 241 | "ExecuteTime": { 242 | "end_time": "2019-09-05T03:01:12.957065Z", 243 | "start_time": "2019-09-05T03:01:08.324560Z" 244 | } 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "df_train[\"title\"] = df_train[\"title\"].map(lambda x: clean_data(x))\n", 249 | "df_train[\"text\"] = df_train[\"text\"].map(lambda x: clean_data(x))\n", 250 | "df_train[\"unknownEntities\"] = df_train[\"unknownEntities\"].map(lambda x: clean_label(x))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 7, 256 | "metadata": { 257 | "ExecuteTime": { 258 | "end_time": "2019-09-05T03:01:13.170263Z", 259 | "start_time": "2019-09-05T03:01:12.959642Z" 260 | } 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "df_train['text_flag'] = df_train[['text','unknownEntities']].apply(lambda x: is_contain(*x), axis=1)\n", 265 | "df_train['title_flag'] = df_train[['title','unknownEntities']].apply(lambda x: title_contain(*x), axis=1)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": { 272 | "ExecuteTime": { 273 | "end_time": "2019-09-05T03:01:13.179781Z", 274 | "start_time": "2019-09-05T03:01:13.172991Z" 275 | } 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "# 这里剔除样本数据\n", 280 | "df_train = df_train[(df_train['text_flag'] == 1 ) | (df_train['title_flag'] == 1)]" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "### 定义词汇和标注函数" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 9, 293 | "metadata": { 294 | "ExecuteTime": { 295 | "end_time": "2019-09-05T03:01:13.188170Z", 296 | "start_time": "2019-09-05T03:01:13.181941Z" 297 | } 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "# 定义tag词汇\n", 302 | "tags = [\"O\", \"B\", \"I\"]\n", 303 | "tag2idx = {tag: i+1 for i, tag in enumerate(tags)}\n", 304 | "tag2idx[\"-PAD-\"] = 0\n", 305 | "n_tags = len(tag2idx)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 10, 311 | "metadata": { 312 | "ExecuteTime": { 313 | "end_time": "2019-09-05T03:01:13.198758Z", 314 | "start_time": "2019-09-05T03:01:13.190064Z" 315 | } 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "# 按照句长进行实体标注\n", 320 | "def remark_tag(text, words):\n", 321 | " \"\"\"多个实体进行标注\"\"\"\n", 322 | " tag_list = [\"O\"] * len(text)\n", 323 | " words_list = words.split(\";\") # 多个实体\n", 324 | " for word in words_list:\n", 325 | " # 获取字符串在文本出现的所有下标\n", 326 | " start_index = [w.start() for w in re.finditer(word, text)]\n", 327 | " for index in start_index:\n", 328 | " tag_list[index] = \"B\"\n", 329 | " tag_list[(index + 1): (index + len(word))] = [\"I\"] * (len(word) - 1)\n", 330 | " assert len(text) == len(tag_list)\n", 331 | " return tag_list" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "## bert部分" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "### bert的存储路径" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 11, 351 | "metadata": { 352 | "ExecuteTime": { 353 | "end_time": "2019-09-05T03:01:13.208855Z", 354 | "start_time": "2019-09-05T03:01:13.200636Z" 355 | } 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "config_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/bert_config.json\"\n", 360 | "checkpoint_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/bert_model.ckpt\"\n", 361 | "dict_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/vocab.txt\"" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "### 自定义tokenizer" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 12, 374 | "metadata": { 375 | "ExecuteTime": { 376 | "end_time": "2019-09-05T03:01:13.260428Z", 377 | "start_time": "2019-09-05T03:01:13.210861Z" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "# 词汇表\n", 383 | "token_dict = {}\n", 384 | "with codecs.open(dict_path, \"r\", encoding=\"utf-8\") as f:\n", 385 | " for line in f:\n", 386 | " token = line.strip()\n", 387 | " token_dict[token] = len(token_dict)\n", 388 | "\n", 389 | "class OurTokenizer(Tokenizer):\n", 390 | " def _tokenize(self, text):\n", 391 | " R = []\n", 392 | " for c in text:\n", 393 | " if c in self._token_dict:\n", 394 | " R.append(c)\n", 395 | " elif self._is_space(c):\n", 396 | " R.append('[unused1]') # space类用未经训练的[unused1]表示\n", 397 | " else:\n", 398 | " R.append('[UNK]') # 剩余的字符是[UNK]\n", 399 | " return R" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "### 加载bert权重和网络结构" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "ExecuteTime": { 414 | "end_time": "2019-09-05T03:01:33.176152Z", 415 | "start_time": "2019-09-05T03:01:13.262378Z" 416 | } 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "tokenizer = OurTokenizer(token_dict)\n", 421 | "bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 14, 427 | "metadata": { 428 | "ExecuteTime": { 429 | "end_time": "2019-09-05T03:01:33.184574Z", 430 | "start_time": "2019-09-05T03:01:33.178373Z" 431 | } 432 | }, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "[]" 438 | ] 439 | }, 440 | "execution_count": 14, 441 | "metadata": {}, 442 | "output_type": "execute_result" 443 | } 444 | ], 445 | "source": [ 446 | "bert_model.outputs" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### 生成训练集和验证集,以及测试集" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 15, 459 | "metadata": { 460 | "ExecuteTime": { 461 | "end_time": "2019-09-05T03:01:33.199050Z", 462 | "start_time": "2019-09-05T03:01:33.187423Z" 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "# 获取数据\n", 468 | "def get_train_data():\n", 469 | " train_data = []\n", 470 | " n2id = []\n", 471 | " for id, t, c, e, text_flag, title_flag in zip(df_train[\"id\"], df_train[\"title\"], df_train[\"text\"], df_train[\"unknownEntities\"], df_train['text_flag'], df_train['title_flag']):\n", 472 | " if text_flag == 1:\n", 473 | " c = get_sentence(c, e)\n", 474 | " else:\n", 475 | " c = get_sentence(t, e)\n", 476 | " # 实体标注\n", 477 | " try:\n", 478 | " n = remark_tag(c, e)\n", 479 | " n2id = [list(map(lambda x: tag2idx[x], sample)) for sample in n]\n", 480 | " except Exception as e:\n", 481 | " print(c)\n", 482 | " print(e)\n", 483 | " print(\"*\"*50)\n", 484 | " train_data.append((id, c, n2id))\n", 485 | " return train_data" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 16, 491 | "metadata": { 492 | "ExecuteTime": { 493 | "end_time": "2019-09-05T03:02:37.806123Z", 494 | "start_time": "2019-09-05T03:02:35.382861Z" 495 | } 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "train_data = get_train_data()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 17, 505 | "metadata": { 506 | "ExecuteTime": { 507 | "end_time": "2019-09-05T03:02:42.246000Z", 508 | "start_time": "2019-09-05T03:02:42.233730Z" 509 | } 510 | }, 511 | "outputs": [ 512 | { 513 | "data": { 514 | "text/plain": [ 515 | "(4433, (4433, 6))" 516 | ] 517 | }, 518 | "execution_count": 17, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "len(train_data), df_train.shape" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 18, 530 | "metadata": { 531 | "ExecuteTime": { 532 | "end_time": "2019-09-05T03:02:57.354976Z", 533 | "start_time": "2019-09-05T03:02:57.348517Z" 534 | } 535 | }, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "('83dcefb7', '揭秘趣步骗局 趣步是什么 趣步是怎么赚钱的。趣步公司可靠吗。趣步合法吗。相信是众多小伙伴最关心的话题 今天小编就来给大家揭开趣步这面丑恶且神秘的面纱 让小伙伴们看清事情的真相。接下来 我用简单的文字 给大家详细剖析一下趣步公司及趣步app的逻辑到底是什么样的n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "print(train_data[0])" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 19, 552 | "metadata": { 553 | "ExecuteTime": { 554 | "end_time": "2019-09-05T03:03:16.293368Z", 555 | "start_time": "2019-09-05T03:03:16.286409Z" 556 | } 557 | }, 558 | "outputs": [ 559 | { 560 | "data": { 561 | "text/plain": [ 562 | "(129, 129)" 563 | ] 564 | }, 565 | "execution_count": 19, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "len(train_data[0][1]),len(train_data[0][2])" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 20, 577 | "metadata": { 578 | "ExecuteTime": { 579 | "end_time": "2019-09-05T03:03:50.637720Z", 580 | "start_time": "2019-09-05T03:03:49.422549Z" 581 | } 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "id_, text_id, X1, X2, Y = [], [], [], [], []\n", 586 | "maxlen = 512\n", 587 | "for i in range(len(train_data)):\n", 588 | " d = train_data[i]\n", 589 | " text = d[1][:maxlen]\n", 590 | " y = d[2][:maxlen]\n", 591 | " x1, x2 = tokenizer.encode(first=text)\n", 592 | " X1.append(x1)\n", 593 | " X2.append(x2)\n", 594 | " Y.append(y)\n", 595 | " id_.append([i])\n", 596 | " text_id.append([d[0]])\n", 597 | "X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=X1, padding=\"post\", value=0)\n", 598 | "X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=X2, padding=\"post\", value=0)\n", 599 | "Y = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=Y, padding=\"post\", value=0)\n", 600 | "Y = [keras.preprocessing.utils.to_categorical(i, num_classes=n_tags) for i in Y]" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 21, 606 | "metadata": { 607 | "ExecuteTime": { 608 | "end_time": "2019-09-05T03:04:01.341080Z", 609 | "start_time": "2019-09-05T03:04:01.332279Z" 610 | } 611 | }, 612 | "outputs": [ 613 | { 614 | "data": { 615 | "text/plain": [ 616 | "(4433, 4433, (4433, 512), (4433, 512), 4433, (512, 4))" 617 | ] 618 | }, 619 | "execution_count": 21, 620 | "metadata": {}, 621 | "output_type": "execute_result" 622 | } 623 | ], 624 | "source": [ 625 | "len(id_), len(text_id), X1.shape, X2.shape, len(Y), Y[0].shape" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 22, 631 | "metadata": { 632 | "ExecuteTime": { 633 | "end_time": "2019-09-05T03:04:37.519193Z", 634 | "start_time": "2019-09-05T03:04:37.495329Z" 635 | } 636 | }, 637 | "outputs": [], 638 | "source": [ 639 | "id_train, id_test, text_id_train, text_id_test, X1_train, X1_test, X2_train, X2_test, Y_train, Y_test = train_test_split(id_, text_id, X1, X2, Y, test_size=0.1)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 23, 645 | "metadata": { 646 | "ExecuteTime": { 647 | "end_time": "2019-09-05T03:04:44.349231Z", 648 | "start_time": "2019-09-05T03:04:44.339623Z" 649 | } 650 | }, 651 | "outputs": [ 652 | { 653 | "data": { 654 | "text/plain": [ 655 | "(3989,\n", 656 | " 444,\n", 657 | " 3989,\n", 658 | " 444,\n", 659 | " (3989, 512),\n", 660 | " (444, 512),\n", 661 | " (3989, 512),\n", 662 | " (444, 512),\n", 663 | " 3989,\n", 664 | " 444)" 665 | ] 666 | }, 667 | "execution_count": 23, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "len(id_train), len(id_test), len(text_id_train), len(text_id_test), X1_train.shape, X1_test.shape, X2_train.shape, X2_test.shape, len(Y_train), len(Y_test)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 24, 679 | "metadata": { 680 | "ExecuteTime": { 681 | "end_time": "2019-09-05T03:05:14.078915Z", 682 | "start_time": "2019-09-05T03:05:14.070738Z" 683 | } 684 | }, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "text/plain": [ 689 | "948" 690 | ] 691 | }, 692 | "execution_count": 24, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "s_id = random.sample(range(len(id_train)), 1)[0]\n", 699 | "s_id" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 25, 705 | "metadata": { 706 | "ExecuteTime": { 707 | "end_time": "2019-09-05T03:05:22.891169Z", 708 | "start_time": "2019-09-05T03:05:22.883995Z" 709 | } 710 | }, 711 | "outputs": [ 712 | { 713 | "data": { 714 | "text/plain": [ 715 | "[1716]" 716 | ] 717 | }, 718 | "execution_count": 25, 719 | "metadata": {}, 720 | "output_type": "execute_result" 721 | } 722 | ], 723 | "source": [ 724 | "id_train[s_id]" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 26, 730 | "metadata": { 731 | "ExecuteTime": { 732 | "end_time": "2019-09-05T03:05:30.306926Z", 733 | "start_time": "2019-09-05T03:05:30.299948Z" 734 | } 735 | }, 736 | "outputs": [ 737 | { 738 | "data": { 739 | "text/plain": [ 740 | "['7ed5aa77']" 741 | ] 742 | }, 743 | "execution_count": 26, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "text_id_train[s_id]" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 27, 755 | "metadata": { 756 | "ExecuteTime": { 757 | "end_time": "2019-09-05T03:05:40.417808Z", 758 | "start_time": "2019-09-05T03:05:40.411409Z" 759 | } 760 | }, 761 | "outputs": [ 762 | { 763 | "name": "stdout", 764 | "output_type": "stream", 765 | "text": [ 766 | "('7ed5aa77', '悦花越有在灾害发生后第一时间向九寨沟县地震灾区捐款100万元资助灾区人民重建家园。 之后 悦花越有以最快的速度在平台发起了捐积分 援四川的大转盘抽奖献爱心活动 号召平台用户为灾区献出自己的爱心。 悦花越有平台用户响应积极 纷纷伸出援助之手 为灾区人民奉献了自己爱心。参与悦花越有本次大转盘献爱心活动的人数超过7100人 最终平台用户冯媛女士从众多用户中有幸被抽中。身为悦花越有平台的忠诚用户 她将会以悦天使的身份 代表悦花越有为灾区传递爱心。 灾难发生后 悦花越有心系灾区人民 积极发挥社会责任感与企业影响力 不仅通过捐款帮助灾区 更是以企业平台和号召力呼吁企业员工及用户为灾区人民献爱心 提供帮助。 未来 悦花越有还会持续关注九寨沟灾区的动态 悦花越有为灾区人民祈福 愿灾区人民早日度过难关 重建家园n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "print(train_data[1716])" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 28, 777 | "metadata": { 778 | "ExecuteTime": { 779 | "end_time": "2019-09-05T03:05:57.523955Z", 780 | "start_time": "2019-09-05T03:05:57.516405Z" 781 | } 782 | }, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "(351, 351)" 788 | ] 789 | }, 790 | "execution_count": 28, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "len(train_data[1716][1]), len(train_data[1716][2])" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "## 自定义ner模型" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "### 定义模型" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 29, 816 | "metadata": { 817 | "ExecuteTime": { 818 | "end_time": "2019-09-05T03:06:33.035544Z", 819 | "start_time": "2019-09-05T03:06:28.498463Z" 820 | } 821 | }, 822 | "outputs": [ 823 | { 824 | "name": "stderr", 825 | "output_type": "stream", 826 | "text": [ 827 | "/home/wangwei/miniconda3/envs/tf3/lib/python3.5/site-packages/keras_contrib-2.0.8-py3.5.egg/keras_contrib/layers/crf.py:346: UserWarning: CRF.loss_function is deprecated and it might be removed in the future. Please use losses.crf_loss instead.\n", 828 | "/home/wangwei/miniconda3/envs/tf3/lib/python3.5/site-packages/keras_contrib-2.0.8-py3.5.egg/keras_contrib/layers/crf.py:353: UserWarning: CRF.accuracy is deprecated and it might be removed in the future. Please use metrics.crf_accuracy\n" 829 | ] 830 | }, 831 | { 832 | "name": "stdout", 833 | "output_type": "stream", 834 | "text": [ 835 | "__________________________________________________________________________________________________\n", 836 | "Layer (type) Output Shape Param # Connected to \n", 837 | "==================================================================================================\n", 838 | "input_1 (InputLayer) (None, None) 0 \n", 839 | "__________________________________________________________________________________________________\n", 840 | "input_2 (InputLayer) (None, None) 0 \n", 841 | "__________________________________________________________________________________________________\n", 842 | "model_2 (Model) (None, None, 768) 101677056 input_1[0][0] \n", 843 | " input_2[0][0] \n", 844 | "__________________________________________________________________________________________________\n", 845 | "bidirectional_1 (Bidirectional) (None, None, 256) 918528 model_2[1][0] \n", 846 | "__________________________________________________________________________________________________\n", 847 | "dropout_1 (Dropout) (None, None, 256) 0 bidirectional_1[0][0] \n", 848 | "__________________________________________________________________________________________________\n", 849 | "time_distributed_1 (TimeDistrib (None, None, 128) 32896 dropout_1[0][0] \n", 850 | "__________________________________________________________________________________________________\n", 851 | "crf_1 (CRF) (None, None, 4) 540 time_distributed_1[0][0] \n", 852 | "==================================================================================================\n", 853 | "Total params: 102,629,020\n", 854 | "Trainable params: 951,964\n", 855 | "Non-trainable params: 101,677,056\n", 856 | "__________________________________________________________________________________________________\n" 857 | ] 858 | } 859 | ], 860 | "source": [ 861 | "x1_in = keras.layers.Input(shape=(None,)) # 待识别句子输入\n", 862 | "x2_in = keras.layers.Input(shape=(None,)) # 待识别句子输入\n", 863 | "bert_output = bert_model([x1_in, x2_in])\n", 864 | "lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=128, return_sequences=True))(bert_output)\n", 865 | "drop = keras.layers.Dropout(0.4)(lstm)\n", 866 | "dense = keras.layers.TimeDistributed(keras.layers.Dense(128, activation=\"relu\"))(drop)\n", 867 | "crf = CRF(n_tags)\n", 868 | "out = crf(dense)\n", 869 | "model = keras.models.Model(inputs=[x1_in, x2_in], outputs=out)\n", 870 | "model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])\n", 871 | "# model.compile(loss=keras.losses.crf_loss, optimizer='adam', metrics=[keras.metrics.crf_accuracy])\n", 872 | "model.summary()" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 30, 878 | "metadata": { 879 | "ExecuteTime": { 880 | "end_time": "2019-09-05T03:06:40.347908Z", 881 | "start_time": "2019-09-05T03:06:40.342689Z" 882 | } 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "save_path = \"/home/wangwei/tf_workdir/word_detect/model\"\n", 887 | "filepath=\"model_{epoch:02d}-{val_crf_viterbi_accuracy:.4f}.hdf5\"" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 31, 893 | "metadata": { 894 | "ExecuteTime": { 895 | "end_time": "2019-09-05T03:06:47.570866Z", 896 | "start_time": "2019-09-05T03:06:47.563941Z" 897 | } 898 | }, 899 | "outputs": [], 900 | "source": [ 901 | "callbacks = [\n", 902 | " keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=0),\n", 903 | " keras.callbacks.ModelCheckpoint(os.path.join(save_path, filepath), monitor='val_loss', save_best_only=True, verbose=0),\n", 904 | "]" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "### 训练过程" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 32, 917 | "metadata": { 918 | "ExecuteTime": { 919 | "end_time": "2019-09-05T03:57:21.704786Z", 920 | "start_time": "2019-09-05T03:07:05.545118Z" 921 | }, 922 | "scrolled": true 923 | }, 924 | "outputs": [ 925 | { 926 | "name": "stdout", 927 | "output_type": "stream", 928 | "text": [ 929 | "Train on 3989 samples, validate on 444 samples\n", 930 | "Epoch 1/20\n", 931 | "3989/3989 [==============================] - 238s 60ms/step - loss: 7.9539 - crf_viterbi_accuracy: 0.8932 - val_loss: 7.4705 - val_crf_viterbi_accuracy: 0.9329\n", 932 | "Epoch 2/20\n", 933 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.7739 - crf_viterbi_accuracy: 0.9318 - val_loss: 7.4365 - val_crf_viterbi_accuracy: 0.9378\n", 934 | "Epoch 3/20\n", 935 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.7371 - crf_viterbi_accuracy: 0.9429 - val_loss: 7.4174 - val_crf_viterbi_accuracy: 0.9470\n", 936 | "Epoch 4/20\n", 937 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.7145 - crf_viterbi_accuracy: 0.9530 - val_loss: 7.4022 - val_crf_viterbi_accuracy: 0.9547\n", 938 | "Epoch 5/20\n", 939 | "3989/3989 [==============================] - 232s 58ms/step - loss: 7.6976 - crf_viterbi_accuracy: 0.9627 - val_loss: 7.3880 - val_crf_viterbi_accuracy: 0.9723\n", 940 | "Epoch 6/20\n", 941 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6834 - crf_viterbi_accuracy: 0.9686 - val_loss: 7.3920 - val_crf_viterbi_accuracy: 0.9617\n", 942 | "Epoch 7/20\n", 943 | "3989/3989 [==============================] - 232s 58ms/step - loss: 7.6709 - crf_viterbi_accuracy: 0.9741 - val_loss: 7.3792 - val_crf_viterbi_accuracy: 0.9717\n", 944 | "Epoch 8/20\n", 945 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6624 - crf_viterbi_accuracy: 0.9765 - val_loss: 7.3783 - val_crf_viterbi_accuracy: 0.9772\n", 946 | "Epoch 9/20\n", 947 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6563 - crf_viterbi_accuracy: 0.9775 - val_loss: 7.3758 - val_crf_viterbi_accuracy: 0.9789\n", 948 | "Epoch 10/20\n", 949 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6476 - crf_viterbi_accuracy: 0.9811 - val_loss: 7.3763 - val_crf_viterbi_accuracy: 0.9784\n", 950 | "Epoch 11/20\n", 951 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6408 - crf_viterbi_accuracy: 0.9833 - val_loss: 7.3747 - val_crf_viterbi_accuracy: 0.9805\n", 952 | "Epoch 12/20\n", 953 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6378 - crf_viterbi_accuracy: 0.9836 - val_loss: 7.3881 - val_crf_viterbi_accuracy: 0.9797\n", 954 | "Epoch 13/20\n", 955 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6343 - crf_viterbi_accuracy: 0.9851 - val_loss: 7.3805 - val_crf_viterbi_accuracy: 0.9811\n" 956 | ] 957 | } 958 | ], 959 | "source": [ 960 | "history = model.fit([X1_train, X2_train], np.array(Y_train), batch_size=64, epochs=20,\n", 961 | " validation_data=([X1_test, X2_test], np.array(Y_test)), verbose=1, callbacks=callbacks)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "### 绘制训练图形" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 33, 974 | "metadata": { 975 | "ExecuteTime": { 976 | "end_time": "2019-09-05T03:57:21.729179Z", 977 | "start_time": "2019-09-05T03:57:21.708070Z" 978 | } 979 | }, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/html": [ 984 | "
\n", 985 | "\n", 998 | "\n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | "
crf_viterbi_accuracylossval_crf_viterbi_accuracyval_loss
00.8932167.9538960.9329127.470517
10.9318157.7739330.9378187.436458
20.9428867.7370790.9470187.417439
30.9529997.7144590.9547317.402221
40.9627387.6976160.9723317.388022
\n", 1046 | "
" 1047 | ], 1048 | "text/plain": [ 1049 | " crf_viterbi_accuracy loss val_crf_viterbi_accuracy val_loss\n", 1050 | "0 0.893216 7.953896 0.932912 7.470517\n", 1051 | "1 0.931815 7.773933 0.937818 7.436458\n", 1052 | "2 0.942886 7.737079 0.947018 7.417439\n", 1053 | "3 0.952999 7.714459 0.954731 7.402221\n", 1054 | "4 0.962738 7.697616 0.972331 7.388022" 1055 | ] 1056 | }, 1057 | "execution_count": 33, 1058 | "metadata": {}, 1059 | "output_type": "execute_result" 1060 | } 1061 | ], 1062 | "source": [ 1063 | "hist = pd.DataFrame(history.history)\n", 1064 | "hist.head()" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": 34, 1070 | "metadata": { 1071 | "ExecuteTime": { 1072 | "end_time": "2019-09-05T03:57:21.985209Z", 1073 | "start_time": "2019-09-05T03:57:21.731383Z" 1074 | } 1075 | }, 1076 | "outputs": [ 1077 | { 1078 | "data": { 1079 | "image/png": "\n", 1080 | "text/plain": [ 1081 | "
" 1082 | ] 1083 | }, 1084 | "metadata": {}, 1085 | "output_type": "display_data" 1086 | } 1087 | ], 1088 | "source": [ 1089 | "plt.style.use(\"ggplot\")\n", 1090 | "plt.figure(figsize=(8,8))\n", 1091 | "plt.plot(hist[\"crf_viterbi_accuracy\"])\n", 1092 | "plt.plot(hist[\"val_crf_viterbi_accuracy\"])\n", 1093 | "plt.show()" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 35, 1099 | "metadata": { 1100 | "ExecuteTime": { 1101 | "end_time": "2019-09-05T03:57:21.990014Z", 1102 | "start_time": "2019-09-05T03:57:21.987299Z" 1103 | } 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "### 对验证集进行各种度量指标计算" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 36, 1113 | "metadata": { 1114 | "ExecuteTime": { 1115 | "end_time": "2019-09-05T03:57:22.001183Z", 1116 | "start_time": "2019-09-05T03:57:21.991770Z" 1117 | } 1118 | }, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/plain": [ 1123 | "(444, 512)" 1124 | ] 1125 | }, 1126 | "execution_count": 36, 1127 | "metadata": {}, 1128 | "output_type": "execute_result" 1129 | } 1130 | ], 1131 | "source": [ 1132 | "X1_test.shape" 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": 37, 1138 | "metadata": { 1139 | "ExecuteTime": { 1140 | "end_time": "2019-09-05T03:57:43.365745Z", 1141 | "start_time": "2019-09-05T03:57:22.003343Z" 1142 | } 1143 | }, 1144 | "outputs": [ 1145 | { 1146 | "name": "stdout", 1147 | "output_type": "stream", 1148 | "text": [ 1149 | "444/444 [==============================] - 21s 48ms/step\n" 1150 | ] 1151 | } 1152 | ], 1153 | "source": [ 1154 | "test_pred = model.predict([X1_test, X2_test], verbose=1)" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": 38, 1160 | "metadata": { 1161 | "ExecuteTime": { 1162 | "end_time": "2019-09-05T03:57:43.374778Z", 1163 | "start_time": "2019-09-05T03:57:43.368600Z" 1164 | } 1165 | }, 1166 | "outputs": [ 1167 | { 1168 | "name": "stdout", 1169 | "output_type": "stream", 1170 | "text": [ 1171 | "tag2idx: {'B': 2, 'O': 1, 'I': 3, '-PAD-': 0}\n", 1172 | "idx2tag: {0: '-PAD-', 1: 'O', 2: 'B', 3: 'I'}\n" 1173 | ] 1174 | } 1175 | ], 1176 | "source": [ 1177 | "idx2tag = {i: w for w, i in tag2idx.items()}\n", 1178 | "print(\"tag2idx:\", tag2idx)\n", 1179 | "print(\"idx2tag:\", idx2tag)" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 39, 1185 | "metadata": { 1186 | "ExecuteTime": { 1187 | "end_time": "2019-09-05T03:57:43.385855Z", 1188 | "start_time": "2019-09-05T03:57:43.379563Z" 1189 | } 1190 | }, 1191 | "outputs": [ 1192 | { 1193 | "data": { 1194 | "text/plain": [ 1195 | "(444, 512, 4)" 1196 | ] 1197 | }, 1198 | "execution_count": 39, 1199 | "metadata": {}, 1200 | "output_type": "execute_result" 1201 | } 1202 | ], 1203 | "source": [ 1204 | "test_pred.shape" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 40, 1210 | "metadata": { 1211 | "ExecuteTime": { 1212 | "end_time": "2019-09-05T03:57:44.404271Z", 1213 | "start_time": "2019-09-05T03:57:43.388630Z" 1214 | } 1215 | }, 1216 | "outputs": [], 1217 | "source": [ 1218 | "def pred2label(pred):\n", 1219 | " out = []\n", 1220 | " for pred_i in pred:\n", 1221 | " out_i = []\n", 1222 | " for p in pred_i:\n", 1223 | " p_i = np.argmax(p)\n", 1224 | " out_i.append(idx2tag[p_i].replace(\"-PAD-\", \"O\"))\n", 1225 | " out.append(out_i)\n", 1226 | " return out\n", 1227 | " \n", 1228 | "pred_labels = pred2label(test_pred)\n", 1229 | "test_labels = pred2label(Y_test)" 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "code", 1234 | "execution_count": 41, 1235 | "metadata": { 1236 | "ExecuteTime": { 1237 | "end_time": "2019-09-05T03:57:44.904263Z", 1238 | "start_time": "2019-09-05T03:57:44.406348Z" 1239 | } 1240 | }, 1241 | "outputs": [ 1242 | { 1243 | "name": "stdout", 1244 | "output_type": "stream", 1245 | "text": [ 1246 | "F1-score: 84.1%\n" 1247 | ] 1248 | } 1249 | ], 1250 | "source": [ 1251 | "print(\"F1-score: {:.1%}\".format(f1_score(test_labels, pred_labels)))" 1252 | ] 1253 | }, 1254 | { 1255 | "cell_type": "code", 1256 | "execution_count": 42, 1257 | "metadata": { 1258 | "ExecuteTime": { 1259 | "end_time": "2019-09-05T03:57:46.851990Z", 1260 | "start_time": "2019-09-05T03:57:44.906277Z" 1261 | } 1262 | }, 1263 | "outputs": [ 1264 | { 1265 | "name": "stdout", 1266 | "output_type": "stream", 1267 | "text": [ 1268 | " precision recall f1-score support\n", 1269 | "\n", 1270 | " B 0.87 0.90 0.89 2093\n", 1271 | " I 0.77 0.82 0.79 2089\n", 1272 | "\n", 1273 | "micro avg 0.82 0.86 0.84 4182\n", 1274 | "macro avg 0.82 0.86 0.84 4182\n", 1275 | "\n" 1276 | ] 1277 | } 1278 | ], 1279 | "source": [ 1280 | "print(classification_report(test_labels, pred_labels))" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "markdown", 1285 | "metadata": {}, 1286 | "source": [ 1287 | "### 验证个例" 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": 43, 1293 | "metadata": { 1294 | "ExecuteTime": { 1295 | "end_time": "2019-09-05T03:57:46.860554Z", 1296 | "start_time": "2019-09-05T03:57:46.854047Z" 1297 | } 1298 | }, 1299 | "outputs": [ 1300 | { 1301 | "name": "stdout", 1302 | "output_type": "stream", 1303 | "text": [ 1304 | "['3c27e9bb']\n", 1305 | "('3c27e9bb', 'ICA亚投链矿机投资配股拆分理财开发找13郑286婷015微737电。ICA亚投链网上投资理财app开发、ICA亚投链虚拟币投资分红模式开发、ICA亚投链理财平台网页版定制开发、ICA亚投链区块链分红系统程序开发区块链技术的定义是什么。ICA亚投链模式制度介绍 注册免费送12万台矿机 数量有限 送完为止。ICA亚投链全球恒量发行1 5亿枚 永不增发 开盘价0 2美元 预计开盘交易黑市价不低于2美元 市场更是一币难求 采取有效的控盘机制。ICA亚投链全球恒量发行1 5亿枚 永不增发。ICA亚投链资质背书 双加密网址 独立开源代码 采用POW智能合约 POS算力 对接商城 话费 水电费充值 中石油、中石化充值 ICA亚投链推广奖励 矿工公会ICA亚投链奖励 一代 拿直推收益总产量的5%低于直推算力 收益减半二代 3%三代 1%一星会长 直推10人 团队100人 工会算力 总算力达到20GH S 送小型矿机一台n", 1306 | "[[0. 0. 1. 0.]\n", 1307 | " [0. 0. 0. 1.]\n", 1308 | " [0. 0. 0. 1.]\n", 1309 | " ...\n", 1310 | " [1. 0. 0. 0.]\n", 1311 | " [1. 0. 0. 0.]\n", 1312 | " [1. 0. 0. 0.]]\n" 1313 | ] 1314 | } 1315 | ], 1316 | "source": [ 1317 | "# 随机抽样\n", 1318 | "sample_id = random.sample(range(len(id_test)), 1)[0]\n", 1319 | "sample_X1 = X1_test[sample_id]\n", 1320 | "sample_X2 = X2_test[sample_id]\n", 1321 | "tid = id_test[sample_id][0]\n", 1322 | "sample_text_id = text_id_test[sample_id]\n", 1323 | "print(sample_text_id)\n", 1324 | "sample_data = train_data[tid]\n", 1325 | "print(sample_data)\n", 1326 | "sample_Y = Y_test[sample_id]\n", 1327 | "print(sample_Y)" 1328 | ] 1329 | }, 1330 | { 1331 | "cell_type": "code", 1332 | "execution_count": 44, 1333 | "metadata": { 1334 | "ExecuteTime": { 1335 | "end_time": "2019-09-05T03:57:46.870274Z", 1336 | "start_time": "2019-09-05T03:57:46.862318Z" 1337 | } 1338 | }, 1339 | "outputs": [ 1340 | { 1341 | "data": { 1342 | "text/plain": [ 1343 | "(512,)" 1344 | ] 1345 | }, 1346 | "execution_count": 44, 1347 | "metadata": {}, 1348 | "output_type": "execute_result" 1349 | } 1350 | ], 1351 | "source": [ 1352 | "sample_X1.shape" 1353 | ] 1354 | }, 1355 | { 1356 | "cell_type": "code", 1357 | "execution_count": 45, 1358 | "metadata": { 1359 | "ExecuteTime": { 1360 | "end_time": "2019-09-05T03:57:47.555336Z", 1361 | "start_time": "2019-09-05T03:57:46.871773Z" 1362 | } 1363 | }, 1364 | "outputs": [ 1365 | { 1366 | "name": "stdout", 1367 | "output_type": "stream", 1368 | "text": [ 1369 | "(1, 512, 4)\n" 1370 | ] 1371 | } 1372 | ], 1373 | "source": [ 1374 | "predict = model.predict([sample_X1.reshape([1, -1]), sample_X2.reshape([1, -1])])\n", 1375 | "print(predict.shape)" 1376 | ] 1377 | }, 1378 | { 1379 | "cell_type": "code", 1380 | "execution_count": 46, 1381 | "metadata": { 1382 | "ExecuteTime": { 1383 | "end_time": "2019-09-05T03:57:47.563364Z", 1384 | "start_time": "2019-09-05T03:57:47.558356Z" 1385 | } 1386 | }, 1387 | "outputs": [], 1388 | "source": [ 1389 | "pred = np.argmax(predict, axis=-1).reshape([-1])\n", 1390 | "true = np.argmax(sample_Y, axis=-1)" 1391 | ] 1392 | }, 1393 | { 1394 | "cell_type": "code", 1395 | "execution_count": 47, 1396 | "metadata": { 1397 | "ExecuteTime": { 1398 | "end_time": "2019-09-05T03:57:47.573980Z", 1399 | "start_time": "2019-09-05T03:57:47.565160Z" 1400 | } 1401 | }, 1402 | "outputs": [], 1403 | "source": [ 1404 | "pred_label = [idx2tag[i] for i in pred]\n", 1405 | "true_label = [idx2tag[i] for i in true]" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": 48, 1411 | "metadata": { 1412 | "ExecuteTime": { 1413 | "end_time": "2019-09-05T03:57:47.624632Z", 1414 | "start_time": "2019-09-05T03:57:47.576187Z" 1415 | } 1416 | }, 1417 | "outputs": [ 1418 | { 1419 | "name": "stdout", 1420 | "output_type": "stream", 1421 | "text": [ 1422 | "I : B B\n", 1423 | "C : I I\n", 1424 | "A : I I\n", 1425 | "亚 : I I\n", 1426 | "投 : I I\n", 1427 | "链 : I I\n", 1428 | "矿 : O O\n", 1429 | "机 : O O\n", 1430 | "投 : O O\n", 1431 | "资 : O O\n", 1432 | "配 : O O\n", 1433 | "股 : O O\n", 1434 | "拆 : O O\n", 1435 | "分 : O O\n", 1436 | "理 : O O\n", 1437 | "财 : O O\n", 1438 | "开 : O O\n", 1439 | "发 : O O\n", 1440 | "找 : O O\n", 1441 | "1 : O O\n", 1442 | "3 : O O\n", 1443 | "郑 : O O\n", 1444 | "2 : O O\n", 1445 | "8 : O O\n", 1446 | "6 : O O\n", 1447 | "婷 : O O\n", 1448 | "0 : O O\n", 1449 | "1 : O O\n", 1450 | "5 : O O\n", 1451 | "微 : O O\n", 1452 | "7 : O O\n", 1453 | "3 : O O\n", 1454 | "7 : O O\n", 1455 | "电 : O O\n", 1456 | "。 : O O\n", 1457 | "I : B B\n", 1458 | "C : I I\n", 1459 | "A : I I\n", 1460 | "亚 : I I\n", 1461 | "投 : I I\n", 1462 | "链 : I I\n", 1463 | "网 : O O\n", 1464 | "上 : O O\n", 1465 | "投 : O O\n", 1466 | "资 : O O\n", 1467 | "理 : O O\n", 1468 | "财 : O O\n", 1469 | "a : O O\n", 1470 | "p : O O\n", 1471 | "p : O O\n", 1472 | "开 : O O\n", 1473 | "发 : O O\n", 1474 | "、 : O O\n", 1475 | "I : B B\n", 1476 | "C : I I\n", 1477 | "A : I I\n", 1478 | "亚 : I I\n", 1479 | "投 : I I\n", 1480 | "链 : I I\n", 1481 | "虚 : O O\n", 1482 | "拟 : O O\n", 1483 | "币 : O O\n", 1484 | "投 : O O\n", 1485 | "资 : O O\n", 1486 | "分 : O O\n", 1487 | "红 : O O\n", 1488 | "模 : O O\n", 1489 | "式 : O O\n", 1490 | "开 : O O\n", 1491 | "发 : O O\n", 1492 | "、 : O O\n", 1493 | "I : B B\n", 1494 | "C : I I\n", 1495 | "A : I I\n", 1496 | "亚 : I I\n", 1497 | "投 : I I\n", 1498 | "链 : I I\n", 1499 | "理 : O O\n", 1500 | "财 : O O\n", 1501 | "平 : O O\n", 1502 | "台 : O O\n", 1503 | "网 : O O\n", 1504 | "页 : O O\n", 1505 | "版 : O O\n", 1506 | "定 : O O\n", 1507 | "制 : O O\n", 1508 | "开 : O O\n", 1509 | "发 : O O\n", 1510 | "、 : O O\n", 1511 | "I : B B\n", 1512 | "C : I I\n", 1513 | "A : I I\n", 1514 | "亚 : I I\n", 1515 | "投 : I I\n", 1516 | "链 : I I\n", 1517 | "区 : O O\n", 1518 | "块 : O O\n", 1519 | "链 : O O\n", 1520 | "分 : O O\n", 1521 | "红 : O O\n", 1522 | "系 : O O\n", 1523 | "统 : O O\n", 1524 | "程 : O O\n", 1525 | "序 : O O\n", 1526 | "开 : O O\n", 1527 | "发 : O O\n", 1528 | "区 : O O\n", 1529 | "块 : O O\n", 1530 | "链 : O O\n", 1531 | "技 : O O\n", 1532 | "术 : O O\n", 1533 | "的 : O O\n", 1534 | "定 : O O\n", 1535 | "义 : O O\n", 1536 | "是 : O O\n", 1537 | "什 : O O\n", 1538 | "么 : O O\n", 1539 | "。 : O O\n", 1540 | "I : B B\n", 1541 | "C : I I\n", 1542 | "A : I I\n", 1543 | "亚 : I I\n", 1544 | "投 : I I\n", 1545 | "链 : I I\n", 1546 | "模 : O O\n", 1547 | "式 : O O\n", 1548 | "制 : O O\n", 1549 | "度 : O O\n", 1550 | "介 : O O\n", 1551 | "绍 : O O\n", 1552 | " : O O\n", 1553 | "注 : O O\n", 1554 | "册 : O O\n", 1555 | "免 : O O\n", 1556 | "费 : O O\n", 1557 | "送 : O O\n", 1558 | "1 : O O\n", 1559 | "2 : O O\n", 1560 | "万 : O O\n", 1561 | "台 : O O\n", 1562 | "矿 : O O\n", 1563 | "机 : O O\n", 1564 | " : O O\n", 1565 | "数 : O O\n", 1566 | "量 : O O\n", 1567 | "有 : O O\n", 1568 | "限 : O O\n", 1569 | " : O O\n", 1570 | "送 : O O\n", 1571 | "完 : O O\n", 1572 | "为 : O O\n", 1573 | "止 : O O\n", 1574 | "。 : O O\n", 1575 | "I : B B\n", 1576 | "C : I I\n", 1577 | "A : I I\n", 1578 | "亚 : I I\n", 1579 | "投 : I I\n", 1580 | "链 : I I\n", 1581 | "全 : O O\n", 1582 | "球 : O O\n", 1583 | "恒 : O O\n", 1584 | "量 : O O\n", 1585 | "发 : O O\n", 1586 | "行 : O O\n", 1587 | "1 : O O\n", 1588 | " : O O\n", 1589 | "5 : O O\n", 1590 | "亿 : O O\n", 1591 | "枚 : O O\n", 1592 | " : O O\n", 1593 | "永 : O O\n", 1594 | "不 : O O\n", 1595 | "增 : O O\n", 1596 | "发 : O O\n", 1597 | " : O O\n", 1598 | "开 : O O\n", 1599 | "盘 : O O\n", 1600 | "价 : O O\n", 1601 | "0 : O O\n", 1602 | " : O O\n", 1603 | "2 : O O\n", 1604 | "美 : O O\n", 1605 | "元 : O O\n", 1606 | " : O O\n", 1607 | "预 : O O\n", 1608 | "计 : O O\n", 1609 | "开 : O O\n", 1610 | "盘 : O O\n", 1611 | "交 : O O\n", 1612 | "易 : O O\n", 1613 | "黑 : O O\n", 1614 | "市 : O O\n", 1615 | "价 : O O\n", 1616 | "不 : O O\n", 1617 | "低 : O O\n", 1618 | "于 : O O\n", 1619 | "2 : O O\n", 1620 | "美 : O O\n", 1621 | "元 : O O\n", 1622 | " : O O\n", 1623 | "市 : O O\n", 1624 | "场 : O O\n", 1625 | "更 : O O\n", 1626 | "是 : O O\n", 1627 | "一 : O O\n", 1628 | "币 : O O\n", 1629 | "难 : O O\n", 1630 | "求 : O O\n", 1631 | " : O O\n", 1632 | "采 : O O\n", 1633 | "取 : O O\n", 1634 | "有 : O O\n", 1635 | "效 : O O\n", 1636 | "的 : O O\n", 1637 | "控 : O O\n", 1638 | "盘 : O O\n", 1639 | "机 : O O\n", 1640 | "制 : O O\n", 1641 | "。 : O O\n", 1642 | "I : B B\n", 1643 | "C : I I\n", 1644 | "A : I I\n", 1645 | "亚 : I I\n", 1646 | "投 : I I\n", 1647 | "链 : I I\n", 1648 | "全 : O O\n", 1649 | "球 : O O\n", 1650 | "恒 : O O\n", 1651 | "量 : O O\n", 1652 | "发 : O O\n", 1653 | "行 : O O\n", 1654 | "1 : O O\n", 1655 | " : O O\n", 1656 | "5 : O O\n", 1657 | "亿 : O O\n", 1658 | "枚 : O O\n", 1659 | " : O O\n", 1660 | "永 : O O\n", 1661 | "不 : O O\n", 1662 | "增 : O O\n", 1663 | "发 : O O\n", 1664 | "。 : O O\n", 1665 | "I : B B\n", 1666 | "C : I I\n", 1667 | "A : I I\n", 1668 | "亚 : I I\n", 1669 | "投 : I I\n", 1670 | "链 : I I\n", 1671 | "资 : O O\n", 1672 | "质 : O O\n", 1673 | "背 : O O\n", 1674 | "书 : O O\n", 1675 | " : O O\n", 1676 | "双 : O O\n", 1677 | "加 : O O\n", 1678 | "密 : O O\n", 1679 | "网 : O O\n", 1680 | "址 : O O\n", 1681 | " : O O\n", 1682 | "独 : O O\n", 1683 | "立 : O O\n", 1684 | "开 : O O\n", 1685 | "源 : O O\n", 1686 | "代 : O O\n", 1687 | "码 : O O\n", 1688 | " : O O\n", 1689 | "采 : O O\n", 1690 | "用 : O O\n", 1691 | "P : O O\n", 1692 | "O : O O\n", 1693 | "W : O O\n", 1694 | "智 : O O\n", 1695 | "能 : O O\n", 1696 | "合 : O O\n", 1697 | "约 : O O\n", 1698 | " : O O\n", 1699 | "P : O O\n", 1700 | "O : O O\n", 1701 | "S : O O\n", 1702 | "算 : O O\n", 1703 | "力 : O O\n", 1704 | " : O O\n", 1705 | "对 : O O\n", 1706 | "接 : O O\n", 1707 | "商 : O O\n", 1708 | "城 : O O\n", 1709 | " : O O\n", 1710 | "话 : O O\n", 1711 | "费 : O O\n", 1712 | " : O O\n", 1713 | "水 : O O\n", 1714 | "电 : O O\n", 1715 | "费 : O O\n", 1716 | "充 : O O\n", 1717 | "值 : O O\n", 1718 | " : O O\n", 1719 | "中 : O O\n", 1720 | "石 : O O\n", 1721 | "油 : O O\n", 1722 | "、 : O O\n", 1723 | "中 : O O\n", 1724 | "石 : O O\n", 1725 | "化 : O O\n", 1726 | "充 : O O\n", 1727 | "值 : O O\n", 1728 | " : O O\n", 1729 | "I : B B\n", 1730 | "C : I I\n", 1731 | "A : I I\n", 1732 | "亚 : I I\n", 1733 | "投 : I I\n", 1734 | "链 : I I\n", 1735 | "推 : O O\n", 1736 | "广 : O O\n", 1737 | "奖 : O O\n", 1738 | "励 : O O\n", 1739 | " : O O\n", 1740 | "矿 : O O\n", 1741 | "工 : O O\n", 1742 | "公 : O O\n", 1743 | "会 : O O\n", 1744 | "I : B B\n", 1745 | "C : I I\n", 1746 | "A : I I\n", 1747 | "亚 : I I\n", 1748 | "投 : I I\n", 1749 | "链 : I I\n", 1750 | "奖 : O O\n", 1751 | "励 : O O\n", 1752 | " : O O\n", 1753 | "一 : O O\n", 1754 | "代 : O O\n", 1755 | " : O O\n", 1756 | "拿 : O O\n", 1757 | "直 : O O\n", 1758 | "推 : O O\n", 1759 | "收 : O O\n", 1760 | "益 : O O\n", 1761 | "总 : O O\n", 1762 | "产 : O O\n", 1763 | "量 : O O\n", 1764 | "的 : O O\n", 1765 | "5 : O O\n", 1766 | "% : O O\n", 1767 | "低 : O O\n", 1768 | "于 : O O\n", 1769 | "直 : O O\n", 1770 | "推 : O O\n", 1771 | "算 : O O\n", 1772 | "力 : O O\n", 1773 | " : O O\n", 1774 | "收 : O O\n", 1775 | "益 : O O\n", 1776 | "减 : O O\n", 1777 | "半 : O O\n", 1778 | "二 : O O\n", 1779 | "代 : O O\n", 1780 | " : O O\n", 1781 | "3 : O O\n", 1782 | "% : O O\n", 1783 | "三 : O O\n", 1784 | "代 : O O\n", 1785 | " : O O\n", 1786 | "1 : O O\n", 1787 | "% : O O\n", 1788 | "一 : O O\n", 1789 | "星 : O O\n", 1790 | "会 : O O\n", 1791 | "长 : O O\n", 1792 | " : O O\n", 1793 | "直 : O O\n", 1794 | "推 : O O\n", 1795 | "1 : O O\n", 1796 | "0 : O O\n", 1797 | "人 : O O\n", 1798 | " : O O\n", 1799 | "团 : O O\n", 1800 | "队 : O O\n", 1801 | "1 : O O\n", 1802 | "0 : O O\n", 1803 | "0 : O O\n", 1804 | "人 : O O\n", 1805 | " : O O\n", 1806 | "工 : O O\n", 1807 | "会 : O O\n", 1808 | "算 : O O\n", 1809 | "力 : O O\n", 1810 | " : O O\n", 1811 | "总 : O O\n", 1812 | "算 : O O\n", 1813 | "力 : O O\n", 1814 | "达 : O O\n", 1815 | "到 : O O\n", 1816 | "2 : O O\n", 1817 | "0 : O O\n", 1818 | "G : O O\n", 1819 | "H : O O\n", 1820 | " : O O\n", 1821 | "S : O O\n", 1822 | " : O O\n", 1823 | "送 : O O\n", 1824 | "小 : O O\n", 1825 | "型 : O O\n", 1826 | "矿 : O O\n", 1827 | "机 : O O\n", 1828 | "一 : O O\n", 1829 | "台 : O O\n" 1830 | ] 1831 | } 1832 | ], 1833 | "source": [ 1834 | "for c, t, p in zip(sample_data[1], pred_label, true_label):\n", 1835 | " if t != \"-PAD-\":\n", 1836 | " print(\"{:15}: {:5} {}\".format(c, t, p))" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 49, 1842 | "metadata": { 1843 | "ExecuteTime": { 1844 | "end_time": "2019-09-05T03:57:47.635452Z", 1845 | "start_time": "2019-09-05T03:57:47.626818Z" 1846 | } 1847 | }, 1848 | "outputs": [], 1849 | "source": [ 1850 | "def get_entity(X_data, y_data):\n", 1851 | " \"\"\"\n", 1852 | " \"\"\"\n", 1853 | " entity_list = []\n", 1854 | " entity_name = ''\n", 1855 | " for i, (c, l) in enumerate(zip(X_data, y_data)):\n", 1856 | " if l == \"B\":\n", 1857 | " entity_name += c\n", 1858 | " elif (l == \"I\") and (len(entity_name)) > 0:\n", 1859 | " entity_name += c\n", 1860 | " if i == len(y_data) - 1:\n", 1861 | " entity_list.append(entity_name)\n", 1862 | " elif l == \"O\":\n", 1863 | " if len(entity_name) > 0:\n", 1864 | " entity_list.append(entity_name)\n", 1865 | " entity_name = ''\n", 1866 | " \n", 1867 | " return \" \".join(list(set(entity_list)))" 1868 | ] 1869 | }, 1870 | { 1871 | "cell_type": "code", 1872 | "execution_count": 50, 1873 | "metadata": { 1874 | "ExecuteTime": { 1875 | "end_time": "2019-09-05T03:57:47.645893Z", 1876 | "start_time": "2019-09-05T03:57:47.637090Z" 1877 | } 1878 | }, 1879 | "outputs": [ 1880 | { 1881 | "name": "stdout", 1882 | "output_type": "stream", 1883 | "text": [ 1884 | "['I', 'C', 'A', '亚', '投', '链', '矿', '机', '投', '资', '配', '股', '拆', '分', '理', '财', '开', '发', '找', '1', '3', '郑', '2', '8', '6', '婷', '0', '1', '5', '微', '7', '3', '7', '电', '。', 'I', 'C', 'A', '亚', '投', '链', '网', '上', '投', '资', '理', '财', 'a', 'p', 'p', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '虚', '拟', '币', '投', '资', '分', '红', '模', '式', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '理', '财', '平', '台', '网', '页', '版', '定', '制', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '区', '块', '链', '分', '红', '系', '统', '程', '序', '开', '发', '区', '块', '链', '技', '术', '的', '定', '义', '是', '什', '么', '。', 'I', 'C', 'A', '亚', '投', '链', '模', '式', '制', '度', '介', '绍', ' ', '注', '册', '免', '费', '送', '1', '2', '万', '台', '矿', '机', ' ', '数', '量', '有', '限', ' ', '送', '完', '为', '止', '。', 'I', 'C', 'A', '亚', '投', '链', '全', '球', '恒', '量', '发', '行', '1', ' ', '5', '亿', '枚', ' ', '永', '不', '增', '发', ' ', '开', '盘', '价', '0', ' ', '2', '美', '元', ' ', '预', '计', '开', '盘', '交', '易', '黑', '市', '价', '不', '低', '于', '2', '美', '元', ' ', '市', '场', '更', '是', '一', '币', '难', '求', ' ', '采', '取', '有', '效', '的', '控', '盘', '机', '制', '。', 'I', 'C', 'A', '亚', '投', '链', '全', '球', '恒', '量', '发', '行', '1', ' ', '5', '亿', '枚', ' ', '永', '不', '增', '发', '。', 'I', 'C', 'A', '亚', '投', '链', '资', '质', '背', '书', ' ', '双', '加', '密', '网', '址', ' ', '独', '立', '开', '源', '代', '码', ' ', '采', '用', 'P', 'O', 'W', '智', '能', '合', '约', ' ', 'P', 'O', 'S', '算', '力', ' ', '对', '接', '商', '城', ' ', '话', '费', ' ', '水', '电', '费', '充', '值', ' ', '中', '石', '油', '、', '中', '石', '化', '充', '值', ' ', 'I', 'C', 'A', '亚', '投', '链', '推', '广', '奖', '励', ' ', '矿', '工', '公', '会', 'I', 'C', 'A', '亚', '投', '链', '奖', '励', ' ', '一', '代', ' ', '拿', '直', '推', '收', '益', '总', '产', '量', '的', '5', '%', '低', '于', '直', '推', '算', '力', ' ', '收', '益', '减', '半', '二', '代', ' ', '3', '%', '三', '代', ' ', '1', '%', '一', '星', '会', '长', ' ', '直', '推', '1', '0', '人', ' ', '团', '队', '1', '0', '0', '人', ' ', '工', '会', '算', '力', ' ', '总', '算', '力', '达', '到', '2', '0', 'G', 'H', ' ', 'S', ' ', '送', '小', '型', '矿', '机', '一', '台']\n" 1885 | ] 1886 | } 1887 | ], 1888 | "source": [ 1889 | "X_data = [c for c in sample_data[1]]\n", 1890 | "print(X_data)" 1891 | ] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": 51, 1896 | "metadata": { 1897 | "ExecuteTime": { 1898 | "end_time": "2019-09-05T03:57:47.657135Z", 1899 | "start_time": "2019-09-05T03:57:47.648087Z" 1900 | } 1901 | }, 1902 | "outputs": [ 1903 | { 1904 | "data": { 1905 | "text/plain": [ 1906 | "'ICA亚投链'" 1907 | ] 1908 | }, 1909 | "execution_count": 51, 1910 | "metadata": {}, 1911 | "output_type": "execute_result" 1912 | } 1913 | ], 1914 | "source": [ 1915 | "get_entity(X_data, pred_label)" 1916 | ] 1917 | }, 1918 | { 1919 | "cell_type": "markdown", 1920 | "metadata": {}, 1921 | "source": [ 1922 | "## 预测测试集" 1923 | ] 1924 | }, 1925 | { 1926 | "cell_type": "markdown", 1927 | "metadata": {}, 1928 | "source": [ 1929 | "### 处理测试集" 1930 | ] 1931 | }, 1932 | { 1933 | "cell_type": "code", 1934 | "execution_count": null, 1935 | "metadata": {}, 1936 | "outputs": [], 1937 | "source": [ 1938 | "def clean_zh(text):\n", 1939 | " '''清洗文本,保证语句通顺(关于小数点的问题无法处理)'''\n", 1940 | " text = text.replace(\"(\", \"(\").replace(\")\", \")\")\n", 1941 | " punct = string.punctuation + punctuation\n", 1942 | " punct = \"\".join([c for c in punct if c not in [\".\", \"、\", \"%\", \"“\", \"”\", \"(\", \")\", \"!\", \"。\", \"?\"]])\n", 1943 | " text = re.sub(r\"[%s]+\" % punct, \" \", text)\n", 1944 | " # 将引号替换\n", 1945 | " text = re.sub(r\"[%s]+\" % \"“”()\", \"\", text)\n", 1946 | " text = re.sub(r\"[%s]+\" % \":\", \" \", text)\n", 1947 | " # 多个空格替换成一个\n", 1948 | " text = re.sub(' +', ' ', text)\n", 1949 | " return text\n", 1950 | "\n", 1951 | "def clean_data(text):\n", 1952 | " \"\"\"清理各种脏数据\"\"\"\n", 1953 | " for p in pattern:\n", 1954 | " text = re.sub(p, \"\", text)\n", 1955 | " text = clean_zh(text)\n", 1956 | " return text" 1957 | ] 1958 | }, 1959 | { 1960 | "cell_type": "code", 1961 | "execution_count": 56, 1962 | "metadata": { 1963 | "ExecuteTime": { 1964 | "end_time": "2019-09-05T04:29:39.654532Z", 1965 | "start_time": "2019-09-05T04:29:34.815255Z" 1966 | } 1967 | }, 1968 | "outputs": [], 1969 | "source": [ 1970 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 1971 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 1972 | ] 1973 | }, 1974 | { 1975 | "cell_type": "code", 1976 | "execution_count": 62, 1977 | "metadata": { 1978 | "ExecuteTime": { 1979 | "end_time": "2019-09-05T04:30:34.470127Z", 1980 | "start_time": "2019-09-05T04:30:34.439130Z" 1981 | } 1982 | }, 1983 | "outputs": [], 1984 | "source": [ 1985 | "df_test[\"new_text\"] = df_test[\"title\"] + \" \" + df_test[\"text\"]" 1986 | ] 1987 | }, 1988 | { 1989 | "cell_type": "code", 1990 | "execution_count": 63, 1991 | "metadata": { 1992 | "ExecuteTime": { 1993 | "end_time": "2019-09-05T04:30:36.728419Z", 1994 | "start_time": "2019-09-05T04:30:36.710158Z" 1995 | } 1996 | }, 1997 | "outputs": [ 1998 | { 1999 | "data": { 2000 | "text/html": [ 2001 | "
\n", 2002 | "\n", 2015 | "\n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | "
idtitletextnew_text
083dcefb7时空周转公众注册 当天秒下时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的...时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到...
11ad5be0d抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ...抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 本篇文章将对两种...
26dd28e9b2019健康行业趋势 住家创业 稳赚不亏2019健康行业趋势 住家创业 稳赚不亏
3f3b61b38CCM区块链商城PPT介绍CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自...CCM区块链商城PPT介绍 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和...
484b12bae加密数字货币里大家都赚钱 钱是从哪里来的?我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿...加密数字货币里大家都赚钱 钱是从哪里来的? 我最近去分享艾尔链LAC加密数字货币时 有很多朋...
\n", 2063 | "
" 2064 | ], 2065 | "text/plain": [ 2066 | " id title \\\n", 2067 | "0 83dcefb7 时空周转公众注册 当天秒下 \n", 2068 | "1 1ad5be0d 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 \n", 2069 | "2 6dd28e9b \n", 2070 | "3 f3b61b38 CCM区块链商城PPT介绍 \n", 2071 | "4 84b12bae 加密数字货币里大家都赚钱 钱是从哪里来的? \n", 2072 | "\n", 2073 | " text \\\n", 2074 | "0 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的... \n", 2075 | "1 本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ... \n", 2076 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 2077 | "3 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自... \n", 2078 | "4 我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿... \n", 2079 | "\n", 2080 | " new_text \n", 2081 | "0 时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到... \n", 2082 | "1 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 本篇文章将对两种... \n", 2083 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 2084 | "3 CCM区块链商城PPT介绍 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和... \n", 2085 | "4 加密数字货币里大家都赚钱 钱是从哪里来的? 我最近去分享艾尔链LAC加密数字货币时 有很多朋... " 2086 | ] 2087 | }, 2088 | "execution_count": 63, 2089 | "metadata": {}, 2090 | "output_type": "execute_result" 2091 | } 2092 | ], 2093 | "source": [ 2094 | "df_test.head()" 2095 | ] 2096 | }, 2097 | { 2098 | "cell_type": "code", 2099 | "execution_count": 64, 2100 | "metadata": { 2101 | "ExecuteTime": { 2102 | "end_time": "2019-09-05T04:30:40.955068Z", 2103 | "start_time": "2019-09-05T04:30:40.947846Z" 2104 | } 2105 | }, 2106 | "outputs": [ 2107 | { 2108 | "data": { 2109 | "text/plain": [ 2110 | "(4998, 4)" 2111 | ] 2112 | }, 2113 | "execution_count": 64, 2114 | "metadata": {}, 2115 | "output_type": "execute_result" 2116 | } 2117 | ], 2118 | "source": [ 2119 | "df_test.shape" 2120 | ] 2121 | }, 2122 | { 2123 | "cell_type": "code", 2124 | "execution_count": 65, 2125 | "metadata": { 2126 | "ExecuteTime": { 2127 | "end_time": "2019-09-05T04:30:44.183591Z", 2128 | "start_time": "2019-09-05T04:30:44.174293Z" 2129 | } 2130 | }, 2131 | "outputs": [], 2132 | "source": [ 2133 | "# 对测试集的文本进行分割,担心正文内容太长\n", 2134 | "def split_text(seq_list, maxlen):\n", 2135 | " res = []\n", 2136 | " for tuple_ in seq_list:\n", 2137 | " id = tuple_[0]\n", 2138 | " if len(tuple_[1]) > maxlen:\n", 2139 | " num_sent = len(tuple_[1]) // maxlen\n", 2140 | " if len(tuple_[1]) % maxlen != 0:\n", 2141 | " num_sent += 1\n", 2142 | " for i in range(num_sent):\n", 2143 | " res.append((id, tuple_[1][i*maxlen: (i+1)*maxlen]))\n", 2144 | " else:\n", 2145 | " res.append((id, tuple_[1]))\n", 2146 | " return res" 2147 | ] 2148 | }, 2149 | { 2150 | "cell_type": "code", 2151 | "execution_count": 68, 2152 | "metadata": { 2153 | "ExecuteTime": { 2154 | "end_time": "2019-09-05T04:31:50.238640Z", 2155 | "start_time": "2019-09-05T04:31:50.232330Z" 2156 | } 2157 | }, 2158 | "outputs": [], 2159 | "source": [ 2160 | "def get_test_data():\n", 2161 | " test_data = []\n", 2162 | " for id, c in zip(df_test[\"id\"], df_test[\"new_text\"]):\n", 2163 | " test_data.append((id, c))\n", 2164 | " return test_data" 2165 | ] 2166 | }, 2167 | { 2168 | "cell_type": "code", 2169 | "execution_count": 69, 2170 | "metadata": { 2171 | "ExecuteTime": { 2172 | "end_time": "2019-09-05T04:31:50.820134Z", 2173 | "start_time": "2019-09-05T04:31:50.808412Z" 2174 | } 2175 | }, 2176 | "outputs": [], 2177 | "source": [ 2178 | "test_d = get_test_data()" 2179 | ] 2180 | }, 2181 | { 2182 | "cell_type": "code", 2183 | "execution_count": 70, 2184 | "metadata": { 2185 | "ExecuteTime": { 2186 | "end_time": "2019-09-05T04:31:58.793146Z", 2187 | "start_time": "2019-09-05T04:31:58.785686Z" 2188 | } 2189 | }, 2190 | "outputs": [ 2191 | { 2192 | "data": { 2193 | "text/plain": [ 2194 | "4998" 2195 | ] 2196 | }, 2197 | "execution_count": 70, 2198 | "metadata": {}, 2199 | "output_type": "execute_result" 2200 | } 2201 | ], 2202 | "source": [ 2203 | "len(test_d)" 2204 | ] 2205 | }, 2206 | { 2207 | "cell_type": "code", 2208 | "execution_count": 71, 2209 | "metadata": { 2210 | "ExecuteTime": { 2211 | "end_time": "2019-09-05T04:32:09.610459Z", 2212 | "start_time": "2019-09-05T04:32:09.603930Z" 2213 | } 2214 | }, 2215 | "outputs": [ 2216 | { 2217 | "data": { 2218 | "text/plain": [ 2219 | "512" 2220 | ] 2221 | }, 2222 | "execution_count": 71, 2223 | "metadata": {}, 2224 | "output_type": "execute_result" 2225 | } 2226 | ], 2227 | "source": [ 2228 | "maxlen" 2229 | ] 2230 | }, 2231 | { 2232 | "cell_type": "code", 2233 | "execution_count": 72, 2234 | "metadata": { 2235 | "ExecuteTime": { 2236 | "end_time": "2019-09-05T04:32:13.443385Z", 2237 | "start_time": "2019-09-05T04:32:13.411916Z" 2238 | } 2239 | }, 2240 | "outputs": [], 2241 | "source": [ 2242 | "test_res = split_text(test_d, maxlen)" 2243 | ] 2244 | }, 2245 | { 2246 | "cell_type": "code", 2247 | "execution_count": 75, 2248 | "metadata": { 2249 | "ExecuteTime": { 2250 | "end_time": "2019-09-05T04:35:17.461315Z", 2251 | "start_time": "2019-09-05T04:35:17.454315Z" 2252 | } 2253 | }, 2254 | "outputs": [ 2255 | { 2256 | "data": { 2257 | "text/plain": [ 2258 | "('83dcefb7',\n", 2259 | " '时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的。扫一扫 立即申请时空周转app功能1、极速放款 自动审核、极速放款、实时到账 2、流程简单 在线填写资料 芝麻信用授权即可贷款 3、信息安全 数据库加密技术、保护借款人隐私 4、随借随还 无论何时何地、借款轻松 还款便捷。时空周转app亮点1、闪电借款 纯线上自动化审核 快至30分钟到账 2、额度灵活 单期借款、现金分期 万元额度任你选。3、门槛超低 无门槛、无担保 有身份证即可借款。关注我们 更多口子信息问 有人在时空周转借款过吗答 时空周转正常情况下2 3小时以内 也有特殊情况。问 时空周转贷款审核需要多少时间 时空周转贷款审核时间多长答 提前还款后 也还是能继续在时空周转借款的。问 时空周转没有还清还可以申请吗?答 时空周转!不需要太多的条件 借款也很快!问 时空周转真的像广告里面说的没有信用卡也能贷款吗答 用过 不是的 时空周转最大的好处就是可以节省分期的手续费问 时空周转好用吗时空周转 使用的人多不多啊答 时空周转它的手续费是比较低的 而且还款压力也比较小的。低于银行七哩八。5万')" 2260 | ] 2261 | }, 2262 | "execution_count": 75, 2263 | "metadata": {}, 2264 | "output_type": "execute_result" 2265 | } 2266 | ], 2267 | "source": [ 2268 | "test_res[0]" 2269 | ] 2270 | }, 2271 | { 2272 | "cell_type": "code", 2273 | "execution_count": 74, 2274 | "metadata": { 2275 | "ExecuteTime": { 2276 | "end_time": "2019-09-05T04:35:09.588152Z", 2277 | "start_time": "2019-09-05T04:35:09.581498Z" 2278 | } 2279 | }, 2280 | "outputs": [ 2281 | { 2282 | "data": { 2283 | "text/plain": [ 2284 | "12235" 2285 | ] 2286 | }, 2287 | "execution_count": 74, 2288 | "metadata": {}, 2289 | "output_type": "execute_result" 2290 | } 2291 | ], 2292 | "source": [ 2293 | "len(test_res)" 2294 | ] 2295 | }, 2296 | { 2297 | "cell_type": "code", 2298 | "execution_count": 73, 2299 | "metadata": { 2300 | "ExecuteTime": { 2301 | "end_time": "2019-09-05T04:35:06.986729Z", 2302 | "start_time": "2019-09-05T04:35:04.013365Z" 2303 | } 2304 | }, 2305 | "outputs": [], 2306 | "source": [ 2307 | "sub_id = []\n", 2308 | "sub_text = []\n", 2309 | "sub_X1 = []\n", 2310 | "sub_X2 = []\n", 2311 | "for tuple_ in test_res:\n", 2312 | " sub_id.append(tuple_[0])\n", 2313 | " x1, x2 = tokenizer.encode(first=tuple_[1])\n", 2314 | " sub_X1.append(x1)\n", 2315 | " sub_X2.append(x2)\n", 2316 | " sub_text.append(tuple_[1])\n", 2317 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 2318 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 2319 | ] 2320 | }, 2321 | { 2322 | "cell_type": "code", 2323 | "execution_count": 76, 2324 | "metadata": { 2325 | "ExecuteTime": { 2326 | "end_time": "2019-09-05T04:35:30.308948Z", 2327 | "start_time": "2019-09-05T04:35:30.302899Z" 2328 | } 2329 | }, 2330 | "outputs": [], 2331 | "source": [ 2332 | "bs = 64\n", 2333 | "steps = len(sub_id) // bs\n", 2334 | "if len(sub_id) % bs != 0:\n", 2335 | " steps += 1" 2336 | ] 2337 | }, 2338 | { 2339 | "cell_type": "code", 2340 | "execution_count": 77, 2341 | "metadata": { 2342 | "ExecuteTime": { 2343 | "end_time": "2019-09-05T04:44:27.480092Z", 2344 | "start_time": "2019-09-05T04:35:48.519124Z" 2345 | } 2346 | }, 2347 | "outputs": [ 2348 | { 2349 | "name": "stderr", 2350 | "output_type": "stream", 2351 | "text": [ 2352 | "100%|██████████| 192/192 [08:38<00:00, 2.15s/it]\n" 2353 | ] 2354 | } 2355 | ], 2356 | "source": [ 2357 | "result_all = []\n", 2358 | "for i in tqdm(range(steps)):\n", 2359 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 2360 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 2361 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 2362 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 2363 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 2364 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 2365 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 2366 | " pred_label = [idx2tag[i] for i in pred]\n", 2367 | " x_data = [c for c in text]\n", 2368 | " entity = get_entity(x_data, pred_label)\n", 2369 | " result_all.append((id, entity))" 2370 | ] 2371 | }, 2372 | { 2373 | "cell_type": "code", 2374 | "execution_count": 87, 2375 | "metadata": { 2376 | "ExecuteTime": { 2377 | "end_time": "2019-09-05T04:53:58.631652Z", 2378 | "start_time": "2019-09-05T04:53:58.598609Z" 2379 | } 2380 | }, 2381 | "outputs": [], 2382 | "source": [ 2383 | "last_result = []\n", 2384 | "for tuple_ in result_all:\n", 2385 | " ner = []\n", 2386 | " for word in tuple_[1].split(\" \"):\n", 2387 | " if len(word) > 2:\n", 2388 | " ner.append(word)\n", 2389 | " last_result.append((tuple_[0], \" \".join(ner)))" 2390 | ] 2391 | }, 2392 | { 2393 | "cell_type": "code", 2394 | "execution_count": 88, 2395 | "metadata": { 2396 | "ExecuteTime": { 2397 | "end_time": "2019-09-05T04:54:03.325160Z", 2398 | "start_time": "2019-09-05T04:54:03.318038Z" 2399 | }, 2400 | "scrolled": true 2401 | }, 2402 | "outputs": [ 2403 | { 2404 | "data": { 2405 | "text/plain": [ 2406 | "('83dcefb7', '时空周转 时空周转')" 2407 | ] 2408 | }, 2409 | "execution_count": 88, 2410 | "metadata": {}, 2411 | "output_type": "execute_result" 2412 | } 2413 | ], 2414 | "source": [ 2415 | "last_result[0]" 2416 | ] 2417 | }, 2418 | { 2419 | "cell_type": "code", 2420 | "execution_count": 89, 2421 | "metadata": { 2422 | "ExecuteTime": { 2423 | "end_time": "2019-09-05T04:55:03.861617Z", 2424 | "start_time": "2019-09-05T04:55:03.830377Z" 2425 | } 2426 | }, 2427 | "outputs": [], 2428 | "source": [ 2429 | "res_dict = dict()\n", 2430 | "for tuple_ in last_result:\n", 2431 | " if tuple_[0] not in res_dict:\n", 2432 | " res_dict[tuple_[0]] = tuple_[1].split(\" \")\n", 2433 | " else:\n", 2434 | " res_dict[tuple_[0]] = res_dict[tuple_[0]] + tuple_[1].split(\" \")" 2435 | ] 2436 | }, 2437 | { 2438 | "cell_type": "code", 2439 | "execution_count": 84, 2440 | "metadata": { 2441 | "ExecuteTime": { 2442 | "end_time": "2019-09-05T04:46:37.752349Z", 2443 | "start_time": "2019-09-05T04:46:37.745531Z" 2444 | } 2445 | }, 2446 | "outputs": [ 2447 | { 2448 | "data": { 2449 | "text/plain": [ 2450 | "4998" 2451 | ] 2452 | }, 2453 | "execution_count": 84, 2454 | "metadata": {}, 2455 | "output_type": "execute_result" 2456 | } 2457 | ], 2458 | "source": [ 2459 | "len(test_d)" 2460 | ] 2461 | }, 2462 | { 2463 | "cell_type": "code", 2464 | "execution_count": 92, 2465 | "metadata": { 2466 | "ExecuteTime": { 2467 | "end_time": "2019-09-05T05:00:20.154002Z", 2468 | "start_time": "2019-09-05T05:00:20.146278Z" 2469 | } 2470 | }, 2471 | "outputs": [ 2472 | { 2473 | "data": { 2474 | "text/plain": [ 2475 | "'w'" 2476 | ] 2477 | }, 2478 | "execution_count": 92, 2479 | "metadata": {}, 2480 | "output_type": "execute_result" 2481 | } 2482 | ], 2483 | "source": [] 2484 | }, 2485 | { 2486 | "cell_type": "code", 2487 | "execution_count": 90, 2488 | "metadata": { 2489 | "ExecuteTime": { 2490 | "end_time": "2019-09-05T04:55:13.687039Z", 2491 | "start_time": "2019-09-05T04:55:13.654892Z" 2492 | } 2493 | }, 2494 | "outputs": [], 2495 | "source": [ 2496 | "last_res = []\n", 2497 | "with open(\"/home/wangwei/tf_workdir/word_detect/submit_test.csv\", \"w\", encoding='utf-8') as f:\n", 2498 | " for i in test_d:\n", 2499 | " key = i[0]\n", 2500 | " value = res_dict[key]\n", 2501 | " value = \";\".join(list(set(value)))\n", 2502 | " f.write(key+','+value+\"\\n\")" 2503 | ] 2504 | }, 2505 | { 2506 | "cell_type": "code", 2507 | "execution_count": 91, 2508 | "metadata": { 2509 | "ExecuteTime": { 2510 | "end_time": "2019-09-05T04:57:13.949558Z", 2511 | "start_time": "2019-09-05T04:57:13.917933Z" 2512 | } 2513 | }, 2514 | "outputs": [], 2515 | "source": [ 2516 | "with codecs.open(\"/home/wangwei/tf_workdir/word_detect/submit_test1.csv\", \"w\", encoding=\"utf-8\") as f:\n", 2517 | " for i in test_d:\n", 2518 | " key = i[0]\n", 2519 | " value = res_dict[key]\n", 2520 | " value = \";\".join(list(set(value)))\n", 2521 | " f.write(key+','+value+\"\\n\")" 2522 | ] 2523 | }, 2524 | { 2525 | "cell_type": "markdown", 2526 | "metadata": {}, 2527 | "source": [ 2528 | "### 处理测试集2" 2529 | ] 2530 | }, 2531 | { 2532 | "cell_type": "code", 2533 | "execution_count": 93, 2534 | "metadata": { 2535 | "ExecuteTime": { 2536 | "end_time": "2019-09-05T05:07:56.925583Z", 2537 | "start_time": "2019-09-05T05:07:52.394191Z" 2538 | } 2539 | }, 2540 | "outputs": [], 2541 | "source": [ 2542 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 2543 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 2544 | ] 2545 | }, 2546 | { 2547 | "cell_type": "code", 2548 | "execution_count": 114, 2549 | "metadata": { 2550 | "ExecuteTime": { 2551 | "end_time": "2019-09-05T05:33:18.485167Z", 2552 | "start_time": "2019-09-05T05:33:18.473832Z" 2553 | } 2554 | }, 2555 | "outputs": [ 2556 | { 2557 | "data": { 2558 | "text/plain": [ 2559 | "array(['SMRT 智慧地球2010年由全美零售业联盟会、新兴市场基金会简称EMF发起研发 基金会团队分布在美国硅谷、巴黎、韩国、印度 四大研发中心号上线 智慧地球🌍首先呢?它不是一个单线平台、是基于多个模块、分不同阶段不同地区和国家进行、相互独立又相互依存、所以说呢地球🌍不是一个简单的拆分、拆分只不过是前期的一部个启动器而已、分区块连🌍社交软件APP浏览器等、商业板块 学习模块、游戏道具交易买卖板块、数字货币交易网站的打造、像火币网一样的数字货币三大交易平台打造、新加坡 迪拜、马耳它、亚洲国家一个中东国家一个欧洲国家一个、自然界 区块链数字货币的打造!作为地球🌍多国渡全生态 交易的价值媒介 实现全球点对点交易无障碍通道 不同模块在不同阶段适时推出为地球🌍每个阶段更好的发展、地球🌍平台志向高远、目前许多玩家仅仅停留在或了解到的只是地球的初级阶段的一小部分东西 地球志向高远 早已经走在了时代的前沿 表面上是一种新的数据结构与计算方式 但其核心价值在于重塑人类交易方式和共识机制 智慧地球🌍每个板块都是一环扣一环环环相扣 未来5个发展阶段和布局结构全部提前规划出行路线和技术的储备 适时推出即可 跟着路线走就行 1️⃣第一阶段SmartGlobe的研发设计 初始建设者招募与教育 也就是玩家的培育 打好根基的意思2️⃣SmartGlobe正式面世 努力打造Z频道链接Amazon 将趣味购物 利润均享的先进理念带入新兴市场区块链APP社交软件和浏览器等3️⃣阶段当建设者 消费者总量达到5万人 SmartGoble智慧地球建设初具规模 开放20万EP的商业单元 审核通过的建设者自有品牌商家便可自主开设一个面向5万人存量消费者的特色商铺。同时 战略合作专用的即时通讯工具上线 方便交易方线上沟通 方便各支SmartGoble建设分队学习交流、包括三国区块链数字交易平台上线运营、自然界 数字货币全球点对点交易支付媒介及数字资产、商品全球买全球卖全球支付体系的建设。4️⃣当建设者 消费者总量达到50万人 开放50万EP的通讯单元 Z频道独立剥离 引入风投或采用众筹模式 推向资本市场5️⃣管委会可控基金达到一定规模 借由专业投资团队 介入优势产业、物色包装有潜力的优质企业。让SmartGlobe以资本的力量 引发共存共赢的理念在全球传播!SmartGlobe优秀的建设者将有优先权 畅享源源不断的资本盛宴。'],\n", 2560 | " dtype=object)" 2561 | ] 2562 | }, 2563 | "execution_count": 114, 2564 | "metadata": {}, 2565 | "output_type": "execute_result" 2566 | } 2567 | ], 2568 | "source": [ 2569 | "df_test[df_test[\"id\"] == \"1414e381\"][\"text\"].values" 2570 | ] 2571 | }, 2572 | { 2573 | "cell_type": "code", 2574 | "execution_count": 136, 2575 | "metadata": { 2576 | "ExecuteTime": { 2577 | "end_time": "2019-09-05T05:48:38.527965Z", 2578 | "start_time": "2019-09-05T05:48:38.511216Z" 2579 | } 2580 | }, 2581 | "outputs": [ 2582 | { 2583 | "data": { 2584 | "text/plain": [ 2585 | "array([' 公告 上银基金管理有限公司 上银中债1 3年农发行债券指数 关于上银中债1 3年农发行债券指数证券投资基金提前结束募集的公告时间 中财网上银基金管理有限公司关于上银中债1 3年农发行债券指数证券投资基金提前结束募集的公告上银中债1 3年农发行债券指数证券投资基金基金简称 上银中债13年农发行债券指数 基金代码 007390 以下简称本基金于经中国证监会证监许可 2019 548号文准予注册募集。本基金已于起开始募集 原定募集截止日为。为充分保护基金份额持有人利益 根据目前本基金销售情况及、、的有关规定 经与本基金托管人股份有限公司协商一致 上银基金管理有限公司决定提前结束本基金的募集 募集截止日提前至 自含当日起不再接受认购申请。敬请投资者留意。投资者可以通过以下途径咨询本基金其他有关信息 1、上银基金管理有限公司客户服务电话 021602319992、上银基金管理有限公司网站 www boscam com cn风险提示 基金管理人承诺以诚实信用、勤勉尽责的原则管理和运用基金资产 但不保证基金一定盈利 也不保证最低收益。投资者投资于本基金前应认真阅读本基金的基金合同和招募说明书。敬请投资者注意投资风险。特此公告上银基金管理有限公司二〇一九年六月十八日 中财网'],\n", 2586 | " dtype=object)" 2587 | ] 2588 | }, 2589 | "execution_count": 136, 2590 | "metadata": {}, 2591 | "output_type": "execute_result" 2592 | } 2593 | ], 2594 | "source": [ 2595 | "df_test[df_test[\"id\"] == \"5a8717fa\"][\"text\"].values" 2596 | ] 2597 | }, 2598 | { 2599 | "cell_type": "code", 2600 | "execution_count": 119, 2601 | "metadata": { 2602 | "ExecuteTime": { 2603 | "end_time": "2019-09-05T05:35:58.049427Z", 2604 | "start_time": "2019-09-05T05:35:58.008664Z" 2605 | } 2606 | }, 2607 | "outputs": [], 2608 | "source": [ 2609 | "test_data = []\n", 2610 | "for id, title, text in zip(df_test[\"id\"], df_test[\"title\"], df_test[\"text\"]):\n", 2611 | " p = re.compile(u\"[^a-zA-Z0-9\\u4e00-\\u9fa5]\")\n", 2612 | " line = p.sub(' ',line)\n", 2613 | " line = re.sub(' +', ' ', line)\n", 2614 | " test_data.append((id, text))" 2615 | ] 2616 | }, 2617 | { 2618 | "cell_type": "code", 2619 | "execution_count": 120, 2620 | "metadata": { 2621 | "ExecuteTime": { 2622 | "end_time": "2019-09-05T05:36:12.472960Z", 2623 | "start_time": "2019-09-05T05:36:12.465903Z" 2624 | }, 2625 | "scrolled": true 2626 | }, 2627 | "outputs": [ 2628 | { 2629 | "data": { 2630 | "text/plain": [ 2631 | "9226" 2632 | ] 2633 | }, 2634 | "execution_count": 120, 2635 | "metadata": {}, 2636 | "output_type": "execute_result" 2637 | } 2638 | ], 2639 | "source": [ 2640 | "len(test_data)" 2641 | ] 2642 | }, 2643 | { 2644 | "cell_type": "code", 2645 | "execution_count": 126, 2646 | "metadata": { 2647 | "ExecuteTime": { 2648 | "end_time": "2019-09-05T05:37:20.968560Z", 2649 | "start_time": "2019-09-05T05:37:20.960859Z" 2650 | }, 2651 | "scrolled": true 2652 | }, 2653 | "outputs": [ 2654 | { 2655 | "data": { 2656 | "text/plain": [ 2657 | "('1ad5be0d',\n", 2658 | " '本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 另一种是做项目 带团队的专职网赚者 通过本篇文章 你们会知道这才是你想要的东西 首先讲纯羊毛党 打开此款APP 我发现 这里真是纯羊毛党的天堂 对于羊毛党 我只介绍这两个功能 1 红包大厅 有做项目的为了推广项目 会发些图方广告 以红包的形式 你只要观看10秒 就能抢到红包 对于羊毛党来说 向来是以量取胜的 而且你若有心 你会发现不少的优质项目 一手资源如图 2 任务大厅 就跟其它比如蚂蚁 牛帮 众人帮等任务平台一样 这里可以发任务 接任务 价格也不错 最有突出的一个就是有朋友圈任务 简单转发到朋友圈 最低是1元 条 两三块 条的也有 每天光接这些任务 都够你一天几十的了 如图 3 这里要顺便说下我的微信群 大家都知道 每个任务平台收徒弟 徒弟完成任务 师傅是有奖励的 之前我建立了众人帮 牛帮 余赚网 闲趣赚这几个平台的徒弟群 每天返70%的奖励给大家 得到了很好的效果 大家做任务都很积极 也得到了更多的奖励 现在我又单独建立了全民推的群 因为我发现全民推 在我开通了金牌站长 VIP会员后 我能得到徒弟30%的奖励 所以 这个数字是非常可观的 为此 请做任务的兄弟们 一定要走我链接 加我微信 进群享高额分红 每天红包雨让你爽翻天 往下拉 加我微信进群 第二种人群 专职网赚带队干项目的为什么一定要来这里呢请先听下我的故事 最近我手上有不少的好项目 却一直建立不起来一个更大的团队 思前思后 主要是没有找到可以引流的池子 一直到我有天看朋友圈发现有人在推广这个APP 下载之后才发现这里是真正的流量池 好了 我只说到这 干项目的都不是傻子 我只发点图给你们看 其它自己揣摩 第一张 请注意 数字 1 79亿 再看中间广告页的点击率 这上面是不断刷新的这一张 是加粉中心页面 可以把自己微信 群 小程序 公众号 甚至货源发上去综上所述 我觉得全民推这款APP是值得大家下载安装的 点击左下角阅读原文做任务的伙伴们记得加我微信进群享每日70%分红 有钱大家一起赚才是正确的操作方式 跟我干有钱赚点击 阅读原文 注册▼往期精彩回顾▼淘宝评价自动变现小而美的项目全国招收云闪付推广员 18元 单 一单一结手机端POS机 店小友 手机就是POS机 央行支付牌照 用友集团旗下 值得拥有软银支付是什么有了店小友 我为什么还要推广软银支付')" 2659 | ] 2660 | }, 2661 | "execution_count": 126, 2662 | "metadata": {}, 2663 | "output_type": "execute_result" 2664 | } 2665 | ], 2666 | "source": [ 2667 | "test_data[3]" 2668 | ] 2669 | }, 2670 | { 2671 | "cell_type": "code", 2672 | "execution_count": 127, 2673 | "metadata": { 2674 | "ExecuteTime": { 2675 | "end_time": "2019-09-05T05:37:33.965221Z", 2676 | "start_time": "2019-09-05T05:37:32.665306Z" 2677 | } 2678 | }, 2679 | "outputs": [], 2680 | "source": [ 2681 | "sub_id = []\n", 2682 | "sub_text = []\n", 2683 | "sub_X1 = []\n", 2684 | "sub_X2 = []\n", 2685 | "for tuple_ in test_data:\n", 2686 | " sub_id.append(tuple_[0])\n", 2687 | " x1, x2 = tokenizer.encode(first=tuple_[1][:maxlen])\n", 2688 | " sub_X1.append(x1)\n", 2689 | " sub_X2.append(x2)\n", 2690 | " sub_text.append(tuple_[1])\n", 2691 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 2692 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 2693 | ] 2694 | }, 2695 | { 2696 | "cell_type": "code", 2697 | "execution_count": 128, 2698 | "metadata": { 2699 | "ExecuteTime": { 2700 | "end_time": "2019-09-05T05:37:33.970693Z", 2701 | "start_time": "2019-09-05T05:37:33.967536Z" 2702 | } 2703 | }, 2704 | "outputs": [], 2705 | "source": [ 2706 | "bs = 64\n", 2707 | "steps = len(sub_id) // bs\n", 2708 | "if len(sub_id) % bs != 0:\n", 2709 | " steps += 1" 2710 | ] 2711 | }, 2712 | { 2713 | "cell_type": "code", 2714 | "execution_count": 129, 2715 | "metadata": { 2716 | "ExecuteTime": { 2717 | "end_time": "2019-09-05T05:44:07.909387Z", 2718 | "start_time": "2019-09-05T05:37:34.646402Z" 2719 | } 2720 | }, 2721 | "outputs": [ 2722 | { 2723 | "name": "stderr", 2724 | "output_type": "stream", 2725 | "text": [ 2726 | "100%|██████████| 145/145 [06:33<00:00, 2.17s/it]\n" 2727 | ] 2728 | } 2729 | ], 2730 | "source": [ 2731 | "result_all = []\n", 2732 | "for i in tqdm(range(steps)):\n", 2733 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 2734 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 2735 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 2736 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 2737 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 2738 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 2739 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 2740 | " pred_label = [idx2tag[i] for i in pred]\n", 2741 | " x_data = [c for c in text]\n", 2742 | " entity = get_entity(x_data, pred_label)\n", 2743 | " result_all.append((id, entity))" 2744 | ] 2745 | }, 2746 | { 2747 | "cell_type": "code", 2748 | "execution_count": 135, 2749 | "metadata": { 2750 | "ExecuteTime": { 2751 | "end_time": "2019-09-05T05:46:54.588870Z", 2752 | "start_time": "2019-09-05T05:46:54.572605Z" 2753 | }, 2754 | "scrolled": true 2755 | }, 2756 | "outputs": [ 2757 | { 2758 | "data": { 2759 | "text/plain": [ 2760 | "[('982e5921', ''),\n", 2761 | " ('982e5921', ''),\n", 2762 | " ('0127089b', '凤推'),\n", 2763 | " ('0127089b', '凤推'),\n", 2764 | " ('7620380d', '比特挖矿 比特挖'),\n", 2765 | " ('7620380d', ' 矿机 比特挖矿'),\n", 2766 | " ('e69f259c', '简啦啦'),\n", 2767 | " ('e69f259c', '简啦啦'),\n", 2768 | " ('9198150a', 'fxcm'),\n", 2769 | " ('f15f9cef', '创富国际有限公司 国付宝信息 国付宝 国付宝信息科技有限公司 创富国际'),\n", 2770 | " ('8658ac79', '行聊 ctb'),\n", 2771 | " ('1f51fdc3', '腾邦现货'),\n", 2772 | " ('1f51fdc3', '腾邦集团 腾邦 腾邦现货'),\n", 2773 | " ('6856cd55', ''),\n", 2774 | " ('6856cd55', ''),\n", 2775 | " ('f63258f6', '三立集团'),\n", 2776 | " ('81356860', ''),\n", 2777 | " ('81356860', ''),\n", 2778 | " ('183c39da', '恐'),\n", 2779 | " ('183c39da', ' 鼎泽'),\n", 2780 | " ('6f3b094c', '华盛资本'),\n", 2781 | " ('6f3b094c', '华盛资本'),\n", 2782 | " ('ff8414dd', '金融危'),\n", 2783 | " ('ff8414dd', '金融'),\n", 2784 | " ('8883244b', '盾安债甩券 合晟资产'),\n", 2785 | " ('8883244b', '盾安 败。盾安 持有盾安债 踩雷盾安债 盾安债 前 泓德基金 盾安集'),\n", 2786 | " ('da72cf2c', '淘钱宝'),\n", 2787 | " ('da72cf2c', '多米 淘钱宝'),\n", 2788 | " ('ad75ffba', '爱乐在线'),\n", 2789 | " ('ad75ffba', '爱乐在线 张先生'),\n", 2790 | " ('347cae00', '商机头条'),\n", 2791 | " ('347cae00', '商机头条'),\n", 2792 | " ('437b9e96', '素店'),\n", 2793 | " ('437b9e96', '素店 靠吗素店 妮素国际'),\n", 2794 | " ('dd1f0b35', ''),\n", 2795 | " ('dd1f0b35', '华润信托'),\n", 2796 | " ('aa183ba3', '比特币 IPFS存储'),\n", 2797 | " ('aa183ba3', '比特币'),\n", 2798 | " ('33116a19', '宝网贷'),\n", 2799 | " ('33116a19', '收获宝'),\n", 2800 | " ('44165a8f', '龙腾盛世'),\n", 2801 | " ('44165a8f', '轮回 新客兑 新客 保德薪'),\n", 2802 | " ('d4a9471e', '汇新智'),\n", 2803 | " ('d4a9471e', '汇新智'),\n", 2804 | " ('a3ae7788', '莱茨狗 区块链'),\n", 2805 | " ('a3ae7788', '莱茨狗'),\n", 2806 | " ('c369fe6d', '凯富创通'),\n", 2807 | " ('c369fe6d', '凯富创通'),\n", 2808 | " ('b46ecefb', ''),\n", 2809 | " ('b46ecefb', ''),\n", 2810 | " ('2d679f41', '趣步 链信'),\n", 2811 | " ('2d679f41', 'APP 趣步 亦跑 链信'),\n", 2812 | " ('5a60afd7', 'lt'),\n", 2813 | " ('5a60afd7', '明堂金融 景山 明堂期货'),\n", 2814 | " ('c4043a74', '英皇金融国 英皇金融国际'),\n", 2815 | " ('c4043a74', '英皇金融 英皇金融国际'),\n", 2816 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 2817 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 2818 | " ('2a0a5b58', '网商万宝'),\n", 2819 | " ('2a0a5b58', '鲨鱼记账 网商万宝 网商'),\n", 2820 | " ('5d0d6bce', '华景无限 华景无限逍遥卡'),\n", 2821 | " ('5d0d6bce', '无限 华景城 华景无限逍遥卡 华景无限旅游 华景无限 华景无限逍'),\n", 2822 | " ('cdb2765f', '国金中融'),\n", 2823 | " ('cdb2765f', '臻鼎投资 国金中融 国金中融 臻鼎'),\n", 2824 | " ('bab546c9', '顺德农商 中国中投 广东顺高投 欧浦小贷'),\n", 2825 | " ('44f1e2a2', '易信easymarkets平台'),\n", 2826 | " ('33f6d234', '国美'),\n", 2827 | " ('33f6d234', '美美 国美在线 国美控股 美美理财 国美 国美在线金融 美易'),\n", 2828 | " ('aaff838e', ''),\n", 2829 | " ('aaff838e', 'P2 芒果金融'),\n", 2830 | " ('ddf8b318', '之道 人人 云联惠 云商'),\n", 2831 | " ('439c26bb', '神店'),\n", 2832 | " ('439c26bb', '神店 神店小'),\n", 2833 | " ('349b162d', '华登 beta'),\n", 2834 | " ('349b162d', '华登 beta'),\n", 2835 | " ('ad924797', '益阳因特网'),\n", 2836 | " ('ad924797', ''),\n", 2837 | " ('da957701', '喜牛'),\n", 2838 | " ('da957701', '喜牛 中 远特喜牛'),\n", 2839 | " ('4a2a6a90', ''),\n", 2840 | " ('4a2a6a90', ''),\n", 2841 | " ('3d2d5a06', ''),\n", 2842 | " ('3d2d5a06', '购房'),\n", 2843 | " ('5dead3e3', 'easyforex易信'),\n", 2844 | " ('5dead3e3', ''),\n", 2845 | " ('2aede375', '恩圣威NCY'),\n", 2846 | " ('b3e4b2cf', '明堂金融 景山 明堂期货'),\n", 2847 | " ('c4e38259', ''),\n", 2848 | " ('c4e38259', '洛阳百事通 百事通'),\n", 2849 | " ('5a8717fa', '上银基金管理有'),\n", 2850 | " ('5a8717fa', '告 上银基金管 财网上银基金管理有限 2、上银基金管理有 致 上银基金管理有限 关于上银 上银基金管理 司 上银'),\n", 2851 | " ('2d80276c', '区块 区块链'),\n", 2852 | " ('2d80276c', '源中瑞 深圳源中瑞科技'),\n", 2853 | " ('b48976d6', '农科农业保险公司、农科融资租赁公司'),\n", 2854 | " ('c38e4640', '恩圣威 艾拓思'),\n", 2855 | " ('53315bd1', 'Finci芬吉'),\n", 2856 | " ('53315bd1', 'Finci芬吉'),\n", 2857 | " ('24366b47', '爱汇宝'),\n", 2858 | " ('3b453440', ''),\n", 2859 | " ('3b453440', '赶街')]" 2860 | ] 2861 | }, 2862 | "execution_count": 135, 2863 | "metadata": {}, 2864 | "output_type": "execute_result" 2865 | } 2866 | ], 2867 | "source": [ 2868 | "result_all[-100:]" 2869 | ] 2870 | }, 2871 | { 2872 | "cell_type": "code", 2873 | "execution_count": null, 2874 | "metadata": {}, 2875 | "outputs": [], 2876 | "source": [ 2877 | "last_result = []\n", 2878 | "for tuple_ in result_all:\n", 2879 | " ner = []\n", 2880 | " for word in tuple_[1].split(\" \"):\n", 2881 | " if len(word) > 2:\n", 2882 | " ner.append(word)\n", 2883 | " last_result.append((tuple_[0], \" \".join(ner)))" 2884 | ] 2885 | }, 2886 | { 2887 | "cell_type": "code", 2888 | "execution_count": null, 2889 | "metadata": {}, 2890 | "outputs": [], 2891 | "source": [ 2892 | "# 去除所有半角全角符号,只留字母、数字、中文。\n", 2893 | "def remove_punctuation(line):\n", 2894 | " p = re.compile(ur\"[^a-zA-Z\\u4e00-\\u9fa5]\")\n", 2895 | " line = p.sub('',line)\n", 2896 | " line = re.sub(' +', ' ', line)\n", 2897 | " return line" 2898 | ] 2899 | }, 2900 | { 2901 | "cell_type": "markdown", 2902 | "metadata": {}, 2903 | "source": [ 2904 | "### 处理测试集3" 2905 | ] 2906 | }, 2907 | { 2908 | "cell_type": "code", 2909 | "execution_count": 138, 2910 | "metadata": { 2911 | "ExecuteTime": { 2912 | "end_time": "2019-09-05T05:52:20.006642Z", 2913 | "start_time": "2019-09-05T05:52:15.161236Z" 2914 | } 2915 | }, 2916 | "outputs": [], 2917 | "source": [ 2918 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 2919 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 2920 | ] 2921 | }, 2922 | { 2923 | "cell_type": "code", 2924 | "execution_count": 139, 2925 | "metadata": { 2926 | "ExecuteTime": { 2927 | "end_time": "2019-09-05T05:52:24.823123Z", 2928 | "start_time": "2019-09-05T05:52:24.804565Z" 2929 | } 2930 | }, 2931 | "outputs": [ 2932 | { 2933 | "data": { 2934 | "text/html": [ 2935 | "
\n", 2936 | "\n", 2949 | "\n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | " \n", 2974 | " \n", 2975 | " \n", 2976 | " \n", 2977 | " \n", 2978 | " \n", 2979 | " \n", 2980 | " \n", 2981 | " \n", 2982 | " \n", 2983 | " \n", 2984 | " \n", 2985 | " \n", 2986 | " \n", 2987 | " \n", 2988 | " \n", 2989 | " \n", 2990 | "
idtitletext
083dcefb7时空周转公众注册 当天秒下时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的...
11ad5be0d抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ...
26dd28e9b2019健康行业趋势 住家创业 稳赚不亏
3f3b61b38CCM区块链商城PPT介绍CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自...
484b12bae加密数字货币里大家都赚钱 钱是从哪里来的?我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿...
\n", 2991 | "
" 2992 | ], 2993 | "text/plain": [ 2994 | " id title \\\n", 2995 | "0 83dcefb7 时空周转公众注册 当天秒下 \n", 2996 | "1 1ad5be0d 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 \n", 2997 | "2 6dd28e9b \n", 2998 | "3 f3b61b38 CCM区块链商城PPT介绍 \n", 2999 | "4 84b12bae 加密数字货币里大家都赚钱 钱是从哪里来的? \n", 3000 | "\n", 3001 | " text \n", 3002 | "0 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的... \n", 3003 | "1 本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ... \n", 3004 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 3005 | "3 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自... \n", 3006 | "4 我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿... " 3007 | ] 3008 | }, 3009 | "execution_count": 139, 3010 | "metadata": {}, 3011 | "output_type": "execute_result" 3012 | } 3013 | ], 3014 | "source": [ 3015 | "df_test.head()" 3016 | ] 3017 | }, 3018 | { 3019 | "cell_type": "code", 3020 | "execution_count": 147, 3021 | "metadata": { 3022 | "ExecuteTime": { 3023 | "end_time": "2019-09-05T06:02:53.724721Z", 3024 | "start_time": "2019-09-05T06:02:53.626214Z" 3025 | } 3026 | }, 3027 | "outputs": [], 3028 | "source": [ 3029 | "test_data = []\n", 3030 | "for id, title, text in zip(df_test[\"id\"], df_test[\"title\"], df_test[\"text\"]):\n", 3031 | " p = re.compile(u\"[^a-zA-Z0-9\\u4e00-\\u9fa5]\")\n", 3032 | " line = p.sub(' ',line)\n", 3033 | " line = re.sub(' +', ' ', line)\n", 3034 | " test_data.append((id, text))" 3035 | ] 3036 | }, 3037 | { 3038 | "cell_type": "code", 3039 | "execution_count": 148, 3040 | "metadata": { 3041 | "ExecuteTime": { 3042 | "end_time": "2019-09-05T06:03:16.824188Z", 3043 | "start_time": "2019-09-05T06:03:15.646284Z" 3044 | }, 3045 | "scrolled": true 3046 | }, 3047 | "outputs": [], 3048 | "source": [ 3049 | "sub_id = []\n", 3050 | "sub_text = []\n", 3051 | "sub_X1 = []\n", 3052 | "sub_X2 = []\n", 3053 | "for tuple_ in test_data:\n", 3054 | " sub_id.append(tuple_[0])\n", 3055 | " x1, x2 = tokenizer.encode(first=tuple_[1][:maxlen])\n", 3056 | " sub_X1.append(x1)\n", 3057 | " sub_X2.append(x2)\n", 3058 | " sub_text.append(tuple_[1])\n", 3059 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 3060 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 3061 | ] 3062 | }, 3063 | { 3064 | "cell_type": "code", 3065 | "execution_count": 149, 3066 | "metadata": { 3067 | "ExecuteTime": { 3068 | "end_time": "2019-09-05T06:03:26.189620Z", 3069 | "start_time": "2019-09-05T06:03:26.183664Z" 3070 | } 3071 | }, 3072 | "outputs": [], 3073 | "source": [ 3074 | "bs = 64\n", 3075 | "steps = len(sub_id) // bs\n", 3076 | "if len(sub_id) % bs != 0:\n", 3077 | " steps += 1" 3078 | ] 3079 | }, 3080 | { 3081 | "cell_type": "code", 3082 | "execution_count": 150, 3083 | "metadata": { 3084 | "ExecuteTime": { 3085 | "end_time": "2019-09-05T06:03:27.083217Z", 3086 | "start_time": "2019-09-05T06:03:27.077359Z" 3087 | } 3088 | }, 3089 | "outputs": [ 3090 | { 3091 | "data": { 3092 | "text/plain": [ 3093 | "79" 3094 | ] 3095 | }, 3096 | "execution_count": 150, 3097 | "metadata": {}, 3098 | "output_type": "execute_result" 3099 | } 3100 | ], 3101 | "source": [ 3102 | "steps" 3103 | ] 3104 | }, 3105 | { 3106 | "cell_type": "code", 3107 | "execution_count": 151, 3108 | "metadata": { 3109 | "ExecuteTime": { 3110 | "end_time": "2019-09-05T06:07:34.236994Z", 3111 | "start_time": "2019-09-05T06:04:02.143078Z" 3112 | } 3113 | }, 3114 | "outputs": [ 3115 | { 3116 | "name": "stderr", 3117 | "output_type": "stream", 3118 | "text": [ 3119 | "100%|██████████| 79/79 [03:32<00:00, 2.13s/it]\n" 3120 | ] 3121 | } 3122 | ], 3123 | "source": [ 3124 | "result_all = []\n", 3125 | "for i in tqdm(range(steps)):\n", 3126 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 3127 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 3128 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 3129 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 3130 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 3131 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 3132 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 3133 | " pred_label = [idx2tag[i] for i in pred]\n", 3134 | " x_data = [c for c in text]\n", 3135 | " entity = get_entity(x_data, pred_label)\n", 3136 | " result_all.append((id, entity))" 3137 | ] 3138 | }, 3139 | { 3140 | "cell_type": "code", 3141 | "execution_count": 152, 3142 | "metadata": { 3143 | "ExecuteTime": { 3144 | "end_time": "2019-09-05T06:07:34.246853Z", 3145 | "start_time": "2019-09-05T06:07:34.241366Z" 3146 | } 3147 | }, 3148 | "outputs": [ 3149 | { 3150 | "data": { 3151 | "text/plain": [ 3152 | "4998" 3153 | ] 3154 | }, 3155 | "execution_count": 152, 3156 | "metadata": {}, 3157 | "output_type": "execute_result" 3158 | } 3159 | ], 3160 | "source": [ 3161 | "len(result_all)" 3162 | ] 3163 | }, 3164 | { 3165 | "cell_type": "code", 3166 | "execution_count": 153, 3167 | "metadata": { 3168 | "ExecuteTime": { 3169 | "end_time": "2019-09-05T06:07:40.467863Z", 3170 | "start_time": "2019-09-05T06:07:40.459499Z" 3171 | } 3172 | }, 3173 | "outputs": [ 3174 | { 3175 | "data": { 3176 | "text/plain": [ 3177 | "[('83dcefb7', '时空周转'),\n", 3178 | " ('1ad5be0d', '羊毛党'),\n", 3179 | " ('6dd28e9b', '住家创业 201'),\n", 3180 | " ('f3b61b38', '➗CCM CCM'),\n", 3181 | " ('84b12bae', ''),\n", 3182 | " ('1db87a14', '富民宝 定活宝 富民银行'),\n", 3183 | " ('6abf4a82', '\\u200b\\u200b\\u200b \\u200b\\u200b\\u200b\\u200b'),\n", 3184 | " ('fa005713', '微信群 微信 微小宠'),\n", 3185 | " ('8d076785', '\\u200b\\u200b 鼎诚创投'),\n", 3186 | " ('a15d25e1', '青团社 ▌A 地平线')]" 3187 | ] 3188 | }, 3189 | "execution_count": 153, 3190 | "metadata": {}, 3191 | "output_type": "execute_result" 3192 | } 3193 | ], 3194 | "source": [ 3195 | "result_all[:10]" 3196 | ] 3197 | }, 3198 | { 3199 | "cell_type": "code", 3200 | "execution_count": 164, 3201 | "metadata": { 3202 | "ExecuteTime": { 3203 | "end_time": "2019-09-05T06:14:34.891873Z", 3204 | "start_time": "2019-09-05T06:14:34.882986Z" 3205 | } 3206 | }, 3207 | "outputs": [ 3208 | { 3209 | "data": { 3210 | "text/plain": [ 3211 | "[('b46ecefb', ''),\n", 3212 | " ('2d679f41', 'APP 趣步 亦跑 链信'),\n", 3213 | " ('5a60afd7', '明堂金融 景山 明堂期货'),\n", 3214 | " ('c4043a74', '英皇金融 英皇金融国际'),\n", 3215 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 3216 | " ('2a0a5b58', '鲨鱼记账 网商万宝 网商'),\n", 3217 | " ('5d0d6bce', '无限 华景城 华景无限逍遥卡 华景无限旅游 华景无限 华景无限逍'),\n", 3218 | " ('cdb2765f', '臻鼎投资 国金中融 国金中融 臻鼎'),\n", 3219 | " ('bab546c9', '顺德农商 中国中投 广东顺高投 欧浦小贷'),\n", 3220 | " ('44f1e2a2', '易信easymarkets平台'),\n", 3221 | " ('33f6d234', '美美 国美在线 国美控股 美美理财 国美 国美在线金融 美易'),\n", 3222 | " ('aaff838e', 'P2 芒果金融'),\n", 3223 | " ('ddf8b318', '之道 人人 云联惠 云商'),\n", 3224 | " ('439c26bb', '神店 神店小'),\n", 3225 | " ('349b162d', '华登 beta'),\n", 3226 | " ('ad924797', ''),\n", 3227 | " ('da957701', '喜牛 中 远特喜牛'),\n", 3228 | " ('4a2a6a90', ''),\n", 3229 | " ('3d2d5a06', '购房'),\n", 3230 | " ('5dead3e3', ''),\n", 3231 | " ('2aede375', '恩圣威NCY'),\n", 3232 | " ('b3e4b2cf', '明堂金融 景山 明堂期货'),\n", 3233 | " ('c4e38259', '洛阳百事通 百事通'),\n", 3234 | " ('5a8717fa', '告 上银基金管 财网上银基金管理有限 2、上银基金管理有 致 上银基金管理有限 关于上银 上银基金管理 司 上银'),\n", 3235 | " ('2d80276c', '源中瑞 深圳源中瑞科技'),\n", 3236 | " ('b48976d6', '农科农业保险公司、农科融资租赁公司'),\n", 3237 | " ('c38e4640', '恩圣威 艾拓思'),\n", 3238 | " ('53315bd1', 'Finci芬吉'),\n", 3239 | " ('24366b47', '爱汇宝'),\n", 3240 | " ('3b453440', '赶街')]" 3241 | ] 3242 | }, 3243 | "execution_count": 164, 3244 | "metadata": {}, 3245 | "output_type": "execute_result" 3246 | } 3247 | ], 3248 | "source": [ 3249 | "result_all[-30:]" 3250 | ] 3251 | }, 3252 | { 3253 | "cell_type": "code", 3254 | "execution_count": 166, 3255 | "metadata": { 3256 | "ExecuteTime": { 3257 | "end_time": "2019-09-05T06:19:30.326154Z", 3258 | "start_time": "2019-09-05T06:19:30.316681Z" 3259 | } 3260 | }, 3261 | "outputs": [], 3262 | "source": [ 3263 | "# 开始写规则\n", 3264 | "def clean_ner(text):\n", 3265 | " # 不能匹配数字\n", 3266 | " text = re.sub(\"[0-9]\", \"\", text).strip()\n", 3267 | " # 不包含\\u200b\n", 3268 | " text = re.sub(\"\\\\u200b\", \"\", text).strip()\n", 3269 | " # 剔除\n", 3270 | " ner = []\n", 3271 | " for n in text.split(\" \"):\n", 3272 | " if len(n) > 1:\n", 3273 | " ner.append(n)\n", 3274 | " text = \";\".join(ner)\n", 3275 | " return text" 3276 | ] 3277 | }, 3278 | { 3279 | "cell_type": "code", 3280 | "execution_count": 167, 3281 | "metadata": { 3282 | "ExecuteTime": { 3283 | "end_time": "2019-09-05T06:19:32.552909Z", 3284 | "start_time": "2019-09-05T06:19:32.519213Z" 3285 | } 3286 | }, 3287 | "outputs": [], 3288 | "source": [ 3289 | "l_res= []\n", 3290 | "for i in result_all:\n", 3291 | " text = clean_ner(i[1])\n", 3292 | " l_res.append((i[0], text))" 3293 | ] 3294 | }, 3295 | { 3296 | "cell_type": "code", 3297 | "execution_count": 171, 3298 | "metadata": { 3299 | "ExecuteTime": { 3300 | "end_time": "2019-09-05T06:23:03.955953Z", 3301 | "start_time": "2019-09-05T06:23:03.947973Z" 3302 | } 3303 | }, 3304 | "outputs": [ 3305 | { 3306 | "data": { 3307 | "text/plain": [ 3308 | "[('83dcefb7', '时空周转'),\n", 3309 | " ('1ad5be0d', '羊毛党'),\n", 3310 | " ('6dd28e9b', '住家创业'),\n", 3311 | " ('f3b61b38', '➗CCM;CCM'),\n", 3312 | " ('84b12bae', ''),\n", 3313 | " ('1db87a14', '富民宝;定活宝;富民银行'),\n", 3314 | " ('6abf4a82', ''),\n", 3315 | " ('fa005713', '微信群;微信;微小宠'),\n", 3316 | " ('8d076785', '鼎诚创投'),\n", 3317 | " ('a15d25e1', '青团社;▌A;地平线')]" 3318 | ] 3319 | }, 3320 | "execution_count": 171, 3321 | "metadata": {}, 3322 | "output_type": "execute_result" 3323 | } 3324 | ], 3325 | "source": [ 3326 | "l_res[:10]" 3327 | ] 3328 | }, 3329 | { 3330 | "cell_type": "code", 3331 | "execution_count": 172, 3332 | "metadata": { 3333 | "ExecuteTime": { 3334 | "end_time": "2019-09-05T06:23:20.457884Z", 3335 | "start_time": "2019-09-05T06:23:20.436811Z" 3336 | } 3337 | }, 3338 | "outputs": [], 3339 | "source": [ 3340 | "with codecs.open(\"/home/wangwei/tf_workdir/word_detect/ww.csv\", \"w\", encoding=\"utf-8\") as f:\n", 3341 | " for i in l_res:\n", 3342 | " key = i[0]\n", 3343 | " value = i[1]\n", 3344 | " f.write(key+','+value+\"\\n\")" 3345 | ] 3346 | }, 3347 | { 3348 | "cell_type": "code", 3349 | "execution_count": null, 3350 | "metadata": {}, 3351 | "outputs": [], 3352 | "source": [] 3353 | } 3354 | ], 3355 | "metadata": { 3356 | "kernelspec": { 3357 | "display_name": "PyCharm (tfproject)", 3358 | "language": "python", 3359 | "name": "tf3" 3360 | }, 3361 | "language_info": { 3362 | "codemirror_mode": { 3363 | "name": "ipython", 3364 | "version": 3 3365 | }, 3366 | "file_extension": ".py", 3367 | "mimetype": "text/x-python", 3368 | "name": "python", 3369 | "nbconvert_exporter": "python", 3370 | "pygments_lexer": "ipython3", 3371 | "version": "3.5.6" 3372 | }, 3373 | "toc": { 3374 | "base_numbering": 1, 3375 | "nav_menu": {}, 3376 | "number_sections": true, 3377 | "sideBar": true, 3378 | "skip_h1_title": false, 3379 | "title_cell": "Table of Contents", 3380 | "title_sidebar": "Contents", 3381 | "toc_cell": false, 3382 | "toc_position": {}, 3383 | "toc_section_display": true, 3384 | "toc_window_display": true 3385 | } 3386 | }, 3387 | "nbformat": 4, 3388 | "nbformat_minor": 2 3389 | } 3390 | --------------------------------------------------------------------------------