├── README.md └── 互联网金融新实体发现 └── bert_ner_rank.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # competition 2 | 比赛相关的实践 3 | 4 | 1. [互联网金融新实体发现](https://www.datafountain.cn/competitions/361) 5 | 基于bert+bilstm+crf来进行实体发现。提供jupyter, 开箱即用,线上成绩0.2以上。代码持续迭代中,欢迎大家star。 6 | 部分结果截图如下: 7 | ![](https://i.imgur.com/afLZrMQ.png) 8 | 9 | 另外可以关注我的另外项目https://github.com/searchlink/Bert-Chinese-Task-Pytorch, 基于预训练数据来处理中文任务,谢谢! -------------------------------------------------------------------------------- /互联网金融新实体发现/bert_ner_rank.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 基于bert的ner" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "ExecuteTime": { 15 | "end_time": "2019-09-05T03:01:07.925298Z", 16 | "start_time": "2019-09-05T03:01:05.982504Z" 17 | } 18 | }, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "Using TensorFlow backend.\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import os\n", 30 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"5\"\n", 31 | "os.environ['TF_CPP_MIN_LOG_LEVEL'] = \"3\"\n", 32 | "import codecs\n", 33 | "import re\n", 34 | "import random\n", 35 | "import string\n", 36 | "from tqdm import tqdm\n", 37 | "import pandas as pd\n", 38 | "import numpy as np\n", 39 | "from zhon.hanzi import punctuation\n", 40 | "from sklearn.model_selection import train_test_split\n", 41 | "from keras_bert import load_trained_model_from_checkpoint, Tokenizer\n", 42 | "from keras_contrib.layers import CRF\n", 43 | "import tensorflow as tf\n", 44 | "import keras\n", 45 | "import matplotlib.pyplot as plt\n", 46 | "from seqeval.metrics import precision_score, recall_score, f1_score, classification_report\n", 47 | "%matplotlib inline\n", 48 | "tf.logging.set_verbosity(tf.logging.ERROR)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": { 54 | "ExecuteTime": { 55 | "end_time": "2019-09-05T00:34:46.972232Z", 56 | "start_time": "2019-09-05T00:34:46.967863Z" 57 | } 58 | }, 59 | "source": [ 60 | "## 读取数据" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": { 67 | "ExecuteTime": { 68 | "end_time": "2019-09-05T03:01:07.931254Z", 69 | "start_time": "2019-09-05T03:01:07.928006Z" 70 | } 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# 数据路径\n", 75 | "train_path = \"/home/wangwei/tf_workdir/word_detect/data/Train_Data.csv\"\n", 76 | "test_path = \"/home/wangwei/tf_workdir/word_detect/data/Test_Data.csv\"\n", 77 | "submit_path = \"/home/wangwei/tf_workdir/word_detect/data/Submit_Example_Data.csv\"" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "### 缺失值填充和删除" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 137, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2019-09-05T05:52:09.856878Z", 93 | "start_time": "2019-09-05T05:52:09.495247Z" 94 | } 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# 开始合并数据\n", 99 | "df_train = pd.read_csv(train_path, header=0, encoding='utf-8')\n", 100 | "# 对缺失值进行处理\n", 101 | "df_train[\"title\"] = df_train[\"title\"].fillna('')\n", 102 | "df_train[\"text\"] = df_train[\"text\"].fillna('')\n", 103 | "df_train = df_train[df_train[\"unknownEntities\"].notnull()] # 剔除没有新词的行(剔除了300个,数量有点大)\n", 104 | "\n", 105 | "df_test = pd.read_csv(test_path, header=0, encoding='utf-8')\n", 106 | "df_test[\"title\"] = df_test[\"title\"].fillna('')\n", 107 | "df_test[\"text\"] = df_test[\"text\"].fillna('')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "## 数据预处理" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### 定义正则表达式的pattern" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 4, 127 | "metadata": { 128 | "ExecuteTime": { 129 | "end_time": "2019-09-05T03:01:08.306580Z", 130 | "start_time": "2019-09-05T03:01:08.299173Z" 131 | } 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "## 数据预处理\n", 136 | "pattern1 = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') # 剔除链接\n", 137 | "pattern2 = re.compile(\"\\{IMG.*?\\}\") # 剔除{IMG:1}{IMG:2}等等\n", 138 | "# pattern3 = re.compile(\"(.*?\\)\") # 剔除括号等等\n", 139 | "pattern4 = re.compile(\"《.*?\\》\") # 剔除括号等等\n", 140 | "pattern5 = re.compile(\"【.*?】\") # 删除括号内容\n", 141 | "pattern6 = re.compile(\"\\?+\") # 删除多个问号\n", 142 | "pattern7 = re.compile(\"[\\w!#$%&'*+/=?^_`{|}~-]+(?:\\.[\\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\\w](?:[\\w-]*[\\w])?\\.)+[\\w](?:[\\w-]*[\\w])?\") # 邮箱\n", 143 | "pattern8 = re.compile(\"0\\d{2}-\\d{8}|0\\d{3}-\\d{7}|\\d{5}-\\d{5}\") # 剔除电话\n", 144 | "pattern9 = re.compile(\"(20\\d{2}([\\.\\-/|年月\\s]{1,3}\\d{1,2}){2}日?(\\s?\\d{2}:\\d{2}(:\\d{2})?)?)|(\\d{1,2}\\s?(分钟|小时|天)前)\") # 日期\n", 145 | "pattern10 = re.compile(\"<.*?>\")\n", 146 | "pattern = [pattern1, pattern2, pattern4, pattern5, pattern6, pattern7, pattern8, pattern9, pattern10]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### 定义各种数据处理函数" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "ExecuteTime": { 161 | "end_time": "2019-09-05T03:01:08.322783Z", 162 | "start_time": "2019-09-05T03:01:08.309257Z" 163 | } 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "def clean_zh(text):\n", 168 | " '''清洗文本,保证语句通顺(关于小数点的问题无法处理)'''\n", 169 | " text = text.replace(\"(\", \"(\").replace(\")\", \")\")\n", 170 | " punct = string.punctuation + punctuation\n", 171 | " punct = \"\".join([c for c in punct if c not in [\".\", \"、\", \"%\", \"“\", \"”\", \"(\", \")\", \"!\", \"。\", \"?\"]])\n", 172 | " text = re.sub(r\"[%s]+\" % punct, \" \", text)\n", 173 | " # 将引号替换\n", 174 | " text = re.sub(r\"[%s]+\" % \"“”()\", \"\", text)\n", 175 | " text = re.sub(r\"[%s]+\" % \":\", \" \", text)\n", 176 | " # 多个空格替换成一个\n", 177 | " text = re.sub(' +', ' ', text)\n", 178 | " return text\n", 179 | "\n", 180 | "def clean_data(text):\n", 181 | " \"\"\"清理各种脏数据\"\"\"\n", 182 | " for p in pattern:\n", 183 | " text = re.sub(p, \"\", text)\n", 184 | " text = clean_zh(text)\n", 185 | " return text\n", 186 | "\n", 187 | "def clean_label(label):\n", 188 | " \"\"\"对标签进行清洗\"\"\"\n", 189 | " label_list = []\n", 190 | " label = re.sub(pattern6, \"\", label)\n", 191 | " # 替换\n", 192 | " label = label.replace(\"(\", \"(\").replace(\")\", \")\")\n", 193 | " label = re.sub(r\"[%s]+\" % \"()\", \"\", label)\n", 194 | " return label\n", 195 | "\n", 196 | "def is_contain(col, words):\n", 197 | " words_list = words.split(\";\") # 多个实体\n", 198 | " length = len(words_list)\n", 199 | " flag = []\n", 200 | " for word in words_list:\n", 201 | " if word in col:\n", 202 | " flag.append(1)\n", 203 | " if len(flag) == length:\n", 204 | " return 1\n", 205 | " else:\n", 206 | " return 0\n", 207 | " \n", 208 | "def title_contain(col, words):\n", 209 | " words_list = words.split(\";\") # 多个实体\n", 210 | " flag = 0 \n", 211 | " for word in words_list:\n", 212 | " if word in col:\n", 213 | " flag = 1\n", 214 | " return flag\n", 215 | "\n", 216 | "# 由于每个句子较长,这里只去包含新词实体的句子\n", 217 | "def get_sentence(text, words):\n", 218 | " \"\"\"缩小数据规模\"\"\"\n", 219 | " new_str = []\n", 220 | " # 按照多个分隔符进行分割\n", 221 | " sentences = re.split(\"。|!|?\", text.strip())\n", 222 | " words_list = words.split(\";\") # 多个实体\n", 223 | " for sent in sentences:\n", 224 | " for word in words_list:\n", 225 | " if word in sent and sent not in new_str:\n", 226 | " new_str.append(sent)\n", 227 | " return \"。\".join(new_str)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "### 开始数据处理" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 6, 240 | "metadata": { 241 | "ExecuteTime": { 242 | "end_time": "2019-09-05T03:01:12.957065Z", 243 | "start_time": "2019-09-05T03:01:08.324560Z" 244 | } 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "df_train[\"title\"] = df_train[\"title\"].map(lambda x: clean_data(x))\n", 249 | "df_train[\"text\"] = df_train[\"text\"].map(lambda x: clean_data(x))\n", 250 | "df_train[\"unknownEntities\"] = df_train[\"unknownEntities\"].map(lambda x: clean_label(x))" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 7, 256 | "metadata": { 257 | "ExecuteTime": { 258 | "end_time": "2019-09-05T03:01:13.170263Z", 259 | "start_time": "2019-09-05T03:01:12.959642Z" 260 | } 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "df_train['text_flag'] = df_train[['text','unknownEntities']].apply(lambda x: is_contain(*x), axis=1)\n", 265 | "df_train['title_flag'] = df_train[['title','unknownEntities']].apply(lambda x: title_contain(*x), axis=1)" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": { 272 | "ExecuteTime": { 273 | "end_time": "2019-09-05T03:01:13.179781Z", 274 | "start_time": "2019-09-05T03:01:13.172991Z" 275 | } 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "# 这里剔除样本数据\n", 280 | "df_train = df_train[(df_train['text_flag'] == 1 ) | (df_train['title_flag'] == 1)]" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": {}, 286 | "source": [ 287 | "### 定义词汇和标注函数" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 9, 293 | "metadata": { 294 | "ExecuteTime": { 295 | "end_time": "2019-09-05T03:01:13.188170Z", 296 | "start_time": "2019-09-05T03:01:13.181941Z" 297 | } 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "# 定义tag词汇\n", 302 | "tags = [\"O\", \"B\", \"I\"]\n", 303 | "tag2idx = {tag: i+1 for i, tag in enumerate(tags)}\n", 304 | "tag2idx[\"-PAD-\"] = 0\n", 305 | "n_tags = len(tag2idx)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 10, 311 | "metadata": { 312 | "ExecuteTime": { 313 | "end_time": "2019-09-05T03:01:13.198758Z", 314 | "start_time": "2019-09-05T03:01:13.190064Z" 315 | } 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "# 按照句长进行实体标注\n", 320 | "def remark_tag(text, words):\n", 321 | " \"\"\"多个实体进行标注\"\"\"\n", 322 | " tag_list = [\"O\"] * len(text)\n", 323 | " words_list = words.split(\";\") # 多个实体\n", 324 | " for word in words_list:\n", 325 | " # 获取字符串在文本出现的所有下标\n", 326 | " start_index = [w.start() for w in re.finditer(word, text)]\n", 327 | " for index in start_index:\n", 328 | " tag_list[index] = \"B\"\n", 329 | " tag_list[(index + 1): (index + len(word))] = [\"I\"] * (len(word) - 1)\n", 330 | " assert len(text) == len(tag_list)\n", 331 | " return tag_list" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "## bert部分" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "### bert的存储路径" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 11, 351 | "metadata": { 352 | "ExecuteTime": { 353 | "end_time": "2019-09-05T03:01:13.208855Z", 354 | "start_time": "2019-09-05T03:01:13.200636Z" 355 | } 356 | }, 357 | "outputs": [], 358 | "source": [ 359 | "config_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/bert_config.json\"\n", 360 | "checkpoint_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/bert_model.ckpt\"\n", 361 | "dict_path = \"/home/wangwei/pretrain_data/bert/chinese_L-12_H-768_A-12/vocab.txt\"" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "### 自定义tokenizer" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 12, 374 | "metadata": { 375 | "ExecuteTime": { 376 | "end_time": "2019-09-05T03:01:13.260428Z", 377 | "start_time": "2019-09-05T03:01:13.210861Z" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "# 词汇表\n", 383 | "token_dict = {}\n", 384 | "with codecs.open(dict_path, \"r\", encoding=\"utf-8\") as f:\n", 385 | " for line in f:\n", 386 | " token = line.strip()\n", 387 | " token_dict[token] = len(token_dict)\n", 388 | "\n", 389 | "class OurTokenizer(Tokenizer):\n", 390 | " def _tokenize(self, text):\n", 391 | " R = []\n", 392 | " for c in text:\n", 393 | " if c in self._token_dict:\n", 394 | " R.append(c)\n", 395 | " elif self._is_space(c):\n", 396 | " R.append('[unused1]') # space类用未经训练的[unused1]表示\n", 397 | " else:\n", 398 | " R.append('[UNK]') # 剩余的字符是[UNK]\n", 399 | " return R" 400 | ] 401 | }, 402 | { 403 | "cell_type": "markdown", 404 | "metadata": {}, 405 | "source": [ 406 | "### 加载bert权重和网络结构" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 13, 412 | "metadata": { 413 | "ExecuteTime": { 414 | "end_time": "2019-09-05T03:01:33.176152Z", 415 | "start_time": "2019-09-05T03:01:13.262378Z" 416 | } 417 | }, 418 | "outputs": [], 419 | "source": [ 420 | "tokenizer = OurTokenizer(token_dict)\n", 421 | "bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 14, 427 | "metadata": { 428 | "ExecuteTime": { 429 | "end_time": "2019-09-05T03:01:33.184574Z", 430 | "start_time": "2019-09-05T03:01:33.178373Z" 431 | } 432 | }, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "[]" 438 | ] 439 | }, 440 | "execution_count": 14, 441 | "metadata": {}, 442 | "output_type": "execute_result" 443 | } 444 | ], 445 | "source": [ 446 | "bert_model.outputs" 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": {}, 452 | "source": [ 453 | "### 生成训练集和验证集,以及测试集" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 15, 459 | "metadata": { 460 | "ExecuteTime": { 461 | "end_time": "2019-09-05T03:01:33.199050Z", 462 | "start_time": "2019-09-05T03:01:33.187423Z" 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "# 获取数据\n", 468 | "def get_train_data():\n", 469 | " train_data = []\n", 470 | " n2id = []\n", 471 | " for id, t, c, e, text_flag, title_flag in zip(df_train[\"id\"], df_train[\"title\"], df_train[\"text\"], df_train[\"unknownEntities\"], df_train['text_flag'], df_train['title_flag']):\n", 472 | " if text_flag == 1:\n", 473 | " c = get_sentence(c, e)\n", 474 | " else:\n", 475 | " c = get_sentence(t, e)\n", 476 | " # 实体标注\n", 477 | " try:\n", 478 | " n = remark_tag(c, e)\n", 479 | " n2id = [list(map(lambda x: tag2idx[x], sample)) for sample in n]\n", 480 | " except Exception as e:\n", 481 | " print(c)\n", 482 | " print(e)\n", 483 | " print(\"*\"*50)\n", 484 | " train_data.append((id, c, n2id))\n", 485 | " return train_data" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 16, 491 | "metadata": { 492 | "ExecuteTime": { 493 | "end_time": "2019-09-05T03:02:37.806123Z", 494 | "start_time": "2019-09-05T03:02:35.382861Z" 495 | } 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "train_data = get_train_data()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": 17, 505 | "metadata": { 506 | "ExecuteTime": { 507 | "end_time": "2019-09-05T03:02:42.246000Z", 508 | "start_time": "2019-09-05T03:02:42.233730Z" 509 | } 510 | }, 511 | "outputs": [ 512 | { 513 | "data": { 514 | "text/plain": [ 515 | "(4433, (4433, 6))" 516 | ] 517 | }, 518 | "execution_count": 17, 519 | "metadata": {}, 520 | "output_type": "execute_result" 521 | } 522 | ], 523 | "source": [ 524 | "len(train_data), df_train.shape" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 18, 530 | "metadata": { 531 | "ExecuteTime": { 532 | "end_time": "2019-09-05T03:02:57.354976Z", 533 | "start_time": "2019-09-05T03:02:57.348517Z" 534 | } 535 | }, 536 | "outputs": [ 537 | { 538 | "name": "stdout", 539 | "output_type": "stream", 540 | "text": [ 541 | "('83dcefb7', '揭秘趣步骗局 趣步是什么 趣步是怎么赚钱的。趣步公司可靠吗。趣步合法吗。相信是众多小伙伴最关心的话题 今天小编就来给大家揭开趣步这面丑恶且神秘的面纱 让小伙伴们看清事情的真相。接下来 我用简单的文字 给大家详细剖析一下趣步公司及趣步app的逻辑到底是什么样的', [[1], [1], [2], [3], [1], [1], [1], [2], [3], [1], [1], [1], [1], [2], [3], [1], [1], [1], [1], [1], [1], [1], [2], [3], [1], [1], [1], [1], [1], [1], [2], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [1], [1], [1], [2], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])\n" 542 | ] 543 | } 544 | ], 545 | "source": [ 546 | "print(train_data[0])" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 19, 552 | "metadata": { 553 | "ExecuteTime": { 554 | "end_time": "2019-09-05T03:03:16.293368Z", 555 | "start_time": "2019-09-05T03:03:16.286409Z" 556 | } 557 | }, 558 | "outputs": [ 559 | { 560 | "data": { 561 | "text/plain": [ 562 | "(129, 129)" 563 | ] 564 | }, 565 | "execution_count": 19, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "len(train_data[0][1]),len(train_data[0][2])" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 20, 577 | "metadata": { 578 | "ExecuteTime": { 579 | "end_time": "2019-09-05T03:03:50.637720Z", 580 | "start_time": "2019-09-05T03:03:49.422549Z" 581 | } 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "id_, text_id, X1, X2, Y = [], [], [], [], []\n", 586 | "maxlen = 512\n", 587 | "for i in range(len(train_data)):\n", 588 | " d = train_data[i]\n", 589 | " text = d[1][:maxlen]\n", 590 | " y = d[2][:maxlen]\n", 591 | " x1, x2 = tokenizer.encode(first=text)\n", 592 | " X1.append(x1)\n", 593 | " X2.append(x2)\n", 594 | " Y.append(y)\n", 595 | " id_.append([i])\n", 596 | " text_id.append([d[0]])\n", 597 | "X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=X1, padding=\"post\", value=0)\n", 598 | "X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=X2, padding=\"post\", value=0)\n", 599 | "Y = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=Y, padding=\"post\", value=0)\n", 600 | "Y = [keras.preprocessing.utils.to_categorical(i, num_classes=n_tags) for i in Y]" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": 21, 606 | "metadata": { 607 | "ExecuteTime": { 608 | "end_time": "2019-09-05T03:04:01.341080Z", 609 | "start_time": "2019-09-05T03:04:01.332279Z" 610 | } 611 | }, 612 | "outputs": [ 613 | { 614 | "data": { 615 | "text/plain": [ 616 | "(4433, 4433, (4433, 512), (4433, 512), 4433, (512, 4))" 617 | ] 618 | }, 619 | "execution_count": 21, 620 | "metadata": {}, 621 | "output_type": "execute_result" 622 | } 623 | ], 624 | "source": [ 625 | "len(id_), len(text_id), X1.shape, X2.shape, len(Y), Y[0].shape" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 22, 631 | "metadata": { 632 | "ExecuteTime": { 633 | "end_time": "2019-09-05T03:04:37.519193Z", 634 | "start_time": "2019-09-05T03:04:37.495329Z" 635 | } 636 | }, 637 | "outputs": [], 638 | "source": [ 639 | "id_train, id_test, text_id_train, text_id_test, X1_train, X1_test, X2_train, X2_test, Y_train, Y_test = train_test_split(id_, text_id, X1, X2, Y, test_size=0.1)" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": 23, 645 | "metadata": { 646 | "ExecuteTime": { 647 | "end_time": "2019-09-05T03:04:44.349231Z", 648 | "start_time": "2019-09-05T03:04:44.339623Z" 649 | } 650 | }, 651 | "outputs": [ 652 | { 653 | "data": { 654 | "text/plain": [ 655 | "(3989,\n", 656 | " 444,\n", 657 | " 3989,\n", 658 | " 444,\n", 659 | " (3989, 512),\n", 660 | " (444, 512),\n", 661 | " (3989, 512),\n", 662 | " (444, 512),\n", 663 | " 3989,\n", 664 | " 444)" 665 | ] 666 | }, 667 | "execution_count": 23, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "len(id_train), len(id_test), len(text_id_train), len(text_id_test), X1_train.shape, X1_test.shape, X2_train.shape, X2_test.shape, len(Y_train), len(Y_test)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 24, 679 | "metadata": { 680 | "ExecuteTime": { 681 | "end_time": "2019-09-05T03:05:14.078915Z", 682 | "start_time": "2019-09-05T03:05:14.070738Z" 683 | } 684 | }, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "text/plain": [ 689 | "948" 690 | ] 691 | }, 692 | "execution_count": 24, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "s_id = random.sample(range(len(id_train)), 1)[0]\n", 699 | "s_id" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 25, 705 | "metadata": { 706 | "ExecuteTime": { 707 | "end_time": "2019-09-05T03:05:22.891169Z", 708 | "start_time": "2019-09-05T03:05:22.883995Z" 709 | } 710 | }, 711 | "outputs": [ 712 | { 713 | "data": { 714 | "text/plain": [ 715 | "[1716]" 716 | ] 717 | }, 718 | "execution_count": 25, 719 | "metadata": {}, 720 | "output_type": "execute_result" 721 | } 722 | ], 723 | "source": [ 724 | "id_train[s_id]" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 26, 730 | "metadata": { 731 | "ExecuteTime": { 732 | "end_time": "2019-09-05T03:05:30.306926Z", 733 | "start_time": "2019-09-05T03:05:30.299948Z" 734 | } 735 | }, 736 | "outputs": [ 737 | { 738 | "data": { 739 | "text/plain": [ 740 | "['7ed5aa77']" 741 | ] 742 | }, 743 | "execution_count": 26, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "text_id_train[s_id]" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": 27, 755 | "metadata": { 756 | "ExecuteTime": { 757 | "end_time": "2019-09-05T03:05:40.417808Z", 758 | "start_time": "2019-09-05T03:05:40.411409Z" 759 | } 760 | }, 761 | "outputs": [ 762 | { 763 | "name": "stdout", 764 | "output_type": "stream", 765 | "text": [ 766 | "('7ed5aa77', '悦花越有在灾害发生后第一时间向九寨沟县地震灾区捐款100万元资助灾区人民重建家园。 之后 悦花越有以最快的速度在平台发起了捐积分 援四川的大转盘抽奖献爱心活动 号召平台用户为灾区献出自己的爱心。 悦花越有平台用户响应积极 纷纷伸出援助之手 为灾区人民奉献了自己爱心。参与悦花越有本次大转盘献爱心活动的人数超过7100人 最终平台用户冯媛女士从众多用户中有幸被抽中。身为悦花越有平台的忠诚用户 她将会以悦天使的身份 代表悦花越有为灾区传递爱心。 灾难发生后 悦花越有心系灾区人民 积极发挥社会责任感与企业影响力 不仅通过捐款帮助灾区 更是以企业平台和号召力呼吁企业员工及用户为灾区人民献爱心 提供帮助。 未来 悦花越有还会持续关注九寨沟灾区的动态 悦花越有为灾区人民祈福 愿灾区人民早日度过难关 重建家园', [[2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])\n" 767 | ] 768 | } 769 | ], 770 | "source": [ 771 | "print(train_data[1716])" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 28, 777 | "metadata": { 778 | "ExecuteTime": { 779 | "end_time": "2019-09-05T03:05:57.523955Z", 780 | "start_time": "2019-09-05T03:05:57.516405Z" 781 | } 782 | }, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "(351, 351)" 788 | ] 789 | }, 790 | "execution_count": 28, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "len(train_data[1716][1]), len(train_data[1716][2])" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "## 自定义ner模型" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "### 定义模型" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 29, 816 | "metadata": { 817 | "ExecuteTime": { 818 | "end_time": "2019-09-05T03:06:33.035544Z", 819 | "start_time": "2019-09-05T03:06:28.498463Z" 820 | } 821 | }, 822 | "outputs": [ 823 | { 824 | "name": "stderr", 825 | "output_type": "stream", 826 | "text": [ 827 | "/home/wangwei/miniconda3/envs/tf3/lib/python3.5/site-packages/keras_contrib-2.0.8-py3.5.egg/keras_contrib/layers/crf.py:346: UserWarning: CRF.loss_function is deprecated and it might be removed in the future. Please use losses.crf_loss instead.\n", 828 | "/home/wangwei/miniconda3/envs/tf3/lib/python3.5/site-packages/keras_contrib-2.0.8-py3.5.egg/keras_contrib/layers/crf.py:353: UserWarning: CRF.accuracy is deprecated and it might be removed in the future. Please use metrics.crf_accuracy\n" 829 | ] 830 | }, 831 | { 832 | "name": "stdout", 833 | "output_type": "stream", 834 | "text": [ 835 | "__________________________________________________________________________________________________\n", 836 | "Layer (type) Output Shape Param # Connected to \n", 837 | "==================================================================================================\n", 838 | "input_1 (InputLayer) (None, None) 0 \n", 839 | "__________________________________________________________________________________________________\n", 840 | "input_2 (InputLayer) (None, None) 0 \n", 841 | "__________________________________________________________________________________________________\n", 842 | "model_2 (Model) (None, None, 768) 101677056 input_1[0][0] \n", 843 | " input_2[0][0] \n", 844 | "__________________________________________________________________________________________________\n", 845 | "bidirectional_1 (Bidirectional) (None, None, 256) 918528 model_2[1][0] \n", 846 | "__________________________________________________________________________________________________\n", 847 | "dropout_1 (Dropout) (None, None, 256) 0 bidirectional_1[0][0] \n", 848 | "__________________________________________________________________________________________________\n", 849 | "time_distributed_1 (TimeDistrib (None, None, 128) 32896 dropout_1[0][0] \n", 850 | "__________________________________________________________________________________________________\n", 851 | "crf_1 (CRF) (None, None, 4) 540 time_distributed_1[0][0] \n", 852 | "==================================================================================================\n", 853 | "Total params: 102,629,020\n", 854 | "Trainable params: 951,964\n", 855 | "Non-trainable params: 101,677,056\n", 856 | "__________________________________________________________________________________________________\n" 857 | ] 858 | } 859 | ], 860 | "source": [ 861 | "x1_in = keras.layers.Input(shape=(None,)) # 待识别句子输入\n", 862 | "x2_in = keras.layers.Input(shape=(None,)) # 待识别句子输入\n", 863 | "bert_output = bert_model([x1_in, x2_in])\n", 864 | "lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=128, return_sequences=True))(bert_output)\n", 865 | "drop = keras.layers.Dropout(0.4)(lstm)\n", 866 | "dense = keras.layers.TimeDistributed(keras.layers.Dense(128, activation=\"relu\"))(drop)\n", 867 | "crf = CRF(n_tags)\n", 868 | "out = crf(dense)\n", 869 | "model = keras.models.Model(inputs=[x1_in, x2_in], outputs=out)\n", 870 | "model.compile(loss=crf.loss_function, optimizer='adam', metrics=[crf.accuracy])\n", 871 | "# model.compile(loss=keras.losses.crf_loss, optimizer='adam', metrics=[keras.metrics.crf_accuracy])\n", 872 | "model.summary()" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": 30, 878 | "metadata": { 879 | "ExecuteTime": { 880 | "end_time": "2019-09-05T03:06:40.347908Z", 881 | "start_time": "2019-09-05T03:06:40.342689Z" 882 | } 883 | }, 884 | "outputs": [], 885 | "source": [ 886 | "save_path = \"/home/wangwei/tf_workdir/word_detect/model\"\n", 887 | "filepath=\"model_{epoch:02d}-{val_crf_viterbi_accuracy:.4f}.hdf5\"" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 31, 893 | "metadata": { 894 | "ExecuteTime": { 895 | "end_time": "2019-09-05T03:06:47.570866Z", 896 | "start_time": "2019-09-05T03:06:47.563941Z" 897 | } 898 | }, 899 | "outputs": [], 900 | "source": [ 901 | "callbacks = [\n", 902 | " keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, verbose=0),\n", 903 | " keras.callbacks.ModelCheckpoint(os.path.join(save_path, filepath), monitor='val_loss', save_best_only=True, verbose=0),\n", 904 | "]" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "### 训练过程" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": 32, 917 | "metadata": { 918 | "ExecuteTime": { 919 | "end_time": "2019-09-05T03:57:21.704786Z", 920 | "start_time": "2019-09-05T03:07:05.545118Z" 921 | }, 922 | "scrolled": true 923 | }, 924 | "outputs": [ 925 | { 926 | "name": "stdout", 927 | "output_type": "stream", 928 | "text": [ 929 | "Train on 3989 samples, validate on 444 samples\n", 930 | "Epoch 1/20\n", 931 | "3989/3989 [==============================] - 238s 60ms/step - loss: 7.9539 - crf_viterbi_accuracy: 0.8932 - val_loss: 7.4705 - val_crf_viterbi_accuracy: 0.9329\n", 932 | "Epoch 2/20\n", 933 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.7739 - crf_viterbi_accuracy: 0.9318 - val_loss: 7.4365 - val_crf_viterbi_accuracy: 0.9378\n", 934 | "Epoch 3/20\n", 935 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.7371 - crf_viterbi_accuracy: 0.9429 - val_loss: 7.4174 - val_crf_viterbi_accuracy: 0.9470\n", 936 | "Epoch 4/20\n", 937 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.7145 - crf_viterbi_accuracy: 0.9530 - val_loss: 7.4022 - val_crf_viterbi_accuracy: 0.9547\n", 938 | "Epoch 5/20\n", 939 | "3989/3989 [==============================] - 232s 58ms/step - loss: 7.6976 - crf_viterbi_accuracy: 0.9627 - val_loss: 7.3880 - val_crf_viterbi_accuracy: 0.9723\n", 940 | "Epoch 6/20\n", 941 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6834 - crf_viterbi_accuracy: 0.9686 - val_loss: 7.3920 - val_crf_viterbi_accuracy: 0.9617\n", 942 | "Epoch 7/20\n", 943 | "3989/3989 [==============================] - 232s 58ms/step - loss: 7.6709 - crf_viterbi_accuracy: 0.9741 - val_loss: 7.3792 - val_crf_viterbi_accuracy: 0.9717\n", 944 | "Epoch 8/20\n", 945 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6624 - crf_viterbi_accuracy: 0.9765 - val_loss: 7.3783 - val_crf_viterbi_accuracy: 0.9772\n", 946 | "Epoch 9/20\n", 947 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6563 - crf_viterbi_accuracy: 0.9775 - val_loss: 7.3758 - val_crf_viterbi_accuracy: 0.9789\n", 948 | "Epoch 10/20\n", 949 | "3989/3989 [==============================] - 231s 58ms/step - loss: 7.6476 - crf_viterbi_accuracy: 0.9811 - val_loss: 7.3763 - val_crf_viterbi_accuracy: 0.9784\n", 950 | "Epoch 11/20\n", 951 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6408 - crf_viterbi_accuracy: 0.9833 - val_loss: 7.3747 - val_crf_viterbi_accuracy: 0.9805\n", 952 | "Epoch 12/20\n", 953 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6378 - crf_viterbi_accuracy: 0.9836 - val_loss: 7.3881 - val_crf_viterbi_accuracy: 0.9797\n", 954 | "Epoch 13/20\n", 955 | "3989/3989 [==============================] - 230s 58ms/step - loss: 7.6343 - crf_viterbi_accuracy: 0.9851 - val_loss: 7.3805 - val_crf_viterbi_accuracy: 0.9811\n" 956 | ] 957 | } 958 | ], 959 | "source": [ 960 | "history = model.fit([X1_train, X2_train], np.array(Y_train), batch_size=64, epochs=20,\n", 961 | " validation_data=([X1_test, X2_test], np.array(Y_test)), verbose=1, callbacks=callbacks)" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "metadata": {}, 967 | "source": [ 968 | "### 绘制训练图形" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": 33, 974 | "metadata": { 975 | "ExecuteTime": { 976 | "end_time": "2019-09-05T03:57:21.729179Z", 977 | "start_time": "2019-09-05T03:57:21.708070Z" 978 | } 979 | }, 980 | "outputs": [ 981 | { 982 | "data": { 983 | "text/html": [ 984 | "
\n", 985 | "\n", 998 | "\n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | "
crf_viterbi_accuracylossval_crf_viterbi_accuracyval_loss
00.8932167.9538960.9329127.470517
10.9318157.7739330.9378187.436458
20.9428867.7370790.9470187.417439
30.9529997.7144590.9547317.402221
40.9627387.6976160.9723317.388022
\n", 1046 | "
" 1047 | ], 1048 | "text/plain": [ 1049 | " crf_viterbi_accuracy loss val_crf_viterbi_accuracy val_loss\n", 1050 | "0 0.893216 7.953896 0.932912 7.470517\n", 1051 | "1 0.931815 7.773933 0.937818 7.436458\n", 1052 | "2 0.942886 7.737079 0.947018 7.417439\n", 1053 | "3 0.952999 7.714459 0.954731 7.402221\n", 1054 | "4 0.962738 7.697616 0.972331 7.388022" 1055 | ] 1056 | }, 1057 | "execution_count": 33, 1058 | "metadata": {}, 1059 | "output_type": "execute_result" 1060 | } 1061 | ], 1062 | "source": [ 1063 | "hist = pd.DataFrame(history.history)\n", 1064 | "hist.head()" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "code", 1069 | "execution_count": 34, 1070 | "metadata": { 1071 | "ExecuteTime": { 1072 | "end_time": "2019-09-05T03:57:21.985209Z", 1073 | "start_time": "2019-09-05T03:57:21.731383Z" 1074 | } 1075 | }, 1076 | "outputs": [ 1077 | { 1078 | "data": { 1079 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAe0AAAHVCAYAAADcnaM7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3Xl8VPW9//HXmewLWSYJ2RMgLAYQBEERFUFotVQr1d7a2motWttbi7utttrbW6ul1aq9WrVatFVLxfZXxWqtFituKODCviVAAtn3fZuZ8/39MRCMLElgJpNJ3s/Hg4eEnDnzma/Am+/3fBfLGGMQERGRQc8R6AJERESkbxTaIiIiQUKhLSIiEiQU2iIiIkFCoS0iIhIkFNoiIiJBQqEtIiISJBTaIiIiQUKhLSIiEiQU2iIiIkEiNNAFHElZWZlP75ecnExNTY1P7xms1BY9qT16UnscorboSe3Rk6/bIyMjo0/XqactIiISJBTaIiIiQUKhLSIiEiQU2iIiIkFCoS0iIhIkFNoiIiJBQqEtIiISJBTaIiIiQUKhLSIiEiQU2iIiIkFCoS0iIhIkFNoiIiJBQqEtIiISJBTaIiIiQUKhLSIiEiQU2iIiIkFCoS0iItIPprUZ09UZkPcODci7ioiIDHLGGKirhv17MPu8P9i/F+qq6brjPsgdP+A1KbRFRGTYMx4PVJRi9u+GfXsw+/d6A7q12XuBZUFqBlbeSTBvISEZOQGpU6EtIiLDiunshJK9mP3enrPZtwdKi8HV5b0gNAwyc7GmnwE5Y7Cyx0DWKKyIyO57hCYnQ03NgNeu0BYRkSHLNDfB/t3envPBIe7KMjC294LoGMgegzX3C97/5oyBtCyskJDAFn4UCm0REQl6xhioqfT2nPd/6vlz/ad6w85kbzDPOMsbzjljwJmCZVmBK7yfFNoiIhJUjNsNFSUHgvng8+c90NbqvcByQFom1rhJ3uHtnDGQNRprRFxgC/cBhbaIiPiVsW1wucDVCV1d3mfHPX7eBV1dGFcXdHX2+LUe13V2YCpKvc+f3S7vzcPCvc+bZ5x94PnzaMgchRUREdgP7ScKbREROSrTUIfZtoGWzjbshgZviHYHapd3vfKnvqar0xuonw7cgwF7PEJCITzcG85h4TAyHevcCw4FdGrmoH3+7A8KbRER6WbcLijcjtn6CWbLx1CyF4BW8A47HwzQ8HAIi/jUz8MhMh7Cw7HCwiE84lDQHrz2U6+1Pv31sa5zDJ9A7guFtojIMGeqKzBbPsZs/Rh2bIbOdggJgbETsS6+AmvSdJJPnkZNQ0NQTdoaihTaIiLDjOnsgJ2bDwV1Vbn3G8mpWGfMxZo0DU6aghUZ3f0aKyxMgT0IKLRFRIY4YwyUFmO2fozZ+gkUbAW32zsMPWEK1rkXYk2e7n1erGAe1BTaIiJDkGltxmzbCFs/8gZ1Q533G5m5WOdegDVpOoyb6H3+LEFDoS0iMgQY2wN7C7wTyLZ+DHsLvLt+Rcdg5Z8Ck6djTZyG5UwOdKlyAhTaIiJByjTUenvRWz/BbNvgPdzCsmDUOKwvftU75D1q3LBaEjXUKbRFRIKEcbmgcNuhZ9MlRd5vxCdiTT0NJk3DmngKVmzw7/wlR6bQFhEZxExV2aE10zs3Q2eHd8ORsflYl3zL+2w6a5QmkA0TCm0RkUHGeDyYj9/HvPZ3KC70/mJKGtYZ53qHvCecjBUZFdgihwmPbShr7mJvfSfFDZ0U1XdQ1NDJT88PJTcA/wsU2iIig4Tp7MSseQPz7xehugJGZmBdejXWlBlYIzMCXd6Q19jhpqihk6L6TooaOilu6GBfQxcu2wAQYkFWfASTRkYTHR4CuAe8xj6F9oYNG3jqqaewbZv58+ezaNGiHt+vrq7m0UcfpampidjYWJYsWUJSUhIAzz77LB9//DHGGE4++WS+/e1vaxhHRORTTEsTZvU/MW+8DC1NMHo8jq9cCaecHvBtPDvcNtuq2ohogDB3O0nRoSREhhLiCN6/x10eQ2lTZ4+ALqrvoL7D031NYmQIuYmRfHFCDKMSIhiVGEFWXARhId7PnZw8gpqazgGvvdfQtm2bZcuWcccdd5CUlMTtt9/OjBkzyMrK6r7mmWeeYc6cOcydO5ctW7awfPlylixZws6dO9m5cyf33XcfAHfeeSfbtm1j0qRJ/vtEIiJBwtRUYla9hHnnde9BGyfPwHH+xTBuUsA6Nx7bsLuugw0VrWwsb2VHTTtuu+c1DgsSIkNJij7wIyoUZ3QYSVHer53RoSRFhREV5gjIZzjIGEN9h8c7pH0wnBs6KWnsxOPtPBPmsMhJCGdaRmx3OOcmRJAQOTgHonutqrCwkLS0NFJTUwGYPXs269ev7xHaJSUlXHHFFQBMmjSJe++9FwDLsujq6sLtdmOMwePxEB8f74/PISISNMz+vZh//R3z4TtgWVinzcE672KszNyA1FPe3MXGilY2lLexubKVli5vSo9OjODCCU5OSY8hOzWJ3WXV1La5qWt3U9vmprbNRWlTF5sr2mh12YfdNzrMccxQT44OJS4yBIcP/oHS6bbZ39hFUYP3mXPxgZBu6jzUe06ODmVUQgQzM2PJPRDQmSPCg2rUoNfQrqur6x7qBkhKSqKgoKDHNbm5uaxbt46FCxeybt062tvbaW5uZvz48UyaNIlrrrkGYwznn39+j7AXERkujDGwYxP2a3+HrZ9ARBTW/AuxFnwJy5kyoLU0d3rYVNnKxvI2NlS0UtniPTozOTqUWdkjmJoWw5S06B69zeTkWJIcHUe9Z4fb7g7y7lBv935d2+Zmf3kr9R1uDjwe7hbqgMTIA6HeHfChJB8IeeeB3nx4iLfXboyhps19oOfc0T3EXdbc1X3viBCLnIQITs+KZXRiJKMSvL3n2IjgX6/uk/7/5ZdfzpNPPsnq1avJz8/H6XTicDioqKigtLSUxx57DIC77rqL7du3k5+f3+P1q1atYtWqVQAsXbqU5GTf7tgTGhrq83sGK7VFT2qPntQeh/iqLYzHQ+cHb9H64rO4C3fgiE8k+hvfJer8L+MYoPXUXW6bLRVNrN/XwPp9DeyobMEA0eEhTM+K57IZCczMSSAnIeqow/J9aY/eumQe21DX1kVNSxfVrV1Ut3RS3XLovyXNXXxS3ka7y3PYa+MiQ3FGh1PT0klL16HvZ8RHMjYlls+dFENecgxjk2PIiI/0e+85UH9Weg1tp9NJbW1t99e1tbU4nc7DrrnlllsA6OjoYO3atcTExPDGG28wbtw4IiMjAZg2bRq7du06LLQXLFjAggULur+uqak5/k90BMnJyT6/Z7BSW/Sk9uhJ7XHIibaF6TowE/z1T80Ev/z7cMa5tIeF097RBR3+aWtjDMUNnWysaGNDeStbq9ro9BgcFkxIjuJrJyczNS2acclRhB4MN08btbVtR72nr35vWEBKKKTEA/HhQDgQ2+OaNpfnQK/94FC8q/vn+UkjyE2IYHRiJDkJ4USHfab37Gmlvq71hOvsja//rGRk9G11QK+hnZeXR3l5OVVVVTidTtasWcN1113X45qDs8YdDgcvvPAC8+bNA7wf6o033sDj8WCMYdu2bSxcuPA4Po6ISHAwrc2YN/+J+c/L0Nw4YDPBa9tcbKxoY2N5KxsrWrtnQmfGhbMgL56p6TGcnBp9eMgNQtFhIUTHh5AdHxHoUgadXkM7JCSExYsXc/fdd2PbNvPmzSM7O5sVK1aQl5fHjBkz2LZtG8uXL8eyLPLz87nqqqsAmDVrFlu2bOnuhZ9yyinMmDHDv59IROQEeWxDTWsXbS4PkaGOPk2UMrVVmH+vxLz7b++uZSfPwHHexTDePzPB2102W6u8PekNFa3sb+wCID4ihKlpMUxNj2ZqWgwpMWE+f28JHMsYY3q/bGCVlZX59H4a8jtEbdGT2qOn4dgexhjKm10U1LZTUNdBYW0He+o66DywJsgCosIcRIU5iD7wIyoshKjQAz/vbCGqaAdR+3cR7ekketQYoqfNJCY9/cC1DqLDQogKcxwaij4OHttQWNfBxgMhvfPAUqzwEIuJI6OZmhbNKWkxjEqM8Mls7M8ajr83jmXQDo+LiAwVxhhq290U1HrDuaC2ncK6DloPLHEKD7EYnRjJ58YmMD49kbrGZtpcNu0um3a3TZvL+6O9y0NtfTNtrR20G4v20DzMmHGH3miLC7bsO+z9w0OsHsHfHeqhn/5HQUiPfyDUtbvZWNHKpso2WrtsLGCMM4IvneRdipWfEtU9s1qGPoW2iAxZTZ0eCmvbKajtOBDU7d3Peh0W5CZEcGbOCMYlRTHWGUlOQkR3b/hIPSlje+CTD7D/9XcoKoAR8VjzL8Sc8wU6I6Jpd30q2F02bS7Pp37+2f96v1fd6up+TVuXp3vTj08bGRPK7ANLsaamRRM3SDf+EP/T/3kRGRLaXTa767y954LaDgrrOrrXH4N3QtbUtBjGJkUyLimK0YkRRIT2rYfqnQn+H8zrLxyYCZ6O9c3vY80+FyssHIBovBOoko59q2O/jzG4bNMj4KPDHKTFhmn7ZwEU2iIShFwem731nQfC2RvSJY1dHOykpkSHMjYpivPGJjAuKZI8ZyQx4f2fNX3YTPBR43BcciVM889McMuyCA+xCA9xkBDp89vLEKDQFpFBzWMb9jd2UljX0T3MXdzQ0b0fdnxkCOOckZyVE8fYpEjGJkUe977RxtUFNVVQU0nz3h3Yr6/0zgSffCqO8y/x20xwkb5SaIvIoFLZ0sXOmgOTxGo72P2pmdzRYQ7GOiP50klOxh0Y5k6ODu1zkBrbhsZ6qK7A1FRATSVUV2JqKqGmAhrquq9tCwnBmjkH67xFWFmj/fJZRfpLoS0ig0K7y+bJjyt5vbAR6DmTe9yBHnTGiPBelzOZtlZvANdUYqorvf89GNA1VeA+9Jwby4LEJEhOxZo4DVJSvT9PTiNp4hTqulxHfyORAFBoi0jA7axp54E1ZVQ0u1iU7+ScUXE9ZnJ/mnG7oK76UA+5+kBA13gDmtbmni+IjoHkNMjMxZp6GiSnYSWnQkoaOFOwwo68+YgjLh60LlkGGYW2iASMxzb8dWstKzbXkBQVyt0Lcpg4MgqaGmBvEXb1wR5yBaamyhvQ9bVgPnUMZGgoOEdCSirW6HHdPWWSD/SaY2KPXoBIkFFoi0hAlDd38cCaMnbWdHBOZiTfsQqI/utz2Ds2Q2d7z4vjnd5QHj/J22tOTsU6MJRNgtOve3qLDCYKbREZUMYYVu1u5A8fVuDweLix+i3OXv0v7zedKVizzoGMnEND2EkjscJ1cIQIKLRFZIAYVxdNWzbzyLZWPmAkk+t3s2Tn86Skp2At+ibW1JmQOUpLqkSOQaEtIn5jmhowmz/EbFzHJ6UtPJy3iOYwJ1d0buaiqQmEXPEgVlxioMsUCRoKbRHxGWMMlBZjNq7DbFoPe3fRaYXwbP7FvDJxBtkRNnfOySZv5ORAlyoSlBTaInJCjMsFu7YcCuraKu83csdStHAxD5iT2N9muHBCIpefktLn/b5F5HAKbRHpN9PceGDYez1s/cQ72zs8HPJPwVr4X5iTT2VlucWfN1YzIiKEn52bzrT0mECXLRL0FNoi0itjDJTtx2xah9m4DvbsBGO8y61On4M15TTIn4IVHkF1q4sH3y9nS2Ubs7Jjufa0NB0lKeIj+pMkIkdk3C7YtRWzab03qGsqvd/IycO64FLv7mI5eT1me79d1MRj6yrwGFgyK435Y+I1G1zEhxTaItLNbmrE/uBN2Lges/VjaG+D0DDIn4p13sVYU2ZiOZMPe11Ll4fH11fyVlETE5KjuHF2OukjwgPwCUSGNoW2iGDK9mE/9wTVOzeDbUNcAtapZ3rXTuefghVx9MOdt1S28eCaMmrb3Vw2JZmvTEoi5Ah7hovIiVNoiwxjxhjMm69g/vZHiIgk5pIraB83GXLHYjmOPcvb5TEs31TNC9vqSBsRxtLP5zIhOWpgChcZphTaIsOUaazH/uNvYcvHMPlUHFdeR2zeODr6cLLVvsZO7n+vjL31nXx+bDyLp6cSFaalXCL+ptAWGYbMhg+w//QwdHZgXfZdrLkL+zRhzBjDP3c18MdPqogMdfDjOZmcnj1iACoWEVBoiwwrpqMd8/wyzDuvQ84YHFffjJWe3afX1re7+b/3y/m4vJVTM2JYMiudxCj9FSIykPQnTmSYMHt3Yf/hN1BdgXX+JVgXXYYVGtan136wv5nfra2gw23z3ZmpfGFcgpZyiQSAQltkiDMeD+bVv2L+8RwkOHHcfDfWhL7t/d3usln2USX/3t1InjOCG2dnkB2vYzJFAkWhLcOWbYx3p68hzFRXYC+7H3bvwDptDtY3vocVHdun1+6saef+98qobHFxyUQnX5+SQliIetcigaTQlmHrvnfLaLcr+PFZaUMujIwxmPf/g/nL42BZWFffjOP0c/r0Wo9t+OuWWlZsqSEpKpS7F+QwKTXazxWLSF8otGVYKqrv4L19zQA8uzGEb08fGeCKfMe0NGE/+wh8tAbGT8Kx+EaspL59vpKGdu58vZhdtR3MHRXHNTNTiQkP8XPFItJXCm0ZllbuqCcixGLeuGRe3F7NyanRzMjs27DxYGa2bcB+6kFobsK6+FtY5y3CcvQtdFftbuAPH+3CYcHNZ2YwZ1Scn6sVkf5SaMuwU9fu5u2iRj4/NoFb5o9jZ2UTv32/nAcXjiIpum+zqQcb4+rC/P0ZzKqVkJaF4wd3YuXm9fn1b+5p5KEPKpieFc/3ZySTEhOc7SAy1GkLIxl2Xt1Vj8eGCyc4iQh1cMtZGXR5bO5fU47HDr6JaaakCPvumzGrVmLNXYjjjgf6Fdh76jp4ZF0Fk0dG8cCXJyuwRQYxhbYMK51um1cLGjgtK5aMOO8pVFlxEXx3ZhpbKtv465baAFfYd8a2sVetxL77ZmhqwHHdT3F843tYEX1fktXc6WHpO6WMCA/h1rMyCdVBHyKDmobHZVj5z55Gmjs9XJTv7PHr546JZ2NFKyu21DA5NZrJg3y2tKmv9T673r4Rpp6G44ofYMUl9OsetjE8sKaM2jYX93wulwTtbiYy6KmnLcOGbQwv7ahnrDOSiSmHn0b1vZlppMWG85v3ymjscAegwr4xH63B/t/rvGuvL/8+jmt/0u/ABnhucw0flbVy9ampOp1LJEgotGXY+LC0hbLmLi7Kdx5xC86oMAe3npVBc6eH375fjj3INl4xHW3YT/0W+7GlkJyK484Hccw5/7i2E11f0sKKzbWcOyaO88f1P/BFJDAU2jJsrNxRT3J0KLNzjn4q1RhnJN+ePpKPylp5aUfdAFZ3bKZwO/bPb8C8/ybWwq/iuO3XWGmZx3Wv8uYuHni/jDGJEXxvZpr2EBcJInqIJcPC7roOtlS2ceW0lF4nWy0cn8Cmylae/qSaiSnRjA/g0LFxuzGvPI955XlwJuO49R6scROP+36dbpulb5diAbfNySQiVP9uFwkm+hMrw8LK7XVEhjr43Njeh4Ity2LJ6ekkRYdy33tltHZ5BqDCw5nKMuxf34Z5+TmsWefg+OlvTyiwjTH8bm0FxQ2d3HxmBqmx4T6sVkQGgkJbhryaNhfvFjfxubHxxPZxS87YiBBuPjOT6lYXv1tbMaAHixhjsN95HfuuG6CyDOuaH3q3Io2OOaH7vrKrnreKmrhsSjLTM4J/9zeR4UjD4zLkvbKzHgNcOCGxX687KSWKb05N4ekN1UwpbOD8cf17/fEwzU3YTz8MGz6Ak6bg+PYNWM7kE77vtqo2nvyoipmZsXxlcpIPKhWRQFBoy5DW7rJ5rbCBM7JHHNdw8JcnOtlc2cayj6o4KTmKUYmRfqjSy2z5CPuP/wetzVj/9W2sBRdhOU58MKyu3c2v3yllZGwYN8xOx6GJZyJBS8PjMqS9saeB1i77sM1U+sphWdwwO52YMAf3vltGh9v2cYVgujqx//I49m//F2JG4Pjxb3B8/ss+CWy3bbj3nVLaXDa3z8nq8+MBERmcFNoyZHlswz921DMhOeqENg9JiAzlxjMzKG3q4vH1lT6sEEzpPu++4f95GWv+hTh+8hus7NE+u/9TH1exrbqdH8xKJzeh79ubisjgpOFxGbLWlbRQ0eLiW9NSTvheU9Ni+MqkJP66tZYpadHMHR1/QvczxmDeW4X5y+8hIgrH9T/Dmjz9hOv8tLf2NvLyznouPClRx2yKDBEKbRmyVu6oIzU2jNOzjr6ZSn98fUoyW6vaeHRdJeOToroPHOkv09GGefZRzNq3vJPNrroJK+H4hu+Ppqi+g4fXVjBpZBRXThvp03uLSOBoeFyGpJ017WyvbufCCYmE+OjkqhCHxU1nZhDmgHvfLcXl6f/zbbNvD/Yvbsasewfrostw3Pi/Pg/sli4Pv3y7lFid3CUy5Ci0ZUhaub2OmDAH8/NObBj7s1JiwlhyRjp76jv54yfVfX6dMQZ79T+xf3krdLbjuPkXOC74GpbDtxPDbGN4cE0ZNW0ufnh2Bok6uUtkSNGfaBlyKlu6eH9/Mxed5CQ6zPezpU/PGsGFExL5x856pqRGc3r2sYffTVsr9tMPwUdrYPJ070YpI3z7j4mDnt9Sy/rSVq6ZkUp+yuA+XlRE+k+hLUPOyzvrsYAv9nMzlf741rQUtlW38X8flPOgM5KUmLAjXmf2FmA//muoq8a65FtYPlrKdSQflbbw3KYa5o6OY+F4ndwlMhRpeFyGlNYuD/8ubOTM3LijBqkvhIU4uPWsTDw2/Oa9Mjx2z21OjTHYq1Zi/+pHYNs4bv0ljvMv8VtgVzR38Zs1ZYxKjOD7p+nkLpGhSqEtQ8qq3Y20u22+dJL/txxNHxHO909PY3t1O8s31XT/umltxv7d3ZgVy7zD4T99EGtsvt/q6HTbLH3nwMldZ+vkLpGhTMPjMmR4N1OpY9LIKMYlDcxxmnNGxbGxopX/t7WWk1Ojmdq6D/uJe6GxAevSq7HmX+jXXq8xhkfWVVBU38mdc7NIG6GTu0SGMv2TXIaMNfuaqW5zH/eWpcfrmhmpZMaF88Cbe6h78G4ICcVx269wLPiS34ep/7mrgdV7m/jalGROzdTJXSJDnUJbhgRjDCt31JExIoyZAxxe4W1N3Fz0Im1u+O3Ma+An92ONGuf3991e3cayjyqZmRnDV3Vyl8iwoNCWIWF7dTsFtR186STngJ5iZXZuwf75DeRue4+rkhrZGDqSF/Z2+v1969vd/PqdMlJiwrhhdoZO7hIZJvRMW4aElTvqGBHuYN4Y/6x//ixjezCv/BXzj+dgZDqO6/+H87JGsendMv68qZpJI6PIH+mfddJu23Dvu6W0dHm497xcndwlMoyopy1Br7y5i7X7WzhvXCKRAzBz2jTUYT/wP5iXlmOdPgfHHfdjZY/GsiyuPT2NkTFh3PdeGc2dHr+8/58+qWJrVTs/OD3Nr+d7i8jgo9CWoPePHXWEOPy7mcpBZtsn2D+/HvbswLryOqzFN2JFHpqpHhMewi1nZdDQ4eahD8oxxhzjbv33dlETL+2o54IJiZxzgieNiUjwUWhLUGvp9LBqdyNzRsXh9OM+28bjwX7hGewHfwYj4nH85H4cZy444uzwcUlRXHHKSNaWtPDKrnqf1VDc0MnDH5STn6KTu0SGKz3TlqD2WmEDnR7Dl07y3zIvU1eD/cR9ULgN66zPYX3tGqyIiGO+5ksnJbKpopWnPq4mPyWaPOeJDWO3dnlY+nYJ0WEOfnh2JmEhmngmMhyppy1By+UxvLyznqlp0Yz207Nds2k99l3Xw/69WFfdhONbS3oNbADLsrj+jHTiI0K4791S2lzH/3zbNoYH3y+nssXFD8/O9OuIgogMbgptCVrv7Wuirt3NRX7oZRu3C/uvT2I/dBckJOO4434cs+b26x5xkaHcfGYGFS0ufr+u8rifb/9tay3rSlr49vSRTPTTjHQRCQ76J7sEJWMMK7fXkRUXzrSMGN/eu6YS+/F7Ye8urLkLsb66GCvs+LYHnZQazaUnJ/OXTTVMSYtmfl7/Tt/6uKyF5RtrmDMqjgsGYKKdiAxuCm0JSpsr29hT38m1p6f5dGMR8/Ea7D89BMbg+N6PsE4984Tv+V+Tkthc2cbv11cyPjmK7Pjeh9fBey74/e+VkZMQwbWn6+QuEdHwuASpl3bUER8Rwjmj4nxyP+NyYS//PfajS2FkBo47H/RJYAOEOCxump1ORKiDe98to9Nt9/qaTrfN0rdLsQ3cPidzQNafi8jgp78JJOiUNHWyvrSVL4xP8MkxlKaqDHvpDzFvvoK14CIcP1qKlZLmg0oPSYoO44Yz0ilu6OTJj6uOXY8xPLa+kj31ndw4O4N0ndwlIgdoeFyCzkvb6wlzWHxh/Ik/4+1459/YjywFRwiOa3+CdcrpPqjwyE7NjGVRvpMXt9cxJTWaM3OPPErwr4IG/rOnkUtPTmJmlk7uEpFDFNoSVJo63Ly5t5G5o+NIiDyx3772356i8bUXIO8kHN+5FSspxUdVHt03p6awtaqNh9dWMDYpktTYnr3onTXt/OGjSk7NiOFrJyf7vR4RCS4aHpeg8q+CBro8hi+d4JnZ9tuvYV57gajzvozjlnsGJLABwkIsbj0rAwu4990yXJ5Dy8Aa2t386u1SkqLDuFEnd4nIESi0JWh0eWxe2VXPqRkx5PRxBvaRmMLtmOW/h4nTGPGdm7BCB3bAKTU2nGtnpVFQ28GzG6sB8NiGe98ro7nLw21nZzIiQid3icjhNDwuQePtoiYaOjwntGWpqa/FfmwpOJNxXHMrVkhgwvHMnDjOH9fW/Xx7U2UbWyrbuOGMdMac4JanIjJ0KbQlKBhjeGl7PaMSIpiadny7ghlXF/Yj90BHB44b78KKCewkr8XTR7Kjup1fv1tGh9tm4fiEATsPXESCk4bHJShsqGijuLGTi/Kdx7XJiDEG88wjUFSAY/ENWJk5fqiyfyJCHdx6VgbGGCYkR7F4emqgSxKRQU49bQkKK7fXkRgZwtm5I47r9eY/L2Pe/w/WBV/Dmn6Gj6s7flnxETzypTHERYTo5C4R6ZV62jLoFTd08knZ85r6AAAgAElEQVR5K1+ckEhYSP9/y5odmzDPL4NTTse68Gt+qPDEJEeHEX4cn0tEhh/9TSGD3ks76ggPsThvXP83UzE1ldi//xWkZuJYfCOWQ7/lRSR46W8wGdQa2t2s3tvE/DHxxPVzGZTp7MD+3T3gsXF8/8dYUTrWUkSCW5+eaW/YsIGnnnoK27aZP38+ixYt6vH96upqHn30UZqamoiNjWXJkiUkJSUBUFNTw2OPPUZtbS0At99+OyNHjvTxx5Ch6p8F9Xhsw4X9XOZljMH86SEoLcJx3U+x0jL9VKGIyMDpNbRt22bZsmXccccdJCUlcfvttzNjxgyysrK6r3nmmWeYM2cOc+fOZcuWLSxfvpwlS5YA8PDDD3PxxRczZcoUOjo6dLyg9Fmn2+bVXQ3MzIolM65/h2aYf/0ds/4drIu/hTX5VD9VKCIysHodHi8sLCQtLY3U1FRCQ0OZPXs269ev73FNSUkJkydPBmDSpEl8+OGH3b/u8XiYMmUKAJGRkUREHP9OVjK8rN7bRFOnh4v628ve/BHmhaexZp6Ndf7FfqpORGTg9drTrqur6x7qBkhKSqKgoKDHNbm5uaxbt46FCxeybt062tvbaW5upqysjJiYGO677z6qqqo4+eST+cY3voHjM5OBVq1axapVqwBYunQpycm+PSghNDTU5/cMVsHSFrYxvPLPYiaMjOWcidl9HqFxl+2n7g+/ITR3LM6bfoYVGXXM64OlPQaK2uMQtUVPao+eAtUePlmnffnll/Pkk0+yevVq8vPzcTqdOBwObNtm+/bt/PrXvyY5OZkHHniA1atXc+655/Z4/YIFC1iwYEH31zU1Nb4oq1tycrLP7xmsgqUtPixtobi+nZtmp3fPh+iNaW/D/uWt4LCwv/tDaltaoaX1mK8JlvYYKGqPQ9QWPak9evJ1e2RkZPTpul5D2+l09vhLs7a2FqfTedg1t9xyCwAdHR2sXbuWmJgYnE4no0aNIjXVu9PTaaedxq5duw4LbZHPWrm9jqTo0KOeOf1Zxraxn3wAKktx3PhzrGTtLiYiQ0+vz7Tz8vIoLy+nqqoKt9vNmjVrmDFjRo9rmpqasG0bgBdeeIF58+YBMHbsWNra2mhqagJgy5YtPSawiRzJnroONlW2ccGEREIdfRsWNy8/BxvWYn31KqyTpvi5QhGRwOi1px0SEsLixYu5++67sW2befPmkZ2dzYoVK8jLy2PGjBls27aN5cuXY1kW+fn5XHXVVQA4HA4uv/xyfv7zn2OMYcyYMT2GwUWOZOWOOiJDHXx+bEKfrjcfv4/5x3NYZ5yLde4Ffq5ORCRwLGOMCXQRn1VWVubT++lZzCGDvS1q21x858XdLByfyNUzeh/iNqX7vM+xM7Jx3HoPVlj/loYN9vYYaGqPQ9QWPak9egrUM23tiCaDyj93NWCACyb0vmWpaW3BfuRuiIjA8d+39zuwRUSCjUJbBo0Ot82/Cuo5PWsEaSOOHcDG9mA/fi/UVnsDOzHpmNeLiAwFCm0ZNN7Y3UhLl82i/N43UzF/fwa2fYJ12XexxuYPQHUiIoGn0JZBwWMb/rGzjgnJkZyUcuwNUey1b2Fe+zvW3C/gmHPeAFUoIhJ4Cm0ZFNaXtlDe7Op1y1Kzbzfm6Ydg7ESsS68eoOpERAYHhbYMCiu31zEyJoxZ2SOOeo1pbvQetRkTh+O/f4QVGjaAFYqIBJ5CWwKuoLadbdXtXHhSIiFH2UzFuN3Yj/0KmhtxXPtjrLjeZ5eLiAw1Cm0JuJXb64gOc7AgL/6o15i/Pgm7tmBdcS1W7tgBrE5EZPBQaEtAVbe6eG9fM58fm0B0WMgRr7HfW4X5z8tYn7sIx6x5A1yhiMjgodCWgHp5Zz1w9M1UzJ6dmGcfgfypWJdcOYCViYgMPgptCZg2l4fXCxs4KyeOlJjDJ5WZhjrsR38JCUk4rrkVK+TIPXERkeFCoS0Bs2p3I20umy/lH97LNi6XN7DbWnFc+xOs2L4d0SkiMpQptCUgPLbhHzvqmJgSxbiknpupGGMwyx+DPTtxLL4BK2tUYIoUERlkFNoSEB/sb6aq1X3ELUvN6lcx7/4ba+FXsU49MwDViYgMTgptCYiVO+pIHxHGjMzYHr9udm3BrHgCpszEuuiyAFUnIjI4KbRlwG2vbmNnTQcXTnD22EzF1FZ7N1BJScNx1U1YDv32FBH5NP2tKAOqqsXFg2vKiYsIYf6nNlMxnZ3Yj9wDbpd34ll0TACrFBEZnBTaMmAqW7r4yapimrs83Dk3i8hQ728/YwzmmYdh/x4cV9+MlZYV4EpFRAan0EAXIMNDeXMXd6zaR4fb5q75OeQ5I7u/Z/79ImbtW1iLvok1ZWYAqxQRGdwU2uJ3pU1d3LlqH1224RcLchid+KnA3voJ5m9/glNnYy38rwBWKSIy+Cm0xa9KGju544392LbhF/OzGfXpwK4qx378XsjIxnHl9VjWkU/4EhERL4W2+M2+hk7ueGMfFvCLz+WQEx/R/T3T0e6deGZZ3olnkVFHv5GIiACaiCZ+UlTfwU9W7cNhWdy94DOBbdvYTz0IZfu9e4qnpAWwUhGR4KHQFp/bU9fBHav2ERZicc+CHLI+FdgA5tW/wcfvY33lSqyJpwSoShGR4KPQFp8qqG3njjf2ERnq4J4FOWTEhff4vtn6CWbln7FOOwfrcxcFqEoRkeCkZ9riMztr2vnf/+wnNiKEu+Znkxr7mcCurcL+w32QkYN1xbWaeCYi0k8KbfGJ7VVt/O+bJcRHhvCLBTmHnY9tXC7vFqUeD47/vh0rIvIodxIRkaPR8LicsK2Vbfzszf0kRoVyz+cOD2wA89wTUFSA49s3YKVmBKBKEZHgp562nJBNFa38YnUJKTFh3LUgB2fU4b+l7DVvYN7+F9b5l2BNmxWAKkVEhgaFthy3DeWt3P1WCemx4fx8fjYJRwhss28P5tlHYcLJWIu+GYAqRUSGDoW2HJePSlv45dulZMZ5Azs+8giB3dqC/dhSiBnhXY8dEhKASkVEhg4905Z+W1/Swj1vl5IdH85dC3KOHNi2jf3kA1BXg+N7P8KKSwhApSIiQ4tCW/rlg/3NLH2nhFEJEdw1P4e4iCP3ns2rf4NN67G+uhgr76QBrlJEZGjS8Lj02Xv7mvjNu2XkOSP5n3OziQ0/SmBv+9QGKvO+OMBViogMXQpt6ZN3ipq4f00ZE5Kj+Om8LKLDjhLYtdXYT9wH6dnaQEVExMcU2tKr1Xsb+e375eSnRHHn3Gyiwo78VMW7gcpScLu1gYqIiB/ombYc0xu7G3hwTTmTR0bz03lHD2wAs+LgBirXY6VlDmCVIiLDg3raclSvFzbwyNoKpqZF8+NzsogIPXpg22v+g3nrX1jnfRlr+uwBrFJEZPhQaMsRvbqrnsfWV3JqRgy3zckkPOQYPez9ezHPPuLdQOXLVwxglSIiw4tCWw7z8s46nviwipmZsfzo7AzCjhXYbS3Yj/4SYmJxXHOLNlAREfEjPdOWHlZu9wb2rOxYfnR25rED27axn3wQ6qpxfPdHWHGJA1ipiMjwo562dPt/W2t5ekM1Z+aM4KYzMwh1HHu5lnn1b7BxHdbXvoM1Nn+AqhQRGb4U2gLA85tr+POmGubkxnHD7HRCegvsbRswK5djnTYH69wLBqhKEZHhTaE9zBljeG5zDc9trmXu6Dium9WHwK47sIFKWibW5dpARURkoCi0hzFjDH/eWMNft9Yyf0w8156e1ntgu1zYj/0K3C4c378dKzJqgKoVERFNRBumjDE8vaGav26t5fNj4/nBrN4DG8A8/wfYuwvHlddjpWUNQKUiInKQetrDkDGGJz+u4qUd9XxhXALXzEzF0Ychbvv9NzGrX8X6/JexTtUGKiIiA02hPcwYY3jioype2VnPBRMSufrUkX16Jm1K9mKe/R2Mn4x1sTZQEREJBIX2MGIbw31v7uaVnfUsyndy5bSUvgV2Wwv2I7+EqFgc19yqDVRERAJEz7SHkT99Us2Lmyu4ZGI/AvvTG6h874dY8dpARUQkUNTTHiZeK2jgxe11XDwlncsnx/V5mZb51//zbqBy6dVYYyf6uUoRETkW9bSHgQ3lrTy2voJTM2K4/pwxfQ/s7RsxL/4Za+bZWPMv9HOVIiLSG4X2ELevoZNfvVNKdnwEt5zV+9akB5m6auzH7/VuoHLFD7SBiojIIKDQHsIa2t3ctXo/ESEWd87NIjqsbxPIujdQcblw/Lc2UBERGSwU2kNUp9vm7rdKaOjw8JO5WaTEhPX5teb5Zd4NVL59HVa6NlARERksFNpDkG0Mv32/nILaDm4+M4NxSX3vKdsfvIlZ/U+szy/COvVMP1YpIiL9pdAegv68sYb39jXzrWkpzMoe0efXmZK9mGd+B+MnYV38LT9WKCIix0OhPcSs2t3A37bWct7YBBblO/v8OtPWgv3o0gMbqPxQG6iIiAxCCu0hZFNFK4+sreCUtGiumZna96Vdto391G+htgrHd7WBiojIYKXQHiJKGjtZ+k4pGXHh/PDszD4v7QIwr/0dNqzF+sqVWOO0gYqIyGCl0B4CGjvc3LW6hFCHd2lXTHjfh7bN9o2YF549sIHKl/xYpYiInCiFdpDr8tjc81Ypde1ufnJOFqmx4X1+ramrwX7iPm2gIiISJBTaQcwYw0PvV7Cjpp0bZqczIbnvS7uM24X9+19BV5c2UBERCRIK7SD2l801vF3cxOWnpHBmTly/XmueXwZ7dmoDFRGRIKLQDlJv7mlkxeZaFuTFc8nEvi/tArA/WI15859Yn7tIG6iIiAQRhXYQ2lrZxsNryzk5NZrvzUzr17NoV/FuzDMPw7iJ2kBFRCTIKLSDTFlTF798u4TU2HBuOzuTsJB+LO1qa6XxV7dDVIx3A5VQHacuIhJMFNpBpKnTw12r92NZ3qVdsRH9WNplDObZR/BUlnsDO6F/Q+oiIhJ4Cu0g4fLYLH27hKpWNz+ek0n6iL4v7QIwa1dj1r9DzNcWY42f5KcqRUTEnxTaQcAYw8NrK9ha1c71Z6STPzK6f6+vrsAs/z2MnUjMxVf4qUoREfE3hXYQeH5LLav3NnHZlGTmjOrn0i6PB/vJBwBwXHWjDgIREQliCu1B7u2iJpZvqmHu6Di+Ojmp3683r/4VCrdjXfY9rORUP1QoIiIDRaE9iG2vbuP/3i9n0sgofnB6/5Z2AZjdOzD/eA7rtHNwzJrrnyJFRGTAKLQHqfLmLu55q5SUmFBum5NFWEj//leZjjbsZfdDYjLWN77rpypFRGQgKbQHoZZOD3etLsEYw51zs4nrx9Kug8xzT0BNFY7FN2JFx/qhShERGWgK7UHG5TEsfaeUypYubp+TRUZc/5Z2AZgP38W89wbWF76i5V0iIkOIQnsQMcbw2PoKNle2ce3p6UxK7d/SLjhw3OYzj8CocVgXfs0PVYqISKD0aR/LDRs28NRTT2HbNvPnz2fRokU9vl9dXc2jjz5KU1MTsbGxLFmyhKSkQzOd29rauOmmm5g5cyZXXXWVbz/BEPL/ttWxancjX52cxLlj4vv9emPb3uVdHjeOq2/WNqUiIkNMrz1t27ZZtmwZP/7xj3nggQd47733KCkp6XHNM888w5w5c7jvvvv4yle+wvLly3t8f8WKFeTn5/u28iHmveImntlQzZzcOC6bknxc9zD/fhF2bsb62newUjN8XKGIiARar6FdWFhIWloaqamphIaGMnv2bNavX9/jmpKSEiZPngzApEmT+PDDD7u/t2fPHhobG5k6daqPSx86dta08+D75ZyUHMWSM/q/tAvAFO/GvPAsTD8D68wFfqhSREQCrdfx07q6uh5D3UlJSRQUFPS4Jjc3l3Xr1rFw4ULWrVtHe3s7zc3NxMTE8PTTT7NkyRI2b9581PdYtWoVq1atAmDp0qUkJx9fT/NoQkNDfX5PXylv6uCXb+8mOTaC+748hcTosH7fw3R2UPvUAzjiE0i6/qc44o4+tD6Y2yIQ1B49qT0OUVv0pPboKVDt4ZOHnpdffjlPPvkkq1evJj8/H6fTicPh4PXXX2fatGk9Qv9IFixYwIIFh3qHNTU1viirW3Jyss/v6QutXR5+9HoxXR4Pvzg7C09bIzVt/b+P/edHMaX7cNx0F3VdLjjGZx2sbREoao+e1B6HqC16Unv05Ov2yMjo2yPNXkPb6XRSW1vb/XVtbS1Op/Owa2655RYAOjo6WLt2LTExMezatYvt27fz+uuv09HRgdvtJjIykm984xv9+SxDkts2/PqdUsqauvjZudlkxUcc133MxnWY1a9iff7LWPl6BCEiMpT1Gtp5eXmUl5dTVVWF0+lkzZo1XHfddT2uOThr3OFw8MILLzBv3jyAHtetXr2a3bt3K7DxLu16fH0lGyraWDIrjSlpMcd3n8Z67D/+H2SPxlr0TR9XKSIig02voR0SEsLixYu5++67sW2befPmkZ2dzYoVK8jLy2PGjBls27aN5cuXY1kW+fn5WtbVi5U76nitsIFLJjpZkJdwXPcwxmD/8bfQ2YHjO7dghfX/WbiIiAQXyxhjAl3EZ5WVlfn0foPpWcwH+5tZ+nYpZ+SM4NazMnAcx0xxAPuNlzHPPY512fdwzFvY59cNprYYDNQePak9DlFb9KT26ClQz7S1I9oAKqht5zfvlTE2KZIbzkg/7sA2pcWYvz0FJ8/AmvsFH1cpIiKDlUJ7gFS3urh7dQkJkSHccU4WEaHH1/TG1YX9xH0QFY3jyuuOa023iIgEJ4X2AKhudfGL1SV0eryndiVEHf9KO/P3p6G0GMe3b8CKO77n4SIiEpy0ObUfeWzDyzvrWb6pGtvAT87JIifh+JZ2AZitn2BWvYQ174tYJ5/qw0pFRCQYKLT9pKC2nd+trWBvfSenZsTw3ZmppMb2/5jNg0xzE/ZTD0J6NtZXrvRdoSIiEjQU2j7W5vLw7MYa/rmznoSoUH54dgazs0ec0LNnYwz20w9BazOO63+GFX78vXUREQleCm0fMcbwwf4WHv+wkvp2NwvHJ/CNqSnEhIec+L3feQ02rMX66lVY2aN9UK2IiAQjhbYPVLW4ePzDCtaXtjI6MYLb52QyPjnKJ/c2FSWYFX+A/KlY8y/0yT1FRCQ4KbRPgMc2/GNnHcs3ehfYL54+kgsmJBLi8M0yLON2Yf/hfgiLwLH4BiyHJvuLiAxnCu3jtLOmnUfXeSeazcyM5ZoZqYyM9e1Woual5VBciOO/b8dKOPZJaSIiMvQptPuptcvDsxureXVXA86oUG6bk8msrFifb3Jidm7G/OvvWGd/Hmv6GT69t4iIBCeFdh8ZY1izr5knPqqiod3NFyck8o2pyUSHnfhEs8Peq7UFe9kDMDID69KrfX5/EREJTgrtPqhs6eL36yv5qKyVMYkR/OScTMYl+Wai2WcZYzDPPgJN9Thu+zVWRKRf3kdERIKPQvsY3LbhpR11/GVTDQ7L9xPNjsS8/ybmw3exvnw51qhxfnsfEREJPgrto9hR3c4j6yoobujk9KxYvjMjlZQY/55ZbarKMct/D+MnYZ1/sV/fS0REgo9C+zNaujw8u6GafxU04IwO5fY5mczKHuH39zUeD/aTD4DDgWPxTVgO3z8rFxGR4KbQPsAYw7vFzSz7qJLGTg8XnJTIZVP8M9HsiO//yvOwewfWNbdiJaUMyHuKiEhwUWjjnWj22LpKPi5vJc8ZyR1zsxmbNHATwEzhdszLK7BmzcMx8+wBe18REQkuwzq03bbhxe11rNhcg8OyuPrUkSwc79+JZp9l2tuwl90PSSlYl313wN5XRESCz7AN7e3VbTy6tpLixk5mZXsnmiVH+3ei2ZGYv/weaqtx/PCXWFHRA/7+IiISPIZdaLd0enh6QzWvFTaQHB3Kj8/J5PQs/080OxJ7/TuY99/EuvBrWGPzA1KDiIgEj2ET2sYY3jkw0ayp08NFJyXy9SkpRIUF5hAOU1vt3URlzASsL14akBpERCS4DIvQLmlo55dvlrChvJVxSZH8z7xsxjgDt9OYsQ8s7/LYOK66CStEy7tERKR3Qzq0XR7Di9treX7LLkIsuGZGKuePSxjQiWZHYl57AXZtwfr29Vgj0wNai4iIBI8hHdrtLg8rd9Qze3QiV5ycQFIAJpp9likuxKz8M9aMs7DOODfQ5YiISBAZ0qEdFxnKbxeOYkJOOjU1NYEuB9PZgf3EbyAuEeub3/f5cZ4iIjK0BWYW1gAaDL3rg8zzy6CqDMdVN2LFxAa6HBERCTJDPrQHC/PJB5i3X8M672KsCScHuhwREQlCCu0BYFpbsJ9+CHLysC66LNDliIhIkFJoD4RdW6ClGcelV2GFDp7hehERCS4K7QFgigohJARGjQt0KSIiEsQU2gPAFBdARg5WeESgSxERkSCm0PYzYwwUFWKply0iIidIoe1vNZXQ2gy5YwNdiYiIBDmFtr8VFwJgjVJoi4jIiVFo+5kpKoTQUMjIDXQpIiIS5BTafmaKCyFzFFaYlnqJiMiJUWj7kbFtKN6NNVqT0ERE5MQptP2pugLaWzUJTUREfEKh7UemqADQJDQREfENhbY/FRVCWDik5wS6EhERGQIU2n5kigsgZwxWSEigSxERkSFAoe0nxvbAvj1Yep4tIiI+otD2l4pS6OzQJDQREfEZhbafmKIDO6FpuZeIiPiIQttfigshIgpSMwJdiYiIDBEKbT8xRQWQOwbLoUloIiLiGwptPzBuN+zfq0loIiLiUwptfyjfD64uTUITERGfUmj7waGd0DQJTUREfEeh7Q/FhRAVAylpga5ERESGEIW2H5iiQhg1Fsuh5hUREd9RqviYcbmgpEiT0ERExOcU2r5WWgQeNzrZS0REfE2h7WMHd0LTzHEREfE1hbavFRdC7AhIGhnoSkREZIhRaPuYKSqE3LFYlhXoUkREZIhRaPuQ6eqEsmKsXK3PFhER31No+9L+vWDbaBKaiIj4g0Lbh0zxgUlo2glNRET8QKHtS0UFEJ8ICc5AVyIiIkOQQtuHNAlNRET8SaHtI6ajHSpKtBOaiIj4jULbV/btAWPQJDQREfEXhbaPdE9CU09bRET8RKHtK0WFkJiMFZ8Y6EpERGSIUmj7iCn2HscpIiLiLwptHzBtLVBZqkloIiLiVwptXyjeDYClTVVERMSPFNo+cGgSWl5gCxERkSFNoe0LRYWQnIoVGxfoSkREZAhTaPuAKS7U82wREfE7hfYJMi1NUFOpmeMiIuJ3Cu0TpUloIiIyQBTaJ8gUFXh/kqNJaCIi4l8K7RNkigohNRMrOibQpYiIyBCn0D5RmoQmIiIDJLQvF23YsIGnnnoK27aZP38+ixYt6vH96upqHn30UZqamoiNjWXJkiUkJSVRVFTEE088QXt7Ow6Hg4svvpjZs2f75YMEgmmsh/oaTUITEZEB0Wto27bNsmXLuOOOO0hKSuL2229nxowZZGVldV/zzDPPMGfOHObOncuWLVtYvnw5S5YsITw8nB/84Aekp6dTV1fHbbfdxtSpU4mJGSJDyQc2VVFPW0REBkKvw+OFhYWkpaWRmppKaGgos2fPZv369T2uKSkpYfLkyQBMmjSJDz/8EICMjAzS09MBcDqdxMfH09TU5OvPEDCmqBAsB+SMCXQpIiIyDPTa066rqyMpKan766SkJAoKCnpck5uby7p161i4cCHr1q2jvb2d5uZmRowY0X1NYWEhbreb1NTUw95j1apVrFq1CoClS5eSnJx83B/oSEJDQ31+T4D68n14snJJzsr2+b39xV9tEazUHj2pPQ5RW/Sk9ugpUO3Rp2favbn88st58sknWb16Nfn5+TidThyOQ534+vp6HnroIa699toev37QggULWLBgQffXNTU1viirW3Jyss/vaYzBLtiGNXGaz+/tT/5oi2Cm9uhJ7XGI2qIntUdPvm6PjIyMPl3Xa2g7nU5qa2u7v66trcXpdB52zS233AJAR0cHa9eu7X5u3dbWxtKlS/n617/O+PHj+/wBBr36Wmis1yQ0EREZML0+087Ly6O8vJyqqircbjdr1qxhxowZPa5pamrCtm0AXnjhBebNmweA2+3mvvvuY86cOcyaNcsP5QeQJqGJiMgA67WnHRISwuLFi7n77ruxbZt58+aRnZ3NihUryMvLY8aMGWzbto3ly5djWRb5+flcddVVAKxZs4bt27fT3NzM6tWrAbj22msZNWqUPz/TgDBFheBwQPboQJciIiLDRJ+eaU+fPp3p06f3+LVLL720++ezZs06Yk96zpw5zJkz5wRLHJxMcQFk5GKFRwS6FBERGSa0I9pxMMZ4d0LT82wRERlACu3jUVsFLc2gk71ERGQAKbSPx8FJaOppi4jIAFJoHweztwBCQyEjN9CliIjIMKLQPg6muBAyR2GFhQW6FBERGUYU2v1kbBuKd6OhcRERGWgK7f6qroD2VtCmKiIiMsAU2v1kiryHpViaOS4iIgNMod1fxYUQFg4ZOYGuREREhhmFdj+Z4kLIGYMVEhLoUkREZJhRaPeDsT3eSWh6ni0iIgGg0O6PilLo7NAkNBERCQiFdj+YIu2EJiIigaPQ7o/iQoiIhLTMQFciIiLDkEK7H7onoTk0CU1ERAaeQruPjMcD+/Zg5Wp9toiIBIZCu6/K94GrC/Q8W0REAkSh3Udmr3ZCExGRwFJo91VxIUTFQEpaoCsREZFhSqHdR6aoEHLzsBxqMhERCQwlUB8YlwtKirQTmoiIBJRCuy/KisHjRpuqiIhIICm0++DgTmjavlRERAJJod0XxYUQOwKSUwNdiYiIDGMK7T4wewsgdyyWZQW6FBERGcYU2r0wXZ1QVqyd0EREJOAU2r3ZvxdsG01CExGRQFNo98IUaxKaiIgMDgrt3hQVQlwCJCYFuhIRERnmFNq9MMWFmoQmIio39acAAA0BSURBVCKDgkL7GExHO5SXoENCRP5/e/cbU2X9/3H8dXmO4J/zBTmHAEXEJN1XU2D8cDo2U9B1w9XmzNVqubnaWtJytfVbxo3WHZ0rGc5Noxtmzc2t5tRWq7U5h22yJcJwFlmc4hwj+YVw8M9R0A7n+t0QyaMo/uVzrnM9H7c4eMl5Xe8bvLiu87muC0AyoLTv5PQfks0iNABAcqC074BFaACAZEJp30koKGVly8rMMp0EAABK+06uL0IDACAZUNq3YV++JP39l/g8GwCQLCjt2zn9uyTxDG0AQNKgtG9jeBEaR9oAgCRBad9OR7uUnSvLl2E6CQAAkijt27LDQU6NAwCSCqU9Ajt6Qer5m1PjAICkQmmPJMwiNABA8qG0R2CH2q99UVhkNggAADegtEdgh4NSzjRZk3ymowAAMIzSHkk4KJ7sBQBINpT2TewLfVKkh0VoAICkQ2nfLHTtpiosQgMAJBtK+yZ2KChZljRjlukoAAAkoLRvYoeDUt50WRMmmo4CAEACSvsGtm0PLULj1DgAIPlQ2jc6F5HO90mFrBwHACQfSvtG4Ws3VeFIGwCQjCjtG9ihoDRunFTwuOkoAADcgtK+gR1ql6YVykpLNx0FAIBbUNpDWIQGAEh2lPZ1vd1S9KLETVUAAEmK0r4uPHQnNI60AQBJitIeYoeCkscr5c80HQUAgBFR2kPscFCaPlPW+PGmowAAMCJKW0OL0EIsQgMAJDdKW5K6u6T+SyxCAwAkNUpbQ9dnS7JmcvtSAEDyorSlayvHx6dJUwtMJwEA4LYobQ0tQit4XJbXazoKAAC35frStuODUvgPWXyeDQBIcq4vbf19RrrSL7FyHACQ5Fxf2nbo+p3QWIQGAEhuri9thdql9AlSXr7pJAAA3JHrS9sOB6UZs2SN85iOAgDAHbm6tO3BQen0H7IKOTUOAEh+ri5tdZ2W/rnKIjQAgCO4urSHF6FxuRcAwAFcXdoKB6WJk6ScqaaTAAAwKleXth0KSoVPyBrn6jEAABzCtW1lx/6ROjs4NQ4AcAzXlrb+CkuxmHiGNgDAKVxb2tcXofEMbQCAU9zVY61aW1u1e/duxeNxLV++XKtWrUr497Nnz+rjjz/WhQsX5PP59OabbyoQCEiSGhoatH//fknS6tWrtWzZsoe7B/crHJQm/0fKzjWdBACAuzLqkXY8HteuXbtUU1Ojuro6HT16VJ2dnQnb7NmzR0899ZS2bt2qNWvWaO/evZKkaDSqffv2afPmzdq8ebP27dunaDT6aPbkHtmh9muL0CzLdBQAAO7KqKUdDAaVl5en3Nxceb1eVVRUqKmpKWGbzs5OzZ8/X5L05JNP6vjx45KuHaEXFxfL5/PJ5/OpuLhYra2tj2A37o199Yp05rT4PBsA4CSjnh6PRCLDp7olKRAIqL29PWGbwsJCHTt2TCtXrtSxY8fU39+vixcv3vJ//X6/IpHILe9x6NAhHTp0SJK0ZcsWZWdn3/cOjcTr9Sb8zH9++1mRwUFlFJdpwkN+r2R38yzcjnkkYh7/YhaJmEciU/O4q8+0R7N27Vp9+umnamho0Ny5c+X3+zXuHq59XrFihVasWDH8uqen52HEGpadnZ3wM+Ot184UXMzKVfQhv1eyu3kWbsc8EjGPfzGLRMwj0cOex7Rp0+5qu1FL2+/3q7e3d/h1b2+v/H7/Ldu88847kqSBgQH9+OOPmjx5svx+v9ra2oa3i0Qimjdv3l0Fe6RCQSljipQVGH1bAACSxKiHw0VFRerq6lJ3d7disZgaGxtVXl6esM2FCxcUj8clSQcOHFBlZaUkqbS0VCdOnFA0GlU0GtWJEydUWlr6CHbj3tjhIIvQAACOM+qRtsfj0SuvvKJNmzYpHo+rsrJSBQUF+uKLL1RUVKTy8nK1tbVp7969sixLc+fO1auvvipJ8vl8eu655/Tee+9JktasWSOfz/do92gU9kC/1NUp638qjOYAAOBe3dVn2mVlZSorK0v43gsvvDD89eLFi7V48eIR/29VVZWqqqoeIOJD9meHZMd5hjYAwHFcd0c0Ozy08p3LvQAADuO60lYoKGVly8rMMp0EAIB74rrSvv44TgAAnMZVpW1fviT9/Ze4ExoAwIlcVdo6/bsk8QxtAIAjuaq07TCP4wQAOJerSluhoBTIkfWfDNNJAAC4Z64qbTsc5FIvAIBjuaa07UsXpbP/J2smN1UBADiTa0pbQ59nswgNAOBUriltu2PoTmiFRWaDAABwn9xT2uGglDNN1iSzDywBAOB+uaa0FQ6Km6oAAJzMFaU9eC4iRXq4PhsA4GiuKO3Y76ckSRxpAwCczBWl/U/wlGRZ0gwWoQEAnMsdpf37KSlvuqwJE01HAQDgvqV8adu2rVjwF3FqHADgdClf2joXUbyvVyrkTmgAAGdL/dIOX7upCkfaAACnS/nStkNBaZxHmv646SgAADyQ1C/tcFDeGY/LSk83HQUAgAeS0qVt27YUCspb9F/TUQAAeGApXdoa6JemzVDavBLTSQAAeGBe0wEeJWviJHn+d7MmZmfrUk+P6TgAADyQ1D7SBgAghVDaAAA4BKUNAIBDUNoAADgEpQ0AgENQ2gAAOASlDQCAQ1DaAAA4BKUNAIBDUNoAADgEpQ0AgENQ2gAAOASlDQCAQ1DaAAA4BKUNAIBDUNoAADgEpQ0AgENQ2gAAOIRl27ZtOgQAABidK460N27caDpC0mAWiZhHIubxL2aRiHkkMjUPV5Q2AACpgNIGAMAhPB988MEHpkOMhVmzZpmOkDSYRSLmkYh5/ItZJGIeiUzMg4VoAAA4BKfHAQBwCEobAACH8JoO8Ci1trZq9+7disfjWr58uVatWmU6kjE9PT3asWOHzp07J8uytGLFCq1cudJ0LKPi8bg2btwov9/v+stZLl26pPr6ev3555+yLEvr16/XnDlzTMcy5ptvvtHhw4dlWZYKCgpUXV2ttLQ007HGzM6dO9XS0qLMzEzV1tZKkqLRqOrq6nT27Fk99thjevvtt+Xz+QwnffRGmsWePXvU3Nwsr9er3NxcVVdXa/LkyWOSJ2WPtOPxuHbt2qWamhrV1dXp6NGj6uzsNB3LGI/Ho7Vr16qurk6bNm3S999/7+p5SNK3336r/Px80zGSwu7du1VaWqpt27bpo48+cvVcIpGIvvvuO23ZskW1tbWKx+NqbGw0HWtMLVu2TDU1NQnfO3jwoBYsWKDt27drwYIFOnjwoKF0Y2ukWRQXF6u2tlZbt27V1KlTdeDAgTHLk7KlHQwGlZeXp9zcXHm9XlVUVKipqcl0LGOysrKGVzpOnDhR+fn5ikQihlOZ09vbq5aWFi1fvtx0FOMuX76sX375RVVVVZIkr9c7ZkcNySoej+vq1asaHBzU1atXlZWVZTrSmJo3b94tR9FNTU1aunSpJGnp0qWu+X060ixKSkrk8XgkSXPmzBnT36Upe3o8EokoEAgMvw4EAmpvbzeYKHl0d3ero6NDTzzxhOkoxnz22Wd6+eWX1d/fbzqKcd3d3crIyNDOnTsVDoc1a9YsrVu3ThMmTDAdzQi/369nn31W69evV1pamkpKSlRSUmI6lnHnz58f/uNlypQpOn/+vOFEyeHw4cOqqKgYs/dL2SNtjGxgYEC1tbVat26dJk2aZDqOEc3NzcrMzOSa0yGDg4Pq6OjQ008/rQ8//FDp6emuOfU5kmg0qqamJu3YsUOffPKJBgYG9MMPP5iOlVQsy5JlWaZjGLd//355PB4tWbJkzN4zZUvb7/ert7d3+HVvb6/8fr/BRObFYjHV1tZqyZIlWrRokek4xvz66686fvy43njjDW3btk0//fSTtm/fbjqWMYFAQIFAQLNnz5YkLV68WB0dHYZTmXPy5Enl5OQoIyNDXq9XixYt0m+//WY6lnGZmZnq6+uTJPX19SkjI8NwIrMaGhrU3NysDRs2jOkfMClb2kVFRerq6lJ3d7disZgaGxtVXl5uOpYxtm2rvr5e+fn5euaZZ0zHMeqll15SfX29duzYobfeekvz58/Xhg0bTMcyZsqUKQoEAjpz5oyka6U1ffp0w6nMyc7OVnt7u65cuSLbtnXy5ElXL8y7rry8XEeOHJEkHTlyRAsXLjScyJzW1lZ99dVXevfdd5Wenj6m753Sd0RraWnR559/rng8rsrKSq1evdp0JGNOnTql999/XzNmzBj+q/DFF19UWVmZ4WRm/fzzz/r6669df8lXKBRSfX29YrGYcnJyVF1d7YrLeW7nyy+/VGNjozwej2bOnKnXX39d48ePNx1rzGzbtk1tbW26ePGiMjMz9fzzz2vhwoWqq6tTT0+Pqy75GmkWBw4cUCwWG97/2bNn67XXXhuTPCld2gAApJKUPT0OAECqobQBAHAIShsAAIegtAEAcAhKGwAAh6C0AQBwCEobAACH+H/VLD2UvubkeAAAAABJRU5ErkJggg==\n", 1080 | "text/plain": [ 1081 | "
" 1082 | ] 1083 | }, 1084 | "metadata": {}, 1085 | "output_type": "display_data" 1086 | } 1087 | ], 1088 | "source": [ 1089 | "plt.style.use(\"ggplot\")\n", 1090 | "plt.figure(figsize=(8,8))\n", 1091 | "plt.plot(hist[\"crf_viterbi_accuracy\"])\n", 1092 | "plt.plot(hist[\"val_crf_viterbi_accuracy\"])\n", 1093 | "plt.show()" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 35, 1099 | "metadata": { 1100 | "ExecuteTime": { 1101 | "end_time": "2019-09-05T03:57:21.990014Z", 1102 | "start_time": "2019-09-05T03:57:21.987299Z" 1103 | } 1104 | }, 1105 | "outputs": [], 1106 | "source": [ 1107 | "### 对验证集进行各种度量指标计算" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "code", 1112 | "execution_count": 36, 1113 | "metadata": { 1114 | "ExecuteTime": { 1115 | "end_time": "2019-09-05T03:57:22.001183Z", 1116 | "start_time": "2019-09-05T03:57:21.991770Z" 1117 | } 1118 | }, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/plain": [ 1123 | "(444, 512)" 1124 | ] 1125 | }, 1126 | "execution_count": 36, 1127 | "metadata": {}, 1128 | "output_type": "execute_result" 1129 | } 1130 | ], 1131 | "source": [ 1132 | "X1_test.shape" 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": 37, 1138 | "metadata": { 1139 | "ExecuteTime": { 1140 | "end_time": "2019-09-05T03:57:43.365745Z", 1141 | "start_time": "2019-09-05T03:57:22.003343Z" 1142 | } 1143 | }, 1144 | "outputs": [ 1145 | { 1146 | "name": "stdout", 1147 | "output_type": "stream", 1148 | "text": [ 1149 | "444/444 [==============================] - 21s 48ms/step\n" 1150 | ] 1151 | } 1152 | ], 1153 | "source": [ 1154 | "test_pred = model.predict([X1_test, X2_test], verbose=1)" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": 38, 1160 | "metadata": { 1161 | "ExecuteTime": { 1162 | "end_time": "2019-09-05T03:57:43.374778Z", 1163 | "start_time": "2019-09-05T03:57:43.368600Z" 1164 | } 1165 | }, 1166 | "outputs": [ 1167 | { 1168 | "name": "stdout", 1169 | "output_type": "stream", 1170 | "text": [ 1171 | "tag2idx: {'B': 2, 'O': 1, 'I': 3, '-PAD-': 0}\n", 1172 | "idx2tag: {0: '-PAD-', 1: 'O', 2: 'B', 3: 'I'}\n" 1173 | ] 1174 | } 1175 | ], 1176 | "source": [ 1177 | "idx2tag = {i: w for w, i in tag2idx.items()}\n", 1178 | "print(\"tag2idx:\", tag2idx)\n", 1179 | "print(\"idx2tag:\", idx2tag)" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 39, 1185 | "metadata": { 1186 | "ExecuteTime": { 1187 | "end_time": "2019-09-05T03:57:43.385855Z", 1188 | "start_time": "2019-09-05T03:57:43.379563Z" 1189 | } 1190 | }, 1191 | "outputs": [ 1192 | { 1193 | "data": { 1194 | "text/plain": [ 1195 | "(444, 512, 4)" 1196 | ] 1197 | }, 1198 | "execution_count": 39, 1199 | "metadata": {}, 1200 | "output_type": "execute_result" 1201 | } 1202 | ], 1203 | "source": [ 1204 | "test_pred.shape" 1205 | ] 1206 | }, 1207 | { 1208 | "cell_type": "code", 1209 | "execution_count": 40, 1210 | "metadata": { 1211 | "ExecuteTime": { 1212 | "end_time": "2019-09-05T03:57:44.404271Z", 1213 | "start_time": "2019-09-05T03:57:43.388630Z" 1214 | } 1215 | }, 1216 | "outputs": [], 1217 | "source": [ 1218 | "def pred2label(pred):\n", 1219 | " out = []\n", 1220 | " for pred_i in pred:\n", 1221 | " out_i = []\n", 1222 | " for p in pred_i:\n", 1223 | " p_i = np.argmax(p)\n", 1224 | " out_i.append(idx2tag[p_i].replace(\"-PAD-\", \"O\"))\n", 1225 | " out.append(out_i)\n", 1226 | " return out\n", 1227 | " \n", 1228 | "pred_labels = pred2label(test_pred)\n", 1229 | "test_labels = pred2label(Y_test)" 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "code", 1234 | "execution_count": 41, 1235 | "metadata": { 1236 | "ExecuteTime": { 1237 | "end_time": "2019-09-05T03:57:44.904263Z", 1238 | "start_time": "2019-09-05T03:57:44.406348Z" 1239 | } 1240 | }, 1241 | "outputs": [ 1242 | { 1243 | "name": "stdout", 1244 | "output_type": "stream", 1245 | "text": [ 1246 | "F1-score: 84.1%\n" 1247 | ] 1248 | } 1249 | ], 1250 | "source": [ 1251 | "print(\"F1-score: {:.1%}\".format(f1_score(test_labels, pred_labels)))" 1252 | ] 1253 | }, 1254 | { 1255 | "cell_type": "code", 1256 | "execution_count": 42, 1257 | "metadata": { 1258 | "ExecuteTime": { 1259 | "end_time": "2019-09-05T03:57:46.851990Z", 1260 | "start_time": "2019-09-05T03:57:44.906277Z" 1261 | } 1262 | }, 1263 | "outputs": [ 1264 | { 1265 | "name": "stdout", 1266 | "output_type": "stream", 1267 | "text": [ 1268 | " precision recall f1-score support\n", 1269 | "\n", 1270 | " B 0.87 0.90 0.89 2093\n", 1271 | " I 0.77 0.82 0.79 2089\n", 1272 | "\n", 1273 | "micro avg 0.82 0.86 0.84 4182\n", 1274 | "macro avg 0.82 0.86 0.84 4182\n", 1275 | "\n" 1276 | ] 1277 | } 1278 | ], 1279 | "source": [ 1280 | "print(classification_report(test_labels, pred_labels))" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "markdown", 1285 | "metadata": {}, 1286 | "source": [ 1287 | "### 验证个例" 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "code", 1292 | "execution_count": 43, 1293 | "metadata": { 1294 | "ExecuteTime": { 1295 | "end_time": "2019-09-05T03:57:46.860554Z", 1296 | "start_time": "2019-09-05T03:57:46.854047Z" 1297 | } 1298 | }, 1299 | "outputs": [ 1300 | { 1301 | "name": "stdout", 1302 | "output_type": "stream", 1303 | "text": [ 1304 | "['3c27e9bb']\n", 1305 | "('3c27e9bb', 'ICA亚投链矿机投资配股拆分理财开发找13郑286婷015微737电。ICA亚投链网上投资理财app开发、ICA亚投链虚拟币投资分红模式开发、ICA亚投链理财平台网页版定制开发、ICA亚投链区块链分红系统程序开发区块链技术的定义是什么。ICA亚投链模式制度介绍 注册免费送12万台矿机 数量有限 送完为止。ICA亚投链全球恒量发行1 5亿枚 永不增发 开盘价0 2美元 预计开盘交易黑市价不低于2美元 市场更是一币难求 采取有效的控盘机制。ICA亚投链全球恒量发行1 5亿枚 永不增发。ICA亚投链资质背书 双加密网址 独立开源代码 采用POW智能合约 POS算力 对接商城 话费 水电费充值 中石油、中石化充值 ICA亚投链推广奖励 矿工公会ICA亚投链奖励 一代 拿直推收益总产量的5%低于直推算力 收益减半二代 3%三代 1%一星会长 直推10人 团队100人 工会算力 总算力达到20GH S 送小型矿机一台', [[2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [2], [3], [3], [3], [3], [3], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])\n", 1306 | "[[0. 0. 1. 0.]\n", 1307 | " [0. 0. 0. 1.]\n", 1308 | " [0. 0. 0. 1.]\n", 1309 | " ...\n", 1310 | " [1. 0. 0. 0.]\n", 1311 | " [1. 0. 0. 0.]\n", 1312 | " [1. 0. 0. 0.]]\n" 1313 | ] 1314 | } 1315 | ], 1316 | "source": [ 1317 | "# 随机抽样\n", 1318 | "sample_id = random.sample(range(len(id_test)), 1)[0]\n", 1319 | "sample_X1 = X1_test[sample_id]\n", 1320 | "sample_X2 = X2_test[sample_id]\n", 1321 | "tid = id_test[sample_id][0]\n", 1322 | "sample_text_id = text_id_test[sample_id]\n", 1323 | "print(sample_text_id)\n", 1324 | "sample_data = train_data[tid]\n", 1325 | "print(sample_data)\n", 1326 | "sample_Y = Y_test[sample_id]\n", 1327 | "print(sample_Y)" 1328 | ] 1329 | }, 1330 | { 1331 | "cell_type": "code", 1332 | "execution_count": 44, 1333 | "metadata": { 1334 | "ExecuteTime": { 1335 | "end_time": "2019-09-05T03:57:46.870274Z", 1336 | "start_time": "2019-09-05T03:57:46.862318Z" 1337 | } 1338 | }, 1339 | "outputs": [ 1340 | { 1341 | "data": { 1342 | "text/plain": [ 1343 | "(512,)" 1344 | ] 1345 | }, 1346 | "execution_count": 44, 1347 | "metadata": {}, 1348 | "output_type": "execute_result" 1349 | } 1350 | ], 1351 | "source": [ 1352 | "sample_X1.shape" 1353 | ] 1354 | }, 1355 | { 1356 | "cell_type": "code", 1357 | "execution_count": 45, 1358 | "metadata": { 1359 | "ExecuteTime": { 1360 | "end_time": "2019-09-05T03:57:47.555336Z", 1361 | "start_time": "2019-09-05T03:57:46.871773Z" 1362 | } 1363 | }, 1364 | "outputs": [ 1365 | { 1366 | "name": "stdout", 1367 | "output_type": "stream", 1368 | "text": [ 1369 | "(1, 512, 4)\n" 1370 | ] 1371 | } 1372 | ], 1373 | "source": [ 1374 | "predict = model.predict([sample_X1.reshape([1, -1]), sample_X2.reshape([1, -1])])\n", 1375 | "print(predict.shape)" 1376 | ] 1377 | }, 1378 | { 1379 | "cell_type": "code", 1380 | "execution_count": 46, 1381 | "metadata": { 1382 | "ExecuteTime": { 1383 | "end_time": "2019-09-05T03:57:47.563364Z", 1384 | "start_time": "2019-09-05T03:57:47.558356Z" 1385 | } 1386 | }, 1387 | "outputs": [], 1388 | "source": [ 1389 | "pred = np.argmax(predict, axis=-1).reshape([-1])\n", 1390 | "true = np.argmax(sample_Y, axis=-1)" 1391 | ] 1392 | }, 1393 | { 1394 | "cell_type": "code", 1395 | "execution_count": 47, 1396 | "metadata": { 1397 | "ExecuteTime": { 1398 | "end_time": "2019-09-05T03:57:47.573980Z", 1399 | "start_time": "2019-09-05T03:57:47.565160Z" 1400 | } 1401 | }, 1402 | "outputs": [], 1403 | "source": [ 1404 | "pred_label = [idx2tag[i] for i in pred]\n", 1405 | "true_label = [idx2tag[i] for i in true]" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": 48, 1411 | "metadata": { 1412 | "ExecuteTime": { 1413 | "end_time": "2019-09-05T03:57:47.624632Z", 1414 | "start_time": "2019-09-05T03:57:47.576187Z" 1415 | } 1416 | }, 1417 | "outputs": [ 1418 | { 1419 | "name": "stdout", 1420 | "output_type": "stream", 1421 | "text": [ 1422 | "I : B B\n", 1423 | "C : I I\n", 1424 | "A : I I\n", 1425 | "亚 : I I\n", 1426 | "投 : I I\n", 1427 | "链 : I I\n", 1428 | "矿 : O O\n", 1429 | "机 : O O\n", 1430 | "投 : O O\n", 1431 | "资 : O O\n", 1432 | "配 : O O\n", 1433 | "股 : O O\n", 1434 | "拆 : O O\n", 1435 | "分 : O O\n", 1436 | "理 : O O\n", 1437 | "财 : O O\n", 1438 | "开 : O O\n", 1439 | "发 : O O\n", 1440 | "找 : O O\n", 1441 | "1 : O O\n", 1442 | "3 : O O\n", 1443 | "郑 : O O\n", 1444 | "2 : O O\n", 1445 | "8 : O O\n", 1446 | "6 : O O\n", 1447 | "婷 : O O\n", 1448 | "0 : O O\n", 1449 | "1 : O O\n", 1450 | "5 : O O\n", 1451 | "微 : O O\n", 1452 | "7 : O O\n", 1453 | "3 : O O\n", 1454 | "7 : O O\n", 1455 | "电 : O O\n", 1456 | "。 : O O\n", 1457 | "I : B B\n", 1458 | "C : I I\n", 1459 | "A : I I\n", 1460 | "亚 : I I\n", 1461 | "投 : I I\n", 1462 | "链 : I I\n", 1463 | "网 : O O\n", 1464 | "上 : O O\n", 1465 | "投 : O O\n", 1466 | "资 : O O\n", 1467 | "理 : O O\n", 1468 | "财 : O O\n", 1469 | "a : O O\n", 1470 | "p : O O\n", 1471 | "p : O O\n", 1472 | "开 : O O\n", 1473 | "发 : O O\n", 1474 | "、 : O O\n", 1475 | "I : B B\n", 1476 | "C : I I\n", 1477 | "A : I I\n", 1478 | "亚 : I I\n", 1479 | "投 : I I\n", 1480 | "链 : I I\n", 1481 | "虚 : O O\n", 1482 | "拟 : O O\n", 1483 | "币 : O O\n", 1484 | "投 : O O\n", 1485 | "资 : O O\n", 1486 | "分 : O O\n", 1487 | "红 : O O\n", 1488 | "模 : O O\n", 1489 | "式 : O O\n", 1490 | "开 : O O\n", 1491 | "发 : O O\n", 1492 | "、 : O O\n", 1493 | "I : B B\n", 1494 | "C : I I\n", 1495 | "A : I I\n", 1496 | "亚 : I I\n", 1497 | "投 : I I\n", 1498 | "链 : I I\n", 1499 | "理 : O O\n", 1500 | "财 : O O\n", 1501 | "平 : O O\n", 1502 | "台 : O O\n", 1503 | "网 : O O\n", 1504 | "页 : O O\n", 1505 | "版 : O O\n", 1506 | "定 : O O\n", 1507 | "制 : O O\n", 1508 | "开 : O O\n", 1509 | "发 : O O\n", 1510 | "、 : O O\n", 1511 | "I : B B\n", 1512 | "C : I I\n", 1513 | "A : I I\n", 1514 | "亚 : I I\n", 1515 | "投 : I I\n", 1516 | "链 : I I\n", 1517 | "区 : O O\n", 1518 | "块 : O O\n", 1519 | "链 : O O\n", 1520 | "分 : O O\n", 1521 | "红 : O O\n", 1522 | "系 : O O\n", 1523 | "统 : O O\n", 1524 | "程 : O O\n", 1525 | "序 : O O\n", 1526 | "开 : O O\n", 1527 | "发 : O O\n", 1528 | "区 : O O\n", 1529 | "块 : O O\n", 1530 | "链 : O O\n", 1531 | "技 : O O\n", 1532 | "术 : O O\n", 1533 | "的 : O O\n", 1534 | "定 : O O\n", 1535 | "义 : O O\n", 1536 | "是 : O O\n", 1537 | "什 : O O\n", 1538 | "么 : O O\n", 1539 | "。 : O O\n", 1540 | "I : B B\n", 1541 | "C : I I\n", 1542 | "A : I I\n", 1543 | "亚 : I I\n", 1544 | "投 : I I\n", 1545 | "链 : I I\n", 1546 | "模 : O O\n", 1547 | "式 : O O\n", 1548 | "制 : O O\n", 1549 | "度 : O O\n", 1550 | "介 : O O\n", 1551 | "绍 : O O\n", 1552 | " : O O\n", 1553 | "注 : O O\n", 1554 | "册 : O O\n", 1555 | "免 : O O\n", 1556 | "费 : O O\n", 1557 | "送 : O O\n", 1558 | "1 : O O\n", 1559 | "2 : O O\n", 1560 | "万 : O O\n", 1561 | "台 : O O\n", 1562 | "矿 : O O\n", 1563 | "机 : O O\n", 1564 | " : O O\n", 1565 | "数 : O O\n", 1566 | "量 : O O\n", 1567 | "有 : O O\n", 1568 | "限 : O O\n", 1569 | " : O O\n", 1570 | "送 : O O\n", 1571 | "完 : O O\n", 1572 | "为 : O O\n", 1573 | "止 : O O\n", 1574 | "。 : O O\n", 1575 | "I : B B\n", 1576 | "C : I I\n", 1577 | "A : I I\n", 1578 | "亚 : I I\n", 1579 | "投 : I I\n", 1580 | "链 : I I\n", 1581 | "全 : O O\n", 1582 | "球 : O O\n", 1583 | "恒 : O O\n", 1584 | "量 : O O\n", 1585 | "发 : O O\n", 1586 | "行 : O O\n", 1587 | "1 : O O\n", 1588 | " : O O\n", 1589 | "5 : O O\n", 1590 | "亿 : O O\n", 1591 | "枚 : O O\n", 1592 | " : O O\n", 1593 | "永 : O O\n", 1594 | "不 : O O\n", 1595 | "增 : O O\n", 1596 | "发 : O O\n", 1597 | " : O O\n", 1598 | "开 : O O\n", 1599 | "盘 : O O\n", 1600 | "价 : O O\n", 1601 | "0 : O O\n", 1602 | " : O O\n", 1603 | "2 : O O\n", 1604 | "美 : O O\n", 1605 | "元 : O O\n", 1606 | " : O O\n", 1607 | "预 : O O\n", 1608 | "计 : O O\n", 1609 | "开 : O O\n", 1610 | "盘 : O O\n", 1611 | "交 : O O\n", 1612 | "易 : O O\n", 1613 | "黑 : O O\n", 1614 | "市 : O O\n", 1615 | "价 : O O\n", 1616 | "不 : O O\n", 1617 | "低 : O O\n", 1618 | "于 : O O\n", 1619 | "2 : O O\n", 1620 | "美 : O O\n", 1621 | "元 : O O\n", 1622 | " : O O\n", 1623 | "市 : O O\n", 1624 | "场 : O O\n", 1625 | "更 : O O\n", 1626 | "是 : O O\n", 1627 | "一 : O O\n", 1628 | "币 : O O\n", 1629 | "难 : O O\n", 1630 | "求 : O O\n", 1631 | " : O O\n", 1632 | "采 : O O\n", 1633 | "取 : O O\n", 1634 | "有 : O O\n", 1635 | "效 : O O\n", 1636 | "的 : O O\n", 1637 | "控 : O O\n", 1638 | "盘 : O O\n", 1639 | "机 : O O\n", 1640 | "制 : O O\n", 1641 | "。 : O O\n", 1642 | "I : B B\n", 1643 | "C : I I\n", 1644 | "A : I I\n", 1645 | "亚 : I I\n", 1646 | "投 : I I\n", 1647 | "链 : I I\n", 1648 | "全 : O O\n", 1649 | "球 : O O\n", 1650 | "恒 : O O\n", 1651 | "量 : O O\n", 1652 | "发 : O O\n", 1653 | "行 : O O\n", 1654 | "1 : O O\n", 1655 | " : O O\n", 1656 | "5 : O O\n", 1657 | "亿 : O O\n", 1658 | "枚 : O O\n", 1659 | " : O O\n", 1660 | "永 : O O\n", 1661 | "不 : O O\n", 1662 | "增 : O O\n", 1663 | "发 : O O\n", 1664 | "。 : O O\n", 1665 | "I : B B\n", 1666 | "C : I I\n", 1667 | "A : I I\n", 1668 | "亚 : I I\n", 1669 | "投 : I I\n", 1670 | "链 : I I\n", 1671 | "资 : O O\n", 1672 | "质 : O O\n", 1673 | "背 : O O\n", 1674 | "书 : O O\n", 1675 | " : O O\n", 1676 | "双 : O O\n", 1677 | "加 : O O\n", 1678 | "密 : O O\n", 1679 | "网 : O O\n", 1680 | "址 : O O\n", 1681 | " : O O\n", 1682 | "独 : O O\n", 1683 | "立 : O O\n", 1684 | "开 : O O\n", 1685 | "源 : O O\n", 1686 | "代 : O O\n", 1687 | "码 : O O\n", 1688 | " : O O\n", 1689 | "采 : O O\n", 1690 | "用 : O O\n", 1691 | "P : O O\n", 1692 | "O : O O\n", 1693 | "W : O O\n", 1694 | "智 : O O\n", 1695 | "能 : O O\n", 1696 | "合 : O O\n", 1697 | "约 : O O\n", 1698 | " : O O\n", 1699 | "P : O O\n", 1700 | "O : O O\n", 1701 | "S : O O\n", 1702 | "算 : O O\n", 1703 | "力 : O O\n", 1704 | " : O O\n", 1705 | "对 : O O\n", 1706 | "接 : O O\n", 1707 | "商 : O O\n", 1708 | "城 : O O\n", 1709 | " : O O\n", 1710 | "话 : O O\n", 1711 | "费 : O O\n", 1712 | " : O O\n", 1713 | "水 : O O\n", 1714 | "电 : O O\n", 1715 | "费 : O O\n", 1716 | "充 : O O\n", 1717 | "值 : O O\n", 1718 | " : O O\n", 1719 | "中 : O O\n", 1720 | "石 : O O\n", 1721 | "油 : O O\n", 1722 | "、 : O O\n", 1723 | "中 : O O\n", 1724 | "石 : O O\n", 1725 | "化 : O O\n", 1726 | "充 : O O\n", 1727 | "值 : O O\n", 1728 | " : O O\n", 1729 | "I : B B\n", 1730 | "C : I I\n", 1731 | "A : I I\n", 1732 | "亚 : I I\n", 1733 | "投 : I I\n", 1734 | "链 : I I\n", 1735 | "推 : O O\n", 1736 | "广 : O O\n", 1737 | "奖 : O O\n", 1738 | "励 : O O\n", 1739 | " : O O\n", 1740 | "矿 : O O\n", 1741 | "工 : O O\n", 1742 | "公 : O O\n", 1743 | "会 : O O\n", 1744 | "I : B B\n", 1745 | "C : I I\n", 1746 | "A : I I\n", 1747 | "亚 : I I\n", 1748 | "投 : I I\n", 1749 | "链 : I I\n", 1750 | "奖 : O O\n", 1751 | "励 : O O\n", 1752 | " : O O\n", 1753 | "一 : O O\n", 1754 | "代 : O O\n", 1755 | " : O O\n", 1756 | "拿 : O O\n", 1757 | "直 : O O\n", 1758 | "推 : O O\n", 1759 | "收 : O O\n", 1760 | "益 : O O\n", 1761 | "总 : O O\n", 1762 | "产 : O O\n", 1763 | "量 : O O\n", 1764 | "的 : O O\n", 1765 | "5 : O O\n", 1766 | "% : O O\n", 1767 | "低 : O O\n", 1768 | "于 : O O\n", 1769 | "直 : O O\n", 1770 | "推 : O O\n", 1771 | "算 : O O\n", 1772 | "力 : O O\n", 1773 | " : O O\n", 1774 | "收 : O O\n", 1775 | "益 : O O\n", 1776 | "减 : O O\n", 1777 | "半 : O O\n", 1778 | "二 : O O\n", 1779 | "代 : O O\n", 1780 | " : O O\n", 1781 | "3 : O O\n", 1782 | "% : O O\n", 1783 | "三 : O O\n", 1784 | "代 : O O\n", 1785 | " : O O\n", 1786 | "1 : O O\n", 1787 | "% : O O\n", 1788 | "一 : O O\n", 1789 | "星 : O O\n", 1790 | "会 : O O\n", 1791 | "长 : O O\n", 1792 | " : O O\n", 1793 | "直 : O O\n", 1794 | "推 : O O\n", 1795 | "1 : O O\n", 1796 | "0 : O O\n", 1797 | "人 : O O\n", 1798 | " : O O\n", 1799 | "团 : O O\n", 1800 | "队 : O O\n", 1801 | "1 : O O\n", 1802 | "0 : O O\n", 1803 | "0 : O O\n", 1804 | "人 : O O\n", 1805 | " : O O\n", 1806 | "工 : O O\n", 1807 | "会 : O O\n", 1808 | "算 : O O\n", 1809 | "力 : O O\n", 1810 | " : O O\n", 1811 | "总 : O O\n", 1812 | "算 : O O\n", 1813 | "力 : O O\n", 1814 | "达 : O O\n", 1815 | "到 : O O\n", 1816 | "2 : O O\n", 1817 | "0 : O O\n", 1818 | "G : O O\n", 1819 | "H : O O\n", 1820 | " : O O\n", 1821 | "S : O O\n", 1822 | " : O O\n", 1823 | "送 : O O\n", 1824 | "小 : O O\n", 1825 | "型 : O O\n", 1826 | "矿 : O O\n", 1827 | "机 : O O\n", 1828 | "一 : O O\n", 1829 | "台 : O O\n" 1830 | ] 1831 | } 1832 | ], 1833 | "source": [ 1834 | "for c, t, p in zip(sample_data[1], pred_label, true_label):\n", 1835 | " if t != \"-PAD-\":\n", 1836 | " print(\"{:15}: {:5} {}\".format(c, t, p))" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 49, 1842 | "metadata": { 1843 | "ExecuteTime": { 1844 | "end_time": "2019-09-05T03:57:47.635452Z", 1845 | "start_time": "2019-09-05T03:57:47.626818Z" 1846 | } 1847 | }, 1848 | "outputs": [], 1849 | "source": [ 1850 | "def get_entity(X_data, y_data):\n", 1851 | " \"\"\"\n", 1852 | " \"\"\"\n", 1853 | " entity_list = []\n", 1854 | " entity_name = ''\n", 1855 | " for i, (c, l) in enumerate(zip(X_data, y_data)):\n", 1856 | " if l == \"B\":\n", 1857 | " entity_name += c\n", 1858 | " elif (l == \"I\") and (len(entity_name)) > 0:\n", 1859 | " entity_name += c\n", 1860 | " if i == len(y_data) - 1:\n", 1861 | " entity_list.append(entity_name)\n", 1862 | " elif l == \"O\":\n", 1863 | " if len(entity_name) > 0:\n", 1864 | " entity_list.append(entity_name)\n", 1865 | " entity_name = ''\n", 1866 | " \n", 1867 | " return \" \".join(list(set(entity_list)))" 1868 | ] 1869 | }, 1870 | { 1871 | "cell_type": "code", 1872 | "execution_count": 50, 1873 | "metadata": { 1874 | "ExecuteTime": { 1875 | "end_time": "2019-09-05T03:57:47.645893Z", 1876 | "start_time": "2019-09-05T03:57:47.637090Z" 1877 | } 1878 | }, 1879 | "outputs": [ 1880 | { 1881 | "name": "stdout", 1882 | "output_type": "stream", 1883 | "text": [ 1884 | "['I', 'C', 'A', '亚', '投', '链', '矿', '机', '投', '资', '配', '股', '拆', '分', '理', '财', '开', '发', '找', '1', '3', '郑', '2', '8', '6', '婷', '0', '1', '5', '微', '7', '3', '7', '电', '。', 'I', 'C', 'A', '亚', '投', '链', '网', '上', '投', '资', '理', '财', 'a', 'p', 'p', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '虚', '拟', '币', '投', '资', '分', '红', '模', '式', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '理', '财', '平', '台', '网', '页', '版', '定', '制', '开', '发', '、', 'I', 'C', 'A', '亚', '投', '链', '区', '块', '链', '分', '红', '系', '统', '程', '序', '开', '发', '区', '块', '链', '技', '术', '的', '定', '义', '是', '什', '么', '。', 'I', 'C', 'A', '亚', '投', '链', '模', '式', '制', '度', '介', '绍', ' ', '注', '册', '免', '费', '送', '1', '2', '万', '台', '矿', '机', ' ', '数', '量', '有', '限', ' ', '送', '完', '为', '止', '。', 'I', 'C', 'A', '亚', '投', '链', '全', '球', '恒', '量', '发', '行', '1', ' ', '5', '亿', '枚', ' ', '永', '不', '增', '发', ' ', '开', '盘', '价', '0', ' ', '2', '美', '元', ' ', '预', '计', '开', '盘', '交', '易', '黑', '市', '价', '不', '低', '于', '2', '美', '元', ' ', '市', '场', '更', '是', '一', '币', '难', '求', ' ', '采', '取', '有', '效', '的', '控', '盘', '机', '制', '。', 'I', 'C', 'A', '亚', '投', '链', '全', '球', '恒', '量', '发', '行', '1', ' ', '5', '亿', '枚', ' ', '永', '不', '增', '发', '。', 'I', 'C', 'A', '亚', '投', '链', '资', '质', '背', '书', ' ', '双', '加', '密', '网', '址', ' ', '独', '立', '开', '源', '代', '码', ' ', '采', '用', 'P', 'O', 'W', '智', '能', '合', '约', ' ', 'P', 'O', 'S', '算', '力', ' ', '对', '接', '商', '城', ' ', '话', '费', ' ', '水', '电', '费', '充', '值', ' ', '中', '石', '油', '、', '中', '石', '化', '充', '值', ' ', 'I', 'C', 'A', '亚', '投', '链', '推', '广', '奖', '励', ' ', '矿', '工', '公', '会', 'I', 'C', 'A', '亚', '投', '链', '奖', '励', ' ', '一', '代', ' ', '拿', '直', '推', '收', '益', '总', '产', '量', '的', '5', '%', '低', '于', '直', '推', '算', '力', ' ', '收', '益', '减', '半', '二', '代', ' ', '3', '%', '三', '代', ' ', '1', '%', '一', '星', '会', '长', ' ', '直', '推', '1', '0', '人', ' ', '团', '队', '1', '0', '0', '人', ' ', '工', '会', '算', '力', ' ', '总', '算', '力', '达', '到', '2', '0', 'G', 'H', ' ', 'S', ' ', '送', '小', '型', '矿', '机', '一', '台']\n" 1885 | ] 1886 | } 1887 | ], 1888 | "source": [ 1889 | "X_data = [c for c in sample_data[1]]\n", 1890 | "print(X_data)" 1891 | ] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": 51, 1896 | "metadata": { 1897 | "ExecuteTime": { 1898 | "end_time": "2019-09-05T03:57:47.657135Z", 1899 | "start_time": "2019-09-05T03:57:47.648087Z" 1900 | } 1901 | }, 1902 | "outputs": [ 1903 | { 1904 | "data": { 1905 | "text/plain": [ 1906 | "'ICA亚投链'" 1907 | ] 1908 | }, 1909 | "execution_count": 51, 1910 | "metadata": {}, 1911 | "output_type": "execute_result" 1912 | } 1913 | ], 1914 | "source": [ 1915 | "get_entity(X_data, pred_label)" 1916 | ] 1917 | }, 1918 | { 1919 | "cell_type": "markdown", 1920 | "metadata": {}, 1921 | "source": [ 1922 | "## 预测测试集" 1923 | ] 1924 | }, 1925 | { 1926 | "cell_type": "markdown", 1927 | "metadata": {}, 1928 | "source": [ 1929 | "### 处理测试集" 1930 | ] 1931 | }, 1932 | { 1933 | "cell_type": "code", 1934 | "execution_count": null, 1935 | "metadata": {}, 1936 | "outputs": [], 1937 | "source": [ 1938 | "def clean_zh(text):\n", 1939 | " '''清洗文本,保证语句通顺(关于小数点的问题无法处理)'''\n", 1940 | " text = text.replace(\"(\", \"(\").replace(\")\", \")\")\n", 1941 | " punct = string.punctuation + punctuation\n", 1942 | " punct = \"\".join([c for c in punct if c not in [\".\", \"、\", \"%\", \"“\", \"”\", \"(\", \")\", \"!\", \"。\", \"?\"]])\n", 1943 | " text = re.sub(r\"[%s]+\" % punct, \" \", text)\n", 1944 | " # 将引号替换\n", 1945 | " text = re.sub(r\"[%s]+\" % \"“”()\", \"\", text)\n", 1946 | " text = re.sub(r\"[%s]+\" % \":\", \" \", text)\n", 1947 | " # 多个空格替换成一个\n", 1948 | " text = re.sub(' +', ' ', text)\n", 1949 | " return text\n", 1950 | "\n", 1951 | "def clean_data(text):\n", 1952 | " \"\"\"清理各种脏数据\"\"\"\n", 1953 | " for p in pattern:\n", 1954 | " text = re.sub(p, \"\", text)\n", 1955 | " text = clean_zh(text)\n", 1956 | " return text" 1957 | ] 1958 | }, 1959 | { 1960 | "cell_type": "code", 1961 | "execution_count": 56, 1962 | "metadata": { 1963 | "ExecuteTime": { 1964 | "end_time": "2019-09-05T04:29:39.654532Z", 1965 | "start_time": "2019-09-05T04:29:34.815255Z" 1966 | } 1967 | }, 1968 | "outputs": [], 1969 | "source": [ 1970 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 1971 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 1972 | ] 1973 | }, 1974 | { 1975 | "cell_type": "code", 1976 | "execution_count": 62, 1977 | "metadata": { 1978 | "ExecuteTime": { 1979 | "end_time": "2019-09-05T04:30:34.470127Z", 1980 | "start_time": "2019-09-05T04:30:34.439130Z" 1981 | } 1982 | }, 1983 | "outputs": [], 1984 | "source": [ 1985 | "df_test[\"new_text\"] = df_test[\"title\"] + \" \" + df_test[\"text\"]" 1986 | ] 1987 | }, 1988 | { 1989 | "cell_type": "code", 1990 | "execution_count": 63, 1991 | "metadata": { 1992 | "ExecuteTime": { 1993 | "end_time": "2019-09-05T04:30:36.728419Z", 1994 | "start_time": "2019-09-05T04:30:36.710158Z" 1995 | } 1996 | }, 1997 | "outputs": [ 1998 | { 1999 | "data": { 2000 | "text/html": [ 2001 | "
\n", 2002 | "\n", 2015 | "\n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | " \n", 2043 | " \n", 2044 | " \n", 2045 | " \n", 2046 | " \n", 2047 | " \n", 2048 | " \n", 2049 | " \n", 2050 | " \n", 2051 | " \n", 2052 | " \n", 2053 | " \n", 2054 | " \n", 2055 | " \n", 2056 | " \n", 2057 | " \n", 2058 | " \n", 2059 | " \n", 2060 | " \n", 2061 | " \n", 2062 | "
idtitletextnew_text
083dcefb7时空周转公众注册 当天秒下时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的...时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到...
11ad5be0d抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ...抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 本篇文章将对两种...
26dd28e9b2019健康行业趋势 住家创业 稳赚不亏2019健康行业趋势 住家创业 稳赚不亏
3f3b61b38CCM区块链商城PPT介绍CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自...CCM区块链商城PPT介绍 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和...
484b12bae加密数字货币里大家都赚钱 钱是从哪里来的?我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿...加密数字货币里大家都赚钱 钱是从哪里来的? 我最近去分享艾尔链LAC加密数字货币时 有很多朋...
\n", 2063 | "
" 2064 | ], 2065 | "text/plain": [ 2066 | " id title \\\n", 2067 | "0 83dcefb7 时空周转公众注册 当天秒下 \n", 2068 | "1 1ad5be0d 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 \n", 2069 | "2 6dd28e9b \n", 2070 | "3 f3b61b38 CCM区块链商城PPT介绍 \n", 2071 | "4 84b12bae 加密数字货币里大家都赚钱 钱是从哪里来的? \n", 2072 | "\n", 2073 | " text \\\n", 2074 | "0 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的... \n", 2075 | "1 本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ... \n", 2076 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 2077 | "3 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自... \n", 2078 | "4 我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿... \n", 2079 | "\n", 2080 | " new_text \n", 2081 | "0 时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到... \n", 2082 | "1 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 本篇文章将对两种... \n", 2083 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 2084 | "3 CCM区块链商城PPT介绍 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和... \n", 2085 | "4 加密数字货币里大家都赚钱 钱是从哪里来的? 我最近去分享艾尔链LAC加密数字货币时 有很多朋... " 2086 | ] 2087 | }, 2088 | "execution_count": 63, 2089 | "metadata": {}, 2090 | "output_type": "execute_result" 2091 | } 2092 | ], 2093 | "source": [ 2094 | "df_test.head()" 2095 | ] 2096 | }, 2097 | { 2098 | "cell_type": "code", 2099 | "execution_count": 64, 2100 | "metadata": { 2101 | "ExecuteTime": { 2102 | "end_time": "2019-09-05T04:30:40.955068Z", 2103 | "start_time": "2019-09-05T04:30:40.947846Z" 2104 | } 2105 | }, 2106 | "outputs": [ 2107 | { 2108 | "data": { 2109 | "text/plain": [ 2110 | "(4998, 4)" 2111 | ] 2112 | }, 2113 | "execution_count": 64, 2114 | "metadata": {}, 2115 | "output_type": "execute_result" 2116 | } 2117 | ], 2118 | "source": [ 2119 | "df_test.shape" 2120 | ] 2121 | }, 2122 | { 2123 | "cell_type": "code", 2124 | "execution_count": 65, 2125 | "metadata": { 2126 | "ExecuteTime": { 2127 | "end_time": "2019-09-05T04:30:44.183591Z", 2128 | "start_time": "2019-09-05T04:30:44.174293Z" 2129 | } 2130 | }, 2131 | "outputs": [], 2132 | "source": [ 2133 | "# 对测试集的文本进行分割,担心正文内容太长\n", 2134 | "def split_text(seq_list, maxlen):\n", 2135 | " res = []\n", 2136 | " for tuple_ in seq_list:\n", 2137 | " id = tuple_[0]\n", 2138 | " if len(tuple_[1]) > maxlen:\n", 2139 | " num_sent = len(tuple_[1]) // maxlen\n", 2140 | " if len(tuple_[1]) % maxlen != 0:\n", 2141 | " num_sent += 1\n", 2142 | " for i in range(num_sent):\n", 2143 | " res.append((id, tuple_[1][i*maxlen: (i+1)*maxlen]))\n", 2144 | " else:\n", 2145 | " res.append((id, tuple_[1]))\n", 2146 | " return res" 2147 | ] 2148 | }, 2149 | { 2150 | "cell_type": "code", 2151 | "execution_count": 68, 2152 | "metadata": { 2153 | "ExecuteTime": { 2154 | "end_time": "2019-09-05T04:31:50.238640Z", 2155 | "start_time": "2019-09-05T04:31:50.232330Z" 2156 | } 2157 | }, 2158 | "outputs": [], 2159 | "source": [ 2160 | "def get_test_data():\n", 2161 | " test_data = []\n", 2162 | " for id, c in zip(df_test[\"id\"], df_test[\"new_text\"]):\n", 2163 | " test_data.append((id, c))\n", 2164 | " return test_data" 2165 | ] 2166 | }, 2167 | { 2168 | "cell_type": "code", 2169 | "execution_count": 69, 2170 | "metadata": { 2171 | "ExecuteTime": { 2172 | "end_time": "2019-09-05T04:31:50.820134Z", 2173 | "start_time": "2019-09-05T04:31:50.808412Z" 2174 | } 2175 | }, 2176 | "outputs": [], 2177 | "source": [ 2178 | "test_d = get_test_data()" 2179 | ] 2180 | }, 2181 | { 2182 | "cell_type": "code", 2183 | "execution_count": 70, 2184 | "metadata": { 2185 | "ExecuteTime": { 2186 | "end_time": "2019-09-05T04:31:58.793146Z", 2187 | "start_time": "2019-09-05T04:31:58.785686Z" 2188 | } 2189 | }, 2190 | "outputs": [ 2191 | { 2192 | "data": { 2193 | "text/plain": [ 2194 | "4998" 2195 | ] 2196 | }, 2197 | "execution_count": 70, 2198 | "metadata": {}, 2199 | "output_type": "execute_result" 2200 | } 2201 | ], 2202 | "source": [ 2203 | "len(test_d)" 2204 | ] 2205 | }, 2206 | { 2207 | "cell_type": "code", 2208 | "execution_count": 71, 2209 | "metadata": { 2210 | "ExecuteTime": { 2211 | "end_time": "2019-09-05T04:32:09.610459Z", 2212 | "start_time": "2019-09-05T04:32:09.603930Z" 2213 | } 2214 | }, 2215 | "outputs": [ 2216 | { 2217 | "data": { 2218 | "text/plain": [ 2219 | "512" 2220 | ] 2221 | }, 2222 | "execution_count": 71, 2223 | "metadata": {}, 2224 | "output_type": "execute_result" 2225 | } 2226 | ], 2227 | "source": [ 2228 | "maxlen" 2229 | ] 2230 | }, 2231 | { 2232 | "cell_type": "code", 2233 | "execution_count": 72, 2234 | "metadata": { 2235 | "ExecuteTime": { 2236 | "end_time": "2019-09-05T04:32:13.443385Z", 2237 | "start_time": "2019-09-05T04:32:13.411916Z" 2238 | } 2239 | }, 2240 | "outputs": [], 2241 | "source": [ 2242 | "test_res = split_text(test_d, maxlen)" 2243 | ] 2244 | }, 2245 | { 2246 | "cell_type": "code", 2247 | "execution_count": 75, 2248 | "metadata": { 2249 | "ExecuteTime": { 2250 | "end_time": "2019-09-05T04:35:17.461315Z", 2251 | "start_time": "2019-09-05T04:35:17.454315Z" 2252 | } 2253 | }, 2254 | "outputs": [ 2255 | { 2256 | "data": { 2257 | "text/plain": [ 2258 | "('83dcefb7',\n", 2259 | " '时空周转公众注册 当天秒下 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的。扫一扫 立即申请时空周转app功能1、极速放款 自动审核、极速放款、实时到账 2、流程简单 在线填写资料 芝麻信用授权即可贷款 3、信息安全 数据库加密技术、保护借款人隐私 4、随借随还 无论何时何地、借款轻松 还款便捷。时空周转app亮点1、闪电借款 纯线上自动化审核 快至30分钟到账 2、额度灵活 单期借款、现金分期 万元额度任你选。3、门槛超低 无门槛、无担保 有身份证即可借款。关注我们 更多口子信息问 有人在时空周转借款过吗答 时空周转正常情况下2 3小时以内 也有特殊情况。问 时空周转贷款审核需要多少时间 时空周转贷款审核时间多长答 提前还款后 也还是能继续在时空周转借款的。问 时空周转没有还清还可以申请吗?答 时空周转!不需要太多的条件 借款也很快!问 时空周转真的像广告里面说的没有信用卡也能贷款吗答 用过 不是的 时空周转最大的好处就是可以节省分期的手续费问 时空周转好用吗时空周转 使用的人多不多啊答 时空周转它的手续费是比较低的 而且还款压力也比较小的。低于银行七哩八。5万')" 2260 | ] 2261 | }, 2262 | "execution_count": 75, 2263 | "metadata": {}, 2264 | "output_type": "execute_result" 2265 | } 2266 | ], 2267 | "source": [ 2268 | "test_res[0]" 2269 | ] 2270 | }, 2271 | { 2272 | "cell_type": "code", 2273 | "execution_count": 74, 2274 | "metadata": { 2275 | "ExecuteTime": { 2276 | "end_time": "2019-09-05T04:35:09.588152Z", 2277 | "start_time": "2019-09-05T04:35:09.581498Z" 2278 | } 2279 | }, 2280 | "outputs": [ 2281 | { 2282 | "data": { 2283 | "text/plain": [ 2284 | "12235" 2285 | ] 2286 | }, 2287 | "execution_count": 74, 2288 | "metadata": {}, 2289 | "output_type": "execute_result" 2290 | } 2291 | ], 2292 | "source": [ 2293 | "len(test_res)" 2294 | ] 2295 | }, 2296 | { 2297 | "cell_type": "code", 2298 | "execution_count": 73, 2299 | "metadata": { 2300 | "ExecuteTime": { 2301 | "end_time": "2019-09-05T04:35:06.986729Z", 2302 | "start_time": "2019-09-05T04:35:04.013365Z" 2303 | } 2304 | }, 2305 | "outputs": [], 2306 | "source": [ 2307 | "sub_id = []\n", 2308 | "sub_text = []\n", 2309 | "sub_X1 = []\n", 2310 | "sub_X2 = []\n", 2311 | "for tuple_ in test_res:\n", 2312 | " sub_id.append(tuple_[0])\n", 2313 | " x1, x2 = tokenizer.encode(first=tuple_[1])\n", 2314 | " sub_X1.append(x1)\n", 2315 | " sub_X2.append(x2)\n", 2316 | " sub_text.append(tuple_[1])\n", 2317 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 2318 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 2319 | ] 2320 | }, 2321 | { 2322 | "cell_type": "code", 2323 | "execution_count": 76, 2324 | "metadata": { 2325 | "ExecuteTime": { 2326 | "end_time": "2019-09-05T04:35:30.308948Z", 2327 | "start_time": "2019-09-05T04:35:30.302899Z" 2328 | } 2329 | }, 2330 | "outputs": [], 2331 | "source": [ 2332 | "bs = 64\n", 2333 | "steps = len(sub_id) // bs\n", 2334 | "if len(sub_id) % bs != 0:\n", 2335 | " steps += 1" 2336 | ] 2337 | }, 2338 | { 2339 | "cell_type": "code", 2340 | "execution_count": 77, 2341 | "metadata": { 2342 | "ExecuteTime": { 2343 | "end_time": "2019-09-05T04:44:27.480092Z", 2344 | "start_time": "2019-09-05T04:35:48.519124Z" 2345 | } 2346 | }, 2347 | "outputs": [ 2348 | { 2349 | "name": "stderr", 2350 | "output_type": "stream", 2351 | "text": [ 2352 | "100%|██████████| 192/192 [08:38<00:00, 2.15s/it]\n" 2353 | ] 2354 | } 2355 | ], 2356 | "source": [ 2357 | "result_all = []\n", 2358 | "for i in tqdm(range(steps)):\n", 2359 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 2360 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 2361 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 2362 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 2363 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 2364 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 2365 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 2366 | " pred_label = [idx2tag[i] for i in pred]\n", 2367 | " x_data = [c for c in text]\n", 2368 | " entity = get_entity(x_data, pred_label)\n", 2369 | " result_all.append((id, entity))" 2370 | ] 2371 | }, 2372 | { 2373 | "cell_type": "code", 2374 | "execution_count": 87, 2375 | "metadata": { 2376 | "ExecuteTime": { 2377 | "end_time": "2019-09-05T04:53:58.631652Z", 2378 | "start_time": "2019-09-05T04:53:58.598609Z" 2379 | } 2380 | }, 2381 | "outputs": [], 2382 | "source": [ 2383 | "last_result = []\n", 2384 | "for tuple_ in result_all:\n", 2385 | " ner = []\n", 2386 | " for word in tuple_[1].split(\" \"):\n", 2387 | " if len(word) > 2:\n", 2388 | " ner.append(word)\n", 2389 | " last_result.append((tuple_[0], \" \".join(ner)))" 2390 | ] 2391 | }, 2392 | { 2393 | "cell_type": "code", 2394 | "execution_count": 88, 2395 | "metadata": { 2396 | "ExecuteTime": { 2397 | "end_time": "2019-09-05T04:54:03.325160Z", 2398 | "start_time": "2019-09-05T04:54:03.318038Z" 2399 | }, 2400 | "scrolled": true 2401 | }, 2402 | "outputs": [ 2403 | { 2404 | "data": { 2405 | "text/plain": [ 2406 | "('83dcefb7', '时空周转 时空周转')" 2407 | ] 2408 | }, 2409 | "execution_count": 88, 2410 | "metadata": {}, 2411 | "output_type": "execute_result" 2412 | } 2413 | ], 2414 | "source": [ 2415 | "last_result[0]" 2416 | ] 2417 | }, 2418 | { 2419 | "cell_type": "code", 2420 | "execution_count": 89, 2421 | "metadata": { 2422 | "ExecuteTime": { 2423 | "end_time": "2019-09-05T04:55:03.861617Z", 2424 | "start_time": "2019-09-05T04:55:03.830377Z" 2425 | } 2426 | }, 2427 | "outputs": [], 2428 | "source": [ 2429 | "res_dict = dict()\n", 2430 | "for tuple_ in last_result:\n", 2431 | " if tuple_[0] not in res_dict:\n", 2432 | " res_dict[tuple_[0]] = tuple_[1].split(\" \")\n", 2433 | " else:\n", 2434 | " res_dict[tuple_[0]] = res_dict[tuple_[0]] + tuple_[1].split(\" \")" 2435 | ] 2436 | }, 2437 | { 2438 | "cell_type": "code", 2439 | "execution_count": 84, 2440 | "metadata": { 2441 | "ExecuteTime": { 2442 | "end_time": "2019-09-05T04:46:37.752349Z", 2443 | "start_time": "2019-09-05T04:46:37.745531Z" 2444 | } 2445 | }, 2446 | "outputs": [ 2447 | { 2448 | "data": { 2449 | "text/plain": [ 2450 | "4998" 2451 | ] 2452 | }, 2453 | "execution_count": 84, 2454 | "metadata": {}, 2455 | "output_type": "execute_result" 2456 | } 2457 | ], 2458 | "source": [ 2459 | "len(test_d)" 2460 | ] 2461 | }, 2462 | { 2463 | "cell_type": "code", 2464 | "execution_count": 92, 2465 | "metadata": { 2466 | "ExecuteTime": { 2467 | "end_time": "2019-09-05T05:00:20.154002Z", 2468 | "start_time": "2019-09-05T05:00:20.146278Z" 2469 | } 2470 | }, 2471 | "outputs": [ 2472 | { 2473 | "data": { 2474 | "text/plain": [ 2475 | "'w'" 2476 | ] 2477 | }, 2478 | "execution_count": 92, 2479 | "metadata": {}, 2480 | "output_type": "execute_result" 2481 | } 2482 | ], 2483 | "source": [] 2484 | }, 2485 | { 2486 | "cell_type": "code", 2487 | "execution_count": 90, 2488 | "metadata": { 2489 | "ExecuteTime": { 2490 | "end_time": "2019-09-05T04:55:13.687039Z", 2491 | "start_time": "2019-09-05T04:55:13.654892Z" 2492 | } 2493 | }, 2494 | "outputs": [], 2495 | "source": [ 2496 | "last_res = []\n", 2497 | "with open(\"/home/wangwei/tf_workdir/word_detect/submit_test.csv\", \"w\", encoding='utf-8') as f:\n", 2498 | " for i in test_d:\n", 2499 | " key = i[0]\n", 2500 | " value = res_dict[key]\n", 2501 | " value = \";\".join(list(set(value)))\n", 2502 | " f.write(key+','+value+\"\\n\")" 2503 | ] 2504 | }, 2505 | { 2506 | "cell_type": "code", 2507 | "execution_count": 91, 2508 | "metadata": { 2509 | "ExecuteTime": { 2510 | "end_time": "2019-09-05T04:57:13.949558Z", 2511 | "start_time": "2019-09-05T04:57:13.917933Z" 2512 | } 2513 | }, 2514 | "outputs": [], 2515 | "source": [ 2516 | "with codecs.open(\"/home/wangwei/tf_workdir/word_detect/submit_test1.csv\", \"w\", encoding=\"utf-8\") as f:\n", 2517 | " for i in test_d:\n", 2518 | " key = i[0]\n", 2519 | " value = res_dict[key]\n", 2520 | " value = \";\".join(list(set(value)))\n", 2521 | " f.write(key+','+value+\"\\n\")" 2522 | ] 2523 | }, 2524 | { 2525 | "cell_type": "markdown", 2526 | "metadata": {}, 2527 | "source": [ 2528 | "### 处理测试集2" 2529 | ] 2530 | }, 2531 | { 2532 | "cell_type": "code", 2533 | "execution_count": 93, 2534 | "metadata": { 2535 | "ExecuteTime": { 2536 | "end_time": "2019-09-05T05:07:56.925583Z", 2537 | "start_time": "2019-09-05T05:07:52.394191Z" 2538 | } 2539 | }, 2540 | "outputs": [], 2541 | "source": [ 2542 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 2543 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 2544 | ] 2545 | }, 2546 | { 2547 | "cell_type": "code", 2548 | "execution_count": 114, 2549 | "metadata": { 2550 | "ExecuteTime": { 2551 | "end_time": "2019-09-05T05:33:18.485167Z", 2552 | "start_time": "2019-09-05T05:33:18.473832Z" 2553 | } 2554 | }, 2555 | "outputs": [ 2556 | { 2557 | "data": { 2558 | "text/plain": [ 2559 | "array(['SMRT 智慧地球2010年由全美零售业联盟会、新兴市场基金会简称EMF发起研发 基金会团队分布在美国硅谷、巴黎、韩国、印度 四大研发中心号上线 智慧地球🌍首先呢?它不是一个单线平台、是基于多个模块、分不同阶段不同地区和国家进行、相互独立又相互依存、所以说呢地球🌍不是一个简单的拆分、拆分只不过是前期的一部个启动器而已、分区块连🌍社交软件APP浏览器等、商业板块 学习模块、游戏道具交易买卖板块、数字货币交易网站的打造、像火币网一样的数字货币三大交易平台打造、新加坡 迪拜、马耳它、亚洲国家一个中东国家一个欧洲国家一个、自然界 区块链数字货币的打造!作为地球🌍多国渡全生态 交易的价值媒介 实现全球点对点交易无障碍通道 不同模块在不同阶段适时推出为地球🌍每个阶段更好的发展、地球🌍平台志向高远、目前许多玩家仅仅停留在或了解到的只是地球的初级阶段的一小部分东西 地球志向高远 早已经走在了时代的前沿 表面上是一种新的数据结构与计算方式 但其核心价值在于重塑人类交易方式和共识机制 智慧地球🌍每个板块都是一环扣一环环环相扣 未来5个发展阶段和布局结构全部提前规划出行路线和技术的储备 适时推出即可 跟着路线走就行 1️⃣第一阶段SmartGlobe的研发设计 初始建设者招募与教育 也就是玩家的培育 打好根基的意思2️⃣SmartGlobe正式面世 努力打造Z频道链接Amazon 将趣味购物 利润均享的先进理念带入新兴市场区块链APP社交软件和浏览器等3️⃣阶段当建设者 消费者总量达到5万人 SmartGoble智慧地球建设初具规模 开放20万EP的商业单元 审核通过的建设者自有品牌商家便可自主开设一个面向5万人存量消费者的特色商铺。同时 战略合作专用的即时通讯工具上线 方便交易方线上沟通 方便各支SmartGoble建设分队学习交流、包括三国区块链数字交易平台上线运营、自然界 数字货币全球点对点交易支付媒介及数字资产、商品全球买全球卖全球支付体系的建设。4️⃣当建设者 消费者总量达到50万人 开放50万EP的通讯单元 Z频道独立剥离 引入风投或采用众筹模式 推向资本市场5️⃣管委会可控基金达到一定规模 借由专业投资团队 介入优势产业、物色包装有潜力的优质企业。让SmartGlobe以资本的力量 引发共存共赢的理念在全球传播!SmartGlobe优秀的建设者将有优先权 畅享源源不断的资本盛宴。'],\n", 2560 | " dtype=object)" 2561 | ] 2562 | }, 2563 | "execution_count": 114, 2564 | "metadata": {}, 2565 | "output_type": "execute_result" 2566 | } 2567 | ], 2568 | "source": [ 2569 | "df_test[df_test[\"id\"] == \"1414e381\"][\"text\"].values" 2570 | ] 2571 | }, 2572 | { 2573 | "cell_type": "code", 2574 | "execution_count": 136, 2575 | "metadata": { 2576 | "ExecuteTime": { 2577 | "end_time": "2019-09-05T05:48:38.527965Z", 2578 | "start_time": "2019-09-05T05:48:38.511216Z" 2579 | } 2580 | }, 2581 | "outputs": [ 2582 | { 2583 | "data": { 2584 | "text/plain": [ 2585 | "array([' 公告 上银基金管理有限公司 上银中债1 3年农发行债券指数 关于上银中债1 3年农发行债券指数证券投资基金提前结束募集的公告时间 中财网上银基金管理有限公司关于上银中债1 3年农发行债券指数证券投资基金提前结束募集的公告上银中债1 3年农发行债券指数证券投资基金基金简称 上银中债13年农发行债券指数 基金代码 007390 以下简称本基金于经中国证监会证监许可 2019 548号文准予注册募集。本基金已于起开始募集 原定募集截止日为。为充分保护基金份额持有人利益 根据目前本基金销售情况及、、的有关规定 经与本基金托管人股份有限公司协商一致 上银基金管理有限公司决定提前结束本基金的募集 募集截止日提前至 自含当日起不再接受认购申请。敬请投资者留意。投资者可以通过以下途径咨询本基金其他有关信息 1、上银基金管理有限公司客户服务电话 021602319992、上银基金管理有限公司网站 www boscam com cn风险提示 基金管理人承诺以诚实信用、勤勉尽责的原则管理和运用基金资产 但不保证基金一定盈利 也不保证最低收益。投资者投资于本基金前应认真阅读本基金的基金合同和招募说明书。敬请投资者注意投资风险。特此公告上银基金管理有限公司二〇一九年六月十八日 中财网'],\n", 2586 | " dtype=object)" 2587 | ] 2588 | }, 2589 | "execution_count": 136, 2590 | "metadata": {}, 2591 | "output_type": "execute_result" 2592 | } 2593 | ], 2594 | "source": [ 2595 | "df_test[df_test[\"id\"] == \"5a8717fa\"][\"text\"].values" 2596 | ] 2597 | }, 2598 | { 2599 | "cell_type": "code", 2600 | "execution_count": 119, 2601 | "metadata": { 2602 | "ExecuteTime": { 2603 | "end_time": "2019-09-05T05:35:58.049427Z", 2604 | "start_time": "2019-09-05T05:35:58.008664Z" 2605 | } 2606 | }, 2607 | "outputs": [], 2608 | "source": [ 2609 | "test_data = []\n", 2610 | "for id, title, text in zip(df_test[\"id\"], df_test[\"title\"], df_test[\"text\"]):\n", 2611 | " p = re.compile(u\"[^a-zA-Z0-9\\u4e00-\\u9fa5]\")\n", 2612 | " line = p.sub(' ',line)\n", 2613 | " line = re.sub(' +', ' ', line)\n", 2614 | " test_data.append((id, text))" 2615 | ] 2616 | }, 2617 | { 2618 | "cell_type": "code", 2619 | "execution_count": 120, 2620 | "metadata": { 2621 | "ExecuteTime": { 2622 | "end_time": "2019-09-05T05:36:12.472960Z", 2623 | "start_time": "2019-09-05T05:36:12.465903Z" 2624 | }, 2625 | "scrolled": true 2626 | }, 2627 | "outputs": [ 2628 | { 2629 | "data": { 2630 | "text/plain": [ 2631 | "9226" 2632 | ] 2633 | }, 2634 | "execution_count": 120, 2635 | "metadata": {}, 2636 | "output_type": "execute_result" 2637 | } 2638 | ], 2639 | "source": [ 2640 | "len(test_data)" 2641 | ] 2642 | }, 2643 | { 2644 | "cell_type": "code", 2645 | "execution_count": 126, 2646 | "metadata": { 2647 | "ExecuteTime": { 2648 | "end_time": "2019-09-05T05:37:20.968560Z", 2649 | "start_time": "2019-09-05T05:37:20.960859Z" 2650 | }, 2651 | "scrolled": true 2652 | }, 2653 | "outputs": [ 2654 | { 2655 | "data": { 2656 | "text/plain": [ 2657 | "('1ad5be0d',\n", 2658 | " '本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 另一种是做项目 带团队的专职网赚者 通过本篇文章 你们会知道这才是你想要的东西 首先讲纯羊毛党 打开此款APP 我发现 这里真是纯羊毛党的天堂 对于羊毛党 我只介绍这两个功能 1 红包大厅 有做项目的为了推广项目 会发些图方广告 以红包的形式 你只要观看10秒 就能抢到红包 对于羊毛党来说 向来是以量取胜的 而且你若有心 你会发现不少的优质项目 一手资源如图 2 任务大厅 就跟其它比如蚂蚁 牛帮 众人帮等任务平台一样 这里可以发任务 接任务 价格也不错 最有突出的一个就是有朋友圈任务 简单转发到朋友圈 最低是1元 条 两三块 条的也有 每天光接这些任务 都够你一天几十的了 如图 3 这里要顺便说下我的微信群 大家都知道 每个任务平台收徒弟 徒弟完成任务 师傅是有奖励的 之前我建立了众人帮 牛帮 余赚网 闲趣赚这几个平台的徒弟群 每天返70%的奖励给大家 得到了很好的效果 大家做任务都很积极 也得到了更多的奖励 现在我又单独建立了全民推的群 因为我发现全民推 在我开通了金牌站长 VIP会员后 我能得到徒弟30%的奖励 所以 这个数字是非常可观的 为此 请做任务的兄弟们 一定要走我链接 加我微信 进群享高额分红 每天红包雨让你爽翻天 往下拉 加我微信进群 第二种人群 专职网赚带队干项目的为什么一定要来这里呢请先听下我的故事 最近我手上有不少的好项目 却一直建立不起来一个更大的团队 思前思后 主要是没有找到可以引流的池子 一直到我有天看朋友圈发现有人在推广这个APP 下载之后才发现这里是真正的流量池 好了 我只说到这 干项目的都不是傻子 我只发点图给你们看 其它自己揣摩 第一张 请注意 数字 1 79亿 再看中间广告页的点击率 这上面是不断刷新的这一张 是加粉中心页面 可以把自己微信 群 小程序 公众号 甚至货源发上去综上所述 我觉得全民推这款APP是值得大家下载安装的 点击左下角阅读原文做任务的伙伴们记得加我微信进群享每日70%分红 有钱大家一起赚才是正确的操作方式 跟我干有钱赚点击 阅读原文 注册▼往期精彩回顾▼淘宝评价自动变现小而美的项目全国招收云闪付推广员 18元 单 一单一结手机端POS机 店小友 手机就是POS机 央行支付牌照 用友集团旗下 值得拥有软银支付是什么有了店小友 我为什么还要推广软银支付')" 2659 | ] 2660 | }, 2661 | "execution_count": 126, 2662 | "metadata": {}, 2663 | "output_type": "execute_result" 2664 | } 2665 | ], 2666 | "source": [ 2667 | "test_data[3]" 2668 | ] 2669 | }, 2670 | { 2671 | "cell_type": "code", 2672 | "execution_count": 127, 2673 | "metadata": { 2674 | "ExecuteTime": { 2675 | "end_time": "2019-09-05T05:37:33.965221Z", 2676 | "start_time": "2019-09-05T05:37:32.665306Z" 2677 | } 2678 | }, 2679 | "outputs": [], 2680 | "source": [ 2681 | "sub_id = []\n", 2682 | "sub_text = []\n", 2683 | "sub_X1 = []\n", 2684 | "sub_X2 = []\n", 2685 | "for tuple_ in test_data:\n", 2686 | " sub_id.append(tuple_[0])\n", 2687 | " x1, x2 = tokenizer.encode(first=tuple_[1][:maxlen])\n", 2688 | " sub_X1.append(x1)\n", 2689 | " sub_X2.append(x2)\n", 2690 | " sub_text.append(tuple_[1])\n", 2691 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 2692 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 2693 | ] 2694 | }, 2695 | { 2696 | "cell_type": "code", 2697 | "execution_count": 128, 2698 | "metadata": { 2699 | "ExecuteTime": { 2700 | "end_time": "2019-09-05T05:37:33.970693Z", 2701 | "start_time": "2019-09-05T05:37:33.967536Z" 2702 | } 2703 | }, 2704 | "outputs": [], 2705 | "source": [ 2706 | "bs = 64\n", 2707 | "steps = len(sub_id) // bs\n", 2708 | "if len(sub_id) % bs != 0:\n", 2709 | " steps += 1" 2710 | ] 2711 | }, 2712 | { 2713 | "cell_type": "code", 2714 | "execution_count": 129, 2715 | "metadata": { 2716 | "ExecuteTime": { 2717 | "end_time": "2019-09-05T05:44:07.909387Z", 2718 | "start_time": "2019-09-05T05:37:34.646402Z" 2719 | } 2720 | }, 2721 | "outputs": [ 2722 | { 2723 | "name": "stderr", 2724 | "output_type": "stream", 2725 | "text": [ 2726 | "100%|██████████| 145/145 [06:33<00:00, 2.17s/it]\n" 2727 | ] 2728 | } 2729 | ], 2730 | "source": [ 2731 | "result_all = []\n", 2732 | "for i in tqdm(range(steps)):\n", 2733 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 2734 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 2735 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 2736 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 2737 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 2738 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 2739 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 2740 | " pred_label = [idx2tag[i] for i in pred]\n", 2741 | " x_data = [c for c in text]\n", 2742 | " entity = get_entity(x_data, pred_label)\n", 2743 | " result_all.append((id, entity))" 2744 | ] 2745 | }, 2746 | { 2747 | "cell_type": "code", 2748 | "execution_count": 135, 2749 | "metadata": { 2750 | "ExecuteTime": { 2751 | "end_time": "2019-09-05T05:46:54.588870Z", 2752 | "start_time": "2019-09-05T05:46:54.572605Z" 2753 | }, 2754 | "scrolled": true 2755 | }, 2756 | "outputs": [ 2757 | { 2758 | "data": { 2759 | "text/plain": [ 2760 | "[('982e5921', ''),\n", 2761 | " ('982e5921', ''),\n", 2762 | " ('0127089b', '凤推'),\n", 2763 | " ('0127089b', '凤推'),\n", 2764 | " ('7620380d', '比特挖矿 比特挖'),\n", 2765 | " ('7620380d', ' 矿机 比特挖矿'),\n", 2766 | " ('e69f259c', '简啦啦'),\n", 2767 | " ('e69f259c', '简啦啦'),\n", 2768 | " ('9198150a', 'fxcm'),\n", 2769 | " ('f15f9cef', '创富国际有限公司 国付宝信息 国付宝 国付宝信息科技有限公司 创富国际'),\n", 2770 | " ('8658ac79', '行聊 ctb'),\n", 2771 | " ('1f51fdc3', '腾邦现货'),\n", 2772 | " ('1f51fdc3', '腾邦集团 腾邦 腾邦现货'),\n", 2773 | " ('6856cd55', ''),\n", 2774 | " ('6856cd55', ''),\n", 2775 | " ('f63258f6', '三立集团'),\n", 2776 | " ('81356860', ''),\n", 2777 | " ('81356860', ''),\n", 2778 | " ('183c39da', '恐'),\n", 2779 | " ('183c39da', ' 鼎泽'),\n", 2780 | " ('6f3b094c', '华盛资本'),\n", 2781 | " ('6f3b094c', '华盛资本'),\n", 2782 | " ('ff8414dd', '金融危'),\n", 2783 | " ('ff8414dd', '金融'),\n", 2784 | " ('8883244b', '盾安债甩券 合晟资产'),\n", 2785 | " ('8883244b', '盾安 败。盾安 持有盾安债 踩雷盾安债 盾安债 前 泓德基金 盾安集'),\n", 2786 | " ('da72cf2c', '淘钱宝'),\n", 2787 | " ('da72cf2c', '多米 淘钱宝'),\n", 2788 | " ('ad75ffba', '爱乐在线'),\n", 2789 | " ('ad75ffba', '爱乐在线 张先生'),\n", 2790 | " ('347cae00', '商机头条'),\n", 2791 | " ('347cae00', '商机头条'),\n", 2792 | " ('437b9e96', '素店'),\n", 2793 | " ('437b9e96', '素店 靠吗素店 妮素国际'),\n", 2794 | " ('dd1f0b35', ''),\n", 2795 | " ('dd1f0b35', '华润信托'),\n", 2796 | " ('aa183ba3', '比特币 IPFS存储'),\n", 2797 | " ('aa183ba3', '比特币'),\n", 2798 | " ('33116a19', '宝网贷'),\n", 2799 | " ('33116a19', '收获宝'),\n", 2800 | " ('44165a8f', '龙腾盛世'),\n", 2801 | " ('44165a8f', '轮回 新客兑 新客 保德薪'),\n", 2802 | " ('d4a9471e', '汇新智'),\n", 2803 | " ('d4a9471e', '汇新智'),\n", 2804 | " ('a3ae7788', '莱茨狗 区块链'),\n", 2805 | " ('a3ae7788', '莱茨狗'),\n", 2806 | " ('c369fe6d', '凯富创通'),\n", 2807 | " ('c369fe6d', '凯富创通'),\n", 2808 | " ('b46ecefb', ''),\n", 2809 | " ('b46ecefb', ''),\n", 2810 | " ('2d679f41', '趣步 链信'),\n", 2811 | " ('2d679f41', 'APP 趣步 亦跑 链信'),\n", 2812 | " ('5a60afd7', 'lt'),\n", 2813 | " ('5a60afd7', '明堂金融 景山 明堂期货'),\n", 2814 | " ('c4043a74', '英皇金融国 英皇金融国际'),\n", 2815 | " ('c4043a74', '英皇金融 英皇金融国际'),\n", 2816 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 2817 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 2818 | " ('2a0a5b58', '网商万宝'),\n", 2819 | " ('2a0a5b58', '鲨鱼记账 网商万宝 网商'),\n", 2820 | " ('5d0d6bce', '华景无限 华景无限逍遥卡'),\n", 2821 | " ('5d0d6bce', '无限 华景城 华景无限逍遥卡 华景无限旅游 华景无限 华景无限逍'),\n", 2822 | " ('cdb2765f', '国金中融'),\n", 2823 | " ('cdb2765f', '臻鼎投资 国金中融 国金中融 臻鼎'),\n", 2824 | " ('bab546c9', '顺德农商 中国中投 广东顺高投 欧浦小贷'),\n", 2825 | " ('44f1e2a2', '易信easymarkets平台'),\n", 2826 | " ('33f6d234', '国美'),\n", 2827 | " ('33f6d234', '美美 国美在线 国美控股 美美理财 国美 国美在线金融 美易'),\n", 2828 | " ('aaff838e', ''),\n", 2829 | " ('aaff838e', 'P2 芒果金融'),\n", 2830 | " ('ddf8b318', '之道 人人 云联惠 云商'),\n", 2831 | " ('439c26bb', '神店'),\n", 2832 | " ('439c26bb', '神店 神店小'),\n", 2833 | " ('349b162d', '华登 beta'),\n", 2834 | " ('349b162d', '华登 beta'),\n", 2835 | " ('ad924797', '益阳因特网'),\n", 2836 | " ('ad924797', ''),\n", 2837 | " ('da957701', '喜牛'),\n", 2838 | " ('da957701', '喜牛 中 远特喜牛'),\n", 2839 | " ('4a2a6a90', ''),\n", 2840 | " ('4a2a6a90', ''),\n", 2841 | " ('3d2d5a06', ''),\n", 2842 | " ('3d2d5a06', '购房'),\n", 2843 | " ('5dead3e3', 'easyforex易信'),\n", 2844 | " ('5dead3e3', ''),\n", 2845 | " ('2aede375', '恩圣威NCY'),\n", 2846 | " ('b3e4b2cf', '明堂金融 景山 明堂期货'),\n", 2847 | " ('c4e38259', ''),\n", 2848 | " ('c4e38259', '洛阳百事通 百事通'),\n", 2849 | " ('5a8717fa', '上银基金管理有'),\n", 2850 | " ('5a8717fa', '告 上银基金管 财网上银基金管理有限 2、上银基金管理有 致 上银基金管理有限 关于上银 上银基金管理 司 上银'),\n", 2851 | " ('2d80276c', '区块 区块链'),\n", 2852 | " ('2d80276c', '源中瑞 深圳源中瑞科技'),\n", 2853 | " ('b48976d6', '农科农业保险公司、农科融资租赁公司'),\n", 2854 | " ('c38e4640', '恩圣威 艾拓思'),\n", 2855 | " ('53315bd1', 'Finci芬吉'),\n", 2856 | " ('53315bd1', 'Finci芬吉'),\n", 2857 | " ('24366b47', '爱汇宝'),\n", 2858 | " ('3b453440', ''),\n", 2859 | " ('3b453440', '赶街')]" 2860 | ] 2861 | }, 2862 | "execution_count": 135, 2863 | "metadata": {}, 2864 | "output_type": "execute_result" 2865 | } 2866 | ], 2867 | "source": [ 2868 | "result_all[-100:]" 2869 | ] 2870 | }, 2871 | { 2872 | "cell_type": "code", 2873 | "execution_count": null, 2874 | "metadata": {}, 2875 | "outputs": [], 2876 | "source": [ 2877 | "last_result = []\n", 2878 | "for tuple_ in result_all:\n", 2879 | " ner = []\n", 2880 | " for word in tuple_[1].split(\" \"):\n", 2881 | " if len(word) > 2:\n", 2882 | " ner.append(word)\n", 2883 | " last_result.append((tuple_[0], \" \".join(ner)))" 2884 | ] 2885 | }, 2886 | { 2887 | "cell_type": "code", 2888 | "execution_count": null, 2889 | "metadata": {}, 2890 | "outputs": [], 2891 | "source": [ 2892 | "# 去除所有半角全角符号,只留字母、数字、中文。\n", 2893 | "def remove_punctuation(line):\n", 2894 | " p = re.compile(ur\"[^a-zA-Z\\u4e00-\\u9fa5]\")\n", 2895 | " line = p.sub('',line)\n", 2896 | " line = re.sub(' +', ' ', line)\n", 2897 | " return line" 2898 | ] 2899 | }, 2900 | { 2901 | "cell_type": "markdown", 2902 | "metadata": {}, 2903 | "source": [ 2904 | "### 处理测试集3" 2905 | ] 2906 | }, 2907 | { 2908 | "cell_type": "code", 2909 | "execution_count": 138, 2910 | "metadata": { 2911 | "ExecuteTime": { 2912 | "end_time": "2019-09-05T05:52:20.006642Z", 2913 | "start_time": "2019-09-05T05:52:15.161236Z" 2914 | } 2915 | }, 2916 | "outputs": [], 2917 | "source": [ 2918 | "df_test[\"title\"] = df_test[\"title\"].map(lambda x: clean_data(x))\n", 2919 | "df_test[\"text\"] = df_test[\"text\"].map(lambda x: clean_data(x))" 2920 | ] 2921 | }, 2922 | { 2923 | "cell_type": "code", 2924 | "execution_count": 139, 2925 | "metadata": { 2926 | "ExecuteTime": { 2927 | "end_time": "2019-09-05T05:52:24.823123Z", 2928 | "start_time": "2019-09-05T05:52:24.804565Z" 2929 | } 2930 | }, 2931 | "outputs": [ 2932 | { 2933 | "data": { 2934 | "text/html": [ 2935 | "
\n", 2936 | "\n", 2949 | "\n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | " \n", 2974 | " \n", 2975 | " \n", 2976 | " \n", 2977 | " \n", 2978 | " \n", 2979 | " \n", 2980 | " \n", 2981 | " \n", 2982 | " \n", 2983 | " \n", 2984 | " \n", 2985 | " \n", 2986 | " \n", 2987 | " \n", 2988 | " \n", 2989 | " \n", 2990 | "
idtitletext
083dcefb7时空周转公众注册 当天秒下时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的...
11ad5be0d抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ...
26dd28e9b2019健康行业趋势 住家创业 稳赚不亏
3f3b61b38CCM区块链商城PPT介绍CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自...
484b12bae加密数字货币里大家都赚钱 钱是从哪里来的?我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿...
\n", 2991 | "
" 2992 | ], 2993 | "text/plain": [ 2994 | " id title \\\n", 2995 | "0 83dcefb7 时空周转公众注册 当天秒下 \n", 2996 | "1 1ad5be0d 抢红包、做任务、缺销路、缺人脉、推广产品 来这就对了 兼职赚钱 聚集人脉 \n", 2997 | "2 6dd28e9b \n", 2998 | "3 f3b61b38 CCM区块链商城PPT介绍 \n", 2999 | "4 84b12bae 加密数字货币里大家都赚钱 钱是从哪里来的? \n", 3000 | "\n", 3001 | " text \n", 3002 | "0 时空周转是一款非常靠谱的小额现金快捷贷款平台。时空周转贷款申请到下款全过程都是在手机上完成的... \n", 3003 | "1 本篇文章将对两种人群进行分析 一种是不做任何项目 只撸些APP拉新 做做任务赚钱的纯羊毛党 ... \n", 3004 | "2 2019健康行业趋势 住家创业 稳赚不亏 \n", 3005 | "3 CCM区块链商介绍CCM3 0我们会开启CCM矿石的互转流通 和支持商家入驻 到时候大家有自... \n", 3006 | "4 我最近去分享艾尔链LAC加密数字货币时 有很多朋友都对我轻蔑一笑 或者抱着怀疑的态度 经常拿... " 3007 | ] 3008 | }, 3009 | "execution_count": 139, 3010 | "metadata": {}, 3011 | "output_type": "execute_result" 3012 | } 3013 | ], 3014 | "source": [ 3015 | "df_test.head()" 3016 | ] 3017 | }, 3018 | { 3019 | "cell_type": "code", 3020 | "execution_count": 147, 3021 | "metadata": { 3022 | "ExecuteTime": { 3023 | "end_time": "2019-09-05T06:02:53.724721Z", 3024 | "start_time": "2019-09-05T06:02:53.626214Z" 3025 | } 3026 | }, 3027 | "outputs": [], 3028 | "source": [ 3029 | "test_data = []\n", 3030 | "for id, title, text in zip(df_test[\"id\"], df_test[\"title\"], df_test[\"text\"]):\n", 3031 | " p = re.compile(u\"[^a-zA-Z0-9\\u4e00-\\u9fa5]\")\n", 3032 | " line = p.sub(' ',line)\n", 3033 | " line = re.sub(' +', ' ', line)\n", 3034 | " test_data.append((id, text))" 3035 | ] 3036 | }, 3037 | { 3038 | "cell_type": "code", 3039 | "execution_count": 148, 3040 | "metadata": { 3041 | "ExecuteTime": { 3042 | "end_time": "2019-09-05T06:03:16.824188Z", 3043 | "start_time": "2019-09-05T06:03:15.646284Z" 3044 | }, 3045 | "scrolled": true 3046 | }, 3047 | "outputs": [], 3048 | "source": [ 3049 | "sub_id = []\n", 3050 | "sub_text = []\n", 3051 | "sub_X1 = []\n", 3052 | "sub_X2 = []\n", 3053 | "for tuple_ in test_data:\n", 3054 | " sub_id.append(tuple_[0])\n", 3055 | " x1, x2 = tokenizer.encode(first=tuple_[1][:maxlen])\n", 3056 | " sub_X1.append(x1)\n", 3057 | " sub_X2.append(x2)\n", 3058 | " sub_text.append(tuple_[1])\n", 3059 | "sub_X1 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X1, padding=\"post\", value=0)\n", 3060 | "sub_X2 = keras.preprocessing.sequence.pad_sequences(maxlen=maxlen, sequences=sub_X2, padding=\"post\", value=0)" 3061 | ] 3062 | }, 3063 | { 3064 | "cell_type": "code", 3065 | "execution_count": 149, 3066 | "metadata": { 3067 | "ExecuteTime": { 3068 | "end_time": "2019-09-05T06:03:26.189620Z", 3069 | "start_time": "2019-09-05T06:03:26.183664Z" 3070 | } 3071 | }, 3072 | "outputs": [], 3073 | "source": [ 3074 | "bs = 64\n", 3075 | "steps = len(sub_id) // bs\n", 3076 | "if len(sub_id) % bs != 0:\n", 3077 | " steps += 1" 3078 | ] 3079 | }, 3080 | { 3081 | "cell_type": "code", 3082 | "execution_count": 150, 3083 | "metadata": { 3084 | "ExecuteTime": { 3085 | "end_time": "2019-09-05T06:03:27.083217Z", 3086 | "start_time": "2019-09-05T06:03:27.077359Z" 3087 | } 3088 | }, 3089 | "outputs": [ 3090 | { 3091 | "data": { 3092 | "text/plain": [ 3093 | "79" 3094 | ] 3095 | }, 3096 | "execution_count": 150, 3097 | "metadata": {}, 3098 | "output_type": "execute_result" 3099 | } 3100 | ], 3101 | "source": [ 3102 | "steps" 3103 | ] 3104 | }, 3105 | { 3106 | "cell_type": "code", 3107 | "execution_count": 151, 3108 | "metadata": { 3109 | "ExecuteTime": { 3110 | "end_time": "2019-09-05T06:07:34.236994Z", 3111 | "start_time": "2019-09-05T06:04:02.143078Z" 3112 | } 3113 | }, 3114 | "outputs": [ 3115 | { 3116 | "name": "stderr", 3117 | "output_type": "stream", 3118 | "text": [ 3119 | "100%|██████████| 79/79 [03:32<00:00, 2.13s/it]\n" 3120 | ] 3121 | } 3122 | ], 3123 | "source": [ 3124 | "result_all = []\n", 3125 | "for i in tqdm(range(steps)):\n", 3126 | " batch_id = sub_id[i*bs:(i+1)*bs]\n", 3127 | " batch_text = sub_text[i*bs:(i+1)*bs]\n", 3128 | " batch_X1 = sub_X1[i*bs:(i+1)*bs]\n", 3129 | " batch_X2 = sub_X2[i*bs:(i+1)*bs]\n", 3130 | " batch_pred = model.predict([batch_X1, batch_X2])\n", 3131 | " batch_pred = np.argmax(batch_pred, axis=-1).tolist()\n", 3132 | " for id, text, pred in zip(batch_id, batch_text, batch_pred):\n", 3133 | " pred_label = [idx2tag[i] for i in pred]\n", 3134 | " x_data = [c for c in text]\n", 3135 | " entity = get_entity(x_data, pred_label)\n", 3136 | " result_all.append((id, entity))" 3137 | ] 3138 | }, 3139 | { 3140 | "cell_type": "code", 3141 | "execution_count": 152, 3142 | "metadata": { 3143 | "ExecuteTime": { 3144 | "end_time": "2019-09-05T06:07:34.246853Z", 3145 | "start_time": "2019-09-05T06:07:34.241366Z" 3146 | } 3147 | }, 3148 | "outputs": [ 3149 | { 3150 | "data": { 3151 | "text/plain": [ 3152 | "4998" 3153 | ] 3154 | }, 3155 | "execution_count": 152, 3156 | "metadata": {}, 3157 | "output_type": "execute_result" 3158 | } 3159 | ], 3160 | "source": [ 3161 | "len(result_all)" 3162 | ] 3163 | }, 3164 | { 3165 | "cell_type": "code", 3166 | "execution_count": 153, 3167 | "metadata": { 3168 | "ExecuteTime": { 3169 | "end_time": "2019-09-05T06:07:40.467863Z", 3170 | "start_time": "2019-09-05T06:07:40.459499Z" 3171 | } 3172 | }, 3173 | "outputs": [ 3174 | { 3175 | "data": { 3176 | "text/plain": [ 3177 | "[('83dcefb7', '时空周转'),\n", 3178 | " ('1ad5be0d', '羊毛党'),\n", 3179 | " ('6dd28e9b', '住家创业 201'),\n", 3180 | " ('f3b61b38', '➗CCM CCM'),\n", 3181 | " ('84b12bae', ''),\n", 3182 | " ('1db87a14', '富民宝 定活宝 富民银行'),\n", 3183 | " ('6abf4a82', '\\u200b\\u200b\\u200b \\u200b\\u200b\\u200b\\u200b'),\n", 3184 | " ('fa005713', '微信群 微信 微小宠'),\n", 3185 | " ('8d076785', '\\u200b\\u200b 鼎诚创投'),\n", 3186 | " ('a15d25e1', '青团社 ▌A 地平线')]" 3187 | ] 3188 | }, 3189 | "execution_count": 153, 3190 | "metadata": {}, 3191 | "output_type": "execute_result" 3192 | } 3193 | ], 3194 | "source": [ 3195 | "result_all[:10]" 3196 | ] 3197 | }, 3198 | { 3199 | "cell_type": "code", 3200 | "execution_count": 164, 3201 | "metadata": { 3202 | "ExecuteTime": { 3203 | "end_time": "2019-09-05T06:14:34.891873Z", 3204 | "start_time": "2019-09-05T06:14:34.882986Z" 3205 | } 3206 | }, 3207 | "outputs": [ 3208 | { 3209 | "data": { 3210 | "text/plain": [ 3211 | "[('b46ecefb', ''),\n", 3212 | " ('2d679f41', 'APP 趣步 亦跑 链信'),\n", 3213 | " ('5a60afd7', '明堂金融 景山 明堂期货'),\n", 3214 | " ('c4043a74', '英皇金融 英皇金融国际'),\n", 3215 | " ('b3030ae2', 'HUSDToken 火币钱包 火币'),\n", 3216 | " ('2a0a5b58', '鲨鱼记账 网商万宝 网商'),\n", 3217 | " ('5d0d6bce', '无限 华景城 华景无限逍遥卡 华景无限旅游 华景无限 华景无限逍'),\n", 3218 | " ('cdb2765f', '臻鼎投资 国金中融 国金中融 臻鼎'),\n", 3219 | " ('bab546c9', '顺德农商 中国中投 广东顺高投 欧浦小贷'),\n", 3220 | " ('44f1e2a2', '易信easymarkets平台'),\n", 3221 | " ('33f6d234', '美美 国美在线 国美控股 美美理财 国美 国美在线金融 美易'),\n", 3222 | " ('aaff838e', 'P2 芒果金融'),\n", 3223 | " ('ddf8b318', '之道 人人 云联惠 云商'),\n", 3224 | " ('439c26bb', '神店 神店小'),\n", 3225 | " ('349b162d', '华登 beta'),\n", 3226 | " ('ad924797', ''),\n", 3227 | " ('da957701', '喜牛 中 远特喜牛'),\n", 3228 | " ('4a2a6a90', ''),\n", 3229 | " ('3d2d5a06', '购房'),\n", 3230 | " ('5dead3e3', ''),\n", 3231 | " ('2aede375', '恩圣威NCY'),\n", 3232 | " ('b3e4b2cf', '明堂金融 景山 明堂期货'),\n", 3233 | " ('c4e38259', '洛阳百事通 百事通'),\n", 3234 | " ('5a8717fa', '告 上银基金管 财网上银基金管理有限 2、上银基金管理有 致 上银基金管理有限 关于上银 上银基金管理 司 上银'),\n", 3235 | " ('2d80276c', '源中瑞 深圳源中瑞科技'),\n", 3236 | " ('b48976d6', '农科农业保险公司、农科融资租赁公司'),\n", 3237 | " ('c38e4640', '恩圣威 艾拓思'),\n", 3238 | " ('53315bd1', 'Finci芬吉'),\n", 3239 | " ('24366b47', '爱汇宝'),\n", 3240 | " ('3b453440', '赶街')]" 3241 | ] 3242 | }, 3243 | "execution_count": 164, 3244 | "metadata": {}, 3245 | "output_type": "execute_result" 3246 | } 3247 | ], 3248 | "source": [ 3249 | "result_all[-30:]" 3250 | ] 3251 | }, 3252 | { 3253 | "cell_type": "code", 3254 | "execution_count": 166, 3255 | "metadata": { 3256 | "ExecuteTime": { 3257 | "end_time": "2019-09-05T06:19:30.326154Z", 3258 | "start_time": "2019-09-05T06:19:30.316681Z" 3259 | } 3260 | }, 3261 | "outputs": [], 3262 | "source": [ 3263 | "# 开始写规则\n", 3264 | "def clean_ner(text):\n", 3265 | " # 不能匹配数字\n", 3266 | " text = re.sub(\"[0-9]\", \"\", text).strip()\n", 3267 | " # 不包含\\u200b\n", 3268 | " text = re.sub(\"\\\\u200b\", \"\", text).strip()\n", 3269 | " # 剔除\n", 3270 | " ner = []\n", 3271 | " for n in text.split(\" \"):\n", 3272 | " if len(n) > 1:\n", 3273 | " ner.append(n)\n", 3274 | " text = \";\".join(ner)\n", 3275 | " return text" 3276 | ] 3277 | }, 3278 | { 3279 | "cell_type": "code", 3280 | "execution_count": 167, 3281 | "metadata": { 3282 | "ExecuteTime": { 3283 | "end_time": "2019-09-05T06:19:32.552909Z", 3284 | "start_time": "2019-09-05T06:19:32.519213Z" 3285 | } 3286 | }, 3287 | "outputs": [], 3288 | "source": [ 3289 | "l_res= []\n", 3290 | "for i in result_all:\n", 3291 | " text = clean_ner(i[1])\n", 3292 | " l_res.append((i[0], text))" 3293 | ] 3294 | }, 3295 | { 3296 | "cell_type": "code", 3297 | "execution_count": 171, 3298 | "metadata": { 3299 | "ExecuteTime": { 3300 | "end_time": "2019-09-05T06:23:03.955953Z", 3301 | "start_time": "2019-09-05T06:23:03.947973Z" 3302 | } 3303 | }, 3304 | "outputs": [ 3305 | { 3306 | "data": { 3307 | "text/plain": [ 3308 | "[('83dcefb7', '时空周转'),\n", 3309 | " ('1ad5be0d', '羊毛党'),\n", 3310 | " ('6dd28e9b', '住家创业'),\n", 3311 | " ('f3b61b38', '➗CCM;CCM'),\n", 3312 | " ('84b12bae', ''),\n", 3313 | " ('1db87a14', '富民宝;定活宝;富民银行'),\n", 3314 | " ('6abf4a82', ''),\n", 3315 | " ('fa005713', '微信群;微信;微小宠'),\n", 3316 | " ('8d076785', '鼎诚创投'),\n", 3317 | " ('a15d25e1', '青团社;▌A;地平线')]" 3318 | ] 3319 | }, 3320 | "execution_count": 171, 3321 | "metadata": {}, 3322 | "output_type": "execute_result" 3323 | } 3324 | ], 3325 | "source": [ 3326 | "l_res[:10]" 3327 | ] 3328 | }, 3329 | { 3330 | "cell_type": "code", 3331 | "execution_count": 172, 3332 | "metadata": { 3333 | "ExecuteTime": { 3334 | "end_time": "2019-09-05T06:23:20.457884Z", 3335 | "start_time": "2019-09-05T06:23:20.436811Z" 3336 | } 3337 | }, 3338 | "outputs": [], 3339 | "source": [ 3340 | "with codecs.open(\"/home/wangwei/tf_workdir/word_detect/ww.csv\", \"w\", encoding=\"utf-8\") as f:\n", 3341 | " for i in l_res:\n", 3342 | " key = i[0]\n", 3343 | " value = i[1]\n", 3344 | " f.write(key+','+value+\"\\n\")" 3345 | ] 3346 | }, 3347 | { 3348 | "cell_type": "code", 3349 | "execution_count": null, 3350 | "metadata": {}, 3351 | "outputs": [], 3352 | "source": [] 3353 | } 3354 | ], 3355 | "metadata": { 3356 | "kernelspec": { 3357 | "display_name": "PyCharm (tfproject)", 3358 | "language": "python", 3359 | "name": "tf3" 3360 | }, 3361 | "language_info": { 3362 | "codemirror_mode": { 3363 | "name": "ipython", 3364 | "version": 3 3365 | }, 3366 | "file_extension": ".py", 3367 | "mimetype": "text/x-python", 3368 | "name": "python", 3369 | "nbconvert_exporter": "python", 3370 | "pygments_lexer": "ipython3", 3371 | "version": "3.5.6" 3372 | }, 3373 | "toc": { 3374 | "base_numbering": 1, 3375 | "nav_menu": {}, 3376 | "number_sections": true, 3377 | "sideBar": true, 3378 | "skip_h1_title": false, 3379 | "title_cell": "Table of Contents", 3380 | "title_sidebar": "Contents", 3381 | "toc_cell": false, 3382 | "toc_position": {}, 3383 | "toc_section_display": true, 3384 | "toc_window_display": true 3385 | } 3386 | }, 3387 | "nbformat": 4, 3388 | "nbformat_minor": 2 3389 | } 3390 | --------------------------------------------------------------------------------