├── .gitignore
├── README.md
├── SECURITY.md
├── bert_nlp
    ├── Crf on the top of Bilstm.pptx
    ├── Io
    │   └── data_loader.py
    ├── config
    │   └── args.py
    ├── data
    │   ├── cws_data
    │   │   ├── cws.dev.json
    │   │   └── cws.train.json
    │   ├── ner_data
    │   │   ├── ner.dev.json
    │   │   └── ner.train.json
    │   └── postag_data
    │   │   ├── postag.dev.json
    │   │   └── postag.train.json
    ├── main
    │   └── main.py
    ├── model
    │   ├── pytorch_pretrained_model
    │   │   └── bert_config.json
    │   └── vocab.txt
    ├── net
    │   ├── crf.py
    │   └── model_net.py
    ├── output
    │   ├── checkpoint
    │   │   └── vocab.txt
    │   ├── images
    │   │   └── loss_acc.png
    │   └── logs
    │   │   └── all.log.2020-05-18
    ├── preprocessing
    │   └── data_processor.py
    ├── pytorch_pretrained_bert
    │   ├── file_utils.py
    │   ├── modeling.py
    │   ├── optimization.py
    │   └── tokenization.py
    ├── run.py
    ├── train
    │   └── train.py
    └── util
    │   ├── Logginger.py
    │   ├── model_util.py
    │   ├── plot_util.py
    │   └── porgress_util.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .DS_Store
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | .static_storage/
 58 | .media/
 59 | local_settings.py
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | #pycharm
109 | .idea/
110 | 
111 | #data
112 | #/data
113 | .vscode/*
114 | 
115 | # XD
116 | models/
117 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 项目说明
 2 | 本项目是基于bert+crf 做的一个NLP序列标注（目前支持分词，词性标注和实体识别训练，后续会加入其他的任务）模型，项目是基于https://github.com/circlePi/Bert_Chinese_Ner_pytorch
 3 | （在此感谢原作者的贡献）进行修改的。在原项目的基础上，加入了多卡训练，同时支持单卡和多卡模式，去掉了一些不必要的code，比如output_mask的形式。此外，加入了分词等nlp任务。
 4 | 
 5 | 
 6 | # 配置文件
 7 | 
 8 |       bert_nlp/config/args.py
 9 | 
10 | # 训练
11 | 
12 | ### 分词训练
13 | 
14 |       修改task_name为cws，以及一些其他超参数
15 | 
16 |       CUDA_VISIBLE_DEVICES=0,1 python run.py   
17 | 
18 | ### 实体识别训练
19 | 
20 |       修改task_name为ner，以及一些其他超参数
21 | 
22 |       CUDA_VISIBLE_DEVICES=0,1 python run.py    
23 | 
24 | ### 词性训练
25 | 
26 |       修改task_name为postag，以及一些其他超参数
27 | 
28 |       CUDA_VISIBLE_DEVICES=0,1 python run.py   
29 | 
30 | ### 实体识别训练数据样例
31 | 
32 |       data/ner_data/
33 | 
34 | ### 分词训练数据样例
35 | 
36 |       data/cws_data/
37 | 
38 | ### 词性训练数据样例
39 | 
40 |       data/postag_data/    
41 | 
42 | 
43 | ### 词性标注符号：
44 | 
45 |       n   普通名词
46 |       nt  时间名词
47 |       nd  方位名词
48 |       nl  处所名词
49 |       nh  人名
50 |       nhf 姓
51 |       nhs 名
52 |       ns  地名
53 |       nn  族名
54 |       ni  机构名
55 |       nz  其他专名
56 |       v   动词
57 |       vd  趋向动词
58 |       vl  联系动词
59 |       vu  能愿动词
60 |       a   形容词
61 |       f   区别词
62 |       m   数词　　
63 |       q   量词
64 |       d   副词
65 |       r   代词
66 |       p   介词
67 |       c   连词
68 |       u   助词
69 |       e   叹词
70 |       o   拟声词
71 |       i   习用语
72 |       j   缩略语
73 |       h   前接成分
74 |       k   后接成分
75 |       g   语素字
76 |       x   非语素字
77 |       w   标点符号
78 |       ws  非汉字字符串
79 |       wu  其他未知的符号
80 | 
81 | 但因为中文的词性都是以词为单位的，因此，我们将每个词都进行了拆分，比如中国/n，拆分后就是：中/B-n，国/I-n；天安门/n，拆分为天/B-n，安/I-n，门/I-n。 
82 | 因此，所有的label都进行了拓展，其中UNK表示可能存在不在列表中的词性。如下所示：
83 | 
84 |       B-n I-n B-nt I-nt B-nd I-nd B-nl I-nl B-nh I-nh B-nhf I-nhf B-nhs I-nhs B-ns I-ns B-nn I-nn B-ni I-ni B-nz I-nz B-v I-v B-vd I-vd B-vl I-vl B-vu I-vu B-a I-a B-f I-f B-m I-m B-q I-q B-d I-d B-r I-r B-p I-p B-c I-c B-u I-u B-e I-e B-o I-o B-i I-i B-j I-j B-h I-h B-k I-k B-g I-g B-x I-x B-w I-w B-ws I-ws B-wu I-wu UNK
85 |      
86 | 
87 |       


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | Use this section to tell people about which versions of your project are
 6 | currently being supported with security updates.
 7 | 
 8 | | Version | Supported          |
 9 | | ------- | ------------------ |
10 | | 5.1.x   | :white_check_mark: |
11 | | 5.0.x   | :x:                |
12 | | 4.0.x   | :white_check_mark: |
13 | | < 4.0   | :x:                |
14 | 
15 | ## Reporting a Vulnerability
16 | 
17 | Use this section to tell people how to report a vulnerability.
18 | 
19 | Tell them where to go, how often they can expect to get an update on a
20 | reported vulnerability, what to expect if the vulnerability is accepted or
21 | declined, etc.
22 | 


--------------------------------------------------------------------------------
/bert_nlp/Crf on the top of Bilstm.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lisennlp/bert_crf_sequence_annotation/93732575dedd77122d61ff9b63e8f40d7e8d84f4/bert_nlp/Crf on the top of Bilstm.pptx


--------------------------------------------------------------------------------
/bert_nlp/Io/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 4 | 
 5 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 6 | from preprocessing.data_processor import MyPro, convert_examples_to_features
 7 | import config.args as args
 8 | from util.Logginger import init_logger
 9 | 
10 | logger = init_logger(f"{args.task_name}", logging_path=args.log_path)
11 | 
12 | 
13 | def init_parameters():
14 |     tokenizer = BertTokenizer(vocab_file=args.VOCAB_FILE)
15 |     tokenizer.save_vocabulary(args.output_dir)    # 保存词表文件
16 |     processor = MyPro()
17 |     return tokenizer, processor
18 | 
19 | 
20 | def create_batch_iter(mode, path):
21 |     """构造迭代器"""
22 |     logger.info(f'{mode} path is {path}')
23 |     tokenizer, processor = init_parameters()
24 |     if mode == "train":
25 |         examples = processor.get_train_examples(path)
26 |         num_train_steps = int(
27 |             len(examples) / args.train_batch_size / args.gradient_accumulation_steps *
28 |             args.num_train_epochs)
29 |         batch_size = args.train_batch_size
30 |         logger.info("  Num train steps = %d", num_train_steps)
31 |     elif mode == "dev":
32 |         examples = processor.get_dev_examples(path)
33 |         batch_size = args.eval_batch_size
34 |     else:
35 |         raise ValueError("Invalid mode %s" % mode)
36 | 
37 |     label_list = processor.get_labels()
38 | 
39 |     # 特征
40 |     features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer)
41 |     logger.info(f"  Num {mode} features = %d", len(features))
42 |     logger.info(f" {mode} Batch size = %d", batch_size)
43 | 
44 |     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
45 |     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
46 |     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
47 |     all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
48 |     all_output_mask = torch.tensor([f.output_mask for f in features], dtype=torch.long)
49 | 
50 |     # 数据集
51 |     data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,
52 |                          all_output_mask)
53 | 
54 |     if mode == "train":
55 |         sampler = RandomSampler(data)
56 |     elif mode == "dev":
57 |         sampler = SequentialSampler(data)
58 |     else:
59 |         raise ValueError("Invalid mode %s" % mode)
60 | 
61 |     # 迭代器
62 |     iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)
63 |     if mode == "train":
64 |         return iterator, num_train_steps
65 |     elif mode == "dev":
66 |         return iterator
67 |     else:
68 |         raise ValueError("Invalid mode %s" % mode)
69 | 


--------------------------------------------------------------------------------
/bert_nlp/config/args.py:
--------------------------------------------------------------------------------
 1 | 
 2 | VOCAB_FILE = "/nas/pretrain-bert/pretrain-pytorch/chinese_wwm_ext_pytorch/vocab.txt"
 3 | 
 4 | log_path = "output/logs"
 5 | plot_path = "output/images/loss_acc.png"
 6 | data_dir = "data/"
 7 | cache_dir = "model/"
 8 | output_dir = "output/checkpoint"    # checkpoint和预测输出文件夹
 9 | 
10 | bert_model = "/nas/pretrain-bert/pretrain-pytorch/chinese_wwm_ext_pytorch"    # BERT 预训练模型种类 bert-base-chinese
11 | 
12 | TASK_NAMES = ['cws', 'ner', 'postag']
13 | task_name = "postag"    # 训练任务名称, 从TASK_NAMES选取一个
14 | 
15 | flag_words = ["[PAD]", "[CLP]", "[SEP]", "[UNK]"]
16 | max_seq_length = 200
17 | do_lower_case = True
18 | train_batch_size = 32
19 | eval_batch_size = 32
20 | learning_rate = 2e-5
21 | num_train_epochs = 6
22 | warmup_proportion = 0.1
23 | no_cuda = False
24 | seed = 2018
25 | gradient_accumulation_steps = 1
26 | fp16 = False
27 | loss_scale = 0.
28 | 
29 | if task_name == 'ner':
30 |     labels = ["B_PER", "I_PER", "B_T", "I_T", "B_ORG", "I_ORG", "B_LOC", "I_LOC", "O"]
31 | elif task_name == 'cws':
32 |     labels = ["B", "M", "E", "S"]
33 | elif task_name == 'postag':
34 |     labels = [
35 |         'B-n', 'I-n', 'B-nt', 'I-nt', 'B-nd', 'I-nd', 'B-nl', 'I-nl', 'B-nh', 'I-nh', 'B-nhf',
36 |         'I-nhf', 'B-nhs', 'I-nhs', 'B-ns', 'I-ns', 'B-nn', 'I-nn', 'B-ni', 'I-ni', 'B-nz', 'I-nz',
37 |         'B-v', 'I-v', 'B-vd', 'I-vd', 'B-vl', 'I-vl', 'B-vu', 'I-vu', 'B-a', 'I-a', 'B-f', 'I-f',
38 |         'B-m', 'I-m', 'B-q', 'I-q', 'B-d', 'I-d', 'B-r', 'I-r', 'B-p', 'I-p', 'B-c', 'I-c', 'B-u',
39 |         'I-u', 'B-e', 'I-e', 'B-o', 'I-o', 'B-i', 'I-i', 'B-j', 'I-j', 'B-h', 'I-h', 'B-k', 'I-k',
40 |         'B-g', 'I-g', 'B-x', 'I-x', 'B-w', 'I-w', 'B-ws', 'I-ws', 'B-wu', 'I-wu', 'UNK'
41 |     ]
42 | 
43 | device = "cuda"
44 | 
45 | TRAIN_PATH = f"data/{task_name}_data/{task_name}.train.json"
46 | VALID_PATH = f"data/{task_name}_data/{task_name}.dev.json"
47 | 


--------------------------------------------------------------------------------
/bert_nlp/data/cws_data/cws.dev.json:
--------------------------------------------------------------------------------
  1 | {"source": "也 有 同 学 认 为 ， 食 堂 阿 姨 的 态 度 也 要 改 进 。", "target": "S S B E B E S B E B E S B E S S B E S"}
  2 | {"source": "二 战 后 按 有 关 法 律 钓 鱼 岛 回 归 中 国 。", "target": "B E S S B E B E B M E B E B E S"}
  3 | {"source": "于 是 ， 从 饰 品 、 鞋 子 、 蛋 糕 、 内 裤 …", "target": "B E S S B E S B E S B E S B E S"}
  4 | {"source": "体 检 主 要 项 目 指 标 解 读 ↓ ↓ 关 乎 健 康 ， 转 给 身 边 人 ！", "target": "B E B E B E B E B E S S B E B E S B E B M E S"}
  5 | {"source": "你 想 要 什 么 主 题 的 婚 礼 ？", "target": "S S S B E B E S B E S"}
  6 | {"source": "其 他 获 奖 作 品 还 有 冰 山 、 沉 船 以 及 棕 熊 捕 鱼 等 场 景 。", "target": "B E B E B E S S B E S B E B E B E B E S B E S"}
  7 | {"source": "出 生 于 日 本 北 海 道 札 幌 市 ， 英 年 早 逝 的 日 本 动 画 导 演 ， 漫 画 家 今 敏 ， 常 被 称 作 连 上 帝 都 嫉 妒 的 天 才 ， 执 导 的 多 部 动 画 作 品 在 国 际 间 获 奖 无 数 ， 作 品 多 着 重 于 个 性 鲜 明 贴 近 生 活 的 人 物 、 角 色 精 神 层 面 、 梦 境 与 现 实 之 间 的 暧 昧 关 系 等 等 。", "target": "B M E B E B M E B M E S B E B E S B E B E B E S B M E B E S S S B E S B E S B E S B E S B E S S S B E B E S B E S B E B E S B E S B M E B E B E B E B E S B E S B E B E B E S B E S B E B E S B E B E B E S"}
  8 | {"source": "分 享 给 大 家 一 个 做 e x c e l 表 格 时 都 会 用 到 的 表 头 小 技 巧 ， 非 常 实 用 ， 转 给 需 要 的 小 伙 伴 们 。", "target": "B E S B E S S S B M M M E B E S S S B E S B E S B E S B E B E S B E B E S B M M E S"}
  9 | {"source": "当 地 时 间 2 0 1 6 年 4 月 9 日 ， 俄 罗 斯 索 契 ， 参 与 者 穿 比 基 尼 在 R o z a K h u t o r 高 山 度 假 胜 地 滑 雪 ， 欲 破 最 多 人 穿 比 基 尼 滑 雪 世 界 纪 录 。", "target": "B E B E B M M M E B E B E S B M E B E S B M E S B M E S B M M M M M M M M E B E B E B E B E S S S S S S S B M E B E B E B E S"}
 10 | {"source": "推 上 一 哥 们 去 南 非 旅 游 ， 表 示 以 前 听 说 过 约 翰 内 斯 堡 治 安 不 好 ， 但 没 想 到 达 到 了 这 种 程 度 . . . |", "target": "S S S B E S B E B E S B E B E B E S B M M M E B E S S S S S B E B E S S S B E B M E S"}
 11 | {"source": "“ 和 你 一 同 笑 过 的 人 ， 你 可 能 把 他 忘 掉 ；", "target": "S S S B E S S S S S S B E S S B E S"}
 12 | {"source": "他 一 生 颠 沛 流 离 ， 以 写 文 卖 画 为 生 ， 贫 病 交 迫 ， 终 身 未 婚 。", "target": "S B E B M M E S S B E B E B E S B E B E S B E B E S"}
 13 | {"source": "独 特 的 折 纸 造 型 令 人 爱 不 释 手 ~ ~ |", "target": "B E S B E B E S S B M M E S S S"}
 14 | {"source": "升 高 或 降 低 又 提 示 着 怎 样 的 身 体 信 号 ？", "target": "B E S B E S B E S B E S B E B E S"}
 15 | {"source": "华 为 在 英 国 伦 敦 发 布 了 重 磅 新 机 P 9 ， 硬 朗 的 外 观 、 顶 级 的 硬 件 配 置 、 双 1 2 0 0 万 像 素 主 摄 像 头 、 徕 卡 光 学 系 统 均 让 人 大 跌 眼 镜 ， 一 些 人 甚 至 表 示 已 经 超 过 了 i P h o n e ， 那 么 ， 小 伙 伴 们 会 弃 i P h o n e 买 华 为 吗 ？", "target": "B E S B E B E B E S B E B E B E S B E S B E S B E S B E B E S S B M M M E B E S B M E S B E B E B E S S S B M M E S B E S B E B E B E B E S B M M M M E S B E S B M M E S S B M M M M E S B E S S"}
 16 | {"source": "又 到 体 检 高 峰 ， 好 不 容 易 完 成 了 人 山 人 海 的 体 检 ， 拿 着 体 检 报 告 的 你 是 否 一 头 雾 水 ？", "target": "S S B E B E S B M M E B E S B M M E S B E S S S B E B E S S B E B M M E S"}
 17 | {"source": "自 联 合 国 成 立 7 0 年 来 ， 共 有 8 任 秘 书 长 且 均 为 男 性 ， 一 些 联 合 国 会 员 国 呼 吁 应 任 命 一 位 女 性 秘 书 长 。", "target": "S B M E B E B M E S S S S S S B M E S S S B E S B E B M E B M E B E S B E S S B E B M E S"}
 18 | {"source": "你 最 喜 欢 哪 种 飞 机 餐 呢 ？", "target": "S S B E S S B M E S S"}
 19 | {"source": "6 7 年 来 ， 中 国 海 军 正 在 成 长 为 一 支 捍 卫 国 家 主 权 的 海 上 钢 铁 长 城 。", "target": "B E S S S B E B E S S B E S S S B E B E B E S B E B E B E S"}
 20 | {"source": "据 朝 日 新 闻 ， 2 0 1 6 年 日 本 男 女 生 校 服 价 格 分 别 上 调 5 % - 1 0 % 。", "target": "S B E B E S B M M M E B E B M E B E B E B E B E B M M M M E S"}
 21 | {"source": "共 消 费 了 2 1 0 0 多 元 ， 结 果 二 人 的 现 金 1 0 0 0 多 元 全 部 被 强 行 掏 空 。 ”", "target": "S B E S B M M M E S S B E S S S B E B M M M E S B E S B E B E S S"}
 22 | {"source": "别 怕 ， 4 0 个 免 费 学 术 资 源 数 据 库 送 给 你 ↓ ↓ 中 文 、 外 文 数 据 库 ， 期 刊 、 电 子 书 、 学 位 论 文 、 会 议 资 料 都 囊 括 ！", "target": "B E S B E S B E B E B E B M E B E S S S B E S B E B M E S B E S B M E S B E B E S B E B E S B E S"}
 23 | {"source": "前 些 年 ， 杭 州 圈 湖 收 钱 的 做 法 废 了 ；", "target": "B M E S B E B E B E S B E B E S"}
 24 | {"source": "加 拿 大 人 猎 杀 海 豹 残 忍 画 面 】", "target": "B M E S B E B E B E B E S"}
 25 | {"source": "努 力 的 意 义 ， 不 在 于 一 定 会 让 你 取 得 多 大 的 成 就 ， 只 是 让 你 在 平 凡 的 日 子 里 ， 活 得 比 原 来 的 那 个 自 己 更 好 一 点 新 的 一 天 ， 早 安", "target": "B E S B E S S B E B E S S S B E S S S B E S B E S S S B E S B E S S S S S B E S S S B E S S B E S S S S S B E"}
 26 | {"source": "北 欧 航 空 公 司 在 成 立 7 0 周 年 之 际 ， 发 布 了 一 系 列 拍 摄 于 2 0 世 纪 5 0 年 代 至 8 0 年 代 的 机 舱 照 片 ， 这 些 古 色 古 香 的 照 片 向 人 们 展 示 了 那 个 年 代 乘 客 们 享 用 的 飞 机 餐 。", "target": "B E B E B E S B E B E B E B E S B E S B M E B E S B M M E B M M E S B M M E S B E B E S B E B M M E S B E S B E B E S S S B E B M E B E S B M E S"}
 27 | {"source": "半 睡 半 醒 的 M e l a n i e 起 床 后 看 到 了 4 名 蒙 面 人 开 着 铲 车 将 银 行 提 款 机 放 入 货 车 中 的 画 面 …", "target": "B M M E S B M M M M M E B E S B E S S S B M E S S B E S B E B M E B E B E S S B E S"}
 28 | {"source": "厨 师 “ 拉 尔 夫 ” 被 送 到 陌 生 的 地 区 ， 身 无 分 文 、 语 言 不 通 ， 只 能 通 过 “ 美 食 ” 和 当 地 人 沟 通 ， 并 获 取 回 家 的 路 费 。", "target": "B E S B M E S S B E B E S B E S B M M E S B E S S S B E B E S B E S S B M E B E S S B E B E S B E S"}
 29 | {"source": "只 有 在 我 们 不 需 要 外 来 的 赞 许 时 ， 才 会 变 得 自 由 。", "target": "S S S B E S B E B E S B E S S S S S S B E S"}
 30 | {"source": "【 联 合 国 首 度 公 开 选 秘 书 长 8 名 候 选 人 将 接 受 面 试 】", "target": "S B M E B E B E S B M E S S B M E S B E B E S"}
 31 | {"source": "吓 死 宝 宝 了 ， 这 是 真 的 吗 看 完 之 后 整 个 人 都 精 神 了 ！", "target": "B E B E S S S S S S S B E B E B E S S B E S S"}
 32 | {"source": "此 举 为 了 训 练 学 生 的 核 心 肌 群 与 背 部 肌 肉 ， 收 到 学 生 和 家 长 们 对 此 热 烈 欢 迎 。", "target": "S S S S B E B E S B E B E S B E B E S B E B E S B M E S S B E B E S"}
 33 | {"source": "私 观 点 ： 工 人 日 报 ： 职 业 女 性 如 何 体 面 生 二 孩 ？", "target": "B M E S B E B E S B E B E B E B E S B E S"}
 34 | {"source": "O 网 页 链 接 一 名 大 一 男 生 现 场 哇 哇 直 叫 ， 称 “ 如 绞 肉 机 把 全 身 撕 裂 ” ， 中 途 无 法 忍 受 放 弃 ， 随 即 哭 着 给 妈 打 电 话 ， 感 叹 当 妈 不 易 ， 要 好 好 孝 敬 妈 妈 。", "target": "S B E B E S S B E B E B E B E B E S S S S B M E S B E B E S S B E B E B E B E S B E S S S S S B E S B E S S S S S S B E B E B E S"}
 35 | {"source": "至 此 ， 梁 启 超 九 个 子 女 都 已 辞 世 。", "target": "B E S B M E S S B E S S B E S"}
 36 | {"source": "中 国 4 月 外 汇 储 备 3 2 1 9 0 亿 美 元 ， 高 于 预 期 ， 较 前 值 3 2 1 2 6 亿 美 元 小 幅 上 升 ， 为 连 续 第 二 个 月 回 升 ， 也 是 2 0 1 4 年 6 月 以 来 首 次 出 现 连 续 两 个 月 环 比 上 升 。", "target": "B E B E B E B E B M M M M E B E S B E B E S S B E B M M M M E B E B E B E S S B E B E S S B E S S S B M M M E B E B E B E B E B E S S S B E B E S"}
 37 | {"source": "【 最 具 幸 福 感 职 业 T O P 1 0 出 炉 你 的 职 业 上 榜 了 吗 ？ 】", "target": "S S S B M E B E B M M M E B E S S B E B E S S S S"}
 38 | {"source": "全 国 图 书 交 易 博 览 会 自 1 9 8 0 年 开 始 举 办 ， 是 我 国 文 化 领 域 举 办 最 早 的 展 会 之 一 。", "target": "B E B E B E B M E S B M M M E B E B E S S B E B E B E B E S S S B E B E S"}
 39 | {"source": "【 纽 约 市 民 也 为 “ 学 区 房 ” 操 碎 心 美 教 授 称 “ 学 区 ” 是 影 响 美 房 地 产 的 重 要 因 素 之 一 】", "target": "S B E B E S S S B M E S B M E S B E S S B E S S B E S B M E S B E B E B E S"}
 40 | {"source": "今 日 嘉 宾 ： 中 国 现 代 国 际 关 系 研 究 院 研 究 员 钱 立 伟 , 中 国 国 际 广 播 电 台 资 深 评 论 员 洪 琳 ， 今 晚 1 9 : 3 4 兵 团 卫 视 ， 敬 请 期 待 ！", "target": "B E B E S B E B E B E B E B M E B M E B M E S B E B E B E B E B E B M E B E S B E B M M M E B E B E S B E B E S"}
 41 | {"source": "巴 西 是 否 正 在 上 演 一 场 现 实 版 逼 宫 ？", "target": "B E B E B E B E S S B M E B E S"}
 42 | {"source": "从 张 北 县 政 府 网 的 信 息 来 看 ， 收 费 经 过 听 证 会 论 证 通 过 ， 由 县 政 府 批 准 。", "target": "S B M E B M E S B E S S S B E B E B M E B E B E S S B M E B E S"}
 43 | {"source": "张 家 口 境 内 被 誉 为 “ 中 国 6 6 号 公 路 ” 的 “ 草 原 天 路 ” 进 入 收 费 时 代 ， 门 票 5 0 元 / 人 次 。", "target": "B M E B E S B E S B E B M E B E S S S B E B E S B E B E B E S B E B E S S B E S"}
 44 | {"source": "生 活 中 ， 这 些 至 关 重 要 的 证 件 一 旦 丢 失 ， 免 不 了 一 阵 手 忙 脚 乱 。", "target": "B E S S B E B M M E S B E B E B E S B M E S S B M M E S"}
 45 | {"source": "邮 政 速 递 工 作 人 员 ： “ 保 价 1 5 0 0 元 ， 如 果 物 品 丢 失 ， 只 能 赔 1 5 0 0 。 ”", "target": "B E B E B E B E S S B E B M M E S S B E B E B E S B E S B M M E S S"}
 46 | {"source": "作 为 第 六 届 北 京 国 际 电 影 节 的 重 要 活 动 之 一 ， 中 外 电 影 合 作 论 坛 1 7 日 下 午 在 北 京 举 行 。", "target": "B E B E S B E B E B M E S B E B E B E S B E B E B E B E B M E B E S B E B E S"}
 47 | {"source": "做 了 2 5 年 民 营 医 疗 的 莆 田 人 陈 永 利 并 没 有 医 师 执 照 ， 但 除 了 皮 肤 病 ， 他 自 称 肝 病 、 性 病 等 都 能 治 。", "target": "S S B E S B E B E S B M E B M E S B E B E B E S S B E B M E S S B E B E S B E S S S S S"}
 48 | {"source": "中 国 迈 开 星 际 旅 行 第 一 步 ！ 】", "target": "B E B E B E B E B E S S S"}
 49 | {"source": "在 欢 乐 中 学 习 英 语 ， 没 看 过 的 一 部 接 着 一 部 来 ！", "target": "S B E S B E B E S S S S S S S B E S S S S"}
 50 | {"source": "【 资 本 炒 作 网 红 是 因 为 逐 利 ， 而 鼓 励 创 业 者 做 网 红 则 是 一 种 误 导 】", "target": "S B E B E B E S B E B E S S B E B M E S B E S S S S B E S"}
 51 | {"source": "刘 某 向 工 商 部 门 提 供 了 找 人 刷 单 的 打 款 记 录 ， 印 证 了 他 的 说 法 。", "target": "B E S B E B E B E S S S B E S B E B E S B E S S S B E S"}
 52 | {"source": "他 们 先 是 通 过 改 装 P O S 机 、 发 钓 鱼 链 接 及 黑 客 W I F I 盗 取 银 行 卡 信 息 ， 再 将 信 息 批 量 卖 出 。", "target": "B E S S B E B E B M M E S S B E B E S B E B M M E B E B M E B E S S S B E B E B E S"}
 53 | {"source": "未 来 商 圈 一 定 是 虚 拟 和 物 理 中 间 的 一 个 结 合 点 ， 我 们 要 聘 请 迪 斯 尼 编 剧 和 中 美 韩 一 流 团 队 做 3 D 动 画 片 ， 强 化 品 牌 I P 。", "target": "B E B E B E S B E S B E B E S S S B M E S B E S B E B M E B E S S S S B E B E S B E B M E S B E B E B E S"}
 54 | {"source": "【 赶 紧 自 查 ： 这 样 的 快 递 包 装 可 能 有 毒 又 有 害 】", "target": "S B E B E S B E S B E B E B E B E S B E S"}
 55 | {"source": "农 村 消 费 习 惯 还 是 “ 信 任 经 济 ” 。", "target": "B E B E B E B E S B E B E S S"}
 56 | {"source": "一 些 被 注 销 驾 驶 证 的 “ 毒 驾 ” 人 员 ， 使 用 已 注 销 的 驾 驶 证 注 册 成 为 网 约 车 驾 驶 员 。", "target": "B E S B E B M E S S B E S B E S B E S B E S B M E B E B E B M E B M E S"}
 57 | {"source": "可 见 最 近 流 行 的 不 再 是 情 怀 和 特 效 ， 而 是 性 冷 淡 。", "target": "B E B E B E S B E S B E S B E S S S B M E S"}
 58 | {"source": "让 我 回 到 天 真 无 邪 的 童 年 去 吧 ～", "target": "S S B E B M M E S B E S S S"}
 59 | {"source": "阿 甘 正 传 廊 桥 遗 梦 ， 哪 一 部 能 打 动 你 ？", "target": "B E B E B M M E S S S S S B E S S"}
 60 | {"source": "这 也 是 沈 腾 首 次 大 胆 跨 界 ， 尝 试 说 相 声 ， 岳 云 鹏 小 岳 岳 抓 住 时 机 ， 欺 负 新 搭 档 ~", "target": "S S S B E B E B E B E S B E S B E S B M E B M E B E B E S B E S B E S"}
 61 | {"source": "天 线 宝 宝 长 大 后 ， 哎 呀 妈 呀 ， 亮 瞎 我 的 双 眼 了 ！", "target": "B E B E B E S S B E B E S B E S S B E S S"}
 62 | {"source": "创 造 者 T h e M a k e r ( 很 感 人 的 动 画 短 片 ) 2 . 5 2 ″ 开 始 的 小 提 琴 部 分 更 是 好 燃", "target": "B M E B M M M M M M E S S B E S B E B E S B M M M E B E S B M E B E S S B E"}
 63 | {"source": "【 整 理 出 近 年 来 香 港 最 好 看 的 9 部 恐 怖 电 影 】", "target": "S B E S B M E B E S B E S S S B E B E S"}
 64 | {"source": "这 些 超 级 疯 狂 叫 醒 人 的 方 法 , 真 的 会 死 人 的 ~ ~ ~", "target": "B E B E B E B E S S B E S B E S B E S B M E"}
 65 | {"source": "无 限 挑 战 中 一 场 经 典 的 内 裤 大 秀 ， 哈 哈 哈 G D 真 的 毫 无 偶 像 包 袱 ， 太 妖 娆 了 ！", "target": "B E B E S S S B E S B E B E S B M E B E B E B E B E B E S S B E S S"}
 66 | {"source": "【 推 荐 9 部 你 值 得 至 少 看 三 遍 的 热 门 电 影 】", "target": "S B E S S S B E B E S S S S B E B E S"}
 67 | {"source": "【 推 荐 一 些 受 欢 迎 的 电 影 】", "target": "S B E B E S B E S B E S"}
 68 | {"source": "先 M a r k 再 尽 情 欢 笑 吧 ！", "target": "S B M M E S B E B E S S"}
 69 | {"source": "各 类 电 影 大 总 结 ， 让 你 3 年 都 看 不 完 ~", "target": "S S B E S B E S S S S S S S S S S"}
 70 | {"source": "奥 斯 卡 最 佳 动 画 短 片 《 神 奇 飞 书 》 ， 想 象 力 丰 富 ， 画 面 配 乐 都 很 棒 ， 我 们 的 生 命 因 为 阅 读 而 变 得 充 满 色 彩 ， 很 温 馨 。", "target": "B M E S S B E B E S B E B E S S B M E B E S B E B E S S S S B E S B E B E B E S S S B E B E S S B E S"}
 71 | {"source": "“ 有 时 像 患 了 忧 郁 症 一 样 ， 会 突 然 心 情 不 好 。", "target": "S B E S S S B M E B E S S B E B E S S S"}
 72 | {"source": "《 愤 怒 的 小 鸟 》 大 电 影 首 曝 片 段 ！", "target": "S B E S B E S S B E B E B E S"}
 73 | {"source": "男 生 由 帅 变 丑 是 一 种 怎 样 的 体 验 ？", "target": "B E S S S S S S S B E S B E S"}
 74 | {"source": "可 以 留 着 无 聊 的 时 候 找 来 看 ~", "target": "B E S S B E S B E B E S S"}
 75 | {"source": "日 剧 《 女 王 的 教 室 》 中 天 海 佑 希 的 一 段 霸 气 训 话 ： 如 果 有 活 到 老 学 到 老 的 想 法 ， 那 就 有 无 限 的 可 能 性 。", "target": "B E S B E S B E S S B M M E S S S B E B E S B E S S S S S S S S B E S S S S B E S B M E S"}
 76 | {"source": "合 集 包 括 《 海 街 日 记 》 《 在 世 界 的 中 心 呼 唤 爱 》 《 深 呼 吸 的 必 要 》 《 泪 光 闪 闪 》 《 暗 堡 里 的 三 恶 人 》 《 群 青 》 《 纯 洁 脆 弱 的 心 》 等 等 ， 一 共 1 1 部 。", "target": "B E B E S B E B E S S S B E S B E B E S S S B M E S B E S S B E B E S S B E S S S B E S S B E S S B E B E S S S B E S B E B E S S"}
 77 | {"source": "哈 莉 · 奎 茵 在 《 X 特 遣 队 》 （ 又 名 《 自 杀 小 队 》 ） 第 三 支 正 式 预 告 片 里 的 惊 艳 表 现 ！", "target": "B M M M E S S B M M E S S S S S B E B E S S B E S B E B M E S S B E B E S"}
 78 | {"source": "近 日 ， 日 本 东 宝 公 司 发 布 《 哥 斯 拉 ： 复 活 》 首 款 正 式 预 告 ： ， 开 头 就 怪 兽 造 型 就 全 屏 放 出 ！ ！", "target": "B E S B E B E B E B E S B M E S B E S S S B E B E S S B E S B E B E S B E B E S S"}
 79 | {"source": "凭 借 自 己 的 能 力 去 完 成 。", "target": "B E B E S B E S B E S"}
 80 | {"source": "小 时 候 的 经 典 ， 百 看 不 厌 ， 再 看 一 次 还 是 觉 得 看 不 够 ~", "target": "B M E S B E S B M M E S S S S S B E B E S S S S"}
 81 | {"source": "如 果 《 美 国 队 长 3 》 对 战 双 方 都 穿 上 队 服 再 决 斗 …", "target": "B E S B E B E S S B E B E S B E B E S B E S"}
 82 | {"source": "如 果 在 评 论 里 写 一 句 话 ， 你 喜 欢 的 人 能 看 到 ， 你 想 对 t a 说 什 么 _ _ _ _ _ _ _ _ _ _ _ _ _ _", "target": "B E S B E S S S S S S S B E S S S B E S S S S B E S B E B M M M M M M M M M M M M E"}
 83 | {"source": "三 大 运 营 商 2 0 1 5 年 合 计 净 利 润 达 1 3 9 1 . 1 4 亿 元 ， 平 均 日 赚 3 . 8 1 亿 元 。", "target": "S S B M E B M M M E B E B M E S B M M M M M M E S S B E B E B M M M E S S"}
 84 | {"source": "【 意 外 ： 现 在 的 W i n 1 0 其 实 是 W i n 9 ！ 】", "target": "S B E S B E S B M M M E B E S B M M E S S"}
 85 | {"source": "年 轻 一 代 着 迷 于 星 座 的 人 不 在 少 数 ， 搜 狗 特 别 针 对 这 些 星 座 拥 趸 推 出 了 星 座 专 版 浏 览 器 ， 首 批 上 线 的 白 羊 座 、 双 鱼 座 、 处 女 座 和 天 蝎 座 都 拥 有 独 立 的 界 面 ， 而 且 还 暗 藏 各 色 星 座 彩 蛋 ， 让 很 多 用 户 打 呼 贴 心 有 巧 思 ！", "target": "B E B E B E S B E S S S S B E S B E B E B E B E B E B E B E S B E B E B M E S S S B E S B M E S B M E S B M E S B M E S B E B E S B E S B E S B E B E B E B E S S B E B E B E B E S B E S"}
 86 | {"source": "除 S p a c e X ， 这 位 I T 奇 才 还 是 P a y P a l 、 T e s l a 和 S o l a r c i t y 的 创 始 人 。", "target": "S B M M M M E S S S B E B E B E B M M M M E S B M M M E S B M M M M M M M E S B M E S"}
 87 | {"source": "京 东 3 C 敢 玩 、 会 玩 ， 这 到 底 是 商 业 广 告 还 是 公 益 广 告 啊 ？", "target": "B E B E S S S S S S S B E S B E B E S S B E B E S S"}
 88 | {"source": "亚 马 逊 本 周 将 发 新 K i n d l e ， 而 其 官 方 天 猫 旗 舰 店 抢 先 泄 露 了 新 品 K i n d l e o a s i s 的 资 料 （ 目 前 已 下 架 ） 。", "target": "B M E B E S S S B M M M M E S S S B E B E B M E B E B E S B E B M M M M M M M M M E S B E S B E S B E S S"}
 89 | {"source": "开 车 要 牢 记 的 1 0 0 条 常 识", "target": "B E S B E S B M E S B E"}
 90 | {"source": "【 超 实 用 的 旅 游 达 人 必 备 技 能 ！ 】", "target": "S S B E S B E B E B E B E S S"}
 91 | {"source": "鲜 虾 杂 蔬 燕 麦 蛋 羹 : 浓 浓 虾 味 夹 着 蛋 羹 ， 吃 上 一 口 r e a l 满 足 ！", "target": "B M E B E B M E S B E B E S S B E S S S S S B M M E B E S"}
 92 | {"source": "3 0 个 中 国 人 应 该 了 解 的 神 话 典 故", "target": "B E S B E S B E B E S B E B E"}
 93 | {"source": "双 臂 伸 直 ， 手 掌 朝 下 ， 静 止 保 持 1 0 分 钟 左 右 ， 便 会 感 到 困 倦 。", "target": "B E B E S B E S S S B E B E B E B E B E S S S B E B E S"}
 94 | {"source": "看 完 笑 死 我 了 ， 这 是 考 科 目 二 的 节 奏 啊 ！", "target": "B E B E S S S S S S B M E S B E S S"}
 95 | {"source": "【 老 外 发 短 信 时 常 用 的 英 文 缩 写 】", "target": "S B E S B E S B E S B E B E S"}
 96 | {"source": "如 果 你 还 不 会 用 键 盘 快 捷 键 ， 只 能 说 你 o u t 了 ！", "target": "B E S S S S S B E B M E S B E S S B M E S S"}
 97 | {"source": "如 果 外 星 人 尝 试 用 一 种 加 密 的 二 进 制 方 法 联 系 我 们 ， 人 类 能 够 理 解 它 们 发 出 的 信 息 吗 ？", "target": "B E B M E B E S S S B E S B M E B E B E B E S B E B E B E B E B E S B E S S"}
 98 | {"source": "该 项 目 尝 试 利 用 3 D 打 印 技 术 去 实 现 模 块 化 的 人 体 ， 这 一 概 念 的 提 出 者 称 ： “ 如 果 我 们 可 以 打 印 器 官 和 人 体 的 某 些 部 分 ， 那 么 为 何 不 彻 底 重 定 义 、 重 新 设 计 人 体 ？ ”", "target": "S B E B E B E B E B E B E S B E B M E S B E S S S B E S B M E S S S B E B E B E B E B E S B E S B E B E S B E B E S B E B M E S B E B E B E S S"}
 99 | {"source": "据 和 讯 引 述 《 金 融 时 报 》 网 络 版 报 道 ， F a c e b o o k 在 中 国 一 起 商 标 诉 讼 中 获 胜 ， 这 被 认 为 是 双 方 关 系 进 一 步 朝 正 面 发 展 的 积 极 信 号 。", "target": "S B E B E S B E B E S B M E B E S B M M M M M M E S B E S S B E B E S B E S S S B E S B E B E B M E S B E B E S B E B E S"}
100 | {"source": "三 星 今 年 将 要 发 布 的 G a l a x y N o t e 6 将 支 持 I P 6 8 级 防 水 ， 手 机 浸 泡 在 1 . 5 米 深 的 水 中 3 0 分 钟 ， 仍 然 可 以 正 常 工 作 。", "target": "B E B E S S B E S B M M M M E B M M M E S B E B M M E S B E S B E B E S B M E S S S B E B E B E S B E B E B E B E S"}
101 | {"source": "经 过 几 个 月 的 测 试 ， 锤 子 S m a r t i s a n O S v 2 . 6 . 0 终 于 正 式 发 布 了 ， 本 次 更 新 新 增 功 能 5 0 余 项 ， 优 化 细 节 6 0 余 项 ， 修 复 b u g 若 干 ， 我 的 天 啦 ！ ！", "target": "B E S S S S B E S B E B M M M M M M M M M M M M M M M E B E B E B E S S S S B E B E B E B M E S S B E B E B M E S S B E B M E B E S S S S S S S"}
102 | {"source": "除 了 手 机 之 外 ， 黑 莓 也 宣 布 对 这 款 黑 莓 机 的 周 边 配 件 降 价 三 成 。", "target": "B E B E B E S B E S B E S S S B M E S B E B E B E B E S"}
103 | {"source": "尽 管 这 一 消 息 还 没 有 得 到 确 认 ， 但 是 雷 蛇 C E O 陈 明 亮 在 T w i t t e r 发 布 的 一 则 招 聘 消 息 透 露 了 进 军 手 机 市 场 的 想 法 。", "target": "B E S S B E S B E B E B E S B E B E B M E B M E S B M M M M M E B E S S S B E B E B E S B E B E B E S B E S"}
104 | {"source": "据 说 是 明 天 要 发 布 的 @ 魅 族 科 技 魅 族 P r o 6 长 这 样 ， 大 家 觉 得 如 何 ？", "target": "B E S B E S B E S S B E B E B E B M M E S B E S B E B E B E S"}
105 | {"source": "码 农 们 ， 还 记 得 大 明 湖 畔 的 W i n d o w s 9 5 吗 ？", "target": "B M E S S B E B M M E S B M M M M M M M E S S"}
106 | {"source": "程 序 员 面 试 的 标 准 答 案 并 不 标 准", "target": "B M E B E S B E B E B E B E"}
107 | {"source": "【 H T M L 5 C a n v a s 图 表 应 用 R G r a p h 图 表 功 能 非 常 强 大 】", "target": "S B M M M M M M M M M E B E B E B M M M M E B E B E B E B E S"}
108 | {"source": "谷 歌 和 百 度 怎 样 成 为 善 恶 的 两 极", "target": "B E S B E B E B E B E S B E"}
109 | {"source": "呵 呵 ， 这 一 年 多 ， 好 像 就 没 有 不 \" 双 子 \" 一 下 的 事 情 了 ， 剧 情 不 \" 反 转 \" 好 像 倒 不 正 常 了", "target": "B E S S S S S S B E S B E S S B E S B E S B E S S B E S S B E S B E S S B E S"}
110 | {"source": "下 午 专 心 写 段 子 ， 今 天 交 易 已 经 结 束 了 。", "target": "B E B E S B E S B E B E B E B E S S"}
111 | {"source": "前 面 那 篇 文 章 的 观 点 是 一 个 汇 总 ， 对 多 数 人 都 应 该 是 有 帮 助 的 。", "target": "B E S S B E S B E S S S B E S S B E S S B E S S B E S S"}
112 | {"source": "嘿 嘿 ， 有 没 有 发 现 一 个 问 题 ， 今 天 是 网 金 在 救 场 ， 不 是 郭 嘉 队", "target": "B E S S S S B E S S B E S B E S B E S B E S S S B E S"}
113 | {"source": "我 相 信 我 这 套 方 法 和 9 9 % 的 市 场 参 与 者 都 不 一 样 ， 最 核 心 就 是 他 们 在 选 自 己 喜 欢 的 ， 比 如 夹 头 。", "target": "S B E S S S B E S B M E S B E B M E S S B E S S B E S S B E S S B E B E S S B E B E S"}
114 | {"source": "你 们 在 里 面 考 试 ， 我 在 外 面 监 考 听 说 发 此 图 会 提 升 逼 格", "target": "B E S B E B E S S S B E B E B E S B E S B E B E"}
115 | {"source": "处 女 座 处 女 座 处 女 座 处 女 座 处 女 座 处 女 座 处 女 座 处 女 座 …", "target": "B M E B M E B M E B M E B M E B M E B M E B M E S"}
116 | {"source": "大 大 讲 话 都 不 好 使 ， 这 届 股 民 不 行", "target": "B E B E S S B E S S S B E B E"}
117 | {"source": "大 家 要 习 惯 郭 嘉 队 老 司 机 来 调 控 节 奏 ， 不 喜 欢 、 玩 不 过 就 可 以 选 择 G U N （ 注 意 ， 不 是 英 语 ） ， 适 者 生 存 ， 韭 菜 们 靠 押 单 边 就 赚 钱 的 好 日 子 肯 定 还 早 ， 这 是 毋 庸 置 疑 的 ， 最 简 单 的 判 断 ， 神 马 时 候 沪 市 交 易 量 稳 定 在 3 K 亿 之 上 你 再 试 图 押 单 边 。", "target": "B E S B E B E S B M E S B E B E S S B E S B M E S B E B E B M E S B E S S S B E S S B E B E S B M E S S B E S S S S S B E B E S S S S S B M M E S S S B E S B E S B E B E B E B M E B E S B M E B E S S B E S B E S"}
118 | {"source": "正 面 ： 他 的 时 间 不 值 钱 了 ， 公 司 强 大 了 ， 团 队 成 熟 了 ， 不 需 要 “ 自 己 ” 了 。", "target": "B E S S S B E S B E S S B E B E S S B E B E S S S B E S B E S S S"}
119 | {"source": "事 实 只 有 一 个 ， 如 果 在 你 的 推 演 之 外 ， 这 就 是 超 预 期 也 是 “ 看 不 见 ” 导 致 的 风 险 。", "target": "B E S S S S S B E S S S B E B E S S S S S B E S S S S S S S B E S B E S"}
120 | {"source": "好 吧 ， 这 已 经 超 出 了 得 瑟 的 范 畴 了 。", "target": "B E S S B E B E S B E S B E S S"}
121 | {"source": "如 果 再 有 板 ， 我 就 只 能 照 规 矩 打 了", "target": "B E S B E S S S B E S B E S S"}
122 | {"source": "玩 互 联 网 也 逃 不 过 这 个 ， 装 看 不 见 和 狗 屁 不 懂 是 两 个 层 次 屌 丝 创 业 是 真 不 懂 ， 骗 子 搞 事 是 装 看 不 见", "target": "S B M E S S S S S S S S S S S S B E S S S S S B E B E B E S S S S S B E B E S S S S S"}
123 | {"source": "如 果 盘 面 还 这 样 ， 明 天 就 把 票 全 改 成 有 \" 金 \" 的 顺 天 而 行", "target": "B E B E S B E S B E S S S S B E S S S S S B M M E"}
124 | {"source": "如 果 这 种 标 准 话 术 都 能 上 当 ， 我 就 只 当 \" 洗 粉 \" 了 …", "target": "B E S S B E B E S S B E S S S S S S B E S S S"}
125 | {"source": "存 量 打 架 ， 就 是 狗 咬 狗 ， 能 够 确 定 的 是 ： 没 有 钱 没 有 钱 没 有 钱 …", "target": "B E B E S S S S S S S B E B E S S S B E S B E S B E S S"}
126 | {"source": "已 经 5 月 份 了 ， 还 咣 咣 咣 往 外 扔 政 策 ， 这 不 合 节 气 啊 ！", "target": "B E B M E S S S B M E B E S B E S S S S B E S S"}
127 | {"source": "要 不 是 指 数 跌 得 像 坨 屎 ， 按 照 我 的 牌 路 现 在 该 上 板 车 了 太 尼 玛 搞 了 ， 今 天 是 什 么 情 况 ？", "target": "B M E B E B E S S S S B E S S B E B E S S B E S S B E S S S B E S B E B E S"}
128 | {"source": "市 场 成 本 在 缓 慢 抬 升 ， 偏 保 守 的 资 金 也 开 始 下 注 ， 问 题 是 各 玩 各 的 ， 没 有 一 致 性 ， 不 洗 牌 成 不 了 大 事 。", "target": "B E B E S B E B E S S B E S B E S B E B E S B E S S S S S S B E B M E S S B E S S S B E S"}
129 | {"source": "市 场 状 况 好 的 情 况 下 ， 我 经 常 做 法 是 同 时 开 好 几 局 ， 打 的 很 \" 松 \" ， 追 求 弹 性 大 的 票 ， 目 的 其 实 就 是 把 赔 率 变 高 （ 板 车 ） ；", "target": "B E B E S S B E S S S B E B E S B E S B E S S S S S S S S S B E B E S S S S B E B E S S S B E B E S B E S S"}
130 | {"source": "带 儿 子 放 风 筝 ， 一 溜 小 跑 ， 上 天 ， 满 屏 点 赞 ， 仰 慕 。", "target": "S B E S B E S S S B E S B E S B E B E S B E S"}
131 | {"source": "一 条 关 于 “ 土 地 使 用 权 年 限 到 期 或 即 将 到 期 ， 需 花 费 房 价 三 成 费 用 ‘ 买 地 ’ 才 能 重 新 办 理 土 地 证 ” 的 消 息 引 人 关 注 ， 很 多 购 房 者 都 开 始 检 查 自 己 买 的 房 子 土 地 使 用 权 年 限 。", "target": "S S B E S B E B M E B E B E S B E B E S S B E B E B E B E S B E S S S B E B E B M E S S B E S S B E S B E B M E S B E B E B E S S B E B E B M E B E S"}
132 | {"source": "商 务 咨 询 / 自 由 撰 稿 人 / 编 剧 / 活 动 策 划 / 猎 头 / 翻 译 / 记 者 / 自 媒 体 ， 这 是 8 0 后 女 孩 姚 夭 同 时 拥 有 的 8 种 身 份 ， 像 姚 夭 这 样 拥 有 多 重 职 业 的 人 被 称 为 “ 斜 杠 青 年 ” 。", "target": "B E B E S B E B M E S B E S B E B E S B E S B E S B E S B M E S S S B E S B E B E B E B E S S S B E S S B E B E B E B E B E S S S B E S B E B E S S"}
133 | {"source": "工 作 地 在 上 海 ， 同 学 们 快 加 入 我 们 吧 ～ 简 历 请 投 ： w i r e l e s s @ y i c a i . c o m", "target": "B M E S B E S B M E S B E B E S S B E S S S B M M M M M M M M M M M M M M M M E"}
134 | {"source": "业 内 人 士 称 ， 这 两 年 有 很 多 人 跟 风 进 入 P 2 P 行 业 ， 实 则 根 本 就 不 知 道 这 个 行 业 是 怎 么 回 事 ， 只 能 艰 难 维 持 。", "target": "B E B E S S S S S S B E S B E B E B M E B E S B E B E S S B E S S B E S B E S S S B E B E B E S"}
135 | {"source": "中 纪 委 ： 交 纳 党 费 是 党 员 最 基 本 的 义 务 ， 当 年 革 命 先 烈 ， 即 使 在 敌 人 的 监 狱 里 ， 临 刑 前 也 要 给 组 织 交 最 后 一 次 党 费 。", "target": "B M E S B E B E S B E S B E S B E S B E B E B E S B E S B E S B E S S B E S S S S B E S B E S S B E S"}
136 | {"source": "发 改 委 消 息 ， 根 据 《 石 油 价 格 管 理 办 法 》 第 六 条 规 定 ， 本 次 汽 、 柴 油 价 格 暂 不 作 调 整 。", "target": "B M E B E S B E S B E B E B E B E S B E S B E S S S S S B E B E S S S B E S"}
137 | {"source": "【 怒 江 水 电 项 目 搁 浅 1 0 年 后 复 活 已 暗 中 开 发 1 3 年 】", "target": "S B E B E B E B E B E S S B E S B E B E B E S S"}
138 | {"source": "保 持 适 度 流 动 性 ， 实 现 货 币 信 贷 和 社 会 融 资 规 模 合 理 增 长 。", "target": "B E B E B M E S B E B E B E S B E B E B E B E B E S"}
139 | {"source": "公 告 中 还 提 到 ， 除 地 铁 集 团 外 ， 公 司 于 2 0 1 5 年 1 2 月 2 5 日 就 拟 议 交 易 与 另 一 名 潜 在 交 易 对 手 方 签 署 了 一 份 不 具 有 法 律 约 束 力 的 合 作 意 向 书 。", "target": "B E S S B E S S B E B E S S B E S B M M M E B M E B M E S S S B E S S S S B E B E B M E B E S S S S B E B E B M E S B E B M E S"}
140 | {"source": "对 于 已 供 应 、 未 开 发 的 房 地 产 开 发 用 地 ， 可 申 请 转 换 用 途 或 调 整 商 住 比 例 。", "target": "B E S B E S S B E S B M E B E B E S S B E B E B E S B E B E B E S"}
141 | {"source": "北 京 一 位 公 募 投 资 总 监 认 为 ， 这 种 情 况 可 能 是 一 些 游 资 希 望 做 短 差 吸 引 其 他 游 资 或 散 户 跟 进 ， 从 而 拉 高 出 货 ， 在 短 期 赚 钱 。", "target": "B E S S B E B E B E B E S S S B E B E S B E B E B E S B E B E B E B E S B E B E S B E B E B E S S B E B E S"}
142 | {"source": "尤 其 是 ， 一 些 年 代 比 较 久 的 二 手 房 是 否 会 受 到 影 响 呢 ?", "target": "B M E S B E B E B E S S B M E B E S B E B E S S"}
143 | {"source": "【 市 场 准 入 负 面 清 单 包 括 禁 止 部 分 新 建 煤 炭 、 钢 铁 项 目 】", "target": "S B E B E B E B E B E B E B E B E B E S B E B E S"}
144 | {"source": "【 香 港 推 出 系 列 举 措 重 塑 旅 游 形 象 】", "target": "S B E B E B E B E B E B E B E S"}
145 | {"source": "《 规 划 》 明 确 将 京 津 冀 地 区 划 定 为 减 量 优 化 区 、 存 量 挖 潜 区 、 增 量 控 制 区 和 适 度 发 展 区 四 个 区 位 ， 明 确 各 区 土 地 利 用 原 则 和 利 用 导 向 根 据 这 四 个 区 位 来 供 应 土 地 。", "target": "S B E S B E S B M E B E B E S B E B M E S B E B M E S B E B M E S B E B M E S S B E S B E S S B E B E B E S B E B E B E S S S B E S B E B E S"}
146 | {"source": "一 种 经 不 起 试 错 、 纠 偏 、 大 折 腾 的 金 融 监 管 体 系 可 以 怎 么 改 革 ？", "target": "S S S S S B E S B E S S B E S B E B E B E B E B E B E S"}
147 | {"source": "不 少 蔬 菜 价 格 动 辄 翻 倍 、 涨 幅 惊 人 。", "target": "B E B E B E B E B E S B E B E S"}
148 | {"source": "中 国 指 数 研 究 院 公 布 的 最 新 监 测 数 据 显 示 ， 上 周 （ 4 月 3 日 - 4 月 9 日 ） 受 监 测 的 重 点 城 市 中 近 九 成 城 市 楼 市 成 交 量 下 滑 。", "target": "B E B E B M E B E S S S B E B E B E S B E S B E B E S B E B E S S B E S B E B E S S B E B E B E B M E B E S"}
149 | {"source": "中 国 证 券 结 算 机 构 的 数 据 显 示 ， 3 月 份 中 国 资 产 超 过 1 0 0 0 万 元 人 民 币 的 股 票 账 户 数 量 增 加 了 2 0 % ， 而 资 产 少 于 1 0 万 元 的 账 户 数 量 减 少 了 3 . 9 % 。", "target": "B E B E B E B E S B E B E S B M E B E B E B E B M M M E S B M E S B E B E B E B E S B M E S S B E B E B M E S S B E B E B E S B M M E S"}
150 | {"source": "【 板 子 不 能 打 在 民 营 医 疗 身 上 】", "target": "S B E S S B E B E B E B E S"}
151 | {"source": "中 西 部 承 接 产 业 转 移 鼓 励 政 策 将 兼 顾 产 业 转 出 地 和 转 入 地 两 方 ， 从 土 地 、 财 税 、 保 险 、 用 工 等 多 方 面 集 合 发 力 ， 引 导 加 工 贸 易 产 业 从 东 部 沿 海 地 区 转 向 中 西 部 地 区 。", "target": "B M E B E B E B E B E B E S B E B E B M E S B M E B E S S B E S B E S B E S B E S S B E B E B E S B E B E B E B E S B E B E B E B E B M E B E S"}
152 | {"source": "今 年 4 月 ， 高 盛 调 低 了 对 在 港 上 市 的 S O H O 中 国 的 评 级 ， 对 其 评 级 由 “ 中 性 ” 降 至 “ 沽 售 ” 。", "target": "B E B E S B E B E S S B E B E S B M M E B E S B E S S S B E S S B E S B E S B E S S"}
153 | {"source": "他 认 为 ， 小 股 票 、 次 新 股 、 垃 圾 股 、 题 材 股 和 伪 成 长 股 等 黑 五 类 依 然 有 风 险 ， 新 兴 行 业 虽 有 很 大 的 前 景 ， 但 要 等 待 回 调 的 机 会 。", "target": "S B E S S B E S B M E S B M E S B M E S B M M E S B M E B E S B E S B E B E S S S S S B E S S S B E B E S B E S"}
154 | {"source": "以 往 政 府 方 面 出 台 有 关 楼 市 政 策 ， 前 期 均 会 和 开 发 商 进 行 广 泛 沟 通 。", "target": "B E B E B E B E B E B E B E S B E S S S B M E B E B E B E S"}
155 | {"source": "优 衣 库 的 母 公 司 — — 迅 销 有 限 公 司 近 日 发 布 了 悲 催 的 半 年 业 绩 数 据 ， 母 公 司 股 东 应 占 利 润 同 比 下 跌 5 5 . 1 % ， 创 下 近 年 来 的 最 大 跌 幅 。", "target": "B M E S B M E B E B E B E B E B E B E S B E S B E B E B E S B M E B E B M M E B E B E B M M M E S B E B M E S S S B E S"}
156 | {"source": "李 克 强 在 部 分 省 （ 市 ） 政 府 主 要 负 责 人 经 济 形 势 座 谈 会 上 表 示 ， 目 前 虽 然 一 些 行 业 市 场 需 求 有 所 好 转 ， 但 调 结 构 的 劲 不 能 松 ， 必 须 加 快 改 造 提 升 传 统 动 能 ， 淘 汰 落 后 产 能 ， 消 化 钢 铁 、 煤 炭 等 过 剩 产 能 ， 安 排 好 在 这 个 过 程 中 一 些 职 工 转 岗 和 保 障 基 本 生 活 工 作 。", "target": "B M E S B E S S S S B E B E B M E B E B E B M E S B E S B E B E B E B E B E B E B E B E S S S B E S S S S S S B E B E B E B E B E B E S B E B E B E S B E B E S B E S B E B E S B E S S S S B E S B E B E B E S B E B E B E B E S"}
157 | {"source": "富 豪 们 抓 住 了 春 季 行 情 ， 依 然 对 中 国 A 股 很 乐 观 。", "target": "B M E B E S B E B E S B E S B E B E S B E S"}
158 | {"source": "【 超 六 成 医 生 觉 得 累 患 者 满 意 度 不 低 】", "target": "S S B E B E B E S B E B M E S S S"}
159 | {"source": "【 身 长 3 6 . 1 c m 全 长 6 2 . 4 c m 成 都 这 只 竹 节 虫 成 】", "target": "S B E B M M E B E B E B M M E B E B E S S B M E S S"}
160 | {"source": "手 表 各 部 分 可 分 开 ， 从 表 盘 到 表 带 ， 每 部 分 都 可 持 续 更 新 和 升 级 ， D I Y 出 最 新 潮 最 称 心 的 智 能 表 。", "target": "B E S B E S B E S S B E S B E S S B E S S B E B E S B E S B M E S S B E S B E S B M E S"}
161 | {"source": "【 青 岛 ： 3 斤 螃 蟹 绑 7 两 皮 筋 商 户 被 罚 1 5 0 0 元 】", "target": "S B E S S S B E S S S B E B E S S B M M E S S"}
162 | {"source": "【 鹿 晗 @ M 鹿 M 昨 晚 发 微 博 晒 合 照 公 开 “ 新 对 象 ” 】", "target": "S B E S B M E B E S B E S B E B E S S B E S S"}
163 | {"source": "4 月 2 2 日 ， 湖 南 湘 乡 警 方 历 时 9 个 多 月 ， 成 功 捣 毁 了 一 个 涉 及 全 国 的 生 产 销 售 假 药 、 有 毒 有 害 食 品 团 伙 。", "target": "B E B M E S B E B E B E B E S S S S S B E B E S S S B E B E S B E B E B E S B E B E B E B E S"}
164 | {"source": "1 9 岁 被 查 出 结 肠 癌 后 ， 3 次 治 疗 ， 一 再 与 死 神 擦 肩 而 过 。", "target": "B E S S B E B M E S S S S B E S B E S B E B M M E S"}
165 | {"source": "一 直 手 抖 ， 还 说 这 一 块 （ 脖 子 ） 疼 得 很 ， 过 一 会 口 水 就 流 下 来 了 ， 没 说 话 在 那 趴 着 。 ”", "target": "B E B E S S S S S S S B E S S S S S S B E B E S S B E S S S B E S S B E S S"}
166 | {"source": "业 主 称 大 家 买 菜 方 便 多 了 。", "target": "B E S B E B E B E S S S"}
167 | {"source": "东 莞 理 工 学 院 城 市 学 院 一 学 生 爆 料 ： 该 校 一 女 生 上 课 看 了 下 手 机 ， 姜 老 师 让 她 当 着 两 个 班 8 0 多 人 的 面 把 手 机 砸 碎 ！", "target": "B E B E B E B E B E S B E B E S B E S B E B E S S S B E S S B E S S S S S S S B M E S S S S B E B E S"}
168 | {"source": "在 医 生 宣 布 死 亡 后 ， 小 勇 冲 向 爸 爸 ： “ 你 把 我 的 妈 妈 打 死 了 ， 我 要 杀 了 你 。 ”", "target": "S B E B E B E S S B E B E B E S S S S S S B E B E S S S S S S S S S"}
169 | {"source": "母 亲 表 示 ， 他 们 这 个 不 是 乱 伦 ， 而 是 “ G e n e t i c S e x u a l A t t r a c t i o n ， 遗 传 性 性 吸 引 ” （ 谁 知 道 她 这 是 去 哪 学 来 的 专 有 名 词 …", "target": "B E B E S B E S S S S B E S B E S B M M M M M M M M M M M M M M M M M M M M M E S B M E B M E S S S B E S S S S S B E S B E B E S"}
170 | {"source": "习 近 平 在 网 络 安 全 和 信 息 化 工 作 座 谈 会 上 重 要 讲 话 全 文 发 表 后 ， 关 注 互 联 网 发 展 的 热 潮 再 度 掀 起 。", "target": "B M E S B E B E S B M E B E B M E S B E B E B E B E S S B E B M E B E S B E B E B E S"}
171 | {"source": "天 气 渐 暖 ， 动 物 园 里 渐 渐 热 闹 起 来 ， 但 随 之 而 来 的 乱 丢 垃 圾 、 乱 投 喂 等 问 题 却 给 这 充 满 欢 笑 的 生 态 乐 园 增 添 了 许 多 烦 恼 。", "target": "B E S S S B M E S B E B E B E S S B M M E S S S B E S S B E S B E S S S B E B E S B E B E B E S B E B E S"}
172 | {"source": "这 款 由 M a i k L i p p e 设 计 的 超 智 能 手 表 意 识 到 此 问 题 ， 提 供 模 块 化 解 决 方 案 。", "target": "S S S B M M M M M M M E B E S S B E B E B E S S B E S B E B M E B E B E S"}
173 | {"source": "买 房 时 ， 该 项 目 还 没 建 。", "target": "B E S S S B E S S S S"}
174 | {"source": "如 今 ， 这 一 难 题 或 迎 来 破 解 机 会 ： 如 果 公 司 拥 有 “ 范 冰 冰 ” 、 “ 冯 小 刚 ” 等 I P 明 星 ， 就 可 以 纳 入 无 形 资 产 评 估 ， 向 金 融 机 构 作 为 抵 押 品 贷 款 了 。", "target": "B E S S S B E S B E B E B E S B E B E B E S B M E S S S B M E S S B E B E S S B E B E B E B E B E S S B E B E B E B M E B E S S"}
175 | {"source": "事 发 后 ， 伤 者 家 属 希 望 尽 快 处 理 恶 犬 ， 狗 主 人 放 言 ： “ 只 要 不 伤 害 我 的 狗 ， 花 多 少 钱 都 愿 意 。 ”", "target": "B E S S B E B E B E B E B E B E S B M E B E S S B E S B E S S S S S B E S S B E S S"}
176 | {"source": "于 是 ， 很 多 妹 子 来 到 外 滩 ， 大 家 排 队 依 次 合 影 小 鲜 肉 有 了 “ 新 欢 ” 不 知 你 还 记 得 大 明 湖 畔 的 红 邮 筒 吗 v i a 新 闻 晨 报", "target": "B E S B E B E B E B E S B E B E B E B E B M E S S S B E S S S S S B E B M M E S S B E S B M E B E B E"}
177 | {"source": "一 位 妈 妈 根 据 崔 医 生 的 书 整 理 出 来 的 ， 关 于 宝 宝 辅 食 添 加 的 一 些 T I P S ， 现 在 宝 宝 1 岁 3 个 月 ， 各 种 发 育 指 标 良 好 。", "target": "S S B E B E B M E S S B E B E S S B E B E B E B E S B E B M M E S B E B E S S S S S S B E B E B E B E S"}
178 | {"source": "【 将 被 子 踩 地 上 被 打 后 ， 孩 子 一 句 话 让 妈 妈 愧 疚 流 泪 ！ 】", "target": "S S B E S B E S S S S B E S S S S B E B E B E S S"}
179 | {"source": "你 知 道 水 果 蒸 着 吃 营 养 翻 倍 吗 ？", "target": "S B E B E S S S B E B E S S"}
180 | {"source": "苹 果 发 布 了 一 段 全 新 的 广 告 “ 可 再 生 能 源 ” ， 这 段 广 告 中 只 有 M e s s a g e s 信 息 应 用 。", "target": "B E B E S S S B E S B E S B M E B E S S S S B E S S S B M M M M M M E B E B E S"}
181 | {"source": "i O S 9 . 3 . 2 从 4 月 6 日 开 始 测 试 。", "target": "B M M M M M M E S B E B E B E B E S"}
182 | {"source": "我 们 可 以 从 图 片 上 看 到 ， 这 台 i P a d P r o 后 壳 换 成 了 亮 黄 色 ， 这 也 使 设 备 非 常 独 特 。", "target": "B E B E S B E S B E S B E B M M M M M E B E B E S B M E S S S S B E B E B E S"}
183 | {"source": "对 于 想 要 购 买 全 新 外 观 A p p l e W a t c h 的 用 户 来 说 ， 则 需 要 等 到 2 0 1 7 年 。", "target": "B E B E B E B E B E B M M M E B M M M E S B E B E S S B E S S B M M M E S"}
184 | {"source": "被 测 试 的 1 0 款 保 护 壳 分 别 是 ： O t t e r b o x D e f e n d e r 、 T r i a n i u m S l i m C a s e 、 S u p e r c a s e U n i c o r n B a t t l e 、 T e c h 2 1 I m p a c t o l o g y 、 S p e c k M i g h t y s h e l l 、 U A G 、 L i f e p r o o f N U U D 、 X - D o r i a S c e n e 、 G r i f f i n S u r v i v o r 和 苹 果 官 方 保 护 壳 。", "target": "S B E S B E S B M E B E S S B M M M M M M M M M M M M M M E S B M M M M M M M M M M M M M M E S B M M M M M M M M M M M M M M M M M M M M E S B M M M M M M M M M M M M M M M E S B M M M M M M M M M M M M M M E S B M E S B M M M M M M M M M M M E S B M M M M M M M M M M E S B M M M M M M M M M M M M M E S B E B E B M E S"}
185 | {"source": "这 意 味 着 高 通 将 损 失 重 要 顾 客 。", "target": "S B E S B E S B E B E B E S"}
186 | {"source": "为 了 不 让 孩 子 哭 闹 ， 一 些 人 甚 至 给 婴 儿 注 射 镇 静 剂 。", "target": "S S S S B E B E S B E S B E S B E B E B M E S"}
187 | {"source": "爱 你 的 人 ， 生 怕 给 你 的 不 够 ；", "target": "S S S S S B E S S S S S S"}
188 | {"source": "为 治 理 拥 堵 问 题 ， 当 地 政 府 规 定 ， 早 晚 高 峰 时 段 在 市 区 主 要 路 段 ， 私 家 车 必 须 有 至 少 三 名 乘 客 才 能 上 路 。", "target": "S B E B E B E S B E B E B E S B E B E B E S B E B E B E S B M E B E S B E S S B E S S B E S"}
189 | {"source": "习 近 平 代 表 中 国 共 产 党 中 央 委 员 会 并 以 他 个 人 名 义 对 金 正 恩 表 示 热 烈 祝 贺 。", "target": "B M E B E B E B M E B E B M E S S S B E B E S B M E B E B E B E S"}
190 | {"source": "【 检 察 日 报 评 长 春 1 2 0 被 骚 扰 万 余 次 ： 执 法 再 难 也 不 能 放 弃 】", "target": "S B E B E S B E B M E S B E B E S S B E S S S S S B E S"}
191 | {"source": "事 发 前 护 士 护 工 数 次 巡 视 病 房 均 未 发 现 端 倪 ， 遭 到 家 属 质 疑 。", "target": "B E S B E B E B E B E B E S S B E B E S B E B E B E S"}
192 | {"source": "劳 尔 · 卡 斯 特 罗 现 年 8 4 岁 ， 曾 跟 随 兄 长 菲 德 尔 · 卡 斯 特 罗 投 入 反 独 裁 政 府 的 斗 争 和 武 装 斗 争 。", "target": "B M M M M M E B E B E S S S B E B E B M M M M M M E B E S B E B E S B E S B E B E S"}
193 | {"source": "亚 投 行 行 长 金 立 群 日 前 指 出 ， 台 湾 若 想 加 入 由 大 陆 牵 头 创 立 的 亚 投 行 ， 需 透 过 大 陆 财 政 部 申 请 。", "target": "B M E B E B M E B E B E S B E S S B E S B E B E B E S B M E S S B E B E B M E B E S"}
194 | {"source": "不 料 却 在 日 前 的 直 升 机 空 难 中 ， 双 双 身 亡 ， 幸 福 时 间 仅 维 持 8 个 月 。", "target": "B E S S B E S B M E B E S S B E B E S B E B E S B E S S S S"}
195 | {"source": "昨 日 大 涨 后 ， 今 日 两 市 明 显 回 调 。", "target": "B E B E S S B E B E B E B E S"}
196 | {"source": "仅 2 0 1 5 年 一 年 ， 长 春 市 急 救 中 心 就 接 听 恶 意 骚 扰 电 话 1 . 5 万 次 。", "target": "S B M M M E S S S B M E B E B E S B E B E B E B E B M M E S S"}
197 | {"source": "今 天 （ 4 月 1 2 日 ） 2 4 时 ， 国 内 成 品 油 调 价 窗 口 将 再 次 开 启 。", "target": "B E S B E B M E S B M E S B E B M E B E B E S B E B E S"}
198 | {"source": "今 天 ， 小 编 教 大 家 一 些 实 用 的 小 技 巧 ~ 这 些 简 单 的 工 具 还 能 这 么 玩 ！ ！", "target": "B E S B E S B E B E B E S S B E S B E B E S B E S S B E S S S"}
199 | {"source": "今 年 1 月 3 日 , 湖 南 邵 阳 市 脑 科 医 院 精 神 分 裂 症 患 者 刘 强 在 住 院 期 间 , 被 同 病 房 另 一 患 者 用 双 手 挖 去 双 眼 。", "target": "B E B E B E S B E B M E B E B E B E B M E B E B E S B E B E S S S B E S S B E S B E B E B E S"}
200 | {"source": "今 年 两 会 期 间 ， 云 南 省 委 书 记 李 纪 恒 表 示 ： 怒 江 小 水 电 全 部 叫 停 ， 不 再 开 发 。", "target": "B E B E B E S B E B E B E B M E B E S B E B M E B E B E S B E B E S"}
201 | {"source": "但 四 年 之 后 看 到 第 一 届 的 学 生 顺 利 毕 业 ， 我 们 跟 他 们 聊 过 ， 从 谈 吐 上 ， 从 整 个 的 学 习 状 况 上 来 看 ， 我 觉 得 差 距 不 大 ， 跟 普 通 的 清 华 学 生 没 有 太 大 的 区 别 。", "target": "S S S B E B E B E S S B E B E B E S B E S B E S S S S B E S S S B E S B E B E S S S S S B E B E S S S S B E S B E B E B E S S S B E S"}
202 | {"source": "但 在 税 率 设 置 上 暂 给 予 一 定 优 惠 ， 对 具 体 商 品 而 言 则 是 “ 有 升 有 降 ” 。", "target": "S S B E B E S S B E B E B E S S B E B E B E S S S S S S S S S"}
203 | {"source": "截 止 到 4 月 2 0 日 ， 这 3 0 座 病 险 水 库 ， 绝 大 多 数 才 进 行 了 除 险 加 固 处 理 。", "target": "B M E B E B M E S S B E S B E B E S B M M E S B E S B E B E B E S"}
204 | {"source": "但 是 ， 就 在 今 年 汛 前 检 查 中 ， 清 远 市 查 出 有 3 0 座 水 库 存 在 不 同 程 度 的 安 全 隐 患 。", "target": "B E S S S B E B E B E S S B M E B E S B E S B E B E B E B E S B E B E S"}
205 | {"source": "住 宅 规 划 每 套 占 地 8 0 至 1 1 0 多 平 方 米 不 等 ， 建 筑 面 积 均 在 4 0 0 平 方 米 以 上 。", "target": "B E B E B E B E B E S B M M E B M E B E S B E B E S S B M E B M E B E S"}
206 | {"source": "全 国 生 猪 出 栏 均 价 从 上 周 的 每 公 斤 1 9 . 6 9 元 涨 至 本 周 的 1 9 . 8 6 元 ， 较 去 年 同 期 上 涨 了 5 5 % ， 猪 价 “ 无 惧 ” 储 备 肉 投 放 再 创 新 高 ， 有 可 能 突 破 2 0 元 / 公 斤 大 关 。", "target": "B E B E B E B E S B E S S B E B M M M E S S S B E S B M M M E S S S B E B E B E S B M E S B E S B E S B M E B E S S B E S S B E B E B E S S B E B E S"}
207 | {"source": "全 额 奖 学 金 加 各 种 补 贴 ；", "target": "B E B M E S B E B E S"}
208 | {"source": "二 人 发 问 为 什 么 跟 着 她 们 ， 竟 遭 到 男 子 持 续 数 分 钟 的 拳 打 脚 踢 。", "target": "S S B E B M E S S B E S S B E B E B E S B E S B M M E S"}
209 | {"source": "【 广 东 清 远 3 0 座 水 库 存 隐 患 今 年 遇 特 大 洪 水 怎 么 办 ？ 】", "target": "S B E B E B E S B E S B E B E S B E B E B M E S S"}
210 | {"source": "医 院 称 按 国 家 标 准 最 高 赔 6 0 万 元 ， 家 属 不 同 意 。", "target": "B E S S B E B E B E S B M E S S B E S B E S"}
211 | {"source": "即 使 没 有 人 为 你 鼓 掌 ， 也 要 优 雅 的 谢 幕 ， 感 谢 自 己 认 真 的 付 出 。", "target": "B E B E S S S B E S S S B E S B E S B E B E B E S B E S"}
212 | {"source": "受 此 示 范 效 应 带 动 ， 近 来 一 些 券 商 研 究 员 纷 纷 加 入 视 频 直 播 行 列 ， 包 括 方 正 证 券 、 海 通 证 券 等 知 名 分 析 师 。", "target": "S S B E B E B E S B E B E B E B M E B E B E B E B E B E S B E B E B E S B E B E S B E B M E S"}
213 | {"source": "受 伤 者 不 是 学 生 而 是 该 校 安 保 人 员 。", "target": "B M E S S B E S S B E B E B E S"}
214 | {"source": "只 有 强 化 政 府 的 服 务 意 识 ， 跟 上 改 革 节 奏 ， 才 能 杜 绝 “ 卡 纸 ” 事 故 ， 让 人 民 真 正 受 益 。", "target": "B E B E B E S B E B E S B E B E B E S B E B E S B E S B E S S B E B E B E S"}
215 | {"source": "和 专 家 老 师 一 起 实 地 考 察 访 问 …", "target": "S B E B E B E B E B E B E S"}
216 | {"source": "唱 《 社 会 主 义 好 》 ， 是 因 为 她 们 觉 得 这 首 歌 最 能 表 达 他 们 第 一 次 去 美 国 ， 站 在 联 合 国 总 部 前 的 心 声 。", "target": "S S B M M E S S S S B E B E B E S S S S S B E B E B E S S B E S B E B M E B E S S B E S"}
217 | {"source": "听 完 这 首 催 眠 曲 ， 我 感 觉 我 再 也 不 会 失 眠 了 . . .", "target": "B E S S B M E S S B E S S S S S B E S S S S"}
218 | {"source": "却 不 知 道 ， 她 在 对 待 别 人 的 时 候 ， 为 什 么 不 是 这 样 的 态 度 ？", "target": "S S B E S S S B E B E S B E S B M E S S B E S B E S"}
219 | {"source": "吹 啊 吹 我 的 骄 傲 放 纵 . . . 哈 哈 哈 哈 哈 大 写 的 懵 逼 ！ ！ ！", "target": "S S S S S B E B E B M E B M M M E B E S B E S S S"}
220 | {"source": "女 生 无 理 取 闹 的 时 候 ， 男 生 总 说 女 生 没 事 找 事 ， 说 女 生 不 讲 道 理 ， 说 女 生 不 可 理 喻 。", "target": "B E B M M E S B E S B E S S B E B M M E S S B E S S B E S S B E B M M E S"}
221 | {"source": "听 完 后 整 个 人 都 被 治 愈 了 …", "target": "B E S B E S S S B E S S"}
222 | {"source": "小 朋 友 ， 你 这 智 商 还 是 不 要 出 来 撩 妹 了 …", "target": "B M E S S S B E B E S S B E B E S S"}
223 | {"source": "谁 做 的 ， 你 粗 来 ， 简 直 毫 无 违 和 感 笑 一 笑 睡 觉 吧", "target": "S S S S S B E S B E B E B M E S S S B E S"}
224 | {"source": "这 脑 洞 太 大 我 已 无 力 承 受 。", "target": "S B E S S S S B E B E S"}
225 | {"source": "绿 箭 侠 “ 放 弃 弓 箭 变 身 面 具 义 警 ， Y o u h a v e f a i l e d t h i s c i t y", "target": "B M E S B E B E B E B E B E S B M E B M M E B M M M M E B M M E B M M E"}
226 | {"source": "续 集 由 杰 西 · 艾 森 伯 格 、 伍 迪 · 哈 里 森 、 戴 夫 · 弗 兰 科 、 丽 兹 · 卡 潘 、 “ 绿 巨 人 ” 马 克 · 鲁 法 洛 、 迈 克 尔 · 凯 恩 、 摩 根 · 弗 里 曼 联 合 周 杰 伦 、 “ 哈 利 · 波 特 ” 丹 尼 尔 · 雷 德 克 里 夫 主 演 ！", "target": "B E S B M M M M M E S B M M M M E S B M M M M E S B M M M E S S B M E S B M M M M E S B M M M M E S B M M M M E B E B M E S S B M M M E S B M M M M M M M E B E S"}
227 | {"source": "》 前 两 天 给 同 事 做 c o d e r e v i e w ， 感 觉 自 己 对 J a v a 的 G e n e r i c s 掌 握 得 不 够 好 ， 便 拿 出 《 E f f e c t i v e J a v a 》 1 这 本 书 再 看 看 相 关 的 章 节 。", "target": "S S S S S B E S B M M M M M M M M E S B E B E S B M M E S B M M M M M M E B E S S S S S S B E S B M M M M M M M M M M M E S S S S S S B E B E S B E S"}
228 | {"source": "《 工 具 推 荐 ： 三 款 自 动 化 代 码 审 计 工 具 》 工 欲 善 其 事 ， 必 先 利 其 器 。", "target": "S B E B E S S S B M E B E B E B E S B M M M E S B M M M E S"}
229 | {"source": "《 诊 断 J a v a 代 码 中 常 见 的 数 据 库 性 能 热 点 问 题 》 当 我 在 帮 助 一 些 开 发 者 或 架 构 师 分 析 及 优 化 J a v a 应 用 程 序 的 性 能 时 ， 关 键 往 往 不 在 于 对 个 别 方 法 进 行 微 调 ， 以 节 省 一 或 两 微 秒 的 执 行 时 间 。", "target": "S B E B M M E B E S B E S B M E B E B E B E S S S S B E B E B M E S B M E B E S B E B M M E B E B E S B E S S B E B E S B E S B E B E B E B E S S B E S S S B E S B E B E S"}
230 | {"source": "（ 来 自 ： L i f e D e s i g n W e b ）", "target": "S B E S B M M M M M M M M M M M E S"}
231 | {"source": "由 于 面 向 受 众 为 架 构 师 ， 因 此 不 会 涉 及 到 很 多 技 术 细 节 。", "target": "B E B E B E S B M E S B E S S B E S B E B E B E S"}
232 | {"source": "《 大 数 据 如 何 改 善 我 们 周 围 的 生 存 环 境 》 在 正 式 讲 环 境 大 数 据 之 前 ， 我 们 来 讲 一 个 和 身 边 有 关 的 案 例 。", "target": "S B M E B E B E B E B E S B E B E S S B E S B E B M E B E S B E S S S S S B E B E S B E S"}
233 | {"source": "有 一 种 人 ， 当 他 做 错 了 事 ， 他 不 会 承 认 自 己 错 ， 只 会 说 自 己 做 得 很 辛 苦 ， 这 种 人 可 能 会 一 辈 子 辛 苦 下 去 。", "target": "S S S S S S S B E S S S S S S B E B E S S S S S B E S S S B E S S S S B E S B M E B E B E S"}
234 | {"source": "“ 你 们 女 生 不 都 喜 欢 色 色 的 男 人 吗 ？ ”", "target": "S B E B E S S B E B E S B E S S S"}
235 | {"source": "你 我 之 间 本 无 缘 分 ， 全 靠 我 有 钱 买 好 吃 的 让 你 吃 撑 。", "target": "S S B E S S B E S B E S B E S B E S S S B E S"}
236 | {"source": "刘 全 有 只 要 一 见 到 我 ， 就 把 我 手 机 拿 去 看 ， 一 边 看 一 边 陶 醉 在 傻 笑 中 ， 因 为 放 眼 全 宇 宙 ， 只 有 我 的 评 论 里 才 会 有 人 夸 刘 全 有 。", "target": "B M E B E S B E S S S S S B E B E S S B E S B E B E S B E S S B E B E B M E S B E S S B E S S S S S S B M E S"}
237 | {"source": "数 据 只 能 给 出 印 象 ， 究 竟 发 生 了 什 么 需 要 进 一 步 思 考 。", "target": "B E B E B E B E S B E B E S B E B E B M E B E S"}
238 | {"source": "5 月 第 一 批 新 股 ， 哪 只 颜 值 最 高 ？", "target": "B E B E S B E S S S B E S S S"}
239 | {"source": "参 与 企 业 ： 南 方 基 因 研 究 所 ， 北 方 基 因 研 究 所 ， 华 （ 分 享 自 @ 雪 球 ）", "target": "B E B E S B E B E B M E S B E B E B M E S S S B E S S B E S"}
240 | {"source": "， 而 侯 安 扬 周 五 发 文 称 “ 上 涨 1 0 0 0 点 的 难 度 很 大 ， 搞 不 好 下 跌 1 0 0 0 点 的 概 率 更 高 。 ”", "target": "S S B M E B E B E S S B E B M M E S S B E S S S S S S B E B M M E S S B E S S S S"}
241 | {"source": "持 有 封 基 : 二 八 轮 动 升 级 版 — — 持 有 封 基 说 股 市 之 二 十 九 网 上 有 个 非 常 简 单 而 又 非 常 有 效 的 二 八 轮 动 策 略 （ ） ， 非 常 简 单 ， 就 是 三 个 指 数 ： 3 0 0 、 5 0 0 、 国 债 指 数 轮 动 ， 3 0 0 和 5 0 0 的 2 0 天 涨 幅 哪 个 大 持 有 哪 个 ， 两 个 都 为 负 数 持 有 国 债 ， 我 回 测 （ 分 享 自 @ 雪 球 ）", "target": "B E B E S B E B E B E S B E B E B E S B E S B M E B E S S B E B E S S B E B E S B E B E B E S S S B E B E S S S S S B E S B M E S B M E S B E B E B E S B M E S B M E S B E S B E S S S B E S S S S S S S B E B E B E S S B E S B E S S B E S"}
242 | {"source": "经 验 看 创 业 板 一 季 度 占 全 年 业 绩 比 为 1 5 % ， 推 算 出 1 6 年 全 年 业 绩 增 速 为 5 4 % 。", "target": "B E S B M E B M E S B E B M E S B M E S B E S B M E B E B E B E S B M E S"}
243 | 


--------------------------------------------------------------------------------
/bert_nlp/main/main.py:
--------------------------------------------------------------------------------
 1 | from net.model_net import Bert_CRF
 2 | from Io.data_loader import create_batch_iter
 3 | from train.train import fit
 4 | import config.args as args
 5 | from util.porgress_util import ProgressBar
 6 | 
 7 | 
 8 | def start():
 9 | 
10 |     train_iter, num_train_steps = create_batch_iter("train", args.TRAIN_PATH)
11 |     eval_iter = create_batch_iter("dev", args.VALID_PATH)
12 | 
13 |     epoch_size = num_train_steps * args.train_batch_size * args.gradient_accumulation_steps / args.num_train_epochs
14 |     print(f'epoch_size = {epoch_size}')
15 |     pbar = ProgressBar(epoch_size=epoch_size, batch_size=args.train_batch_size)
16 |     model = Bert_CRF.from_pretrained(args.bert_model, num_tag=len(args.labels))
17 |     for name, param in model.named_parameters():
18 |         if param.requires_grad:
19 |             print(name)
20 | 
21 |     fit(model=model,
22 |         training_iter=train_iter,
23 |         eval_iter=eval_iter,
24 |         num_epoch=args.num_train_epochs,
25 |         pbar=pbar,
26 |         num_train_steps=num_train_steps,
27 |         verbose=1)
28 | 


--------------------------------------------------------------------------------
/bert_nlp/model/pytorch_pretrained_model/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1, 
 3 |   "directionality": "bidi", 
 4 |   "hidden_act": "gelu", 
 5 |   "hidden_dropout_prob": 0.1, 
 6 |   "hidden_size": 768, 
 7 |   "initializer_range": 0.02, 
 8 |   "intermediate_size": 3072, 
 9 |   "max_position_embeddings": 512, 
10 |   "num_attention_heads": 12, 
11 |   "num_hidden_layers": 12, 
12 |   "pooler_fc_size": 768, 
13 |   "pooler_num_attention_heads": 12, 
14 |   "pooler_num_fc_layers": 3, 
15 |   "pooler_size_per_head": 128, 
16 |   "pooler_type": "first_token_transform", 
17 |   "type_vocab_size": 2, 
18 |   "vocab_size": 21128
19 | }
20 | 


--------------------------------------------------------------------------------
/bert_nlp/net/crf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | 
  5 | 
  6 | class CRF(nn.Module):
  7 |     """线性条件随机场"""
  8 | 
  9 |     def __init__(self, num_tag, use_cuda=False):
 10 |         if num_tag <= 0:
 11 |             raise ValueError("Invalid value of num_tag: %d" % num_tag)
 12 |         super(CRF, self).__init__()
 13 |         self.num_tag = num_tag
 14 |         self.start_tag = num_tag
 15 |         self.end_tag = num_tag + 1
 16 |         self.use_cuda = use_cuda
 17 |         # 转移矩阵transitions：P_jk 表示从tag_j到tag_k的概率
 18 |         # P_j* 表示所有从tag_j出发的边
 19 |         # P_*k 表示所有到tag_k的边
 20 |         self.transitions = nn.Parameter(torch.Tensor(num_tag + 2, num_tag + 2))
 21 |         nn.init.uniform_(self.transitions, -0.1, 0.1)
 22 |         self.transitions.data[self.end_tag, :] = -10000    # 表示从EOS->其他标签为不可能事件, 如果发生，则产生一个极大的损失
 23 |         self.transitions.data[:, self.start_tag] = -10000    # 表示从其他标签->SOS为不可能事件, 同上
 24 | 
 25 |     def real_path_score(self, features, tags):
 26 |         """
 27 |         features: (time_steps, num_tag)
 28 |         real_path_score表示真实路径分数
 29 |         它由Emission score和Transition score两部分相加组成
 30 |         Emission score由LSTM输出结合真实的tag决定，表示我们希望由输出得到真实的标签
 31 |         Transition score则是crf层需要进行训练的参数，它是随机初始化的，表示标签序列前后间的约束关系（转移概率）
 32 |         Transition矩阵存储的是标签序列相互间的约束关系
 33 |         在训练的过程中，希望real_path_score最高，因为这是所有路径中最可能的路径
 34 |         """
 35 |         r = torch.LongTensor(range(features.size(0)))
 36 |         if self.use_cuda:
 37 |             pad_start_tags = torch.cat([torch.cuda.LongTensor([self.start_tag]), tags])
 38 |             pad_stop_tags = torch.cat([tags, torch.cuda.LongTensor([self.end_tag])])
 39 |             r = r.cuda()
 40 |         else:
 41 |             pad_start_tags = torch.cat([torch.LongTensor([self.start_tag]), tags])
 42 |             pad_stop_tags = torch.cat([tags, torch.LongTensor([self.end_tag])])
 43 |         # Transition score + Emission score
 44 |         score = torch.sum(self.transitions[pad_start_tags, pad_stop_tags]).cpu() + torch.sum(
 45 |             features[r, tags])
 46 |         return score
 47 | 
 48 |     def all_possible_path_score(self, features):
 49 |         """
 50 |         计算所有可能的路径分数的log和：前向算法
 51 |         step1: 将forward列expand成3*3
 52 |         step2: 将下个单词的emission行expand成3*3
 53 |         step3: 将1和2和对应位置的转移矩阵相加
 54 |         step4: 更新forward，合并行
 55 |         step5: 取forward指数的对数计算total
 56 |         """
 57 |         time_steps = features.size(0)
 58 |         # 初始化
 59 |         forward = Variable(torch.zeros(self.num_tag))    # 初始化START_TAG的发射分数为0
 60 |         if self.use_cuda:
 61 |             forward = forward.cuda()
 62 |         for i in range(0, time_steps):    # START_TAG -> 1st word -> 2nd word ->...->END_TAG
 63 |             emission_start = forward.expand(self.num_tag, self.num_tag).t()
 64 |             emission_end = features[i, :].expand(self.num_tag, self.num_tag)
 65 |             if i == 0:
 66 |                 trans_score = self.transitions[self.start_tag, :self.start_tag].cpu()
 67 |             else:
 68 |                 trans_score = self.transitions[:self.start_tag, :self.start_tag].cpu()
 69 |             sum = emission_start + emission_end + trans_score
 70 |             forward = log_sum(sum, dim=0)
 71 |         forward = forward + self.transitions[:self.start_tag, self.end_tag].cpu()    # END_TAG
 72 |         total_score = log_sum(forward, dim=0)
 73 |         return total_score
 74 | 
 75 |     def negative_log_loss(self, inputs, output_mask, tags):
 76 |         """
 77 |         inputs:(batch_size, time_step, num_tag)
 78 |         target_function = P_real_path_score/P_all_possible_path_score
 79 |                         = exp(S_real_path_score)/ sum(exp(certain_path_score))
 80 |         我们希望P_real_path_score的概率越高越好，即target_function的值越大越好
 81 |         因此，loss_function取其相反数，越小越好
 82 |         loss_function = -log(target_function)
 83 |                       = -S_real_path_score + log(exp(S_1 + exp(S_2) + exp(S_3) + ...))
 84 |                       = -S_real_path_score + log(all_possible_path_score)
 85 |         """
 86 |         if not self.use_cuda:
 87 |             inputs = inputs.cpu()
 88 |             output_mask = output_mask.cpu()
 89 |             tags = tags.cpu()
 90 | 
 91 |         loss = Variable(torch.tensor(0.), requires_grad=True)
 92 |         num_chars = torch.sum(output_mask.detach()).float()
 93 |         # lsp:每一行算loss
 94 |         for ix, (features, tag) in enumerate(zip(inputs, tags)):
 95 |             # print(f'features shape = {features.shape}')  # 200 x 9
 96 |             # print(f'tag shape = {tag.shape}')    # 200
 97 |             # 过滤[CLS] [SEP] sub_word
 98 |             # features (time_steps, num_tag)
 99 |             # output_mask (batch_size, time_step)
100 |             num_valid = torch.sum(output_mask[ix].detach())
101 |             features = features[output_mask[ix] == 1]    # 将cls 和 sep过滤掉
102 |             tag = tag[:num_valid]
103 |             real_score = self.real_path_score(features, tag)
104 |             total_score = self.all_possible_path_score(features)
105 |             cost = total_score - real_score
106 |             loss = loss + cost
107 |         return loss / num_chars    # lsp: 每个字的loss，即每个label的平均loss
108 | 
109 |     def viterbi(self, features):
110 |         time_steps = features.size(0)
111 |         forward = Variable(torch.zeros(self.num_tag))    # START_TAG
112 |         if self.use_cuda:
113 |             forward = forward.cuda()
114 |         # back_points 到该点的最大分数  last_points 前一个点的索引
115 |         back_points, index_points = [self.transitions[self.start_tag, :self.start_tag].cpu()
116 |                                     ], [torch.LongTensor([-1]).expand_as(forward)]
117 |         for i in range(1, time_steps):    # START_TAG -> 1st word -> 2nd word ->...->END_TAG
118 |             emission_start = forward.expand(self.num_tag, self.num_tag).t()
119 |             emission_end = features[i, :].expand(self.num_tag, self.num_tag)
120 |             trans_score = self.transitions[:self.start_tag, :self.start_tag].cpu()
121 |             sum = emission_start + emission_end + trans_score
122 |             forward, index = torch.max(sum.detach(), dim=0)
123 |             back_points.append(forward)
124 |             index_points.append(index)
125 |         back_points.append(forward +
126 |                            self.transitions[:self.start_tag, self.end_tag].cpu())    # END_TAG
127 |         return back_points, index_points
128 | 
129 |     def get_best_path(self, features):
130 |         back_points, index_points = self.viterbi(features)
131 |         # 找到线头
132 |         best_last_point = argmax(back_points[-1])
133 |         index_points = torch.stack(index_points)    # 堆成矩阵
134 |         m = index_points.size(0)
135 |         # 初始化矩阵
136 |         best_path = [best_last_point]
137 |         # 循着线头找到其对应的最佳路径
138 |         for i in range(m - 1, 0, -1):
139 |             best_index_point = index_points[i][best_last_point]
140 |             best_path.append(best_index_point)
141 |             best_last_point = best_index_point
142 |         best_path.reverse()
143 |         return best_path
144 | 
145 |     def get_batch_best_path(self, inputs, output_mask):
146 |         if not self.use_cuda:
147 |             inputs = inputs.cpu()
148 |             output_mask = output_mask.cpu()
149 |         batch_best_path = []
150 |         max_len = inputs.size(1)
151 |         for ix, features in enumerate(inputs):
152 |             features = features[output_mask[ix] == 1]
153 |             best_path = self.get_best_path(features)
154 |             best_path = torch.Tensor(best_path).long()
155 |             best_path = padding(best_path, max_len)
156 |             batch_best_path.append(best_path)
157 |         batch_best_path = torch.stack(batch_best_path, dim=0)
158 |         return batch_best_path
159 | 
160 | 
161 | def log_sum(matrix, dim):
162 |     """
163 |     前向算法是不断累积之前的结果，这样就会有个缺点
164 |     指数和累积到一定程度后，会超过计算机浮点值的最大值，变成inf，这样取log后也是inf
165 |     为了避免这种情况，我们做了改动：
166 |     1. 用一个合适的值clip去提指数和的公因子，这样就不会使某项变得过大而无法计算
167 |     SUM = log(exp(s1)+exp(s2)+...+exp(s100))
168 |         = log{exp(clip)*[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]}
169 |         = clip + log[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]
170 |     where clip=max
171 |     """
172 |     clip_value = torch.max(matrix)    # 极大值
173 |     clip_value = int(clip_value.data.tolist())
174 |     log_sum_value = clip_value + torch.log(torch.sum(torch.exp(matrix - clip_value), dim=dim))
175 |     return log_sum_value
176 | 
177 | 
178 | def argmax(matrix, dim=0):
179 |     """(0.5, 0.4, 0.3)"""
180 |     _, index = torch.max(matrix, dim=dim)
181 |     return index
182 | 
183 | 
184 | def padding(vec, max_len, pad_token=-1):
185 |     new_vec = torch.zeros(max_len).long()
186 |     new_vec[:vec.size(0)] = vec
187 |     new_vec[vec.size(0):] = pad_token
188 |     return new_vec
189 | 


--------------------------------------------------------------------------------
/bert_nlp/net/model_net.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from net.crf import CRF
 3 | import numpy as np
 4 | from sklearn.metrics import f1_score, classification_report
 5 | from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
 6 | 
 7 | 
 8 | class Bert_CRF(BertPreTrainedModel):
 9 | 
10 |     def __init__(self, config, num_tag):
11 |         super(Bert_CRF, self).__init__(config)
12 |         self.bert = BertModel(config)
13 |         # for p in self.bert.parameters():
14 |         #     p.requires_grad = False
15 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
16 |         self.classifier = nn.Linear(config.hidden_size, num_tag)
17 |         self.apply(self.init_bert_weights)
18 | 
19 |         self.crf = CRF(num_tag)
20 | 
21 |     def forward(self,
22 |                 input_ids,
23 |                 token_type_ids,
24 |                 attention_mask,
25 |                 label_id=None,
26 |                 output_all_encoded_layers=False):
27 |         bert_encode, _ = self.bert(input_ids,
28 |                                    token_type_ids,
29 |                                    attention_mask,
30 |                                    output_all_encoded_layers=output_all_encoded_layers)
31 |         output = self.classifier(bert_encode)
32 |         return output
33 | 
34 |     def loss_fn(self, bert_encode, output_mask, tags):
35 |         loss = self.crf.negative_log_loss(bert_encode, output_mask, tags)
36 |         return loss
37 | 
38 |     def predict(self, bert_encode, output_mask):
39 |         predicts = self.crf.get_batch_best_path(bert_encode, output_mask)
40 |         predicts = predicts.view(1, -1).squeeze()
41 |         predicts = predicts[predicts != -1]
42 |         return predicts
43 | 
44 |     def acc_f1(self, y_pred, y_true):
45 |         y_pred = y_pred.numpy()
46 |         y_true = y_true.numpy()
47 |         f1 = f1_score(y_true, y_pred, average="macro")
48 |         correct = np.sum((y_true == y_pred).astype(int))
49 |         acc = correct / y_pred.shape[0]
50 |         return acc, f1
51 | 
52 |     def class_report(self, y_pred, y_true):
53 |         y_true = y_true.numpy()
54 |         y_pred = y_pred.numpy()
55 |         classify_report = classification_report(y_true, y_pred)
56 |         print('\n\nclassify_report:\n', classify_report)
57 | 


--------------------------------------------------------------------------------
/bert_nlp/output/images/loss_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lisennlp/bert_crf_sequence_annotation/93732575dedd77122d61ff9b63e8f40d7e8d84f4/bert_nlp/output/images/loss_acc.png


--------------------------------------------------------------------------------
/bert_nlp/preprocessing/data_processor.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from util.Logginger import init_logger
  3 | import config.args as args
  4 | 
  5 | logger = init_logger("model_net", logging_path=args.log_path)
  6 | 
  7 | 
  8 | class InputExample(object):
  9 | 
 10 |     def __init__(self, guid, text_a, text_b=None, label=None):
 11 |         """创建一个输入实例
 12 |         Args:
 13 |             guid: 每个example拥有唯一的id
 14 |             text_a: 第一个句子的原始文本，一般对于文本分类来说，只需要text_a
 15 |             text_b: 第二个句子的原始文本，在句子对的任务中才有，分类问题中为None
 16 |             label: example对应的标签，对于训练集和验证集应非None，测试集为None
 17 |         """
 18 |         self.guid = guid
 19 |         self.text_a = text_a
 20 |         self.text_b = text_b
 21 |         self.label = label
 22 | 
 23 | 
 24 | class InputFeature(object):
 25 | 
 26 |     def __init__(self, input_ids, input_mask, segment_ids, label_id, output_mask):
 27 |         self.input_ids = input_ids
 28 |         self.input_mask = input_mask
 29 |         self.segment_ids = segment_ids
 30 |         self.label_id = label_id
 31 |         self.output_mask = output_mask
 32 | 
 33 | 
 34 | class DataProcessor(object):
 35 |     """数据预处理的基类，自定义的MyPro继承该类"""
 36 | 
 37 |     def get_train_examples(self, data_dir):
 38 |         """读取训练集 Gets a collection of `InputExample`s for the train set."""
 39 |         raise NotImplementedError()
 40 | 
 41 |     def get_dev_examples(self, data_dir):
 42 |         """读取验证集 Gets a collection of `InputExample`s for the dev set."""
 43 |         raise NotImplementedError()
 44 | 
 45 |     def get_labels(self):
 46 |         """读取标签 Gets the list of labels for this data set."""
 47 |         raise NotImplementedError()
 48 | 
 49 |     @classmethod
 50 |     def _read_json(cls, input_file):
 51 |         with open(input_file, "r", encoding='utf-8') as fr:
 52 |             lines = []
 53 |             for line in fr:
 54 |                 _line = line.strip('\n')
 55 |                 lines.append(_line)
 56 |             return lines
 57 | 
 58 | 
 59 | class MyPro(DataProcessor):
 60 |     """将数据构造成example格式"""
 61 | 
 62 |     def _create_example(self, lines, set_type):
 63 |         examples = []
 64 |         for i, line in enumerate(lines):
 65 |             guid = "%s-%d" % (set_type, i)
 66 |             line = json.loads(line)
 67 |             text_a = line["source"]
 68 |             label = line["target"]
 69 |             try:
 70 |                 assert len(label.split()) == len(text_a.split())
 71 |             except:
 72 |                 logger.info(f'  Error data  \n')
 73 |                 print(f'{label.split()}, {len(label.split())}')
 74 |                 print(f'{text_a.split()}, {len(text_a.split())}')
 75 |                 continue
 76 |             example = InputExample(guid=guid, text_a=text_a, label=label)
 77 |             examples.append(example)
 78 |         return examples
 79 | 
 80 |     def get_train_examples(self, path):
 81 |         lines = self._read_json(path)
 82 |         examples = self._create_example(lines, "train")
 83 |         return examples
 84 | 
 85 |     def get_dev_examples(self, path):
 86 |         lines = self._read_json(path)
 87 |         examples = self._create_example(lines, "dev")
 88 |         return examples
 89 | 
 90 |     def get_labels(self):
 91 |         return args.labels
 92 | 
 93 | 
 94 | def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
 95 |     # 标签转换为数字
 96 |     label_map = {label: i for i, label in enumerate(label_list)}
 97 |     features = []
 98 |     for ex_index, example in enumerate(examples):
 99 |         tokens_a = tokenizer.tokenize(example.text_a)
100 |         labels = example.label.split()
101 | 
102 |         if len(tokens_a) == 0 or len(labels) == 0:
103 |             continue
104 | 
105 |         if len(tokens_a) > max_seq_length - 2:
106 |             tokens_a = tokens_a[:(max_seq_length - 2)]
107 |             labels = labels[:(max_seq_length - 2)]
108 |         # ----------------处理source--------------
109 |         ## 句子首尾加入标示符
110 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
111 |         segment_ids = [0] * len(tokens)
112 |         ## 词转换成数字
113 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
114 | 
115 |         input_mask = [1] * len(input_ids)
116 | 
117 |         padding = [0] * (max_seq_length - len(input_ids))
118 | 
119 |         input_ids += padding
120 |         input_mask += padding
121 |         segment_ids += padding
122 | 
123 |         assert len(input_ids) == max_seq_length
124 |         assert len(input_mask) == max_seq_length
125 |         assert len(segment_ids) == max_seq_length
126 | 
127 |         # ---------------处理target----------------
128 |         ## Notes: label_id中不包括[CLS]和[SEP]
129 |         label_id = [label_map.get(l, len(label_map) - 1) for l in labels]
130 |         label_padding = [-1] * (max_seq_length - len(label_id))
131 |         label_id += label_padding
132 | 
133 |         # 不考虑cls和sep
134 |         output_mask = [0] + len(tokens_a) * [1] + [0]
135 |         output_mask += padding
136 | 
137 |         if ex_index < 1:
138 |             logger.info("-----------------Example-----------------")
139 |             logger.info("guid: %s" % (example.guid))
140 |             logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
141 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
142 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
143 |             logger.info("label: %s " % " ".join([str(x) for x in label_id]))
144 |             logger.info("output_mask: %s " % " ".join([str(x) for x in output_mask]))
145 |         # ----------------------------------------------------
146 | 
147 |         feature = InputFeature(input_ids=input_ids,
148 |                                input_mask=input_mask,
149 |                                segment_ids=segment_ids,
150 |                                label_id=label_id,
151 |                                output_mask=output_mask)
152 |         features.append(feature)
153 | 
154 |     return features
155 | 


--------------------------------------------------------------------------------
/bert_nlp/pytorch_pretrained_bert/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
  7 | 
  8 | import sys
  9 | import json
 10 | import logging
 11 | import os
 12 | import shutil
 13 | import tempfile
 14 | import fnmatch
 15 | from functools import wraps
 16 | from hashlib import sha256
 17 | import sys
 18 | from io import open
 19 | 
 20 | import boto3
 21 | import requests
 22 | from botocore.exceptions import ClientError
 23 | from tqdm import tqdm
 24 | 
 25 | try:
 26 |     from urllib.parse import urlparse
 27 | except ImportError:
 28 |     from urlparse import urlparse
 29 | 
 30 | try:
 31 |     from pathlib import Path
 32 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 33 |                                                    Path.home() / '.pytorch_pretrained_bert'))
 34 | except (AttributeError, ImportError):
 35 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 36 |                                               os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 37 | 
 38 | CONFIG_NAME = "config.json"
 39 | WEIGHTS_NAME = "pytorch_model.bin"
 40 | 
 41 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 42 | 
 43 | 
 44 | def url_to_filename(url, etag=None):
 45 |     """
 46 |     Convert `url` into a hashed filename in a repeatable way.
 47 |     If `etag` is specified, append its hash to the url's, delimited
 48 |     by a period.
 49 |     """
 50 |     url_bytes = url.encode('utf-8')
 51 |     url_hash = sha256(url_bytes)
 52 |     filename = url_hash.hexdigest()
 53 | 
 54 |     if etag:
 55 |         etag_bytes = etag.encode('utf-8')
 56 |         etag_hash = sha256(etag_bytes)
 57 |         filename += '.' + etag_hash.hexdigest()
 58 | 
 59 |     return filename
 60 | 
 61 | 
 62 | def filename_to_url(filename, cache_dir=None):
 63 |     """
 64 |     Return the url and etag (which may be ``None``) stored for `filename`.
 65 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
 66 |     """
 67 |     if cache_dir is None:
 68 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 69 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 70 |         cache_dir = str(cache_dir)
 71 | 
 72 |     cache_path = os.path.join(cache_dir, filename)
 73 |     if not os.path.exists(cache_path):
 74 |         raise EnvironmentError("file {} not found".format(cache_path))
 75 | 
 76 |     meta_path = cache_path + '.json'
 77 |     if not os.path.exists(meta_path):
 78 |         raise EnvironmentError("file {} not found".format(meta_path))
 79 | 
 80 |     with open(meta_path, encoding="utf-8") as meta_file:
 81 |         metadata = json.load(meta_file)
 82 |     url = metadata['url']
 83 |     etag = metadata['etag']
 84 | 
 85 |     return url, etag
 86 | 
 87 | 
 88 | def cached_path(url_or_filename, cache_dir=None):
 89 |     """
 90 |     Given something that might be a URL (or might be a local path),
 91 |     determine which. If it's a URL, download the file and cache it, and
 92 |     return the path to the cached file. If it's already a local path,
 93 |     make sure the file exists and then return the path.
 94 |     """
 95 |     if cache_dir is None:
 96 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 97 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
 98 |         url_or_filename = str(url_or_filename)
 99 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
100 |         cache_dir = str(cache_dir)
101 | 
102 |     parsed = urlparse(url_or_filename)
103 | 
104 |     if parsed.scheme in ('http', 'https', 's3'):
105 |         # URL, so get it from the cache (downloading if necessary)
106 |         return get_from_cache(url_or_filename, cache_dir)
107 |     elif os.path.exists(url_or_filename):
108 |         # File, and it exists.
109 |         return url_or_filename
110 |     elif parsed.scheme == '':
111 |         # File, but it doesn't exist.
112 |         raise EnvironmentError("file {} not found".format(url_or_filename))
113 |     else:
114 |         # Something unknown
115 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
116 | 
117 | 
118 | def split_s3_path(url):
119 |     """Split a full s3 path into the bucket name and path."""
120 |     parsed = urlparse(url)
121 |     if not parsed.netloc or not parsed.path:
122 |         raise ValueError("bad s3 path {}".format(url))
123 |     bucket_name = parsed.netloc
124 |     s3_path = parsed.path
125 |     # Remove '/' at beginning of path.
126 |     if s3_path.startswith("/"):
127 |         s3_path = s3_path[1:]
128 |     return bucket_name, s3_path
129 | 
130 | 
131 | def s3_request(func):
132 |     """
133 |     Wrapper function for s3 requests in order to create more helpful error
134 |     messages.
135 |     """
136 | 
137 |     @wraps(func)
138 |     def wrapper(url, *args, **kwargs):
139 |         try:
140 |             return func(url, *args, **kwargs)
141 |         except ClientError as exc:
142 |             if int(exc.response["Error"]["Code"]) == 404:
143 |                 raise EnvironmentError("file {} not found".format(url))
144 |             else:
145 |                 raise
146 | 
147 |     return wrapper
148 | 
149 | 
150 | @s3_request
151 | def s3_etag(url):
152 |     """Check ETag on S3 object."""
153 |     s3_resource = boto3.resource("s3")
154 |     bucket_name, s3_path = split_s3_path(url)
155 |     s3_object = s3_resource.Object(bucket_name, s3_path)
156 |     return s3_object.e_tag
157 | 
158 | 
159 | @s3_request
160 | def s3_get(url, temp_file):
161 |     """Pull a file directly from S3."""
162 |     s3_resource = boto3.resource("s3")
163 |     bucket_name, s3_path = split_s3_path(url)
164 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
165 | 
166 | 
167 | def http_get(url, temp_file):
168 |     req = requests.get(url, stream=True)
169 |     content_length = req.headers.get('Content-Length')
170 |     total = int(content_length) if content_length is not None else None
171 |     progress = tqdm(unit="B", total=total)
172 |     for chunk in req.iter_content(chunk_size=1024):
173 |         if chunk: # filter out keep-alive new chunks
174 |             progress.update(len(chunk))
175 |             temp_file.write(chunk)
176 |     progress.close()
177 | 
178 | 
179 | def get_from_cache(url, cache_dir=None):
180 |     """
181 |     Given a URL, look for the corresponding dataset in the local cache.
182 |     If it's not there, download it. Then return the path to the cached file.
183 |     """
184 |     if cache_dir is None:
185 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
186 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
187 |         cache_dir = str(cache_dir)
188 | 
189 |     if not os.path.exists(cache_dir):
190 |         os.makedirs(cache_dir)
191 | 
192 |     # Get eTag to add to filename, if it exists.
193 |     if url.startswith("s3://"):
194 |         etag = s3_etag(url)
195 |     else:
196 |         try:
197 |             response = requests.head(url, allow_redirects=True)
198 |             if response.status_code != 200:
199 |                 etag = None
200 |             else:
201 |                 etag = response.headers.get("ETag")
202 |         except EnvironmentError:
203 |             etag = None
204 | 
205 |     if sys.version_info[0] == 2 and etag is not None:
206 |         etag = etag.decode('utf-8')
207 |     filename = url_to_filename(url, etag)
208 | 
209 |     # get cache path to put the file
210 |     cache_path = os.path.join(cache_dir, filename)
211 | 
212 |     # If we don't have a connection (etag is None) and can't identify the file
213 |     # try to get the last downloaded one
214 |     if not os.path.exists(cache_path) and etag is None:
215 |         matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
216 |         matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
217 |         if matching_files:
218 |             cache_path = os.path.join(cache_dir, matching_files[-1])
219 | 
220 |     if not os.path.exists(cache_path):
221 |         # Download to temporary file, then copy to cache dir once finished.
222 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
223 |         with tempfile.NamedTemporaryFile() as temp_file:
224 |             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
225 | 
226 |             # GET file object
227 |             if url.startswith("s3://"):
228 |                 s3_get(url, temp_file)
229 |             else:
230 |                 http_get(url, temp_file)
231 | 
232 |             # we are copying the file before closing it, so flush to avoid truncation
233 |             temp_file.flush()
234 |             # shutil.copyfileobj() starts at the current position, so go to the start
235 |             temp_file.seek(0)
236 | 
237 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
238 |             with open(cache_path, 'wb') as cache_file:
239 |                 shutil.copyfileobj(temp_file, cache_file)
240 | 
241 |             logger.info("creating metadata file for %s", cache_path)
242 |             meta = {'url': url, 'etag': etag}
243 |             meta_path = cache_path + '.json'
244 |             with open(meta_path, 'w') as meta_file:
245 |                 output_string = json.dumps(meta)
246 |                 if sys.version_info[0] == 2 and isinstance(output_string, str):
247 |                     output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
248 |                 meta_file.write(output_string)
249 | 
250 |             logger.info("removing temp file %s", temp_file.name)
251 | 
252 |     return cache_path
253 | 
254 | 
255 | def read_set_from_file(filename):
256 |     '''
257 |     Extract a de-duped collection (set) of text from a file.
258 |     Expected file format is one item per line.
259 |     '''
260 |     collection = set()
261 |     with open(filename, 'r', encoding='utf-8') as file_:
262 |         for line in file_:
263 |             collection.add(line.rstrip())
264 |     return collection
265 | 
266 | 
267 | def get_file_extension(path, dot=True, lower=True):
268 |     ext = os.path.splitext(path)[1]
269 |     ext = ext if dot else ext[1:]
270 |     return ext.lower() if lower else ext
271 | 


--------------------------------------------------------------------------------
/bert_nlp/pytorch_pretrained_bert/modeling.py:
--------------------------------------------------------------------------------
   1 | # coding=utf-8
   2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
   3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
   4 | #
   5 | # Licensed under the Apache License, Version 2.0 (the "License");
   6 | # you may not use this file except in compliance with the License.
   7 | # You may obtain a copy of the License at
   8 | #
   9 | #     http://www.apache.org/licenses/LICENSE-2.0
  10 | #
  11 | # Unless required by applicable law or agreed to in writing, software
  12 | # distributed under the License is distributed on an "AS IS" BASIS,
  13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14 | # See the License for the specific language governing permissions and
  15 | # limitations under the License.
  16 | """PyTorch BERT model."""
  17 | 
  18 | from __future__ import absolute_import, division, print_function, unicode_literals
  19 | 
  20 | import copy
  21 | import json
  22 | import logging
  23 | import math
  24 | import os
  25 | import shutil
  26 | import tarfile
  27 | import tempfile
  28 | import sys
  29 | from io import open
  30 | 
  31 | import torch
  32 | from torch import nn
  33 | from torch.nn import CrossEntropyLoss
  34 | 
  35 | from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
  36 | 
  37 | logger = logging.getLogger(__name__)
  38 | 
  39 | PRETRAINED_MODEL_ARCHIVE_MAP = {
  40 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
  41 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
  42 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
  43 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
  44 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
  45 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
  46 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
  47 | }
  48 | BERT_CONFIG_NAME = 'bert_config.json'
  49 | TF_WEIGHTS_NAME = 'model.ckpt'
  50 | 
  51 | def load_tf_weights_in_bert(model, tf_checkpoint_path):
  52 |     """ Load tf checkpoints in a pytorch model
  53 |     """
  54 |     try:
  55 |         import re
  56 |         import numpy as np
  57 |         import tensorflow as tf
  58 |     except ImportError:
  59 |         print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
  60 |             "https://www.tensorflow.org/install/ for installation instructions.")
  61 |         raise
  62 |     tf_path = os.path.abspath(tf_checkpoint_path)
  63 |     print("Converting TensorFlow checkpoint from {}".format(tf_path))
  64 |     # Load weights from TF model
  65 |     init_vars = tf.train.list_variables(tf_path)
  66 |     names = []
  67 |     arrays = []
  68 |     for name, shape in init_vars:
  69 |         print("Loading TF weight {} with shape {}".format(name, shape))
  70 |         array = tf.train.load_variable(tf_path, name)
  71 |         names.append(name)
  72 |         arrays.append(array)
  73 | 
  74 |     for name, array in zip(names, arrays):
  75 |         name = name.split('/')
  76 |         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
  77 |         # which are not required for using pretrained model
  78 |         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
  79 |             print("Skipping {}".format("/".join(name)))
  80 |             continue
  81 |         pointer = model
  82 |         for m_name in name:
  83 |             if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
  84 |                 l = re.split(r'_(\d+)', m_name)
  85 |             else:
  86 |                 l = [m_name]
  87 |             if l[0] == 'kernel' or l[0] == 'gamma':
  88 |                 pointer = getattr(pointer, 'weight')
  89 |             elif l[0] == 'output_bias' or l[0] == 'beta':
  90 |                 pointer = getattr(pointer, 'bias')
  91 |             elif l[0] == 'output_weights':
  92 |                 pointer = getattr(pointer, 'weight')
  93 |             elif l[0] == 'squad':
  94 |                 pointer = getattr(pointer, 'classifier')
  95 |             else:
  96 |                 try:
  97 |                     pointer = getattr(pointer, l[0])
  98 |                 except AttributeError:
  99 |                     print("Skipping {}".format("/".join(name)))
 100 |                     continue
 101 |             if len(l) >= 2:
 102 |                 num = int(l[1])
 103 |                 pointer = pointer[num]
 104 |         if m_name[-11:] == '_embeddings':
 105 |             pointer = getattr(pointer, 'weight')
 106 |         elif m_name == 'kernel':
 107 |             array = np.transpose(array)
 108 |         try:
 109 |             assert pointer.shape == array.shape
 110 |         except AssertionError as e:
 111 |             e.args += (pointer.shape, array.shape)
 112 |             raise
 113 |         print("Initialize PyTorch weight {}".format(name))
 114 |         pointer.data = torch.from_numpy(array)
 115 |     return model
 116 | 
 117 | 
 118 | def gelu(x):
 119 |     """Implementation of the gelu activation function.
 120 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
 121 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 122 |         Also see https://arxiv.org/abs/1606.08415
 123 |     """
 124 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 125 | 
 126 | 
 127 | def swish(x):
 128 |     return x * torch.sigmoid(x)
 129 | 
 130 | 
 131 | ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 132 | 
 133 | 
 134 | class BertConfig(object):
 135 |     """Configuration class to store the configuration of a `BertModel`.
 136 |     """
 137 |     def __init__(self,
 138 |                  vocab_size_or_config_json_file,
 139 |                  hidden_size=768,
 140 |                  num_hidden_layers=12,
 141 |                  num_attention_heads=12,
 142 |                  intermediate_size=3072,
 143 |                  hidden_act="gelu",
 144 |                  hidden_dropout_prob=0.1,
 145 |                  attention_probs_dropout_prob=0.1,
 146 |                  max_position_embeddings=512,
 147 |                  type_vocab_size=2,
 148 |                  initializer_range=0.02):
 149 |         """Constructs BertConfig.
 150 | 
 151 |         Args:
 152 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 153 |             hidden_size: Size of the encoder layers and the pooler layer.
 154 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 155 |             num_attention_heads: Number of attention heads for each attention layer in
 156 |                 the Transformer encoder.
 157 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 158 |                 layer in the Transformer encoder.
 159 |             hidden_act: The non-linear activation function (function or string) in the
 160 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 161 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 162 |                 layers in the embeddings, encoder, and pooler.
 163 |             attention_probs_dropout_prob: The dropout ratio for the attention
 164 |                 probabilities.
 165 |             max_position_embeddings: The maximum sequence length that this model might
 166 |                 ever be used with. Typically set this to something large just in case
 167 |                 (e.g., 512 or 1024 or 2048).
 168 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 169 |                 `BertModel`.
 170 |             initializer_range: The sttdev of the truncated_normal_initializer for
 171 |                 initializing all weight matrices.
 172 |         """
 173 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 174 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 175 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 176 |                 json_config = json.loads(reader.read())
 177 |             for key, value in json_config.items():
 178 |                 self.__dict__[key] = value
 179 |         elif isinstance(vocab_size_or_config_json_file, int):
 180 |             self.vocab_size = vocab_size_or_config_json_file
 181 |             self.hidden_size = hidden_size
 182 |             self.num_hidden_layers = num_hidden_layers
 183 |             self.num_attention_heads = num_attention_heads
 184 |             self.hidden_act = hidden_act
 185 |             self.intermediate_size = intermediate_size
 186 |             self.hidden_dropout_prob = hidden_dropout_prob
 187 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
 188 |             self.max_position_embeddings = max_position_embeddings
 189 |             self.type_vocab_size = type_vocab_size
 190 |             self.initializer_range = initializer_range
 191 |         else:
 192 |             raise ValueError("First argument must be either a vocabulary size (int)"
 193 |                              "or the path to a pretrained model config file (str)")
 194 | 
 195 |     @classmethod
 196 |     def from_dict(cls, json_object):
 197 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
 198 |         config = BertConfig(vocab_size_or_config_json_file=-1)
 199 |         for key, value in json_object.items():
 200 |             config.__dict__[key] = value
 201 |         return config
 202 | 
 203 |     @classmethod
 204 |     def from_json_file(cls, json_file):
 205 |         """Constructs a `BertConfig` from a json file of parameters."""
 206 |         with open(json_file, "r", encoding='utf-8') as reader:
 207 |             text = reader.read()
 208 |         return cls.from_dict(json.loads(text))
 209 | 
 210 |     def __repr__(self):
 211 |         return str(self.to_json_string())
 212 | 
 213 |     def to_dict(self):
 214 |         """Serializes this instance to a Python dictionary."""
 215 |         output = copy.deepcopy(self.__dict__)
 216 |         return output
 217 | 
 218 |     def to_json_string(self):
 219 |         """Serializes this instance to a JSON string."""
 220 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
 221 | 
 222 |     def to_json_file(self, json_file_path):
 223 |         """ Save this instance to a json file."""
 224 |         with open(json_file_path, "w", encoding='utf-8') as writer:
 225 |             writer.write(self.to_json_string())
 226 | 
 227 | try:
 228 |     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
 229 | except ImportError:
 230 |     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
 231 |     class BertLayerNorm(nn.Module):
 232 |         def __init__(self, hidden_size, eps=1e-12):
 233 |             """Construct a layernorm module in the TF style (epsilon inside the square root).
 234 |             """
 235 |             super(BertLayerNorm, self).__init__()
 236 |             self.weight = nn.Parameter(torch.ones(hidden_size))
 237 |             self.bias = nn.Parameter(torch.zeros(hidden_size))
 238 |             self.variance_epsilon = eps
 239 | 
 240 |         def forward(self, x):
 241 |             u = x.mean(-1, keepdim=True)
 242 |             s = (x - u).pow(2).mean(-1, keepdim=True)
 243 |             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
 244 |             return self.weight * x + self.bias
 245 | 
 246 | class BertEmbeddings(nn.Module):
 247 |     """Construct the embeddings from word, position and token_type embeddings.
 248 |     """
 249 |     def __init__(self, config):
 250 |         super(BertEmbeddings, self).__init__()
 251 |         self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
 252 |         self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
 253 |         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
 254 | 
 255 |         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
 256 |         # any TensorFlow checkpoint file
 257 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 258 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 259 | 
 260 |     def forward(self, input_ids, token_type_ids=None):
 261 |         seq_length = input_ids.size(1)
 262 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
 263 |         position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
 264 |         if token_type_ids is None:
 265 |             token_type_ids = torch.zeros_like(input_ids)
 266 | 
 267 |         words_embeddings = self.word_embeddings(input_ids)
 268 |         position_embeddings = self.position_embeddings(position_ids)
 269 |         token_type_embeddings = self.token_type_embeddings(token_type_ids)
 270 | 
 271 |         embeddings = words_embeddings + position_embeddings + token_type_embeddings
 272 |         embeddings = self.LayerNorm(embeddings)
 273 |         embeddings = self.dropout(embeddings)
 274 |         return embeddings
 275 | 
 276 | 
 277 | class BertSelfAttention(nn.Module):
 278 |     def __init__(self, config):
 279 |         super(BertSelfAttention, self).__init__()
 280 |         if config.hidden_size % config.num_attention_heads != 0:
 281 |             raise ValueError(
 282 |                 "The hidden size (%d) is not a multiple of the number of attention "
 283 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
 284 |         self.num_attention_heads = config.num_attention_heads
 285 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 286 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 287 | 
 288 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
 289 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
 290 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 291 | 
 292 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 293 | 
 294 |     def transpose_for_scores(self, x):
 295 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
 296 |         x = x.view(*new_x_shape)
 297 |         return x.permute(0, 2, 1, 3)
 298 | 
 299 |     def forward(self, hidden_states, attention_mask):
 300 |         mixed_query_layer = self.query(hidden_states)
 301 |         mixed_key_layer = self.key(hidden_states)
 302 |         mixed_value_layer = self.value(hidden_states)
 303 | 
 304 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 305 |         key_layer = self.transpose_for_scores(mixed_key_layer)
 306 |         value_layer = self.transpose_for_scores(mixed_value_layer)
 307 | 
 308 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 309 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 310 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 311 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
 312 |         attention_scores = attention_scores + attention_mask
 313 | 
 314 |         # Normalize the attention scores to probabilities.
 315 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 316 | 
 317 |         # This is actually dropping out entire tokens to attend to, which might
 318 |         # seem a bit unusual, but is taken from the original Transformer paper.
 319 |         attention_probs = self.dropout(attention_probs)
 320 | 
 321 |         context_layer = torch.matmul(attention_probs, value_layer)
 322 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 323 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 324 |         context_layer = context_layer.view(*new_context_layer_shape)
 325 |         return context_layer
 326 | 
 327 | 
 328 | class BertSelfOutput(nn.Module):
 329 |     def __init__(self, config):
 330 |         super(BertSelfOutput, self).__init__()
 331 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 332 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 333 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 334 | 
 335 |     def forward(self, hidden_states, input_tensor):
 336 |         hidden_states = self.dense(hidden_states)
 337 |         hidden_states = self.dropout(hidden_states)
 338 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 339 |         return hidden_states
 340 | 
 341 | 
 342 | class BertAttention(nn.Module):
 343 |     def __init__(self, config):
 344 |         super(BertAttention, self).__init__()
 345 |         self.self = BertSelfAttention(config)
 346 |         self.output = BertSelfOutput(config)
 347 | 
 348 |     def forward(self, input_tensor, attention_mask):
 349 |         self_output = self.self(input_tensor, attention_mask)
 350 |         attention_output = self.output(self_output, input_tensor)
 351 |         return attention_output
 352 | 
 353 | 
 354 | class BertIntermediate(nn.Module):
 355 |     def __init__(self, config):
 356 |         super(BertIntermediate, self).__init__()
 357 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
 358 |         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
 359 |             self.intermediate_act_fn = ACT2FN[config.hidden_act]
 360 |         else:
 361 |             self.intermediate_act_fn = config.hidden_act
 362 | 
 363 |     def forward(self, hidden_states):
 364 |         hidden_states = self.dense(hidden_states)
 365 |         hidden_states = self.intermediate_act_fn(hidden_states)
 366 |         return hidden_states
 367 | 
 368 | 
 369 | class BertOutput(nn.Module):
 370 |     def __init__(self, config):
 371 |         super(BertOutput, self).__init__()
 372 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
 373 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 374 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 375 | 
 376 |     def forward(self, hidden_states, input_tensor):
 377 |         hidden_states = self.dense(hidden_states)
 378 |         hidden_states = self.dropout(hidden_states)
 379 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 380 |         return hidden_states
 381 | 
 382 | 
 383 | class BertLayer(nn.Module):
 384 |     def __init__(self, config):
 385 |         super(BertLayer, self).__init__()
 386 |         self.attention = BertAttention(config)
 387 |         self.intermediate = BertIntermediate(config)
 388 |         self.output = BertOutput(config)
 389 | 
 390 |     def forward(self, hidden_states, attention_mask):
 391 |         attention_output = self.attention(hidden_states, attention_mask)
 392 |         intermediate_output = self.intermediate(attention_output)
 393 |         layer_output = self.output(intermediate_output, attention_output)
 394 |         return layer_output
 395 | 
 396 | 
 397 | class BertEncoder(nn.Module):
 398 |     def __init__(self, config):
 399 |         super(BertEncoder, self).__init__()
 400 |         layer = BertLayer(config)
 401 |         self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 402 | 
 403 |     def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
 404 |         all_encoder_layers = []
 405 |         for layer_module in self.layer:
 406 |             hidden_states = layer_module(hidden_states, attention_mask)
 407 |             if output_all_encoded_layers:
 408 |                 all_encoder_layers.append(hidden_states)
 409 |         if not output_all_encoded_layers:
 410 |             all_encoder_layers.append(hidden_states)
 411 |         return all_encoder_layers
 412 | 
 413 | 
 414 | class BertPooler(nn.Module):
 415 |     def __init__(self, config):
 416 |         super(BertPooler, self).__init__()
 417 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 418 |         self.activation = nn.Tanh()
 419 | 
 420 |     def forward(self, hidden_states):
 421 |         # We "pool" the model by simply taking the hidden state corresponding
 422 |         # to the first token.
 423 |         first_token_tensor = hidden_states[:, 0]
 424 |         pooled_output = self.dense(first_token_tensor)
 425 |         pooled_output = self.activation(pooled_output)
 426 |         return pooled_output
 427 | 
 428 | 
 429 | class BertPredictionHeadTransform(nn.Module):
 430 |     def __init__(self, config):
 431 |         super(BertPredictionHeadTransform, self).__init__()
 432 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 433 |         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
 434 |             self.transform_act_fn = ACT2FN[config.hidden_act]
 435 |         else:
 436 |             self.transform_act_fn = config.hidden_act
 437 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 438 | 
 439 |     def forward(self, hidden_states):
 440 |         hidden_states = self.dense(hidden_states)
 441 |         hidden_states = self.transform_act_fn(hidden_states)
 442 |         hidden_states = self.LayerNorm(hidden_states)
 443 |         return hidden_states
 444 | 
 445 | 
 446 | class BertLMPredictionHead(nn.Module):
 447 |     def __init__(self, config, bert_model_embedding_weights):
 448 |         super(BertLMPredictionHead, self).__init__()
 449 |         self.transform = BertPredictionHeadTransform(config)
 450 | 
 451 |         # The output weights are the same as the input embeddings, but there is
 452 |         # an output-only bias for each token.
 453 |         self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
 454 |                                  bert_model_embedding_weights.size(0),
 455 |                                  bias=False)
 456 |         self.decoder.weight = bert_model_embedding_weights
 457 |         self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
 458 | 
 459 |     def forward(self, hidden_states):
 460 |         hidden_states = self.transform(hidden_states)
 461 |         hidden_states = self.decoder(hidden_states) + self.bias
 462 |         return hidden_states
 463 | 
 464 | 
 465 | class BertOnlyMLMHead(nn.Module):
 466 |     def __init__(self, config, bert_model_embedding_weights):
 467 |         super(BertOnlyMLMHead, self).__init__()
 468 |         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
 469 | 
 470 |     def forward(self, sequence_output):
 471 |         prediction_scores = self.predictions(sequence_output)
 472 |         return prediction_scores
 473 | 
 474 | 
 475 | class BertOnlyNSPHead(nn.Module):
 476 |     def __init__(self, config):
 477 |         super(BertOnlyNSPHead, self).__init__()
 478 |         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 479 | 
 480 |     def forward(self, pooled_output):
 481 |         seq_relationship_score = self.seq_relationship(pooled_output)
 482 |         return seq_relationship_score
 483 | 
 484 | 
 485 | class BertPreTrainingHeads(nn.Module):
 486 |     def __init__(self, config, bert_model_embedding_weights):
 487 |         super(BertPreTrainingHeads, self).__init__()
 488 |         self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
 489 |         self.seq_relationship = nn.Linear(config.hidden_size, 2)
 490 | 
 491 |     def forward(self, sequence_output, pooled_output):
 492 |         prediction_scores = self.predictions(sequence_output)
 493 |         seq_relationship_score = self.seq_relationship(pooled_output)
 494 |         return prediction_scores, seq_relationship_score
 495 | 
 496 | 
 497 | class BertPreTrainedModel(nn.Module):
 498 |     """ An abstract class to handle weights initialization and
 499 |         a simple interface for dowloading and loading pretrained models.
 500 |     """
 501 |     def __init__(self, config, *inputs, **kwargs):
 502 |         super(BertPreTrainedModel, self).__init__()
 503 |         if not isinstance(config, BertConfig):
 504 |             raise ValueError(
 505 |                 "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
 506 |                 "To create a model from a Google pretrained model use "
 507 |                 "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
 508 |                     self.__class__.__name__, self.__class__.__name__
 509 |                 ))
 510 |         self.config = config
 511 | 
 512 |     def init_bert_weights(self, module):
 513 |         """ Initialize the weights.
 514 |         """
 515 |         if isinstance(module, (nn.Linear, nn.Embedding)):
 516 |             # Slightly different from the TF version which uses truncated_normal for initialization
 517 |             # cf https://github.com/pytorch/pytorch/pull/5617
 518 |             module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
 519 |         elif isinstance(module, BertLayerNorm):
 520 |             module.bias.data.zero_()
 521 |             module.weight.data.fill_(1.0)
 522 |         if isinstance(module, nn.Linear) and module.bias is not None:
 523 |             module.bias.data.zero_()
 524 | 
 525 |     @classmethod
 526 |     def from_pretrained(cls, pretrained_model_name_or_path, state_dict=None, cache_dir=None,
 527 |                         from_tf=False, *inputs, **kwargs):
 528 |         """
 529 |         Instantiate a BertPreTrainedModel from a pre-trained model file or a pytorch state dict.
 530 |         Download and cache the pre-trained model file if needed.
 531 | 
 532 |         Params:
 533 |             pretrained_model_name_or_path: either:
 534 |                 - a str with the name of a pre-trained model to load selected in the list of:
 535 |                     . `bert-base-uncased`
 536 |                     . `bert-large-uncased`
 537 |                     . `bert-base-cased`
 538 |                     . `bert-large-cased`
 539 |                     . `bert-base-multilingual-uncased`
 540 |                     . `bert-base-multilingual-cased`
 541 |                     . `bert-base-chinese`
 542 |                 - a path or url to a pretrained model archive containing:
 543 |                     . `bert_config.json` a configuration file for the model
 544 |                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
 545 |                 - a path or url to a pretrained model archive containing:
 546 |                     . `bert_config.json` a configuration file for the model
 547 |                     . `model.chkpt` a TensorFlow checkpoint
 548 |             from_tf: should we load the weights from a locally saved TensorFlow checkpoint
 549 |             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
 550 |             state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
 551 |             *inputs, **kwargs: additional input for the specific Bert class
 552 |                 (ex: num_labels for BertForSequenceClassification)
 553 |         """
 554 |         if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
 555 |             archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
 556 |         else:
 557 |             archive_file = pretrained_model_name_or_path
 558 |         # redirect to the cache, if necessary
 559 |         try:
 560 |             resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
 561 |         except EnvironmentError:
 562 |             logger.error(
 563 |                 "Model name '{}' was not found in model name list ({}). "
 564 |                 "We assumed '{}' was a path or url but couldn't find any file "
 565 |                 "associated to this path or url.".format(
 566 |                     pretrained_model_name_or_path,
 567 |                     ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
 568 |                     archive_file))
 569 |             return None
 570 |         if resolved_archive_file == archive_file:
 571 |             logger.info("loading archive file {}".format(archive_file))
 572 |         else:
 573 |             logger.info("loading archive file {} from cache at {}".format(
 574 |                 archive_file, resolved_archive_file))
 575 |         tempdir = None
 576 |         if os.path.isdir(resolved_archive_file) or from_tf:
 577 |             serialization_dir = resolved_archive_file
 578 |         else:
 579 |             # Extract archive to temp dir
 580 |             tempdir = tempfile.mkdtemp()
 581 |             logger.info("extracting archive file {} to temp dir {}".format(
 582 |                 resolved_archive_file, tempdir))
 583 |             with tarfile.open(resolved_archive_file, 'r:gz') as archive:
 584 |                 archive.extractall(tempdir)
 585 |             serialization_dir = tempdir
 586 |         # Load config
 587 |         config_file = os.path.join(serialization_dir, CONFIG_NAME)
 588 |         if not os.path.exists(config_file):
 589 |             # Backward compatibility with old naming format
 590 |             config_file = os.path.join(serialization_dir, BERT_CONFIG_NAME)
 591 |         config = BertConfig.from_json_file(config_file)
 592 |         logger.info("Model config {}".format(config))
 593 |         # Instantiate model.
 594 |         model = cls(config, *inputs, **kwargs)
 595 |         if state_dict is None and not from_tf:
 596 |             weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
 597 |             state_dict = torch.load(weights_path, map_location='cpu')
 598 |         if tempdir:
 599 |             # Clean up temp dir
 600 |             shutil.rmtree(tempdir)
 601 |         if from_tf:
 602 |             # Directly load from a TensorFlow checkpoint
 603 |             weights_path = os.path.join(serialization_dir, TF_WEIGHTS_NAME)
 604 |             return load_tf_weights_in_bert(model, weights_path)
 605 |         # Load from a PyTorch state_dict
 606 |         old_keys = []
 607 |         new_keys = []
 608 |         for key in state_dict.keys():
 609 |             new_key = None
 610 |             if 'gamma' in key:
 611 |                 new_key = key.replace('gamma', 'weight')
 612 |             if 'beta' in key:
 613 |                 new_key = key.replace('beta', 'bias')
 614 |             if new_key:
 615 |                 old_keys.append(key)
 616 |                 new_keys.append(new_key)
 617 |         for old_key, new_key in zip(old_keys, new_keys):
 618 |             state_dict[new_key] = state_dict.pop(old_key)
 619 | 
 620 |         missing_keys = []
 621 |         unexpected_keys = []
 622 |         error_msgs = []
 623 |         # copy state_dict so _load_from_state_dict can modify it
 624 |         metadata = getattr(state_dict, '_metadata', None)
 625 |         state_dict = state_dict.copy()
 626 |         if metadata is not None:
 627 |             state_dict._metadata = metadata
 628 | 
 629 |         def load(module, prefix=''):
 630 |             local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
 631 |             module._load_from_state_dict(
 632 |                 state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
 633 |             for name, child in module._modules.items():
 634 |                 if child is not None:
 635 |                     load(child, prefix + name + '.')
 636 |         start_prefix = ''
 637 |         if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()):
 638 |             start_prefix = 'bert.'
 639 |         load(model, prefix=start_prefix)
 640 |         if len(missing_keys) > 0:
 641 |             logger.info("Weights of {} not initialized from pretrained model: {}".format(
 642 |                 model.__class__.__name__, missing_keys))
 643 |         if len(unexpected_keys) > 0:
 644 |             logger.info("Weights from pretrained model not used in {}: {}".format(
 645 |                 model.__class__.__name__, unexpected_keys))
 646 |         if len(error_msgs) > 0:
 647 |             raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
 648 |                                model.__class__.__name__, "\n\t".join(error_msgs)))
 649 |         return model
 650 | 
 651 | 
 652 | class BertModel(BertPreTrainedModel):
 653 |     """BERT model ("Bidirectional Embedding Representations from a Transformer").
 654 | 
 655 |     Params:
 656 |         config: a BertConfig class instance with the configuration to build a new model
 657 | 
 658 |     Inputs:
 659 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 660 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 661 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 662 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 663 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 664 |             a `sentence B` token (see BERT paper for more details).
 665 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 666 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 667 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 668 |             a batch has varying length sentences.
 669 |         `output_all_encoded_layers`: boolean which controls the content of the `encoded_layers` output as described below. Default: `True`.
 670 | 
 671 |     Outputs: Tuple of (encoded_layers, pooled_output)
 672 |         `encoded_layers`: controled by `output_all_encoded_layers` argument:
 673 |             - `output_all_encoded_layers=True`: outputs a list of the full sequences of encoded-hidden-states at the end
 674 |                 of each attention block (i.e. 12 full sequences for BERT-base, 24 for BERT-large), each
 675 |                 encoded-hidden-state is a torch.FloatTensor of size [batch_size, sequence_length, hidden_size],
 676 |             - `output_all_encoded_layers=False`: outputs only the full sequence of hidden-states corresponding
 677 |                 to the last attention block of shape [batch_size, sequence_length, hidden_size],
 678 |         `pooled_output`: a torch.FloatTensor of size [batch_size, hidden_size] which is the output of a
 679 |             classifier pretrained on top of the hidden state associated to the first character of the
 680 |             input (`CLS`) to train on the Next-Sentence task (see BERT's paper).
 681 | 
 682 |     Example usage:
 683 |     ```python
 684 |     # Already been converted into WordPiece token ids
 685 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 686 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 687 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 688 | 
 689 |     config = modeling.BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 690 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 691 | 
 692 |     model = modeling.BertModel(config=config)
 693 |     all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
 694 |     ```
 695 |     """
 696 |     def __init__(self, config):
 697 |         super(BertModel, self).__init__(config)
 698 |         self.embeddings = BertEmbeddings(config)
 699 |         self.encoder = BertEncoder(config)
 700 |         self.pooler = BertPooler(config)
 701 |         self.apply(self.init_bert_weights)
 702 | 
 703 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
 704 |         if attention_mask is None:
 705 |             attention_mask = torch.ones_like(input_ids)
 706 |         if token_type_ids is None:
 707 |             token_type_ids = torch.zeros_like(input_ids)
 708 | 
 709 |         # We create a 3D attention mask from a 2D tensor mask.
 710 |         # Sizes are [batch_size, 1, 1, to_seq_length]
 711 |         # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
 712 |         # this attention mask is more simple than the triangular masking of causal attention
 713 |         # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
 714 |         extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
 715 | 
 716 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
 717 |         # masked positions, this operation will create a tensor which is 0.0 for
 718 |         # positions we want to attend and -10000.0 for masked positions.
 719 |         # Since we are adding it to the raw scores before the softmax, this is
 720 |         # effectively the same as removing these entirely.
 721 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
 722 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 723 | 
 724 |         embedding_output = self.embeddings(input_ids, token_type_ids)
 725 |         encoded_layers = self.encoder(embedding_output,
 726 |                                       extended_attention_mask,
 727 |                                       output_all_encoded_layers=output_all_encoded_layers)
 728 |         sequence_output = encoded_layers[-1]
 729 |         pooled_output = self.pooler(sequence_output)
 730 |         if not output_all_encoded_layers:
 731 |             encoded_layers = encoded_layers[-1]
 732 |         return encoded_layers, pooled_output
 733 | 
 734 | 
 735 | class BertForPreTraining(BertPreTrainedModel):
 736 |     """BERT model with pre-training heads.
 737 |     This module comprises the BERT model followed by the two pre-training heads:
 738 |         - the masked language modeling head, and
 739 |         - the next sentence classification head.
 740 | 
 741 |     Params:
 742 |         config: a BertConfig class instance with the configuration to build a new model.
 743 | 
 744 |     Inputs:
 745 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 746 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 747 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 748 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 749 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 750 |             a `sentence B` token (see BERT paper for more details).
 751 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 752 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 753 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 754 |             a batch has varying length sentences.
 755 |         `masked_lm_labels`: optional masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
 756 |             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
 757 |             is only computed for the labels set in [0, ..., vocab_size]
 758 |         `next_sentence_label`: optional next sentence classification loss: torch.LongTensor of shape [batch_size]
 759 |             with indices selected in [0, 1].
 760 |             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
 761 | 
 762 |     Outputs:
 763 |         if `masked_lm_labels` and `next_sentence_label` are not `None`:
 764 |             Outputs the total_loss which is the sum of the masked language modeling loss and the next
 765 |             sentence classification loss.
 766 |         if `masked_lm_labels` or `next_sentence_label` is `None`:
 767 |             Outputs a tuple comprising
 768 |             - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
 769 |             - the next sentence classification logits of shape [batch_size, 2].
 770 | 
 771 |     Example usage:
 772 |     ```python
 773 |     # Already been converted into WordPiece token ids
 774 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 775 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 776 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 777 | 
 778 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 779 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 780 | 
 781 |     model = BertForPreTraining(config)
 782 |     masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
 783 |     ```
 784 |     """
 785 |     def __init__(self, config):
 786 |         super(BertForPreTraining, self).__init__(config)
 787 |         self.bert = BertModel(config)
 788 |         self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
 789 |         self.apply(self.init_bert_weights)
 790 | 
 791 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, next_sentence_label=None):
 792 |         sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
 793 |                                                    output_all_encoded_layers=False)
 794 |         prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
 795 | 
 796 |         if masked_lm_labels is not None and next_sentence_label is not None:
 797 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 798 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
 799 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 800 |             total_loss = masked_lm_loss + next_sentence_loss
 801 |             return total_loss
 802 |         else:
 803 |             return prediction_scores, seq_relationship_score
 804 | 
 805 | 
 806 | class BertForMaskedLM(BertPreTrainedModel):
 807 |     """BERT model with the masked language modeling head.
 808 |     This module comprises the BERT model followed by the masked language modeling head.
 809 | 
 810 |     Params:
 811 |         config: a BertConfig class instance with the configuration to build a new model.
 812 | 
 813 |     Inputs:
 814 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 815 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 816 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 817 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 818 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 819 |             a `sentence B` token (see BERT paper for more details).
 820 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 821 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 822 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 823 |             a batch has varying length sentences.
 824 |         `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
 825 |             with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
 826 |             is only computed for the labels set in [0, ..., vocab_size]
 827 | 
 828 |     Outputs:
 829 |         if `masked_lm_labels` is  not `None`:
 830 |             Outputs the masked language modeling loss.
 831 |         if `masked_lm_labels` is `None`:
 832 |             Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
 833 | 
 834 |     Example usage:
 835 |     ```python
 836 |     # Already been converted into WordPiece token ids
 837 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 838 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 839 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 840 | 
 841 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 842 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 843 | 
 844 |     model = BertForMaskedLM(config)
 845 |     masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
 846 |     ```
 847 |     """
 848 |     def __init__(self, config):
 849 |         super(BertForMaskedLM, self).__init__(config)
 850 |         self.bert = BertModel(config)
 851 |         self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
 852 |         self.apply(self.init_bert_weights)
 853 | 
 854 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None):
 855 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask,
 856 |                                        output_all_encoded_layers=False)
 857 |         prediction_scores = self.cls(sequence_output)
 858 | 
 859 |         if masked_lm_labels is not None:
 860 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 861 |             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
 862 |             return masked_lm_loss
 863 |         else:
 864 |             return prediction_scores
 865 | 
 866 | 
 867 | class BertForNextSentencePrediction(BertPreTrainedModel):
 868 |     """BERT model with next sentence prediction head.
 869 |     This module comprises the BERT model followed by the next sentence classification head.
 870 | 
 871 |     Params:
 872 |         config: a BertConfig class instance with the configuration to build a new model.
 873 | 
 874 |     Inputs:
 875 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 876 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
 877 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 878 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 879 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 880 |             a `sentence B` token (see BERT paper for more details).
 881 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 882 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 883 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 884 |             a batch has varying length sentences.
 885 |         `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
 886 |             with indices selected in [0, 1].
 887 |             0 => next sentence is the continuation, 1 => next sentence is a random sentence.
 888 | 
 889 |     Outputs:
 890 |         if `next_sentence_label` is not `None`:
 891 |             Outputs the total_loss which is the sum of the masked language modeling loss and the next
 892 |             sentence classification loss.
 893 |         if `next_sentence_label` is `None`:
 894 |             Outputs the next sentence classification logits of shape [batch_size, 2].
 895 | 
 896 |     Example usage:
 897 |     ```python
 898 |     # Already been converted into WordPiece token ids
 899 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 900 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 901 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 902 | 
 903 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 904 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 905 | 
 906 |     model = BertForNextSentencePrediction(config)
 907 |     seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
 908 |     ```
 909 |     """
 910 |     def __init__(self, config):
 911 |         super(BertForNextSentencePrediction, self).__init__(config)
 912 |         self.bert = BertModel(config)
 913 |         self.cls = BertOnlyNSPHead(config)
 914 |         self.apply(self.init_bert_weights)
 915 | 
 916 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None):
 917 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
 918 |                                      output_all_encoded_layers=False)
 919 |         seq_relationship_score = self.cls( pooled_output)
 920 | 
 921 |         if next_sentence_label is not None:
 922 |             loss_fct = CrossEntropyLoss(ignore_index=-1)
 923 |             next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
 924 |             return next_sentence_loss
 925 |         else:
 926 |             return seq_relationship_score
 927 | 
 928 | 
 929 | class BertForSequenceClassification(BertPreTrainedModel):
 930 |     """BERT model for classification.
 931 |     This module is composed of the BERT model with a linear layer on top of
 932 |     the pooled output.
 933 | 
 934 |     Params:
 935 |         `config`: a BertConfig class instance with the configuration to build a new model.
 936 |         `num_labels`: the number of classes for the classifier. Default = 2.
 937 | 
 938 |     Inputs:
 939 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
 940 |             with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
 941 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
 942 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
 943 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
 944 |             a `sentence B` token (see BERT paper for more details).
 945 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
 946 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
 947 |             input sequence length in the current batch. It's the mask that we typically use for attention when
 948 |             a batch has varying length sentences.
 949 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
 950 |             with indices selected in [0, ..., num_labels].
 951 | 
 952 |     Outputs:
 953 |         if `labels` is not `None`:
 954 |             Outputs the CrossEntropy classification loss of the output with the labels.
 955 |         if `labels` is `None`:
 956 |             Outputs the classification logits of shape [batch_size, num_labels].
 957 | 
 958 |     Example usage:
 959 |     ```python
 960 |     # Already been converted into WordPiece token ids
 961 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
 962 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
 963 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
 964 | 
 965 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
 966 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
 967 | 
 968 |     num_labels = 2
 969 | 
 970 |     model = BertForSequenceClassification(config, num_labels)
 971 |     logits = model(input_ids, token_type_ids, input_mask)
 972 |     ```
 973 |     """
 974 |     def __init__(self, config, num_labels):
 975 |         super(BertForSequenceClassification, self).__init__(config)
 976 |         self.num_labels = num_labels
 977 |         self.bert = BertModel(config)
 978 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 979 |         self.classifier = nn.Linear(config.hidden_size, num_labels)
 980 |         self.apply(self.init_bert_weights)
 981 | 
 982 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
 983 |         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
 984 |         pooled_output = self.dropout(pooled_output)
 985 |         logits = self.classifier(pooled_output)
 986 | 
 987 |         if labels is not None:
 988 |             loss_fct = CrossEntropyLoss()
 989 |             loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
 990 |             return loss
 991 |         else:
 992 |             return logits
 993 | 
 994 | 
 995 | class BertForMultipleChoice(BertPreTrainedModel):
 996 |     """BERT model for multiple choice tasks.
 997 |     This module is composed of the BERT model with a linear layer on top of
 998 |     the pooled output.
 999 | 
1000 |     Params:
1001 |         `config`: a BertConfig class instance with the configuration to build a new model.
1002 |         `num_choices`: the number of classes for the classifier. Default = 2.
1003 | 
1004 |     Inputs:
1005 |         `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
1006 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1007 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1008 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
1009 |             with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
1010 |             and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
1011 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
1012 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1013 |             input sequence length in the current batch. It's the mask that we typically use for attention when
1014 |             a batch has varying length sentences.
1015 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
1016 |             with indices selected in [0, ..., num_choices].
1017 | 
1018 |     Outputs:
1019 |         if `labels` is not `None`:
1020 |             Outputs the CrossEntropy classification loss of the output with the labels.
1021 |         if `labels` is `None`:
1022 |             Outputs the classification logits of shape [batch_size, num_labels].
1023 | 
1024 |     Example usage:
1025 |     ```python
1026 |     # Already been converted into WordPiece token ids
1027 |     input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
1028 |     input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
1029 |     token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
1030 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1031 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1032 | 
1033 |     num_choices = 2
1034 | 
1035 |     model = BertForMultipleChoice(config, num_choices)
1036 |     logits = model(input_ids, token_type_ids, input_mask)
1037 |     ```
1038 |     """
1039 |     def __init__(self, config, num_choices):
1040 |         super(BertForMultipleChoice, self).__init__(config)
1041 |         self.num_choices = num_choices
1042 |         self.bert = BertModel(config)
1043 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
1044 |         self.classifier = nn.Linear(config.hidden_size, 1)
1045 |         self.apply(self.init_bert_weights)
1046 | 
1047 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
1048 |         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
1049 |         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
1050 |         flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
1051 |         _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
1052 |         pooled_output = self.dropout(pooled_output)
1053 |         logits = self.classifier(pooled_output)
1054 |         reshaped_logits = logits.view(-1, self.num_choices)
1055 | 
1056 |         if labels is not None:
1057 |             loss_fct = CrossEntropyLoss()
1058 |             loss = loss_fct(reshaped_logits, labels)
1059 |             return loss
1060 |         else:
1061 |             return reshaped_logits
1062 | 
1063 | 
1064 | class BertForTokenClassification(BertPreTrainedModel):
1065 |     """BERT model for token-level classification.
1066 |     This module is composed of the BERT model with a linear layer on top of
1067 |     the full hidden state of the last layer.
1068 | 
1069 |     Params:
1070 |         `config`: a BertConfig class instance with the configuration to build a new model.
1071 |         `num_labels`: the number of classes for the classifier. Default = 2.
1072 | 
1073 |     Inputs:
1074 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
1075 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1076 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1077 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
1078 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
1079 |             a `sentence B` token (see BERT paper for more details).
1080 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
1081 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1082 |             input sequence length in the current batch. It's the mask that we typically use for attention when
1083 |             a batch has varying length sentences.
1084 |         `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
1085 |             with indices selected in [0, ..., num_labels].
1086 | 
1087 |     Outputs:
1088 |         if `labels` is not `None`:
1089 |             Outputs the CrossEntropy classification loss of the output with the labels.
1090 |         if `labels` is `None`:
1091 |             Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
1092 | 
1093 |     Example usage:
1094 |     ```python
1095 |     # Already been converted into WordPiece token ids
1096 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
1097 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
1098 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
1099 | 
1100 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1101 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1102 | 
1103 |     num_labels = 2
1104 | 
1105 |     model = BertForTokenClassification(config, num_labels)
1106 |     logits = model(input_ids, token_type_ids, input_mask)
1107 |     ```
1108 |     """
1109 |     def __init__(self, config, num_labels):
1110 |         super(BertForTokenClassification, self).__init__(config)
1111 |         self.num_labels = num_labels
1112 |         self.bert = BertModel(config)
1113 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
1114 |         self.classifier = nn.Linear(config.hidden_size, num_labels)
1115 |         self.apply(self.init_bert_weights)
1116 | 
1117 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
1118 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
1119 |         sequence_output = self.dropout(sequence_output)
1120 |         logits = self.classifier(sequence_output)
1121 | 
1122 |         if labels is not None:
1123 |             loss_fct = CrossEntropyLoss()
1124 |             # Only keep active parts of the loss
1125 |             if attention_mask is not None:
1126 |                 active_loss = attention_mask.view(-1) == 1
1127 |                 active_logits = logits.view(-1, self.num_labels)[active_loss]
1128 |                 active_labels = labels.view(-1)[active_loss]
1129 |                 loss = loss_fct(active_logits, active_labels)
1130 |             else:
1131 |                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1132 |             return loss
1133 |         else:
1134 |             return logits
1135 | 
1136 | 
1137 | class BertForQuestionAnswering(BertPreTrainedModel):
1138 |     """BERT model for Question Answering (span extraction).
1139 |     This module is composed of the BERT model with a linear layer on top of
1140 |     the sequence output that computes start_logits and end_logits
1141 | 
1142 |     Params:
1143 |         `config`: a BertConfig class instance with the configuration to build a new model.
1144 | 
1145 |     Inputs:
1146 |         `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
1147 |             with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
1148 |             `extract_features.py`, `run_classifier.py` and `run_squad.py`)
1149 |         `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
1150 |             types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
1151 |             a `sentence B` token (see BERT paper for more details).
1152 |         `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
1153 |             selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
1154 |             input sequence length in the current batch. It's the mask that we typically use for attention when
1155 |             a batch has varying length sentences.
1156 |         `start_positions`: position of the first token for the labeled span: torch.LongTensor of shape [batch_size].
1157 |             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
1158 |             into account for computing the loss.
1159 |         `end_positions`: position of the last token for the labeled span: torch.LongTensor of shape [batch_size].
1160 |             Positions are clamped to the length of the sequence and position outside of the sequence are not taken
1161 |             into account for computing the loss.
1162 | 
1163 |     Outputs:
1164 |         if `start_positions` and `end_positions` are not `None`:
1165 |             Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
1166 |         if `start_positions` or `end_positions` is `None`:
1167 |             Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
1168 |             position tokens of shape [batch_size, sequence_length].
1169 | 
1170 |     Example usage:
1171 |     ```python
1172 |     # Already been converted into WordPiece token ids
1173 |     input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
1174 |     input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
1175 |     token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
1176 | 
1177 |     config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
1178 |         num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
1179 | 
1180 |     model = BertForQuestionAnswering(config)
1181 |     start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
1182 |     ```
1183 |     """
1184 |     def __init__(self, config):
1185 |         super(BertForQuestionAnswering, self).__init__(config)
1186 |         self.bert = BertModel(config)
1187 |         # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
1188 |         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
1189 |         self.qa_outputs = nn.Linear(config.hidden_size, 2)
1190 |         self.apply(self.init_bert_weights)
1191 | 
1192 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
1193 |         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
1194 |         logits = self.qa_outputs(sequence_output)
1195 |         start_logits, end_logits = logits.split(1, dim=-1)
1196 |         start_logits = start_logits.squeeze(-1)
1197 |         end_logits = end_logits.squeeze(-1)
1198 | 
1199 |         if start_positions is not None and end_positions is not None:
1200 |             # If we are on multi-GPU, split add a dimension
1201 |             if len(start_positions.size()) > 1:
1202 |                 start_positions = start_positions.squeeze(-1)
1203 |             if len(end_positions.size()) > 1:
1204 |                 end_positions = end_positions.squeeze(-1)
1205 |             # sometimes the start/end positions are outside our model inputs, we ignore these terms
1206 |             ignored_index = start_logits.size(1)
1207 |             start_positions.clamp_(0, ignored_index)
1208 |             end_positions.clamp_(0, ignored_index)
1209 | 
1210 |             loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1211 |             start_loss = loss_fct(start_logits, start_positions)
1212 |             end_loss = loss_fct(end_logits, end_positions)
1213 |             total_loss = (start_loss + end_loss) / 2
1214 |             return total_loss
1215 |         else:
1216 |             return start_logits, end_logits
1217 | 


--------------------------------------------------------------------------------
/bert_nlp/pytorch_pretrained_bert/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import math
 18 | import torch
 19 | from torch.optim import Optimizer
 20 | from torch.optim.optimizer import required
 21 | from torch.nn.utils import clip_grad_norm_
 22 | import logging
 23 | import abc
 24 | import sys
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | 
 29 | if sys.version_info >= (3, 4):
 30 |     ABC = abc.ABC
 31 | else:
 32 |     ABC = abc.ABCMeta('ABC', (), {})
 33 | 
 34 | 
 35 | class _LRSchedule(ABC):
 36 |     """ Parent of all LRSchedules here. """
 37 |     warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
 38 |     def __init__(self, warmup=0.002, t_total=-1, **kw):
 39 |         """
 40 |         :param warmup:  what fraction of t_total steps will be used for linear warmup
 41 |         :param t_total: how many training steps (updates) are planned
 42 |         :param kw:
 43 |         """
 44 |         super(_LRSchedule, self).__init__(**kw)
 45 |         if t_total < 0:
 46 |             logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
 47 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 48 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 49 |         warmup = max(warmup, 0.)
 50 |         self.warmup, self.t_total = float(warmup), float(t_total)
 51 |         self.warned_for_t_total_at_progress = -1
 52 | 
 53 |     def get_lr(self, step, nowarn=False):
 54 |         """
 55 |         :param step:    which of t_total steps we're on
 56 |         :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
 57 |         :return:        learning rate multiplier for current update
 58 |         """
 59 |         if self.t_total < 0:
 60 |             return 1.
 61 |         progress = float(step) / self.t_total
 62 |         ret = self.get_lr_(progress)
 63 |         # warning for exceeding t_total (only active with warmup_linear
 64 |         if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
 65 |             logger.warning(
 66 |                 "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
 67 |                     .format(ret, self.__class__.__name__))
 68 |             self.warned_for_t_total_at_progress = progress
 69 |         # end warning
 70 |         return ret
 71 | 
 72 |     @abc.abstractmethod
 73 |     def get_lr_(self, progress):
 74 |         """
 75 |         :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
 76 |         :return:            learning rate multiplier for current update
 77 |         """
 78 |         return 1.
 79 | 
 80 | 
 81 | class ConstantLR(_LRSchedule):
 82 |     def get_lr_(self, progress):
 83 |         return 1.
 84 | 
 85 | 
 86 | class WarmupCosineSchedule(_LRSchedule):
 87 |     """
 88 |     Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
 89 |     """
 90 |     warn_t_total = True
 91 |     def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
 92 |         """
 93 |         :param warmup:      see LRSchedule
 94 |         :param t_total:     see LRSchedule
 95 |         :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
 96 |         :param kw:
 97 |         """
 98 |         super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
 99 |         self.cycles = cycles
100 | 
101 |     def get_lr_(self, progress):
102 |         if progress < self.warmup:
103 |             return progress / self.warmup
104 |         else:
105 |             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
106 |             return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
107 | 
108 | 
109 | class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
110 |     """
111 |     Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1).
112 |     """
113 |     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
114 |         super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
115 |         assert(cycles >= 1.)
116 | 
117 |     def get_lr_(self, progress):
118 |         if progress < self.warmup:
119 |             return progress / self.warmup
120 |         else:
121 |             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
122 |             ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
123 |             return ret
124 | 
125 | 
126 | class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
127 |     """
128 |     Cosine learning rate schedule with linear warmups and linear warmup restarts.
129 |     The same warmup rate is used for warmup restarts as for initial warmup.
130 |     The total effective fraction of warmup steps over all cycles is warmup * cycles!
131 |     """
132 |     def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
133 |         assert(warmup * cycles < 1.)
134 |         warmup = warmup * cycles if warmup >= 0 else warmup
135 |         super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
136 | 
137 |     def get_lr_(self, progress):
138 |         progress = progress * self.cycles % 1.
139 |         if progress < self.warmup:
140 |             return progress / self.warmup
141 |         else:
142 |             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
143 |             ret = 0.5 * (1. + math.cos(math.pi * progress))
144 |             return ret
145 | 
146 | 
147 | class WarmupConstantSchedule(_LRSchedule):
148 |     """
149 |     Applies linear warmup. After warmup always returns 1..
150 |     """
151 |     def get_lr_(self, progress):
152 |         if progress < self.warmup:
153 |             return progress / self.warmup
154 |         return 1.
155 | 
156 | 
157 | class WarmupLinearSchedule(_LRSchedule):
158 |     """
159 |     Linear warmup. Linear decay after warmup.
160 |     """
161 |     warn_t_total = True
162 |     def get_lr_(self, progress):
163 |         if progress < self.warmup:
164 |             return progress / self.warmup
165 |         return max((progress - 1.) / (self.warmup - 1.), 0.)
166 | 
167 | 
168 | SCHEDULES = {
169 |     None:       ConstantLR,
170 |     "none":     ConstantLR,
171 |     "warmup_cosine": WarmupCosineSchedule,
172 |     "warmup_constant": WarmupConstantSchedule,
173 |     "warmup_linear": WarmupLinearSchedule
174 | }
175 | 
176 | 
177 | class BertAdam(Optimizer):
178 |     """Implements BERT version of Adam algorithm with weight decay fix.
179 |     Params:
180 |         lr: learning rate
181 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
182 |         t_total: total number of training steps for the learning
183 |             rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
184 |         schedule: schedule to use for the warmup (see above).
185 |             Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
186 |             Default: 'warmup_linear'
187 |         b1: Adams b1. Default: 0.9
188 |         b2: Adams b2. Default: 0.999
189 |         e: Adams epsilon. Default: 1e-6
190 |         weight_decay: Weight decay. Default: 0.01
191 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
192 |     """
193 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
194 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
195 |         if lr is not required and lr < 0.0:
196 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
197 |         if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
198 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
199 |         if not 0.0 <= b1 < 1.0:
200 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
201 |         if not 0.0 <= b2 < 1.0:
202 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
203 |         if not e >= 0.0:
204 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
205 |         # initialize schedule object
206 |         if not isinstance(schedule, _LRSchedule):
207 |             schedule_type = SCHEDULES[schedule]
208 |             schedule = schedule_type(warmup=warmup, t_total=t_total)
209 |         else:
210 |             if warmup != -1 or t_total != -1:
211 |                 logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
212 |                                "Please specify custom warmup and t_total in LRSchedule object.")
213 |         defaults = dict(lr=lr, schedule=schedule,
214 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
215 |                         max_grad_norm=max_grad_norm)
216 |         super(BertAdam, self).__init__(params, defaults)
217 | 
218 |     def get_lr(self):
219 |         lr = []
220 |         for group in self.param_groups:
221 |             for p in group['params']:
222 |                 state = self.state[p]
223 |                 if len(state) == 0:
224 |                     return [0]
225 |                 lr_scheduled = group['lr']
226 |                 lr_scheduled *= group['schedule'].get_lr(state['step'])
227 |                 lr.append(lr_scheduled)
228 |         return lr
229 | 
230 |     def step(self, closure=None):
231 |         """Performs a single optimization step.
232 | 
233 |         Arguments:
234 |             closure (callable, optional): A closure that reevaluates the model
235 |                 and returns the loss.
236 |         """
237 |         loss = None
238 |         if closure is not None:
239 |             loss = closure()
240 | 
241 |         for group in self.param_groups:
242 |             for p in group['params']:
243 |                 if p.grad is None:
244 |                     continue
245 |                 grad = p.grad.data
246 |                 if grad.is_sparse:
247 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
248 | 
249 |                 state = self.state[p]
250 | 
251 |                 # State initialization
252 |                 if len(state) == 0:
253 |                     state['step'] = 0
254 |                     # Exponential moving average of gradient values
255 |                     state['next_m'] = torch.zeros_like(p.data)
256 |                     # Exponential moving average of squared gradient values
257 |                     state['next_v'] = torch.zeros_like(p.data)
258 | 
259 |                 next_m, next_v = state['next_m'], state['next_v']
260 |                 beta1, beta2 = group['b1'], group['b2']
261 | 
262 |                 # Add grad clipping
263 |                 if group['max_grad_norm'] > 0:
264 |                     clip_grad_norm_(p, group['max_grad_norm'])
265 | 
266 |                 # Decay the first and second moment running average coefficient
267 |                 # In-place operations to update the averages at the same time
268 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
269 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
270 |                 update = next_m / (next_v.sqrt() + group['e'])
271 | 
272 |                 # Just adding the square of the weights to the loss function is *not*
273 |                 # the correct way of using L2 regularization/weight decay with Adam,
274 |                 # since that will interact with the m and v parameters in strange ways.
275 |                 #
276 |                 # Instead we want to decay the weights in a manner that doesn't interact
277 |                 # with the m/v parameters. This is equivalent to adding the square
278 |                 # of the weights to the loss with plain (non-momentum) SGD.
279 |                 if group['weight_decay'] > 0.0:
280 |                     update += group['weight_decay'] * p.data
281 | 
282 |                 lr_scheduled = group['lr']
283 |                 lr_scheduled *= group['schedule'].get_lr(state['step'])
284 | 
285 |                 update_with_lr = lr_scheduled * update
286 |                 p.data.add_(-update_with_lr)
287 | 
288 |                 state['step'] += 1
289 | 
290 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
291 |                 # No bias correction
292 |                 # bias_correction1 = 1 - beta1 ** state['step']
293 |                 # bias_correction2 = 1 - beta2 ** state['step']
294 | 
295 |         return loss
296 | 


--------------------------------------------------------------------------------
/bert_nlp/pytorch_pretrained_bert/tokenization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import collections
 20 | import logging
 21 | import os
 22 | import unicodedata
 23 | from io import open
 24 | 
 25 | from .file_utils import cached_path
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | PRETRAINED_VOCAB_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 37 | }
 38 | PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
 39 |     'bert-base-uncased': 512,
 40 |     'bert-large-uncased': 512,
 41 |     'bert-base-cased': 512,
 42 |     'bert-large-cased': 512,
 43 |     'bert-base-multilingual-uncased': 512,
 44 |     'bert-base-multilingual-cased': 512,
 45 |     'bert-base-chinese': 512,
 46 | }
 47 | VOCAB_NAME = 'vocab.txt'
 48 | 
 49 | 
 50 | def load_vocab(vocab_file):
 51 |     """Loads a vocabulary file into a dictionary."""
 52 |     vocab = collections.OrderedDict()
 53 |     index = 0
 54 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 55 |         while True:
 56 |             token = reader.readline()
 57 |             if not token:
 58 |                 break
 59 |             token = token.strip()
 60 |             vocab[token] = index
 61 |             index += 1
 62 |     return vocab
 63 | 
 64 | 
 65 | def whitespace_tokenize(text):
 66 |     """Runs basic whitespace cleaning and splitting on a piece of text."""
 67 |     text = text.strip()
 68 |     if not text:
 69 |         return []
 70 |     tokens = text.split()
 71 |     return tokens
 72 | 
 73 | 
 74 | class BertTokenizer(object):
 75 |     """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
 76 | 
 77 |     def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
 78 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
 79 |         """Constructs a BertTokenizer.
 80 | 
 81 |         Args:
 82 |           vocab_file: Path to a one-wordpiece-per-line vocabulary file
 83 |           do_lower_case: Whether to lower case the input
 84 |                          Only has an effect when do_wordpiece_only=False
 85 |           do_basic_tokenize: Whether to do basic tokenization before wordpiece.
 86 |           max_len: An artificial maximum length to truncate tokenized sequences to;
 87 |                          Effective maximum length is always the minimum of this
 88 |                          value (if specified) and the underlying BERT model's
 89 |                          sequence length.
 90 |           never_split: List of tokens which will never be split during tokenization.
 91 |                          Only has an effect when do_wordpiece_only=False
 92 |         """
 93 |         if not os.path.isfile(vocab_file):
 94 |             raise ValueError(
 95 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
 96 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
 97 |         self.vocab = load_vocab(vocab_file)
 98 |         self.ids_to_tokens = collections.OrderedDict(
 99 |             [(ids, tok) for tok, ids in self.vocab.items()])
100 |         self.do_basic_tokenize = do_basic_tokenize
101 |         if do_basic_tokenize:
102 |           self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
103 |                                                 never_split=never_split)
104 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
105 |         self.max_len = max_len if max_len is not None else int(1e12)
106 | 
107 |     def tokenize(self, text):
108 |         split_tokens = []
109 |         if self.do_basic_tokenize:
110 |             for token in self.basic_tokenizer.tokenize(text):
111 |                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
112 |                     split_tokens.append(sub_token)
113 |         else:
114 |             split_tokens = self.wordpiece_tokenizer.tokenize(text)
115 |         return split_tokens
116 | 
117 |     def convert_tokens_to_ids(self, tokens):
118 |         """Converts a sequence of tokens into ids using the vocab."""
119 |         ids = []
120 |         for token in tokens:
121 |             ids.append(self.vocab[token])
122 |         if len(ids) > self.max_len:
123 |             logger.warning(
124 |                 "Token indices sequence length is longer than the specified maximum "
125 |                 " sequence length for this BERT model ({} > {}). Running this"
126 |                 " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
127 |             )
128 |         return ids
129 | 
130 |     def convert_ids_to_tokens(self, ids):
131 |         """Converts a sequence of ids in wordpiece tokens using the vocab."""
132 |         tokens = []
133 |         for i in ids:
134 |             tokens.append(self.ids_to_tokens[i])
135 |         return tokens
136 | 
137 |     def save_vocabulary(self, vocab_path):
138 |         """Save the tokenizer vocabulary to a directory or file."""
139 |         index = 0
140 |         if os.path.isdir(vocab_path):
141 |             vocab_file = os.path.join(vocab_path, VOCAB_NAME)
142 |         with open(vocab_file, "w", encoding="utf-8") as writer:
143 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
144 |                 if index != token_index:
145 |                     logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
146 |                                    " Please check that the vocabulary is not corrupted!".format(vocab_file))
147 |                     index = token_index
148 |                 writer.write(token + u'\n')
149 |                 index += 1
150 |         return vocab_file
151 | 
152 |     @classmethod
153 |     def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
154 |         """
155 |         Instantiate a PreTrainedBertModel from a pre-trained model file.
156 |         Download and cache the pre-trained model file if needed.
157 |         """
158 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
159 |             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
160 |             if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
161 |                 logger.warning("The pre-trained model you are loading is a cased model but you have not set "
162 |                                "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
163 |                                "you may want to check this behavior.")
164 |                 kwargs['do_lower_case'] = False
165 |             elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
166 |                 logger.warning("The pre-trained model you are loading is an uncased model but you have set "
167 |                                "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
168 |                                "but you may want to check this behavior.")
169 |                 kwargs['do_lower_case'] = True
170 |         else:
171 |             vocab_file = pretrained_model_name_or_path
172 |         if os.path.isdir(vocab_file):
173 |             vocab_file = os.path.join(vocab_file, VOCAB_NAME)
174 |         # redirect to the cache, if necessary
175 |         try:
176 |             resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
177 |         except EnvironmentError:
178 |             logger.error(
179 |                 "Model name '{}' was not found in model name list ({}). "
180 |                 "We assumed '{}' was a path or url but couldn't find any file "
181 |                 "associated to this path or url.".format(
182 |                     pretrained_model_name_or_path,
183 |                     ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
184 |                     vocab_file))
185 |             return None
186 |         if resolved_vocab_file == vocab_file:
187 |             logger.info("loading vocabulary file {}".format(vocab_file))
188 |         else:
189 |             logger.info("loading vocabulary file {} from cache at {}".format(
190 |                 vocab_file, resolved_vocab_file))
191 |         if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
192 |             # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
193 |             # than the number of positional embeddings
194 |             max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
195 |             kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
196 |         # Instantiate tokenizer.
197 |         tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
198 |         return tokenizer
199 | 
200 | 
201 | class BasicTokenizer(object):
202 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
203 | 
204 |     def __init__(self,
205 |                  do_lower_case=True,
206 |                  never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
207 |         """Constructs a BasicTokenizer.
208 | 
209 |         Args:
210 |           do_lower_case: Whether to lower case the input.
211 |         """
212 |         self.do_lower_case = do_lower_case
213 |         self.never_split = never_split
214 | 
215 |     def tokenize(self, text):
216 |         """Tokenizes a piece of text."""
217 |         text = self._clean_text(text)
218 |         # This was added on November 1st, 2018 for the multilingual and Chinese
219 |         # models. This is also applied to the English models now, but it doesn't
220 |         # matter since the English models were not trained on any Chinese data
221 |         # and generally don't have any Chinese data in them (there are Chinese
222 |         # characters in the vocabulary because Wikipedia does have some Chinese
223 |         # words in the English Wikipedia.).
224 |         text = self._tokenize_chinese_chars(text)
225 |         orig_tokens = whitespace_tokenize(text)
226 |         split_tokens = []
227 |         for token in orig_tokens:
228 |             if self.do_lower_case and token not in self.never_split:
229 |                 token = token.lower()
230 |                 token = self._run_strip_accents(token)
231 |             split_tokens.extend(self._run_split_on_punc(token))
232 | 
233 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
234 |         return output_tokens
235 | 
236 |     def _run_strip_accents(self, text):
237 |         """Strips accents from a piece of text."""
238 |         text = unicodedata.normalize("NFD", text)
239 |         output = []
240 |         for char in text:
241 |             cat = unicodedata.category(char)
242 |             if cat == "Mn":
243 |                 continue
244 |             output.append(char)
245 |         return "".join(output)
246 | 
247 |     def _run_split_on_punc(self, text):
248 |         """Splits punctuation on a piece of text."""
249 |         if text in self.never_split:
250 |             return [text]
251 |         chars = list(text)
252 |         i = 0
253 |         start_new_word = True
254 |         output = []
255 |         while i < len(chars):
256 |             char = chars[i]
257 |             if _is_punctuation(char):
258 |                 output.append([char])
259 |                 start_new_word = True
260 |             else:
261 |                 if start_new_word:
262 |                     output.append([])
263 |                 start_new_word = False
264 |                 output[-1].append(char)
265 |             i += 1
266 | 
267 |         return ["".join(x) for x in output]
268 | 
269 |     def _tokenize_chinese_chars(self, text):
270 |         """Adds whitespace around any CJK character."""
271 |         output = []
272 |         for char in text:
273 |             cp = ord(char)
274 |             if self._is_chinese_char(cp):
275 |                 output.append(" ")
276 |                 output.append(char)
277 |                 output.append(" ")
278 |             else:
279 |                 output.append(char)
280 |         return "".join(output)
281 | 
282 |     def _is_chinese_char(self, cp):
283 |         """Checks whether CP is the codepoint of a CJK character."""
284 |         # This defines a "chinese character" as anything in the CJK Unicode block:
285 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
286 |         #
287 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
288 |         # despite its name. The modern Korean Hangul alphabet is a different block,
289 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
290 |         # space-separated words, so they are not treated specially and handled
291 |         # like the all of the other languages.
292 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
293 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
294 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
295 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
296 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
297 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
298 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
299 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
300 |             return True
301 | 
302 |         return False
303 | 
304 |     def _clean_text(self, text):
305 |         """Performs invalid character removal and whitespace cleanup on text."""
306 |         output = []
307 |         for char in text:
308 |             cp = ord(char)
309 |             if cp == 0 or cp == 0xfffd or _is_control(char):
310 |                 continue
311 |             if _is_whitespace(char):
312 |                 output.append(" ")
313 |             else:
314 |                 output.append(char)
315 |         return "".join(output)
316 | 
317 | 
318 | class WordpieceTokenizer(object):
319 |     """Runs WordPiece tokenization."""
320 | 
321 |     def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
322 |         self.vocab = vocab
323 |         self.unk_token = unk_token
324 |         self.max_input_chars_per_word = max_input_chars_per_word
325 | 
326 |     def tokenize(self, text):
327 |         """Tokenizes a piece of text into its word pieces.
328 | 
329 |         This uses a greedy longest-match-first algorithm to perform tokenization
330 |         using the given vocabulary.
331 | 
332 |         For example:
333 |           input = "unaffable"
334 |           output = ["un", "##aff", "##able"]
335 | 
336 |         Args:
337 |           text: A single token or whitespace separated tokens. This should have
338 |             already been passed through `BasicTokenizer`.
339 | 
340 |         Returns:
341 |           A list of wordpiece tokens.
342 |         """
343 | 
344 |         output_tokens = []
345 |         for token in whitespace_tokenize(text):
346 |             chars = list(token)
347 |             if len(chars) > self.max_input_chars_per_word:
348 |                 output_tokens.append(self.unk_token)
349 |                 continue
350 | 
351 |             is_bad = False
352 |             start = 0
353 |             sub_tokens = []
354 |             while start < len(chars):
355 |                 end = len(chars)
356 |                 cur_substr = None
357 |                 while start < end:
358 |                     substr = "".join(chars[start:end])
359 |                     if start > 0:
360 |                         substr = "##" + substr
361 |                     if substr in self.vocab:
362 |                         cur_substr = substr
363 |                         break
364 |                     end -= 1
365 |                 if cur_substr is None:
366 |                     is_bad = True
367 |                     break
368 |                 sub_tokens.append(cur_substr)
369 |                 start = end
370 | 
371 |             if is_bad:
372 |                 output_tokens.append(self.unk_token)
373 |             else:
374 |                 output_tokens.extend(sub_tokens)
375 |         return output_tokens
376 | 
377 | 
378 | def _is_whitespace(char):
379 |     """Checks whether `chars` is a whitespace character."""
380 |     # \t, \n, and \r are technically contorl characters but we treat them
381 |     # as whitespace since they are generally considered as such.
382 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
383 |         return True
384 |     cat = unicodedata.category(char)
385 |     if cat == "Zs":
386 |         return True
387 |     return False
388 | 
389 | 
390 | def _is_control(char):
391 |     """Checks whether `chars` is a control character."""
392 |     # These are technically control characters but we count them as whitespace
393 |     # characters.
394 |     if char == "\t" or char == "\n" or char == "\r":
395 |         return False
396 |     cat = unicodedata.category(char)
397 |     if cat.startswith("C"):
398 |         return True
399 |     return False
400 | 
401 | 
402 | def _is_punctuation(char):
403 |     """Checks whether `chars` is a punctuation character."""
404 |     cp = ord(char)
405 |     # We treat all non-letter/number ASCII as punctuation.
406 |     # Characters such as "^", "$", and "`" are not in the Unicode
407 |     # Punctuation class but we treat them as punctuation anyways, for
408 |     # consistency.
409 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
410 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
411 |         return True
412 |     cat = unicodedata.category(char)
413 |     if cat.startswith("P"):
414 |         return True
415 |     return False
416 | 


--------------------------------------------------------------------------------
/bert_nlp/run.py:
--------------------------------------------------------------------------------
1 | from main.main import start
2 | 
3 | 
4 | if __name__ == '__main__':
5 |     start()


--------------------------------------------------------------------------------
/bert_nlp/train/train.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import warnings
  3 | import os
  4 | 
  5 | import torch
  6 | 
  7 | from pytorch_pretrained_bert.optimization import BertAdam
  8 | import config.args as args
  9 | from util.plot_util import loss_acc_plot
 10 | from util.Logginger import init_logger
 11 | from util.model_util import save_model
 12 | 
 13 | logger = init_logger("torch", logging_path=args.log_path)
 14 | torch.manual_seed(args.seed)
 15 | torch.cuda.manual_seed(args.seed)
 16 | torch.cuda.manual_seed_all(args.seed)
 17 | warnings.filterwarnings('ignore')
 18 | 
 19 | WEIGHTS_NAME = 'pytorch_model.bin'
 20 | CONFIG_NAME = 'bert_config.json'
 21 | 
 22 | 
 23 | def warmup_linear(x, warmup=0.002):
 24 |     if x < warmup:
 25 |         return x / warmup
 26 |     return 1.0 - x
 27 | 
 28 | 
 29 | def fit(model, training_iter, eval_iter, num_epoch, pbar, num_train_steps, verbose=1):
 30 |     # ------------------判断CUDA模式----------------------
 31 |     device = torch.device(args.device if torch.cuda.is_available() and not args.no_cuda else "cpu")
 32 | 
 33 |     # ---------------------优化器-------------------------
 34 |     param_optimizer = list(model.named_parameters())
 35 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
 36 | 
 37 |     optimizer_grouped_parameters = [{
 38 |         'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
 39 |         'weight_decay': 0.01
 40 |     }, {
 41 |         'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
 42 |         'weight_decay': 0.0
 43 |     }]
 44 | 
 45 |     t_total = num_train_steps
 46 | 
 47 |     ## ---------------------GPU半精度fp16-----------------------------
 48 |     if args.fp16:
 49 |         try:
 50 |             from apex.optimizers import FP16_Optimizer
 51 |             from apex.optimizers import FusedAdam
 52 |         except ImportError:
 53 |             raise ImportError(
 54 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
 55 |             )
 56 | 
 57 |         optimizer = FusedAdam(optimizer_grouped_parameters,
 58 |                               lr=args.learning_rate,
 59 |                               bias_correction=False,
 60 |                               max_grad_norm=1.0)
 61 |         if args.loss_scale == 0:
 62 |             optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
 63 |         else:
 64 |             optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
 65 |     ## ------------------------GPU单精度fp32---------------------------
 66 |     else:
 67 |         optimizer = BertAdam(optimizer_grouped_parameters,
 68 |                              lr=args.learning_rate,
 69 |                              warmup=args.warmup_proportion,
 70 |                              t_total=t_total)
 71 |     # ---------------------模型初始化----------------------
 72 |     if args.fp16:
 73 |         model.half()
 74 | 
 75 |     device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
 76 |     n_gpu = torch.cuda.device_count()
 77 | 
 78 |     logger.info(f"Device {device} n_gpu {n_gpu} distributed training")
 79 | 
 80 |     model.to(device)
 81 | 
 82 |     if n_gpu > 1:
 83 |         model = torch.nn.DataParallel(model)
 84 |         model_module = model.module
 85 |     else:
 86 |         model_module = model
 87 | 
 88 |     train_losses = []
 89 |     eval_losses = []
 90 |     train_accuracy = []
 91 |     eval_accuracy = []
 92 | 
 93 |     history = {
 94 |         "train_loss": train_losses,
 95 |         "train_acc": train_accuracy,
 96 |         "eval_loss": eval_losses,
 97 |         "eval_acc": eval_accuracy
 98 |     }
 99 | 
100 |     # ------------------------训练------------------------------
101 |     start = time.time()
102 |     global_step = 0
103 | 
104 |     for e in range(num_epoch):
105 |         model.train()
106 |         for step, batch in enumerate(training_iter):
107 |             batch = tuple(t.to(device) for t in batch)
108 |             # lsp: output_mask是为了过滤掉带有‘##’的token，即subword，这些词不计算loss，同时也是为了保证label_ids和token的长度一致
109 |             input_ids, input_mask, segment_ids, label_ids, output_mask = batch
110 |             bert_encode = model(input_ids, segment_ids, input_mask).cpu()
111 | 
112 |             train_loss = model_module.loss_fn(bert_encode=bert_encode,
113 |                                               tags=label_ids,
114 |                                               output_mask=output_mask)
115 |             if args.gradient_accumulation_steps > 1:
116 |                 train_loss = train_loss / args.gradient_accumulation_steps
117 |             if args.fp16:
118 |                 optimizer.backward(train_loss)
119 |             else:
120 |                 train_loss.backward()
121 |             if (step + 1) % args.gradient_accumulation_steps == 0:
122 |                 # modify learning rate with special warm up BERT uses
123 |                 lr_this_step = args.learning_rate * warmup_linear(global_step / t_total,
124 |                                                                   args.warmup_proportion)
125 |                 for param_group in optimizer.param_groups:
126 |                     param_group['lr'] = lr_this_step
127 |                 optimizer.step()
128 |                 optimizer.zero_grad()
129 |                 global_step += 1
130 |             predicts = model_module.predict(bert_encode, output_mask)
131 |             label_ids = label_ids.view(1, -1)
132 |             label_ids = label_ids[label_ids != -1]
133 |             label_ids = label_ids.cpu()
134 |             if len(predicts) != len(label_ids):
135 |                 continue
136 |             train_acc, f1 = model_module.acc_f1(predicts, label_ids)    # lsp: 每一步的准确率情况
137 |             pbar.show_process(train_acc, train_loss.item(), f1, time.time() - start, step)
138 | 
139 | # -----------------------验证----------------------------
140 |         model.eval()
141 |         count = 0
142 |         y_predicts, y_labels = [], []
143 |         eval_loss, eval_acc, eval_f1 = 0, 0, 0
144 |         with torch.no_grad():
145 |             for step, batch in enumerate(eval_iter):
146 |                 batch = tuple(t.to(device) for t in batch)
147 |                 input_ids, input_mask, segment_ids, label_ids, output_mask = batch
148 |                 bert_encode = model(input_ids, segment_ids, input_mask).cpu()
149 |                 eval_los = model_module.loss_fn(bert_encode=bert_encode,
150 |                                                 tags=label_ids,
151 |                                                 output_mask=output_mask)
152 |                 eval_loss = eval_los + eval_loss
153 |                 count += 1
154 |                 predicts = model_module.predict(bert_encode, output_mask)
155 |                 label_ids = label_ids.view(1, -1)
156 |                 label_ids = label_ids[label_ids != -1]
157 |                 if len(predicts) != len(label_ids):
158 |                     continue
159 |                 y_predicts.append(predicts)
160 |                 y_labels.append(label_ids)
161 |                 
162 | 
163 |             eval_predicted = torch.cat(y_predicts, dim=0).cpu()
164 |             eval_labeled = torch.cat(y_labels, dim=0).cpu()
165 | 
166 |             eval_acc, eval_f1 = model_module.acc_f1(eval_predicted, eval_labeled)
167 |             model_module.class_report(eval_predicted, eval_labeled)
168 | 
169 |             logger.info(
170 |                 '\n\nEpoch %d - train_loss: %4f - eval_loss: %4f - train_acc:%4f - eval_acc:%4f - eval_f1:%4f\n'
171 |                 %
172 |                 (e + 1, train_loss.item(), eval_loss.item() / count, train_acc, eval_acc, eval_f1))
173 | 
174 |             save_model(model, args.output_dir, step=e, f_1=eval_f1)  # 所有模型都保存
175 | 
176 |             if e == 0:
177 |                 output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
178 |                 model_module.config.to_json_file(output_config_file)
179 | 
180 |             if e % verbose == 0:
181 |                 train_losses.append(train_loss.item())
182 |                 train_accuracy.append(train_acc)
183 |                 eval_losses.append(eval_loss.item() / count)
184 |                 eval_accuracy.append(eval_acc)
185 | 
186 |     loss_acc_plot(history)
187 | 


--------------------------------------------------------------------------------
/bert_nlp/util/Logginger.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | import logging
 3 | from logging import Logger
 4 | from logging.handlers import TimedRotatingFileHandler
 5 | 
 6 | '''
 7 | 使用方式
 8 | from you_logging_filename.py import init_logger
 9 | logger = init_logger("dataset",logging_path='')
10 | def you_function():
11 | 	logger.info()
12 | 	logger.error()
13 | 
14 | '''
15 | 
16 | 
17 | '''
18 | 日志模块
19 | 1. 同时将日志打印到屏幕跟文件中
20 | 2. 默认值保留近7天日志文件
21 | '''
22 | def init_logger(logger_name, logging_path):
23 |     if logger_name not in Logger.manager.loggerDict:
24 |         logger  = logging.getLogger(logger_name)
25 |         logger.setLevel(logging.DEBUG)
26 |         handler = TimedRotatingFileHandler(filename=logging_path+"/all.log",when='D',backupCount = 7)
27 |         datefmt = '%Y-%m-%d %H:%M:%S'
28 |         format_str = '[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s  %(message)s'
29 |         formatter = logging.Formatter(format_str,datefmt)
30 |         handler.setFormatter(formatter)
31 |         handler.setLevel(logging.INFO)
32 |         logger.addHandler(handler)
33 |         console= logging.StreamHandler()
34 |         console.setLevel(logging.INFO)
35 |         console.setFormatter(formatter)
36 |         logger.addHandler(console)
37 | 
38 |         handler = TimedRotatingFileHandler(filename=logging_path+"/error.log",when='D',backupCount=7)
39 |         datefmt = '%Y-%m-%d %H:%M:%S'
40 |         format_str = '[%(asctime)s]: %(name)s %(filename)s[line:%(lineno)s] %(levelname)s  %(message)s'
41 |         formatter = logging.Formatter(format_str,datefmt)
42 |         handler.setFormatter(formatter)
43 |         handler.setLevel(logging.ERROR)
44 |         logger.addHandler(handler)
45 |     logger = logging.getLogger(logger_name)
46 |     return logger
47 | 
48 | #if __name__ == "__main__":
49 | #     logger = init_logger("datatest",logging_path="E:/neo4j-community-3.4.1")
50 | #     logger.error('test_error')
51 | #     logger.info("test-info")
52 | #     logger.warn("test-warn")
53 | 	 
54 | 	 
55 | 	 


--------------------------------------------------------------------------------
/bert_nlp/util/model_util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from net.model_net import Bert_CRF
 4 | import config.args as args
 5 | 
 6 | 
 7 | def save_model(model, output_dir, step, f_1):
 8 |     f_1 = round(f_1, 3)
 9 |     model_to_save = model.module if hasattr(model,
10 |                                             'module') else model    # Only save the model it-self
11 |     output_model_file = os.path.join(output_dir, f"pytorch_model_{f_1}_{step}.bin")
12 |     torch.save(model_to_save.state_dict(), output_model_file)
13 | 
14 | 
15 | def load_model(output_dir):
16 |     # Load a trained model that you have fine-tuned
17 |     output_model_file = os.path.join(output_dir, "pytorch_model.bin")
18 |     model_state_dict = torch.load(output_model_file)
19 |     model = Bert_CRF.from_pretrained(args.bert_model, state_dict=model_state_dict)
20 |     return model
21 | 


--------------------------------------------------------------------------------
/bert_nlp/util/plot_util.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import config.args as config
 3 | 
 4 | # 无图形界面需要加，否则plt报错
 5 | plt.switch_backend('agg')
 6 | 
 7 | 
 8 | def loss_acc_plot(history):
 9 |     train_loss = history['train_loss']
10 |     eval_loss = history['eval_loss']
11 |     train_accuracy = history['train_acc']
12 |     eval_accuracy = history['eval_acc']
13 | 
14 |     fig = plt.figure(figsize=(12, 8))
15 |     fig.add_subplot(2, 1, 1)
16 |     plt.title('loss during train')
17 |     plt.xlabel('epochs')
18 |     plt.ylabel('loss')
19 |     epochs = range(1, len(train_loss)+1)
20 |     plt.plot(epochs, train_loss)
21 |     plt.plot(epochs, eval_loss)
22 |     plt.legend(['train_loss', 'eval_loss'])
23 | 
24 |     fig.add_subplot(2, 1, 2)
25 |     plt.title('accuracy during train')
26 |     plt.xlabel('epochs')
27 |     plt.ylabel('accuracy')
28 |     epochs = range(1, len(train_loss) + 1)
29 |     plt.plot(epochs, train_accuracy)
30 |     plt.plot(epochs, eval_accuracy)
31 |     plt.legend(['train_acc', 'eval_acc'])
32 | 
33 |     plt.savefig(config.plot_path)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     history = {
38 |         'train_loss': range(100),
39 |         'eval_loss': range(100),
40 |         'train_accuracy': range(100),
41 |         'eval_accuracy': range(100)
42 |     }
43 |     loss_acc_plot(history)
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/bert_nlp/util/porgress_util.py:
--------------------------------------------------------------------------------
 1 | """进度条"""
 2 | 
 3 | import sys
 4 | 
 5 | 
 6 | class ProgressBar(object):
 7 |     """
 8 |     显示处理进度的类
 9 |     调用该类相关函数即可实现处理进度的显示
10 |     """
11 | 
12 |     # 初始化函数，需要知道总共的处理次数
13 |     def __init__(self, epoch_size, batch_size, max_arrow=80):
14 |         self.epoch_size = epoch_size
15 |         self.batch_size = batch_size
16 |         self.max_steps = round(epoch_size / batch_size)    # 总共处理次数 = round(epoch/batch_size)
17 |         self.max_arrow = max_arrow    # 进度条的长度
18 | 
19 |     # 显示函数，根据当前的处理进度i显示进度
20 |     # 效果为[>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>]100.00%
21 |     def show_process(self, train_acc, train_loss, f1, used_time, i):
22 |         num_arrow = int(i * self.max_arrow / self.max_steps)    # 计算显示多少个'>'
23 |         num_line = self.max_arrow - num_arrow    # 计算显示多少个'-'
24 |         percent = i * 100.0 / self.max_steps    # 计算完成进度，格式为xx.xx%
25 |         num_steps = self.batch_size * i    # 当前处理数据条数
26 |         process_bar =  '%d'%num_steps + '/' + '%d'%self.epoch_size + '[' + '>' * num_arrow + '-' * num_line + ']'\
27 |                       + '%.2f' % percent + '%' + ' - train_acc ' + '%.4f'%train_acc + ' - train_loss '+ \
28 |                        '%.4f' %train_loss + ' - f1 ' + '%.4f'% f1 + ' - time '+ '%.1fs'%used_time + '\r'
29 |         sys.stdout.write(process_bar)    # 这两句打印字符到终端
30 |         sys.stdout.flush()
31 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | atomicwrites==1.3.0
 2 | attrs==19.3.0
 3 | boto3==1.12.47
 4 | botocore==1.15.47
 5 | certifi==2019.9.11
 6 | chardet==3.0.4
 7 | docutils==0.15.2
 8 | idna==2.8
 9 | importlib-metadata==0.23
10 | jmespath==0.9.5
11 | joblib==0.14.1
12 | more-itertools==7.2.0
13 | numpy==1.17.4
14 | packaging==19.2
15 | pluggy==0.13.0
16 | py==1.8.0
17 | pyparsing==2.4.5
18 | pytest==5.2.4
19 | python-dateutil==2.8.1
20 | pytz==2020.1
21 | requests==2.22.0
22 | s3transfer==0.3.3
23 | scipy==1.4.1
24 | six==1.13.0
25 | torch==1.0.1.post2
26 | tqdm==4.43.0
27 | urllib3==1.25.7
28 | wcwidth==0.1.7
29 | zipp==0.6.0
30 | matplotlib==3.1.1
31 | scikit-learn==0.21.3
32 | 


--------------------------------------------------------------------------------