├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_EN.md
├── config_classification.yaml
├── config_sequence_labeling.yaml
├── examples
    ├── chunking.ipynb
    └── sentiment.ipynb
├── images
    ├── entity_visualization_sample.jpg
    └── framework.jpg
├── nlp_toolkit
    ├── __init__.py
    ├── bin
    │   ├── run_classifier
    │   └── run_seq_tagger
    ├── callbacks.py
    ├── chunk_segmentor
    │   ├── README.md
    │   ├── __init__.py
    │   ├── segment.py
    │   ├── tagger.py
    │   ├── tests
    │   │   ├── data.sh
    │   │   ├── test_functions.py
    │   │   └── test_speed.py
    │   ├── trie.py
    │   └── utils.py
    ├── classifier.py
    ├── config.py
    ├── data.py
    ├── data
    │   └── radical.txt
    ├── labeler.py
    ├── models
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── bi_lstm_att.py
    │   ├── char_rnn.py
    │   ├── dpcnn.py
    │   ├── han.py
    │   ├── idcnn.py
    │   ├── text_cnn.py
    │   ├── transformer.py
    │   └── word_rnn.py
    ├── modules
    │   ├── __init__.py
    │   ├── attentions
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── multi_dim_attention.py
    │   │   └── self_attention.py
    │   ├── custom_loss.py
    │   ├── logits.py
    │   └── token_embedders
    │   │   ├── __init__.py
    │   │   ├── embedding.py
    │   │   └── position_embedding.py
    ├── sequence.py
    ├── trainer.py
    ├── utilities.py
    └── visualization.py
├── reproduction
    ├── company_pro_con_classify.py
    └── noun_phrases_detect.py
├── requirements-gpu.txt
├── requirements.txt
├── sample_data
    ├── company_pro_con.txt
    ├── cv_word_basic.txt
    └── cv_word_conll.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 stevewyl
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | include requirements-gpu.txt
4 | include nlp_toolkit/data/*
5 | include nlp_toolkit/modules/*
6 | include nlp_toolkit/models/*
7 | include nlp_toolkit/chunk_segmentor/*
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # nlp_toolkit
  2 | 
  3 | 中文NLP基础工具箱，包括以下任务：例如文本分类、序列标注等。
  4 | 
  5 | 本仓库复现了一些近几年比较火的nlp论文。所有的代码是基于keras开发的。
  6 | 
  7 | 不到10行代码，你就可以快速训练一个文本分类模型（暂时不支持多标签任务）或序列标注模型，或者可以体验基于名词短语切分的分词器
  8 | 
  9 | ## 直接安装
 10 | 
 11 | ```bash
 12 | pip install nlp_toolkit
 13 | 
 14 | # 使用GPU
 15 | pip install tensorflow-gpu, GPUtil
 16 | ```
 17 | 
 18 | ## 手动安装
 19 | 
 20 | ```bash
 21 | git clone https://github.com/stevewyl/nlp_toolkit
 22 | cd nlp_toolkit
 23 | 
 24 | # 只使用CPU
 25 | pip install -r requirements.txt
 26 | 
 27 | # 使用GPU
 28 | pip install -r requirements-gpu.txt
 29 | 
 30 | # 如果keras_contrib安装失败
 31 | pip install git+https://www.github.com/keras-team/keras-contrib.git
 32 | ```
 33 | 
 34 | ### 安装错误
 35 | 
 36 | 1. ImportError: cannot import name 'normalize_data_format'
 37 | 
 38 |     ```bash
 39 |     pip install -U keras
 40 |     ```
 41 | 
 42 | ## 使用方法
 43 | 
 44 | 本仓库的框架图：
 45 | 
 46 | ![framework](./images/framework.jpg)
 47 | 
 48 | 主要由以下几大模块组成：
 49 | 
 50 | 1. Dataset：处理文本和标签数据为适合模型输入的格式，主要进行的处理操作有清理、分词、index化
 51 | 
 52 | 2. Model Zoo & Layer：近几年在该任务中常用的模型汇总及一些Keras的自定义层
 53 | 
 54 |    目前支持的自定义层有如下：
 55 | 
 56 |    * 1D注意力层 🆗
 57 |    * 2D注意力层 🆗
 58 |    * 多头注意力层 🆗
 59 |    * 位置嵌入层 🆗
 60 |    * K-max池化层
 61 | 
 62 | 3. Trainer：定义模型的训练流程，支持bucket序列、自定义callbacks和N折交叉验证
 63 | 
 64 |     * bucket序列：通过将相似长度的文本放入同一batch来减小padding的多余计算来实现模型训练的加速，在文本分类任务中，能够对RNN网络提速2倍以上（**暂时不支持含有Flatten层的网络**）
 65 |   
 66 |     * callbacks：通过自定义回调器来控制训练流程，目前预设的回调器有提前终止训练，学习率自动变化，更丰富的评估函数等
 67 | 
 68 |     * N折交叉验证：支持交叉验证来考验模型的真实能力
 69 | 
 70 | 4. Classifier & Sequence Labeler：封装类，支持不同的训练任务
 71 | 
 72 | 5. Application：目前工具箱内封装了基于jieba的名词短语分词器 Chunk_Segmentor (如需模型文件，可以邮件联系我)
 73 | 
 74 | 简单的用法如下：
 75 | 
 76 | ```python
 77 | from nlp_toolkit import Dataset, Classifier, Labeler
 78 | import yaml
 79 | 
 80 | config = yaml.load(open('your_config.yaml'))
 81 | 
 82 | # 分类任务
 83 | dataset = Dataset(fname='your_data.txt', task_type='classification', mode='train', config=config)
 84 | text_classifier = Classifier('multi_head_self_att', dataset)
 85 | trained_model = text_classifier.train()
 86 | 
 87 | # 序列标注任务
 88 | dataset = Dataset(fname='your_data.txt', task_type='sequence_labeling', mode='train', config=config)
 89 | seq_labeler = Labeler('word_rnn', dataset)
 90 | trained_model = seq_labeler.train()
 91 | 
 92 | # 预测（以文本分类为例）
 93 | dataset = Dataset(fname='your_data.txt', task_type='classification', mode='predict', tran_fname='your_transformer.h5')
 94 | text_classifier = Classifier('bi_lstm_att', dataset)
 95 | text_classifier.load(weight_fname='your_model_weights.h5', para_fname='your_model_parameters.json')
 96 | y_pred = text_classifier.predict(dataset.texts)
 97 | 
 98 | # chunk分词
 99 | # 第一次import的时候，会自动下载模型和字典数据
100 | # 支持单句和多句文本的输入格式，建议以列表的形式传入分词器
101 | # 源代码中已略去相关数据的下载路径，有需要的请邮件联系
102 | from nlp_toolkit.chunk_segmentor import Chunk_Segmentor
103 | cutter = Chunk_Segmentor()
104 | s = '这是一个能够输出名词短语的分词器，欢迎试用！'
105 | res = [item for item in cutter.cut([s] * 10000)] # 1080ti上耗时8s
106 | # 提供两个版本，accurate为精确版，fast为快速版但召回会降低一些，默认精确版
107 | cutter = Chunk_Segmentor(mode='accurate')
108 | cutter = Chunk_Segmentor(mode='fast')
109 | # 是否输出词性， 默认开启
110 | cutter.cut(s, pos=False)
111 | # 是否将可切分的名词短语切分，默认关闭
112 | cutter.cut(s, cut_all=True)
113 | # 输出格式（词列表，词性列表，名词短语集合）
114 | [
115 |     (
116 |         ['这', '是', '一个', '能够', '输出', '名词_短语', '的', '分词器', ',', '欢迎', '试用', '!'],
117 |         ['r', 'v', 'mq', 'v', 'vn', 'np', 'ude1', 'np', 'w', 'v', 'v', 'w'],
118 |         ['分词器', '名词_短语']
119 |     )
120 |     ...
121 | ]
122 | ```
123 | 
124 | 更多使用细节，请阅读[**examples**](https://github.com/stevewyl/nlp_toolkit/tree/master/examples)文件夹中的Jupyter Notebook和chunk_segmentor页面的[**README**](https://github.com/stevewyl/nlp_toolkit/tree/master/nlp_toolkit/chunk_segmentor)
125 | 
126 | ### 数据格式
127 | 
128 | 1. 文本分类：每一行预先分好词的文件，每一行的格式如下：
129 | 
130 |     __label__标签1 __label__标签2 ... 词 词 ... 词\n
131 | 
132 |     例如 “__label__neg 公司 目前 地理 位置 不 太 理想 ， 离 城市 中心 较 远点 。”
133 | 
134 | 2. 序列标注：每一行预先分好词的文件，支持两种数据格式，每一行的格式如下：
135 | 
136 |     词###标签 [TAB] 词###标签 [TAB] ... \n
137 | 
138 |     例如 “目前###O\t公司###O\t地理###B-Chunk\t位置###E-Chunk\t不###O\t太###O\t理想\n”
139 | 
140 |     或者 CONLL的标准格式
141 | 
142 |     词 [TAB] 标签
143 | 
144 |     词 [TAB] 标签
145 | 
146 |     ...
147 | 
148 |     词 [TAB] 标签
149 | 
150 |     词 [TAB] 标签
151 | 
152 |     ...
153 | 
154 |     例如：
155 | 
156 |     目前\tO
157 | 
158 |     公司\tO
159 | 
160 |     ...
161 | 
162 |     地理\tB-Chunk
163 | 
164 |     位置\tE-Chunk
165 | 
166 |     不\tO
167 | 
168 |     太\tO
169 | 
170 |     理想\tO
171 | 
172 |     标签含义（这里以chunk为例）：
173 | 
174 |     * O：普通词
175 |     * B-Chunk：表示chunk词的开始
176 |     * I-Chunk：表示chunk词的中间
177 |     * E-Chunk：表示chunk词的结束
178 | 
179 |     建议：文本序列以短句为主，针对标注实体的任务，最好保证每行数据中有实体词（即非全O的序列）
180 | 
181 |     你可以通过以下方式互相转换两种数据格式：
182 |     ```python
183 |     from nlp_toolkit.utilities import convert_seq_format
184 |     # here we convert dataset from conll format to basic format
185 |     convert_seq_format(input_file, output_file, 'basic')
186 |     ```
187 | 
188 |     ps: 具体可查看data文件夹中对应的[**示例数据**](https://github.com/stevewyl/nlp_toolkit/tree/master/sample_data)
189 | 
190 | 3. 预测：不同任务每一行均为预先分好词的文本序列
191 | 
192 | 4. 支持简单的自己添加数据的方法
193 | 
194 |    ```python
195 |    dataset = Dataset(task_type='classification', mode='train', config=config)
196 |    # classification
197 |    dataset.add({'text': '我 爱 机器 学习', 'label': 'pos'})
198 |    # sequence labeling
199 |    dataset.add({'text': '我 爱 机器 学习', 'label': 'O O B-Chunk E-Chunk'})
200 |    # after you add all your data
201 |    dataset.fit()
202 |    ```
203 | 
204 | ### 配置文件
205 | 
206 | nlp_toolkit通过配置文件来初始化训练任务
207 | 
208 | train: 表示训练过程中的参数，包括batch大小，epoch数量，训练模式等
209 | 
210 | data: 表示数据预处理的参数，包括最大词数和字符数，是否使用词内部字符序列等
211 | 
212 | embed: 词向量，pre表示是否使用预训练词向量
213 | 
214 | 剩下的模块对应不同的模型的超参数
215 | 
216 | 具体细节可查看仓库根目录下的两个**配置文件**注释
217 | 
218 | ### 可视化
219 | 
220 | 1. attention权重可视化
221 | 
222 |     ```python
223 |     # only support model bi_lstm_att currently
224 |     # first you need to get attention_weights from model predictions
225 |     # you can find the actual usage in examples/sentiment.ipynb
226 |     texts = '有 能力 的 人 就 有 很多 机会'
227 |     from nlp_toolkit import visualization as vs
228 |     vs.mk_html(texts, attention_weights)
229 |     ```
230 | 
231 |     <span style="background-color: #FFFAFA">有</span> <span style="background-color: #FFB6B6">能力</span> <span style="background-color: #FFFBFB">的</span> <span style="background-color: #FFF8F8">人</span> <span style="background-color: #FFEFEF">就</span> <span style="background-color: #FFE3E3">有</span> <span style="background-color: #FFEFEF">很多</span> <span style="background-color: #FF9191">机会</span>
232 | 
233 | 2. 实体预测结果可视化
234 | 
235 |    ```python
236 |    from nlp_toolkit import visualization as vs
237 |    vs.entity_visualization(dataset.texts, y_pred, output_fname='result.html')
238 |    ```
239 | 
240 | 3. acc/loss 曲线可视化
241 | 
242 |    ```python
243 |    # after your have trained one model, you will also get a history object, which contains some loss and metrics info
244 |    from nlp_toolkit import visualization as vs
245 |    vs.plot_loss_acc(history, task='sequence_labeling')
246 |    ```
247 | 
248 | ### 其他
249 | 
250 | 1. 生成词向量小文件
251 | 
252 |     ```python
253 |     from nlp_toolkit.utilities import gen_small_embedding
254 |     gen_small_embedding(vocab_file, embed_file, output_file)
255 |     ```
256 | 
257 | ## 模型
258 | 
259 | ### 文本分类
260 | 
261 | 1. 双层双向LSTM + Attention 🆗
262 | 
263 |     [DeepMoji](https://arxiv.org/abs/1708.00524)一文中所采用的的模型框架，本仓库中对attention层作了扩展
264 | 
265 |     对应配置文件中的名称：bi_lstm_att
266 | 
267 | 2. [Transformer](http://papers.nips.cc/paper/7181-attention-is-all-you-need) 🆗
268 | 
269 |     采用Transformer中的多头自注意力层来表征文本信息，详细的细节可阅读此[文章](https://kexue.fm/archives/4765)
270 | 
271 |     对应配置文件中的名称：multi_head_self_att
272 | 
273 | 3. [TextCNN](https://arxiv.org/abs/1408.5882) 🆗
274 | 
275 |     CNN网络之于文本分类任务的开山之作，在过去几年中经常被用作baseline，详细的细节可阅读此[文章](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)
276 | 
277 |     对应配置文件中的名称：text_cnn
278 | 
279 | 4. [DPCNN](http://www.aclweb.org/anthology/P17-1052) 🆗
280 | 
281 |     在textCNN的基础上，DPCNN使用残差连接、固定feature map数量和1/2池化层等技巧来实现更丰富的文本表示，详细的细节可阅读此[文章](https://zhuanlan.zhihu.com/p/35457093)
282 | 
283 |     对应配置文件中的名称：dpcnn
284 |     暂时不支持bucket序列化的数据
285 | 
286 | 5. [HAN](https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf)
287 | 
288 |     使用attention机制的文档分类模型
289 | 
290 | ### 序列标注
291 | 
292 | 1. [WordRNN](https://arxiv.org/abs/1707.06799) 🆗
293 | 
294 |     Baseline模型，文本序列经过双向LSTM后，由CRF层编码作为输出
295 | 
296 |     对应配置文件中的名称：word_rnn
297 | 
298 | 2. [CharRNN](https://pdfs.semanticscholar.org/b944/5206f592423f0b2faf05f99de124ccc6aaa8.pdf) 🆗
299 | 
300 |     基于汉语的特点，在字符级别的LSTM信息外，加入偏旁部首，分词，Ngram信息
301 | 
302 | 3. [InnerChar](https://arxiv.org/abs/1611.04361) 🆗
303 | 
304 |     基于另外一篇[论文](https://arxiv.org/abs/1511.08308)，扩展了本文的模型，使用bi-lstm或CNN在词内部的char级别进行信息的抽取，然后与原来的词向量进行concat或attention计算
305 | 
306 |     对应配置文件中的名称：word_rnn，并设置配置文件data模块中的inner_char为True
307 | 
308 | 4. [IDCNN](https://arxiv.org/abs/1702.02098) 🆗
309 | 
310 |     膨胀卷积网络，在保持参数量不变的情况下，增大了卷积核的感受野，详细的细节可阅读此[文章](http://www.crownpku.com//2017/08/26/%E7%94%A8IDCNN%E5%92%8CCRF%E5%81%9A%E7%AB%AF%E5%88%B0%E7%AB%AF%E7%9A%84%E4%B8%AD%E6%96%87%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB.html)
311 | 
312 |     对应配置文件中的名称：idcnn
313 | 
314 | ## 性能
315 | 
316 | 后续加入对中文NLP的标准数据集的测试
317 | 
318 | ### 文本分类
319 | 
320 | 测试数据集：
321 | 
322 | 1. 公司优缺点评价，二分类，数据规模：95K
323 | 
324 |     Model                   | 10-fold_f1   | Model Size   | Time per epoch
325 |     ----------------------- | :------:     | :----------: | :-------------:
326 |     Bi-LSTM Attention       |              |              |
327 |     Transformer             |              | 7M           | 12s
328 |     TextCNN                 | 96.57        | 10M          | 19s
329 |     DPCNN                   | 93.35        | 9M           | 28s
330 |     HAN                     |              |              |
331 | 
332 | ### 序列标注
333 | 
334 | 测试数据集：
335 | 
336 | 1. 简历工作经历，chunk，数据规模：58K
337 | 
338 |     Model                   | 10-fold_f1   | Model Size   | Time per epoch
339 |     ----------------------- | :------:     | :----------: | :-------------:
340 |     Baseline(WordRNN)       |              |              |
341 |     WordRNN + InnerChar     |              | 3M           | 165s
342 |     CharRNN(seg+radical)    |              |              |
343 |     IDCNN                   |              | 2.7M         | 43s
344 | 
345 | ps: 模型大小表示为模型的参数量，其中K表示千，M表示百万；测试设备为1080ti+i7-6800K
346 | 
347 | ## To-Do列表
348 | 
349 | 1. 加入更多SOTA的模型和自定义层
350 | 
351 | 2. 下一版本规划：增加抽象类Sentence
352 | 
353 | 3. V2.0规划：切换为tf.estimator和tf.keras的API
354 | 
355 | ## 感谢
356 | 
357 | * 数据流模块部分代码借鉴于此： https://github.com/Hironsan/anago/
358 | 
359 | * 序列标注任务的评估函数来源于此： https://github.com/chakki-works/seqeval
360 |   
361 | * bucket序列化代码来自：https://github.com/tbennun/keras-bucketed-sequence
362 | 
363 | * 多头注意力层和位置嵌入层代码来自：https://github.com/bojone/attention
364 | 
365 | ## 联系方式
366 | 
367 | 联系人：王奕磊
368 | 
369 | 📧 邮箱：stevewyl@163.com
370 | 
371 | 微信：Steve_1125
372 | 


--------------------------------------------------------------------------------
/README_EN.md:
--------------------------------------------------------------------------------
  1 | # nlp_toolkit
  2 | 
  3 | Basic Chinese NLP Toolkits include following tasks, such as text classification, sequence labeling etc.
  4 | 
  5 | This repo reproduce some hot nlp papers in recent years. All the code is based on Keras.
  6 | 
  7 | Less than 10 lines of code, you can quickly train a text classfication model or sequence labeling model.
  8 | 
  9 | ## Install
 10 | 
 11 | ```bash
 12 | git clone https://github.com/stevewyl/nlp_toolkit
 13 | cd nlp_toolkit
 14 | 
 15 | # Use cpu-only
 16 | pip install -r requirements.txt
 17 | 
 18 | # Use GPU
 19 | pip install -r requirements-gpu.txt
 20 | 
 21 | # if keras_contrib install fail
 22 | pip install git+https://www.github.com/keras-team/keras-contrib.git
 23 | ```
 24 | 
 25 | ## Usage
 26 | 
 27 | The frameword of this repository:
 28 | 
 29 | ![framework](./images/framework.jpg)
 30 | 
 31 | Following modules are included in:
 32 | 
 33 | 1. Dataset：Text and label data are processed in a format suitable for model input. The main processing operations are cleaning, word segmentation and indexation.
 34 | 
 35 | 2. Model Zoo & Layer：The collection of models commonly used in this task in recent years and some custom layers of Keras.
 36 | 
 37 |     Customized layers are as followed:
 38 | 
 39 |     * Attention
 40 | 
 41 |     * Multi-Head Attention
 42 | 
 43 |     * Position Embedding
 44 | 
 45 | 3. Trainer：Define the training process of differnent models, which supports bucket sequence, customed callbacks and N-fold validation training.
 46 | 
 47 |     * Bucket Iterator: Accelerate model training by putting texts with similar lengths into the same batch to reduce the extra calculation of padding. In text classification task, it can help speed up RNN by over 2 times. (currently not support for networks with Flatten layer)
 48 | 
 49 |     * callbacks: The training process is controlled by custom callbacks. Currently, the preset callbacks include early stopping strategy, automatical learning rate decay, richer evaluation functions and etc.
 50 | 
 51 |     * N-fold cross validation: Support cross-validation to test the true capabilities of the model.
 52 | 
 53 | 4. Classifier & Sequence Labeler：Encapsulates classes that support different training tasks.
 54 | 
 55 | Quick start：
 56 | 
 57 | ```python
 58 | from nlp_toolkit import Dataset, Classifier, Labeler
 59 | import yaml
 60 | 
 61 | config = yaml.load(open('your_config.yaml'))
 62 | 
 63 | # text classification task
 64 | dataset = Dataset(fname='your_data.txt', task_type='classification', mode='train', config=config)
 65 | x, y, config = dataset.transform()
 66 | text_classifier = Classifier(config=config, model_name='multi_head_self_att', seq_type='bucket', transformer=dataset.transformer)
 67 | trained_model = text_classifier.train(x, y)
 68 | 
 69 | # sequence labeling task
 70 | dataset = Dataset(fname='your_data.txt', task_type='sequence_labeling', mode='train', config=config)
 71 | x, y, config = dataset.transform()
 72 | seq_labeler = Labeler(config=config, model_name='word_rnn', seq_type='bucket',,transformer=dataset.transformer)
 73 | trained_model = seq_labeler.train(x, y)
 74 | 
 75 | # predict (for text classification task)
 76 | dataset = Dataset('your_data.txt', task_type='classification', mode='predict', tran_fname='your_transformer.h5', segment=False)
 77 | x_seq = dataset.transform()
 78 | text_classifier = Classifier('bi_lstm_att', dataset.transformer)
 79 | text_classifier.load(weight_fname='your_model_weights.h5', para_fname='your_model_parameters.json')
 80 | y_pred = text_classifier.predict(x_seq['word'])
 81 | ```
 82 | 
 83 | For more details, please read the jupyter notebooks in **examples** folder
 84 | 
 85 | ### Data Format
 86 | 
 87 | 1. Text Classification: A pretokenised file where each line is in the following format(temporarily does not support multi-label tasks):
 88 | 
 89 |     WORD [SPACE] WORD [SPACE] ... [TAB] LABEL \n
 90 | 
 91 |     such as "公司 目前 地理 位置 不 太 理想 ， 离 城市 中心 较 远点 。\tneg\n"
 92 | 
 93 | 2. Sequence Labeling: A pretokenised file where each line is in the following format:
 94 | 
 95 |     WORD###TAG [TAB] WORD###TAG [TAB] ..... \n
 96 | 
 97 |     such as "目前###O\t公司###O\t地理###B-Chunk\t位置###E-Chunk\t不###O\t太###O\t理想\n"
 98 | 
 99 |     label format (chunking as an example):
100 | 
101 |     * O：common words
102 |     * B-Chunk：indicates the beginning of the chunk word
103 |     * I-Chunk：indicates the middle of the chunk word
104 |     * E-Chunk：indicates the end of the chunk word
105 | 
106 |     Suggestions: The text sequence is mainly short sentences. For the task of labeling entities, it is best to ensure that there are entity words in each row of data (ie, sequences of non-all Os).
107 | 
108 | 3. Prediction: Each line of different tasks is text.
109 | 
110 | 
111 | ### Configuration file
112 | 
113 | Train: indicates the parameters in the training process, including batch size, epoch numbers, training mode, etc.
114 | 
115 | Data: indicates the parameters of data preprocessing, including the maximum number of words and characters, whether to use the word internal character sequence, whether to use word segmentation
116 | 
117 | Embed: word vectors, pre indicates whether to use pre-trained word vectors
118 | 
119 | The remaining modules correspond to different model hyperparameters
120 | 
121 | See the configuration file comments for details.
122 | 
123 | ## Models
124 | 
125 | 1. Double Bi-LSTM + Attention 🆗
126 | 
127 |     The model framework used in paper [DeepMoji](https://arxiv.org/abs/1708.00524). The attention layer has been extended in nlp_toolkit.
128 | 
129 |     Corresponding to the name in the configuration file: bi_lstm_att
130 | 
131 | 2. [Transformer](http://papers.nips.cc/paper/7181-attention-is-all-you-need) 🆗
132 | 
133 |     Use the multi-head-self-attention layer in Transformer to characterize text information. Read the [article](https://kexue.fm/archives/4765) for details.
134 | 
135 |     Corresponding to the name in the configuration file: multi_head_self_att
136 | 
137 | 3. [TextCNN](https://arxiv.org/abs/1408.5882) 🆗
138 | 
139 |     CNN Network's pioneering work on text classification tasks has often been used as a baseline in the past few years. Detailed details can be read in this [Article](http://www.wildml.com/2015/12/implementing-a-cnn-for-text-classification-in-tensorflow/)
140 | 
141 |     Corresponding to the name in the configuration file: text_cnn
142 | 
143 | 4. [DPCNN](http://www.aclweb.org/anthology/P17-1052)
144 | 
145 |     Get better text characterization by continuously deepening the CNN network.
146 | 
147 | 5. [HAN](https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf)
148 | 
149 |     Document classification model using the attention mechanism
150 | 
151 | ### Sequence Labeling
152 | 
153 | 1. [WordRNN](https://arxiv.org/abs/1707.06799) 🆗
154 | 
155 |     Baseline model, the text sequence is encoded by the CRF layer after passing through the bidirectional LSTM
156 | 
157 |     Corresponding to the name in the configuration file: word_rnn
158 | 
159 | 2. [CharRNN](https://pdfs.semanticscholar.org/b944/5206f592423f0b2faf05f99de124ccc6aaa8.pdf)
160 | 
161 |     Based on the characteristics of Chinese, in addition to the LSTM information at the character level, the radicals, word segmentation, and Ngram information are added.
162 | 
163 | 3. [InnerChar](https://arxiv.org/abs/1611.04361) 🆗
164 | 
165 |     Based on another [paper](https://arxiv.org/abs/1511.08308), the above model is extended, using bi-lstm or CNN to extract information from the char level inside the word, and then concat with the original word vectors or conduct attention calculation.
166 | 
167 |     Corresponding to the name in the configuration file: word_rnn, and set the inner_char in the data module in the configuration file to True.
168 | 
169 | 4. [IDCNN](https://arxiv.org/abs/1702.02098) 🆗
170 | 
171 |     The iterated dilated CNN increases the receptive field of the convolution kernel while keeping the parameter amount constant. The detailed details can be read in this [article](http://www.crownpku.com//2017/08/26/%E7%94%A8IDCNN%E5%92%8CCRF%E5%81%9A%E7%AB%AF%E5%88%B0%E7%AB%AF%E7%9A%84%E4%B8%AD%E6%96%87%E5%AE%9E%E4%BD%93%E8%AF%86%E5%88%AB.html)
172 | 
173 |     Corresponding to the name in the configuration file: idcnn
174 | 
175 | 
176 | ### Text Classification
177 | 
178 | ### Sequence Labeling
179 | 
180 | ## Performance
181 | 
182 | Here list the performace based on following two datasets:
183 | 
184 | 1. Company Pros and Cons: Crawled from Kanzhun.com and Dajie.com, it contains 95K reviews on the pros and cons of different companies.
185 | 2. 
186 | 
187 | ### Text Classification
188 | 
189 | Model                   | 10-fold_f1   | Model Size   | Time per epoch
190 | ----------------------- | :------:     | :----------: | :-------------:
191 | Bi-LSTM Attention       |              |              | 
192 | Transformer             |              |              |
193 | TextCNN                 |              |              |
194 | DPCNN                   |              |              |
195 | HAN                     |              |              |
196 | 
197 | ### Sequence Labeling
198 | 
199 | Model                   | 10-fold_f1   | Model Size   | Time per epoch
200 | ----------------------- | :------:     | :----------: | :-------------:
201 | Baseline(WordRNN)       |              |              | 
202 | WordRNN + InnerChar     |              |              |
203 | CharRNN                 |              |              |
204 | IDCNN                   |              |              |
205 | 
206 | ## To-Do List
207 | 
208 | 1. Sentence split module
209 | 
210 | 2. Add more SOTA model(such as BERT)
211 | 
212 | 3. Support for training language model
213 | 
214 | 4. Support for customized moudle
215 | 
216 | 5. Generate a unique configuration file for each model
217 | 
218 | ## Acknowledgments
219 | 
220 | * The preprocessor part is derived from https://github.com/Hironsan/anago/
221 | * The evaluations for sequence labeling are based on a modified version of https://github.com/chakki-works/seqeval
222 | * Bucket sequence are based on https://github.com/tbennun/keras-bucketed-sequence
223 | * Multi-head attention and position embedding are from: https://github.com/bojone/attention
224 | 
225 | ## Contact
226 | Contact: Yilei Wang
227 | 
228 | 📧 E-mail: stevewyl@163.com
229 | 
230 | WeChat: Steve_1125


--------------------------------------------------------------------------------
/config_classification.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   bi_lstm_att:
 3 |     # rnn隐层大小
 4 |     rnn_size: 512
 5 |     # attention层隐层大小
 6 |     attention_dim: 128
 7 |     # 向量层丢弃率
 8 |     embed_drop_rate: 0.15
 9 |     # 输出层前一层丢弃率
10 |     final_drop_rate: 0.5
11 |     # 是否返回attention权重
12 |     return_att: True
13 | 
14 |   transformer:
15 |     # head个数
16 |     nb_head: 8
17 |     # head大小
18 |     head_size: 16
19 |     # attention层个数
20 |     nb_transformer: 2
21 |     # 是否使用位置嵌入向量
22 |     pos_embed: True
23 |     # 词向量层丢弃率
24 |     embed_drop_rate: 0.15
25 |     # 输出层前一层丢弃率
26 |     final_drop_rate: 0.5
27 | 
28 |   text_cnn:
29 |     # 卷积核大小
30 |     conv_kernel_size: [3, 4, 5]
31 |     # 池化层核大小
32 |     pool_size: [2, 2, 2]
33 |     # 滤波器个数
34 |     nb_filters: 128
35 |     # 全连接层隐层大小
36 |     fc_size: 128
37 |     # 词向量层丢弃率
38 |     embed_drop_rate: 0.15
39 | 
40 |   dpcnn:
41 |     # text_cnn特征
42 |     region_kernel_size: [3, 4, 5]
43 |     # 卷积核大小
44 |     conv_kernel_size: 3
45 |     # 池化层核大小
46 |     pool_size: 3
47 |     # cnn层个数
48 |     repeat_time: 2
49 |     # 词向量层丢弃率
50 |     embed_drop_rate: 0.15
51 |     # 输出层前一层丢弃率
52 |     final_drop_rate: 0.5
53 |     # 滤波器个数
54 |     nb_filters: 250
55 | 
56 | train:
57 |   # bucket个数
58 |   nb_bucket: 100
59 |   # batch大小
60 |   batch_size: 64
61 |   # 最大迭代词数
62 |   epochs: 25
63 |   # 评估指标
64 |   metric: f1
65 |   # 交叉验证的次数
66 |   nb_fold: 10
67 |   # 训练模式，有single和fold两种
68 |   train_mode: single
69 |   # 测试集比例
70 |   test_size: 0.2
71 |   # early_stopping的终止条件
72 |   patiences: 3
73 | 
74 | data:
75 |   # 最小的token粒度，有word和char两种
76 |   basic_token: word
77 |   # 最大词数
78 |   max_words: 100
79 |   # 最大字符数
80 |   max_chars: 150
81 |   # 最大词内部字符数
82 |   max_inner_chars: 8
83 |   # 是否开启词内部序列
84 |   inner_char: False
85 | 
86 | embed:
87 |   # 是否使用预训练词向量
88 |   pre: True
89 |   # 词向量
90 |   word:
91 |     path: ../data/embeddings/fasttext_cv_all_300d.txt
92 |     dim: 256
93 |   # 字向量
94 |   char:
95 |     path: null
96 |     dim: 128
97 | 


--------------------------------------------------------------------------------
/config_sequence_labeling.yaml:
--------------------------------------------------------------------------------
  1 | model:
  2 |   word_rnn:
  3 |     # 词级别rnn隐层大小
  4 |     word_rnn_size: 128
  5 |     # 字符级别rnn隐层大小
  6 |     char_rnn_size: 32
  7 |     # 是否使用CRF
  8 |     use_crf: True
  9 |     # 词内部字符信息表征方式，有cnn和rnn两种
 10 |     char_feature_method: cnn
 11 |     # 词和词内部字符信息的连接方式，有concat和attention两种
 12 |     integration_method: attention
 13 |     # rnn层的类别，有lstm和gru两种
 14 |     rnn_type: lstm
 15 |     # rnn层的个数
 16 |     nb_rnn_layers: 2
 17 |     # 滤波器个数
 18 |     nb_filters: 64
 19 |     # 卷积核大小
 20 |     conv_kernel_size: 2
 21 |     # 丢弃率
 22 |     drop_rate: 0.5
 23 |     # 词向量层丢弃率
 24 |     embed_drop_rate: 0.15
 25 |     # rnn层的内部丢弃率
 26 |     re_drop_rate: 0.15
 27 | 
 28 |   char_rnn:
 29 |     # 是否使用偏旁部首
 30 |     use_radical: False
 31 |     # 是否使用分词信息
 32 |     use_seg: False
 33 |     # 是否使用CRF
 34 |     use_crf: True 
 35 |     # 字符级别rnn隐层大小
 36 |     char_rnn_size: 64
 37 |     # rnn层的类别，有lstm和gru两种
 38 |     rnn_type: lstm
 39 |     # rnn层的个数
 40 |     nb_rnn_layers: 2
 41 |     # 词向量层丢弃率
 42 |     embed_drop_rate: 0.15
 43 |     # 丢弃率
 44 |     drop_rate: 0.5
 45 |     # rnn层的内部丢弃率
 46 |     re_drop_rate: 0.15
 47 | 
 48 |   idcnn:
 49 |     # 词向量层丢弃率
 50 |     embed_drop_rate: 0.15
 51 |     # 丢弃率
 52 |     drop_rate: 0.5
 53 |     # 滤波器个数
 54 |     nb_filters: 64
 55 |     # 卷积核大小
 56 |     conv_kernel_size: 3
 57 |     # 膨胀率
 58 |     dilation_rate: [1, 1, 2]
 59 |     # 膨胀卷积层重复次数
 60 |     repeat_times: 4
 61 |     # 是否使用CRF
 62 |     use_crf: True
 63 | 
 64 | train:
 65 |   # bucket个数
 66 |   nb_bucket: 100
 67 |   # batch大小
 68 |   batch_size: 64
 69 |   # 最大迭代词数
 70 |   epochs: 25
 71 |   # 评估指标
 72 |   metric: f1_seq
 73 |   # 交叉验证的次数
 74 |   nb_fold: 10
 75 |   # 训练模式，有single和fold两种
 76 |   train_mode: single
 77 |   # 测试集比例
 78 |   test_size: 0.2
 79 |   # early_stopping的终止条件
 80 |   patiences: 3
 81 | 
 82 | data:
 83 |   # 最小的token粒度，有word和char两种
 84 |   basic_token: word
 85 |   # 最大词数
 86 |   max_words: 80
 87 |   # 最大字符数
 88 |   max_chars: 120
 89 |   # 最大词内部字符数
 90 |   max_inner_chars: 8
 91 |   # 是否开启词内部序列
 92 |   inner_char: True
 93 |   # 数据格式，有basic和conll两种
 94 |   format: basic
 95 |   # 是否使用偏旁部首
 96 |   use_radical: False
 97 |   # 是否使用分词信息
 98 |   use_seg: False
 99 | 
100 | embed:
101 |   # 是否使用预训练词向量
102 |   pre: False
103 |   # 词向量
104 |   word:
105 |     path: null
106 |     dim: 64
107 |   # 字向量
108 |   char:
109 |     path: null
110 |     dim: 32


--------------------------------------------------------------------------------
/images/entity_visualization_sample.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevewyl/nlp_toolkit/257dabd300b29957a0be38e7a8049a54f2095ccc/images/entity_visualization_sample.jpg


--------------------------------------------------------------------------------
/images/framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevewyl/nlp_toolkit/257dabd300b29957a0be38e7a8049a54f2095ccc/images/framework.jpg


--------------------------------------------------------------------------------
/nlp_toolkit/__init__.py:
--------------------------------------------------------------------------------
 1 | import gc
 2 | import os
 3 | import logging
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from nlp_toolkit.classifier import Classifier
 7 | from nlp_toolkit.labeler import Labeler
 8 | from nlp_toolkit.data import Dataset
 9 | from nlp_toolkit.config import YParams
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | 
13 | try:
14 |     import GPUtil
15 |     from keras.backend.tensorflow_backend import set_session
16 | 
17 |     num_all_gpu = len(GPUtil.getGPUs())
18 |     avail_gpu = GPUtil.getAvailable(order='memory')
19 |     num_avail_gpu = len(avail_gpu)
20 | 
21 |     gpu_no = str(avail_gpu[0])
22 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
23 |     os.environ['CUDA_VISIBLE_DEVICES'] = gpu_no
24 |     logging.info('Choose the most free GPU: %s, currently not support multi-gpus' % gpu_no)
25 | 
26 |     tf_config = tf.ConfigProto()
27 |     tf_config.gpu_options.allow_growth = True
28 |     set_session(tf.Session(config=tf_config))
29 | 
30 | except FileNotFoundError:
31 |     logging.info('nvidia-smi is missing, often means no gpu on this machine. '
32 |                  'fall back to cpu!')
33 | 
34 | gc.disable()
35 | 


--------------------------------------------------------------------------------
/nlp_toolkit/bin/run_classifier:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | 
 4 | from nlp_toolkit.data import Dataset
 5 | from nlp_toolkit.classifier import Classifier
 6 | 
 7 | def get_args():
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument('-model_dir', type=str, required=True,
10 |                          help='directory of a pretrained BERT model')
11 | 
12 | def main(args):
13 |     pass
14 | 
15 | 
16 | if __name__ == '__main__':
17 |     args = get_args()
18 |     main(args)


--------------------------------------------------------------------------------
/nlp_toolkit/bin/run_seq_tagger:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevewyl/nlp_toolkit/257dabd300b29957a0be38e7a8049a54f2095ccc/nlp_toolkit/bin/run_seq_tagger


--------------------------------------------------------------------------------
/nlp_toolkit/callbacks.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Different kinds of callbacks during model training
  3 | """
  4 | 
  5 | import numpy as np
  6 | from collections import defaultdict
  7 | from typing import List
  8 | from pathlib import Path
  9 | from seqeval.metrics import accuracy_score
 10 | from seqeval.metrics import f1_score as f1_seq_score
 11 | from seqeval.metrics import classification_report as sequence_report
 12 | from sklearn.metrics import confusion_matrix, f1_score, classification_report
 13 | from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
 14 | 
 15 | 
 16 | class Top_N_Acc(Callback):
 17 |     """
 18 |     Evaluate model with top n label acc at each epoch
 19 |     """
 20 | 
 21 |     def __init__(self, seq, top_n=5, attention=False, transformer=None):
 22 |         super(Top_N_Acc, self).__init__()
 23 |         self.seq = seq
 24 |         self.top_n = top_n
 25 |         self.t = transformer
 26 |         self.attention = attention
 27 | 
 28 |     def on_epoch_end(self, epoch, logs={}):
 29 |         label_true, label_pred = [], []
 30 |         for i in range(len(self.seq)):
 31 |             x_true, y_true = self.seq[i]
 32 |             y_pred = self.model.predict_on_batch(x_true)
 33 |             if self.attention:
 34 |                 y_pred = y_pred[:, :self.t.label_size]
 35 |             y_true = self.t.inverse_transform(y_true)
 36 |             y_pred = self.t.inverse_transform(y_pred, top_k=self.top_n)
 37 |             label_true.extend(y_true)
 38 |             label_pred.extend(y_pred)
 39 |         assert len(label_pred) == len(label_true)
 40 |         correct = 0
 41 |         for i in range(len(label_pred)):
 42 |             if label_true[i] in label_pred[i]:
 43 |                 correct += 1
 44 |         top_n_acc = correct / len(label_pred)
 45 |         print(' - top_{}_acc: {:04.2f}'.format(self.top_n, top_n_acc * 100))
 46 |         logs['acc_%d' % self.top_n] = np.float64(top_n_acc)
 47 | 
 48 | 
 49 | class F1score(Callback):
 50 |     """
 51 |     Evaluate classification model with f1 score at each epoch
 52 |     """
 53 | 
 54 |     def __init__(self, seq, attention=False, transformer=None):
 55 |         super(F1score, self).__init__()
 56 |         self.seq = seq
 57 |         self.t = transformer
 58 |         self.attention = attention
 59 | 
 60 |     def on_epoch_end(self, epoch, logs={}):
 61 |         label_true, label_pred = [], []
 62 |         for i in range(len(self.seq)):
 63 |             x_true, y_true = self.seq[i]
 64 |             y_true = np.argmax(y_true, -1)
 65 |             y_pred = self.model.predict_on_batch(x_true)
 66 |             if self.attention:
 67 |                 y_pred = y_pred[:, :self.t.label_size]
 68 |             y_pred = np.argmax(y_pred, -1)
 69 |             label_true.extend(y_true)
 70 |             label_pred.extend(y_pred)
 71 | 
 72 |         assert len(label_pred) == len(label_true)
 73 |         f1 = self._calc_f1(label_true, label_pred)
 74 |         assert f1.shape[0] == self.t.label_size
 75 |         for i in range(f1.shape[0]):
 76 |             label = self.t._label_vocab._id2token[i]
 77 |             print(label, '- f1: {:04.2f}'.format(f1[i] * 100))
 78 |         # print(classification_report(label_true, label_pred))
 79 |         logs['f1'] = f1_score(label_true, label_pred, average='weighted')
 80 | 
 81 |     def _calc_f1(self, y_true, y_pred):
 82 |         cm = confusion_matrix(y_true, y_pred)
 83 |         correct_preds = np.diagonal(cm)
 84 |         r = correct_preds / np.sum(cm, axis=1)
 85 |         p = correct_preds / np.sum(cm, axis=0)
 86 |         f1 = 2 * p * r / (p + r)
 87 |         return f1
 88 | 
 89 | 
 90 | class F1score_seq(Callback):
 91 |     """
 92 |     Evaluate sequence labeling model with f1 score at each epoch
 93 |     """
 94 | 
 95 |     def __init__(self, seq, transformer=None):
 96 |         super(F1score_seq, self).__init__()
 97 |         self.seq = seq
 98 |         self.t = transformer
 99 | 
100 |     def get_lengths(self, y_true):
101 |         lengths = []
102 |         for y in np.argmax(y_true, -1):
103 |             try:
104 |                 i = list(y).index(0)
105 |             except ValueError:
106 |                 i = len(y)
107 |             lengths.append(i)
108 |         return lengths
109 | 
110 |     def on_epoch_end(self, epoch, logs={}):
111 |         label_true, label_pred = [], []
112 |         for i in range(len(self.seq)):
113 |             x_true, y_true = self.seq[i]
114 |             lengths = self.get_lengths(y_true)
115 |             y_pred = self.model.predict_on_batch(x_true)
116 |             y_true = self.t.inverse_transform(y_true, lengths)
117 |             y_pred = self.t.inverse_transform(y_pred, lengths)
118 |             label_true.extend(y_true)
119 |             label_pred.extend(y_pred)
120 |         acc = accuracy_score(label_true, label_pred)
121 |         f1 = f1_seq_score(label_true, label_pred)
122 |         print(' - acc: {:04.2f}'.format(acc * 100))
123 |         print(' - f1: {:04.2f}'.format(f1 * 100))
124 |         print(sequence_report(label_true, label_pred))
125 |         logs['f1_seq'] = np.float64(f1)
126 |         logs['seq_acc'] = np.float64(acc)
127 | 
128 | 
129 | class History(Callback):
130 |     def __init__(self, metric: List[str]):
131 |         self.metric = metric
132 | 
133 |     def on_train_begin(self, logs={}):
134 |         self.loss = []
135 |         self.acc = []
136 |         self.val_loss = []
137 |         self.val_acc = []
138 |         self.metrics = defaultdict(list)
139 | 
140 |     def on_batch_end(self, batch, logs={}):
141 |         self.loss.append(logs.get('loss'))
142 |         self.acc.append(logs.get('acc'))
143 | 
144 |     def on_epoch_end(self, epoch, logs={}):
145 |         for m in self.metric:
146 |             self.metrics[m].append(logs.get(m))
147 |         self.val_loss.append(logs.get('val_loss'))
148 |         self.val_acc.append(logs.get('val_acc'))
149 | 
150 | 
151 | def get_callbacks(history=None, log_dir=None, valid=None, metric='f1',
152 |                   transformer=None, early_stopping=True, patiences=3,
153 |                   LRPlateau=True, top_n=5, attention=False):
154 |     """
155 |     Define list of callbacks for Keras model
156 |     """
157 |     callbacks = []
158 |     if valid is not None:
159 |         if metric == 'top_n_acc':
160 |             print('mointor training process using top_%d_acc score' % top_n)
161 |             callbacks.append(Top_N_Acc(valid, top_n, attention, transformer))
162 |         elif metric == 'f1':
163 |             print('mointor training process using f1 score')
164 |             callbacks.append(F1score(valid, attention, transformer))
165 |         elif metric == 'f1_seq':
166 |             print('mointor training process using f1 score and label acc')
167 |             callbacks.append(F1score_seq(valid, transformer))
168 | 
169 |     if log_dir:
170 |         path = Path(log_dir)
171 |         if not path.exists():
172 |             print('Successfully made a directory: {}'.format(log_dir))
173 |             path.mkdir()
174 | 
175 |         file_name = '_'.join(
176 |             ['model_weights', '{epoch:02d}', '{val_acc:2.4f}', '{%s:2.4f}' % metric]) + '.h5'
177 |         weight_file = path / file_name
178 |         save_model = ModelCheckpoint(str(weight_file),
179 |                                      monitor=metric,
180 |                                      verbose=1,
181 |                                      save_best_only=True,
182 |                                      save_weights_only=True,
183 |                                      mode='max')
184 |         callbacks.append(save_model)
185 | 
186 |     if early_stopping:
187 |         print('using Early Stopping')
188 |         callbacks.append(EarlyStopping(
189 |             monitor=metric, patience=patiences, mode='max'))
190 | 
191 |     if LRPlateau:
192 |         print('using Reduce LR On Plateau')
193 |         callbacks.append(ReduceLROnPlateau(
194 |             monitor=metric, factor=0.2, patience=patiences-2, min_lr=0.00001))
195 | 
196 |     if history:
197 |         print('tracking loss history and metrics')
198 |         callbacks.append(history)
199 | 
200 |     return callbacks
201 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/README.md:
--------------------------------------------------------------------------------
 1 | # Chunk分词器使用指南
 2 | 
 3 | 环境依赖：python 3.6.5 (暂时只支持python3)
 4 | 
 5 | **不再维护更新**
 6 | **源代码中已略去相关数据的下载路径，有需要的请邮件联系**
 7 | 
 8 | ## 安装
 9 | 
10 | ```bash
11 | pip install nlp_toolkit
12 | 
13 | # 如果keras_contrib安装失败
14 | pip install git+https://www.github.com/keras-team/keras-contrib.git
15 | ```
16 | 
17 | ## 主要功能
18 | 
19 | 1. 能够输出名词短语
20 | 2. 支持词性输出，名词短语词性为np
21 | 3. 支持名词短语以限定词+中心词的形式输出(以“_”分隔)
22 | 
23 | >不可分割的名词短语是不存在限定词+中心词的形式的，如“机器学习”，而“经典机器学习算法”可拆解为“经典_机器学习_算法”
24 | 
25 | ## 如何使用
26 | 
27 | * 第一次import的时候，会自动下载模型和字典数据  
28 | * 支持单句和多句文本的输入格式，建议以列表的形式传入分词器
29 | 
30 | ```python
31 | from nlp_toolkit.chunk_segmentor import Chunk_Segmentor
32 | cutter = Chunk_Segmentor()
33 | s = '这是一个能够输出名词短语的分词器，欢迎试用！'
34 | res = [item for item in cutter.cut([s] * 10000)] # 1080ti上耗时8s
35 | 
36 | # 提供两个版本，accurate为精确版，fast为快速版但召回会降低一些，默认精确版
37 | cutter = Chunk_Segmentor(mode='accurate')
38 | cutter = Chunk_Segmentor(mode='fast')
39 | # 支持用户自定义字典
40 | # 格式为每行 “词 词性”，必须为utf8编码，词性可省略
41 | cutter = Chunk_Segmentor(user_dict='your_dict.txt')
42 | # 是否输出词性， 默认开启
43 | cutter.cut(s, pos=False)
44 | # 是否需要更细粒度的切分结果， 默认关闭
45 | # 开启后会将部分名词短语以限定词+中心词的形式切开，词性均为np
46 | cutter.cut(s, cut_all=True)
47 | 
48 | # 输出格式（词列表，词性列表，名词短语集合）
49 | [
50 |     (
51 |         ['这', '是', '一个', '能够', '输出', '名词_短语', '的', '分词器', ',', '欢迎', '试用', '!'],
52 |         ['r', 'v', 'mq', 'v', 'vn', 'np', 'ude1', 'np', 'w', 'v', 'v', 'w'],
53 |         ['分词器', '名词_短语']
54 |     )
55 |     ...
56 | ]
57 | ```
58 | 
59 | ## Step 3 后续更新
60 | 
61 | 若存在新的模型和字典数据，会提示你是否需要更新
62 | 
63 | ## To-Do Lists
64 | 
65 | 1. 提升限定词和名词短语的准确性 ---> 新的模型
66 | 2. char模型存在GPU调用内存溢出的问题 ---> 使用cnn提取Nchar信息来代替embedding的方式，缩小模型规模
67 | 3. 自定义字典，支持不同粒度的切分
68 | 4. 多进程模型加载和预测
69 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import glob
  4 | import pickle
  5 | import socket
  6 | from pathlib import Path
  7 | from datetime import datetime
  8 | 
  9 | INIT_PATH = os.path.realpath(__file__)
 10 | STATIC_ROOT = os.path.dirname(INIT_PATH)
 11 | DATA_PATH = Path(STATIC_ROOT) / 'data'
 12 | MD5_FILE_PATH = DATA_PATH / 'model_data.md5'
 13 | UPDATE_TAG_PATH = DATA_PATH / 'last_update.pkl'
 14 | UPDATE_INIT_PATH = DATA_PATH / 'init_update.txt'
 15 | MD5_HDFS_PATH = '/user/xxxx/chunk_segmentor/model_data.md5'
 16 | MODEL_HDFS_PATH = '/user/xxxx/chunk_segmentor/model_data.zip'
 17 | USER_NAME = 'xxxx'
 18 | PASSWORD = 'xxxxx'
 19 | FTP_PATH_1 = 'ftp://xxx.xxx.xx.xx:xx/chunk_segmentor'
 20 | FTP_PATH_2 = 'ftp://xxx.xxx.xx.xx:xx/chunk_segmentor'
 21 | IP = socket.gethostbyname(socket.gethostname())
 22 | 
 23 | 
 24 | def check_version():
 25 |     if MD5_FILE_PATH.exists():
 26 |         src = get_data_md5()
 27 |         if src:
 28 |             flag = update(src)
 29 |             if not flag:
 30 |                 print('模型和数据更新失败！')
 31 |             else:
 32 |                 for fname in glob.glob('model_data.md5*'):
 33 |                     os.remove(fname)
 34 |         else:
 35 |             print('拉取md5文件失败！')
 36 |     else:
 37 |         print("这是第一次启动Chunk分词器。 请耐心等待片刻至数据和模型下载完成。")
 38 |         flag = download()
 39 |         if flag:
 40 |             current_time = datetime.now()
 41 |             init_update_time = str(os.path.getctime(INIT_PATH))
 42 |             pickle.dump(current_time, open(UPDATE_TAG_PATH, 'wb'))
 43 |             with open(UPDATE_INIT_PATH, 'w') as fout:
 44 |                 fout.write(init_update_time)
 45 |         else:
 46 |             print('请寻找一台有hadoop或者能访问ftp://xxx.xxx.xx.xx:xx或者ftp://xxx.xxx.xx.xx:xx的机器')
 47 | 
 48 | 
 49 | def write_config(config_path, new_root_path):
 50 |     content = []
 51 |     with open(config_path, encoding='utf8') as f:
 52 |         for line in f:
 53 |             if line.startswith('root'):
 54 |                 line = 'root={}{}'.format(new_root_path, os.linesep)
 55 |             content.append(line)
 56 |     with open(config_path, 'w', encoding='utf8') as f:
 57 |         f.writelines(content)
 58 | 
 59 | 
 60 | def download():
 61 |     # 下载数据文件
 62 |     ret1 = -1
 63 |     ret2 = -1
 64 |     for fname in glob.glob('model_data.md5*'):
 65 |         os.remove(fname)
 66 |     for fname in glob.glob('model_data.zip*'):
 67 |         os.remove(fname)
 68 | 
 69 |     if not IP.startswith('127'):
 70 |         print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据')
 71 |         ret2 = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' %
 72 |                         (USER_NAME, PASSWORD, FTP_PATH_1))
 73 |         if ret2 == 0:
 74 |             ret1 = os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' %
 75 |                             (USER_NAME, PASSWORD, FTP_PATH_1))
 76 |         if ret1 != 0:
 77 |             print('尝试从hdfs上拉取数据，大约20-30s')
 78 |             ret1 = os.system('hadoop fs -get %s' % MODEL_HDFS_PATH)
 79 |             ret2 = os.system('hadoop fs -get %s' % MD5_HDFS_PATH)
 80 |     else:
 81 |         print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据')
 82 |         ret2 = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' %
 83 |                         (USER_NAME, PASSWORD, FTP_PATH_2))
 84 |         if ret2 == 0:
 85 |             ret1 = os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' %
 86 |                             (USER_NAME, PASSWORD, FTP_PATH_2))
 87 |     if ret1 != 0 or ret2 != 0:
 88 |         return False
 89 |     if ret1 == 0 and ret2 == 0:
 90 |         os.system('unzip -q model_data.zip')
 91 |         os.system('cp -r model_data/data %s' % STATIC_ROOT)
 92 |         os.system('cp -f model_data/best_model.txt %s' % DATA_PATH)
 93 |         os.system('cp -f model_data.md5 %s' % DATA_PATH)
 94 |         os.system('rm -r model_data')
 95 |         os.system('rm model_data.md5*')
 96 |         os.system('rm model_data.zip*')
 97 |         print('数据和模型下载成功')
 98 |         return True
 99 | 
100 | 
101 | def get_data_md5():
102 |     for fname in glob.glob('model_data.md5*'):
103 |         os.remove(fname)
104 |     ret = -1
105 | 
106 |     if not IP.startswith('127'):
107 |         ret = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' %
108 |                         (USER_NAME, PASSWORD, FTP_PATH_1))
109 |         if ret == 0:
110 |             src = 'ftp1'
111 |         else:
112 |             ret = os.system('hadoop fs -get /user/kdd_wangyilei/chunk_segmentor/model_data.md5')
113 |             if ret == 0:
114 |                 src = 'hdfs'
115 |     else:
116 |         ret = os.system('wget -q --timeout=2 --tries=1 --ftp-user=%s --ftp-password=%s %s/model_data.md5' % 
117 |                         (USER_NAME, PASSWORD, FTP_PATH_2))
118 |         if ret == 0:
119 |             src = 'ftp2'
120 |     if ret != 0:
121 |         print('请寻找一台有hadoop或者能访问ftp://xxx.xxx.xx.xx:xx或者ftp://xxx.xxx.xx.xx:xx的机器')
122 |         return None
123 |     else:
124 |         return src
125 | 
126 | 
127 | def update(src):
128 |     with open(MD5_FILE_PATH, 'rb') as f:
129 |         current_data_md5 = f.readlines()[0].strip()
130 |     with open('model_data.md5', 'rb') as f:
131 |         latest_data_md5 = f.readlines()[0].strip()
132 |     try:
133 |         if current_data_md5 != latest_data_md5:
134 |             x = input('发现新的数据和模型？是否决定下载更新？ Yes/No?')
135 |             if x in ['Yes', 'Y', 'y', 'YES', '1', 1, 'yes'] or x == '':
136 |                 flag = update_data(src)
137 |                 if flag:
138 |                     print('模型和字典数据已更新到最新版本')
139 |                     return True
140 |                 else:
141 |                     return False
142 |             else:
143 |                 print('希望您下次来更新数据！')
144 |                 return True
145 |         else:
146 |             return True
147 |     except:
148 |         return False
149 | 
150 | 
151 | def update_data(src):
152 |     try:
153 |         for fname in glob.glob('model_data.zip*'):
154 |             os.remove(fname)
155 |         if src == 'hdfs':
156 |             print('尝试从hdfs上拉取数据，大约20-30s')
157 |             os.system('hadoop fs -get /user/xxxxx/chunk_segmentor/model_data.zip')
158 |         elif src == 'ftp1':
159 |             print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据')
160 |             os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' % (USER_NAME, PASSWORD, FTP_PATH_1))
161 |         elif src == 'ftp2':
162 |             print('尝试从ftp://xxx.xxx.xx.xx:xx获取数据')
163 |             os.system('wget --ftp-user=%s --ftp-password=%s %s/model_data.zip' % (USER_NAME, PASSWORD, FTP_PATH_2))
164 | 
165 |         os.system('unzip -q model_data.zip')
166 |         os.system('rm -r %s' % DATA_PATH)
167 |         os.system('cp -r model_data/data %s' % STATIC_ROOT)
168 |         os.system('cp -f model_data/best_model.txt %s' % DATA_PATH)
169 |         os.system('cp -f model_data.md5 %s' % DATA_PATH)
170 |         os.system('rm -r model_data')
171 |         os.system('rm model_data.md5*')
172 |         os.system('rm model_data.zip*')
173 |         return True
174 |     except:
175 |         return False
176 | 
177 | 
178 | check_version()
179 | from .segment import Chunk_Segmentor
180 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/segment.py:
--------------------------------------------------------------------------------
  1 | # ======主程序========
  2 | import sys
  3 | import os
  4 | import pickle
  5 | import time
  6 | import logging
  7 | from pathlib import Path
  8 | from collections import Counter
  9 | 
 10 | import jieba
 11 | import jieba.posseg as pseg
 12 | from nlp_toolkit.chunk_segmentor.trie import Trie
 13 | from nlp_toolkit.chunk_segmentor.utils import read_line, flatten_gen, sent_split, preprocess, jieba_cut
 14 | from nlp_toolkit.sequence import IndexTransformer
 15 | from nlp_toolkit.models import Word_RNN, Char_RNN
 16 | from nlp_toolkit.chunk_segmentor.tagger import Tagger
 17 | 
 18 | global model_loaded
 19 | global last_model_name
 20 | global Tree
 21 | global Labeler
 22 | global load_dict
 23 | global load_qualifier
 24 | global qualifier_dict
 25 | last_model_name = ''
 26 | tree_loaded = False
 27 | Labeler = None
 28 | Tree = None
 29 | load_dict = False
 30 | load_qualifier = False
 31 | qualifier_dict = None
 32 | 
 33 | # 关闭jieba的日志输出
 34 | jieba.setLogLevel(logging.INFO)
 35 | 
 36 | 
 37 | class Chunk_Labeler(object):
 38 |     def __init__(self, model_name='word-rnn', tagger=None):
 39 |         self.model_name = model_name
 40 |         if self.model_name != 'word-rnn':
 41 |             print('char-rnn model will update soon!')
 42 |             sys.exit()
 43 |         self.tagger = tagger
 44 | 
 45 |     def analyze(self, text, has_seq=True, char_input=False,
 46 |                 mode='batch', batch_size=256, radical_file=''):
 47 |         if mode == 'single':
 48 |             batch_size = 1
 49 |         if not self.tagger:
 50 |             if self.model_name in ['char-rnn', 'idcnn']:
 51 |                 char_input = True
 52 |             self.tagger = Tagger(self.model, self.p, char_input,
 53 |                                  mode, batch_size, radical_file)
 54 |         return self.tagger.analyze(text)
 55 | 
 56 |     @classmethod
 57 |     def load(cls, model_name, weight_file, params_file, preprocessor_file):
 58 |         self = cls(model_name=model_name)
 59 |         self.p = IndexTransformer.load(preprocessor_file)
 60 |         if model_name == 'word-rnn':
 61 |             self.model = Word_RNN.load(weight_file, params_file)
 62 |         elif model_name == 'char-rnn':
 63 |             self.model = Char_RNN.load(weight_file, params_file)
 64 |         else:
 65 |             print('No other available models for chunking')
 66 |             print('Please use word-rnn or char-rnn')
 67 |         return self
 68 | 
 69 | 
 70 | class Chunk_Segmentor(object):
 71 |     def __init__(self, user_dict='', model_name='word-rnn', mode='accurate', verbose=0):
 72 |         try:
 73 |             assert mode in ['accurate', 'fast']
 74 |         except:
 75 |             print('Only support three following mode: accurate, fast')
 76 |             sys.exit()
 77 |         self.pos = True
 78 |         self.mode = mode
 79 |         self.verbose = verbose
 80 |         self.path = os.path.abspath(os.path.dirname(__file__))
 81 |         if model_name != '':
 82 |             self.model_name = model_name
 83 |         else:
 84 |             try:
 85 |                 self.model_name = read_line(Path(self.path) / 'data' / 'best_model.txt')[0]
 86 |             except Exception:
 87 |                 self.model_name = model_name
 88 | 
 89 |         # jieba初始化
 90 |         base_dict = Path(self.path) / 'data' / 'dict' / 'jieba_base_supplyment.txt'
 91 |         jieba.load_userdict(str(base_dict))
 92 |         if mode == 'fast':
 93 |             global load_dict
 94 |             if not load_dict:
 95 |                 if self.verbose:
 96 |                     print('loading np dict to jieba cache')
 97 |                 dict_path = Path(self.path) / 'data' / 'dict' / 'chunk_pos.txt'
 98 |                 jieba.load_userdict(str(dict_path))
 99 |                 load_dict = True
100 |         if user_dict:
101 |             jieba.load_userdict(user_dict)
102 |         self.seg = pseg
103 | 
104 |         # model变量
105 |         self.weight_file = os.path.join(self.path, 'data/model/%s_weights.h5' % self.model_name)
106 |         self.param_file = os.path.join(self.path, 'data/model/%s_parameters.json' % self.model_name)
107 |         self.preprocess_file = os.path.join(self.path, 'data/model/%s_transformer.h5' % self.model_name)
108 |         self.define_tagger()
109 | 
110 |     def define_tagger(self):
111 |         global load_qualifier
112 |         global qualifier_dict
113 |         if not load_qualifier:
114 |             qualifier_word_path = os.path.join(self.path, 'data/dict/chunk_qualifier.dict')
115 |             self.qualifier_word = pickle.load(open(qualifier_word_path, 'rb'))
116 |             load_qualifier = True
117 |             qualifier_dict = self.qualifier_word
118 |         else:
119 |             self.qualifier_word = qualifier_dict
120 | 
121 |         self.basic_token = 'char' if self.model_name[:4] == 'char' else 'word'
122 | 
123 |         # acc模式变量
124 |         if self.mode == 'accurate':
125 |             global tree_loaded
126 |             global last_model_name
127 |             global Labeler
128 |             global Tree
129 |             if self.verbose:
130 |                 if not load_dict:
131 |                     print('Model and Trie Tree are loading. It will cost 10-20s.')
132 |             if self.model_name != last_model_name:
133 |                 self.labeler = Chunk_Labeler.load(
134 |                     self.model_name, self.weight_file, self.param_file, self.preprocess_file)
135 |                 if self.verbose:
136 |                     print('load model succeed')
137 |                 last_model_name = self.model_name
138 |                 Labeler = self.labeler
139 |             else:
140 |                 self.labeler = Labeler
141 |             if not tree_loaded:
142 |                 chunk_dict = read_line(os.path.join(self.path, 'data/dict/chunk.txt'))
143 |                 self.tree = Trie()
144 |                 for chunk in chunk_dict:
145 |                     self.tree.insert(chunk)
146 |                 if self.verbose:
147 |                     print('trie tree succeed')
148 |                 tree_loaded = True
149 |                 Tree = self.tree
150 |             else:
151 |                 self.tree = Tree
152 |             radical_file = os.path.join(self.path, 'data/dict/radical.txt')
153 |             self.tagger = Tagger(self.labeler.model, self.labeler.p,
154 |                                  basic_token=self.basic_token, radical_file=radical_file,
155 |                                  tree=self.tree, qualifier_dict=self.qualifier_word,
156 |                                  verbose=self.verbose)
157 | 
158 |     @property
159 |     def get_segmentor_info(self):
160 |         params = {'model_name': self.model_name,
161 |                   'mode': self.mode,
162 |                   'pos': self.pos}
163 |         return params
164 | 
165 |     def extract_item(self, item):
166 |         C_CUT_WORD, C_CUT_POS, C_CUT_CHUNK = 0, 1, 2
167 |         complete_words = [sub[C_CUT_WORD] for sub in item]
168 |         complete_poss = [sub[C_CUT_POS] for sub in item]
169 |         if load_dict:
170 |             all_chunks = [x for sub in item for x, y in zip(
171 |                 sub[C_CUT_WORD], sub[C_CUT_POS]) if y == 'np']
172 |         else:
173 |             all_chunks = list(flatten_gen([sub[C_CUT_CHUNK] for sub in item]))
174 |         words = list(flatten_gen(complete_words))
175 |         poss = list(flatten_gen(complete_poss))
176 |         if self.cut_all:
177 |             words, poss = zip(*[(x1, y1) for x, y in zip(words, poss) for x1, y1 in self.cut_qualifier(x, y)])
178 |         words = [' ' if word == 's_' else word for word in words]
179 |         if self.pos:
180 |             d = (words,   # C_CUT_WORD
181 |                  poss,    # C_CUT_POS
182 |                  list(dict.fromkeys(all_chunks)))   # C_CUT_CHUNK
183 |         else:
184 |             d = (words, list(dict.fromkeys(all_chunks)))
185 |         if self.verbose:
186 |             print(d)
187 |         return d
188 | 
189 |     def cut_qualifier(self, x, y):
190 |         if y == 'np' and '_' in x and x not in ['s_', 'ss_', 'lan_']:
191 |             for sub_word in x.split('_'):
192 |                 yield sub_word, y
193 |         else:
194 |             yield x, y
195 | 
196 |     def output(self, data):
197 |         idx_list, strings = zip(
198 |             *[[idx, sub] for idx, item in enumerate(data) for sub in sent_split(preprocess(item))])
199 |         cc = list(Counter(idx_list).values())
200 |         end_idx = [sum(cc[:i]) for i in range(len(cc)+1)]
201 |         seg_res = jieba_cut(strings, self.seg,
202 |                             self.qualifier_word, mode=self.mode,
203 |                             dict_loaded=load_dict)
204 |         if self.verbose:
205 |             print(seg_res)
206 |         if self.mode == 'accurate':
207 |             outputs, _ = self.tagger.analyze(seg_res)
208 |         else:
209 |             outputs = [list(zip(*item)) for item in seg_res]
210 |         if self.verbose:
211 |             print(outputs)
212 |         new_res = (outputs[end_idx[i]: end_idx[i+1]]
213 |                    for i in range(len(end_idx)-1))
214 |         for item in new_res:
215 |             yield self.extract_item(item)
216 | 
217 |     def cut(self, data, batch_size=512, pos=True, cut_all=False):
218 |         if isinstance(data, str):
219 |             data = [data]
220 |         if not pos:
221 |             self.pos = False
222 |         else:
223 |             self.pos = True
224 |         if not cut_all:
225 |             self.cut_all = False
226 |         else:
227 |             self.cut_all = True
228 |         self.define_tagger()
229 |         assert isinstance(data, list)
230 |         data_cnt = len(data)
231 |         num_batches = int(data_cnt / batch_size) + 1
232 |         if self.verbose:
233 |             print('total_batch_num: ', num_batches)
234 |         for batch_num in range(num_batches):
235 |             start_index = batch_num * batch_size
236 |             end_index = min((batch_num + 1) * batch_size, data_cnt)
237 |             batch_input = data[start_index:end_index]
238 |             for res in self.output(batch_input):
239 |                 yield res
240 | 
241 | 
242 | if __name__ == "__main__":
243 |     cutter = Chunk_Segmentor(verbose=1)
244 |     cutter.cut('这是一个能够输出名词短语的分词器，欢迎试用！')
245 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/tagger.py:
--------------------------------------------------------------------------------
  1 | """预测类"""
  2 | 
  3 | import re
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | from pathlib import Path
  7 | from collections import Counter
  8 | from seqeval.metrics.sequence_labeling import get_entities
  9 | from nlp_toolkit.chunk_segmentor.utils import flatten_gen, tag_by_dict, read_line, compare_idx
 10 | 
 11 | global special_tokens
 12 | global graph
 13 | special_tokens = set(['s_', 'lan_', 'ss_'])
 14 | graph = tf.get_default_graph()
 15 | 
 16 | 
 17 | def check_in(check_list, filter_list):
 18 |     combine = set(check_list) & filter_list
 19 |     if len(combine) > 0:
 20 |         return True
 21 |     else:
 22 |         return False
 23 | 
 24 | 
 25 | # judge char type ['cn', 'en', 'num', 'other']
 26 | def char_type(word):
 27 |     for char in word:
 28 |         unicode_char = ord(char)
 29 |         if unicode_char >= 19968 and unicode_char <= 40869:
 30 |             yield (char, 'cn')
 31 |         elif unicode_char >= 65 and unicode_char <= 122:
 32 |             yield (char, 'en')
 33 |         elif unicode_char >= 48 and unicode_char <= 57:
 34 |             yield (char, 'num')
 35 |         else:
 36 |             yield (char, 'other')
 37 | 
 38 | 
 39 | # split word into chars
 40 | def split_cn_en(word):
 41 |     new_word = [c for c in char_type(word)]
 42 |     new_word_len = len(new_word)
 43 |     tmp = ''
 44 |     for ix, item in enumerate(new_word):
 45 |         if item[1] in {'en', 'num'}:
 46 |             if ix < new_word_len - 1:
 47 |                 if new_word[ix+1][1] == item[1]:
 48 |                     tmp += item[0]
 49 |                 else:
 50 |                     tmp += item[0]
 51 |                     yield tmp
 52 |                     tmp = ''
 53 |             else:
 54 |                 tmp += item[0]
 55 |                 yield tmp
 56 |         else:
 57 |             yield item[0]
 58 | 
 59 | 
 60 | def split_word(word):
 61 |     word, pos = word.rsplit('-', 1)
 62 |     if len(word) == 1 or word in special_tokens or not re.search(r'[^a-z0-9]+', word):
 63 |         yield [word, word, pos, 'S']
 64 |     else:
 65 |         char_list = list(split_cn_en(word))
 66 |         l_c = len(char_list)
 67 |         word_list = [word] * l_c
 68 |         pos_list = [pos] * l_c
 69 |         seg_list = ['M'] * l_c
 70 |         seg_list[0] = 'B'
 71 |         seg_list[-1] = 'E'
 72 |         for i in range(l_c):
 73 |             yield [char_list[i], word_list[i], pos_list[i], seg_list[i]]
 74 | 
 75 | 
 76 | def word2char(word_list):
 77 |     return list(flatten_gen([list(split_word(word)) for word in word_list]))
 78 | 
 79 | 
 80 | def chunk_list(word_list, max_length):
 81 |     l_w = len(word_list)
 82 |     if l_w > max_length:
 83 |         for i in range(0, len(word_list), max_length):
 84 |             yield word_list[0+i: max_length+i]
 85 |     else:
 86 |         yield word_list
 87 | 
 88 | 
 89 | def split_sent(possible_idx, num_split, max_length, word_list):
 90 |     start = 0
 91 |     end = max_length
 92 |     if len(possible_idx) > 0:
 93 |         for _ in range(num_split):
 94 |             sub_possible_idx = [
 95 |                 idx for idx in possible_idx if idx > start and idx <= end]
 96 |             if sub_possible_idx != []:
 97 |                 end = max(sub_possible_idx, key=lambda x: x - end)
 98 |                 yield word_list[start:end+1]
 99 |                 start = end + 1
100 |             end += max_length
101 |         yield word_list[start:]
102 |     else:
103 |         yield word_list
104 | 
105 | 
106 | def split_long_sent(word_list, max_length):
107 |     if len(word_list) <= max_length:
108 |         return [word_list]
109 |     num_split = int(len(word_list) / max_length)
110 |     possible_split = [',', '.', 's_', '、', '/']
111 |     possible_idx = [idx for idx, item in enumerate(word_list) if item[0] in possible_split]
112 |     split_text = split_sent(possible_idx, num_split, max_length, word_list)
113 |     new_list = [sub_item for item in split_text for sub_item in chunk_list(item, max_length)]
114 |     return new_list
115 | 
116 | 
117 | def get_radical(d, char_list):
118 |     return [d[char] if char in d else '<unk>' for char in char_list]
119 | 
120 | 
121 | class Tagger(object):
122 |     def __init__(self, model, preprocessor, basic_token='word', pos=True,
123 |                  batch_size=512, radical_file='', tree=None,
124 |                  qualifier_dict=None, verbose=0):
125 |         self.wrong = []
126 |         self.model = model
127 |         self.p = preprocessor
128 |         self.basic_token = basic_token
129 |         self.pos = pos
130 |         self.tree = tree
131 |         self.qualifier_dict = qualifier_dict
132 |         self.verbose = verbose
133 |         if self.basic_token == 'char':
134 |             if self.p.radical_vocab_size > 2:
135 |                 self.use_radical = True
136 |                 self.radical_dict = {item.split('\t')[0]: item.split(
137 |                     '\t')[1] for item in read_line(radical_file)}
138 |             else:
139 |                 self.use_radical = False
140 |             if self.p.seg_vocab_size > 2:
141 |                 self.use_seg = True
142 |             else:
143 |                 self.use_seg = False
144 |         elif self.basic_token == 'word':
145 |             self.use_radical = False
146 |             self.use_seg = False
147 |             if self.p.char_vocab_size > 2:
148 |                 self.use_inner_char = True
149 |             else:
150 |                 self.use_inner_char = False
151 | 
152 |         self.char_tokenizer = word2char
153 |         self.word_tokenizer = str.split
154 |         self.batch_size = batch_size
155 | 
156 |         dict_path = Path(__file__).parent / 'data' / 'dict'
157 |         self.stopwords = set(read_line(dict_path / 'stopwords.txt'))
158 |         self.stopwords_first = set(read_line(dict_path / 'stopwords_first_word.txt'))
159 |         self.stopwords_last = set(read_line(dict_path / 'stopwords_last_word.txt'))
160 |         self.pos_filter = set(read_line(dict_path / 'pos_filter_jieba.txt'))
161 |         self.pos_filter_first = set(read_line(dict_path / 'pos_filter_first_jieba.txt'))
162 |         self.MAIN_INPUT_IDX = 0
163 |         self.POS_IDX = 1
164 |         self.SEG_IDX = 2
165 |         self.RADICAL_IDX = 3
166 |         self.WORD_IDX = 4
167 | 
168 |     @property
169 |     def get_tagger_info(self):
170 |         params = {'basic_token': self.basic_token,
171 |                   'pos': self.pos,
172 |                   'batch_size': self.batch_size,
173 |                   'use_seg': self.use_seg,
174 |                   'use_radical': self.use_radical,
175 |                   'use_inner_char': self.use_inner_char}
176 |         return params
177 | 
178 |     def data_generator(self, batch_input):
179 |         input_data = {}
180 |         batch_input = [self.preprocess_data(item) for item in batch_input]
181 |         text_pos_idx = [(idx, each, item['pos'][i]) for idx, item in enumerate(batch_input) for i, each in enumerate(item['token'])]
182 |         sent_idx, sub_text, sub_pos = zip(*text_pos_idx)
183 | 
184 |         try:
185 |             input_data['token'] = sub_text
186 |             input_data['pos'] = sub_pos
187 |             if self.basic_token == 'char':
188 |                 input_data['word'] = [each for item in batch_input for each in item['word']]
189 |                 if self.use_seg:
190 |                     input_data['seg'] = [each for item in batch_input for each in item['seg']]
191 |                 if self.use_radical:
192 |                     input_data['radical'] = [each for item in batch_input for each in item['radical']]
193 |             else:
194 |                 if self.use_inner_char:
195 |                     pass
196 |             cc = list(Counter(sent_idx).values())
197 |             end_idx = [sum(cc[:i]) for i in range(len(cc)+1)]
198 |             return end_idx, input_data
199 |         except Exception as e:
200 |             print(e)
201 |             length = [len(each) for idx, item in enumerate(batch_input) for each in item['token']]
202 |             print(len(batch_input), length, sub_text)
203 |             self.wrong.append(len(batch_input), length, sub_text)
204 | 
205 |     def preprocess_data(self, seg_res):
206 |         assert isinstance(seg_res, list)
207 |         assert len(seg_res) > 0
208 |         input_data = {}
209 |         if self.basic_token == 'char':
210 |             string_c = self.char_tokenizer(seg_res)
211 |             string_c = list(flatten_gen([sub_item for item in string_c for sub_item in split_long_sent(item, self.p.max_tokens)]))
212 |             try:
213 |                 input_data['token'] = [item[0] for item in string_c]
214 |                 input_data['word'] = [item[1] for item in string_c]
215 |                 input_data['pos'] = [item[2] for item in string_c]
216 |                 if self.use_seg:
217 |                     input_data['seg'] = [item[3] for item in string_c]
218 |             except Exception as e:
219 |                 print('char tokenizer error: ', e)
220 |                 print(string_c)
221 |             if self.use_radical:
222 |                 input_data['radical'] = [get_radical(self.radical_dict, item) for item in input_data['token']]
223 |         else:
224 |             string_w = split_long_sent([item.split('-')
225 |                                         for item in seg_res], self.p.max_tokens)
226 |             input_data['token'] = [[each[0] for each in item] for item in string_w]
227 |             input_data['pos'] = [[each[1] for each in item] for item in string_w]
228 |         return input_data
229 | 
230 |     def predict_proba_batch(self, batch_data):
231 |         split_text = batch_data['token']
232 |         pos = batch_data['pos']
233 |         if self.basic_token == 'char':
234 |             segs = batch_data['seg']
235 |             words = batch_data['word']
236 |         else:
237 |             segs = []
238 |             words = []
239 |         X = self.p.transform(batch_data)
240 |         with graph.as_default():
241 |             Y = self.model.model.predict_on_batch(X)
242 |         return split_text, pos, Y, segs, words
243 | 
244 |     def _get_prob(self, pred):
245 |         prob = np.max(pred, -1)
246 |         return prob
247 | 
248 |     def _get_tags(self, pred):
249 |         tags = self.p.inverse_transform([pred])
250 |         tags = tags[0]
251 |         return tags
252 | 
253 |     def _build_response(self, split_text, tags, poss, segs=[], words=[]):
254 |         if self.basic_token == 'char':
255 |             res = {
256 |                 'words': split_text,
257 |                 'pos': poss,
258 |                 'char_pos': poss,
259 |                 'char_word': words,
260 |                 'seg': segs,
261 |                 'entities': []
262 |             }
263 |         else:
264 |             res = {
265 |                 'words': split_text,
266 |                 'pos': poss,
267 |                 'entities': []
268 |             }
269 |         chunks = get_entities(tags)
270 |         for chunk_type, chunk_start, chunk_end in chunks:
271 |             chunk = self.post_process_chunk(chunk_type, chunk_start, chunk_end, split_text, poss)
272 |             if chunk is not None:
273 |                 entity = {
274 |                     'text': chunk,
275 |                     'type': chunk_type,
276 |                     'beginOffset': chunk_start,
277 |                     'endOffset': chunk_end
278 |                 }
279 |                 res['entities'].append(entity)
280 |         return res
281 | 
282 |     def post_process_chunk(self, chunk_type, chunk_start, chunk_end, split_text, pos):
283 |         if chunk_type == 'Chunk':
284 |             chunk_inner_words = split_text[chunk_start: chunk_end+1]
285 |             chunk = ''.join(chunk_inner_words)
286 |             check_char = not re.search(r'[^a-zA-Z0-9\u4e00-\u9fa5\.\+#]+', chunk)
287 |             if len(chunk) < 15 and len(chunk) > 2 and check_char and len(chunk_inner_words) > 1:
288 |                 chunk_inner_poss = pos[chunk_start: chunk_end+1]
289 |                 filter_flag = any([check_in(chunk_inner_words, self.stopwords),
290 |                                    check_in([chunk_inner_words[0]], self.stopwords_first),
291 |                                    check_in([chunk_inner_words[-1]], self.stopwords_last),
292 |                                    check_in(chunk_inner_poss, self.pos_filter),
293 |                                    check_in([chunk_inner_poss[-1]], self.pos_filter_first)])
294 |                 if not filter_flag:
295 |                     return chunk
296 |             else:
297 |                 return None
298 |         else:
299 |             return None
300 | 
301 |     def output(self, res):
302 |         if self.verbose:
303 |             print(res)
304 |         words = res['words']
305 |         poss = res['pos']
306 |         dict_idx = tag_by_dict(words, self.tree)
307 |         model_idx = [[item['beginOffset'], item['endOffset']] for item in res['entities']]
308 |         new_idx = sorted(list(compare_idx(dict_idx, model_idx)), key=lambda x: x[1])
309 |         new_idx = [item for item in new_idx if item[0] != item[1]]
310 |         new_word = []
311 |         new_pos = []
312 |         new_chunk = []
313 |         if self.basic_token == 'char':
314 |             seg = res['seg']
315 |             tag = ['O'] * len(seg)
316 |             char_pos = res['char_pos']
317 |             char_word = res['char_word']
318 |             assert len(char_pos) == len(seg) == len(char_word)
319 |             for s, e in new_idx:
320 |                 tag[s:e] = ['B-Chunk'] + ['I-Chunk'] * (e-s-1) + ['E-Chunk']
321 |             chunks = {e: ''.join(words[s:e+1]) for s, e in new_idx}
322 |             start = 0
323 |             mid = 0
324 |             for j, item_BEMS in enumerate(seg):
325 |                 if tag[j] == 'O':
326 |                     if item_BEMS == 'S':
327 |                         new_word.append(char_word[j])
328 |                         new_pos.append(char_pos[j])
329 |                     elif item_BEMS == 'E':
330 |                         if not tag[j-1].endswith('Chunk'):
331 |                             if not tag[start].endswith('Chunk'):
332 |                                 new_word.append(char_word[j])
333 |                             else:
334 |                                 new_word.append(''.join(words[mid:j]))
335 |                         else:
336 |                             new_word.append(words[j])
337 |                         new_pos.append(char_pos[j])
338 |                     else:
339 |                         if item_BEMS == 'B':
340 |                             start = j
341 |                         if tag[j+1].endswith('Chunk'):
342 |                             new_word.append(''.join(words[start:j]))
343 |                             new_pos.append(char_pos[j])
344 |                         if tag[j-1].endswith('Chunk') and item_BEMS == 'M':
345 |                             mid = j
346 |                 elif tag[j] == 'E-Chunk':
347 |                     try:
348 |                         chunk = chunks[j]
349 |                         if chunk in self.qualifier_dict:
350 |                             qualifier_word = self.qualifier_dict[chunk]
351 |                             new_word.append(qualifier_word)
352 |                             new_chunk.append(qualifier_word)
353 |                         else:
354 |                             new_word.append(chunk)
355 |                             new_chunk.append(chunk)
356 |                     except Exception as e:
357 |                         print(e)
358 |                     new_pos.append('np')
359 |         else:
360 |             chunks = {item[1]: ''.join(words[item[0]: item[1]+1]) for item in new_idx}
361 |             if self.verbose:
362 |                 print(chunks)
363 |             chunk_idx = [i for item in new_idx for i in range(item[0], item[1] + 1)]
364 |             for i, item in enumerate(words):
365 |                 if i not in chunk_idx:
366 |                     new_word.append(item)
367 |                     new_pos.append(poss[i])
368 |                 else:
369 |                     if i in chunks.keys():
370 |                         chunk = chunks[i]
371 |                         if chunk in self.qualifier_dict:
372 |                             qualifier_word = self.qualifier_dict[chunk]
373 |                             new_word.append(qualifier_word)
374 |                             new_chunk.append(qualifier_word)
375 |                         else:
376 |                             new_word.append(chunk)
377 |                             new_chunk.append(chunk)
378 |                         new_pos.append('np')
379 |         try:
380 |             assert len(new_word) == len(new_pos)
381 |         except Exception as e:
382 |             print('new word list length not equals with new pos list')
383 |             print(new_word, len(new_word))
384 |             print(new_pos, len(new_pos))
385 |             print(chunks)
386 |             print(dict_idx, model_idx, new_idx)
387 |         return (new_word, new_pos, new_chunk)  # C_WORD=0 C_POS=1 C_CHUNK=2
388 | 
389 |     def analyze(self, text):
390 |         assert isinstance(text, list) or isinstance(text, tuple)
391 |         final_res = []
392 |         sent_idx, batch_data = self.data_generator(text)
393 |         split_text, split_pos, pred, segs, word = self.predict_proba_batch(batch_data)
394 |         split_text = [split_text[sent_idx[i]:sent_idx[i+1]] for i in range(len(sent_idx)-1)]
395 |         split_pos = [split_pos[sent_idx[i]:sent_idx[i+1]] for i in range(len(sent_idx)-1)]
396 |         pred = [np.array(pred[sent_idx[i]:sent_idx[i+1]]) for i in range(len(sent_idx)-1)]
397 |         if self.verbose:
398 |             print(pred)
399 |         if self.basic_token == 'char':
400 |             segs = [segs[sent_idx[i]:sent_idx[i+1]] for i in range(len(sent_idx)-1)]
401 |             word = [word[sent_idx[i]:sent_idx[i+1]] for i in range(len(sent_idx)-1)]
402 |             assert len(segs) == len(split_text) == len(pred)
403 |         for k, item in enumerate(pred):
404 |             tmp_y = [y[:len(x)] for x, y in zip(split_text[k], item)]
405 |             Y = np.concatenate(tmp_y)
406 |             words = list(flatten_gen(split_text[k]))
407 |             poss = list(flatten_gen(split_pos[k]))
408 |             # assert len(words) == len(poss)
409 |             if self.basic_token == 'char':
410 |                 split_segs = list(flatten_gen(segs[k]))
411 |                 split_words = list(flatten_gen(word[k]))
412 |             else:
413 |                 split_segs = []
414 |                 split_words = []
415 |             tags = self._get_tags(Y)
416 |             if self.verbose:
417 |                 print(tags)
418 |             # prob = self._get_prob(Y)
419 |             res = self._build_response(words, tags, poss, split_segs, split_words)
420 |             final_res.append(self.output(res))
421 |         return final_res, self.wrong
422 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/tests/data.sh:
--------------------------------------------------------------------------------
 1 | rm model_data.zip
 2 | zip -qr model_data.zip model_data
 3 | echo "zip data folder successfully"
 4 | md5sum model_data.zip > model_data.md5
 5 | echo "calculate md5 successfully"
 6 | hadoop fs -rm chunk_segmentor/model_data.md5
 7 | hadoop fs -rm chunk_segmentor/model_data.zip
 8 | hadoop fs -put model_data.zip chunk_segmentor
 9 | hadoop fs -put model_data.md5 chunk_segmentor
10 | echo "commit new data file to hdfs successfully"
11 | PUTFILE_1 = model_data.md5
12 | PUTFILE_2 = model_data.zip
13 | ftp -v -n 192.168.8.23 << EOF
14 | user yilei.wang ifchange0829FWGR
15 | delete chunk_segmentor/model_data.md5
16 | delete chunk_segmentor/model_data.zip
17 | put model_data.md5 chunk_segmentor/model_data.md5
18 | put model_data.zip chunk_segmentor/model_data.zip
19 | bye
20 | EOF
21 | echo "commit new data file to ftp successfully"
22 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/tests/test_functions.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # sys.path.append('../../..')
 3 | from nlp_toolkit.chunk_segmentor import Chunk_Segmentor
 4 | import time
 5 | import os
 6 | 
 7 | VERBOSE = 1
 8 | text = '主要配合UI设计师100%还原设计图，使用前端技术解决各大浏览器兼容问题，使用HTML5+css3完成页面优化及提高用户体验，www.s.com使用bootstrap、jQuery完成界面效果展示，使用JavaScript完成页面功能展示，并且，在规定的时间内提前完成任务大大提高了工作的效率'
 9 | 
10 | print('test model loading')
11 | cutter = Chunk_Segmentor(verbose=VERBOSE)
12 | 
13 | print('test Chunk_Segmentor object reload')
14 | start = time.time()
15 | cutter = Chunk_Segmentor(verbose=VERBOSE)
16 | if time.time() - start < 1:
17 |     pass
18 | else:
19 |     print('not pass reload model. Quit!')
20 |     sys.exit()
21 | 
22 | '''
23 | print('test switch model')
24 | cutter = Chunk_Segmentor(model_name='char-rnn', verbose=VERBOSE)
25 | print(list(cutter.cut(text)))
26 | '''
27 | 
28 | print('test cutting performance')
29 | cutter = Chunk_Segmentor(verbose=VERBOSE)
30 | start = time.time()
31 | print(list(cutter.cut(text, pos=False)))
32 | print('cut single sentence used {:04.2f}s'.format(time.time() - start))
33 | print('test pos')
34 | print(list(cutter.cut(text, pos=True)))
35 | print('test cut_all')
36 | print(list(cutter.cut(text, cut_all=True)))
37 | 
38 | print('test user dict')
39 | fin = open('user_dict.txt', 'w', encoding='utf8')
40 | fin.write('用户体验 np\n')
41 | fin.close()
42 | cutter = Chunk_Segmentor(verbose=VERBOSE, user_dict='user_dict.txt')
43 | print(list(cutter.cut(text)))
44 | os.system('rm user_dict.txt')
45 | 
46 | text_list = [text] * 10000
47 | start = time.time()
48 | result = list(cutter.cut(text_list, pos=False))
49 | print('cut 10000 sentences no pos used {:04.2f}s'.format(time.time() - start))
50 | start = time.time()
51 | result = list(cutter.cut(text_list))
52 | print('cut 10000 sentences used {:04.2f}s'.format(time.time() - start))
53 | 
54 | print('test fast mode')
55 | cutter = Chunk_Segmentor(mode='fast', verbose=VERBOSE)
56 | print(list(cutter.cut(text)))
57 | print('test cut_all')
58 | print(list(cutter.cut(text, cut_all=True)))
59 | start = time.time()
60 | result = list(cutter.cut(text_list))
61 | print('cut 10000 sentences in fast mode used {:04.2f}s'.format(time.time() - start))
62 | 
63 | print('test all pass')
64 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/tests/test_speed.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # sys.path.append('../../..')
 3 | from nlp_toolkit.chunk_segmentor import Chunk_Segmentor
 4 | 
 5 | mode = sys.argv[1]
 6 | 
 7 | if mode == 'short':
 8 |     text = '这是一个能够输出名词短语的分词器，欢迎试用！'
 9 | elif mode == 'long':
10 |     text = '主要配合UI设计师100%还原设计图，使用前端技术解决各大浏览器兼容问题，使用HTML5+css3完成页面优化及提高用户体验，www.s.com使用bootstrap、jQuery完成界面效果展示，使用JavaScript完成页面功能展示，并且，在规定的时间内提前完成任务大大提高了工作的效率'
11 | 
12 | 
13 | def load_fast():
14 |     return Chunk_Segmentor(mode='fast')
15 | 
16 | 
17 | def test_fast():
18 |     return list(CUTTER.cut([text] * 10000))
19 | 
20 | 
21 | def load_accurate():
22 |     return Chunk_Segmentor(mode='accurate')
23 | 
24 | 
25 | def test_accurate():
26 |     return list(CUTTER.cut([text] * 10000))
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     import cProfile
31 |     global CUTTER
32 |     CUTTER = load_accurate()
33 |     cProfile.run("test_accurate()", filename='chunk_speed_accurate_%s.out' % mode)
34 |     CUTTER = load_fast()
35 |     cProfile.run("test_fast()", filename='chunk_speed_fast_%s.out' % mode)
36 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/trie.py:
--------------------------------------------------------------------------------
 1 | """Trie树结构"""
 2 | 
 3 | 
 4 | class TrieNode(object):
 5 |     def __init__(self):
 6 |         """
 7 |         Initialize your data structure here.
 8 |         """
 9 |         self.data = {}
10 |         self.is_word = False
11 | 
12 | 
13 | class Trie(object):
14 |     def __init__(self):
15 |         self.root = TrieNode()
16 | 
17 |     def insert(self, word):
18 |         """
19 |         Inserts a word into the trie.
20 |         :type word: str
21 |         :rtype: void
22 |         """
23 |         node = self.root
24 |         for letter in word:
25 |             child = node.data.get(letter)
26 |             if not child:
27 |                 node.data[letter] = TrieNode()
28 |             node = node.data[letter]
29 |         node.is_word = True
30 | 
31 |     def search(self, word):
32 |         """
33 |         Returns if the word is in the trie.
34 |         :type word: str
35 |         :rtype: bool
36 |         """
37 |         node = self.root
38 |         for letter in word:
39 |             node = node.data.get(letter)
40 |             if not node:
41 |                 return False
42 |         return node.is_word  # 判断单词是否是完整的存在在trie树中
43 | 
44 |     def starts_with(self, prefix):
45 |         """
46 |         Returns if there is any word in the trie
47 |         that starts with the given prefix.
48 |         :type prefix: str
49 |         :rtype: bool
50 |         """
51 |         node = self.root
52 |         for letter in prefix:
53 |             node = node.data.get(letter)
54 |             if not node:
55 |                 return False
56 |         return True
57 | 
58 |     def get_start(self, prefix):
59 |         """
60 |         Returns words started with prefix
61 |         :param prefix:
62 |         :return: words (list)
63 |         """
64 |         def _get_key(pre, pre_node):
65 |             words_list = []
66 |             if pre_node.is_word:
67 |                 words_list.append(pre)
68 |             for x in pre_node.data.keys():
69 |                 words_list.extend(_get_key(pre + str(x), pre_node.data.get(x)))
70 |             return words_list
71 | 
72 |         words = []
73 |         if not self.starts_with(prefix):
74 |             return words
75 |         if self.search(prefix):
76 |             words.append(prefix)
77 |             return words
78 |         node = self.root
79 |         for letter in prefix:
80 |             node = node.data.get(letter)
81 |         return _get_key(prefix, node)
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     tree = Trie()
86 |     tree.insert('深度学习')
87 |     tree.insert('深度神经网络')
88 |     tree.insert('深度网络')
89 |     tree.insert('机器学习')
90 |     tree.insert('机器学习模型')
91 |     print(tree.search('深度学习'))
92 |     print(tree.search('机器学习模型'))
93 |     print(tree.get_start('深度'))
94 |     print(tree.get_start('深度网'))
95 | 


--------------------------------------------------------------------------------
/nlp_toolkit/chunk_segmentor/utils.py:
--------------------------------------------------------------------------------
  1 | """一些nlp的常用函数"""
  2 | 
  3 | import re
  4 | import itertools
  5 | import collections
  6 | from hanziconv import HanziConv
  7 | 
  8 | 
  9 | # 扁平化列表
 10 | # ['1', '12', ['abc', 'df'], ['a']] ---> ['1','12','abc','df','a']
 11 | def flatten(x):
 12 |     tmp = [([i] if isinstance(i, str) else i) for i in x]
 13 |     return list(itertools.chain(*tmp))
 14 | 
 15 | 
 16 | def flatten_gen(x):
 17 |     for i in x:
 18 |         if isinstance(i, list) or isinstance(i, tuple):
 19 |             for inner_i in i:
 20 |                 yield inner_i
 21 |         else:
 22 |             yield i
 23 | 
 24 | 
 25 | def n_grams(a, n):
 26 |     z = (itertools.islice(a, i, None) for i in range(n))
 27 |     return zip(*z)
 28 | 
 29 | 
 30 | def tag_by_dict(word_list, tree):
 31 |     idx = []
 32 |     length = len(word_list)
 33 |     start_idx = 0
 34 |     end_idx = 0
 35 |     while start_idx < length - 1:
 36 |         tmp_end_idx = 0
 37 |         tmp_chunk = ''.join(word_list[start_idx: end_idx+1])
 38 |         while tree.starts_with(tmp_chunk) and end_idx < length:
 39 |             tmp_end_idx = end_idx
 40 |             end_idx += 1
 41 |             tmp_chunk = ''.join(word_list[start_idx: end_idx+1])
 42 |         if tmp_end_idx != 0 and tree.search(''.join(word_list[start_idx: end_idx])):
 43 |             idx.append([start_idx, tmp_end_idx])
 44 |         start_idx += 1
 45 |         end_idx = start_idx
 46 |     if idx != []:
 47 |         idx = list(combine_idx(idx))
 48 |     return idx
 49 | 
 50 | 
 51 | # 合并交叉的chunk
 52 | def combine_idx(idx_list):
 53 |     l_idx = len(idx_list)
 54 |     if l_idx > 1:
 55 |         idx = 0
 56 |         used = []
 57 |         last_idx = l_idx - 1
 58 |         while idx <= l_idx - 2:
 59 |             if idx_list[idx+1][0] > idx_list[idx][1]:
 60 |                 if idx not in used:
 61 |                     yield idx_list[idx]
 62 |                 if idx + 1 == last_idx:
 63 |                     yield idx_list[idx+1]
 64 |                 idx += 1
 65 |             else:
 66 |                 start = idx_list[idx][0]
 67 |                 while idx_list[idx+1][0] <= idx_list[idx][1]:
 68 |                     end = idx_list[idx+1][1]
 69 |                     used.append(idx)
 70 |                     idx += 1
 71 |                     if idx > l_idx - 2:
 72 |                         break
 73 |                 used.append(idx)
 74 |                 yield [start, end]
 75 |     else:
 76 |         yield idx_list[0]
 77 | 
 78 | 
 79 | def combine_two_idx(x, y):
 80 |     if x[0] >= y[0] and x[1] <= y[1]:
 81 |         return y
 82 |     elif x[0] < y[0] and x[1] > y[1]:
 83 |         return x
 84 |     else:
 85 |         all_idx = set(x + y)
 86 |         return [min(all_idx), max(all_idx)]
 87 | 
 88 | 
 89 | def compare_idx(dict_idx, model_idx):
 90 |     if dict_idx == model_idx or dict_idx == []:
 91 |         for idx in model_idx:
 92 |             yield idx
 93 |     elif model_idx == []:
 94 |         for idx in dict_idx:
 95 |             yield idx
 96 |     else:
 97 |         union_idx = dict_idx + model_idx
 98 |         uniq_idx = [list(x) for x in set([tuple(x) for x in union_idx])]
 99 |         sort_idx = sorted(uniq_idx, key=lambda x: (x[0], x[1]))
100 |         for idx in list(combine_idx(sort_idx)):
101 |             yield idx
102 | 
103 | 
104 | def word_length(segs):
105 |     cnt = []
106 |     i = 0
107 |     for item in segs:
108 |         if item == 'E':
109 |             i += 1
110 |             cnt.append(i)
111 |             i = 0
112 |         elif item == 'S':
113 |             cnt.append(1)
114 |         else:
115 |             i += 1
116 |     return cnt
117 | 
118 | 
119 | # 根据另外一个列表进行sub_list的切分
120 | def split_sublist(list1, list2):
121 |     if len(list1) == 1:
122 |         return [list2]
123 |     else:
124 |         list1_len = [len(item) for item in list1]
125 |         new_list = []
126 |         for i in range(len(list1)):
127 |             if i == 0:
128 |                 start = 0
129 |             end = sum(list1_len[:i+1])
130 |             new_list.append(list2[start: end])
131 |             start = end
132 |         return new_list
133 | 
134 | 
135 | def output_reform(a, b, mode, dict_loaded=False):
136 |     if mode == 'accurate':
137 |         if dict_loaded:
138 |             a = a.replace('_', '')
139 |         return a + '-' + b
140 |     else:
141 |         return (a, b)
142 | 
143 | 
144 | def reshape_term(term, qualifier_word=None, mode='accurate', dict_loaded=False):
145 |     # pos = str(term.nature)
146 |     # word = term.word
147 |     term = str(term).split('/')
148 |     pos = term[1]
149 |     word = term[0]
150 |     if pos == 'np':
151 |         if word in qualifier_word:
152 |             return output_reform(qualifier_word[word], pos, mode, dict_loaded)
153 |         else:
154 |             return output_reform(word, pos, mode, dict_loaded)
155 |     else:
156 |         return output_reform(word, pos, mode, dict_loaded)
157 | 
158 | 
159 | def hanlp_cut(sent_list, segmentor, qualifier_word=None, mode='accurate'):
160 |     if qualifier_word is None:
161 |         if mode == 'accurate':
162 |             res = [[term.word + '-' + str(term.nature) for term in segmentor.segment(sub)] for sub in sent_list]
163 |         else:
164 |             res = [[(term.word, str(term.nature)) for term in segmentor.segment(sub)] for sub in sent_list]
165 |     else:
166 |         res = [[reshape_term(term, qualifier_word, mode) for term in segmentor.segment(sub)] for sub in sent_list]
167 |     return res
168 | 
169 | 
170 | def jieba_cut(sent_list, segmentor, qualifier_word=None, mode='accurate', dict_loaded=False):
171 |     if qualifier_word is None:
172 |         if mode == 'accurate':
173 |             res = [[word + '-' + flag for word, flag in segmentor.cut(sub)] for sub in sent_list]
174 |         else:
175 |             res = [[(word, flag) for word, flag in segmentor.cut(sub)] for sub in sent_list]
176 |     else:
177 |         res = [[reshape_term(term, qualifier_word, mode, dict_loaded) for term in segmentor.cut(sub)] for sub in sent_list]
178 |     return res
179 | 
180 | 
181 | EMOJI_UNICODE = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\u2600-\u26FF\u2700-\u27BF]'
182 | REGEX_STR = [
183 |     r'转发微博|欢迎转发|^回复|…{2,}|图片评论',  # 微博特定停用词
184 |     r'<[^>]+>',  # HTML标记
185 |     r'/{0,2}@\w+-?\w*[:：]?',  # @-用户
186 |     # r'#.+#',  # hash-tags
187 |     # URLs
188 |     # r'(?:https?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
189 |     # r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-] +\.[a-zA-Z0-9-.] +\b'  # E-MAIL
190 |     r'&#[\s\w\d]+;'
191 | ]
192 | START_PATTERN = [
193 |     r'\* ',
194 |     r'\d{1,2}\.\d{1,2}\.\d{1,2}',  # 1.2.1
195 |     r'\d+\t',
196 |     r'([1-9][0-9]){1,2}[。；:：，,、\.\t/]{1}\s?(?![年月日\d+])',
197 |     r'([1-9][0-9]){1,2}[)）]{1}、?',
198 |     r' \| ',
199 |     r'\n[1-9][0-9]',
200 |     r'\n{2,}',
201 |     r'(?<![A-Za-z0-9/])[A-Za-z]{1}\s?[、\.\)、\t]{1}',
202 |     r'\(1?[1-9]\)',
203 |     r'第?[一二三四五六七八九十]+[、\)\.) \t，]{1}',
204 |     r'\([一二三四五六七八九十]+\)\.?'
205 | ]
206 | START_PATTERN = re.compile(r'('+'|'.join(START_PATTERN)+')+', re.UNICODE)
207 | END_PATTERN = r'(。|！|？|!|\?|；|;)'
208 | HTML = re.compile(r'('+'|'.join(REGEX_STR)+')', re.UNICODE)
209 | 
210 | 
211 | # 异常字符过滤
212 | def preprocess(string):
213 |     invalid_unicode = u'[\u25A0-\u25FF\u0080-\u00A0\uE000-\uFBFF\u2000-\u201B\u201E-\u2027\u2030-\u206F]+'
214 |     lang_char = u'[\u3040-\u309f\u30A0-\u30FF\u1100-\u11FF\u0E00-\u0E7F\u0600-\u06ff\u0750-\u077f\u0400-\u04ff]+'
215 |     invalid_char = u'[\xa0\x7f\x9f]+'
216 |     string = re.sub(EMOJI_UNICODE, '', string)
217 |     string = re.sub(HTML, '', string)
218 |     string = re.sub(r'\r|\t|<\w+>|&\w+;?|br\s*|li>', '', string)
219 |     string = re.sub(invalid_char, '', string)
220 |     string = re.sub(r'<U\+2028>|<U\+F09F>|<U\+F06C>|<U\+F0A7>', '', string)
221 |     string = re.sub(r'[ \u3000]+', 's_', string)
222 |     string = re.sub(invalid_unicode, 'ss_', string)
223 |     string = re.sub(lang_char, 'lan_', string)
224 |     # string = re.sub(r'(工作描述|工作职责|岗位职责|任职要求)(:|：)', '', string)
225 |     string = HanziConv.toSimplified(strQ2B(string))
226 |     string = re.sub(
227 |         r'[^\u4e00-\u9fa5\u0020-\u007f，。！？；、（）：\n\u2029\u2028a-zA-Z0-9]+', '', string)
228 |     return string
229 | 
230 | 
231 | # 分句策略(比直接切开慢3倍)
232 | def sent_split(string):
233 |     string = re.sub(END_PATTERN, '\\1<cut>', re.sub(
234 |         START_PATTERN, '<cut>\\1', string))
235 |     return [item for item in re.split(r'\n|\u2029|\u2028|<cut>', string) if item != '']
236 | 
237 | 
238 | def strQ2B(ustring):
239 |     """全角转半角"""
240 |     rstring = ""
241 |     for uchar in ustring:
242 |         inside_code = ord(uchar)
243 |         if inside_code == 12288:
244 |             inside_code = 32
245 |         elif (inside_code >= 65281 and inside_code <= 65374):
246 |             inside_code -= 65248
247 |         rstring += chr(inside_code)
248 |     return rstring.lower().strip()
249 | 
250 | 
251 | '''
252 | 文件操作
253 | '''
254 | 
255 | 
256 | # 按行读取文本文件
257 | def read_line(fname):
258 |     return open(fname, encoding='utf8').read().split('\n')
259 | 
260 | 
261 | # 保存为文本文件
262 | def save_line(obj, fname='result.txt'):
263 |     with open(fname, 'w', encoding='utf8') as f:
264 |         if isinstance(obj, list):
265 |             for k, v in enumerate(obj):
266 |                 v = str(v)
267 |                 if v != '\n' and k != len(obj) - 1:
268 |                     f.write(v + '\n')
269 |                 else:
270 |                     f.write(v)
271 |         if isinstance(obj, collections.Counter) or isinstance(obj, dict):
272 |             row = 0
273 |             for k, v in sorted(obj.items(), key=lambda x: x[1], reverse=True):
274 |                 v = str(v)
275 |                 if str(v) != '\n' and k != len(obj) - 1:
276 |                     f.write(k + '\t' + str(v) + '\n')
277 |                     row += 1
278 |                 else:
279 |                     f.write(k + '\t' + str(v))
280 | 


--------------------------------------------------------------------------------
/nlp_toolkit/classifier.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classifier Wrapper
  3 | """
  4 | 
  5 | import sys
  6 | import time
  7 | from nlp_toolkit.models import bi_lstm_attention
  8 | from nlp_toolkit.models import Transformer
  9 | from nlp_toolkit.models import textCNN, DPCNN
 10 | from nlp_toolkit.trainer import Trainer
 11 | from nlp_toolkit.utilities import logger
 12 | from nlp_toolkit.sequence import BasicIterator
 13 | from nlp_toolkit.data import Dataset
 14 | from typing import List, Dict
 15 | from copy import deepcopy
 16 | from sklearn.metrics import classification_report
 17 | 
 18 | # TODO
 19 | # 1. evaluate func
 20 | class Classifier(object):
 21 |     """
 22 |     Classifier Model Zoos. Include following models:
 23 | 
 24 |     1. TextCNN
 25 |     2. DPCNN (Deep Pyramid CNN)
 26 |     3. Bi-LSTM-Attention
 27 |     4. Multi-Head-Self-Attention (Transformer)
 28 |     5. HAN (Hierachical Attention Network)
 29 |     """
 30 | 
 31 |     def __init__(self, model_name, dataset: Dataset, seq_type='bucket'):
 32 |         self.model_name = model_name
 33 |         self.dataset = dataset
 34 |         self.transformer = dataset.transformer
 35 |         if dataset.mode == 'train':
 36 |             self.config = self.dataset.config
 37 |             self.m_cfg = self.config['model'][self.model_name]
 38 |             self.seq_type = seq_type
 39 |             if seq_type == 'bucket':
 40 |                 self.config['maxlen'] = None
 41 |             self.model = self.get_model()
 42 |             self.model_trainer = self.get_trainer()
 43 |         elif dataset.mode == 'predict' or dataset.mode == 'eval':
 44 |             pass
 45 |         else:
 46 |             logger.warning('invalid mode name. Current only support "train" "eval" "predict"')
 47 | 
 48 |     def get_model(self):
 49 |         if self.model_name == 'bi_lstm_att':
 50 |             model = bi_lstm_attention(
 51 |                 nb_classes=self.config['nb_classes'],
 52 |                 nb_tokens=self.config['nb_tokens'],
 53 |                 maxlen=self.config['maxlen'],
 54 |                 embedding_dim=self.config['embedding_dim'],
 55 |                 embeddings=self.config['token_embeddings'],
 56 |                 rnn_size=self.m_cfg['rnn_size'],
 57 |                 attention_dim=self.m_cfg['attention_dim'],
 58 |                 final_dropout_rate=self.m_cfg['final_drop_rate'],
 59 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate'],
 60 |                 return_attention=self.m_cfg['return_att']
 61 |             )
 62 |         elif self.model_name == 'transformer':
 63 |             model = Transformer(
 64 |                 nb_classes=self.config['nb_classes'],
 65 |                 nb_tokens=self.config['nb_tokens'],
 66 |                 maxlen=self.config['maxlen'],
 67 |                 embedding_dim=self.config['embedding_dim'],
 68 |                 embeddings=self.config['token_embeddings'],
 69 |                 pos_embed=self.m_cfg['pos_embed'],
 70 |                 nb_transformer=self.m_cfg['nb_transformer'],
 71 |                 final_dropout_rate=self.m_cfg['final_drop_rate'],
 72 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
 73 |             )
 74 |         elif self.model_name == 'text_cnn':
 75 |             model = textCNN(
 76 |                 nb_classes=self.config['nb_classes'],
 77 |                 nb_tokens=self.config['nb_tokens'],
 78 |                 maxlen=self.config['maxlen'],
 79 |                 embedding_dim=self.config['embedding_dim'],
 80 |                 embeddings=self.config['token_embeddings'],
 81 |                 conv_kernel_size=self.m_cfg['conv_kernel_size'],
 82 |                 pool_size=self.m_cfg['pool_size'],
 83 |                 nb_filters=self.m_cfg['nb_filters'],
 84 |                 fc_size=self.m_cfg['fc_size'],
 85 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
 86 |             )
 87 |         elif self.model_name == 'dpcnn':
 88 |             model = DPCNN(
 89 |                 nb_classes=self.config['nb_classes'],
 90 |                 nb_tokens=self.config['nb_tokens'],
 91 |                 maxlen=self.config['maxlen'],
 92 |                 embedding_dim=self.config['embedding_dim'],
 93 |                 embeddings=self.config['token_embeddings'],
 94 |                 region_kernel_size=self.m_cfg['region_kernel_size'],
 95 |                 conv_kernel_size=self.m_cfg['conv_kernel_size'],
 96 |                 pool_size=self.m_cfg['pool_size'],
 97 |                 nb_filters=self.m_cfg['nb_filters'],
 98 |                 repeat_time=self.m_cfg['repeat_time'],
 99 |                 final_dropout_rate=self.m_cfg['final_drop_rate'],
100 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
101 |             )
102 |         else:
103 |             logger.warning('The model name ' + self.model_name + ' is unknown')
104 |             model = None
105 |         return model
106 | 
107 |     def get_trainer(self):
108 |         t_cfg = self.config['train']
109 |         model_trainer = Trainer(
110 |             self.model,
111 |             model_name=self.model_name,
112 |             task_type=self.config['task_type'],
113 |             batch_size=t_cfg['batch_size'],
114 |             max_epoch=t_cfg['epochs'],
115 |             train_mode=t_cfg['train_mode'],
116 |             fold_cnt=t_cfg['nb_fold'],
117 |             test_size=t_cfg['test_size'],
118 |             metric=['f1'],
119 |             nb_bucket=t_cfg['nb_bucket'],
120 |             patiences=t_cfg['patiences']
121 |         )
122 |         return model_trainer
123 | 
124 |     def train(self):
125 |         if self.model_name == 'bi_lstm_att':
126 |             return_att = self.m_cfg['return_att']
127 |         else:
128 |             return_att = False
129 |         return self.model_trainer.train(
130 |             self.dataset.texts, self.dataset.labels,
131 |             self.transformer, self.seq_type, return_att)
132 | 
133 |     def predict(self, x: Dict[str, List[List[str]]], batch_size=64,
134 |                 return_attention=False, return_prob=False):
135 |         n_labels = len(self.transformer._label_vocab._id2token)
136 |         x_c = deepcopy(x)
137 |         start = time.time()
138 |         x_len = [item[-1] for item in x_c['token']]
139 |         x_c['token'] = [item[:-1] for item in x_c['token']]
140 |         x_seq = BasicIterator('classification', self.transformer,
141 |                               x_c, batch_size=batch_size)
142 |         result = self.model.model.predict_generator(x_seq)
143 |         if return_prob:
144 |             y_pred = result[:, :n_labels]
145 |         else:
146 |             y_pred = self.transformer.inverse_transform(result[:, :n_labels])
147 |         used_time = time.time() - start
148 |         logger.info('predict {} samples used {:4.1f}s'.format(
149 |             len(x['token']), used_time))
150 |         if result.shape[1] > n_labels and self.model_name == 'bi_lstm_att':
151 |             attention = result[:, n_labels:]
152 |             attention = [attention[idx][:l] for idx, l in enumerate(x_len)]
153 |             return y_pred, attention
154 |         else:
155 |             return y_pred
156 | 
157 |     def evaluate(self, x: Dict[str, List[List[str]]], y: List[str],
158 |                  batch_size=64):
159 |         n_labels = len(self.transformer._label_vocab._id2token)
160 |         y = [item[0] for item in y]
161 |         x_c = deepcopy(x)
162 |         x_len = [item[-1] for item in x_c['token']]
163 |         x_c['token'] = [item[:-1] for item in x_c['token']]
164 |         x_seq = BasicIterator('classification', self.transformer,
165 |                               x_c, batch_size=batch_size)
166 |         result = self.model.model.predict_generator(x_seq)
167 |         result = result[:, :n_labels]
168 |         y_pred = self.transformer.inverse_transform(result, lengths=x_len)
169 |         print(classification_report(y, y_pred))
170 | 
171 |     def load(self, weight_fname, para_fname):
172 |         if self.model_name == 'bi_lstm_att':
173 |             self.model = bi_lstm_attention.load(weight_fname, para_fname)
174 |         elif self.model_name == 'transformer':
175 |             self.model = Transformer.load(weight_fname, para_fname)
176 |         elif self.model_name == 'text_cnn':
177 |             self.model = textCNN.load(weight_fname, para_fname)
178 |         elif self.model_name == 'dpcnn':
179 |             self.model = DPCNN.load(weight_fname, para_fname)
180 |         else:
181 |             logger.warning('invalid model name')
182 |             sys.exit()
183 | 


--------------------------------------------------------------------------------
/nlp_toolkit/config.py:
--------------------------------------------------------------------------------
 1 | from ruamel.yaml import YAML
 2 | from tensorflow.contrib.training import HParams
 3 | 
 4 | 
 5 | class YParams(HParams):
 6 |     def __init__(self, yaml_fn, config_name):
 7 |         super().__init__()
 8 |         with open(yaml_fn, encoding='utf8') as fp:
 9 |             for k, v in YAML().load(fp)[config_name].items():
10 |                 self.add_hparam(k, v)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     hparams = YParams('./config_classification.yaml', 'data')
15 |     print(hparams.basic_token)
16 | 


--------------------------------------------------------------------------------
/nlp_toolkit/data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Text preprocess utilties
  3 | """
  4 | 
  5 | import re
  6 | import os
  7 | import sys
  8 | from pathlib import Path
  9 | from hanziconv import HanziConv
 10 | from typing import Dict
 11 | from nlp_toolkit.sequence import IndexTransformer
 12 | from nlp_toolkit.utilities import load_vectors, logger, word2char
 13 | 
 14 | EMOJI_UNICODE = r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\u2600-\u26FF\u2700-\u27BF]'
 15 | REGEX_STR = [
 16 |     r'转发微博|欢迎转发|^回复|…{2,}|图片评论',  # 微博特定停用词
 17 |     r'<[^>]+>',  # HTML标记
 18 |     r'/{0,2}@\w+-?\w*[:：]?',  # @-用户
 19 |     r'#.+#',  # hash-tags
 20 |     # URLs
 21 |     r'(?:https{0,1}?://|www\.)(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
 22 |     r'\b[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-] +\.[a-zA-Z0-9-.] +\b'  # E-MAIL
 23 | ]
 24 | NEGATIVES = ['不', '没有', '无', '莫', '非', '没']
 25 | ADVERSATIVES = ['但是', '但', '然而']
 26 | SENT_SEP_LIST = r'[。!！？\?]+'
 27 | 
 28 | 
 29 | # TODO
 30 | class Dataset(object):
 31 |     """
 32 |     Clean text for post processing. Contains following steps:
 33 |         1. remove sepcific tokens(e.g. weibo emotions, emojis, html tags etc.)
 34 |         2. must contain Chinese character
 35 |         3. simplify Chinese character
 36 |         4. segment words supported by pyhanlp (removed)
 37 |     Then transform text and label to number index according to the given task
 38 | 
 39 |     Data Foramt by line:
 40 |         classification: __label__<class_name_1> __label__<class_name_2> ... <text>
 41 |         sequence labeling: token_1###label_1\ttoken_2###label_2\t... \ttoken_n###label_n
 42 |         language model: token_1 token_2 ... token_n
 43 |     """
 44 | 
 45 |     def __init__(self, mode, fname='', tran_fname='',
 46 |                  config=None, task_type=None, data_format=''):
 47 |         self.mode = mode
 48 |         self.fname = fname
 49 |         self.inner_char = False
 50 |         self.use_seg = False
 51 |         self.use_radical = False
 52 |         self.radical_dict = None
 53 |         
 54 |         if data_format != '':
 55 |             self.data_format = data_format
 56 | 
 57 |         if config:
 58 |             self.basic_token = config['data']['basic_token']
 59 |         self.html_texts = re.compile(r'('+'|'.join(REGEX_STR)+')', re.UNICODE)
 60 | 
 61 |         if task_type:
 62 |             if mode == 'train' and config is None:
 63 |                 logger.error('please specify the config file path')
 64 |                 sys.exit()
 65 |             self.task_type = task_type
 66 |         else:
 67 |             try:
 68 |                 self.task_type = re.findall(r'config_(\w+)\.yaml', config)[0]
 69 |             except:
 70 |                 logger.error('please check your config filename')
 71 |                 sys.exit()
 72 | 
 73 |         if mode == 'train':
 74 |             if 'data' in config:
 75 |                 self.config = config
 76 |                 self.data_config = config['data']
 77 |                 self.embed_config = config['embed']
 78 |                 if self.task_type == 'sequence':
 79 |                     self.data_format = self.data_config['format']
 80 |                 if self.basic_token == 'word':
 81 |                     self.max_tokens = self.data_config['max_words']
 82 |                     self.inner_char = self.data_config['inner_char']
 83 |                 elif self.basic_token == 'char':
 84 |                     self.max_tokens = self.data_config['max_chars']
 85 |                     if self.task_type == 'sequence_labeling':
 86 |                         self.use_seg = self.data_config['use_seg']
 87 |                         self.use_radical = self.data_config['use_radical']
 88 |                         if self.config['train']['metric'] not in ['f1_seq']:
 89 |                             self.config['train']['metric'] = 'f1_seq'
 90 |                             logger.warning('sequence labeling task currently only support f1_seq callback')
 91 |                     elif self.task_type == 'classification':
 92 |                         if self.config['train']['metric'] in ['f1_seq']:
 93 |                             self.config['train']['metric'] = 'f1'
 94 |                             logger.warning('text classification task not support f1_seq callback, changed to f1')
 95 |                 else:
 96 |                     logger.error('invalid token type, only support word and char')
 97 |                     sys.exit()
 98 |             else:
 99 |                 logger.error("please pass in the correct config dict")
100 |                 sys.exit()
101 | 
102 |             if self.basic_token == 'char':
103 |                 self.use_seg = config['data']['use_seg']
104 |                 self.use_radical = config['data']['use_radical']
105 | 
106 |             if self.use_radical:
107 |                 radical_file = Path(os.path.dirname(
108 |                     os.path.realpath(__file__))) / 'data' / 'radical.txt'
109 |                 self.radical_dict = {line.split()[0]: line.split()[1].strip()
110 |                                     for line in open(radical_file, encoding='utf8')}
111 | 
112 |             self.transformer = IndexTransformer(
113 |                 task_type=self.task_type,
114 |                 max_tokens=self.max_tokens,
115 |                 max_inner_chars=self.data_config['max_inner_chars'],
116 |                 use_inner_char=self.inner_char,
117 |                 use_seg=self.use_seg,
118 |                 use_radical=self.use_radical,
119 |                 radical_dict=self.radical_dict,
120 |                 basic_token=self.basic_token)
121 | 
122 |         elif mode != 'train':
123 |             if len(tran_fname) > 0:
124 |                 logger.info('transformer loaded')
125 |                 self.transformer = IndexTransformer.load(tran_fname)
126 |                 self.basic_token = self.transformer.basic_token
127 |                 self.use_seg = self.transformer.use_seg
128 |                 self.use_radical = self.transformer.use_radical
129 |                 self.inner_char = self.transformer.use_inner_char
130 |                 self.max_tokens = self.transformer.max_tokens
131 |             else:
132 |                 logger.error("please pass in the transformer's filepath")
133 |                 sys.exit()
134 | 
135 |         if fname:
136 |             self.load_data()
137 |             self.fit()
138 |         else:
139 |             self.texts = []
140 |             self.labels = []
141 | 
142 |     def clean(self, line):
143 |         line = re.sub(r'\[[\u4e00-\u9fa5a-z]{1,4}\]|\[aloha\]', '', line)
144 |         line = re.sub(EMOJI_UNICODE, '', line)
145 |         line = re.sub(self.html_texts, '', line)
146 |         if re.search(r'[\u4300-\u9fa5]+', line):
147 |             line = HanziConv.toSimplified(line)
148 |             return re.sub(' {2,}|\t', ' ', line).lower()
149 |         else:
150 |             return None
151 | 
152 |     def load_data(self):
153 |         if self.task_type == 'classification':
154 |             self.load_tc_data()
155 |         elif self.task_type == 'sequence_labeling':
156 |             if self.mode != 'predict':
157 |                 self.load_sl_data()
158 |             else:
159 |                 self.texts = [line.strip().split() for line in open(self.fname, 'r', encoding='utf8')]
160 |         logger.info('data loaded')
161 | 
162 |     def load_tc_data(self, max_tokens_per_doc=256):
163 |         """
164 |         Reads a data file for text classification. The file should contain one document/text per line.
165 |         The line should have the following format:
166 |         __label__<class_name> <text>
167 |         If you have a multi label task, you can have as many labels as you want at the beginning of the line, e.g.,
168 |         __label__<class_name_1> __label__<class_name_2> <text>
169 |         """
170 |         label_prefix = '__label__'
171 |         self.texts = []
172 |         self.labels = []
173 | 
174 |         with open(self.fname, 'r', encoding='utf8') as fin:
175 |             for line in fin:
176 |                 words = self.clean(line.strip()).split()
177 |                 if self.mode != 'predict':
178 |                     if words:
179 |                         nb_labels = 0
180 |                         label_line = []
181 |                         for word in words:
182 |                             if word.startswith(label_prefix):
183 |                                 nb_labels += 1
184 |                                 label = word.replace(label_prefix, "")
185 |                                 label_line.append(label)
186 |                             else:
187 |                                 break
188 |                         text = words[nb_labels:]
189 |                         if len(text) > max_tokens_per_doc:
190 |                             text = text[:max_tokens_per_doc]
191 |                         self.texts.append(text)
192 |                         self.labels.append(label_line)
193 |                 else:
194 |                     self.texts.append(words)
195 | 
196 |     def load_sl_data(self):
197 |         """
198 |         Reads a data file for text classification. The file should contain one document/text per line.
199 |         The line should have the following formats:
200 |         1. conll:
201 |             word\ttag
202 |             ...
203 |             word\ttag
204 | 
205 |             word\ttag
206 |             ...
207 |         2. basic:
208 |             word###tag\tword###tag\t...word###tag
209 |         """
210 |         data = (line.strip() for line in open(self.fname, 'r', encoding='utf8'))
211 |         if self.data_format == 'basic':
212 |             self.texts, self.labels = zip(
213 |                 *[zip(*[item.rsplit('###', 1) for item in line.split('\t')]) for line in data])
214 |             self.texts = list(map(list, self.texts))
215 |             self.labels = list(map(list, self.labels))
216 |         elif self.data_format == 'conll':
217 |             self.texts, self.labels = self.process_conll(data)
218 |         else:
219 |             logger.warning('invalid data format for sequence labeling task')
220 |             sys.exit()
221 | 
222 |     def process_conll(self, data):
223 |         sents, labels = [], []
224 |         tokens, tags = [], []
225 |         for line in data:
226 |             if line:
227 |                 token, tag = line.split('\t')
228 |                 tokens.append(token)
229 |                 tags.append(tag)
230 |             else:
231 |                 sents.append(tokens)
232 |                 labels.append(tags)
233 |                 tokens, tags = [], []
234 |         return sents, labels
235 | 
236 |     def add(self, line: Dict[str, str]):
237 |         t = line['text'].strip().split()
238 |         if self.mode == 'train':
239 |             l = line['label'].strip().split()
240 |             if self.task_type == 'sequence_labeling':
241 |                 assert len(t) == len(l)
242 |             self.texts.append(t)
243 |             self.labels.append(l)
244 |         elif self.mode == 'predict':
245 |             self.texts.append(t)
246 | 
247 |     # 转折句简单切分
248 |     def adv_split(self, line):
249 |         return re.sub('(' + '|'.join(ADVERSATIVES) + ')', r'<turn>', line)
250 | 
251 |     def fit(self):
252 |         if self.mode != 'predict':
253 |             if self.basic_token == 'char':
254 |                 if self.task_type == 'sequence_labeling':
255 |                     self.texts = [
256 |                         word2char(x, y, self.task_type, self.use_seg, self.radical_dict)
257 |                         for x, y in zip(self.texts, self.labels)]
258 |                     self.texts = {k: [dic[k] for dic in self.texts] for k in self.texts[0]}
259 |                     self.labels = self.texts['label']
260 |                     del self.texts['label']
261 |                 else:
262 |                     self.texts = {'token': [word2char(x, task_type=self.task_type) for x in self.texts]}
263 |             else:
264 |                 self.texts = {'token': self.texts}
265 |             if self.mode == 'train':
266 |                 self.config['mode'] = self.mode
267 |                 self.transformer.fit(self.texts['token'], self.labels)
268 |                 logger.info('transformer fitting complete')
269 |                 embed = {}
270 |                 if self.embed_config['pre']:
271 |                     token_embed, dim = load_vectors(
272 |                         self.embed_config[self.basic_token]['path'], self.transformer._token_vocab)
273 |                     embed[self.basic_token] = token_embed
274 |                     logger.info('Loaded Pre_trained Embeddings')
275 |                 else:
276 |                     logger.info('Use Embeddings from Straching ')
277 |                     dim = self.embed_config[self.basic_token]['dim']
278 |                     embed[self.basic_token] = None
279 |                 # update config
280 |                 self.config['nb_classes'] = self.transformer.label_size
281 |                 self.config['nb_tokens'] = self.transformer.token_vocab_size
282 |                 self.config['extra_features'] = []
283 |                 if self.inner_char:
284 |                     self.config['nb_char_tokens'] = self.transformer.char_vocab_size
285 |                 else:
286 |                     self.config['nb_char_tokens'] = 0
287 |                     self.config['use_inner_char'] = False
288 |                 if self.use_seg:
289 |                     self.config['nb_seg_tokens'] = self.transformer.seg_vocab_size
290 |                     self.config['extra_features'].append('seg')
291 |                     self.config['use_seg'] = self.use_seg
292 |                 else:
293 |                     self.config['nb_seg_tokens'] = 0
294 |                     self.config['use_seg'] = False
295 |                 if self.use_radical:
296 |                     self.config['nb_radical_tokens'] = self.transformer.radical_vocab_size
297 |                     self.config['extra_features'].append('radical')
298 |                     self.config['use_radical'] = self.use_radical
299 |                 else:
300 |                     self.config['nb_radical_tokens'] = 0
301 |                     self.config['use_radical'] = False
302 |                 self.config['embedding_dim'] = dim
303 |                 self.config['token_embeddings'] = embed[self.basic_token]
304 |                 self.config['maxlen'] = self.max_tokens
305 |                 self.config['task_type'] = self.task_type
306 |         else:
307 |             if self.basic_token == 'char':
308 |                 self.texts = [
309 |                     word2char(x, None, self.task_type, self.use_seg, self.radical_dict)
310 |                     for x in self.texts]
311 |                 self.texts = {k: [dic[k] for dic in self.texts]
312 |                               for k in self.texts[0]}
313 |             else:
314 |                 self.texts = {'token': self.texts}
315 | 
316 |         lengths = [len(item) if len(item) <= self.max_tokens else self.max_tokens
317 |                    for item in self.texts['token']]
318 |         self.texts['token'] = list(map(list, self.texts['token']))
319 |         self.texts['token'] = [item + [lengths[idx]] for idx, item in enumerate(self.texts['token'])]
320 | 
321 | 
322 | # TODO
323 | class Sentence(object):
324 |     """
325 |     """
326 | 
327 |     def __init__(self, transformer):
328 |         pass
329 | 


--------------------------------------------------------------------------------
/nlp_toolkit/labeler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Sequence Labeler Wrapper
  3 | """
  4 | 
  5 | import sys
  6 | import time
  7 | import numpy as np
  8 | from copy import deepcopy
  9 | from nlp_toolkit.models import Word_RNN, IDCNN, Char_RNN
 10 | from nlp_toolkit.trainer import Trainer
 11 | from nlp_toolkit.utilities import logger
 12 | from nlp_toolkit.sequence import BasicIterator
 13 | from nlp_toolkit.data import Dataset
 14 | from typing import List, Dict
 15 | from seqeval.metrics import classification_report as sequence_report
 16 | 
 17 | 
 18 | class Labeler(object):
 19 |     """
 20 |     Sequence Labeling Model Zoos. Include following models:
 21 | 
 22 |     1. WordRNN + Inner_Char
 23 |     2. CharRNN + Extra Embeddings (segment, radical, nchar)
 24 |     3. IDCNN
 25 |     """
 26 | 
 27 |     def __init__(self, model_name, dataset: Dataset, seq_type='bucket'):
 28 |         self.model_name = model_name
 29 |         self.dataset = dataset
 30 |         self.transformer = dataset.transformer
 31 |         if dataset.mode == 'train':
 32 |             self.config = self.dataset.config
 33 |             self.m_cfg = self.config['model'][self.model_name]
 34 |             self.seq_type = seq_type
 35 |             if seq_type == 'bucket':
 36 |                 self.config['maxlen'] = None
 37 |             self.model = self.get_model()
 38 |             self.model_trainer = self.get_trainer()
 39 |         elif dataset.mode == 'predict' or dataset.mode == 'eval':
 40 |             pass
 41 |         else:
 42 |             logger.warning('invalid mode name. Current only support "train" "eval" "predict"')
 43 | 
 44 |     def get_model(self):
 45 |         if self.model_name == 'word_rnn':
 46 |             model = Word_RNN(
 47 |                 nb_classes=self.config['nb_classes'],
 48 |                 nb_tokens=self.config['nb_tokens'],
 49 |                 nb_char_tokens=self.config['nb_char_tokens'],
 50 |                 maxlen=self.config['maxlen'],
 51 |                 embedding_dim=self.config['embedding_dim'],
 52 |                 embeddings=self.config['token_embeddings'],
 53 |                 inner_char=self.config['data']['inner_char'],
 54 |                 use_crf=self.m_cfg['use_crf'],
 55 |                 char_feature_method=self.m_cfg['char_feature_method'],
 56 |                 integration_method=self.m_cfg['integration_method'],
 57 |                 rnn_type=self.m_cfg['rnn_type'],
 58 |                 nb_rnn_layers=self.m_cfg['nb_rnn_layers'],
 59 |                 nb_filters=self.m_cfg['nb_filters'],
 60 |                 conv_kernel_size=self.m_cfg['conv_kernel_size'],
 61 |                 drop_rate=self.m_cfg['drop_rate'],
 62 |                 re_drop_rate=self.m_cfg['re_drop_rate'],
 63 |                 word_rnn_size=self.m_cfg['word_rnn_size'],
 64 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
 65 |             )
 66 |         elif self.model_name == 'char_rnn':
 67 |             model = Char_RNN(
 68 |                 nb_classes=self.config['nb_classes'],
 69 |                 nb_tokens=self.config['nb_tokens'],
 70 |                 nb_seg_tokens=self.config['nb_seg_tokens'],
 71 |                 nb_radical_tokens=self.config['nb_radical_tokens'],
 72 |                 maxlen=self.config['maxlen'],
 73 |                 embedding_dim=self.config['embedding_dim'],
 74 |                 use_seg=self.config['use_seg'],
 75 |                 use_radical=self.config['use_radical'],
 76 |                 use_crf=self.m_cfg['use_crf'],
 77 |                 rnn_type=self.m_cfg['rnn_type'],
 78 |                 nb_rnn_layers=self.m_cfg['nb_rnn_layers'],
 79 |                 drop_rate=self.m_cfg['drop_rate'],
 80 |                 re_drop_rate=self.m_cfg['re_drop_rate'],
 81 |                 char_rnn_size=self.m_cfg['char_rnn_size'],
 82 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
 83 |             )
 84 |         elif self.model_name == 'idcnn':
 85 |             model = IDCNN(
 86 |                 nb_classes=self.config['nb_classes'],
 87 |                 nb_tokens=self.config['nb_tokens'],
 88 |                 maxlen=self.config['maxlen'],
 89 |                 embedding_dim=self.config['embedding_dim'],
 90 |                 embeddings=self.config['token_embeddings'],
 91 |                 use_crf=self.m_cfg['use_crf'],
 92 |                 nb_filters=self.m_cfg['nb_filters'],
 93 |                 conv_kernel_size=self.m_cfg['conv_kernel_size'],
 94 |                 drop_rate=self.m_cfg['drop_rate'],
 95 |                 repeat_times=self.m_cfg['repeat_times'],
 96 |                 dilation_rate=self.m_cfg['dilation_rate'],
 97 |                 embed_dropout_rate=self.m_cfg['embed_drop_rate']
 98 |             )
 99 |         else:
100 |             logger.warning('The model name ' + self.model_name + ' is unknown')
101 |             model = None
102 |         return model
103 | 
104 |     def get_trainer(self):
105 |         t_cfg = self.config['train']
106 |         model_trainer = Trainer(
107 |             self.model,
108 |             model_name=self.model_name,
109 |             task_type=self.config['task_type'],
110 |             batch_size=t_cfg['batch_size'],
111 |             max_epoch=t_cfg['epochs'],
112 |             train_mode=t_cfg['train_mode'],
113 |             fold_cnt=t_cfg['nb_fold'],
114 |             test_size=t_cfg['test_size'],
115 |             metric=['f1_seq', 'seq_acc'],
116 |             nb_bucket=t_cfg['nb_bucket'],
117 |             patiences=t_cfg['patiences']
118 |         )
119 |         return model_trainer
120 | 
121 |     def train(self):
122 |         return self.model_trainer.train(
123 |             self.dataset.texts, self.dataset.labels,
124 |             self.transformer, self.seq_type)
125 | 
126 |     def predict(self, x: Dict[str, List[List[str]]], batch_size=64,
127 |                 return_prob=False):
128 |         start = time.time()
129 |         x_c = deepcopy(x)
130 |         x_len = [item[-1] for item in x_c['token']]
131 |         x_c['token'] = [item[:-1] for item in x_c['token']]
132 |         x_seq = BasicIterator('sequence_labeling', self.transformer,
133 |                               x_c, batch_size=batch_size)
134 |         result = self.model.model.predict_generator(x_seq)
135 |         if return_prob:
136 |             y_pred = [result[idx][:l] for idx, l in enumerate(x_len)]
137 |         else:
138 |             y_pred = self.transformer.inverse_transform(result, lengths=x_len)
139 |         used_time = time.time() - start
140 |         logger.info('predict {} samples used {:4.1f}s'.format(
141 |             len(x['token']), used_time))
142 |         return y_pred
143 | 
144 |     def show_results(self, x, y_pred):
145 |         return [[(x1, y1) for x1, y1 in zip(x, y)] for x, y in zip(x, y_pred)]
146 | 
147 |     def evaluate(self, x: Dict[str, List[List[str]]], y: List[List[str]],
148 |                  batch_size=64):
149 |         x_c = deepcopy(x)
150 |         x_len = [item[-1] for item in x_c['token']]
151 |         x_c['token'] = [item[:-1] for item in x_c['token']]
152 |         x_seq = BasicIterator('sequence_labeling', self.transformer,
153 |                             x_c, batch_size=batch_size)
154 |         result = self.model.model.predict_generator(x_seq)
155 |         y_pred = self.transformer.inverse_transform(result, lengths=x_len)
156 |         print(sequence_report(y, y_pred))
157 | 
158 |     def load(self, weight_fname, para_fname):
159 |         if self.model_name == 'word_rnn':
160 |             self.model = Word_RNN.load(weight_fname, para_fname)
161 |         elif self.model_name == 'char_rnn':
162 |             self.model = Char_RNN.load(weight_fname, para_fname)
163 |         elif self.model_name == 'idcnn':
164 |             self.model = IDCNN.load(weight_fname, para_fname)
165 |         else:
166 |             logger.warning('invalid model name')
167 |             sys.exit()
168 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # text classification models
 2 | from nlp_toolkit.models.base_model import Base_Model
 3 | from nlp_toolkit.models.bi_lstm_att import bi_lstm_attention
 4 | from nlp_toolkit.models.text_cnn import textCNN
 5 | from nlp_toolkit.models.transformer import Transformer
 6 | from nlp_toolkit.models.dpcnn import DPCNN
 7 | # sequence labeling models
 8 | from nlp_toolkit.models.word_rnn import Word_RNN
 9 | from nlp_toolkit.models.char_rnn import Char_RNN
10 | from nlp_toolkit.models.idcnn import IDCNN
11 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/base_model.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class Base_Model(object):
 5 |     """
 6 |     Base Keras model for all SOTA models
 7 |     """
 8 |     def __init__(self):
 9 |         self.model = None
10 | 
11 |     def save(self, weights_file, params_file):
12 |         self.save_weights(weights_file)
13 |         self.save_params(params_file)
14 | 
15 |     def save_params(self, file_path, invalid_params={}):
16 |         with open(file_path, 'w') as f:
17 |             invalid_params = {'_loss', '_acc', 'model', 'invalid_params', 'token_embeddings'}.union(invalid_params)
18 |             params = {name.lstrip('_'): val for name, val in vars(self).items()
19 |                       if name not in invalid_params}
20 |             print('model hyperparameters:\n', params)
21 |             json.dump(params, f, sort_keys=True, indent=4)
22 | 
23 |     def save_weights(self, filepath):
24 |         self.model.save_weights(filepath)
25 | 
26 |     @classmethod
27 |     def load(cls, weights_file, params_file):
28 |         params = cls.load_params(params_file)
29 |         self = cls(**params)
30 |         self.forward()
31 |         self.model.load_weights(weights_file)
32 |         print('model loaded')
33 |         return self
34 | 
35 |     @classmethod
36 |     def load_params(cls, file_path):
37 |         with open(file_path) as f:
38 |             params = json.load(f)
39 |         return params
40 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/bi_lstm_att.py:
--------------------------------------------------------------------------------
  1 | from nlp_toolkit.models import Base_Model
  2 | from nlp_toolkit.modules.attentions import Attention
  3 | from nlp_toolkit.modules.token_embedders import Token_Embedding
  4 | from nlp_toolkit.modules.logits import tc_output_logits
  5 | from nlp_toolkit.modules.custom_loss import custom_binary_crossentropy, custom_categorical_crossentropy
  6 | from keras.layers import Input, Activation
  7 | from keras.layers import LSTM, Bidirectional
  8 | from keras.layers.merge import concatenate
  9 | from keras.models import Model
 10 | 
 11 | 
 12 | class bi_lstm_attention(Base_Model):
 13 |     """
 14 |     Model is modified from DeepMoji.
 15 | 
 16 |     Model structure: double bi-lstm followed by attention with some dropout techniques
 17 | 
 18 |     # Arguments:
 19 |         nb_classes: nbber of classes in the dataset.
 20 |         nb_tokens: nbber of tokens in the dataset (i.e. vocabulary size).
 21 |         maxlen: Maximum length of a token.
 22 |         embedding_dim: Embedding layer output dim.
 23 |         embeddings: Embedding weights. Default word embeddings.
 24 |         feature_output: If True the model returns the penultimate
 25 |                         feature vector rather than Softmax probabilities
 26 |                         (defaults to False).
 27 |         embed_dropout_rate: Dropout rate for the embedding layer.
 28 |         final_dropout_rate: Dropout rate for the final Softmax layer.
 29 |         embed_l2: L2 regularization for the embedding layerl.
 30 | 
 31 |     # Returns:
 32 |         Model with the given parameters.
 33 |     """
 34 | 
 35 |     def __init__(self, nb_classes, nb_tokens, maxlen,
 36 |                  embedding_dim=256, embeddings=None,
 37 |                  rnn_size=512, attention_dim=None,
 38 |                  embed_dropout_rate=0,
 39 |                  final_dropout_rate=0, embed_l2=1E-6,
 40 |                  return_attention=False):
 41 |         super(bi_lstm_attention).__init__()
 42 |         self.nb_classes = nb_classes
 43 |         self.nb_tokens = nb_tokens
 44 |         self.maxlen = maxlen
 45 |         self.embedding_dim = embedding_dim
 46 |         self.rnn_size = rnn_size
 47 |         self.attention_dim = attention_dim
 48 |         if embeddings is not None:
 49 |             self.token_embeddings = [embeddings]
 50 |         else:
 51 |             self.token_embeddings = None
 52 |         self.embed_dropout_rate = embed_dropout_rate
 53 |         self.final_dropout_rate = final_dropout_rate
 54 |         self.return_attention = return_attention
 55 |         self.attention_layer = Attention(
 56 |             attention_dim=attention_dim,
 57 |             return_attention=return_attention, name='attlayer')
 58 | 
 59 |         self.invalid_params = {'attention_layer'}
 60 | 
 61 |     def forward(self):
 62 |         model_input = Input(shape=(self.maxlen,), dtype='int32', name='token')
 63 |         x = Token_Embedding(model_input, self.nb_tokens, self.embedding_dim,
 64 |                             self.token_embeddings, True, self.maxlen,
 65 |                             self.embed_dropout_rate, name='token_embeddings')
 66 |         x = Activation('tanh')(x)
 67 | 
 68 |         # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
 69 |         # ordering of the way the merge is done is important for consistency with the pretrained model
 70 |         lstm_0_output = Bidirectional(
 71 |             LSTM(self.rnn_size, return_sequences=True), name="bi_lstm_0")(x)
 72 |         lstm_1_output = Bidirectional(
 73 |             LSTM(self.rnn_size, return_sequences=True), name="bi_lstm_1")(lstm_0_output)
 74 |         x = concatenate([lstm_1_output, lstm_0_output, x], name='concatenate')
 75 | 
 76 |         x = self.attention_layer(x)
 77 |         if self.return_attention:
 78 |             x, weights = x
 79 |         outputs = tc_output_logits(x, self.nb_classes, self.final_dropout_rate)
 80 |         if self.return_attention:
 81 |             outputs.append(weights)
 82 |             outputs = concatenate(outputs, axis=-1, name='outputs')
 83 | 
 84 |         self.model = Model(inputs=model_input,
 85 |                            outputs=outputs, name="Bi_LSTM_Attention")
 86 | 
 87 |     def get_loss(self):
 88 |         if self.nb_classes == 2:
 89 |             if self.return_attention:
 90 |                 return custom_binary_crossentropy
 91 |             else:
 92 |                 return 'binary_crossentropy'
 93 |         elif self.nb_classes > 2:
 94 |             if self.return_attention:
 95 |                 return custom_categorical_crossentropy
 96 |             else:
 97 |                 return 'categorical_crossentropy'
 98 | 
 99 |     def get_metrics(self):
100 |         return ['acc']
101 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/char_rnn.py:
--------------------------------------------------------------------------------
  1 | from nlp_toolkit.models import Base_Model
  2 | from nlp_toolkit.modules.token_embedders import Token_Embedding
  3 | from nlp_toolkit.modules.logits import sl_output_logits
  4 | from keras.layers import Input, BatchNormalization
  5 | from keras.layers import LSTM, GRU, Bidirectional
  6 | from keras.layers.merge import concatenate
  7 | from keras.models import Model
  8 | import sys
  9 | 
 10 | 
 11 | class Char_RNN(Base_Model):
 12 |     """
 13 |     Similar model structure to Word_RNN. But use char as basic token.
 14 |     And some useful features are included: 1. radicals 2. segmentation tag 3. nchar
 15 |     """
 16 | 
 17 |     def __init__(self, nb_classes, nb_tokens, maxlen,
 18 |                  embedding_dim=64, use_crf=True,
 19 |                  use_seg=False, use_radical=False,
 20 |                  use_nchar=False,
 21 |                  nb_seg_tokens=None, nb_radical_tokens=None,
 22 |                  rnn_type='lstm', nb_rnn_layers=2,
 23 |                  char_rnn_size=128, drop_rate=0.5,
 24 |                  re_drop_rate=0.15, embed_dropout_rate=0.15):
 25 |         self.nb_classes = nb_classes
 26 |         self.nb_tokens = nb_tokens
 27 |         self.maxlen = maxlen
 28 |         self.embedding_dim = embedding_dim
 29 |         self.use_crf = use_crf
 30 |         self.use_seg = use_seg
 31 |         self.use_radical = use_radical
 32 |         self.use_nchar = False
 33 |         self.rnn_type = rnn_type
 34 |         self.nb_rnn_layers = nb_rnn_layers
 35 |         self.drop_rate = drop_rate
 36 |         self.re_drop_rate = re_drop_rate
 37 |         self.char_rnn_size = char_rnn_size
 38 |         self.embed_dropout_rate = embed_dropout_rate
 39 |         if use_seg:
 40 |             self.nb_seg_tokens = nb_seg_tokens
 41 |         if use_radical:
 42 |             self.nb_radical_tokens = nb_radical_tokens
 43 | 
 44 |         self.invalid_params = {}
 45 |         super(Char_RNN).__init__()
 46 | 
 47 |     def forward(self):
 48 |         char_ids = Input(shape=(self.maxlen,), dtype='int32', name='token')
 49 |         input_data = [char_ids]
 50 |         char_embed = Token_Embedding(
 51 |             char_ids, self.nb_tokens,
 52 |             self.embedding_dim, None, True,
 53 |             self.maxlen, self.embed_dropout_rate, name='char_embeddings')
 54 |         embed_features = [char_embed]
 55 |         if self.use_seg:
 56 |             seg_ids = Input(shape=(self.maxlen,), dtype='int32', name='seg')
 57 |             input_data.append(seg_ids)
 58 |             seg_emebd = Token_Embedding(
 59 |                 seg_ids, self.nb_seg_tokens, 8, None, True,
 60 |                 self.maxlen, name='seg_embeddings')
 61 |             embed_features.append(seg_emebd)
 62 |         if self.use_radical:
 63 |             radical_ids = Input(shape=(self.maxlen,), dtype='int32', name='radical')
 64 |             input_data.append(radical_ids)
 65 |             radical_embed = Token_Embedding(
 66 |                 radical_ids, self.nb_radical_tokens, 32,
 67 |                 None, True, self.maxlen, name='radical_embeddings')
 68 |             embed_features.append(radical_embed)
 69 |         if self.use_nchar:
 70 |             pass
 71 |         if self.use_seg or self.use_radical:
 72 |             x = concatenate(embed_features, axis=-1, name='embed')
 73 |         else:
 74 |             x = char_embed
 75 |         x = BatchNormalization()(x)
 76 | 
 77 |         for i in range(self.nb_rnn_layers):
 78 |             if self.rnn_type == 'lstm':
 79 |                 x = Bidirectional(
 80 |                     LSTM(self.char_rnn_size, dropout=self.drop_rate,
 81 |                          recurrent_dropout=self.re_drop_rate,
 82 |                          return_sequences=True), name='char_lstm_%d' % (i+1))(x)
 83 |             elif self.rnn_type == 'gru':
 84 |                 x = Bidirectional(
 85 |                     GRU(self.char_rnn_size, dropout=self.drop_rate,
 86 |                         recurrent_dropout=self.re_drop_rate,
 87 |                         return_sequences=True), name='char_gru_%d' % (i+1))(x)
 88 |             else:
 89 |                 print('invalid rnn type, only support lstm and gru')
 90 |                 sys.exit()
 91 | 
 92 |         outputs, self._loss, self._acc = sl_output_logits(
 93 |             x, self.nb_classes, self.use_crf)
 94 |         self.model = Model(inputs=input_data, outputs=outputs)
 95 | 
 96 |     def get_loss(self):
 97 |         return self._loss
 98 | 
 99 |     def get_metrics(self):
100 |         return self._acc
101 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/dpcnn.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.models import Base_Model
 2 | from nlp_toolkit.modules.logits import tc_output_logits
 3 | from nlp_toolkit.modules.token_embedders import Token_Embedding
 4 | from keras.layers import Input, Dense, add, Activation
 5 | from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
 6 | from keras.layers.merge import concatenate
 7 | from keras.models import Model
 8 | 
 9 | 
10 | class DPCNN(Base_Model):
11 |     """
12 |     Deep Pyramid CNN
13 |     Three key point of DPCNN:
14 |         1. region embeddings
15 |         2. fixed feature maps
16 |         3. residual connection
17 |     """
18 | 
19 |     def __init__(self, nb_classes, nb_tokens, maxlen,
20 |                  embedding_dim=256, embeddings=None,
21 |                  region_kernel_size=[3, 4, 5],
22 |                  conv_kernel_size=3, nb_filters=250, pool_size=3,
23 |                  repeat_time=2,
24 |                  embed_dropout_rate=0.15, final_dropout_rate=0.25):
25 |         super(DPCNN).__init__()
26 |         self.nb_classes = nb_classes
27 |         self.nb_tokens = nb_tokens
28 |         self.maxlen = maxlen
29 |         self.embedding_dim = embedding_dim
30 |         if embeddings is not None:
31 |             self.token_embeddings = [embeddings]
32 |         else:
33 |             self.token_embeddings = None
34 |         self.region_kernel_size = region_kernel_size
35 |         self.conv_kernel_size = conv_kernel_size
36 |         self.nb_filters = nb_filters
37 |         self.pool_size = pool_size
38 |         self.repeat_time = repeat_time
39 |         self.embed_dropout_rate = embed_dropout_rate
40 |         self.final_dropout_rate = final_dropout_rate
41 |         self.invalid_params = {}
42 | 
43 |     def forward(self):
44 |         model_input = Input(shape=(self.maxlen,), dtype='int32', name='token')
45 |         # region embedding
46 |         x = Token_Embedding(model_input, self.nb_tokens, self.embedding_dim,
47 |                             self.token_embeddings, False, self.maxlen,
48 |                             self.embed_dropout_rate, name='token_embeddings')
49 |         if isinstance(self.region_kernel_size, list):
50 |             region = [Conv1D(self.nb_filters, f, padding='same')(x)
51 |                       for f in self.region_kernel_size]
52 |             region_embedding = add(region, name='region_embeddings')
53 |         else:
54 |             region_embedding = Conv1D(
55 |                 self.nb_filters, self.region_kernel_size, padding='same', name='region_embeddings')(x)
56 |         # same padding convolution
57 |         x = Activation('relu')(region_embedding)
58 |         x = Conv1D(self.nb_filters, self.conv_kernel_size,
59 |                    padding='same', name='conv_1')(x)
60 |         x = Activation('relu')(x)
61 |         x = Conv1D(self.nb_filters, self.conv_kernel_size,
62 |                    padding='same', name='conv_2')(x)
63 |         # residual connection
64 |         x = add([x, region_embedding], name='pre_block_hidden')
65 | 
66 |         for k in range(self.repeat_time):
67 |             x = self._block(x, k)
68 |         x = GlobalMaxPooling1D()(x)
69 |         outputs = tc_output_logits(x, self.nb_classes, self.final_dropout_rate)
70 | 
71 |         self.model = Model(inputs=model_input,
72 |                            outputs=outputs, name="Deep Pyramid CNN")
73 | 
74 |     def _block(self, x, k):
75 |         x = MaxPooling1D(self.pool_size, strides=2)(x)
76 |         last_x = x
77 |         x = Activation('relu')(x)
78 |         x = Conv1D(self.nb_filters, self.conv_kernel_size,
79 |                    padding='same', name='block_%d_conv_1' % k)(x)
80 |         x = Activation('relu')(x)
81 |         x = Conv1D(self.nb_filters, self.conv_kernel_size,
82 |                    padding='same', name='block_%d_conv_2' % k)(x)
83 |         # residual connection
84 |         x = add([x, last_x])
85 |         return x
86 | 
87 |     def get_loss(self):
88 |         if self.nb_classes == 2:
89 |             return 'binary_crossentropy'
90 |         elif self.nb_classes > 2:
91 |             return 'categorical_crossentropy'
92 | 
93 |     def get_metrics(self):
94 |         return ['acc']
95 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/han.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevewyl/nlp_toolkit/257dabd300b29957a0be38e7a8049a54f2095ccc/nlp_toolkit/models/han.py


--------------------------------------------------------------------------------
/nlp_toolkit/models/idcnn.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.models import Base_Model
 2 | from nlp_toolkit.modules.token_embedders import Token_Embedding
 3 | from nlp_toolkit.modules.logits import sl_output_logits
 4 | from keras.layers import Input, Dropout, Activation
 5 | from keras.layers import Conv1D, MaxPooling1D
 6 | from keras.layers.merge import concatenate
 7 | from keras.models import Model
 8 | 
 9 | 
10 | class IDCNN(Base_Model):
11 |     """
12 |     Iterated Dilated Convolution Nerual Networks with CRF
13 |     """
14 | 
15 |     def __init__(self, nb_classes,
16 |                  nb_tokens,
17 |                  maxlen,
18 |                  embeddings=None,
19 |                  embedding_dim=64,
20 |                  embed_dropout_rate=0.25,
21 |                  drop_rate=0.5,
22 |                  nb_filters=64,
23 |                  conv_kernel_size=3,
24 |                  dilation_rate=[1, 1, 2],
25 |                  repeat_times=4,
26 |                  use_crf=True,
27 |                  ):
28 |         super(IDCNN).__init__()
29 |         self.nb_classes = nb_classes
30 |         self.nb_tokens = nb_tokens
31 |         self.maxlen = maxlen
32 |         self.embedding_dim = embedding_dim
33 |         self.embed_dropout_rate = embed_dropout_rate
34 |         self.drop_rate = drop_rate
35 |         self.nb_filters = nb_filters
36 |         self.conv_kernel_size = conv_kernel_size
37 |         self.dilation_rate = dilation_rate
38 |         self.repeat_times = repeat_times
39 |         self.use_crf = use_crf
40 |         if embeddings is not None:
41 |             self.token_embeddings = [embeddings]
42 |         else:
43 |             self.token_embeddings = None
44 |         self.invalid_params = {}
45 | 
46 |     def forward(self):
47 |         word_ids = Input(shape=(self.maxlen,), dtype='int32', name='token')
48 |         input_data = [word_ids]
49 |         embed = Token_Embedding(word_ids, self.nb_tokens, self.embedding_dim,
50 |                                 self.token_embeddings, False, self.maxlen,
51 |                                 self.embed_dropout_rate, name='token_embeddings')
52 |         layerInput = Conv1D(
53 |             self.nb_filters, self.conv_kernel_size, padding='same', name='conv_first')(embed)
54 |         dilation_layers = []
55 |         totalWidthForLastDim = 0
56 |         for j in range(self.repeat_times):
57 |             for i in range(len(self.dilation_rate)):
58 |                 islast = True if i == len(self.dilation_rate) - 1 else False
59 |                 conv = Conv1D(self.nb_filters, self.conv_kernel_size, use_bias=True,
60 |                               padding='same', dilation_rate=self.dilation_rate[i],
61 |                               name='atrous_conv_%d_%d' % (j, i))(layerInput)
62 |                 conv = Activation('relu')(conv)
63 |                 if islast:
64 |                     dilation_layers.append(conv)
65 |                     totalWidthForLastDim += self.nb_filters
66 |                 layerInput = conv
67 |         dilation_conv = concatenate(
68 |             dilation_layers, axis=-1, name='dilated_conv')
69 |         if self.drop_rate > 0:
70 |             enc = Dropout(self.drop_rate)(dilation_conv)
71 | 
72 |         outputs, self._loss, self._acc = sl_output_logits(
73 |             enc, self.nb_classes, self.use_crf)
74 |         self.model = Model(inputs=input_data, outputs=outputs)
75 | 
76 |     def get_loss(self):
77 |         return self._loss
78 | 
79 |     def get_metrics(self):
80 |         return self._acc
81 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/text_cnn.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.models import Base_Model
 2 | from nlp_toolkit.modules.logits import tc_output_logits
 3 | from nlp_toolkit.modules.token_embedders import Token_Embedding
 4 | from keras.layers import Input, Dense, Flatten, Dropout
 5 | from keras.layers import Conv1D, MaxPooling1D
 6 | from keras.layers.merge import concatenate
 7 | from keras.models import Model
 8 | 
 9 | 
10 | class textCNN(Base_Model):
11 |     """
12 |     The known Kim CNN model used in text classification.
13 |     It use mulit-channel CNN to encode texts
14 |     """
15 | 
16 |     def __init__(self, nb_classes, nb_tokens, maxlen,
17 |                  embedding_dim=256, embeddings=None, embed_l2=1E-6,
18 |                  conv_kernel_size=[3, 4, 5], pool_size=[2, 2, 2],
19 |                  nb_filters=128, fc_size=128,
20 |                  embed_dropout_rate=0.25, final_dropout_rate=0.5):
21 |         super(textCNN).__init__()
22 |         self.nb_classes = nb_classes
23 |         self.nb_tokens = nb_tokens
24 |         self.maxlen = maxlen
25 |         self.embedding_dim = embedding_dim
26 |         self.nb_filters = nb_filters
27 |         self.pool_size = pool_size
28 |         self.conv_kernel_size = conv_kernel_size
29 |         self.fc_size = fc_size
30 |         self.final_dropout_rate = final_dropout_rate
31 |         self.embed_dropout_rate = embed_dropout_rate
32 | 
33 |         # core layer: multi-channel cnn-pool layers
34 |         self.cnn_list = [Conv1D(
35 |             nb_filters, f, padding='same', name='conv_%d' % k) for k, f in enumerate(conv_kernel_size)]
36 |         self.pool_list = [MaxPooling1D(p, name='pool_%d' % k)
37 |                           for k, p in enumerate(pool_size)]
38 |         self.fc = Dense(fc_size, activation='relu',
39 |                         kernel_initializer='he_normal')
40 |         if embeddings is not None:
41 |             self.token_embeddings = [embeddings]
42 |         else:
43 |             self.token_embeddings = None
44 |         self.invalid_params = {'cnn_list', 'pool_list', 'fc'}
45 | 
46 |     def forward(self):
47 |         model_input = Input(shape=(self.maxlen,), dtype='int32', name='token')
48 |         x = Token_Embedding(model_input, self.nb_tokens, self.embedding_dim,
49 |                             self.token_embeddings, False, self.maxlen,
50 |                             self.embed_dropout_rate, name='token_embeddings')
51 |         cnn_combine = []
52 |         for i in range(len(self.conv_kernel_size)):
53 |             cnn = self.cnn_list[i](x)
54 |             pool = self.pool_list[i](cnn)
55 |             cnn_combine.append(pool)
56 |         x = concatenate(cnn_combine, axis=-1)
57 | 
58 |         x = Flatten()(x)
59 |         x = Dropout(self.final_dropout_rate)(x)
60 |         x = self.fc(x)
61 | 
62 |         outputs = tc_output_logits(x, self.nb_classes, self.final_dropout_rate)
63 | 
64 |         self.model = Model(inputs=model_input,
65 |                            outputs=outputs, name="TextCNN")
66 | 
67 |     def get_loss(self):
68 |         if self.nb_classes == 2:
69 |             return 'binary_crossentropy'
70 |         elif self.nb_classes > 2:
71 |             return 'categorical_crossentropy'
72 | 
73 |     def get_metrics(self):
74 |         return ['acc']
75 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/transformer.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.models import Base_Model
 2 | from nlp_toolkit.modules.attentions import Self_Attention
 3 | from nlp_toolkit.modules.token_embedders import Position_Embedding
 4 | from nlp_toolkit.modules.token_embedders import Token_Embedding
 5 | from nlp_toolkit.modules.logits import tc_output_logits
 6 | from keras.layers import Input, GlobalAveragePooling1D
 7 | from keras.models import Model
 8 | 
 9 | 
10 | class Transformer(Base_Model):
11 |     """
12 |     Multi-Head Self Attention Model.
13 |     Use Transfomer's architecture to encode texts.
14 | 
15 |     # Arguments:
16 |         1. nb_transformer: the nbber of attention layer.
17 |         2. nb_head: the nbber of attention block in one layer
18 |         3. head_size: the hidden size of each attention unit
19 |         4. pos_embed: whether to use poisition embedding
20 |     """
21 | 
22 |     def __init__(self, nb_classes, nb_tokens, maxlen,
23 |                  nb_head=8, head_size=16, nb_transformer=2,
24 |                  embedding_dim=256, embeddings=None, embed_l2=1E-6,
25 |                  pos_embed=False, final_dropout_rate=0.15,
26 |                  embed_dropout_rate=0.15):
27 |         self.nb_classes = nb_classes
28 |         self.nb_tokens = nb_tokens
29 |         self.maxlen = maxlen
30 |         self.nb_head = nb_head
31 |         self.head_size = head_size
32 |         self.embedding_dim = embedding_dim
33 |         self.nb_transformer = nb_transformer
34 |         if embeddings is not None:
35 |             self.token_embeddings = [embeddings]
36 |         else:
37 |             self.token_embeddings = None
38 |         self.pos_embed = pos_embed
39 |         self.final_dropout_rate = final_dropout_rate
40 |         self.embed_dropout_rate = embed_dropout_rate
41 |         self.pos_embed_layer = Position_Embedding(name='position_embedding')
42 |         self.transformers = [Self_Attention(
43 |             nb_head, head_size, name='self_attention_%d' % i) for i in range(nb_transformer)]
44 |         self.pool = GlobalAveragePooling1D()
45 |         self.invalid_params = {'pos_embed_layer', 'transformers', 'pool'}
46 | 
47 |     def forward(self):
48 |         model_input = Input(shape=(self.maxlen,), dtype='int32', name='token')
49 |         x = Token_Embedding(model_input, self.nb_tokens, self.embedding_dim,
50 |                             self.token_embeddings, False, self.maxlen,
51 |                             self.embed_dropout_rate, name='token_embeddings')
52 |         if self.pos_embed:
53 |             x = self.pos_embed_layer(x)
54 |         for i in range(self.nb_transformer):
55 |             x = self.transformers[i]([x, x, x])
56 |         x = self.pool(x)
57 |         outputs = tc_output_logits(x, self.nb_classes, self.final_dropout_rate)
58 |         self.model = Model(inputs=model_input,
59 |                            outputs=outputs, name="Self_Multi_Head_Attention")
60 | 
61 |     def get_loss(self):
62 |         if self.nb_classes == 2:
63 |             return 'binary_crossentropy'
64 |         elif self.nb_classes > 2:
65 |             return 'categorical_crossentropy'
66 | 
67 |     def get_metrics(self):
68 |         return ['acc']
69 | 


--------------------------------------------------------------------------------
/nlp_toolkit/models/word_rnn.py:
--------------------------------------------------------------------------------
  1 | from nlp_toolkit.models import Base_Model
  2 | from nlp_toolkit.modules.token_embedders import Token_Embedding
  3 | from nlp_toolkit.modules.logits import sl_output_logits
  4 | from keras.layers import Input, Activation, TimeDistributed, Dense
  5 | from keras.layers import LSTM, GRU, Bidirectional
  6 | from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
  7 | from keras.layers import subtract, multiply, add, Lambda
  8 | from keras.layers.merge import concatenate
  9 | from keras.models import Model
 10 | import keras.backend as K
 11 | import sys
 12 | 
 13 | 
 14 | class Word_RNN(Base_Model):
 15 |     """
 16 |     Baseline sequence labeling model. Basic token is word.
 17 |     Support following extensibility:
 18 |         1. Extract inner-char features by using lstm or cnn
 19 |         2. Concat or attention between word and char features
 20 |     """
 21 | 
 22 |     def __init__(self, nb_classes, nb_tokens, maxlen,
 23 |                  nb_char_tokens=None, max_charlen=10,
 24 |                  embedding_dim=128, char_embedding_dim=32,
 25 |                  word_rnn_size=128, char_rnn_size=32,
 26 |                  embeddings=None, char_embeddings=None,
 27 |                  inner_char=False, use_crf=True,
 28 |                  char_feature_method='rnn',
 29 |                  integration_method='concat',
 30 |                  rnn_type='lstm',
 31 |                  nb_rnn_layers=1,
 32 |                  nb_filters=32,
 33 |                  conv_kernel_size=2,
 34 |                  drop_rate=0.5,
 35 |                  re_drop_rate=0.15,
 36 |                  embed_l2=1E-6,
 37 |                  embed_dropout_rate=0.15):
 38 |         super(Word_RNN).__init__()
 39 |         self.nb_classes = nb_classes
 40 |         self.nb_tokens = nb_tokens
 41 |         self.maxlen = maxlen
 42 |         self.embedding_dim = embedding_dim
 43 |         self.rnn_type = rnn_type
 44 |         self.nb_rnn_layers = nb_rnn_layers
 45 |         self.drop_rate = drop_rate
 46 |         self.re_drop_rate = re_drop_rate
 47 |         self.use_crf = use_crf
 48 |         self.inner_char = inner_char
 49 |         self.word_rnn_size = word_rnn_size
 50 |         self.embed_dropout_rate = embed_dropout_rate
 51 | 
 52 |         if self.inner_char:
 53 |             self.integration_method = integration_method
 54 |             self.char_feature_method = char_feature_method
 55 |             self.max_charlen = max_charlen
 56 |             self.nb_char_tokens = nb_char_tokens
 57 |             self.char_embedding_dim = char_embedding_dim
 58 |             if char_feature_method == 'rnn':
 59 |                 if self.integration_method == 'attention':
 60 |                     self.char_rnn_size = int(self.embedding_dim / 2)
 61 |                 else:
 62 |                     self.char_rnn_size = char_rnn_size
 63 |             elif char_feature_method == 'cnn':
 64 |                 self.nb_filters = nb_filters
 65 |                 self.conv_kernel_size = conv_kernel_size
 66 |                 if self.integration_method == 'attention':
 67 |                     self.nb_filters = self.embedding_dim
 68 |         if embeddings is not None:
 69 |             self.token_embeddings = [embeddings]
 70 |         else:
 71 |             self.token_embeddings = None
 72 |         if char_feature_method == 'rnn':
 73 |             self.mask_zero = True
 74 |         else:
 75 |             self.mask_zero = False
 76 |         self.char_lstm = LSTM(char_rnn_size, return_sequences=False)
 77 |         self.char_gru = GRU(char_rnn_size, return_sequences=False)
 78 |         self.conv = Conv1D(
 79 |             kernel_size=conv_kernel_size, filters=self.nb_filters, padding='same')
 80 |         self.fc_tanh = Dense(
 81 |             embedding_dim, kernel_initializer="glorot_uniform", activation='tanh')
 82 |         self.fc_sigmoid = Dense(embedding_dim, activation='sigmoid')
 83 | 
 84 |         self.invalid_params = {'char_lstm', 'char_gru', 'mask_zero',
 85 |                                'conv', 'fc_tanh', 'fc_sigmoid'}
 86 | 
 87 |     def forward(self):
 88 |         word_ids = Input(shape=(self.maxlen,), dtype='int32', name='token')
 89 |         input_data = [word_ids]
 90 |         x = Token_Embedding(word_ids, self.nb_tokens, self.embedding_dim,
 91 |                             self.token_embeddings, True, self.maxlen,
 92 |                             self.embed_dropout_rate)
 93 | 
 94 |         # char features
 95 |         if self.inner_char:
 96 |             char_ids = Input(batch_shape=(None, None, None),
 97 |                              dtype='int32', name='char')
 98 |             input_data.append(char_ids)
 99 |             x_c = Token_Embedding(
100 |                 char_ids, input_dim=self.nb_char_tokens,
101 |                 output_dim=self.char_embedding_dim,
102 |                 mask_zero=self.mask_zero, name='char_embeddings',
103 |                 time_distributed=True)
104 |             if self.char_feature_method == 'rnn':
105 |                 if self.rnn_type == 'lstm':
106 |                     char_feature = TimeDistributed(
107 |                         Bidirectional(self.char_lstm), name="char_lstm")(x_c)
108 |                 elif self.rnn_type == 'gru':
109 |                     char_feature = TimeDistributed(
110 |                         Bidirectional(self.char_gru), name="char_gru")(x_c)
111 |                 else:
112 |                     print('invalid rnn type, only support lstm and gru')
113 |                     sys.exit()
114 |             elif self.char_feature_method == 'cnn':
115 |                 conv1d_out = TimeDistributed(self.conv, name='char_cnn')(x_c)
116 |                 char_feature = TimeDistributed(
117 |                     GlobalMaxPooling1D(), name='char_pooling')(conv1d_out)
118 |             if self.integration_method == 'concat':
119 |                 concat_tensor = concatenate([x, char_feature], axis=-1, name='concat_feature')
120 |             elif self.integration_method == 'attention':
121 |                 word_embed_dense = self.fc_tanh(x)
122 |                 char_embed_dense = self.fc_tanh(char_feature)
123 |                 attention_evidence_tensor = add(
124 |                     [word_embed_dense, char_embed_dense])
125 |                 attention_output = self.fc_sigmoid(attention_evidence_tensor)
126 |                 part1 = multiply([attention_output, x])
127 |                 tmp = subtract([Lambda(lambda x: K.ones_like(x))(
128 |                     attention_output), attention_output])
129 |                 part2 = multiply([tmp, char_feature])
130 |                 concat_tensor = add([part1, part2], name='attention_feature')
131 | 
132 |         # rnn encoder
133 |         if self.inner_char:
134 |             enc = concat_tensor
135 |         else:
136 |             enc = x
137 |         for i in range(self.nb_rnn_layers):
138 |             if self.rnn_type == 'lstm':
139 |                 enc = Bidirectional(
140 |                     LSTM(self.word_rnn_size, dropout=self.drop_rate,
141 |                          recurrent_dropout=self.re_drop_rate,
142 |                          return_sequences=True), name='word_lstm_%d' % (i+1))(enc)
143 |             elif self.rnn_type == 'gru':
144 |                 enc = Bidirectional(
145 |                     GRU(self.word_rnn_size, dropout=self.drop_rate,
146 |                         recurrent_dropout=self.re_drop_rate,
147 |                         return_sequences=True), name='word_gru_%d' % (i+1))(enc)
148 |             else:
149 |                 print('invalid rnn type, only support lstm and gru')
150 |                 sys.exit()
151 | 
152 |         # output logits
153 |         outputs, self._loss, self._acc = sl_output_logits(
154 |             enc, self.nb_classes, self.use_crf)
155 |         self.model = Model(inputs=input_data, outputs=outputs)
156 | 
157 |     def get_loss(self):
158 |         return self._loss
159 | 
160 |     def get_metrics(self):
161 |         return self._acc
162 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stevewyl/nlp_toolkit/257dabd300b29957a0be38e7a8049a54f2095ccc/nlp_toolkit/modules/__init__.py


--------------------------------------------------------------------------------
/nlp_toolkit/modules/attentions/__init__.py:
--------------------------------------------------------------------------------
1 | from .attention import Attention
2 | from .self_attention import Self_Attention
3 | from .multi_dim_attention import Multi_Dim_Attention
4 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/attentions/attention.py:
--------------------------------------------------------------------------------
  1 | from keras.engine import Layer
  2 | from keras import backend as K
  3 | 
  4 | 
  5 | class Attention(Layer):
  6 |     """
  7 |     Basic attention layer.
  8 |     Attention layers are normally used to find important tokens based on different labels.
  9 |     uses 'max trick' for numerical stability
 10 |     # Arguments:
 11 |         1. use_bias: whether to use bias
 12 |         2. use_context: whether to use context vector
 13 |         3. return_attention: whether to return attention weights as part of output
 14 |         4. attention_dim: dimensionality of the inner attention
 15 |         5. activation: whether to use activation func in first MLP
 16 |     # Inputs:
 17 |         Tensor with shape (batch_size, time_steps, hidden_size)
 18 |     # Returns:
 19 |         Tensor with shape (batch_size, hidden_size)
 20 |         If return attention weight,
 21 |         an additional tensor with shape (batch_size, time_steps) will be returned.
 22 |     """
 23 | 
 24 |     def __init__(self,
 25 |                  use_bias=True,
 26 |                  use_context=True,
 27 |                  return_attention=False,
 28 |                  attention_dim=None,
 29 |                  activation=True,
 30 |                  **kwargs):
 31 |         self.use_bias = use_bias
 32 |         self.use_context = use_context
 33 |         self.return_attention = return_attention
 34 |         self.attention_dim = attention_dim
 35 |         self.activation = activation
 36 |         super(Attention, self).__init__(**kwargs)
 37 | 
 38 |     def build(self, input_shape):
 39 |         if len(input_shape) < 3:
 40 |             raise ValueError(
 41 |                 "Expected input shape of `(batch_size, time_steps, features)`, found `{}`".format(input_shape))
 42 |         if self.attention_dim is None:
 43 |             attention_dim = input_shape[-1]
 44 |         else:
 45 |             attention_dim = self.attention_dim
 46 | 
 47 |         self.kernel = self.add_weight(name='kernel',
 48 |                                       shape=(input_shape[-1], attention_dim),
 49 |                                       initializer="glorot_normal",
 50 |                                       trainable=True)
 51 |         if self.use_bias:
 52 |             self.bias = self.add_weight(name='bias',
 53 |                                         shape=(attention_dim,),
 54 |                                         initializer="zeros",
 55 |                                         trainable=True)
 56 |         else:
 57 |             self.bias = None
 58 |         if self.use_context:
 59 |             self.context_kernel = self.add_weight(name='context_kernel',
 60 |                                                   shape=(attention_dim, 1),
 61 |                                                   initializer="glorot_normal",
 62 |                                                   trainable=True)
 63 |         else:
 64 |             self.context_kernel = None
 65 | 
 66 |         super(Attention, self).build(input_shape)
 67 | 
 68 |     def call(self, x, mask=None):
 69 |         # MLP
 70 |         ut = K.dot(x, self.kernel)
 71 |         if self.use_bias:
 72 |             ut = K.bias_add(ut, self.bias)
 73 |         if self.activation:
 74 |             ut = K.tanh(ut)
 75 |         if self.context_kernel:
 76 |             ut = K.dot(ut, self.context_kernel)
 77 |         ut = K.squeeze(ut, axis=-1)
 78 |         # softmax
 79 |         at = K.exp(ut - K.max(ut, axis=-1, keepdims=True))
 80 |         if mask is not None:
 81 |             at *= K.cast(mask, K.floatx())
 82 |         att_weights = at / (K.sum(at, axis=1, keepdims=True) + K.epsilon())
 83 |         # output
 84 |         atx = x * K.expand_dims(att_weights, axis=-1)
 85 |         output = K.sum(atx, axis=1)
 86 |         if self.return_attention:
 87 |             return [output, att_weights]
 88 |         return output
 89 | 
 90 |     def compute_mask(self, input, input_mask=None):
 91 |         if isinstance(input_mask, list):
 92 |             return [None] * len(input_mask)
 93 |         else:
 94 |             return None
 95 | 
 96 |     def compute_output_shape(self, input_shape):
 97 |         output_len = input_shape[2]
 98 |         if self.return_attention:
 99 |             return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
100 |         return (input_shape[0], output_len)
101 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/attentions/multi_dim_attention.py:
--------------------------------------------------------------------------------
 1 | from keras.engine import Layer
 2 | from keras import backend as K
 3 | from keras import initializers
 4 | 
 5 | 
 6 | class Multi_Dim_Attention(Layer):
 7 |     """
 8 |     2D attention from "A Structured Self-Attentive Sentence Embedding" (2017)
 9 |     """
10 | 
11 |     def __init__(self, ws1, ws2, punish, init='glorot_normal', **kwargs):
12 |         self.kernel_initializer = initializers.get(init)
13 |         self.weight_ws1 = ws1
14 |         self.weight_ws2 = ws2
15 |         self.punish = punish
16 |         super(Multi_Dim_Attention, self).__init__(** kwargs)
17 | 
18 |     def build(self, input_shape):
19 |         self.Ws1 = self.add_weight(shape=(input_shape[-1], self.weight_ws1),
20 |                                    initializer=self.kernel_initializer,
21 |                                    trainable=True,
22 |                                    name='{}_Ws1'.format(self.name))
23 |         self.Ws2 = self.add_weight(shape=(self.weight_ws1, self.weight_ws2),
24 |                                    initializer=self.kernel_initializer,
25 |                                    trainable=True,
26 |                                    name='{}_Ws2'.format(self.name))
27 |         self.batch_size = input_shape[0]
28 |         super(Multi_Dim_Attention, self).build(input_shape)
29 | 
30 |     def compute_mask(self, input, input_mask=None):
31 |         return None
32 | 
33 |     def call(self, x, mask=None):
34 |         uit = K.tanh(K.dot(x, self.Ws1))
35 |         ait = K.dot(uit, self.Ws2)
36 |         ait = K.permute_dimensions(ait, (0, 2, 1))
37 |         A = K.softmax(ait, axis=1)
38 |         M = K.batch_dot(A, x)
39 |         if self.punish:
40 |             A_T = K.permute_dimensions(A, (0, 2, 1))
41 |             tile_eye = K.tile(K.eye(self.weight_ws2), [self.batch_size, 1])
42 |             tile_eye = K.reshape(
43 |                 tile_eye, shape=[-1, self.weight_ws2, self.weight_ws2])
44 |             AA_T = K.batch_dot(A, A_T) - tile_eye
45 |             P = K.l2_normalize(AA_T, axis=(1, 2))
46 |             return M, P
47 |         else:
48 |             return M
49 | 
50 |     def compute_output_shape(self, input_shape):
51 |         if self.punish:
52 |             out1 = (input_shape[0], self.weight_ws2, input_shape[-1])
53 |             out2 = (input_shape[0], self.weight_ws2, self.weight_ws2)
54 |             return [out1, out2]
55 |         else:
56 |             return (input_shape[0], self.weight_ws2, input_shape[-1])
57 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/attentions/self_attention.py:
--------------------------------------------------------------------------------
 1 | from keras.engine import Layer
 2 | from keras import backend as K
 3 | 
 4 | 
 5 | class Self_Attention(Layer):
 6 |     """
 7 |     Multi_Head Attention Layer defined in <Attention is all your need>.
 8 |     If you want to use it as self-attention, then pass in three same tensors
 9 |     https://github.com/bojone/attention/blob/master/attention_keras.py
10 |     """
11 | 
12 |     def __init__(self, nb_head, size_per_head, **kwargs):
13 |         self.nb_head = nb_head
14 |         self.size_per_head = size_per_head
15 |         self.output_dim = nb_head*size_per_head
16 |         super(Self_Attention, self).__init__(**kwargs)
17 | 
18 |     def build(self, input_shape):
19 |         self.WQ = self.add_weight(name='WQ',
20 |                                   shape=(input_shape[0][-1], self.output_dim),
21 |                                   initializer='glorot_uniform',
22 |                                   trainable=True)
23 |         self.WK = self.add_weight(name='WK',
24 |                                   shape=(input_shape[1][-1], self.output_dim),
25 |                                   initializer='glorot_uniform',
26 |                                   trainable=True)
27 |         self.WV = self.add_weight(name='WV',
28 |                                   shape=(input_shape[2][-1], self.output_dim),
29 |                                   initializer='glorot_uniform',
30 |                                   trainable=True)
31 |         super(Self_Attention, self).build(input_shape)
32 | 
33 |     def Mask(self, inputs, seq_len, mode='mul'):
34 |         """
35 |         # Arguments:
36 |             inputs: input tensor with shape (batch_size, seq_len, input_size)
37 |             seq_len: Each sequence's actual length with shape (batch_size,)
38 |             mode:
39 |                 mul: mask the rest dim with zero, used before fully-connected layer
40 |                 add: subtract a big constant from the rest, used before softmax layer
41 |         # Reutrns:
42 |             Masked tensors with the same shape of input tensor
43 |         """
44 |         if seq_len is None:
45 |             return inputs
46 |         else:
47 |             mask = K.one_hot(seq_len[:, 0], K.shape(inputs)[1])
48 |             mask = 1 - K.cumsum(mask, 1)
49 |             for _ in range(len(inputs.shape) - 2):
50 |                 mask = K.expand_dims(mask, 2)
51 |             if mode == 'mul':
52 |                 return inputs * mask
53 |             if mode == 'add':
54 |                 return inputs - (1 - mask) * 1e12
55 | 
56 |     def call(self, x):
57 |         # if only pass in [Q_seq,K_seq,V_seq], then no Mask operation
58 |         # if you also pass in [Q_len,V_len], Mask will apply to the redundance
59 |         if len(x) == 3:
60 |             Q_seq, K_seq, V_seq = x
61 |             Q_len, V_len = None, None
62 |         elif len(x) == 5:
63 |             Q_seq, K_seq, V_seq, Q_len, V_len = x
64 |         # linear transformation of Q, K, V
65 |         Q_seq = K.dot(Q_seq, self.WQ)
66 |         Q_seq = K.reshape(
67 |             Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
68 |         Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3))
69 |         K_seq = K.dot(K_seq, self.WK)
70 |         K_seq = K.reshape(
71 |             K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
72 |         K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3))
73 |         V_seq = K.dot(V_seq, self.WV)
74 |         V_seq = K.reshape(
75 |             V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
76 |         V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3))
77 |         # compute inner product, then mask, then softmax
78 |         A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5
79 |         A = K.permute_dimensions(A, (0, 3, 2, 1))
80 |         A = self.Mask(A, V_len, 'add')
81 |         A = K.permute_dimensions(A, (0, 3, 2, 1))
82 |         A = K.softmax(A)
83 |         # output and mask
84 |         O_seq = K.batch_dot(A, V_seq, axes=[3, 2])
85 |         O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3))
86 |         O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
87 |         O_seq = self.Mask(O_seq, Q_len, 'mul')
88 |         return O_seq
89 | 
90 |     def compute_output_shape(self, input_shape):
91 |         return (input_shape[0][0], input_shape[0][1], self.output_dim)
92 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/custom_loss.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | custom loss functions
 3 | '''
 4 | 
 5 | from keras import backend as K
 6 | 
 7 | 
 8 | def custom_binary_crossentropy(y_true, y_pred):
 9 |     return K.mean(K.binary_crossentropy(y_true, y_pred[:, :2]), axis=-1)
10 | 
11 | 
12 | def custom_categorical_crossentropy(y_true, y_pred, n):
13 |     return K.categorical_crossentropy(y_true, y_pred[:, :n])
14 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/logits.py:
--------------------------------------------------------------------------------
 1 | """
 2 | common output layers for different tasks
 3 | """
 4 | 
 5 | from keras_contrib.layers import CRF
 6 | from keras.layers import Dense, Dropout
 7 | from keras.regularizers import l2
 8 | 
 9 | 
10 | def tc_output_logits(x, nb_classes, final_dropout_rate=0):
11 |     if final_dropout_rate != 0:
12 |         x = Dropout(final_dropout_rate)(x)
13 |     if nb_classes > 2:
14 |         activation_func = 'softmax'
15 |     else:
16 |         activation_func = 'sigmoid'
17 |     logits = Dense(nb_classes, kernel_regularizer=l2(0.01),
18 |                    activation=activation_func, name='softmax')(x)
19 |     outputs = [logits]
20 |     return outputs
21 | 
22 | 
23 | def sl_output_logits(x, nb_classes, use_crf=True):
24 |     if use_crf:
25 |         crf = CRF(nb_classes, sparse_target=False)
26 |         loss = crf.loss_function
27 |         acc = [crf.accuracy]
28 |         outputs = crf(x)
29 |     else:
30 |         loss = 'categorical_crossentropy'
31 |         acc = ['acc']
32 |         outputs = Dense(nb_classes, activation='softmax')(x)
33 |     return outputs, loss, acc
34 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/token_embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedding import Token_Embedding
2 | from .position_embedding import Position_Embedding
3 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/token_embedders/embedding.py:
--------------------------------------------------------------------------------
 1 | from keras.engine import Layer
 2 | from keras import backend as K
 3 | from keras.layers import Embedding, Dropout, SpatialDropout1D, TimeDistributed
 4 | from keras.regularizers import L1L2
 5 | 
 6 | 
 7 | def Token_Embedding(x, input_dim, output_dim, embed_weights=None,
 8 |                     mask_zero=False, input_length=None, dropout_rate=0,
 9 |                     embed_l2=1E-6, name='', time_distributed=False, **kwargs):
10 |     """
11 |     Basic token embedding layer, also included some dropout layer.
12 |     """
13 |     embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None
14 |     embed_layer = Embedding(input_dim=input_dim,
15 |                             output_dim=output_dim,
16 |                             weights=embed_weights,
17 |                             mask_zero=mask_zero,
18 |                             input_length=input_length,
19 |                             embeddings_regularizer=embed_reg,
20 |                             name=name)
21 |     if time_distributed:
22 |         embed = TimeDistributed(embed_layer)(x)
23 |     else:
24 |         embed = embed_layer(x)
25 |     # entire embedding channels are dropped out instead of the
26 |     # normal Keras embedding dropout, which drops all channels for entire words
27 |     # many of the datasets contain so few words that losing one or more words can alter the emotions completely
28 |     if dropout_rate != 0:
29 |         embed = SpatialDropout1D(dropout_rate)(embed)
30 |     return embed
31 | 


--------------------------------------------------------------------------------
/nlp_toolkit/modules/token_embedders/position_embedding.py:
--------------------------------------------------------------------------------
 1 | from keras.engine import Layer
 2 | from keras import backend as K
 3 | 
 4 | 
 5 | class Position_Embedding(Layer):
 6 |     """
 7 |     Computes sequence position information for Attention based models
 8 |     https://github.com/bojone/attention/blob/master/attention_keras.py
 9 | 
10 |     # Arguments:
11 |         A tensor with shape (batch_size, seq_len, word_size)
12 |     # Returns:
13 |         A position tensor with shape (batch_size, seq_len, position_size)
14 |     """
15 | 
16 |     def __init__(self, size=None, mode='sum', **kwargs):
17 |         self.size = size  # 必须为偶数
18 |         self.mode = mode
19 |         super(Position_Embedding, self).__init__(**kwargs)
20 | 
21 |     def call(self, x):
22 |         if (self.size is None) or (self.mode == 'sum'):
23 |             self.size = int(x.shape[-1])
24 |         batch_size, seq_len = K.shape(x)[0], K.shape(x)[1]
25 |         position_j = 1. / K.pow(10000.,
26 |                                 2 * K.arange(self.size / 2, dtype='float32'
27 |                                              ) / self.size)
28 |         position_j = K.expand_dims(position_j, 0)
29 |         # K.arange不支持变长，只好用这种方法生成
30 |         position_i = K.cumsum(K.ones_like(x[:, :, 0]), 1) - 1
31 |         position_i = K.expand_dims(position_i, 2)
32 |         position_ij = K.dot(position_i, position_j)
33 |         position_ij = K.concatenate(
34 |             [K.cos(position_ij), K.sin(position_ij)], 2)
35 |         if self.mode == 'sum':
36 |             return position_ij + x
37 |         elif self.mode == 'concat':
38 |             return K.concatenate([position_ij, x], 2)
39 | 
40 |     def compute_output_shape(self, input_shape):
41 |         if self.mode == 'sum':
42 |             return input_shape
43 |         elif self.mode == 'concat':
44 |             return (input_shape[0], input_shape[1], input_shape[2] + self.size)
45 | 


--------------------------------------------------------------------------------
/nlp_toolkit/sequence.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Text Sequence Utilties
  3 | """
  4 | 
  5 | import math
  6 | import random
  7 | import numpy as np
  8 | from collections import Counter
  9 | from keras.utils import Sequence
 10 | from keras.utils.np_utils import to_categorical
 11 | from keras.preprocessing.sequence import pad_sequences
 12 | from sklearn.externals import joblib
 13 | from sklearn.base import BaseEstimator, TransformerMixin
 14 | from nlp_toolkit.utilities import logger, word2char
 15 | from typing import Dict, List
 16 | from collections import defaultdict
 17 | 
 18 | 
 19 | def top_elements(array, k):
 20 |     ind = np.argpartition(array, -k)[-k:]
 21 |     return ind[np.argsort(array[ind])][::-1]
 22 | 
 23 | 
 24 | class Vocabulary(object):
 25 |     """
 26 |     Vocab Class for any NLP Tasks
 27 |     """
 28 | 
 29 |     def __init__(self, max_size=None, lower=True, unk_token=True, specials=('<pad>',)):
 30 |         self._max_size = max_size
 31 |         self._lower = lower
 32 |         self._unk = unk_token
 33 |         if specials:
 34 |             self._token2id = {token: i for i, token in enumerate(specials)}
 35 |             self._id2token = list(specials)
 36 |         else:
 37 |             self._token2id = {}
 38 |             self._id2token = []
 39 |         self._token_count = Counter()
 40 | 
 41 |     def __len__(self):
 42 |         return len(self._token2id)
 43 | 
 44 |     def add_token(self, token):
 45 |         token = self.process_token(token)
 46 |         self._token_count.update([token])
 47 | 
 48 |     def add_documents(self, docs):
 49 |         for sent in docs:
 50 |             sent = map(self.process_token, sent)
 51 |             self._token_count.update(sent)
 52 | 
 53 |     def doc2id(self, doc):
 54 |         # doc = map(self.process_token, doc)
 55 |         return [self.token_to_id(token) for token in doc]
 56 | 
 57 |     def id2doc(self, ids):
 58 |         return [self.id_to_token(idx) for idx in ids]
 59 | 
 60 |     def build(self):
 61 |         token_freq = self._token_count.most_common(self._max_size)
 62 |         idx = len(self.vocab)
 63 |         for token, _ in token_freq:
 64 |             self._token2id[token] = idx
 65 |             self._id2token.append(token)
 66 |             idx += 1
 67 |         if self._unk:
 68 |             unk = '<unk>'
 69 |             self._token2id[unk] = idx
 70 |             self._id2token.append(unk)
 71 | 
 72 |     def process_token(self, token):
 73 |         if self._lower:
 74 |             token = token.lower()
 75 | 
 76 |         return token
 77 | 
 78 |     def token_to_id(self, token):
 79 |         # token = self.process_token(token)
 80 |         return self._token2id.get(token, len(self._token2id) - 1)
 81 | 
 82 |     def id_to_token(self, idx):
 83 |         return self._id2token[idx]
 84 | 
 85 |     def extend_vocab(self, new_vocab, max_tokens=10000):
 86 |         assert isinstance(new_vocab, list)
 87 |         if max_tokens < 0:
 88 |             max_tokens = 10000
 89 |         base_index = self.__len__()
 90 |         added = 0
 91 |         for word in new_vocab:
 92 |             if added >= max_tokens:
 93 |                 break
 94 |             if word not in self._token2id:
 95 |                 self._token2id[word] = base_index + added
 96 |                 self._id2token.append(word)
 97 |                 added += 1
 98 |         logger.info('%d new words have been added to vocab' % added)
 99 |         return added
100 | 
101 |     @property
102 |     def vocab(self):
103 |         return self._token2id
104 | 
105 |     @property
106 |     def reverse_vocab(self):
107 |         return self._id2token
108 | 
109 | 
110 | class IndexTransformer(BaseEstimator, TransformerMixin):
111 |     """
112 |     Similar with Sklearn function for transforming text to index
113 |     Basic tokens are usually words.
114 | 
115 |     # Arguments:
116 |         1. max_tokens: maximum number of basic tokens in one sentence
117 |         2. max_inner_chars: maximum number of char tokens in one word
118 |         3. lower: whether to lower tokers
119 |         4. use_inner_char: whether to use inner char tokens depend on your model
120 |         5. initial_vocab: the additional basic tokens which are not in corpus
121 | 
122 |     # Usage：
123 |         p = IndexTransformer()
124 |         new_data = p.fit_transform(data)
125 |         # save
126 |         p.save(file_name)
127 |         # load
128 |         p = IndexTransformer.load(file_name)
129 |         # inverse transform y label
130 |         y_true_label = p.inver_transform(y_pred)
131 |     """
132 | 
133 |     def __init__(self, task_type, max_tokens=80, max_inner_chars=8, lower=True,
134 |                  use_inner_char=False, initial_vocab=None,
135 |                  use_seg=False, use_radical=False, radical_dict=None, basic_token='word'):
136 |         self.basic_token = basic_token
137 |         self.task_type = task_type
138 |         self.max_tokens = max_tokens
139 |         self.max_inner_chars = max_inner_chars
140 |         self.use_inner_char = use_inner_char
141 |         self.use_seg = use_seg
142 |         self.use_radical = use_radical
143 |         self._token_vocab = Vocabulary(lower=lower)
144 |         self._label_vocab = Vocabulary(
145 |             lower=False, unk_token=False, specials=None)
146 |         if use_inner_char:
147 |             self._inner_char_vocab = Vocabulary(lower=lower)
148 |         if initial_vocab:
149 |             self._token_vocab.add_documents([initial_vocab])
150 |         if use_seg:
151 |             self._seg_vocab = Vocabulary(lower=False)
152 |         if use_radical:
153 |             self._radical_vocab = Vocabulary(lower=False)
154 |             self.radical_dict = radical_dict
155 | 
156 |     def fit(self, X, y=None):
157 |         # assert isinstance(X, dict)
158 |         self._token_vocab.add_documents(X)
159 |         self._token_vocab.build()
160 |         if y is not None:
161 |             self._label_vocab.add_documents(y)
162 |             self._label_vocab.build()
163 |         if self.use_inner_char:
164 |             for doc in X:
165 |                 self._inner_char_vocab.add_documents(doc)
166 |             self._inner_char_vocab.build()
167 |         if self.use_seg:
168 |             self._seg_vocab.add_documents([['B'], ['E'], ['M'], ['S']])
169 |             self._seg_vocab.build()
170 |         if self.use_radical:
171 |             self._radical_vocab.add_documents([[w] for w in self.radical_dict])
172 |             self._radical_vocab.build()
173 | 
174 |         return self
175 | 
176 |     def transform(self, X, y=None, max_len=None):
177 |         if max_len is not None:
178 |             max_tokens = max_len
179 |         else:
180 |             max_tokens = self.max_tokens
181 |         tokens = X['token']
182 |         token_ids = [self._token_vocab.doc2id(doc) for doc in tokens]
183 |         token_ids = pad_sequences(
184 |             token_ids, maxlen=max_tokens, padding='post')
185 | 
186 |         features = {'token': token_ids}
187 | 
188 |         if self.use_inner_char:
189 |             char_ids = [[self._inner_char_vocab.doc2id(w) for w in doc] for doc in tokens]
190 |             char_ids = pad_nested_sequences(
191 |                 char_ids, max_tokens, self.max_inner_chars)
192 |             features['char'] = char_ids
193 | 
194 |         if self.use_seg:
195 |             seg_ids = [self._seg_vocab.doc2id(doc) for doc in X['seg']]
196 |             seg_ids = pad_sequences(
197 |                 seg_ids, maxlen=max_tokens, padding='post')
198 |             features['seg'] = seg_ids
199 | 
200 |         if self.use_radical:
201 |             radical_ids = [self._radical_vocab.doc2id(doc) for doc in X['radical']]
202 |             radical_ids = pad_sequences(
203 |                 radical_ids, maxlen=max_tokens, padding='post')
204 |             features['radical'] = radical_ids
205 | 
206 |         if y is not None:
207 |             y = [self._label_vocab.doc2id(doc) for doc in y]
208 |             if self.task_type == 'sequence_labeling':
209 |                 y = pad_sequences(y, maxlen=max_tokens, padding='post')
210 |             y = to_categorical(y, self.label_size).astype(float)
211 | 
212 |             return features, y
213 |         else:
214 |             return features
215 | 
216 |     def fit_transform(self, X, y=None, **params):
217 |         return self.fit(X, y).transform(X, y)
218 | 
219 |     def inverse_transform(self, y, lengths=None, top_k=1, return_percentage=False):
220 |         if self.task_type == 'classification':
221 |             if top_k == 1:
222 |                 ind_top = np.argmax(y, -1)
223 |                 inverse_y = [self._label_vocab.id2doc([idx])[0] for idx in ind_top]
224 |                 return inverse_y
225 |             elif top_k > 1:
226 |                 ind_top = [top_elements(prob, top_k) for prob in y]
227 |                 inverse_y = [self._label_vocab.id2doc(id_list) for id_list in ind_top]
228 |                 if not return_percentage:
229 |                     return inverse_y
230 |                 else:
231 |                     pct_top = [[prob[ind] for ind in ind_top[idx]] for idx, prob in enumerate(y)]
232 |                     return inverse_y, pct_top
233 |         elif self.task_type == 'sequence_labeling':
234 |             ind_top = np.argmax(y, -1)
235 |             inverse_y = [self._label_vocab.id2doc(idx) for idx in ind_top]
236 |             if lengths is not None:
237 |                 inverse_y = [iy[:l] for iy, l in zip(inverse_y, lengths)]
238 |             return inverse_y
239 | 
240 |     @property
241 |     def token_vocab_size(self):
242 |         return len(self._token_vocab)
243 | 
244 |     @property
245 |     def char_vocab_size(self):
246 |         return len(self._inner_char_vocab)
247 | 
248 |     @property
249 |     def seg_vocab_size(self):
250 |         return len(self._seg_vocab)
251 | 
252 |     @property
253 |     def radical_vocab_size(self):
254 |         return len(self._radical_vocab)
255 | 
256 |     @property
257 |     def label_size(self):
258 |         return len(self._label_vocab)
259 | 
260 |     def save(self, file_path):
261 |         joblib.dump(self, file_path)
262 | 
263 |     @classmethod
264 |     def load(cls, file_path):
265 |         p = joblib.load(file_path)
266 |         # print('data transformer loaded')
267 |         return p
268 | 
269 | 
270 | def pad_nested_sequences(sequences, max_sent_len, max_word_len, dtype='int32'):
271 |     """
272 |     Pad char sequences of one single word
273 |     """
274 |     x = np.zeros((len(sequences), max_sent_len, max_word_len)).astype(dtype)
275 |     for i, sent in enumerate(sequences):
276 |         if len(sent) > max_sent_len:
277 |             sent = sent[:max_sent_len]
278 |         for j, word in enumerate(sent):
279 |             if len(word) < max_word_len:
280 |                 x[i, j, :len(word)] = word
281 |             else:
282 |                 x[i, j, :] = word[:max_word_len]
283 |     return x
284 | 
285 | 
286 | class BasicIterator(Sequence):
287 |     """
288 |     Wrapper for Keras Sequence Class
289 |     """
290 | 
291 |     def __init__(self, task_type: str, transformer: IndexTransformer,
292 |                  x: Dict[str, List[List[str]]], y: List[List[str]] = None, batch_size=1):
293 |         self.task_type = task_type
294 |         self.t = transformer
295 |         self.x = x
296 |         self.y = y
297 |         self.batch_size = batch_size
298 |         if self.t.use_radical:
299 |             self.radical_dict = self.t.radical_dict
300 |         else:
301 |             self.radical_dict = None
302 | 
303 |     def __getitem__(self, idx):
304 |         idx_begin = self.batch_size * idx
305 |         idx_end = self.batch_size * (idx + 1)
306 |         x_batch = {k: v[idx_begin: idx_end] for k, v in self.x.items()}
307 |    
308 |         if self.y is not None:
309 |             y_batch = self.y[idx_begin: idx_end]
310 |             features, labels = self.t.transform(X=x_batch, y=y_batch)
311 |             return features, labels
312 |         else:
313 |             features = self.t.transform(X=x_batch)
314 |             return features
315 | 
316 |     def __len__(self):
317 |         return math.ceil(len(self.x['token']) / self.batch_size)
318 | 
319 | 
320 | def _roundto(val, batch_size):
321 |     return int(math.ceil(val / batch_size)) * batch_size
322 | 
323 | 
324 | # TODO
325 | # 按长度聚簇，长文本采用小的batch_size，短文本采用大的batch_size
326 | class BucketIterator(Sequence):
327 |     """
328 |     A Keras Sequence (dataset reader) of input sequences read in bucketed bins.
329 |     Assumes all inputs are already padded using 'pad_sequences'
330 |     (where post padding is prepended).
331 |     """
332 | 
333 |     def __init__(self, task_type: str, transformer: IndexTransformer,
334 |                  seq_lengths: List[int],
335 |                  x: Dict[str, List[List[str]]], y: List[List[str]],
336 |                  num_buckets: int = 8, batch_size=1):
337 |         self.task_type = task_type
338 |         self.t = transformer
339 |         self.batch_size = batch_size
340 |         self.task_type = task_type
341 |         self.x = x
342 |         self.y = y
343 |         if self.t.use_radical:
344 |             self.radical_dict = self.t.radical_dict
345 |         else:
346 |             self.radical_dict = None
347 | 
348 |         # Count bucket sizes
349 |         bucket_sizes, bucket_ranges = np.histogram(
350 |             seq_lengths, bins=num_buckets)
351 |         # Looking for non-empty buckets
352 |         actual_buckets = [bucket_ranges[i+1]
353 |                           for i, bs in enumerate(bucket_sizes) if bs > 0]
354 |         actual_bucket_sizes = [bs for bs in bucket_sizes if bs > 0]
355 |         self.bucket_seqlen = [int(math.ceil(bs)) for bs in actual_buckets]
356 |         num_actual = len(actual_buckets)
357 |         logger.info('Training with %d non-empty buckets' % num_actual)
358 | 
359 |         self.bins = [(defaultdict(list), []) for bs in actual_bucket_sizes]
360 |         assert len(self.bins) == num_actual
361 | 
362 |         # Insert the sequences into the bins
363 |         self.feature_keys = list(self.x.keys())
364 |         for i, sl in enumerate(seq_lengths):
365 |             for j in range(num_actual):
366 |                 bsl = self.bucket_seqlen[j]
367 |                 if sl < bsl or j == num_actual - 1:
368 |                     for k in self.feature_keys:
369 |                         self.bins[j][0][k].append(x[k][i])
370 |                     self.bins[j][1].append(y[i])
371 |                     break
372 | 
373 |         self.num_samples = len(self.x['token'])
374 |         self.dataset_len = int(sum([math.ceil(bs / self.batch_size)
375 |                                     for bs in actual_bucket_sizes]))
376 |         self._permute()
377 | 
378 |     def _permute(self):
379 |         # Shuffle bins
380 |         random.shuffle(self.bins)
381 | 
382 |         # Shuffle bin contents
383 |         for i, (xbin, ybin) in enumerate(self.bins):
384 |             index_array = np.random.permutation(len(ybin))
385 |             self.bins[i] = ({k: [xbin[k][i] for i in index_array] for k in self.feature_keys}, [ybin[i] for i in index_array])
386 | 
387 |     def on_epoch_end(self):
388 |         self._permute()
389 | 
390 |     def __len__(self):
391 |         return self.dataset_len
392 | 
393 |     def __getitem__(self, idx):
394 |         idx_begin = self.batch_size * idx
395 |         idx_end = self.batch_size * (idx + 1)
396 | 
397 |         # Obtain bin index
398 |         for idx, (xbin, ybin) in enumerate(self.bins):
399 |             rounded_bin = _roundto(len(ybin), self.batch_size)
400 |             if idx_begin >= rounded_bin:
401 |                 idx_begin -= rounded_bin
402 |                 idx_end -= rounded_bin
403 |                 continue
404 | 
405 |             # Found bin
406 |             idx_end = min(len(ybin), idx_end)  # Clamp to end of bin
407 |             x_batch = {k: v[idx_begin: idx_end] for k, v in xbin.items()}
408 |             y_batch = ybin[idx_begin: idx_end]
409 | 
410 |             max_len_i = self.bucket_seqlen[idx]
411 |             features, labels = self.t.transform(x_batch, y_batch, max_len_i)
412 | 
413 |             return features, labels
414 |         raise ValueError('out of bounds')
415 | 


--------------------------------------------------------------------------------
/nlp_toolkit/trainer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Trainer Class: define the training process
  3 | """
  4 | 
  5 | import os
  6 | import time
  7 | import numpy as np
  8 | from pathlib import Path
  9 | from keras.optimizers import Adam, Nadam
 10 | from sklearn.model_selection import train_test_split
 11 | from sklearn.metrics import precision_recall_fscore_support
 12 | from nlp_toolkit.callbacks import get_callbacks, History
 13 | from nlp_toolkit.utilities import logger
 14 | from nlp_toolkit.sequence import BasicIterator, BucketIterator
 15 | from nlp_toolkit.modules.custom_loss import custom_binary_crossentropy, custom_categorical_crossentropy
 16 | from typing import Dict
 17 | from copy import deepcopy
 18 | 
 19 | np.random.seed(1050)
 20 | 
 21 | 
 22 | # TODO 自适应的学习率
 23 | # 1. 基于valid数据的自适应学习率下降
 24 | # 2. 三角学习率
 25 | class Trainer(object):
 26 |     """
 27 |     Trainer class for all model training
 28 |     support single training and n-fold training
 29 | 
 30 |     # Arguments:
 31 |         1. model: Keras Model object
 32 |         2. model_name
 33 |         3. task_type: text classification or sequence labeling
 34 |         4. metric: the main metric used to track model performance on epoch end
 35 |         5. extra_features: besides token features, some useful features will be included
 36 |         6. batch_size: minimum batch size
 37 |         7. max_epoch: maximum epoch numbers
 38 |         8. optimizer: default is Adam
 39 |         9. checkpoint_path: the folder path for saving models
 40 |         9. early_stopping: whether to use early stopping strategy
 41 |         10. lrplateau: whether to use lr lateau strategy
 42 |         11. tensorboard: whether to open tensorboard to log training process
 43 |         12. nb_bucket: the bucket size
 44 |         13. train_mode: single turn training or n-fold training
 45 |         14. fold_cnt: the number of folds
 46 |         15. test_size: default is 0.2
 47 |         16. shuffle: whether to shuffle data between epochs, default is true
 48 |         17. patiences: the maximum epochs to stop training when the metric has not been improved
 49 | 
 50 |     # Returns:
 51 |         The trained model or average performance of the model
 52 |     """
 53 | 
 54 |     def __init__(self, model,
 55 |                  model_name,
 56 |                  task_type,
 57 |                  metric,
 58 |                  batch_size=64,
 59 |                  max_epoch=25,
 60 |                  optimizer=Adam(),
 61 |                  checkpoint_path='./models/',
 62 |                  early_stopping=True,
 63 |                  lrplateau=True,
 64 |                  tensorboard=False,
 65 |                  nb_bucket=100,
 66 |                  train_mode='single',
 67 |                  fold_cnt=10,
 68 |                  test_size=0.2,
 69 |                  shuffle=True,
 70 |                  patiences=3):
 71 |         self.single_model = deepcopy(model)
 72 |         self.fold_model = deepcopy(model)
 73 |         self.model_name = model_name
 74 |         self.task_type = task_type
 75 |         self.metric = metric
 76 |         self.batch_size = batch_size
 77 |         self.max_epoch = max_epoch
 78 |         self.optimizer = optimizer
 79 |         self.test_size = test_size
 80 |         self.train_mode = train_mode
 81 |         self.fold_cnt = fold_cnt
 82 |         self.shuffle = shuffle
 83 |         self.nb_bucket = nb_bucket
 84 |         self.patiences = patiences
 85 |         base_dir = Path(checkpoint_path)
 86 |         if not base_dir.exists():
 87 |             base_dir.mkdir()
 88 |         current_time = time.strftime(
 89 |             '%Y%m%d%H%M', time.localtime(time.time()))
 90 |         save_dir = self.model_name + '_' + current_time
 91 |         self.checkpoint_path = Path(checkpoint_path) / save_dir
 92 | 
 93 |     def data_generator(self, seq_type, x_train, x_valid, y_train, y_valid,
 94 |                        x_len_train=None, x_len_valid=None,):
 95 |         if seq_type == 'bucket':
 96 |             logger.info('use bucket sequence to speed up model training')
 97 |             train_batches = BucketIterator(
 98 |                 self.task_type, self.transformer, x_len_train,
 99 |                 x_train, y_train, self.nb_bucket, self.batch_size)
100 |             valid_batches = BucketIterator(
101 |                 self.task_type, self.transformer, x_len_valid,
102 |                 x_valid, y_valid, self.nb_bucket, self.batch_size)
103 |         elif seq_type == 'basic':
104 |             train_batches = BasicIterator(
105 |                 self.task_type, self.transformer,
106 |                 x_train, y_train, self.batch_size)
107 |             valid_batches = BasicIterator(
108 |                 self.task_type, self.transformer,
109 |                 x_valid, y_valid, self.batch_size)
110 |         else:
111 |             logger.warning('invalid data iterator type, only supports "basic" or "bucket"')
112 |         return train_batches, valid_batches
113 | 
114 |     def train(self, x_ori, y, transformer,
115 |               seq_type='bucket',
116 |               return_attention=False):
117 |         self.transformer = transformer
118 |         self.feature_keys = list(x_ori.keys())
119 | 
120 |         if self.train_mode == 'single':
121 |             x = deepcopy(x_ori)
122 |             x_len = [item[-1] for item in x['token']]
123 |             x['token'] = [item[:-1] for item in x['token']]
124 | 
125 |             # model initialization
126 |             self.single_model.forward()
127 |             logger.info('%s model structure...' % self.model_name)
128 |             self.single_model.model.summary()
129 | 
130 |             # split dataset
131 |             indices = np.random.permutation(len(x['token']))
132 |             cut_point = int(len(x['token']) * (1 - self.test_size))
133 |             train_idx, valid_idx = indices[:cut_point], indices[cut_point:]
134 |             x_train = {k: [x[k][i] for i in train_idx] for k in self.feature_keys}
135 |             x_valid = {k: [x[k][i] for i in valid_idx] for k in self.feature_keys}
136 |             y_train, y_valid = [y[i] for i in train_idx], [y[i] for i in valid_idx]
137 |             x_len_train, x_len_valid = [x_len[i] for i in train_idx], [x_len[i] for i in valid_idx]
138 |             logger.info(
139 |                 'train/valid set: {}/{}'.format(train_idx.shape[0], valid_idx.shape[0]))
140 | 
141 |             # transform data to sequence data streamer
142 |             train_batches, valid_batches = self.data_generator(
143 |                 seq_type,
144 |                 x_train, x_valid, y_train, y_valid,
145 |                 x_len_train, x_len_valid)
146 | 
147 |             # define callbacks
148 |             history = History(self.metric)
149 |             self.callbacks = get_callbacks(
150 |                 history=history,
151 |                 metric=self.metric[0],
152 |                 log_dir=self.checkpoint_path,
153 |                 valid=valid_batches,
154 |                 transformer=transformer,
155 |                 attention=return_attention)
156 | 
157 |             # model compile
158 |             self.single_model.model.compile(
159 |                 loss=self.single_model.get_loss(),
160 |                 optimizer=self.optimizer,
161 |                 metrics=self.single_model.get_metrics())
162 | 
163 |             # save transformer and model parameters
164 |             if not self.checkpoint_path.exists():
165 |                 self.checkpoint_path.mkdir()
166 |             transformer.save(self.checkpoint_path / 'transformer.h5')
167 |             invalid_params = self.single_model.invalid_params
168 |             param_file = self.checkpoint_path / 'model_parameters.json'
169 |             self.single_model.save_params(param_file, invalid_params)
170 |             logger.info('saving model parameters and transformer to {}'.format(
171 |                 self.checkpoint_path))
172 | 
173 |             # actual training start
174 |             self.single_model.model.fit_generator(
175 |                 generator=train_batches,
176 |                 epochs=self.max_epoch,
177 |                 callbacks=self.callbacks,
178 |                 shuffle=self.shuffle,
179 |                 validation_data=valid_batches)
180 |             print('best {}: {:04.2f}'.format(self.metric[0],
181 |                                              max(history.metrics[self.metric[0]]) * 100))
182 |             return self.single_model.model, history
183 | 
184 |         elif self.train_mode == 'fold':
185 |             x = deepcopy(x_ori)
186 |             x_len = [item[-1] for item in x['token']]
187 |             x['token'] = [item[:-1] for item in x['token']]
188 |             x_token_first = x['token'][0]
189 | 
190 |             fold_size = len(x['token']) // self.fold_cnt
191 |             scores = []
192 |             logger.info('%d-fold starts!' % self.fold_cnt)
193 | 
194 |             for fold_id in range(self.fold_cnt):
195 |                 print('\n------------------------ fold ' + str(fold_id) + '------------------------')
196 | 
197 |                 assert x_token_first == x['token'][0]
198 |                 model_init = self.fold_model
199 |                 model_init.forward()
200 | 
201 |                 fold_start = fold_size * fold_id
202 |                 fold_end = fold_start + fold_size
203 |                 if fold_id == fold_size - 1:
204 |                     fold_end = len(x)
205 |                 if fold_id == 0:
206 |                     logger.info('%s model structure...' % self.model_name)
207 |                     model_init.model.summary()
208 | 
209 |                 x_train = {k: x[k][:fold_start] + x[k][fold_end:] for k in self.feature_keys}
210 |                 x_len_train = x_len[:fold_start] + x_len[fold_end:]
211 |                 y_train = y[:fold_start] + y[fold_end:]
212 |                 x_valid = {k: x[k][fold_start:fold_end] for k in self.feature_keys}
213 |                 x_len_valid = x_len[fold_start:fold_end]
214 |                 y_valid = y[fold_start:fold_end]
215 | 
216 |                 train_batches, valid_batches = self.data_generator(
217 |                     seq_type,
218 |                     x_train, x_valid, y_train, y_valid,
219 |                     x_len_train, x_len_valid)
220 | 
221 |                 history = History(self.metric)
222 |                 self.callbacks = get_callbacks(
223 |                     history=history, metric=self.metric[0],
224 |                     valid=valid_batches, transformer=transformer,
225 |                     attention=return_attention)
226 | 
227 |                 model_init.model.compile(
228 |                     loss=model_init.get_loss(),
229 |                     optimizer=self.optimizer,
230 |                     metrics=model_init.get_metrics())
231 | 
232 |                 model_init.model.fit_generator(
233 |                     generator=train_batches,
234 |                     epochs=self.max_epoch,
235 |                     callbacks=self.callbacks,
236 |                     shuffle=self.shuffle,
237 |                     validation_data=valid_batches)
238 |                 scores.append(max(history.metrics[self.metric[0]]))
239 | 
240 |             logger.info('training finished! The mean {} scores: {:4.2f}(±{:4.2f})'.format(
241 |                 self.metric[0], np.mean(scores) * 100, np.std(scores) * 100))
242 | 


--------------------------------------------------------------------------------
/nlp_toolkit/utilities.py:
--------------------------------------------------------------------------------
  1 | """
  2 | some nlp process utilty functions
  3 | """
  4 | 
  5 | import io
  6 | import re
  7 | import sys
  8 | import time
  9 | import logging
 10 | import numpy as np
 11 | from itertools import groupby
 12 | 
 13 | logging.basicConfig(level=logging.INFO,
 14 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 15 | logger = logging.getLogger('nlp_toolkit')
 16 | 
 17 | global special_tokens
 18 | special_tokens = set(['s_', 'lan_', 'ss_'])
 19 | 
 20 | 
 21 | # [1, ['a', 'b], [True, False]] ---> [1, 'a', 'b', True, False]
 22 | def flatten_gen(x):
 23 |     for i in x:
 24 |         if isinstance(i, list) or isinstance(i, tuple):
 25 |             for inner_i in i:
 26 |                 yield inner_i
 27 |         else:
 28 |             yield i
 29 | 
 30 | 
 31 | # judge char type ['cn', 'en', 'num', 'other']
 32 | def char_type(word):
 33 |     for char in word:
 34 |         unicode_char = ord(char)
 35 |         if unicode_char >= 19968 and unicode_char <= 40869:
 36 |             yield (char, 'cn')
 37 |         elif unicode_char >= 65 and unicode_char <= 122:
 38 |             yield (char, 'en')
 39 |         elif unicode_char >= 48 and unicode_char <= 57:
 40 |             yield (char, 'num')
 41 |         else:
 42 |             yield (char, 'other')
 43 | 
 44 | 
 45 | # split word into chars
 46 | def split_cn_en(word):
 47 |     new_word = [c for c in char_type(word)]
 48 |     new_word_len = len(new_word)
 49 |     tmp = ''
 50 |     for ix, item in enumerate(new_word):
 51 |         if item[1] in {'en', 'num'}:
 52 |             if ix < new_word_len - 1:
 53 |                 if new_word[ix+1][1] == item[1]:
 54 |                     tmp += item[0]
 55 |                 else:
 56 |                     tmp += item[0]
 57 |                     yield tmp
 58 |                     tmp = ''
 59 |             else:
 60 |                 tmp += item[0]
 61 |                 yield tmp
 62 |         else:
 63 |             yield item[0]
 64 | 
 65 | 
 66 | # reassign token labels according new tokens
 67 | def extract_char(word_list, label_list=None, use_seg=False):
 68 |     if label_list:
 69 |         for word, label in zip(word_list, label_list):
 70 |             # label = label.strip('#')
 71 |             single_check = word in special_tokens or not re.search(r'[^a-z0-9]+', word)
 72 |             if len(word) == 1 or single_check:
 73 |                 if use_seg:
 74 |                     yield (word, label, 'S')
 75 |                 else:
 76 |                     yield (word, label)
 77 |             else:
 78 |                 try:
 79 |                     new_word = list(split_cn_en(word))
 80 |                     word_len = len(new_word)
 81 |                     if label == 'O':
 82 |                         new_label = ['O'] * word_len
 83 |                     elif label.startswith('I'):
 84 |                         new_label = [label] * word_len
 85 |                     else:
 86 |                         label_i = 'I' + label[1:]
 87 |                         if label.startswith('B'):
 88 |                             new_label = [label] + [label_i] * (word_len - 1)
 89 |                         elif label.startswith('E'):
 90 |                             new_label = [label_i] * (word_len - 1) + [label]
 91 |                     if use_seg:
 92 |                         seg_tag = ['M'] * word_len
 93 |                         seg_tag[0] = 'B'
 94 |                         seg_tag[-1] = 'E'
 95 |                         for x, y, z in zip(new_word, new_label, seg_tag):
 96 |                             yield (x, y, z)
 97 |                     else:
 98 |                         for x, y in zip(new_word, new_label):
 99 |                             yield (x, y)
100 |                 except Exception as e:
101 |                     print(e)
102 |                     print(list(zip(word_list, label_list)))
103 |                     sys.exit()
104 |     else:
105 |         for word in word_list:
106 |             single_check = word in special_tokens or not re.search(r'[^a-z0-9]+', word)
107 |             if len(word) == 1 or single_check:
108 |                 if use_seg:
109 |                     yield (word, 'S')
110 |                 else:
111 |                     yield (word)
112 |             else:
113 |                 new_word = list(split_cn_en(word))
114 |                 if use_seg:
115 |                     seg_tag = ['M'] * len(new_word)
116 |                     seg_tag[0] = 'B'
117 |                     seg_tag[-1] = 'E'
118 |                     for x, y in zip(new_word, seg_tag):
119 |                         yield (x, y)
120 |                 else:
121 |                     for x in new_word:
122 |                         yield x
123 | 
124 | 
125 | # get radical token by chars
126 | def get_radical(d, char_list):
127 |     return [d[char] if char in d else '<unk>' for char in char_list]
128 | 
129 | 
130 | def word2char(word_list, label_list=None, task_type='',
131 |               use_seg=False, radical_dict=None):
132 |     """
133 |     convert basic token from word to char
134 |     non-chinese word will not be simply splitted into char sequences
135 |     e.g. "machine02" will be splitted into "machine" and "02"
136 |     """
137 | 
138 |     if task_type == 'classification':
139 |         assert label_list is None
140 |         assert radical_dict is None
141 |         assert use_seg is False
142 |         return [char for word in word_list for char in list(split_cn_en(word))]
143 |     elif task_type == 'sequence_labeling':
144 |         results = list(
145 |             zip(*[item for item in extract_char(word_list, label_list, use_seg)]))
146 |         if label_list:
147 |             if use_seg:
148 |                 chars, new_labels, seg_tags = results
149 |                 assert len(chars) == len(new_labels) == len(seg_tags)
150 |             else:
151 |                 chars, new_labels = results
152 |                 assert len(chars) == len(new_labels)
153 |             new_result = {'token': chars, 'label': new_labels}
154 |         else:
155 |             if use_seg:
156 |                 chars, seg_tags = results
157 |                 assert len(chars) == len(seg_tags)
158 |             else:
159 |                 chars = results
160 |             new_result = {'token': chars}
161 |         if use_seg:
162 |             new_result['seg'] = seg_tags
163 |         if radical_dict:
164 |             new_result['radical'] = get_radical(radical_dict, chars)
165 |         return new_result
166 |     else:
167 |         logger.error('invalid task type')
168 |         sys.exit()
169 | 
170 | 
171 | def shorten_word(word):
172 |     """
173 |     Shorten groupings of 3+ identical consecutive chars to 2, e.g. '!!!!' --> '!!'
174 |     """
175 | 
176 |     # must have at least 3 char to be shortened
177 |     if len(word) < 3:
178 |         return word
179 |     # find groups of 3+ consecutive letters
180 |     letter_groups = [list(g) for k, g in groupby(word)]
181 |     triple_or_more = [''.join(g) for g in letter_groups if len(g) >= 3]
182 |     if len(triple_or_more) == 0:
183 |         return word
184 |     # replace letters to find the short word
185 |     short_word = word
186 |     for trip in triple_or_more:
187 |         short_word = short_word.replace(trip, trip[0] * 2)
188 | 
189 |     return short_word
190 | 
191 | 
192 | # Command line arguments are cast to bool type
193 | def boolean_string(s):
194 |     if s not in {'False', 'True'}:
195 |         raise ValueError('Not a valid boolean string')
196 |     return s == 'True'
197 | 
198 | 
199 | # decorator to time a function
200 | def timer(function):
201 |     def log_time():
202 |         start_time = time.time()
203 |         function()
204 |         elapsed = time.time() - start_time
205 |         logger.info('Function "{name}" finished in {time:.2f} s'.format(name=function.__name__, time=elapsed))
206 |     return log_time()
207 | 
208 | 
209 | # generate small embedding files according given vocabs
210 | def gen_small_embedding(vocab_file, embed_file, output_file):
211 |     vocab = set([word.strip() for word in open(vocab_file, encoding='utf8')])
212 |     print('total vocab: ', len(vocab))
213 |     fin = io.open(embed_file, 'r', encoding='utf-8', newline='\n', errors='ignore')
214 |     try:
215 |         n, d = map(int, fin.readline().split())
216 |     except Exception:
217 |         print('please make sure the embed file is gensim-formatted')
218 | 
219 |     def gen():
220 |         for line in fin:
221 |             token = line.rstrip().split(' ', 1)[0]
222 |             if token in vocab:
223 |                 yield line
224 | 
225 |     result = [line for line in gen()]
226 |     rate = 1 - len(result) / len(vocab)
227 |     print('oov rate: {:4.2f}%'.format(rate * 100))
228 | 
229 |     with open(output_file, 'w', encoding='utf8') as fout:
230 |         fout.write(str(len(result)) + ' ' + str(d) + '\n')
231 |         for line in result:
232 |             fout.write(line)
233 | 
234 | 
235 | # load embeddings from text file
236 | def load_vectors(fname, vocab):
237 |     fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
238 |     _, d = map(int, fin.readline().split())
239 |     data = {}
240 |     for line in fin:
241 |         tokens = line.rstrip().split(' ')
242 |         data[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
243 | 
244 |     scale = 0.25
245 |     # scale = np.sqrt(3.0 / n_dim)
246 |     embedding_matrix = np.random.uniform(-scale, scale, [len(vocab), d])
247 |     embedding_matrix[0] = np.zeros(d)
248 |     cnt = 0
249 |     for word, i in vocab._token2id.items():
250 |         embedding_vector = data.get(word)
251 |         if embedding_vector is not None:
252 |             cnt += 1
253 |             embedding_matrix[i] = embedding_vector
254 |     logger.info('OOV rate: {:04.2f} %'.format(1 - cnt / len(vocab._token2id)))
255 |     return embedding_matrix, d
256 | 
257 | 
258 | def load_tc_data(fname, label_prefix='__label__', max_tokens_per_doc=256):
259 | 
260 |     def gen():
261 |         with open(fname, 'r', encoding='utf8') as fin:
262 |             for line in fin:
263 |                 words = line.strip().split()
264 |                 if words:
265 |                     nb_labels = 0
266 |                     label_line = []
267 |                     for word in words:
268 |                         if word.startswith(label_prefix):
269 |                             nb_labels += 1
270 |                             label = word.replace(label_prefix, "")
271 |                             label_line.append(label)
272 |                         else:
273 |                             break
274 |                     text = words[nb_labels:]
275 |                     if len(text) > max_tokens_per_doc:
276 |                         text = text[:max_tokens_per_doc]
277 |                     yield (text, label_line)
278 | 
279 |     texts, labels = zip(*[item for item in gen()])
280 |     return texts, labels
281 | 
282 | 
283 | def load_sl_data(fname, data_format='basic'):
284 | 
285 |     def process_conll(data):
286 |         sents, labels = [], []
287 |         tokens, tags = [], []
288 |         for line in data:
289 |             if line:
290 |                 token, tag = line.split('\t')
291 |                 tokens.append(token)
292 |                 tags.append(tag)
293 |             else:
294 |                 sents.append(tokens)
295 |                 labels.append(tags)
296 |                 tokens, tags = [], []
297 |         return sents, labels
298 | 
299 |     data = (line.strip() for line in open(fname, 'r', encoding='utf8'))
300 |     if data_format:
301 |         if data_format == 'basic':
302 |             texts, labels = zip(
303 |                 *[zip(*[item.rsplit('###', 1) for item in line.split('\t')]) for line in data])
304 |         elif data_format == 'conll':
305 |             texts, labels = process_conll(data)
306 |         return texts, labels
307 |     else:
308 |         print('invalid data format for sequence labeling task')
309 | 
310 | 
311 | def convert_seq_format(fin_name, fout_name, dest_format='conll'):
312 |     if dest_format == 'conll':
313 |         basic2conll(fin_name, fout_name)
314 |     elif dest_format == 'basic':
315 |         conll2basic(fin_name, fout_name)
316 |     else:
317 |         logger.warning('invalid data format')
318 | 
319 | 
320 | def basic2conll(fin_name, fout_name):
321 |     data = [line.strip() for line in open(fin_name, 'r', encoding='utf8')]
322 |     with open(fout_name, 'w', encoding='utf8') as fout:
323 |         for line in data:
324 |             for item in line.split('\t'):
325 |                 token, label = item.rsplit('###')
326 |                 label = label.strip('#')
327 |                 fout.write(token + '\t' + label + '\n')
328 |             fout.write('\n')
329 | 
330 | 
331 | def conll2basic(fin_name, fout_name):
332 |     data = [line.strip() for line in open(fin_name, 'r', encoding='utf8')]
333 |     with open(fout_name, 'w', encoding='utf8') as fout:
334 |         tmp = []
335 |         for line in data:
336 |             if line:
337 |                 token, label = line.split('\t')
338 |                 label = label.strip('\t')
339 |                 item = token + '###' + label
340 |                 tmp.append(item)
341 |             else:
342 |                 new_line = '\t'.join(tmp) + '\n'
343 |                 fout.write(new_line)
344 |                 tmp = []
345 | 


--------------------------------------------------------------------------------
/nlp_toolkit/visualization.py:
--------------------------------------------------------------------------------
  1 | """
  2 | some Visualization Functions
  3 | """
  4 | import random
  5 | from seqeval.metrics.sequence_labeling import get_entities
  6 | from typing import List
  7 | from copy import deepcopy
  8 | 
  9 | ENTITY_COLOR = ['#ff9900', '#00ccff', '#66ff99', '#ff3300', '#9933ff', '#669999']
 10 | 
 11 | 
 12 | def highlight_by_weight(word, att_weight):
 13 |     html_color = '#%02X%02X%02X' % (255, int(255 * (1 - att_weight)), int(255 * (1 - att_weight)))
 14 |     return '<span style="background-color: {}">{}</span>'.format(html_color, word)
 15 | 
 16 | 
 17 | def att2html(words, att_weights):
 18 |     html = ""
 19 |     for word, att_weight in zip(words, att_weights):
 20 |         html += ' ' + highlight_by_weight(word, att_weight)
 21 |     return html + "<br><br>\n"
 22 | 
 23 | 
 24 | def attention_visualization(texts: List[List[str]], attention_weights,
 25 |                             output_fname='attention_texts.html'):
 26 |     with open(output_fname, 'w') as fout:
 27 |         for x, y in zip(texts, attention_weights):
 28 |             fout.write(att2html(x, y))
 29 | 
 30 | 
 31 | def highlight_entity(words: List[str], entity_type, entity_color):
 32 |     if entity_type:
 33 |         html_color = entity_color[entity_type]
 34 |         words = ' '.join(words) + ' [%s]' % entity_type
 35 |         return '<span style="background-color: {}">{}</span>'.format(html_color, words)
 36 |     else:
 37 |         return ' '.join(words)
 38 | 
 39 | 
 40 | def entity2html(words, labels, entity_colors):
 41 |     html = ""
 42 |     entity_dict = {item[1]: [item[0], item[-1]] for item in labels}
 43 |     start, end = 0, 0
 44 |     while end < len(words):
 45 |         if end not in entity_dict:
 46 |             end += 1
 47 |             if end == len(words):
 48 |                 html += words[-1]
 49 |         else:
 50 |             if end > start:
 51 |                 html += highlight_entity(words[start: end], None, entity_colors) + ' '
 52 |             entity_info = entity_dict[end]
 53 |             entity_start = end
 54 |             entity_end = entity_info[-1] + 1
 55 |             html += highlight_entity(words[entity_start: entity_end], entity_info[0], entity_colors) + ' '
 56 |             start = entity_end
 57 |             end = start
 58 |     return html + "<br><br>\n"
 59 | 
 60 | 
 61 | def entity_visualization(texts: List[List[str]], labels: List[List[str]],
 62 |                          output_fname='entity_texts.html'):
 63 |     texts_c = deepcopy(texts)
 64 |     texts_c = [item[:-1] for item in texts_c]
 65 |     entities = [get_entities(item) for item in labels]
 66 |     all_entities = list(set([sub_item[0] for item in entities for sub_item in item]))
 67 |     all_entities = [item for item in all_entities if item != 'O']
 68 |     nb_entities = len(all_entities)
 69 |     if nb_entities > len(ENTITY_COLOR):
 70 |         rest_nb_colors = nb_entities - len(ENTITY_COLOR)
 71 |         colors = ENTITY_COLOR + ['#' + ''.join([random.choice('0123456789ABCDEF') for j in range(6)])
 72 |                                  for i in range(rest_nb_colors)]
 73 |     else:
 74 |         colors = ENTITY_COLOR[:nb_entities]
 75 |     assert len(colors) == nb_entities
 76 |     entity_colors = {all_entities[i]: colors[i] for i in range(nb_entities)}
 77 | 
 78 |     with open(output_fname, 'w') as fout:
 79 |         for x, y in zip(texts_c, entities):
 80 |             fout.write(entity2html(x, y, entity_colors))
 81 | 
 82 | 
 83 | def plot_loss_acc(history, task):
 84 |     import matplotlib.pyplot as plt
 85 | 
 86 |     nb_epochs = len(history.val_acc)
 87 |     epoch_size_nearly = len(history.acc) // nb_epochs
 88 |     val_x = [i for i in range(len(history.acc)) if i %
 89 |             epoch_size_nearly == 0][1:] + [len(history.acc)-1]
 90 | 
 91 |     f = plt.figure(figsize=(15, 45))
 92 |     ax1 = f.add_subplot(311)
 93 |     ax2 = f.add_subplot(312)
 94 |     ax3 = f.add_subplot(313)
 95 | 
 96 |     ax1.set_title("Train & Dev Acc")
 97 |     ax1.plot(history.acc, color="g", label="Train")
 98 |     ax1.plot(val_x, history.val_acc, color="b", label="Dev")
 99 |     ax1.legend(loc="best")
100 | 
101 |     ax2.set_title("Train & Dev Loss")
102 |     ax2.plot(history.loss, color="g", label="Train")
103 |     ax2.plot(val_x, history.val_loss, color="b", label="Dev")
104 |     ax2.legend(loc="best")
105 | 
106 |     if task == 'classification':
107 |         ax3.set_title("F1 per epoch")
108 |         ax3.plot(history.metrics['f1'], color="g", label="F1")
109 |     elif task == 'sequence_labeling':
110 |         ax3.set_title("F1 and acc per epoch")
111 |         ax3.plot(history.metrics['f1_seq'], color="g", label="F1")
112 |         ax3.plot(history.metrics['seq_acc'], color="b", label="Acc")
113 |     ax3.legend(loc="best")
114 | 
115 |     plt.tight_layout()
116 |     plt.show()
117 | 


--------------------------------------------------------------------------------
/reproduction/company_pro_con_classify.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.data import Dataset
 2 | from nlp_toolkit.classifier import Classifier
 3 | import yaml
 4 | 
 5 | data_path = '../sample_data/company_pro_con.txt'
 6 | config_path = '../config_classification.yaml'
 7 | 
 8 | # 建议使用safe_load()
 9 | config = yaml.safe_load(open(config_path, encoding='utf8'))
10 | config['model']['bi_lstm_att']['return_attention'] = True
11 | 
12 | # 加载数据，初始化参数
13 | dataset = Dataset(fname=data_path, task_type='classification',
14 |                   mode='train', config=config)
15 | 
16 | # 定义分类器
17 | classifier = Classifier(model_name='bi_lstm_att', dataset=dataset,
18 |                         seq_type='bucket')
19 | 
20 | # 模型训练
21 | # 会在当前目录生成models目录，用于保存模型训练结果
22 | trained_model = classifier.train()
23 | 


--------------------------------------------------------------------------------
/reproduction/noun_phrases_detect.py:
--------------------------------------------------------------------------------
 1 | from nlp_toolkit.data import Dataset
 2 | from nlp_toolkit.labeler import Labeler
 3 | import yaml
 4 | 
 5 | data_path = '../sample_data/cv_word.txt'
 6 | config_path = '../config_sequence_labeling.yaml'
 7 | 
 8 | # 建议使用safe_load()
 9 | config = yaml.safe_load(open(config_path, encoding='utf8'))
10 | config['data']['basic_token'] = 'char'
11 | config['data']['use_seg'] = True
12 | config['data']['use_radical'] = True
13 | 
14 | # 加载数据，初始化参数
15 | dataset = Dataset(fname=data_path, task_type='sequence_labeling',
16 |                   mode='train', config=config)
17 | 
18 | # 定义标注器
19 | seq_labeler = Labeler(model_name='char_rnn', dataset=dataset,
20 |                       seq_type='bucket')
21 | 
22 | # 模型训练
23 | # 会在当前目录生成models目录，用于保存模型训练结果
24 | trained_model = seq_labeler.train()
25 | 


--------------------------------------------------------------------------------
/requirements-gpu.txt:
--------------------------------------------------------------------------------
 1 | tensorflow-gpu>=1.9.0
 2 | Keras==2.2.4
 3 | numpy>=1.14.3
 4 | scikit-learn>=0.19.1
 5 | seqeval>=0.0.5
 6 | hanziconv>=0.3.2
 7 | jieba>=0.39
 8 | GPUtil>=1.3.0
 9 | ruamel.yaml>=0.15.81
10 | -e git+https://www.github.com/keras-team/keras-contrib.git#egg=keras-contrib


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow>=1.9.0
2 | Keras==2.2.4
3 | numpy>=1.14.3
4 | scikit-learn>=0.19.1
5 | seqeval>=0.0.5
6 | hanziconv>=0.3.2
7 | jieba>=0.39
8 | ruamel.yaml>=0.15.81
9 | -e git+https://www.github.com/keras-team/keras-contrib.git#egg=keras-contrib


--------------------------------------------------------------------------------
/sample_data/company_pro_con.txt:
--------------------------------------------------------------------------------
 1 | __label__pos 进去 前 许诺 的 工资 给 的 高
 2 | __label__pos 校园 环境 优美 ， 美女 很多 ， 适合 居住 ， 食堂 饭菜 便宜 ， 操场 好 ， 可以 天天 运动
 3 | __label__pos 老板 人 很好 老 员工 会 各种 教 你 东西 ， 而且 不会 有所 保留 薪水 在 大连 还 算 可以
 4 | __label__neg 人员 比较 多 ， 复杂   办公室 容易 形成 拉帮结派 不利于 企业 发展
 5 | __label__neg 出差 太多 了 。 在 现场 开发 很苦 逼 。
 6 | __label__neg 公司 目前 地理位置 不 太 理想 ， 离 城市 中心 较 远点 。
 7 | __label__pos 公司 的 技术 水平 国内 顶尖 ， 十几 年 的 资历 ， 制作 的 作品 几乎 都 是 精品 ， 参与 过 很多 知名 项目 。
 8 | __label__neg 工作 流程 复杂     个人 上升 空间 有限     新产品 的 创新 能力 有限   组织 架构 稍 显 臃肿
 9 | __label__neg 无偿 加班 ， 加班 多 ， 没 加班费 ， 压力 很大
10 | __label__pos 环境 比较 轻松 ， 跟 项目 走 ， 能 学 不少 专业 知识 ， 经验 很 重要
11 | __label__neg 有 命 挣钱 没命 花 ， 不 适合 发展
12 | __label__neg 制度 管理 不是 很 完善 。
13 | __label__pos 人文 氛围 厚重 、 和谐 ， 涉及 多 个 领域 有 发展 前景 。
14 | __label__neg 实习 的 时候 看 ， 比较 死板 整个 工作 的 活力 不大 ， 规矩 特别 多 ， 会 很多 ，
15 | __label__pos 有 能力 的 人 就 有 很多 机会
16 | __label__neg 加班 很多   加班 费用 很少   领导 不一定 都 晓得 下面   人员 流动 大 。
17 | __label__neg 环境 小 ， 人员 少且 流动性 很大 ； 工作 平台 局限性 较大 ， 难以 得到 较好 的 锻炼 与 发展 。
18 | __label__pos 一 年 14 薪 ， 每年 都加 工资 ， 不 随便 裁员
19 | __label__pos 差劲 的 公司 ， 刚开始 用 中 韩 合资 的 名称 唬人 ， 其实 就是 私人 土 老板 家族 企业
20 | __label__pos 地理位置 较好 ， 位于 浙江省 境内 ， 天时地利 人 和 ， 可以 向 外国 出口 产品
21 | __label__neg 薪水 不 高
22 | __label__pos 随意   年轻时 有 一定 成长 空间
23 | __label__neg 小气 ， 抠门 ， 没有 什么 发展前途 。
24 | __label__neg 有时 压力 偏 大 ， 当 老师 估计 都 不 轻松
25 | __label__pos 感觉 公司 还是 挺 正规 ！


--------------------------------------------------------------------------------
/sample_data/cv_word_basic.txt:
--------------------------------------------------------------------------------
 1 | 主要###O	帮助###O	工地###B-Chunk	师傅###E-Chunk	一起###O	超平###O	,###O	防线###O	工作###O
 2 | 协助###O	线###O	上###O	、###O	线###B-Chunk	下###I-Chunk	活动###E-Chunk	的###O	执行###O
 3 | 执行###O	各项###O	培训###O	相关###O	的###O	各项###O	工作###B-Chunk	流程###E-Chunk
 4 | 云南###O	:###O	曲靖###O	、###O	昭通###O	下属###O	的###O	5###O	个###O	县级###O	供电###B-Chunk	公司###E-Chunk	10###O	个###O	供电所###O
 5 | 担任###O	培训###O	学校###B-Chunk	英语###I-Chunk	讲师###E-Chunk	一###O	职###O	和###O	学生###B-Chunk	管理###E-Chunk
 6 | 搜寻###O	招标###B-Chunk	公告###E-Chunk	,###O	告知###O	领导###O	及###O	业务###B-Chunk	人员###E-Chunk	,###O	确认###O	是否###O	报名###O
 7 | 2001###O	/###O	10###O	--###O	2002###O	/###O	04###O	:###O	上海###O	润###O	宝###O	工贸###B-Chunk	公司###E-Chunk	<s>###O	所属###O	行业###O	:###O	<s>###O	环保###O	<s>###O	销售部###O	<s>###O	销售###B-Chunk	代表###E-Chunk	<s>###O	负责###O	江浙###O	一带###O	工业###O	圆###O	区###O	的###O	空气过滤器###O	的###O	销售###O	和###O	维护###O	,###O	期间###O	昆山###O	翊###O	腾###O	电子###O	是###O	长期###O	的###O	客户###O
 8 | <s>###O	仓库###B-Chunk	管理###E-Chunk	:###O	对###O	仓库###O	进行###O	合理###O	布局###O	,###O	为###O	方便###O	员工###B-Chunk	操作###E-Chunk	和###O	减少###O	失误###O	,###O	能够###O	独立###O	编排###O	库###O	位###O	图###O	和###O	货位###O	表###O
 9 | <s>###O	档案###B-Chunk	管理###E-Chunk	:###O	能###O	独立###O	制定###O	仓库###B-Chunk	管理###E-Chunk	文档###O	,###O	专人###O	负责###O	仓库###O	资料###O	的###O	更新###O	归档###O	并###O	定期###O	检查###O
10 | 电子###B-Chunk	技术###E-Chunk	/###O	半导体###O	/###O	集成电路###O
11 | 手机###B-Chunk	射频###I-Chunk	信号###I-Chunk	测试###E-Chunk
12 | 在###O	金源###B-Chunk	集团###E-Chunk	的###O	世纪城###O	三期###O	担任###O	置业###B-Chunk	顾问###E-Chunk	,###O	负责###O	房地产###B-Chunk	销售###E-Chunk
13 | 根据###O	用户###O	的###O	反馈###O	以及###O	运营###O	数据###O	的###O	分析###O	,###O	绘制###O	并###O	撰写###O	新版本###O	的###O	原型###B-Chunk	图###E-Chunk	和###O	prd###O	文###O	档###O
14 | 根据###O	各###O	渠道###O	的###O	平台###B-Chunk	要求###E-Chunk	,###O	定制###O	个性化###B-Chunk	产品###E-Chunk	,###O	并###O	负责###O	所###O	发布###O	版本###O	的###O	跟踪###O	管理###O
15 | <s>###O	2005###O	/###O	12###O	--###O	2006###O	/###O	05###O	:###O	广州###O	南沣###O	电子###B-Chunk	有限公司###E-Chunk	<s>###O	所属###O	行业###O	:###O	<s>###O	电子###B-Chunk	技术###E-Chunk	/###O	半导体###O	<s>###O	技术###O	部###O	<s>###O	维修###B-Chunk	技术###I-Chunk	员###E-Chunk	<s>###O	负责###O	调试###O	与###O	维修###O	线切割###B-Chunk	机床###E-Chunk	控制器###O	<s>###O	2006###O	/###O	03###O	<ss>###O	至今###O	:###O	中国###O	电器###B-Chunk	科学研究###E-Chunk	园###O	<s>###O	擎天###B-Chunk	实业###I-Chunk	有限公司###E-Chunk	<s>###O	所属###O	行业###O	:###O	电器###O	,###O	电子###O	<s>###O	技术###O	部###O	<s>###O	维修###B-Chunk	技术员###E-Chunk
16 | <s>###O	南京###B-Chunk	项目###E-Chunk	先期###O	股权###B-Chunk	融资###E-Chunk	置换###O
17 | 每月###O	核对###O	及###O	结算###B-Chunk	银行###I-Chunk	流水账###E-Chunk	和###O	现金###B-Chunk	日记账###E-Chunk	,###O	做到###O	账###O	实###O	相符###O	,###O	出具###O	汇总###O	对账###B-Chunk	表###E-Chunk


--------------------------------------------------------------------------------
/sample_data/cv_word_conll.txt:
--------------------------------------------------------------------------------
  1 | 主要	O
  2 | 帮助	O
  3 | 工地	B-Chunk
  4 | 师傅	E-Chunk
  5 | 一起	O
  6 | 超平	O
  7 | ,	O
  8 | 防线	O
  9 | 工作	O
 10 | 
 11 | 协助	O
 12 | 线	O
 13 | 上	O
 14 | 、	O
 15 | 线	B-Chunk
 16 | 下	I-Chunk
 17 | 活动	E-Chunk
 18 | 的	O
 19 | 执行	O
 20 | 
 21 | 执行	O
 22 | 各项	O
 23 | 培训	O
 24 | 相关	O
 25 | 的	O
 26 | 各项	O
 27 | 工作	B-Chunk
 28 | 流程	E-Chunk
 29 | 
 30 | 云南	O
 31 | :	O
 32 | 曲靖	O
 33 | 、	O
 34 | 昭通	O
 35 | 下属	O
 36 | 的	O
 37 | 5	O
 38 | 个	O
 39 | 县级	O
 40 | 供电	B-Chunk
 41 | 公司	E-Chunk
 42 | 10	O
 43 | 个	O
 44 | 供电所	O
 45 | 
 46 | 担任	O
 47 | 培训	O
 48 | 学校	B-Chunk
 49 | 英语	I-Chunk
 50 | 讲师	E-Chunk
 51 | 一	O
 52 | 职	O
 53 | 和	O
 54 | 学生	B-Chunk
 55 | 管理	E-Chunk
 56 | 
 57 | 搜寻	O
 58 | 招标	B-Chunk
 59 | 公告	E-Chunk
 60 | ,	O
 61 | 告知	O
 62 | 领导	O
 63 | 及	O
 64 | 业务	B-Chunk
 65 | 人员	E-Chunk
 66 | ,	O
 67 | 确认	O
 68 | 是否	O
 69 | 报名	O
 70 | 
 71 | 2001	O
 72 | /	O
 73 | 10	O
 74 | --	O
 75 | 2002	O
 76 | /	O
 77 | 04	O
 78 | :	O
 79 | 上海	O
 80 | 润	O
 81 | 宝	O
 82 | 工贸	B-Chunk
 83 | 公司	E-Chunk
 84 | <s>	O
 85 | 所属	O
 86 | 行业	O
 87 | :	O
 88 | <s>	O
 89 | 环保	O
 90 | <s>	O
 91 | 销售部	O
 92 | <s>	O
 93 | 销售	B-Chunk
 94 | 代表	E-Chunk
 95 | <s>	O
 96 | 负责	O
 97 | 江浙	O
 98 | 一带	O
 99 | 工业	O
100 | 圆	O
101 | 区	O
102 | 的	O
103 | 空气过滤器	O
104 | 的	O
105 | 销售	O
106 | 和	O
107 | 维护	O
108 | ,	O
109 | 期间	O
110 | 昆山	O
111 | 翊	O
112 | 腾	O
113 | 电子	O
114 | 是	O
115 | 长期	O
116 | 的	O
117 | 客户	O
118 | 
119 | <s>	O
120 | 仓库	B-Chunk
121 | 管理	E-Chunk
122 | :	O
123 | 对	O
124 | 仓库	O
125 | 进行	O
126 | 合理	O
127 | 布局	O
128 | ,	O
129 | 为	O
130 | 方便	O
131 | 员工	B-Chunk
132 | 操作	E-Chunk
133 | 和	O
134 | 减少	O
135 | 失误	O
136 | ,	O
137 | 能够	O
138 | 独立	O
139 | 编排	O
140 | 库	O
141 | 位	O
142 | 图	O
143 | 和	O
144 | 货位	O
145 | 表	O
146 | 
147 | <s>	O
148 | 档案	B-Chunk
149 | 管理	E-Chunk
150 | :	O
151 | 能	O
152 | 独立	O
153 | 制定	O
154 | 仓库	B-Chunk
155 | 管理	E-Chunk
156 | 文档	O
157 | ,	O
158 | 专人	O
159 | 负责	O
160 | 仓库	O
161 | 资料	O
162 | 的	O
163 | 更新	O
164 | 归档	O
165 | 并	O
166 | 定期	O
167 | 检查	O
168 | 
169 | 电子	B-Chunk
170 | 技术	E-Chunk
171 | /	O
172 | 半导体	O
173 | /	O
174 | 集成电路	O
175 | 
176 | 手机	B-Chunk
177 | 射频	I-Chunk
178 | 信号	I-Chunk
179 | 测试	E-Chunk
180 | 
181 | 在	O
182 | 金源	B-Chunk
183 | 集团	E-Chunk
184 | 的	O
185 | 世纪城	O
186 | 三期	O
187 | 担任	O
188 | 置业	B-Chunk
189 | 顾问	E-Chunk
190 | ,	O
191 | 负责	O
192 | 房地产	B-Chunk
193 | 销售	E-Chunk
194 | 
195 | 根据	O
196 | 用户	O
197 | 的	O
198 | 反馈	O
199 | 以及	O
200 | 运营	O
201 | 数据	O
202 | 的	O
203 | 分析	O
204 | ,	O
205 | 绘制	O
206 | 并	O
207 | 撰写	O
208 | 新版本	O
209 | 的	O
210 | 原型	B-Chunk
211 | 图	E-Chunk
212 | 和	O
213 | prd	O
214 | 文	O
215 | 档	O
216 | 
217 | 根据	O
218 | 各	O
219 | 渠道	O
220 | 的	O
221 | 平台	B-Chunk
222 | 要求	E-Chunk
223 | ,	O
224 | 定制	O
225 | 个性化	B-Chunk
226 | 产品	E-Chunk
227 | ,	O
228 | 并	O
229 | 负责	O
230 | 所	O
231 | 发布	O
232 | 版本	O
233 | 的	O
234 | 跟踪	O
235 | 管理	O
236 | 
237 | <s>	O
238 | 2005	O
239 | /	O
240 | 12	O
241 | --	O
242 | 2006	O
243 | /	O
244 | 05	O
245 | :	O
246 | 广州	O
247 | 南沣	O
248 | 电子	B-Chunk
249 | 有限公司	E-Chunk
250 | <s>	O
251 | 所属	O
252 | 行业	O
253 | :	O
254 | <s>	O
255 | 电子	B-Chunk
256 | 技术	E-Chunk
257 | /	O
258 | 半导体	O
259 | <s>	O
260 | 技术	O
261 | 部	O
262 | <s>	O
263 | 维修	B-Chunk
264 | 技术	I-Chunk
265 | 员	E-Chunk
266 | <s>	O
267 | 负责	O
268 | 调试	O
269 | 与	O
270 | 维修	O
271 | 线切割	B-Chunk
272 | 机床	E-Chunk
273 | 控制器	O
274 | <s>	O
275 | 2006	O
276 | /	O
277 | 03	O
278 | <ss>	O
279 | 至今	O
280 | :	O
281 | 中国	O
282 | 电器	B-Chunk
283 | 科学研究	E-Chunk
284 | 园	O
285 | <s>	O
286 | 擎天	B-Chunk
287 | 实业	I-Chunk
288 | 有限公司	E-Chunk
289 | <s>	O
290 | 所属	O
291 | 行业	O
292 | :	O
293 | 电器	O
294 | ,	O
295 | 电子	O
296 | <s>	O
297 | 技术	O
298 | 部	O
299 | <s>	O
300 | 维修	B-Chunk
301 | 技术员	E-Chunk
302 | 
303 | <s>	O
304 | 南京	B-Chunk
305 | 项目	E-Chunk
306 | 先期	O
307 | 股权	B-Chunk
308 | 融资	E-Chunk
309 | 置换	O
310 | 
311 | 每月	O
312 | 核对	O
313 | 及	O
314 | 结算	B-Chunk
315 | 银行	I-Chunk
316 | 流水账	E-Chunk
317 | 和	O
318 | 现金	B-Chunk
319 | 日记账	E-Chunk
320 | ,	O
321 | 做到	O
322 | 账	O
323 | 实	O
324 | 相符	O
325 | ,	O
326 | 出具	O
327 | 汇总	O
328 | 对账	B-Chunk
329 | 表	E-Chunk
330 | 
331 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open('requirements.txt') as requirements:
 4 |     REQUIREMENTS = requirements.readlines()
 5 | long_description = open('README.md', encoding='utf-8').read()
 6 | 
 7 | REQUIREMENTS = ['seqeval>=0.0.5', 'Keras>=2.2.4',
 8 |                 'tensorflow>=1.9.0', 'jieba>=0.39',
 9 |                 'numpy>=1.14.3', 'scikit-learn>=0.19.1',
10 |                 'hanziconv>=0.3.2', 'ruamel.yaml>=0.15.81']
11 | 
12 | setup(
13 |     name='nlp_toolkit',
14 |     version='1.3.2',
15 |     description='NLP Toolkit with easy model training and applications',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown',
18 |     author='yilei.wang',
19 |     author_email='stevewyl@163.com',
20 |     license='MIT',
21 |     install_requires=REQUIREMENTS,
22 |     extra_requires={
23 |         'tensorflow_gpu': ['tensorflow-gpu>=1.10.0'],
24 |         'GPUtil': ['GPUtil>=1.3.0'],
25 |     },
26 |     python_requires='>=3.6',
27 |     packages=find_packages(),
28 |     package_data={'nlp_toolkit': ['data/*.txt']},
29 |     include_package_data=True,
30 |     url='https://github.com/stevewyl/nlp_toolkit',
31 |     classifiers=[
32 |         'Programming Language :: Python :: 3.6',
33 |         'License :: OSI Approved :: MIT License',
34 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
35 |     ],
36 |     keywords='nlp keras text classification sequence labeling',
37 | )
38 | 


--------------------------------------------------------------------------------