├── .gitignore
├── LICENSE
├── README.md
├── dicts
    ├── core_char.dict
    ├── idiom.dict
    ├── ns.dict
    └── stop_words.dict
├── models
    ├── cws_label.txt
    ├── idiom_dat.bin
    ├── model_c_label.txt
    ├── ns_dat.bin
    ├── stop_dat.bin
    └── t2s.dat
├── pom.xml
└── src
    ├── main
        └── java
        │   └── io
        │       └── github
        │           └── yizhiru
        │               └── thulac4j
        │                   ├── POSTagger.java
        │                   ├── SPChineseTokenizer.java
        │                   ├── Segmenter.java
        │                   ├── common
        │                       ├── DoubleArrayTrie.java
        │                       └── Nullable.java
        │                   ├── perceptron
        │                       ├── StructuredPerceptronClassifier.java
        │                       └── StructuredPerceptronModel.java
        │                   ├── process
        │                       ├── LexiconCementer.java
        │                       ├── RuleAnnotator.java
        │                       └── SpecifiedWordCementer.java
        │                   ├── term
        │                       ├── AnnotatedTerms.java
        │                       ├── CharType.java
        │                       ├── POC.java
        │                       └── TokenItem.java
        │                   └── util
        │                       ├── CharUtils.java
        │                       ├── ChineseUtils.java
        │                       ├── IOUtils.java
        │                       └── ModelPaths.java
    └── test
        └── java
            └── io
                └── github
                    └── yizhiru
                        └── thulac4j
                            ├── POSTaggerTest.java
                            ├── SPChineseTokenizerTest.java
                            ├── SegmenterTest.java
                            ├── common
                                └── DoubleArrayTrieTest.java
                            ├── perceptron
                                └── StructuredPerceptronModelTest.java
                            ├── process
                                ├── LexiconCementerTest.java
                                ├── RuleAnnotatorTest.java
                                └── SpecifiedWordCementerTest.java
                            ├── term
                                └── POCTest.java
                            └── util
                                ├── CharUtilsTest.java
                                ├── ChineseUtilsTest.java
                                └── IOUtilsTest.java


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | target/
4 | models/cws_dat.bin
5 | models/cws_model.bin
6 | models/model_c_dat.bin
7 | models/model_c_model.bin
8 | train/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {jyzheng} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # thulac4j
 2 | 
 3 | thulac4j是[THULAC](http://thulac.thunlp.org/)的高效Java 8实现，具有分词速度快、准、强的特点；支持
 4 | 
 5 | - 自定义词典
 6 | - 繁体转简体
 7 | - 停用词过滤
 8 | 
 9 | 
10 | ## 使用示例
11 | 
12 | 在项目中使用thulac4j，添加依赖（请使用最新版本）：
13 | 
14 | ```xml
15 | <dependency>
16 |   <groupId>io.github.yizhiru</groupId>
17 |   <artifactId>thulac4j</artifactId>
18 |   <version>3.1.2</version>
19 | </dependency>
20 | ```
21 | 
22 | thulac4j支持中文分词与词性标注，使用示例如下：
23 | 
24 | 
25 | ```java
26 | String sentence = "滔滔的流水，向着波士顿湾无声逝去";
27 | List<String> words = Segmenter.segment(sentence);
28 | // [滔滔, 的, 流水, ，, 向着, 波士顿湾, 无声, 逝去]
29 | 
30 | POSTagger pos = new POSTagger("models/model_c_model.bin", "models/model_c_dat.bin");
31 | List<SegItem> words = pos.tagging(sentence);
32 | // [滔滔/a, 的/u, 流水/n, ，/w, 向着/p, 波士顿湾/ns, 无声/v, 逝去/v]
33 | ```
34 | 
35 | 模型数据较大，没有放在jar包与源码。训练模型下载及更多使用说明，请参看[Wiki](https://github.com/yizhiru/thulac4j/wiki).
36 | 
37 | 
38 | 最后感谢THUNLP实验室！
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/dicts/stop_words.dict:
--------------------------------------------------------------------------------
  1 | !
  2 | "
  3 | #
  4 | $
  5 | %
  6 | &
  7 | '
  8 | (
  9 | )
 10 | *
 11 | +
 12 | ,
 13 | -
 14 | .
 15 | /
 16 | 0
 17 | 1
 18 | 2
 19 | 3
 20 | 4
 21 | 5
 22 | 6
 23 | 7
 24 | 8
 25 | 9
 26 | :
 27 | ;
 28 | <
 29 | =
 30 | >
 31 | ?
 32 | @
 33 | A
 34 | [
 35 | \
 36 | ]
 37 | ^
 38 | _
 39 | `
 40 | |
 41 | ~
 42 | ·
 43 | —
 44 | ‘
 45 | ’
 46 | “
 47 | ”
 48 | …
 49 | 、
 50 | 。
 51 | 〈
 52 | 〉
 53 | 《
 54 | 》
 55 | ︿
 56 | ﻿,
 57 | ！
 58 | ＃
 59 | ＄
 60 | ％
 61 | ＆
 62 | （
 63 | ）
 64 | ＊
 65 | ＋
 66 | ，
 67 | ０
 68 | １
 69 | ２
 70 | ３
 71 | ４
 72 | ５
 73 | ６
 74 | ７
 75 | ８
 76 | ９
 77 | ：
 78 | ；
 79 | ＜
 80 | ＞
 81 | ？
 82 | ＠
 83 | ［
 84 | ］
 85 | ｛
 86 | ｜
 87 | ｝
 88 | ～
 89 | ￥
 90 | 『
 91 | 』
 92 | 【
 93 | 】
 94 | 〖
 95 | 〗
 96 | 「
 97 | 」
 98 | →
 99 | ‖
100 | º
101 | 造就
102 | 等到
103 | 其人
104 | 看得到
105 | 辅以
106 | 就是
107 | 谁知
108 | 看见
109 | 不顾
110 | 提出
111 | 举办
112 | 总能
113 | 比起
114 | 哪知
115 | 含有
116 | 接着
117 | 怎么回事
118 | 看出
119 | 此地
120 | 一手
121 | 发出
122 | 看得出
123 | 乃是
124 | 富于
125 | 来看
126 | 不肯
127 | 有利
128 | 回来
129 | 经由
130 | 加以
131 | 不如说
132 | 看不见
133 | 走去
134 | 有所
135 | 离不开
136 | 要知道
137 | 正当
138 | 接下来
139 | 为止
140 | 实行
141 | 有一次
142 | 做好
143 | 等于
144 | 看成
145 | 在于
146 | 提到
147 | 无所
148 | 开来
149 | 过来
150 | 没想
151 | 想不到
152 | 看到
153 | 近乎
154 | 包括
155 | 不想
156 | 饱受
157 | 怎么办
158 | 同在
159 | 回去
160 | 不能
161 | 诸如
162 | 可以说
163 | 什么样
164 | 收有
165 | 出来
166 | 一身
167 | 不甘
168 | 进一步
169 | 留给
170 | 共同
171 | 听来
172 | 听起来
173 | 还要
174 | 不够
175 | 仅仅是
176 | 分成
177 | 带到
178 | 如下
179 | 面对
180 | 所有
181 | 方面
182 | 不失为
183 | 怎会
184 | 终于
185 | 看起来
186 | 不失
187 | 能为
188 | 谈及
189 | 以期
190 | 号称
191 | 取决于
192 | 无人
193 | 一行人
194 | 想得到
195 | 不愿
196 | 可以
197 | 来得
198 | 想来
199 | 起来
200 | 来讲
201 | 听得
202 | 所在
203 | 迫使
204 | 几经
205 | 只得
206 | 位于
207 | 不免
208 | 做出
209 | 听完
210 | 仅有
211 | 有的人
212 | 时候
213 | 本身
214 | 可看
215 | 来去
216 | 做成
217 | 不敢
218 | 出现
219 | 感到
220 | 面向
221 | 分为
222 | 身为
223 | 本人
224 | 相处
225 | 这里
226 | 这种
227 | 当时
228 | 出去
229 | 仍是
230 | 遍及
231 | 引起
232 | 更具
233 | 来过
234 | 搞好
235 | 未有
236 | 显得
237 | 当成
238 | 即是
239 | 遭受
240 | 当上
241 | 做到
242 | 不如
243 | 纳入
244 | 不要
245 | 来说
246 | 不料
247 | 适合于
248 | 却是
249 | 变成
250 | 受到
251 | 之初
252 | 展开
253 | 向着
254 | 抓好
255 | 还是
256 | 上下
257 | 得出
258 | 宛如
259 | 皆有
260 | 跟着
261 | 予以
262 | 现有
263 | 哪能
264 | 一体
265 | 所得
266 | 有着
267 | 一块
268 | 开展
269 | 这个
270 | 这般
271 | 道来
272 | 推向
273 | 变为
274 | 一面
275 | 怎么一回事
276 | 直至
277 | 得到
278 | 从事
279 | 相关
280 | 归于
281 | 算是
282 | 带给
283 | 并用
284 | 不无
285 | 历尽
286 | 四处
287 | 不出
288 | 亦即
289 | 不已
290 | 引出
291 | 才是
292 | 利于
293 | 结成
294 | 一定
295 | 不下
296 | 此类
297 | 怎知
298 | 看着
299 | 情况
300 | 这么
301 | 看似
302 | 同样
303 | 想尽
304 | 带有
305 | 分开
306 | 对应
307 | 化成
308 | 直到
309 | 哪敢
310 | 不论是
311 | 看来
312 | 更是
313 | 是不是
314 | 后者
315 | 看作
316 | 得了
317 | 举行
318 | 叫做
319 | 除去
320 | 提供
321 | 结为
322 | 不到
323 | 不是
324 | 带着
325 | 说起来
326 | 可知
327 | 去到
328 | 所谓
329 | 说来
330 | 造成
331 | 怎样
332 | 请看
333 | 犹如
334 | 不乏
335 | 度过
336 | 化为
337 | 看完
338 | 既定
339 | 带来
340 | 以求
341 | 样子
342 | 提及
343 | 四起
344 | 属于
345 | 一开始
346 | 掀起
347 | 好比
348 | 那是
349 | 象是
350 | 亦可
351 | 处于
352 | 达成
353 | 可谓
354 | 还给
355 | 自身
356 | 看过
357 | 打下
358 | 作出
359 | 奉为
360 | 极具
361 | 看去
362 | 附近
363 | 还有
364 | 比较
365 | 达到
366 | 列入
367 | 得以
368 | 成为
369 | 哪里
370 | 限于
371 | 此处
372 | 不应
373 | 将要
374 | 勾起
375 | 没人
376 | 哪知道
377 | 充满
378 | 多方面
379 | 有可能
380 | 一样
381 | 称为
382 | 一行
383 | 怎么说
384 | 别看
385 | 据说
386 | 自有
387 | 使出
388 | 早在
389 | 作为
390 | 实为
391 | 只能
392 | 一道
393 | 便是
394 | 到了
395 | 没想到
396 | 当作
397 | 争取
398 | 之余
399 | 用于
400 | 围绕
401 | 为什么
402 | 做得
403 | 这次
404 | 何在
405 | 原是
406 | 尽在
407 | 随着
408 | 没有
409 | 各方面
410 | 哪个
411 | 取得
412 | 相应
413 | 上来
414 | 称得上
415 | 更有
416 | 看尽
417 | 直指
418 | 看做
419 | 怎能
420 | 不会
421 | 充当
422 | 便于
423 | 促成
424 | 藉此
425 | 有必要
426 | 不休
427 | 处在
428 | 前来
429 | 用以
430 | 下来
431 | 表明
432 | 不怎么样
433 | 给予
434 | 如同
435 | 左右
436 | 列出
437 | 彷佛
438 | 该怎么办
439 | 或是
440 | 即可
441 | 经过
442 | 受过
443 | 特别
444 | 只要
445 | 或者是
446 | 可能
447 | 形成
448 | 经受
449 | 东西
450 | 不住
451 | 至于
452 | 称之为
453 | 怎奈
454 | 看上去
455 | 上去
456 | 无法
457 | 快要
458 | 引来
459 | 进来
460 | 不止
461 | 采取
462 | 应有
463 | 有别于
464 | 前去
465 | 认为
466 | 列为
467 | 化作
468 | 这边
469 | 下去
470 | 此时
471 | 未能
472 | 听见
473 | 正是
474 | 想见
475 | 不得
476 | 会有
477 | 来自
478 | 上述
479 | 关乎
480 | 过上
481 | 用来
482 | 应当
483 | 应该说
484 | 整个
485 | 出自
486 | 一头
487 | 到来
488 | 竟是
489 | 论及
490 | 不容
491 | 怎料
492 | 为主
493 | 一系列
494 | 运用
495 | 本想
496 | 合乎
497 | 配有
498 | 进去
499 | 前者
500 | 不及
501 | 何谓
502 | 在内
503 | 引发
504 | 毫无
505 | 相当于
506 | 推出
507 | 例如
508 | 加上
509 | 同时
510 | 发生
511 | 及时
512 | 去过
513 | 相对于
514 | 来到
515 | 双方
516 | 不忍
517 | 依靠
518 | 想出
519 | 层面
520 | 当做
521 | 涉及
522 | 又是
523 | 遭到
524 | 就要
525 | 不只是
526 | 什么
527 | 有关
528 | 譬如
529 | 起到
530 | 不可
531 | 一如
532 | 或许是
533 | 听到
534 | 不说
535 | 广为
536 | 想到
537 | 有如
538 | 之类
539 | 感觉到
540 | 无关
541 | 不怕
542 | 极有
543 | 这么回事
544 | 身处
545 | 并存
546 | 此事
547 | 提起
548 | 而是
549 | 不少
550 | 一方
551 | 做法
552 | 不堪
553 | 一句话
554 | 也是
555 | 备受
556 | 特有
557 | 进行
558 | 至此
559 | 力图
560 | 发起
561 | 能够
562 | 相比
563 | 并不是
564 | 不行
565 | 必需
566 | 凭借
567 | 均为
568 | 不尽
569 | 实现
570 | 应该
571 | 此人
572 | 相反
573 | 出于
574 | 另有
575 | 感受到
576 | 怎么
577 | 使得
578 | 介入
579 | 带入
580 | 一方面
581 | 以为
582 | 至极
583 | 养成
584 | 多日
585 | 以前
586 | 日前
587 | 日子
588 | 前一天
589 | 时刻
590 | 大前
591 | 先前
592 | 目前
593 | 终日
594 | 当下
595 | 一会
596 | 现今
597 | 每月
598 | 半天
599 | 成天
600 | 今度
601 | 多时
602 | 个月
603 | 某日
604 | 几时
605 | 后来
606 | 一天
607 | 有的时候
608 | 当初
609 | 一个
610 | 那一天
611 | 近日
612 | 近年来
613 | 此后
614 | 以来
615 | 之后
616 | 而今
617 | 一刻
618 | 时年
619 | 以往
620 | 历年来
621 | 从前
622 | 每天
623 | 当天
624 | 十数年
625 | 眼下
626 | 现时
627 | 其时
628 | 一会儿
629 | 忽然间
630 | 当前
631 | 多年来
632 | 其后
633 | 一晚
634 | 这时候
635 | 原初
636 | 现下
637 | 某天
638 | 此刻
639 | 不久前
640 | 多久
641 | 前夕
642 | 此前
643 | 每晚
644 | 现世
645 | 之前
646 | 前后
647 | 会儿
648 | 没多久
649 | 往日
650 | 同一天
651 | 尔后
652 | 早先
653 | 前一刻
654 | 如今
655 | 现如今
656 | 往后
657 | 每年
658 | 当年
659 | 今后
660 | 转眼间
661 | 一时间
662 | 多年
663 | 顷刻间
664 | 起初
665 | 许久
666 | 起先
667 | 来年
668 | 在此之前
669 | 近来
670 | 是时
671 | 日日
672 | 稍后
673 | 往常
674 | 期间
675 | 晚近
676 | 数小时
677 | 以后
678 | 日后
679 | 已往
680 | 他日
681 | 先后
682 | 在此期间
683 | 不久
684 | 近年
685 | 时下
686 | 两年
687 | 前不久
688 | 哪一天
689 | 当今
690 | 很早以前
691 | 最近
692 | 早些
693 | 同年
694 | 万世
695 | 一日
696 | 二十年
697 | 近些年
698 | 这一刻
699 | 彼时
700 | 于今
701 | 这些年
702 | 每日
703 | 往年
704 | 一时


--------------------------------------------------------------------------------
/models/cws_label.txt:
--------------------------------------------------------------------------------
1 | 0
2 | 2
3 | 3
4 | 1
5 | 


--------------------------------------------------------------------------------
/models/idiom_dat.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/idiom_dat.bin


--------------------------------------------------------------------------------
/models/model_c_label.txt:
--------------------------------------------------------------------------------
 1 | 0v
 2 | 2v
 3 | 3p
 4 | 0n
 5 | 2n
 6 | 3v
 7 | 1n
 8 | 3w
 9 | 0ns
10 | 1ns
11 | 2ns
12 | 0t
13 | 1t
14 | 2t
15 | 0f
16 | 2f
17 | 0d
18 | 2d
19 | 3f
20 | 3u
21 | 1v
22 | 0m
23 | 1m
24 | 2m
25 | 0q
26 | 2q
27 | 0r
28 | 2r
29 | 0j
30 | 1j
31 | 2j
32 | 0s
33 | 2s
34 | 3a
35 | 3c
36 | 3g
37 | 3m
38 | 3q
39 | 3d
40 | 3n
41 | 0a
42 | 2a
43 | 0id
44 | 1id
45 | 2id
46 | 3r
47 | 0ni
48 | 1ni
49 | 2ni
50 | 0p
51 | 2p
52 | 0c
53 | 1c
54 | 2c
55 | 0np
56 | 1np
57 | 2np
58 | 3j
59 | 1d
60 | 3np
61 | 1a
62 | 3x
63 | 0nz
64 | 2nz
65 | 1nz
66 | 0w
67 | 1w
68 | 2w
69 | 0u
70 | 2u
71 | 1q
72 | 1s
73 | 3k
74 | 1f
75 | 3o
76 | 0o
77 | 2o
78 | 1r
79 | 0x
80 | 1x
81 | 2x
82 | 3e
83 | 3h
84 | 3t
85 | 1o
86 | 1p
87 | 0e
88 | 1e
89 | 2e
90 | 3ni
91 | 3s
92 | 3nz
93 | 1u
94 | 0k
95 | 1k
96 | 2k
97 | 


--------------------------------------------------------------------------------
/models/ns_dat.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/ns_dat.bin


--------------------------------------------------------------------------------
/models/stop_dat.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/stop_dat.bin


--------------------------------------------------------------------------------
/models/t2s.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/t2s.dat


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     <parent>
  5 |         <groupId>org.sonatype.oss</groupId>
  6 |         <artifactId>oss-parent</artifactId>
  7 |         <version>7</version>
  8 |     </parent>
  9 | 
 10 |     <groupId>io.github.yizhiru</groupId>
 11 |     <artifactId>thulac4j</artifactId>
 12 |     <version>3.1.2</version>
 13 |     <packaging>jar</packaging>
 14 | 
 15 |     <name>thulac4j</name>
 16 |     <url>https://github.com/yizhiru/thulac4j</url>
 17 |     <description>Java implementation of THULAC.</description>
 18 | 
 19 |     <properties>
 20 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 21 |         <junit.version>4.13.1</junit.version>
 22 |         <powermock.version>1.7.3</powermock.version>
 23 |     </properties>
 24 | 
 25 |     <licenses>
 26 |         <license>
 27 |             <name>The Apache Software License, Version 2.0</name>
 28 |             <url>http://www.apache.org/licenses/LICENSE-2.0.txt</url>
 29 |             <distribution>repo</distribution>
 30 |         </license>
 31 |     </licenses>
 32 | 
 33 |     <developers>
 34 |         <developer>
 35 |             <id>yizhiru</id>
 36 |             <name>Zheng Jiangyu</name>
 37 |             <email>j.y.zheng@qq.com</email>
 38 |         </developer>
 39 |     </developers>
 40 |     <scm>
 41 |         <connection>scm:git:git@github.com:yizhiru/thulac4j.git</connection>
 42 |         <developerConnection>scm:git:git@github.com:yizhiru/thulac4j.git</developerConnection>
 43 |         <url>git@github.com:yizhiru/thulac4j.git</url>
 44 |     </scm>
 45 | 
 46 |     <dependencies>
 47 |         <dependency>
 48 |             <groupId>junit</groupId>
 49 |             <artifactId>junit</artifactId>
 50 |             <version>${junit.version}</version>
 51 |             <scope>test</scope>
 52 |         </dependency>
 53 |         <dependency>
 54 |             <groupId>org.powermock</groupId>
 55 |             <artifactId>powermock-module-junit4</artifactId>
 56 |             <version>${powermock.version}</version>
 57 |             <scope>test</scope>
 58 |         </dependency>
 59 |         <dependency>
 60 |             <groupId>org.powermock</groupId>
 61 |             <artifactId>powermock-api-easymock</artifactId>
 62 |             <version>${powermock.version}</version>
 63 |             <scope>test</scope>
 64 |         </dependency>
 65 |     </dependencies>
 66 | 
 67 |     <build>
 68 |         <sourceDirectory>src/main/java</sourceDirectory>
 69 |         <testSourceDirectory>src/test/java</testSourceDirectory>
 70 |         <resources>
 71 |             <resource>
 72 |                 <directory>./</directory>
 73 |                 <includes>
 74 |                     <include>models/*label.txt</include>
 75 |                     <include>models/cws*</include>
 76 |                     <include>models/*dat.bin</include>
 77 |                     <include>models/t2s.dat</include>
 78 |                     <include>dicts/core_char.dict</include>
 79 |                 </includes>
 80 |                 <excludes>
 81 |                     <exclude>models/model_c_dat.bin</exclude>
 82 |                 </excludes>
 83 |             </resource>
 84 |         </resources>
 85 | 
 86 |         <plugins>
 87 |             <plugin>
 88 |                 <groupId>org.apache.maven.plugins</groupId>
 89 |                 <artifactId>maven-compiler-plugin</artifactId>
 90 |                 <version>3.1</version>
 91 |                 <configuration>
 92 |                     <source>1.8</source>
 93 |                     <target>1.8</target>
 94 |                     <encoding>${project.build.sourceEncoding}</encoding>
 95 |                 </configuration>
 96 |             </plugin>
 97 |             <plugin>
 98 |                 <groupId>org.apache.maven.plugins</groupId>
 99 |                 <artifactId>maven-surefire-plugin</artifactId>
100 |                 <version>2.12.4</version>
101 |                 <configuration>
102 |                     <forkMode>once</forkMode>
103 |                     <argLine>-Dfile.encoding=UTF-8</argLine>
104 |                 </configuration>
105 |             </plugin>
106 |             <plugin>
107 |                 <groupId>org.apache.maven.plugins</groupId>
108 |                 <artifactId>maven-source-plugin</artifactId>
109 |                 <version>2.1.2</version>
110 |                 <executions>
111 |                     <execution>
112 |                         <phase>package</phase>
113 |                         <goals>
114 |                             <goal>jar-no-fork</goal>
115 |                         </goals>
116 |                     </execution>
117 |                 </executions>
118 |                 <configuration>
119 |                     <excludes>
120 |                         <exclude>models/</exclude>
121 |                     </excludes>
122 |                 </configuration>
123 |             </plugin>
124 |             <plugin>
125 |                 <groupId>org.apache.maven.plugins</groupId>
126 |                 <artifactId>maven-javadoc-plugin</artifactId>
127 |                 <version>2.9.1</version>
128 |                 <configuration>
129 |                     <encoding>UTF-8</encoding>
130 |                     <docencoding>UTF-8</docencoding>
131 |                 </configuration>
132 |                 <executions>
133 |                     <execution>
134 |                         <phase>package</phase>
135 |                         <goals>
136 |                             <goal>jar</goal>
137 |                         </goals>
138 |                     </execution>
139 |                 </executions>
140 |             </plugin>
141 |             <plugin>
142 |                 <groupId>org.apache.maven.plugins</groupId>
143 |                 <artifactId>maven-gpg-plugin</artifactId>
144 |                 <executions>
145 |                     <execution>
146 |                         <phase>verify</phase>
147 |                         <goals>
148 |                             <goal>sign</goal>
149 |                         </goals>
150 |                     </execution>
151 |                 </executions>
152 |             </plugin>
153 |         </plugins>
154 |     </build>
155 | 
156 | 
157 |     <profiles>
158 |         <profile>
159 |             <id>release</id>
160 |             <distributionManagement>
161 |                 <snapshotRepository>
162 |                     <id>oss</id>
163 |                     <url>https://oss.sonatype.org/content/repositories/snapshots/</url>
164 |                 </snapshotRepository>
165 |                 <repository>
166 |                     <id>oss</id>
167 |                     <url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
168 |                 </repository>
169 |             </distributionManagement>
170 |         </profile>
171 |     </profiles>
172 | </project>
173 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/POSTagger.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j;
 2 | 
 3 | import io.github.yizhiru.thulac4j.term.TokenItem;
 4 | 
 5 | import java.io.FileInputStream;
 6 | import java.io.IOException;
 7 | import java.util.List;
 8 | 
 9 | import static io.github.yizhiru.thulac4j.util.ModelPaths.POS_TAGGING_LABEL_PATH;
10 | 
11 | /**
12 |  * 中文词性标注.
13 |  */
14 | public class POSTagger extends SPChineseTokenizer {
15 | 
16 | 	public POSTagger(String weightPath, String featurePath) throws IOException {
17 | 		super(new FileInputStream(weightPath),
18 | 				new FileInputStream(featurePath),
19 | 				POSTagger.class.getResourceAsStream(POS_TAGGING_LABEL_PATH));
20 | 	}
21 | 
22 | 	/**
23 | 	 * 词性标注
24 | 	 *
25 | 	 * @param text 输入句子
26 | 	 * @return 词与词性结对结果
27 | 	 */
28 | 	public List<TokenItem> tagging(String text) {
29 | 		return tokenize(text);
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/SPChineseTokenizer.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j;
  2 | 
  3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie;
  4 | import io.github.yizhiru.thulac4j.util.ModelPaths;
  5 | import io.github.yizhiru.thulac4j.common.Nullable;
  6 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronClassifier;
  7 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel;
  8 | import io.github.yizhiru.thulac4j.process.RuleAnnotator;
  9 | import io.github.yizhiru.thulac4j.process.LexiconCementer;
 10 | import io.github.yizhiru.thulac4j.process.SpecifiedWordCementer;
 11 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms;
 12 | import io.github.yizhiru.thulac4j.term.TokenItem;
 13 | import io.github.yizhiru.thulac4j.util.ChineseUtils;
 14 | 
 15 | import java.io.IOException;
 16 | import java.io.InputStream;
 17 | import java.util.ArrayList;
 18 | import java.util.LinkedList;
 19 | import java.util.List;
 20 | 
 21 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.PocMark.*;
 22 | 
 23 | public class SPChineseTokenizer {
 24 | 
 25 | 	/**
 26 | 	 * 结构感知器模型
 27 | 	 */
 28 | 	private StructuredPerceptronClassifier classifier;
 29 | 
 30 | 	/**
 31 | 	 * 前向Label 二维数组
 32 | 	 */
 33 | 	protected int[][] previousTrans;
 34 | 
 35 | 	/**
 36 | 	 * 地名 ns 词典黏结.
 37 | 	 */
 38 | 	public final LexiconCementer nsCementer;
 39 | 
 40 | 	/**
 41 | 	 * 习语 idiom 词典黏结.
 42 | 	 */
 43 | 	public final LexiconCementer idiomCementer;
 44 | 
 45 | 	/**
 46 | 	 * 自定义词典，可为null
 47 | 	 */
 48 | 	@Nullable
 49 | 	protected LexiconCementer uwCementer = null;
 50 | 
 51 | 	private static final class Config {
 52 | 
 53 | 		/**
 54 | 		 * 是否开启黏结书名号内的词.
 55 | 		 */
 56 | 		private static boolean isEnableTileWord = false;
 57 | 
 58 | 		/**
 59 | 		 * 是否开启停用词过滤
 60 | 		 */
 61 | 		private static boolean isEnableFilterStopWords = false;
 62 | 
 63 | 		/**
 64 | 		 * 是否开启转简体中文
 65 | 		 */
 66 | 		private static boolean isEnableConvertToSimplifiedCHN = false;
 67 | 
 68 | 	}
 69 | 
 70 | 	SPChineseTokenizer(InputStream weightInput, InputStream featureInput, InputStream labelInput) {
 71 | 		try {
 72 | 			this.classifier = new StructuredPerceptronClassifier(
 73 | 					new StructuredPerceptronModel(weightInput, featureInput, labelInput));
 74 | 			this.nsCementer = new LexiconCementer(
 75 | 					this.getClass().getResourceAsStream(ModelPaths.NS_BIN_PATH), "ns");
 76 | 			this.idiomCementer = new LexiconCementer(
 77 | 					this.getClass().getResourceAsStream(ModelPaths.IDIOM_BIN_PATH), "i");
 78 | 		} catch (IOException e) {
 79 | 			throw new RuntimeException(e);
 80 | 		}
 81 | 		this.previousTrans = setPreviousTransitions(classifier.getLabelValues());
 82 | 	}
 83 | 
 84 | 	/**
 85 | 	 * Label 前向转移图
 86 | 	 *
 87 | 	 * @param labelValues label值
 88 | 	 * @return 前向转移二维数组，每行表示该label的所有前向label
 89 | 	 */
 90 | 	private int[][] setPreviousTransitions(String[] labelValues) {
 91 | 		int labelSize = labelValues.length;
 92 | 		List<List<Integer>> labelTransitions = new ArrayList<>();
 93 | 		for (int i = 0; i < labelSize; i++) {
 94 | 			labelTransitions.add(new LinkedList<>());
 95 | 		}
 96 | 		for (int cur = 0; cur < labelSize; cur++) {
 97 | 			for (int pre = 0; pre < labelSize; pre++) {
 98 | 				String curString = labelValues[cur];
 99 | 				String preString = labelValues[pre];
100 | 				char curPoc = curString.charAt(0);
101 | 				char prePoc = preString.charAt(0);
102 | 				// 如果有相同词性或者不带词性，按转移规则进行转移
103 | 				if (curString.substring(1).equals(preString.substring(1))) {
104 | 					// B 前面只能是E 或S
105 | 					if (curPoc == POS_B_CHAR) {
106 | 						if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) {
107 | 							labelTransitions.get(cur).add(pre);
108 | 						}
109 | 					}
110 | 					// M 前面只能是M 或 B
111 | 					else if (curPoc == POS_M_CHAR) {
112 | 						if (prePoc == POS_M_CHAR || prePoc == POS_B_CHAR) {
113 | 							labelTransitions.get(cur).add(pre);
114 | 						}
115 | 					}
116 | 					// E 前面只能是B 或 M
117 | 					else if (curPoc == POS_E_CHAR) {
118 | 						if (prePoc == POS_B_CHAR || prePoc == POS_M_CHAR) {
119 | 							labelTransitions.get(cur).add(pre);
120 | 						}
121 | 					}
122 | 					// S 前面只能是E 或 S
123 | 					else if (curPoc == POS_S_CHAR) {
124 | 						if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) {
125 | 							labelTransitions.get(cur).add(pre);
126 | 						}
127 | 					}
128 | 				}
129 | 				// 如果带有词性并且前后词性不相同，那么则按规则
130 | 				// B 前面只能是E 或S，S 前面只能是E 或S 进行转移
131 | 				else if (curString.length() > 1) {
132 | 					if (curPoc == POS_B_CHAR || curPoc == POS_S_CHAR) {
133 | 						if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) {
134 | 							labelTransitions.get(cur).add(pre);
135 | 						}
136 | 					}
137 | 				}
138 | 			}
139 | 		}
140 | 		// 将List 转成二维数组
141 | 		int[][] previousTrans = new int[labelSize][];
142 | 		for (int i = 0; i < labelSize; i++) {
143 | 			previousTrans[i] = new int[labelTransitions.get(i).size()];
144 | 			for (int j = 0; j < labelTransitions.get(i).size(); j++) {
145 | 				previousTrans[i][j] = labelTransitions.get(i).get(j);
146 | 			}
147 | 		}
148 | 		return previousTrans;
149 | 	}
150 | 
151 | 	/**
152 | 	 * 序列标注分词
153 | 	 *
154 | 	 * @param text 输入文本
155 | 	 * @return 序列标注结果
156 | 	 */
157 | 	public List<TokenItem> tokenize(String text) {
158 | 		List<TokenItem> tokenItems = new ArrayList<>();
159 | 		if (text.length() == 0) {
160 | 			return tokenItems;
161 | 		}
162 | 
163 | 		AnnotatedTerms annotatedTerms;
164 | 		// 若开启转简体
165 | 		if (Config.isEnableConvertToSimplifiedCHN) {
166 | 			String simplifiedSentence = ChineseUtils.simplified(text);
167 | 			annotatedTerms = RuleAnnotator.annotate(simplifiedSentence, Config.isEnableTileWord);
168 | 		} else {
169 | 			annotatedTerms = RuleAnnotator.annotate(text, Config.isEnableTileWord);
170 | 		}
171 | 		if (annotatedTerms.isEmpty()) {
172 | 			return tokenItems;
173 | 		}
174 | 
175 | 		int[] labels = classifier.classify(annotatedTerms, previousTrans);
176 | 
177 | 		char[] rawChars = annotatedTerms.getPreAnnotateChars();
178 | 		String[] labelValues = classifier.getLabelValues();
179 | 		for (int i = 0, offset = 0; i < rawChars.length; i++) {
180 | 			String label = labelValues[labels[i]];
181 | 			char pocChar = label.charAt(0);
182 | 			if (pocChar == POS_E_CHAR || pocChar == POS_S_CHAR) {
183 | 				String word = new String(rawChars, offset, i + 1 - offset);
184 | 				if (label.length() >= 2) {
185 | 					tokenItems.add(new TokenItem(word, label.substring(1)));
186 | 				} else {
187 | 					tokenItems.add(new TokenItem(word, null));
188 | 				}
189 | 				offset = i + 1;
190 | 			}
191 | 		}
192 | 		// 若开启停用词过滤
193 | 		if (Config.isEnableFilterStopWords) {
194 | 			filterStopWords(tokenItems);
195 | 		}
196 | 		// 地名词典黏结
197 | 		nsCementer.cement(tokenItems);
198 | 		// 习语词典黏结
199 | 		idiomCementer.cement(tokenItems);
200 | 		// 特定词语黏结
201 | 		SpecifiedWordCementer.cementWord(tokenItems);
202 | 		if (uwCementer != null) {
203 | 			uwCementer.cement(tokenItems);
204 | 		}
205 | 		return tokenItems;
206 | 	}
207 | 
208 | 	/**
209 | 	 * 添加自定义词典
210 | 	 *
211 | 	 * @param words 词典
212 | 	 */
213 | 	public void addUserWords(List<String> words) {
214 | 		DoubleArrayTrie dat = DoubleArrayTrie.make(words);
215 | 		this.uwCementer = new LexiconCementer(dat, "uw");
216 | 	}
217 | 
218 | 	/**
219 | 	 * 开启书名单独成词
220 | 	 */
221 | 	public void enableTitleWord() {
222 | 		Config.isEnableTileWord = true;
223 | 	}
224 | 
225 | 	/**
226 | 	 * 开启停用词过滤
227 | 	 */
228 | 	public void enableFilterStopWords() {
229 | 		Config.isEnableFilterStopWords = true;
230 | 	}
231 | 
232 | 	/**
233 | 	 * 开启转简写
234 | 	 */
235 | 	public void enableConvertToSimplifiedCHN() {
236 | 		Config.isEnableConvertToSimplifiedCHN = true;
237 | 	}
238 | 
239 | 	/**
240 | 	 * 过滤停用词
241 | 	 *
242 | 	 * @param tokenItems 解码结果
243 | 	 */
244 | 	private void filterStopWords(List<TokenItem> tokenItems) {
245 | 		for (int i = 0; i < tokenItems.size(); ) {
246 | 			if (ChineseUtils.isStopWord(tokenItems.get(i).word)) {
247 | 				tokenItems.remove(i);
248 | 			} else {
249 | 				i++;
250 | 			}
251 | 		}
252 | 	}
253 | }
254 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/Segmenter.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j;
 2 | 
 3 | import java.util.List;
 4 | import java.util.stream.Collectors;
 5 | 
 6 | import static io.github.yizhiru.thulac4j.util.ModelPaths.*;
 7 | 
 8 | /**
 9 |  * 中文分词.
10 |  */
11 | public final class Segmenter {
12 | 
13 | 	private static final SPChineseTokenizer TOKENIZER = new SPChineseTokenizer(
14 | 			Segmenter.class.getResourceAsStream(SEGMENTER_WEIGHT_PATH),
15 | 			Segmenter.class.getResourceAsStream(SEGMENTER_FEATURE_PATH),
16 | 			Segmenter.class.getResourceAsStream(SEGMENTER_LABEL_PATH));
17 | 
18 | 	/**
19 | 	 * 中文分词
20 | 	 *
21 | 	 * @param text 待分词文本
22 | 	 * @return 分词结果
23 | 	 */
24 | 	public static List<String> segment(String text) {
25 | 		return TOKENIZER.tokenize(text)
26 | 				.stream()
27 | 				.map(item -> (item.word))
28 | 				.collect(Collectors.toList());
29 | 	}
30 | 
31 | 	/**
32 | 	 * 添加自定义词典
33 | 	 *
34 | 	 * @param words 词典
35 | 	 */
36 | 	public static void addUserWords(List<String> words) {
37 | 		TOKENIZER.addUserWords(words);
38 | 	}
39 | 
40 | 	/**
41 | 	 * 开启开启书名单独成词
42 | 	 */
43 | 	public static void enableTitleWord() {
44 | 		TOKENIZER.enableTitleWord();
45 | 	}
46 | 
47 | 	/**
48 | 	 * 开启停用词过滤
49 | 	 */
50 | 	public static void enableFilterStopWords() {
51 | 		TOKENIZER.enableFilterStopWords();
52 | 	}
53 | 
54 | 	/**
55 | 	 * 开启转简写
56 | 	 */
57 | 	public static void enableConvertToSimplifiedCHN() {
58 | 		TOKENIZER.enableConvertToSimplifiedCHN();
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/common/DoubleArrayTrie.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.common;
  2 | 
  3 | import io.github.yizhiru.thulac4j.util.IOUtils;
  4 | 
  5 | import java.io.*;
  6 | import java.nio.ByteBuffer;
  7 | import java.nio.ByteOrder;
  8 | import java.nio.IntBuffer;
  9 | import java.nio.channels.FileChannel;
 10 | import java.util.Arrays;
 11 | import java.util.LinkedList;
 12 | import java.util.List;
 13 | import java.util.stream.Collectors;
 14 | 
 15 | /**
 16 |  * Double Array Trie (DAT).
 17 |  */
 18 | public class DoubleArrayTrie implements Serializable {
 19 | 
 20 | 	private static final long serialVersionUID = 8713857561296693244L;
 21 | 
 22 | 	public static final int MATCH_FAILURE_INDEX = -1;
 23 | 
 24 | 	/**
 25 | 	 * Base array.
 26 | 	 */
 27 | 	protected int[] baseArray;
 28 | 
 29 | 	/**
 30 | 	 * Check array.
 31 | 	 */
 32 | 	protected int[] checkArray;
 33 | 
 34 | 
 35 | 	/**
 36 | 	 * The size of DAT.
 37 | 	 */
 38 | 	protected int size;
 39 | 
 40 | 	public DoubleArrayTrie(int[] baseArray, int[] checkArray) {
 41 | 		if (baseArray.length != checkArray.length) {
 42 | 			throw new IllegalArgumentException(String.format("The getAnnotatedLength of base array %s != the getAnnotatedLength of check " +
 43 | 					"array %s", baseArray.length, checkArray.length));
 44 | 		}
 45 | 		this.baseArray = baseArray;
 46 | 		this.checkArray = checkArray;
 47 | 		size = baseArray.length;
 48 | 	}
 49 | 
 50 | 	public DoubleArrayTrie(int[] baseArray, int[] checkArray, int size) {
 51 | 		this.baseArray = Arrays.copyOf(baseArray, size);
 52 | 		this.checkArray = Arrays.copyOf(checkArray, size);
 53 | 		this.size = size;
 54 | 	}
 55 | 
 56 | 	private DoubleArrayTrie() {
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * The size of DAT.
 61 | 	 *
 62 | 	 * @return size
 63 | 	 */
 64 | 	public int size() {
 65 | 		return size;
 66 | 	}
 67 | 
 68 | 	/**
 69 | 	 * Ensure the index is not out bound.
 70 | 	 *
 71 | 	 * @param index the index value.
 72 | 	 */
 73 | 	private void ensureValidIndex(int index) {
 74 | 		if (index >= size()) {
 75 | 			throw new RuntimeException(String.format("The index %s is out of bound [%s].",
 76 | 					index, size()));
 77 | 		}
 78 | 	}
 79 | 
 80 | 	/**
 81 | 	 * Get base value by its index.
 82 | 	 *
 83 | 	 * @param index the index of base array.
 84 | 	 * @return the base value.
 85 | 	 */
 86 | 	public int getBaseByIndex(int index) {
 87 | 		ensureValidIndex(index);
 88 | 		return baseArray[index];
 89 | 	}
 90 | 
 91 | 	/**
 92 | 	 * Get check value by its index.
 93 | 	 *
 94 | 	 * @param index the index of check array.
 95 | 	 * @return the check value.
 96 | 	 */
 97 | 	public int getCheckByIndex(int index) {
 98 | 		ensureValidIndex(index);
 99 | 		return checkArray[index];
100 | 	}
101 | 
102 | 	/**
103 | 	 * 序列化.
104 | 	 *
105 | 	 * @param path 文件路径
106 | 	 */
107 | 	public void serialize(String path) throws IOException {
108 | 		FileChannel channel = new FileOutputStream(path).getChannel();
109 | 		ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4 * (2 * size() + 1));
110 | 		IntBuffer intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN)
111 | 				.asIntBuffer();
112 | 		intBuffer.put(size());
113 | 		intBuffer.put(baseArray);
114 | 		intBuffer.put(checkArray);
115 | 		channel.write(byteBuffer);
116 | 		channel.close();
117 | 	}
118 | 
119 | 	/**
120 | 	 * 加载序列化DAT模型
121 | 	 *
122 | 	 * @param path 文件目录
123 | 	 * @return DAT模型
124 | 	 */
125 | 	public static DoubleArrayTrie loadDat(String path) throws IOException {
126 | 		return loadDat(new FileInputStream(path));
127 | 	}
128 | 
129 | 	/**
130 | 	 * 加载序列化DAT模型
131 | 	 *
132 | 	 * @param inputStream 文件输入流
133 | 	 * @return DAT模型
134 | 	 */
135 | 	public static DoubleArrayTrie loadDat(InputStream inputStream) {
136 | 		int[] array;
137 | 		try {
138 | 			array = IOUtils.toIntArray(inputStream);
139 | 		} catch (IOException e) {
140 | 			throw new RuntimeException(e);
141 | 		}
142 | 		int arrayLen = array[0];
143 | 		int[] baseArr = Arrays.copyOfRange(array, 1, arrayLen + 1);
144 | 		int[] checkArr = Arrays.copyOfRange(array, arrayLen + 1, 2 * arrayLen + 1);
145 | 		return new DoubleArrayTrie(baseArr, checkArr);
146 | 	}
147 | 
148 | 	/**
149 | 	 * 按照DAT的转移方程进行转移: ROOT_PATH[r] + c = s, check[s] = r
150 | 	 *
151 | 	 * @param prefixIndex 前缀在DAT中的index
152 | 	 * @param charValue   转移字符的int值
153 | 	 * @return 在DAT中的index，若不在则为-1
154 | 	 */
155 | 	public int transition(int prefixIndex, int charValue) {
156 | 		if (prefixIndex < 0 || prefixIndex >= size()) {
157 | 			return MATCH_FAILURE_INDEX;
158 | 		}
159 | 		int index = baseArray[prefixIndex] + charValue;
160 | 		if (index >= size() || checkArray[index] != prefixIndex) {
161 | 			return MATCH_FAILURE_INDEX;
162 | 		}
163 | 		return index;
164 | 	}
165 | 
166 | 	/**
167 | 	 * 词是否在trie树中
168 | 	 *
169 | 	 * @param word 词
170 | 	 * @return 若存在，则为true
171 | 	 */
172 | 	public boolean isWordMatched(String word) {
173 | 		return isWordMatched(-match(word));
174 | 	}
175 | 
176 | 	/**
177 | 	 * 词是否在trie树中
178 | 	 *
179 | 	 * @param matchedIndex 已匹配上词前缀的index
180 | 	 * @return 若存在，则为true
181 | 	 */
182 | 	public boolean isWordMatched(int matchedIndex) {
183 | 		if (matchedIndex <= 0) {
184 | 			return false;
185 | 		}
186 | 		int base = baseArray[matchedIndex];
187 | 		return base < size() && checkArray[base] == matchedIndex;
188 | 	}
189 | 
190 | 	/**
191 | 	 * 前缀是否在trie树中
192 | 	 *
193 | 	 * @param prefix 前缀
194 | 	 * @return 若存在，则为true
195 | 	 */
196 | 	public boolean isPrefixMatched(String prefix) {
197 | 		return match(prefix) < 0;
198 | 	}
199 | 
200 | 	/**
201 | 	 * 匹配字符串.
202 | 	 *
203 | 	 * @param str 字符串
204 | 	 * @return 若匹配上，则为转移后index的负值；否则，则返回已匹配上的字符数
205 | 	 */
206 | 	protected int match(String str) {
207 | 		return match(0, str);
208 | 	}
209 | 
210 | 	/**
211 | 	 * 匹配字符串.
212 | 	 *
213 | 	 * @param startIndex DAT的开始index
214 | 	 * @param str        字符串
215 | 	 * @return 若匹配上，则为转移后index的负值；否则，则返回已匹配上的字符数
216 | 	 */
217 | 	public int match(int startIndex, String str) {
218 | 		int index = startIndex;
219 | 		for (int i = 0; i < str.length(); i++) {
220 | 			index = transition(index, str.charAt(i));
221 | 			if (index == MATCH_FAILURE_INDEX) {
222 | 				return i;
223 | 			}
224 | 		}
225 | 		return -index;
226 | 	}
227 | 
228 | 	private static class Builder extends DoubleArrayTrie {
229 | 
230 | 		private static final long serialVersionUID = 1675990036852836829L;
231 | 
232 | 		/**
233 | 		 * 标记可用的base index值.
234 | 		 */
235 | 		private int availableBaseIndex;
236 | 
237 | 		/**
238 | 		 * Initial value.
239 | 		 */
240 | 		private static final int INITIAL_VALUE = -1;
241 | 
242 | 		private Builder() {
243 | 			baseArray = new int[]{0};
244 | 			checkArray = new int[]{INITIAL_VALUE};
245 | 			size = 1;
246 | 			availableBaseIndex = 0;
247 | 		}
248 | 
249 | 		/**
250 | 		 * Expand two size.
251 | 		 */
252 | 		private void expand() {
253 | 			int oldCapacity = size;
254 | 			int newCapacity = oldCapacity << 1;
255 | 			baseArray = Arrays.copyOf(baseArray, newCapacity);
256 | 			Arrays.fill(baseArray, oldCapacity, newCapacity, INITIAL_VALUE);
257 | 			checkArray = Arrays.copyOf(checkArray, newCapacity);
258 | 			Arrays.fill(checkArray, oldCapacity, newCapacity, INITIAL_VALUE);
259 | 
260 | 			size = newCapacity;
261 | 		}
262 | 
263 | 		/**
264 | 		 * Remove useless base and check.
265 | 		 */
266 | 		private void shrink() {
267 | 			for (int i = checkArray.length - 1; i >= 0; i--) {
268 | 				if (checkArray[i] == INITIAL_VALUE) {
269 | 					size--;
270 | 				} else {
271 | 					break;
272 | 				}
273 | 			}
274 | 		}
275 | 
276 | 		/**
277 | 		 * 找到满足条件的baseIndex
278 | 		 *
279 | 		 * @param children 前缀的后一字符集合
280 | 		 * @return baseIndex
281 | 		 */
282 | 		private int findBaseIndex(List<Integer> children) {
283 | 			int cSize = children.size();
284 | 			for (int bi = availableBaseIndex; ; bi++) {
285 | 				if (bi == size()) {
286 | 					expand();
287 | 				}
288 | 				if (cSize > 0) {
289 | 					while (bi + children.get(cSize - 1) >= size()) {
290 | 						expand();
291 | 					}
292 | 				}
293 | 				// baseIndex应满足条件：
294 | 				// 1. 未被使用
295 | 				// 2. 满足所有children跳转到的node也未被使用
296 | 				if (checkArray[bi] >= 0) {
297 | 					continue;
298 | 				}
299 | 				boolean isValid = true;
300 | 				for (Integer c : children) {
301 | 					if (checkArray[bi + c] >= 0) {
302 | 						isValid = false;
303 | 						break;
304 | 					}
305 | 				}
306 | 				if (isValid) {
307 | 					return bi;
308 | 				}
309 | 			}
310 | 		}
311 | 
312 | 		/**
313 | 		 * 插入到Trie树
314 | 		 *
315 | 		 * @param prefixIndex 前缀对应的index
316 | 		 * @param children    前缀的后一字符集合
317 | 		 * @param isWord      前缀是否为词
318 | 		 */
319 | 		private void insert(int prefixIndex, List<Integer> children, boolean isWord) {
320 | 			int bi = findBaseIndex(children);
321 | 			baseArray[prefixIndex] = bi;
322 | 			if (isWord) {
323 | 				checkArray[bi] = prefixIndex;
324 | 				availableBaseIndex = bi + 1;
325 | 			}
326 | 			for (int c : children) {
327 | 				baseArray[bi + c] = 0;
328 | 				checkArray[bi + c] = prefixIndex;
329 | 			}
330 | 		}
331 | 
332 | 		/**
333 | 		 * 给定前缀生成后一字符集合
334 | 		 *
335 | 		 * @param sortedLexicon     按字典序排序后的词典
336 | 		 * @param startLexiconIndex 词典开始时的索引位置
337 | 		 * @param prefix            前缀
338 | 		 * @return 后一字符集合
339 | 		 */
340 | 		private List<Integer> generateChildren(List<String> sortedLexicon,
341 | 		                                       int startLexiconIndex,
342 | 		                                       String prefix) {
343 | 			List<Integer> children = new LinkedList<>();
344 | 			int prefixLen = prefix.length();
345 | 			for (int i = startLexiconIndex; i < sortedLexicon.size(); i++) {
346 | 				String word = sortedLexicon.get(i);
347 | 				// 停止循环条件：
348 | 				// 1. 词的长度小于前缀长度
349 | 				// 2. 词的前缀与给定前缀不一致
350 | 				if (word.length() < prefixLen
351 | 						|| !word.substring(0, prefixLen).equals(prefix)) {
352 | 					return children;
353 | 				} else if (word.length() > prefixLen) {
354 | 					int charValue = (int) word.charAt(prefixLen);
355 | 					if (children.isEmpty() || charValue != children.get(children.size() - 1)) {
356 | 						children.add(charValue);
357 | 					}
358 | 				}
359 | 			}
360 | 			return children;
361 | 		}
362 | 
363 | 		/**
364 | 		 * 构建DAT
365 | 		 *
366 | 		 * @param lexicon 词典
367 | 		 * @return 词典对应的DAT
368 | 		 */
369 | 		private DoubleArrayTrie build(List<String> lexicon) {
370 | 			lexicon.sort(String::compareTo);
371 | 			String word, prefix;
372 | 			int preIndex;
373 | 			for (int i = 0; i < lexicon.size(); i++) {
374 | 				word = lexicon.get(i);
375 | 				int matched = match(word);
376 | 				matched = matched < 0 ? word.length() : matched;
377 | 				for (int j = matched; j <= word.length(); j++) {
378 | 					prefix = word.substring(0, j);
379 | 					preIndex = -match(prefix);
380 | 					List<Integer> children = generateChildren(lexicon, i, prefix);
381 | 					insert(preIndex, children, j == word.length());
382 | 				}
383 | 				matched = -match(word);
384 | 				baseArray[baseArray[matched]] = i;
385 | 			}
386 | 			shrink();
387 | 			return new DoubleArrayTrie(baseArray, checkArray, size);
388 | 		}
389 | 	}
390 | 
391 | 	/**
392 | 	 * Make DAT.
393 | 	 *
394 | 	 * @param path file path.
395 | 	 * @return DAT
396 | 	 */
397 | 	public static DoubleArrayTrie make(String path) throws FileNotFoundException {
398 | 		return make(new FileInputStream(path));
399 | 	}
400 | 
401 | 	/**
402 | 	 * Make DAT.
403 | 	 *
404 | 	 * @param inputStream input stream of file
405 | 	 * @return DAT
406 | 	 */
407 | 	public static DoubleArrayTrie make(InputStream inputStream) {
408 | 		BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
409 | 		List<String> lexicon = br.lines()
410 | 				.map(String::trim)
411 | 				.collect(Collectors.toList());
412 | 		return make(lexicon);
413 | 	}
414 | 
415 | 	public static DoubleArrayTrie make(List<String> lexicon) {
416 | 		return new Builder().build(lexicon);
417 | 	}
418 | 
419 | 	/**
420 | 	 * 从DAT 还原成词典.
421 | 	 *
422 | 	 * @param dat DAT
423 | 	 */
424 | 	public static List<String> restore(DoubleArrayTrie dat) {
425 | 		String word;
426 | 		LinkedList<String> list = new LinkedList<>();
427 | 		for (int i = 0; i < dat.size(); i++) {
428 | 			if (dat.getCheckByIndex(i) >= 0) {
429 | 				word = restoreWord(dat, i);
430 | 				if (dat.isWordMatched(word)) {
431 | 					list.add(word);
432 | 				}
433 | 			}
434 | 		}
435 | 		return list;
436 | 	}
437 | 
438 | 	/**
439 | 	 * Restore word by its last index.
440 | 	 *
441 | 	 * @param dat   Double Array Trie
442 | 	 * @param index the last index of word, i.e. its check >= 0
443 | 	 * @return word
444 | 	 */
445 | 	private static String restoreWord(DoubleArrayTrie dat, int index) {
446 | 		int pre;
447 | 		int cur = index;
448 | 		StringBuilder sb = new StringBuilder();
449 | 		while (cur > 0 && cur < dat.size()) {
450 | 			pre = dat.getCheckByIndex(cur);
451 | 			if (pre == cur || dat.getBaseByIndex(pre) >= cur) {
452 | 				break;
453 | 			}
454 | 			sb.insert(0, (char) (cur - dat.getBaseByIndex(pre)));
455 | 			cur = pre;
456 | 		}
457 | 		return sb.toString();
458 | 	}
459 | }
460 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/common/Nullable.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.common;
 2 | 
 3 | import java.lang.annotation.Documented;
 4 | import java.lang.annotation.ElementType;
 5 | import java.lang.annotation.Retention;
 6 | import java.lang.annotation.RetentionPolicy;
 7 | import java.lang.annotation.Target;
 8 | 
 9 | /**
10 |  * Declares that null is a valid value for a Java type. May be applied to parameters,
11 |  * fields and methods (to declare the return type).
12 |  */
13 | @Retention(RetentionPolicy.RUNTIME)
14 | @Target({ElementType.PARAMETER, ElementType.METHOD, ElementType.FIELD})
15 | @Documented
16 | public @interface Nullable {
17 | }
18 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronClassifier.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.perceptron;
  2 | 
  3 | import io.github.yizhiru.thulac4j.term.POC;
  4 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms;
  5 | 
  6 | 
  7 | public final class StructuredPerceptronClassifier {
  8 | 
  9 | 	/**
 10 | 	 * Null previous label.
 11 | 	 */
 12 | 	private static final int NULL_PREVIOUS_LABEL = -5;
 13 | 
 14 | 	/**
 15 | 	 * Initial score.
 16 | 	 */
 17 | 	private static final int INITIAL_SCORE = 0;
 18 | 
 19 | 	/**
 20 | 	 * Initial previous label.
 21 | 	 */
 22 | 	private static final int INITIAL_PREVIOUS_LABEL = -1;
 23 | 
 24 | 	/**
 25 | 	 * SP 模型.
 26 | 	 */
 27 | 	private StructuredPerceptronModel model;
 28 | 
 29 | 
 30 | 	public StructuredPerceptronClassifier(StructuredPerceptronModel model) {
 31 | 		this.model = model;
 32 | 	}
 33 | 
 34 | 	/**
 35 | 	 * 解码路径节点
 36 | 	 */
 37 | 	private static class PathNode {
 38 | 		/**
 39 | 		 * Score.
 40 | 		 */
 41 | 		private int score;
 42 | 
 43 | 		/**
 44 | 		 * Previous Label.
 45 | 		 */
 46 | 		private int previousLabel;
 47 | 
 48 | 		public PathNode() {
 49 | 			score = INITIAL_SCORE;
 50 | 			previousLabel = NULL_PREVIOUS_LABEL;
 51 | 		}
 52 | 
 53 | 		@Override
 54 | 		public String toString() {
 55 | 			return score + ", " + previousLabel;
 56 | 		}
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * 结构感知器分类，采用Viterbi算法解码
 61 | 	 *
 62 | 	 * @param annotatedTerms     规则处理后的句子Label 类
 63 | 	 * @param previousTransition 前向转移label
 64 | 	 * @return 最优路径对应的label索引值
 65 | 	 */
 66 | 	public int[] classify(
 67 | 			AnnotatedTerms annotatedTerms,
 68 | 			int[][] previousTransition) {
 69 | 		int len = annotatedTerms.getAnnotatedLength();
 70 | 		// 最优路径对应的label
 71 | 		int[] bestPath = new int[len];
 72 | 		int labelSize = model.labelSize;
 73 | 		int optimalLastScore = Integer.MIN_VALUE;
 74 | 		int optimalLastLabel = 2;
 75 | 		PathNode node;
 76 | 		// 记录在位置i时类别为y的最优路径
 77 | 		// [current index][current Label] -> PathNode(score, previousLabel)
 78 | 		PathNode[][] pathTabular = new PathNode[len][];
 79 | 		for (int i = 0; i < len; i++) {
 80 | 			pathTabular[i] = new PathNode[labelSize];
 81 | 			for (int j = 0; j < labelSize; j++) {
 82 | 				pathTabular[i][j] = new PathNode();
 83 | 			}
 84 | 		}
 85 | 
 86 | 		char[] chars = annotatedTerms.appendBoundaryAround();
 87 | 		POC[] pocs = annotatedTerms.getPocs();
 88 | 
 89 | 		// DP求解
 90 | 		for (int i = 0; i < len; i++) {
 91 | 			int[] labelIndices = model.allowTabular[pocs[i].ordinal()];
 92 | 			int[] weights = model.evaluateCharWeights(
 93 | 					chars[i],
 94 | 					chars[i + 1],
 95 | 					chars[i + 2],
 96 | 					chars[i + 3],
 97 | 					chars[i + 4],
 98 | 					labelIndices);
 99 | 			for (int labelIndex : labelIndices) {
100 | 				node = pathTabular[i][labelIndex];
101 | 				if (i == 0) {
102 | 					node.previousLabel = INITIAL_PREVIOUS_LABEL;
103 | 				} else {
104 | 					int[] preLabels = previousTransition[labelIndex];
105 | 					for (int pre : preLabels) {
106 | 						if (pathTabular[i - 1][pre].previousLabel == NULL_PREVIOUS_LABEL) {
107 | 							continue;
108 | 						}
109 | 						int score = pathTabular[i - 1][pre].score
110 | 								+ model.llWeights[pre * model.labelSize + labelIndex];
111 | 						if (node.previousLabel == NULL_PREVIOUS_LABEL || score > node.score) {
112 | 							node.score = score;
113 | 							node.previousLabel = pre;
114 | 						}
115 | 					}
116 | 				}
117 | 				node.score += weights[labelIndex];
118 | 				if (i == len - 1 && optimalLastScore < node.score) {
119 | 					optimalLastScore = node.score;
120 | 					optimalLastLabel = labelIndex;
121 | 				}
122 | 			}
123 | 		}
124 | 		// 尾节点的最优label
125 | 		node = pathTabular[len - 1][optimalLastLabel];
126 | 		bestPath[len - 1] = optimalLastLabel;
127 | 		// 回溯最优路径，保留label到数组
128 | 		for (int i = len - 2; i >= 0; i--) {
129 | 			bestPath[i] = node.previousLabel;
130 | 			node = pathTabular[i][node.previousLabel];
131 | 		}
132 | 		return bestPath;
133 | 	}
134 | 
135 | 	/**
136 | 	 * 得到所有label.
137 | 	 */
138 | 	public String[] getLabelValues() {
139 | 		return model.labelValues;
140 | 	}
141 | }
142 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronModel.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.perceptron;
  2 | 
  3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie;
  4 | import io.github.yizhiru.thulac4j.util.IOUtils;
  5 | 
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.io.Serializable;
  9 | import java.nio.ByteBuffer;
 10 | import java.nio.ByteOrder;
 11 | import java.nio.IntBuffer;
 12 | import java.util.ArrayList;
 13 | import java.util.List;
 14 | 
 15 | import static io.github.yizhiru.thulac4j.common.DoubleArrayTrie.MATCH_FAILURE_INDEX;
 16 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.NGramFeature.*;
 17 | 
 18 | /**
 19 |  * 结构感知器模型.
 20 |  */
 21 | public final class StructuredPerceptronModel implements Serializable {
 22 | 
 23 | 	private static final long serialVersionUID = -5324153272692800765L;
 24 | 
 25 | 	/**
 26 | 	 * Label数量.
 27 | 	 */
 28 | 	public final int labelSize;
 29 | 
 30 | 	/**
 31 | 	 * 特征数量.
 32 | 	 */
 33 | 	public final int featureSize;
 34 | 
 35 | 	/**
 36 | 	 * label转移到label的权重.
 37 | 	 */
 38 | 	public final int[] llWeights;
 39 | 
 40 | 	/**
 41 | 	 * 特征对应某label的权重.
 42 | 	 * Why use one-dimensional array but not two-dimensional array? Please Refer to
 43 | 	 * <a>
 44 | 	 * https://stackoverflow.com/questions/2512082/java-multi-dimensional-array-vs-one-dimensional
 45 | 	 * </a>
 46 | 	 */
 47 | 	public final int[] flWeights;
 48 | 
 49 | 	/**
 50 | 	 * Feature DAT
 51 | 	 */
 52 | 	private final DoubleArrayTrie featureDat;
 53 | 
 54 | 	/**
 55 | 	 * Label, 对应于cws_label.txt 或者 model_c_label.txt
 56 | 	 */
 57 | 	public final String[] labelValues;
 58 | 
 59 | 	/**
 60 | 	 * 映射 enum POC 对应的所有label 索引值.
 61 | 	 * 其中，行为POC的ordinal值，列为索引值
 62 | 	 */
 63 | 	public final int[][] allowTabular;
 64 | 
 65 | 	/**
 66 | 	 * 加载训练模型
 67 | 	 *
 68 | 	 * @param weightInput  label转移权重、特征label权重, cws_model.bin
 69 | 	 *                     *                    或者model_c_model.bin
 70 | 	 * @param featureInput 特征DAT cws_dat.bin 或者 model_c_dat.bin
 71 | 	 * @param labelInput   label
 72 | 	 * @throws IOException if an I/O error occurs
 73 | 	 */
 74 | 	public StructuredPerceptronModel(InputStream weightInput, InputStream featureInput, InputStream labelInput) throws IOException {
 75 | 		// Load weights model
 76 | 		ByteBuffer byteBuffer = ByteBuffer.wrap(IOUtils.toByteArray(weightInput));
 77 | 		IntBuffer intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN)
 78 | 				.asIntBuffer();
 79 | 		labelSize = intBuffer.get();
 80 | 		featureSize = intBuffer.get();
 81 | 		llWeights = new int[labelSize * labelSize];
 82 | 		flWeights = new int[featureSize * labelSize];
 83 | 		intBuffer.get(llWeights);
 84 | 		intBuffer.get(flWeights);
 85 | 
 86 | 		// Load feature DAT
 87 | 		byteBuffer = ByteBuffer.wrap(IOUtils.toByteArray(featureInput));
 88 | 		// int类型占4个字节
 89 | 		int arrayLen = byteBuffer.remaining() / 4;
 90 | 		int[] featureArray = new int[arrayLen];
 91 | 		intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN)
 92 | 				.asIntBuffer();
 93 | 		intBuffer.get(featureArray);
 94 | 		// convert feature DAT
 95 | 		int[] baseArr = new int[arrayLen / 2];
 96 | 		int[] checkArr = new int[arrayLen / 2];
 97 | 		for (int i = 0; i < arrayLen / 2; i++) {
 98 | 			baseArr[i] = featureArray[2 * i];
 99 | 			checkArr[i] = featureArray[2 * i + 1];
100 | 		}
101 | 		featureDat = new DoubleArrayTrie(baseArr, checkArr);
102 | 
103 | 		List<String> labelList = IOUtils.readLines(labelInput);
104 | 		labelValues = new String[labelList.size()];
105 | 		labelList.toArray(labelValues);
106 | 
107 | 		// 记录label 集合，能与allowTabular 映射起来
108 | 		List<List<Integer>> posTags = getPosTags();
109 | 
110 | 		// allowTabular 表示enum POC 对应的所有允许label，比如：
111 | 		// PUNCTUATION_POC 对应的允许label为 3 或 3w，
112 | 		// BEGIN_POC 对应的允许label为 0 或 0打头的label
113 | 		allowTabular = new int[12][];
114 | 		for (int i = 0; i < labelValues.length; i++) {
115 | 			// punctuation
116 | 			if ("3".equals(labelValues[i]) || "3w".equals(labelValues[i])) {
117 | 				allowTabular[0] = new int[]{i};
118 | 			}
119 | 			// single of numeral
120 | 			if ("3".equals(labelValues[i]) || "3m".equals(labelValues[i])) {
121 | 				allowTabular[4] = new int[]{i};
122 | 			}
123 | 			// begin of numeral
124 | 			else if ("0".equals(labelValues[i]) || "0m".equals(labelValues[i])) {
125 | 				allowTabular[1] = new int[]{i};
126 | 			}
127 | 			// middle of numeral
128 | 			else if ("1".equals(labelValues[i]) || "1m".equals(labelValues[i])) {
129 | 				allowTabular[2] = new int[]{i};
130 | 			}
131 | 			// end of numeral
132 | 			else if ("2".equals(labelValues[i]) || "2m".equals(labelValues[i])) {
133 | 				allowTabular[3] = new int[]{i};
134 | 			}
135 | 		}
136 | 		int[] indices = {1, 2, 4, 8, 9, 12, 15};
137 | 		for (int i = 0; i < indices.length; i++) {
138 | 			allowTabular[i + 5] = posTags.get(indices[i])
139 | 					.stream()
140 | 					.mapToInt(x -> x)
141 | 					.toArray();
142 | 		}
143 | 	}
144 | 
145 | 	/**
146 | 	 * 计算所有可能label 索引值集合，以二维数组表示
147 | 	 *
148 | 	 * @return 索引值二维数组
149 | 	 */
150 | 	private List<List<Integer>> getPosTags() {
151 | 		List<List<Integer>> posTagsList = new ArrayList<>();
152 | 		int defaultSize = 16;
153 | 		for (int i = 0; i < defaultSize; i++) {
154 | 			posTagsList.add(new ArrayList<>());
155 | 		}
156 | 		for (int i = 0; i < labelValues.length; i++) {
157 | 			int segIndex = labelValues[i].charAt(0) - '0';
158 | 			for (int j = 0; j < defaultSize; j++) {
159 | 				if (((1 << segIndex) & j) != 0) {
160 | 					posTagsList.get(j).add(i);
161 | 				}
162 | 			}
163 | 		}
164 | 		return posTagsList;
165 | 	}
166 | 
167 | 	/**
168 | 	 * 训练模型文件中POC对应的标识.
169 | 	 */
170 | 	public static final class PocMark {
171 | 		/**
172 | 		 * 对应于 POC B 的char.
173 | 		 */
174 | 		public static final Character POS_B_CHAR = '0';
175 | 
176 | 		/**
177 | 		 * 对应于 POC M 的char.
178 | 		 */
179 | 		public static final Character POS_M_CHAR = '1';
180 | 
181 | 		/**
182 | 		 * 对应于 POC E 的char.
183 | 		 */
184 | 		public static final Character POS_E_CHAR = '2';
185 | 
186 | 		/**
187 | 		 * 对应于 POC B 的char.
188 | 		 */
189 | 		public static final Character POS_S_CHAR = '3';
190 | 	}
191 | 
192 | 	/**
193 | 	 * N-gram 特征.
194 | 	 * THULAC采用的分词模型为结构化感知器（Structured Perceptron, SP），以最大熵准则
195 | 	 * 建模序列标注的得分函数.
196 | 	 */
197 | 	public static class NGramFeature {
198 | 
199 | 		/**
200 | 		 * 超越边界的统一字符'#'
201 | 		 */
202 | 		public static final char BOUNDARY = 65283;
203 | 
204 | 		/**
205 | 		 * feature的一部分
206 | 		 */
207 | 		public static final char SPACE = ' ';
208 | 
209 | 		/**
210 | 		 * Unigram 特征种类1，对应于特征 mid + SPACE + '1'，即标注对应的当前字符
211 | 		 */
212 | 		public static final char UNIGRAM_FEATURE_1 = '1';
213 | 
214 | 		/**
215 | 		 * Unigram 特征种类2，对应于特征 left + SPACE + '2'，即标注的前一字符
216 | 		 */
217 | 		public static final char UNIGRAM_FEATURE_2 = '2';
218 | 
219 | 		/**
220 | 		 * Unigram 特征种类3，对应于特征 right + SPACE + '3'，即标注的后一字符
221 | 		 */
222 | 		public static final char UNIGRAM_FEATURE_3 = '3';
223 | 
224 | 		/**
225 | 		 * Bigram 特征种类1，对应于特征 left + mid + SPACE + '1'，
226 | 		 * 即标注的前一字符加上当前字符
227 | 		 */
228 | 		public static final char BIGRAM_FEATURE_1 = '1';
229 | 
230 | 		/**
231 | 		 * Bigram 特征种类2，对应于特征 mid + right + SPACE + '2'，
232 | 		 * 即标注对应的当前字符加上后一字符
233 | 		 */
234 | 		public static final char BIGRAM_FEATURE_2 = '2';
235 | 
236 | 		/**
237 | 		 * Bigram 特征种类3，对应于特征 left2 + left1 + SPACE + '3'，
238 | 		 * 即标注的前二字符加上前一字符
239 | 		 */
240 | 		public static final char BIGRAM_FEATURE_3 = '3';
241 | 
242 | 		/**
243 | 		 * Bigram 特征种类4，对应于特征 right + right2 + SPACE + '4'，
244 | 		 * 即标注的后一字符加上后二字符.
245 | 		 */
246 | 		public static final char BIGRAM_FEATURE_4 = '4';
247 | 	}
248 | 
249 | 	/**
250 | 	 * 寻找Unigram特征对应于DAT中的base.
251 | 	 *
252 | 	 * @param ch   字符
253 | 	 * @param mark 标识属于3种特征中的一种: '1', '2', '3'
254 | 	 * @return 若存在则返回base，否则则返回-1
255 | 	 */
256 | 	private int findUnigramFeat(char ch, char mark) {
257 | 		int index = (int) ch;
258 | 		index = featureDat.transition(index, SPACE);
259 | 		index = featureDat.transition(index, mark);
260 | 		if (index == MATCH_FAILURE_INDEX) {
261 | 			return MATCH_FAILURE_INDEX;
262 | 		}
263 | 		return featureDat.getBaseByIndex(index);
264 | 	}
265 | 
266 | 	/**
267 | 	 * 寻找Bigram特征对应于DAT中的base
268 | 	 *
269 | 	 * @param c1   第一个字符
270 | 	 * @param c2   第二个字符
271 | 	 * @param mark 标识属于4种特征中的一种: '1', '2', '3', '4'
272 | 	 * @return 若存在则返回对应的base值，否则返回-1
273 | 	 */
274 | 	private int findBigramFeat(char c1, char c2, char mark) {
275 | 		int index1 = (int) c1;
276 | 		int index2 = (int) c2;
277 | 		int index = featureDat.transition(index1, index2);
278 | 		index = featureDat.transition(index, SPACE);
279 | 		index = featureDat.transition(index, mark);
280 | 		if (index == MATCH_FAILURE_INDEX) {
281 | 			return MATCH_FAILURE_INDEX;
282 | 		}
283 | 		return featureDat.getBaseByIndex(index);
284 | 	}
285 | 
286 | 	/**
287 | 	 * 根据featureDAT的base值，更新特征权重之和数组
288 | 	 *
289 | 	 * @param weights      label权重之和数组
290 | 	 * @param base         featureDAT base值
291 | 	 * @param labelIndices 允许POS 索引值
292 | 	 */
293 | 	private void addWeights(int[] weights, int base, int[] labelIndices) {
294 | 		int offset = base * labelSize;
295 | 		for (int i : labelIndices) {
296 | 			weights[i] += flWeights[offset + i];
297 | 		}
298 | 	}
299 | 
300 | 	/**
301 | 	 * 根据前后一起的五个字符，计算加权特征权重之和数组
302 | 	 *
303 | 	 * @param left2        前二字符
304 | 	 * @param left1        前一字符
305 | 	 * @param mid          当前字符
306 | 	 * @param right1       后一字符
307 | 	 * @param right2       后二字符
308 | 	 * @param labelIndices 允许label 索引值
309 | 	 * @return 一维数组，表示当前字符的各label对应的特征权值加权之和
310 | 	 */
311 | 	public int[] evaluateCharWeights(
312 | 			char left2,
313 | 			char left1,
314 | 			char mid,
315 | 			char right1,
316 | 			char right2,
317 | 			int[] labelIndices) {
318 | 		int[] weights = new int[labelSize];
319 | 		int base;
320 | 		if ((base = findUnigramFeat(mid, UNIGRAM_FEATURE_1)) != MATCH_FAILURE_INDEX) {
321 | 			addWeights(weights, base, labelIndices);
322 | 		}
323 | 		if ((base = findUnigramFeat(left1, UNIGRAM_FEATURE_2)) != MATCH_FAILURE_INDEX) {
324 | 			addWeights(weights, base, labelIndices);
325 | 		}
326 | 		if ((base = findUnigramFeat(right1, UNIGRAM_FEATURE_3)) != MATCH_FAILURE_INDEX) {
327 | 			addWeights(weights, base, labelIndices);
328 | 		}
329 | 		if ((base = findBigramFeat(left1, mid, BIGRAM_FEATURE_1)) != MATCH_FAILURE_INDEX) {
330 | 			addWeights(weights, base, labelIndices);
331 | 		}
332 | 		if ((base = findBigramFeat(mid, right1, BIGRAM_FEATURE_2)) != MATCH_FAILURE_INDEX) {
333 | 			addWeights(weights, base, labelIndices);
334 | 		}
335 | 		if ((base = findBigramFeat(left2, left1, BIGRAM_FEATURE_3)) != MATCH_FAILURE_INDEX) {
336 | 			addWeights(weights, base, labelIndices);
337 | 		}
338 | 		if ((base = findBigramFeat(right1, right2, BIGRAM_FEATURE_4)) != MATCH_FAILURE_INDEX) {
339 | 			addWeights(weights, base, labelIndices);
340 | 		}
341 | 		return weights;
342 | 	}
343 | }
344 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/process/LexiconCementer.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.process;
 2 | 
 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie;
 4 | import io.github.yizhiru.thulac4j.term.TokenItem;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import java.io.Serializable;
 9 | import java.util.List;
10 | 
11 | /**
12 |  * 基于词典黏结词.
13 |  */
14 | public final class LexiconCementer implements Serializable {
15 | 
16 | 	private static final long serialVersionUID = 5479588292425956277L;
17 | 
18 | 	/**
19 | 	 * 词典DAT.
20 | 	 */
21 | 	private final DoubleArrayTrie dat;
22 | 
23 | 	/**
24 | 	 * 词性.
25 | 	 */
26 | 	private final String pos;
27 | 
28 | 	/**
29 | 	 * 加载序列化DAT模型.
30 | 	 *
31 | 	 * @param inputStream DAT输入流.
32 | 	 * @param pos         词性.
33 | 	 */
34 | 	public LexiconCementer(InputStream inputStream, String pos) throws IOException {
35 | 		dat = DoubleArrayTrie.loadDat(inputStream);
36 | 		this.pos = pos;
37 | 	}
38 | 
39 | 	/**
40 | 	 * 构造器.
41 | 	 *
42 | 	 * @param dat DAT.
43 | 	 * @param pos 词性.
44 | 	 */
45 | 	public LexiconCementer(DoubleArrayTrie dat, String pos) {
46 | 		this.dat = dat;
47 | 		this.pos = pos;
48 | 	}
49 | 
50 | 	public void cement(List<TokenItem> tokenItems) {
51 | 		int index;
52 | 		int j;
53 | 		for (int i = 0; i < tokenItems.size(); i++) {
54 | 			index = -dat.match(0, tokenItems.get(i).word);
55 | 			if (index <= 0) {
56 | 				continue;
57 | 			}
58 | 			StringBuilder builder = new StringBuilder(tokenItems.get(i).word);
59 | 			for (j = i + 1; j < tokenItems.size(); j++) {
60 | 				int preIndex = index;
61 | 				index = -dat.match(index, tokenItems.get(j).word);
62 | 				// 后面的词没有匹配上词典
63 | 				if (index <= 0) {
64 | 					index = preIndex;
65 | 					break;
66 | 				}
67 | 				builder.append(tokenItems.get(j).word);
68 | 			}
69 | 			// 若其后的词匹配上词典，则进行黏词
70 | 			String word = builder.toString();
71 | 			if (dat.isWordMatched(index)) {
72 | 				tokenItems.set(i, new TokenItem(word, pos));
73 | 				for (j = j - 1; j > i; j--) {
74 | 					tokenItems.remove(j);
75 | 				}
76 | 			}
77 | 		}
78 | 	}
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/process/RuleAnnotator.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.process;
  2 | 
  3 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms;
  4 | import io.github.yizhiru.thulac4j.term.CharType;
  5 | import io.github.yizhiru.thulac4j.util.CharUtils;
  6 | import io.github.yizhiru.thulac4j.term.POC;
  7 | 
  8 | import java.util.function.Predicate;
  9 | 
 10 | import static io.github.yizhiru.thulac4j.util.CharUtils.*;
 11 | 
 12 | 
 13 | /**
 14 |  * 借助标点符号、数字等信息，提前标注字符的可能label.
 15 |  */
 16 | public final class RuleAnnotator {
 17 | 
 18 | 	/**
 19 | 	 * 依据标点符号、数字等规则，标注部分字符的POC
 20 | 	 *
 21 | 	 * @param text 待分词文本
 22 | 	 * @return 清洗后String
 23 | 	 */
 24 | 	public static AnnotatedTerms annotate(String text, boolean isEnableTileWord) {
 25 | 		int len = text.length();
 26 | 		AnnotatedTerms annotatedTerms = new AnnotatedTerms(text.toCharArray());
 27 | 		boolean hasTitleBegin = false;
 28 | 		int titleBegin = 0;
 29 | 		for (int i = 0; i < len; ) {
 30 | 			CharType charType = annotatedTerms.getCharTypeByIndex(i);
 31 | 			// 1. Space or control character
 32 | 			if (charType == CharType.SPACE_OR_CONTROL_CHAR) {
 33 | 				annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC);
 34 | 				// 连续忽略
 35 | 				for (i++; i < len; i++) {
 36 | 					if (annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) {
 37 | 						break;
 38 | 					}
 39 | 				}
 40 | 				// 处理后面字符
 41 | 				if (i < len) {
 42 | 					annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC);
 43 | 				}
 44 | 			}
 45 | 			// 2. 标点符号
 46 | 			else if (charType == CharType.SINGLE_PUNCTUATION_CHAR) {
 47 | 				annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC);
 48 | 				annotatedTerms.append(i, POC.PUNCTUATION_POC);
 49 | 				if (isEnableTileWord) {
 50 | 					// 前书名号
 51 | 					char ch = annotatedTerms.getRawCharByIndex(i);
 52 | 					if (ch == LEFT_TITLE_QUOTATION_CHAR) {
 53 | 						hasTitleBegin = true;
 54 | 						titleBegin = i;
 55 | 					}
 56 | 					// 后书名号
 57 | 					else if (hasTitleBegin && ch == RIGHT_TITLE_QUOTATION_CHAR) {
 58 | 						if (isPossibleTitle(annotatedTerms, titleBegin + 1, i - 1)) {
 59 | 							setTitleWordPoc(annotatedTerms,
 60 | 									titleBegin + 1,
 61 | 									i - 1,
 62 | 									annotatedTerms.getAnnotatedLength() - 2);
 63 | 						}
 64 | 						hasTitleBegin = false;
 65 | 					}
 66 | 				}
 67 | 				i++;
 68 | 				// 处理后面字符
 69 | 				if (i < len && annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) {
 70 | 					annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC);
 71 | 				}
 72 | 			}
 73 | 			// 3. 英文字母
 74 | 			else if (charType == CharType.ENGLISH_LETTER_CHAR) {
 75 | 				i = processWord(annotatedTerms,
 76 | 						i,
 77 | 						RuleAnnotator::isPartOfLetterWord,
 78 | 						false);
 79 | 			}
 80 | 			// 4. Numbers
 81 | 			else if (charType == CharType.ARABIC_NUMERAL_CHAR) {
 82 | 				i = processWord(annotatedTerms,
 83 | 						i,
 84 | 						RuleAnnotator::isPartOfNumeral,
 85 | 						true);
 86 | 			}
 87 | 			// 5. 以上条件均不满足的标点符号单独成词
 88 | 			else if (charType == CharType.EX_SINGLE_PUNCTUATION_CHAR
 89 | 					|| charType == CharType.NUMERAL_PUNCTUATION_CHAR) {
 90 | 				setCurrentAsSingle(i, annotatedTerms, POC.PUNCTUATION_POC);
 91 | 				i++;
 92 | 			}
 93 | 			// 6. 汉字字符
 94 | 			else if (charType == CharType.HAN_ZI_CHAR
 95 | 					|| charType == CharType.CHINESE_NUMERAL_CHAR) {
 96 | 				annotatedTerms.append(i, POC.DEFAULT_POC);
 97 | 				i++;
 98 | 			}
 99 | 			// 7. 其他字符
100 | 			else {
101 | 				setCurrentAsSingle(i, annotatedTerms, POC.SINGLE_POC);
102 | 				i++;
103 | 			}
104 | 		}
105 | 		annotatedTerms.intersectPocByIndex(0, POC.BEGIN_OR_SINGLE_POC);
106 | 		annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC);
107 | 		return annotatedTerms;
108 | 	}
109 | 
110 | 	/**
111 | 	 * 当前字符单独成词，设置前一、当前、后一字符的POC.
112 | 	 *
113 | 	 * @param currentRawIndex 当前原字符串索引位置
114 | 	 * @param annotatedTerms  标注结果
115 | 	 * @param currentPoc      当前字符对应的POC
116 | 	 */
117 | 	private static void setCurrentAsSingle(int currentRawIndex,
118 | 	                                       AnnotatedTerms annotatedTerms,
119 | 	                                       POC currentPoc) {
120 | 		annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC);
121 | 		annotatedTerms.append(currentRawIndex, currentPoc);
122 | 		int nextIndex = currentRawIndex + 1;
123 | 		if (nextIndex < annotatedTerms.getRawCharsLength()
124 | 				&& annotatedTerms.getCharTypeByIndex(nextIndex) != CharType.SPACE_OR_CONTROL_CHAR) {
125 | 			annotatedTerms.appendAhead(nextIndex, POC.BEGIN_OR_SINGLE_POC);
126 | 		}
127 | 	}
128 | 
129 | 	/**
130 | 	 * 判断前后书名号内的字符串是否为能成词
131 | 	 *
132 | 	 * @param annotatedTerms 标注结果
133 | 	 * @param startIndex     前书名号《 后一个index
134 | 	 * @param endIndex       后书名号》前一个index
135 | 	 * @return 若能则true
136 | 	 */
137 | 	private static boolean isPossibleTitle(AnnotatedTerms annotatedTerms, int startIndex, int endIndex) {
138 | 		if (endIndex - startIndex > 8 || endIndex - startIndex <= 0) {
139 | 			return false;
140 | 		}
141 | 		for (int i = startIndex; i <= endIndex; i++) {
142 | 			CharType charType = annotatedTerms.getCharTypeByIndex(i);
143 | 			if (charType == CharType.SINGLE_PUNCTUATION_CHAR
144 | 					|| charType == CharType.SPACE_OR_CONTROL_CHAR) {
145 | 				return false;
146 | 			}
147 | 		}
148 | 		return true;
149 | 	}
150 | 
151 | 	/**
152 | 	 * 设置书名号内为一个词.
153 | 	 *
154 | 	 * @param annotatedTerms    清洗句子结果
155 | 	 * @param startRawIndex     词的起始原字符串索引位置（在待分词文本中的索引值）
156 | 	 * @param endRawIndex       词的结束原字符串索引位置（在待分词文本中的索引值）
157 | 	 * @param endAnnotatedIndex 词的结束标注索引位置
158 | 	 */
159 | 	private static void setTitleWordPoc(
160 | 			AnnotatedTerms annotatedTerms,
161 | 			int startRawIndex,
162 | 			int endRawIndex,
163 | 			int endAnnotatedIndex) {
164 | 		// 单独字符成词
165 | 		if (startRawIndex == endRawIndex) {
166 | 			annotatedTerms.intersectPocByIndex(endAnnotatedIndex, POC.SINGLE_POC);
167 | 			return;
168 | 		}
169 | 		// 对应起始标注索引位置
170 | 		int startAnnotatedIndex = endAnnotatedIndex - endRawIndex + startRawIndex;
171 | 		annotatedTerms.setPocByIndex(startAnnotatedIndex, POC.BEGIN_POC);
172 | 		for (int i = startAnnotatedIndex + 1; i < endAnnotatedIndex; i++) {
173 | 			annotatedTerms.setPocByIndex(i, POC.MIDDLE_POC);
174 | 		}
175 | 		annotatedTerms.setPocByIndex(endAnnotatedIndex, POC.END_POC);
176 | 	}
177 | 
178 | 	/**
179 | 	 * 英文可与数字联合成词
180 | 	 *
181 | 	 * @param charType 字符类型
182 | 	 * @return 布尔值
183 | 	 */
184 | 	public static boolean isPartOfLetterWord(CharType charType) {
185 | 		return charType == CharType.ENGLISH_LETTER_CHAR
186 | 				|| charType == CharType.ARABIC_NUMERAL_CHAR
187 | 				|| charType == CharType.EX_SINGLE_PUNCTUATION_CHAR;
188 | 	}
189 | 
190 | 
191 | 	/**
192 | 	 * 为数词的一部分，数字字符或可与数字搭配的标点符号.
193 | 	 *
194 | 	 * @param charType 字符类型
195 | 	 * @return 布尔值
196 | 	 */
197 | 	public static boolean isPartOfNumeral(CharType charType) {
198 | 		return charType == CharType.CHINESE_NUMERAL_CHAR
199 | 				|| charType == CharType.ARABIC_NUMERAL_CHAR
200 | 				|| charType == CharType.NUMERAL_PUNCTUATION_CHAR;
201 | 	}
202 | 
203 | 	/**
204 | 	 * 处理单词或连续数字
205 | 	 *
206 | 	 * @param annotatedTerms 规则标注结果
207 | 	 * @param startRawIndex  在字符串raw中的起始位置
208 | 	 * @param condition      函数式接口，判断是否为字母或数字
209 | 	 * @param isNumeral      单词or数字
210 | 	 * @return 词结束后的下一个字符所处位置
211 | 	 */
212 | 	private static int processWord(
213 | 			AnnotatedTerms annotatedTerms,
214 | 			int startRawIndex,
215 | 			Predicate<CharType> condition,
216 | 			boolean isNumeral) {
217 | 		POC b, m, e, s;
218 | 		if (isNumeral) {
219 | 			b = POC.BEGIN_NUMERAL_POC;
220 | 			m = POC.MIDDLE_NUMERAL_POC;
221 | 			e = POC.END_NUMERAL_POC;
222 | 			s = POC.SINGLE_NUMERAL_POC;
223 | 		} else {
224 | 			b = POC.BEGIN_POC;
225 | 			m = POC.MIDDLE_POC;
226 | 			e = POC.END_POC;
227 | 			s = POC.SINGLE_POC;
228 | 		}
229 | 
230 | 		// 处理前一字符
231 | 		annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC);
232 | 
233 | 		int len = annotatedTerms.getRawCharsLength();
234 | 		int i = startRawIndex;
235 | 		i++;
236 | 		// 单独成词
237 | 		if (i == len
238 | 				|| (i < len && !condition.test(annotatedTerms.getCharTypeByIndex(i)))) {
239 | 			annotatedTerms.append(i - 1, s);
240 | 		}
241 | 		// 连续成词
242 | 		else {
243 | 			annotatedTerms.append(i - 1, b);
244 | 			for (; i + 1 < len && condition.test(annotatedTerms.getCharTypeByIndex(i + 1)); i++) {
245 | 				annotatedTerms.append(i, m);
246 | 			}
247 | 			annotatedTerms.append(i, e);
248 | 			i++;
249 | 		}
250 | 		// 处理成词后的下一字符
251 | 		if (i < len && annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) {
252 | 			annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC);
253 | 		}
254 | 		return i;
255 | 	}
256 | }
257 | 
258 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/process/SpecifiedWordCementer.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.process;
  2 | 
  3 | import io.github.yizhiru.thulac4j.term.TokenItem;
  4 | import io.github.yizhiru.thulac4j.util.CharUtils;
  5 | 
  6 | import java.util.Arrays;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | import java.util.Set;
 10 | 
 11 | /**
 12 |  * 特定词的黏结
 13 |  */
 14 | public class SpecifiedWordCementer {
 15 | 
 16 | 	/**
 17 | 	 * 时间量词
 18 | 	 */
 19 | 	private static final Set<String> TIME_UNIT_WORDS = new HashSet<>(
 20 | 			Arrays.asList("年", "月", "日", "号", "时", "点", "分", "秒"));
 21 | 
 22 | 	/**
 23 | 	 * Year time unit.
 24 | 	 */
 25 | 	private static final String YEAR_TIME_UNIT_WORD = "年";
 26 | 
 27 | 	/**
 28 | 	 * 可黏结的重复特定词
 29 | 	 */
 30 | 	private static final Set<String> CAN_FORM_REPEATED_WORDS = new HashSet<>(
 31 | 			Arrays.asList("—", "…"));
 32 | 
 33 | 
 34 | 	/**
 35 | 	 * 黏结词
 36 | 	 *
 37 | 	 * @param tokenItems 分词中间结果
 38 | 	 */
 39 | 	public static void cementWord(List<TokenItem> tokenItems) {
 40 | 		for (int i = tokenItems.size() - 1; i > 0; i--) {
 41 | 			TokenItem item = tokenItems.get(i);
 42 | 			String word = item.word;
 43 | 			if (TIME_UNIT_WORDS.contains(word)) {
 44 | 				i = cementTimeWord(tokenItems, item, i);
 45 | 			} else if (CAN_FORM_REPEATED_WORDS.contains(word)) {
 46 | 				i = cementRepeatedWord(tokenItems, item, i);
 47 | 			}
 48 | 		}
 49 | 	}
 50 | 
 51 | 	/**
 52 | 	 * 黏结阿拉伯数字与时间量词
 53 | 	 *
 54 | 	 * @param tokenItems   分词中间结果
 55 | 	 * @param timeUnitItem 时间单位词项
 56 | 	 * @param endIndex     结束索引值
 57 | 	 * @return 时间词的开始索引位置
 58 | 	 */
 59 | 	private static int cementTimeWord(List<TokenItem> tokenItems,
 60 | 	                                  TokenItem timeUnitItem,
 61 | 	                                  int endIndex) {
 62 | 		String timeUit = timeUnitItem.word;
 63 | 		if (endIndex - 1 >= 0) {
 64 | 			String previousWord = tokenItems.get(endIndex - 1).word;
 65 | 			if (isNumeralWord(previousWord)) {
 66 | 				if (timeUit.equals(YEAR_TIME_UNIT_WORD) && previousWord.length() < 4) {
 67 | 					return endIndex;
 68 | 				}
 69 | 				tokenItems.remove(endIndex);
 70 | 				StringBuilder builder = new StringBuilder(previousWord + timeUnitItem.word);
 71 | 				int j = endIndex - 2;
 72 | 				for (; j >= 0; j--) {
 73 | 					String w = tokenItems.get(j).word;
 74 | 					if (isNumeralWord(w)) {
 75 | 						tokenItems.remove(j + 1);
 76 | 						builder.insert(0, w);
 77 | 					} else {
 78 | 						break;
 79 | 					}
 80 | 				}
 81 | 				tokenItems.set(j + 1,
 82 | 						new TokenItem(builder.toString(), "t"));
 83 | 				return j + 1;
 84 | 			}
 85 | 		}
 86 | 		return endIndex;
 87 | 	}
 88 | 
 89 | 	/**
 90 | 	 * 黏结左右相同的特定词
 91 | 	 *
 92 | 	 * @param tokenItems   分词中间结果
 93 | 	 * @param repeatedItem 重复的特定词项
 94 | 	 * @param endIndex     结束索引值
 95 | 	 * @return 词的开始索引位置
 96 | 	 */
 97 | 	private static int cementRepeatedWord(List<TokenItem> tokenItems,
 98 | 	                                      TokenItem repeatedItem,
 99 | 	                                      int endIndex) {
100 | 		String word = repeatedItem.word;
101 | 		int i = endIndex - 1;
102 | 		if (i >= 0 && tokenItems.get(i).word.equals(word)) {
103 | 			StringBuilder builder = new StringBuilder(word + word);
104 | 			tokenItems.remove(endIndex);
105 | 			for (i--; i >= 0 && tokenItems.get(i).word.equals(word); i--) {
106 | 				builder.insert(0, word);
107 | 				tokenItems.remove(i + 1);
108 | 			}
109 | 			tokenItems.set(i + 1,
110 | 					new TokenItem(builder.toString(), repeatedItem.pos));
111 | 		}
112 | 		return i + 1;
113 | 	}
114 | 
115 | 	/**
116 | 	 * 一个词是否全为阿拉伯数字组成.
117 | 	 *
118 | 	 * @param word 词.
119 | 	 * @return 布尔值
120 | 	 */
121 | 	private static boolean isNumeralWord(String word) {
122 | 		for (char ch : word.toCharArray()) {
123 | 			if (!CharUtils.isNumeral(ch)) {
124 | 				return false;
125 | 			}
126 | 		}
127 | 		return true;
128 | 	}
129 | }
130 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/term/AnnotatedTerms.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.term;
  2 | 
  3 | import io.github.yizhiru.thulac4j.util.CharUtils;
  4 | 
  5 | import java.util.Arrays;
  6 | 
  7 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.NGramFeature.BOUNDARY;
  8 | import static io.github.yizhiru.thulac4j.util.CharUtils.convertHalfWidth;
  9 | 
 10 | public final class AnnotatedTerms {
 11 | 
 12 | 	/**
 13 | 	 * 待分词文本对应的原字符串.
 14 | 	 */
 15 | 	private char[] rawChars;
 16 | 
 17 | 	/**
 18 | 	 * 待分词文本对应的字符类型
 19 | 	 */
 20 | 	private CharType[] rawCharTypes;
 21 | 
 22 | 	/**
 23 | 	 * 规则标注处理前的字符串，长度与 @annotatedChars相等
 24 | 	 */
 25 | 	private char[] preAnnotateChars;
 26 | 
 27 | 	/**
 28 | 	 * 规则标注处理后字符串（对应于raw），包含操作：去除空格字符、半角转全角
 29 | 	 */
 30 | 	private char[] annotatedChars;
 31 | 
 32 | 	/**
 33 | 	 * 可能POC.
 34 | 	 */
 35 | 	public POC[] pocs;
 36 | 
 37 | 	/**
 38 | 	 * 规则标注后后句子长度.
 39 | 	 */
 40 | 	private int annotatedLength;
 41 | 
 42 | 	/**
 43 | 	 * 最后一个tuple是否已提前添加.
 44 | 	 */
 45 | 	private boolean isAppendAhead;
 46 | 
 47 | 	/**
 48 | 	 * 构造器
 49 | 	 *
 50 | 	 * @param rawChars 待分词文本字符串
 51 | 	 */
 52 | 	public AnnotatedTerms(char[] rawChars) {
 53 | 		this.rawChars = rawChars;
 54 | 		int textLength = rawChars.length;
 55 | 		// get char type
 56 | 		this.rawCharTypes = new CharType[textLength];
 57 | 		for (int i = 0; i < textLength; i++) {
 58 | 			rawCharTypes[i] = CharUtils.getCharType(rawChars[i]);
 59 | 		}
 60 | 		this.preAnnotateChars = new char[textLength];
 61 | 		this.annotatedChars = new char[textLength];
 62 | 		this.pocs = new POC[textLength];
 63 | 		this.annotatedLength = 0;
 64 | 		this.isAppendAhead = false;
 65 | 	}
 66 | 
 67 | 	public char[] getPreAnnotateChars() {
 68 | 		return Arrays.copyOfRange(preAnnotateChars, 0, annotatedLength);
 69 | 	}
 70 | 
 71 | 	public char[] getAnnotatedChars() {
 72 | 		return Arrays.copyOfRange(annotatedChars, 0, annotatedLength);
 73 | 	}
 74 | 
 75 | 	/**
 76 | 	 * 规则标注后的长度.
 77 | 	 *
 78 | 	 * @return 长度
 79 | 	 */
 80 | 	public int getAnnotatedLength() {
 81 | 		return this.annotatedLength;
 82 | 	}
 83 | 
 84 | 	public POC[] getPocs() {
 85 | 		return Arrays.copyOfRange(pocs, 0, annotatedLength);
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * 根据原字符串的索引位置得到字符类型
 90 | 	 *
 91 | 	 * @param rawIndex 原字符串的索引位置
 92 | 	 * @return 字符类型
 93 | 	 */
 94 | 	public char getRawCharByIndex(int rawIndex) {
 95 | 		return rawChars[rawIndex];
 96 | 	}
 97 | 
 98 | 	/**
 99 | 	 * 根据原字符串的索引位置得到字符类型
100 | 	 *
101 | 	 * @param rawIndex 原字符串的索引位置
102 | 	 * @return 字符类型
103 | 	 */
104 | 	public CharType getCharTypeByIndex(int rawIndex) {
105 | 		return rawCharTypes[rawIndex];
106 | 	}
107 | 
108 | 	/**
109 | 	 * 原始字符串长度
110 | 	 *
111 | 	 * @return 整数值长度
112 | 	 */
113 | 	public int getRawCharsLength() {
114 | 		return rawChars.length;
115 | 	}
116 | 
117 | 	/**
118 | 	 * 结果字符串是否为空
119 | 	 *
120 | 	 * @return 若为空，则为true
121 | 	 */
122 | 	public boolean isEmpty() {
123 | 		return annotatedLength == 0;
124 | 	}
125 | 
126 | 	/**
127 | 	 * 首尾拼接BOUNDARY 字符
128 | 	 *
129 | 	 * @return 拼接后的字符串
130 | 	 */
131 | 	public char[] appendBoundaryAround() {
132 | 		char[] array = new char[annotatedLength + 4];
133 | 		System.arraycopy(annotatedChars, 0, array, 2, annotatedLength);
134 | 		array[0] = array[1] = array[annotatedLength + 2] = array[annotatedLength + 3] = BOUNDARY;
135 | 		return array;
136 | 	}
137 | 
138 | 	/**
139 | 	 * 对于index位置的POC求交集
140 | 	 *
141 | 	 * @param annotatedIndex 标注字符串的索引位置
142 | 	 * @param poc            POC值
143 | 	 */
144 | 	public void intersectPocByIndex(int annotatedIndex, POC poc) {
145 | 		if (annotatedIndex < 0 || annotatedIndex >= annotatedLength) {
146 | 			return;
147 | 		}
148 | 		pocs[annotatedIndex] = pocs[annotatedIndex].intersect(poc);
149 | 	}
150 | 
151 | 	/**
152 | 	 * 对最后位置POC求交集
153 | 	 *
154 | 	 * @param poc POC值
155 | 	 */
156 | 	public void intersectLastPoc(POC poc) {
157 | 		intersectPocByIndex(annotatedLength - 1, poc);
158 | 	}
159 | 
160 | 	/**
161 | 	 * 按照index 值设置 poc
162 | 	 *
163 | 	 * @param annotatedIndex 标注字符串的索引位置
164 | 	 * @param poc            POC值
165 | 	 */
166 | 	public void setPocByIndex(int annotatedIndex, POC poc) {
167 | 		if (annotatedIndex < 0 || annotatedIndex >= annotatedLength) {
168 | 			return;
169 | 		}
170 | 		pocs[annotatedIndex] = poc;
171 | 	}
172 | 
173 | 	/**
174 | 	 * 添加最后一个
175 | 	 *
176 | 	 * @param rawIndex 原字符串的索引位置
177 | 	 * @param poc      可能的POC
178 | 	 */
179 | 	public void append(int rawIndex, POC poc) {
180 | 		if (isAppendAhead) {
181 | 			intersectLastPoc(poc);
182 | 			isAppendAhead = false;
183 | 		} else {
184 | 			char ch = rawChars[rawIndex];
185 | 			preAnnotateChars[annotatedLength] = ch;
186 | 			annotatedChars[annotatedLength] = convertHalfWidth(ch);
187 | 			pocs[annotatedLength] = poc;
188 | 			annotatedLength++;
189 | 		}
190 | 	}
191 | 
192 | 	/**
193 | 	 * 尾部提前追加元素.
194 | 	 *
195 | 	 * @param rawIndex 原字符串的索引位置
196 | 	 * @param poc      可能的POC
197 | 	 */
198 | 	public void appendAhead(int rawIndex, POC poc) {
199 | 		char ch = rawChars[rawIndex];
200 | 		preAnnotateChars[annotatedLength] = ch;
201 | 		annotatedChars[annotatedLength] = convertHalfWidth(ch);
202 | 		pocs[annotatedLength] = poc;
203 | 		annotatedLength++;
204 | 		isAppendAhead = true;
205 | 	}
206 | }
207 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/term/CharType.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.term;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public enum CharType {
 7 | 
 8 | 	/**
 9 | 	 * 只能单独成词的标点符号，不会与其他字符合组合成词
10 | 	 */
11 | 	SINGLE_PUNCTUATION_CHAR("p"),
12 | 
13 | 	/**
14 | 	 * 既能单独成词又能与其他字符组合成词的标点符号
15 | 	 */
16 | 	EX_SINGLE_PUNCTUATION_CHAR("ep"),
17 | 
18 | 	/**
19 | 	 * 空格或控制字符串
20 | 	 */
21 | 	SPACE_OR_CONTROL_CHAR("c"),
22 | 
23 | 	/**
24 | 	 * 中文数字字符
25 | 	 */
26 | 	CHINESE_NUMERAL_CHAR("cn"),
27 | 
28 | 	/**
29 | 	 * 阿拉伯数字字符
30 | 	 */
31 | 	ARABIC_NUMERAL_CHAR("an"),
32 | 
33 | 	/**
34 | 	 * 数词专用标点符合
35 | 	 */
36 | 	NUMERAL_PUNCTUATION_CHAR("np"),
37 | 
38 | 	/**
39 | 	 * 汉字字符
40 | 	 */
41 | 	HAN_ZI_CHAR("h"),
42 | 
43 | 	/**
44 | 	 * 英文字符
45 | 	 */
46 | 	ENGLISH_LETTER_CHAR("e"),
47 | 
48 | 	/**
49 | 	 * 其他字符
50 | 	 */
51 | 	OTHER_CHAR("o"),
52 | 	;
53 | 
54 | 	/**
55 | 	 * 简写
56 | 	 */
57 | 	private final String abbreviation;
58 | 
59 | 	/**
60 | 	 * 简写与CharType之间的映射
61 | 	 */
62 | 	private static final Map<String, CharType> MAP = new HashMap<>(values().length, 1);
63 | 
64 | 	// 静态初始化
65 | 	static {
66 | 		for (CharType t : values()) {
67 | 			MAP.put(t.abbreviation, t);
68 | 		}
69 | 	}
70 | 
71 | 	CharType(String abbreviation) {
72 | 		this.abbreviation = abbreviation;
73 | 	}
74 | 
75 | 	/**
76 | 	 * 根据CharType的简写得到枚举值
77 | 	 *
78 | 	 * @param abbr 简写
79 | 	 * @return 具体枚举值，若没有则抛出 IllegalArgumentException
80 | 	 */
81 | 	public static CharType of(String abbr) {
82 | 		CharType type = MAP.get(abbr);
83 | 		if (type == null) {
84 | 			throw new IllegalArgumentException("Invalid char type abbreviation: " + abbr);
85 | 		}
86 | 		return type;
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/term/POC.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.term;
 2 | 
 3 | /**
 4 |  * POC (position of char) 种类，用以描述字符的标注 (label) 信息.
 5 |  */
 6 | public enum POC {
 7 | 
 8 | 	/**
 9 | 	 * Punctuation POC.
10 | 	 */
11 | 	PUNCTUATION_POC,
12 | 
13 | 	/**
14 | 	 * Begin of numeral.
15 | 	 */
16 | 	BEGIN_NUMERAL_POC,
17 | 
18 | 	/**
19 | 	 * Middle of numeral.
20 | 	 */
21 | 	MIDDLE_NUMERAL_POC,
22 | 
23 | 	/**
24 | 	 * End of numeral.
25 | 	 */
26 | 	END_NUMERAL_POC,
27 | 
28 | 	/**
29 | 	 * Single of numeral.
30 | 	 */
31 | 	SINGLE_NUMERAL_POC,
32 | 
33 | 	/**
34 | 	 * Word begin.
35 | 	 */
36 | 	BEGIN_POC,
37 | 
38 | 	/**
39 | 	 * Word middle.
40 | 	 */
41 | 	MIDDLE_POC,
42 | 
43 | 	/**
44 | 	 * Word end.
45 | 	 */
46 | 	END_POC,
47 | 
48 | 	/**
49 | 	 * Single character as a word.
50 | 	 */
51 | 	SINGLE_POC,
52 | 
53 | 	/**
54 | 	 * Begin or single.
55 | 	 */
56 | 	BEGIN_OR_SINGLE_POC,
57 | 
58 | 	/**
59 | 	 * End or single.
60 | 	 */
61 | 	END_OR_SINGLE_POC,
62 | 
63 | 	/**
64 | 	 * Default POC.
65 | 	 */
66 | 	DEFAULT_POC;
67 | 
68 | 	/**
69 | 	 * 对可能标注求交集，比如，若某字符的标注既可能为BS_POC，也可能为ES_POC，
70 | 	 * 则其标注为SINGLE_POC.
71 | 	 *
72 | 	 * @param that 另一种可能POC.
73 | 	 * @return 交集POC.
74 | 	 */
75 | 	public POC intersect(POC that) {
76 | 		if (this.ordinal() < that.ordinal()) {
77 | 			if (this == BEGIN_OR_SINGLE_POC && that == END_OR_SINGLE_POC) {
78 | 				return SINGLE_POC;
79 | 			}
80 | 			return this;
81 | 		} else if (this == END_OR_SINGLE_POC && that == BEGIN_OR_SINGLE_POC) {
82 | 			return SINGLE_POC;
83 | 		}
84 | 		return that;
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/term/TokenItem.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.term;
 2 | 
 3 | import io.github.yizhiru.thulac4j.common.Nullable;
 4 | 
 5 | /**
 6 |  * Word Segment item.
 7 |  */
 8 | public final class TokenItem {
 9 | 
10 | 	/**
11 | 	 * Tokenized word.
12 | 	 */
13 | 	public final String word;
14 | 
15 | 	/**
16 | 	 * Part-of-speech.
17 | 	 */
18 | 	@Nullable
19 | 	public final String pos;
20 | 
21 | 	public TokenItem(String word, String pos) {
22 | 		this.word = word;
23 | 		this.pos = pos;
24 | 	}
25 | 
26 | 	@Override
27 | 	public String toString() {
28 | 		if (pos == null) {
29 | 			return word;
30 | 		}
31 | 		return word + '/' + pos;
32 | 	}
33 | 
34 | 	@Override
35 | 	public boolean equals(Object o) {
36 | 		if (this == o) {
37 | 			return true;
38 | 		}
39 | 		if (o == null || getClass() != o.getClass()) {
40 | 			return false;
41 | 		}
42 | 
43 | 		TokenItem tokenItem = (TokenItem) o;
44 | 		return (word != null ? word.equals(tokenItem.word) : tokenItem.word == null)
45 | 				&& (pos != null ? pos.equals(tokenItem.pos) : tokenItem.pos == null);
46 | 	}
47 | 
48 | 	@Override
49 | 	public int hashCode() {
50 | 		int result = word != null ? word.hashCode() : 0;
51 | 		result = 31 * result + (pos != null ? pos.hashCode() : 0);
52 | 		return result;
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/util/CharUtils.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.util;
  2 | 
  3 | import io.github.yizhiru.thulac4j.term.CharType;
  4 | 
  5 | import java.io.IOException;
  6 | import java.util.*;
  7 | 
  8 | public final class CharUtils {
  9 | 
 10 | 
 11 | 	private static final Map<Character, CharType> CORE_CHAR_TYPE_MAP = loadCharTypeMap();
 12 | 
 13 | 	/**
 14 | 	 * 空格字符，ASCII码值为32
 15 | 	 */
 16 | 	private static final char LATIN_SPACE_CHAR = ' ';
 17 | 
 18 | 	/**
 19 | 	 * 前书名号
 20 | 	 */
 21 | 	public static final char LEFT_TITLE_QUOTATION_CHAR = '《';
 22 | 
 23 | 	/**
 24 | 	 * 后书名号
 25 | 	 */
 26 | 	public static final char RIGHT_TITLE_QUOTATION_CHAR = '》';
 27 | 
 28 | 	/**
 29 | 	 * 加载核心字符类型词典
 30 | 	 *
 31 | 	 * @return 核心字符映射到字符类型 Map
 32 | 	 */
 33 | 	private static Map<Character, CharType> loadCharTypeMap() {
 34 | 		List<String> lines;
 35 | 		try {
 36 | 			lines = IOUtils.readLines(CharUtils.class.getResourceAsStream(ModelPaths.CORE_CHAR_PATH));
 37 | 		} catch (IOException e) {
 38 | 			throw new RuntimeException(e);
 39 | 		}
 40 | 		Map<Character, CharType> map = new HashMap<>(lines.size());
 41 | 		for (String line : lines) {
 42 | 			String[] arr = line.split("\t");
 43 | 			map.put(arr[0].charAt(0), CharType.of(arr[1]));
 44 | 		}
 45 | 		return map;
 46 | 	}
 47 | 
 48 | 	/**
 49 | 	 * 映射字符类型
 50 | 	 *
 51 | 	 * @param ch 字符
 52 | 	 * @return 字符类型
 53 | 	 */
 54 | 	public static CharType getCharType(char ch) {
 55 | 		if (isSpaceOrControl(ch)) {
 56 | 			return CharType.SPACE_OR_CONTROL_CHAR;
 57 | 		}
 58 | 		return CORE_CHAR_TYPE_MAP.getOrDefault(ch, CharType.OTHER_CHAR);
 59 | 	}
 60 | 
 61 | 
 62 | 	/**
 63 | 	 * 是否为控制字符或空格字符，在分词过程中忽略这样的字符.
 64 | 	 *
 65 | 	 * @param ch 字符
 66 | 	 * @return 布尔值，若是则返回true
 67 | 	 */
 68 | 	public static boolean isSpaceOrControl(char ch) {
 69 | 		return (ch < LATIN_SPACE_CHAR) || Character.isSpaceChar(ch);
 70 | 	}
 71 | 
 72 | 
 73 | 	/**
 74 | 	 * 字符是否为数字
 75 | 	 *
 76 | 	 * @param ch 字符
 77 | 	 * @return 布尔值
 78 | 	 */
 79 | 	public static boolean isNumeral(char ch) {
 80 | 		CharType charType = getCharType(ch);
 81 | 		return charType == CharType.CHINESE_NUMERAL_CHAR
 82 | 				|| charType == CharType.ARABIC_NUMERAL_CHAR;
 83 | 	}
 84 | 
 85 | 	/**
 86 | 	 * 半角字符转全角字符.
 87 | 	 * 半角空格为32, 全角空格为12288;
 88 | 	 * 其他半角字符(33-126)与全角字符(65281-65374)均相差 65248.
 89 | 	 *
 90 | 	 * @param ch 字符
 91 | 	 * @return 半角转成的全角字符
 92 | 	 */
 93 | 	public static char convertHalfWidth(char ch) {
 94 | 		if (ch == 32) {
 95 | 			return (char) 12288;
 96 | 		} else if (ch > 32 && ch < 127) {
 97 | 			return (char) (ch + 65248);
 98 | 		}
 99 | 		return ch;
100 | 	}
101 | }
102 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/util/ChineseUtils.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.util;
 2 | 
 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie;
 4 | 
 5 | import java.io.IOException;
 6 | import java.util.HashMap;
 7 | 
 8 | public final class ChineseUtils {
 9 | 
10 | 	/**
11 | 	 * 繁体字符映射到简体字符
12 | 	 */
13 | 	private static final HashMap<Character, Character> T2S_MAP = parseT2sMap();
14 | 
15 | 	/**
16 | 	 * 停用词表
17 | 	 */
18 | 	private static final DoubleArrayTrie STOP_WORDS_DAT = DoubleArrayTrie.loadDat(
19 | 			ChineseUtils.class.getResourceAsStream(ModelPaths.STOP_WORDS_BIN_PATH));
20 | 
21 | 	/**
22 | 	 * 解析繁体简体映射Map文件.
23 | 	 *
24 | 	 * @return HashMap
25 | 	 */
26 | 	private static HashMap<Character, Character> parseT2sMap() {
27 | 		int[] array;
28 | 		try {
29 | 			array = IOUtils.toIntArray(ChineseUtils.class.getResourceAsStream(ModelPaths.T2S_PATH));
30 | 		} catch (IOException e) {
31 | 			throw new RuntimeException(e);
32 | 		}
33 | 
34 | 		// 文件包含繁体字符共有2800个
35 | 		int traditionNum = array.length / 2;
36 | 		HashMap<Character, Character> t2sMap = new HashMap<>(traditionNum);
37 | 		for (int i = 0; i < traditionNum; i++) {
38 | 			t2sMap.put((char) array[i], (char) array[i + traditionNum]);
39 | 		}
40 | 		return t2sMap;
41 | 	}
42 | 
43 | 	/**
44 | 	 * 将繁体汉字转为简体汉字
45 | 	 *
46 | 	 * @param sentence 输入句子
47 | 	 * @return 简体字化句子
48 | 	 */
49 | 	public static String simplified(String sentence) {
50 | 		StringBuilder builder = new StringBuilder(sentence.length());
51 | 		for (char ch : sentence.toCharArray()) {
52 | 			builder.append(T2S_MAP.getOrDefault(ch, ch));
53 | 		}
54 | 		return builder.toString();
55 | 	}
56 | 
57 | 	/**
58 | 	 * 判断该词是否为停用词.
59 | 	 *
60 | 	 * @param word 输入词
61 | 	 * @return 布尔值，若为停用词则为true
62 | 	 */
63 | 	public static boolean isStopWord(String word) {
64 | 		return STOP_WORDS_DAT.isWordMatched(word);
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/util/IOUtils.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.util;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.ByteArrayOutputStream;
  5 | import java.io.FileInputStream;
  6 | import java.io.IOException;
  7 | import java.io.InputStream;
  8 | import java.io.InputStreamReader;
  9 | import java.nio.ByteBuffer;
 10 | import java.nio.ByteOrder;
 11 | import java.nio.IntBuffer;
 12 | import java.nio.MappedByteBuffer;
 13 | import java.nio.channels.FileChannel;
 14 | import java.util.ArrayList;
 15 | import java.util.List;
 16 | 
 17 | /**
 18 |  * IO Utils, 参考 commons-io 包中类 IOUtils 实现.
 19 |  */
 20 | public final class IOUtils {
 21 | 
 22 | 	/**
 23 | 	 * Represents the end-of-file (or stream).
 24 | 	 */
 25 | 	private static final int EOF = -1;
 26 | 
 27 | 	/**
 28 | 	 * The default buffer size  to use for copy.
 29 | 	 */
 30 | 	private static final int DEFAULT_BUFFER_SIZE = 1024 * 4;
 31 | 
 32 | 	/**
 33 | 	 * Maps a region of this channel's file directly into memory.
 34 | 	 *
 35 | 	 * @param inputPath the file path to read from, not null.
 36 | 	 * @return The mapped byte buffer
 37 | 	 * @throws IOException If some other I/O error occurs
 38 | 	 */
 39 | 	public static MappedByteBuffer mapToByteBuffer(final String inputPath) throws IOException {
 40 | 		FileChannel channel = new FileInputStream(inputPath).getChannel();
 41 | 		return channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size());
 42 | 	}
 43 | 
 44 | 	/**
 45 | 	 * Gets the contents of an <code>InputStream</code> as a list of Strings,
 46 | 	 * one entry per line.
 47 | 	 *
 48 | 	 * @param input the <code>InputStream</code> to read from, not null
 49 | 	 * @return the list of Strings, never null
 50 | 	 * @throws NullPointerException if the input is null
 51 | 	 * @throws IOException          if an I/O error occurs
 52 | 	 */
 53 | 	public static List<String> readLines(final InputStream input) throws IOException {
 54 | 		final BufferedReader reader = new BufferedReader(
 55 | 				new InputStreamReader(input));
 56 | 		final List<String> list = new ArrayList<>();
 57 | 		String line = reader.readLine();
 58 | 		while (line != null) {
 59 | 			list.add(line);
 60 | 			line = reader.readLine();
 61 | 		}
 62 | 		reader.close();
 63 | 		return list;
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * Gets the contents of an <code>InputStream</code> as a <code>byte[]</code>.
 68 | 	 * <p>
 69 | 	 * This method buffers the input internally, so there is no need to use a
 70 | 	 * <code>BufferedInputStream</code>.
 71 | 	 *
 72 | 	 * @param input the <code>InputStream</code> to read from
 73 | 	 * @return the requested byte array
 74 | 	 * @throws NullPointerException if the input is null
 75 | 	 * @throws IOException          if an I/O error occurs
 76 | 	 */
 77 | 	public static byte[] toByteArray(final InputStream input) throws IOException {
 78 | 		try (final ByteArrayOutputStream output = new ByteArrayOutputStream()) {
 79 | 			int n;
 80 | 			byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
 81 | 			while (EOF != (n = input.read(buffer))) {
 82 | 				output.write(buffer, 0, n);
 83 | 			}
 84 | 			return output.toByteArray();
 85 | 		}
 86 | 	}
 87 | 
 88 | 	/**
 89 | 	 * Gets the contents of an <code>InputStream</code> as a int array.
 90 | 	 * <p>
 91 | 	 * This method buffers the input internally, so there is no need to use a
 92 | 	 * <code>BufferedInputStream</code>.
 93 | 	 *
 94 | 	 * @param input the <code>InputStream</code> to read from
 95 | 	 * @return the requested int array
 96 | 	 * @throws NullPointerException if the input is null
 97 | 	 * @throws IOException          if an I/O error occurs
 98 | 	 */
 99 | 	public static int[] toIntArray(final InputStream input) throws IOException {
100 | 		byte[] bytes = toByteArray(input);
101 | 		IntBuffer intBuffer = ByteBuffer.wrap(bytes)
102 | 				.order(ByteOrder.LITTLE_ENDIAN)
103 | 				.asIntBuffer();
104 | 		int[] array = new int[intBuffer.remaining()];
105 | 		intBuffer.get(array);
106 | 		return array;
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/java/io/github/yizhiru/thulac4j/util/ModelPaths.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.util;
 2 | 
 3 | /**
 4 |  * 模型文件路径名
 5 |  */
 6 | public final class ModelPaths {
 7 | 
 8 | 	/**
 9 | 	 * 核心字符类型词典
10 | 	 */
11 | 	public static final String CORE_CHAR_PATH = "/dicts/core_char.dict";
12 | 
13 | 	/**
14 | 	 * 地名词典
15 | 	 */
16 | 	public static final String NS_DICT_PATH = "dicts/ns.dict";
17 | 
18 | 	/**
19 | 	 * 成语、习语、谚语词典
20 | 	 */
21 | 	public static final String IDIOM_DICT_PATH = "dicts/idiom.dict";
22 | 
23 | 	/**
24 | 	 * 停用词词典
25 | 	 */
26 | 	public static final String STOP_WORDS_DICT_PATH = "dicts/stop_words.dict";
27 | 
28 | 	public static final String NS_BIN_PATH = "/models/ns_dat.bin";
29 | 	public static final String IDIOM_BIN_PATH = "/models/idiom_dat.bin";
30 | 	public static final String STOP_WORDS_BIN_PATH = "/models/stop_dat.bin";
31 | 
32 | 	/**
33 | 	 * 繁体到简体字符映射
34 | 	 */
35 | 	public static final String T2S_PATH = "/models/t2s.dat";
36 | 
37 | 	/**
38 | 	 * 分词模块权重
39 | 	 */
40 | 	public static final String SEGMENTER_WEIGHT_PATH = "/models/cws_model.bin";
41 | 
42 | 	/**
43 | 	 * 分词模块特征
44 | 	 */
45 | 	public static final String SEGMENTER_FEATURE_PATH = "/models/cws_dat.bin";
46 | 
47 | 	/**
48 | 	 * 分词模块label
49 | 	 */
50 | 	public static final String SEGMENTER_LABEL_PATH = "/models/cws_label.txt";
51 | 
52 | 	/**
53 | 	 * 词性标注模块label
54 | 	 */
55 | 	public static final String POS_TAGGING_LABEL_PATH = "/models/model_c_label.txt";
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/POSTaggerTest.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j;
  2 | 
  3 | import io.github.yizhiru.thulac4j.term.TokenItem;
  4 | import org.junit.Test;
  5 | 
  6 | import java.io.IOException;
  7 | import java.nio.charset.StandardCharsets;
  8 | import java.util.stream.Collectors;
  9 | 
 10 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.POS_FEATURES_PATH;
 11 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.POS_WEIGHTS_PATH;
 12 | import static io.github.yizhiru.thulac4j.SegmenterTest.SENTENCES;
 13 | import static org.junit.Assert.assertEquals;
 14 | 
 15 | public class POSTaggerTest {
 16 | 
 17 | 	@Test
 18 | 	public void tagging() throws IOException {
 19 | 		String[] expectedResults = new String[]{
 20 | 				"因/p",
 21 | 				"",
 22 | 				"",
 23 | 				"UTF/x -/w 8/m",
 24 | 				"iphone5/x",
 25 | 				"鲜芋仙/nz 3/m",
 26 | 				"枪杆子/n 中/f 出/v 政权/n",
 27 | 				"两/m 块/q 五/m 一/m 套/q ，/w 三/m 块/q 八/m 一/m 斤/q ，/w 四/m 块/q 七/m 一/m 本/q ，/w 五/m 块/q 六/m 一/m 条/q",
 28 | 				"RT/x @/w laoshipukong/x :/w 27日/t ，/w",
 29 | 				"AT&T/nz 是/v 一/m 件/q 不错/a 的/u 公司/n ，/w 给/p 你/r 发/v offer/x 了/u 吗/u ？/w",
 30 | 				"4/m 个/q 月/n 赚/v 了/u 20％/m 多/m",
 31 | 				"仅/d 1/m 只/q ，/w 为/v 0.9923/m 元/q",
 32 | 				"Just/n one/nz space/x ,/w or/ns all/nz such/x spaces/x ?/w",
 33 | 				"倒模/v ，/w 替身/v 算/v 什么/r ？/w 钟汉良/np 、/w ab/np 《/w 孤芳不自赏/id 》/w 抠图/n 来/v 充数/v",
 34 | 				"奥迪/nz CEO/x 违规/v 遭批/v 大众/n 表示/v 不/d 会/v 解雇/v",
 35 | 				"找/v 小姐/n",
 36 | 				"找/v 小妹/n",
 37 | 				"学生/n 妹/n",
 38 | 				"职业/n 狐狸精/n",
 39 | 				"男/a 公关/n",
 40 | 				"上门/v",
 41 | 				"抽獎/v",
 42 | 				"好/a 声音/n",
 43 | 				"好/a 聲音/n",
 44 | 				"夢/n 之/u 声/g",
 45 | 				"夢之聲/id",
 46 | 				"訂票/n",
 47 | 				"改簽/v",
 48 | 				"熱线/n",
 49 | 				"熱線/n",
 50 | 				"热線/a",
 51 | 				"電话/n",
 52 | 				"電話/n",
 53 | 				"醫院/n",
 54 | 				"代刷/v",
 55 | 				"撲剋牌/nz",
 56 | 				"137-1234-1234/m",
 57 | 				"这/r 是/v 一个/m 伸手不见五指/i 的/u 黑夜/n 。/w 我/r 叫/v 孙悟空/np ，/w 我/r 爱/v 北京/ns ，/w 我/r 爱/v Python/x 和/c C/x +/w" +
 58 | 						" +/w 。/w",
 59 | 				"我/r 不/d 喜欢/v 日本/ns 和服/n 。/w",
 60 | 				"雷猴/v 回归/v 人间/n 。/w",
 61 | 				"工信处/n 女/a 干事/n 每月/r 经过/p 下属/v 科室/n 都/d 要/v 亲口/d 交代/v 24/m 口/q 交换机/n 等/u 技术性/n 器件/n 的/u 安装/v 工作/v",
 62 | 				"我/r 需要/v 廉/g 租/v 房/n",
 63 | 				"永和/nz 服装/n 饰品/n 有限公司/n",
 64 | 				"我/r 爱/v 北京/ns 天安门/ns",
 65 | 				"abc/n",
 66 | 				"隐马尔可夫/np",
 67 | 				"雷猴/v 是/v 个/q 好/a 网站/n",
 68 | 				"“/w ,/w ”/w 和/c “/w SOFTware/x （/w 软件/n ）/w ”/w 两/m 部分/n 组成/v",
 69 | 				"草泥马/n 和/c 欺/g 实马/n 是/v 今年/t 的/u 流行/v 词汇/n",
 70 | 				"伊藤/nz 洋华堂/n 总府店/n",
 71 | 				"中国/ns 科学院/n 计算/v 技术/n 研究所/n",
 72 | 				"罗密欧/ns 与/c 朱丽叶/np",
 73 | 				"我/r 购买/v 了/u 道具/n 和/c 服装/n",
 74 | 				"PS/x :/w 我/r 觉得/v 开源/v 有/v 一个/m 好处/n ，/w 就/d 是/v 能够/v 敦促/v 自己/r 不断/d 改进/v ，/w 避免/v 敞帚自珍/id",
 75 | 				"湖北省/ns 石首市/ns",
 76 | 				"湖北省/ns 十堰市/ns",
 77 | 				"总经理/n 完成/v 了/u 这/r 件/q 事情/n",
 78 | 				"电脑/n 修好/v 了/u",
 79 | 				"做好/v 了/u 这/r 件/q 事情/n 就/d 一了百了/i 了/u",
 80 | 				"人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u",
 81 | 				"我们/r 买/v 了/u 一个/m 美/a 的/u 空调/n",
 82 | 				"线程/n 初始化/v 时/g 我们/r 要/v 注意/v",
 83 | 				"一个/m 分子/n 是/v 由/p 好多/m 原子组/n 织成/v 的/u",
 84 | 				"祝/v 你/r 马到功成/i",
 85 | 				"他/r 掉/v 进/v 了/u 无/v 底洞/n 里/f",
 86 | 				"中国/ns 的/u 首都/n 是/v 北京/ns",
 87 | 				"孙君意/np",
 88 | 				"外交部/ni 发言人/n 马朝旭/np",
 89 | 				"领导人/n 会议/n 和/c 第四/m 届/q 东亚/ns 峰会/n",
 90 | 				"在/p 过去/t 的/u 这/r 五/m 年/q",
 91 | 				"还/d 需要/v 很/d 长/a 的/u 路/n 要/v 走/v",
 92 | 				"60/m 周年/q 首都/n 阅兵/n",
 93 | 				"你好/id 人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u",
 94 | 				"买/v 水果/n 然后/c 来/v 世博园/j",
 95 | 				"买/v 水果/n 然后/c 去/v 世博园/j",
 96 | 				"但是/c 后来/t 我/r 才/d 知道/v 你/r 是/v 对/a 的/u",
 97 | 				"存在/v 即/c 合理/a",
 98 | 				"的/u 的/u 的/u 的/u 的/u 在/p 的/u 的/u 的/u 的/u 就/d 以/p 和和/nz 和/c",
 99 | 				"I/v love/x 你/r ，/w 不以为耻/i ，/w 反/d 以为/v rong/x",
100 | 				"hello/x 你好/id 人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u",
101 | 				"很/d 好/a 但/c 主要/d 是/v 基于/p 网页/n 形式/n",
102 | 				"为什么/r 我/r 不/d 能/v 拥有/v 想/v 要/v 的/u 生活/v",
103 | 				"后来/t 我/r 才/d",
104 | 				"此次/r 来/v 中国/ns 是/v 为了/p",
105 | 				"使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n",
106 | 				",/w 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n",
107 | 				"其实/d 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n",
108 | 				"好人/n 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n",
109 | 				"是/v 因为/p 和/p 国家/n",
110 | 				"老年/t 搜索/v 还/d 支持/v",
111 | 				"干脆/d 就/d 把/p 那/r 部/q 蒙/v 人/n 的/u 闲法/n 给/p 废/v 了/u 拉倒/v ！/w RT/x @/w laoshipukong/x :/w 27日/t ，/w " +
112 | 						"全国/n 人大/j 常委会/j 第三/m 次/q 审议/v 侵权/v 责任法/n 草案/n ，/w 删除/v 了/u 有关/v 医疗/n 损害/v 责任/n “/w 举证/v 倒置/v" +
113 | 						" ”/w 的/u 规定/n 。/w 在/p 医患/n 纠纷/n 中/f 本/d 已/d 处于/v 弱势/n 地位/n 的/u 消费者/n 由此/d 将/d 陷入/v 万劫不复/i " +
114 | 						"的/u 境地/n 。/w",
115 | 				"他/r 说/v 的/u 确实/a 在理/a",
116 | 				"长春/ns 市长/n 春节/t 讲话/n",
117 | 				"结婚/v 的/u 和/c 尚未/d 结婚/v 的/u",
118 | 				"结合/v 成分子/n 时/g",
119 | 				"旅游/v 和/c 服务/v 是/v 最/d 好/a 的/u",
120 | 				"这/r 件/q 事情/n 的确/d 是/v 我/r 的/u 错/n",
121 | 				"供/v 大家/r 参考/v 指正/v",
122 | 				"哈尔滨/ns 政府/n 公布/v 塌/v 桥/n 原因/n",
123 | 				"我/r 在/p 机场/n 入口处/n",
124 | 				"邢永臣/np 摄影/v 报道/v",
125 | 				"BP/x 神经/n 网络/n 如何/r 训练/v 才/d 能/v 在/p 分类/v 时/g 增加/v 区/n 分度/n ？/w",
126 | 				"南京市/ns 长江/ns 大桥/n",
127 | 				"应/v 一些/m 使用者/n 的/u 建议/n ，/w 也/d 为了/p 便于/v 利用/v NiuTrans/x 用于/v SMT/x 研究/v",
128 | 				"长春市/ns 长春/ns 药店/n",
129 | 				"邓颖超/np 生前/t 最/d 喜欢/v 的/u 衣服/n",
130 | 				"胡锦涛/np 是/v 热爱/v 世界/n 和平/n 的/u 政治局/n 常委/n",
131 | 				"程序员/n 祝海林/np 和/c 朱会震/np 是/v 在/p 孙健/np 的/u 左面/f 和/c 右面/f ,/w 范凯/np 在/p 最/d 右面/f ./w 再/d 往/p 左/f 是/v " +
132 | 						"李松洪/np",
133 | 				"一次性/d 交/v 多少/r 钱/n",
134 | 				"小/a 和/c 尚/d 留/v 了/u 一个/m 像/p 大/a 和尚/n 一样/a 的/u 和尚/n 头/n",
135 | 				"我/r 是/v 中华人民共和国/ns 公民/n ;/w 我/r 爸爸/n 是/v 共和党/n 党员/n ;/w 地铁/n 和平/n 门站/n",
136 | 				"张晓梅/np 去/v 人民/n 医院/n 做/v 了/u 个/q B/x 超然/a 后/f 去/v 买/v 了/u 件/q T/m 恤/q",
137 | 				"C/x +/w +/w 和/c c/g #/w 是/v 什么/r 关系/n ？/w 11/m +/w 122/m =/w 133/m ，/w 是/v 吗/u ？/w PI/x =/w 3.14159/m",
138 | 				"你/r 认识/v 那个/r 和/c 主席/n 握手/v 的/u 的/u 哥/j 吗/u ？/w 他/r 开/v 一/m 辆/q 黑色/n 的士/n 。/w",
139 | 				"2017-10-13/m 给/p 你/r 发/v offer/x 了/u 吗/u ？/w 27日/t 发/v iphone5/x 了/u 吗/u",
140 | 				"本报/r 讯/g 深圳市/ns 海王/nz 生物/n 工程/n 股份/n 有限公司/n 二○○○/m 年度/n 增/v 发/v A/x 股/n 路/n 演/v 推介会/n 日前/t 在/p 北京/ns 举行/v",
141 | 				"共同/d 创造/v 美好/a 的/u 新/a 世纪/n ——/w 2001年/t 新年/t 贺词/n",
142 | 		};
143 | 
144 | 		POSTagger posTagger = new POSTagger(POS_WEIGHTS_PATH, POS_FEATURES_PATH);
145 | 		posTagger.enableTitleWord();
146 | 		for (int i = 0; i < SENTENCES.length; i++) {
147 | 			String actual = posTagger.tagging(SENTENCES[i])
148 | 					.stream()
149 | 					.map(TokenItem::toString)
150 | 					.collect(Collectors.joining(" "));
151 | 			assertEquals(expectedResults[i], actual);
152 | 		}
153 | 
154 | 		long length = 0L;
155 | 		long start = System.currentTimeMillis();
156 | 		for (int i = 0; i < 100; ++i) {
157 | 			for (String sentence : SENTENCES) {
158 | 				posTagger.tagging(sentence);
159 | 				length += sentence.getBytes(StandardCharsets.UTF_8).length;
160 | 			}
161 | 		}
162 | 		long elapsed = (System.currentTimeMillis() - start);
163 | 		System.out.println(String.format("time elapsed: %d ms, rate: %.2f kb/s.",
164 | 				elapsed, (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f)));
165 | 	}
166 | }
167 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/SPChineseTokenizerTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j;
 2 | 
 3 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronClassifier;
 4 | import org.junit.Test;
 5 | import org.junit.runner.RunWith;
 6 | import org.powermock.core.classloader.annotations.PrepareForTest;
 7 | import org.powermock.modules.junit4.PowerMockRunner;
 8 | import org.powermock.reflect.internal.WhiteboxImpl;
 9 | 
10 | import java.io.FileInputStream;
11 | import java.util.Arrays;
12 | 
13 | import static org.junit.Assert.assertArrayEquals;
14 | import static org.junit.Assert.assertEquals;
15 | 
16 | @RunWith(PowerMockRunner.class)
17 | @PrepareForTest(StructuredPerceptronClassifier.class)
18 | public class SPChineseTokenizerTest {
19 | 
20 | 	/**
21 | 	 * Segmenter weights model path.
22 | 	 */
23 | 	public static final String SEG_WEIGHTS_PATH = "models/cws_model.bin";
24 | 
25 | 	/**
26 | 	 * Segmenter features path.
27 | 	 */
28 | 	public static final String SEG_FEATURES_PATH = "models/cws_dat.bin";
29 | 
30 | 	public static final String SEG_LABELS_PATH = "models/cws_label.txt";
31 | 
32 | 	/**
33 | 	 * POSTagger weights model path.
34 | 	 */
35 | 	public static final String POS_WEIGHTS_PATH = "models/model_c_model.bin";
36 | 
37 | 	/**
38 | 	 * POSTagger features path.
39 | 	 */
40 | 	public static final String POS_FEATURES_PATH = "models/model_c_dat.bin";
41 | 
42 | 	public static final String POS_LABELS_PATH = "models/model_c_label.txt";
43 | 
44 | 	@Test
45 | 	public void setPreviousTrans() throws Exception {
46 | 		SPChineseTokenizer tokenizer = new SPChineseTokenizer(
47 | 				new FileInputStream(SEG_WEIGHTS_PATH),
48 | 				new FileInputStream(SEG_FEATURES_PATH),
49 | 				new FileInputStream(SEG_LABELS_PATH));
50 | 		StructuredPerceptronClassifier classifier = WhiteboxImpl.getInternalState(tokenizer, "classifier");
51 | 		int[][] previousTrans = WhiteboxImpl.invokeMethod(
52 | 				tokenizer,
53 | 				"setPreviousTransitions",
54 | 				new Class<?>[]{String[].class},
55 | 				(Object) classifier.getLabelValues());
56 | 
57 | 		assertEquals("[[1, 2], [0, 3], [1, 2], [0, 3]]",
58 | 				Arrays.deepToString(previousTrans));
59 | 
60 | 		tokenizer = new SPChineseTokenizer(
61 | 				new FileInputStream(POS_WEIGHTS_PATH),
62 | 				new FileInputStream(POS_FEATURES_PATH),
63 | 				new FileInputStream(POS_LABELS_PATH));
64 | 		classifier = WhiteboxImpl.getInternalState(tokenizer, "classifier");
65 | 		previousTrans = WhiteboxImpl.invokeMethod(
66 | 				tokenizer,
67 | 				"setPreviousTransitions",
68 | 				new Class<?>[]{String[].class},
69 | 				(Object) classifier.getLabelValues());
70 | 		assertEquals("[1, 2, 4, 5, 7, 10, 13, 15, 17, 18, 19, 23, 25, 27, " +
71 | 						"30, 32, 33, 34, 35, 36, 37, 38, 39, 41, 44, 45, 48, 50, 53, " +
72 | 						"56, 57, 59, 61, 63, 67, 69, 72, 74, 76, 80, 81, 82, 83, 88, " +
73 | 						"89, 90, 91, 95]",
74 | 				Arrays.toString(previousTrans[0]));
75 | 		assertEquals("[0, 20]", Arrays.toString(previousTrans[1]));
76 | 		assertEquals("[54, 55]", Arrays.toString(previousTrans[56]));
77 | 		assertEquals("[93, 94]", Arrays.toString(previousTrans[95]));
78 | 	}
79 | }


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/SegmenterTest.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j;
  2 | 
  3 | import org.junit.FixMethodOrder;
  4 | import org.junit.Test;
  5 | import org.junit.runners.MethodSorters;
  6 | 
  7 | import java.nio.charset.StandardCharsets;
  8 | import java.util.Arrays;
  9 | import java.util.stream.Collectors;
 10 | 
 11 | import static org.junit.Assert.assertEquals;
 12 | 
 13 | @FixMethodOrder(MethodSorters.NAME_ASCENDING)
 14 | public class SegmenterTest {
 15 | 
 16 | 	static final String[] SENTENCES = new String[]{
 17 | 			"因",
 18 | 			" ",
 19 | 			"",
 20 | 			"UTF-8",
 21 | 			"iphone5",
 22 | 			"鲜芋仙 3",
 23 | 			"枪杆子中出政权",
 24 | 			"两块五一套，三块八一斤，四块七一本，五块六一条",
 25 | 			"RT @laoshipukong : 27日，",
 26 | 			"AT&T是一件不错的公司，给你发offer了吗？",
 27 | 			"4个月赚了20％多",
 28 | 			"仅1只，为0.9923元",
 29 | 			"Just one space, or all such spaces?",
 30 | 			"倒模，替身算什么？钟汉良、ab《孤芳不自赏》抠图来充数",
 31 | 			"奥迪CEO违规遭批 大众表示不会解雇",
 32 | 			"找小姐",
 33 | 			"找小妹",
 34 | 			"学生妹",
 35 | 			"职业狐狸精",
 36 | 			"男公关",
 37 | 			"上门",
 38 | 			"抽獎",
 39 | 			"好声音",
 40 | 			"好聲音",
 41 | 			"夢之声",
 42 | 			"夢之聲",
 43 | 			"訂票",
 44 | 			"改簽",
 45 | 			"熱线",
 46 | 			"熱線",
 47 | 			"热線",
 48 | 			"電话",
 49 | 			"電話",
 50 | 			"醫院",
 51 | 			"代刷",
 52 | 			"撲剋牌",
 53 | 			"137-1234-1234",
 54 | 			"这是一个伸手不见五指的黑夜。我叫孙悟空，我爱北京，我爱Python和C++。",
 55 | 			"我不喜欢日本和服。",
 56 | 			"雷猴回归人间。",
 57 | 			"工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
 58 | 			"我需要廉租房",
 59 | 			"永和服装饰品有限公司",
 60 | 			"我爱北京天安门",
 61 | 			"abc",
 62 | 			"隐马尔可夫",
 63 | 			"雷猴是个好网站",
 64 | 			"“,”和“SOFTware（软件）”两部分组成",
 65 | 			"草泥马和欺实马是今年的流行词汇",
 66 | 			"伊藤洋华堂总府店",
 67 | 			"中国科学院计算技术研究所",
 68 | 			"罗密欧与朱丽叶",
 69 | 			"我购买了道具和服装",
 70 | 			"PS: 我觉得开源有一个好处，就是能够敦促自己不断改进，避免敞帚自珍",
 71 | 			"湖北省石首市",
 72 | 			"湖北省十堰市",
 73 | 			"总经理完成了这件事情",
 74 | 			"电脑修好了",
 75 | 			"做好了这件事情就一了百了了",
 76 | 			"人们审美的观点是不同的",
 77 | 			"我们买了一个美的空调",
 78 | 			"线程初始化时我们要注意",
 79 | 			"一个分子是由好多原子组织成的",
 80 | 			"祝你马到功成",
 81 | 			"他掉进了无底洞里",
 82 | 			"中国的首都是北京",
 83 | 			"孙君意",
 84 | 			"外交部发言人马朝旭",
 85 | 			"领导人会议和第四届东亚峰会",
 86 | 			"在过去的这五年",
 87 | 			"还需要很长的路要走",
 88 | 			"60周年首都阅兵",
 89 | 			"你好人们审美的观点是不同的",
 90 | 			"买水果然后来世博园",
 91 | 			"买水果然后去世博园",
 92 | 			"但是后来我才知道你是对的",
 93 | 			"存在即合理",
 94 | 			"的的的的的在的的的的就以和和和",
 95 | 			"I love你，不以为耻，反以为rong",
 96 | 			"hello你好人们审美的观点是不同的",
 97 | 			"很好但主要是基于网页形式",
 98 | 			"为什么我不能拥有想要的生活",
 99 | 			"后来我才",
100 | 			"此次来中国是为了",
101 | 			"使用了它就可以解决一些问题",
102 | 			",使用了它就可以解决一些问题",
103 | 			"其实使用了它就可以解决一些问题",
104 | 			"好人使用了它就可以解决一些问题",
105 | 			"是因为和国家",
106 | 			"老年搜索还支持",
107 | 			"干脆就把那部蒙人的闲法给废了拉倒！RT @laoshipukong : 27日，全国人大常委会第三次审议" +
108 | 					"侵权责任法草案，删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于" +
109 | 					"弱势地位的消费者由此将陷入万劫不复的境地。 ",
110 | 			"他说的确实在理",
111 | 			"长春市长春节讲话",
112 | 			"结婚的和尚未结婚的",
113 | 			"结合成分子时",
114 | 			"旅游和服务是最好的",
115 | 			"这件事情的确是我的错",
116 | 			"供大家参考指正",
117 | 			"哈尔滨政府公布塌桥原因",
118 | 			"我在机场入口处",
119 | 			"邢永臣摄影报道",
120 | 			"BP神经网络如何训练才能在分类时增加区分度？",
121 | 			"南京市长江大桥",
122 | 			"应一些使用者的建议，也为了便于利用NiuTrans用于SMT研究",
123 | 			"长春市长春药店",
124 | 			"邓颖超生前最喜欢的衣服",
125 | 			"胡锦涛是热爱世界和平的政治局常委",
126 | 			"程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪",
127 | 			"一次性交多少钱",
128 | 			"小和尚留了一个像大和尚一样的和尚头",
129 | 			"我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站",
130 | 			"张晓梅去人民医院做了个B超然后去买了件T恤",
131 | 			"C++和c#是什么关系？11+122=133，是吗？PI=3.14159",
132 | 			"你认识那个和主席握手的的哥吗？他开一辆黑色的士。",
133 | 			"2017-10-13给你发offer了吗？27日发iphone5了吗",
134 | 			"本报讯深圳市海王生物工程股份有限公司二○○○年度增发A股路演推介会日前在北京举行",
135 | 			"共同创造美好的新世纪——2001年新年贺词",
136 | 	};
137 | 
138 | 	@Test
139 | 	public void segment() {
140 | 		String[] expectedResults = new String[]{
141 | 				"因",
142 | 				"",
143 | 				"",
144 | 				"UTF - 8",
145 | 				"iphone5",
146 | 				"鲜芋仙 3",
147 | 				"枪杆子 中 出 政权",
148 | 				"两 块 五一套 ， 三 块 八一斤 ， 四 块 七 一 本 ， 五 块 六 一 条",
149 | 				"RT @ laoshipukong : 27日 ，",
150 | 				"AT&T 是 一 件 不错 的 公司 ， 给 你 发 offer 了 吗 ？",
151 | 				"4 个 月 赚 了 20％ 多",
152 | 				"仅 1 只 ， 为 0.9923 元",
153 | 				"Just one space , or all such spaces ?",
154 | 				"倒模 ， 替身 算 什么 ？ 钟汉良 、 ab 《 孤芳不自赏 》 抠 图 来 充数",
155 | 				"奥迪 CEO 违规 遭 批 大众 表示 不 会 解雇",
156 | 				"找 小姐",
157 | 				"找 小 妹",
158 | 				"学生 妹",
159 | 				"职业 狐狸 精",
160 | 				"男 公关",
161 | 				"上门",
162 | 				"抽獎",
163 | 				"好 声音",
164 | 				"好 聲音",
165 | 				"夢 之 声",
166 | 				"夢 之 聲",
167 | 				"訂票",
168 | 				"改簽",
169 | 				"熱线",
170 | 				"熱線",
171 | 				"热線",
172 | 				"電话",
173 | 				"電話",
174 | 				"醫院",
175 | 				"代刷",
176 | 				"撲剋牌",
177 | 				"137-1234-1234",
178 | 				"这 是 一个 伸手不见五指 的 黑夜 。 我 叫 孙悟空 ， 我 爱 北京 ， 我 爱 Python 和 C + + 。",
179 | 				"我 不 喜欢 日本 和服 。",
180 | 				"雷猴 回归 人间 。",
181 | 				"工信 处女 干事 每月 经过 下属 科室 都 要 亲口 交代 24 口 交换机 等 技术性 器件 的 安装 工作",
182 | 				"我 需要 廉 租 房",
183 | 				"永 和 服装 饰品 有限公司",
184 | 				"我 爱 北京 天安门",
185 | 				"abc",
186 | 				"隐马尔可夫",
187 | 				"雷猴 是 个 好 网站",
188 | 				"“ , ” 和 “ SOFTware （ 软件 ） ” 两 部分 组成",
189 | 				"草泥马 和 欺实马 是 今年 的 流行 词汇",
190 | 				"伊藤 洋华堂 总府 店",
191 | 				"中国 科学院 计算 技术 研究所",
192 | 				"罗密欧 与 朱丽叶",
193 | 				"我 购买 了 道具 和 服装",
194 | 				"PS : 我 觉得 开源 有 一个 好处 ， 就是 能够 敦促 自己 不断 改进 ， 避免 敞帚自珍",
195 | 				"湖北省 石首市",
196 | 				"湖北省 十堰市",
197 | 				"总经理 完成 了 这 件 事情",
198 | 				"电脑 修好 了",
199 | 				"做 好 了 这 件 事情 就 一了百了 了",
200 | 				"人们 审美 的 观点 是 不同 的",
201 | 				"我们 买 了 一个 美 的 空调",
202 | 				"线程 初始化 时 我们 要 注意",
203 | 				"一个 分子 是 由 好多 原子 组织 成 的",
204 | 				"祝 你 马到功成",
205 | 				"他 掉 进 了 无 底洞 里",
206 | 				"中国 的 首都 是 北京",
207 | 				"孙君意",
208 | 				"外交部 发言人 马朝旭",
209 | 				"领导人 会议 和 第四 届 东亚 峰会",
210 | 				"在 过去 的 这 五 年",
211 | 				"还 需要 很 长 的 路 要 走",
212 | 				"60 周年 首都 阅兵",
213 | 				"你好 人们 审美 的 观点 是 不同 的",
214 | 				"买 水 果然 后来 世博园",
215 | 				"买 水果 然后 去世 博园",
216 | 				"但是 后来 我 才 知道 你 是 对 的",
217 | 				"存在 即 合理",
218 | 				"的 的 的 的 的 在 的 的 的 的 就 以 和 和 和",
219 | 				"I love 你 ， 不以为耻 ， 反 以为 rong",
220 | 				"hello 你好 人们 审美 的 观点 是 不同 的",
221 | 				"很 好 但 主要 是 基于 网页 形式",
222 | 				"为什么 我 不 能 拥有 想 要 的 生活",
223 | 				"后来 我 才",
224 | 				"此次 来 中国 是 为了",
225 | 				"使用 了 它 就 可以 解决 一些 问题",
226 | 				", 使用 了 它 就 可以 解决 一些 问题",
227 | 				"其实 使用 了 它 就 可以 解决 一些 问题",
228 | 				"好人 使用 了 它 就 可以 解决 一些 问题",
229 | 				"是 因为 和 国家",
230 | 				"老年 搜索 还 支持",
231 | 				"干脆 就 把 那 部 蒙人 的 闲法 给 废 了 拉倒 ！ RT @ laoshipukong : 27日 ， 全国 人大 常委会 第三 次 审议 侵权 责任法 草案 ， 删除 了 有关 医疗 损害 " +
232 | 						"责任 “ 举证 倒置 ” 的 规定 。 在 医患 纠纷 中 本 已 处于 弱势 地位 的 消费者 由此 将 陷入 万劫不复 的 境地 。",
233 | 				"他 说 的 确实 在理",
234 | 				"长春 市长 春节 讲话",
235 | 				"结婚 的 和 尚未 结婚 的",
236 | 				"结合 成分子 时",
237 | 				"旅游 和 服务 是 最 好 的",
238 | 				"这 件 事情 的确 是 我 的 错",
239 | 				"供 大家 参考 指正",
240 | 				"哈尔滨 政府 公布 塌桥 原因",
241 | 				"我 在 机场 入口处",
242 | 				"邢永臣 摄影 报道",
243 | 				"BP 神经 网络 如何 训练 才 能 在 分类 时 增加 区 分度 ？",
244 | 				"南京市 长江 大桥",
245 | 				"应 一些 使用者 的 建议 ， 也 为了 便于 利用 NiuTrans 用于 SMT 研究",
246 | 				"长春市 长春 药店",
247 | 				"邓颖超 生前 最 喜欢 的 衣服",
248 | 				"胡锦涛 是 热爱 世界 和平 的 政治局 常委",
249 | 				"程序员 祝海林 和 朱会震 是 在 孙健 的 左面 和 右面 , 范凯 在 最 右 面 . 再 往 左 是 李松洪",
250 | 				"一次性 交 多少 钱",
251 | 				"小 和尚 留 了 一个 像 大 和 尚 一样 的 和尚 头",
252 | 				"我 是 中华人民共和国 公民 ; 我 爸爸 是 共和党 党员 ; 地铁 和平门站",
253 | 				"张晓梅 去 人民 医院 做 了 个 B 超然 后 去 买 了 件 T 恤",
254 | 				"C + + 和 c # 是 什么 关系 ？ 11 + 122 = 133 ， 是 吗 ？ PI = 3.14159",
255 | 				"你 认识 那个 和 主席 握手 的 的 哥 吗 ？ 他 开 一 辆 黑色 的 士 。",
256 | 				"2017-10-13 给 你 发 offer 了 吗 ？ 27日 发 iphone5 了 吗",
257 | 				"本报 讯 深圳市 海王 生物 工程 股份 有限公司 二○○○ 年度 增 发 A 股 路演 推介会 日前 在 北京 举行",
258 | 				"共同 创造 美好 的 新 世纪 —— 2001年 新年 贺词",
259 | 		};
260 | 
261 | 		Segmenter.enableTitleWord();
262 | 		for (int i = 0; i < SENTENCES.length; i++) {
263 | 			String actual = String.join(" ", Segmenter.segment(SENTENCES[i]));
264 | 			assertEquals(expectedResults[i], actual);
265 | 		}
266 | 
267 | 		long length = 0L;
268 | 		long start = System.currentTimeMillis();
269 | 		for (int i = 0; i < 1000; ++i) {
270 | 			for (String sentence : SENTENCES) {
271 | 				Segmenter.segment(sentence);
272 | 				length += sentence.getBytes(StandardCharsets.UTF_8).length;
273 | 			}
274 | 		}
275 | 		long elapsed = (System.currentTimeMillis() - start);
276 | 		System.out.println(String.format("time elapsed: %d ms, rate: %.2f kb/s.",
277 | 				elapsed, (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f)));
278 | 	}
279 | 
280 | 	@Test
281 | 	public void addUserWords() {
282 | 		Segmenter.addUserWords(Arrays.asList("中国风", "淡雅茗香"));
283 | 		assertEquals("浓浓的,中国风,淡雅茗香,古风",
284 | 				String.join(",", Segmenter.segment("浓浓的中国风 淡雅茗香古风")));
285 | 	}
286 | 
287 | 	@Test
288 | 	public void zFilterStopWords() {
289 | 		Segmenter.enableFilterStopWords();
290 | 		assertEquals("我,能,做,的,事,绝不,推诿,到,下,一",
291 | 				String.join(",", Segmenter.segment("此时我能做的事，绝不推诿到下一时刻；")));
292 | 		assertEquals("H,歌,你,的,猎豹,要是,有,你,的,嘴,那么,硬,有,多,好",
293 | 				String.join(",", Segmenter.segment("【H歌】你的猎豹要是有你的嘴那么硬有多好")));
294 | 		assertEquals("沿江,高铁,雏形,初,现,湖北,要,做,祖国,立交桥",
295 | 				String.join(",", Segmenter.segment("沿江高铁雏形初现：湖北要做“祖国立交桥”")));
296 | 		assertEquals("学,得,好,却,总是,考,不好,是,回,事",
297 | 				String.join(",", Segmenter.segment("「学得好却总是考不好」是怎么回事?")));
298 | 	}
299 | }
300 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/common/DoubleArrayTrieTest.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.common;
  2 | 
  3 | import io.github.yizhiru.thulac4j.util.ModelPaths;
  4 | import org.junit.Test;
  5 | 
  6 | import java.io.IOException;
  7 | import java.nio.file.Files;
  8 | import java.nio.file.Paths;
  9 | import java.util.List;
 10 | import java.util.Set;
 11 | import java.util.stream.Collectors;
 12 | 
 13 | import static org.junit.Assert.*;
 14 | 
 15 | public class DoubleArrayTrieTest {
 16 | 
 17 | 	@Test
 18 | 	public void isMatched() throws IOException {
 19 | 		DoubleArrayTrie dat = DoubleArrayTrie.loadDat("." + ModelPaths.NS_BIN_PATH);
 20 | 		assertTrue(dat.isPrefixMatched("黑龙江"));
 21 | 		assertTrue(dat.isWordMatched("黑龙江"));
 22 | 		assertTrue(dat.isWordMatched("齐齐哈尔"));
 23 | 		assertTrue(dat.isWordMatched("名古屋"));
 24 | 		assertTrue(dat.isWordMatched("克拉约瓦"));
 25 | 		assertTrue(dat.isWordMatched("１０月９日街"));
 26 | 		assertTrue(dat.isWordMatched("鸡公？"));
 27 | 		assertTrue(dat.isWordMatched("齐白石纪念馆"));
 28 | 		assertTrue(dat.isWordMatched("龙格伦吉里"));
 29 | 		assertTrue(dat.isWordMatched("特德本－圣玛丽"));
 30 | 		assertFalse(dat.isWordMatched("首乌"));
 31 | 	}
 32 | 
 33 | 	@Test
 34 | 	public void serialize() throws IOException {
 35 | 		String[] dictPaths = new String[]{
 36 | 				ModelPaths.IDIOM_DICT_PATH,
 37 | 				ModelPaths.NS_DICT_PATH,
 38 | 				ModelPaths.STOP_WORDS_DICT_PATH,
 39 | 		};
 40 | 		String[] binPaths = new String[]{
 41 | 				"." + ModelPaths.IDIOM_BIN_PATH,
 42 | 				"." + ModelPaths.NS_BIN_PATH,
 43 | 				"." + ModelPaths.STOP_WORDS_BIN_PATH,
 44 | 		};
 45 | 		for (int i = 0; i < dictPaths.length; i++) {
 46 | 			DoubleArrayTrie expect = DoubleArrayTrie.make(dictPaths[i]);
 47 | 			expect.serialize(binPaths[i]);
 48 | 			DoubleArrayTrie actual = DoubleArrayTrie.loadDat(binPaths[i]);
 49 | 
 50 | 			assertEquals(expect.size(), actual.size());
 51 | 			for (int j = 0; j < expect.size(); j++) {
 52 | 				assertEquals(expect.getBaseByIndex(j), actual.getBaseByIndex(j));
 53 | 				assertEquals(expect.getCheckByIndex(j), actual.getCheckByIndex(j));
 54 | 			}
 55 | 		}
 56 | 	}
 57 | 
 58 | 	@Test
 59 | 	public void make() throws IOException {
 60 | 		String[] paths = new String[]{
 61 | 				ModelPaths.NS_DICT_PATH,
 62 | 				ModelPaths.IDIOM_DICT_PATH,
 63 | 				ModelPaths.STOP_WORDS_DICT_PATH
 64 | 		};
 65 | 		for (String path : paths) {
 66 | 			List<String> lexicon = Files.lines(Paths.get(path))
 67 | 					.map(String::trim)
 68 | 					.collect(Collectors.toList());
 69 | 			DoubleArrayTrie dat = DoubleArrayTrie.make(path);
 70 | 			for (String word : lexicon) {
 71 | 				if (word.length() > 1) {
 72 | 					assertTrue(dat.isPrefixMatched(word.substring(0, word.length() - 1)));
 73 | 				}
 74 | 				assertTrue(dat.isWordMatched(word));
 75 | 			}
 76 | 		}
 77 | 	}
 78 | 
 79 | 	@Test
 80 | 	public void restore() throws IOException {
 81 | 		String[] binPaths = new String[]{
 82 | 				"." + ModelPaths.NS_BIN_PATH,
 83 | 				"." + ModelPaths.IDIOM_BIN_PATH,
 84 | 				"." + ModelPaths.STOP_WORDS_BIN_PATH
 85 | 		};
 86 | 		String[] dictPaths = new String[]{
 87 | 				ModelPaths.NS_DICT_PATH,
 88 | 				ModelPaths.IDIOM_DICT_PATH,
 89 | 				ModelPaths.STOP_WORDS_DICT_PATH
 90 | 		};
 91 | 
 92 | 		for (int i = 0; i < binPaths.length; i++) {
 93 | 			DoubleArrayTrie dat = DoubleArrayTrie.loadDat(binPaths[i]);
 94 | 			Set<String> dict = Files.lines(Paths.get(dictPaths[i]))
 95 | 					.map(String::trim)
 96 | 					.collect(Collectors.toSet());
 97 | 			List<String> words = DoubleArrayTrie.restore(dat);
 98 | 			for (String word : words) {
 99 | 				assertTrue(dict.contains(word));
100 | 			}
101 | 			assertEquals(dict.size(), words.size());
102 | 		}
103 | 	}
104 | }
105 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronModelTest.java:
--------------------------------------------------------------------------------
  1 | package io.github.yizhiru.thulac4j.perceptron;
  2 | 
  3 | import io.github.yizhiru.thulac4j.process.RuleAnnotator;
  4 | import io.github.yizhiru.thulac4j.term.POC;
  5 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms;
  6 | import org.junit.Test;
  7 | 
  8 | import java.io.FileInputStream;
  9 | import java.io.IOException;
 10 | import java.util.Arrays;
 11 | 
 12 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.*;
 13 | import static org.junit.Assert.assertArrayEquals;
 14 | import static org.junit.Assert.assertEquals;
 15 | 
 16 | public class StructuredPerceptronModelTest {
 17 | 
 18 | 	@Test
 19 | 	public void loadSegmenterModel() throws IOException {
 20 | 		StructuredPerceptronModel SPModel = new StructuredPerceptronModel(
 21 | 				new FileInputStream(SEG_WEIGHTS_PATH),
 22 | 				new FileInputStream(SEG_FEATURES_PATH),
 23 | 				new FileInputStream(SEG_LABELS_PATH));
 24 | 		assertEquals(2453880, SPModel.featureSize);
 25 | 		assertEquals(4, SPModel.labelSize);
 26 | 		assertEquals(-42717, SPModel.llWeights[0]);
 27 | 		assertEquals(-4958, SPModel.flWeights[0]);
 28 | 
 29 | 		assertArrayEquals(new String[]{"3"}, getPosValue(POC.SINGLE_NUMERAL_POC, SPModel));
 30 | 		assertArrayEquals(new String[]{"0"}, getPosValue(POC.BEGIN_POC, SPModel));
 31 | 		assertArrayEquals(new String[]{"1"}, getPosValue(POC.MIDDLE_POC, SPModel));
 32 | 		assertArrayEquals(new String[]{"2"}, getPosValue(POC.END_POC, SPModel));
 33 | 		assertArrayEquals(new String[]{"3"}, getPosValue(POC.SINGLE_POC, SPModel));
 34 | 		assertArrayEquals(new String[]{"0", "3"}, getPosValue(POC.BEGIN_OR_SINGLE_POC, SPModel));
 35 | 		assertArrayEquals(new String[]{"2", "3"}, getPosValue(POC.END_OR_SINGLE_POC, SPModel));
 36 | 		assertArrayEquals(
 37 | 				new String[]{"0", "2", "3", "1"},
 38 | 				getPosValue(POC.DEFAULT_POC, SPModel));
 39 | 	}
 40 | 
 41 | 	@Test
 42 | 	public void loadPosModel() throws IOException {
 43 | 		StructuredPerceptronModel SPModel = new StructuredPerceptronModel(
 44 | 				new FileInputStream(POS_WEIGHTS_PATH),
 45 | 				new FileInputStream(POS_FEATURES_PATH),
 46 | 				new FileInputStream(POS_LABELS_PATH));
 47 | 		assertEquals(961470, SPModel.featureSize);
 48 | 		assertEquals(96, SPModel.labelSize);
 49 | 		assertEquals(-10615, SPModel.llWeights[0]);
 50 | 		assertEquals(5481, SPModel.flWeights[0]);
 51 | 
 52 | 		assertArrayEquals(new String[]{"3w"}, getPosValue(POC.PUNCTUATION_POC, SPModel));
 53 | 		assertArrayEquals(new String[]{"0m"}, getPosValue(POC.BEGIN_NUMERAL_POC, SPModel));
 54 | 		assertArrayEquals(new String[]{"1m"}, getPosValue(POC.MIDDLE_NUMERAL_POC, SPModel));
 55 | 		assertArrayEquals(new String[]{"2m"}, getPosValue(POC.END_NUMERAL_POC, SPModel));
 56 | 		assertArrayEquals(new String[]{"3m"}, getPosValue(POC.SINGLE_NUMERAL_POC, SPModel));
 57 | 		assertArrayEquals(
 58 | 				new String[]{"0v", "0n", "0ns", "0t", "0f", "0d", "0m", "0q", "0r", "0j", "0s", "0a",
 59 | 						"0id", "0ni", "0p", "0c", "0np", "0nz", "0w", "0u", "0o", "0x", "0e", "0k"},
 60 | 				getPosValue(POC.BEGIN_POC, SPModel));
 61 | 		assertArrayEquals(
 62 | 				new String[]{"1n", "1ns", "1t", "1v", "1m", "1j", "1id", "1ni", "1c", "1np", "1d", "1a",
 63 | 						"1nz", "1w", "1q", "1s", "1f", "1r", "1x", "1o", "1p", "1e", "1u", "1k"},
 64 | 				getPosValue(POC.MIDDLE_POC, SPModel));
 65 | 		assertArrayEquals(
 66 | 				new String[]{"2v", "2n", "2ns", "2t", "2f", "2d", "2m", "2q", "2r", "2j", "2s", "2a", "2id",
 67 | 						"2ni", "2p", "2c", "2np", "2nz", "2w", "2u", "2o", "2x", "2e", "2k"},
 68 | 				getPosValue(POC.END_POC, SPModel));
 69 | 		assertArrayEquals(
 70 | 				new String[]{"3p", "3v", "3w", "3f", "3u", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "3r",
 71 | 						"3j", "3np", "3x", "3k", "3o", "3e", "3h", "3t", "3ni", "3s", "3nz"},
 72 | 				getPosValue(POC.SINGLE_POC, SPModel));
 73 | 		assertArrayEquals(
 74 | 				new String[]{"0v", "3p", "0n", "3v", "3w", "0ns", "0t", "0f", "0d", "3f", "3u", "0m", "0q", "0r",
 75 | 						"0j", "0s", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "0a", "0id", "3r", "0ni", "0p", "0c",
 76 | 						"0np", "3j", "3np", "3x", "0nz", "0w", "0u", "3k", "3o", "0o", "0x", "3e", "3h", "3t", "0e",
 77 | 						"3ni", "3s", "3nz", "0k"},
 78 | 				getPosValue(POC.BEGIN_OR_SINGLE_POC, SPModel));
 79 | 		assertArrayEquals(
 80 | 				new String[]{"2v", "3p", "2n", "3v", "3w", "2ns", "2t", "2f", "2d", "3f", "3u", "2m", "2q", "2r",
 81 | 						"2j", "2s", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "2a", "2id", "3r", "2ni", "2p", "2c",
 82 | 						"2np", "3j", "3np", "3x", "2nz", "2w", "2u", "3k", "3o", "2o", "2x", "3e", "3h", "3t", "2e",
 83 | 						"3ni", "3s", "3nz", "2k"},
 84 | 				getPosValue(POC.END_OR_SINGLE_POC, SPModel));
 85 | 	}
 86 | 
 87 | 	/**
 88 | 	 * 根据POS 得到对应的所有label
 89 | 	 *
 90 | 	 * @param pos     enum POC 值
 91 | 	 * @param SPModel StructuredPerceptronModel 对象
 92 | 	 * @return pos 对应的所有label
 93 | 	 */
 94 | 	private String[] getPosValue(POC pos, StructuredPerceptronModel SPModel) {
 95 | 		return Arrays.stream(SPModel.allowTabular[pos.ordinal()])
 96 | 				.mapToObj(t -> SPModel.labelValues[t])
 97 | 				.toArray(String[]::new);
 98 | 	}
 99 | 
100 | 	@Test
101 | 	public void evaluateCharWeights() throws IOException {
102 | 		StructuredPerceptronModel SPModel = new StructuredPerceptronModel(
103 | 				new FileInputStream(SEG_WEIGHTS_PATH),
104 | 				new FileInputStream(SEG_FEATURES_PATH),
105 | 				new FileInputStream(SEG_LABELS_PATH));
106 | 		String[] sentences = new String[]{
107 | 				"鲜",
108 | 				"两块五一套，",
109 | 				"AT&T是"
110 | 		};
111 | 		String[] expectedWeights = new String[]{
112 | 				"[[0, 0, -2664, 0]]",
113 | 				"[[-4384, 0, 21415, 0], [-22789, 1568, 24039, -2808], [21771, -3627, -11546, -6585], [-13779, -3998, " +
114 | 						"7844, 9945], [0, 19768, 18906, 0], [0, 0, 40833, 0]]",
115 | 				"[[1857, 0, 0, 0], [0, 0, 0, 15367], [0, 0, 0, 8591], [0, 15227, 0, 0], [0, 0, 22574, 0]]",
116 | 		};
117 | 
118 | 		for (int i = 0; i < sentences.length; i++) {
119 | 			AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true);
120 | 			char[] chars = annotatedTerms.appendBoundaryAround();
121 | 
122 | 			POC[] pocs = annotatedTerms.getPocs();
123 | 
124 | 			int[][] weights = new int[annotatedTerms.getAnnotatedLength()][];
125 | 			for (int j = 0; j < annotatedTerms.getAnnotatedLength(); j++) {
126 | 				int[] labelIndices = SPModel.allowTabular[pocs[j].ordinal()];
127 | 				weights[j] = SPModel.evaluateCharWeights(
128 | 						chars[j],
129 | 						chars[j + 1],
130 | 						chars[j + 2],
131 | 						chars[j + 3],
132 | 						chars[j + 4],
133 | 						labelIndices);
134 | 			}
135 | 
136 | 			assertEquals(expectedWeights[i], Arrays.deepToString(weights));
137 | 		}
138 | 
139 | 		SPModel = new StructuredPerceptronModel(
140 | 				new FileInputStream(POS_WEIGHTS_PATH),
141 | 				new FileInputStream(POS_FEATURES_PATH),
142 | 				new FileInputStream(POS_LABELS_PATH));
143 | 		String[] expected0Weights = new String[]{
144 | 				"[0, 0, -577, 0, 0, -6529, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -997, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0," +
145 | 						" 0, 0, 0, 12863, 0, 1074, -4387, 0, -1926, -2411, 0, 0, 0, 0, 0, -910, 0, 0, 0, 0, 0, 0, 0, " +
146 | 						"0, 0, 0, 0, -1841, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " +
147 | 						"0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
148 | 				"[-5754, 0, -1939, 813, 0, -4395, 0, 940, -3822, 0, 0, 166, 0, 0, 7178, 0, 979, 0, -1127, -639, 0, " +
149 | 						"9709, 0, 0, 3389, 0, 7075, 0, 6760, 0, 0, 3892, 0, 1710, -943, -7110, 31462, 5834, -472, " +
150 | 						"3806, -577, 0, 1626, 0, 0, -4558, -1971, 0, 0, 0, 0, -985, 0, 0, -1399, 0, 0, -1704, 0, " +
151 | 						"-775, 0, 0, -5751, 0, 0, 0, 0, 0, -147, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " +
152 | 						"0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
153 | 				"[-5660, 0, 0, -8860, 0, 0, 0, 0, 740, 0, 0, 0, 0, 0, 0, 0, -987, 0, 0, 0, 0, -2864, 0, 0, -953, 0, " +
154 | 						"0, 0, -1068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4410, 0, -2860, 0, 0, 0, 4055, 0, 0, 0, 0, " +
155 | 						"-996, 0, 0, 348, 0, 0, 0, 0, 0, 0, 0, 2363, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " +
156 | 						"13543, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",
157 | 		};
158 | 		for (int i = 0; i < sentences.length; i++) {
159 | 			AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true);
160 | 			char[] chars = annotatedTerms.appendBoundaryAround();
161 | 
162 | 			POC[] pocs = annotatedTerms.getPocs();
163 | 			int[] labelIndices = SPModel.allowTabular[pocs[0].ordinal()];
164 | 			int[] weights = SPModel.evaluateCharWeights(
165 | 					chars[0],
166 | 					chars[1],
167 | 					chars[2],
168 | 					chars[3],
169 | 					chars[4],
170 | 					labelIndices);
171 | 			assertEquals(expected0Weights[i], Arrays.toString(weights));
172 | 		}
173 | 	}
174 | }


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/process/LexiconCementerTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.process;
 2 | 
 3 | import io.github.yizhiru.thulac4j.util.ModelPaths;
 4 | import io.github.yizhiru.thulac4j.term.TokenItem;
 5 | import org.junit.Test;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.ArrayList;
 9 | import java.util.Arrays;
10 | import java.util.List;
11 | 
12 | import static org.junit.Assert.assertEquals;
13 | 
14 | public class LexiconCementerTest {
15 | 
16 | 	@Test
17 | 	public void cement() throws IOException {
18 | 		LexiconCementer cementer = new LexiconCementer(
19 | 				this.getClass().getResourceAsStream(ModelPaths.NS_BIN_PATH),
20 | 				"ns");
21 | 		List<TokenItem> tokenItems = new ArrayList<>(Arrays.asList(
22 | 				new TokenItem("黑", null),
23 | 				new TokenItem("龙", "n"),
24 | 				new TokenItem("江", "j"))
25 | 		);
26 | 		cementer.cement(tokenItems);
27 | 		assertEquals("[黑龙江/ns]", tokenItems.toString());
28 | 
29 | 		cementer = new LexiconCementer(
30 | 				this.getClass().getResourceAsStream(ModelPaths.IDIOM_BIN_PATH),
31 | 				"i");
32 | 		tokenItems = new ArrayList<>(Arrays.asList(
33 | 				new TokenItem("掉", null),
34 | 				new TokenItem("进", "n"),
35 | 				new TokenItem("了", "j"),
36 | 				new TokenItem("无", "n"),
37 | 				new TokenItem("底洞", "j"))
38 | 		);
39 | 		cementer.cement(tokenItems);
40 | 		assertEquals("[掉, 进/n, 了/j, 无/n, 底洞/j]", tokenItems.toString());
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/process/RuleAnnotatorTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.process;
 2 | 
 3 | import io.github.yizhiru.thulac4j.term.POC;
 4 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms;
 5 | import org.junit.Test;
 6 | 
 7 | import java.util.HashMap;
 8 | import java.util.Map;
 9 | import java.util.stream.Collectors;
10 | import java.util.stream.Stream;
11 | 
12 | import static org.junit.Assert.assertEquals;
13 | 
14 | /**
15 |  * RuleAnnotator Test.
16 |  */
17 | public class RuleAnnotatorTest {
18 | 
19 | 	@Test
20 | 	public void annotate() {
21 | 		Map<POC, String> pocStringHashMap = new HashMap<>(POC.values().length);
22 | 		pocStringHashMap.put(POC.PUNCTUATION_POC, "w");
23 | 		pocStringHashMap.put(POC.BEGIN_NUMERAL_POC, "bm");
24 | 		pocStringHashMap.put(POC.MIDDLE_NUMERAL_POC, "mm");
25 | 		pocStringHashMap.put(POC.END_NUMERAL_POC, "em");
26 | 		pocStringHashMap.put(POC.SINGLE_NUMERAL_POC, "sm");
27 | 		pocStringHashMap.put(POC.BEGIN_POC, "b");
28 | 		pocStringHashMap.put(POC.MIDDLE_POC, "m");
29 | 		pocStringHashMap.put(POC.END_POC, "e");
30 | 		pocStringHashMap.put(POC.SINGLE_POC, "s");
31 | 		pocStringHashMap.put(POC.BEGIN_OR_SINGLE_POC, "bs");
32 | 		pocStringHashMap.put(POC.END_OR_SINGLE_POC, "es");
33 | 		pocStringHashMap.put(POC.DEFAULT_POC, "d");
34 | 
35 | 		String[] sentences = new String[]{
36 | 				"4个月赚了20％多",
37 | 				"【开放式基金】",
38 | 				"大",
39 | 				"10大重仓股：厦门钨业……这些",
40 | 				"鲜芋仙 3",
41 | 				"仅1只，为0.9923元",
42 | 				"大河《地方的",
43 | 				"●会议》无否决",
44 | 				"AT&T是一家",
45 | 				"在2017-12-12 这一天",
46 | 				"UTF-8",
47 | 				"鲜芋仙 3",
48 | 				"最右面.再",
49 | 				"内容《》真实、、",
50 | 				"签定《供货协议书》的，",
51 | 				"昨日《上市公司证券发行管理办法》发布",
52 | 				"《21世纪》：",
53 | 				"《探索·发现》栏目",
54 | 				"《麦亚hee》",
55 | 				"日系＆动漫",
56 | 		};
57 | 		String[] expectedPocString = new String[]{
58 | 				"sm,bs,d,d,es,bm,mm,em,s",
59 | 				"w,bs,d,d,d,es,w",
60 | 				"s",
61 | 				"bm,em,bs,d,d,es,w,bs,d,d,es,w,w,bs,es",
62 | 				"bs,d,es,sm",
63 | 				"s,sm,s,w,s,bm,mm,mm,mm,mm,em,s",
64 | 				"bs,es,w,bs,d,es",
65 | 				"w,bs,es,w,bs,d,es",
66 | 				"b,m,m,e,bs,d,es",
67 | 				"s,bm,mm,mm,mm,mm,mm,mm,mm,mm,em,bs,d,es",
68 | 				"b,m,e,w,sm",
69 | 				"bs,d,es,sm",
70 | 				"bs,d,es,w,s",
71 | 				"bs,es,w,w,bs,es,w,w",
72 | 				"bs,es,w,b,m,m,m,e,w,s,w",
73 | 				"bs,es,w,bs,d,d,d,d,d,d,d,d,d,d,es,w,bs,es",
74 | 				"w,b,m,m,e,w,w",
75 | 				"w,b,m,m,m,e,w,bs,es",
76 | 				"w,b,m,m,m,e,w",
77 | 				"bs,es,s,bs,es",
78 | 		};
79 | 
80 | 		for (int i = 0; i < sentences.length; i++) {
81 | 			AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true);
82 | 			String result = Stream.of(annotatedTerms.getPocs())
83 | 					.map(pocStringHashMap::get)
84 | 					.collect(Collectors.joining(","));
85 | 			if (!expectedPocString[i].equals(result)) {
86 | 				System.out.println(sentences[i]);
87 | 			}
88 | 			assertEquals(expectedPocString[i], result);
89 | 		}
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/process/SpecifiedWordCementerTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.process;
 2 | 
 3 | import io.github.yizhiru.thulac4j.term.TokenItem;
 4 | import org.junit.Test;
 5 | 
 6 | import java.util.ArrayList;
 7 | import java.util.Arrays;
 8 | import java.util.List;
 9 | 
10 | import static org.junit.Assert.assertEquals;
11 | 
12 | public class SpecifiedWordCementerTest {
13 | 
14 | 	@Test
15 | 	public void cement() {
16 | 		List<TokenItem> tokenItems = new ArrayList<>(Arrays.asList(
17 | 				new TokenItem("二○○一", "m"),
18 | 				new TokenItem("年", "q"),
19 | 				new TokenItem("27", "m"),
20 | 				new TokenItem("日", "q"))
21 | 		);
22 | 		SpecifiedWordCementer.cementWord(tokenItems);
23 | 		assertEquals(2, tokenItems.size());
24 | 		assertEquals("二○○一年", tokenItems.get(0).word);
25 | 		assertEquals("27日", tokenItems.get(1).word);
26 | 		assertEquals("t", tokenItems.get(1).pos);
27 | 
28 | 		tokenItems = new ArrayList<>(Arrays.asList(
29 | 				new TokenItem("盛典", "n"),
30 | 				new TokenItem("—", "w"),
31 | 				new TokenItem("—", "w"),
32 | 				new TokenItem("—", "w"),
33 | 				new TokenItem("2001", "m"),
34 | 				new TokenItem("年", "q"))
35 | 		);
36 | 		SpecifiedWordCementer.cementWord(tokenItems);
37 | 		assertEquals(3, tokenItems.size());
38 | 		assertEquals("———", tokenItems.get(1).word);
39 | 		assertEquals("2001年", tokenItems.get(2).word);
40 | 	}
41 | }


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/term/POCTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.term;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_NUMERAL_POC;
 6 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_POC;
 7 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_OR_SINGLE_POC;
 8 | import static io.github.yizhiru.thulac4j.term.POC.DEFAULT_POC;
 9 | import static io.github.yizhiru.thulac4j.term.POC.END_NUMERAL_POC;
10 | import static io.github.yizhiru.thulac4j.term.POC.END_POC;
11 | import static io.github.yizhiru.thulac4j.term.POC.END_OR_SINGLE_POC;
12 | import static io.github.yizhiru.thulac4j.term.POC.MIDDLE_NUMERAL_POC;
13 | import static io.github.yizhiru.thulac4j.term.POC.MIDDLE_POC;
14 | import static io.github.yizhiru.thulac4j.term.POC.PUNCTUATION_POC;
15 | import static io.github.yizhiru.thulac4j.term.POC.SINGLE_NUMERAL_POC;
16 | import static io.github.yizhiru.thulac4j.term.POC.SINGLE_POC;
17 | import static org.junit.Assert.assertEquals;
18 | 
19 | public class POCTest {
20 | 
21 | 	@Test
22 | 	public void intersect() {
23 | 		assertEquals(PUNCTUATION_POC, PUNCTUATION_POC.intersect(BEGIN_POC));
24 | 
25 | 		assertEquals(BEGIN_NUMERAL_POC, BEGIN_POC.intersect(BEGIN_NUMERAL_POC));
26 | 		assertEquals(END_NUMERAL_POC, END_POC.intersect(END_NUMERAL_POC));
27 | 		assertEquals(MIDDLE_NUMERAL_POC, MIDDLE_NUMERAL_POC.intersect(MIDDLE_POC));
28 | 		assertEquals(SINGLE_NUMERAL_POC, SINGLE_NUMERAL_POC.intersect(SINGLE_POC));
29 | 
30 | 		assertEquals(SINGLE_POC, BEGIN_OR_SINGLE_POC.intersect(END_OR_SINGLE_POC));
31 | 		assertEquals(SINGLE_POC, END_OR_SINGLE_POC.intersect(BEGIN_OR_SINGLE_POC));
32 | 
33 | 		assertEquals(SINGLE_POC, DEFAULT_POC.intersect(SINGLE_POC));
34 | 		assertEquals(BEGIN_NUMERAL_POC, BEGIN_NUMERAL_POC.intersect(DEFAULT_POC));
35 | 	}
36 | }


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/util/CharUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.util;
 2 | 
 3 | 
 4 | import io.github.yizhiru.thulac4j.term.CharType;
 5 | import org.junit.Test;
 6 | 
 7 | import static io.github.yizhiru.thulac4j.util.CharUtils.getCharType;
 8 | import static org.junit.Assert.assertSame;
 9 | 
10 | public class CharUtilsTest {
11 | 
12 | 	@Test
13 | 	public void checkCharType() {
14 | 		char[] singlePunctuations = new char[]{
15 | 				'，', '。', '？', '！', '：', '；', '‘', '’', '“', '”', '【', '】', '、',
16 | 				'《', '》', '@', '#', '（', '）', '"', '[', ']', '~', ':', '?', '◤',
17 | 				'☆', '★', '…', '\'', '!', '*', '+', '>', '(', ')', ';', '=',
18 | 				'℃', '℉',
19 | 		};
20 | 		for (char c : singlePunctuations) {
21 | 			assertSame(CharType.SINGLE_PUNCTUATION_CHAR, getCharType(c));
22 | 		}
23 | 
24 | 		char[] exSinglePunctuations = new char[]{
25 | 				'·', '—', '￥', '$', '&', '\\', '^', '_', '{', '|', '}'
26 | 		};
27 | 		for (char c : exSinglePunctuations) {
28 | 			assertSame(CharType.EX_SINGLE_PUNCTUATION_CHAR, getCharType(c));
29 | 		}
30 | 
31 | 		char[] chineseNumeralChars = new char[]{
32 | 				'〇', '一', '二', '三', '四', '五', '六', '七', '八', '九'
33 | 		};
34 | 		for (char c : chineseNumeralChars) {
35 | 			assertSame(CharType.CHINESE_NUMERAL_CHAR, getCharType(c));
36 | 		}
37 | 
38 | 		char[] arabicNumeralChars = new char[]{
39 | 				'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
40 | 				'０', '１', '２', '３', '４', '５', '６', '７', '８', '９'
41 | 		};
42 | 		for (char c : arabicNumeralChars) {
43 | 			assertSame(CharType.ARABIC_NUMERAL_CHAR, getCharType(c));
44 | 		}
45 | 
46 | 		// numeral punctuations
47 | 		char[] numeralPunctuationChars = new char[]{
48 | 				'%', '.', ',', '/', '％', '-', '±', '‰',
49 | 		};
50 | 		for (char c : numeralPunctuationChars) {
51 | 			assertSame(CharType.NUMERAL_PUNCTUATION_CHAR, getCharType(c));
52 | 		}
53 | 
54 | 		char[] hanChars = new char[]{
55 | 				'苟', '利', '国', '家', '生', '死', '以',
56 | 				'豈', '因', '禍', '福', '避', '趨', '之',
57 | 		};
58 | 		for (char c : hanChars) {
59 | 			assertSame(CharType.HAN_ZI_CHAR, getCharType(c));
60 | 		}
61 | 
62 | 		char[] englishLetterChars = new char[]{
63 | 				'a', 'b', 'c', 'd', 'h', 'l', 'o', 'r', 'u', 'z',
64 | 				'A', 'B', 'C', 'D', 'H', 'L', 'O', 'R', 'U', 'Z'
65 | 		};
66 | 		for (char c : englishLetterChars) {
67 | 			assertSame(CharType.ENGLISH_LETTER_CHAR, getCharType(c));
68 | 		}
69 | 
70 | 		char[] otherChars = new char[]{
71 | 				'＆',
72 | 		};
73 | 		for (char c : otherChars) {
74 | 			assertSame(CharType.OTHER_CHAR, getCharType(c));
75 | 		}
76 | 	}
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/util/ChineseUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.util;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import static org.junit.Assert.*;
 6 | 
 7 | public class ChineseUtilsTest {
 8 | 
 9 | 	@Test
10 | 	public void t2s() {
11 | 		String[] traditions = new String[]{
12 | 				"為何曾加入日軍的他，一生無法原諒日本人？日導演用7年走遍台灣，拍下時代淚水",
13 | 				"「那些人哪裡像軍隊？根本不知是蔣介石從哪撿來的流氓！」",
14 | 				"明明課本上都說光復節是台灣人熱烈歡迎「祖國」到來的時刻，為何有一群受過日本統治的台灣人，到現在都不能接受中華民國？",
15 | 				"鯛魚是低脂肪、高蛋白的健康食材, 肉質軟嫩細緻。",
16 | 				"世界商機大發現：抓住泰國工頭的需求 就是臺灣手工具產業的福氣啦！",
17 | 				"房市買氣還沒回春，房價也還在向下修正，但土地交易熱度卻是燒燙燙，替地方政府的國庫充實不少"
18 | 		};
19 | 		String[] simples = new String[]{
20 | 				"为何曾加入日军的他，一生无法原谅日本人？日导演用7年走遍台湾，拍下时代泪水",
21 | 				"「那些人哪里像军队？根本不知是蒋介石从哪捡来的流氓！」",
22 | 				"明明课本上都说光复节是台湾人热烈欢迎「祖国」到来的时刻，为何有一群受过日本统治的台湾人，到现在都不能接受中华民国？",
23 | 				"鲷鱼是低脂肪、高蛋白的健康食材, 肉质软嫩细致。",
24 | 				"世界商机大发现：抓住泰国工头的需求 就是台湾手工具产业的福气啦！",
25 | 				"房市买气还没回春，房价也还在向下修正，但土地交易热度却是烧烫烫，替地方政府的国库充实不少"
26 | 		};
27 | 
28 | 		for (int i = 0; i < traditions.length; i++) {
29 | 			assertEquals(simples[i], ChineseUtils.simplified(traditions[i]));
30 | 		}
31 | 	}
32 | 
33 | 
34 | 	@Test
35 | 	public void isStopWords() {
36 | 		assertTrue(ChineseUtils.isStopWord("此时"));
37 | 		assertTrue(ChineseUtils.isStopWord("；"));
38 | 		assertTrue(ChineseUtils.isStopWord("一时"));
39 | 		assertFalse(ChineseUtils.isStopWord("刻"));
40 | 		assertFalse(ChineseUtils.isStopWord("到"));
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/java/io/github/yizhiru/thulac4j/util/IOUtilsTest.java:
--------------------------------------------------------------------------------
 1 | package io.github.yizhiru.thulac4j.util;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import java.io.IOException;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | 
 9 | public class IOUtilsTest {
10 | 
11 | 	@Test
12 | 	public void toIntArray() throws IOException {
13 | 		int[] array = IOUtils.toIntArray(
14 | 				this.getClass().getResourceAsStream(ModelPaths.T2S_PATH));
15 | 		assertEquals(5600, array.length);
16 | 		assertEquals(33836, array[0]);
17 | 		assertEquals(40800, array[2789]);
18 | 		assertEquals(40863, array[5599]);
19 | 	}
20 | }


--------------------------------------------------------------------------------