├── .gitignore ├── LICENSE ├── README.md ├── dicts ├── core_char.dict ├── idiom.dict ├── ns.dict └── stop_words.dict ├── models ├── cws_label.txt ├── idiom_dat.bin ├── model_c_label.txt ├── ns_dat.bin ├── stop_dat.bin └── t2s.dat ├── pom.xml └── src ├── main └── java │ └── io │ └── github │ └── yizhiru │ └── thulac4j │ ├── POSTagger.java │ ├── SPChineseTokenizer.java │ ├── Segmenter.java │ ├── common │ ├── DoubleArrayTrie.java │ └── Nullable.java │ ├── perceptron │ ├── StructuredPerceptronClassifier.java │ └── StructuredPerceptronModel.java │ ├── process │ ├── LexiconCementer.java │ ├── RuleAnnotator.java │ └── SpecifiedWordCementer.java │ ├── term │ ├── AnnotatedTerms.java │ ├── CharType.java │ ├── POC.java │ └── TokenItem.java │ └── util │ ├── CharUtils.java │ ├── ChineseUtils.java │ ├── IOUtils.java │ └── ModelPaths.java └── test └── java └── io └── github └── yizhiru └── thulac4j ├── POSTaggerTest.java ├── SPChineseTokenizerTest.java ├── SegmenterTest.java ├── common └── DoubleArrayTrieTest.java ├── perceptron └── StructuredPerceptronModelTest.java ├── process ├── LexiconCementerTest.java ├── RuleAnnotatorTest.java └── SpecifiedWordCementerTest.java ├── term └── POCTest.java └── util ├── CharUtilsTest.java ├── ChineseUtilsTest.java └── IOUtilsTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | target/ 4 | models/cws_dat.bin 5 | models/cws_model.bin 6 | models/model_c_dat.bin 7 | models/model_c_model.bin 8 | train/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {jyzheng} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # thulac4j 2 | 3 | thulac4j是[THULAC](http://thulac.thunlp.org/)的高效Java 8实现,具有分词速度快、准、强的特点;支持 4 | 5 | - 自定义词典 6 | - 繁体转简体 7 | - 停用词过滤 8 | 9 | 10 | ## 使用示例 11 | 12 | 在项目中使用thulac4j,添加依赖(请使用最新版本): 13 | 14 | ```xml 15 | 16 | io.github.yizhiru 17 | thulac4j 18 | 3.1.2 19 | 20 | ``` 21 | 22 | thulac4j支持中文分词与词性标注,使用示例如下: 23 | 24 | 25 | ```java 26 | String sentence = "滔滔的流水,向着波士顿湾无声逝去"; 27 | List words = Segmenter.segment(sentence); 28 | // [滔滔, 的, 流水, ,, 向着, 波士顿湾, 无声, 逝去] 29 | 30 | POSTagger pos = new POSTagger("models/model_c_model.bin", "models/model_c_dat.bin"); 31 | List words = pos.tagging(sentence); 32 | // [滔滔/a, 的/u, 流水/n, ,/w, 向着/p, 波士顿湾/ns, 无声/v, 逝去/v] 33 | ``` 34 | 35 | 模型数据较大,没有放在jar包与源码。训练模型下载及更多使用说明,请参看[Wiki](https://github.com/yizhiru/thulac4j/wiki). 36 | 37 | 38 | 最后感谢THUNLP实验室! 39 | 40 | 41 | -------------------------------------------------------------------------------- /dicts/stop_words.dict: -------------------------------------------------------------------------------- 1 | ! 2 | " 3 | # 4 | $ 5 | % 6 | & 7 | ' 8 | ( 9 | ) 10 | * 11 | + 12 | , 13 | - 14 | . 15 | / 16 | 0 17 | 1 18 | 2 19 | 3 20 | 4 21 | 5 22 | 6 23 | 7 24 | 8 25 | 9 26 | : 27 | ; 28 | < 29 | = 30 | > 31 | ? 32 | @ 33 | A 34 | [ 35 | \ 36 | ] 37 | ^ 38 | _ 39 | ` 40 | | 41 | ~ 42 | · 43 | — 44 | ‘ 45 | ’ 46 | “ 47 | ” 48 | … 49 | 、 50 | 。 51 | 〈 52 | 〉 53 | 《 54 | 》 55 | ︿ 56 | , 57 | ! 58 | # 59 | $ 60 | % 61 | & 62 | ( 63 | ) 64 | * 65 | + 66 | , 67 | 0 68 | 1 69 | 2 70 | 3 71 | 4 72 | 5 73 | 6 74 | 7 75 | 8 76 | 9 77 | : 78 | ; 79 | < 80 | > 81 | ? 82 | @ 83 | [ 84 | ] 85 | { 86 | | 87 | } 88 | ~ 89 | ¥ 90 | 『 91 | 』 92 | 【 93 | 】 94 | 〖 95 | 〗 96 | 「 97 | 」 98 | → 99 | ‖ 100 | º 101 | 造就 102 | 等到 103 | 其人 104 | 看得到 105 | 辅以 106 | 就是 107 | 谁知 108 | 看见 109 | 不顾 110 | 提出 111 | 举办 112 | 总能 113 | 比起 114 | 哪知 115 | 含有 116 | 接着 117 | 怎么回事 118 | 看出 119 | 此地 120 | 一手 121 | 发出 122 | 看得出 123 | 乃是 124 | 富于 125 | 来看 126 | 不肯 127 | 有利 128 | 回来 129 | 经由 130 | 加以 131 | 不如说 132 | 看不见 133 | 走去 134 | 有所 135 | 离不开 136 | 要知道 137 | 正当 138 | 接下来 139 | 为止 140 | 实行 141 | 有一次 142 | 做好 143 | 等于 144 | 看成 145 | 在于 146 | 提到 147 | 无所 148 | 开来 149 | 过来 150 | 没想 151 | 想不到 152 | 看到 153 | 近乎 154 | 包括 155 | 不想 156 | 饱受 157 | 怎么办 158 | 同在 159 | 回去 160 | 不能 161 | 诸如 162 | 可以说 163 | 什么样 164 | 收有 165 | 出来 166 | 一身 167 | 不甘 168 | 进一步 169 | 留给 170 | 共同 171 | 听来 172 | 听起来 173 | 还要 174 | 不够 175 | 仅仅是 176 | 分成 177 | 带到 178 | 如下 179 | 面对 180 | 所有 181 | 方面 182 | 不失为 183 | 怎会 184 | 终于 185 | 看起来 186 | 不失 187 | 能为 188 | 谈及 189 | 以期 190 | 号称 191 | 取决于 192 | 无人 193 | 一行人 194 | 想得到 195 | 不愿 196 | 可以 197 | 来得 198 | 想来 199 | 起来 200 | 来讲 201 | 听得 202 | 所在 203 | 迫使 204 | 几经 205 | 只得 206 | 位于 207 | 不免 208 | 做出 209 | 听完 210 | 仅有 211 | 有的人 212 | 时候 213 | 本身 214 | 可看 215 | 来去 216 | 做成 217 | 不敢 218 | 出现 219 | 感到 220 | 面向 221 | 分为 222 | 身为 223 | 本人 224 | 相处 225 | 这里 226 | 这种 227 | 当时 228 | 出去 229 | 仍是 230 | 遍及 231 | 引起 232 | 更具 233 | 来过 234 | 搞好 235 | 未有 236 | 显得 237 | 当成 238 | 即是 239 | 遭受 240 | 当上 241 | 做到 242 | 不如 243 | 纳入 244 | 不要 245 | 来说 246 | 不料 247 | 适合于 248 | 却是 249 | 变成 250 | 受到 251 | 之初 252 | 展开 253 | 向着 254 | 抓好 255 | 还是 256 | 上下 257 | 得出 258 | 宛如 259 | 皆有 260 | 跟着 261 | 予以 262 | 现有 263 | 哪能 264 | 一体 265 | 所得 266 | 有着 267 | 一块 268 | 开展 269 | 这个 270 | 这般 271 | 道来 272 | 推向 273 | 变为 274 | 一面 275 | 怎么一回事 276 | 直至 277 | 得到 278 | 从事 279 | 相关 280 | 归于 281 | 算是 282 | 带给 283 | 并用 284 | 不无 285 | 历尽 286 | 四处 287 | 不出 288 | 亦即 289 | 不已 290 | 引出 291 | 才是 292 | 利于 293 | 结成 294 | 一定 295 | 不下 296 | 此类 297 | 怎知 298 | 看着 299 | 情况 300 | 这么 301 | 看似 302 | 同样 303 | 想尽 304 | 带有 305 | 分开 306 | 对应 307 | 化成 308 | 直到 309 | 哪敢 310 | 不论是 311 | 看来 312 | 更是 313 | 是不是 314 | 后者 315 | 看作 316 | 得了 317 | 举行 318 | 叫做 319 | 除去 320 | 提供 321 | 结为 322 | 不到 323 | 不是 324 | 带着 325 | 说起来 326 | 可知 327 | 去到 328 | 所谓 329 | 说来 330 | 造成 331 | 怎样 332 | 请看 333 | 犹如 334 | 不乏 335 | 度过 336 | 化为 337 | 看完 338 | 既定 339 | 带来 340 | 以求 341 | 样子 342 | 提及 343 | 四起 344 | 属于 345 | 一开始 346 | 掀起 347 | 好比 348 | 那是 349 | 象是 350 | 亦可 351 | 处于 352 | 达成 353 | 可谓 354 | 还给 355 | 自身 356 | 看过 357 | 打下 358 | 作出 359 | 奉为 360 | 极具 361 | 看去 362 | 附近 363 | 还有 364 | 比较 365 | 达到 366 | 列入 367 | 得以 368 | 成为 369 | 哪里 370 | 限于 371 | 此处 372 | 不应 373 | 将要 374 | 勾起 375 | 没人 376 | 哪知道 377 | 充满 378 | 多方面 379 | 有可能 380 | 一样 381 | 称为 382 | 一行 383 | 怎么说 384 | 别看 385 | 据说 386 | 自有 387 | 使出 388 | 早在 389 | 作为 390 | 实为 391 | 只能 392 | 一道 393 | 便是 394 | 到了 395 | 没想到 396 | 当作 397 | 争取 398 | 之余 399 | 用于 400 | 围绕 401 | 为什么 402 | 做得 403 | 这次 404 | 何在 405 | 原是 406 | 尽在 407 | 随着 408 | 没有 409 | 各方面 410 | 哪个 411 | 取得 412 | 相应 413 | 上来 414 | 称得上 415 | 更有 416 | 看尽 417 | 直指 418 | 看做 419 | 怎能 420 | 不会 421 | 充当 422 | 便于 423 | 促成 424 | 藉此 425 | 有必要 426 | 不休 427 | 处在 428 | 前来 429 | 用以 430 | 下来 431 | 表明 432 | 不怎么样 433 | 给予 434 | 如同 435 | 左右 436 | 列出 437 | 彷佛 438 | 该怎么办 439 | 或是 440 | 即可 441 | 经过 442 | 受过 443 | 特别 444 | 只要 445 | 或者是 446 | 可能 447 | 形成 448 | 经受 449 | 东西 450 | 不住 451 | 至于 452 | 称之为 453 | 怎奈 454 | 看上去 455 | 上去 456 | 无法 457 | 快要 458 | 引来 459 | 进来 460 | 不止 461 | 采取 462 | 应有 463 | 有别于 464 | 前去 465 | 认为 466 | 列为 467 | 化作 468 | 这边 469 | 下去 470 | 此时 471 | 未能 472 | 听见 473 | 正是 474 | 想见 475 | 不得 476 | 会有 477 | 来自 478 | 上述 479 | 关乎 480 | 过上 481 | 用来 482 | 应当 483 | 应该说 484 | 整个 485 | 出自 486 | 一头 487 | 到来 488 | 竟是 489 | 论及 490 | 不容 491 | 怎料 492 | 为主 493 | 一系列 494 | 运用 495 | 本想 496 | 合乎 497 | 配有 498 | 进去 499 | 前者 500 | 不及 501 | 何谓 502 | 在内 503 | 引发 504 | 毫无 505 | 相当于 506 | 推出 507 | 例如 508 | 加上 509 | 同时 510 | 发生 511 | 及时 512 | 去过 513 | 相对于 514 | 来到 515 | 双方 516 | 不忍 517 | 依靠 518 | 想出 519 | 层面 520 | 当做 521 | 涉及 522 | 又是 523 | 遭到 524 | 就要 525 | 不只是 526 | 什么 527 | 有关 528 | 譬如 529 | 起到 530 | 不可 531 | 一如 532 | 或许是 533 | 听到 534 | 不说 535 | 广为 536 | 想到 537 | 有如 538 | 之类 539 | 感觉到 540 | 无关 541 | 不怕 542 | 极有 543 | 这么回事 544 | 身处 545 | 并存 546 | 此事 547 | 提起 548 | 而是 549 | 不少 550 | 一方 551 | 做法 552 | 不堪 553 | 一句话 554 | 也是 555 | 备受 556 | 特有 557 | 进行 558 | 至此 559 | 力图 560 | 发起 561 | 能够 562 | 相比 563 | 并不是 564 | 不行 565 | 必需 566 | 凭借 567 | 均为 568 | 不尽 569 | 实现 570 | 应该 571 | 此人 572 | 相反 573 | 出于 574 | 另有 575 | 感受到 576 | 怎么 577 | 使得 578 | 介入 579 | 带入 580 | 一方面 581 | 以为 582 | 至极 583 | 养成 584 | 多日 585 | 以前 586 | 日前 587 | 日子 588 | 前一天 589 | 时刻 590 | 大前 591 | 先前 592 | 目前 593 | 终日 594 | 当下 595 | 一会 596 | 现今 597 | 每月 598 | 半天 599 | 成天 600 | 今度 601 | 多时 602 | 个月 603 | 某日 604 | 几时 605 | 后来 606 | 一天 607 | 有的时候 608 | 当初 609 | 一个 610 | 那一天 611 | 近日 612 | 近年来 613 | 此后 614 | 以来 615 | 之后 616 | 而今 617 | 一刻 618 | 时年 619 | 以往 620 | 历年来 621 | 从前 622 | 每天 623 | 当天 624 | 十数年 625 | 眼下 626 | 现时 627 | 其时 628 | 一会儿 629 | 忽然间 630 | 当前 631 | 多年来 632 | 其后 633 | 一晚 634 | 这时候 635 | 原初 636 | 现下 637 | 某天 638 | 此刻 639 | 不久前 640 | 多久 641 | 前夕 642 | 此前 643 | 每晚 644 | 现世 645 | 之前 646 | 前后 647 | 会儿 648 | 没多久 649 | 往日 650 | 同一天 651 | 尔后 652 | 早先 653 | 前一刻 654 | 如今 655 | 现如今 656 | 往后 657 | 每年 658 | 当年 659 | 今后 660 | 转眼间 661 | 一时间 662 | 多年 663 | 顷刻间 664 | 起初 665 | 许久 666 | 起先 667 | 来年 668 | 在此之前 669 | 近来 670 | 是时 671 | 日日 672 | 稍后 673 | 往常 674 | 期间 675 | 晚近 676 | 数小时 677 | 以后 678 | 日后 679 | 已往 680 | 他日 681 | 先后 682 | 在此期间 683 | 不久 684 | 近年 685 | 时下 686 | 两年 687 | 前不久 688 | 哪一天 689 | 当今 690 | 很早以前 691 | 最近 692 | 早些 693 | 同年 694 | 万世 695 | 一日 696 | 二十年 697 | 近些年 698 | 这一刻 699 | 彼时 700 | 于今 701 | 这些年 702 | 每日 703 | 往年 704 | 一时 -------------------------------------------------------------------------------- /models/cws_label.txt: -------------------------------------------------------------------------------- 1 | 0 2 | 2 3 | 3 4 | 1 5 | -------------------------------------------------------------------------------- /models/idiom_dat.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/idiom_dat.bin -------------------------------------------------------------------------------- /models/model_c_label.txt: -------------------------------------------------------------------------------- 1 | 0v 2 | 2v 3 | 3p 4 | 0n 5 | 2n 6 | 3v 7 | 1n 8 | 3w 9 | 0ns 10 | 1ns 11 | 2ns 12 | 0t 13 | 1t 14 | 2t 15 | 0f 16 | 2f 17 | 0d 18 | 2d 19 | 3f 20 | 3u 21 | 1v 22 | 0m 23 | 1m 24 | 2m 25 | 0q 26 | 2q 27 | 0r 28 | 2r 29 | 0j 30 | 1j 31 | 2j 32 | 0s 33 | 2s 34 | 3a 35 | 3c 36 | 3g 37 | 3m 38 | 3q 39 | 3d 40 | 3n 41 | 0a 42 | 2a 43 | 0id 44 | 1id 45 | 2id 46 | 3r 47 | 0ni 48 | 1ni 49 | 2ni 50 | 0p 51 | 2p 52 | 0c 53 | 1c 54 | 2c 55 | 0np 56 | 1np 57 | 2np 58 | 3j 59 | 1d 60 | 3np 61 | 1a 62 | 3x 63 | 0nz 64 | 2nz 65 | 1nz 66 | 0w 67 | 1w 68 | 2w 69 | 0u 70 | 2u 71 | 1q 72 | 1s 73 | 3k 74 | 1f 75 | 3o 76 | 0o 77 | 2o 78 | 1r 79 | 0x 80 | 1x 81 | 2x 82 | 3e 83 | 3h 84 | 3t 85 | 1o 86 | 1p 87 | 0e 88 | 1e 89 | 2e 90 | 3ni 91 | 3s 92 | 3nz 93 | 1u 94 | 0k 95 | 1k 96 | 2k 97 | -------------------------------------------------------------------------------- /models/ns_dat.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/ns_dat.bin -------------------------------------------------------------------------------- /models/stop_dat.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/stop_dat.bin -------------------------------------------------------------------------------- /models/t2s.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yizhiru/thulac4j/a9d5b8405e71ff900be08f36b876c225daec63c7/models/t2s.dat -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.sonatype.oss 6 | oss-parent 7 | 7 8 | 9 | 10 | io.github.yizhiru 11 | thulac4j 12 | 3.1.2 13 | jar 14 | 15 | thulac4j 16 | https://github.com/yizhiru/thulac4j 17 | Java implementation of THULAC. 18 | 19 | 20 | UTF-8 21 | 4.13.1 22 | 1.7.3 23 | 24 | 25 | 26 | 27 | The Apache Software License, Version 2.0 28 | http://www.apache.org/licenses/LICENSE-2.0.txt 29 | repo 30 | 31 | 32 | 33 | 34 | 35 | yizhiru 36 | Zheng Jiangyu 37 | j.y.zheng@qq.com 38 | 39 | 40 | 41 | scm:git:git@github.com:yizhiru/thulac4j.git 42 | scm:git:git@github.com:yizhiru/thulac4j.git 43 | git@github.com:yizhiru/thulac4j.git 44 | 45 | 46 | 47 | 48 | junit 49 | junit 50 | ${junit.version} 51 | test 52 | 53 | 54 | org.powermock 55 | powermock-module-junit4 56 | ${powermock.version} 57 | test 58 | 59 | 60 | org.powermock 61 | powermock-api-easymock 62 | ${powermock.version} 63 | test 64 | 65 | 66 | 67 | 68 | src/main/java 69 | src/test/java 70 | 71 | 72 | ./ 73 | 74 | models/*label.txt 75 | models/cws* 76 | models/*dat.bin 77 | models/t2s.dat 78 | dicts/core_char.dict 79 | 80 | 81 | models/model_c_dat.bin 82 | 83 | 84 | 85 | 86 | 87 | 88 | org.apache.maven.plugins 89 | maven-compiler-plugin 90 | 3.1 91 | 92 | 1.8 93 | 1.8 94 | ${project.build.sourceEncoding} 95 | 96 | 97 | 98 | org.apache.maven.plugins 99 | maven-surefire-plugin 100 | 2.12.4 101 | 102 | once 103 | -Dfile.encoding=UTF-8 104 | 105 | 106 | 107 | org.apache.maven.plugins 108 | maven-source-plugin 109 | 2.1.2 110 | 111 | 112 | package 113 | 114 | jar-no-fork 115 | 116 | 117 | 118 | 119 | 120 | models/ 121 | 122 | 123 | 124 | 125 | org.apache.maven.plugins 126 | maven-javadoc-plugin 127 | 2.9.1 128 | 129 | UTF-8 130 | UTF-8 131 | 132 | 133 | 134 | package 135 | 136 | jar 137 | 138 | 139 | 140 | 141 | 142 | org.apache.maven.plugins 143 | maven-gpg-plugin 144 | 145 | 146 | verify 147 | 148 | sign 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | release 160 | 161 | 162 | oss 163 | https://oss.sonatype.org/content/repositories/snapshots/ 164 | 165 | 166 | oss 167 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/POSTagger.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import io.github.yizhiru.thulac4j.term.TokenItem; 4 | 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.util.List; 8 | 9 | import static io.github.yizhiru.thulac4j.util.ModelPaths.POS_TAGGING_LABEL_PATH; 10 | 11 | /** 12 | * 中文词性标注. 13 | */ 14 | public class POSTagger extends SPChineseTokenizer { 15 | 16 | public POSTagger(String weightPath, String featurePath) throws IOException { 17 | super(new FileInputStream(weightPath), 18 | new FileInputStream(featurePath), 19 | POSTagger.class.getResourceAsStream(POS_TAGGING_LABEL_PATH)); 20 | } 21 | 22 | /** 23 | * 词性标注 24 | * 25 | * @param text 输入句子 26 | * @return 词与词性结对结果 27 | */ 28 | public List tagging(String text) { 29 | return tokenize(text); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/SPChineseTokenizer.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie; 4 | import io.github.yizhiru.thulac4j.util.ModelPaths; 5 | import io.github.yizhiru.thulac4j.common.Nullable; 6 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronClassifier; 7 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel; 8 | import io.github.yizhiru.thulac4j.process.RuleAnnotator; 9 | import io.github.yizhiru.thulac4j.process.LexiconCementer; 10 | import io.github.yizhiru.thulac4j.process.SpecifiedWordCementer; 11 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms; 12 | import io.github.yizhiru.thulac4j.term.TokenItem; 13 | import io.github.yizhiru.thulac4j.util.ChineseUtils; 14 | 15 | import java.io.IOException; 16 | import java.io.InputStream; 17 | import java.util.ArrayList; 18 | import java.util.LinkedList; 19 | import java.util.List; 20 | 21 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.PocMark.*; 22 | 23 | public class SPChineseTokenizer { 24 | 25 | /** 26 | * 结构感知器模型 27 | */ 28 | private StructuredPerceptronClassifier classifier; 29 | 30 | /** 31 | * 前向Label 二维数组 32 | */ 33 | protected int[][] previousTrans; 34 | 35 | /** 36 | * 地名 ns 词典黏结. 37 | */ 38 | public final LexiconCementer nsCementer; 39 | 40 | /** 41 | * 习语 idiom 词典黏结. 42 | */ 43 | public final LexiconCementer idiomCementer; 44 | 45 | /** 46 | * 自定义词典,可为null 47 | */ 48 | @Nullable 49 | protected LexiconCementer uwCementer = null; 50 | 51 | private static final class Config { 52 | 53 | /** 54 | * 是否开启黏结书名号内的词. 55 | */ 56 | private static boolean isEnableTileWord = false; 57 | 58 | /** 59 | * 是否开启停用词过滤 60 | */ 61 | private static boolean isEnableFilterStopWords = false; 62 | 63 | /** 64 | * 是否开启转简体中文 65 | */ 66 | private static boolean isEnableConvertToSimplifiedCHN = false; 67 | 68 | } 69 | 70 | SPChineseTokenizer(InputStream weightInput, InputStream featureInput, InputStream labelInput) { 71 | try { 72 | this.classifier = new StructuredPerceptronClassifier( 73 | new StructuredPerceptronModel(weightInput, featureInput, labelInput)); 74 | this.nsCementer = new LexiconCementer( 75 | this.getClass().getResourceAsStream(ModelPaths.NS_BIN_PATH), "ns"); 76 | this.idiomCementer = new LexiconCementer( 77 | this.getClass().getResourceAsStream(ModelPaths.IDIOM_BIN_PATH), "i"); 78 | } catch (IOException e) { 79 | throw new RuntimeException(e); 80 | } 81 | this.previousTrans = setPreviousTransitions(classifier.getLabelValues()); 82 | } 83 | 84 | /** 85 | * Label 前向转移图 86 | * 87 | * @param labelValues label值 88 | * @return 前向转移二维数组,每行表示该label的所有前向label 89 | */ 90 | private int[][] setPreviousTransitions(String[] labelValues) { 91 | int labelSize = labelValues.length; 92 | List> labelTransitions = new ArrayList<>(); 93 | for (int i = 0; i < labelSize; i++) { 94 | labelTransitions.add(new LinkedList<>()); 95 | } 96 | for (int cur = 0; cur < labelSize; cur++) { 97 | for (int pre = 0; pre < labelSize; pre++) { 98 | String curString = labelValues[cur]; 99 | String preString = labelValues[pre]; 100 | char curPoc = curString.charAt(0); 101 | char prePoc = preString.charAt(0); 102 | // 如果有相同词性或者不带词性,按转移规则进行转移 103 | if (curString.substring(1).equals(preString.substring(1))) { 104 | // B 前面只能是E 或S 105 | if (curPoc == POS_B_CHAR) { 106 | if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) { 107 | labelTransitions.get(cur).add(pre); 108 | } 109 | } 110 | // M 前面只能是M 或 B 111 | else if (curPoc == POS_M_CHAR) { 112 | if (prePoc == POS_M_CHAR || prePoc == POS_B_CHAR) { 113 | labelTransitions.get(cur).add(pre); 114 | } 115 | } 116 | // E 前面只能是B 或 M 117 | else if (curPoc == POS_E_CHAR) { 118 | if (prePoc == POS_B_CHAR || prePoc == POS_M_CHAR) { 119 | labelTransitions.get(cur).add(pre); 120 | } 121 | } 122 | // S 前面只能是E 或 S 123 | else if (curPoc == POS_S_CHAR) { 124 | if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) { 125 | labelTransitions.get(cur).add(pre); 126 | } 127 | } 128 | } 129 | // 如果带有词性并且前后词性不相同,那么则按规则 130 | // B 前面只能是E 或S,S 前面只能是E 或S 进行转移 131 | else if (curString.length() > 1) { 132 | if (curPoc == POS_B_CHAR || curPoc == POS_S_CHAR) { 133 | if (prePoc == POS_E_CHAR || prePoc == POS_S_CHAR) { 134 | labelTransitions.get(cur).add(pre); 135 | } 136 | } 137 | } 138 | } 139 | } 140 | // 将List 转成二维数组 141 | int[][] previousTrans = new int[labelSize][]; 142 | for (int i = 0; i < labelSize; i++) { 143 | previousTrans[i] = new int[labelTransitions.get(i).size()]; 144 | for (int j = 0; j < labelTransitions.get(i).size(); j++) { 145 | previousTrans[i][j] = labelTransitions.get(i).get(j); 146 | } 147 | } 148 | return previousTrans; 149 | } 150 | 151 | /** 152 | * 序列标注分词 153 | * 154 | * @param text 输入文本 155 | * @return 序列标注结果 156 | */ 157 | public List tokenize(String text) { 158 | List tokenItems = new ArrayList<>(); 159 | if (text.length() == 0) { 160 | return tokenItems; 161 | } 162 | 163 | AnnotatedTerms annotatedTerms; 164 | // 若开启转简体 165 | if (Config.isEnableConvertToSimplifiedCHN) { 166 | String simplifiedSentence = ChineseUtils.simplified(text); 167 | annotatedTerms = RuleAnnotator.annotate(simplifiedSentence, Config.isEnableTileWord); 168 | } else { 169 | annotatedTerms = RuleAnnotator.annotate(text, Config.isEnableTileWord); 170 | } 171 | if (annotatedTerms.isEmpty()) { 172 | return tokenItems; 173 | } 174 | 175 | int[] labels = classifier.classify(annotatedTerms, previousTrans); 176 | 177 | char[] rawChars = annotatedTerms.getPreAnnotateChars(); 178 | String[] labelValues = classifier.getLabelValues(); 179 | for (int i = 0, offset = 0; i < rawChars.length; i++) { 180 | String label = labelValues[labels[i]]; 181 | char pocChar = label.charAt(0); 182 | if (pocChar == POS_E_CHAR || pocChar == POS_S_CHAR) { 183 | String word = new String(rawChars, offset, i + 1 - offset); 184 | if (label.length() >= 2) { 185 | tokenItems.add(new TokenItem(word, label.substring(1))); 186 | } else { 187 | tokenItems.add(new TokenItem(word, null)); 188 | } 189 | offset = i + 1; 190 | } 191 | } 192 | // 若开启停用词过滤 193 | if (Config.isEnableFilterStopWords) { 194 | filterStopWords(tokenItems); 195 | } 196 | // 地名词典黏结 197 | nsCementer.cement(tokenItems); 198 | // 习语词典黏结 199 | idiomCementer.cement(tokenItems); 200 | // 特定词语黏结 201 | SpecifiedWordCementer.cementWord(tokenItems); 202 | if (uwCementer != null) { 203 | uwCementer.cement(tokenItems); 204 | } 205 | return tokenItems; 206 | } 207 | 208 | /** 209 | * 添加自定义词典 210 | * 211 | * @param words 词典 212 | */ 213 | public void addUserWords(List words) { 214 | DoubleArrayTrie dat = DoubleArrayTrie.make(words); 215 | this.uwCementer = new LexiconCementer(dat, "uw"); 216 | } 217 | 218 | /** 219 | * 开启书名单独成词 220 | */ 221 | public void enableTitleWord() { 222 | Config.isEnableTileWord = true; 223 | } 224 | 225 | /** 226 | * 开启停用词过滤 227 | */ 228 | public void enableFilterStopWords() { 229 | Config.isEnableFilterStopWords = true; 230 | } 231 | 232 | /** 233 | * 开启转简写 234 | */ 235 | public void enableConvertToSimplifiedCHN() { 236 | Config.isEnableConvertToSimplifiedCHN = true; 237 | } 238 | 239 | /** 240 | * 过滤停用词 241 | * 242 | * @param tokenItems 解码结果 243 | */ 244 | private void filterStopWords(List tokenItems) { 245 | for (int i = 0; i < tokenItems.size(); ) { 246 | if (ChineseUtils.isStopWord(tokenItems.get(i).word)) { 247 | tokenItems.remove(i); 248 | } else { 249 | i++; 250 | } 251 | } 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/Segmenter.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import java.util.List; 4 | import java.util.stream.Collectors; 5 | 6 | import static io.github.yizhiru.thulac4j.util.ModelPaths.*; 7 | 8 | /** 9 | * 中文分词. 10 | */ 11 | public final class Segmenter { 12 | 13 | private static final SPChineseTokenizer TOKENIZER = new SPChineseTokenizer( 14 | Segmenter.class.getResourceAsStream(SEGMENTER_WEIGHT_PATH), 15 | Segmenter.class.getResourceAsStream(SEGMENTER_FEATURE_PATH), 16 | Segmenter.class.getResourceAsStream(SEGMENTER_LABEL_PATH)); 17 | 18 | /** 19 | * 中文分词 20 | * 21 | * @param text 待分词文本 22 | * @return 分词结果 23 | */ 24 | public static List segment(String text) { 25 | return TOKENIZER.tokenize(text) 26 | .stream() 27 | .map(item -> (item.word)) 28 | .collect(Collectors.toList()); 29 | } 30 | 31 | /** 32 | * 添加自定义词典 33 | * 34 | * @param words 词典 35 | */ 36 | public static void addUserWords(List words) { 37 | TOKENIZER.addUserWords(words); 38 | } 39 | 40 | /** 41 | * 开启开启书名单独成词 42 | */ 43 | public static void enableTitleWord() { 44 | TOKENIZER.enableTitleWord(); 45 | } 46 | 47 | /** 48 | * 开启停用词过滤 49 | */ 50 | public static void enableFilterStopWords() { 51 | TOKENIZER.enableFilterStopWords(); 52 | } 53 | 54 | /** 55 | * 开启转简写 56 | */ 57 | public static void enableConvertToSimplifiedCHN() { 58 | TOKENIZER.enableConvertToSimplifiedCHN(); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/common/DoubleArrayTrie.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.common; 2 | 3 | import io.github.yizhiru.thulac4j.util.IOUtils; 4 | 5 | import java.io.*; 6 | import java.nio.ByteBuffer; 7 | import java.nio.ByteOrder; 8 | import java.nio.IntBuffer; 9 | import java.nio.channels.FileChannel; 10 | import java.util.Arrays; 11 | import java.util.LinkedList; 12 | import java.util.List; 13 | import java.util.stream.Collectors; 14 | 15 | /** 16 | * Double Array Trie (DAT). 17 | */ 18 | public class DoubleArrayTrie implements Serializable { 19 | 20 | private static final long serialVersionUID = 8713857561296693244L; 21 | 22 | public static final int MATCH_FAILURE_INDEX = -1; 23 | 24 | /** 25 | * Base array. 26 | */ 27 | protected int[] baseArray; 28 | 29 | /** 30 | * Check array. 31 | */ 32 | protected int[] checkArray; 33 | 34 | 35 | /** 36 | * The size of DAT. 37 | */ 38 | protected int size; 39 | 40 | public DoubleArrayTrie(int[] baseArray, int[] checkArray) { 41 | if (baseArray.length != checkArray.length) { 42 | throw new IllegalArgumentException(String.format("The getAnnotatedLength of base array %s != the getAnnotatedLength of check " + 43 | "array %s", baseArray.length, checkArray.length)); 44 | } 45 | this.baseArray = baseArray; 46 | this.checkArray = checkArray; 47 | size = baseArray.length; 48 | } 49 | 50 | public DoubleArrayTrie(int[] baseArray, int[] checkArray, int size) { 51 | this.baseArray = Arrays.copyOf(baseArray, size); 52 | this.checkArray = Arrays.copyOf(checkArray, size); 53 | this.size = size; 54 | } 55 | 56 | private DoubleArrayTrie() { 57 | } 58 | 59 | /** 60 | * The size of DAT. 61 | * 62 | * @return size 63 | */ 64 | public int size() { 65 | return size; 66 | } 67 | 68 | /** 69 | * Ensure the index is not out bound. 70 | * 71 | * @param index the index value. 72 | */ 73 | private void ensureValidIndex(int index) { 74 | if (index >= size()) { 75 | throw new RuntimeException(String.format("The index %s is out of bound [%s].", 76 | index, size())); 77 | } 78 | } 79 | 80 | /** 81 | * Get base value by its index. 82 | * 83 | * @param index the index of base array. 84 | * @return the base value. 85 | */ 86 | public int getBaseByIndex(int index) { 87 | ensureValidIndex(index); 88 | return baseArray[index]; 89 | } 90 | 91 | /** 92 | * Get check value by its index. 93 | * 94 | * @param index the index of check array. 95 | * @return the check value. 96 | */ 97 | public int getCheckByIndex(int index) { 98 | ensureValidIndex(index); 99 | return checkArray[index]; 100 | } 101 | 102 | /** 103 | * 序列化. 104 | * 105 | * @param path 文件路径 106 | */ 107 | public void serialize(String path) throws IOException { 108 | FileChannel channel = new FileOutputStream(path).getChannel(); 109 | ByteBuffer byteBuffer = ByteBuffer.allocateDirect(4 * (2 * size() + 1)); 110 | IntBuffer intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN) 111 | .asIntBuffer(); 112 | intBuffer.put(size()); 113 | intBuffer.put(baseArray); 114 | intBuffer.put(checkArray); 115 | channel.write(byteBuffer); 116 | channel.close(); 117 | } 118 | 119 | /** 120 | * 加载序列化DAT模型 121 | * 122 | * @param path 文件目录 123 | * @return DAT模型 124 | */ 125 | public static DoubleArrayTrie loadDat(String path) throws IOException { 126 | return loadDat(new FileInputStream(path)); 127 | } 128 | 129 | /** 130 | * 加载序列化DAT模型 131 | * 132 | * @param inputStream 文件输入流 133 | * @return DAT模型 134 | */ 135 | public static DoubleArrayTrie loadDat(InputStream inputStream) { 136 | int[] array; 137 | try { 138 | array = IOUtils.toIntArray(inputStream); 139 | } catch (IOException e) { 140 | throw new RuntimeException(e); 141 | } 142 | int arrayLen = array[0]; 143 | int[] baseArr = Arrays.copyOfRange(array, 1, arrayLen + 1); 144 | int[] checkArr = Arrays.copyOfRange(array, arrayLen + 1, 2 * arrayLen + 1); 145 | return new DoubleArrayTrie(baseArr, checkArr); 146 | } 147 | 148 | /** 149 | * 按照DAT的转移方程进行转移: ROOT_PATH[r] + c = s, check[s] = r 150 | * 151 | * @param prefixIndex 前缀在DAT中的index 152 | * @param charValue 转移字符的int值 153 | * @return 在DAT中的index,若不在则为-1 154 | */ 155 | public int transition(int prefixIndex, int charValue) { 156 | if (prefixIndex < 0 || prefixIndex >= size()) { 157 | return MATCH_FAILURE_INDEX; 158 | } 159 | int index = baseArray[prefixIndex] + charValue; 160 | if (index >= size() || checkArray[index] != prefixIndex) { 161 | return MATCH_FAILURE_INDEX; 162 | } 163 | return index; 164 | } 165 | 166 | /** 167 | * 词是否在trie树中 168 | * 169 | * @param word 词 170 | * @return 若存在,则为true 171 | */ 172 | public boolean isWordMatched(String word) { 173 | return isWordMatched(-match(word)); 174 | } 175 | 176 | /** 177 | * 词是否在trie树中 178 | * 179 | * @param matchedIndex 已匹配上词前缀的index 180 | * @return 若存在,则为true 181 | */ 182 | public boolean isWordMatched(int matchedIndex) { 183 | if (matchedIndex <= 0) { 184 | return false; 185 | } 186 | int base = baseArray[matchedIndex]; 187 | return base < size() && checkArray[base] == matchedIndex; 188 | } 189 | 190 | /** 191 | * 前缀是否在trie树中 192 | * 193 | * @param prefix 前缀 194 | * @return 若存在,则为true 195 | */ 196 | public boolean isPrefixMatched(String prefix) { 197 | return match(prefix) < 0; 198 | } 199 | 200 | /** 201 | * 匹配字符串. 202 | * 203 | * @param str 字符串 204 | * @return 若匹配上,则为转移后index的负值;否则,则返回已匹配上的字符数 205 | */ 206 | protected int match(String str) { 207 | return match(0, str); 208 | } 209 | 210 | /** 211 | * 匹配字符串. 212 | * 213 | * @param startIndex DAT的开始index 214 | * @param str 字符串 215 | * @return 若匹配上,则为转移后index的负值;否则,则返回已匹配上的字符数 216 | */ 217 | public int match(int startIndex, String str) { 218 | int index = startIndex; 219 | for (int i = 0; i < str.length(); i++) { 220 | index = transition(index, str.charAt(i)); 221 | if (index == MATCH_FAILURE_INDEX) { 222 | return i; 223 | } 224 | } 225 | return -index; 226 | } 227 | 228 | private static class Builder extends DoubleArrayTrie { 229 | 230 | private static final long serialVersionUID = 1675990036852836829L; 231 | 232 | /** 233 | * 标记可用的base index值. 234 | */ 235 | private int availableBaseIndex; 236 | 237 | /** 238 | * Initial value. 239 | */ 240 | private static final int INITIAL_VALUE = -1; 241 | 242 | private Builder() { 243 | baseArray = new int[]{0}; 244 | checkArray = new int[]{INITIAL_VALUE}; 245 | size = 1; 246 | availableBaseIndex = 0; 247 | } 248 | 249 | /** 250 | * Expand two size. 251 | */ 252 | private void expand() { 253 | int oldCapacity = size; 254 | int newCapacity = oldCapacity << 1; 255 | baseArray = Arrays.copyOf(baseArray, newCapacity); 256 | Arrays.fill(baseArray, oldCapacity, newCapacity, INITIAL_VALUE); 257 | checkArray = Arrays.copyOf(checkArray, newCapacity); 258 | Arrays.fill(checkArray, oldCapacity, newCapacity, INITIAL_VALUE); 259 | 260 | size = newCapacity; 261 | } 262 | 263 | /** 264 | * Remove useless base and check. 265 | */ 266 | private void shrink() { 267 | for (int i = checkArray.length - 1; i >= 0; i--) { 268 | if (checkArray[i] == INITIAL_VALUE) { 269 | size--; 270 | } else { 271 | break; 272 | } 273 | } 274 | } 275 | 276 | /** 277 | * 找到满足条件的baseIndex 278 | * 279 | * @param children 前缀的后一字符集合 280 | * @return baseIndex 281 | */ 282 | private int findBaseIndex(List children) { 283 | int cSize = children.size(); 284 | for (int bi = availableBaseIndex; ; bi++) { 285 | if (bi == size()) { 286 | expand(); 287 | } 288 | if (cSize > 0) { 289 | while (bi + children.get(cSize - 1) >= size()) { 290 | expand(); 291 | } 292 | } 293 | // baseIndex应满足条件: 294 | // 1. 未被使用 295 | // 2. 满足所有children跳转到的node也未被使用 296 | if (checkArray[bi] >= 0) { 297 | continue; 298 | } 299 | boolean isValid = true; 300 | for (Integer c : children) { 301 | if (checkArray[bi + c] >= 0) { 302 | isValid = false; 303 | break; 304 | } 305 | } 306 | if (isValid) { 307 | return bi; 308 | } 309 | } 310 | } 311 | 312 | /** 313 | * 插入到Trie树 314 | * 315 | * @param prefixIndex 前缀对应的index 316 | * @param children 前缀的后一字符集合 317 | * @param isWord 前缀是否为词 318 | */ 319 | private void insert(int prefixIndex, List children, boolean isWord) { 320 | int bi = findBaseIndex(children); 321 | baseArray[prefixIndex] = bi; 322 | if (isWord) { 323 | checkArray[bi] = prefixIndex; 324 | availableBaseIndex = bi + 1; 325 | } 326 | for (int c : children) { 327 | baseArray[bi + c] = 0; 328 | checkArray[bi + c] = prefixIndex; 329 | } 330 | } 331 | 332 | /** 333 | * 给定前缀生成后一字符集合 334 | * 335 | * @param sortedLexicon 按字典序排序后的词典 336 | * @param startLexiconIndex 词典开始时的索引位置 337 | * @param prefix 前缀 338 | * @return 后一字符集合 339 | */ 340 | private List generateChildren(List sortedLexicon, 341 | int startLexiconIndex, 342 | String prefix) { 343 | List children = new LinkedList<>(); 344 | int prefixLen = prefix.length(); 345 | for (int i = startLexiconIndex; i < sortedLexicon.size(); i++) { 346 | String word = sortedLexicon.get(i); 347 | // 停止循环条件: 348 | // 1. 词的长度小于前缀长度 349 | // 2. 词的前缀与给定前缀不一致 350 | if (word.length() < prefixLen 351 | || !word.substring(0, prefixLen).equals(prefix)) { 352 | return children; 353 | } else if (word.length() > prefixLen) { 354 | int charValue = (int) word.charAt(prefixLen); 355 | if (children.isEmpty() || charValue != children.get(children.size() - 1)) { 356 | children.add(charValue); 357 | } 358 | } 359 | } 360 | return children; 361 | } 362 | 363 | /** 364 | * 构建DAT 365 | * 366 | * @param lexicon 词典 367 | * @return 词典对应的DAT 368 | */ 369 | private DoubleArrayTrie build(List lexicon) { 370 | lexicon.sort(String::compareTo); 371 | String word, prefix; 372 | int preIndex; 373 | for (int i = 0; i < lexicon.size(); i++) { 374 | word = lexicon.get(i); 375 | int matched = match(word); 376 | matched = matched < 0 ? word.length() : matched; 377 | for (int j = matched; j <= word.length(); j++) { 378 | prefix = word.substring(0, j); 379 | preIndex = -match(prefix); 380 | List children = generateChildren(lexicon, i, prefix); 381 | insert(preIndex, children, j == word.length()); 382 | } 383 | matched = -match(word); 384 | baseArray[baseArray[matched]] = i; 385 | } 386 | shrink(); 387 | return new DoubleArrayTrie(baseArray, checkArray, size); 388 | } 389 | } 390 | 391 | /** 392 | * Make DAT. 393 | * 394 | * @param path file path. 395 | * @return DAT 396 | */ 397 | public static DoubleArrayTrie make(String path) throws FileNotFoundException { 398 | return make(new FileInputStream(path)); 399 | } 400 | 401 | /** 402 | * Make DAT. 403 | * 404 | * @param inputStream input stream of file 405 | * @return DAT 406 | */ 407 | public static DoubleArrayTrie make(InputStream inputStream) { 408 | BufferedReader br = new BufferedReader(new InputStreamReader(inputStream)); 409 | List lexicon = br.lines() 410 | .map(String::trim) 411 | .collect(Collectors.toList()); 412 | return make(lexicon); 413 | } 414 | 415 | public static DoubleArrayTrie make(List lexicon) { 416 | return new Builder().build(lexicon); 417 | } 418 | 419 | /** 420 | * 从DAT 还原成词典. 421 | * 422 | * @param dat DAT 423 | */ 424 | public static List restore(DoubleArrayTrie dat) { 425 | String word; 426 | LinkedList list = new LinkedList<>(); 427 | for (int i = 0; i < dat.size(); i++) { 428 | if (dat.getCheckByIndex(i) >= 0) { 429 | word = restoreWord(dat, i); 430 | if (dat.isWordMatched(word)) { 431 | list.add(word); 432 | } 433 | } 434 | } 435 | return list; 436 | } 437 | 438 | /** 439 | * Restore word by its last index. 440 | * 441 | * @param dat Double Array Trie 442 | * @param index the last index of word, i.e. its check >= 0 443 | * @return word 444 | */ 445 | private static String restoreWord(DoubleArrayTrie dat, int index) { 446 | int pre; 447 | int cur = index; 448 | StringBuilder sb = new StringBuilder(); 449 | while (cur > 0 && cur < dat.size()) { 450 | pre = dat.getCheckByIndex(cur); 451 | if (pre == cur || dat.getBaseByIndex(pre) >= cur) { 452 | break; 453 | } 454 | sb.insert(0, (char) (cur - dat.getBaseByIndex(pre))); 455 | cur = pre; 456 | } 457 | return sb.toString(); 458 | } 459 | } 460 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/common/Nullable.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.common; 2 | 3 | import java.lang.annotation.Documented; 4 | import java.lang.annotation.ElementType; 5 | import java.lang.annotation.Retention; 6 | import java.lang.annotation.RetentionPolicy; 7 | import java.lang.annotation.Target; 8 | 9 | /** 10 | * Declares that null is a valid value for a Java type. May be applied to parameters, 11 | * fields and methods (to declare the return type). 12 | */ 13 | @Retention(RetentionPolicy.RUNTIME) 14 | @Target({ElementType.PARAMETER, ElementType.METHOD, ElementType.FIELD}) 15 | @Documented 16 | public @interface Nullable { 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronClassifier.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.perceptron; 2 | 3 | import io.github.yizhiru.thulac4j.term.POC; 4 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms; 5 | 6 | 7 | public final class StructuredPerceptronClassifier { 8 | 9 | /** 10 | * Null previous label. 11 | */ 12 | private static final int NULL_PREVIOUS_LABEL = -5; 13 | 14 | /** 15 | * Initial score. 16 | */ 17 | private static final int INITIAL_SCORE = 0; 18 | 19 | /** 20 | * Initial previous label. 21 | */ 22 | private static final int INITIAL_PREVIOUS_LABEL = -1; 23 | 24 | /** 25 | * SP 模型. 26 | */ 27 | private StructuredPerceptronModel model; 28 | 29 | 30 | public StructuredPerceptronClassifier(StructuredPerceptronModel model) { 31 | this.model = model; 32 | } 33 | 34 | /** 35 | * 解码路径节点 36 | */ 37 | private static class PathNode { 38 | /** 39 | * Score. 40 | */ 41 | private int score; 42 | 43 | /** 44 | * Previous Label. 45 | */ 46 | private int previousLabel; 47 | 48 | public PathNode() { 49 | score = INITIAL_SCORE; 50 | previousLabel = NULL_PREVIOUS_LABEL; 51 | } 52 | 53 | @Override 54 | public String toString() { 55 | return score + ", " + previousLabel; 56 | } 57 | } 58 | 59 | /** 60 | * 结构感知器分类,采用Viterbi算法解码 61 | * 62 | * @param annotatedTerms 规则处理后的句子Label 类 63 | * @param previousTransition 前向转移label 64 | * @return 最优路径对应的label索引值 65 | */ 66 | public int[] classify( 67 | AnnotatedTerms annotatedTerms, 68 | int[][] previousTransition) { 69 | int len = annotatedTerms.getAnnotatedLength(); 70 | // 最优路径对应的label 71 | int[] bestPath = new int[len]; 72 | int labelSize = model.labelSize; 73 | int optimalLastScore = Integer.MIN_VALUE; 74 | int optimalLastLabel = 2; 75 | PathNode node; 76 | // 记录在位置i时类别为y的最优路径 77 | // [current index][current Label] -> PathNode(score, previousLabel) 78 | PathNode[][] pathTabular = new PathNode[len][]; 79 | for (int i = 0; i < len; i++) { 80 | pathTabular[i] = new PathNode[labelSize]; 81 | for (int j = 0; j < labelSize; j++) { 82 | pathTabular[i][j] = new PathNode(); 83 | } 84 | } 85 | 86 | char[] chars = annotatedTerms.appendBoundaryAround(); 87 | POC[] pocs = annotatedTerms.getPocs(); 88 | 89 | // DP求解 90 | for (int i = 0; i < len; i++) { 91 | int[] labelIndices = model.allowTabular[pocs[i].ordinal()]; 92 | int[] weights = model.evaluateCharWeights( 93 | chars[i], 94 | chars[i + 1], 95 | chars[i + 2], 96 | chars[i + 3], 97 | chars[i + 4], 98 | labelIndices); 99 | for (int labelIndex : labelIndices) { 100 | node = pathTabular[i][labelIndex]; 101 | if (i == 0) { 102 | node.previousLabel = INITIAL_PREVIOUS_LABEL; 103 | } else { 104 | int[] preLabels = previousTransition[labelIndex]; 105 | for (int pre : preLabels) { 106 | if (pathTabular[i - 1][pre].previousLabel == NULL_PREVIOUS_LABEL) { 107 | continue; 108 | } 109 | int score = pathTabular[i - 1][pre].score 110 | + model.llWeights[pre * model.labelSize + labelIndex]; 111 | if (node.previousLabel == NULL_PREVIOUS_LABEL || score > node.score) { 112 | node.score = score; 113 | node.previousLabel = pre; 114 | } 115 | } 116 | } 117 | node.score += weights[labelIndex]; 118 | if (i == len - 1 && optimalLastScore < node.score) { 119 | optimalLastScore = node.score; 120 | optimalLastLabel = labelIndex; 121 | } 122 | } 123 | } 124 | // 尾节点的最优label 125 | node = pathTabular[len - 1][optimalLastLabel]; 126 | bestPath[len - 1] = optimalLastLabel; 127 | // 回溯最优路径,保留label到数组 128 | for (int i = len - 2; i >= 0; i--) { 129 | bestPath[i] = node.previousLabel; 130 | node = pathTabular[i][node.previousLabel]; 131 | } 132 | return bestPath; 133 | } 134 | 135 | /** 136 | * 得到所有label. 137 | */ 138 | public String[] getLabelValues() { 139 | return model.labelValues; 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronModel.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.perceptron; 2 | 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie; 4 | import io.github.yizhiru.thulac4j.util.IOUtils; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.Serializable; 9 | import java.nio.ByteBuffer; 10 | import java.nio.ByteOrder; 11 | import java.nio.IntBuffer; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | import static io.github.yizhiru.thulac4j.common.DoubleArrayTrie.MATCH_FAILURE_INDEX; 16 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.NGramFeature.*; 17 | 18 | /** 19 | * 结构感知器模型. 20 | */ 21 | public final class StructuredPerceptronModel implements Serializable { 22 | 23 | private static final long serialVersionUID = -5324153272692800765L; 24 | 25 | /** 26 | * Label数量. 27 | */ 28 | public final int labelSize; 29 | 30 | /** 31 | * 特征数量. 32 | */ 33 | public final int featureSize; 34 | 35 | /** 36 | * label转移到label的权重. 37 | */ 38 | public final int[] llWeights; 39 | 40 | /** 41 | * 特征对应某label的权重. 42 | * Why use one-dimensional array but not two-dimensional array? Please Refer to 43 | * 44 | * https://stackoverflow.com/questions/2512082/java-multi-dimensional-array-vs-one-dimensional 45 | * 46 | */ 47 | public final int[] flWeights; 48 | 49 | /** 50 | * Feature DAT 51 | */ 52 | private final DoubleArrayTrie featureDat; 53 | 54 | /** 55 | * Label, 对应于cws_label.txt 或者 model_c_label.txt 56 | */ 57 | public final String[] labelValues; 58 | 59 | /** 60 | * 映射 enum POC 对应的所有label 索引值. 61 | * 其中,行为POC的ordinal值,列为索引值 62 | */ 63 | public final int[][] allowTabular; 64 | 65 | /** 66 | * 加载训练模型 67 | * 68 | * @param weightInput label转移权重、特征label权重, cws_model.bin 69 | * * 或者model_c_model.bin 70 | * @param featureInput 特征DAT cws_dat.bin 或者 model_c_dat.bin 71 | * @param labelInput label 72 | * @throws IOException if an I/O error occurs 73 | */ 74 | public StructuredPerceptronModel(InputStream weightInput, InputStream featureInput, InputStream labelInput) throws IOException { 75 | // Load weights model 76 | ByteBuffer byteBuffer = ByteBuffer.wrap(IOUtils.toByteArray(weightInput)); 77 | IntBuffer intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN) 78 | .asIntBuffer(); 79 | labelSize = intBuffer.get(); 80 | featureSize = intBuffer.get(); 81 | llWeights = new int[labelSize * labelSize]; 82 | flWeights = new int[featureSize * labelSize]; 83 | intBuffer.get(llWeights); 84 | intBuffer.get(flWeights); 85 | 86 | // Load feature DAT 87 | byteBuffer = ByteBuffer.wrap(IOUtils.toByteArray(featureInput)); 88 | // int类型占4个字节 89 | int arrayLen = byteBuffer.remaining() / 4; 90 | int[] featureArray = new int[arrayLen]; 91 | intBuffer = byteBuffer.order(ByteOrder.LITTLE_ENDIAN) 92 | .asIntBuffer(); 93 | intBuffer.get(featureArray); 94 | // convert feature DAT 95 | int[] baseArr = new int[arrayLen / 2]; 96 | int[] checkArr = new int[arrayLen / 2]; 97 | for (int i = 0; i < arrayLen / 2; i++) { 98 | baseArr[i] = featureArray[2 * i]; 99 | checkArr[i] = featureArray[2 * i + 1]; 100 | } 101 | featureDat = new DoubleArrayTrie(baseArr, checkArr); 102 | 103 | List labelList = IOUtils.readLines(labelInput); 104 | labelValues = new String[labelList.size()]; 105 | labelList.toArray(labelValues); 106 | 107 | // 记录label 集合,能与allowTabular 映射起来 108 | List> posTags = getPosTags(); 109 | 110 | // allowTabular 表示enum POC 对应的所有允许label,比如: 111 | // PUNCTUATION_POC 对应的允许label为 3 或 3w, 112 | // BEGIN_POC 对应的允许label为 0 或 0打头的label 113 | allowTabular = new int[12][]; 114 | for (int i = 0; i < labelValues.length; i++) { 115 | // punctuation 116 | if ("3".equals(labelValues[i]) || "3w".equals(labelValues[i])) { 117 | allowTabular[0] = new int[]{i}; 118 | } 119 | // single of numeral 120 | if ("3".equals(labelValues[i]) || "3m".equals(labelValues[i])) { 121 | allowTabular[4] = new int[]{i}; 122 | } 123 | // begin of numeral 124 | else if ("0".equals(labelValues[i]) || "0m".equals(labelValues[i])) { 125 | allowTabular[1] = new int[]{i}; 126 | } 127 | // middle of numeral 128 | else if ("1".equals(labelValues[i]) || "1m".equals(labelValues[i])) { 129 | allowTabular[2] = new int[]{i}; 130 | } 131 | // end of numeral 132 | else if ("2".equals(labelValues[i]) || "2m".equals(labelValues[i])) { 133 | allowTabular[3] = new int[]{i}; 134 | } 135 | } 136 | int[] indices = {1, 2, 4, 8, 9, 12, 15}; 137 | for (int i = 0; i < indices.length; i++) { 138 | allowTabular[i + 5] = posTags.get(indices[i]) 139 | .stream() 140 | .mapToInt(x -> x) 141 | .toArray(); 142 | } 143 | } 144 | 145 | /** 146 | * 计算所有可能label 索引值集合,以二维数组表示 147 | * 148 | * @return 索引值二维数组 149 | */ 150 | private List> getPosTags() { 151 | List> posTagsList = new ArrayList<>(); 152 | int defaultSize = 16; 153 | for (int i = 0; i < defaultSize; i++) { 154 | posTagsList.add(new ArrayList<>()); 155 | } 156 | for (int i = 0; i < labelValues.length; i++) { 157 | int segIndex = labelValues[i].charAt(0) - '0'; 158 | for (int j = 0; j < defaultSize; j++) { 159 | if (((1 << segIndex) & j) != 0) { 160 | posTagsList.get(j).add(i); 161 | } 162 | } 163 | } 164 | return posTagsList; 165 | } 166 | 167 | /** 168 | * 训练模型文件中POC对应的标识. 169 | */ 170 | public static final class PocMark { 171 | /** 172 | * 对应于 POC B 的char. 173 | */ 174 | public static final Character POS_B_CHAR = '0'; 175 | 176 | /** 177 | * 对应于 POC M 的char. 178 | */ 179 | public static final Character POS_M_CHAR = '1'; 180 | 181 | /** 182 | * 对应于 POC E 的char. 183 | */ 184 | public static final Character POS_E_CHAR = '2'; 185 | 186 | /** 187 | * 对应于 POC B 的char. 188 | */ 189 | public static final Character POS_S_CHAR = '3'; 190 | } 191 | 192 | /** 193 | * N-gram 特征. 194 | * THULAC采用的分词模型为结构化感知器(Structured Perceptron, SP),以最大熵准则 195 | * 建模序列标注的得分函数. 196 | */ 197 | public static class NGramFeature { 198 | 199 | /** 200 | * 超越边界的统一字符'#' 201 | */ 202 | public static final char BOUNDARY = 65283; 203 | 204 | /** 205 | * feature的一部分 206 | */ 207 | public static final char SPACE = ' '; 208 | 209 | /** 210 | * Unigram 特征种类1,对应于特征 mid + SPACE + '1',即标注对应的当前字符 211 | */ 212 | public static final char UNIGRAM_FEATURE_1 = '1'; 213 | 214 | /** 215 | * Unigram 特征种类2,对应于特征 left + SPACE + '2',即标注的前一字符 216 | */ 217 | public static final char UNIGRAM_FEATURE_2 = '2'; 218 | 219 | /** 220 | * Unigram 特征种类3,对应于特征 right + SPACE + '3',即标注的后一字符 221 | */ 222 | public static final char UNIGRAM_FEATURE_3 = '3'; 223 | 224 | /** 225 | * Bigram 特征种类1,对应于特征 left + mid + SPACE + '1', 226 | * 即标注的前一字符加上当前字符 227 | */ 228 | public static final char BIGRAM_FEATURE_1 = '1'; 229 | 230 | /** 231 | * Bigram 特征种类2,对应于特征 mid + right + SPACE + '2', 232 | * 即标注对应的当前字符加上后一字符 233 | */ 234 | public static final char BIGRAM_FEATURE_2 = '2'; 235 | 236 | /** 237 | * Bigram 特征种类3,对应于特征 left2 + left1 + SPACE + '3', 238 | * 即标注的前二字符加上前一字符 239 | */ 240 | public static final char BIGRAM_FEATURE_3 = '3'; 241 | 242 | /** 243 | * Bigram 特征种类4,对应于特征 right + right2 + SPACE + '4', 244 | * 即标注的后一字符加上后二字符. 245 | */ 246 | public static final char BIGRAM_FEATURE_4 = '4'; 247 | } 248 | 249 | /** 250 | * 寻找Unigram特征对应于DAT中的base. 251 | * 252 | * @param ch 字符 253 | * @param mark 标识属于3种特征中的一种: '1', '2', '3' 254 | * @return 若存在则返回base,否则则返回-1 255 | */ 256 | private int findUnigramFeat(char ch, char mark) { 257 | int index = (int) ch; 258 | index = featureDat.transition(index, SPACE); 259 | index = featureDat.transition(index, mark); 260 | if (index == MATCH_FAILURE_INDEX) { 261 | return MATCH_FAILURE_INDEX; 262 | } 263 | return featureDat.getBaseByIndex(index); 264 | } 265 | 266 | /** 267 | * 寻找Bigram特征对应于DAT中的base 268 | * 269 | * @param c1 第一个字符 270 | * @param c2 第二个字符 271 | * @param mark 标识属于4种特征中的一种: '1', '2', '3', '4' 272 | * @return 若存在则返回对应的base值,否则返回-1 273 | */ 274 | private int findBigramFeat(char c1, char c2, char mark) { 275 | int index1 = (int) c1; 276 | int index2 = (int) c2; 277 | int index = featureDat.transition(index1, index2); 278 | index = featureDat.transition(index, SPACE); 279 | index = featureDat.transition(index, mark); 280 | if (index == MATCH_FAILURE_INDEX) { 281 | return MATCH_FAILURE_INDEX; 282 | } 283 | return featureDat.getBaseByIndex(index); 284 | } 285 | 286 | /** 287 | * 根据featureDAT的base值,更新特征权重之和数组 288 | * 289 | * @param weights label权重之和数组 290 | * @param base featureDAT base值 291 | * @param labelIndices 允许POS 索引值 292 | */ 293 | private void addWeights(int[] weights, int base, int[] labelIndices) { 294 | int offset = base * labelSize; 295 | for (int i : labelIndices) { 296 | weights[i] += flWeights[offset + i]; 297 | } 298 | } 299 | 300 | /** 301 | * 根据前后一起的五个字符,计算加权特征权重之和数组 302 | * 303 | * @param left2 前二字符 304 | * @param left1 前一字符 305 | * @param mid 当前字符 306 | * @param right1 后一字符 307 | * @param right2 后二字符 308 | * @param labelIndices 允许label 索引值 309 | * @return 一维数组,表示当前字符的各label对应的特征权值加权之和 310 | */ 311 | public int[] evaluateCharWeights( 312 | char left2, 313 | char left1, 314 | char mid, 315 | char right1, 316 | char right2, 317 | int[] labelIndices) { 318 | int[] weights = new int[labelSize]; 319 | int base; 320 | if ((base = findUnigramFeat(mid, UNIGRAM_FEATURE_1)) != MATCH_FAILURE_INDEX) { 321 | addWeights(weights, base, labelIndices); 322 | } 323 | if ((base = findUnigramFeat(left1, UNIGRAM_FEATURE_2)) != MATCH_FAILURE_INDEX) { 324 | addWeights(weights, base, labelIndices); 325 | } 326 | if ((base = findUnigramFeat(right1, UNIGRAM_FEATURE_3)) != MATCH_FAILURE_INDEX) { 327 | addWeights(weights, base, labelIndices); 328 | } 329 | if ((base = findBigramFeat(left1, mid, BIGRAM_FEATURE_1)) != MATCH_FAILURE_INDEX) { 330 | addWeights(weights, base, labelIndices); 331 | } 332 | if ((base = findBigramFeat(mid, right1, BIGRAM_FEATURE_2)) != MATCH_FAILURE_INDEX) { 333 | addWeights(weights, base, labelIndices); 334 | } 335 | if ((base = findBigramFeat(left2, left1, BIGRAM_FEATURE_3)) != MATCH_FAILURE_INDEX) { 336 | addWeights(weights, base, labelIndices); 337 | } 338 | if ((base = findBigramFeat(right1, right2, BIGRAM_FEATURE_4)) != MATCH_FAILURE_INDEX) { 339 | addWeights(weights, base, labelIndices); 340 | } 341 | return weights; 342 | } 343 | } 344 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/process/LexiconCementer.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie; 4 | import io.github.yizhiru.thulac4j.term.TokenItem; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.Serializable; 9 | import java.util.List; 10 | 11 | /** 12 | * 基于词典黏结词. 13 | */ 14 | public final class LexiconCementer implements Serializable { 15 | 16 | private static final long serialVersionUID = 5479588292425956277L; 17 | 18 | /** 19 | * 词典DAT. 20 | */ 21 | private final DoubleArrayTrie dat; 22 | 23 | /** 24 | * 词性. 25 | */ 26 | private final String pos; 27 | 28 | /** 29 | * 加载序列化DAT模型. 30 | * 31 | * @param inputStream DAT输入流. 32 | * @param pos 词性. 33 | */ 34 | public LexiconCementer(InputStream inputStream, String pos) throws IOException { 35 | dat = DoubleArrayTrie.loadDat(inputStream); 36 | this.pos = pos; 37 | } 38 | 39 | /** 40 | * 构造器. 41 | * 42 | * @param dat DAT. 43 | * @param pos 词性. 44 | */ 45 | public LexiconCementer(DoubleArrayTrie dat, String pos) { 46 | this.dat = dat; 47 | this.pos = pos; 48 | } 49 | 50 | public void cement(List tokenItems) { 51 | int index; 52 | int j; 53 | for (int i = 0; i < tokenItems.size(); i++) { 54 | index = -dat.match(0, tokenItems.get(i).word); 55 | if (index <= 0) { 56 | continue; 57 | } 58 | StringBuilder builder = new StringBuilder(tokenItems.get(i).word); 59 | for (j = i + 1; j < tokenItems.size(); j++) { 60 | int preIndex = index; 61 | index = -dat.match(index, tokenItems.get(j).word); 62 | // 后面的词没有匹配上词典 63 | if (index <= 0) { 64 | index = preIndex; 65 | break; 66 | } 67 | builder.append(tokenItems.get(j).word); 68 | } 69 | // 若其后的词匹配上词典,则进行黏词 70 | String word = builder.toString(); 71 | if (dat.isWordMatched(index)) { 72 | tokenItems.set(i, new TokenItem(word, pos)); 73 | for (j = j - 1; j > i; j--) { 74 | tokenItems.remove(j); 75 | } 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/process/RuleAnnotator.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms; 4 | import io.github.yizhiru.thulac4j.term.CharType; 5 | import io.github.yizhiru.thulac4j.util.CharUtils; 6 | import io.github.yizhiru.thulac4j.term.POC; 7 | 8 | import java.util.function.Predicate; 9 | 10 | import static io.github.yizhiru.thulac4j.util.CharUtils.*; 11 | 12 | 13 | /** 14 | * 借助标点符号、数字等信息,提前标注字符的可能label. 15 | */ 16 | public final class RuleAnnotator { 17 | 18 | /** 19 | * 依据标点符号、数字等规则,标注部分字符的POC 20 | * 21 | * @param text 待分词文本 22 | * @return 清洗后String 23 | */ 24 | public static AnnotatedTerms annotate(String text, boolean isEnableTileWord) { 25 | int len = text.length(); 26 | AnnotatedTerms annotatedTerms = new AnnotatedTerms(text.toCharArray()); 27 | boolean hasTitleBegin = false; 28 | int titleBegin = 0; 29 | for (int i = 0; i < len; ) { 30 | CharType charType = annotatedTerms.getCharTypeByIndex(i); 31 | // 1. Space or control character 32 | if (charType == CharType.SPACE_OR_CONTROL_CHAR) { 33 | annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC); 34 | // 连续忽略 35 | for (i++; i < len; i++) { 36 | if (annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) { 37 | break; 38 | } 39 | } 40 | // 处理后面字符 41 | if (i < len) { 42 | annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC); 43 | } 44 | } 45 | // 2. 标点符号 46 | else if (charType == CharType.SINGLE_PUNCTUATION_CHAR) { 47 | annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC); 48 | annotatedTerms.append(i, POC.PUNCTUATION_POC); 49 | if (isEnableTileWord) { 50 | // 前书名号 51 | char ch = annotatedTerms.getRawCharByIndex(i); 52 | if (ch == LEFT_TITLE_QUOTATION_CHAR) { 53 | hasTitleBegin = true; 54 | titleBegin = i; 55 | } 56 | // 后书名号 57 | else if (hasTitleBegin && ch == RIGHT_TITLE_QUOTATION_CHAR) { 58 | if (isPossibleTitle(annotatedTerms, titleBegin + 1, i - 1)) { 59 | setTitleWordPoc(annotatedTerms, 60 | titleBegin + 1, 61 | i - 1, 62 | annotatedTerms.getAnnotatedLength() - 2); 63 | } 64 | hasTitleBegin = false; 65 | } 66 | } 67 | i++; 68 | // 处理后面字符 69 | if (i < len && annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) { 70 | annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC); 71 | } 72 | } 73 | // 3. 英文字母 74 | else if (charType == CharType.ENGLISH_LETTER_CHAR) { 75 | i = processWord(annotatedTerms, 76 | i, 77 | RuleAnnotator::isPartOfLetterWord, 78 | false); 79 | } 80 | // 4. Numbers 81 | else if (charType == CharType.ARABIC_NUMERAL_CHAR) { 82 | i = processWord(annotatedTerms, 83 | i, 84 | RuleAnnotator::isPartOfNumeral, 85 | true); 86 | } 87 | // 5. 以上条件均不满足的标点符号单独成词 88 | else if (charType == CharType.EX_SINGLE_PUNCTUATION_CHAR 89 | || charType == CharType.NUMERAL_PUNCTUATION_CHAR) { 90 | setCurrentAsSingle(i, annotatedTerms, POC.PUNCTUATION_POC); 91 | i++; 92 | } 93 | // 6. 汉字字符 94 | else if (charType == CharType.HAN_ZI_CHAR 95 | || charType == CharType.CHINESE_NUMERAL_CHAR) { 96 | annotatedTerms.append(i, POC.DEFAULT_POC); 97 | i++; 98 | } 99 | // 7. 其他字符 100 | else { 101 | setCurrentAsSingle(i, annotatedTerms, POC.SINGLE_POC); 102 | i++; 103 | } 104 | } 105 | annotatedTerms.intersectPocByIndex(0, POC.BEGIN_OR_SINGLE_POC); 106 | annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC); 107 | return annotatedTerms; 108 | } 109 | 110 | /** 111 | * 当前字符单独成词,设置前一、当前、后一字符的POC. 112 | * 113 | * @param currentRawIndex 当前原字符串索引位置 114 | * @param annotatedTerms 标注结果 115 | * @param currentPoc 当前字符对应的POC 116 | */ 117 | private static void setCurrentAsSingle(int currentRawIndex, 118 | AnnotatedTerms annotatedTerms, 119 | POC currentPoc) { 120 | annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC); 121 | annotatedTerms.append(currentRawIndex, currentPoc); 122 | int nextIndex = currentRawIndex + 1; 123 | if (nextIndex < annotatedTerms.getRawCharsLength() 124 | && annotatedTerms.getCharTypeByIndex(nextIndex) != CharType.SPACE_OR_CONTROL_CHAR) { 125 | annotatedTerms.appendAhead(nextIndex, POC.BEGIN_OR_SINGLE_POC); 126 | } 127 | } 128 | 129 | /** 130 | * 判断前后书名号内的字符串是否为能成词 131 | * 132 | * @param annotatedTerms 标注结果 133 | * @param startIndex 前书名号《 后一个index 134 | * @param endIndex 后书名号》前一个index 135 | * @return 若能则true 136 | */ 137 | private static boolean isPossibleTitle(AnnotatedTerms annotatedTerms, int startIndex, int endIndex) { 138 | if (endIndex - startIndex > 8 || endIndex - startIndex <= 0) { 139 | return false; 140 | } 141 | for (int i = startIndex; i <= endIndex; i++) { 142 | CharType charType = annotatedTerms.getCharTypeByIndex(i); 143 | if (charType == CharType.SINGLE_PUNCTUATION_CHAR 144 | || charType == CharType.SPACE_OR_CONTROL_CHAR) { 145 | return false; 146 | } 147 | } 148 | return true; 149 | } 150 | 151 | /** 152 | * 设置书名号内为一个词. 153 | * 154 | * @param annotatedTerms 清洗句子结果 155 | * @param startRawIndex 词的起始原字符串索引位置(在待分词文本中的索引值) 156 | * @param endRawIndex 词的结束原字符串索引位置(在待分词文本中的索引值) 157 | * @param endAnnotatedIndex 词的结束标注索引位置 158 | */ 159 | private static void setTitleWordPoc( 160 | AnnotatedTerms annotatedTerms, 161 | int startRawIndex, 162 | int endRawIndex, 163 | int endAnnotatedIndex) { 164 | // 单独字符成词 165 | if (startRawIndex == endRawIndex) { 166 | annotatedTerms.intersectPocByIndex(endAnnotatedIndex, POC.SINGLE_POC); 167 | return; 168 | } 169 | // 对应起始标注索引位置 170 | int startAnnotatedIndex = endAnnotatedIndex - endRawIndex + startRawIndex; 171 | annotatedTerms.setPocByIndex(startAnnotatedIndex, POC.BEGIN_POC); 172 | for (int i = startAnnotatedIndex + 1; i < endAnnotatedIndex; i++) { 173 | annotatedTerms.setPocByIndex(i, POC.MIDDLE_POC); 174 | } 175 | annotatedTerms.setPocByIndex(endAnnotatedIndex, POC.END_POC); 176 | } 177 | 178 | /** 179 | * 英文可与数字联合成词 180 | * 181 | * @param charType 字符类型 182 | * @return 布尔值 183 | */ 184 | public static boolean isPartOfLetterWord(CharType charType) { 185 | return charType == CharType.ENGLISH_LETTER_CHAR 186 | || charType == CharType.ARABIC_NUMERAL_CHAR 187 | || charType == CharType.EX_SINGLE_PUNCTUATION_CHAR; 188 | } 189 | 190 | 191 | /** 192 | * 为数词的一部分,数字字符或可与数字搭配的标点符号. 193 | * 194 | * @param charType 字符类型 195 | * @return 布尔值 196 | */ 197 | public static boolean isPartOfNumeral(CharType charType) { 198 | return charType == CharType.CHINESE_NUMERAL_CHAR 199 | || charType == CharType.ARABIC_NUMERAL_CHAR 200 | || charType == CharType.NUMERAL_PUNCTUATION_CHAR; 201 | } 202 | 203 | /** 204 | * 处理单词或连续数字 205 | * 206 | * @param annotatedTerms 规则标注结果 207 | * @param startRawIndex 在字符串raw中的起始位置 208 | * @param condition 函数式接口,判断是否为字母或数字 209 | * @param isNumeral 单词or数字 210 | * @return 词结束后的下一个字符所处位置 211 | */ 212 | private static int processWord( 213 | AnnotatedTerms annotatedTerms, 214 | int startRawIndex, 215 | Predicate condition, 216 | boolean isNumeral) { 217 | POC b, m, e, s; 218 | if (isNumeral) { 219 | b = POC.BEGIN_NUMERAL_POC; 220 | m = POC.MIDDLE_NUMERAL_POC; 221 | e = POC.END_NUMERAL_POC; 222 | s = POC.SINGLE_NUMERAL_POC; 223 | } else { 224 | b = POC.BEGIN_POC; 225 | m = POC.MIDDLE_POC; 226 | e = POC.END_POC; 227 | s = POC.SINGLE_POC; 228 | } 229 | 230 | // 处理前一字符 231 | annotatedTerms.intersectLastPoc(POC.END_OR_SINGLE_POC); 232 | 233 | int len = annotatedTerms.getRawCharsLength(); 234 | int i = startRawIndex; 235 | i++; 236 | // 单独成词 237 | if (i == len 238 | || (i < len && !condition.test(annotatedTerms.getCharTypeByIndex(i)))) { 239 | annotatedTerms.append(i - 1, s); 240 | } 241 | // 连续成词 242 | else { 243 | annotatedTerms.append(i - 1, b); 244 | for (; i + 1 < len && condition.test(annotatedTerms.getCharTypeByIndex(i + 1)); i++) { 245 | annotatedTerms.append(i, m); 246 | } 247 | annotatedTerms.append(i, e); 248 | i++; 249 | } 250 | // 处理成词后的下一字符 251 | if (i < len && annotatedTerms.getCharTypeByIndex(i) != CharType.SPACE_OR_CONTROL_CHAR) { 252 | annotatedTerms.appendAhead(i, POC.BEGIN_OR_SINGLE_POC); 253 | } 254 | return i; 255 | } 256 | } 257 | 258 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/process/SpecifiedWordCementer.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.term.TokenItem; 4 | import io.github.yizhiru.thulac4j.util.CharUtils; 5 | 6 | import java.util.Arrays; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Set; 10 | 11 | /** 12 | * 特定词的黏结 13 | */ 14 | public class SpecifiedWordCementer { 15 | 16 | /** 17 | * 时间量词 18 | */ 19 | private static final Set TIME_UNIT_WORDS = new HashSet<>( 20 | Arrays.asList("年", "月", "日", "号", "时", "点", "分", "秒")); 21 | 22 | /** 23 | * Year time unit. 24 | */ 25 | private static final String YEAR_TIME_UNIT_WORD = "年"; 26 | 27 | /** 28 | * 可黏结的重复特定词 29 | */ 30 | private static final Set CAN_FORM_REPEATED_WORDS = new HashSet<>( 31 | Arrays.asList("—", "…")); 32 | 33 | 34 | /** 35 | * 黏结词 36 | * 37 | * @param tokenItems 分词中间结果 38 | */ 39 | public static void cementWord(List tokenItems) { 40 | for (int i = tokenItems.size() - 1; i > 0; i--) { 41 | TokenItem item = tokenItems.get(i); 42 | String word = item.word; 43 | if (TIME_UNIT_WORDS.contains(word)) { 44 | i = cementTimeWord(tokenItems, item, i); 45 | } else if (CAN_FORM_REPEATED_WORDS.contains(word)) { 46 | i = cementRepeatedWord(tokenItems, item, i); 47 | } 48 | } 49 | } 50 | 51 | /** 52 | * 黏结阿拉伯数字与时间量词 53 | * 54 | * @param tokenItems 分词中间结果 55 | * @param timeUnitItem 时间单位词项 56 | * @param endIndex 结束索引值 57 | * @return 时间词的开始索引位置 58 | */ 59 | private static int cementTimeWord(List tokenItems, 60 | TokenItem timeUnitItem, 61 | int endIndex) { 62 | String timeUit = timeUnitItem.word; 63 | if (endIndex - 1 >= 0) { 64 | String previousWord = tokenItems.get(endIndex - 1).word; 65 | if (isNumeralWord(previousWord)) { 66 | if (timeUit.equals(YEAR_TIME_UNIT_WORD) && previousWord.length() < 4) { 67 | return endIndex; 68 | } 69 | tokenItems.remove(endIndex); 70 | StringBuilder builder = new StringBuilder(previousWord + timeUnitItem.word); 71 | int j = endIndex - 2; 72 | for (; j >= 0; j--) { 73 | String w = tokenItems.get(j).word; 74 | if (isNumeralWord(w)) { 75 | tokenItems.remove(j + 1); 76 | builder.insert(0, w); 77 | } else { 78 | break; 79 | } 80 | } 81 | tokenItems.set(j + 1, 82 | new TokenItem(builder.toString(), "t")); 83 | return j + 1; 84 | } 85 | } 86 | return endIndex; 87 | } 88 | 89 | /** 90 | * 黏结左右相同的特定词 91 | * 92 | * @param tokenItems 分词中间结果 93 | * @param repeatedItem 重复的特定词项 94 | * @param endIndex 结束索引值 95 | * @return 词的开始索引位置 96 | */ 97 | private static int cementRepeatedWord(List tokenItems, 98 | TokenItem repeatedItem, 99 | int endIndex) { 100 | String word = repeatedItem.word; 101 | int i = endIndex - 1; 102 | if (i >= 0 && tokenItems.get(i).word.equals(word)) { 103 | StringBuilder builder = new StringBuilder(word + word); 104 | tokenItems.remove(endIndex); 105 | for (i--; i >= 0 && tokenItems.get(i).word.equals(word); i--) { 106 | builder.insert(0, word); 107 | tokenItems.remove(i + 1); 108 | } 109 | tokenItems.set(i + 1, 110 | new TokenItem(builder.toString(), repeatedItem.pos)); 111 | } 112 | return i + 1; 113 | } 114 | 115 | /** 116 | * 一个词是否全为阿拉伯数字组成. 117 | * 118 | * @param word 词. 119 | * @return 布尔值 120 | */ 121 | private static boolean isNumeralWord(String word) { 122 | for (char ch : word.toCharArray()) { 123 | if (!CharUtils.isNumeral(ch)) { 124 | return false; 125 | } 126 | } 127 | return true; 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/term/AnnotatedTerms.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.term; 2 | 3 | import io.github.yizhiru.thulac4j.util.CharUtils; 4 | 5 | import java.util.Arrays; 6 | 7 | import static io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronModel.NGramFeature.BOUNDARY; 8 | import static io.github.yizhiru.thulac4j.util.CharUtils.convertHalfWidth; 9 | 10 | public final class AnnotatedTerms { 11 | 12 | /** 13 | * 待分词文本对应的原字符串. 14 | */ 15 | private char[] rawChars; 16 | 17 | /** 18 | * 待分词文本对应的字符类型 19 | */ 20 | private CharType[] rawCharTypes; 21 | 22 | /** 23 | * 规则标注处理前的字符串,长度与 @annotatedChars相等 24 | */ 25 | private char[] preAnnotateChars; 26 | 27 | /** 28 | * 规则标注处理后字符串(对应于raw),包含操作:去除空格字符、半角转全角 29 | */ 30 | private char[] annotatedChars; 31 | 32 | /** 33 | * 可能POC. 34 | */ 35 | public POC[] pocs; 36 | 37 | /** 38 | * 规则标注后后句子长度. 39 | */ 40 | private int annotatedLength; 41 | 42 | /** 43 | * 最后一个tuple是否已提前添加. 44 | */ 45 | private boolean isAppendAhead; 46 | 47 | /** 48 | * 构造器 49 | * 50 | * @param rawChars 待分词文本字符串 51 | */ 52 | public AnnotatedTerms(char[] rawChars) { 53 | this.rawChars = rawChars; 54 | int textLength = rawChars.length; 55 | // get char type 56 | this.rawCharTypes = new CharType[textLength]; 57 | for (int i = 0; i < textLength; i++) { 58 | rawCharTypes[i] = CharUtils.getCharType(rawChars[i]); 59 | } 60 | this.preAnnotateChars = new char[textLength]; 61 | this.annotatedChars = new char[textLength]; 62 | this.pocs = new POC[textLength]; 63 | this.annotatedLength = 0; 64 | this.isAppendAhead = false; 65 | } 66 | 67 | public char[] getPreAnnotateChars() { 68 | return Arrays.copyOfRange(preAnnotateChars, 0, annotatedLength); 69 | } 70 | 71 | public char[] getAnnotatedChars() { 72 | return Arrays.copyOfRange(annotatedChars, 0, annotatedLength); 73 | } 74 | 75 | /** 76 | * 规则标注后的长度. 77 | * 78 | * @return 长度 79 | */ 80 | public int getAnnotatedLength() { 81 | return this.annotatedLength; 82 | } 83 | 84 | public POC[] getPocs() { 85 | return Arrays.copyOfRange(pocs, 0, annotatedLength); 86 | } 87 | 88 | /** 89 | * 根据原字符串的索引位置得到字符类型 90 | * 91 | * @param rawIndex 原字符串的索引位置 92 | * @return 字符类型 93 | */ 94 | public char getRawCharByIndex(int rawIndex) { 95 | return rawChars[rawIndex]; 96 | } 97 | 98 | /** 99 | * 根据原字符串的索引位置得到字符类型 100 | * 101 | * @param rawIndex 原字符串的索引位置 102 | * @return 字符类型 103 | */ 104 | public CharType getCharTypeByIndex(int rawIndex) { 105 | return rawCharTypes[rawIndex]; 106 | } 107 | 108 | /** 109 | * 原始字符串长度 110 | * 111 | * @return 整数值长度 112 | */ 113 | public int getRawCharsLength() { 114 | return rawChars.length; 115 | } 116 | 117 | /** 118 | * 结果字符串是否为空 119 | * 120 | * @return 若为空,则为true 121 | */ 122 | public boolean isEmpty() { 123 | return annotatedLength == 0; 124 | } 125 | 126 | /** 127 | * 首尾拼接BOUNDARY 字符 128 | * 129 | * @return 拼接后的字符串 130 | */ 131 | public char[] appendBoundaryAround() { 132 | char[] array = new char[annotatedLength + 4]; 133 | System.arraycopy(annotatedChars, 0, array, 2, annotatedLength); 134 | array[0] = array[1] = array[annotatedLength + 2] = array[annotatedLength + 3] = BOUNDARY; 135 | return array; 136 | } 137 | 138 | /** 139 | * 对于index位置的POC求交集 140 | * 141 | * @param annotatedIndex 标注字符串的索引位置 142 | * @param poc POC值 143 | */ 144 | public void intersectPocByIndex(int annotatedIndex, POC poc) { 145 | if (annotatedIndex < 0 || annotatedIndex >= annotatedLength) { 146 | return; 147 | } 148 | pocs[annotatedIndex] = pocs[annotatedIndex].intersect(poc); 149 | } 150 | 151 | /** 152 | * 对最后位置POC求交集 153 | * 154 | * @param poc POC值 155 | */ 156 | public void intersectLastPoc(POC poc) { 157 | intersectPocByIndex(annotatedLength - 1, poc); 158 | } 159 | 160 | /** 161 | * 按照index 值设置 poc 162 | * 163 | * @param annotatedIndex 标注字符串的索引位置 164 | * @param poc POC值 165 | */ 166 | public void setPocByIndex(int annotatedIndex, POC poc) { 167 | if (annotatedIndex < 0 || annotatedIndex >= annotatedLength) { 168 | return; 169 | } 170 | pocs[annotatedIndex] = poc; 171 | } 172 | 173 | /** 174 | * 添加最后一个 175 | * 176 | * @param rawIndex 原字符串的索引位置 177 | * @param poc 可能的POC 178 | */ 179 | public void append(int rawIndex, POC poc) { 180 | if (isAppendAhead) { 181 | intersectLastPoc(poc); 182 | isAppendAhead = false; 183 | } else { 184 | char ch = rawChars[rawIndex]; 185 | preAnnotateChars[annotatedLength] = ch; 186 | annotatedChars[annotatedLength] = convertHalfWidth(ch); 187 | pocs[annotatedLength] = poc; 188 | annotatedLength++; 189 | } 190 | } 191 | 192 | /** 193 | * 尾部提前追加元素. 194 | * 195 | * @param rawIndex 原字符串的索引位置 196 | * @param poc 可能的POC 197 | */ 198 | public void appendAhead(int rawIndex, POC poc) { 199 | char ch = rawChars[rawIndex]; 200 | preAnnotateChars[annotatedLength] = ch; 201 | annotatedChars[annotatedLength] = convertHalfWidth(ch); 202 | pocs[annotatedLength] = poc; 203 | annotatedLength++; 204 | isAppendAhead = true; 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/term/CharType.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.term; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public enum CharType { 7 | 8 | /** 9 | * 只能单独成词的标点符号,不会与其他字符合组合成词 10 | */ 11 | SINGLE_PUNCTUATION_CHAR("p"), 12 | 13 | /** 14 | * 既能单独成词又能与其他字符组合成词的标点符号 15 | */ 16 | EX_SINGLE_PUNCTUATION_CHAR("ep"), 17 | 18 | /** 19 | * 空格或控制字符串 20 | */ 21 | SPACE_OR_CONTROL_CHAR("c"), 22 | 23 | /** 24 | * 中文数字字符 25 | */ 26 | CHINESE_NUMERAL_CHAR("cn"), 27 | 28 | /** 29 | * 阿拉伯数字字符 30 | */ 31 | ARABIC_NUMERAL_CHAR("an"), 32 | 33 | /** 34 | * 数词专用标点符合 35 | */ 36 | NUMERAL_PUNCTUATION_CHAR("np"), 37 | 38 | /** 39 | * 汉字字符 40 | */ 41 | HAN_ZI_CHAR("h"), 42 | 43 | /** 44 | * 英文字符 45 | */ 46 | ENGLISH_LETTER_CHAR("e"), 47 | 48 | /** 49 | * 其他字符 50 | */ 51 | OTHER_CHAR("o"), 52 | ; 53 | 54 | /** 55 | * 简写 56 | */ 57 | private final String abbreviation; 58 | 59 | /** 60 | * 简写与CharType之间的映射 61 | */ 62 | private static final Map MAP = new HashMap<>(values().length, 1); 63 | 64 | // 静态初始化 65 | static { 66 | for (CharType t : values()) { 67 | MAP.put(t.abbreviation, t); 68 | } 69 | } 70 | 71 | CharType(String abbreviation) { 72 | this.abbreviation = abbreviation; 73 | } 74 | 75 | /** 76 | * 根据CharType的简写得到枚举值 77 | * 78 | * @param abbr 简写 79 | * @return 具体枚举值,若没有则抛出 IllegalArgumentException 80 | */ 81 | public static CharType of(String abbr) { 82 | CharType type = MAP.get(abbr); 83 | if (type == null) { 84 | throw new IllegalArgumentException("Invalid char type abbreviation: " + abbr); 85 | } 86 | return type; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/term/POC.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.term; 2 | 3 | /** 4 | * POC (position of char) 种类,用以描述字符的标注 (label) 信息. 5 | */ 6 | public enum POC { 7 | 8 | /** 9 | * Punctuation POC. 10 | */ 11 | PUNCTUATION_POC, 12 | 13 | /** 14 | * Begin of numeral. 15 | */ 16 | BEGIN_NUMERAL_POC, 17 | 18 | /** 19 | * Middle of numeral. 20 | */ 21 | MIDDLE_NUMERAL_POC, 22 | 23 | /** 24 | * End of numeral. 25 | */ 26 | END_NUMERAL_POC, 27 | 28 | /** 29 | * Single of numeral. 30 | */ 31 | SINGLE_NUMERAL_POC, 32 | 33 | /** 34 | * Word begin. 35 | */ 36 | BEGIN_POC, 37 | 38 | /** 39 | * Word middle. 40 | */ 41 | MIDDLE_POC, 42 | 43 | /** 44 | * Word end. 45 | */ 46 | END_POC, 47 | 48 | /** 49 | * Single character as a word. 50 | */ 51 | SINGLE_POC, 52 | 53 | /** 54 | * Begin or single. 55 | */ 56 | BEGIN_OR_SINGLE_POC, 57 | 58 | /** 59 | * End or single. 60 | */ 61 | END_OR_SINGLE_POC, 62 | 63 | /** 64 | * Default POC. 65 | */ 66 | DEFAULT_POC; 67 | 68 | /** 69 | * 对可能标注求交集,比如,若某字符的标注既可能为BS_POC,也可能为ES_POC, 70 | * 则其标注为SINGLE_POC. 71 | * 72 | * @param that 另一种可能POC. 73 | * @return 交集POC. 74 | */ 75 | public POC intersect(POC that) { 76 | if (this.ordinal() < that.ordinal()) { 77 | if (this == BEGIN_OR_SINGLE_POC && that == END_OR_SINGLE_POC) { 78 | return SINGLE_POC; 79 | } 80 | return this; 81 | } else if (this == END_OR_SINGLE_POC && that == BEGIN_OR_SINGLE_POC) { 82 | return SINGLE_POC; 83 | } 84 | return that; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/term/TokenItem.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.term; 2 | 3 | import io.github.yizhiru.thulac4j.common.Nullable; 4 | 5 | /** 6 | * Word Segment item. 7 | */ 8 | public final class TokenItem { 9 | 10 | /** 11 | * Tokenized word. 12 | */ 13 | public final String word; 14 | 15 | /** 16 | * Part-of-speech. 17 | */ 18 | @Nullable 19 | public final String pos; 20 | 21 | public TokenItem(String word, String pos) { 22 | this.word = word; 23 | this.pos = pos; 24 | } 25 | 26 | @Override 27 | public String toString() { 28 | if (pos == null) { 29 | return word; 30 | } 31 | return word + '/' + pos; 32 | } 33 | 34 | @Override 35 | public boolean equals(Object o) { 36 | if (this == o) { 37 | return true; 38 | } 39 | if (o == null || getClass() != o.getClass()) { 40 | return false; 41 | } 42 | 43 | TokenItem tokenItem = (TokenItem) o; 44 | return (word != null ? word.equals(tokenItem.word) : tokenItem.word == null) 45 | && (pos != null ? pos.equals(tokenItem.pos) : tokenItem.pos == null); 46 | } 47 | 48 | @Override 49 | public int hashCode() { 50 | int result = word != null ? word.hashCode() : 0; 51 | result = 31 * result + (pos != null ? pos.hashCode() : 0); 52 | return result; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/util/CharUtils.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | import io.github.yizhiru.thulac4j.term.CharType; 4 | 5 | import java.io.IOException; 6 | import java.util.*; 7 | 8 | public final class CharUtils { 9 | 10 | 11 | private static final Map CORE_CHAR_TYPE_MAP = loadCharTypeMap(); 12 | 13 | /** 14 | * 空格字符,ASCII码值为32 15 | */ 16 | private static final char LATIN_SPACE_CHAR = ' '; 17 | 18 | /** 19 | * 前书名号 20 | */ 21 | public static final char LEFT_TITLE_QUOTATION_CHAR = '《'; 22 | 23 | /** 24 | * 后书名号 25 | */ 26 | public static final char RIGHT_TITLE_QUOTATION_CHAR = '》'; 27 | 28 | /** 29 | * 加载核心字符类型词典 30 | * 31 | * @return 核心字符映射到字符类型 Map 32 | */ 33 | private static Map loadCharTypeMap() { 34 | List lines; 35 | try { 36 | lines = IOUtils.readLines(CharUtils.class.getResourceAsStream(ModelPaths.CORE_CHAR_PATH)); 37 | } catch (IOException e) { 38 | throw new RuntimeException(e); 39 | } 40 | Map map = new HashMap<>(lines.size()); 41 | for (String line : lines) { 42 | String[] arr = line.split("\t"); 43 | map.put(arr[0].charAt(0), CharType.of(arr[1])); 44 | } 45 | return map; 46 | } 47 | 48 | /** 49 | * 映射字符类型 50 | * 51 | * @param ch 字符 52 | * @return 字符类型 53 | */ 54 | public static CharType getCharType(char ch) { 55 | if (isSpaceOrControl(ch)) { 56 | return CharType.SPACE_OR_CONTROL_CHAR; 57 | } 58 | return CORE_CHAR_TYPE_MAP.getOrDefault(ch, CharType.OTHER_CHAR); 59 | } 60 | 61 | 62 | /** 63 | * 是否为控制字符或空格字符,在分词过程中忽略这样的字符. 64 | * 65 | * @param ch 字符 66 | * @return 布尔值,若是则返回true 67 | */ 68 | public static boolean isSpaceOrControl(char ch) { 69 | return (ch < LATIN_SPACE_CHAR) || Character.isSpaceChar(ch); 70 | } 71 | 72 | 73 | /** 74 | * 字符是否为数字 75 | * 76 | * @param ch 字符 77 | * @return 布尔值 78 | */ 79 | public static boolean isNumeral(char ch) { 80 | CharType charType = getCharType(ch); 81 | return charType == CharType.CHINESE_NUMERAL_CHAR 82 | || charType == CharType.ARABIC_NUMERAL_CHAR; 83 | } 84 | 85 | /** 86 | * 半角字符转全角字符. 87 | * 半角空格为32, 全角空格为12288; 88 | * 其他半角字符(33-126)与全角字符(65281-65374)均相差 65248. 89 | * 90 | * @param ch 字符 91 | * @return 半角转成的全角字符 92 | */ 93 | public static char convertHalfWidth(char ch) { 94 | if (ch == 32) { 95 | return (char) 12288; 96 | } else if (ch > 32 && ch < 127) { 97 | return (char) (ch + 65248); 98 | } 99 | return ch; 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/util/ChineseUtils.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | import io.github.yizhiru.thulac4j.common.DoubleArrayTrie; 4 | 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | 8 | public final class ChineseUtils { 9 | 10 | /** 11 | * 繁体字符映射到简体字符 12 | */ 13 | private static final HashMap T2S_MAP = parseT2sMap(); 14 | 15 | /** 16 | * 停用词表 17 | */ 18 | private static final DoubleArrayTrie STOP_WORDS_DAT = DoubleArrayTrie.loadDat( 19 | ChineseUtils.class.getResourceAsStream(ModelPaths.STOP_WORDS_BIN_PATH)); 20 | 21 | /** 22 | * 解析繁体简体映射Map文件. 23 | * 24 | * @return HashMap 25 | */ 26 | private static HashMap parseT2sMap() { 27 | int[] array; 28 | try { 29 | array = IOUtils.toIntArray(ChineseUtils.class.getResourceAsStream(ModelPaths.T2S_PATH)); 30 | } catch (IOException e) { 31 | throw new RuntimeException(e); 32 | } 33 | 34 | // 文件包含繁体字符共有2800个 35 | int traditionNum = array.length / 2; 36 | HashMap t2sMap = new HashMap<>(traditionNum); 37 | for (int i = 0; i < traditionNum; i++) { 38 | t2sMap.put((char) array[i], (char) array[i + traditionNum]); 39 | } 40 | return t2sMap; 41 | } 42 | 43 | /** 44 | * 将繁体汉字转为简体汉字 45 | * 46 | * @param sentence 输入句子 47 | * @return 简体字化句子 48 | */ 49 | public static String simplified(String sentence) { 50 | StringBuilder builder = new StringBuilder(sentence.length()); 51 | for (char ch : sentence.toCharArray()) { 52 | builder.append(T2S_MAP.getOrDefault(ch, ch)); 53 | } 54 | return builder.toString(); 55 | } 56 | 57 | /** 58 | * 判断该词是否为停用词. 59 | * 60 | * @param word 输入词 61 | * @return 布尔值,若为停用词则为true 62 | */ 63 | public static boolean isStopWord(String word) { 64 | return STOP_WORDS_DAT.isWordMatched(word); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/util/IOUtils.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.FileInputStream; 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.io.InputStreamReader; 9 | import java.nio.ByteBuffer; 10 | import java.nio.ByteOrder; 11 | import java.nio.IntBuffer; 12 | import java.nio.MappedByteBuffer; 13 | import java.nio.channels.FileChannel; 14 | import java.util.ArrayList; 15 | import java.util.List; 16 | 17 | /** 18 | * IO Utils, 参考 commons-io 包中类 IOUtils 实现. 19 | */ 20 | public final class IOUtils { 21 | 22 | /** 23 | * Represents the end-of-file (or stream). 24 | */ 25 | private static final int EOF = -1; 26 | 27 | /** 28 | * The default buffer size to use for copy. 29 | */ 30 | private static final int DEFAULT_BUFFER_SIZE = 1024 * 4; 31 | 32 | /** 33 | * Maps a region of this channel's file directly into memory. 34 | * 35 | * @param inputPath the file path to read from, not null. 36 | * @return The mapped byte buffer 37 | * @throws IOException If some other I/O error occurs 38 | */ 39 | public static MappedByteBuffer mapToByteBuffer(final String inputPath) throws IOException { 40 | FileChannel channel = new FileInputStream(inputPath).getChannel(); 41 | return channel.map(FileChannel.MapMode.READ_ONLY, 0, channel.size()); 42 | } 43 | 44 | /** 45 | * Gets the contents of an InputStream as a list of Strings, 46 | * one entry per line. 47 | * 48 | * @param input the InputStream to read from, not null 49 | * @return the list of Strings, never null 50 | * @throws NullPointerException if the input is null 51 | * @throws IOException if an I/O error occurs 52 | */ 53 | public static List readLines(final InputStream input) throws IOException { 54 | final BufferedReader reader = new BufferedReader( 55 | new InputStreamReader(input)); 56 | final List list = new ArrayList<>(); 57 | String line = reader.readLine(); 58 | while (line != null) { 59 | list.add(line); 60 | line = reader.readLine(); 61 | } 62 | reader.close(); 63 | return list; 64 | } 65 | 66 | /** 67 | * Gets the contents of an InputStream as a byte[]. 68 | *

69 | * This method buffers the input internally, so there is no need to use a 70 | * BufferedInputStream. 71 | * 72 | * @param input the InputStream to read from 73 | * @return the requested byte array 74 | * @throws NullPointerException if the input is null 75 | * @throws IOException if an I/O error occurs 76 | */ 77 | public static byte[] toByteArray(final InputStream input) throws IOException { 78 | try (final ByteArrayOutputStream output = new ByteArrayOutputStream()) { 79 | int n; 80 | byte[] buffer = new byte[DEFAULT_BUFFER_SIZE]; 81 | while (EOF != (n = input.read(buffer))) { 82 | output.write(buffer, 0, n); 83 | } 84 | return output.toByteArray(); 85 | } 86 | } 87 | 88 | /** 89 | * Gets the contents of an InputStream as a int array. 90 | *

91 | * This method buffers the input internally, so there is no need to use a 92 | * BufferedInputStream. 93 | * 94 | * @param input the InputStream to read from 95 | * @return the requested int array 96 | * @throws NullPointerException if the input is null 97 | * @throws IOException if an I/O error occurs 98 | */ 99 | public static int[] toIntArray(final InputStream input) throws IOException { 100 | byte[] bytes = toByteArray(input); 101 | IntBuffer intBuffer = ByteBuffer.wrap(bytes) 102 | .order(ByteOrder.LITTLE_ENDIAN) 103 | .asIntBuffer(); 104 | int[] array = new int[intBuffer.remaining()]; 105 | intBuffer.get(array); 106 | return array; 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/io/github/yizhiru/thulac4j/util/ModelPaths.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | /** 4 | * 模型文件路径名 5 | */ 6 | public final class ModelPaths { 7 | 8 | /** 9 | * 核心字符类型词典 10 | */ 11 | public static final String CORE_CHAR_PATH = "/dicts/core_char.dict"; 12 | 13 | /** 14 | * 地名词典 15 | */ 16 | public static final String NS_DICT_PATH = "dicts/ns.dict"; 17 | 18 | /** 19 | * 成语、习语、谚语词典 20 | */ 21 | public static final String IDIOM_DICT_PATH = "dicts/idiom.dict"; 22 | 23 | /** 24 | * 停用词词典 25 | */ 26 | public static final String STOP_WORDS_DICT_PATH = "dicts/stop_words.dict"; 27 | 28 | public static final String NS_BIN_PATH = "/models/ns_dat.bin"; 29 | public static final String IDIOM_BIN_PATH = "/models/idiom_dat.bin"; 30 | public static final String STOP_WORDS_BIN_PATH = "/models/stop_dat.bin"; 31 | 32 | /** 33 | * 繁体到简体字符映射 34 | */ 35 | public static final String T2S_PATH = "/models/t2s.dat"; 36 | 37 | /** 38 | * 分词模块权重 39 | */ 40 | public static final String SEGMENTER_WEIGHT_PATH = "/models/cws_model.bin"; 41 | 42 | /** 43 | * 分词模块特征 44 | */ 45 | public static final String SEGMENTER_FEATURE_PATH = "/models/cws_dat.bin"; 46 | 47 | /** 48 | * 分词模块label 49 | */ 50 | public static final String SEGMENTER_LABEL_PATH = "/models/cws_label.txt"; 51 | 52 | /** 53 | * 词性标注模块label 54 | */ 55 | public static final String POS_TAGGING_LABEL_PATH = "/models/model_c_label.txt"; 56 | } 57 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/POSTaggerTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import io.github.yizhiru.thulac4j.term.TokenItem; 4 | import org.junit.Test; 5 | 6 | import java.io.IOException; 7 | import java.nio.charset.StandardCharsets; 8 | import java.util.stream.Collectors; 9 | 10 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.POS_FEATURES_PATH; 11 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.POS_WEIGHTS_PATH; 12 | import static io.github.yizhiru.thulac4j.SegmenterTest.SENTENCES; 13 | import static org.junit.Assert.assertEquals; 14 | 15 | public class POSTaggerTest { 16 | 17 | @Test 18 | public void tagging() throws IOException { 19 | String[] expectedResults = new String[]{ 20 | "因/p", 21 | "", 22 | "", 23 | "UTF/x -/w 8/m", 24 | "iphone5/x", 25 | "鲜芋仙/nz 3/m", 26 | "枪杆子/n 中/f 出/v 政权/n", 27 | "两/m 块/q 五/m 一/m 套/q ,/w 三/m 块/q 八/m 一/m 斤/q ,/w 四/m 块/q 七/m 一/m 本/q ,/w 五/m 块/q 六/m 一/m 条/q", 28 | "RT/x @/w laoshipukong/x :/w 27日/t ,/w", 29 | "AT&T/nz 是/v 一/m 件/q 不错/a 的/u 公司/n ,/w 给/p 你/r 发/v offer/x 了/u 吗/u ?/w", 30 | "4/m 个/q 月/n 赚/v 了/u 20%/m 多/m", 31 | "仅/d 1/m 只/q ,/w 为/v 0.9923/m 元/q", 32 | "Just/n one/nz space/x ,/w or/ns all/nz such/x spaces/x ?/w", 33 | "倒模/v ,/w 替身/v 算/v 什么/r ?/w 钟汉良/np 、/w ab/np 《/w 孤芳不自赏/id 》/w 抠图/n 来/v 充数/v", 34 | "奥迪/nz CEO/x 违规/v 遭批/v 大众/n 表示/v 不/d 会/v 解雇/v", 35 | "找/v 小姐/n", 36 | "找/v 小妹/n", 37 | "学生/n 妹/n", 38 | "职业/n 狐狸精/n", 39 | "男/a 公关/n", 40 | "上门/v", 41 | "抽獎/v", 42 | "好/a 声音/n", 43 | "好/a 聲音/n", 44 | "夢/n 之/u 声/g", 45 | "夢之聲/id", 46 | "訂票/n", 47 | "改簽/v", 48 | "熱线/n", 49 | "熱線/n", 50 | "热線/a", 51 | "電话/n", 52 | "電話/n", 53 | "醫院/n", 54 | "代刷/v", 55 | "撲剋牌/nz", 56 | "137-1234-1234/m", 57 | "这/r 是/v 一个/m 伸手不见五指/i 的/u 黑夜/n 。/w 我/r 叫/v 孙悟空/np ,/w 我/r 爱/v 北京/ns ,/w 我/r 爱/v Python/x 和/c C/x +/w" + 58 | " +/w 。/w", 59 | "我/r 不/d 喜欢/v 日本/ns 和服/n 。/w", 60 | "雷猴/v 回归/v 人间/n 。/w", 61 | "工信处/n 女/a 干事/n 每月/r 经过/p 下属/v 科室/n 都/d 要/v 亲口/d 交代/v 24/m 口/q 交换机/n 等/u 技术性/n 器件/n 的/u 安装/v 工作/v", 62 | "我/r 需要/v 廉/g 租/v 房/n", 63 | "永和/nz 服装/n 饰品/n 有限公司/n", 64 | "我/r 爱/v 北京/ns 天安门/ns", 65 | "abc/n", 66 | "隐马尔可夫/np", 67 | "雷猴/v 是/v 个/q 好/a 网站/n", 68 | "“/w ,/w ”/w 和/c “/w SOFTware/x (/w 软件/n )/w ”/w 两/m 部分/n 组成/v", 69 | "草泥马/n 和/c 欺/g 实马/n 是/v 今年/t 的/u 流行/v 词汇/n", 70 | "伊藤/nz 洋华堂/n 总府店/n", 71 | "中国/ns 科学院/n 计算/v 技术/n 研究所/n", 72 | "罗密欧/ns 与/c 朱丽叶/np", 73 | "我/r 购买/v 了/u 道具/n 和/c 服装/n", 74 | "PS/x :/w 我/r 觉得/v 开源/v 有/v 一个/m 好处/n ,/w 就/d 是/v 能够/v 敦促/v 自己/r 不断/d 改进/v ,/w 避免/v 敞帚自珍/id", 75 | "湖北省/ns 石首市/ns", 76 | "湖北省/ns 十堰市/ns", 77 | "总经理/n 完成/v 了/u 这/r 件/q 事情/n", 78 | "电脑/n 修好/v 了/u", 79 | "做好/v 了/u 这/r 件/q 事情/n 就/d 一了百了/i 了/u", 80 | "人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u", 81 | "我们/r 买/v 了/u 一个/m 美/a 的/u 空调/n", 82 | "线程/n 初始化/v 时/g 我们/r 要/v 注意/v", 83 | "一个/m 分子/n 是/v 由/p 好多/m 原子组/n 织成/v 的/u", 84 | "祝/v 你/r 马到功成/i", 85 | "他/r 掉/v 进/v 了/u 无/v 底洞/n 里/f", 86 | "中国/ns 的/u 首都/n 是/v 北京/ns", 87 | "孙君意/np", 88 | "外交部/ni 发言人/n 马朝旭/np", 89 | "领导人/n 会议/n 和/c 第四/m 届/q 东亚/ns 峰会/n", 90 | "在/p 过去/t 的/u 这/r 五/m 年/q", 91 | "还/d 需要/v 很/d 长/a 的/u 路/n 要/v 走/v", 92 | "60/m 周年/q 首都/n 阅兵/n", 93 | "你好/id 人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u", 94 | "买/v 水果/n 然后/c 来/v 世博园/j", 95 | "买/v 水果/n 然后/c 去/v 世博园/j", 96 | "但是/c 后来/t 我/r 才/d 知道/v 你/r 是/v 对/a 的/u", 97 | "存在/v 即/c 合理/a", 98 | "的/u 的/u 的/u 的/u 的/u 在/p 的/u 的/u 的/u 的/u 就/d 以/p 和和/nz 和/c", 99 | "I/v love/x 你/r ,/w 不以为耻/i ,/w 反/d 以为/v rong/x", 100 | "hello/x 你好/id 人们/n 审美/v 的/u 观点/n 是/v 不同/a 的/u", 101 | "很/d 好/a 但/c 主要/d 是/v 基于/p 网页/n 形式/n", 102 | "为什么/r 我/r 不/d 能/v 拥有/v 想/v 要/v 的/u 生活/v", 103 | "后来/t 我/r 才/d", 104 | "此次/r 来/v 中国/ns 是/v 为了/p", 105 | "使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n", 106 | ",/w 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n", 107 | "其实/d 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n", 108 | "好人/n 使用/v 了/u 它/r 就/d 可以/v 解决/v 一些/m 问题/n", 109 | "是/v 因为/p 和/p 国家/n", 110 | "老年/t 搜索/v 还/d 支持/v", 111 | "干脆/d 就/d 把/p 那/r 部/q 蒙/v 人/n 的/u 闲法/n 给/p 废/v 了/u 拉倒/v !/w RT/x @/w laoshipukong/x :/w 27日/t ,/w " + 112 | "全国/n 人大/j 常委会/j 第三/m 次/q 审议/v 侵权/v 责任法/n 草案/n ,/w 删除/v 了/u 有关/v 医疗/n 损害/v 责任/n “/w 举证/v 倒置/v" + 113 | " ”/w 的/u 规定/n 。/w 在/p 医患/n 纠纷/n 中/f 本/d 已/d 处于/v 弱势/n 地位/n 的/u 消费者/n 由此/d 将/d 陷入/v 万劫不复/i " + 114 | "的/u 境地/n 。/w", 115 | "他/r 说/v 的/u 确实/a 在理/a", 116 | "长春/ns 市长/n 春节/t 讲话/n", 117 | "结婚/v 的/u 和/c 尚未/d 结婚/v 的/u", 118 | "结合/v 成分子/n 时/g", 119 | "旅游/v 和/c 服务/v 是/v 最/d 好/a 的/u", 120 | "这/r 件/q 事情/n 的确/d 是/v 我/r 的/u 错/n", 121 | "供/v 大家/r 参考/v 指正/v", 122 | "哈尔滨/ns 政府/n 公布/v 塌/v 桥/n 原因/n", 123 | "我/r 在/p 机场/n 入口处/n", 124 | "邢永臣/np 摄影/v 报道/v", 125 | "BP/x 神经/n 网络/n 如何/r 训练/v 才/d 能/v 在/p 分类/v 时/g 增加/v 区/n 分度/n ?/w", 126 | "南京市/ns 长江/ns 大桥/n", 127 | "应/v 一些/m 使用者/n 的/u 建议/n ,/w 也/d 为了/p 便于/v 利用/v NiuTrans/x 用于/v SMT/x 研究/v", 128 | "长春市/ns 长春/ns 药店/n", 129 | "邓颖超/np 生前/t 最/d 喜欢/v 的/u 衣服/n", 130 | "胡锦涛/np 是/v 热爱/v 世界/n 和平/n 的/u 政治局/n 常委/n", 131 | "程序员/n 祝海林/np 和/c 朱会震/np 是/v 在/p 孙健/np 的/u 左面/f 和/c 右面/f ,/w 范凯/np 在/p 最/d 右面/f ./w 再/d 往/p 左/f 是/v " + 132 | "李松洪/np", 133 | "一次性/d 交/v 多少/r 钱/n", 134 | "小/a 和/c 尚/d 留/v 了/u 一个/m 像/p 大/a 和尚/n 一样/a 的/u 和尚/n 头/n", 135 | "我/r 是/v 中华人民共和国/ns 公民/n ;/w 我/r 爸爸/n 是/v 共和党/n 党员/n ;/w 地铁/n 和平/n 门站/n", 136 | "张晓梅/np 去/v 人民/n 医院/n 做/v 了/u 个/q B/x 超然/a 后/f 去/v 买/v 了/u 件/q T/m 恤/q", 137 | "C/x +/w +/w 和/c c/g #/w 是/v 什么/r 关系/n ?/w 11/m +/w 122/m =/w 133/m ,/w 是/v 吗/u ?/w PI/x =/w 3.14159/m", 138 | "你/r 认识/v 那个/r 和/c 主席/n 握手/v 的/u 的/u 哥/j 吗/u ?/w 他/r 开/v 一/m 辆/q 黑色/n 的士/n 。/w", 139 | "2017-10-13/m 给/p 你/r 发/v offer/x 了/u 吗/u ?/w 27日/t 发/v iphone5/x 了/u 吗/u", 140 | "本报/r 讯/g 深圳市/ns 海王/nz 生物/n 工程/n 股份/n 有限公司/n 二○○○/m 年度/n 增/v 发/v A/x 股/n 路/n 演/v 推介会/n 日前/t 在/p 北京/ns 举行/v", 141 | "共同/d 创造/v 美好/a 的/u 新/a 世纪/n ——/w 2001年/t 新年/t 贺词/n", 142 | }; 143 | 144 | POSTagger posTagger = new POSTagger(POS_WEIGHTS_PATH, POS_FEATURES_PATH); 145 | posTagger.enableTitleWord(); 146 | for (int i = 0; i < SENTENCES.length; i++) { 147 | String actual = posTagger.tagging(SENTENCES[i]) 148 | .stream() 149 | .map(TokenItem::toString) 150 | .collect(Collectors.joining(" ")); 151 | assertEquals(expectedResults[i], actual); 152 | } 153 | 154 | long length = 0L; 155 | long start = System.currentTimeMillis(); 156 | for (int i = 0; i < 100; ++i) { 157 | for (String sentence : SENTENCES) { 158 | posTagger.tagging(sentence); 159 | length += sentence.getBytes(StandardCharsets.UTF_8).length; 160 | } 161 | } 162 | long elapsed = (System.currentTimeMillis() - start); 163 | System.out.println(String.format("time elapsed: %d ms, rate: %.2f kb/s.", 164 | elapsed, (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f))); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/SPChineseTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import io.github.yizhiru.thulac4j.perceptron.StructuredPerceptronClassifier; 4 | import org.junit.Test; 5 | import org.junit.runner.RunWith; 6 | import org.powermock.core.classloader.annotations.PrepareForTest; 7 | import org.powermock.modules.junit4.PowerMockRunner; 8 | import org.powermock.reflect.internal.WhiteboxImpl; 9 | 10 | import java.io.FileInputStream; 11 | import java.util.Arrays; 12 | 13 | import static org.junit.Assert.assertArrayEquals; 14 | import static org.junit.Assert.assertEquals; 15 | 16 | @RunWith(PowerMockRunner.class) 17 | @PrepareForTest(StructuredPerceptronClassifier.class) 18 | public class SPChineseTokenizerTest { 19 | 20 | /** 21 | * Segmenter weights model path. 22 | */ 23 | public static final String SEG_WEIGHTS_PATH = "models/cws_model.bin"; 24 | 25 | /** 26 | * Segmenter features path. 27 | */ 28 | public static final String SEG_FEATURES_PATH = "models/cws_dat.bin"; 29 | 30 | public static final String SEG_LABELS_PATH = "models/cws_label.txt"; 31 | 32 | /** 33 | * POSTagger weights model path. 34 | */ 35 | public static final String POS_WEIGHTS_PATH = "models/model_c_model.bin"; 36 | 37 | /** 38 | * POSTagger features path. 39 | */ 40 | public static final String POS_FEATURES_PATH = "models/model_c_dat.bin"; 41 | 42 | public static final String POS_LABELS_PATH = "models/model_c_label.txt"; 43 | 44 | @Test 45 | public void setPreviousTrans() throws Exception { 46 | SPChineseTokenizer tokenizer = new SPChineseTokenizer( 47 | new FileInputStream(SEG_WEIGHTS_PATH), 48 | new FileInputStream(SEG_FEATURES_PATH), 49 | new FileInputStream(SEG_LABELS_PATH)); 50 | StructuredPerceptronClassifier classifier = WhiteboxImpl.getInternalState(tokenizer, "classifier"); 51 | int[][] previousTrans = WhiteboxImpl.invokeMethod( 52 | tokenizer, 53 | "setPreviousTransitions", 54 | new Class[]{String[].class}, 55 | (Object) classifier.getLabelValues()); 56 | 57 | assertEquals("[[1, 2], [0, 3], [1, 2], [0, 3]]", 58 | Arrays.deepToString(previousTrans)); 59 | 60 | tokenizer = new SPChineseTokenizer( 61 | new FileInputStream(POS_WEIGHTS_PATH), 62 | new FileInputStream(POS_FEATURES_PATH), 63 | new FileInputStream(POS_LABELS_PATH)); 64 | classifier = WhiteboxImpl.getInternalState(tokenizer, "classifier"); 65 | previousTrans = WhiteboxImpl.invokeMethod( 66 | tokenizer, 67 | "setPreviousTransitions", 68 | new Class[]{String[].class}, 69 | (Object) classifier.getLabelValues()); 70 | assertEquals("[1, 2, 4, 5, 7, 10, 13, 15, 17, 18, 19, 23, 25, 27, " + 71 | "30, 32, 33, 34, 35, 36, 37, 38, 39, 41, 44, 45, 48, 50, 53, " + 72 | "56, 57, 59, 61, 63, 67, 69, 72, 74, 76, 80, 81, 82, 83, 88, " + 73 | "89, 90, 91, 95]", 74 | Arrays.toString(previousTrans[0])); 75 | assertEquals("[0, 20]", Arrays.toString(previousTrans[1])); 76 | assertEquals("[54, 55]", Arrays.toString(previousTrans[56])); 77 | assertEquals("[93, 94]", Arrays.toString(previousTrans[95])); 78 | } 79 | } -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/SegmenterTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j; 2 | 3 | import org.junit.FixMethodOrder; 4 | import org.junit.Test; 5 | import org.junit.runners.MethodSorters; 6 | 7 | import java.nio.charset.StandardCharsets; 8 | import java.util.Arrays; 9 | import java.util.stream.Collectors; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | 13 | @FixMethodOrder(MethodSorters.NAME_ASCENDING) 14 | public class SegmenterTest { 15 | 16 | static final String[] SENTENCES = new String[]{ 17 | "因", 18 | " ", 19 | "", 20 | "UTF-8", 21 | "iphone5", 22 | "鲜芋仙 3", 23 | "枪杆子中出政权", 24 | "两块五一套,三块八一斤,四块七一本,五块六一条", 25 | "RT @laoshipukong : 27日,", 26 | "AT&T是一件不错的公司,给你发offer了吗?", 27 | "4个月赚了20%多", 28 | "仅1只,为0.9923元", 29 | "Just one space, or all such spaces?", 30 | "倒模,替身算什么?钟汉良、ab《孤芳不自赏》抠图来充数", 31 | "奥迪CEO违规遭批 大众表示不会解雇", 32 | "找小姐", 33 | "找小妹", 34 | "学生妹", 35 | "职业狐狸精", 36 | "男公关", 37 | "上门", 38 | "抽獎", 39 | "好声音", 40 | "好聲音", 41 | "夢之声", 42 | "夢之聲", 43 | "訂票", 44 | "改簽", 45 | "熱线", 46 | "熱線", 47 | "热線", 48 | "電话", 49 | "電話", 50 | "醫院", 51 | "代刷", 52 | "撲剋牌", 53 | "137-1234-1234", 54 | "这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。", 55 | "我不喜欢日本和服。", 56 | "雷猴回归人间。", 57 | "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 58 | "我需要廉租房", 59 | "永和服装饰品有限公司", 60 | "我爱北京天安门", 61 | "abc", 62 | "隐马尔可夫", 63 | "雷猴是个好网站", 64 | "“,”和“SOFTware(软件)”两部分组成", 65 | "草泥马和欺实马是今年的流行词汇", 66 | "伊藤洋华堂总府店", 67 | "中国科学院计算技术研究所", 68 | "罗密欧与朱丽叶", 69 | "我购买了道具和服装", 70 | "PS: 我觉得开源有一个好处,就是能够敦促自己不断改进,避免敞帚自珍", 71 | "湖北省石首市", 72 | "湖北省十堰市", 73 | "总经理完成了这件事情", 74 | "电脑修好了", 75 | "做好了这件事情就一了百了了", 76 | "人们审美的观点是不同的", 77 | "我们买了一个美的空调", 78 | "线程初始化时我们要注意", 79 | "一个分子是由好多原子组织成的", 80 | "祝你马到功成", 81 | "他掉进了无底洞里", 82 | "中国的首都是北京", 83 | "孙君意", 84 | "外交部发言人马朝旭", 85 | "领导人会议和第四届东亚峰会", 86 | "在过去的这五年", 87 | "还需要很长的路要走", 88 | "60周年首都阅兵", 89 | "你好人们审美的观点是不同的", 90 | "买水果然后来世博园", 91 | "买水果然后去世博园", 92 | "但是后来我才知道你是对的", 93 | "存在即合理", 94 | "的的的的的在的的的的就以和和和", 95 | "I love你,不以为耻,反以为rong", 96 | "hello你好人们审美的观点是不同的", 97 | "很好但主要是基于网页形式", 98 | "为什么我不能拥有想要的生活", 99 | "后来我才", 100 | "此次来中国是为了", 101 | "使用了它就可以解决一些问题", 102 | ",使用了它就可以解决一些问题", 103 | "其实使用了它就可以解决一些问题", 104 | "好人使用了它就可以解决一些问题", 105 | "是因为和国家", 106 | "老年搜索还支持", 107 | "干脆就把那部蒙人的闲法给废了拉倒!RT @laoshipukong : 27日,全国人大常委会第三次审议" + 108 | "侵权责任法草案,删除了有关医疗损害责任“举证倒置”的规定。在医患纠纷中本已处于" + 109 | "弱势地位的消费者由此将陷入万劫不复的境地。 ", 110 | "他说的确实在理", 111 | "长春市长春节讲话", 112 | "结婚的和尚未结婚的", 113 | "结合成分子时", 114 | "旅游和服务是最好的", 115 | "这件事情的确是我的错", 116 | "供大家参考指正", 117 | "哈尔滨政府公布塌桥原因", 118 | "我在机场入口处", 119 | "邢永臣摄影报道", 120 | "BP神经网络如何训练才能在分类时增加区分度?", 121 | "南京市长江大桥", 122 | "应一些使用者的建议,也为了便于利用NiuTrans用于SMT研究", 123 | "长春市长春药店", 124 | "邓颖超生前最喜欢的衣服", 125 | "胡锦涛是热爱世界和平的政治局常委", 126 | "程序员祝海林和朱会震是在孙健的左面和右面, 范凯在最右面.再往左是李松洪", 127 | "一次性交多少钱", 128 | "小和尚留了一个像大和尚一样的和尚头", 129 | "我是中华人民共和国公民;我爸爸是共和党党员; 地铁和平门站", 130 | "张晓梅去人民医院做了个B超然后去买了件T恤", 131 | "C++和c#是什么关系?11+122=133,是吗?PI=3.14159", 132 | "你认识那个和主席握手的的哥吗?他开一辆黑色的士。", 133 | "2017-10-13给你发offer了吗?27日发iphone5了吗", 134 | "本报讯深圳市海王生物工程股份有限公司二○○○年度增发A股路演推介会日前在北京举行", 135 | "共同创造美好的新世纪——2001年新年贺词", 136 | }; 137 | 138 | @Test 139 | public void segment() { 140 | String[] expectedResults = new String[]{ 141 | "因", 142 | "", 143 | "", 144 | "UTF - 8", 145 | "iphone5", 146 | "鲜芋仙 3", 147 | "枪杆子 中 出 政权", 148 | "两 块 五一套 , 三 块 八一斤 , 四 块 七 一 本 , 五 块 六 一 条", 149 | "RT @ laoshipukong : 27日 ,", 150 | "AT&T 是 一 件 不错 的 公司 , 给 你 发 offer 了 吗 ?", 151 | "4 个 月 赚 了 20% 多", 152 | "仅 1 只 , 为 0.9923 元", 153 | "Just one space , or all such spaces ?", 154 | "倒模 , 替身 算 什么 ? 钟汉良 、 ab 《 孤芳不自赏 》 抠 图 来 充数", 155 | "奥迪 CEO 违规 遭 批 大众 表示 不 会 解雇", 156 | "找 小姐", 157 | "找 小 妹", 158 | "学生 妹", 159 | "职业 狐狸 精", 160 | "男 公关", 161 | "上门", 162 | "抽獎", 163 | "好 声音", 164 | "好 聲音", 165 | "夢 之 声", 166 | "夢 之 聲", 167 | "訂票", 168 | "改簽", 169 | "熱线", 170 | "熱線", 171 | "热線", 172 | "電话", 173 | "電話", 174 | "醫院", 175 | "代刷", 176 | "撲剋牌", 177 | "137-1234-1234", 178 | "这 是 一个 伸手不见五指 的 黑夜 。 我 叫 孙悟空 , 我 爱 北京 , 我 爱 Python 和 C + + 。", 179 | "我 不 喜欢 日本 和服 。", 180 | "雷猴 回归 人间 。", 181 | "工信 处女 干事 每月 经过 下属 科室 都 要 亲口 交代 24 口 交换机 等 技术性 器件 的 安装 工作", 182 | "我 需要 廉 租 房", 183 | "永 和 服装 饰品 有限公司", 184 | "我 爱 北京 天安门", 185 | "abc", 186 | "隐马尔可夫", 187 | "雷猴 是 个 好 网站", 188 | "“ , ” 和 “ SOFTware ( 软件 ) ” 两 部分 组成", 189 | "草泥马 和 欺实马 是 今年 的 流行 词汇", 190 | "伊藤 洋华堂 总府 店", 191 | "中国 科学院 计算 技术 研究所", 192 | "罗密欧 与 朱丽叶", 193 | "我 购买 了 道具 和 服装", 194 | "PS : 我 觉得 开源 有 一个 好处 , 就是 能够 敦促 自己 不断 改进 , 避免 敞帚自珍", 195 | "湖北省 石首市", 196 | "湖北省 十堰市", 197 | "总经理 完成 了 这 件 事情", 198 | "电脑 修好 了", 199 | "做 好 了 这 件 事情 就 一了百了 了", 200 | "人们 审美 的 观点 是 不同 的", 201 | "我们 买 了 一个 美 的 空调", 202 | "线程 初始化 时 我们 要 注意", 203 | "一个 分子 是 由 好多 原子 组织 成 的", 204 | "祝 你 马到功成", 205 | "他 掉 进 了 无 底洞 里", 206 | "中国 的 首都 是 北京", 207 | "孙君意", 208 | "外交部 发言人 马朝旭", 209 | "领导人 会议 和 第四 届 东亚 峰会", 210 | "在 过去 的 这 五 年", 211 | "还 需要 很 长 的 路 要 走", 212 | "60 周年 首都 阅兵", 213 | "你好 人们 审美 的 观点 是 不同 的", 214 | "买 水 果然 后来 世博园", 215 | "买 水果 然后 去世 博园", 216 | "但是 后来 我 才 知道 你 是 对 的", 217 | "存在 即 合理", 218 | "的 的 的 的 的 在 的 的 的 的 就 以 和 和 和", 219 | "I love 你 , 不以为耻 , 反 以为 rong", 220 | "hello 你好 人们 审美 的 观点 是 不同 的", 221 | "很 好 但 主要 是 基于 网页 形式", 222 | "为什么 我 不 能 拥有 想 要 的 生活", 223 | "后来 我 才", 224 | "此次 来 中国 是 为了", 225 | "使用 了 它 就 可以 解决 一些 问题", 226 | ", 使用 了 它 就 可以 解决 一些 问题", 227 | "其实 使用 了 它 就 可以 解决 一些 问题", 228 | "好人 使用 了 它 就 可以 解决 一些 问题", 229 | "是 因为 和 国家", 230 | "老年 搜索 还 支持", 231 | "干脆 就 把 那 部 蒙人 的 闲法 给 废 了 拉倒 ! RT @ laoshipukong : 27日 , 全国 人大 常委会 第三 次 审议 侵权 责任法 草案 , 删除 了 有关 医疗 损害 " + 232 | "责任 “ 举证 倒置 ” 的 规定 。 在 医患 纠纷 中 本 已 处于 弱势 地位 的 消费者 由此 将 陷入 万劫不复 的 境地 。", 233 | "他 说 的 确实 在理", 234 | "长春 市长 春节 讲话", 235 | "结婚 的 和 尚未 结婚 的", 236 | "结合 成分子 时", 237 | "旅游 和 服务 是 最 好 的", 238 | "这 件 事情 的确 是 我 的 错", 239 | "供 大家 参考 指正", 240 | "哈尔滨 政府 公布 塌桥 原因", 241 | "我 在 机场 入口处", 242 | "邢永臣 摄影 报道", 243 | "BP 神经 网络 如何 训练 才 能 在 分类 时 增加 区 分度 ?", 244 | "南京市 长江 大桥", 245 | "应 一些 使用者 的 建议 , 也 为了 便于 利用 NiuTrans 用于 SMT 研究", 246 | "长春市 长春 药店", 247 | "邓颖超 生前 最 喜欢 的 衣服", 248 | "胡锦涛 是 热爱 世界 和平 的 政治局 常委", 249 | "程序员 祝海林 和 朱会震 是 在 孙健 的 左面 和 右面 , 范凯 在 最 右 面 . 再 往 左 是 李松洪", 250 | "一次性 交 多少 钱", 251 | "小 和尚 留 了 一个 像 大 和 尚 一样 的 和尚 头", 252 | "我 是 中华人民共和国 公民 ; 我 爸爸 是 共和党 党员 ; 地铁 和平门站", 253 | "张晓梅 去 人民 医院 做 了 个 B 超然 后 去 买 了 件 T 恤", 254 | "C + + 和 c # 是 什么 关系 ? 11 + 122 = 133 , 是 吗 ? PI = 3.14159", 255 | "你 认识 那个 和 主席 握手 的 的 哥 吗 ? 他 开 一 辆 黑色 的 士 。", 256 | "2017-10-13 给 你 发 offer 了 吗 ? 27日 发 iphone5 了 吗", 257 | "本报 讯 深圳市 海王 生物 工程 股份 有限公司 二○○○ 年度 增 发 A 股 路演 推介会 日前 在 北京 举行", 258 | "共同 创造 美好 的 新 世纪 —— 2001年 新年 贺词", 259 | }; 260 | 261 | Segmenter.enableTitleWord(); 262 | for (int i = 0; i < SENTENCES.length; i++) { 263 | String actual = String.join(" ", Segmenter.segment(SENTENCES[i])); 264 | assertEquals(expectedResults[i], actual); 265 | } 266 | 267 | long length = 0L; 268 | long start = System.currentTimeMillis(); 269 | for (int i = 0; i < 1000; ++i) { 270 | for (String sentence : SENTENCES) { 271 | Segmenter.segment(sentence); 272 | length += sentence.getBytes(StandardCharsets.UTF_8).length; 273 | } 274 | } 275 | long elapsed = (System.currentTimeMillis() - start); 276 | System.out.println(String.format("time elapsed: %d ms, rate: %.2f kb/s.", 277 | elapsed, (length * 1.0) / 1024.0f / (elapsed * 1.0 / 1000.0f))); 278 | } 279 | 280 | @Test 281 | public void addUserWords() { 282 | Segmenter.addUserWords(Arrays.asList("中国风", "淡雅茗香")); 283 | assertEquals("浓浓的,中国风,淡雅茗香,古风", 284 | String.join(",", Segmenter.segment("浓浓的中国风 淡雅茗香古风"))); 285 | } 286 | 287 | @Test 288 | public void zFilterStopWords() { 289 | Segmenter.enableFilterStopWords(); 290 | assertEquals("我,能,做,的,事,绝不,推诿,到,下,一", 291 | String.join(",", Segmenter.segment("此时我能做的事,绝不推诿到下一时刻;"))); 292 | assertEquals("H,歌,你,的,猎豹,要是,有,你,的,嘴,那么,硬,有,多,好", 293 | String.join(",", Segmenter.segment("【H歌】你的猎豹要是有你的嘴那么硬有多好"))); 294 | assertEquals("沿江,高铁,雏形,初,现,湖北,要,做,祖国,立交桥", 295 | String.join(",", Segmenter.segment("沿江高铁雏形初现:湖北要做“祖国立交桥”"))); 296 | assertEquals("学,得,好,却,总是,考,不好,是,回,事", 297 | String.join(",", Segmenter.segment("「学得好却总是考不好」是怎么回事?"))); 298 | } 299 | } 300 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/common/DoubleArrayTrieTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.common; 2 | 3 | import io.github.yizhiru.thulac4j.util.ModelPaths; 4 | import org.junit.Test; 5 | 6 | import java.io.IOException; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.List; 10 | import java.util.Set; 11 | import java.util.stream.Collectors; 12 | 13 | import static org.junit.Assert.*; 14 | 15 | public class DoubleArrayTrieTest { 16 | 17 | @Test 18 | public void isMatched() throws IOException { 19 | DoubleArrayTrie dat = DoubleArrayTrie.loadDat("." + ModelPaths.NS_BIN_PATH); 20 | assertTrue(dat.isPrefixMatched("黑龙江")); 21 | assertTrue(dat.isWordMatched("黑龙江")); 22 | assertTrue(dat.isWordMatched("齐齐哈尔")); 23 | assertTrue(dat.isWordMatched("名古屋")); 24 | assertTrue(dat.isWordMatched("克拉约瓦")); 25 | assertTrue(dat.isWordMatched("10月9日街")); 26 | assertTrue(dat.isWordMatched("鸡公?")); 27 | assertTrue(dat.isWordMatched("齐白石纪念馆")); 28 | assertTrue(dat.isWordMatched("龙格伦吉里")); 29 | assertTrue(dat.isWordMatched("特德本-圣玛丽")); 30 | assertFalse(dat.isWordMatched("首乌")); 31 | } 32 | 33 | @Test 34 | public void serialize() throws IOException { 35 | String[] dictPaths = new String[]{ 36 | ModelPaths.IDIOM_DICT_PATH, 37 | ModelPaths.NS_DICT_PATH, 38 | ModelPaths.STOP_WORDS_DICT_PATH, 39 | }; 40 | String[] binPaths = new String[]{ 41 | "." + ModelPaths.IDIOM_BIN_PATH, 42 | "." + ModelPaths.NS_BIN_PATH, 43 | "." + ModelPaths.STOP_WORDS_BIN_PATH, 44 | }; 45 | for (int i = 0; i < dictPaths.length; i++) { 46 | DoubleArrayTrie expect = DoubleArrayTrie.make(dictPaths[i]); 47 | expect.serialize(binPaths[i]); 48 | DoubleArrayTrie actual = DoubleArrayTrie.loadDat(binPaths[i]); 49 | 50 | assertEquals(expect.size(), actual.size()); 51 | for (int j = 0; j < expect.size(); j++) { 52 | assertEquals(expect.getBaseByIndex(j), actual.getBaseByIndex(j)); 53 | assertEquals(expect.getCheckByIndex(j), actual.getCheckByIndex(j)); 54 | } 55 | } 56 | } 57 | 58 | @Test 59 | public void make() throws IOException { 60 | String[] paths = new String[]{ 61 | ModelPaths.NS_DICT_PATH, 62 | ModelPaths.IDIOM_DICT_PATH, 63 | ModelPaths.STOP_WORDS_DICT_PATH 64 | }; 65 | for (String path : paths) { 66 | List lexicon = Files.lines(Paths.get(path)) 67 | .map(String::trim) 68 | .collect(Collectors.toList()); 69 | DoubleArrayTrie dat = DoubleArrayTrie.make(path); 70 | for (String word : lexicon) { 71 | if (word.length() > 1) { 72 | assertTrue(dat.isPrefixMatched(word.substring(0, word.length() - 1))); 73 | } 74 | assertTrue(dat.isWordMatched(word)); 75 | } 76 | } 77 | } 78 | 79 | @Test 80 | public void restore() throws IOException { 81 | String[] binPaths = new String[]{ 82 | "." + ModelPaths.NS_BIN_PATH, 83 | "." + ModelPaths.IDIOM_BIN_PATH, 84 | "." + ModelPaths.STOP_WORDS_BIN_PATH 85 | }; 86 | String[] dictPaths = new String[]{ 87 | ModelPaths.NS_DICT_PATH, 88 | ModelPaths.IDIOM_DICT_PATH, 89 | ModelPaths.STOP_WORDS_DICT_PATH 90 | }; 91 | 92 | for (int i = 0; i < binPaths.length; i++) { 93 | DoubleArrayTrie dat = DoubleArrayTrie.loadDat(binPaths[i]); 94 | Set dict = Files.lines(Paths.get(dictPaths[i])) 95 | .map(String::trim) 96 | .collect(Collectors.toSet()); 97 | List words = DoubleArrayTrie.restore(dat); 98 | for (String word : words) { 99 | assertTrue(dict.contains(word)); 100 | } 101 | assertEquals(dict.size(), words.size()); 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/perceptron/StructuredPerceptronModelTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.perceptron; 2 | 3 | import io.github.yizhiru.thulac4j.process.RuleAnnotator; 4 | import io.github.yizhiru.thulac4j.term.POC; 5 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms; 6 | import org.junit.Test; 7 | 8 | import java.io.FileInputStream; 9 | import java.io.IOException; 10 | import java.util.Arrays; 11 | 12 | import static io.github.yizhiru.thulac4j.SPChineseTokenizerTest.*; 13 | import static org.junit.Assert.assertArrayEquals; 14 | import static org.junit.Assert.assertEquals; 15 | 16 | public class StructuredPerceptronModelTest { 17 | 18 | @Test 19 | public void loadSegmenterModel() throws IOException { 20 | StructuredPerceptronModel SPModel = new StructuredPerceptronModel( 21 | new FileInputStream(SEG_WEIGHTS_PATH), 22 | new FileInputStream(SEG_FEATURES_PATH), 23 | new FileInputStream(SEG_LABELS_PATH)); 24 | assertEquals(2453880, SPModel.featureSize); 25 | assertEquals(4, SPModel.labelSize); 26 | assertEquals(-42717, SPModel.llWeights[0]); 27 | assertEquals(-4958, SPModel.flWeights[0]); 28 | 29 | assertArrayEquals(new String[]{"3"}, getPosValue(POC.SINGLE_NUMERAL_POC, SPModel)); 30 | assertArrayEquals(new String[]{"0"}, getPosValue(POC.BEGIN_POC, SPModel)); 31 | assertArrayEquals(new String[]{"1"}, getPosValue(POC.MIDDLE_POC, SPModel)); 32 | assertArrayEquals(new String[]{"2"}, getPosValue(POC.END_POC, SPModel)); 33 | assertArrayEquals(new String[]{"3"}, getPosValue(POC.SINGLE_POC, SPModel)); 34 | assertArrayEquals(new String[]{"0", "3"}, getPosValue(POC.BEGIN_OR_SINGLE_POC, SPModel)); 35 | assertArrayEquals(new String[]{"2", "3"}, getPosValue(POC.END_OR_SINGLE_POC, SPModel)); 36 | assertArrayEquals( 37 | new String[]{"0", "2", "3", "1"}, 38 | getPosValue(POC.DEFAULT_POC, SPModel)); 39 | } 40 | 41 | @Test 42 | public void loadPosModel() throws IOException { 43 | StructuredPerceptronModel SPModel = new StructuredPerceptronModel( 44 | new FileInputStream(POS_WEIGHTS_PATH), 45 | new FileInputStream(POS_FEATURES_PATH), 46 | new FileInputStream(POS_LABELS_PATH)); 47 | assertEquals(961470, SPModel.featureSize); 48 | assertEquals(96, SPModel.labelSize); 49 | assertEquals(-10615, SPModel.llWeights[0]); 50 | assertEquals(5481, SPModel.flWeights[0]); 51 | 52 | assertArrayEquals(new String[]{"3w"}, getPosValue(POC.PUNCTUATION_POC, SPModel)); 53 | assertArrayEquals(new String[]{"0m"}, getPosValue(POC.BEGIN_NUMERAL_POC, SPModel)); 54 | assertArrayEquals(new String[]{"1m"}, getPosValue(POC.MIDDLE_NUMERAL_POC, SPModel)); 55 | assertArrayEquals(new String[]{"2m"}, getPosValue(POC.END_NUMERAL_POC, SPModel)); 56 | assertArrayEquals(new String[]{"3m"}, getPosValue(POC.SINGLE_NUMERAL_POC, SPModel)); 57 | assertArrayEquals( 58 | new String[]{"0v", "0n", "0ns", "0t", "0f", "0d", "0m", "0q", "0r", "0j", "0s", "0a", 59 | "0id", "0ni", "0p", "0c", "0np", "0nz", "0w", "0u", "0o", "0x", "0e", "0k"}, 60 | getPosValue(POC.BEGIN_POC, SPModel)); 61 | assertArrayEquals( 62 | new String[]{"1n", "1ns", "1t", "1v", "1m", "1j", "1id", "1ni", "1c", "1np", "1d", "1a", 63 | "1nz", "1w", "1q", "1s", "1f", "1r", "1x", "1o", "1p", "1e", "1u", "1k"}, 64 | getPosValue(POC.MIDDLE_POC, SPModel)); 65 | assertArrayEquals( 66 | new String[]{"2v", "2n", "2ns", "2t", "2f", "2d", "2m", "2q", "2r", "2j", "2s", "2a", "2id", 67 | "2ni", "2p", "2c", "2np", "2nz", "2w", "2u", "2o", "2x", "2e", "2k"}, 68 | getPosValue(POC.END_POC, SPModel)); 69 | assertArrayEquals( 70 | new String[]{"3p", "3v", "3w", "3f", "3u", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "3r", 71 | "3j", "3np", "3x", "3k", "3o", "3e", "3h", "3t", "3ni", "3s", "3nz"}, 72 | getPosValue(POC.SINGLE_POC, SPModel)); 73 | assertArrayEquals( 74 | new String[]{"0v", "3p", "0n", "3v", "3w", "0ns", "0t", "0f", "0d", "3f", "3u", "0m", "0q", "0r", 75 | "0j", "0s", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "0a", "0id", "3r", "0ni", "0p", "0c", 76 | "0np", "3j", "3np", "3x", "0nz", "0w", "0u", "3k", "3o", "0o", "0x", "3e", "3h", "3t", "0e", 77 | "3ni", "3s", "3nz", "0k"}, 78 | getPosValue(POC.BEGIN_OR_SINGLE_POC, SPModel)); 79 | assertArrayEquals( 80 | new String[]{"2v", "3p", "2n", "3v", "3w", "2ns", "2t", "2f", "2d", "3f", "3u", "2m", "2q", "2r", 81 | "2j", "2s", "3a", "3c", "3g", "3m", "3q", "3d", "3n", "2a", "2id", "3r", "2ni", "2p", "2c", 82 | "2np", "3j", "3np", "3x", "2nz", "2w", "2u", "3k", "3o", "2o", "2x", "3e", "3h", "3t", "2e", 83 | "3ni", "3s", "3nz", "2k"}, 84 | getPosValue(POC.END_OR_SINGLE_POC, SPModel)); 85 | } 86 | 87 | /** 88 | * 根据POS 得到对应的所有label 89 | * 90 | * @param pos enum POC 值 91 | * @param SPModel StructuredPerceptronModel 对象 92 | * @return pos 对应的所有label 93 | */ 94 | private String[] getPosValue(POC pos, StructuredPerceptronModel SPModel) { 95 | return Arrays.stream(SPModel.allowTabular[pos.ordinal()]) 96 | .mapToObj(t -> SPModel.labelValues[t]) 97 | .toArray(String[]::new); 98 | } 99 | 100 | @Test 101 | public void evaluateCharWeights() throws IOException { 102 | StructuredPerceptronModel SPModel = new StructuredPerceptronModel( 103 | new FileInputStream(SEG_WEIGHTS_PATH), 104 | new FileInputStream(SEG_FEATURES_PATH), 105 | new FileInputStream(SEG_LABELS_PATH)); 106 | String[] sentences = new String[]{ 107 | "鲜", 108 | "两块五一套,", 109 | "AT&T是" 110 | }; 111 | String[] expectedWeights = new String[]{ 112 | "[[0, 0, -2664, 0]]", 113 | "[[-4384, 0, 21415, 0], [-22789, 1568, 24039, -2808], [21771, -3627, -11546, -6585], [-13779, -3998, " + 114 | "7844, 9945], [0, 19768, 18906, 0], [0, 0, 40833, 0]]", 115 | "[[1857, 0, 0, 0], [0, 0, 0, 15367], [0, 0, 0, 8591], [0, 15227, 0, 0], [0, 0, 22574, 0]]", 116 | }; 117 | 118 | for (int i = 0; i < sentences.length; i++) { 119 | AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true); 120 | char[] chars = annotatedTerms.appendBoundaryAround(); 121 | 122 | POC[] pocs = annotatedTerms.getPocs(); 123 | 124 | int[][] weights = new int[annotatedTerms.getAnnotatedLength()][]; 125 | for (int j = 0; j < annotatedTerms.getAnnotatedLength(); j++) { 126 | int[] labelIndices = SPModel.allowTabular[pocs[j].ordinal()]; 127 | weights[j] = SPModel.evaluateCharWeights( 128 | chars[j], 129 | chars[j + 1], 130 | chars[j + 2], 131 | chars[j + 3], 132 | chars[j + 4], 133 | labelIndices); 134 | } 135 | 136 | assertEquals(expectedWeights[i], Arrays.deepToString(weights)); 137 | } 138 | 139 | SPModel = new StructuredPerceptronModel( 140 | new FileInputStream(POS_WEIGHTS_PATH), 141 | new FileInputStream(POS_FEATURES_PATH), 142 | new FileInputStream(POS_LABELS_PATH)); 143 | String[] expected0Weights = new String[]{ 144 | "[0, 0, -577, 0, 0, -6529, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -997, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0," + 145 | " 0, 0, 0, 12863, 0, 1074, -4387, 0, -1926, -2411, 0, 0, 0, 0, 0, -910, 0, 0, 0, 0, 0, 0, 0, " + 146 | "0, 0, 0, 0, -1841, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " + 147 | "0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]", 148 | "[-5754, 0, -1939, 813, 0, -4395, 0, 940, -3822, 0, 0, 166, 0, 0, 7178, 0, 979, 0, -1127, -639, 0, " + 149 | "9709, 0, 0, 3389, 0, 7075, 0, 6760, 0, 0, 3892, 0, 1710, -943, -7110, 31462, 5834, -472, " + 150 | "3806, -577, 0, 1626, 0, 0, -4558, -1971, 0, 0, 0, 0, -985, 0, 0, -1399, 0, 0, -1704, 0, " + 151 | "-775, 0, 0, -5751, 0, 0, 0, 0, 0, -147, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " + 152 | "0, 0, 0, 0, 0, 0, 0, 0, 0, 0]", 153 | "[-5660, 0, 0, -8860, 0, 0, 0, 0, 740, 0, 0, 0, 0, 0, 0, 0, -987, 0, 0, 0, 0, -2864, 0, 0, -953, 0, " + 154 | "0, 0, -1068, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -4410, 0, -2860, 0, 0, 0, 4055, 0, 0, 0, 0, " + 155 | "-996, 0, 0, 348, 0, 0, 0, 0, 0, 0, 0, 2363, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, " + 156 | "13543, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]", 157 | }; 158 | for (int i = 0; i < sentences.length; i++) { 159 | AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true); 160 | char[] chars = annotatedTerms.appendBoundaryAround(); 161 | 162 | POC[] pocs = annotatedTerms.getPocs(); 163 | int[] labelIndices = SPModel.allowTabular[pocs[0].ordinal()]; 164 | int[] weights = SPModel.evaluateCharWeights( 165 | chars[0], 166 | chars[1], 167 | chars[2], 168 | chars[3], 169 | chars[4], 170 | labelIndices); 171 | assertEquals(expected0Weights[i], Arrays.toString(weights)); 172 | } 173 | } 174 | } -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/process/LexiconCementerTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.util.ModelPaths; 4 | import io.github.yizhiru.thulac4j.term.TokenItem; 5 | import org.junit.Test; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayList; 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | import static org.junit.Assert.assertEquals; 13 | 14 | public class LexiconCementerTest { 15 | 16 | @Test 17 | public void cement() throws IOException { 18 | LexiconCementer cementer = new LexiconCementer( 19 | this.getClass().getResourceAsStream(ModelPaths.NS_BIN_PATH), 20 | "ns"); 21 | List tokenItems = new ArrayList<>(Arrays.asList( 22 | new TokenItem("黑", null), 23 | new TokenItem("龙", "n"), 24 | new TokenItem("江", "j")) 25 | ); 26 | cementer.cement(tokenItems); 27 | assertEquals("[黑龙江/ns]", tokenItems.toString()); 28 | 29 | cementer = new LexiconCementer( 30 | this.getClass().getResourceAsStream(ModelPaths.IDIOM_BIN_PATH), 31 | "i"); 32 | tokenItems = new ArrayList<>(Arrays.asList( 33 | new TokenItem("掉", null), 34 | new TokenItem("进", "n"), 35 | new TokenItem("了", "j"), 36 | new TokenItem("无", "n"), 37 | new TokenItem("底洞", "j")) 38 | ); 39 | cementer.cement(tokenItems); 40 | assertEquals("[掉, 进/n, 了/j, 无/n, 底洞/j]", tokenItems.toString()); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/process/RuleAnnotatorTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.term.POC; 4 | import io.github.yizhiru.thulac4j.term.AnnotatedTerms; 5 | import org.junit.Test; 6 | 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | import java.util.stream.Collectors; 10 | import java.util.stream.Stream; 11 | 12 | import static org.junit.Assert.assertEquals; 13 | 14 | /** 15 | * RuleAnnotator Test. 16 | */ 17 | public class RuleAnnotatorTest { 18 | 19 | @Test 20 | public void annotate() { 21 | Map pocStringHashMap = new HashMap<>(POC.values().length); 22 | pocStringHashMap.put(POC.PUNCTUATION_POC, "w"); 23 | pocStringHashMap.put(POC.BEGIN_NUMERAL_POC, "bm"); 24 | pocStringHashMap.put(POC.MIDDLE_NUMERAL_POC, "mm"); 25 | pocStringHashMap.put(POC.END_NUMERAL_POC, "em"); 26 | pocStringHashMap.put(POC.SINGLE_NUMERAL_POC, "sm"); 27 | pocStringHashMap.put(POC.BEGIN_POC, "b"); 28 | pocStringHashMap.put(POC.MIDDLE_POC, "m"); 29 | pocStringHashMap.put(POC.END_POC, "e"); 30 | pocStringHashMap.put(POC.SINGLE_POC, "s"); 31 | pocStringHashMap.put(POC.BEGIN_OR_SINGLE_POC, "bs"); 32 | pocStringHashMap.put(POC.END_OR_SINGLE_POC, "es"); 33 | pocStringHashMap.put(POC.DEFAULT_POC, "d"); 34 | 35 | String[] sentences = new String[]{ 36 | "4个月赚了20%多", 37 | "【开放式基金】", 38 | "大", 39 | "10大重仓股:厦门钨业……这些", 40 | "鲜芋仙 3", 41 | "仅1只,为0.9923元", 42 | "大河《地方的", 43 | "●会议》无否决", 44 | "AT&T是一家", 45 | "在2017-12-12 这一天", 46 | "UTF-8", 47 | "鲜芋仙 3", 48 | "最右面.再", 49 | "内容《》真实、、", 50 | "签定《供货协议书》的,", 51 | "昨日《上市公司证券发行管理办法》发布", 52 | "《21世纪》:", 53 | "《探索·发现》栏目", 54 | "《麦亚hee》", 55 | "日系&动漫", 56 | }; 57 | String[] expectedPocString = new String[]{ 58 | "sm,bs,d,d,es,bm,mm,em,s", 59 | "w,bs,d,d,d,es,w", 60 | "s", 61 | "bm,em,bs,d,d,es,w,bs,d,d,es,w,w,bs,es", 62 | "bs,d,es,sm", 63 | "s,sm,s,w,s,bm,mm,mm,mm,mm,em,s", 64 | "bs,es,w,bs,d,es", 65 | "w,bs,es,w,bs,d,es", 66 | "b,m,m,e,bs,d,es", 67 | "s,bm,mm,mm,mm,mm,mm,mm,mm,mm,em,bs,d,es", 68 | "b,m,e,w,sm", 69 | "bs,d,es,sm", 70 | "bs,d,es,w,s", 71 | "bs,es,w,w,bs,es,w,w", 72 | "bs,es,w,b,m,m,m,e,w,s,w", 73 | "bs,es,w,bs,d,d,d,d,d,d,d,d,d,d,es,w,bs,es", 74 | "w,b,m,m,e,w,w", 75 | "w,b,m,m,m,e,w,bs,es", 76 | "w,b,m,m,m,e,w", 77 | "bs,es,s,bs,es", 78 | }; 79 | 80 | for (int i = 0; i < sentences.length; i++) { 81 | AnnotatedTerms annotatedTerms = RuleAnnotator.annotate(sentences[i], true); 82 | String result = Stream.of(annotatedTerms.getPocs()) 83 | .map(pocStringHashMap::get) 84 | .collect(Collectors.joining(",")); 85 | if (!expectedPocString[i].equals(result)) { 86 | System.out.println(sentences[i]); 87 | } 88 | assertEquals(expectedPocString[i], result); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/process/SpecifiedWordCementerTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.process; 2 | 3 | import io.github.yizhiru.thulac4j.term.TokenItem; 4 | import org.junit.Test; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Arrays; 8 | import java.util.List; 9 | 10 | import static org.junit.Assert.assertEquals; 11 | 12 | public class SpecifiedWordCementerTest { 13 | 14 | @Test 15 | public void cement() { 16 | List tokenItems = new ArrayList<>(Arrays.asList( 17 | new TokenItem("二○○一", "m"), 18 | new TokenItem("年", "q"), 19 | new TokenItem("27", "m"), 20 | new TokenItem("日", "q")) 21 | ); 22 | SpecifiedWordCementer.cementWord(tokenItems); 23 | assertEquals(2, tokenItems.size()); 24 | assertEquals("二○○一年", tokenItems.get(0).word); 25 | assertEquals("27日", tokenItems.get(1).word); 26 | assertEquals("t", tokenItems.get(1).pos); 27 | 28 | tokenItems = new ArrayList<>(Arrays.asList( 29 | new TokenItem("盛典", "n"), 30 | new TokenItem("—", "w"), 31 | new TokenItem("—", "w"), 32 | new TokenItem("—", "w"), 33 | new TokenItem("2001", "m"), 34 | new TokenItem("年", "q")) 35 | ); 36 | SpecifiedWordCementer.cementWord(tokenItems); 37 | assertEquals(3, tokenItems.size()); 38 | assertEquals("———", tokenItems.get(1).word); 39 | assertEquals("2001年", tokenItems.get(2).word); 40 | } 41 | } -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/term/POCTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.term; 2 | 3 | import org.junit.Test; 4 | 5 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_NUMERAL_POC; 6 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_POC; 7 | import static io.github.yizhiru.thulac4j.term.POC.BEGIN_OR_SINGLE_POC; 8 | import static io.github.yizhiru.thulac4j.term.POC.DEFAULT_POC; 9 | import static io.github.yizhiru.thulac4j.term.POC.END_NUMERAL_POC; 10 | import static io.github.yizhiru.thulac4j.term.POC.END_POC; 11 | import static io.github.yizhiru.thulac4j.term.POC.END_OR_SINGLE_POC; 12 | import static io.github.yizhiru.thulac4j.term.POC.MIDDLE_NUMERAL_POC; 13 | import static io.github.yizhiru.thulac4j.term.POC.MIDDLE_POC; 14 | import static io.github.yizhiru.thulac4j.term.POC.PUNCTUATION_POC; 15 | import static io.github.yizhiru.thulac4j.term.POC.SINGLE_NUMERAL_POC; 16 | import static io.github.yizhiru.thulac4j.term.POC.SINGLE_POC; 17 | import static org.junit.Assert.assertEquals; 18 | 19 | public class POCTest { 20 | 21 | @Test 22 | public void intersect() { 23 | assertEquals(PUNCTUATION_POC, PUNCTUATION_POC.intersect(BEGIN_POC)); 24 | 25 | assertEquals(BEGIN_NUMERAL_POC, BEGIN_POC.intersect(BEGIN_NUMERAL_POC)); 26 | assertEquals(END_NUMERAL_POC, END_POC.intersect(END_NUMERAL_POC)); 27 | assertEquals(MIDDLE_NUMERAL_POC, MIDDLE_NUMERAL_POC.intersect(MIDDLE_POC)); 28 | assertEquals(SINGLE_NUMERAL_POC, SINGLE_NUMERAL_POC.intersect(SINGLE_POC)); 29 | 30 | assertEquals(SINGLE_POC, BEGIN_OR_SINGLE_POC.intersect(END_OR_SINGLE_POC)); 31 | assertEquals(SINGLE_POC, END_OR_SINGLE_POC.intersect(BEGIN_OR_SINGLE_POC)); 32 | 33 | assertEquals(SINGLE_POC, DEFAULT_POC.intersect(SINGLE_POC)); 34 | assertEquals(BEGIN_NUMERAL_POC, BEGIN_NUMERAL_POC.intersect(DEFAULT_POC)); 35 | } 36 | } -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/util/CharUtilsTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | 4 | import io.github.yizhiru.thulac4j.term.CharType; 5 | import org.junit.Test; 6 | 7 | import static io.github.yizhiru.thulac4j.util.CharUtils.getCharType; 8 | import static org.junit.Assert.assertSame; 9 | 10 | public class CharUtilsTest { 11 | 12 | @Test 13 | public void checkCharType() { 14 | char[] singlePunctuations = new char[]{ 15 | ',', '。', '?', '!', ':', ';', '‘', '’', '“', '”', '【', '】', '、', 16 | '《', '》', '@', '#', '(', ')', '"', '[', ']', '~', ':', '?', '◤', 17 | '☆', '★', '…', '\'', '!', '*', '+', '>', '(', ')', ';', '=', 18 | '℃', '℉', 19 | }; 20 | for (char c : singlePunctuations) { 21 | assertSame(CharType.SINGLE_PUNCTUATION_CHAR, getCharType(c)); 22 | } 23 | 24 | char[] exSinglePunctuations = new char[]{ 25 | '·', '—', '¥', '$', '&', '\\', '^', '_', '{', '|', '}' 26 | }; 27 | for (char c : exSinglePunctuations) { 28 | assertSame(CharType.EX_SINGLE_PUNCTUATION_CHAR, getCharType(c)); 29 | } 30 | 31 | char[] chineseNumeralChars = new char[]{ 32 | '〇', '一', '二', '三', '四', '五', '六', '七', '八', '九' 33 | }; 34 | for (char c : chineseNumeralChars) { 35 | assertSame(CharType.CHINESE_NUMERAL_CHAR, getCharType(c)); 36 | } 37 | 38 | char[] arabicNumeralChars = new char[]{ 39 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 40 | '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' 41 | }; 42 | for (char c : arabicNumeralChars) { 43 | assertSame(CharType.ARABIC_NUMERAL_CHAR, getCharType(c)); 44 | } 45 | 46 | // numeral punctuations 47 | char[] numeralPunctuationChars = new char[]{ 48 | '%', '.', ',', '/', '%', '-', '±', '‰', 49 | }; 50 | for (char c : numeralPunctuationChars) { 51 | assertSame(CharType.NUMERAL_PUNCTUATION_CHAR, getCharType(c)); 52 | } 53 | 54 | char[] hanChars = new char[]{ 55 | '苟', '利', '国', '家', '生', '死', '以', 56 | '豈', '因', '禍', '福', '避', '趨', '之', 57 | }; 58 | for (char c : hanChars) { 59 | assertSame(CharType.HAN_ZI_CHAR, getCharType(c)); 60 | } 61 | 62 | char[] englishLetterChars = new char[]{ 63 | 'a', 'b', 'c', 'd', 'h', 'l', 'o', 'r', 'u', 'z', 64 | 'A', 'B', 'C', 'D', 'H', 'L', 'O', 'R', 'U', 'Z' 65 | }; 66 | for (char c : englishLetterChars) { 67 | assertSame(CharType.ENGLISH_LETTER_CHAR, getCharType(c)); 68 | } 69 | 70 | char[] otherChars = new char[]{ 71 | '&', 72 | }; 73 | for (char c : otherChars) { 74 | assertSame(CharType.OTHER_CHAR, getCharType(c)); 75 | } 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/util/ChineseUtilsTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.*; 6 | 7 | public class ChineseUtilsTest { 8 | 9 | @Test 10 | public void t2s() { 11 | String[] traditions = new String[]{ 12 | "為何曾加入日軍的他,一生無法原諒日本人?日導演用7年走遍台灣,拍下時代淚水", 13 | "「那些人哪裡像軍隊?根本不知是蔣介石從哪撿來的流氓!」", 14 | "明明課本上都說光復節是台灣人熱烈歡迎「祖國」到來的時刻,為何有一群受過日本統治的台灣人,到現在都不能接受中華民國?", 15 | "鯛魚是低脂肪、高蛋白的健康食材, 肉質軟嫩細緻。", 16 | "世界商機大發現:抓住泰國工頭的需求 就是臺灣手工具產業的福氣啦!", 17 | "房市買氣還沒回春,房價也還在向下修正,但土地交易熱度卻是燒燙燙,替地方政府的國庫充實不少" 18 | }; 19 | String[] simples = new String[]{ 20 | "为何曾加入日军的他,一生无法原谅日本人?日导演用7年走遍台湾,拍下时代泪水", 21 | "「那些人哪里像军队?根本不知是蒋介石从哪捡来的流氓!」", 22 | "明明课本上都说光复节是台湾人热烈欢迎「祖国」到来的时刻,为何有一群受过日本统治的台湾人,到现在都不能接受中华民国?", 23 | "鲷鱼是低脂肪、高蛋白的健康食材, 肉质软嫩细致。", 24 | "世界商机大发现:抓住泰国工头的需求 就是台湾手工具产业的福气啦!", 25 | "房市买气还没回春,房价也还在向下修正,但土地交易热度却是烧烫烫,替地方政府的国库充实不少" 26 | }; 27 | 28 | for (int i = 0; i < traditions.length; i++) { 29 | assertEquals(simples[i], ChineseUtils.simplified(traditions[i])); 30 | } 31 | } 32 | 33 | 34 | @Test 35 | public void isStopWords() { 36 | assertTrue(ChineseUtils.isStopWord("此时")); 37 | assertTrue(ChineseUtils.isStopWord(";")); 38 | assertTrue(ChineseUtils.isStopWord("一时")); 39 | assertFalse(ChineseUtils.isStopWord("刻")); 40 | assertFalse(ChineseUtils.isStopWord("到")); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/io/github/yizhiru/thulac4j/util/IOUtilsTest.java: -------------------------------------------------------------------------------- 1 | package io.github.yizhiru.thulac4j.util; 2 | 3 | import org.junit.Test; 4 | 5 | import java.io.IOException; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | 9 | public class IOUtilsTest { 10 | 11 | @Test 12 | public void toIntArray() throws IOException { 13 | int[] array = IOUtils.toIntArray( 14 | this.getClass().getResourceAsStream(ModelPaths.T2S_PATH)); 15 | assertEquals(5600, array.length); 16 | assertEquals(33836, array[0]); 17 | assertEquals(40800, array[2789]); 18 | assertEquals(40863, array[5599]); 19 | } 20 | } --------------------------------------------------------------------------------