├── .gitignore
├── LICENSE
├── README.md
├── java
    ├── build.gradle
    ├── gradle
    │   └── wrapper
    │   │   ├── gradle-wrapper.jar
    │   │   └── gradle-wrapper.properties
    ├── gradlew
    ├── gradlew.bat
    ├── settings.gradle
    └── src
    │   └── main
    │       ├── java
    │           └── indi
    │           │   └── tiandi
    │           │       └── nlp
    │           │           ├── Seg.java
    │           │           ├── Sentence.java
    │           │           ├── Term.java
    │           │           ├── evaluation
    │           │               ├── SegEvaluation.java
    │           │               └── impl
    │           │               │   ├── AnsjImpl.java
    │           │               │   ├── FNLPImpl.java
    │           │               │   ├── HanLPImpl.java
    │           │               │   ├── JcsegImpl.java
    │           │               │   ├── JiebaAnalysisImpl.java
    │           │               │   ├── MMSeg4jImpl.java
    │           │               │   ├── MYNLPImpl.java
    │           │               │   ├── PaodingImpl.java
    │           │               │   ├── StanfordCoreNLPImpl.java
    │           │               │   ├── ThulacImpl.java
    │           │               │   └── WordImpl.java
    │           │           └── tool
    │           │               ├── HttpRequest.java
    │           │               └── ZipUtil.java
    │       └── resources
    │           ├── ansj
    │               └── library
    │               │   ├── ambiguity.dic
    │               │   ├── default.dic
    │               │   ├── regex.dic
    │               │   ├── stop.dic
    │               │   └── synonyms.dic
    │           └── logback.xml
└── python
    ├── indi.tiandi.nlp.evaluation
        ├── SegEvaluation.py
        └── metric.py
    └── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | ### Java template
108 | # Compiled class file
109 | *.class
110 | 
111 | # Log file
112 | *.log
113 | 
114 | # BlueJ files
115 | *.ctxt
116 | 
117 | # Mobile Tools for Java (J2ME)
118 | .mtj.tmp/
119 | 
120 | # Package Files #
121 | *.jar
122 | *.war
123 | *.nar
124 | *.ear
125 | *.zip
126 | *.tar.gz
127 | *.rar
128 | 
129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
130 | hs_err_pid*
131 | 
132 | !/java/gradle/wrapper/gradle-wrapper.jar
133 | ### Example user template template
134 | ### Example user template
135 | 
136 | # IntelliJ project files
137 | .idea
138 | *.iml
139 | .gradle
140 | out
141 | gen
142 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 目录
 2 | - <a href="#java">java</a>
 3 | - <a href="#python">python</a>
 4 | - <a href="#总结">总结</a>
 5 | ## java
 6 | #### Requirement
 7 |     java8
 8 |     
 9 | #### 步骤
10 | 
11 | 1. `git clone https://github.com/tiandiweizun/nlp-evaluation.git`
12 | 2. `cd nlp-evaluation/java`
13 | 3. (windows) &nbsp;`.\gradlew.bat build`  &nbsp;&nbsp;(linux) &nbsp;`./gradlew build`
14 | 4. `java -Dfile.encoding=utf-8 -jar build/libs/nlp-evaluation-java-1.0.0.jar`
15 | 
16 | 
17 | #### 说明
18 | 1. java -jar nlp-evaluation-java-1.0.0.jar 有3个参数，可以执行 java -jar nlp-evaluation-java-1.0.0.jar -h 查看
19 | </br> -i 分词文件，默认为data/seg.data_big文件,每行一个句子，每个词用空格分开，可以指定自己的测试集
20 | </br> -o 分词结果存储路径，默认不存储
21 | </br> -n 最大读取分词文件行数
22 | </br> -c 需要评估的分词器名称，用英文逗号隔开，默认HanLP，jieba，thulac，mynlp，示例: -c=HanLP
23 | 
24 | 2. 由于[斯坦福分词](https://github.com/stanfordnlp/CoreNLP)效果一般，速度极慢，且模型巨大，在打包的时候已经排除（不影响在IDE里面测试），
25 |       打包如果要包含斯坦福分词，修改build.gradle，注释掉exclude(dependency('edu.stanford.nlp:stanford-corenlp'))
26 | 3. 由于[Word](https://github.com/ysc/word)、[Ansj](https://github.com/NLPchina/ansj_seg)、[Jcseg](https://github.com/lionsoul2014/jcseg)、[MMSeg4j](https://github.com/chenlb/mmseg4j-core)存在bug（把词语拼接起来和原始句子不一样），在代码里面已经注释掉了，不进行测试。
27 | 4. 依赖的库均存在于maven中心仓库，像庖丁、复旦分词等找不到的，这里没有测试
28 | 
29 |     
30 | #### 测试效果
31 | 
32 | 总行数:2533709  总字符数:28374490
33 | 
34 |  |segmentor|precision| recall | f1   |  speed(字符/ms)_windows   | speed(字符/ms)_linux   |
35 |  | --| -- | ------ | --- | --- | --- |
36 |  |[HanLP](https://github.com/hankcs/HanLP)          |  0.900433  |  0.910614   |  0.905495  | 1034.470451  | 797.596346 |
37 |  |[jieba](https://github.com/huaban/jieba-analysis) |  0.852657  |  0.803263   |  0.827223  | 1774.181830  | 980.865943 |
38 |  |[thulac](https://github.com/yizhiru/thulac4j)     |  0.884405  |  0.901930   |  0.893082  | 1449.749131  | 939.832732 |
39 |  |[mynlp](https://github.com/mayabot/mynlp)         |  0.901661  |  0.900246   |  0.900953  | 1739.272404  | 1178.930115|
40 |  
41 | 经过多次测试发现，linux第一个性能偏低，thulac在linux上速度不是特别稳定，最快与jieba差不多
42 | 
43 | #### 开发者
44 | 
45 | - 建议使用idea打开或者导入java目录，把data目录拷贝到java目录，直接可以运行SegEvaluation调试。
46 | - 可以打开stanford和其他分词器
47 | - 评测自定义分词器：继承Seg类并实现segment方法，添加到evaluators即可。
48 |  
49 | ## python
50 | 
51 | #### Requirement
52 | 
53 |     Python:3
54 |     其他参见 requirements.txt
55 |     
56 | #### 步骤
57 | 
58 |     1. git clone https://github.com/tiandiweizun/nlp-evaluation.git
59 |     2. cd nlp-evaluation
60 |     3. pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
61 |     4. cd python/indi.tiandi.nlp.evaluation
62 |     5. python3 SegEvaluation.py   
63 |     
64 | #### 说明
65 | 1. python3 SegEvaluation.py 有3个参数，可以执行 python3 SegEvaluation.py -h 查看
66 | </br>       -i 分词文件，默认为data/seg.data_big文件,每行一个句子，每个词用空格分开，可以指定自己的测试集
67 | </br>       -o 分词结果存储路径，默认不存储
68 | </br>       -n 最大读取分词文件行数，由于python速度太慢，建议设置
69 | </br>       -c 需要评估的分词器名称，用英文逗号隔开，默认pkuseg，jieba_fast，thulac
70 | 
71 | 2. [pynlpir](https://github.com/tsroten/pynlpir)存在bug(把词语拼接起来和原始句子不一样)，[pyltp](https://github.com/HIT-SCIR/pyltp)在windows上不易安装，这里都没有进行测试，比较慢的也没有测试
72 | 
73 | ##### 测试效果 
74 | 
75 | 总行数:2533709  总字符数:28374490
76 | 
77 | |segmentor|precision| recall | f1  |  speed(字符/ms)_windows   | speed(字符/ms)_linux   |
78 | | --| -- | ------ | --- | --- |--- |
79 | |[pkuseg](https://github.com/lancopku/pkuseg-python)    |  0.890170  |  0.886405  | 0.888284  |  34.077104 |  19.826954  |
80 | |[jieba](https://github.com/fxsjy/jieba)                |  0.855293  |  0.808204  | 0.831082  | 169.651694 | 104.554222  |
81 | |[jieba_fast](https://github.com/deepcs233/jieba_fast) |  0.855299  |  0.808182  | 0.831073  | 408.241520 | 203.815985  |
82 | |[thulac](https://github.com/thunlp/THULAC-Python)      |  0.848839  |  0.883031  | 0.865597  |  28.831738 |  16.565779  |
83 | |[pyltp](https://github.com/HIT-SCIR/pyltp)             |  0.894885  |  0.908761  | 0.901770  |  --------- |  52.371131  |
84 | |[snownlp](https://github.com/isnowfy/snownlp)          |  0.811029  |  0.864835  | 0.837069  |  --------- |   1.947430  |
85 | 
86 | #### 开发者
87 | 
88 | - 建议使用pycharm打开python目录，即可运行
89 | - 如果需要使用pynlpir，需要修改pynlpir_path的安装目录
90 | - 如果需要使用pyltp，需要修改ltp_data_dir的模型分词目录
91 | - 评测自定义分词器：只要实现segment方法和向evaluators追加即可。
92 | 
93 | ## 总结
94 | - 性能：java 远高于python，至少多了一个数量级。
95 | - 效果：对于jieba和thulac，在python和java上表现的不同，需要更多的时间去寻找原因，且java的thulac4j非官方提供。
96 | - 数据：默认数据集来源于[cws_evaluation](https://github.com/ysc/cws_evaluation)，该项目为评估中文分词的性能与效果，对于效果该项目采用的是行完美率这个指标，但是对于长句，这个指标会变的不合适，如果不同算法的错误率不一样，但是如果有一个错的词，会导致整个句子都是错的，不能很好的区分算法的precision
97 | 


--------------------------------------------------------------------------------
/java/build.gradle:
--------------------------------------------------------------------------------
 1 | buildscript {
 2 |     repositories {
 3 |         jcenter()
 4 |     }
 5 | 
 6 |     dependencies {
 7 |         classpath "com.github.jengelman.gradle.plugins:shadow:2.0.2"
 8 |     }
 9 | }
10 | 
11 | apply plugin: 'idea'
12 | apply plugin: 'java'
13 | apply plugin: "com.github.johnrengelman.shadow"
14 | 
15 | group 'indi.nlp'
16 | version '1.0-SNAPSHOT'
17 | 
18 | sourceCompatibility = 1.8
19 | 
20 | tasks.withType(JavaCompile) {
21 |     options.encoding = "UTF-8"
22 | }
23 | 
24 | //禁掉jar task
25 | jar.enabled = false
26 | shadowJar {
27 |     baseName = "nlp-evaluation-java"
28 |     //classifier是生成jar包的后缀
29 |     classifier = null
30 |     version = '1.0.1'
31 |     manifest {
32 |         attributes 'Main-Class': 'indi.tiandi.nlp.evaluation.SegEvaluation'
33 |     }
34 | 
35 |     from("../") {
36 |         include 'data/seg_data_big.txt'
37 |     }
38 | }
39 | 
40 | repositories {
41 |     maven{ url 'http://maven.aliyun.com/nexus/content/groups/public/'}
42 | }
43 | 
44 | dependencies {
45 |     implementation 'org.ansj:ansj_seg:5.1.6'
46 |     implementation 'com.hankcs:hanlp:portable-1.6.8'
47 |     implementation 'org.apdplat:word:1.3.1'
48 |     implementation 'io.github.yizhiru:thulac4j:3.1.2'
49 |     implementation 'com.chenlb.mmseg4j:mmseg4j-core:1.10.0'
50 |     implementation 'org.lionsoul:jcseg-core:2.4.0'
51 |     implementation 'com.huaban:jieba-analysis:1.0.2'
52 |     implementation 'com.mayabot.mynlp:mynlp-segment:3.0.1'
53 |     implementation 'org.apache.commons:commons-lang3:3.11'
54 | 
55 |     testImplementation 'junit:junit:4.2'
56 | }
57 | 
58 | 
59 | artifacts {
60 |     shadowJar;
61 | }
62 | 
63 | build.dependsOn(shadowJar);


--------------------------------------------------------------------------------
/java/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/java/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/java/gradlew:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env sh
  2 | 
  3 | ##############################################################################
  4 | ##
  5 | ##  Gradle start up script for UN*X
  6 | ##
  7 | ##############################################################################
  8 | 
  9 | # Attempt to set APP_HOME
 10 | # Resolve links: $0 may be a link
 11 | PRG="$0"
 12 | # Need this for relative symlinks.
 13 | while [ -h "$PRG" ] ; do
 14 |     ls=`ls -ld "$PRG"`
 15 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 16 |     if expr "$link" : '/.*' > /dev/null; then
 17 |         PRG="$link"
 18 |     else
 19 |         PRG=`dirname "$PRG"`"/$link"
 20 |     fi
 21 | done
 22 | SAVED="`pwd`"
 23 | cd "`dirname \"$PRG\"`/" >/dev/null
 24 | APP_HOME="`pwd -P`"
 25 | cd "$SAVED" >/dev/null
 26 | 
 27 | APP_NAME="Gradle"
 28 | APP_BASE_NAME=`basename "$0"`
 29 | 
 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
 31 | DEFAULT_JVM_OPTS=""
 32 | 
 33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
 34 | MAX_FD="maximum"
 35 | 
 36 | warn () {
 37 |     echo "$*"
 38 | }
 39 | 
 40 | die () {
 41 |     echo
 42 |     echo "$*"
 43 |     echo
 44 |     exit 1
 45 | }
 46 | 
 47 | # OS specific support (must be 'true' or 'false').
 48 | cygwin=false
 49 | msys=false
 50 | darwin=false
 51 | nonstop=false
 52 | case "`uname`" in
 53 |   CYGWIN* )
 54 |     cygwin=true
 55 |     ;;
 56 |   Darwin* )
 57 |     darwin=true
 58 |     ;;
 59 |   MINGW* )
 60 |     msys=true
 61 |     ;;
 62 |   NONSTOP* )
 63 |     nonstop=true
 64 |     ;;
 65 | esac
 66 | 
 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
 68 | 
 69 | # Determine the Java command to use to start the JVM.
 70 | if [ -n "$JAVA_HOME" ] ; then
 71 |     if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
 72 |         # IBM's JDK on AIX uses strange locations for the executables
 73 |         JAVACMD="$JAVA_HOME/jre/sh/java"
 74 |     else
 75 |         JAVACMD="$JAVA_HOME/bin/java"
 76 |     fi
 77 |     if [ ! -x "$JAVACMD" ] ; then
 78 |         die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
 79 | 
 80 | Please set the JAVA_HOME variable in your environment to match the
 81 | location of your Java installation."
 82 |     fi
 83 | else
 84 |     JAVACMD="java"
 85 |     which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
 86 | 
 87 | Please set the JAVA_HOME variable in your environment to match the
 88 | location of your Java installation."
 89 | fi
 90 | 
 91 | # Increase the maximum file descriptors if we can.
 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
 93 |     MAX_FD_LIMIT=`ulimit -H -n`
 94 |     if [ $? -eq 0 ] ; then
 95 |         if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
 96 |             MAX_FD="$MAX_FD_LIMIT"
 97 |         fi
 98 |         ulimit -n $MAX_FD
 99 |         if [ $? -ne 0 ] ; then
100 |             warn "Could not set maximum file descriptor limit: $MAX_FD"
101 |         fi
102 |     else
103 |         warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 |     fi
105 | fi
106 | 
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 |     GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 | 
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 |     APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 |     CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 |     JAVACMD=`cygpath --unix "$JAVACMD"`
117 | 
118 |     # We build the pattern for arguments to be converted via cygpath
119 |     ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 |     SEP=""
121 |     for dir in $ROOTDIRSRAW ; do
122 |         ROOTDIRS="$ROOTDIRS$SEP$dir"
123 |         SEP="|"
124 |     done
125 |     OURCYGPATTERN="(^($ROOTDIRS))"
126 |     # Add a user-defined pattern to the cygpath arguments
127 |     if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 |         OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 |     fi
130 |     # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 |     i=0
132 |     for arg in "$@" ; do
133 |         CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 |         CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
135 | 
136 |         if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
137 |             eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 |         else
139 |             eval `echo args$i`="\"$arg\""
140 |         fi
141 |         i=$((i+1))
142 |     done
143 |     case $i in
144 |         (0) set -- ;;
145 |         (1) set -- "$args0" ;;
146 |         (2) set -- "$args0" "$args1" ;;
147 |         (3) set -- "$args0" "$args1" "$args2" ;;
148 |         (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 |         (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 |         (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 |         (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 |         (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 |         (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 |     esac
155 | fi
156 | 
157 | # Escape application args
158 | save () {
159 |     for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 |     echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 | 
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 | 
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 |   cd "$(dirname "$0")"
170 | fi
171 | 
172 | exec "$JAVACMD" "$@"
173 | 


--------------------------------------------------------------------------------
/java/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/java/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'java'
2 | 
3 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Seg.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp;
 2 | 
 3 | import java.util.List;
 4 | /**
 5 |  * 分词器抽象类
 6 |  *
 7 |  * @date 2019/3/4
 8 |  * @author tiandi
 9 |  */
10 | public abstract class Seg {
11 |     private String name;
12 | 
13 |     public abstract List<Term> segment(String sentence);
14 | 
15 |     public String getName() {
16 |         return this.name;
17 |     }
18 | 
19 |     public void setName(String name) {
20 |         this.name = name;
21 |     }
22 | 
23 |     public Seg(String name) {
24 |         this.name = name;
25 |     }
26 | 
27 |     public Seg() {
28 |         this.name = this.getClass().getSimpleName();
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Sentence.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * 句子
 7 |  *
 8 |  * @author tiandi
 9 |  * @date 2019/3/4
10 |  */
11 | public class Sentence {
12 |     private List<Term> terms;
13 | 
14 |     public Sentence(List<Term> terms) {
15 |         this.terms = terms;
16 |     }
17 | 
18 |     public String getString() {
19 |         StringBuilder sb = new StringBuilder();
20 |         for (Term term : terms) {
21 |             sb.append(term.getWord());
22 |         }
23 |         return sb.toString();
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Term.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp;
 2 | 
 3 | /**
 4 |  * 词和词性
 5 |  *
 6 |  * @date 2019/3/4
 7 |  * @author tiandi
 8 |  */
 9 | public class Term {
10 |     /**
11 |      * 词
12 |      */
13 |     private String word;
14 |     /**
15 |      * 词性
16 |      */
17 |     private String pos;
18 | 
19 |     public Term(String word) {
20 |         this.word = word;
21 |     }
22 | 
23 |     public Term(String word, String pos) {
24 |         this.word = word;
25 |         this.pos = pos;
26 |     }
27 | 
28 |     public String getWord() {
29 |         return word;
30 |     }
31 | 
32 |     public void setWord(String word) {
33 |         this.word = word;
34 |     }
35 | 
36 |     public String getPos() {
37 |         return pos;
38 |     }
39 | 
40 |     public void setPos(String pos) {
41 |         this.pos = pos;
42 |     }
43 | 
44 |     @Override
45 |     public String toString() {
46 |         return word;
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/SegEvaluation.java:
--------------------------------------------------------------------------------
  1 | package indi.tiandi.nlp.evaluation;
  2 | 
  3 | import indi.tiandi.nlp.Seg;
  4 | import indi.tiandi.nlp.Term;
  5 | import indi.tiandi.nlp.tool.HttpRequest;
  6 | import indi.tiandi.nlp.tool.ZipUtil;
  7 | import org.apache.commons.lang3.StringUtils;
  8 | 
  9 | import java.io.*;
 10 | import java.net.JarURLConnection;
 11 | import java.net.URL;
 12 | import java.net.URLDecoder;
 13 | import java.util.ArrayList;
 14 | import java.util.Arrays;
 15 | import java.util.Enumeration;
 16 | import java.util.List;
 17 | import java.util.jar.JarEntry;
 18 | import java.util.jar.JarFile;
 19 | 
 20 | /**
 21 |  * SegEvaluation
 22 |  * 分词评估器
 23 |  *
 24 |  * @author tiandi
 25 |  * @date 2019/3/4
 26 |  */
 27 | public class SegEvaluation {
 28 |     public static final String testSentence = "这是一个测试句子";
 29 | 
 30 |     public static final String helpMessage;
 31 | 
 32 |     static {
 33 |         StringBuilder sb = new StringBuilder();
 34 |         sb.append("thank you for using nlp-evaluation\n");
 35 |         sb.append("github: https://github.com/tiandiweizun/chinese-segmentation-evaluation\n");
 36 |         sb.append("\t-i or -input\t\t\tfile to segment,jar default using the file in ./data/seg_data_big.txt and debug model using chinese-segmentation-evaluation/data/seg_data_big.txt\n");
 37 |         sb.append("\t-o or -output\t\t\tpath to save the result, default is not saving\n");
 38 |         sb.append("\t-n or -max_line_number\t\tmaximum number of read rows, default reading all\n");
 39 |         sb.append("\t-c or -contains\t\t\t segmentor to evaluate，default contains HanLP，jieba，thulac\n");
 40 |         sb.append("\t-h or -help\t\t\tmessage for help\n");
 41 |         sb.append("\n");
 42 |         sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar -n=10\n");
 43 |         sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar nlp-evaluation/data/seg.data_big -n=10\n");
 44 |         helpMessage = sb.toString();
 45 |     }
 46 | 
 47 |     public static String getFileNameWithExtension(File file) {
 48 |         String fileName = file.getName();
 49 |         int i = fileName.lastIndexOf(".");
 50 |         if (i <= 0) {
 51 |             i = fileName.length();
 52 |         }
 53 |         return fileName.substring(0, i);
 54 |     }
 55 | 
 56 |     public static void main(String[] args) throws Exception {
 57 |         Config config = parseParams(args);
 58 | 
 59 |         String rFileName = config.rFileName;
 60 |         String wFilePath = config.wFilePath;
 61 |         boolean writeResult = false;
 62 |         int maxLineCount = config.maxLineCount;
 63 | 
 64 |         InputStream inputStream = null;
 65 |         File file = new File(rFileName);
 66 |         if (!file.exists()) {
 67 |             URL resource = SegEvaluation.class.getClassLoader().getResource(rFileName);
 68 |             if (resource != null) {
 69 |                 // 从jar包内部加载
 70 |                 inputStream = SegEvaluation.class.getClassLoader().getResourceAsStream((rFileName));
 71 |             } else {
 72 |                 File tempZipFile = new File(Config.zipFileName);
 73 |                 if (getFileNameWithExtension(file).equals(getFileNameWithExtension(tempZipFile))) {
 74 |                     boolean download = true;
 75 |                     if (tempZipFile.exists()) {
 76 |                         try {
 77 |                             //解压
 78 |                             ZipUtil.unZip(tempZipFile, tempZipFile.getParent());
 79 |                             download = false;
 80 |                         } catch (Exception e) {
 81 |                             //删除错误zip文件
 82 |                             tempZipFile.delete();
 83 |                         }
 84 |                     }
 85 |                     // 从互联网下载并解压
 86 |                     if (download) {
 87 |                         System.out.println(String.format("从 %s 下载文件，如果下载较慢，亦可手动下载，保存到 %s 即可", Config.url, tempZipFile.getAbsolutePath()));
 88 |                         try {
 89 |                             // 下载
 90 |                             HttpRequest.download(Config.url, Config.zipFileName);
 91 |                             System.out.println("下载完成");
 92 |                             // 解压
 93 |                             ZipUtil.unZip(tempZipFile, tempZipFile.getParent());
 94 |                         } catch (IOException e) {
 95 |                             System.out.println(String.format("下载或解压错误：%s", e.getMessage()));
 96 |                             System.exit(1);
 97 |                         }
 98 |                     }
 99 |                 } else {
100 |                     // 自定义的文件未找到
101 |                     System.out.println("未从本地和jar包内找到文件:" + rFileName);
102 |                     System.exit(1);
103 |                 }
104 |             }
105 |         }
106 |         if (file.exists()) {
107 |             inputStream = new FileInputStream(rFileName);
108 |             System.out.println("读入分词文件地址:" + file.getAbsolutePath());
109 |         }
110 |         if (wFilePath.length() > 0) {
111 |             writeResult = true;
112 |             System.out.println("分词结果写入地址:" + new File(wFilePath).getAbsolutePath());
113 |         }
114 |         calcPFRScore(inputStream, wFilePath, writeResult, maxLineCount, config.segmentorNames);
115 |     }
116 | 
117 |     public static void calcPFRScore(InputStream inputStream, String wFilePath, boolean writeResult, int maxLineCount,
118 |                                     List<String> segmentorNames) {
119 |         String line = "";
120 |         try {
121 |             List<Evaluator> evaluators = new ArrayList<>();
122 |             List<String> classesFromPackage = getClassNames("indi.tiandi.nlp.evaluation.impl");
123 |             for (String segmentorName : segmentorNames) {
124 |                 for (String className : classesFromPackage) {
125 |                     int i = className.lastIndexOf(".") + 1;
126 |                     String simpleClassName = className.substring(i);
127 |                     if (!simpleClassName.toLowerCase().startsWith(segmentorName.toLowerCase())) {
128 |                         continue;
129 |                     }
130 |                     Class<?> aClass = Class.forName(className);
131 |                     if (Seg.class.isAssignableFrom(aClass)) {
132 |                         evaluators.add(new Evaluator(aClass.asSubclass(Seg.class), segmentorName));
133 |                         break;
134 |                     }
135 |                 }
136 |             }
137 | 
138 | //            evaluators.add(new Evaluator(JiebaAnalysisImpl.class));
139 | //            evaluators.add(new Evaluator(ThulacImpl.class));
140 | //            分词太慢
141 | //            evaluators.add(new Evaluator(new StanfordCoreNLPImpl()));
142 | //            以下分词都存在bug，导致分词后的句子与分词前的句子不一样
143 | //            evaluators.add(new Evaluator(WordImpl.class));
144 | //            evaluators.add(new Evaluator(AnsjImpl.class));
145 | //            evaluators.add(new Evaluator(JcsegImpl.class));
146 | //            evaluators.add(new Evaluator(MMSeg4jImpl.class));
147 |             // 获得项目根目录的绝对路径
148 |             if (evaluators.size() == 0) {
149 |                 System.out.println("没有任何待评测分词器");
150 |                 System.exit(-1);
151 |             }
152 |             BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "utf-8"));
153 |             List<List<String>> gold = new ArrayList<>();
154 |             List<String> test = new ArrayList<>();
155 |             int charCount = 0;
156 |             boolean calcScore = true;
157 |             int lineCount = 0;
158 |             // -1 表示读取所有数据
159 |             while ((line = br.readLine()) != null) {
160 |                 if (line.trim().length() == 0) {
161 |                     continue;
162 |                 }
163 |                 String[] s = line.split(" ");
164 |                 gold.add(Arrays.asList(s));
165 |                 test.add(line.replace(" ", ""));
166 |                 charCount += test.get(test.size() - 1).length();
167 | 
168 |                 lineCount += 1;
169 |                 if (maxLineCount > 0 && lineCount >= maxLineCount) {
170 |                     break;
171 |                 }
172 |             }
173 |             System.out.println();
174 |             System.out.println(String.format("总行数:%d\t总字符数:%d", gold.size(), charCount));
175 |             for (Evaluator item : evaluators) {
176 | 
177 |                 System.out.println();
178 |                 System.out.println(item.seg.getName() + " 评测开始");
179 |                 if (!item.init) {
180 |                     System.out.println(item.seg.getName() + " 初始化错误,跳过");
181 |                     continue;
182 |                 }
183 |                 BufferedWriter bw = null;
184 |                 if (writeResult) {
185 |                     wFilePath = wFilePath.replace("\\", "/");
186 |                     if (!wFilePath.endsWith("/")) {
187 |                         wFilePath += "/";
188 |                     }
189 |                     String wFileName = wFilePath + item.seg.getName();
190 |                     bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(wFileName), "utf-8"));
191 |                 }
192 |                 long start = System.currentTimeMillis();
193 |                 int right_num = 0;
194 |                 int gold_num = 0;
195 |                 int predict_num = 0;
196 |                 for (int i = 0; i < test.size(); i++) {
197 |                     line = test.get(i);
198 |                     List<Term> segment = item.seg.segment(line);
199 |                     List<String> predict = new ArrayList<>();
200 |                     for (Term term : segment) {
201 |                         predict.add(term.getWord());
202 |                     }
203 |                     if (calcScore) {
204 |                         if (!StringUtils.join(predict, "").equals(line)) {
205 |                             System.out.println(item.seg.getName() + "\t" + line);
206 |                             continue;
207 |                         }
208 |                         int result[] = calcScore(gold.get(i), predict);
209 |                         right_num += result[0];
210 |                         gold_num += result[1];
211 |                         predict_num += result[2];
212 |                     }
213 |                     if (writeResult) {
214 |                         bw.write(StringUtils.join(predict, " ") + "\n");
215 |                     }
216 |                 }
217 |                 if (writeResult) {
218 |                     bw.close();
219 |                 }
220 |                 item.time = System.currentTimeMillis() - start;
221 |                 double precision = 0;
222 |                 double recall = 0;
223 |                 double f = 0;
224 |                 if (predict_num != 0) {
225 |                     precision = right_num * 1.0 / predict_num;
226 |                 }
227 |                 if (gold_num != 0) {
228 |                     recall = right_num * 1.0 / gold_num;
229 |                 }
230 |                 if (precision + recall > 0) {
231 |                     f = 2 * precision * recall / (precision + recall);
232 |                 }
233 |                 System.out.println(String.format("precision:%f \t recall:%f \t f1:%f", precision, recall, f));
234 |                 System.out.println(String.format("耗时:%d ms,\t速度:%f 字符/毫秒", item.time, charCount * 1.0 / item.time));
235 |             }
236 |         } catch (Exception e) {
237 |             System.out.println(line);
238 |             e.printStackTrace();
239 |         }
240 |     }
241 | 
242 |     public static int[] calcScore(List<String> gold, List<String> predict) {
243 |         int gold_offset = 0;
244 |         int predict_offset = 0;
245 | 
246 |         int gold_term_index = 0;
247 |         int predict_term_index = 0;
248 | 
249 |         int right = 0;
250 |         int total = gold.size();
251 |         int right_and_wrong = predict.size();
252 |         while (gold_term_index < total || predict_term_index < right_and_wrong) {
253 |             if (gold_offset == predict_offset) {
254 |                 if (gold.get(gold_term_index).equals(predict.get(predict_term_index))) {
255 |                     right += 1;
256 |                 }
257 |                 int[] result = update(gold_offset, gold_term_index, gold);
258 |                 gold_offset = result[0];
259 |                 gold_term_index = result[1];
260 |                 result = update(predict_offset, predict_term_index, predict);
261 |                 predict_offset = result[0];
262 |                 predict_term_index = result[1];
263 |             } else if (gold_offset < predict_offset) {
264 |                 int[] result = update(gold_offset, gold_term_index, gold);
265 |                 gold_offset = result[0];
266 |                 gold_term_index = result[1];
267 |             } else {
268 |                 int[] result = update(predict_offset, predict_term_index, predict);
269 |                 predict_offset = result[0];
270 |                 predict_term_index = result[1];
271 |             }
272 |         }
273 |         int[] result = {right, total, right_and_wrong};
274 |         return result;
275 |     }
276 | 
277 |     public static int[] update(int offset, int index, List<String> terms) {
278 |         offset += terms.get(index).length();
279 |         index += 1;
280 |         int[] result = {offset, index};
281 |         return result;
282 |     }
283 | 
284 |     public static Config parseParams(String[] args) {
285 |         // message for help
286 |         try {
287 | 
288 |             Config config = new Config();
289 |             if (args.length == 1) {
290 |                 if (args[0].equalsIgnoreCase("-h") || args[0].equalsIgnoreCase("-help")) {
291 |                     System.out.println(helpMessage);
292 |                     System.exit(0);
293 |                 }
294 |             }
295 |             boolean containsOption = false;
296 |             for (int i = 0; i < args.length; i++) {
297 |                 String arg = args[i];
298 |                 if (arg.startsWith("-") && arg.contains("=")) {
299 |                     containsOption = true;
300 |                     String[] split = arg.split("=");
301 |                     String paramName = split[0].trim().substring(1).toLowerCase();
302 |                     String paramValue = split[1].trim();
303 |                     switch (paramName) {
304 |                         case "i":
305 |                         case "input":
306 |                             config.rFileName = paramValue;
307 |                             break;
308 |                         case "o":
309 |                         case "output":
310 |                             config.wFilePath = paramValue;
311 |                             break;
312 |                         case "n":
313 |                         case "max_line_number":
314 |                             config.maxLineCount = Integer.parseInt(paramValue);
315 |                             break;
316 |                         case "c":
317 |                         case "contains":
318 |                             String[] segmentorNames = paramValue.split(",");
319 |                             config.segmentorNames = Arrays.asList(segmentorNames);
320 |                             break;
321 |                     }
322 |                 } else if (containsOption) {
323 |                     System.out.println("optional argument follows keyword argument");
324 |                 } else {
325 |                     if (i == 0) {
326 |                         config.rFileName = args[0].trim();
327 |                     } else if (i == 1) {
328 |                         config.wFilePath = args[1].trim();
329 |                     } else if (i == 2) {
330 |                         config.maxLineCount = Integer.parseInt(args[2].trim());
331 |                     }
332 |                 }
333 |             }
334 |             return config;
335 |         } catch (Exception e) {
336 |             System.out.println("参数错误：" + e.getMessage());
337 |             System.out.println(helpMessage);
338 |             System.exit(0);
339 |         }
340 |         return null;
341 |     }
342 | 
343 |     /**
344 |      * 从包package中获取所有的Class
345 |      *
346 |      * @param packageName
347 |      * @return
348 |      */
349 | 
350 |     public static List<String> getClassNames(String packageName) {
351 |         //第一个class类的集合
352 |         List<String> classes = new ArrayList<>();
353 |         //是否循环迭代
354 |         boolean recursive = true;
355 |         //获取包的名字 并进行替换
356 |         String packageDirName = packageName.replace('.', '/');
357 |         //定义一个枚举的集合 并进行循环来处理这个目录下的things
358 |         Enumeration<URL> dirs;
359 |         try {
360 |             dirs = Thread.currentThread().getContextClassLoader().getResources(packageDirName);
361 |             //循环迭代下去
362 |             while (dirs.hasMoreElements()) {
363 |                 //获取下一个元素
364 |                 URL url = dirs.nextElement();
365 |                 //得到协议的名称
366 |                 String protocol = url.getProtocol();
367 |                 //如果是以文件的形式保存在服务器上
368 |                 if ("file".equals(protocol)) {
369 |                     //获取包的物理路径
370 |                     String filePath = URLDecoder.decode(url.getFile(), "UTF-8");
371 |                     //以文件的方式扫描整个包下的文件 并添加到集合中
372 |                     findAndAddClassesInPackageByFile(packageName, filePath, recursive, classes);
373 |                 } else if ("jar".equals(protocol)) {
374 |                     //如果是jar包文件
375 |                     //定义一个JarFile
376 |                     JarFile jar;
377 |                     try {
378 |                         //获取jar
379 |                         jar = ((JarURLConnection) url.openConnection()).getJarFile();
380 |                         //从此jar包 得到一个枚举类
381 |                         Enumeration<JarEntry> entries = jar.entries();
382 |                         //同样的进行循环迭代
383 |                         while (entries.hasMoreElements()) {
384 |                             //获取jar里的一个实体 可以是目录 和一些jar包里的其他文件 如META-INF等文件
385 |                             JarEntry entry = entries.nextElement();
386 |                             String name = entry.getName();
387 |                             //如果是以/开头的
388 |                             if (name.charAt(0) == '/') {
389 |                                 //获取后面的字符串
390 |                                 name = name.substring(1);
391 |                             }
392 |                             //如果前半部分和定义的包名相同
393 |                             if (name.startsWith(packageDirName)) {
394 |                                 int idx = name.lastIndexOf('/');
395 |                                 //如果以"/"结尾 是一个包
396 |                                 if (idx != -1) {
397 |                                     //获取包名 把"/"替换成"."
398 |                                     packageName = name.substring(0, idx).replace('/', '.');
399 |                                 }
400 |                                 //如果可以迭代下去 并且是一个包
401 |                                 if ((idx != -1) || recursive) {
402 |                                     //如果是一个.class文件 而且不是目录
403 |                                     if (name.endsWith(".class") && !entry.isDirectory()) {
404 |                                         //去掉后面的".class" 获取真正的类名
405 |                                         String className = name.substring(packageName.length() + 1, name.length() - 6);
406 |                                         //添加到classes
407 |                                         classes.add(packageName + '.' + className);
408 |                                     }
409 |                                 }
410 |                             }
411 |                         }
412 |                     } catch (IOException e) {
413 |                         e.printStackTrace();
414 |                     }
415 |                 }
416 |             }
417 |         } catch (IOException e) {
418 |             e.printStackTrace();
419 |         }
420 |         return classes;
421 |     }
422 | 
423 |     /**
424 |      * 以文件的形式来获取包下的所有Class
425 |      *
426 |      * @param packageName
427 |      * @param packagePath
428 |      * @param recursive
429 |      * @param classes
430 |      */
431 | 
432 |     public static void findAndAddClassesInPackageByFile(String packageName, String packagePath, final boolean recursive,
433 |                                                         List<String> classes) {
434 |         //获取此包的目录 建立一个File
435 |         File dir = new File(packagePath);
436 |         //如果不存在或者 也不是目录就直接返回
437 |         if (!dir.exists() || !dir.isDirectory()) {
438 |             return;
439 |         }
440 |         //如果存在 就获取包下的所有文件 包括目录
441 |         File[] dirfiles = dir.listFiles(new FileFilter() {
442 |             //自定义过滤规则 如果可以循环(包含子目录) 或则是以.class结尾的文件(编译好的java类文件)
443 |             public boolean accept(File file) {
444 |                 return (recursive && file.isDirectory()) || (file.getName().endsWith(".class"));
445 |             }
446 |         });
447 |         //循环所有文件
448 |         for (File file : dirfiles) {
449 |             //如果是目录 则继续扫描
450 |             if (file.isDirectory()) {
451 |                 findAndAddClassesInPackageByFile(packageName + "." + file.getName(), file.getAbsolutePath(), recursive, classes);
452 |             } else {
453 |                 //如果是java类文件 去掉后面的.class 只留下类名
454 |                 String className = file.getName().substring(0, file.getName().length() - 6);
455 |                 classes.add(packageName + '.' + className);
456 |             }
457 |         }
458 |     }
459 | }
460 | 
461 | class Evaluator {
462 | 
463 |     public Seg seg;
464 |     public long time = 0;
465 |     public boolean init = false;
466 | 
467 |     public Evaluator(Class<? extends Seg> segClass) {
468 |         this(segClass, segClass.getSimpleName());
469 |     }
470 | 
471 |     public Evaluator(Class<? extends Seg> segClass, String name) {
472 |         try {
473 |             long start = System.currentTimeMillis();
474 |             System.out.println(name + " 初始化开始");
475 |             this.seg = segClass.newInstance();
476 |             this.seg.setName(name);
477 |             List<Term> terms = this.seg.segment(SegEvaluation.testSentence);
478 |             StringBuilder sb = new StringBuilder();
479 |             for (Term term : terms) {
480 |                 sb.append(term.getWord());
481 |             }
482 |             long end = System.currentTimeMillis();
483 |             long cost = end - start;
484 |             if (!sb.toString().equals(SegEvaluation.testSentence)) {
485 |                 System.out.println(name + " 初始化错误,句子:" + SegEvaluation.testSentence + ",分词结果:" + terms);
486 |             } else {
487 |                 this.init = true;
488 |                 System.out.println(name + " 初始化结束,耗时:" + cost + " ms");
489 |             }
490 |         } catch (InstantiationException e) {
491 |             e.printStackTrace();
492 |         } catch (IllegalAccessException e) {
493 |             e.printStackTrace();
494 |         }
495 |     }
496 | }
497 | 
498 | class Config {
499 |     public static final String zipFileName = "data/seg_data_big.zip";
500 |     public static final String url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip";
501 |     public String rFileName = "data/seg_data_big.txt";
502 |     public String wFilePath = "";
503 |     public boolean writeResult = false;
504 |     public int maxLineCount = -1;
505 |     public List<String> segmentorNames = new ArrayList<>(Arrays.asList("HanLP", "Jieba", "Thulac", "mynlp"));
506 | }


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/AnsjImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import indi.tiandi.nlp.Seg;
 4 | import indi.tiandi.nlp.Term;
 5 | import org.ansj.domain.Result;
 6 | import org.ansj.library.AmbiguityLibrary;
 7 | import org.ansj.library.DicLibrary;
 8 | import org.ansj.splitWord.analysis.ToAnalysis;
 9 | import org.ansj.util.MyStaticValue;
10 | 
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | /**
14 |  * Ansj分词
15 |  *
16 |  * @date 2019/3/4
17 |  * @author tiandi
18 |  */
19 | public class AnsjImpl extends Seg {
20 |     static {
21 | //         设置后速度会慢25% 左右
22 | //        MyStaticValue.ENV.put(DicLibrary.DEFAULT, AnsjImpl.class.getClassLoader().getResource("ansj/library/default.dic").getPath());
23 | //        MyStaticValue.ENV.put(AmbiguityLibrary.DEFAULT,  AnsjImpl.class.getClassLoader().getResource("ansj/library/ambiguity.dic").getPath());
24 |     }
25 | 
26 |     @Override
27 |     public List<Term> segment(String sentence) {
28 |         Result result = ToAnalysis.parse(sentence);
29 |         List<Term> terms = new ArrayList<>();
30 |         for (org.ansj.domain.Term term : result) {
31 |             terms.add(new Term(term.getName()));
32 |         }
33 |         return terms;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/FNLPImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 | 
3 | //import indi.nlp.Seg;
4 | 
5 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/HanLPImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import com.hankcs.hanlp.HanLP;
 4 | import indi.tiandi.nlp.Seg;
 5 | import indi.tiandi.nlp.Term;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | /**
10 |  * HanLP分词
11 |  *
12 |  * @date 2019/3/4
13 |  * @author tiandi
14 |  */
15 | public class HanLPImpl extends Seg {
16 | 
17 |     @Override
18 |     public List<Term> segment(String sentence) {
19 | //        List<com.hankcs.hanlp.seg.common.Term> segment = BasicTokenizer.segment(sentence);
20 |         List<com.hankcs.hanlp.seg.common.Term> segment = HanLP.segment(sentence);
21 |         List<Term> terms = new ArrayList<>();
22 |         for (com.hankcs.hanlp.seg.common.Term term : segment) {
23 |             terms.add(new Term(term.word));
24 |         }
25 |         return terms;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/JcsegImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import indi.tiandi.nlp.Seg;
 4 | import indi.tiandi.nlp.Term;
 5 | import org.lionsoul.jcseg.tokenizer.core.*;
 6 | 
 7 | import java.io.IOException;
 8 | import java.io.StringReader;
 9 | import java.util.ArrayList;
10 | import java.util.List;
11 | /**
12 |  * Jcseg分词
13 |  *
14 |  * @date 2019/3/4
15 |  * @author tiandi
16 |  */
17 | public class JcsegImpl extends Seg {
18 | 
19 |     private final static JcsegTaskConfig config = new JcsegTaskConfig(true);
20 |     private final static ADictionary dic = DictionaryFactory.createSingletonDictionary(config);
21 |     private static ISegment seg;
22 | 
23 |     static {
24 |         try {
25 |             seg = SegmentFactory.createJcseg(JcsegTaskConfig.COMPLEX_MODE, new Object[]{config, dic});
26 |         }
27 |         catch (JcsegException e) {
28 |             e.printStackTrace();
29 |         }
30 |     }
31 | 
32 |     @Override
33 |     public List<Term> segment(String sentence) {
34 |         List<Term> terms = new ArrayList<>();
35 |         try {
36 |             seg.reset(new StringReader(sentence));
37 |             IWord word = null;
38 |             while ((word = seg.next()) != null) {
39 |                 terms.add(new Term(word.getValue()));
40 |             }
41 |         }
42 |         catch (IOException e) {
43 |             e.printStackTrace();
44 |         }
45 |         return terms;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/JiebaAnalysisImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import com.huaban.analysis.jieba.JiebaSegmenter;
 4 | import indi.tiandi.nlp.Seg;
 5 | import indi.tiandi.nlp.Term;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | 
10 | /**
11 |  * 结巴分词
12 |  *
13 |  * @author tiandi
14 |  * @date 2019/3/7
15 |  */
16 | public class JiebaAnalysisImpl extends Seg {
17 |     private static final JiebaSegmenter seg = new JiebaSegmenter();
18 | 
19 |     @Override
20 |     public List<Term> segment(String sentence) {
21 |         List<String> strings = seg.sentenceProcess(sentence);
22 |         List<Term> terms = new ArrayList<>();
23 |         for (String string : strings) {
24 |             terms.add(new Term(string));
25 |         }
26 |         return terms;
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/MMSeg4jImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import com.chenlb.mmseg4j.ComplexSeg;
 4 | import com.chenlb.mmseg4j.Dictionary;
 5 | import com.chenlb.mmseg4j.MMSeg;
 6 | import com.chenlb.mmseg4j.Word;
 7 | import indi.tiandi.nlp.Seg;
 8 | import indi.tiandi.nlp.Term;
 9 | 
10 | import java.io.IOException;
11 | import java.io.StringReader;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | /**
15 |  * MMSeg分词
16 |  *
17 |  * @date 2019/3/4
18 |  * @author tiandi
19 |  */
20 | public class MMSeg4jImpl extends Seg {
21 |     private static final Dictionary dic = Dictionary.getInstance();
22 |     private static final ComplexSeg seg = new ComplexSeg(dic);
23 |     private static final MMSeg mmSeg = new MMSeg(new StringReader(""), seg);
24 | 
25 |     @Override
26 |     public List<Term> segment(String sentence) {
27 |         mmSeg.reset(new StringReader(sentence));
28 |         Word word = null;
29 |         List<Term> terms = new ArrayList<>();
30 |         try {
31 |             while ((word = mmSeg.next()) != null) {
32 |                 if (word != null) {
33 |                     terms.add(new Term(word.getString()));
34 |                 }
35 |             }
36 |         }
37 |         catch (IOException e) {
38 |             System.out.println(sentence);
39 |             e.printStackTrace();
40 |         }
41 |         return terms;
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/MYNLPImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import com.mayabot.nlp.segment.Lexer;
 4 | import com.mayabot.nlp.segment.Lexers;
 5 | import com.mayabot.nlp.segment.Sentence;
 6 | import com.mayabot.nlp.segment.WordTerm;
 7 | import indi.tiandi.nlp.Seg;
 8 | import indi.tiandi.nlp.Term;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | /**
13 |  * MYNLP分词
14 |  *
15 |  * @date 2019/3/4
16 |  * @author tiandi
17 |  */
18 | public class MYNLPImpl extends Seg {
19 |     Lexer lexer = Lexers.builder().basic().core().keepOriCharOutput().build();
20 | 
21 |     @Override
22 |     public List<Term> segment(String sentence) {
23 |         Sentence result = lexer.scan(sentence);
24 |         List<Term> terms = new ArrayList<>();
25 |         for (WordTerm term : result.toList()) {
26 |             terms.add(new Term(term.getWord()));
27 |         }
28 |         return terms;
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/PaodingImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | //import indi.nlp.Seg;
 4 | //import indi.nlp.Term;
 5 | //import org.apache.lucene.analysis.TokenStream;
 6 | //
 7 | //import java.io.IOException;
 8 | //import java.util.ArrayList;
 9 | //import java.util.List;
10 | //
11 | ///**
12 | // * 庖丁分词
13 | // *
14 | // * @author tiandi
15 | // * @date 2019/3/4
16 | // */
17 | //public class PaodingImpl implements Seg {
18 | //    public static PaodingAnalyzer paodingAnalyzer = new PaodingAnalyzer();
19 | //
20 | //    @Override
21 | //    public List<Term> segment(String sentence) {
22 | //        List<Term> terms = new ArrayList<>();
23 | //        try {
24 | //            TokenStream tokenStream = paodingAnalyzer.tokenStream("", sentence);
25 | //            System.out.println(tokenStream.toString());
26 | //        } catch (IOException e) {
27 | //            e.printStackTrace();
28 | //        }
29 | //        return terms;
30 | //    }
31 | //
32 | //    public static void main(String[] args) {
33 | //        new PaodingImpl().segment("我是中国人");
34 | //    }
35 | //}
36 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/StanfordCoreNLPImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | //import edu.stanford.nlp.ling.CoreLabel;
 4 | //import edu.stanford.nlp.pipeline.CoreDocument;
 5 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 6 | //import indi.tiandi.nlp.Seg;
 7 | //import indi.tiandi.nlp.Term;
 8 | //
 9 | //import java.util.ArrayList;
10 | //import java.util.List;
11 | //import java.util.Properties;
12 | //
13 | ///**
14 | // * 斯坦福CoreNLP分词
15 | // *
16 | // * @date 2019/3/4
17 | // * @author tiandi
18 | // */
19 | //public class StanfordCoreNLPImpl extends Seg {
20 | //    public static StanfordCoreNLP stanfordCoreNLP;
21 | //
22 | //    static {
23 | //        try {
24 | //            Properties props = new Properties();
25 | //            props.load(StanfordCoreNLPImpl.class.getClassLoader().getResourceAsStream("StanfordCoreNLP-chinese.properties"));
26 | //            props.setProperty("annotators", "tokenize,ssplit,pos");
27 | //            stanfordCoreNLP = new StanfordCoreNLP(props);
28 | //        } catch (Exception e) {
29 | //            e.printStackTrace();
30 | //        }
31 | //    }
32 | //
33 | //    @Override
34 | //    public List<Term> segment(String sentence) {
35 | //        CoreDocument exampleDocument = new CoreDocument(sentence);
36 | //        // annotate document
37 | //        stanfordCoreNLP.annotate(exampleDocument);
38 | //        // access tokens from a CoreDocument
39 | //        // a token is represented by a CoreLabel
40 | //        List<CoreLabel> firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
41 | //        // this for loop will print out all of the tokens and the character offset info
42 | //        List<Term> terms = new ArrayList<>();
43 | //        for (CoreLabel token : firstSentenceTokens) {
44 | //            terms.add(new Term(token.word(), token.tag()));
45 | //        }
46 | //        return terms;
47 | //    }
48 | //}
49 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/ThulacImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import indi.tiandi.nlp.Seg;
 4 | import indi.tiandi.nlp.Term;
 5 | import io.github.yizhiru.thulac4j.Segmenter;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.List;
 9 | /**
10 |  * 清华分词
11 |  *
12 |  * @date 2019/3/4
13 |  * @author tiandi
14 |  */
15 | public class ThulacImpl extends Seg {
16 |     @Override
17 |     public List<Term> segment(String sentence) {
18 |         List<String> segment = Segmenter.segment(sentence);
19 |         List<Term> terms=new ArrayList<>();
20 |         for (String s : segment) {
21 |             terms.add(new Term(s));
22 |         }
23 |         return terms;
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/WordImpl.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.evaluation.impl;
 2 | 
 3 | import indi.tiandi.nlp.Seg;
 4 | import indi.tiandi.nlp.Term;
 5 | import org.apdplat.word.segmentation.Segmentation;
 6 | import org.apdplat.word.segmentation.SegmentationAlgorithm;
 7 | import org.apdplat.word.segmentation.SegmentationFactory;
 8 | import org.apdplat.word.segmentation.Word;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | /**
13 |  * word分词
14 |  *
15 |  * @date 2019/3/4
16 |  * @author tiandi
17 |  */
18 | public class WordImpl extends Seg {
19 |     public static final Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
20 | 
21 |     @Override
22 |     public List<Term> segment(String sentence) {
23 |         List<Word> words = segmentation.seg(sentence);
24 |         List<Term> terms = new ArrayList<>();
25 |         for (Word word : words) {
26 |             terms.add(new Term(word.getText()));
27 |         }
28 | 
29 |         return terms;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/tool/HttpRequest.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.tool;
 2 | 
 3 | import java.io.File;
 4 | import java.io.FileOutputStream;
 5 | import java.io.IOException;
 6 | import java.io.InputStream;
 7 | import java.net.URL;
 8 | import java.net.URLConnection;
 9 | 
10 | /**
11 |  * Http下载
12 |  *
13 |  * @author tiandi
14 |  * @date 2021/2/18
15 |  */
16 | public class HttpRequest {
17 |     /**
18 |      * 从网络Url中下载文件
19 |      *
20 |      * @param httpUrl  url地址
21 |      * @param saveFile 保存文件路径和名称
22 |      * @throws IOException
23 |      */
24 |     public static boolean download(String httpUrl, String saveFile) throws IOException {
25 |         // 下载网络文件
26 |         int bytesum = 0;
27 |         int byteread = 0;
28 |         URL url = new URL(httpUrl);
29 |         URLConnection conn = url.openConnection();
30 |         File file = new File(saveFile);
31 |         if (!file.getParentFile().exists()) {
32 |             file.getParentFile().mkdirs();
33 |         }
34 |         try (InputStream inStream = conn.getInputStream();
35 |              FileOutputStream fs = new FileOutputStream(saveFile);
36 |         ) {
37 |             byte[] buffer = new byte[1204];
38 |             while ((byteread = inStream.read(buffer)) != -1) {
39 |                 bytesum += byteread;
40 |                 fs.write(buffer, 0, byteread);
41 |             }
42 |             return true;
43 |         } catch (IOException e) {
44 |             throw e;
45 |         }
46 |     }
47 | }


--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/tool/ZipUtil.java:
--------------------------------------------------------------------------------
 1 | package indi.tiandi.nlp.tool;
 2 | 
 3 | import java.io.*;
 4 | import java.nio.charset.Charset;
 5 | import java.nio.file.Paths;
 6 | import java.util.Enumeration;
 7 | import java.util.zip.ZipEntry;
 8 | import java.util.zip.ZipFile;
 9 | 
10 | /**
11 |  * 压缩和解压缩（https://www.bbsmax.com/A/x9J2bZLMJ6/）
12 |  * 更多压缩文件格式可以参见https://www.bookstack.cn/read/hutool/a56da94bbb16617b.md
13 |  *
14 |  * @author tiandi
15 |  * @date 2021/2/1
16 |  */
17 | public class ZipUtil {
18 |     public static boolean unZip(File zipFile, String descDir) throws IOException {
19 |         boolean flag = false;
20 |         // 指定编码，否则压缩包里面不能有中文目录
21 |         InputStream in = null;
22 |         OutputStream out = null;
23 |         ZipFile zip = new ZipFile(zipFile, Charset.forName("gbk"));
24 |         try {
25 |             for (Enumeration entries = zip.entries(); entries.hasMoreElements(); ) {
26 |                 ZipEntry entry = (ZipEntry) entries.nextElement();
27 |                 String zipEntryName = entry.getName();
28 |                 File file = Paths.get(descDir, zipEntryName).toFile();
29 |                 File dir = file;
30 |                 if (!zipEntryName.endsWith("/") && !zipEntryName.endsWith("\\")) {
31 |                     // 非文件夹获取父文件
32 |                     dir = file.getParentFile();
33 |                 }
34 |                 if (!dir.exists()) {
35 |                     dir.mkdirs();
36 |                 }
37 |                 if (file.isDirectory()) {
38 |                     continue;
39 |                 }
40 |                 in = zip.getInputStream(entry);
41 |                 out = new FileOutputStream(file);
42 |                 byte[] buf1 = new byte[2048];
43 |                 int len;
44 |                 while ((len = in.read(buf1)) > 0) {
45 |                     out.write(buf1, 0, len);
46 |                 }
47 |                 in.close();
48 |                 out.close();
49 |             }
50 |             flag = true;
51 |             // 必须关闭，否则无法删除该zip文件
52 |         } catch (IOException exception) {
53 |             zip.close();
54 |             if (in != null) {
55 |                 in.close();
56 |             }
57 |             if (out != null) {
58 |                 out.close();
59 |             }
60 |             throw exception;
61 |         }
62 |         return flag;
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/ambiguity.dic:
--------------------------------------------------------------------------------
 1 | 习近平	nr
 2 | 李民	nr	工作	vn
 3 | 三个	m	和尚	n
 4 | 的确	d	定	v	不	v
 5 | 大	a	和尚	n
 6 | 张三	nr	和	c
 7 | 动漫	n	游戏	n
 8 | 邓颖超	nr	生前	t 
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/regex.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/src/main/resources/ansj/library/regex.dic


--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/stop.dic:
--------------------------------------------------------------------------------
1 | ?
2 | :
3 | .
4 | ,
5 | is
6 | a
7 | #
8 | v	nature
9 | .*了	regex


--------------------------------------------------------------------------------
/java/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |     <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |         <!-- encoders are assigned the type
 5 |              ch.qos.logback.classic.encoder.PatternLayoutEncoder by default -->
 6 |         <encoder>
 7 |             <pattern>%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n</pattern>
 8 |         </encoder>
 9 |     </appender>
10 | 
11 |     <root level="info">
12 |         <appender-ref ref="STDOUT" />
13 |     </root>
14 | </configuration>


--------------------------------------------------------------------------------
/python/indi.tiandi.nlp.evaluation/SegEvaluation.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import time
  5 | import zipfile
  6 | 
  7 | import jieba
  8 | import jieba_fast
  9 | import pkuseg
 10 | import pynlpir
 11 | import requests
 12 | import thulac
 13 | import wget
 14 | from snownlp import SnowNLP
 15 | 
 16 | 
 17 | class Seg:
 18 |     def __init__(self):
 19 |         pass
 20 | 
 21 |     def segment(self, sentence):
 22 |         pass
 23 | 
 24 | 
 25 | # 中科院分词，python版本有bug和demo页面不一致
 26 | class pynlpir_impl:
 27 |     def __init__(self):
 28 |         # 通过代码自动更新lience
 29 |         pynlpir_impl.update_pynlpir_lience()
 30 |         pynlpir.open()
 31 | 
 32 |     @staticmethod
 33 |     def update_pynlpir_lience():
 34 |         # pynlpir 分词授权下载地址
 35 |         # https://github.com/NLPIR-team/NLPIR/tree/master/License
 36 |         url = 'https://raw.githubusercontent.com/NLPIR-team/NLPIR/master/License/license%20for%20a%20month/NLPIR-ICTCLAS%E5%88%86%E8%AF%8D%E7%B3%BB%E7%BB%9F%E6%8E%88%E6%9D%83/NLPIR.user'
 37 |         r = requests.get(url)
 38 |         # 拷贝到pynlpir相应的目录
 39 |         with open(os.path.join(pynlpir.__path__[0], "Data", "NLPIR.user"), "wb") as code:
 40 |             code.write(r.content)
 41 | 
 42 |     def segment(self, sentence):
 43 |         return pynlpir.segment(sentence, pos_tagging=False)
 44 | 
 45 | 
 46 | # 北大分词
 47 | class pkuseg_impl(Seg):
 48 |     def __init__(self):
 49 |         self.pku_seg = pkuseg.pkuseg()
 50 | 
 51 |     def segment(self, sentence):
 52 |         return self.pku_seg.cut(sentence)
 53 | 
 54 | 
 55 | # 结巴分词
 56 | class jieba_impl(Seg):
 57 |     def segment(self, sentence):
 58 |         return jieba.lcut(sentence)
 59 | 
 60 | 
 61 | # jieba_fast分词
 62 | class jieba_fast_impl(Seg):
 63 |     def segment(self, sentence):
 64 |         return jieba_fast.lcut(sentence)
 65 | 
 66 | 
 67 | # snownlp分词
 68 | class snownlp_impl(Seg):
 69 |     def segment(self, sentence):
 70 |         return SnowNLP(sentence).words
 71 | 
 72 | 
 73 | # 清华分词
 74 | class thulac_impl(Seg):
 75 |     def __init__(self):
 76 |         self.thu1 = thulac.thulac(seg_only=True)  # 默认模式
 77 | 
 78 |     def segment(self, sentence):
 79 |         return self.thu1.cut(sentence, text=True).split()  # 进行一句话分词
 80 | 
 81 | 
 82 | # 哈工大分词
 83 | class pyltp_impl(Seg):
 84 | 
 85 |     def __init__(self):
 86 |         ltp_data_dir = '/home/work/tiandi/ltp.model/ltp_data'  # ltp模型目录的路径，由于cws文件比较大，需要自行下载 http://model.scir.yunfutech.com/model/ltp_data_v3.4.0.zip
 87 |         cws_model_path = os.path.join(ltp_data_dir, 'cws.model')  # 分词模型路径，模型名称为`cws.model`
 88 |         from pyltp import Segmentor
 89 |         self.segmentor = Segmentor()
 90 |         self.segmentor.load(cws_model_path)
 91 | 
 92 |     def segment(self, sentence):
 93 |         return self.segmentor.segment(sentence);
 94 | 
 95 | 
 96 | test_sentence = "这是一个测试句子"
 97 | 
 98 | 
 99 | # 分词评估器
100 | class Evaluator:
101 |     def __init__(self, seg_tool, name=None):
102 |         '''
103 |         :param seg_tool: 分词器
104 |         :param name: 分词器名称（可以直接重构到分词器里面去）
105 |         '''
106 |         if not name:
107 |             name = seg_tool.__name__
108 |         self.name = name
109 |         time_start = time.time()
110 |         print("%s 初始化开始" % self.name)
111 |         self.seg = seg_tool()
112 |         self.time = 0
113 |         self.init = False
114 |         result = self.seg.segment(test_sentence)
115 |         # print(result)
116 |         time_end = time.time()
117 |         time_cost = (time_end - time_start) * 1000
118 |         if "".join(result) != test_sentence:
119 |             print("%s 初始化错误,句子:%s,分词结果:%s" % (self.name, test_sentence, result))
120 |         else:
121 |             self.init = True
122 |             print("%s 初始化结束,耗时:%d ms" % (self.name, time_cost))
123 | 
124 | 
125 | # 用BMES标记分词结果
126 | def get_ner(terms):
127 |     temp_gold = []
128 |     for term in terms:
129 |         if len(term) == 1:
130 |             temp_gold.append("S-Null")
131 |         else:
132 |             term_item = ["M-Null"] * len(term)
133 |             term_item[0] = "B-Null"
134 |             term_item[-1] = "E-Null"
135 |             temp_gold.extend(term_item)
136 |     return temp_gold
137 | 
138 | 
139 | # 评估各个分词器
140 | def evaluate(input, output, max_line_count, include):
141 |     # 分词文件目录
142 |     if len(input) == 0:
143 |         # 项目root目录
144 |         root = os.getcwd()[:os.getcwd().rindex("python")]
145 |         data_dir = os.path.join(root, "data")
146 |         input = os.path.join(data_dir, "seg_data_big.txt")
147 |         download_seg_file = True
148 |         if not os.path.exists(input):
149 |             temp_zip_file = os.path.join(data_dir, "seg_data_big.zip")
150 |             if os.path.exists(temp_zip_file):
151 |                 try:
152 |                     zipfile.ZipFile(temp_zip_file).extractall(data_dir)
153 |                     download_seg_file = False
154 |                 except Exception as e:
155 |                     os.remove(temp_zip_file)
156 |             if download_seg_file:
157 |                 url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip"
158 |                 print("从 %s 下载文件，如果下载较慢，亦可手动下载，保存到 %s 即可" % (url, temp_zip_file))
159 |                 try:
160 |                     if not os.path.exists(os.path.dirname(temp_zip_file)):
161 |                         os.makedirs(os.path.dirname(temp_zip_file))
162 |                     wget.download(url, out=temp_zip_file)
163 |                     print("下载完成")
164 |                     zipfile.ZipFile(temp_zip_file).extractall(data_dir)
165 |                 except Exception as e:
166 |                     print("下载或解压错误：%s" % e)
167 |                     sys.exit(1)
168 | 
169 |     if not os.path.exists(input):
170 |         print("未从本地到文件：%s" % input)
171 |         sys.exit(1)
172 |     print("读入分词文件地址:" + input)
173 |     write_result = False
174 |     if len(output) > 0:
175 |         print("分词结果写入地址:" + output)
176 |         write_result = True
177 |     max_line_count = int(max_line_count)
178 |     if max_line_count > 0:
179 |         print("最大读取行数:" + str(max_line_count))
180 | 
181 |     evaluators = []
182 |     for name in include.split(","):
183 |         evaluators.append(Evaluator(globals()[name + "_impl"], name))
184 |     # evaluators.append(Evaluator(pynlpir_impl))
185 |     # evaluators.append(Evaluator(pkuseg_impl))
186 |     # evaluators.append(Evaluator(jieba_impl))
187 |     # evaluators.append(Evaluator(snownlp_impl))
188 |     # evaluators.append(Evaluator(thulac_impl))
189 |     # evaluators.append(Evaluator(pyltp_impl))
190 |     time_start = time.time()
191 |     print("读入分词文件开始")
192 |     with open(input, encoding="utf-8") as f:
193 |         lines = f.readlines()
194 |     time_end = time.time()
195 |     time_cost = (time_end - time_start) * 1000
196 |     print("读取文件结束,耗时:%d ms" % (time_cost))
197 | 
198 |     gold = []
199 |     test = []
200 |     char_count = 0
201 |     # max_line_count = 100000
202 |     if max_line_count <= 0:
203 |         max_line_count = len(lines)
204 |     line_count = 0
205 |     for line in lines:
206 |         gold.append(line.strip().split())
207 |         test.append("".join(gold[-1]))
208 |         char_count += len(test[-1])
209 |         line_count += 1
210 |         if line_count > max_line_count:
211 |             break
212 |     print("总行数:%d\t总字符数:%d" % (line_count, char_count))
213 |     calcScore = True
214 |     for item in evaluators:
215 |         print()
216 |         print("%s 评测开始" % item.name)
217 |         if not item.init:
218 |             print("%s 初始化错误,跳过" % (item.name))
219 |             continue
220 |         if write_result:
221 |             file = open(os.path.join(output, item.name), mode="w", encoding="utf-8")
222 |         time_start = time.time()
223 |         right_num = 0
224 |         gold_num = 0
225 |         predict_num = 0
226 | 
227 |         for i in range(line_count):
228 |             line = test[i]
229 |             predict = item.seg.segment(line)
230 | 
231 |             if (calcScore):
232 |                 # temp_gold = get_ner(gold[i])
233 |                 # temp_predict = get_ner(predict)
234 |                 if len("".join(predict)) != len(line):
235 |                     print(item.name + "\t" + line + "\t")
236 |                     continue
237 |                 # accuracy, precision, recall, f_measure, right_num, golden_num, predict_num = get_ner_fmeasure([temp_gold], [temp_predict])
238 |                 right_num_local, golden_num_local, predict_num_local = calc_score(gold[i], predict)
239 | 
240 |                 right_num += right_num_local
241 |                 gold_num += golden_num_local
242 |                 predict_num += predict_num_local
243 | 
244 |                 # if (right_num != right_num_local or golden_num != golden_num_local or predict_num != predict_num_local):
245 |                 #     print("badcase:" % line)
246 | 
247 |                 # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
248 |                 # print()
249 |             if write_result:
250 |                 file.write(" ".join(predict) + "\n")
251 | 
252 |         time_end = time.time()
253 |         item.time = (time_end - time_start) * 1000
254 |         precision = 0
255 |         recall = 0
256 |         f = 0
257 |         if write_result:
258 |             file.close()
259 |         if predict_num != 0:
260 |             precision = right_num * 1.0 / predict_num
261 |         if gold_num != 0:
262 |             recall = right_num * 1.0 / gold_num
263 |         if precision + recall > 0:
264 |             f = 2 * precision * recall / (precision + recall)
265 | 
266 |         print("precision:%f \t recall:%f \t f1:%f" % (precision, recall, f))
267 |         if item.time == 0:
268 |             print("耗时太少,速度无法评估")
269 |         else:
270 |             print("耗时:%d ms,\t速度:%f 字符/毫秒" % (item.time, char_count * 1.0 / item.time))
271 | 
272 | 
273 | def update(offset, index, terms):
274 |     offset += len(terms[index])
275 |     index += 1
276 |     return offset, index
277 | 
278 | 
279 | def calc_score(gold, predict):
280 |     gold_offset = 0
281 |     predict_offset = 0
282 | 
283 |     gold_term_index = 0
284 |     predict_term_index = 0
285 | 
286 |     right = 0
287 |     total = len(gold)
288 |     right_and_wrong = len(predict)
289 |     while (gold_term_index < len(gold) or predict_term_index < len(predict)):
290 |         if gold_offset == predict_offset:
291 |             if gold[gold_term_index] == predict[predict_term_index]:
292 |                 right += 1
293 |             gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold)
294 |             predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict)
295 |         elif gold_offset < predict_offset:
296 |             gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold)
297 |         else:
298 |             predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict)
299 |     return right, total, right_and_wrong
300 | 
301 | 
302 | if __name__ == '__main__':
303 |     parser = argparse.ArgumentParser(description='中文分词对比测试')
304 |     parser.add_argument('-i',
305 |                         help='file to segment, default using the file in chinese-segmentation-evaluation/data/seg_data_big.txt',
306 |                         default="")
307 |     parser.add_argument('-o', help='path to save the result, default is not saving', default="")
308 |     parser.add_argument("-n", help='maximum number of read rows, default reading all', default="-1")
309 |     parser.add_argument("-c", help='segmentor to evaluate', default="pkuseg,jieba_fast,thulac")
310 |     args = parser.parse_args()
311 |     evaluate(args.i, args.o, args.n, args.c)
312 | 


--------------------------------------------------------------------------------
/python/indi.tiandi.nlp.evaluation/metric.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: Jie
  3 | # @Date:   2017-02-16 09:53:19
  4 | # @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
  5 | # @Last Modified time: 2017-12-19 15:23:12
  6 | 
  7 | # from operator import add
  8 | #
  9 | from __future__ import print_function
 10 | 
 11 | 
 12 | ## input as sentence level labels
 13 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"):
 14 |     sent_num = len(golden_lists)
 15 |     golden_full = []
 16 |     predict_full = []
 17 |     right_full = []
 18 |     right_tag = 0
 19 |     all_tag = 0
 20 |     for idx in range(0, sent_num):
 21 |         # word_list = sentence_lists[idx]
 22 |         golden_list = golden_lists[idx]
 23 |         predict_list = predict_lists[idx]
 24 |         for idy in range(len(golden_list)):
 25 |             if golden_list[idy] == predict_list[idy]:
 26 |                 right_tag += 1
 27 |         all_tag += len(golden_list)
 28 |         if label_type == "BMES":
 29 |             gold_matrix = get_ner_BMES(golden_list)
 30 |             pred_matrix = get_ner_BMES(predict_list)
 31 |         else:
 32 |             gold_matrix = get_ner_BIO(golden_list)
 33 |             pred_matrix = get_ner_BIO(predict_list)
 34 |         # print "gold", gold_matrix
 35 |         # print "pred", pred_matrix
 36 |         right_ner = list(set(gold_matrix).intersection(set(pred_matrix)))
 37 |         golden_full += gold_matrix
 38 |         predict_full += pred_matrix
 39 |         right_full += right_ner
 40 |     right_num = len(right_full)
 41 |     golden_num = len(golden_full)
 42 |     predict_num = len(predict_full)
 43 |     if predict_num == 0:
 44 |         precision = -1
 45 |     else:
 46 |         precision = (right_num + 0.0) / predict_num
 47 |     if golden_num == 0:
 48 |         recall = -1
 49 |     else:
 50 |         recall = (right_num + 0.0) / golden_num
 51 |     if (precision == -1) or (recall == -1) or (precision + recall) <= 0.:
 52 |         f_measure = -1
 53 |     else:
 54 |         f_measure = 2 * precision * recall / (precision + recall)
 55 |     accuracy = (right_tag + 0.0) / all_tag
 56 |     # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy
 57 |     # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
 58 |     return accuracy, precision, recall, f_measure, right_num, golden_num, predict_num
 59 | 
 60 | 
 61 | def reverse_style(input_string):
 62 |     target_position = input_string.index('[')
 63 |     input_len = len(input_string)
 64 |     output_string = input_string[target_position:input_len] + input_string[0:target_position]
 65 |     return output_string
 66 | 
 67 | 
 68 | def get_ner_BMES(label_list):
 69 |     # list_len = len(word_list)
 70 |     # assert(list_len == len(label_list)), "word list size unmatch with label list"
 71 |     list_len = len(label_list)
 72 |     begin_label = 'B-'
 73 |     end_label = 'E-'
 74 |     single_label = 'S-'
 75 |     whole_tag = ''
 76 |     index_tag = ''
 77 |     tag_list = []
 78 |     stand_matrix = []
 79 |     for i in range(0, list_len):
 80 |         # wordlabel = word_list[i]
 81 |         current_label = label_list[i].upper()
 82 |         if begin_label in current_label:
 83 |             if index_tag != '':
 84 |                 tag_list.append(whole_tag + ',' + str(i - 1))
 85 |             whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
 86 |             index_tag = current_label.replace(begin_label, "", 1)
 87 | 
 88 |         elif single_label in current_label:
 89 |             if index_tag != '':
 90 |                 tag_list.append(whole_tag + ',' + str(i - 1))
 91 |             whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i)
 92 |             tag_list.append(whole_tag)
 93 |             whole_tag = ""
 94 |             index_tag = ""
 95 |         elif end_label in current_label:
 96 |             if index_tag != '':
 97 |                 tag_list.append(whole_tag + ',' + str(i))
 98 |             whole_tag = ''
 99 |             index_tag = ''
100 |         else:
101 |             continue
102 |     if (whole_tag != '') & (index_tag != ''):
103 |         tag_list.append(whole_tag)
104 |     tag_list_len = len(tag_list)
105 | 
106 |     for i in range(0, tag_list_len):
107 |         if len(tag_list[i]) > 0:
108 |             tag_list[i] = tag_list[i] + ']'
109 |             insert_list = reverse_style(tag_list[i])
110 |             stand_matrix.append(insert_list)
111 |     # print stand_matrix
112 |     return stand_matrix
113 | 
114 | 
115 | def get_ner_BIO(label_list):
116 |     # list_len = len(word_list)
117 |     # assert(list_len == len(label_list)), "word list size unmatch with label list"
118 |     list_len = len(label_list)
119 |     begin_label = 'B-'
120 |     inside_label = 'I-'
121 |     whole_tag = ''
122 |     index_tag = ''
123 |     tag_list = []
124 |     stand_matrix = []
125 |     for i in range(0, list_len):
126 |         # wordlabel = word_list[i]
127 |         current_label = label_list[i].upper()
128 |         if begin_label in current_label:
129 |             if index_tag == '':
130 |                 whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
131 |                 index_tag = current_label.replace(begin_label, "", 1)
132 |             else:
133 |                 tag_list.append(whole_tag + ',' + str(i - 1))
134 |                 whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
135 |                 index_tag = current_label.replace(begin_label, "", 1)
136 | 
137 |         elif inside_label in current_label:
138 |             if current_label.replace(inside_label, "", 1) == index_tag:
139 |                 whole_tag = whole_tag
140 |             else:
141 |                 if (whole_tag != '') & (index_tag != ''):
142 |                     tag_list.append(whole_tag + ',' + str(i - 1))
143 |                 whole_tag = ''
144 |                 index_tag = ''
145 |         else:
146 |             if (whole_tag != '') & (index_tag != ''):
147 |                 tag_list.append(whole_tag + ',' + str(i - 1))
148 |             whole_tag = ''
149 |             index_tag = ''
150 | 
151 |     if (whole_tag != '') & (index_tag != ''):
152 |         tag_list.append(whole_tag)
153 |     tag_list_len = len(tag_list)
154 | 
155 |     for i in range(0, tag_list_len):
156 |         if len(tag_list[i]) > 0:
157 |             tag_list[i] = tag_list[i] + ']'
158 |             insert_list = reverse_style(tag_list[i])
159 |             stand_matrix.append(insert_list)
160 |     return stand_matrix
161 | 
162 | 
163 | def readSentence(input_file):
164 |     in_lines = open(input_file, 'r', encoding="utf-8").readlines()
165 |     sentences = []
166 |     labels = []
167 |     sentence = []
168 |     label = []
169 |     for line in in_lines:
170 |         if len(line) < 2:
171 |             sentences.append(sentence)
172 |             labels.append(label)
173 |             sentence = []
174 |             label = []
175 |         else:
176 |             pair = line.strip('\n').split(' ')
177 |             sentence.append(pair[0])
178 |             label.append(pair[-1])
179 |     return sentences, labels
180 | 
181 | 
182 | def readTwoLabelSentence(input_file, pred_col=-1):
183 |     in_lines = open(input_file, 'r', encoding="utf-8").readlines()
184 |     sentences = []
185 |     predict_labels = []
186 |     golden_labels = []
187 |     sentence = []
188 |     predict_label = []
189 |     golden_label = []
190 |     for line in in_lines:
191 |         if "##score##" in line:
192 |             continue
193 |         if len(line) < 2:
194 |             sentences.append(sentence)
195 |             golden_labels.append(golden_label)
196 |             predict_labels.append(predict_label)
197 |             sentence = []
198 |             golden_label = []
199 |             predict_label = []
200 |         else:
201 |             pair = line.strip('\n').split()
202 |             sentence.append(pair[0])
203 |             golden_label.append(pair[1])
204 |             predict_label.append(pair[pred_col])
205 | 
206 |     return sentences, golden_labels, predict_labels
207 | 
208 | 
209 | def fmeasure_from_file(golden_file, predict_file, label_type="BMES"):
210 |     print("Get f measure from file:", golden_file, predict_file)
211 |     print("Label format:", label_type)
212 |     golden_sent, golden_labels = readSentence(golden_file)
213 |     predict_sent, predict_labels = readSentence(predict_file)
214 |     A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
215 |     print("P:%sm R:%s, F:%s" % (P, R, F))
216 | 
217 | 
218 | def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1):
219 |     sent, golden_labels, predict_labels = readTwoLabelSentence(twolabel_file, pred_col)
220 |     A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
221 |     print("P:%s, R:%s, F:%s" % (P, R, F))
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     # print "sys:",len(sys.argv)
226 |     gold = [["B-Null", "M-Null"]]
227 |     predict = [["B-Null", "M-Null", "E-Null", "S-Null"]]
228 |     print(get_ner_fmeasure(gold, predict))
229 | 
230 |     # sys.argv.append("result")
231 |     # if len(sys.argv) == 3:
232 |     #     fmeasure_from_singlefile(sys.argv[1], "BMES", int(sys.argv[2]))
233 |     # else:
234 |     #     fmeasure_from_singlefile(sys.argv[1], "BMES")
235 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | jieba_fast
3 | pkuseg
4 | pynlpir
5 | thulac
6 | snownlp
7 | requests
8 | pyltp
9 | 


--------------------------------------------------------------------------------