├── .gitignore
├── LICENSE
├── README.md
├── java
├── build.gradle
├── gradle
│ └── wrapper
│ │ ├── gradle-wrapper.jar
│ │ └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── settings.gradle
└── src
│ └── main
│ ├── java
│ └── indi
│ │ └── tiandi
│ │ └── nlp
│ │ ├── Seg.java
│ │ ├── Sentence.java
│ │ ├── Term.java
│ │ ├── evaluation
│ │ ├── SegEvaluation.java
│ │ └── impl
│ │ │ ├── AnsjImpl.java
│ │ │ ├── FNLPImpl.java
│ │ │ ├── HanLPImpl.java
│ │ │ ├── JcsegImpl.java
│ │ │ ├── JiebaAnalysisImpl.java
│ │ │ ├── MMSeg4jImpl.java
│ │ │ ├── MYNLPImpl.java
│ │ │ ├── PaodingImpl.java
│ │ │ ├── StanfordCoreNLPImpl.java
│ │ │ ├── ThulacImpl.java
│ │ │ └── WordImpl.java
│ │ └── tool
│ │ ├── HttpRequest.java
│ │ └── ZipUtil.java
│ └── resources
│ ├── ansj
│ └── library
│ │ ├── ambiguity.dic
│ │ ├── default.dic
│ │ ├── regex.dic
│ │ ├── stop.dic
│ │ └── synonyms.dic
│ └── logback.xml
└── python
├── indi.tiandi.nlp.evaluation
├── SegEvaluation.py
└── metric.py
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 | ### Java template
108 | # Compiled class file
109 | *.class
110 |
111 | # Log file
112 | *.log
113 |
114 | # BlueJ files
115 | *.ctxt
116 |
117 | # Mobile Tools for Java (J2ME)
118 | .mtj.tmp/
119 |
120 | # Package Files #
121 | *.jar
122 | *.war
123 | *.nar
124 | *.ear
125 | *.zip
126 | *.tar.gz
127 | *.rar
128 |
129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
130 | hs_err_pid*
131 |
132 | !/java/gradle/wrapper/gradle-wrapper.jar
133 | ### Example user template template
134 | ### Example user template
135 |
136 | # IntelliJ project files
137 | .idea
138 | *.iml
139 | .gradle
140 | out
141 | gen
142 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 目录
2 | - java
3 | - python
4 | - 总结
5 | ## java
6 | #### Requirement
7 | java8
8 |
9 | #### 步骤
10 |
11 | 1. `git clone https://github.com/tiandiweizun/nlp-evaluation.git`
12 | 2. `cd nlp-evaluation/java`
13 | 3. (windows) `.\gradlew.bat build` (linux) `./gradlew build`
14 | 4. `java -Dfile.encoding=utf-8 -jar build/libs/nlp-evaluation-java-1.0.0.jar`
15 |
16 |
17 | #### 说明
18 | 1. java -jar nlp-evaluation-java-1.0.0.jar 有3个参数,可以执行 java -jar nlp-evaluation-java-1.0.0.jar -h 查看
19 | -i 分词文件,默认为data/seg.data_big文件,每行一个句子,每个词用空格分开,可以指定自己的测试集
20 | -o 分词结果存储路径,默认不存储
21 | -n 最大读取分词文件行数
22 | -c 需要评估的分词器名称,用英文逗号隔开,默认HanLP,jieba,thulac,mynlp,示例: -c=HanLP
23 |
24 | 2. 由于[斯坦福分词](https://github.com/stanfordnlp/CoreNLP)效果一般,速度极慢,且模型巨大,在打包的时候已经排除(不影响在IDE里面测试),
25 | 打包如果要包含斯坦福分词,修改build.gradle,注释掉exclude(dependency('edu.stanford.nlp:stanford-corenlp'))
26 | 3. 由于[Word](https://github.com/ysc/word)、[Ansj](https://github.com/NLPchina/ansj_seg)、[Jcseg](https://github.com/lionsoul2014/jcseg)、[MMSeg4j](https://github.com/chenlb/mmseg4j-core)存在bug(把词语拼接起来和原始句子不一样),在代码里面已经注释掉了,不进行测试。
27 | 4. 依赖的库均存在于maven中心仓库,像庖丁、复旦分词等找不到的,这里没有测试
28 |
29 |
30 | #### 测试效果
31 |
32 | 总行数:2533709 总字符数:28374490
33 |
34 | |segmentor|precision| recall | f1 | speed(字符/ms)_windows | speed(字符/ms)_linux |
35 | | --| -- | ------ | --- | --- | --- |
36 | |[HanLP](https://github.com/hankcs/HanLP) | 0.900433 | 0.910614 | 0.905495 | 1034.470451 | 797.596346 |
37 | |[jieba](https://github.com/huaban/jieba-analysis) | 0.852657 | 0.803263 | 0.827223 | 1774.181830 | 980.865943 |
38 | |[thulac](https://github.com/yizhiru/thulac4j) | 0.884405 | 0.901930 | 0.893082 | 1449.749131 | 939.832732 |
39 | |[mynlp](https://github.com/mayabot/mynlp) | 0.901661 | 0.900246 | 0.900953 | 1739.272404 | 1178.930115|
40 |
41 | 经过多次测试发现,linux第一个性能偏低,thulac在linux上速度不是特别稳定,最快与jieba差不多
42 |
43 | #### 开发者
44 |
45 | - 建议使用idea打开或者导入java目录,把data目录拷贝到java目录,直接可以运行SegEvaluation调试。
46 | - 可以打开stanford和其他分词器
47 | - 评测自定义分词器:继承Seg类并实现segment方法,添加到evaluators即可。
48 |
49 | ## python
50 |
51 | #### Requirement
52 |
53 | Python:3
54 | 其他参见 requirements.txt
55 |
56 | #### 步骤
57 |
58 | 1. git clone https://github.com/tiandiweizun/nlp-evaluation.git
59 | 2. cd nlp-evaluation
60 | 3. pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
61 | 4. cd python/indi.tiandi.nlp.evaluation
62 | 5. python3 SegEvaluation.py
63 |
64 | #### 说明
65 | 1. python3 SegEvaluation.py 有3个参数,可以执行 python3 SegEvaluation.py -h 查看
66 | -i 分词文件,默认为data/seg.data_big文件,每行一个句子,每个词用空格分开,可以指定自己的测试集
67 | -o 分词结果存储路径,默认不存储
68 | -n 最大读取分词文件行数,由于python速度太慢,建议设置
69 | -c 需要评估的分词器名称,用英文逗号隔开,默认pkuseg,jieba_fast,thulac
70 |
71 | 2. [pynlpir](https://github.com/tsroten/pynlpir)存在bug(把词语拼接起来和原始句子不一样),[pyltp](https://github.com/HIT-SCIR/pyltp)在windows上不易安装,这里都没有进行测试,比较慢的也没有测试
72 |
73 | ##### 测试效果
74 |
75 | 总行数:2533709 总字符数:28374490
76 |
77 | |segmentor|precision| recall | f1 | speed(字符/ms)_windows | speed(字符/ms)_linux |
78 | | --| -- | ------ | --- | --- |--- |
79 | |[pkuseg](https://github.com/lancopku/pkuseg-python) | 0.890170 | 0.886405 | 0.888284 | 34.077104 | 19.826954 |
80 | |[jieba](https://github.com/fxsjy/jieba) | 0.855293 | 0.808204 | 0.831082 | 169.651694 | 104.554222 |
81 | |[jieba_fast](https://github.com/deepcs233/jieba_fast) | 0.855299 | 0.808182 | 0.831073 | 408.241520 | 203.815985 |
82 | |[thulac](https://github.com/thunlp/THULAC-Python) | 0.848839 | 0.883031 | 0.865597 | 28.831738 | 16.565779 |
83 | |[pyltp](https://github.com/HIT-SCIR/pyltp) | 0.894885 | 0.908761 | 0.901770 | --------- | 52.371131 |
84 | |[snownlp](https://github.com/isnowfy/snownlp) | 0.811029 | 0.864835 | 0.837069 | --------- | 1.947430 |
85 |
86 | #### 开发者
87 |
88 | - 建议使用pycharm打开python目录,即可运行
89 | - 如果需要使用pynlpir,需要修改pynlpir_path的安装目录
90 | - 如果需要使用pyltp,需要修改ltp_data_dir的模型分词目录
91 | - 评测自定义分词器:只要实现segment方法和向evaluators追加即可。
92 |
93 | ## 总结
94 | - 性能:java 远高于python,至少多了一个数量级。
95 | - 效果:对于jieba和thulac,在python和java上表现的不同,需要更多的时间去寻找原因,且java的thulac4j非官方提供。
96 | - 数据:默认数据集来源于[cws_evaluation](https://github.com/ysc/cws_evaluation),该项目为评估中文分词的性能与效果,对于效果该项目采用的是行完美率这个指标,但是对于长句,这个指标会变的不合适,如果不同算法的错误率不一样,但是如果有一个错的词,会导致整个句子都是错的,不能很好的区分算法的precision
97 |
--------------------------------------------------------------------------------
/java/build.gradle:
--------------------------------------------------------------------------------
1 | buildscript {
2 | repositories {
3 | jcenter()
4 | }
5 |
6 | dependencies {
7 | classpath "com.github.jengelman.gradle.plugins:shadow:2.0.2"
8 | }
9 | }
10 |
11 | apply plugin: 'idea'
12 | apply plugin: 'java'
13 | apply plugin: "com.github.johnrengelman.shadow"
14 |
15 | group 'indi.nlp'
16 | version '1.0-SNAPSHOT'
17 |
18 | sourceCompatibility = 1.8
19 |
20 | tasks.withType(JavaCompile) {
21 | options.encoding = "UTF-8"
22 | }
23 |
24 | //禁掉jar task
25 | jar.enabled = false
26 | shadowJar {
27 | baseName = "nlp-evaluation-java"
28 | //classifier是生成jar包的后缀
29 | classifier = null
30 | version = '1.0.1'
31 | manifest {
32 | attributes 'Main-Class': 'indi.tiandi.nlp.evaluation.SegEvaluation'
33 | }
34 |
35 | from("../") {
36 | include 'data/seg_data_big.txt'
37 | }
38 | }
39 |
40 | repositories {
41 | maven{ url 'http://maven.aliyun.com/nexus/content/groups/public/'}
42 | }
43 |
44 | dependencies {
45 | implementation 'org.ansj:ansj_seg:5.1.6'
46 | implementation 'com.hankcs:hanlp:portable-1.6.8'
47 | implementation 'org.apdplat:word:1.3.1'
48 | implementation 'io.github.yizhiru:thulac4j:3.1.2'
49 | implementation 'com.chenlb.mmseg4j:mmseg4j-core:1.10.0'
50 | implementation 'org.lionsoul:jcseg-core:2.4.0'
51 | implementation 'com.huaban:jieba-analysis:1.0.2'
52 | implementation 'com.mayabot.mynlp:mynlp-segment:3.0.1'
53 | implementation 'org.apache.commons:commons-lang3:3.11'
54 |
55 | testImplementation 'junit:junit:4.2'
56 | }
57 |
58 |
59 | artifacts {
60 | shadowJar;
61 | }
62 |
63 | build.dependsOn(shadowJar);
--------------------------------------------------------------------------------
/java/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/java/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/java/gradlew:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | ##############################################################################
4 | ##
5 | ## Gradle start up script for UN*X
6 | ##
7 | ##############################################################################
8 |
9 | # Attempt to set APP_HOME
10 | # Resolve links: $0 may be a link
11 | PRG="$0"
12 | # Need this for relative symlinks.
13 | while [ -h "$PRG" ] ; do
14 | ls=`ls -ld "$PRG"`
15 | link=`expr "$ls" : '.*-> \(.*\)$'`
16 | if expr "$link" : '/.*' > /dev/null; then
17 | PRG="$link"
18 | else
19 | PRG=`dirname "$PRG"`"/$link"
20 | fi
21 | done
22 | SAVED="`pwd`"
23 | cd "`dirname \"$PRG\"`/" >/dev/null
24 | APP_HOME="`pwd -P`"
25 | cd "$SAVED" >/dev/null
26 |
27 | APP_NAME="Gradle"
28 | APP_BASE_NAME=`basename "$0"`
29 |
30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
31 | DEFAULT_JVM_OPTS=""
32 |
33 | # Use the maximum available, or set MAX_FD != -1 to use that value.
34 | MAX_FD="maximum"
35 |
36 | warn () {
37 | echo "$*"
38 | }
39 |
40 | die () {
41 | echo
42 | echo "$*"
43 | echo
44 | exit 1
45 | }
46 |
47 | # OS specific support (must be 'true' or 'false').
48 | cygwin=false
49 | msys=false
50 | darwin=false
51 | nonstop=false
52 | case "`uname`" in
53 | CYGWIN* )
54 | cygwin=true
55 | ;;
56 | Darwin* )
57 | darwin=true
58 | ;;
59 | MINGW* )
60 | msys=true
61 | ;;
62 | NONSTOP* )
63 | nonstop=true
64 | ;;
65 | esac
66 |
67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
68 |
69 | # Determine the Java command to use to start the JVM.
70 | if [ -n "$JAVA_HOME" ] ; then
71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
72 | # IBM's JDK on AIX uses strange locations for the executables
73 | JAVACMD="$JAVA_HOME/jre/sh/java"
74 | else
75 | JAVACMD="$JAVA_HOME/bin/java"
76 | fi
77 | if [ ! -x "$JAVACMD" ] ; then
78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
79 |
80 | Please set the JAVA_HOME variable in your environment to match the
81 | location of your Java installation."
82 | fi
83 | else
84 | JAVACMD="java"
85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
86 |
87 | Please set the JAVA_HOME variable in your environment to match the
88 | location of your Java installation."
89 | fi
90 |
91 | # Increase the maximum file descriptors if we can.
92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
93 | MAX_FD_LIMIT=`ulimit -H -n`
94 | if [ $? -eq 0 ] ; then
95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
96 | MAX_FD="$MAX_FD_LIMIT"
97 | fi
98 | ulimit -n $MAX_FD
99 | if [ $? -ne 0 ] ; then
100 | warn "Could not set maximum file descriptor limit: $MAX_FD"
101 | fi
102 | else
103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
104 | fi
105 | fi
106 |
107 | # For Darwin, add options to specify how the application appears in the dock
108 | if $darwin; then
109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
110 | fi
111 |
112 | # For Cygwin, switch paths to Windows format before running java
113 | if $cygwin ; then
114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"`
115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
116 | JAVACMD=`cygpath --unix "$JAVACMD"`
117 |
118 | # We build the pattern for arguments to be converted via cygpath
119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
120 | SEP=""
121 | for dir in $ROOTDIRSRAW ; do
122 | ROOTDIRS="$ROOTDIRS$SEP$dir"
123 | SEP="|"
124 | done
125 | OURCYGPATTERN="(^($ROOTDIRS))"
126 | # Add a user-defined pattern to the cygpath arguments
127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then
128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
129 | fi
130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh
131 | i=0
132 | for arg in "$@" ; do
133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option
135 |
136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition
137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
138 | else
139 | eval `echo args$i`="\"$arg\""
140 | fi
141 | i=$((i+1))
142 | done
143 | case $i in
144 | (0) set -- ;;
145 | (1) set -- "$args0" ;;
146 | (2) set -- "$args0" "$args1" ;;
147 | (3) set -- "$args0" "$args1" "$args2" ;;
148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
154 | esac
155 | fi
156 |
157 | # Escape application args
158 | save () {
159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
160 | echo " "
161 | }
162 | APP_ARGS=$(save "$@")
163 |
164 | # Collect all arguments for the java command, following the shell quoting and substitution rules
165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
166 |
167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
169 | cd "$(dirname "$0")"
170 | fi
171 |
172 | exec "$JAVACMD" "$@"
173 |
--------------------------------------------------------------------------------
/java/gradlew.bat:
--------------------------------------------------------------------------------
1 | @if "%DEBUG%" == "" @echo off
2 | @rem ##########################################################################
3 | @rem
4 | @rem Gradle startup script for Windows
5 | @rem
6 | @rem ##########################################################################
7 |
8 | @rem Set local scope for the variables with windows NT shell
9 | if "%OS%"=="Windows_NT" setlocal
10 |
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 |
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS=
18 |
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 |
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 |
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 |
32 | goto fail
33 |
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 |
38 | if exist "%JAVA_EXE%" goto init
39 |
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 |
46 | goto fail
47 |
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 |
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 |
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 |
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 |
61 | set CMD_LINE_ARGS=%*
62 |
63 | :execute
64 | @rem Setup the command line
65 |
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 |
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 |
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 |
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 |
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 |
84 | :omega
85 |
--------------------------------------------------------------------------------
/java/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'java'
2 |
3 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Seg.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp;
2 |
3 | import java.util.List;
4 | /**
5 | * 分词器抽象类
6 | *
7 | * @date 2019/3/4
8 | * @author tiandi
9 | */
10 | public abstract class Seg {
11 | private String name;
12 |
13 | public abstract List segment(String sentence);
14 |
15 | public String getName() {
16 | return this.name;
17 | }
18 |
19 | public void setName(String name) {
20 | this.name = name;
21 | }
22 |
23 | public Seg(String name) {
24 | this.name = name;
25 | }
26 |
27 | public Seg() {
28 | this.name = this.getClass().getSimpleName();
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Sentence.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * 句子
7 | *
8 | * @author tiandi
9 | * @date 2019/3/4
10 | */
11 | public class Sentence {
12 | private List terms;
13 |
14 | public Sentence(List terms) {
15 | this.terms = terms;
16 | }
17 |
18 | public String getString() {
19 | StringBuilder sb = new StringBuilder();
20 | for (Term term : terms) {
21 | sb.append(term.getWord());
22 | }
23 | return sb.toString();
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/Term.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp;
2 |
3 | /**
4 | * 词和词性
5 | *
6 | * @date 2019/3/4
7 | * @author tiandi
8 | */
9 | public class Term {
10 | /**
11 | * 词
12 | */
13 | private String word;
14 | /**
15 | * 词性
16 | */
17 | private String pos;
18 |
19 | public Term(String word) {
20 | this.word = word;
21 | }
22 |
23 | public Term(String word, String pos) {
24 | this.word = word;
25 | this.pos = pos;
26 | }
27 |
28 | public String getWord() {
29 | return word;
30 | }
31 |
32 | public void setWord(String word) {
33 | this.word = word;
34 | }
35 |
36 | public String getPos() {
37 | return pos;
38 | }
39 |
40 | public void setPos(String pos) {
41 | this.pos = pos;
42 | }
43 |
44 | @Override
45 | public String toString() {
46 | return word;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/SegEvaluation.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation;
2 |
3 | import indi.tiandi.nlp.Seg;
4 | import indi.tiandi.nlp.Term;
5 | import indi.tiandi.nlp.tool.HttpRequest;
6 | import indi.tiandi.nlp.tool.ZipUtil;
7 | import org.apache.commons.lang3.StringUtils;
8 |
9 | import java.io.*;
10 | import java.net.JarURLConnection;
11 | import java.net.URL;
12 | import java.net.URLDecoder;
13 | import java.util.ArrayList;
14 | import java.util.Arrays;
15 | import java.util.Enumeration;
16 | import java.util.List;
17 | import java.util.jar.JarEntry;
18 | import java.util.jar.JarFile;
19 |
20 | /**
21 | * SegEvaluation
22 | * 分词评估器
23 | *
24 | * @author tiandi
25 | * @date 2019/3/4
26 | */
27 | public class SegEvaluation {
28 | public static final String testSentence = "这是一个测试句子";
29 |
30 | public static final String helpMessage;
31 |
32 | static {
33 | StringBuilder sb = new StringBuilder();
34 | sb.append("thank you for using nlp-evaluation\n");
35 | sb.append("github: https://github.com/tiandiweizun/chinese-segmentation-evaluation\n");
36 | sb.append("\t-i or -input\t\t\tfile to segment,jar default using the file in ./data/seg_data_big.txt and debug model using chinese-segmentation-evaluation/data/seg_data_big.txt\n");
37 | sb.append("\t-o or -output\t\t\tpath to save the result, default is not saving\n");
38 | sb.append("\t-n or -max_line_number\t\tmaximum number of read rows, default reading all\n");
39 | sb.append("\t-c or -contains\t\t\t segmentor to evaluate,default contains HanLP,jieba,thulac\n");
40 | sb.append("\t-h or -help\t\t\tmessage for help\n");
41 | sb.append("\n");
42 | sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar -n=10\n");
43 | sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar nlp-evaluation/data/seg.data_big -n=10\n");
44 | helpMessage = sb.toString();
45 | }
46 |
47 | public static String getFileNameWithExtension(File file) {
48 | String fileName = file.getName();
49 | int i = fileName.lastIndexOf(".");
50 | if (i <= 0) {
51 | i = fileName.length();
52 | }
53 | return fileName.substring(0, i);
54 | }
55 |
56 | public static void main(String[] args) throws Exception {
57 | Config config = parseParams(args);
58 |
59 | String rFileName = config.rFileName;
60 | String wFilePath = config.wFilePath;
61 | boolean writeResult = false;
62 | int maxLineCount = config.maxLineCount;
63 |
64 | InputStream inputStream = null;
65 | File file = new File(rFileName);
66 | if (!file.exists()) {
67 | URL resource = SegEvaluation.class.getClassLoader().getResource(rFileName);
68 | if (resource != null) {
69 | // 从jar包内部加载
70 | inputStream = SegEvaluation.class.getClassLoader().getResourceAsStream((rFileName));
71 | } else {
72 | File tempZipFile = new File(Config.zipFileName);
73 | if (getFileNameWithExtension(file).equals(getFileNameWithExtension(tempZipFile))) {
74 | boolean download = true;
75 | if (tempZipFile.exists()) {
76 | try {
77 | //解压
78 | ZipUtil.unZip(tempZipFile, tempZipFile.getParent());
79 | download = false;
80 | } catch (Exception e) {
81 | //删除错误zip文件
82 | tempZipFile.delete();
83 | }
84 | }
85 | // 从互联网下载并解压
86 | if (download) {
87 | System.out.println(String.format("从 %s 下载文件,如果下载较慢,亦可手动下载,保存到 %s 即可", Config.url, tempZipFile.getAbsolutePath()));
88 | try {
89 | // 下载
90 | HttpRequest.download(Config.url, Config.zipFileName);
91 | System.out.println("下载完成");
92 | // 解压
93 | ZipUtil.unZip(tempZipFile, tempZipFile.getParent());
94 | } catch (IOException e) {
95 | System.out.println(String.format("下载或解压错误:%s", e.getMessage()));
96 | System.exit(1);
97 | }
98 | }
99 | } else {
100 | // 自定义的文件未找到
101 | System.out.println("未从本地和jar包内找到文件:" + rFileName);
102 | System.exit(1);
103 | }
104 | }
105 | }
106 | if (file.exists()) {
107 | inputStream = new FileInputStream(rFileName);
108 | System.out.println("读入分词文件地址:" + file.getAbsolutePath());
109 | }
110 | if (wFilePath.length() > 0) {
111 | writeResult = true;
112 | System.out.println("分词结果写入地址:" + new File(wFilePath).getAbsolutePath());
113 | }
114 | calcPFRScore(inputStream, wFilePath, writeResult, maxLineCount, config.segmentorNames);
115 | }
116 |
117 | public static void calcPFRScore(InputStream inputStream, String wFilePath, boolean writeResult, int maxLineCount,
118 | List segmentorNames) {
119 | String line = "";
120 | try {
121 | List evaluators = new ArrayList<>();
122 | List classesFromPackage = getClassNames("indi.tiandi.nlp.evaluation.impl");
123 | for (String segmentorName : segmentorNames) {
124 | for (String className : classesFromPackage) {
125 | int i = className.lastIndexOf(".") + 1;
126 | String simpleClassName = className.substring(i);
127 | if (!simpleClassName.toLowerCase().startsWith(segmentorName.toLowerCase())) {
128 | continue;
129 | }
130 | Class> aClass = Class.forName(className);
131 | if (Seg.class.isAssignableFrom(aClass)) {
132 | evaluators.add(new Evaluator(aClass.asSubclass(Seg.class), segmentorName));
133 | break;
134 | }
135 | }
136 | }
137 |
138 | // evaluators.add(new Evaluator(JiebaAnalysisImpl.class));
139 | // evaluators.add(new Evaluator(ThulacImpl.class));
140 | // 分词太慢
141 | // evaluators.add(new Evaluator(new StanfordCoreNLPImpl()));
142 | // 以下分词都存在bug,导致分词后的句子与分词前的句子不一样
143 | // evaluators.add(new Evaluator(WordImpl.class));
144 | // evaluators.add(new Evaluator(AnsjImpl.class));
145 | // evaluators.add(new Evaluator(JcsegImpl.class));
146 | // evaluators.add(new Evaluator(MMSeg4jImpl.class));
147 | // 获得项目根目录的绝对路径
148 | if (evaluators.size() == 0) {
149 | System.out.println("没有任何待评测分词器");
150 | System.exit(-1);
151 | }
152 | BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "utf-8"));
153 | List> gold = new ArrayList<>();
154 | List test = new ArrayList<>();
155 | int charCount = 0;
156 | boolean calcScore = true;
157 | int lineCount = 0;
158 | // -1 表示读取所有数据
159 | while ((line = br.readLine()) != null) {
160 | if (line.trim().length() == 0) {
161 | continue;
162 | }
163 | String[] s = line.split(" ");
164 | gold.add(Arrays.asList(s));
165 | test.add(line.replace(" ", ""));
166 | charCount += test.get(test.size() - 1).length();
167 |
168 | lineCount += 1;
169 | if (maxLineCount > 0 && lineCount >= maxLineCount) {
170 | break;
171 | }
172 | }
173 | System.out.println();
174 | System.out.println(String.format("总行数:%d\t总字符数:%d", gold.size(), charCount));
175 | for (Evaluator item : evaluators) {
176 |
177 | System.out.println();
178 | System.out.println(item.seg.getName() + " 评测开始");
179 | if (!item.init) {
180 | System.out.println(item.seg.getName() + " 初始化错误,跳过");
181 | continue;
182 | }
183 | BufferedWriter bw = null;
184 | if (writeResult) {
185 | wFilePath = wFilePath.replace("\\", "/");
186 | if (!wFilePath.endsWith("/")) {
187 | wFilePath += "/";
188 | }
189 | String wFileName = wFilePath + item.seg.getName();
190 | bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(wFileName), "utf-8"));
191 | }
192 | long start = System.currentTimeMillis();
193 | int right_num = 0;
194 | int gold_num = 0;
195 | int predict_num = 0;
196 | for (int i = 0; i < test.size(); i++) {
197 | line = test.get(i);
198 | List segment = item.seg.segment(line);
199 | List predict = new ArrayList<>();
200 | for (Term term : segment) {
201 | predict.add(term.getWord());
202 | }
203 | if (calcScore) {
204 | if (!StringUtils.join(predict, "").equals(line)) {
205 | System.out.println(item.seg.getName() + "\t" + line);
206 | continue;
207 | }
208 | int result[] = calcScore(gold.get(i), predict);
209 | right_num += result[0];
210 | gold_num += result[1];
211 | predict_num += result[2];
212 | }
213 | if (writeResult) {
214 | bw.write(StringUtils.join(predict, " ") + "\n");
215 | }
216 | }
217 | if (writeResult) {
218 | bw.close();
219 | }
220 | item.time = System.currentTimeMillis() - start;
221 | double precision = 0;
222 | double recall = 0;
223 | double f = 0;
224 | if (predict_num != 0) {
225 | precision = right_num * 1.0 / predict_num;
226 | }
227 | if (gold_num != 0) {
228 | recall = right_num * 1.0 / gold_num;
229 | }
230 | if (precision + recall > 0) {
231 | f = 2 * precision * recall / (precision + recall);
232 | }
233 | System.out.println(String.format("precision:%f \t recall:%f \t f1:%f", precision, recall, f));
234 | System.out.println(String.format("耗时:%d ms,\t速度:%f 字符/毫秒", item.time, charCount * 1.0 / item.time));
235 | }
236 | } catch (Exception e) {
237 | System.out.println(line);
238 | e.printStackTrace();
239 | }
240 | }
241 |
242 | public static int[] calcScore(List gold, List predict) {
243 | int gold_offset = 0;
244 | int predict_offset = 0;
245 |
246 | int gold_term_index = 0;
247 | int predict_term_index = 0;
248 |
249 | int right = 0;
250 | int total = gold.size();
251 | int right_and_wrong = predict.size();
252 | while (gold_term_index < total || predict_term_index < right_and_wrong) {
253 | if (gold_offset == predict_offset) {
254 | if (gold.get(gold_term_index).equals(predict.get(predict_term_index))) {
255 | right += 1;
256 | }
257 | int[] result = update(gold_offset, gold_term_index, gold);
258 | gold_offset = result[0];
259 | gold_term_index = result[1];
260 | result = update(predict_offset, predict_term_index, predict);
261 | predict_offset = result[0];
262 | predict_term_index = result[1];
263 | } else if (gold_offset < predict_offset) {
264 | int[] result = update(gold_offset, gold_term_index, gold);
265 | gold_offset = result[0];
266 | gold_term_index = result[1];
267 | } else {
268 | int[] result = update(predict_offset, predict_term_index, predict);
269 | predict_offset = result[0];
270 | predict_term_index = result[1];
271 | }
272 | }
273 | int[] result = {right, total, right_and_wrong};
274 | return result;
275 | }
276 |
277 | public static int[] update(int offset, int index, List terms) {
278 | offset += terms.get(index).length();
279 | index += 1;
280 | int[] result = {offset, index};
281 | return result;
282 | }
283 |
284 | public static Config parseParams(String[] args) {
285 | // message for help
286 | try {
287 |
288 | Config config = new Config();
289 | if (args.length == 1) {
290 | if (args[0].equalsIgnoreCase("-h") || args[0].equalsIgnoreCase("-help")) {
291 | System.out.println(helpMessage);
292 | System.exit(0);
293 | }
294 | }
295 | boolean containsOption = false;
296 | for (int i = 0; i < args.length; i++) {
297 | String arg = args[i];
298 | if (arg.startsWith("-") && arg.contains("=")) {
299 | containsOption = true;
300 | String[] split = arg.split("=");
301 | String paramName = split[0].trim().substring(1).toLowerCase();
302 | String paramValue = split[1].trim();
303 | switch (paramName) {
304 | case "i":
305 | case "input":
306 | config.rFileName = paramValue;
307 | break;
308 | case "o":
309 | case "output":
310 | config.wFilePath = paramValue;
311 | break;
312 | case "n":
313 | case "max_line_number":
314 | config.maxLineCount = Integer.parseInt(paramValue);
315 | break;
316 | case "c":
317 | case "contains":
318 | String[] segmentorNames = paramValue.split(",");
319 | config.segmentorNames = Arrays.asList(segmentorNames);
320 | break;
321 | }
322 | } else if (containsOption) {
323 | System.out.println("optional argument follows keyword argument");
324 | } else {
325 | if (i == 0) {
326 | config.rFileName = args[0].trim();
327 | } else if (i == 1) {
328 | config.wFilePath = args[1].trim();
329 | } else if (i == 2) {
330 | config.maxLineCount = Integer.parseInt(args[2].trim());
331 | }
332 | }
333 | }
334 | return config;
335 | } catch (Exception e) {
336 | System.out.println("参数错误:" + e.getMessage());
337 | System.out.println(helpMessage);
338 | System.exit(0);
339 | }
340 | return null;
341 | }
342 |
343 | /**
344 | * 从包package中获取所有的Class
345 | *
346 | * @param packageName
347 | * @return
348 | */
349 |
350 | public static List getClassNames(String packageName) {
351 | //第一个class类的集合
352 | List classes = new ArrayList<>();
353 | //是否循环迭代
354 | boolean recursive = true;
355 | //获取包的名字 并进行替换
356 | String packageDirName = packageName.replace('.', '/');
357 | //定义一个枚举的集合 并进行循环来处理这个目录下的things
358 | Enumeration dirs;
359 | try {
360 | dirs = Thread.currentThread().getContextClassLoader().getResources(packageDirName);
361 | //循环迭代下去
362 | while (dirs.hasMoreElements()) {
363 | //获取下一个元素
364 | URL url = dirs.nextElement();
365 | //得到协议的名称
366 | String protocol = url.getProtocol();
367 | //如果是以文件的形式保存在服务器上
368 | if ("file".equals(protocol)) {
369 | //获取包的物理路径
370 | String filePath = URLDecoder.decode(url.getFile(), "UTF-8");
371 | //以文件的方式扫描整个包下的文件 并添加到集合中
372 | findAndAddClassesInPackageByFile(packageName, filePath, recursive, classes);
373 | } else if ("jar".equals(protocol)) {
374 | //如果是jar包文件
375 | //定义一个JarFile
376 | JarFile jar;
377 | try {
378 | //获取jar
379 | jar = ((JarURLConnection) url.openConnection()).getJarFile();
380 | //从此jar包 得到一个枚举类
381 | Enumeration entries = jar.entries();
382 | //同样的进行循环迭代
383 | while (entries.hasMoreElements()) {
384 | //获取jar里的一个实体 可以是目录 和一些jar包里的其他文件 如META-INF等文件
385 | JarEntry entry = entries.nextElement();
386 | String name = entry.getName();
387 | //如果是以/开头的
388 | if (name.charAt(0) == '/') {
389 | //获取后面的字符串
390 | name = name.substring(1);
391 | }
392 | //如果前半部分和定义的包名相同
393 | if (name.startsWith(packageDirName)) {
394 | int idx = name.lastIndexOf('/');
395 | //如果以"/"结尾 是一个包
396 | if (idx != -1) {
397 | //获取包名 把"/"替换成"."
398 | packageName = name.substring(0, idx).replace('/', '.');
399 | }
400 | //如果可以迭代下去 并且是一个包
401 | if ((idx != -1) || recursive) {
402 | //如果是一个.class文件 而且不是目录
403 | if (name.endsWith(".class") && !entry.isDirectory()) {
404 | //去掉后面的".class" 获取真正的类名
405 | String className = name.substring(packageName.length() + 1, name.length() - 6);
406 | //添加到classes
407 | classes.add(packageName + '.' + className);
408 | }
409 | }
410 | }
411 | }
412 | } catch (IOException e) {
413 | e.printStackTrace();
414 | }
415 | }
416 | }
417 | } catch (IOException e) {
418 | e.printStackTrace();
419 | }
420 | return classes;
421 | }
422 |
423 | /**
424 | * 以文件的形式来获取包下的所有Class
425 | *
426 | * @param packageName
427 | * @param packagePath
428 | * @param recursive
429 | * @param classes
430 | */
431 |
432 | public static void findAndAddClassesInPackageByFile(String packageName, String packagePath, final boolean recursive,
433 | List classes) {
434 | //获取此包的目录 建立一个File
435 | File dir = new File(packagePath);
436 | //如果不存在或者 也不是目录就直接返回
437 | if (!dir.exists() || !dir.isDirectory()) {
438 | return;
439 | }
440 | //如果存在 就获取包下的所有文件 包括目录
441 | File[] dirfiles = dir.listFiles(new FileFilter() {
442 | //自定义过滤规则 如果可以循环(包含子目录) 或则是以.class结尾的文件(编译好的java类文件)
443 | public boolean accept(File file) {
444 | return (recursive && file.isDirectory()) || (file.getName().endsWith(".class"));
445 | }
446 | });
447 | //循环所有文件
448 | for (File file : dirfiles) {
449 | //如果是目录 则继续扫描
450 | if (file.isDirectory()) {
451 | findAndAddClassesInPackageByFile(packageName + "." + file.getName(), file.getAbsolutePath(), recursive, classes);
452 | } else {
453 | //如果是java类文件 去掉后面的.class 只留下类名
454 | String className = file.getName().substring(0, file.getName().length() - 6);
455 | classes.add(packageName + '.' + className);
456 | }
457 | }
458 | }
459 | }
460 |
461 | class Evaluator {
462 |
463 | public Seg seg;
464 | public long time = 0;
465 | public boolean init = false;
466 |
467 | public Evaluator(Class extends Seg> segClass) {
468 | this(segClass, segClass.getSimpleName());
469 | }
470 |
471 | public Evaluator(Class extends Seg> segClass, String name) {
472 | try {
473 | long start = System.currentTimeMillis();
474 | System.out.println(name + " 初始化开始");
475 | this.seg = segClass.newInstance();
476 | this.seg.setName(name);
477 | List terms = this.seg.segment(SegEvaluation.testSentence);
478 | StringBuilder sb = new StringBuilder();
479 | for (Term term : terms) {
480 | sb.append(term.getWord());
481 | }
482 | long end = System.currentTimeMillis();
483 | long cost = end - start;
484 | if (!sb.toString().equals(SegEvaluation.testSentence)) {
485 | System.out.println(name + " 初始化错误,句子:" + SegEvaluation.testSentence + ",分词结果:" + terms);
486 | } else {
487 | this.init = true;
488 | System.out.println(name + " 初始化结束,耗时:" + cost + " ms");
489 | }
490 | } catch (InstantiationException e) {
491 | e.printStackTrace();
492 | } catch (IllegalAccessException e) {
493 | e.printStackTrace();
494 | }
495 | }
496 | }
497 |
498 | class Config {
499 | public static final String zipFileName = "data/seg_data_big.zip";
500 | public static final String url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip";
501 | public String rFileName = "data/seg_data_big.txt";
502 | public String wFilePath = "";
503 | public boolean writeResult = false;
504 | public int maxLineCount = -1;
505 | public List segmentorNames = new ArrayList<>(Arrays.asList("HanLP", "Jieba", "Thulac", "mynlp"));
506 | }
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/AnsjImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import indi.tiandi.nlp.Seg;
4 | import indi.tiandi.nlp.Term;
5 | import org.ansj.domain.Result;
6 | import org.ansj.library.AmbiguityLibrary;
7 | import org.ansj.library.DicLibrary;
8 | import org.ansj.splitWord.analysis.ToAnalysis;
9 | import org.ansj.util.MyStaticValue;
10 |
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | /**
14 | * Ansj分词
15 | *
16 | * @date 2019/3/4
17 | * @author tiandi
18 | */
19 | public class AnsjImpl extends Seg {
20 | static {
21 | // 设置后速度会慢25% 左右
22 | // MyStaticValue.ENV.put(DicLibrary.DEFAULT, AnsjImpl.class.getClassLoader().getResource("ansj/library/default.dic").getPath());
23 | // MyStaticValue.ENV.put(AmbiguityLibrary.DEFAULT, AnsjImpl.class.getClassLoader().getResource("ansj/library/ambiguity.dic").getPath());
24 | }
25 |
26 | @Override
27 | public List segment(String sentence) {
28 | Result result = ToAnalysis.parse(sentence);
29 | List terms = new ArrayList<>();
30 | for (org.ansj.domain.Term term : result) {
31 | terms.add(new Term(term.getName()));
32 | }
33 | return terms;
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/FNLPImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | //import indi.nlp.Seg;
4 |
5 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/HanLPImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import com.hankcs.hanlp.HanLP;
4 | import indi.tiandi.nlp.Seg;
5 | import indi.tiandi.nlp.Term;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | /**
10 | * HanLP分词
11 | *
12 | * @date 2019/3/4
13 | * @author tiandi
14 | */
15 | public class HanLPImpl extends Seg {
16 |
17 | @Override
18 | public List segment(String sentence) {
19 | // List segment = BasicTokenizer.segment(sentence);
20 | List segment = HanLP.segment(sentence);
21 | List terms = new ArrayList<>();
22 | for (com.hankcs.hanlp.seg.common.Term term : segment) {
23 | terms.add(new Term(term.word));
24 | }
25 | return terms;
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/JcsegImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import indi.tiandi.nlp.Seg;
4 | import indi.tiandi.nlp.Term;
5 | import org.lionsoul.jcseg.tokenizer.core.*;
6 |
7 | import java.io.IOException;
8 | import java.io.StringReader;
9 | import java.util.ArrayList;
10 | import java.util.List;
11 | /**
12 | * Jcseg分词
13 | *
14 | * @date 2019/3/4
15 | * @author tiandi
16 | */
17 | public class JcsegImpl extends Seg {
18 |
19 | private final static JcsegTaskConfig config = new JcsegTaskConfig(true);
20 | private final static ADictionary dic = DictionaryFactory.createSingletonDictionary(config);
21 | private static ISegment seg;
22 |
23 | static {
24 | try {
25 | seg = SegmentFactory.createJcseg(JcsegTaskConfig.COMPLEX_MODE, new Object[]{config, dic});
26 | }
27 | catch (JcsegException e) {
28 | e.printStackTrace();
29 | }
30 | }
31 |
32 | @Override
33 | public List segment(String sentence) {
34 | List terms = new ArrayList<>();
35 | try {
36 | seg.reset(new StringReader(sentence));
37 | IWord word = null;
38 | while ((word = seg.next()) != null) {
39 | terms.add(new Term(word.getValue()));
40 | }
41 | }
42 | catch (IOException e) {
43 | e.printStackTrace();
44 | }
45 | return terms;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/JiebaAnalysisImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import com.huaban.analysis.jieba.JiebaSegmenter;
4 | import indi.tiandi.nlp.Seg;
5 | import indi.tiandi.nlp.Term;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 |
10 | /**
11 | * 结巴分词
12 | *
13 | * @author tiandi
14 | * @date 2019/3/7
15 | */
16 | public class JiebaAnalysisImpl extends Seg {
17 | private static final JiebaSegmenter seg = new JiebaSegmenter();
18 |
19 | @Override
20 | public List segment(String sentence) {
21 | List strings = seg.sentenceProcess(sentence);
22 | List terms = new ArrayList<>();
23 | for (String string : strings) {
24 | terms.add(new Term(string));
25 | }
26 | return terms;
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/MMSeg4jImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import com.chenlb.mmseg4j.ComplexSeg;
4 | import com.chenlb.mmseg4j.Dictionary;
5 | import com.chenlb.mmseg4j.MMSeg;
6 | import com.chenlb.mmseg4j.Word;
7 | import indi.tiandi.nlp.Seg;
8 | import indi.tiandi.nlp.Term;
9 |
10 | import java.io.IOException;
11 | import java.io.StringReader;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | /**
15 | * MMSeg分词
16 | *
17 | * @date 2019/3/4
18 | * @author tiandi
19 | */
20 | public class MMSeg4jImpl extends Seg {
21 | private static final Dictionary dic = Dictionary.getInstance();
22 | private static final ComplexSeg seg = new ComplexSeg(dic);
23 | private static final MMSeg mmSeg = new MMSeg(new StringReader(""), seg);
24 |
25 | @Override
26 | public List segment(String sentence) {
27 | mmSeg.reset(new StringReader(sentence));
28 | Word word = null;
29 | List terms = new ArrayList<>();
30 | try {
31 | while ((word = mmSeg.next()) != null) {
32 | if (word != null) {
33 | terms.add(new Term(word.getString()));
34 | }
35 | }
36 | }
37 | catch (IOException e) {
38 | System.out.println(sentence);
39 | e.printStackTrace();
40 | }
41 | return terms;
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/MYNLPImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import com.mayabot.nlp.segment.Lexer;
4 | import com.mayabot.nlp.segment.Lexers;
5 | import com.mayabot.nlp.segment.Sentence;
6 | import com.mayabot.nlp.segment.WordTerm;
7 | import indi.tiandi.nlp.Seg;
8 | import indi.tiandi.nlp.Term;
9 |
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | /**
13 | * MYNLP分词
14 | *
15 | * @date 2019/3/4
16 | * @author tiandi
17 | */
18 | public class MYNLPImpl extends Seg {
19 | Lexer lexer = Lexers.builder().basic().core().keepOriCharOutput().build();
20 |
21 | @Override
22 | public List segment(String sentence) {
23 | Sentence result = lexer.scan(sentence);
24 | List terms = new ArrayList<>();
25 | for (WordTerm term : result.toList()) {
26 | terms.add(new Term(term.getWord()));
27 | }
28 | return terms;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/PaodingImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | //import indi.nlp.Seg;
4 | //import indi.nlp.Term;
5 | //import org.apache.lucene.analysis.TokenStream;
6 | //
7 | //import java.io.IOException;
8 | //import java.util.ArrayList;
9 | //import java.util.List;
10 | //
11 | ///**
12 | // * 庖丁分词
13 | // *
14 | // * @author tiandi
15 | // * @date 2019/3/4
16 | // */
17 | //public class PaodingImpl implements Seg {
18 | // public static PaodingAnalyzer paodingAnalyzer = new PaodingAnalyzer();
19 | //
20 | // @Override
21 | // public List segment(String sentence) {
22 | // List terms = new ArrayList<>();
23 | // try {
24 | // TokenStream tokenStream = paodingAnalyzer.tokenStream("", sentence);
25 | // System.out.println(tokenStream.toString());
26 | // } catch (IOException e) {
27 | // e.printStackTrace();
28 | // }
29 | // return terms;
30 | // }
31 | //
32 | // public static void main(String[] args) {
33 | // new PaodingImpl().segment("我是中国人");
34 | // }
35 | //}
36 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/StanfordCoreNLPImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | //import edu.stanford.nlp.ling.CoreLabel;
4 | //import edu.stanford.nlp.pipeline.CoreDocument;
5 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP;
6 | //import indi.tiandi.nlp.Seg;
7 | //import indi.tiandi.nlp.Term;
8 | //
9 | //import java.util.ArrayList;
10 | //import java.util.List;
11 | //import java.util.Properties;
12 | //
13 | ///**
14 | // * 斯坦福CoreNLP分词
15 | // *
16 | // * @date 2019/3/4
17 | // * @author tiandi
18 | // */
19 | //public class StanfordCoreNLPImpl extends Seg {
20 | // public static StanfordCoreNLP stanfordCoreNLP;
21 | //
22 | // static {
23 | // try {
24 | // Properties props = new Properties();
25 | // props.load(StanfordCoreNLPImpl.class.getClassLoader().getResourceAsStream("StanfordCoreNLP-chinese.properties"));
26 | // props.setProperty("annotators", "tokenize,ssplit,pos");
27 | // stanfordCoreNLP = new StanfordCoreNLP(props);
28 | // } catch (Exception e) {
29 | // e.printStackTrace();
30 | // }
31 | // }
32 | //
33 | // @Override
34 | // public List segment(String sentence) {
35 | // CoreDocument exampleDocument = new CoreDocument(sentence);
36 | // // annotate document
37 | // stanfordCoreNLP.annotate(exampleDocument);
38 | // // access tokens from a CoreDocument
39 | // // a token is represented by a CoreLabel
40 | // List firstSentenceTokens = exampleDocument.sentences().get(0).tokens();
41 | // // this for loop will print out all of the tokens and the character offset info
42 | // List terms = new ArrayList<>();
43 | // for (CoreLabel token : firstSentenceTokens) {
44 | // terms.add(new Term(token.word(), token.tag()));
45 | // }
46 | // return terms;
47 | // }
48 | //}
49 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/ThulacImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import indi.tiandi.nlp.Seg;
4 | import indi.tiandi.nlp.Term;
5 | import io.github.yizhiru.thulac4j.Segmenter;
6 |
7 | import java.util.ArrayList;
8 | import java.util.List;
9 | /**
10 | * 清华分词
11 | *
12 | * @date 2019/3/4
13 | * @author tiandi
14 | */
15 | public class ThulacImpl extends Seg {
16 | @Override
17 | public List segment(String sentence) {
18 | List segment = Segmenter.segment(sentence);
19 | List terms=new ArrayList<>();
20 | for (String s : segment) {
21 | terms.add(new Term(s));
22 | }
23 | return terms;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/evaluation/impl/WordImpl.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.evaluation.impl;
2 |
3 | import indi.tiandi.nlp.Seg;
4 | import indi.tiandi.nlp.Term;
5 | import org.apdplat.word.segmentation.Segmentation;
6 | import org.apdplat.word.segmentation.SegmentationAlgorithm;
7 | import org.apdplat.word.segmentation.SegmentationFactory;
8 | import org.apdplat.word.segmentation.Word;
9 |
10 | import java.util.ArrayList;
11 | import java.util.List;
12 | /**
13 | * word分词
14 | *
15 | * @date 2019/3/4
16 | * @author tiandi
17 | */
18 | public class WordImpl extends Seg {
19 | public static final Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore);
20 |
21 | @Override
22 | public List segment(String sentence) {
23 | List words = segmentation.seg(sentence);
24 | List terms = new ArrayList<>();
25 | for (Word word : words) {
26 | terms.add(new Term(word.getText()));
27 | }
28 |
29 | return terms;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/tool/HttpRequest.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.tool;
2 |
3 | import java.io.File;
4 | import java.io.FileOutputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import java.net.URL;
8 | import java.net.URLConnection;
9 |
10 | /**
11 | * Http下载
12 | *
13 | * @author tiandi
14 | * @date 2021/2/18
15 | */
16 | public class HttpRequest {
17 | /**
18 | * 从网络Url中下载文件
19 | *
20 | * @param httpUrl url地址
21 | * @param saveFile 保存文件路径和名称
22 | * @throws IOException
23 | */
24 | public static boolean download(String httpUrl, String saveFile) throws IOException {
25 | // 下载网络文件
26 | int bytesum = 0;
27 | int byteread = 0;
28 | URL url = new URL(httpUrl);
29 | URLConnection conn = url.openConnection();
30 | File file = new File(saveFile);
31 | if (!file.getParentFile().exists()) {
32 | file.getParentFile().mkdirs();
33 | }
34 | try (InputStream inStream = conn.getInputStream();
35 | FileOutputStream fs = new FileOutputStream(saveFile);
36 | ) {
37 | byte[] buffer = new byte[1204];
38 | while ((byteread = inStream.read(buffer)) != -1) {
39 | bytesum += byteread;
40 | fs.write(buffer, 0, byteread);
41 | }
42 | return true;
43 | } catch (IOException e) {
44 | throw e;
45 | }
46 | }
47 | }
--------------------------------------------------------------------------------
/java/src/main/java/indi/tiandi/nlp/tool/ZipUtil.java:
--------------------------------------------------------------------------------
1 | package indi.tiandi.nlp.tool;
2 |
3 | import java.io.*;
4 | import java.nio.charset.Charset;
5 | import java.nio.file.Paths;
6 | import java.util.Enumeration;
7 | import java.util.zip.ZipEntry;
8 | import java.util.zip.ZipFile;
9 |
10 | /**
11 | * 压缩和解压缩(https://www.bbsmax.com/A/x9J2bZLMJ6/)
12 | * 更多压缩文件格式可以参见https://www.bookstack.cn/read/hutool/a56da94bbb16617b.md
13 | *
14 | * @author tiandi
15 | * @date 2021/2/1
16 | */
17 | public class ZipUtil {
18 | public static boolean unZip(File zipFile, String descDir) throws IOException {
19 | boolean flag = false;
20 | // 指定编码,否则压缩包里面不能有中文目录
21 | InputStream in = null;
22 | OutputStream out = null;
23 | ZipFile zip = new ZipFile(zipFile, Charset.forName("gbk"));
24 | try {
25 | for (Enumeration entries = zip.entries(); entries.hasMoreElements(); ) {
26 | ZipEntry entry = (ZipEntry) entries.nextElement();
27 | String zipEntryName = entry.getName();
28 | File file = Paths.get(descDir, zipEntryName).toFile();
29 | File dir = file;
30 | if (!zipEntryName.endsWith("/") && !zipEntryName.endsWith("\\")) {
31 | // 非文件夹获取父文件
32 | dir = file.getParentFile();
33 | }
34 | if (!dir.exists()) {
35 | dir.mkdirs();
36 | }
37 | if (file.isDirectory()) {
38 | continue;
39 | }
40 | in = zip.getInputStream(entry);
41 | out = new FileOutputStream(file);
42 | byte[] buf1 = new byte[2048];
43 | int len;
44 | while ((len = in.read(buf1)) > 0) {
45 | out.write(buf1, 0, len);
46 | }
47 | in.close();
48 | out.close();
49 | }
50 | flag = true;
51 | // 必须关闭,否则无法删除该zip文件
52 | } catch (IOException exception) {
53 | zip.close();
54 | if (in != null) {
55 | in.close();
56 | }
57 | if (out != null) {
58 | out.close();
59 | }
60 | throw exception;
61 | }
62 | return flag;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/ambiguity.dic:
--------------------------------------------------------------------------------
1 | 习近平 nr
2 | 李民 nr 工作 vn
3 | 三个 m 和尚 n
4 | 的确 d 定 v 不 v
5 | 大 a 和尚 n
6 | 张三 nr 和 c
7 | 动漫 n 游戏 n
8 | 邓颖超 nr 生前 t
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/regex.dic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/src/main/resources/ansj/library/regex.dic
--------------------------------------------------------------------------------
/java/src/main/resources/ansj/library/stop.dic:
--------------------------------------------------------------------------------
1 | ?
2 | :
3 | .
4 | ,
5 | is
6 | a
7 | #
8 | v nature
9 | .*了 regex
--------------------------------------------------------------------------------
/java/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/python/indi.tiandi.nlp.evaluation/SegEvaluation.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 | import time
5 | import zipfile
6 |
7 | import jieba
8 | import jieba_fast
9 | import pkuseg
10 | import pynlpir
11 | import requests
12 | import thulac
13 | import wget
14 | from snownlp import SnowNLP
15 |
16 |
17 | class Seg:
18 | def __init__(self):
19 | pass
20 |
21 | def segment(self, sentence):
22 | pass
23 |
24 |
25 | # 中科院分词,python版本有bug和demo页面不一致
26 | class pynlpir_impl:
27 | def __init__(self):
28 | # 通过代码自动更新lience
29 | pynlpir_impl.update_pynlpir_lience()
30 | pynlpir.open()
31 |
32 | @staticmethod
33 | def update_pynlpir_lience():
34 | # pynlpir 分词授权下载地址
35 | # https://github.com/NLPIR-team/NLPIR/tree/master/License
36 | url = 'https://raw.githubusercontent.com/NLPIR-team/NLPIR/master/License/license%20for%20a%20month/NLPIR-ICTCLAS%E5%88%86%E8%AF%8D%E7%B3%BB%E7%BB%9F%E6%8E%88%E6%9D%83/NLPIR.user'
37 | r = requests.get(url)
38 | # 拷贝到pynlpir相应的目录
39 | with open(os.path.join(pynlpir.__path__[0], "Data", "NLPIR.user"), "wb") as code:
40 | code.write(r.content)
41 |
42 | def segment(self, sentence):
43 | return pynlpir.segment(sentence, pos_tagging=False)
44 |
45 |
46 | # 北大分词
47 | class pkuseg_impl(Seg):
48 | def __init__(self):
49 | self.pku_seg = pkuseg.pkuseg()
50 |
51 | def segment(self, sentence):
52 | return self.pku_seg.cut(sentence)
53 |
54 |
55 | # 结巴分词
56 | class jieba_impl(Seg):
57 | def segment(self, sentence):
58 | return jieba.lcut(sentence)
59 |
60 |
61 | # jieba_fast分词
62 | class jieba_fast_impl(Seg):
63 | def segment(self, sentence):
64 | return jieba_fast.lcut(sentence)
65 |
66 |
67 | # snownlp分词
68 | class snownlp_impl(Seg):
69 | def segment(self, sentence):
70 | return SnowNLP(sentence).words
71 |
72 |
73 | # 清华分词
74 | class thulac_impl(Seg):
75 | def __init__(self):
76 | self.thu1 = thulac.thulac(seg_only=True) # 默认模式
77 |
78 | def segment(self, sentence):
79 | return self.thu1.cut(sentence, text=True).split() # 进行一句话分词
80 |
81 |
82 | # 哈工大分词
83 | class pyltp_impl(Seg):
84 |
85 | def __init__(self):
86 | ltp_data_dir = '/home/work/tiandi/ltp.model/ltp_data' # ltp模型目录的路径,由于cws文件比较大,需要自行下载 http://model.scir.yunfutech.com/model/ltp_data_v3.4.0.zip
87 | cws_model_path = os.path.join(ltp_data_dir, 'cws.model') # 分词模型路径,模型名称为`cws.model`
88 | from pyltp import Segmentor
89 | self.segmentor = Segmentor()
90 | self.segmentor.load(cws_model_path)
91 |
92 | def segment(self, sentence):
93 | return self.segmentor.segment(sentence);
94 |
95 |
96 | test_sentence = "这是一个测试句子"
97 |
98 |
99 | # 分词评估器
100 | class Evaluator:
101 | def __init__(self, seg_tool, name=None):
102 | '''
103 | :param seg_tool: 分词器
104 | :param name: 分词器名称(可以直接重构到分词器里面去)
105 | '''
106 | if not name:
107 | name = seg_tool.__name__
108 | self.name = name
109 | time_start = time.time()
110 | print("%s 初始化开始" % self.name)
111 | self.seg = seg_tool()
112 | self.time = 0
113 | self.init = False
114 | result = self.seg.segment(test_sentence)
115 | # print(result)
116 | time_end = time.time()
117 | time_cost = (time_end - time_start) * 1000
118 | if "".join(result) != test_sentence:
119 | print("%s 初始化错误,句子:%s,分词结果:%s" % (self.name, test_sentence, result))
120 | else:
121 | self.init = True
122 | print("%s 初始化结束,耗时:%d ms" % (self.name, time_cost))
123 |
124 |
125 | # 用BMES标记分词结果
126 | def get_ner(terms):
127 | temp_gold = []
128 | for term in terms:
129 | if len(term) == 1:
130 | temp_gold.append("S-Null")
131 | else:
132 | term_item = ["M-Null"] * len(term)
133 | term_item[0] = "B-Null"
134 | term_item[-1] = "E-Null"
135 | temp_gold.extend(term_item)
136 | return temp_gold
137 |
138 |
139 | # 评估各个分词器
140 | def evaluate(input, output, max_line_count, include):
141 | # 分词文件目录
142 | if len(input) == 0:
143 | # 项目root目录
144 | root = os.getcwd()[:os.getcwd().rindex("python")]
145 | data_dir = os.path.join(root, "data")
146 | input = os.path.join(data_dir, "seg_data_big.txt")
147 | download_seg_file = True
148 | if not os.path.exists(input):
149 | temp_zip_file = os.path.join(data_dir, "seg_data_big.zip")
150 | if os.path.exists(temp_zip_file):
151 | try:
152 | zipfile.ZipFile(temp_zip_file).extractall(data_dir)
153 | download_seg_file = False
154 | except Exception as e:
155 | os.remove(temp_zip_file)
156 | if download_seg_file:
157 | url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip"
158 | print("从 %s 下载文件,如果下载较慢,亦可手动下载,保存到 %s 即可" % (url, temp_zip_file))
159 | try:
160 | if not os.path.exists(os.path.dirname(temp_zip_file)):
161 | os.makedirs(os.path.dirname(temp_zip_file))
162 | wget.download(url, out=temp_zip_file)
163 | print("下载完成")
164 | zipfile.ZipFile(temp_zip_file).extractall(data_dir)
165 | except Exception as e:
166 | print("下载或解压错误:%s" % e)
167 | sys.exit(1)
168 |
169 | if not os.path.exists(input):
170 | print("未从本地到文件:%s" % input)
171 | sys.exit(1)
172 | print("读入分词文件地址:" + input)
173 | write_result = False
174 | if len(output) > 0:
175 | print("分词结果写入地址:" + output)
176 | write_result = True
177 | max_line_count = int(max_line_count)
178 | if max_line_count > 0:
179 | print("最大读取行数:" + str(max_line_count))
180 |
181 | evaluators = []
182 | for name in include.split(","):
183 | evaluators.append(Evaluator(globals()[name + "_impl"], name))
184 | # evaluators.append(Evaluator(pynlpir_impl))
185 | # evaluators.append(Evaluator(pkuseg_impl))
186 | # evaluators.append(Evaluator(jieba_impl))
187 | # evaluators.append(Evaluator(snownlp_impl))
188 | # evaluators.append(Evaluator(thulac_impl))
189 | # evaluators.append(Evaluator(pyltp_impl))
190 | time_start = time.time()
191 | print("读入分词文件开始")
192 | with open(input, encoding="utf-8") as f:
193 | lines = f.readlines()
194 | time_end = time.time()
195 | time_cost = (time_end - time_start) * 1000
196 | print("读取文件结束,耗时:%d ms" % (time_cost))
197 |
198 | gold = []
199 | test = []
200 | char_count = 0
201 | # max_line_count = 100000
202 | if max_line_count <= 0:
203 | max_line_count = len(lines)
204 | line_count = 0
205 | for line in lines:
206 | gold.append(line.strip().split())
207 | test.append("".join(gold[-1]))
208 | char_count += len(test[-1])
209 | line_count += 1
210 | if line_count > max_line_count:
211 | break
212 | print("总行数:%d\t总字符数:%d" % (line_count, char_count))
213 | calcScore = True
214 | for item in evaluators:
215 | print()
216 | print("%s 评测开始" % item.name)
217 | if not item.init:
218 | print("%s 初始化错误,跳过" % (item.name))
219 | continue
220 | if write_result:
221 | file = open(os.path.join(output, item.name), mode="w", encoding="utf-8")
222 | time_start = time.time()
223 | right_num = 0
224 | gold_num = 0
225 | predict_num = 0
226 |
227 | for i in range(line_count):
228 | line = test[i]
229 | predict = item.seg.segment(line)
230 |
231 | if (calcScore):
232 | # temp_gold = get_ner(gold[i])
233 | # temp_predict = get_ner(predict)
234 | if len("".join(predict)) != len(line):
235 | print(item.name + "\t" + line + "\t")
236 | continue
237 | # accuracy, precision, recall, f_measure, right_num, golden_num, predict_num = get_ner_fmeasure([temp_gold], [temp_predict])
238 | right_num_local, golden_num_local, predict_num_local = calc_score(gold[i], predict)
239 |
240 | right_num += right_num_local
241 | gold_num += golden_num_local
242 | predict_num += predict_num_local
243 |
244 | # if (right_num != right_num_local or golden_num != golden_num_local or predict_num != predict_num_local):
245 | # print("badcase:" % line)
246 |
247 | # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
248 | # print()
249 | if write_result:
250 | file.write(" ".join(predict) + "\n")
251 |
252 | time_end = time.time()
253 | item.time = (time_end - time_start) * 1000
254 | precision = 0
255 | recall = 0
256 | f = 0
257 | if write_result:
258 | file.close()
259 | if predict_num != 0:
260 | precision = right_num * 1.0 / predict_num
261 | if gold_num != 0:
262 | recall = right_num * 1.0 / gold_num
263 | if precision + recall > 0:
264 | f = 2 * precision * recall / (precision + recall)
265 |
266 | print("precision:%f \t recall:%f \t f1:%f" % (precision, recall, f))
267 | if item.time == 0:
268 | print("耗时太少,速度无法评估")
269 | else:
270 | print("耗时:%d ms,\t速度:%f 字符/毫秒" % (item.time, char_count * 1.0 / item.time))
271 |
272 |
273 | def update(offset, index, terms):
274 | offset += len(terms[index])
275 | index += 1
276 | return offset, index
277 |
278 |
279 | def calc_score(gold, predict):
280 | gold_offset = 0
281 | predict_offset = 0
282 |
283 | gold_term_index = 0
284 | predict_term_index = 0
285 |
286 | right = 0
287 | total = len(gold)
288 | right_and_wrong = len(predict)
289 | while (gold_term_index < len(gold) or predict_term_index < len(predict)):
290 | if gold_offset == predict_offset:
291 | if gold[gold_term_index] == predict[predict_term_index]:
292 | right += 1
293 | gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold)
294 | predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict)
295 | elif gold_offset < predict_offset:
296 | gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold)
297 | else:
298 | predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict)
299 | return right, total, right_and_wrong
300 |
301 |
302 | if __name__ == '__main__':
303 | parser = argparse.ArgumentParser(description='中文分词对比测试')
304 | parser.add_argument('-i',
305 | help='file to segment, default using the file in chinese-segmentation-evaluation/data/seg_data_big.txt',
306 | default="")
307 | parser.add_argument('-o', help='path to save the result, default is not saving', default="")
308 | parser.add_argument("-n", help='maximum number of read rows, default reading all', default="-1")
309 | parser.add_argument("-c", help='segmentor to evaluate', default="pkuseg,jieba_fast,thulac")
310 | args = parser.parse_args()
311 | evaluate(args.i, args.o, args.n, args.c)
312 |
--------------------------------------------------------------------------------
/python/indi.tiandi.nlp.evaluation/metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: Jie
3 | # @Date: 2017-02-16 09:53:19
4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com
5 | # @Last Modified time: 2017-12-19 15:23:12
6 |
7 | # from operator import add
8 | #
9 | from __future__ import print_function
10 |
11 |
12 | ## input as sentence level labels
13 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"):
14 | sent_num = len(golden_lists)
15 | golden_full = []
16 | predict_full = []
17 | right_full = []
18 | right_tag = 0
19 | all_tag = 0
20 | for idx in range(0, sent_num):
21 | # word_list = sentence_lists[idx]
22 | golden_list = golden_lists[idx]
23 | predict_list = predict_lists[idx]
24 | for idy in range(len(golden_list)):
25 | if golden_list[idy] == predict_list[idy]:
26 | right_tag += 1
27 | all_tag += len(golden_list)
28 | if label_type == "BMES":
29 | gold_matrix = get_ner_BMES(golden_list)
30 | pred_matrix = get_ner_BMES(predict_list)
31 | else:
32 | gold_matrix = get_ner_BIO(golden_list)
33 | pred_matrix = get_ner_BIO(predict_list)
34 | # print "gold", gold_matrix
35 | # print "pred", pred_matrix
36 | right_ner = list(set(gold_matrix).intersection(set(pred_matrix)))
37 | golden_full += gold_matrix
38 | predict_full += pred_matrix
39 | right_full += right_ner
40 | right_num = len(right_full)
41 | golden_num = len(golden_full)
42 | predict_num = len(predict_full)
43 | if predict_num == 0:
44 | precision = -1
45 | else:
46 | precision = (right_num + 0.0) / predict_num
47 | if golden_num == 0:
48 | recall = -1
49 | else:
50 | recall = (right_num + 0.0) / golden_num
51 | if (precision == -1) or (recall == -1) or (precision + recall) <= 0.:
52 | f_measure = -1
53 | else:
54 | f_measure = 2 * precision * recall / (precision + recall)
55 | accuracy = (right_tag + 0.0) / all_tag
56 | # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy
57 | # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num)
58 | return accuracy, precision, recall, f_measure, right_num, golden_num, predict_num
59 |
60 |
61 | def reverse_style(input_string):
62 | target_position = input_string.index('[')
63 | input_len = len(input_string)
64 | output_string = input_string[target_position:input_len] + input_string[0:target_position]
65 | return output_string
66 |
67 |
68 | def get_ner_BMES(label_list):
69 | # list_len = len(word_list)
70 | # assert(list_len == len(label_list)), "word list size unmatch with label list"
71 | list_len = len(label_list)
72 | begin_label = 'B-'
73 | end_label = 'E-'
74 | single_label = 'S-'
75 | whole_tag = ''
76 | index_tag = ''
77 | tag_list = []
78 | stand_matrix = []
79 | for i in range(0, list_len):
80 | # wordlabel = word_list[i]
81 | current_label = label_list[i].upper()
82 | if begin_label in current_label:
83 | if index_tag != '':
84 | tag_list.append(whole_tag + ',' + str(i - 1))
85 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
86 | index_tag = current_label.replace(begin_label, "", 1)
87 |
88 | elif single_label in current_label:
89 | if index_tag != '':
90 | tag_list.append(whole_tag + ',' + str(i - 1))
91 | whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i)
92 | tag_list.append(whole_tag)
93 | whole_tag = ""
94 | index_tag = ""
95 | elif end_label in current_label:
96 | if index_tag != '':
97 | tag_list.append(whole_tag + ',' + str(i))
98 | whole_tag = ''
99 | index_tag = ''
100 | else:
101 | continue
102 | if (whole_tag != '') & (index_tag != ''):
103 | tag_list.append(whole_tag)
104 | tag_list_len = len(tag_list)
105 |
106 | for i in range(0, tag_list_len):
107 | if len(tag_list[i]) > 0:
108 | tag_list[i] = tag_list[i] + ']'
109 | insert_list = reverse_style(tag_list[i])
110 | stand_matrix.append(insert_list)
111 | # print stand_matrix
112 | return stand_matrix
113 |
114 |
115 | def get_ner_BIO(label_list):
116 | # list_len = len(word_list)
117 | # assert(list_len == len(label_list)), "word list size unmatch with label list"
118 | list_len = len(label_list)
119 | begin_label = 'B-'
120 | inside_label = 'I-'
121 | whole_tag = ''
122 | index_tag = ''
123 | tag_list = []
124 | stand_matrix = []
125 | for i in range(0, list_len):
126 | # wordlabel = word_list[i]
127 | current_label = label_list[i].upper()
128 | if begin_label in current_label:
129 | if index_tag == '':
130 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
131 | index_tag = current_label.replace(begin_label, "", 1)
132 | else:
133 | tag_list.append(whole_tag + ',' + str(i - 1))
134 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i)
135 | index_tag = current_label.replace(begin_label, "", 1)
136 |
137 | elif inside_label in current_label:
138 | if current_label.replace(inside_label, "", 1) == index_tag:
139 | whole_tag = whole_tag
140 | else:
141 | if (whole_tag != '') & (index_tag != ''):
142 | tag_list.append(whole_tag + ',' + str(i - 1))
143 | whole_tag = ''
144 | index_tag = ''
145 | else:
146 | if (whole_tag != '') & (index_tag != ''):
147 | tag_list.append(whole_tag + ',' + str(i - 1))
148 | whole_tag = ''
149 | index_tag = ''
150 |
151 | if (whole_tag != '') & (index_tag != ''):
152 | tag_list.append(whole_tag)
153 | tag_list_len = len(tag_list)
154 |
155 | for i in range(0, tag_list_len):
156 | if len(tag_list[i]) > 0:
157 | tag_list[i] = tag_list[i] + ']'
158 | insert_list = reverse_style(tag_list[i])
159 | stand_matrix.append(insert_list)
160 | return stand_matrix
161 |
162 |
163 | def readSentence(input_file):
164 | in_lines = open(input_file, 'r', encoding="utf-8").readlines()
165 | sentences = []
166 | labels = []
167 | sentence = []
168 | label = []
169 | for line in in_lines:
170 | if len(line) < 2:
171 | sentences.append(sentence)
172 | labels.append(label)
173 | sentence = []
174 | label = []
175 | else:
176 | pair = line.strip('\n').split(' ')
177 | sentence.append(pair[0])
178 | label.append(pair[-1])
179 | return sentences, labels
180 |
181 |
182 | def readTwoLabelSentence(input_file, pred_col=-1):
183 | in_lines = open(input_file, 'r', encoding="utf-8").readlines()
184 | sentences = []
185 | predict_labels = []
186 | golden_labels = []
187 | sentence = []
188 | predict_label = []
189 | golden_label = []
190 | for line in in_lines:
191 | if "##score##" in line:
192 | continue
193 | if len(line) < 2:
194 | sentences.append(sentence)
195 | golden_labels.append(golden_label)
196 | predict_labels.append(predict_label)
197 | sentence = []
198 | golden_label = []
199 | predict_label = []
200 | else:
201 | pair = line.strip('\n').split()
202 | sentence.append(pair[0])
203 | golden_label.append(pair[1])
204 | predict_label.append(pair[pred_col])
205 |
206 | return sentences, golden_labels, predict_labels
207 |
208 |
209 | def fmeasure_from_file(golden_file, predict_file, label_type="BMES"):
210 | print("Get f measure from file:", golden_file, predict_file)
211 | print("Label format:", label_type)
212 | golden_sent, golden_labels = readSentence(golden_file)
213 | predict_sent, predict_labels = readSentence(predict_file)
214 | A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
215 | print("P:%sm R:%s, F:%s" % (P, R, F))
216 |
217 |
218 | def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1):
219 | sent, golden_labels, predict_labels = readTwoLabelSentence(twolabel_file, pred_col)
220 | A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type)
221 | print("P:%s, R:%s, F:%s" % (P, R, F))
222 |
223 |
224 | if __name__ == '__main__':
225 | # print "sys:",len(sys.argv)
226 | gold = [["B-Null", "M-Null"]]
227 | predict = [["B-Null", "M-Null", "E-Null", "S-Null"]]
228 | print(get_ner_fmeasure(gold, predict))
229 |
230 | # sys.argv.append("result")
231 | # if len(sys.argv) == 3:
232 | # fmeasure_from_singlefile(sys.argv[1], "BMES", int(sys.argv[2]))
233 | # else:
234 | # fmeasure_from_singlefile(sys.argv[1], "BMES")
235 |
--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
1 | jieba
2 | jieba_fast
3 | pkuseg
4 | pynlpir
5 | thulac
6 | snownlp
7 | requests
8 | pyltp
9 |
--------------------------------------------------------------------------------