├── .gitignore ├── LICENSE ├── README.md ├── java ├── build.gradle ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── settings.gradle └── src │ └── main │ ├── java │ └── indi │ │ └── tiandi │ │ └── nlp │ │ ├── Seg.java │ │ ├── Sentence.java │ │ ├── Term.java │ │ ├── evaluation │ │ ├── SegEvaluation.java │ │ └── impl │ │ │ ├── AnsjImpl.java │ │ │ ├── FNLPImpl.java │ │ │ ├── HanLPImpl.java │ │ │ ├── JcsegImpl.java │ │ │ ├── JiebaAnalysisImpl.java │ │ │ ├── MMSeg4jImpl.java │ │ │ ├── MYNLPImpl.java │ │ │ ├── PaodingImpl.java │ │ │ ├── StanfordCoreNLPImpl.java │ │ │ ├── ThulacImpl.java │ │ │ └── WordImpl.java │ │ └── tool │ │ ├── HttpRequest.java │ │ └── ZipUtil.java │ └── resources │ ├── ansj │ └── library │ │ ├── ambiguity.dic │ │ ├── default.dic │ │ ├── regex.dic │ │ ├── stop.dic │ │ └── synonyms.dic │ └── logback.xml └── python ├── indi.tiandi.nlp.evaluation ├── SegEvaluation.py └── metric.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | ### Java template 108 | # Compiled class file 109 | *.class 110 | 111 | # Log file 112 | *.log 113 | 114 | # BlueJ files 115 | *.ctxt 116 | 117 | # Mobile Tools for Java (J2ME) 118 | .mtj.tmp/ 119 | 120 | # Package Files # 121 | *.jar 122 | *.war 123 | *.nar 124 | *.ear 125 | *.zip 126 | *.tar.gz 127 | *.rar 128 | 129 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 130 | hs_err_pid* 131 | 132 | !/java/gradle/wrapper/gradle-wrapper.jar 133 | ### Example user template template 134 | ### Example user template 135 | 136 | # IntelliJ project files 137 | .idea 138 | *.iml 139 | .gradle 140 | out 141 | gen 142 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 目录 2 | - java 3 | - python 4 | - 总结 5 | ## java 6 | #### Requirement 7 | java8 8 | 9 | #### 步骤 10 | 11 | 1. `git clone https://github.com/tiandiweizun/nlp-evaluation.git` 12 | 2. `cd nlp-evaluation/java` 13 | 3. (windows)  `.\gradlew.bat build`   (linux)  `./gradlew build` 14 | 4. `java -Dfile.encoding=utf-8 -jar build/libs/nlp-evaluation-java-1.0.0.jar` 15 | 16 | 17 | #### 说明 18 | 1. java -jar nlp-evaluation-java-1.0.0.jar 有3个参数,可以执行 java -jar nlp-evaluation-java-1.0.0.jar -h 查看 19 |
-i 分词文件,默认为data/seg.data_big文件,每行一个句子,每个词用空格分开,可以指定自己的测试集 20 |
-o 分词结果存储路径,默认不存储 21 |
-n 最大读取分词文件行数 22 |
-c 需要评估的分词器名称,用英文逗号隔开,默认HanLP,jieba,thulac,mynlp,示例: -c=HanLP 23 | 24 | 2. 由于[斯坦福分词](https://github.com/stanfordnlp/CoreNLP)效果一般,速度极慢,且模型巨大,在打包的时候已经排除(不影响在IDE里面测试), 25 | 打包如果要包含斯坦福分词,修改build.gradle,注释掉exclude(dependency('edu.stanford.nlp:stanford-corenlp')) 26 | 3. 由于[Word](https://github.com/ysc/word)、[Ansj](https://github.com/NLPchina/ansj_seg)、[Jcseg](https://github.com/lionsoul2014/jcseg)、[MMSeg4j](https://github.com/chenlb/mmseg4j-core)存在bug(把词语拼接起来和原始句子不一样),在代码里面已经注释掉了,不进行测试。 27 | 4. 依赖的库均存在于maven中心仓库,像庖丁、复旦分词等找不到的,这里没有测试 28 | 29 | 30 | #### 测试效果 31 | 32 | 总行数:2533709 总字符数:28374490 33 | 34 | |segmentor|precision| recall | f1 | speed(字符/ms)_windows | speed(字符/ms)_linux | 35 | | --| -- | ------ | --- | --- | --- | 36 | |[HanLP](https://github.com/hankcs/HanLP) | 0.900433 | 0.910614 | 0.905495 | 1034.470451 | 797.596346 | 37 | |[jieba](https://github.com/huaban/jieba-analysis) | 0.852657 | 0.803263 | 0.827223 | 1774.181830 | 980.865943 | 38 | |[thulac](https://github.com/yizhiru/thulac4j) | 0.884405 | 0.901930 | 0.893082 | 1449.749131 | 939.832732 | 39 | |[mynlp](https://github.com/mayabot/mynlp) | 0.901661 | 0.900246 | 0.900953 | 1739.272404 | 1178.930115| 40 | 41 | 经过多次测试发现,linux第一个性能偏低,thulac在linux上速度不是特别稳定,最快与jieba差不多 42 | 43 | #### 开发者 44 | 45 | - 建议使用idea打开或者导入java目录,把data目录拷贝到java目录,直接可以运行SegEvaluation调试。 46 | - 可以打开stanford和其他分词器 47 | - 评测自定义分词器:继承Seg类并实现segment方法,添加到evaluators即可。 48 | 49 | ## python 50 | 51 | #### Requirement 52 | 53 | Python:3 54 | 其他参见 requirements.txt 55 | 56 | #### 步骤 57 | 58 | 1. git clone https://github.com/tiandiweizun/nlp-evaluation.git 59 | 2. cd nlp-evaluation 60 | 3. pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple 61 | 4. cd python/indi.tiandi.nlp.evaluation 62 | 5. python3 SegEvaluation.py 63 | 64 | #### 说明 65 | 1. python3 SegEvaluation.py 有3个参数,可以执行 python3 SegEvaluation.py -h 查看 66 |
-i 分词文件,默认为data/seg.data_big文件,每行一个句子,每个词用空格分开,可以指定自己的测试集 67 |
-o 分词结果存储路径,默认不存储 68 |
-n 最大读取分词文件行数,由于python速度太慢,建议设置 69 |
-c 需要评估的分词器名称,用英文逗号隔开,默认pkuseg,jieba_fast,thulac 70 | 71 | 2. [pynlpir](https://github.com/tsroten/pynlpir)存在bug(把词语拼接起来和原始句子不一样),[pyltp](https://github.com/HIT-SCIR/pyltp)在windows上不易安装,这里都没有进行测试,比较慢的也没有测试 72 | 73 | ##### 测试效果 74 | 75 | 总行数:2533709 总字符数:28374490 76 | 77 | |segmentor|precision| recall | f1 | speed(字符/ms)_windows | speed(字符/ms)_linux | 78 | | --| -- | ------ | --- | --- |--- | 79 | |[pkuseg](https://github.com/lancopku/pkuseg-python) | 0.890170 | 0.886405 | 0.888284 | 34.077104 | 19.826954 | 80 | |[jieba](https://github.com/fxsjy/jieba) | 0.855293 | 0.808204 | 0.831082 | 169.651694 | 104.554222 | 81 | |[jieba_fast](https://github.com/deepcs233/jieba_fast) | 0.855299 | 0.808182 | 0.831073 | 408.241520 | 203.815985 | 82 | |[thulac](https://github.com/thunlp/THULAC-Python) | 0.848839 | 0.883031 | 0.865597 | 28.831738 | 16.565779 | 83 | |[pyltp](https://github.com/HIT-SCIR/pyltp) | 0.894885 | 0.908761 | 0.901770 | --------- | 52.371131 | 84 | |[snownlp](https://github.com/isnowfy/snownlp) | 0.811029 | 0.864835 | 0.837069 | --------- | 1.947430 | 85 | 86 | #### 开发者 87 | 88 | - 建议使用pycharm打开python目录,即可运行 89 | - 如果需要使用pynlpir,需要修改pynlpir_path的安装目录 90 | - 如果需要使用pyltp,需要修改ltp_data_dir的模型分词目录 91 | - 评测自定义分词器:只要实现segment方法和向evaluators追加即可。 92 | 93 | ## 总结 94 | - 性能:java 远高于python,至少多了一个数量级。 95 | - 效果:对于jieba和thulac,在python和java上表现的不同,需要更多的时间去寻找原因,且java的thulac4j非官方提供。 96 | - 数据:默认数据集来源于[cws_evaluation](https://github.com/ysc/cws_evaluation),该项目为评估中文分词的性能与效果,对于效果该项目采用的是行完美率这个指标,但是对于长句,这个指标会变的不合适,如果不同算法的错误率不一样,但是如果有一个错的词,会导致整个句子都是错的,不能很好的区分算法的precision 97 | -------------------------------------------------------------------------------- /java/build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | jcenter() 4 | } 5 | 6 | dependencies { 7 | classpath "com.github.jengelman.gradle.plugins:shadow:2.0.2" 8 | } 9 | } 10 | 11 | apply plugin: 'idea' 12 | apply plugin: 'java' 13 | apply plugin: "com.github.johnrengelman.shadow" 14 | 15 | group 'indi.nlp' 16 | version '1.0-SNAPSHOT' 17 | 18 | sourceCompatibility = 1.8 19 | 20 | tasks.withType(JavaCompile) { 21 | options.encoding = "UTF-8" 22 | } 23 | 24 | //禁掉jar task 25 | jar.enabled = false 26 | shadowJar { 27 | baseName = "nlp-evaluation-java" 28 | //classifier是生成jar包的后缀 29 | classifier = null 30 | version = '1.0.1' 31 | manifest { 32 | attributes 'Main-Class': 'indi.tiandi.nlp.evaluation.SegEvaluation' 33 | } 34 | 35 | from("../") { 36 | include 'data/seg_data_big.txt' 37 | } 38 | } 39 | 40 | repositories { 41 | maven{ url 'http://maven.aliyun.com/nexus/content/groups/public/'} 42 | } 43 | 44 | dependencies { 45 | implementation 'org.ansj:ansj_seg:5.1.6' 46 | implementation 'com.hankcs:hanlp:portable-1.6.8' 47 | implementation 'org.apdplat:word:1.3.1' 48 | implementation 'io.github.yizhiru:thulac4j:3.1.2' 49 | implementation 'com.chenlb.mmseg4j:mmseg4j-core:1.10.0' 50 | implementation 'org.lionsoul:jcseg-core:2.4.0' 51 | implementation 'com.huaban:jieba-analysis:1.0.2' 52 | implementation 'com.mayabot.mynlp:mynlp-segment:3.0.1' 53 | implementation 'org.apache.commons:commons-lang3:3.11' 54 | 55 | testImplementation 'junit:junit:4.2' 56 | } 57 | 58 | 59 | artifacts { 60 | shadowJar; 61 | } 62 | 63 | build.dependsOn(shadowJar); -------------------------------------------------------------------------------- /java/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /java/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /java/gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /java/gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /java/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'java' 2 | 3 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/Seg.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp; 2 | 3 | import java.util.List; 4 | /** 5 | * 分词器抽象类 6 | * 7 | * @date 2019/3/4 8 | * @author tiandi 9 | */ 10 | public abstract class Seg { 11 | private String name; 12 | 13 | public abstract List segment(String sentence); 14 | 15 | public String getName() { 16 | return this.name; 17 | } 18 | 19 | public void setName(String name) { 20 | this.name = name; 21 | } 22 | 23 | public Seg(String name) { 24 | this.name = name; 25 | } 26 | 27 | public Seg() { 28 | this.name = this.getClass().getSimpleName(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/Sentence.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * 句子 7 | * 8 | * @author tiandi 9 | * @date 2019/3/4 10 | */ 11 | public class Sentence { 12 | private List terms; 13 | 14 | public Sentence(List terms) { 15 | this.terms = terms; 16 | } 17 | 18 | public String getString() { 19 | StringBuilder sb = new StringBuilder(); 20 | for (Term term : terms) { 21 | sb.append(term.getWord()); 22 | } 23 | return sb.toString(); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/Term.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp; 2 | 3 | /** 4 | * 词和词性 5 | * 6 | * @date 2019/3/4 7 | * @author tiandi 8 | */ 9 | public class Term { 10 | /** 11 | * 词 12 | */ 13 | private String word; 14 | /** 15 | * 词性 16 | */ 17 | private String pos; 18 | 19 | public Term(String word) { 20 | this.word = word; 21 | } 22 | 23 | public Term(String word, String pos) { 24 | this.word = word; 25 | this.pos = pos; 26 | } 27 | 28 | public String getWord() { 29 | return word; 30 | } 31 | 32 | public void setWord(String word) { 33 | this.word = word; 34 | } 35 | 36 | public String getPos() { 37 | return pos; 38 | } 39 | 40 | public void setPos(String pos) { 41 | this.pos = pos; 42 | } 43 | 44 | @Override 45 | public String toString() { 46 | return word; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/SegEvaluation.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation; 2 | 3 | import indi.tiandi.nlp.Seg; 4 | import indi.tiandi.nlp.Term; 5 | import indi.tiandi.nlp.tool.HttpRequest; 6 | import indi.tiandi.nlp.tool.ZipUtil; 7 | import org.apache.commons.lang3.StringUtils; 8 | 9 | import java.io.*; 10 | import java.net.JarURLConnection; 11 | import java.net.URL; 12 | import java.net.URLDecoder; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.Enumeration; 16 | import java.util.List; 17 | import java.util.jar.JarEntry; 18 | import java.util.jar.JarFile; 19 | 20 | /** 21 | * SegEvaluation 22 | * 分词评估器 23 | * 24 | * @author tiandi 25 | * @date 2019/3/4 26 | */ 27 | public class SegEvaluation { 28 | public static final String testSentence = "这是一个测试句子"; 29 | 30 | public static final String helpMessage; 31 | 32 | static { 33 | StringBuilder sb = new StringBuilder(); 34 | sb.append("thank you for using nlp-evaluation\n"); 35 | sb.append("github: https://github.com/tiandiweizun/chinese-segmentation-evaluation\n"); 36 | sb.append("\t-i or -input\t\t\tfile to segment,jar default using the file in ./data/seg_data_big.txt and debug model using chinese-segmentation-evaluation/data/seg_data_big.txt\n"); 37 | sb.append("\t-o or -output\t\t\tpath to save the result, default is not saving\n"); 38 | sb.append("\t-n or -max_line_number\t\tmaximum number of read rows, default reading all\n"); 39 | sb.append("\t-c or -contains\t\t\t segmentor to evaluate,default contains HanLP,jieba,thulac\n"); 40 | sb.append("\t-h or -help\t\t\tmessage for help\n"); 41 | sb.append("\n"); 42 | sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar -n=10\n"); 43 | sb.append("\te.g., java -jar nlp-evaluation-java-1.0.0.jar nlp-evaluation/data/seg.data_big -n=10\n"); 44 | helpMessage = sb.toString(); 45 | } 46 | 47 | public static String getFileNameWithExtension(File file) { 48 | String fileName = file.getName(); 49 | int i = fileName.lastIndexOf("."); 50 | if (i <= 0) { 51 | i = fileName.length(); 52 | } 53 | return fileName.substring(0, i); 54 | } 55 | 56 | public static void main(String[] args) throws Exception { 57 | Config config = parseParams(args); 58 | 59 | String rFileName = config.rFileName; 60 | String wFilePath = config.wFilePath; 61 | boolean writeResult = false; 62 | int maxLineCount = config.maxLineCount; 63 | 64 | InputStream inputStream = null; 65 | File file = new File(rFileName); 66 | if (!file.exists()) { 67 | URL resource = SegEvaluation.class.getClassLoader().getResource(rFileName); 68 | if (resource != null) { 69 | // 从jar包内部加载 70 | inputStream = SegEvaluation.class.getClassLoader().getResourceAsStream((rFileName)); 71 | } else { 72 | File tempZipFile = new File(Config.zipFileName); 73 | if (getFileNameWithExtension(file).equals(getFileNameWithExtension(tempZipFile))) { 74 | boolean download = true; 75 | if (tempZipFile.exists()) { 76 | try { 77 | //解压 78 | ZipUtil.unZip(tempZipFile, tempZipFile.getParent()); 79 | download = false; 80 | } catch (Exception e) { 81 | //删除错误zip文件 82 | tempZipFile.delete(); 83 | } 84 | } 85 | // 从互联网下载并解压 86 | if (download) { 87 | System.out.println(String.format("从 %s 下载文件,如果下载较慢,亦可手动下载,保存到 %s 即可", Config.url, tempZipFile.getAbsolutePath())); 88 | try { 89 | // 下载 90 | HttpRequest.download(Config.url, Config.zipFileName); 91 | System.out.println("下载完成"); 92 | // 解压 93 | ZipUtil.unZip(tempZipFile, tempZipFile.getParent()); 94 | } catch (IOException e) { 95 | System.out.println(String.format("下载或解压错误:%s", e.getMessage())); 96 | System.exit(1); 97 | } 98 | } 99 | } else { 100 | // 自定义的文件未找到 101 | System.out.println("未从本地和jar包内找到文件:" + rFileName); 102 | System.exit(1); 103 | } 104 | } 105 | } 106 | if (file.exists()) { 107 | inputStream = new FileInputStream(rFileName); 108 | System.out.println("读入分词文件地址:" + file.getAbsolutePath()); 109 | } 110 | if (wFilePath.length() > 0) { 111 | writeResult = true; 112 | System.out.println("分词结果写入地址:" + new File(wFilePath).getAbsolutePath()); 113 | } 114 | calcPFRScore(inputStream, wFilePath, writeResult, maxLineCount, config.segmentorNames); 115 | } 116 | 117 | public static void calcPFRScore(InputStream inputStream, String wFilePath, boolean writeResult, int maxLineCount, 118 | List segmentorNames) { 119 | String line = ""; 120 | try { 121 | List evaluators = new ArrayList<>(); 122 | List classesFromPackage = getClassNames("indi.tiandi.nlp.evaluation.impl"); 123 | for (String segmentorName : segmentorNames) { 124 | for (String className : classesFromPackage) { 125 | int i = className.lastIndexOf(".") + 1; 126 | String simpleClassName = className.substring(i); 127 | if (!simpleClassName.toLowerCase().startsWith(segmentorName.toLowerCase())) { 128 | continue; 129 | } 130 | Class aClass = Class.forName(className); 131 | if (Seg.class.isAssignableFrom(aClass)) { 132 | evaluators.add(new Evaluator(aClass.asSubclass(Seg.class), segmentorName)); 133 | break; 134 | } 135 | } 136 | } 137 | 138 | // evaluators.add(new Evaluator(JiebaAnalysisImpl.class)); 139 | // evaluators.add(new Evaluator(ThulacImpl.class)); 140 | // 分词太慢 141 | // evaluators.add(new Evaluator(new StanfordCoreNLPImpl())); 142 | // 以下分词都存在bug,导致分词后的句子与分词前的句子不一样 143 | // evaluators.add(new Evaluator(WordImpl.class)); 144 | // evaluators.add(new Evaluator(AnsjImpl.class)); 145 | // evaluators.add(new Evaluator(JcsegImpl.class)); 146 | // evaluators.add(new Evaluator(MMSeg4jImpl.class)); 147 | // 获得项目根目录的绝对路径 148 | if (evaluators.size() == 0) { 149 | System.out.println("没有任何待评测分词器"); 150 | System.exit(-1); 151 | } 152 | BufferedReader br = new BufferedReader(new InputStreamReader(inputStream, "utf-8")); 153 | List> gold = new ArrayList<>(); 154 | List test = new ArrayList<>(); 155 | int charCount = 0; 156 | boolean calcScore = true; 157 | int lineCount = 0; 158 | // -1 表示读取所有数据 159 | while ((line = br.readLine()) != null) { 160 | if (line.trim().length() == 0) { 161 | continue; 162 | } 163 | String[] s = line.split(" "); 164 | gold.add(Arrays.asList(s)); 165 | test.add(line.replace(" ", "")); 166 | charCount += test.get(test.size() - 1).length(); 167 | 168 | lineCount += 1; 169 | if (maxLineCount > 0 && lineCount >= maxLineCount) { 170 | break; 171 | } 172 | } 173 | System.out.println(); 174 | System.out.println(String.format("总行数:%d\t总字符数:%d", gold.size(), charCount)); 175 | for (Evaluator item : evaluators) { 176 | 177 | System.out.println(); 178 | System.out.println(item.seg.getName() + " 评测开始"); 179 | if (!item.init) { 180 | System.out.println(item.seg.getName() + " 初始化错误,跳过"); 181 | continue; 182 | } 183 | BufferedWriter bw = null; 184 | if (writeResult) { 185 | wFilePath = wFilePath.replace("\\", "/"); 186 | if (!wFilePath.endsWith("/")) { 187 | wFilePath += "/"; 188 | } 189 | String wFileName = wFilePath + item.seg.getName(); 190 | bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(wFileName), "utf-8")); 191 | } 192 | long start = System.currentTimeMillis(); 193 | int right_num = 0; 194 | int gold_num = 0; 195 | int predict_num = 0; 196 | for (int i = 0; i < test.size(); i++) { 197 | line = test.get(i); 198 | List segment = item.seg.segment(line); 199 | List predict = new ArrayList<>(); 200 | for (Term term : segment) { 201 | predict.add(term.getWord()); 202 | } 203 | if (calcScore) { 204 | if (!StringUtils.join(predict, "").equals(line)) { 205 | System.out.println(item.seg.getName() + "\t" + line); 206 | continue; 207 | } 208 | int result[] = calcScore(gold.get(i), predict); 209 | right_num += result[0]; 210 | gold_num += result[1]; 211 | predict_num += result[2]; 212 | } 213 | if (writeResult) { 214 | bw.write(StringUtils.join(predict, " ") + "\n"); 215 | } 216 | } 217 | if (writeResult) { 218 | bw.close(); 219 | } 220 | item.time = System.currentTimeMillis() - start; 221 | double precision = 0; 222 | double recall = 0; 223 | double f = 0; 224 | if (predict_num != 0) { 225 | precision = right_num * 1.0 / predict_num; 226 | } 227 | if (gold_num != 0) { 228 | recall = right_num * 1.0 / gold_num; 229 | } 230 | if (precision + recall > 0) { 231 | f = 2 * precision * recall / (precision + recall); 232 | } 233 | System.out.println(String.format("precision:%f \t recall:%f \t f1:%f", precision, recall, f)); 234 | System.out.println(String.format("耗时:%d ms,\t速度:%f 字符/毫秒", item.time, charCount * 1.0 / item.time)); 235 | } 236 | } catch (Exception e) { 237 | System.out.println(line); 238 | e.printStackTrace(); 239 | } 240 | } 241 | 242 | public static int[] calcScore(List gold, List predict) { 243 | int gold_offset = 0; 244 | int predict_offset = 0; 245 | 246 | int gold_term_index = 0; 247 | int predict_term_index = 0; 248 | 249 | int right = 0; 250 | int total = gold.size(); 251 | int right_and_wrong = predict.size(); 252 | while (gold_term_index < total || predict_term_index < right_and_wrong) { 253 | if (gold_offset == predict_offset) { 254 | if (gold.get(gold_term_index).equals(predict.get(predict_term_index))) { 255 | right += 1; 256 | } 257 | int[] result = update(gold_offset, gold_term_index, gold); 258 | gold_offset = result[0]; 259 | gold_term_index = result[1]; 260 | result = update(predict_offset, predict_term_index, predict); 261 | predict_offset = result[0]; 262 | predict_term_index = result[1]; 263 | } else if (gold_offset < predict_offset) { 264 | int[] result = update(gold_offset, gold_term_index, gold); 265 | gold_offset = result[0]; 266 | gold_term_index = result[1]; 267 | } else { 268 | int[] result = update(predict_offset, predict_term_index, predict); 269 | predict_offset = result[0]; 270 | predict_term_index = result[1]; 271 | } 272 | } 273 | int[] result = {right, total, right_and_wrong}; 274 | return result; 275 | } 276 | 277 | public static int[] update(int offset, int index, List terms) { 278 | offset += terms.get(index).length(); 279 | index += 1; 280 | int[] result = {offset, index}; 281 | return result; 282 | } 283 | 284 | public static Config parseParams(String[] args) { 285 | // message for help 286 | try { 287 | 288 | Config config = new Config(); 289 | if (args.length == 1) { 290 | if (args[0].equalsIgnoreCase("-h") || args[0].equalsIgnoreCase("-help")) { 291 | System.out.println(helpMessage); 292 | System.exit(0); 293 | } 294 | } 295 | boolean containsOption = false; 296 | for (int i = 0; i < args.length; i++) { 297 | String arg = args[i]; 298 | if (arg.startsWith("-") && arg.contains("=")) { 299 | containsOption = true; 300 | String[] split = arg.split("="); 301 | String paramName = split[0].trim().substring(1).toLowerCase(); 302 | String paramValue = split[1].trim(); 303 | switch (paramName) { 304 | case "i": 305 | case "input": 306 | config.rFileName = paramValue; 307 | break; 308 | case "o": 309 | case "output": 310 | config.wFilePath = paramValue; 311 | break; 312 | case "n": 313 | case "max_line_number": 314 | config.maxLineCount = Integer.parseInt(paramValue); 315 | break; 316 | case "c": 317 | case "contains": 318 | String[] segmentorNames = paramValue.split(","); 319 | config.segmentorNames = Arrays.asList(segmentorNames); 320 | break; 321 | } 322 | } else if (containsOption) { 323 | System.out.println("optional argument follows keyword argument"); 324 | } else { 325 | if (i == 0) { 326 | config.rFileName = args[0].trim(); 327 | } else if (i == 1) { 328 | config.wFilePath = args[1].trim(); 329 | } else if (i == 2) { 330 | config.maxLineCount = Integer.parseInt(args[2].trim()); 331 | } 332 | } 333 | } 334 | return config; 335 | } catch (Exception e) { 336 | System.out.println("参数错误:" + e.getMessage()); 337 | System.out.println(helpMessage); 338 | System.exit(0); 339 | } 340 | return null; 341 | } 342 | 343 | /** 344 | * 从包package中获取所有的Class 345 | * 346 | * @param packageName 347 | * @return 348 | */ 349 | 350 | public static List getClassNames(String packageName) { 351 | //第一个class类的集合 352 | List classes = new ArrayList<>(); 353 | //是否循环迭代 354 | boolean recursive = true; 355 | //获取包的名字 并进行替换 356 | String packageDirName = packageName.replace('.', '/'); 357 | //定义一个枚举的集合 并进行循环来处理这个目录下的things 358 | Enumeration dirs; 359 | try { 360 | dirs = Thread.currentThread().getContextClassLoader().getResources(packageDirName); 361 | //循环迭代下去 362 | while (dirs.hasMoreElements()) { 363 | //获取下一个元素 364 | URL url = dirs.nextElement(); 365 | //得到协议的名称 366 | String protocol = url.getProtocol(); 367 | //如果是以文件的形式保存在服务器上 368 | if ("file".equals(protocol)) { 369 | //获取包的物理路径 370 | String filePath = URLDecoder.decode(url.getFile(), "UTF-8"); 371 | //以文件的方式扫描整个包下的文件 并添加到集合中 372 | findAndAddClassesInPackageByFile(packageName, filePath, recursive, classes); 373 | } else if ("jar".equals(protocol)) { 374 | //如果是jar包文件 375 | //定义一个JarFile 376 | JarFile jar; 377 | try { 378 | //获取jar 379 | jar = ((JarURLConnection) url.openConnection()).getJarFile(); 380 | //从此jar包 得到一个枚举类 381 | Enumeration entries = jar.entries(); 382 | //同样的进行循环迭代 383 | while (entries.hasMoreElements()) { 384 | //获取jar里的一个实体 可以是目录 和一些jar包里的其他文件 如META-INF等文件 385 | JarEntry entry = entries.nextElement(); 386 | String name = entry.getName(); 387 | //如果是以/开头的 388 | if (name.charAt(0) == '/') { 389 | //获取后面的字符串 390 | name = name.substring(1); 391 | } 392 | //如果前半部分和定义的包名相同 393 | if (name.startsWith(packageDirName)) { 394 | int idx = name.lastIndexOf('/'); 395 | //如果以"/"结尾 是一个包 396 | if (idx != -1) { 397 | //获取包名 把"/"替换成"." 398 | packageName = name.substring(0, idx).replace('/', '.'); 399 | } 400 | //如果可以迭代下去 并且是一个包 401 | if ((idx != -1) || recursive) { 402 | //如果是一个.class文件 而且不是目录 403 | if (name.endsWith(".class") && !entry.isDirectory()) { 404 | //去掉后面的".class" 获取真正的类名 405 | String className = name.substring(packageName.length() + 1, name.length() - 6); 406 | //添加到classes 407 | classes.add(packageName + '.' + className); 408 | } 409 | } 410 | } 411 | } 412 | } catch (IOException e) { 413 | e.printStackTrace(); 414 | } 415 | } 416 | } 417 | } catch (IOException e) { 418 | e.printStackTrace(); 419 | } 420 | return classes; 421 | } 422 | 423 | /** 424 | * 以文件的形式来获取包下的所有Class 425 | * 426 | * @param packageName 427 | * @param packagePath 428 | * @param recursive 429 | * @param classes 430 | */ 431 | 432 | public static void findAndAddClassesInPackageByFile(String packageName, String packagePath, final boolean recursive, 433 | List classes) { 434 | //获取此包的目录 建立一个File 435 | File dir = new File(packagePath); 436 | //如果不存在或者 也不是目录就直接返回 437 | if (!dir.exists() || !dir.isDirectory()) { 438 | return; 439 | } 440 | //如果存在 就获取包下的所有文件 包括目录 441 | File[] dirfiles = dir.listFiles(new FileFilter() { 442 | //自定义过滤规则 如果可以循环(包含子目录) 或则是以.class结尾的文件(编译好的java类文件) 443 | public boolean accept(File file) { 444 | return (recursive && file.isDirectory()) || (file.getName().endsWith(".class")); 445 | } 446 | }); 447 | //循环所有文件 448 | for (File file : dirfiles) { 449 | //如果是目录 则继续扫描 450 | if (file.isDirectory()) { 451 | findAndAddClassesInPackageByFile(packageName + "." + file.getName(), file.getAbsolutePath(), recursive, classes); 452 | } else { 453 | //如果是java类文件 去掉后面的.class 只留下类名 454 | String className = file.getName().substring(0, file.getName().length() - 6); 455 | classes.add(packageName + '.' + className); 456 | } 457 | } 458 | } 459 | } 460 | 461 | class Evaluator { 462 | 463 | public Seg seg; 464 | public long time = 0; 465 | public boolean init = false; 466 | 467 | public Evaluator(Class segClass) { 468 | this(segClass, segClass.getSimpleName()); 469 | } 470 | 471 | public Evaluator(Class segClass, String name) { 472 | try { 473 | long start = System.currentTimeMillis(); 474 | System.out.println(name + " 初始化开始"); 475 | this.seg = segClass.newInstance(); 476 | this.seg.setName(name); 477 | List terms = this.seg.segment(SegEvaluation.testSentence); 478 | StringBuilder sb = new StringBuilder(); 479 | for (Term term : terms) { 480 | sb.append(term.getWord()); 481 | } 482 | long end = System.currentTimeMillis(); 483 | long cost = end - start; 484 | if (!sb.toString().equals(SegEvaluation.testSentence)) { 485 | System.out.println(name + " 初始化错误,句子:" + SegEvaluation.testSentence + ",分词结果:" + terms); 486 | } else { 487 | this.init = true; 488 | System.out.println(name + " 初始化结束,耗时:" + cost + " ms"); 489 | } 490 | } catch (InstantiationException e) { 491 | e.printStackTrace(); 492 | } catch (IllegalAccessException e) { 493 | e.printStackTrace(); 494 | } 495 | } 496 | } 497 | 498 | class Config { 499 | public static final String zipFileName = "data/seg_data_big.zip"; 500 | public static final String url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip"; 501 | public String rFileName = "data/seg_data_big.txt"; 502 | public String wFilePath = ""; 503 | public boolean writeResult = false; 504 | public int maxLineCount = -1; 505 | public List segmentorNames = new ArrayList<>(Arrays.asList("HanLP", "Jieba", "Thulac", "mynlp")); 506 | } -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/AnsjImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import indi.tiandi.nlp.Seg; 4 | import indi.tiandi.nlp.Term; 5 | import org.ansj.domain.Result; 6 | import org.ansj.library.AmbiguityLibrary; 7 | import org.ansj.library.DicLibrary; 8 | import org.ansj.splitWord.analysis.ToAnalysis; 9 | import org.ansj.util.MyStaticValue; 10 | 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | /** 14 | * Ansj分词 15 | * 16 | * @date 2019/3/4 17 | * @author tiandi 18 | */ 19 | public class AnsjImpl extends Seg { 20 | static { 21 | // 设置后速度会慢25% 左右 22 | // MyStaticValue.ENV.put(DicLibrary.DEFAULT, AnsjImpl.class.getClassLoader().getResource("ansj/library/default.dic").getPath()); 23 | // MyStaticValue.ENV.put(AmbiguityLibrary.DEFAULT, AnsjImpl.class.getClassLoader().getResource("ansj/library/ambiguity.dic").getPath()); 24 | } 25 | 26 | @Override 27 | public List segment(String sentence) { 28 | Result result = ToAnalysis.parse(sentence); 29 | List terms = new ArrayList<>(); 30 | for (org.ansj.domain.Term term : result) { 31 | terms.add(new Term(term.getName())); 32 | } 33 | return terms; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/FNLPImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | //import indi.nlp.Seg; 4 | 5 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/HanLPImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import com.hankcs.hanlp.HanLP; 4 | import indi.tiandi.nlp.Seg; 5 | import indi.tiandi.nlp.Term; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | /** 10 | * HanLP分词 11 | * 12 | * @date 2019/3/4 13 | * @author tiandi 14 | */ 15 | public class HanLPImpl extends Seg { 16 | 17 | @Override 18 | public List segment(String sentence) { 19 | // List segment = BasicTokenizer.segment(sentence); 20 | List segment = HanLP.segment(sentence); 21 | List terms = new ArrayList<>(); 22 | for (com.hankcs.hanlp.seg.common.Term term : segment) { 23 | terms.add(new Term(term.word)); 24 | } 25 | return terms; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/JcsegImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import indi.tiandi.nlp.Seg; 4 | import indi.tiandi.nlp.Term; 5 | import org.lionsoul.jcseg.tokenizer.core.*; 6 | 7 | import java.io.IOException; 8 | import java.io.StringReader; 9 | import java.util.ArrayList; 10 | import java.util.List; 11 | /** 12 | * Jcseg分词 13 | * 14 | * @date 2019/3/4 15 | * @author tiandi 16 | */ 17 | public class JcsegImpl extends Seg { 18 | 19 | private final static JcsegTaskConfig config = new JcsegTaskConfig(true); 20 | private final static ADictionary dic = DictionaryFactory.createSingletonDictionary(config); 21 | private static ISegment seg; 22 | 23 | static { 24 | try { 25 | seg = SegmentFactory.createJcseg(JcsegTaskConfig.COMPLEX_MODE, new Object[]{config, dic}); 26 | } 27 | catch (JcsegException e) { 28 | e.printStackTrace(); 29 | } 30 | } 31 | 32 | @Override 33 | public List segment(String sentence) { 34 | List terms = new ArrayList<>(); 35 | try { 36 | seg.reset(new StringReader(sentence)); 37 | IWord word = null; 38 | while ((word = seg.next()) != null) { 39 | terms.add(new Term(word.getValue())); 40 | } 41 | } 42 | catch (IOException e) { 43 | e.printStackTrace(); 44 | } 45 | return terms; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/JiebaAnalysisImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import com.huaban.analysis.jieba.JiebaSegmenter; 4 | import indi.tiandi.nlp.Seg; 5 | import indi.tiandi.nlp.Term; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | 10 | /** 11 | * 结巴分词 12 | * 13 | * @author tiandi 14 | * @date 2019/3/7 15 | */ 16 | public class JiebaAnalysisImpl extends Seg { 17 | private static final JiebaSegmenter seg = new JiebaSegmenter(); 18 | 19 | @Override 20 | public List segment(String sentence) { 21 | List strings = seg.sentenceProcess(sentence); 22 | List terms = new ArrayList<>(); 23 | for (String string : strings) { 24 | terms.add(new Term(string)); 25 | } 26 | return terms; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/MMSeg4jImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import com.chenlb.mmseg4j.ComplexSeg; 4 | import com.chenlb.mmseg4j.Dictionary; 5 | import com.chenlb.mmseg4j.MMSeg; 6 | import com.chenlb.mmseg4j.Word; 7 | import indi.tiandi.nlp.Seg; 8 | import indi.tiandi.nlp.Term; 9 | 10 | import java.io.IOException; 11 | import java.io.StringReader; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | /** 15 | * MMSeg分词 16 | * 17 | * @date 2019/3/4 18 | * @author tiandi 19 | */ 20 | public class MMSeg4jImpl extends Seg { 21 | private static final Dictionary dic = Dictionary.getInstance(); 22 | private static final ComplexSeg seg = new ComplexSeg(dic); 23 | private static final MMSeg mmSeg = new MMSeg(new StringReader(""), seg); 24 | 25 | @Override 26 | public List segment(String sentence) { 27 | mmSeg.reset(new StringReader(sentence)); 28 | Word word = null; 29 | List terms = new ArrayList<>(); 30 | try { 31 | while ((word = mmSeg.next()) != null) { 32 | if (word != null) { 33 | terms.add(new Term(word.getString())); 34 | } 35 | } 36 | } 37 | catch (IOException e) { 38 | System.out.println(sentence); 39 | e.printStackTrace(); 40 | } 41 | return terms; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/MYNLPImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import com.mayabot.nlp.segment.Lexer; 4 | import com.mayabot.nlp.segment.Lexers; 5 | import com.mayabot.nlp.segment.Sentence; 6 | import com.mayabot.nlp.segment.WordTerm; 7 | import indi.tiandi.nlp.Seg; 8 | import indi.tiandi.nlp.Term; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | /** 13 | * MYNLP分词 14 | * 15 | * @date 2019/3/4 16 | * @author tiandi 17 | */ 18 | public class MYNLPImpl extends Seg { 19 | Lexer lexer = Lexers.builder().basic().core().keepOriCharOutput().build(); 20 | 21 | @Override 22 | public List segment(String sentence) { 23 | Sentence result = lexer.scan(sentence); 24 | List terms = new ArrayList<>(); 25 | for (WordTerm term : result.toList()) { 26 | terms.add(new Term(term.getWord())); 27 | } 28 | return terms; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/PaodingImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | //import indi.nlp.Seg; 4 | //import indi.nlp.Term; 5 | //import org.apache.lucene.analysis.TokenStream; 6 | // 7 | //import java.io.IOException; 8 | //import java.util.ArrayList; 9 | //import java.util.List; 10 | // 11 | ///** 12 | // * 庖丁分词 13 | // * 14 | // * @author tiandi 15 | // * @date 2019/3/4 16 | // */ 17 | //public class PaodingImpl implements Seg { 18 | // public static PaodingAnalyzer paodingAnalyzer = new PaodingAnalyzer(); 19 | // 20 | // @Override 21 | // public List segment(String sentence) { 22 | // List terms = new ArrayList<>(); 23 | // try { 24 | // TokenStream tokenStream = paodingAnalyzer.tokenStream("", sentence); 25 | // System.out.println(tokenStream.toString()); 26 | // } catch (IOException e) { 27 | // e.printStackTrace(); 28 | // } 29 | // return terms; 30 | // } 31 | // 32 | // public static void main(String[] args) { 33 | // new PaodingImpl().segment("我是中国人"); 34 | // } 35 | //} 36 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/StanfordCoreNLPImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | //import edu.stanford.nlp.ling.CoreLabel; 4 | //import edu.stanford.nlp.pipeline.CoreDocument; 5 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP; 6 | //import indi.tiandi.nlp.Seg; 7 | //import indi.tiandi.nlp.Term; 8 | // 9 | //import java.util.ArrayList; 10 | //import java.util.List; 11 | //import java.util.Properties; 12 | // 13 | ///** 14 | // * 斯坦福CoreNLP分词 15 | // * 16 | // * @date 2019/3/4 17 | // * @author tiandi 18 | // */ 19 | //public class StanfordCoreNLPImpl extends Seg { 20 | // public static StanfordCoreNLP stanfordCoreNLP; 21 | // 22 | // static { 23 | // try { 24 | // Properties props = new Properties(); 25 | // props.load(StanfordCoreNLPImpl.class.getClassLoader().getResourceAsStream("StanfordCoreNLP-chinese.properties")); 26 | // props.setProperty("annotators", "tokenize,ssplit,pos"); 27 | // stanfordCoreNLP = new StanfordCoreNLP(props); 28 | // } catch (Exception e) { 29 | // e.printStackTrace(); 30 | // } 31 | // } 32 | // 33 | // @Override 34 | // public List segment(String sentence) { 35 | // CoreDocument exampleDocument = new CoreDocument(sentence); 36 | // // annotate document 37 | // stanfordCoreNLP.annotate(exampleDocument); 38 | // // access tokens from a CoreDocument 39 | // // a token is represented by a CoreLabel 40 | // List firstSentenceTokens = exampleDocument.sentences().get(0).tokens(); 41 | // // this for loop will print out all of the tokens and the character offset info 42 | // List terms = new ArrayList<>(); 43 | // for (CoreLabel token : firstSentenceTokens) { 44 | // terms.add(new Term(token.word(), token.tag())); 45 | // } 46 | // return terms; 47 | // } 48 | //} 49 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/ThulacImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import indi.tiandi.nlp.Seg; 4 | import indi.tiandi.nlp.Term; 5 | import io.github.yizhiru.thulac4j.Segmenter; 6 | 7 | import java.util.ArrayList; 8 | import java.util.List; 9 | /** 10 | * 清华分词 11 | * 12 | * @date 2019/3/4 13 | * @author tiandi 14 | */ 15 | public class ThulacImpl extends Seg { 16 | @Override 17 | public List segment(String sentence) { 18 | List segment = Segmenter.segment(sentence); 19 | List terms=new ArrayList<>(); 20 | for (String s : segment) { 21 | terms.add(new Term(s)); 22 | } 23 | return terms; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/evaluation/impl/WordImpl.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.evaluation.impl; 2 | 3 | import indi.tiandi.nlp.Seg; 4 | import indi.tiandi.nlp.Term; 5 | import org.apdplat.word.segmentation.Segmentation; 6 | import org.apdplat.word.segmentation.SegmentationAlgorithm; 7 | import org.apdplat.word.segmentation.SegmentationFactory; 8 | import org.apdplat.word.segmentation.Word; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | /** 13 | * word分词 14 | * 15 | * @date 2019/3/4 16 | * @author tiandi 17 | */ 18 | public class WordImpl extends Seg { 19 | public static final Segmentation segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.MaxNgramScore); 20 | 21 | @Override 22 | public List segment(String sentence) { 23 | List words = segmentation.seg(sentence); 24 | List terms = new ArrayList<>(); 25 | for (Word word : words) { 26 | terms.add(new Term(word.getText())); 27 | } 28 | 29 | return terms; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/tool/HttpRequest.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.tool; 2 | 3 | import java.io.File; 4 | import java.io.FileOutputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.net.URL; 8 | import java.net.URLConnection; 9 | 10 | /** 11 | * Http下载 12 | * 13 | * @author tiandi 14 | * @date 2021/2/18 15 | */ 16 | public class HttpRequest { 17 | /** 18 | * 从网络Url中下载文件 19 | * 20 | * @param httpUrl url地址 21 | * @param saveFile 保存文件路径和名称 22 | * @throws IOException 23 | */ 24 | public static boolean download(String httpUrl, String saveFile) throws IOException { 25 | // 下载网络文件 26 | int bytesum = 0; 27 | int byteread = 0; 28 | URL url = new URL(httpUrl); 29 | URLConnection conn = url.openConnection(); 30 | File file = new File(saveFile); 31 | if (!file.getParentFile().exists()) { 32 | file.getParentFile().mkdirs(); 33 | } 34 | try (InputStream inStream = conn.getInputStream(); 35 | FileOutputStream fs = new FileOutputStream(saveFile); 36 | ) { 37 | byte[] buffer = new byte[1204]; 38 | while ((byteread = inStream.read(buffer)) != -1) { 39 | bytesum += byteread; 40 | fs.write(buffer, 0, byteread); 41 | } 42 | return true; 43 | } catch (IOException e) { 44 | throw e; 45 | } 46 | } 47 | } -------------------------------------------------------------------------------- /java/src/main/java/indi/tiandi/nlp/tool/ZipUtil.java: -------------------------------------------------------------------------------- 1 | package indi.tiandi.nlp.tool; 2 | 3 | import java.io.*; 4 | import java.nio.charset.Charset; 5 | import java.nio.file.Paths; 6 | import java.util.Enumeration; 7 | import java.util.zip.ZipEntry; 8 | import java.util.zip.ZipFile; 9 | 10 | /** 11 | * 压缩和解压缩(https://www.bbsmax.com/A/x9J2bZLMJ6/) 12 | * 更多压缩文件格式可以参见https://www.bookstack.cn/read/hutool/a56da94bbb16617b.md 13 | * 14 | * @author tiandi 15 | * @date 2021/2/1 16 | */ 17 | public class ZipUtil { 18 | public static boolean unZip(File zipFile, String descDir) throws IOException { 19 | boolean flag = false; 20 | // 指定编码,否则压缩包里面不能有中文目录 21 | InputStream in = null; 22 | OutputStream out = null; 23 | ZipFile zip = new ZipFile(zipFile, Charset.forName("gbk")); 24 | try { 25 | for (Enumeration entries = zip.entries(); entries.hasMoreElements(); ) { 26 | ZipEntry entry = (ZipEntry) entries.nextElement(); 27 | String zipEntryName = entry.getName(); 28 | File file = Paths.get(descDir, zipEntryName).toFile(); 29 | File dir = file; 30 | if (!zipEntryName.endsWith("/") && !zipEntryName.endsWith("\\")) { 31 | // 非文件夹获取父文件 32 | dir = file.getParentFile(); 33 | } 34 | if (!dir.exists()) { 35 | dir.mkdirs(); 36 | } 37 | if (file.isDirectory()) { 38 | continue; 39 | } 40 | in = zip.getInputStream(entry); 41 | out = new FileOutputStream(file); 42 | byte[] buf1 = new byte[2048]; 43 | int len; 44 | while ((len = in.read(buf1)) > 0) { 45 | out.write(buf1, 0, len); 46 | } 47 | in.close(); 48 | out.close(); 49 | } 50 | flag = true; 51 | // 必须关闭,否则无法删除该zip文件 52 | } catch (IOException exception) { 53 | zip.close(); 54 | if (in != null) { 55 | in.close(); 56 | } 57 | if (out != null) { 58 | out.close(); 59 | } 60 | throw exception; 61 | } 62 | return flag; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /java/src/main/resources/ansj/library/ambiguity.dic: -------------------------------------------------------------------------------- 1 | 习近平 nr 2 | 李民 nr 工作 vn 3 | 三个 m 和尚 n 4 | 的确 d 定 v 不 v 5 | 大 a 和尚 n 6 | 张三 nr 和 c 7 | 动漫 n 游戏 n 8 | 邓颖超 nr 生前 t 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /java/src/main/resources/ansj/library/regex.dic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tiandiweizun/chinese-segmentation-evaluation/d0c96997bbe39fe73a114b8f380e50d4af6d5741/java/src/main/resources/ansj/library/regex.dic -------------------------------------------------------------------------------- /java/src/main/resources/ansj/library/stop.dic: -------------------------------------------------------------------------------- 1 | ? 2 | : 3 | . 4 | , 5 | is 6 | a 7 | # 8 | v nature 9 | .*了 regex -------------------------------------------------------------------------------- /java/src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /python/indi.tiandi.nlp.evaluation/SegEvaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import time 5 | import zipfile 6 | 7 | import jieba 8 | import jieba_fast 9 | import pkuseg 10 | import pynlpir 11 | import requests 12 | import thulac 13 | import wget 14 | from snownlp import SnowNLP 15 | 16 | 17 | class Seg: 18 | def __init__(self): 19 | pass 20 | 21 | def segment(self, sentence): 22 | pass 23 | 24 | 25 | # 中科院分词,python版本有bug和demo页面不一致 26 | class pynlpir_impl: 27 | def __init__(self): 28 | # 通过代码自动更新lience 29 | pynlpir_impl.update_pynlpir_lience() 30 | pynlpir.open() 31 | 32 | @staticmethod 33 | def update_pynlpir_lience(): 34 | # pynlpir 分词授权下载地址 35 | # https://github.com/NLPIR-team/NLPIR/tree/master/License 36 | url = 'https://raw.githubusercontent.com/NLPIR-team/NLPIR/master/License/license%20for%20a%20month/NLPIR-ICTCLAS%E5%88%86%E8%AF%8D%E7%B3%BB%E7%BB%9F%E6%8E%88%E6%9D%83/NLPIR.user' 37 | r = requests.get(url) 38 | # 拷贝到pynlpir相应的目录 39 | with open(os.path.join(pynlpir.__path__[0], "Data", "NLPIR.user"), "wb") as code: 40 | code.write(r.content) 41 | 42 | def segment(self, sentence): 43 | return pynlpir.segment(sentence, pos_tagging=False) 44 | 45 | 46 | # 北大分词 47 | class pkuseg_impl(Seg): 48 | def __init__(self): 49 | self.pku_seg = pkuseg.pkuseg() 50 | 51 | def segment(self, sentence): 52 | return self.pku_seg.cut(sentence) 53 | 54 | 55 | # 结巴分词 56 | class jieba_impl(Seg): 57 | def segment(self, sentence): 58 | return jieba.lcut(sentence) 59 | 60 | 61 | # jieba_fast分词 62 | class jieba_fast_impl(Seg): 63 | def segment(self, sentence): 64 | return jieba_fast.lcut(sentence) 65 | 66 | 67 | # snownlp分词 68 | class snownlp_impl(Seg): 69 | def segment(self, sentence): 70 | return SnowNLP(sentence).words 71 | 72 | 73 | # 清华分词 74 | class thulac_impl(Seg): 75 | def __init__(self): 76 | self.thu1 = thulac.thulac(seg_only=True) # 默认模式 77 | 78 | def segment(self, sentence): 79 | return self.thu1.cut(sentence, text=True).split() # 进行一句话分词 80 | 81 | 82 | # 哈工大分词 83 | class pyltp_impl(Seg): 84 | 85 | def __init__(self): 86 | ltp_data_dir = '/home/work/tiandi/ltp.model/ltp_data' # ltp模型目录的路径,由于cws文件比较大,需要自行下载 http://model.scir.yunfutech.com/model/ltp_data_v3.4.0.zip 87 | cws_model_path = os.path.join(ltp_data_dir, 'cws.model') # 分词模型路径,模型名称为`cws.model` 88 | from pyltp import Segmentor 89 | self.segmentor = Segmentor() 90 | self.segmentor.load(cws_model_path) 91 | 92 | def segment(self, sentence): 93 | return self.segmentor.segment(sentence); 94 | 95 | 96 | test_sentence = "这是一个测试句子" 97 | 98 | 99 | # 分词评估器 100 | class Evaluator: 101 | def __init__(self, seg_tool, name=None): 102 | ''' 103 | :param seg_tool: 分词器 104 | :param name: 分词器名称(可以直接重构到分词器里面去) 105 | ''' 106 | if not name: 107 | name = seg_tool.__name__ 108 | self.name = name 109 | time_start = time.time() 110 | print("%s 初始化开始" % self.name) 111 | self.seg = seg_tool() 112 | self.time = 0 113 | self.init = False 114 | result = self.seg.segment(test_sentence) 115 | # print(result) 116 | time_end = time.time() 117 | time_cost = (time_end - time_start) * 1000 118 | if "".join(result) != test_sentence: 119 | print("%s 初始化错误,句子:%s,分词结果:%s" % (self.name, test_sentence, result)) 120 | else: 121 | self.init = True 122 | print("%s 初始化结束,耗时:%d ms" % (self.name, time_cost)) 123 | 124 | 125 | # 用BMES标记分词结果 126 | def get_ner(terms): 127 | temp_gold = [] 128 | for term in terms: 129 | if len(term) == 1: 130 | temp_gold.append("S-Null") 131 | else: 132 | term_item = ["M-Null"] * len(term) 133 | term_item[0] = "B-Null" 134 | term_item[-1] = "E-Null" 135 | temp_gold.extend(term_item) 136 | return temp_gold 137 | 138 | 139 | # 评估各个分词器 140 | def evaluate(input, output, max_line_count, include): 141 | # 分词文件目录 142 | if len(input) == 0: 143 | # 项目root目录 144 | root = os.getcwd()[:os.getcwd().rindex("python")] 145 | data_dir = os.path.join(root, "data") 146 | input = os.path.join(data_dir, "seg_data_big.txt") 147 | download_seg_file = True 148 | if not os.path.exists(input): 149 | temp_zip_file = os.path.join(data_dir, "seg_data_big.zip") 150 | if os.path.exists(temp_zip_file): 151 | try: 152 | zipfile.ZipFile(temp_zip_file).extractall(data_dir) 153 | download_seg_file = False 154 | except Exception as e: 155 | os.remove(temp_zip_file) 156 | if download_seg_file: 157 | url = "https://github.com/tiandiweizun/chinese-segmentation-evaluation/releases/download/v1.0.1/seg_data_big.zip" 158 | print("从 %s 下载文件,如果下载较慢,亦可手动下载,保存到 %s 即可" % (url, temp_zip_file)) 159 | try: 160 | if not os.path.exists(os.path.dirname(temp_zip_file)): 161 | os.makedirs(os.path.dirname(temp_zip_file)) 162 | wget.download(url, out=temp_zip_file) 163 | print("下载完成") 164 | zipfile.ZipFile(temp_zip_file).extractall(data_dir) 165 | except Exception as e: 166 | print("下载或解压错误:%s" % e) 167 | sys.exit(1) 168 | 169 | if not os.path.exists(input): 170 | print("未从本地到文件:%s" % input) 171 | sys.exit(1) 172 | print("读入分词文件地址:" + input) 173 | write_result = False 174 | if len(output) > 0: 175 | print("分词结果写入地址:" + output) 176 | write_result = True 177 | max_line_count = int(max_line_count) 178 | if max_line_count > 0: 179 | print("最大读取行数:" + str(max_line_count)) 180 | 181 | evaluators = [] 182 | for name in include.split(","): 183 | evaluators.append(Evaluator(globals()[name + "_impl"], name)) 184 | # evaluators.append(Evaluator(pynlpir_impl)) 185 | # evaluators.append(Evaluator(pkuseg_impl)) 186 | # evaluators.append(Evaluator(jieba_impl)) 187 | # evaluators.append(Evaluator(snownlp_impl)) 188 | # evaluators.append(Evaluator(thulac_impl)) 189 | # evaluators.append(Evaluator(pyltp_impl)) 190 | time_start = time.time() 191 | print("读入分词文件开始") 192 | with open(input, encoding="utf-8") as f: 193 | lines = f.readlines() 194 | time_end = time.time() 195 | time_cost = (time_end - time_start) * 1000 196 | print("读取文件结束,耗时:%d ms" % (time_cost)) 197 | 198 | gold = [] 199 | test = [] 200 | char_count = 0 201 | # max_line_count = 100000 202 | if max_line_count <= 0: 203 | max_line_count = len(lines) 204 | line_count = 0 205 | for line in lines: 206 | gold.append(line.strip().split()) 207 | test.append("".join(gold[-1])) 208 | char_count += len(test[-1]) 209 | line_count += 1 210 | if line_count > max_line_count: 211 | break 212 | print("总行数:%d\t总字符数:%d" % (line_count, char_count)) 213 | calcScore = True 214 | for item in evaluators: 215 | print() 216 | print("%s 评测开始" % item.name) 217 | if not item.init: 218 | print("%s 初始化错误,跳过" % (item.name)) 219 | continue 220 | if write_result: 221 | file = open(os.path.join(output, item.name), mode="w", encoding="utf-8") 222 | time_start = time.time() 223 | right_num = 0 224 | gold_num = 0 225 | predict_num = 0 226 | 227 | for i in range(line_count): 228 | line = test[i] 229 | predict = item.seg.segment(line) 230 | 231 | if (calcScore): 232 | # temp_gold = get_ner(gold[i]) 233 | # temp_predict = get_ner(predict) 234 | if len("".join(predict)) != len(line): 235 | print(item.name + "\t" + line + "\t") 236 | continue 237 | # accuracy, precision, recall, f_measure, right_num, golden_num, predict_num = get_ner_fmeasure([temp_gold], [temp_predict]) 238 | right_num_local, golden_num_local, predict_num_local = calc_score(gold[i], predict) 239 | 240 | right_num += right_num_local 241 | gold_num += golden_num_local 242 | predict_num += predict_num_local 243 | 244 | # if (right_num != right_num_local or golden_num != golden_num_local or predict_num != predict_num_local): 245 | # print("badcase:" % line) 246 | 247 | # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num) 248 | # print() 249 | if write_result: 250 | file.write(" ".join(predict) + "\n") 251 | 252 | time_end = time.time() 253 | item.time = (time_end - time_start) * 1000 254 | precision = 0 255 | recall = 0 256 | f = 0 257 | if write_result: 258 | file.close() 259 | if predict_num != 0: 260 | precision = right_num * 1.0 / predict_num 261 | if gold_num != 0: 262 | recall = right_num * 1.0 / gold_num 263 | if precision + recall > 0: 264 | f = 2 * precision * recall / (precision + recall) 265 | 266 | print("precision:%f \t recall:%f \t f1:%f" % (precision, recall, f)) 267 | if item.time == 0: 268 | print("耗时太少,速度无法评估") 269 | else: 270 | print("耗时:%d ms,\t速度:%f 字符/毫秒" % (item.time, char_count * 1.0 / item.time)) 271 | 272 | 273 | def update(offset, index, terms): 274 | offset += len(terms[index]) 275 | index += 1 276 | return offset, index 277 | 278 | 279 | def calc_score(gold, predict): 280 | gold_offset = 0 281 | predict_offset = 0 282 | 283 | gold_term_index = 0 284 | predict_term_index = 0 285 | 286 | right = 0 287 | total = len(gold) 288 | right_and_wrong = len(predict) 289 | while (gold_term_index < len(gold) or predict_term_index < len(predict)): 290 | if gold_offset == predict_offset: 291 | if gold[gold_term_index] == predict[predict_term_index]: 292 | right += 1 293 | gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold) 294 | predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict) 295 | elif gold_offset < predict_offset: 296 | gold_offset, gold_term_index = update(gold_offset, gold_term_index, gold) 297 | else: 298 | predict_offset, predict_term_index = update(predict_offset, predict_term_index, predict) 299 | return right, total, right_and_wrong 300 | 301 | 302 | if __name__ == '__main__': 303 | parser = argparse.ArgumentParser(description='中文分词对比测试') 304 | parser.add_argument('-i', 305 | help='file to segment, default using the file in chinese-segmentation-evaluation/data/seg_data_big.txt', 306 | default="") 307 | parser.add_argument('-o', help='path to save the result, default is not saving', default="") 308 | parser.add_argument("-n", help='maximum number of read rows, default reading all', default="-1") 309 | parser.add_argument("-c", help='segmentor to evaluate', default="pkuseg,jieba_fast,thulac") 310 | args = parser.parse_args() 311 | evaluate(args.i, args.o, args.n, args.c) 312 | -------------------------------------------------------------------------------- /python/indi.tiandi.nlp.evaluation/metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Jie 3 | # @Date: 2017-02-16 09:53:19 4 | # @Last Modified by: Jie Yang, Contact: jieynlp@gmail.com 5 | # @Last Modified time: 2017-12-19 15:23:12 6 | 7 | # from operator import add 8 | # 9 | from __future__ import print_function 10 | 11 | 12 | ## input as sentence level labels 13 | def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"): 14 | sent_num = len(golden_lists) 15 | golden_full = [] 16 | predict_full = [] 17 | right_full = [] 18 | right_tag = 0 19 | all_tag = 0 20 | for idx in range(0, sent_num): 21 | # word_list = sentence_lists[idx] 22 | golden_list = golden_lists[idx] 23 | predict_list = predict_lists[idx] 24 | for idy in range(len(golden_list)): 25 | if golden_list[idy] == predict_list[idy]: 26 | right_tag += 1 27 | all_tag += len(golden_list) 28 | if label_type == "BMES": 29 | gold_matrix = get_ner_BMES(golden_list) 30 | pred_matrix = get_ner_BMES(predict_list) 31 | else: 32 | gold_matrix = get_ner_BIO(golden_list) 33 | pred_matrix = get_ner_BIO(predict_list) 34 | # print "gold", gold_matrix 35 | # print "pred", pred_matrix 36 | right_ner = list(set(gold_matrix).intersection(set(pred_matrix))) 37 | golden_full += gold_matrix 38 | predict_full += pred_matrix 39 | right_full += right_ner 40 | right_num = len(right_full) 41 | golden_num = len(golden_full) 42 | predict_num = len(predict_full) 43 | if predict_num == 0: 44 | precision = -1 45 | else: 46 | precision = (right_num + 0.0) / predict_num 47 | if golden_num == 0: 48 | recall = -1 49 | else: 50 | recall = (right_num + 0.0) / golden_num 51 | if (precision == -1) or (recall == -1) or (precision + recall) <= 0.: 52 | f_measure = -1 53 | else: 54 | f_measure = 2 * precision * recall / (precision + recall) 55 | accuracy = (right_tag + 0.0) / all_tag 56 | # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy 57 | # print("gold_num = ", golden_num, " pred_num = ", predict_num, " right_num = ", right_num) 58 | return accuracy, precision, recall, f_measure, right_num, golden_num, predict_num 59 | 60 | 61 | def reverse_style(input_string): 62 | target_position = input_string.index('[') 63 | input_len = len(input_string) 64 | output_string = input_string[target_position:input_len] + input_string[0:target_position] 65 | return output_string 66 | 67 | 68 | def get_ner_BMES(label_list): 69 | # list_len = len(word_list) 70 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 71 | list_len = len(label_list) 72 | begin_label = 'B-' 73 | end_label = 'E-' 74 | single_label = 'S-' 75 | whole_tag = '' 76 | index_tag = '' 77 | tag_list = [] 78 | stand_matrix = [] 79 | for i in range(0, list_len): 80 | # wordlabel = word_list[i] 81 | current_label = label_list[i].upper() 82 | if begin_label in current_label: 83 | if index_tag != '': 84 | tag_list.append(whole_tag + ',' + str(i - 1)) 85 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) 86 | index_tag = current_label.replace(begin_label, "", 1) 87 | 88 | elif single_label in current_label: 89 | if index_tag != '': 90 | tag_list.append(whole_tag + ',' + str(i - 1)) 91 | whole_tag = current_label.replace(single_label, "", 1) + '[' + str(i) 92 | tag_list.append(whole_tag) 93 | whole_tag = "" 94 | index_tag = "" 95 | elif end_label in current_label: 96 | if index_tag != '': 97 | tag_list.append(whole_tag + ',' + str(i)) 98 | whole_tag = '' 99 | index_tag = '' 100 | else: 101 | continue 102 | if (whole_tag != '') & (index_tag != ''): 103 | tag_list.append(whole_tag) 104 | tag_list_len = len(tag_list) 105 | 106 | for i in range(0, tag_list_len): 107 | if len(tag_list[i]) > 0: 108 | tag_list[i] = tag_list[i] + ']' 109 | insert_list = reverse_style(tag_list[i]) 110 | stand_matrix.append(insert_list) 111 | # print stand_matrix 112 | return stand_matrix 113 | 114 | 115 | def get_ner_BIO(label_list): 116 | # list_len = len(word_list) 117 | # assert(list_len == len(label_list)), "word list size unmatch with label list" 118 | list_len = len(label_list) 119 | begin_label = 'B-' 120 | inside_label = 'I-' 121 | whole_tag = '' 122 | index_tag = '' 123 | tag_list = [] 124 | stand_matrix = [] 125 | for i in range(0, list_len): 126 | # wordlabel = word_list[i] 127 | current_label = label_list[i].upper() 128 | if begin_label in current_label: 129 | if index_tag == '': 130 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) 131 | index_tag = current_label.replace(begin_label, "", 1) 132 | else: 133 | tag_list.append(whole_tag + ',' + str(i - 1)) 134 | whole_tag = current_label.replace(begin_label, "", 1) + '[' + str(i) 135 | index_tag = current_label.replace(begin_label, "", 1) 136 | 137 | elif inside_label in current_label: 138 | if current_label.replace(inside_label, "", 1) == index_tag: 139 | whole_tag = whole_tag 140 | else: 141 | if (whole_tag != '') & (index_tag != ''): 142 | tag_list.append(whole_tag + ',' + str(i - 1)) 143 | whole_tag = '' 144 | index_tag = '' 145 | else: 146 | if (whole_tag != '') & (index_tag != ''): 147 | tag_list.append(whole_tag + ',' + str(i - 1)) 148 | whole_tag = '' 149 | index_tag = '' 150 | 151 | if (whole_tag != '') & (index_tag != ''): 152 | tag_list.append(whole_tag) 153 | tag_list_len = len(tag_list) 154 | 155 | for i in range(0, tag_list_len): 156 | if len(tag_list[i]) > 0: 157 | tag_list[i] = tag_list[i] + ']' 158 | insert_list = reverse_style(tag_list[i]) 159 | stand_matrix.append(insert_list) 160 | return stand_matrix 161 | 162 | 163 | def readSentence(input_file): 164 | in_lines = open(input_file, 'r', encoding="utf-8").readlines() 165 | sentences = [] 166 | labels = [] 167 | sentence = [] 168 | label = [] 169 | for line in in_lines: 170 | if len(line) < 2: 171 | sentences.append(sentence) 172 | labels.append(label) 173 | sentence = [] 174 | label = [] 175 | else: 176 | pair = line.strip('\n').split(' ') 177 | sentence.append(pair[0]) 178 | label.append(pair[-1]) 179 | return sentences, labels 180 | 181 | 182 | def readTwoLabelSentence(input_file, pred_col=-1): 183 | in_lines = open(input_file, 'r', encoding="utf-8").readlines() 184 | sentences = [] 185 | predict_labels = [] 186 | golden_labels = [] 187 | sentence = [] 188 | predict_label = [] 189 | golden_label = [] 190 | for line in in_lines: 191 | if "##score##" in line: 192 | continue 193 | if len(line) < 2: 194 | sentences.append(sentence) 195 | golden_labels.append(golden_label) 196 | predict_labels.append(predict_label) 197 | sentence = [] 198 | golden_label = [] 199 | predict_label = [] 200 | else: 201 | pair = line.strip('\n').split() 202 | sentence.append(pair[0]) 203 | golden_label.append(pair[1]) 204 | predict_label.append(pair[pred_col]) 205 | 206 | return sentences, golden_labels, predict_labels 207 | 208 | 209 | def fmeasure_from_file(golden_file, predict_file, label_type="BMES"): 210 | print("Get f measure from file:", golden_file, predict_file) 211 | print("Label format:", label_type) 212 | golden_sent, golden_labels = readSentence(golden_file) 213 | predict_sent, predict_labels = readSentence(predict_file) 214 | A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type) 215 | print("P:%sm R:%s, F:%s" % (P, R, F)) 216 | 217 | 218 | def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1): 219 | sent, golden_labels, predict_labels = readTwoLabelSentence(twolabel_file, pred_col) 220 | A, P, R, F = get_ner_fmeasure(golden_labels, predict_labels, label_type) 221 | print("P:%s, R:%s, F:%s" % (P, R, F)) 222 | 223 | 224 | if __name__ == '__main__': 225 | # print "sys:",len(sys.argv) 226 | gold = [["B-Null", "M-Null"]] 227 | predict = [["B-Null", "M-Null", "E-Null", "S-Null"]] 228 | print(get_ner_fmeasure(gold, predict)) 229 | 230 | # sys.argv.append("result") 231 | # if len(sys.argv) == 3: 232 | # fmeasure_from_singlefile(sys.argv[1], "BMES", int(sys.argv[2])) 233 | # else: 234 | # fmeasure_from_singlefile(sys.argv[1], "BMES") 235 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | jieba 2 | jieba_fast 3 | pkuseg 4 | pynlpir 5 | thulac 6 | snownlp 7 | requests 8 | pyltp 9 | --------------------------------------------------------------------------------