├── .travis.yml ├── LICENSE.txt ├── README.md ├── config ├── extra_main.dic ├── extra_single_word.dic ├── extra_single_word_full.dic ├── extra_single_word_low_freq.dic ├── extra_stopword.dic ├── extra_test.dic ├── main.dic ├── preposition.dic ├── quantifier.dic ├── stopword.dic ├── suffix.dic └── surname.dic ├── index ├── _1.cfe ├── _1.cfs ├── _1.si ├── _3.cfe ├── _3.cfs ├── _3.si ├── segments_2 ├── segments_4 └── write.lock ├── licenses ├── lucene-LICENSE.txt └── lucene-NOTICE.txt ├── pom.xml └── src ├── main ├── assemblies │ └── plugin.xml ├── java │ └── org │ │ ├── elasticsearch │ │ ├── index │ │ │ └── analysis │ │ │ │ ├── IkAnalyzerProvider.java │ │ │ │ └── IkTokenizerFactory.java │ │ └── plugin │ │ │ └── analysis │ │ │ └── ik │ │ │ └── AnalysisIkPlugin.java │ │ └── wltea │ │ └── analyzer │ │ ├── cfg │ │ └── Configuration.java │ │ ├── core │ │ ├── AnalyzeContext.java │ │ ├── CharacterUtil.java │ │ ├── IKArbitrator.java │ │ ├── IKSegmenter.java │ │ ├── Lexeme.java │ │ ├── LexemePath.java │ │ ├── QuickSortSet.java │ │ └── segmenter │ │ │ ├── CJKSegmenter.java │ │ │ ├── CN_QuantifierSegmenter.java │ │ │ ├── ISegmenter.java │ │ │ └── LetterSegmenter.java │ │ ├── dic │ │ ├── DicFile.java │ │ ├── DictSegment.java │ │ ├── Dictionary.java │ │ ├── Hit.java │ │ └── RemoteDicMonitor.java │ │ ├── help │ │ ├── CharacterHelper.java │ │ ├── ESPluginLoggerFactory.java │ │ ├── PrefixPluginLogger.java │ │ └── Sleep.java │ │ └── lucene │ │ ├── IKAnalyzer.java │ │ └── IKTokenizer.java └── resources │ ├── plugin-descriptor.properties │ └── plugin-security.policy └── test └── java └── org └── wltea └── analyzer └── TokenizerTest.java /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | jdk: 3 | - oraclejdk8 4 | install: true 5 | script: 6 | - sudo apt-get update && sudo apt-get install oracle-java8-installer 7 | - java -version 8 | language: java 9 | script: mvn clean package 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | IK Analysis for Elasticsearch 2 | ============================= 3 | 4 | The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary. 5 | 6 | Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word` 7 | 8 | 说明 9 | ----- 10 | 该分词器时基于github medcl的分词器(https://github.com/medcl/elasticsearch-analysis-ik)改造而来, 11 | 改造点如下: 12 | 13 | 1、改造前,所有索引使用一个词库,没办法针对不同索引添加不同词库, 14 | 改造后,词库的加载由索引中自定义的analyzer配置时,设置的词库而决定 15 | 从而实现了,不同业务的索引使用不同的词库 16 | 17 | 2、优化了Dictionary类的代码结构,使得逻辑更清晰,将原来600行的代码缩减到300行, 18 | 优化比较死板的字典加载机制,不再读取IKAnalyzer.cfg.xml,而直接由用户索引analyzer创建时配置 19 | 20 | 3、优化了Remote Dictionary的加载机制 21 | 22 | 4、去掉了分词器中不必要的synchronized锁,提高了性能 23 | 24 | 5、读取字典文件路径顺序:优先从es的config/analysis-ik/下读取字典文件, 25 | 如未找到,则从plugin下,分词器对应的目录读取 26 | 27 | 28 | ### Dictionary Configuration 29 | 30 | `IKAnalyzer.cfg.xml` 配置文件不再使用,所有自定义扩展词库需要在定义分词器时设置, 31 | 例如 32 | ### 33 | ``` 34 | { 35 | "settings": { 36 | "number_of_shards": 1, 37 | "number_of_replicas": 0, 38 | "analysis": { 39 | "tokenizer": { 40 | "my_tokenizer": { 41 | "type": "ik_max_word", 42 | "ext_dic_main": [ 43 | "https:xxx.com/sss/ssss.dic",// 该字典是一个远程字典,路径以http或https打头 44 | "dddd.dic"// 该词典文件时ES服务器上的本地文件,需要放到IK的config目录下 45 | ] 46 | } 47 | }, 48 | "analyzer":{ 49 | "tokenizer":"my_tokenizer", 50 | "filer":["lowercase", "my_stemmer"] 51 | } 52 | } 53 | } 54 | } 55 | ``` 56 | Versions 57 | -------- 58 | 59 | IK version | ES version 60 | -----------|----------- 61 | master | 6.x 62 | 63 | 其它版本,请自己修改version,打包即可 64 | 65 | Install 66 | ------- 67 | 68 | 1.download or compile 69 | 70 | * optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases 71 | 72 | create plugin folder `cd your-es-root/plugins/ && mkdir ik` 73 | 74 | unzip plugin to folder `your-es-root/plugins/ik` 75 | 76 | * optional 2 - use elasticsearch-plugin to install ( supported from version v5.5.1 ): 77 | 78 | ``` 79 | ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip 80 | ``` 81 | 82 | NOTE: replace `6.3.0` to your own elasticsearch version 83 | 84 | 2.restart elasticsearch 85 | 86 | 87 | 88 | #### Quick Example 89 | 90 | 1.create a index 91 | 92 | ```bash 93 | curl -XPUT http://localhost:9200/index 94 | ``` 95 | 96 | 2.create a mapping 97 | 98 | ```bash 99 | curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d' 100 | { 101 | "properties": { 102 | "content": { 103 | "type": "text", 104 | "analyzer": "ik_max_word", 105 | "search_analyzer": "ik_smart" 106 | } 107 | } 108 | 109 | }' 110 | ``` 111 | 112 | 3.index some docs 113 | 114 | ```bash 115 | curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d' 116 | {"content":"美国留给伊拉克的是个烂摊子吗"} 117 | ' 118 | ``` 119 | 120 | ```bash 121 | curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d' 122 | {"content":"公安部:各地校车将享最高路权"} 123 | ' 124 | ``` 125 | 126 | ```bash 127 | curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d' 128 | {"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"} 129 | ' 130 | ``` 131 | 132 | ```bash 133 | curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d' 134 | {"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"} 135 | ' 136 | ``` 137 | 138 | 4.query with highlighting 139 | 140 | ```bash 141 | curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d' 142 | { 143 | "query" : { "match" : { "content" : "中国" }}, 144 | "highlight" : { 145 | "pre_tags" : ["", ""], 146 | "post_tags" : ["", ""], 147 | "fields" : { 148 | "content" : {} 149 | } 150 | } 151 | } 152 | ' 153 | ``` 154 | 155 | Result 156 | 157 | ```json 158 | { 159 | "took": 14, 160 | "timed_out": false, 161 | "_shards": { 162 | "total": 5, 163 | "successful": 5, 164 | "failed": 0 165 | }, 166 | "hits": { 167 | "total": 2, 168 | "max_score": 2, 169 | "hits": [ 170 | { 171 | "_index": "index", 172 | "_type": "fulltext", 173 | "_id": "4", 174 | "_score": 2, 175 | "_source": { 176 | "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首" 177 | }, 178 | "highlight": { 179 | "content": [ 180 | "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首 " 181 | ] 182 | } 183 | }, 184 | { 185 | "_index": "index", 186 | "_type": "fulltext", 187 | "_id": "3", 188 | "_score": 2, 189 | "_source": { 190 | "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船" 191 | }, 192 | "highlight": { 193 | "content": [ 194 | "均每天扣1艘中国渔船 " 195 | ] 196 | } 197 | } 198 | ] 199 | } 200 | } 201 | ``` 202 | 203 | 204 | 205 | ### 热更新 IK 分词使用方法 206 | 207 | 208 | 满足上面两点要求就可以实现热更新分词了,不需要重启 ES 实例。 209 | 210 | 可以将需自动更新的热词放在一个 UTF-8 编码的 .txt 文件里,放在 nginx 或其他简易 http server 下,当 .txt 文件修改时,http server 会在客户端请求该文件时自动返回相应的 Last-Modified 和 ETag。可以另外做一个工具来从业务系统提取相关词汇,并更新这个 .txt 文件。 211 | 212 | have fun. 213 | 214 | 常见问题 215 | ------- 216 | 217 | 1.自定义词典为什么没有生效? 218 | 219 | 请确保你的扩展词典的文本格式为 UTF8 编码 220 | 221 | 2.如何手动安装? 222 | 223 | 224 | ```bash 225 | git clone https://github.com/medcl/elasticsearch-analysis-ik 226 | cd elasticsearch-analysis-ik 227 | git checkout tags/{version} 228 | mvn clean 229 | mvn compile 230 | mvn package 231 | ``` 232 | 233 | 拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik 234 | 重启elasticsearch 235 | 236 | 3.分词测试失败 237 | 请在某个索引下调用analyze接口测试,而不是直接调用analyze接口 238 | 如: 239 | ```bash 240 | curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: application/json' -d' 241 | { 242 | "text":"中华人民共和国MN","tokenizer": "my_ik" 243 | }' 244 | ``` 245 | 246 | 247 | 4. ik_max_word 和 ik_smart 什么区别? 248 | 249 | 250 | ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query; 251 | 252 | ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。 253 | 254 | Changes 255 | ------ 256 | *自 v5.0.0 起* 257 | 258 | - 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word` 259 | 260 | 261 | Thanks 262 | ------ 263 | YourKit supports IK Analysis for ElasticSearch project with its full-featured Java Profiler. 264 | YourKit, LLC is the creator of innovative and intelligent tools for profiling 265 | Java and .NET applications. Take a look at YourKit's leading software products: 266 | YourKit Java Profiler and 267 | YourKit .NET Profiler. 268 | -------------------------------------------------------------------------------- /config/extra_single_word_low_freq.dic: -------------------------------------------------------------------------------- 1 | 踧 2 | 覢 3 | 觓 4 | 覛 5 | 覅 6 | 覟 7 | 覗 8 | 覣 9 | 覭 10 | 覂 11 | 觡 12 | 覝 13 | 觟 14 | 褱 15 | 褰 16 | 襒 17 | 覞 18 | 袨 19 | 觏 20 | 赒 21 | 觇 22 | 謍 23 | 讙 24 | 襦 25 | 袤 26 | 誸 27 | 诮 28 | 衩 29 | 茷 30 | 趒 31 | 襌 32 | 诰 33 | 譠 34 | 袄 35 | 聱 36 | 豸 37 | 蠓 38 | 讵 39 | 袅 40 | 诂 41 | 裞 42 | 訄 43 | 荺 44 | 褂 45 | 蠡 46 | 裐 47 | 諴 48 | 芫 49 | 赧 50 | 触 51 | 跫 52 | 褫 53 | 赝 54 | 褡 55 | 衪 56 | 裎 57 | 豜 58 | 褶 59 | 裟 60 | 跏 61 | 袪 62 | 袈 63 | 觐 64 | 跄 65 | 坏 66 | 肱 67 | 裾 68 | 考 69 | 豝 70 | 踰 71 | 覃 72 | 蹓 73 | 黾 74 | 褴 75 | 轲 76 | 裨 77 | 蜇 78 | 鮆 79 | 褥 80 | 誊 81 | 貉 82 | 褊 83 | 蜉 84 | 衔 85 | 詄 86 | 豋 87 | 胼 88 | 荞 89 | 踫 90 | 谗 91 | 耦 92 | 誏 93 | 衮 94 | 胝 95 | 幔 96 | 轭 97 | 赈 98 | 贲 99 | 蓼 100 | 褛 101 | 迵 102 | 觊 103 | 蚜 104 | 讫 105 | 颢 106 | 葄 107 | 觎 108 | 诎 109 | 謢 110 | 蹧 111 | 邬 112 | 芊 113 | 赣 114 | 囱 115 | 蝎 116 | 夆 117 | 蠋 118 | 蠕 119 | 蹼 120 | 臊 121 | 蛭 122 | 颚 123 | 讴 124 | 踽 125 | 菫 126 | 臾 127 | 薮 128 | 蹒 129 | 谀 130 | 菀 131 | 佶 132 | 摀 133 | 佚 134 | 邸 135 | 跺 136 | 豊 137 | 荔 138 | 锌 139 | 诿 140 | 蕤 141 | 诳 142 | 芩 143 | 蹴 144 | 褉 145 | 觔 146 | 舴 147 | 腋 148 | 颍 149 | 膊 150 | 脯 151 | 荪 152 | 郢 153 | 坛 154 | 轫 155 | 醺 156 | 捺 157 | 姝 158 | 胭 159 | 饷 160 | 谪 161 | 驮 162 | 僮 163 | 踯 164 | 忪 165 | 驷 166 | 躅 167 | 忑 168 | 彧 169 | 衲 170 | 唠 171 | 跚 172 | 吃 173 | 诩 174 | 褓 175 | 诤 176 | 豨 177 | 诋 178 | 菈 179 | 逖 180 | 荟 181 | 裆 182 | 喋 183 | 忖 184 | 闾 185 | 诌 186 | 啻 187 | 铀 188 | 菡 189 | 胱 190 | 蹬 191 | 隹 192 | 鹬 193 | 诒 194 | 轧 195 | 萏 196 | 舶 197 | 鳅 198 | 药 199 | 酯 200 | 夯 201 | 偬 202 | 酝 203 | 跻 204 | 咤 205 | 掬 206 | 呆 207 | 蹶 208 | 踞 209 | 蝌 210 | 咋 211 | 谧 212 | 舫 213 | 啐 214 | 茸 215 | 谟 216 | 嵌 217 | 蜿 218 | 魇 219 | 帷 220 | 觑 221 | 鳍 222 | 谏 223 | 哽 224 | 乓 225 | 蚌 226 | 嗙 227 | 巿 228 | 刽 229 | 踱 230 | 腆 231 | 薏 232 | 蜃 233 | 谑 234 | 躄 235 | 鸾 236 | 齁 237 | 腼 238 | 呷 239 | 吆 240 | 荀 241 | 裱 242 | 辇 243 | 睫 244 | 伎 245 | 妲 246 | 菠 247 | 鼐 248 | 麾 249 | 芮 250 | 鲑 251 | 辉 252 | 啜 253 | 苞 254 | 踼 255 | 荃 256 | 杞 257 | 浣 258 | 沬 259 | 胤 260 | 恿 261 | 驭 262 | 逵 263 | 钛 264 | 徕 265 | 贮 266 | 蔫 267 | 锚 268 | 衙 269 | 肄 270 | 豺 271 | 闸 272 | 隋 273 | 腑 274 | 脐 275 | 脓 276 | 叱 277 | 迥 278 | 踝 279 | 馥 280 | 佣 281 | 喳 282 | 迩 283 | 贻 284 | 诙 285 | 椭 286 | 琬 287 | 赂 288 | 诧 289 | 苯 290 | 怂 291 | 蟆 292 | 龊 293 | 漳 294 | 迭 295 | 垛 296 | 铲 297 | 馊 298 | 娓 299 | 葆 300 | 赑 301 | 卍 302 | 遽 303 | 谯 304 | 賏 305 | 蛹 306 | 锤 307 | 粟 308 | 衿 309 | 渥 310 | 铳 311 | 刍 312 | 镳 313 | 匮 314 | 万 315 | 骁 316 | 酣 317 | 酉 318 | 骥 319 | 寨 320 | 蓁 321 | 诽 322 | 钡 323 | 浙 324 | 酗 325 | 跩 326 | 拗 327 | 坷 328 | 雱 329 | 闺 330 | 喈 331 | 晔 332 | 螳 333 | 谙 334 | 蹂 335 | 鞑 336 | 蔗 337 | 账 338 | 垚 339 | 瞩 340 | 谩 341 | 掳 342 | 媲 343 | 葾 344 | 鳗 345 | 钣 346 | 檀 347 | 阕 348 | 聿 349 | 蜍 350 | 仆 351 | 嗅 352 | 峥 353 | 蜈 354 | 垠 355 | 蚓 356 | 麓 357 | 殉 358 | 弩 359 | 朴 360 | 胥 361 | 瘴 362 | 篑 363 | 镍 364 | 鹂 365 | 暐 366 | 榷 367 | 咀 368 | 佯 369 | 蚣 370 | 荻 371 | 鬓 372 | 仝 373 | 裴 374 | 讷 375 | 孺 376 | 咨 377 | 俑 378 | 遴 379 | 吽 380 | 笋 381 | 耀 382 | 霾 383 | 绎 384 | 咿 385 | 骸 386 | 霭 387 | 昕 388 | 漩 389 | 浒 390 | 轼 391 | 婿 392 | 嗳 393 | 钙 394 | 谲 395 | 蛾 396 | 跛 397 | 惺 398 | 翎 399 | 炽 400 | 晒 401 | 钳 402 | 鞘 403 | 谚 404 | 钊 405 | 背 406 | 瀛 407 | 槌 408 | 臀 409 | 跋 410 | 窒 411 | 藤 412 | 噬 413 | 蓊 414 | 褐 415 | 蔺 416 | 鲍 417 | 鲨 418 | 舔 419 | 箔 420 | 萦 421 | 诏 422 | 褔 423 | 咄 424 | 俘 425 | 彪 426 | 饪 427 | 嘱 428 | 诬 429 | 踮 430 | 囝 431 | 佢 432 | 汶 433 | 讹 434 | 踅 435 | 咐 436 | 讼 437 | 玟 438 | 迂 439 | 亵 440 | 婵 441 | 馁 442 | 崭 443 | 惦 444 | 蠹 445 | 濒 446 | 匈 447 | 蟋 448 | 谕 449 | 酪 450 | 眛 451 | 煦 452 | 甭 453 | 谄 454 | 妾 455 | 梧 456 | 芜 457 | 蛎 458 | 颐 459 | 雌 460 | 褒 461 | 臼 462 | 圳 463 | 剔 464 | 噶 465 | 耨 466 | 嗈 467 | 勋 468 | 冶 469 | 扑 470 | 膺 471 | 腺 472 | 荤 473 | 坞 474 | 羲 475 | 栾 476 | 傌 477 | 幌 478 | 噗 479 | 蛀 480 | 觞 481 | 塾 482 | 耙 483 | 枭 484 | 擞 485 | 缅 486 | 踌 487 | 蟀 488 | 侥 489 | 诣 490 | 姜 491 | 甸 492 | 俭 493 | 泠 494 | 躇 495 | 萌 496 | 虏 497 | 匕 498 | 藩 499 | 嗽 500 | 蜻 501 | 咛 502 | 艹 503 | 跎 504 | 蔬 505 | 鸠 506 | 跆 507 | 肋 508 | 巅 509 | 芯 510 | 荐 511 | 荼 512 | 慵 513 | 咸 514 | 杭 515 | 樟 516 | 夸 517 | 戮 518 | 吱 519 | 模 520 | 葔 521 | 迢 522 | 砰 523 | 须 524 | 蒜 525 | 骐 526 | 茱 527 | 痊 528 | 蛤 529 | 蜴 530 | 诟 531 | 俾 532 | 疮 533 | 悴 534 | 袒 535 | 蒹 536 | 镖 537 | 娥 538 | 鹉 539 | 婊 540 | 噫 541 | 矜 542 | 岳 543 | 鹦 544 | 葭 545 | 褚 546 | 嵩 547 | 丫 548 | 凛 549 | 峦 550 | 惚 551 | 懊 552 | 韶 553 | 憋 554 | 聋 555 | 讪 556 | 瘫 557 | 霓 558 | 哺 559 | 蝙 560 | 靥 561 | 堇 562 | 铺 563 | 趾 564 | 褪 565 | 缆 566 | 媛 567 | 胧 568 | 肛 569 | 珈 570 | 畴 571 | 驹 572 | 熔 573 | 臆 574 | 肘 575 | 豁 576 | 冕 577 | 吊 578 | 韧 579 | 炜 580 | 舱 581 | 恁 582 | 巳 583 | 舵 584 | 臻 585 | 戊 586 | 稽 587 | 诲 588 | 隽 589 | 铐 590 | 鲫 591 | 畸 592 | 饥 593 | 茉 594 | 蒲 595 | 矶 596 | 峨 597 | 蚵 598 | 蔼 599 | 诛 600 | 焰 601 | 偈 602 | 蚱 603 | 骯 604 | 盔 605 | 巩 606 | 折 607 | 偕 608 | 嗓 609 | 辙 610 | 鸶 611 | 酵 612 | 莘 613 | 耘 614 | 汹 615 | 楞 616 | 陡 617 | 裳 618 | 憎 619 | 讳 620 | 荆 621 | 笃 622 | 屉 623 | 霈 624 | 恬 625 | 蹦 626 | 扬 627 | 侃 628 | 艳 629 | 璇 630 | 韬 631 | 烬 632 | 傀 633 | 铮 634 | 曦 635 | 搂 636 | 蝠 637 | 霄 638 | 胺 639 | 遐 640 | 飨 641 | 郡 642 | 困 643 | 呎 644 | 墅 645 | 鞠 646 | 瘤 647 | 藻 648 | 咆 649 | 踹 650 | 狷 651 | 镀 652 | 桐 653 | 赘 654 | 揽 655 | 炬 656 | 氢 657 | 膛 658 | 搪 659 | 湿 660 | 唆 661 | 兑 662 | 暸 663 | 厮 664 | 懈 665 | 媳 666 | 塘 667 | 靡 668 | 鹭 669 | 祟 670 | 冀 671 | 豚 672 | 蹄 673 | 橙 674 | 阎 675 | 硫 676 | 埠 677 | 噱 678 | 妃 679 | 搓 680 | 啃 681 | 俞 682 | 龚 683 | 橄 684 | 嚎 685 | 椎 686 | 蓦 687 | 朔 688 | 痘 689 | 鳞 690 | 铠 691 | 叽 692 | 跤 693 | 裔 694 | 诃 695 | 岫 696 | 怯 697 | 讥 698 | 聂 699 | 垢 700 | 藐 701 | 濑 702 | 莒 703 | 淇 704 | 毯 705 | 礁 706 | 赃 707 | 庐 708 | 辕 709 | 瞌 710 | 锯 711 | 莓 712 | 涡 713 | 昼 714 | 捌 715 | 嗡 716 | 倌 717 | 禹 718 | 蹋 719 | 卯 720 | 粪 721 | 耽 722 | 闰 723 | 曳 724 | 苔 725 | 诵 726 | 菇 727 | 斟 728 | 芥 729 | 莅 730 | 喀 731 | 麒 732 | 颊 733 | 扛 734 | 曜 735 | 咎 736 | 缮 737 | 诫 738 | 躁 739 | 茜 740 | 缤 741 | 暧 742 | 郄 743 | 酥 744 | 僻 745 | 躬 746 | 峙 747 | 驯 748 | 噎 749 | 厦 750 | 澜 751 | 杏 752 | 樽 753 | 勘 754 | 煤 755 | 茎 756 | 嚷 757 | 昆 758 | 铸 759 | 烘 760 | 邹 761 | 廓 762 | 拚 763 | 俐 764 | 裘 765 | 饵 766 | 恃 767 | 蔓 768 | 笙 769 | 茁 770 | 楷 771 | 嚼 772 | 锻 773 | 蕊 774 | 脖 775 | 茍 776 | 壤 777 | 琮 778 | 莽 779 | 塌 780 | 蚤 781 | 膳 782 | 磋 783 | 蓓 784 | 澈 785 | 萎 786 | 擒 787 | 禄 788 | 儡 789 | 懦 790 | 瞻 791 | 虔 792 | 粥 793 | 赦 794 | 畜 795 | 彷 796 | 寥 797 | 揣 798 | 嫖 799 | 朽 800 | 挂 801 | 啄 802 | 浇 803 | 崖 804 | 棠 805 | 禽 806 | 台 807 | 邂 808 | 矫 809 | 茅 810 | 惫 811 | 吠 812 | 苟 813 | 叩 814 | 徊 815 | 巍 816 | 舆 817 | 邵 818 | 彗 819 | 萃 820 | 拱 821 | 嘶 822 | 貂 823 | 趴 824 | 愿 825 | 脊 826 | 冗 827 | 杆 828 | 蕙 829 | 铎 830 | 囚 831 | 啼 832 | 谤 833 | 徘 834 | 芹 835 | 骆 836 | 夭 837 | 饺 838 | 馒 839 | 溺 840 | 咫 841 | 屐 842 | 绅 843 | 诅 844 | 缉 845 | 渣 846 | 敞 847 | 萱 848 | 丰 849 | 俏 850 | 螃 851 | 蜀 852 | 徽 853 | 逞 854 | 跪 855 | 虞 856 | 隙 857 | 匀 858 | 憧 859 | 辄 860 | 鸳 861 | 疵 862 | 跷 863 | 呱 864 | 穆 865 | 阑 866 | 搏 867 | 肾 868 | 靶 869 | 阱 870 | 囡 871 | 寰 872 | 庄 873 | 蟾 874 | 怠 875 | 腕 876 | 烟 877 | 巾 878 | 奢 879 | 垄 880 | 姨 881 | 躯 882 | 肺 883 | 钰 884 | 佰 885 | 阙 886 | 雏 887 | 溉 888 | 焚 889 | 丑 890 | 锥 891 | 诘 892 | 瞪 893 | 茹 894 | 绊 895 | 蚀 896 | 袱 897 | 煽 898 | 窕 899 | 掷 900 | 沮 901 | 钞 902 | 涕 903 | 浏 904 | 仄 905 | 孰 906 | 峻 907 | 皱 908 | 芦 909 | 膏 910 | 晰 911 | 衬 912 | 谍 913 | 丞 914 | 绽 915 | 蔽 916 | 呕 917 | 轿 918 | 隶 919 | 楠 920 | 匣 921 | 葵 922 | 沫 923 | 刃 924 | 禧 925 | 晦 926 | 哔 927 | 晖 928 | 绣 929 | 仟 930 | 窟 931 | 谛 932 | 瀚 933 | 黛 934 | 忿 935 | 姚 936 | 蜘 937 | 耸 938 | 捍 939 | 斐 940 | 卜 941 | 辗 942 | 刁 943 | 涅 944 | 泓 945 | 梵 946 | 扳 947 | 暇 948 | 袜 949 | 柠 950 | 傍 951 | 逮 952 | 呃 953 | 蜗 954 | 窍 955 | 琉 956 | 喃 957 | 溢 958 | 抉 959 | 旷 960 | 卅 961 | 亟 962 | 膝 963 | 伶 964 | 闇 965 | 莺 966 | 蔚 967 | 醋 968 | 瑛 969 | 拭 970 | 绮 971 | 鑫 972 | 圭 973 | 脂 974 | 酿 975 | 诈 976 | 膨 977 | 隧 978 | 惭 979 | 庚 980 | 衅 981 | 哨 982 | 凋 983 | 里 984 | 祯 985 | 撼 986 | 谭 987 | 稻 988 | 迋 989 | 碌 990 | 罕 991 | 逾 992 | 嗜 993 | 蹲 994 | 檬 995 | 肖 996 | 辖 997 | 襟 998 | 扎 999 | 槟 1000 | 缔 1001 | 袂 1002 | 敷 1003 | 腥 1004 | 喘 1005 | 簿 1006 | 鳖 1007 | 出 1008 | 噢 1009 | 炫 1010 | 佑 1011 | 贷 1012 | 粮 1013 | 荳 1014 | 桦 1015 | 颉 1016 | 哑 1017 | 倪 1018 | 颤 1019 | 御 1020 | 芽 1021 | 朦 1022 | 裹 1023 | 贬 1024 | 蕉 1025 | 蝉 1026 | 赎 1027 | 崔 1028 | 滔 1029 | 茵 1030 | 径 1031 | 克 1032 | 啤 1033 | 拯 1034 | 坟 1035 | 葱 1036 | 芋 1037 | 瞒 1038 | 掠 1039 | 绳 1040 | 蛛 1041 | 匠 1042 | 凸 1043 | 苛 1044 | 押 1045 | 楣 1046 | 芙 1047 | 酌 1048 | 俺 1049 | 掏 1050 | 倡 1051 | 唾 1052 | 瞄 1053 | 磊 1054 | 吼 1055 | 搅 1056 | 溃 1057 | 聆 1058 | 沌 1059 | 蝇 1060 | 鸥 1061 | 妒 1062 | 焕 1063 | 拙 1064 | 夷 1065 | 迄 1066 | 绰 1067 | 锵 1068 | 耿 1069 | 祺 1070 | 吶 1071 | 惶 1072 | 廊 1073 | 兜 1074 | 倩 1075 | 杖 1076 | 窄 1077 | 僚 1078 | 竖 1079 | 芷 1080 | 咚 1081 | 鲢 1082 | 沛 1083 | 挪 1084 | 柄 1085 | 顷 1086 | 璞 1087 | 裸 1088 | 鵰 1089 | 郊 1090 | 屿 1091 | 仕 1092 | 艘 1093 | 铅 1094 | 铝 1095 | 饲 1096 | 黯 1097 | 疫 1098 | 栽 1099 | 喉 1100 | 逗 1101 | 祇 1102 | 阪 1103 | 侍 1104 | 抒 1105 | 弗 1106 | 尬 1107 | 浦 1108 | 鄙 1109 | 盏 1110 | 喽 1111 | 炳 1112 | 卵 1113 | 肌 1114 | 迦 1115 | 擅 1116 | 豹 1117 | 胏 1118 | 炼 1119 | 悸 1120 | 谴 1121 | 贾 1122 | 胀 1123 | 疋 1124 | 矿 1125 | 梨 1126 | 碑 1127 | 髓 1128 | 巢 1129 | 叹 1130 | 屡 1131 | 滩 1132 | 侮 1133 | 橘 1134 | 嘲 1135 | 酬 1136 | 枚 1137 | 氓 1138 | 菌 1139 | 颁 1140 | 萝 1141 | 谘 1142 | 曝 1143 | 薯 1144 | 襄 1145 | 辽 1146 | 萄 1147 | 寇 1148 | 舜 1149 | 颂 1150 | 撰 1151 | 腻 1152 | 崩 1153 | 咕 1154 | 癌 1155 | 歇 1156 | 汰 1157 | 烁 1158 | 撇 1159 | 宴 1160 | 惩 1161 | 烛 1162 | 贰 1163 | 呻 1164 | 呒 1165 | 翩 1166 | 绑 1167 | 捞 1168 | 爹 1169 | 秉 1170 | 棉 1171 | 妓 1172 | 尉 1173 | 霍 1174 | 甫 1175 | 尝 1176 | 葡 1177 | 蒸 1178 | 鸦 1179 | 挚 1180 | 奸 1181 | 纬 1182 | 艰 1183 | 履 1184 | 葬 1185 | 滨 1186 | 耕 1187 | 婴 1188 | 醇 1189 | 堵 1190 | 钉 1191 | 喧 1192 | 遂 1193 | 锣 1194 | 垮 1195 | 蓬 1196 | 薛 1197 | 虐 1198 | 睁 1199 | 厨 1200 | 娶 1201 | 浆 1202 | 挨 1203 | 矢 1204 | 蕾 1205 | 伺 1206 | 券 1207 | 鹏 1208 | 削 1209 | 蓄 1210 | 琦 1211 | 熄 1212 | 湘 1213 | 慌 1214 | 枕 1215 | 衍 1216 | 薇 1217 | 囊 1218 | 喂 1219 | 蕴 1220 | 倘 1221 | 峡 1222 | 浊 1223 | 窃 1224 | 颈 1225 | 裙 1226 | 晕 1227 | 缚 1228 | 获 1229 | 帕 1230 | 脾 1231 | 莹 1232 | 逍 1233 | 姬 1234 | 韦 1235 | 畔 1236 | 伐 1237 | 霞 1238 | 嘘 1239 | 盐 1240 | 摧 1241 | 债 1242 | 佩 1243 | 畏 1244 | 驴 1245 | 氧 1246 | 奴 1247 | 瘦 1248 | 菊 1249 | 廿 1250 | 狭 1251 | 赴 1252 | 碳 1253 | 坊 1254 | 盆 1255 | 趟 1256 | 匿 1257 | 肇 1258 | 溶 1259 | 揭 1260 | 剥 1261 | 沦 1262 | 秃 1263 | 郝 1264 | 唔 1265 | 锡 1266 | 娇 1267 | 抚 1268 | 屎 1269 | 甩 1270 | 娱 1271 | 表 1272 | 犬 1273 | 魁 1274 | 蒂 1275 | 皓 1276 | 祷 1277 | 瞎 1278 | 瘾 1279 | 煎 1280 | 螺 1281 | 遮 1282 | 坠 1283 | 剎 1284 | 筝 1285 | 棵 1286 | 冤 1287 | 崎 1288 | 昔 1289 | 驼 1290 | 竿 1291 | 甄 1292 | 斑 1293 | 歹 1294 | 骏 1295 | 缝 1296 | 鞭 1297 | 垫 1298 | 淹 1299 | 并 1300 | 遨 1301 | 宠 1302 | 掰 1303 | 枯 1304 | 艇 1305 | 豫 1306 | 募 1307 | 郁 1308 | 稚 1309 | 懿 1310 | 辐 1311 | 酱 1312 | 恕 1313 | 范 1314 | 涂 1315 | 滤 1316 | 肃 1317 | 膜 1318 | 佬 1319 | 哼 1320 | 慨 1321 | 穗 1322 | 辰 1323 | 雁 1324 | 瑟 1325 | 帆 1326 | 拢 1327 | 汁 1328 | 蝴 1329 | 冈 1330 | 诠 1331 | 蹈 1332 | 黏 1333 | 痞 1334 | 屑 1335 | 潇 1336 | 觅 1337 | 钧 1338 | 挣 1339 | 谐 1340 | 霜 1341 | 诊 1342 | 熬 1343 | 讽 1344 | 歧 1345 | 戈 1346 | 闯 1347 | 饶 1348 | 斤 1349 | 婉 1350 | 致 1351 | 贿 1352 | 苑 1353 | 矮 1354 | 毋 1355 | 詹 1356 | 祈 1357 | 咳 1358 | 昱 1359 | 佐 1360 | 帖 1361 | 猩 1362 | 尹 1363 | 诇 1364 | 肆 1365 | 亭 1366 | 丘 1367 | 淘 1368 | 颠 1369 | 勃 1370 | 讶 1371 | 抖 1372 | 袁 1373 | 柱 1374 | 僧 1375 | 蚊 1376 | 匹 1377 | 辣 1378 | 螂 1379 | 澡 1380 | 昧 1381 | 诡 1382 | 槽 1383 | 穴 1384 | 斩 1385 | 聘 1386 | 扶 1387 | 熙 1388 | 驰 1389 | 棍 1390 | 兆 1391 | 蟑 1392 | 矩 1393 | 谬 1394 | 贫 1395 | 鼎 1396 | 践 1397 | 盲 1398 | 眷 1399 | 尿 1400 | 伫 1401 | 饿 1402 | 砸 1403 | 妄 1404 | 荡 1405 | 炒 1406 | 冥 1407 | 偿 1408 | 墓 1409 | 骄 1410 | 毙 1411 | 淋 1412 | 芝 1413 | 胃 1414 | 宅 1415 | 董 1416 | 梭 1417 | 凑 1418 | 宰 1419 | 卑 1420 | 丛 1421 | 纠 1422 | 肢 1423 | 闽 1424 | 铜 1425 | 寺 1426 | 瞬 1427 | 澳 1428 | 庞 1429 | 腔 1430 | 泼 1431 | 昂 1432 | 梁 1433 | 躺 1434 | 姻 1435 | 潭 1436 | 吋 1437 | 撤 1438 | 殖 1439 | 轴 1440 | 颖 1441 | 冻 1442 | 琼 1443 | 恳 1444 | 衫 1445 | 譬 1446 | 猎 1447 | 衰 1448 | 桶 1449 | 辜 1450 | 筒 1451 | 赫 1452 | 仗 1453 | 膀 1454 | 乳 1455 | 嚣 1456 | 划 1457 | 玮 1458 | 卿 1459 | 枉 1460 | 埃 1461 | 跨 1462 | 粹 1463 | 猴 1464 | 愤 1465 | 壹 1466 | 卢 1467 | 尧 1468 | 翰 1469 | 叮 1470 | 媚 1471 | 钮 1472 | 袖 1473 | 斌 1474 | 卓 1475 | 粽 1476 | 雀 1477 | 谦 1478 | 傅 1479 | 殿 1480 | 睹 1481 | 菁 1482 | 桂 1483 | 诱 1484 | 舌 1485 | 惟 1486 | 岗 1487 | 衷 1488 | 屈 1489 | 陋 1490 | 陌 1491 | 宵 1492 | 麟 1493 | 魏 1494 | 贸 1495 | 几 1496 | 埔 1497 | 谎 1498 | 袍 1499 | 卸 1500 | 仓 1501 | 匪 1502 | 叛 1503 | 肠 1504 | 肝 1505 | 俄 1506 | 孕 1507 | 庙 1508 | 嫁 1509 | 肤 1510 | 拦 1511 | 羯 1512 | 匙 1513 | 咏 1514 | 蠢 1515 | 纽 1516 | 拘 1517 | 旨 1518 | 胁 1519 | 馨 1520 | 珊 1521 | 签 1522 | 赔 1523 | 秩 1524 | 喻 1525 | 谜 1526 | 翠 1527 | 芭 1528 | 摊 1529 | 侣 1530 | 灿 1531 | 寡 1532 | 罐 1533 | 贼 1534 | 叙 1535 | 谨 1536 | 体 1537 | 敲 1538 | 浴 1539 | 吻 1540 | 臂 1541 | 袭 1542 | 煮 1543 | 腹 1544 | 暮 1545 | 曹 1546 | 虹 1547 | 抑 1548 | 贩 1549 | 踩 1550 | 澎 1551 | 糖 1552 | 催 1553 | 萍 1554 | 垂 1555 | 斥 1556 | 侬 1557 | 拷 1558 | 唤 1559 | 匆 1560 | 阮 1561 | 飙 1562 | 柴 1563 | 剂 1564 | 妖 1565 | 添 1566 | 畅 1567 | 汗 1568 | 鸭 1569 | 稀 1570 | 晋 1571 | 埋 1572 | 弊 1573 | 返 1574 | 叡 1575 | 娟 1576 | 玻 1577 | 腾 1578 | 栋 1579 | 歪 1580 | 邓 1581 | 渴 1582 | 粒 1583 | 泣 1584 | 疾 1585 | 蓉 1586 | 塑 1587 | 祂 1588 | 储 1589 | 劣 1590 | 柯 1591 | 陶 1592 | 患 1593 | 蛇 1594 | 腐 1595 | 琳 1596 | 慎 1597 | 泊 1598 | 牢 1599 | 呈 1600 | 趁 1601 | 恶 1602 | 浑 1603 | 扮 1604 | 樱 1605 | 臣 1606 | 遵 1607 | 缠 1608 | 虫 1609 | 撒 1610 | 叉 1611 | 刑 1612 | 苗 1613 | 脉 1614 | 盈 1615 | 津 1616 | 愧 1617 | 摔 1618 | 盒 1619 | 丧 1620 | 鹤 1621 | 呦 1622 | 厕 1623 | 斜 1624 | 芒 1625 | 翅 1626 | 悄 1627 | 晃 1628 | 茂 1629 | 寸 1630 | 杉 1631 | 旺 1632 | 俩 1633 | 雯 1634 | 霖 1635 | 递 1636 | 胶 1637 | 氛 1638 | 谣 1639 | 捉 1640 | 虾 1641 | 秘 1642 | 漠 1643 | 扭 1644 | 贞 1645 | 陵 1646 | 叔 1647 | 轨 1648 | 鹅 1649 | 液 1650 | 妥 1651 | 贱 1652 | 涨 1653 | 滥 1654 | 痕 1655 | 沿 1656 | 秤 1657 | 措 1658 | 巡 1659 | 丈 1660 | 魅 1661 | 欲 1662 | 缸 1663 | 鹿 1664 | 汝 1665 | 迁 1666 | 矣 1667 | 肩 1668 | 烤 1669 | 笛 1670 | 迅 1671 | 劫 1672 | 趋 1673 | 披 1674 | 荷 1675 | 卒 1676 | 丙 1677 | 碗 1678 | 伙 1679 | 椅 1680 | 赞 1681 | 侦 1682 | 灾 1683 | 秦 1684 | 蛙 1685 | 禅 1686 | 慰 1687 | 余 1688 | 朗 1689 | 辱 1690 | 征 1691 | 愚 1692 | 抛 1693 | 挺 1694 | 彭 1695 | 允 1696 | 靖 1697 | 滋 1698 | 凝 1699 | 赠 1700 | 莎 1701 | 顽 1702 | 狠 1703 | 堕 1704 | 翘 1705 | 惹 1706 | 纲 1707 | 贯 1708 | 饼 1709 | 抬 1710 | 逆 1711 | 堪 1712 | 坤 1713 | 斗 1714 | 钦 1715 | 疏 1716 | 羞 1717 | 扇 1718 | 蜂 1719 | 赌 1720 | 驻 1721 | 屏 1722 | 爵 1723 | 轰 1724 | 契 1725 | 悦 1726 | 邻 1727 | 哉 1728 | 陀 1729 | 裂 1730 | 刷 1731 | 毅 1732 | 拾 1733 | 疼 1734 | 阔 1735 | 耍 1736 | 亏 1737 | 吟 1738 | 锐 1739 | 惧 1740 | 锅 1741 | 蝶 1742 | 壳 1743 | 糕 1744 | 舟 1745 | 牧 1746 | 妮 1747 | 粗 1748 | 仇 1749 | 驶 1750 | 促 1751 | 孝 1752 | 裤 1753 | 誉 1754 | 家 1755 | 迈 1756 | 姿 1757 | 踪 1758 | 兔 1759 | 综 1760 | 旭 1761 | 韵 1762 | 齿 1763 | 乔 1764 | 怖 1765 | 晴 1766 | 闷 1767 | 墨 1768 | 咬 1769 | 侧 1770 | 狱 1771 | 琪 1772 | 梯 1773 | 宾 1774 | 枫 1775 | 锦 1776 | 瑜 1777 | 敦 1778 | 矛 1779 | 弘 1780 | 玛 1781 | 茫 1782 | 迪 1783 | 览 1784 | 挤 1785 | 雳 1786 | 岚 1787 | 卷 1788 | 黎 1789 | 薄 1790 | 柳 1791 | 咦 1792 | 廷 1793 | 瞧 1794 | 幅 1795 | 挖 1796 | 唬 1797 | 侯 1798 | 祸 1799 | 饰 1800 | 儒 1801 | 捡 1802 | 筋 1803 | 融 1804 | 耗 1805 | 铃 1806 | 奉 1807 | 鼻 1808 | 坜 1809 | 曼 1810 | 贡 1811 | 嗨 1812 | 炎 1813 | 啡 1814 | 捐 1815 | 炮 1816 | 霹 1817 | 貌 1818 | 鸣 1819 | 饱 1820 | 廉 1821 | 绘 1822 | 咪 1823 | 吝 1824 | 肚 1825 | 云 1826 | 翼 1827 | 氏 1828 | 骚 1829 | 爷 1830 | 寿 1831 | 绕 1832 | 唷 1833 | 牺 1834 | 屠 1835 | 谋 1836 | 彻 1837 | 俱 1838 | 粉 1839 | 雾 1840 | 涵 1841 | 侨 1842 | 础 1843 | 疗 1844 | 署 1845 | 稿 1846 | 涉 1847 | 稣 1848 | 誓 1849 | 箭 1850 | 涯 1851 | 锺 1852 | 迹 1853 | 抄 1854 | 踢 1855 | 贪 1856 | 咖 1857 | 莱 1858 | 夺 1859 | 勉 1860 | 焦 1861 | 蒋 1862 | 桑 1863 | 沧 1864 | 恰 1865 | 泳 1866 | 牲 1867 | 戒 1868 | 恼 1869 | 夕 1870 | 棚 1871 | 爬 1872 | 菲 1873 | 翁 1874 | 奔 1875 | 滴 1876 | 玄 1877 | 捷 1878 | 曰 1879 | 愉 1880 | 逊 1881 | 憾 1882 | 钓 1883 | 壁 1884 | 躲 1885 | 嫌 1886 | 姆 1887 | 乏 1888 | 洛 1889 | 逼 1890 | 磨 1891 | 剪 1892 | 逝 1893 | 亨 1894 | 盼 1895 | 杯 1896 | 敝 1897 | 碍 1898 | 痴 1899 | 植 1900 | 瑰 1901 | 勤 1902 | 悟 1903 | 彬 1904 | 删 1905 | 薪 1906 | 悠 1907 | 胎 1908 | 侵 1909 | 坪 1910 | 赋 1911 | 弯 1912 | 丹 1913 | 巫 1914 | 轩 1915 | 辨 1916 | 吐 1917 | 么 1918 | 盾 1919 | 扯 1920 | 割 1921 | 艾 1922 | 幼 1923 | 捕 1924 | 召 1925 | 怒 1926 | 坡 1927 | 缓 1928 | 猛 1929 | 驾 1930 | 莉 1931 | 彦 1932 | 韩 1933 | 鞋 1934 | 碧 1935 | 泽 1936 | 泉 1937 | 缴 1938 | 跃 1939 | 喇 1940 | 腿 1941 | 糟 1942 | 胆 1943 | 摘 1944 | 朵 1945 | 逛 1946 | 甜 1947 | 拔 1948 | 劲 1949 | 悉 1950 | 穷 1951 | 汤 1952 | 唐 1953 | 臭 1954 | 玲 1955 | 怡 1956 | 舍 1957 | 欺 1958 | 蜜 1959 | 耻 1960 | 坦 1961 | 叭 1962 | 亿 1963 | 忌 1964 | 鲁 1965 | 繁 1966 | 泥 1967 | 伸 1968 | 壮 1969 | 串 1970 | 圾 1971 | 币 1972 | 荒 1973 | 垃 1974 | 妇 1975 | 旦 1976 | 截 1977 | 喷 1978 | 碎 1979 | 吕 1980 | 犹 1981 | 抹 1982 | 脆 1983 | 煞 1984 | 胞 1985 | 晶 1986 | 潜 1987 | 玫 1988 | 妻 1989 | 估 1990 | 陷 1991 | 孔 1992 | 娃 1993 | 兽 1994 | 肥 1995 | 凉 1996 | 岂 1997 | 逻 1998 | 胸 1999 | 杜 2000 | 袋 2001 | 甘 2002 | 邀 2003 | 培 2004 | 龄 2005 | 辆 2006 | 廖 2007 | 冲 2008 | 渡 2009 | 羽 2010 | 秒 2011 | 辞 2012 | 倾 2013 | 窝 2014 | 柏 2015 | 淑 2016 | 诞 2017 | 漏 2018 | 姑 2019 | 托 2020 | 吾 2021 | 纷 2022 | 拆 2023 | 浩 2024 | 税 2025 | 邱 2026 | 迟 2027 | 筹 2028 | 监 2029 | 汪 2030 | 擎 2031 | 衡 2032 | 狐 2033 | 灰 2034 | 尖 2035 | 番 2036 | 罚 2037 | 证 2038 | 盗 2039 | 祥 2040 | 毫 2041 | 彰 2042 | 扩 2043 | 幽 2044 | 阐 2045 | 喊 2046 | 菩 2047 | 赐 2048 | 奋 2049 | 鲜 2050 | 劝 2051 | 栏 2052 | 慈 2053 | 扫 2054 | 尽 2055 | 穹 2056 | 丌 2057 | 绪 2058 | 砂 2059 | 勿 2060 | 抢 2061 | 啪 2062 | 庸 2063 | 赤 2064 | 饮 2065 | 萨 2066 | 兼 2067 | 访 2068 | 舒 2069 | 裕 2070 | 逸 2071 | 宙 2072 | 丸 2073 | 准 2074 | 魂 2075 | 厚 2076 | 励 2077 | 仰 2078 | 糊 2079 | 顿 2080 | 闭 2081 | 塔 2082 | 枪 2083 | 睛 2084 | 斋 2085 | 奥 2086 | 恭 2087 | 翔 2088 | 遥 2089 | 航 2090 | 孟 2091 | 昌 2092 | 卧 2093 | 颇 2094 | 革 2095 | 邪 2096 | 阻 2097 | 蟹 2098 | 裁 2099 | 后 2100 | 函 2101 | 于 2102 | 拳 2103 | 宽 2104 | 锋 2105 | 州 2106 | 葛 2107 | 拒 2108 | 池 2109 | 镇 2110 | 芬 2111 | 岸 2112 | 寞 2113 | 凭 2114 | 姊 2115 | 殊 2116 | 板 2117 | 勒 2118 | 慕 2119 | 跌 2120 | 踏 2121 | 填 2122 | 陪 2123 | 逐 2124 | 洽 2125 | 描 2126 | 妨 2127 | 仪 2128 | 摄 2129 | 紫 2130 | 谅 2131 | 阅 2132 | 邦 2133 | 麦 2134 | 莲 2135 | 闪 2136 | 纵 2137 | 庭 2138 | 圈 2139 | 榜 2140 | 滑 2141 | 舰 2142 | 面 2143 | 献 2144 | 浅 2145 | 飘 2146 | 宋 2147 | 俗 2148 | 沟 2149 | 巷 2150 | 眠 2151 | 帽 2152 | 惑 2153 | 羊 2154 | 牵 2155 | 净 2156 | 厉 2157 | 撞 2158 | 崇 2159 | 竞 2160 | 回 2161 | 乙 2162 | 聪 2163 | 桃 2164 | 伍 2165 | 役 2166 | 潮 2167 | 损 2168 | 凯 2169 | 锁 2170 | 震 2171 | 醉 2172 | 屁 2173 | 牠 2174 | 孙 2175 | 酷 2176 | 染 2177 | 尺 2178 | 摸 2179 | 盛 2180 | 闹 2181 | 棋 2182 | 吓 2183 | 迫 2184 | 瓜 2185 | 松 2186 | 搬 2187 | 戴 2188 | 瞭 2189 | 乌 2190 | 谱 2191 | 滚 2192 | 赚 2193 | 障 2194 | 逃 2195 | 齐 2196 | 牙 2197 | 怨 2198 | 拖 2199 | 皇 2200 | 贺 2201 | 横 2202 | 塞 2203 | 摆 2204 | 农 2205 | 倍 2206 | 额 2207 | 乘 2208 | 户 2209 | 奈 2210 | 川 2211 | 徐 2212 | 井 2213 | 寝 2214 | 洞 2215 | 劳 2216 | 船 2217 | 域 2218 | 屋 2219 | 胖 2220 | 藉 2221 | 销 2222 | 拼 2223 | 桌 2224 | 忧 2225 | 违 2226 | 拟 2227 | 吵 2228 | 媒 2229 | 辩 2230 | 妙 2231 | 鸿 2232 | 恩 2233 | 映 2234 | 耳 2235 | 傻 2236 | 京 2237 | 搭 2238 | 残 2239 | 稍 2240 | 颜 2241 | 固 2242 | 眉 2243 | 龟 2244 | 哀 2245 | 发 2246 | 沈 2247 | 拨 2248 | 丁 2249 | 愁 2250 | 耐 2251 | 宪 2252 | 覆 2253 | 盟 2254 | 昭 2255 | 握 2256 | 萧 2257 | 延 2258 | 豆 2259 | 弱 2260 | 隆 2261 | 页 2262 | 烧 2263 | 遍 2264 | 距 2265 | 摩 2266 | 祖 2267 | 探 2268 | 倚 2269 | 寂 2270 | 阴 2271 | 悔 2272 | 库 2273 | 嘴 2274 | 沉 2275 | 伊 2276 | 暂 2277 | 霸 2278 | 喵 2279 | 频 2280 | 鼓 2281 | 冒 2282 | 鼠 2283 | 企 2284 | 副 2285 | 菜 2286 | 款 2287 | 忽 2288 | 尾 2289 | 租 2290 | 椰 2291 | 隔 2292 | 狼 2293 | 浮 2294 | 惠 2295 | 峰 2296 | 索 2297 | 芳 2298 | 摇 2299 | 洪 2300 | 伦 2301 | 骨 2302 | 吹 2303 | 郑 2304 | 哩 2305 | 珍 2306 | 纳 2307 | 零 2308 | 哲 2309 | 遭 2310 | 瓶 2311 | 亡 2312 | 振 2313 | 予 2314 | 村 2315 | 旅 2316 | 惨 2317 | 汽 2318 | 爸 2319 | 隐 2320 | 械 2321 | 寒 2322 | 危 2323 | 邮 2324 | 贝 2325 | 阶 2326 | 赖 2327 | 茶 2328 | 谊 2329 | 涛 2330 | 惯 2331 | 尘 2332 | 丝 2333 | 森 2334 | 询 2335 | 露 2336 | 稳 2337 | 桥 2338 | 夏 2339 | 哭 2340 | 坚 2341 | 籍 2342 | 厌 2343 | 苍 2344 | 析 2345 | 冰 2346 | 仙 2347 | 布 2348 | 箱 2349 | 脱 2350 | 贤 2351 | 途 2352 | 订 2353 | 财 2354 | 欧 2355 | 赢 2356 | 枢 2357 | 泪 2358 | 废 2359 | 钢 2360 | 渐 2361 | 泡 2362 | 刊 2363 | 肯 2364 | 恨 2365 | 砍 2366 | 抽 2367 | 股 2368 | 咧 2369 | 婆 2370 | 禁 2371 | 郎 2372 | 默 2373 | 符 2374 | 缩 2375 | 童 2376 | 绿 2377 | 骗 2378 | 辈 2379 | 尼 2380 | 届 2381 | 彼 2382 | 兮 2383 | 聚 2384 | 宇 2385 | 辛 2386 | 疯 2387 | 减 2388 | 米 2389 | 念 2390 | 降 2391 | 街 2392 | 临 2393 | 敏 2394 | 洗 2395 | 玉 2396 | 伴 2397 | 辅 2398 | 诺 2399 | 鸡 2400 | 侠 2401 | 健 2402 | 熊 2403 | 顶 2404 | 挑 2405 | 替 2406 | 豪 2407 | 掌 2408 | 饭 2409 | 银 2410 | 圆 2411 | 志 2412 | 休 2413 | 材 2414 | 灭 2415 | 烈 2416 | 爆 2417 | 透 2418 | 遗 2419 | 虚 2420 | 醒 2421 | 货 2422 | 雅 2423 | 宏 2424 | 帅 2425 | 宫 2426 | 港 2427 | 偶 2428 | 丢 2429 | 篮 2430 | 凡 2431 | 瑞 2432 | 硕 2433 | 雪 2434 | 忠 2435 | 蔡 2436 | 插 2437 | 积 2438 | 乖 2439 | 挥 2440 | 抗 2441 | 察 2442 | 末 2443 | 盖 2444 | 厅 2445 | 移 2446 | 吸 2447 | 括 2448 | 笨 2449 | 孤 2450 | 译 2451 | 避 2452 | 秀 2453 | 富 2454 | 漂 2455 | 柔 2456 | 私 2457 | 围 2458 | 狮 2459 | 祝 2460 | 庆 2461 | 序 2462 | 拥 2463 | 洲 2464 | 徒 2465 | 借 2466 | 晓 2467 | 嘉 2468 | 诗 2469 | 淡 2470 | 束 2471 | 姓 2472 | 颗 2473 | 勇 2474 | 犯 2475 | 喝 2476 | 食 2477 | 镜 2478 | 偏 2479 | 猜 2480 | 层 2481 | 帐 2482 | 仅 2483 | 购 2484 | 衣 2485 | 申 2486 | 伯 2487 | 紧 2488 | 县 2489 | 婚 2490 | 季 2491 | 敬 2492 | 弃 2493 | 尊 2494 | 蛋 2495 | 鹰 2496 | 熟 2497 | 冠 2498 | 唯 2499 | 混 2500 | 藏 2501 | 河 2502 | 忍 2503 | 窗 2504 | 朝 2505 | 轮 2506 | 册 2507 | 乡 2508 | 敌 2509 | 散 2510 | 沙 2511 | 幻 2512 | 短 2513 | 略 2514 | 批 2515 | 游 2516 | 奖 2517 | 岛 2518 | 逢 2519 | 脸 2520 | 顾 2521 | 督 2522 | 协 2523 | 雷 2524 | 详 2525 | 穿 2526 | 慧 2527 | 巧 2528 | 罢 2529 | 呼 2530 | 暗 2531 | 贴 2532 | 纸 2533 | 歉 2534 | 郭 2535 | 努 2536 | 担 2537 | 蓝 2538 | 训 2539 | 享 2540 | 架 2541 | 济 2542 | 猪 2543 | 派 2544 | 均 2545 | 妈 2546 | 哦 2547 | 宣 2548 | 检 2549 | 鬼 2550 | 灯 2551 | 策 2552 | 梅 2553 | 启 2554 | 嘿 2555 | 洋 2556 | 伟 2557 | 萤 2558 | 磁 2559 | 啰 2560 | 付 2561 | 弄 2562 | 寄 2563 | 钟 2564 | 播 2565 | 险 2566 | 载 2567 | 赏 2568 | 汉 2569 | 块 2570 | 刀 2571 | 铭 2572 | 施 2573 | 卫 2574 | 弹 2575 | 售 2576 | 叶 2577 | 皆 2578 | 罪 2579 | 虎 2580 | 归 2581 | 毛 2582 | 昨 2583 | 荣 2584 | 律 2585 | 树 2586 | 奏 2587 | 注 2588 | 扁 2589 | 笔 2590 | 旁 2591 | 键 2592 | 制 2593 | 莫 2594 | 堆 2595 | 射 2596 | 承 2597 | 波 2598 | 皮 2599 | 释 2600 | 判 2601 | 含 2602 | 既 2603 | 退 2604 | 纪 2605 | 刻 2606 | 肉 2607 | 靠 2608 | 麻 2609 | 湖 2610 | 继 2611 | 诚 2612 | 姐 2613 | 益 2614 | 置 2615 | 惜 2616 | 艺 2617 | 尚 2618 | 纯 2619 | 骂 2620 | 琴 2621 | 漫 2622 | 援 2623 | 缺 2624 | 诸 2625 | 尤 2626 | 忆 2627 | 景 2628 | 府 2629 | 委 2630 | 刘 2631 | 绍 2632 | 虑 2633 | 暴 2634 | 草 2635 | 充 2636 | 授 2637 | 防 2638 | 素 2639 | 房 2640 | 搞 2641 | 典 2642 | 仔 2643 | 父 2644 | 吉 2645 | 招 2646 | 剑 2647 | 脚 2648 | 突 2649 | 牌 2650 | 餐 2651 | 仁 2652 | 酒 2653 | 礼 2654 | 巴 2655 | 丽 2656 | 亮 2657 | 恐 2658 | 述 2659 | 周 2660 | 杂 2661 | 旧 2662 | 套 2663 | 赵 2664 | 堂 2665 | 创 2666 | 母 2667 | 辑 2668 | 络 2669 | 俊 2670 | 毒 2671 | 威 2672 | 冷 2673 | 蛮 2674 | 普 2675 | 登 2676 | 微 2677 | 控 2678 | 爽 2679 | 香 2680 | 坐 2681 | 缘 2682 | 幕 2683 | 兰 2684 | 悲 2685 | 势 2686 | 午 2687 | 睡 2688 | 密 2689 | 垒 2690 | 警 2691 | 宗 2692 | 严 2693 | 阵 2694 | 江 2695 | 亚 2696 | 攻 2697 | 静 2698 | 抱 2699 | 啥 2700 | 急 2701 | 宿 2702 | 剧 2703 | 词 2704 | 忙 2705 | 牛 2706 | 吴 2707 | 陆 2708 | 维 2709 | 激 2710 | 增 2711 | 聊 2712 | 浪 2713 | 状 2714 | 良 -------------------------------------------------------------------------------- /config/extra_stopword.dic: -------------------------------------------------------------------------------- 1 | 也 2 | 了 3 | 仍 4 | 从 5 | 以 6 | 使 7 | 则 8 | 却 9 | 又 10 | 及 11 | 对 12 | 就 13 | 并 14 | 很 15 | 或 16 | 把 17 | 是 18 | 的 19 | 着 20 | 给 21 | 而 22 | 被 23 | 让 24 | 在 25 | 还 26 | 比 27 | 等 28 | 当 29 | 与 30 | 于 31 | 但 -------------------------------------------------------------------------------- /config/extra_test.dic: -------------------------------------------------------------------------------- 1 | 我是中国人 -------------------------------------------------------------------------------- /config/preposition.dic: -------------------------------------------------------------------------------- 1 | 不 2 | 也 3 | 了 4 | 仍 5 | 从 6 | 以 7 | 使 8 | 则 9 | 却 10 | 又 11 | 及 12 | 对 13 | 就 14 | 并 15 | 很 16 | 或 17 | 把 18 | 是 19 | 的 20 | 着 21 | 给 22 | 而 23 | 被 24 | 让 25 | 但 -------------------------------------------------------------------------------- /config/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /config/stopword.dic: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | and 4 | are 5 | as 6 | at 7 | be 8 | but 9 | by 10 | for 11 | if 12 | in 13 | into 14 | is 15 | it 16 | no 17 | not 18 | of 19 | on 20 | or 21 | such 22 | that 23 | the 24 | their 25 | then 26 | there 27 | these 28 | they 29 | this 30 | to 31 | was 32 | will 33 | with -------------------------------------------------------------------------------- /config/suffix.dic: -------------------------------------------------------------------------------- 1 | 乡 2 | 井 3 | 亭 4 | 党 5 | 区 6 | 厅 7 | 县 8 | 园 9 | 塔 10 | 家 11 | 寺 12 | 局 13 | 巷 14 | 市 15 | 弄 16 | 所 17 | 斯基 18 | 楼 19 | 江 20 | 河 21 | 海 22 | 湖 23 | 省 24 | 维奇 25 | 署 26 | 苑 27 | 街 28 | 觀 29 | 观 30 | 诺夫 31 | 路 32 | 部 33 | 镇 34 | 阁 35 | 山 36 | 子 37 | 娃 -------------------------------------------------------------------------------- /config/surname.dic: -------------------------------------------------------------------------------- 1 | 丁 2 | 万 3 | 万俟 4 | 上官 5 | 东方 6 | 乔 7 | 于 8 | 令狐 9 | 仲孙 10 | 任 11 | 何 12 | 余 13 | 候 14 | 傅 15 | 公冶 16 | 公孙 17 | 公羊 18 | 冯 19 | 刘 20 | 单 21 | 单于 22 | 卢 23 | 史 24 | 叶 25 | 司徒 26 | 司空 27 | 司马 28 | 吕 29 | 吴 30 | 周 31 | 唐 32 | 夏 33 | 夏侯 34 | 太叔 35 | 姚 36 | 姜 37 | 孔 38 | 孙 39 | 孟 40 | 宇文 41 | 宋 42 | 宗政 43 | 尉迟 44 | 尹 45 | 崔 46 | 常 47 | 康 48 | 廖 49 | 张 50 | 彭 51 | 徐 52 | 慕容 53 | 戴 54 | 文 55 | 方 56 | 易 57 | 曹 58 | 曾 59 | 朱 60 | 李 61 | 杜 62 | 杨 63 | 林 64 | 梁 65 | 欧阳 66 | 武 67 | 段 68 | 毛 69 | 江 70 | 汤 71 | 沈 72 | 淳于 73 | 潘 74 | 澹台 75 | 濮阳 76 | 熊 77 | 王 78 | 田 79 | 申屠 80 | 白 81 | 皇甫 82 | 石 83 | 秦 84 | 程 85 | 罗 86 | 肖 87 | 胡 88 | 苏 89 | 范 90 | 董 91 | 蒋 92 | 薛 93 | 袁 94 | 许 95 | 诸葛 96 | 谢 97 | 谭 98 | 贺 99 | 贾 100 | 赖 101 | 赫连 102 | 赵 103 | 轩辕 104 | 邓 105 | 邱 106 | 邵 107 | 邹 108 | 郑 109 | 郝 110 | 郭 111 | 金 112 | 钟 113 | 钟离 114 | 钱 115 | 长孙 116 | 闻人 117 | 闾丘 118 | 阎 119 | 陆 120 | 陈 121 | 雷 122 | 韩 123 | 顾 124 | 马 125 | 高 126 | 魏 127 | 鲜于 128 | 黄 129 | 黎 130 | 龙 131 | 龚 -------------------------------------------------------------------------------- /index/_1.cfe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.cfe -------------------------------------------------------------------------------- /index/_1.cfs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.cfs -------------------------------------------------------------------------------- /index/_1.si: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.si -------------------------------------------------------------------------------- /index/_3.cfe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.cfe -------------------------------------------------------------------------------- /index/_3.cfs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.cfs -------------------------------------------------------------------------------- /index/_3.si: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.si -------------------------------------------------------------------------------- /index/segments_2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/segments_2 -------------------------------------------------------------------------------- /index/segments_4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/segments_4 -------------------------------------------------------------------------------- /index/write.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/write.lock -------------------------------------------------------------------------------- /licenses/lucene-LICENSE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | 204 | 205 | 206 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 207 | derived from unicode conversion examples available at 208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright 209 | from those sources: 210 | 211 | /* 212 | * Copyright 2001-2004 Unicode, Inc. 213 | * 214 | * Disclaimer 215 | * 216 | * This source code is provided as is by Unicode, Inc. No claims are 217 | * made as to fitness for any particular purpose. No warranties of any 218 | * kind are expressed or implied. The recipient agrees to determine 219 | * applicability of information provided. If this file has been 220 | * purchased on magnetic or optical media from Unicode, Inc., the 221 | * sole remedy for any claim will be exchange of defective media 222 | * within 90 days of receipt. 223 | * 224 | * Limitations on Rights to Redistribute This Code 225 | * 226 | * Unicode, Inc. hereby grants the right to freely use the information 227 | * supplied in this file in the creation of products supporting the 228 | * Unicode Standard, and to make copies of this file in any form 229 | * for internal or external distribution as long as this notice 230 | * remains attached. 231 | */ 232 | 233 | 234 | Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was 235 | derived from Python 2.4.2 sources available at 236 | http://www.python.org. Full license is here: 237 | 238 | http://www.python.org/download/releases/2.4.2/license/ 239 | 240 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 241 | derived from Python 3.1.2 sources available at 242 | http://www.python.org. Full license is here: 243 | 244 | http://www.python.org/download/releases/3.1.2/license/ 245 | 246 | Some code in core/src/java/org/apache/lucene/util/automaton was 247 | derived from Brics automaton sources available at 248 | www.brics.dk/automaton/. Here is the copyright from those sources: 249 | 250 | /* 251 | * Copyright (c) 2001-2009 Anders Moeller 252 | * All rights reserved. 253 | * 254 | * Redistribution and use in source and binary forms, with or without 255 | * modification, are permitted provided that the following conditions 256 | * are met: 257 | * 1. Redistributions of source code must retain the above copyright 258 | * notice, this list of conditions and the following disclaimer. 259 | * 2. Redistributions in binary form must reproduce the above copyright 260 | * notice, this list of conditions and the following disclaimer in the 261 | * documentation and/or other materials provided with the distribution. 262 | * 3. The name of the author may not be used to endorse or promote products 263 | * derived from this software without specific prior written permission. 264 | * 265 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 266 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 267 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 268 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 269 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 270 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 271 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 272 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 273 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 274 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 275 | */ 276 | 277 | The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton 278 | were automatically generated with the moman/finenight FSA package. 279 | Here is the copyright for those sources: 280 | 281 | # Copyright (c) 2010, Jean-Philippe Barrette-LaPierre, 282 | # 283 | # Permission is hereby granted, free of charge, to any person 284 | # obtaining a copy of this software and associated documentation 285 | # files (the "Software"), to deal in the Software without 286 | # restriction, including without limitation the rights to use, 287 | # copy, modify, merge, publish, distribute, sublicense, and/or sell 288 | # copies of the Software, and to permit persons to whom the 289 | # Software is furnished to do so, subject to the following 290 | # conditions: 291 | # 292 | # The above copyright notice and this permission notice shall be 293 | # included in all copies or substantial portions of the Software. 294 | # 295 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 296 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 297 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 298 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 299 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 300 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 301 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 302 | # OTHER DEALINGS IN THE SOFTWARE. 303 | 304 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was 305 | derived from ICU (http://www.icu-project.org) 306 | The full license is available here: 307 | http://source.icu-project.org/repos/icu/icu/trunk/license.html 308 | 309 | /* 310 | * Copyright (C) 1999-2010, International Business Machines 311 | * Corporation and others. All Rights Reserved. 312 | * 313 | * Permission is hereby granted, free of charge, to any person obtaining a copy 314 | * of this software and associated documentation files (the "Software"), to deal 315 | * in the Software without restriction, including without limitation the rights 316 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the 317 | * Software, and to permit persons to whom the Software is furnished to do so, 318 | * provided that the above copyright notice(s) and this permission notice appear 319 | * in all copies of the Software and that both the above copyright notice(s) and 320 | * this permission notice appear in supporting documentation. 321 | * 322 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 323 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 324 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 325 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE 326 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR 327 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER 328 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT 329 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 330 | * 331 | * Except as contained in this notice, the name of a copyright holder shall not 332 | * be used in advertising or otherwise to promote the sale, use or other 333 | * dealings in this Software without prior written authorization of the 334 | * copyright holder. 335 | */ 336 | 337 | The following license applies to the Snowball stemmers: 338 | 339 | Copyright (c) 2001, Dr Martin Porter 340 | Copyright (c) 2002, Richard Boulton 341 | All rights reserved. 342 | 343 | Redistribution and use in source and binary forms, with or without 344 | modification, are permitted provided that the following conditions are met: 345 | 346 | * Redistributions of source code must retain the above copyright notice, 347 | * this list of conditions and the following disclaimer. 348 | * Redistributions in binary form must reproduce the above copyright 349 | * notice, this list of conditions and the following disclaimer in the 350 | * documentation and/or other materials provided with the distribution. 351 | * Neither the name of the copyright holders nor the names of its contributors 352 | * may be used to endorse or promote products derived from this software 353 | * without specific prior written permission. 354 | 355 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 356 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 357 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 358 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE 359 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 360 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 361 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 362 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 363 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 364 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 365 | 366 | The following license applies to the KStemmer: 367 | 368 | Copyright © 2003, 369 | Center for Intelligent Information Retrieval, 370 | University of Massachusetts, Amherst. 371 | All rights reserved. 372 | 373 | Redistribution and use in source and binary forms, with or without modification, 374 | are permitted provided that the following conditions are met: 375 | 376 | 1. Redistributions of source code must retain the above copyright notice, this 377 | list of conditions and the following disclaimer. 378 | 379 | 2. Redistributions in binary form must reproduce the above copyright notice, 380 | this list of conditions and the following disclaimer in the documentation 381 | and/or other materials provided with the distribution. 382 | 383 | 3. The names "Center for Intelligent Information Retrieval" and 384 | "University of Massachusetts" must not be used to endorse or promote products 385 | derived from this software without prior written permission. To obtain 386 | permission, contact info@ciir.cs.umass.edu. 387 | 388 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS 389 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 390 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE 392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 394 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 395 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 396 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 397 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 398 | SUCH DAMAGE. 399 | 400 | The following license applies to the Morfologik project: 401 | 402 | Copyright (c) 2006 Dawid Weiss 403 | Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski 404 | All rights reserved. 405 | 406 | Redistribution and use in source and binary forms, with or without modification, 407 | are permitted provided that the following conditions are met: 408 | 409 | * Redistributions of source code must retain the above copyright notice, 410 | this list of conditions and the following disclaimer. 411 | 412 | * Redistributions in binary form must reproduce the above copyright notice, 413 | this list of conditions and the following disclaimer in the documentation 414 | and/or other materials provided with the distribution. 415 | 416 | * Neither the name of Morfologik nor the names of its contributors 417 | may be used to endorse or promote products derived from this software 418 | without specific prior written permission. 419 | 420 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 421 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 422 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 423 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 424 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 425 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 426 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 427 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 428 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 429 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 430 | 431 | --- 432 | 433 | The dictionary comes from Morfologik project. Morfologik uses data from 434 | Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and 435 | is licenced on the terms of (inter alia) LGPL and Creative Commons 436 | ShareAlike. The part-of-speech tags were added in Morfologik project and 437 | are not found in the data from sjp.pl. The tagset is similar to IPI PAN 438 | tagset. 439 | 440 | --- 441 | 442 | The following license applies to the Morfeusz project, 443 | used by org.apache.lucene.analysis.morfologik. 444 | 445 | BSD-licensed dictionary of Polish (SGJP) 446 | http://sgjp.pl/morfeusz/ 447 | 448 | Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński, 449 | Marcin Woliński, Robert Wołosz 450 | 451 | All rights reserved. 452 | 453 | Redistribution and use in source and binary forms, with or without 454 | modification, are permitted provided that the following conditions are 455 | met: 456 | 457 | 1. Redistributions of source code must retain the above copyright 458 | notice, this list of conditions and the following disclaimer. 459 | 460 | 2. Redistributions in binary form must reproduce the above copyright 461 | notice, this list of conditions and the following disclaimer in the 462 | documentation and/or other materials provided with the 463 | distribution. 464 | 465 | THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS 466 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 467 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 468 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE 469 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 470 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 471 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 472 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 473 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 474 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN 475 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 476 | -------------------------------------------------------------------------------- /licenses/lucene-NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Lucene 2 | Copyright 2014 The Apache Software Foundation 3 | 4 | This product includes software developed at 5 | The Apache Software Foundation (http://www.apache.org/). 6 | 7 | Includes software from other Apache Software Foundation projects, 8 | including, but not limited to: 9 | - Apache Ant 10 | - Apache Jakarta Regexp 11 | - Apache Commons 12 | - Apache Xerces 13 | 14 | ICU4J, (under analysis/icu) is licensed under an MIT styles license 15 | and Copyright (c) 1995-2008 International Business Machines Corporation and others 16 | 17 | Some data files (under analysis/icu/src/data) are derived from Unicode data such 18 | as the Unicode Character Database. See http://unicode.org/copyright.html for more 19 | details. 20 | 21 | Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is 22 | BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/ 23 | 24 | The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were 25 | automatically generated with the moman/finenight FSA library, created by 26 | Jean-Philippe Barrette-LaPierre. This library is available under an MIT license, 27 | see http://sites.google.com/site/rrettesite/moman and 28 | http://bitbucket.org/jpbarrette/moman/overview/ 29 | 30 | The class org.apache.lucene.util.WeakIdentityMap was derived from 31 | the Apache CXF project and is Apache License 2.0. 32 | 33 | The Google Code Prettify is Apache License 2.0. 34 | See http://code.google.com/p/google-code-prettify/ 35 | 36 | JUnit (junit-4.10) is licensed under the Common Public License v. 1.0 37 | See http://junit.sourceforge.net/cpl-v10.html 38 | 39 | This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin 40 | g Package (jaspell): http://jaspell.sourceforge.net/ 41 | License: The BSD License (http://www.opensource.org/licenses/bsd-license.php) 42 | 43 | The snowball stemmers in 44 | analysis/common/src/java/net/sf/snowball 45 | were developed by Martin Porter and Richard Boulton. 46 | The snowball stopword lists in 47 | analysis/common/src/resources/org/apache/lucene/analysis/snowball 48 | were developed by Martin Porter and Richard Boulton. 49 | The full snowball package is available from 50 | http://snowball.tartarus.org/ 51 | 52 | The KStem stemmer in 53 | analysis/common/src/org/apache/lucene/analysis/en 54 | was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst) 55 | under the BSD-license. 56 | 57 | The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default 58 | stopword list that is BSD-licensed created by Jacques Savoy. These files reside in: 59 | analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt, 60 | analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt, 61 | analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt, 62 | analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt, 63 | analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt 64 | See http://members.unine.ch/jacques.savoy/clef/index.html. 65 | 66 | The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers 67 | (common) are based on BSD-licensed reference implementations created by Jacques Savoy and 68 | Ljiljana Dolamic. These files reside in: 69 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java 70 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java 71 | analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java 72 | analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java 73 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java 74 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java 75 | analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java 76 | analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java 77 | analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java 78 | analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java 79 | analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java 80 | 81 | The Stempel analyzer (stempel) includes BSD-licensed software developed 82 | by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil, 83 | and Edmond Nolan. 84 | 85 | The Polish analyzer (stempel) comes with a default 86 | stopword list that is BSD-licensed created by the Carrot2 project. The file resides 87 | in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt. 88 | See http://project.carrot2.org/license.html. 89 | 90 | The SmartChineseAnalyzer source code (smartcn) was 91 | provided by Xiaoping Gao and copyright 2009 by www.imdict.net. 92 | 93 | WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/) 94 | is derived from Unicode data such as the Unicode Character Database. 95 | See http://unicode.org/copyright.html for more details. 96 | 97 | The Morfologik analyzer (morfologik) includes BSD-licensed software 98 | developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/). 99 | 100 | Morfologik uses data from Polish ispell/myspell dictionary 101 | (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia) 102 | LGPL and Creative Commons ShareAlike. 103 | 104 | Morfologic includes data from BSD-licensed dictionary of Polish (SGJP) 105 | (http://sgjp.pl/morfeusz/) 106 | 107 | Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original 108 | source code for this can be found at http://www.eclipse.org/jetty/downloads.php 109 | 110 | =========================================================================== 111 | Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration 112 | =========================================================================== 113 | 114 | This software includes a binary and/or source version of data from 115 | 116 | mecab-ipadic-2.7.0-20070801 117 | 118 | which can be obtained from 119 | 120 | http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz 121 | 122 | or 123 | 124 | http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz 125 | 126 | =========================================================================== 127 | mecab-ipadic-2.7.0-20070801 Notice 128 | =========================================================================== 129 | 130 | Nara Institute of Science and Technology (NAIST), 131 | the copyright holders, disclaims all warranties with regard to this 132 | software, including all implied warranties of merchantability and 133 | fitness, in no event shall NAIST be liable for 134 | any special, indirect or consequential damages or any damages 135 | whatsoever resulting from loss of use, data or profits, whether in an 136 | action of contract, negligence or other tortuous action, arising out 137 | of or in connection with the use or performance of this software. 138 | 139 | A large portion of the dictionary entries 140 | originate from ICOT Free Software. The following conditions for ICOT 141 | Free Software applies to the current dictionary as well. 142 | 143 | Each User may also freely distribute the Program, whether in its 144 | original form or modified, to any third party or parties, PROVIDED 145 | that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear 146 | on, or be attached to, the Program, which is distributed substantially 147 | in the same form as set out herein and that such intended 148 | distribution, if actually made, will neither violate or otherwise 149 | contravene any of the laws and regulations of the countries having 150 | jurisdiction over the User or the intended distribution itself. 151 | 152 | NO WARRANTY 153 | 154 | The program was produced on an experimental basis in the course of the 155 | research and development conducted during the project and is provided 156 | to users as so produced on an experimental basis. Accordingly, the 157 | program is provided without any warranty whatsoever, whether express, 158 | implied, statutory or otherwise. The term "warranty" used herein 159 | includes, but is not limited to, any warranty of the quality, 160 | performance, merchantability and fitness for a particular purpose of 161 | the program and the nonexistence of any infringement or violation of 162 | any right of any third party. 163 | 164 | Each user of the program will agree and understand, and be deemed to 165 | have agreed and understood, that there is no warranty whatsoever for 166 | the program and, accordingly, the entire risk arising from or 167 | otherwise connected with the program is assumed by the user. 168 | 169 | Therefore, neither ICOT, the copyright holder, or any other 170 | organization that participated in or was otherwise related to the 171 | development of the program and their respective officials, directors, 172 | officers and other employees shall be held liable for any and all 173 | damages, including, without limitation, general, special, incidental 174 | and consequential damages, arising out of or otherwise in connection 175 | with the use or inability to use the program or any product, material 176 | or result produced or otherwise obtained by using the program, 177 | regardless of whether they have been advised of, or otherwise had 178 | knowledge of, the possibility of such damages at any time during the 179 | project or thereafter. Each user will be deemed to have agreed to the 180 | foregoing by his or her commencement of use of the program. The term 181 | "use" as used herein includes, but is not limited to, the use, 182 | modification, copying and distribution of the program and the 183 | production of secondary products from the program. 184 | 185 | In the case where the program, whether in its original form or 186 | modified, was distributed or delivered to or received by a user from 187 | any person, organization or entity other than ICOT, unless it makes or 188 | grants independently of ICOT any specific warranty to the user in 189 | writing, such person, organization or entity, will also be exempted 190 | from and not be held liable to the user for any such damages as noted 191 | above as far as the program is concerned. 192 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | elasticsearch-analysis-ik 6 | 4.0.0 7 | org.elasticsearch 8 | elasticsearch-analysis-ik 9 | ${elasticsearch.version} 10 | jar 11 | IK Analyzer for Elasticsearch 12 | 2011 13 | 14 | 15 | 6.5.4 16 | 1.8 17 | ${project.basedir}/src/main/assemblies/plugin.xml 18 | analysis-ik 19 | org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin 20 | true 21 | false 22 | true 23 | 4E899B30 24 | true 25 | 26 | 27 | 28 | 29 | The Apache Software License, Version 2.0 30 | http://www.apache.org/licenses/LICENSE-2.0.txt 31 | repo 32 | 33 | 34 | 35 | 36 | 37 | Medcl 38 | medcl@elastic.co 39 | elastic 40 | http://www.elastic.co 41 | 42 | 43 | 44 | 45 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git 46 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git 47 | 48 | http://github.com/medcl/elasticsearch-analysis-ik 49 | 50 | 51 | 52 | org.sonatype.oss 53 | oss-parent 54 | 9 55 | 56 | 57 | 58 | 59 | oss.sonatype.org 60 | https://oss.sonatype.org/content/repositories/snapshots 61 | 62 | 63 | oss.sonatype.org 64 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 65 | 66 | 67 | 68 | 69 | 70 | oss.sonatype.org 71 | OSS Sonatype 72 | true 73 | true 74 | http://oss.sonatype.org/content/repositories/releases/ 75 | 76 | 77 | 78 | 79 | 80 | org.elasticsearch 81 | elasticsearch 82 | ${elasticsearch.version} 83 | compile 84 | 85 | 86 | 87 | 88 | org.apache.httpcomponents 89 | httpclient 90 | 4.5.2 91 | 92 | 93 | 94 | org.apache.logging.log4j 95 | log4j-api 96 | 2.3 97 | 98 | 99 | 100 | org.hamcrest 101 | hamcrest-core 102 | 1.3 103 | test 104 | 105 | 106 | 107 | org.hamcrest 108 | hamcrest-library 109 | 1.3 110 | test 111 | 112 | 113 | junit 114 | junit 115 | 4.12 116 | test 117 | 118 | 119 | 120 | 121 | 122 | 123 | org.apache.maven.plugins 124 | maven-compiler-plugin 125 | 3.5.1 126 | 127 | ${maven.compiler.target} 128 | ${maven.compiler.target} 129 | 130 | 131 | 132 | org.apache.maven.plugins 133 | maven-surefire-plugin 134 | 2.11 135 | 136 | 137 | **/*Tests.java 138 | 139 | 140 | 141 | 142 | org.apache.maven.plugins 143 | maven-source-plugin 144 | 2.1.2 145 | 146 | 147 | attach-sources 148 | 149 | jar 150 | 151 | 152 | 153 | 154 | 155 | maven-assembly-plugin 156 | 157 | 158 | false 159 | ${project.build.directory}/releases/ 160 | 161 | ${basedir}/src/main/assemblies/plugin.xml 162 | 163 | 164 | 165 | fully.qualified.MainClass 166 | 167 | 168 | 169 | 170 | 171 | package 172 | 173 | single 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | disable-java8-doclint 183 | 184 | [1.8,) 185 | 186 | 187 | -Xdoclint:none 188 | 189 | 190 | 191 | release 192 | 193 | 194 | 195 | org.sonatype.plugins 196 | nexus-staging-maven-plugin 197 | 1.6.3 198 | true 199 | 200 | oss 201 | https://oss.sonatype.org/ 202 | true 203 | 204 | 205 | 206 | org.apache.maven.plugins 207 | maven-release-plugin 208 | 2.1 209 | 210 | true 211 | false 212 | release 213 | deploy 214 | 215 | 216 | 217 | org.apache.maven.plugins 218 | maven-compiler-plugin 219 | 3.5.1 220 | 221 | ${maven.compiler.target} 222 | ${maven.compiler.target} 223 | 224 | 225 | 226 | org.apache.maven.plugins 227 | maven-gpg-plugin 228 | 1.5 229 | 230 | 231 | sign-artifacts 232 | verify 233 | 234 | sign 235 | 236 | 237 | 238 | 239 | 240 | org.apache.maven.plugins 241 | maven-source-plugin 242 | 2.2.1 243 | 244 | 245 | attach-sources 246 | 247 | jar-no-fork 248 | 249 | 250 | 251 | 252 | 253 | org.apache.maven.plugins 254 | maven-javadoc-plugin 255 | 2.9 256 | 257 | 258 | attach-javadocs 259 | 260 | jar 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | -------------------------------------------------------------------------------- /src/main/assemblies/plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | analysis-ik-release 4 | 5 | zip 6 | 7 | false 8 | 9 | 10 | ${project.basedir}/config 11 | config 12 | 13 | 14 | 15 | 16 | 17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties 18 | 19 | true 20 | 21 | 22 | ${project.basedir}/src/main/resources/plugin-security.policy 23 | 24 | true 25 | 26 | 27 | 28 | 29 | 30 | true 31 | true 32 | 33 | org.elasticsearch:elasticsearch 34 | 35 | 36 | 37 | 38 | true 39 | true 40 | 41 | org.apache.httpcomponents:httpclient 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.IndexSettings; 6 | import org.wltea.analyzer.cfg.Configuration; 7 | import org.wltea.analyzer.lucene.IKAnalyzer; 8 | 9 | public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider { 10 | private final IKAnalyzer analyzer; 11 | 12 | public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) { 13 | super(indexSettings, name, settings); 14 | 15 | Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart); 16 | analyzer=new IKAnalyzer(configuration); 17 | } 18 | 19 | public static IkAnalyzerProvider getIkSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 20 | return new IkAnalyzerProvider(indexSettings,env,name,settings,true); 21 | } 22 | 23 | public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) { 24 | return new IkAnalyzerProvider(indexSettings,env,name,settings,false); 25 | } 26 | 27 | @Override public IKAnalyzer get() { 28 | return this.analyzer; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.index.analysis; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.wltea.analyzer.cfg.Configuration; 8 | import org.wltea.analyzer.lucene.IKTokenizer; 9 | 10 | public class IkTokenizerFactory extends AbstractTokenizerFactory { 11 | private Configuration configuration; 12 | 13 | public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 14 | super(indexSettings, name, settings); 15 | configuration=new Configuration(env,settings); 16 | } 17 | 18 | public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 19 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false); 20 | } 21 | 22 | public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { 23 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true); 24 | } 25 | 26 | public IkTokenizerFactory setSmart(boolean smart){ 27 | this.configuration.setUseSmart(smart); 28 | return this; 29 | } 30 | 31 | @Override 32 | public Tokenizer create() { 33 | return new IKTokenizer(configuration); } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java: -------------------------------------------------------------------------------- 1 | package org.elasticsearch.plugin.analysis.ik; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.index.analysis.AnalyzerProvider; 5 | import org.elasticsearch.index.analysis.IkAnalyzerProvider; 6 | import org.elasticsearch.index.analysis.IkTokenizerFactory; 7 | import org.elasticsearch.index.analysis.TokenizerFactory; 8 | import org.elasticsearch.indices.analysis.AnalysisModule; 9 | import org.elasticsearch.plugins.AnalysisPlugin; 10 | import org.elasticsearch.plugins.Plugin; 11 | 12 | import java.util.HashMap; 13 | import java.util.Map; 14 | 15 | 16 | public class AnalysisIkPlugin extends Plugin implements AnalysisPlugin { 17 | 18 | public static String PLUGIN_NAME = "analysis-ik"; 19 | 20 | @Override 21 | public Map> getTokenizers() { 22 | Map> extra = new HashMap<>(); 23 | 24 | 25 | extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory); 26 | extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory); 27 | 28 | return extra; 29 | } 30 | 31 | @Override 32 | public Map>> getAnalyzers() { 33 | Map>> extra = new HashMap<>(); 34 | 35 | extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider); 36 | extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider); 37 | 38 | return extra; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.wltea.analyzer.cfg; 5 | 6 | import org.elasticsearch.common.inject.Inject; 7 | import org.elasticsearch.common.io.PathUtils; 8 | import org.elasticsearch.common.settings.Settings; 9 | import org.elasticsearch.env.Environment; 10 | import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin; 11 | import org.wltea.analyzer.dic.DicFile; 12 | 13 | import java.io.File; 14 | import java.io.UnsupportedEncodingException; 15 | import java.nio.file.Path; 16 | import java.security.MessageDigest; 17 | import java.security.NoSuchAlgorithmException; 18 | import java.util.ArrayList; 19 | import java.util.List; 20 | 21 | public class Configuration { 22 | 23 | 24 | private static final String PATH_DIC_MAIN = "main.dic"; 25 | private static final String PATH_DIC_SURNAME = "surname.dic"; 26 | private static final String PATH_DIC_QUANTIFIER = "quantifier.dic"; 27 | private static final String PATH_DIC_SUFFIX = "suffix.dic"; 28 | private static final String PATH_DIC_PREP = "preposition.dic"; 29 | private static final String PATH_DIC_STOP = "stopword.dic"; 30 | // 要使用的词典文件 31 | private List dicFiles = new ArrayList<>(); 32 | 33 | //是否启用智能分词 34 | private boolean useSmart; 35 | 36 | //是否启用远程词典加载 37 | private boolean enableRemoteDict=false; 38 | 39 | //是否启用小写处理 40 | private boolean enableLowercase=true; 41 | // 用于读取插件绝对路径下文件 42 | private String absolutePath; 43 | 44 | /** 45 | * settings是分词器定义时的配置信息 46 | * */ 47 | @Inject 48 | public Configuration(Environment env,Settings settings) { 49 | this.absolutePath = env.configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME).toAbsolutePath().toString(); 50 | // this.absolutePath = "C:\\Users\\jm005113\\Desktop\\workspace\\elasticsearch-analysis-ik\\config"; 51 | this.useSmart = settings.get("use_smart", "false").equals("true"); 52 | this.enableLowercase = settings.get("enable_lowercase", "true").equals("true"); 53 | this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true"); 54 | 55 | // 以下部分为初始化分词器配置的词典文件 56 | // 基础整词(必选词典文件) 57 | DicFile mainDic = new DicFile(absolutePath); 58 | mainDic.setDicName("main"); 59 | mainDic.setDicPath(PATH_DIC_MAIN); 60 | mainDic.setRemote(false); 61 | mainDic.setDictType(DicFile.DictType.INTACT_WORDS); 62 | this.dicFiles.add(mainDic); 63 | 64 | // 基础量词(必选词典文件) 65 | DicFile quantifierDic = new DicFile(absolutePath); 66 | quantifierDic.setDicName("quantifier"); 67 | quantifierDic.setDicPath(PATH_DIC_QUANTIFIER); 68 | quantifierDic.setRemote(false); 69 | quantifierDic.setDictType(DicFile.DictType.QUANTIFIER); 70 | this.dicFiles.add(quantifierDic); 71 | 72 | // 基础停词(必选词典文件) 73 | DicFile stopwordsDic = new DicFile(absolutePath); 74 | stopwordsDic.setDicName("stopwords"); 75 | stopwordsDic.setDicPath(PATH_DIC_STOP); 76 | stopwordsDic.setRemote(false); 77 | stopwordsDic.setDictType(DicFile.DictType.STOPWORDS); 78 | this.dicFiles.add(stopwordsDic); 79 | 80 | // 基础前缀词(必选词典文件) 81 | DicFile suffixDic = new DicFile(absolutePath); 82 | suffixDic.setDicName("suffix"); 83 | suffixDic.setDicPath(PATH_DIC_SUFFIX); 84 | suffixDic.setRemote(false); 85 | suffixDic.setDictType(DicFile.DictType.SUFFIX); 86 | this.dicFiles.add(suffixDic); 87 | 88 | // 基础前姓氏(必选词典文件) 89 | DicFile surnameDic = new DicFile(absolutePath); 90 | surnameDic.setDicName("surname"); 91 | surnameDic.setDicPath(PATH_DIC_SURNAME); 92 | surnameDic.setRemote(false); 93 | surnameDic.setDictType(DicFile.DictType.SURNAME); 94 | this.dicFiles.add(surnameDic); 95 | 96 | // 配置用户设置的词典文件 97 | List mainDics = settings.getAsList("ext_dic_main"); 98 | if(mainDics != null && mainDics.size() > 0 ){ 99 | mainDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.INTACT_WORDS))); 100 | } 101 | // 配置用户设置的词典文件 102 | List stopDics = settings.getAsList("ext_dic_stop"); 103 | if(stopDics != null && stopDics.size() > 0 ){ 104 | stopDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.STOPWORDS))); 105 | } 106 | // 配置用户设置的词典文件 107 | List quantifierDics = settings.getAsList("ext_dic_quantifier"); 108 | if(quantifierDics != null && quantifierDics.size() > 0 ){ 109 | quantifierDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.QUANTIFIER))); 110 | } 111 | } 112 | 113 | /** 114 | * 解析配置好的词典文件,示例:#dicName$extra#dicPath$extra_test.dic#isRemote$false 115 | * 解析说明:#为key的开始,$是value的开始 116 | * */ 117 | private static DicFile str2DicFile(String absolutePath, String dicPath){ 118 | DicFile dicFile = new DicFile(absolutePath); 119 | dicFile.setRemote(dicPath.startsWith("http:") || dicPath.startsWith("https:") || dicPath.startsWith("ftp:")); 120 | dicFile.setDicName(getMD5(dicPath)); 121 | dicFile.setDicPath(dicPath); 122 | return dicFile; 123 | } 124 | 125 | public static String getMD5(String string) { 126 | byte[] hash; 127 | try { 128 | //创建一个MD5算法对象,并获得MD5字节数组,16*8=128位 129 | hash = MessageDigest.getInstance("MD5").digest(string.getBytes("UTF-8")); 130 | } catch (NoSuchAlgorithmException e) { 131 | throw new RuntimeException("Huh, MD5 should be supported?", e); 132 | } catch (UnsupportedEncodingException e) { 133 | throw new RuntimeException("Huh, UTF-8 should be supported?", e); 134 | } 135 | 136 | //转换为十六进制字符串 137 | StringBuilder hex = new StringBuilder(hash.length * 2); 138 | for (byte b : hash) { 139 | if ((b & 0xFF) < 0x10) hex.append("0"); 140 | hex.append(Integer.toHexString(b & 0xFF)); 141 | } 142 | return hex.toString().toLowerCase(); 143 | } 144 | 145 | public Path getConfigInPluginDir() { 146 | return PathUtils 147 | .get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath()) 148 | .getParent(), "config") 149 | .toAbsolutePath(); 150 | } 151 | 152 | public boolean isUseSmart() { 153 | return useSmart; 154 | } 155 | 156 | public Configuration setUseSmart(boolean useSmart) { 157 | this.useSmart = useSmart; 158 | return this; 159 | } 160 | 161 | public boolean isEnableRemoteDict() { 162 | return enableRemoteDict; 163 | } 164 | 165 | public boolean isEnableLowercase() { 166 | return enableLowercase; 167 | } 168 | 169 | public List getDicFiles() { 170 | return dicFiles; 171 | } 172 | 173 | public void addDic(List dicFiles) { 174 | this.dicFiles.addAll(dicFiles); 175 | } 176 | } 177 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/AnalyzeContext.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.io.IOException; 28 | import java.io.Reader; 29 | import java.util.*; 30 | import java.util.stream.Collectors; 31 | 32 | import org.wltea.analyzer.cfg.Configuration; 33 | import org.wltea.analyzer.dic.DicFile; 34 | import org.wltea.analyzer.dic.Dictionary; 35 | 36 | /** 37 | * 38 | * 分词器上下文状态 39 | * 40 | */ 41 | public class AnalyzeContext { 42 | 43 | //默认缓冲区大小 44 | private static final int BUFF_SIZE = 4096; 45 | //缓冲区耗尽的临界值 46 | private static final int BUFF_EXHAUST_CRITICAL = 100; 47 | 48 | 49 | //字符串读取缓冲 50 | private char[] segmentBuff; 51 | //字符类型数组 52 | private int[] charTypes; 53 | 54 | 55 | //记录Reader内已分析的字串总长度 56 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 57 | private int buffOffset; 58 | //当前缓冲区位置指针 59 | private int cursor; 60 | //最近一次读入的,可处理的字串长度 61 | private int available; 62 | 63 | 64 | //子分词器锁 65 | //该集合非空,说明有子分词器在占用segmentBuff 66 | private Set buffLocker; 67 | 68 | //原始分词结果集合,未经歧义处理 69 | private QuickSortSet orgLexemes; 70 | //LexemePath位置索引表 71 | private Map pathMap; 72 | //最终分词结果集 73 | private LinkedList results; 74 | // 是否大小写过滤 75 | private boolean isEnableLowerCase; 76 | private boolean isUseSmart; 77 | private List mainDicNames; 78 | private List quantifierNames; 79 | private List stopwordDicFile; 80 | 81 | public AnalyzeContext(Configuration configuration){ 82 | this.isUseSmart = configuration.isUseSmart(); 83 | this.isEnableLowerCase = configuration.isEnableLowercase(); 84 | this.segmentBuff = new char[BUFF_SIZE]; 85 | this.charTypes = new int[BUFF_SIZE]; 86 | this.buffLocker = new HashSet(); 87 | this.orgLexemes = new QuickSortSet(); 88 | this.pathMap = new HashMap(); 89 | this.results = new LinkedList(); 90 | this.mainDicNames = new ArrayList<>(); 91 | this.quantifierNames = new ArrayList<>(); 92 | this.stopwordDicFile = new ArrayList<>(); 93 | // 将定义分词器时的词典文件列表设置到分词器的context中,在分词时,从context中读取词典列表, 94 | // 好实现不同IK分词器使用不同词典的逻辑 95 | configuration.getDicFiles().forEach(dicFile -> { 96 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){ 97 | mainDicNames.add(dicFile.getDicName()); 98 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){ 99 | quantifierNames.add(dicFile.getDicName()); 100 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){ 101 | stopwordDicFile.add(dicFile.getDicName()); 102 | } 103 | }); 104 | mainDicNames = mainDicNames.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList()); 105 | quantifierNames = quantifierNames.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList()); 106 | stopwordDicFile = stopwordDicFile.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList()); 107 | } 108 | 109 | public int getCursor(){ 110 | return this.cursor; 111 | } 112 | 113 | public char[] getSegmentBuff(){ 114 | return this.segmentBuff; 115 | } 116 | 117 | public char getCurrentChar(){ 118 | return this.segmentBuff[this.cursor]; 119 | } 120 | 121 | public int getCurrentCharType(){ 122 | return this.charTypes[this.cursor]; 123 | } 124 | 125 | public int getBufferOffset(){ 126 | return this.buffOffset; 127 | } 128 | 129 | public List getMainDicNames() { 130 | return mainDicNames; 131 | } 132 | 133 | public List getQuantifierNames() { 134 | return quantifierNames; 135 | } 136 | 137 | /** 138 | * 根据context的上下文情况,填充segmentBuff 139 | * @param reader 140 | * @return 返回待分析的(有效的)字串长度 141 | * @throws java.io.IOException 142 | */ 143 | public int fillBuffer(Reader reader) throws IOException{ 144 | int readCount = 0; 145 | if(this.buffOffset == 0){ 146 | //首次读取reader 147 | readCount = reader.read(segmentBuff); 148 | }else{ 149 | int offset = this.available - this.cursor; 150 | if(offset > 0){ 151 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 152 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset); 153 | readCount = offset; 154 | } 155 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 156 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset); 157 | } 158 | //记录最后一次从Reader中读入的可用字符长度 159 | this.available = readCount; 160 | //重置当前指针 161 | this.cursor = 0; 162 | return readCount; 163 | } 164 | 165 | /** 166 | * 初始化buff指针,处理第一个字符 167 | */ 168 | public void initCursor(){ 169 | this.cursor = 0; 170 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],this.isEnableLowerCase); 171 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 172 | } 173 | 174 | /** 175 | * 指针+1 176 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 177 | * 并处理当前字符 178 | */ 179 | public boolean moveCursor(){ 180 | if(this.cursor < this.available - 1){ 181 | this.cursor++; 182 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],this.isEnableLowerCase); 183 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 184 | return true; 185 | }else{ 186 | return false; 187 | } 188 | } 189 | 190 | /** 191 | * 设置当前segmentBuff为锁定状态 192 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff 193 | * @param segmenterName 194 | */ 195 | public void lockBuffer(String segmenterName){ 196 | this.buffLocker.add(segmenterName); 197 | } 198 | 199 | /** 200 | * 移除指定的子分词器名,释放对segmentBuff的占用 201 | * @param segmenterName 202 | */ 203 | public void unlockBuffer(String segmenterName){ 204 | this.buffLocker.remove(segmenterName); 205 | } 206 | 207 | /** 208 | * 只要buffLocker中存在segmenterName 209 | * 则buffer被锁定 210 | * @return boolean 缓冲去是否被锁定 211 | */ 212 | boolean isBufferLocked(){ 213 | return this.buffLocker.size() > 0; 214 | } 215 | 216 | /** 217 | * 判断当前segmentBuff是否已经用完 218 | * 当前执针cursor移至segmentBuff末端this.available - 1 219 | * @return 220 | */ 221 | public boolean isBufferConsumed(){ 222 | return this.cursor == this.available - 1; 223 | } 224 | 225 | /** 226 | * 判断segmentBuff是否需要读取新数据 227 | * 228 | * 满足一下条件时, 229 | * 1.available == BUFF_SIZE 表示buffer满载 230 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 231 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer 232 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) 233 | * @return 234 | */ 235 | boolean needRefillBuffer(){ 236 | return this.available == BUFF_SIZE 237 | && this.cursor < this.available - 1 238 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL 239 | && !this.isBufferLocked(); 240 | } 241 | 242 | /** 243 | * 累计当前的segmentBuff相对于reader起始位置的位移 244 | */ 245 | void markBufferOffset(){ 246 | this.buffOffset += this.cursor; 247 | } 248 | 249 | /** 250 | * 向分词结果集添加词元 251 | * @param lexeme 252 | */ 253 | public void addLexeme(Lexeme lexeme){ 254 | this.orgLexemes.addLexeme(lexeme); 255 | } 256 | 257 | /** 258 | * 添加分词结果路径 259 | * 路径起始位置 ---> 路径 映射表 260 | * @param path 261 | */ 262 | public void addLexemePath(LexemePath path){ 263 | if(path != null){ 264 | this.pathMap.put(path.getPathBegin(), path); 265 | } 266 | } 267 | 268 | 269 | /** 270 | * 返回原始分词结果 271 | * @return 272 | */ 273 | public QuickSortSet getOrgLexemes(){ 274 | return this.orgLexemes; 275 | } 276 | 277 | /** 278 | * 推送分词结果到结果集合 279 | * 1.从buff头部遍历到this.cursor已处理位置 280 | * 2.将map中存在的分词结果推入results 281 | * 3.将map中不存在的CJDK字符以单字方式推入results 282 | */ 283 | public void outputToResult(){ 284 | int index = 0; 285 | for( ; index <= this.cursor ;){ 286 | //跳过非CJK字符 287 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){ 288 | index++; 289 | continue; 290 | } 291 | //从pathMap找出对应index位置的LexemePath 292 | LexemePath path = this.pathMap.get(index); 293 | if(path != null){ 294 | //输出LexemePath中的lexeme到results集合 295 | Lexeme l = path.pollFirst(); 296 | while(l != null){ 297 | this.results.add(l); 298 | //字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字 299 | /*int innerIndex = index + 1; 300 | for (; innerIndex < index + l.getLength(); innerIndex++) { 301 | Lexeme innerL = path.peekFirst(); 302 | if (innerL != null && innerIndex == innerL.getBegin()) { 303 | this.outputSingleCJK(innerIndex - 1); 304 | } 305 | }*/ 306 | 307 | //将index移至lexeme后 308 | index = l.getBegin() + l.getLength(); 309 | l = path.pollFirst(); 310 | if(l != null){ 311 | //输出path内部,词元间遗漏的单字 312 | for(;index < l.getBegin();index++){ 313 | this.outputSingleCJK(index); 314 | } 315 | } 316 | } 317 | }else{//pathMap中找不到index对应的LexemePath 318 | //单字输出 319 | this.outputSingleCJK(index); 320 | index++; 321 | } 322 | } 323 | //清空当前的Map 324 | this.pathMap.clear(); 325 | } 326 | 327 | /** 328 | * 对CJK字符进行单字输出 329 | * @param index 330 | */ 331 | private void outputSingleCJK(int index){ 332 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){ 333 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR); 334 | this.results.add(singleCharLexeme); 335 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){ 336 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK); 337 | this.results.add(singleCharLexeme); 338 | } 339 | } 340 | 341 | /** 342 | * 返回lexeme 343 | * 344 | * 同时处理合并 345 | * @return 346 | */ 347 | Lexeme getNextLexeme(){ 348 | //从结果集取出,并移除第一个Lexme 349 | Lexeme result = this.results.pollFirst(); 350 | while(result != null){ 351 | //数量词合并 352 | this.compound(result); 353 | if(Dictionary.getSingleton().isStopWord(this.stopwordDicFile, this.segmentBuff , result.getBegin() , result.getLength())){ 354 | //是停止词继续取列表的下一个 355 | result = this.results.pollFirst(); 356 | }else{ 357 | //不是停止词, 生成lexeme的词元文本,输出 358 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength())); 359 | break; 360 | } 361 | } 362 | return result; 363 | } 364 | 365 | /** 366 | * 重置分词上下文状态 367 | */ 368 | void reset(){ 369 | this.buffLocker.clear(); 370 | this.orgLexemes = new QuickSortSet(); 371 | this.available =0; 372 | this.buffOffset = 0; 373 | this.charTypes = new int[BUFF_SIZE]; 374 | this.cursor = 0; 375 | this.results.clear(); 376 | this.segmentBuff = new char[BUFF_SIZE]; 377 | this.pathMap.clear(); 378 | } 379 | 380 | /** 381 | * 组合词元 382 | */ 383 | private void compound(Lexeme result){ 384 | 385 | if(!this.isUseSmart){ 386 | return ; 387 | } 388 | //数量词合并处理 389 | if(!this.results.isEmpty()){ 390 | 391 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){ 392 | Lexeme nextLexeme = this.results.peekFirst(); 393 | boolean appendOk = false; 394 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){ 395 | //合并英文数词+中文数词 396 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); 397 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 398 | //合并英文数词+中文量词 399 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 400 | } 401 | if(appendOk){ 402 | //弹出 403 | this.results.pollFirst(); 404 | } 405 | } 406 | 407 | //可能存在第二轮合并 408 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){ 409 | Lexeme nextLexeme = this.results.peekFirst(); 410 | boolean appendOk = false; 411 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){ 412 | //合并中文数词+中文量词 413 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 414 | } 415 | if(appendOk){ 416 | //弹出 417 | this.results.pollFirst(); 418 | } 419 | } 420 | 421 | } 422 | } 423 | 424 | } 425 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 字符集识别工具类 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | /** 29 | * 30 | * 字符集识别工具类 31 | */ 32 | public class CharacterUtil { 33 | 34 | public static final int CHAR_USELESS = 0; 35 | 36 | public static final int CHAR_ARABIC = 0X00000001; 37 | 38 | public static final int CHAR_ENGLISH = 0X00000002; 39 | 40 | public static final int CHAR_CHINESE = 0X00000004; 41 | 42 | public static final int CHAR_OTHER_CJK = 0X00000008; 43 | 44 | 45 | /** 46 | * 识别字符类型 47 | * @param input 48 | * @return int CharacterUtil定义的字符类型常量 49 | */ 50 | static int identifyCharType(char input){ 51 | if(input >= '0' && input <= '9'){ 52 | return CHAR_ARABIC; 53 | 54 | }else if((input >= 'a' && input <= 'z') 55 | || (input >= 'A' && input <= 'Z')){ 56 | return CHAR_ENGLISH; 57 | 58 | }else { 59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 60 | 61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 64 | //目前已知的中文字符UTF-8集合 65 | return CHAR_CHINESE; 66 | 67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 68 | //韩文字符集 69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 70 | || ub == Character.UnicodeBlock.HANGUL_JAMO 71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 72 | //日文字符集 73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 74 | || ub == Character.UnicodeBlock.KATAKANA //片假名 75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 76 | return CHAR_OTHER_CJK; 77 | 78 | } 79 | } 80 | //其他的不做处理的字符 81 | return CHAR_USELESS; 82 | } 83 | 84 | /** 85 | * 进行字符规格化(全角转半角,大写转小写处理) 86 | * @param input 87 | * @return char 88 | */ 89 | static char regularize(char input,boolean lowercase){ 90 | if (input == 12288) { 91 | input = (char) 32; 92 | 93 | }else if (input > 65280 && input < 65375) { 94 | input = (char) (input - 65248); 95 | 96 | }else if (input >= 'A' && input <= 'Z' && lowercase) { 97 | input += 32; 98 | } 99 | 100 | return input; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator(){ 36 | 37 | } 38 | 39 | /** 40 | * 分词歧义处理 41 | // * @param orgLexemes 42 | * @param useSmart 43 | */ 44 | void process(AnalyzeContext context , boolean useSmart){ 45 | QuickSortSet orgLexemes = context.getOrgLexemes(); 46 | Lexeme orgLexeme = orgLexemes.pollFirst(); 47 | 48 | LexemePath crossPath = new LexemePath(); 49 | while(orgLexeme != null){ 50 | if(!crossPath.addCrossLexeme(orgLexeme)){ 51 | //找到与crossPath不相交的下一个crossPath 52 | if(crossPath.size() == 1 || !useSmart){ 53 | //crossPath没有歧义 或者 不做歧义处理 54 | //直接输出当前crossPath 55 | context.addLexemePath(crossPath); 56 | }else{ 57 | //对当前的crossPath进行歧义处理 58 | QuickSortSet.Cell headCell = crossPath.getHead(); 59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 60 | //输出歧义处理结果judgeResult 61 | context.addLexemePath(judgeResult); 62 | } 63 | 64 | //把orgLexeme加入新的crossPath中 65 | crossPath = new LexemePath(); 66 | crossPath.addCrossLexeme(orgLexeme); 67 | } 68 | orgLexeme = orgLexemes.pollFirst(); 69 | } 70 | 71 | 72 | //处理最后的path 73 | if(crossPath.size() == 1 || !useSmart){ 74 | //crossPath没有歧义 或者 不做歧义处理 75 | //直接输出当前crossPath 76 | context.addLexemePath(crossPath); 77 | }else{ 78 | //对当前的crossPath进行歧义处理 79 | QuickSortSet.Cell headCell = crossPath.getHead(); 80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 81 | //输出歧义处理结果judgeResult 82 | context.addLexemePath(judgeResult); 83 | } 84 | } 85 | 86 | /** 87 | * 歧义识别 88 | * @param lexemeCell 歧义路径链表头 89 | * @param fullTextLength 歧义路径文本长度 90 | * @return 91 | */ 92 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ 93 | //候选路径集合 94 | TreeSet pathOptions = new TreeSet(); 95 | //候选结果路径 96 | LexemePath option = new LexemePath(); 97 | 98 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 99 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 100 | 101 | //当前词元链并非最理想的,加入候选路径集合 102 | pathOptions.add(option.copy()); 103 | 104 | //存在歧义词,处理 105 | QuickSortSet.Cell c = null; 106 | while(!lexemeStack.isEmpty()){ 107 | c = lexemeStack.pop(); 108 | //回滚词元链 109 | this.backPath(c.getLexeme() , option); 110 | //从歧义词位置开始,递归,生成可选方案 111 | this.forwardPath(c , option); 112 | pathOptions.add(option.copy()); 113 | } 114 | 115 | //返回集合中的最优方案 116 | return pathOptions.first(); 117 | 118 | } 119 | 120 | /** 121 | * 向前遍历,添加词元,构造一个无歧义词元组合 122 | // * @param LexemePath path 123 | * @return 124 | */ 125 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ 126 | //发生冲突的Lexeme栈 127 | Stack conflictStack = new Stack(); 128 | QuickSortSet.Cell c = lexemeCell; 129 | //迭代遍历Lexeme链表 130 | while(c != null && c.getLexeme() != null){ 131 | if(!option.addNotCrossLexeme(c.getLexeme())){ 132 | //词元交叉,添加失败则加入lexemeStack栈 133 | conflictStack.push(c); 134 | } 135 | c = c.getNext(); 136 | } 137 | return conflictStack; 138 | } 139 | 140 | /** 141 | * 回滚词元链,直到它能够接受指定的词元 142 | // * @param lexeme 143 | * @param l 144 | */ 145 | private void backPath(Lexeme l , LexemePath option){ 146 | while(option.checkCross(l)){ 147 | option.removeTail(); 148 | } 149 | 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import org.wltea.analyzer.cfg.Configuration; 27 | import org.wltea.analyzer.core.segmenter.CJKSegmenter; 28 | import org.wltea.analyzer.core.segmenter.CN_QuantifierSegmenter; 29 | import org.wltea.analyzer.core.segmenter.ISegmenter; 30 | import org.wltea.analyzer.core.segmenter.LetterSegmenter; 31 | 32 | import java.io.IOException; 33 | import java.io.Reader; 34 | import java.util.ArrayList; 35 | import java.util.List; 36 | 37 | /** 38 | * IK分词器主类 39 | * 40 | */ 41 | public final class IKSegmenter { 42 | 43 | //字符窜reader 44 | private Reader input; 45 | //分词器上下文 46 | private AnalyzeContext context; 47 | //分词处理器列表 48 | private List segmenters; 49 | //分词歧义裁决器 50 | private IKArbitrator arbitrator; 51 | private Configuration configuration; 52 | 53 | 54 | /** 55 | * IK分词器构造函数 56 | * @param input 57 | */ 58 | public IKSegmenter(Reader input ,Configuration configuration){ 59 | this.input = input; 60 | this.configuration = configuration; 61 | this.init(); 62 | } 63 | 64 | 65 | /** 66 | * 初始化 67 | */ 68 | private void init(){ 69 | //初始化分词上下文 70 | this.context = new AnalyzeContext(configuration); 71 | //加载子分词器 72 | this.segmenters = this.loadSegmenters(); 73 | //加载歧义裁决器 74 | this.arbitrator = new IKArbitrator(); 75 | } 76 | 77 | /** 78 | * 初始化词典,加载子分词器实现 79 | * @return List 80 | */ 81 | private List loadSegmenters(){ 82 | List segmenters = new ArrayList(4); 83 | //处理字母的子分词器 84 | segmenters.add(new LetterSegmenter()); 85 | //处理中文数量词的子分词器 86 | segmenters.add(new CN_QuantifierSegmenter()); 87 | //处理中文词的子分词器 88 | segmenters.add(new CJKSegmenter()); 89 | return segmenters; 90 | } 91 | 92 | /** 93 | * 分词,获取下一个词元 94 | * @return Lexeme 词元对象 95 | * @throws java.io.IOException 96 | */ 97 | // TODO 待测试 该锁没有必要,tokenizer重用策略是被缓存在ThreadLocal里,所以,不同线程使用的是不同的tokenizer,不会有多线程问题 98 | public Lexeme next()throws IOException{ 99 | Lexeme l = null; 100 | while((l = context.getNextLexeme()) == null ){ 101 | /* 102 | * 从reader中读取数据,填充buffer 103 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 104 | * 移位处理上次读入的但未处理的数据 105 | */ 106 | int available = context.fillBuffer(this.input); 107 | if(available <= 0){ 108 | //reader已经读完 109 | context.reset(); 110 | return null; 111 | 112 | }else{ 113 | //初始化指针 114 | context.initCursor(); 115 | do{ 116 | //遍历子分词器 117 | for(ISegmenter segmenter : segmenters){ 118 | segmenter.analyze(context); 119 | } 120 | //字符缓冲区接近读完,需要读入新的字符 121 | if(context.needRefillBuffer()){ 122 | break; 123 | } 124 | //向前移动指针 125 | }while(context.moveCursor()); 126 | //重置子分词器,为下轮循环进行初始化 127 | for(ISegmenter segmenter : segmenters){ 128 | segmenter.reset(); 129 | } 130 | } 131 | //对分词进行歧义处理 132 | this.arbitrator.process(context, configuration.isUseSmart()); 133 | //将分词结果输出到结果集,并处理未切分的单个CJK字符 134 | context.outputToResult(); 135 | //记录本次分词的缓冲区位移 136 | context.markBufferOffset(); 137 | } 138 | return l; 139 | } 140 | 141 | /** 142 | * 重置分词器到初始状态 143 | * @param input 144 | */ 145 | public synchronized void reset(Reader input) { 146 | this.input = input; 147 | context.reset(); 148 | for(ISegmenter segmenter : segmenters){ 149 | segmenter.reset(); 150 | } 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK词元对象 29 | */ 30 | public class Lexeme implements Comparable{ 31 | //lexemeType常量 32 | //未知 33 | public static final int TYPE_UNKNOWN = 0; 34 | //英文 35 | public static final int TYPE_ENGLISH = 1; 36 | //数字 37 | public static final int TYPE_ARABIC = 2; 38 | //英文数字混合 39 | public static final int TYPE_LETTER = 3; 40 | //中文词元 41 | public static final int TYPE_CNWORD = 4; 42 | //中文单字 43 | public static final int TYPE_CNCHAR = 64; 44 | //日韩文字 45 | public static final int TYPE_OTHER_CJK = 8; 46 | //中文数词 47 | public static final int TYPE_CNUM = 16; 48 | //中文量词 49 | public static final int TYPE_COUNT = 32; 50 | //中文数量词 51 | public static final int TYPE_CQUAN = 48; 52 | 53 | //词元的起始位移 54 | private int offset; 55 | //词元的相对起始位置 56 | private int begin; 57 | //词元的长度 58 | private int length; 59 | //词元文本 60 | private String lexemeText; 61 | //词元类型 62 | private int lexemeType; 63 | 64 | 65 | public Lexeme(int offset , int begin , int length , int lexemeType){ 66 | this.offset = offset; 67 | this.begin = begin; 68 | if(length < 0){ 69 | throw new IllegalArgumentException("length < 0"); 70 | } 71 | this.length = length; 72 | this.lexemeType = lexemeType; 73 | } 74 | 75 | /* 76 | * 判断词元相等算法 77 | * 起始位置偏移、起始位置、终止位置相同 78 | * @see java.lang.Object#equals(Object o) 79 | */ 80 | public boolean equals(Object o){ 81 | if(o == null){ 82 | return false; 83 | } 84 | 85 | if(this == o){ 86 | return true; 87 | } 88 | 89 | if(o instanceof Lexeme){ 90 | Lexeme other = (Lexeme)o; 91 | if(this.offset == other.getOffset() 92 | && this.begin == other.getBegin() 93 | && this.length == other.getLength()){ 94 | return true; 95 | }else{ 96 | return false; 97 | } 98 | }else{ 99 | return false; 100 | } 101 | } 102 | 103 | /* 104 | * 词元哈希编码算法 105 | * @see java.lang.Object#hashCode() 106 | */ 107 | public int hashCode(){ 108 | int absBegin = getBeginPosition(); 109 | int absEnd = getEndPosition(); 110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 111 | } 112 | 113 | /* 114 | * 词元在排序集合中的比较算法 115 | * @see java.lang.Comparable#compareTo(java.lang.Object) 116 | */ 117 | public int compareTo(Lexeme other) { 118 | //起始位置优先 119 | if(this.begin < other.getBegin()){ 120 | return -1; 121 | }else if(this.begin == other.getBegin()){ 122 | //词元长度优先 123 | if(this.length > other.getLength()){ 124 | return -1; 125 | }else if(this.length == other.getLength()){ 126 | return 0; 127 | }else {//this.length < other.getLength() 128 | return 1; 129 | } 130 | 131 | }else{//this.begin > other.getBegin() 132 | return 1; 133 | } 134 | } 135 | 136 | public int getOffset() { 137 | return offset; 138 | } 139 | 140 | public void setOffset(int offset) { 141 | this.offset = offset; 142 | } 143 | 144 | public int getBegin() { 145 | return begin; 146 | } 147 | /** 148 | * 获取词元在文本中的起始位置 149 | * @return int 150 | */ 151 | public int getBeginPosition(){ 152 | return offset + begin; 153 | } 154 | 155 | public void setBegin(int begin) { 156 | this.begin = begin; 157 | } 158 | 159 | /** 160 | * 获取词元在文本中的结束位置 161 | * @return int 162 | */ 163 | public int getEndPosition(){ 164 | return offset + begin + length; 165 | } 166 | 167 | /** 168 | * 获取词元的字符长度 169 | * @return int 170 | */ 171 | public int getLength(){ 172 | return this.length; 173 | } 174 | 175 | public void setLength(int length) { 176 | if(this.length < 0){ 177 | throw new IllegalArgumentException("length < 0"); 178 | } 179 | this.length = length; 180 | } 181 | 182 | /** 183 | * 获取词元的文本内容 184 | * @return String 185 | */ 186 | public String getLexemeText() { 187 | if(lexemeText == null){ 188 | return ""; 189 | } 190 | return lexemeText; 191 | } 192 | 193 | public void setLexemeText(String lexemeText) { 194 | if(lexemeText == null){ 195 | this.lexemeText = ""; 196 | this.length = 0; 197 | }else{ 198 | this.lexemeText = lexemeText; 199 | this.length = lexemeText.length(); 200 | } 201 | } 202 | 203 | /** 204 | * 获取词元类型 205 | * @return int 206 | */ 207 | public int getLexemeType() { 208 | return lexemeType; 209 | } 210 | 211 | /** 212 | * 获取词元类型标示字符串 213 | * @return String 214 | */ 215 | public String getLexemeTypeString(){ 216 | switch(lexemeType) { 217 | 218 | case TYPE_ENGLISH : 219 | return "ENGLISH"; 220 | 221 | case TYPE_ARABIC : 222 | return "ARABIC"; 223 | 224 | case TYPE_LETTER : 225 | return "LETTER"; 226 | 227 | case TYPE_CNWORD : 228 | return "CN_WORD"; 229 | 230 | case TYPE_CNCHAR : 231 | return "CN_CHAR"; 232 | 233 | case TYPE_OTHER_CJK : 234 | return "OTHER_CJK"; 235 | 236 | case TYPE_COUNT : 237 | return "COUNT"; 238 | 239 | case TYPE_CNUM : 240 | return "TYPE_CNUM"; 241 | 242 | case TYPE_CQUAN: 243 | return "TYPE_CQUAN"; 244 | 245 | default : 246 | return "UNKONW"; 247 | } 248 | } 249 | 250 | 251 | public void setLexemeType(int lexemeType) { 252 | this.lexemeType = lexemeType; 253 | } 254 | 255 | /** 256 | * 合并两个相邻的词元 257 | * @param l 258 | * @param lexemeType 259 | * @return boolean 词元是否成功合并 260 | */ 261 | public boolean append(Lexeme l , int lexemeType){ 262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){ 263 | this.length += l.getLength(); 264 | this.lexemeType = lexemeType; 265 | return true; 266 | }else { 267 | return false; 268 | } 269 | } 270 | 271 | 272 | /** 273 | * 274 | */ 275 | public String toString(){ 276 | StringBuffer strbuf = new StringBuffer(); 277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 279 | strbuf.append(this.getLexemeTypeString()); 280 | return strbuf.toString(); 281 | } 282 | 283 | 284 | } 285 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * Lexeme链(路径) 30 | */ 31 | class LexemePath extends QuickSortSet implements Comparable{ 32 | 33 | //起始位置 34 | private int pathBegin; 35 | //结束 36 | private int pathEnd; 37 | //词元链的有效字符长度 38 | private int payloadLength; 39 | 40 | LexemePath(){ 41 | this.pathBegin = -1; 42 | this.pathEnd = -1; 43 | this.payloadLength = 0; 44 | } 45 | 46 | /** 47 | * 向LexemePath追加相交的Lexeme 48 | * @param lexeme 49 | * @return 50 | */ 51 | boolean addCrossLexeme(Lexeme lexeme){ 52 | if(this.isEmpty()){ 53 | this.addLexeme(lexeme); 54 | this.pathBegin = lexeme.getBegin(); 55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 56 | this.payloadLength += lexeme.getLength(); 57 | return true; 58 | 59 | }else if(this.checkCross(lexeme)){ 60 | this.addLexeme(lexeme); 61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){ 62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 63 | } 64 | this.payloadLength = this.pathEnd - this.pathBegin; 65 | return true; 66 | 67 | }else{ 68 | return false; 69 | 70 | } 71 | } 72 | 73 | /** 74 | * 向LexemePath追加不相交的Lexeme 75 | * @param lexeme 76 | * @return 77 | */ 78 | boolean addNotCrossLexeme(Lexeme lexeme){ 79 | if(this.isEmpty()){ 80 | this.addLexeme(lexeme); 81 | this.pathBegin = lexeme.getBegin(); 82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 83 | this.payloadLength += lexeme.getLength(); 84 | return true; 85 | 86 | }else if(this.checkCross(lexeme)){ 87 | return false; 88 | 89 | }else{ 90 | this.addLexeme(lexeme); 91 | this.payloadLength += lexeme.getLength(); 92 | Lexeme head = this.peekFirst(); 93 | this.pathBegin = head.getBegin(); 94 | Lexeme tail = this.peekLast(); 95 | this.pathEnd = tail.getBegin() + tail.getLength(); 96 | return true; 97 | 98 | } 99 | } 100 | 101 | /** 102 | * 移除尾部的Lexeme 103 | * @return 104 | */ 105 | Lexeme removeTail(){ 106 | Lexeme tail = this.pollLast(); 107 | if(this.isEmpty()){ 108 | this.pathBegin = -1; 109 | this.pathEnd = -1; 110 | this.payloadLength = 0; 111 | }else{ 112 | this.payloadLength -= tail.getLength(); 113 | Lexeme newTail = this.peekLast(); 114 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 115 | } 116 | return tail; 117 | } 118 | 119 | /** 120 | * 检测词元位置交叉(有歧义的切分) 121 | * @param lexeme 122 | * @return 123 | */ 124 | boolean checkCross(Lexeme lexeme){ 125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength()); 127 | } 128 | 129 | int getPathBegin() { 130 | return pathBegin; 131 | } 132 | 133 | int getPathEnd() { 134 | return pathEnd; 135 | } 136 | 137 | /** 138 | * 获取Path的有效词长 139 | * @return 140 | */ 141 | int getPayloadLength(){ 142 | return this.payloadLength; 143 | } 144 | 145 | /** 146 | * 获取LexemePath的路径长度 147 | * @return 148 | */ 149 | int getPathLength(){ 150 | return this.pathEnd - this.pathBegin; 151 | } 152 | 153 | 154 | /** 155 | * X权重(词元长度积) 156 | * @return 157 | */ 158 | int getXWeight(){ 159 | int product = 1; 160 | Cell c = this.getHead(); 161 | while( c != null && c.getLexeme() != null){ 162 | product *= c.getLexeme().getLength(); 163 | c = c.getNext(); 164 | } 165 | return product; 166 | } 167 | 168 | /** 169 | * 词元位置权重 170 | * @return 171 | */ 172 | int getPWeight(){ 173 | int pWeight = 0; 174 | int p = 0; 175 | Cell c = this.getHead(); 176 | while( c != null && c.getLexeme() != null){ 177 | p++; 178 | pWeight += p * c.getLexeme().getLength() ; 179 | c = c.getNext(); 180 | } 181 | return pWeight; 182 | } 183 | 184 | LexemePath copy(){ 185 | LexemePath theCopy = new LexemePath(); 186 | theCopy.pathBegin = this.pathBegin; 187 | theCopy.pathEnd = this.pathEnd; 188 | theCopy.payloadLength = this.payloadLength; 189 | Cell c = this.getHead(); 190 | while( c != null && c.getLexeme() != null){ 191 | theCopy.addLexeme(c.getLexeme()); 192 | c = c.getNext(); 193 | } 194 | return theCopy; 195 | } 196 | 197 | public int compareTo(LexemePath o) { 198 | //比较有效文本长度 199 | if(this.payloadLength > o.payloadLength){ 200 | return -1; 201 | }else if(this.payloadLength < o.payloadLength){ 202 | return 1; 203 | }else{ 204 | //比较词元个数,越少越好 205 | if(this.size() < o.size()){ 206 | return -1; 207 | }else if (this.size() > o.size()){ 208 | return 1; 209 | }else{ 210 | //路径跨度越大越好 211 | if(this.getPathLength() > o.getPathLength()){ 212 | return -1; 213 | }else if(this.getPathLength() < o.getPathLength()){ 214 | return 1; 215 | }else { 216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 217 | if(this.pathEnd > o.pathEnd){ 218 | return -1; 219 | }else if(pathEnd < o.pathEnd){ 220 | return 1; 221 | }else{ 222 | //词长越平均越好 223 | if(this.getXWeight() > o.getXWeight()){ 224 | return -1; 225 | }else if(this.getXWeight() < o.getXWeight()){ 226 | return 1; 227 | }else { 228 | //词元位置权重比较 229 | if(this.getPWeight() > o.getPWeight()){ 230 | return -1; 231 | }else if(this.getPWeight() < o.getPWeight()){ 232 | return 1; 233 | } 234 | 235 | } 236 | } 237 | } 238 | } 239 | } 240 | return 0; 241 | } 242 | 243 | public String toString(){ 244 | StringBuffer sb = new StringBuffer(); 245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n"); 246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n"); 247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n"); 248 | Cell head = this.getHead(); 249 | while(head != null){ 250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); 251 | head = head.getNext(); 252 | } 253 | return sb.toString(); 254 | } 255 | 256 | } 257 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/QuickSortSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK分词器专用的Lexem快速排序集合 29 | */ 30 | public class QuickSortSet { 31 | //链表头 32 | private Cell head; 33 | //链表尾 34 | private Cell tail; 35 | //链表的实际大小 36 | private int size; 37 | 38 | QuickSortSet(){ 39 | this.size = 0; 40 | } 41 | 42 | /** 43 | * 向链表集合添加词元 44 | * @param lexeme 45 | */ 46 | public boolean addLexeme(Lexeme lexeme){ 47 | Cell newCell = new Cell(lexeme); 48 | if(this.size == 0){ 49 | this.head = newCell; 50 | this.tail = newCell; 51 | this.size++; 52 | return true; 53 | 54 | }else{ 55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合 56 | return false; 57 | 58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部 59 | this.tail.next = newCell; 60 | newCell.prev = this.tail; 61 | this.tail = newCell; 62 | this.size++; 63 | return true; 64 | 65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部 66 | this.head.prev = newCell; 67 | newCell.next = this.head; 68 | this.head = newCell; 69 | this.size++; 70 | return true; 71 | 72 | }else{ 73 | //从尾部上逆 74 | Cell index = this.tail; 75 | while(index != null && index.compareTo(newCell) > 0){ 76 | index = index.prev; 77 | } 78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 79 | return false; 80 | 81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 82 | newCell.prev = index; 83 | newCell.next = index.next; 84 | index.next.prev = newCell; 85 | index.next = newCell; 86 | this.size++; 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | 94 | /** 95 | * 返回链表头部元素 96 | * @return 97 | */ 98 | public Lexeme peekFirst(){ 99 | if(this.head != null){ 100 | return this.head.lexeme; 101 | } 102 | return null; 103 | } 104 | 105 | /** 106 | * 取出链表集合的第一个元素 107 | * @return Lexeme 108 | */ 109 | public Lexeme pollFirst(){ 110 | if(this.size == 1){ 111 | Lexeme first = this.head.lexeme; 112 | this.head = null; 113 | this.tail = null; 114 | this.size--; 115 | return first; 116 | }else if(this.size > 1){ 117 | Lexeme first = this.head.lexeme; 118 | this.head = this.head.next; 119 | this.size --; 120 | return first; 121 | }else{ 122 | return null; 123 | } 124 | } 125 | 126 | /** 127 | * 返回链表尾部元素 128 | * @return 129 | */ 130 | public Lexeme peekLast(){ 131 | if(this.tail != null){ 132 | return this.tail.lexeme; 133 | } 134 | return null; 135 | } 136 | 137 | /** 138 | * 取出链表集合的最后一个元素 139 | * @return Lexeme 140 | */ 141 | public Lexeme pollLast(){ 142 | if(this.size == 1){ 143 | Lexeme last = this.head.lexeme; 144 | this.head = null; 145 | this.tail = null; 146 | this.size--; 147 | return last; 148 | 149 | }else if(this.size > 1){ 150 | Lexeme last = this.tail.lexeme; 151 | this.tail = this.tail.prev; 152 | this.size--; 153 | return last; 154 | 155 | }else{ 156 | return null; 157 | } 158 | } 159 | 160 | /** 161 | * 返回集合大小 162 | * @return 163 | */ 164 | public int size(){ 165 | return this.size; 166 | } 167 | 168 | /** 169 | * 判断集合是否为空 170 | * @return 171 | */ 172 | public boolean isEmpty(){ 173 | return this.size == 0; 174 | } 175 | 176 | /** 177 | * 返回lexeme链的头部 178 | * @return 179 | */ 180 | public Cell getHead(){ 181 | return this.head; 182 | } 183 | 184 | /** 185 | * 186 | * IK 中文分词 版本 5.0 187 | * IK Analyzer release 5.0 188 | * 189 | * Licensed to the Apache Software Foundation (ASF) under one or more 190 | * contributor license agreements. See the NOTICE file distributed with 191 | * this work for additional information regarding copyright ownership. 192 | * The ASF licenses this file to You under the Apache License, Version 2.0 193 | * (the "License"); you may not use this file except in compliance with 194 | * the License. You may obtain a copy of the License at 195 | * 196 | * http://www.apache.org/licenses/LICENSE-2.0 197 | * 198 | * Unless required by applicable law or agreed to in writing, software 199 | * distributed under the License is distributed on an "AS IS" BASIS, 200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | * See the License for the specific language governing permissions and 202 | * limitations under the License. 203 | * 204 | * 源代码由林良益(linliangyi2005@gmail.com)提供 205 | * 版权声明 2012,乌龙茶工作室 206 | * provided by Linliangyi and copyright 2012 by Oolong studio 207 | * 208 | * QuickSortSet集合单元 209 | * 210 | */ 211 | public class Cell implements Comparable{ 212 | private Cell prev; 213 | private Cell next; 214 | private Lexeme lexeme; 215 | 216 | Cell(Lexeme lexeme){ 217 | if(lexeme == null){ 218 | throw new IllegalArgumentException("lexeme must not be null"); 219 | } 220 | this.lexeme = lexeme; 221 | } 222 | 223 | public int compareTo(Cell o) { 224 | return this.lexeme.compareTo(o.lexeme); 225 | } 226 | 227 | public Cell getPrev(){ 228 | return this.prev; 229 | } 230 | 231 | public Cell getNext(){ 232 | return this.next; 233 | } 234 | 235 | public Lexeme getLexeme(){ 236 | return this.lexeme; 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/segmenter/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.core.segmenter; 27 | 28 | import org.wltea.analyzer.core.AnalyzeContext; 29 | import org.wltea.analyzer.core.CharacterUtil; 30 | import org.wltea.analyzer.core.Lexeme; 31 | import org.wltea.analyzer.dic.Dictionary; 32 | import org.wltea.analyzer.dic.Hit; 33 | 34 | import java.util.LinkedList; 35 | import java.util.List; 36 | 37 | 38 | /** 39 | * 中文-日韩文子分词器 40 | */ 41 | public class CJKSegmenter implements ISegmenter { 42 | 43 | //子分词器标签 44 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 45 | //待处理的分词hit队列 46 | private List tmpHits; 47 | 48 | 49 | public CJKSegmenter(){ 50 | this.tmpHits = new LinkedList(); 51 | } 52 | 53 | /* (non-Javadoc) 54 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 55 | */ 56 | public void analyze(AnalyzeContext context) { 57 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ 58 | 59 | //优先处理tmpHits中的hit 60 | if(!this.tmpHits.isEmpty()){ 61 | //处理词段队列 62 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 63 | for(Hit hit : tmpArray){ 64 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 65 | if(hit.isMatch()){ 66 | //输出当前的词 67 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); 68 | context.addLexeme(newLexeme); 69 | 70 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 71 | this.tmpHits.remove(hit); 72 | } 73 | 74 | }else if(hit.isUnmatch()){ 75 | //hit不是词,移除 76 | this.tmpHits.remove(hit); 77 | } 78 | } 79 | } 80 | 81 | //********************************* 82 | //再对当前指针位置的字符进行单字匹配 83 | // 分词器选择的词典文件是该分词器实例化时,configuration里的字典文件列表 84 | List singleCharHits = Dictionary.getSingleton().matchInMainDict(context.getMainDicNames(), context.getSegmentBuff(), context.getCursor(), 1); 85 | for(Hit singleCharHit : singleCharHits){ 86 | if(singleCharHit.isMatch()){//首字成词 87 | //输出当前的词 88 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); 89 | context.addLexeme(newLexeme); 90 | 91 | //同时也是词前缀 92 | if(singleCharHit.isPrefix()){ 93 | //前缀匹配则放入hit列表 94 | this.tmpHits.add(singleCharHit); 95 | } 96 | }else if(singleCharHit.isPrefix()){//首字为词前缀 97 | //前缀匹配则放入hit列表 98 | this.tmpHits.add(singleCharHit); 99 | } 100 | } 101 | 102 | }else{ 103 | //遇到CHAR_USELESS字符 104 | //清空队列 105 | this.tmpHits.clear(); 106 | } 107 | 108 | //判断缓冲区是否已经读完 109 | if(context.isBufferConsumed()){ 110 | //清空队列 111 | this.tmpHits.clear(); 112 | } 113 | 114 | //判断是否锁定缓冲区 115 | if(this.tmpHits.size() == 0){ 116 | context.unlockBuffer(SEGMENTER_NAME); 117 | 118 | }else{ 119 | context.lockBuffer(SEGMENTER_NAME); 120 | } 121 | } 122 | 123 | /* (non-Javadoc) 124 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#reset() 125 | */ 126 | public void reset() { 127 | //清空队列 128 | this.tmpHits.clear(); 129 | } 130 | 131 | } 132 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/segmenter/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core.segmenter; 26 | 27 | import java.util.HashSet; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Set; 31 | 32 | import org.wltea.analyzer.core.AnalyzeContext; 33 | import org.wltea.analyzer.core.CharacterUtil; 34 | import org.wltea.analyzer.core.Lexeme; 35 | import org.wltea.analyzer.dic.Dictionary; 36 | import org.wltea.analyzer.dic.Hit; 37 | 38 | /** 39 | * 40 | * 中文数量词子分词器 41 | */ 42 | public class CN_QuantifierSegmenter implements ISegmenter{ 43 | 44 | //子分词器标签 45 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 46 | 47 | //中文数词 48 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum 49 | private static Set ChnNumberChars = new HashSet(); 50 | static{ 51 | char[] ca = Chn_Num.toCharArray(); 52 | for(char nChar : ca){ 53 | ChnNumberChars.add(nChar); 54 | } 55 | } 56 | 57 | /* 58 | * 词元的开始位置, 59 | * 同时作为子分词器状态标识 60 | * 当start > -1 时,标识当前的分词器正在处理字符 61 | */ 62 | private int nStart; 63 | /* 64 | * 记录词元结束位置 65 | * end记录的是在词元中最后一个出现的合理的数词结束 66 | */ 67 | private int nEnd; 68 | 69 | //待处理的量词hit队列 70 | private List countHits; 71 | 72 | 73 | public CN_QuantifierSegmenter(){ 74 | nStart = -1; 75 | nEnd = -1; 76 | this.countHits = new LinkedList(); 77 | } 78 | 79 | /** 80 | * 分词 81 | */ 82 | public void analyze(AnalyzeContext context) { 83 | //处理中文数词 84 | this.processCNumber(context); 85 | //处理中文量词 86 | this.processCount(context); 87 | 88 | //判断是否锁定缓冲区 89 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ 90 | //对缓冲区解锁 91 | context.unlockBuffer(SEGMENTER_NAME); 92 | }else{ 93 | context.lockBuffer(SEGMENTER_NAME); 94 | } 95 | } 96 | 97 | 98 | /** 99 | * 重置子分词器状态 100 | */ 101 | public void reset() { 102 | nStart = -1; 103 | nEnd = -1; 104 | countHits.clear(); 105 | } 106 | 107 | /** 108 | * 处理数词 109 | */ 110 | private void processCNumber(AnalyzeContext context){ 111 | if(nStart == -1 && nEnd == -1){//初始状态 112 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 113 | && ChnNumberChars.contains(context.getCurrentChar())){ 114 | //记录数词的起始、结束位置 115 | nStart = context.getCursor(); 116 | nEnd = context.getCursor(); 117 | } 118 | }else{//正在处理状态 119 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 120 | && ChnNumberChars.contains(context.getCurrentChar())){ 121 | //记录数词的结束位置 122 | nEnd = context.getCursor(); 123 | }else{ 124 | //输出数词 125 | this.outputNumLexeme(context); 126 | //重置头尾指针 127 | nStart = -1; 128 | nEnd = -1; 129 | } 130 | } 131 | 132 | //缓冲区已经用完,还有尚未输出的数词 133 | if(context.isBufferConsumed() && (nStart != -1 && nEnd != -1)){ 134 | //输出数词 135 | outputNumLexeme(context); 136 | //重置头尾指针 137 | nStart = -1; 138 | nEnd = -1; 139 | } 140 | } 141 | 142 | /** 143 | * 处理中文量词 144 | * @param context 145 | */ 146 | private void processCount(AnalyzeContext context){ 147 | // 判断是否需要启动量词扫描 148 | if(!this.needCountScan(context)){ 149 | return; 150 | } 151 | 152 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ 153 | 154 | //优先处理countHits中的hit 155 | if(!this.countHits.isEmpty()){ 156 | //处理词段队列 157 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 158 | for(Hit hit : tmpArray){ 159 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 160 | if(hit.isMatch()){ 161 | //输出当前的词 162 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); 163 | context.addLexeme(newLexeme); 164 | 165 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 166 | this.countHits.remove(hit); 167 | } 168 | 169 | }else if(hit.isUnmatch()){ 170 | //hit不是词,移除 171 | this.countHits.remove(hit); 172 | } 173 | } 174 | } 175 | 176 | //********************************* 177 | //对当前指针位置的字符进行单字匹配 178 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getQuantifierNames(), context.getSegmentBuff(), context.getCursor(), 1); 179 | if(singleCharHit.isMatch()){//首字成量词词 180 | //输出当前的词 181 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); 182 | context.addLexeme(newLexeme); 183 | 184 | //同时也是词前缀 185 | if(singleCharHit.isPrefix()){ 186 | //前缀匹配则放入hit列表 187 | this.countHits.add(singleCharHit); 188 | } 189 | }else if(singleCharHit.isPrefix()){//首字为量词前缀 190 | //前缀匹配则放入hit列表 191 | this.countHits.add(singleCharHit); 192 | } 193 | 194 | 195 | }else{ 196 | //输入的不是中文字符 197 | //清空未成形的量词 198 | this.countHits.clear(); 199 | } 200 | 201 | //缓冲区数据已经读完,还有尚未输出的量词 202 | if(context.isBufferConsumed()){ 203 | //清空未成形的量词 204 | this.countHits.clear(); 205 | } 206 | } 207 | 208 | /** 209 | * 判断是否需要扫描量词 210 | * @return 211 | */ 212 | private boolean needCountScan(AnalyzeContext context){ 213 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ 214 | //正在处理中文数词,或者正在处理量词 215 | return true; 216 | }else{ 217 | //找到一个相邻的数词 218 | if(!context.getOrgLexemes().isEmpty()){ 219 | Lexeme l = context.getOrgLexemes().peekLast(); 220 | if((Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) 221 | && (l.getBegin() + l.getLength() == context.getCursor())){ 222 | return true; 223 | } 224 | } 225 | } 226 | return false; 227 | } 228 | 229 | /** 230 | * 添加数词词元到结果集 231 | * @param context 232 | */ 233 | private void outputNumLexeme(AnalyzeContext context){ 234 | if(nStart > -1 && nEnd > -1){ 235 | //输出数词 236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); 237 | context.addLexeme(newLexeme); 238 | 239 | } 240 | } 241 | 242 | } 243 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/segmenter/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core.segmenter; 26 | 27 | 28 | import org.wltea.analyzer.core.AnalyzeContext; 29 | 30 | /** 31 | * 32 | * 子分词器接口 33 | */ 34 | public interface ISegmenter { 35 | 36 | /** 37 | * 从分析器读取下一个可能分解的词元对象 38 | * @param context 分词算法上下文 39 | */ 40 | void analyze(AnalyzeContext context); 41 | 42 | 43 | /** 44 | * 重置子分析器状态 45 | */ 46 | void reset(); 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/segmenter/LetterSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core.segmenter; 26 | 27 | import org.wltea.analyzer.core.AnalyzeContext; 28 | import org.wltea.analyzer.core.CharacterUtil; 29 | import org.wltea.analyzer.core.Lexeme; 30 | 31 | import java.util.Arrays; 32 | 33 | /** 34 | * 35 | * 英文字符及阿拉伯数字子分词器 36 | */ 37 | public class LetterSegmenter implements ISegmenter { 38 | 39 | //子分词器标签 40 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; 41 | //链接符号 42 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'}; 43 | 44 | //数字符号 45 | private static final char[] Num_Connector = new char[]{',' , '.'}; 46 | 47 | /* 48 | * 词元的开始位置, 49 | * 同时作为子分词器状态标识 50 | * 当start > -1 时,标识当前的分词器正在处理字符 51 | */ 52 | private int start; 53 | /* 54 | * 记录词元结束位置 55 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 56 | */ 57 | private int end; 58 | 59 | /* 60 | * 字母起始位置 61 | */ 62 | private int englishStart; 63 | 64 | /* 65 | * 字母结束位置 66 | */ 67 | private int englishEnd; 68 | 69 | /* 70 | * 阿拉伯数字起始位置 71 | */ 72 | private int arabicStart; 73 | 74 | /* 75 | * 阿拉伯数字结束位置 76 | */ 77 | private int arabicEnd; 78 | 79 | public LetterSegmenter(){ 80 | Arrays.sort(Letter_Connector); 81 | Arrays.sort(Num_Connector); 82 | this.start = -1; 83 | this.end = -1; 84 | this.englishStart = -1; 85 | this.englishEnd = -1; 86 | this.arabicStart = -1; 87 | this.arabicEnd = -1; 88 | } 89 | 90 | 91 | /* (non-Javadoc) 92 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 93 | */ 94 | public void analyze(AnalyzeContext context) { 95 | boolean bufferLockFlag = false; 96 | //处理英文字母 97 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 98 | //处理阿拉伯字母 99 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 100 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) 101 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 102 | 103 | //判断是否锁定缓冲区 104 | if(bufferLockFlag){ 105 | context.lockBuffer(SEGMENTER_NAME); 106 | }else{ 107 | //对缓冲区解锁 108 | context.unlockBuffer(SEGMENTER_NAME); 109 | } 110 | } 111 | 112 | /* (non-Javadoc) 113 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#reset() 114 | */ 115 | public void reset() { 116 | this.start = -1; 117 | this.end = -1; 118 | this.englishStart = -1; 119 | this.englishEnd = -1; 120 | this.arabicStart = -1; 121 | this.arabicEnd = -1; 122 | } 123 | 124 | /** 125 | * 处理数字字母混合输出 126 | * 如:windos2000 | linliangyi2005@gmail.com 127 | // * @param input 128 | * @param context 129 | * @return 130 | */ 131 | private boolean processMixLetter(AnalyzeContext context){ 132 | boolean needLock = false; 133 | 134 | if(this.start == -1){//当前的分词器尚未开始处理字符 135 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 136 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 137 | //记录起始指针的位置,标明分词器进入处理状态 138 | this.start = context.getCursor(); 139 | this.end = start; 140 | } 141 | 142 | }else{//当前的分词器正在处理字符 143 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 144 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 145 | //记录下可能的结束位置 146 | this.end = context.getCursor(); 147 | 148 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 149 | && this.isLetterConnector(context.getCurrentChar())){ 150 | //记录下可能的结束位置 151 | this.end = context.getCursor(); 152 | }else{ 153 | //遇到非Letter字符,输出词元 154 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 155 | context.addLexeme(newLexeme); 156 | this.start = -1; 157 | this.end = -1; 158 | } 159 | } 160 | 161 | //判断缓冲区是否已经读完 162 | if(context.isBufferConsumed() && (this.start != -1 && this.end != -1)){ 163 | //缓冲以读完,输出词元 164 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 165 | context.addLexeme(newLexeme); 166 | this.start = -1; 167 | this.end = -1; 168 | } 169 | 170 | //判断是否锁定缓冲区 171 | if(this.start == -1 && this.end == -1){ 172 | //对缓冲区解锁 173 | needLock = false; 174 | }else{ 175 | needLock = true; 176 | } 177 | return needLock; 178 | } 179 | 180 | /** 181 | * 处理纯英文字母输出 182 | * @param context 183 | * @return 184 | */ 185 | private boolean processEnglishLetter(AnalyzeContext context){ 186 | boolean needLock = false; 187 | 188 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符 189 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 190 | //记录起始指针的位置,标明分词器进入处理状态 191 | this.englishStart = context.getCursor(); 192 | this.englishEnd = this.englishStart; 193 | } 194 | }else {//当前的分词器正在处理英文字符 195 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 196 | //记录当前指针位置为结束位置 197 | this.englishEnd = context.getCursor(); 198 | }else{ 199 | //遇到非English字符,输出词元 200 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 201 | context.addLexeme(newLexeme); 202 | this.englishStart = -1; 203 | this.englishEnd= -1; 204 | } 205 | } 206 | 207 | //判断缓冲区是否已经读完 208 | if(context.isBufferConsumed() && (this.englishStart != -1 && this.englishEnd != -1)){ 209 | //缓冲以读完,输出词元 210 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 211 | context.addLexeme(newLexeme); 212 | this.englishStart = -1; 213 | this.englishEnd= -1; 214 | } 215 | 216 | //判断是否锁定缓冲区 217 | if(this.englishStart == -1 && this.englishEnd == -1){ 218 | //对缓冲区解锁 219 | needLock = false; 220 | }else{ 221 | needLock = true; 222 | } 223 | return needLock; 224 | } 225 | 226 | /** 227 | * 处理阿拉伯数字输出 228 | * @param context 229 | * @return 230 | */ 231 | private boolean processArabicLetter(AnalyzeContext context){ 232 | boolean needLock = false; 233 | 234 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符 235 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 236 | //记录起始指针的位置,标明分词器进入处理状态 237 | this.arabicStart = context.getCursor(); 238 | this.arabicEnd = this.arabicStart; 239 | } 240 | }else {//当前的分词器正在处理数字字符 241 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 242 | //记录当前指针位置为结束位置 243 | this.arabicEnd = context.getCursor(); 244 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 245 | && this.isNumConnector(context.getCurrentChar())){ 246 | //不输出数字,但不标记结束 247 | }else{ 248 | ////遇到非Arabic字符,输出词元 249 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 250 | context.addLexeme(newLexeme); 251 | this.arabicStart = -1; 252 | this.arabicEnd = -1; 253 | } 254 | } 255 | 256 | //判断缓冲区是否已经读完 257 | if(context.isBufferConsumed() && (this.arabicStart != -1 && this.arabicEnd != -1)){ 258 | //生成已切分的词元 259 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 260 | context.addLexeme(newLexeme); 261 | this.arabicStart = -1; 262 | this.arabicEnd = -1; 263 | } 264 | 265 | //判断是否锁定缓冲区 266 | if(this.arabicStart == -1 && this.arabicEnd == -1){ 267 | //对缓冲区解锁 268 | needLock = false; 269 | }else{ 270 | needLock = true; 271 | } 272 | return needLock; 273 | } 274 | 275 | /** 276 | * 判断是否是字母连接符号 277 | * @param input 278 | * @return 279 | */ 280 | private boolean isLetterConnector(char input){ 281 | int index = Arrays.binarySearch(Letter_Connector, input); 282 | return index >= 0; 283 | } 284 | 285 | /** 286 | * 判断是否是数字连接符号 287 | * @param input 288 | * @return 289 | */ 290 | private boolean isNumConnector(char input){ 291 | int index = Arrays.binarySearch(Num_Connector, input); 292 | return index >= 0; 293 | } 294 | } 295 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/DicFile.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.dic; 2 | 3 | /** 4 | * 字典信息描述 5 | * */ 6 | public class DicFile { 7 | 8 | /** 字典名称 */ 9 | private String dicName; 10 | 11 | /** 字典文件路径*/ 12 | private String dicPath; 13 | 14 | /** 是远程文件还是本地字典文件, 默认为本地字典文件*/ 15 | private Boolean isRemote = false; 16 | 17 | private DictType dictType; 18 | 19 | private String absolutePath; 20 | 21 | public DicFile(String absolutePath){ 22 | this.absolutePath = absolutePath; 23 | } 24 | 25 | public String getAbsolutePath() { 26 | return absolutePath; 27 | } 28 | public String getDicName() { 29 | return dicName; 30 | } 31 | 32 | public void setDicName(String dicName) { 33 | this.dicName = dicName; 34 | } 35 | 36 | public String getDicPath() { 37 | return dicPath; 38 | } 39 | 40 | public void setDicPath(String dicPath) { 41 | this.dicPath = dicPath; 42 | } 43 | 44 | public Boolean isRemote() { 45 | return isRemote; 46 | } 47 | 48 | public void setRemote(Boolean remote) { 49 | isRemote = remote; 50 | } 51 | 52 | public DictType getDictType() { 53 | return dictType; 54 | } 55 | 56 | public DicFile setDictType(DictType dictType) { 57 | this.dictType = dictType; 58 | return this; 59 | } 60 | 61 | public enum DictType{ 62 | /**整词*/ 63 | INTACT_WORDS, 64 | /**量词*/ 65 | QUANTIFIER, 66 | /**停词*/ 67 | STOPWORDS, 68 | SUFFIX, 69 | SURNAME; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/DictSegment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.util.Arrays; 29 | import java.util.Map; 30 | import java.util.concurrent.ConcurrentHashMap; 31 | 32 | /** 33 | * 词典树分段,表示词典树的一个分枝 34 | */ 35 | class DictSegment implements Comparable{ 36 | 37 | //公用字典表,存储汉字 38 | private static final Map charMap = new ConcurrentHashMap(16 , 0.95f); 39 | //数组大小上限 40 | private static final int ARRAY_LENGTH_LIMIT = 3; 41 | 42 | 43 | //Map存储结构 44 | private Map childrenMap; 45 | //数组方式存储结构 46 | private DictSegment[] childrenArray; 47 | 48 | 49 | //当前节点上存储的字符 50 | private Character nodeChar; 51 | //当前节点存储的Segment数目 52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 53 | private int storeSize = 0; 54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 55 | private int nodeState = 0; 56 | 57 | 58 | DictSegment(Character nodeChar){ 59 | if(nodeChar == null){ 60 | throw new IllegalArgumentException("参数为空异常,字符不能为空"); 61 | } 62 | this.nodeChar = nodeChar; 63 | } 64 | 65 | Character getNodeChar() { 66 | return nodeChar; 67 | } 68 | 69 | /* 70 | * 判断是否有下一个节点 71 | */ 72 | boolean hasNextNode(){ 73 | return this.storeSize > 0; 74 | } 75 | 76 | /** 77 | * 匹配词段 78 | * @param charArray 79 | * @return Hit 80 | */ 81 | Hit match(char[] charArray){ 82 | return this.match(charArray , 0 , charArray.length , null); 83 | } 84 | 85 | /** 86 | * 匹配词段 87 | * @param charArray 88 | * @param begin 89 | * @param length 90 | * @return Hit 91 | */ 92 | Hit match(char[] charArray , int begin , int length){ 93 | return this.match(charArray , begin , length , null); 94 | } 95 | 96 | /** 97 | * 匹配词段 98 | * @param charArray 99 | * @param begin 100 | * @param length 101 | * @param searchHit 102 | * @return Hit 103 | */ 104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){ 105 | 106 | if(searchHit == null){ 107 | //如果hit为空,新建 108 | searchHit= new Hit(); 109 | //设置hit的其实文本位置 110 | searchHit.setBegin(begin); 111 | }else{ 112 | //否则要将HIT状态重置 113 | searchHit.setUnmatch(); 114 | } 115 | //设置hit的当前处理位置 116 | searchHit.setEnd(begin); 117 | 118 | Character keyChar = Character.valueOf(charArray[begin]); 119 | DictSegment ds = null; 120 | 121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题 122 | DictSegment[] segmentArray = this.childrenArray; 123 | Map segmentMap = this.childrenMap; 124 | 125 | //STEP1 在节点中查找keyChar对应的DictSegment 126 | if(segmentArray != null){ 127 | //在数组中查找 128 | DictSegment keySegment = new DictSegment(keyChar); 129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment); 130 | if(position >= 0){ 131 | ds = segmentArray[position]; 132 | } 133 | 134 | }else if(segmentMap != null){ 135 | //在map中查找 136 | ds = (DictSegment)segmentMap.get(keyChar); 137 | } 138 | 139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 140 | if(ds != null){ 141 | if(length > 1){ 142 | //词未匹配完,继续往下搜索 143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit); 144 | }else if (length == 1){ 145 | 146 | //搜索最后一个char 147 | if(ds.nodeState == 1){ 148 | //添加HIT状态为完全匹配 149 | searchHit.setMatch(); 150 | } 151 | if(ds.hasNextNode()){ 152 | //添加HIT状态为前缀匹配 153 | searchHit.setPrefix(); 154 | //记录当前位置的DictSegment 155 | searchHit.setMatchedDictSegment(ds); 156 | } 157 | return searchHit; 158 | } 159 | 160 | } 161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配 162 | return searchHit; 163 | } 164 | 165 | /** 166 | * 加载填充词典片段 167 | * @param charArray 168 | */ 169 | void fillSegment(char[] charArray){ 170 | this.fillSegment(charArray, 0 , charArray.length , 1); 171 | } 172 | 173 | /** 174 | * 屏蔽词典中的一个词 175 | * @param charArray 176 | */ 177 | void disableSegment(char[] charArray){ 178 | this.fillSegment(charArray, 0 , charArray.length , 0); 179 | } 180 | 181 | /** 182 | * 加载填充词典片段 183 | * @param charArray 184 | * @param begin 185 | * @param length 186 | * @param enabled 187 | */ 188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){ 189 | //获取字典表中的汉字对象 190 | Character beginChar = Character.valueOf(charArray[begin]); 191 | Character keyChar = charMap.get(beginChar); 192 | //字典中没有该字,则将其添加入字典 193 | if(keyChar == null){ 194 | charMap.put(beginChar, beginChar); 195 | keyChar = beginChar; 196 | } 197 | 198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 199 | DictSegment ds = lookforSegment(keyChar , enabled); 200 | if(ds != null){ 201 | //处理keyChar对应的segment 202 | if(length > 1){ 203 | //词元还没有完全加入词典树 204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled); 205 | }else if (length == 1){ 206 | //已经是词元的最后一个char,设置当前节点状态为enabled, 207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 208 | ds.nodeState = enabled; 209 | } 210 | } 211 | 212 | } 213 | 214 | /** 215 | * 查找本节点下对应的keyChar的segment * 216 | * @param keyChar 217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null 218 | * @return 219 | */ 220 | private DictSegment lookforSegment(Character keyChar , int create){ 221 | 222 | DictSegment ds = null; 223 | 224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){ 225 | //获取数组容器,如果数组未创建则创建数组 226 | DictSegment[] segmentArray = getChildrenArray(); 227 | //搜寻数组 228 | DictSegment keySegment = new DictSegment(keyChar); 229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment); 230 | if(position >= 0){ 231 | ds = segmentArray[position]; 232 | } 233 | 234 | //遍历数组后没有找到对应的segment 235 | if(ds == null && create == 1){ 236 | ds = keySegment; 237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){ 238 | //数组容量未满,使用数组存储 239 | segmentArray[this.storeSize] = ds; 240 | //segment数目+1 241 | this.storeSize++; 242 | Arrays.sort(segmentArray , 0 , this.storeSize); 243 | 244 | }else{ 245 | //数组容量已满,切换Map存储 246 | //获取Map容器,如果Map未创建,则创建Map 247 | Map segmentMap = getChildrenMap(); 248 | //将数组中的segment迁移到Map中 249 | migrate(segmentArray , segmentMap); 250 | //存储新的segment 251 | segmentMap.put(keyChar, ds); 252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 253 | this.storeSize++; 254 | //释放当前的数组引用 255 | this.childrenArray = null; 256 | } 257 | 258 | } 259 | 260 | }else{ 261 | //获取Map容器,如果Map未创建,则创建Map 262 | Map segmentMap = getChildrenMap(); 263 | //搜索Map 264 | ds = (DictSegment)segmentMap.get(keyChar); 265 | if(ds == null && create == 1){ 266 | //构造新的segment 267 | ds = new DictSegment(keyChar); 268 | segmentMap.put(keyChar , ds); 269 | //当前节点存储segment数目+1 270 | this.storeSize ++; 271 | } 272 | } 273 | 274 | return ds; 275 | } 276 | 277 | 278 | /** 279 | * 获取数组容器 280 | * 线程同步方法 281 | */ 282 | private DictSegment[] getChildrenArray(){ 283 | synchronized(this){ 284 | if(this.childrenArray == null){ 285 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT]; 286 | } 287 | } 288 | return this.childrenArray; 289 | } 290 | 291 | /** 292 | * 获取Map容器 293 | * 线程同步方法 294 | */ 295 | private Map getChildrenMap(){ 296 | synchronized(this){ 297 | if(this.childrenMap == null){ 298 | this.childrenMap = new ConcurrentHashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); 299 | } 300 | } 301 | return this.childrenMap; 302 | } 303 | 304 | /** 305 | * 将数组中的segment迁移到Map中 306 | * @param segmentArray 307 | */ 308 | private void migrate(DictSegment[] segmentArray , Map segmentMap){ 309 | for(DictSegment segment : segmentArray){ 310 | if(segment != null){ 311 | segmentMap.put(segment.nodeChar, segment); 312 | } 313 | } 314 | } 315 | 316 | /** 317 | * 实现Comparable接口 318 | * @param o 319 | * @return int 320 | */ 321 | public int compareTo(DictSegment o) { 322 | //对当前节点存储的char进行比较 323 | return this.nodeChar.compareTo(o.nodeChar); 324 | } 325 | 326 | } 327 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/Dictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.io.*; 29 | import java.nio.file.Files; 30 | import java.nio.file.Path; 31 | import java.nio.file.Paths; 32 | import java.util.*; 33 | import java.util.concurrent.Executors; 34 | import java.util.concurrent.ScheduledExecutorService; 35 | import java.util.concurrent.TimeUnit; 36 | import org.apache.logging.log4j.Logger; 37 | import org.elasticsearch.common.io.PathUtils; 38 | import org.wltea.analyzer.help.ESPluginLoggerFactory; 39 | 40 | 41 | /** 42 | * 词典管理类,单子模式 43 | */ 44 | public class Dictionary { 45 | 46 | /* 47 | * 词典单子实例 48 | */ 49 | private static Dictionary singleton; 50 | /* 51 | * 主词典对象 52 | */ 53 | private Map _MainDict = new HashMap<>(4); 54 | /* 55 | * 量词词典 56 | */ 57 | private Map _QuantifierDict = new HashMap<>(4); 58 | /* 59 | * 停止词集合 60 | */ 61 | private Map _StopWords = new HashMap<>(4); 62 | /* 63 | * 姓氏词典 64 | */ 65 | private Map _SurnameDict = new HashMap<>(4); 66 | /* 67 | * 后缀词典 68 | */ 69 | private Map _SuffixDict = new HashMap<>(4); 70 | /* 71 | * 副词,介词词典 72 | */ 73 | private Map _PrepDict = new HashMap<>(4); 74 | 75 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDicMonitor.class.getName()); 76 | 77 | private static ScheduledExecutorService pool; 78 | 79 | private RemoteDicMonitor dicMonitor; 80 | 81 | private Dictionary(){} 82 | 83 | public void loadAllDictFiles(List dicFiles) { 84 | dicFiles.forEach(dicFile -> { 85 | if(needLoad(dicFile)){ 86 | DictSegment dictSegment; 87 | if(dicFile.isRemote()){ 88 | // 从远程加载 89 | dictSegment = RemoteDicMonitor.loadRemoteDic(dicFile); 90 | // 添加监控任务 91 | addMonitorTask(dicFile); 92 | } else { 93 | dictSegment = loadLocalDictFile(dicFile); 94 | } 95 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){ 96 | _MainDict.put(dicFile.getDicName(), dictSegment); 97 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){ 98 | _QuantifierDict.put(dicFile.getDicName(), dictSegment); 99 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){ 100 | _StopWords.put(dicFile.getDicName(), dictSegment); 101 | } else if(dicFile.getDictType() == DicFile.DictType.SUFFIX){ 102 | _SuffixDict.put(dicFile.getDicName(), dictSegment); 103 | } else if(dicFile.getDictType() == DicFile.DictType.SURNAME){ 104 | _SurnameDict.put(dicFile.getDicName(), dictSegment); 105 | } 106 | } 107 | }); 108 | } 109 | 110 | private void addMonitorTask(DicFile dicFile) { 111 | if(pool == null){ 112 | synchronized (Dictionary.class){ 113 | if(pool == null){ 114 | // 初始化监控任务 115 | initRemoteMoniter(); 116 | } 117 | } 118 | } 119 | RemoteDicMonitor.RemoteDicFile remoteDicFile = new RemoteDicMonitor.RemoteDicFile(dicFile.getAbsolutePath()); 120 | remoteDicFile.setDicName(dicFile.getDicName()); 121 | remoteDicFile.setDicPath(dicFile.getDicPath()); 122 | remoteDicFile.setDictType(dicFile.getDictType()); 123 | remoteDicFile.setRemote(true); 124 | this.dicMonitor.addFile(remoteDicFile); 125 | } 126 | 127 | private boolean needLoad(DicFile dicFile){ 128 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){ 129 | return _MainDict.get(dicFile.getDicName()) == null; 130 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){ 131 | return _QuantifierDict.get(dicFile.getDicName()) == null; 132 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){ 133 | return _StopWords.get(dicFile.getDicName()) == null; 134 | } else if(dicFile.getDictType() == DicFile.DictType.SUFFIX){ 135 | return _SuffixDict.get(dicFile.getDicName()) == null; 136 | } else if(dicFile.getDictType() == DicFile.DictType.SURNAME){ 137 | return _SurnameDict.get(dicFile.getDicName()) == null; 138 | } 139 | return false; 140 | } 141 | 142 | private static DictSegment loadLocalDictFile(DicFile dicFile) { 143 | DictSegment dictSegment = new DictSegment((char) 0); 144 | 145 | // check file exist 146 | // 读取字典文件路径顺序:优先从es的config/analysis-ik/下读取字典文件, 147 | // 如未找到,则从plugin下,分词器对应的目录读取 148 | Path dicFilePath = Paths.get(dicFile.getAbsolutePath(), dicFile.getDicPath()); 149 | if(!Files.exists(dicFilePath)){ 150 | Path configInPluginDir = PathUtils.get(new File(Dictionary.class.getProtectionDomain().getCodeSource().getLocation().getPath()) 151 | .getParent(), "config").toAbsolutePath(); 152 | dicFilePath = configInPluginDir.resolve(dicFile.getDicPath()); 153 | } 154 | // 读取词典文件 155 | try (InputStream is = new FileInputStream(dicFilePath.toFile()); 156 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512)){ 157 | String word = br.readLine(); 158 | if (word != null) { 159 | if (word.startsWith("\uFEFF")) 160 | word = word.substring(1); 161 | for (; word != null; word = br.readLine()) { 162 | word = word.trim(); 163 | if (word.isEmpty()) continue; 164 | dictSegment.fillSegment(word.toCharArray()); 165 | } 166 | } 167 | } catch (FileNotFoundException e) { 168 | logger.error("ik-analyzer: " + dicFile.getDicName() + " not found", e); 169 | throw new RuntimeException("ik-analyzer: " + dicFile.getDicName() + " not found!!!", e); 170 | } catch (IOException e) { 171 | logger.error("ik-analyzer: " + dicFile.getDicName() + " loading failed", e); 172 | } 173 | return dictSegment; 174 | } 175 | 176 | /** 177 | * 获取词典单子实例 178 | * 179 | * @return Dictionary 单例对象 180 | */ 181 | public static Dictionary getSingleton() { 182 | if (singleton == null) { 183 | synchronized (Dictionary.class){ 184 | if(singleton == null){ 185 | singleton = new Dictionary(); 186 | } 187 | } 188 | } 189 | return singleton; 190 | } 191 | 192 | public static void initRemoteMoniter(){ 193 | // 开启远程词典文件监控任务 194 | singleton.dicMonitor = new RemoteDicMonitor(); 195 | pool = Executors.newScheduledThreadPool(1); 196 | pool.scheduleAtFixedRate(singleton.dicMonitor, 10, 60, TimeUnit.SECONDS); 197 | } 198 | 199 | 200 | /** 201 | * 批量加载新词条 202 | * 203 | * @param words 204 | * Collection词条列表 205 | */ 206 | public void addWords(String fileName, Collection words) { 207 | if (words != null) { 208 | for (String word : words) { 209 | if (word != null) { 210 | // 批量加载词条到主内存词典中 211 | singleton._MainDict.get(fileName).fillSegment(word.trim().toCharArray()); 212 | } 213 | } 214 | } 215 | } 216 | 217 | /** 218 | * 批量移除(屏蔽)词条 219 | */ 220 | public void disableWords(String fileName, Collection words) { 221 | if (words != null) { 222 | for (String word : words) { 223 | if (word != null) { 224 | // 批量屏蔽词条 225 | singleton._MainDict.get(fileName).disableSegment(word.trim().toCharArray()); 226 | } 227 | } 228 | } 229 | } 230 | 231 | /** 232 | * 检索匹配主词典 233 | * 234 | * @return Hit 匹配结果描述 235 | */ 236 | public Hit matchInMainDict(String fileName, char[] charArray) { 237 | return singleton._MainDict.get(fileName).match(charArray); 238 | } 239 | 240 | /** 241 | * 检索匹配主词典 242 | * 243 | * @return Hit 匹配结果描述 244 | */ 245 | public List matchInMainDict(List dicNames, char[] charArray, int begin, int length) { 246 | ArrayList tmpHits = new ArrayList(dicNames.size()); 247 | for(String dicName : dicNames){ 248 | // 成词优先级比前缀优先级高 249 | Hit tmpHit = singleton._MainDict.get(dicName).match(charArray, begin, length); 250 | if(tmpHit.isMatch() || tmpHit.isPrefix()) tmpHits.add(tmpHit); 251 | } 252 | return tmpHits; 253 | } 254 | 255 | /** 256 | * 检索匹配量词词典 257 | * 258 | * @return Hit 匹配结果描述 259 | */ 260 | public Hit matchInQuantifierDict(List fileNames, char[] charArray, int begin, int length) { 261 | Hit tmpHit = new Hit(); 262 | for(String fileName : fileNames){ 263 | // 成词优先级比前缀优先级高 264 | tmpHit = singleton._QuantifierDict.get(fileName).match(charArray, begin, length); 265 | if(tmpHit.isMatch() || tmpHit.isPrefix()) return tmpHit; 266 | } 267 | return tmpHit; 268 | } 269 | 270 | /** 271 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 272 | * 273 | * @return Hit 274 | */ 275 | public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { 276 | DictSegment ds = matchedHit.getMatchedDictSegment(); 277 | return ds.match(charArray, currentIndex, 1, matchedHit); 278 | } 279 | 280 | /** 281 | * 判断是否是停止词 282 | * 283 | * @return boolean 284 | */ 285 | public boolean isStopWord(List fileNames, char[] charArray, int begin, int length) { 286 | for(String fileName : fileNames){ 287 | // 满足任意词典里的停词,则认为是停词,都不满足,则不是停词 288 | if(singleton._StopWords.get(fileName).match(charArray, begin, length).isMatch()) 289 | return true; 290 | } 291 | return false; 292 | } 293 | 294 | /** 295 | * 检索匹配姓氏词典 296 | * @param charArray 297 | * @param begin 298 | * @param length 299 | * @return Hit 匹配结果描述 300 | */ 301 | public static Hit matchInSurnameDict(String fileName, char[] charArray , int begin, int length){ 302 | return singleton._SurnameDict.get(fileName).match(charArray, begin, length); 303 | } 304 | 305 | /** 306 | * 检索匹配在后缀词典 307 | * @param charArray 308 | * @param begin 309 | * @param length 310 | * @return Hit 匹配结果描述 311 | */ 312 | public static Hit matchInSuffixDict(String fileName, char[] charArray , int begin, int length){ 313 | return singleton._SuffixDict.get(fileName).match(charArray, begin, length); 314 | } 315 | 316 | /** 317 | * 检索匹配介词、副词词典 318 | * @param charArray 319 | * @param begin 320 | * @param length 321 | * @return Hit 匹配结果描述 322 | */ 323 | public static Hit matchInPrepDict(String fileName, char[] charArray , int begin, int length){ 324 | return singleton._PrepDict.get(fileName).match(charArray, begin, length); 325 | } 326 | } 327 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /** 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | //Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | //Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | //Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | 40 | //该HIT当前状态,默认未匹配 41 | private int hitState = UNMATCH; 42 | 43 | //记录词典匹配过程中,当前匹配到的词典分支节点 44 | private DictSegment matchedDictSegment; 45 | /* 46 | * 词段开始位置 47 | */ 48 | private int begin; 49 | /* 50 | * 词段的结束位置 51 | */ 52 | private int end; 53 | 54 | 55 | /** 56 | * 判断是否完全匹配 57 | */ 58 | public boolean isMatch() { 59 | return (this.hitState & MATCH) > 0; 60 | } 61 | /** 62 | * 63 | */ 64 | public void setMatch() { 65 | this.hitState = this.hitState | MATCH; 66 | } 67 | 68 | /** 69 | * 判断是否是词的前缀 70 | */ 71 | public boolean isPrefix() { 72 | return (this.hitState & PREFIX) > 0; 73 | } 74 | /** 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | /** 81 | * 判断是否是不匹配 82 | */ 83 | public boolean isUnmatch() { 84 | return this.hitState == UNMATCH ; 85 | } 86 | /** 87 | * 88 | */ 89 | public void setUnmatch() { 90 | this.hitState = UNMATCH; 91 | } 92 | 93 | public DictSegment getMatchedDictSegment() { 94 | return matchedDictSegment; 95 | } 96 | 97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 98 | this.matchedDictSegment = matchedDictSegment; 99 | } 100 | 101 | public int getBegin() { 102 | return begin; 103 | } 104 | 105 | public void setBegin(int begin) { 106 | this.begin = begin; 107 | } 108 | 109 | public int getEnd() { 110 | return end; 111 | } 112 | 113 | public void setEnd(int end) { 114 | this.end = end; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/RemoteDicMonitor.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.dic; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.security.AccessController; 7 | import java.security.PrivilegedAction; 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | import java.util.concurrent.ConcurrentLinkedQueue; 11 | 12 | import org.apache.http.Header; 13 | import org.apache.http.HttpEntity; 14 | import org.apache.http.client.config.RequestConfig; 15 | import org.apache.http.client.methods.CloseableHttpResponse; 16 | import org.apache.http.client.methods.HttpGet; 17 | import org.apache.http.client.methods.HttpHead; 18 | import org.apache.http.impl.client.CloseableHttpClient; 19 | import org.apache.http.impl.client.HttpClients; 20 | import org.apache.logging.log4j.Logger; 21 | import org.elasticsearch.SpecialPermission; 22 | import org.wltea.analyzer.help.ESPluginLoggerFactory; 23 | 24 | public class RemoteDicMonitor implements Runnable { 25 | 26 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDicMonitor.class.getName()); 27 | 28 | private static CloseableHttpClient httpclient = HttpClients.createDefault(); 29 | 30 | public static class RemoteDicFile extends DicFile{ 31 | /** 上次更改时间 */ 32 | private String last_modified; 33 | /** 资源属性 */ 34 | private String eTags; 35 | 36 | public RemoteDicFile(String absolutePath) { 37 | super(absolutePath); 38 | } 39 | 40 | public String getLast_modified() { 41 | return last_modified; 42 | } 43 | 44 | public void setLast_modified(String last_modified) { 45 | this.last_modified = last_modified; 46 | } 47 | 48 | public String getETags() { 49 | return eTags; 50 | } 51 | 52 | public void setETags(String eTags) { 53 | this.eTags = eTags; 54 | } 55 | } 56 | 57 | /* 58 | * 请求地址 59 | */ 60 | private ConcurrentLinkedQueue monitorFiles = new ConcurrentLinkedQueue<>(); 61 | 62 | public void addFile(RemoteDicFile dicFile){ 63 | boolean hasAdd = monitorFiles.stream().anyMatch(r -> r.getDicName().equals(dicFile.getDicName())); 64 | if(!hasAdd) { 65 | monitorFiles.offer(dicFile); 66 | } 67 | } 68 | 69 | public void run() { 70 | SpecialPermission.check(); 71 | monitorFiles.forEach(dicFile -> { 72 | AccessController.doPrivileged((PrivilegedAction) () -> { 73 | this.runUnprivileged(dicFile); 74 | return null; 75 | }); 76 | }); 77 | } 78 | 79 | /** 80 | * 监控流程: 81 | * ①向词库服务器发送Head请求 82 | * ②从响应中获取Last-Modify、ETags字段值,判断是否变化 83 | * ③如果未变化,休眠1min,返回第①步 84 | * ④如果有变化,重新加载词典 85 | * ⑤休眠1min,返回第①步 86 | */ 87 | 88 | public void runUnprivileged(RemoteDicFile dicFile) { 89 | 90 | //超时设置 91 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000) 92 | .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build(); 93 | 94 | HttpHead httpHead = new HttpHead(dicFile.getDicPath()); 95 | httpHead.setConfig(rc); 96 | 97 | //设置请求头 98 | if (dicFile.getLast_modified() != null) { 99 | httpHead.setHeader("If-Modified-Since", dicFile.getLast_modified()); 100 | } 101 | if (dicFile.getETags() != null) { 102 | httpHead.setHeader("If-None-Match", dicFile.getETags()); 103 | } 104 | 105 | CloseableHttpResponse response = null; 106 | try { 107 | 108 | response = httpclient.execute(httpHead); 109 | 110 | //返回200 才做操作 111 | if(response.getStatusLine().getStatusCode()==200){ 112 | 113 | if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(dicFile.getLast_modified())) 114 | ||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(dicFile.eTags))) { 115 | 116 | // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags 117 | List words = getRemoteWords(dicFile.getDicPath()); 118 | Dictionary.getSingleton().addWords(dicFile.getDicName(), words); 119 | dicFile.setLast_modified(response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue()); 120 | dicFile.setETags(response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue()); 121 | } 122 | }else if (response.getStatusLine().getStatusCode()==304) { 123 | //没有修改,不做操作 124 | //noop 125 | }else{ 126 | logger.info("remote_ext_dict {} return bad code {}" , dicFile.getDicPath() , response.getStatusLine().getStatusCode() ); 127 | } 128 | 129 | } catch (Exception e) { 130 | logger.error("remote_ext_dict {} error!",e , dicFile.getDicPath()); 131 | }finally{ 132 | try { 133 | if (response != null) { 134 | response.close(); 135 | } 136 | } catch (IOException e) { 137 | logger.error(e.getMessage(), e); 138 | } 139 | } 140 | } 141 | 142 | public static DictSegment loadRemoteDic(DicFile dicFile){ 143 | logger.info("[Dict Loading] " + dicFile.getDicPath()); 144 | DictSegment dictSegment = new DictSegment((char) 0); 145 | List lists = getRemoteWords(dicFile.getDicPath()); 146 | // 如果找不到扩展的字典,则忽略 147 | if (lists == null) { 148 | logger.error("[Dict Loading] " + dicFile.getDicPath() + "加载失败"); 149 | return dictSegment; 150 | } 151 | for (String theWord : lists) { 152 | if (theWord != null && !"".equals(theWord.trim())) { 153 | logger.info(theWord); 154 | dictSegment.fillSegment(theWord.trim().toLowerCase().toCharArray()); 155 | } 156 | } 157 | return dictSegment; 158 | } 159 | 160 | private static List getRemoteWords(String location) { 161 | SpecialPermission.check(); 162 | return AccessController.doPrivileged((PrivilegedAction>) () -> { 163 | return getRemoteWordsUnprivileged(location); 164 | }); 165 | } 166 | 167 | 168 | /** 169 | * 从远程服务器上下载自定义词条 170 | */ 171 | public static List getRemoteWordsUnprivileged(String location) { 172 | 173 | List buffer = new ArrayList(); 174 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000) 175 | .setSocketTimeout(60 * 1000).build(); 176 | CloseableHttpClient httpclient = HttpClients.createDefault(); 177 | CloseableHttpResponse response; 178 | BufferedReader in; 179 | HttpGet get = new HttpGet(location); 180 | get.setConfig(rc); 181 | try { 182 | response = httpclient.execute(get); 183 | if (response.getStatusLine().getStatusCode() == 200) { 184 | 185 | String charset = "UTF-8"; 186 | // 获取编码,默认为utf-8 187 | HttpEntity entity = response.getEntity(); 188 | if(entity!=null){ 189 | Header contentType = entity.getContentType(); 190 | if(contentType!=null&&contentType.getValue()!=null){ 191 | String typeValue = contentType.getValue(); 192 | if(typeValue!=null&&typeValue.contains("charset=")){ 193 | charset = typeValue.substring(typeValue.lastIndexOf("=") + 1); 194 | } 195 | } 196 | 197 | if (entity.getContentLength() > 0) { 198 | in = new BufferedReader(new InputStreamReader(entity.getContent(), charset)); 199 | String line; 200 | while ((line = in.readLine()) != null) { 201 | buffer.add(line); 202 | } 203 | in.close(); 204 | response.close(); 205 | return buffer; 206 | } 207 | } 208 | } 209 | response.close(); 210 | } catch (IllegalStateException | IOException e) { 211 | logger.error("getRemoteWords {} error", e, location); 212 | } 213 | return buffer; 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/help/CharacterHelper.java: -------------------------------------------------------------------------------- 1 | 2 | package org.wltea.analyzer.help; 3 | 4 | public class CharacterHelper { 5 | 6 | public static boolean isSpaceLetter(char input){ 7 | return input == 8 || input == 9 8 | || input == 10 || input == 13 9 | || input == 32 || input == 160; 10 | } 11 | 12 | public static boolean isEnglishLetter(char input){ 13 | return (input >= 'a' && input <= 'z') 14 | || (input >= 'A' && input <= 'Z'); 15 | } 16 | 17 | public static boolean isArabicNumber(char input){ 18 | return input >= '0' && input <= '9'; 19 | } 20 | 21 | public static boolean isCJKCharacter(char input){ 22 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 23 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 24 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 25 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 26 | 27 | || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS 28 | 29 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 30 | || ub == Character.UnicodeBlock.HANGUL_JAMO 31 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 32 | 33 | || ub == Character.UnicodeBlock.HIRAGANA 34 | || ub == Character.UnicodeBlock.KATAKANA 35 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS 36 | ) { 37 | return true; 38 | }else{ 39 | return false; 40 | } 41 | 42 | 43 | 44 | } 45 | 46 | public static char regularize(char input){ 47 | if (input == 12288) { 48 | input = (char) 32; 49 | 50 | }else if (input > 65280 && input < 65375) { 51 | input = (char) (input - 65248); 52 | 53 | }else if (input >= 'A' && input <= 'Z') { 54 | input += 32; 55 | } 56 | 57 | return input; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/help/ESPluginLoggerFactory.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.help; 2 | 3 | import org.apache.logging.log4j.LogManager; 4 | import org.apache.logging.log4j.Logger; 5 | import org.apache.logging.log4j.spi.ExtendedLogger; 6 | 7 | public class ESPluginLoggerFactory { 8 | 9 | private ESPluginLoggerFactory() { 10 | } 11 | 12 | static public Logger getLogger(String name) { 13 | return getLogger("", LogManager.getLogger(name)); 14 | } 15 | 16 | static public Logger getLogger(String prefix, String name) { 17 | return getLogger(prefix, LogManager.getLogger(name)); 18 | } 19 | 20 | static public Logger getLogger(String prefix, Class clazz) { 21 | return getLogger(prefix, LogManager.getLogger(clazz.getName())); 22 | } 23 | 24 | static public Logger getLogger(String prefix, Logger logger) { 25 | return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/help/PrefixPluginLogger.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.help; 2 | 3 | import org.apache.logging.log4j.Level; 4 | import org.apache.logging.log4j.Marker; 5 | import org.apache.logging.log4j.MarkerManager; 6 | import org.apache.logging.log4j.message.Message; 7 | import org.apache.logging.log4j.message.MessageFactory; 8 | import org.apache.logging.log4j.spi.ExtendedLogger; 9 | import org.apache.logging.log4j.spi.ExtendedLoggerWrapper; 10 | 11 | import java.util.WeakHashMap; 12 | 13 | public class PrefixPluginLogger extends ExtendedLoggerWrapper { 14 | private static final WeakHashMap markers = new WeakHashMap(); 15 | private final Marker marker; 16 | 17 | static int markersSize() { 18 | return markers.size(); 19 | } 20 | 21 | public String prefix() { 22 | return this.marker.getName(); 23 | } 24 | 25 | PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) { 26 | super(logger, name, (MessageFactory) null); 27 | String actualPrefix = prefix == null ? "" : prefix; 28 | WeakHashMap var6 = markers; 29 | MarkerManager.Log4jMarker actualMarker; 30 | synchronized (markers) { 31 | MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker) markers.get(actualPrefix); 32 | if (maybeMarker == null) { 33 | actualMarker = new MarkerManager.Log4jMarker(actualPrefix); 34 | markers.put(new String(actualPrefix), actualMarker); 35 | } else { 36 | actualMarker = maybeMarker; 37 | } 38 | } 39 | 40 | this.marker = (Marker) actualMarker; 41 | } 42 | 43 | public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) { 44 | assert marker == null; 45 | 46 | super.logMessage(fqcn, level, this.marker, message, t); 47 | } 48 | } -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/help/Sleep.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer.help; 2 | 3 | import org.apache.logging.log4j.Logger; 4 | 5 | public class Sleep { 6 | 7 | private static final Logger logger = ESPluginLoggerFactory.getLogger(Sleep.class.getName()); 8 | 9 | public enum Type {MSEC, SEC, MIN, HOUR} 10 | 11 | ; 12 | 13 | public static void sleep(Type type, int num) { 14 | try { 15 | switch (type) { 16 | case MSEC: 17 | Thread.sleep(num); 18 | return; 19 | case SEC: 20 | Thread.sleep(num * 1000); 21 | return; 22 | case MIN: 23 | Thread.sleep(num * 60 * 1000); 24 | return; 25 | case HOUR: 26 | Thread.sleep(num * 60 * 60 * 1000); 27 | return; 28 | default: 29 | System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一"); 30 | return; 31 | } 32 | } catch (InterruptedException e) { 33 | logger.error(e.getMessage(), e); 34 | } 35 | } 36 | 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.lucene; 26 | 27 | import org.apache.lucene.analysis.Analyzer; 28 | import org.apache.lucene.analysis.Tokenizer; 29 | import org.wltea.analyzer.cfg.Configuration; 30 | 31 | /** 32 | * IK分词器,Lucene Analyzer接口实现 33 | * 兼容Lucene 4.0版本 34 | */ 35 | public final class IKAnalyzer extends Analyzer{ 36 | 37 | private Configuration configuration; 38 | 39 | /** 40 | * IK分词器Lucene Analyzer接口实现类 41 | * 42 | * 默认细粒度切分算法 43 | */ 44 | private IKAnalyzer(){ 45 | } 46 | 47 | /** 48 | * IK分词器Lucene Analyzer接口实现类 49 | * 50 | * @param configuration IK配置 51 | */ 52 | public IKAnalyzer(Configuration configuration){ 53 | super(); 54 | this.configuration = configuration; 55 | } 56 | 57 | 58 | /** 59 | * 重载Analyzer接口,构造分词组件 60 | */ 61 | @Override 62 | protected TokenStreamComponents createComponents(String fieldName) { 63 | Tokenizer _IKTokenizer = new IKTokenizer(configuration); 64 | return new TokenStreamComponents(_IKTokenizer); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import org.apache.lucene.analysis.Tokenizer; 30 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 31 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 32 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 33 | import org.wltea.analyzer.cfg.Configuration; 34 | import org.wltea.analyzer.core.IKSegmenter; 35 | import org.wltea.analyzer.core.Lexeme; 36 | 37 | import java.io.IOException; 38 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 39 | import org.wltea.analyzer.dic.Dictionary; 40 | 41 | /** 42 | * IK分词器 Lucene Tokenizer适配器类 43 | * 兼容Lucene 4.0版本 44 | */ 45 | public final class IKTokenizer extends Tokenizer { 46 | 47 | //IK分词器实现 48 | private IKSegmenter _IKImplement; 49 | 50 | //词元文本属性 51 | private final CharTermAttribute termAtt; 52 | //词元位移属性 53 | private final OffsetAttribute offsetAtt; 54 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 55 | private final TypeAttribute typeAtt; 56 | //记录最后一个词元的结束位置 57 | private int endPosition; 58 | 59 | private int skippedPositions; 60 | 61 | private PositionIncrementAttribute posIncrAtt; 62 | 63 | 64 | /** 65 | * Lucene 4.0 Tokenizer适配器类构造函数 66 | */ 67 | public IKTokenizer(Configuration configuration){ 68 | super(); 69 | offsetAtt = addAttribute(OffsetAttribute.class); 70 | termAtt = addAttribute(CharTermAttribute.class); 71 | typeAtt = addAttribute(TypeAttribute.class); 72 | posIncrAtt = addAttribute(PositionIncrementAttribute.class); 73 | // 初始化词典 74 | Dictionary.getSingleton().loadAllDictFiles(configuration.getDicFiles()); 75 | // 创建分词器 76 | _IKImplement = new IKSegmenter(input,configuration); 77 | } 78 | 79 | /* (non-Javadoc) 80 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 81 | */ 82 | @Override 83 | public boolean incrementToken() throws IOException { 84 | //清除所有的词元属性 85 | clearAttributes(); 86 | skippedPositions = 0; 87 | 88 | Lexeme nextLexeme = _IKImplement.next(); 89 | if(nextLexeme != null){ 90 | posIncrAtt.setPositionIncrement(skippedPositions +1 ); 91 | 92 | //将Lexeme转成Attributes 93 | //设置词元文本 94 | termAtt.append(nextLexeme.getLexemeText()); 95 | //设置词元长度 96 | termAtt.setLength(nextLexeme.getLength()); 97 | //设置词元位移 98 | offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition())); 99 | 100 | //记录分词的最后位置 101 | endPosition = nextLexeme.getEndPosition(); 102 | //记录词元分类 103 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 104 | //返会true告知还有下个词元 105 | return true; 106 | } 107 | //返会false告知词元输出完毕 108 | return false; 109 | } 110 | 111 | /* 112 | * (non-Javadoc) 113 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 114 | */ 115 | @Override 116 | public void reset() throws IOException { 117 | super.reset(); 118 | _IKImplement.reset(input); 119 | skippedPositions = 0; 120 | } 121 | 122 | @Override 123 | public final void end() throws IOException { 124 | super.end(); 125 | // set final offset 126 | int finalOffset = correctOffset(this.endPosition); 127 | offsetAtt.setOffset(finalOffset, finalOffset); 128 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/resources/plugin-descriptor.properties: -------------------------------------------------------------------------------- 1 | # Elasticsearch plugin descriptor file 2 | # This file must exist as 'plugin-descriptor.properties' at 3 | # the root directory of all plugins. 4 | # 5 | # A plugin can be 'site', 'jvm', or both. 6 | # 7 | ### example site plugin for "foo": 8 | # 9 | # foo.zip <-- zip file for the plugin, with this structure: 10 | # _site/ <-- the contents that will be served 11 | # plugin-descriptor.properties <-- example contents below: 12 | # 13 | # site=true 14 | # description=My cool plugin 15 | # version=1.0 16 | # 17 | ### example jvm plugin for "foo" 18 | # 19 | # foo.zip <-- zip file for the plugin, with this structure: 20 | # .jar <-- classes, resources, dependencies 21 | # .jar <-- any number of jars 22 | # plugin-descriptor.properties <-- example contents below: 23 | # 24 | # jvm=true 25 | # classname=foo.bar.BazPlugin 26 | # description=My cool plugin 27 | # version=2.0.0-rc1 28 | # elasticsearch.version=2.0 29 | # java.version=1.7 30 | # 31 | ### mandatory elements for all plugins: 32 | # 33 | # 'description': simple summary of the plugin 34 | description=${project.description} 35 | # 36 | # 'version': plugin's version 37 | version=${project.version} 38 | # 39 | # 'name': the plugin name 40 | name=${elasticsearch.plugin.name} 41 | # 42 | # 'classname': the name of the class to load, fully-qualified. 43 | classname=${elasticsearch.plugin.classname} 44 | # 45 | # 'java.version' version of java the code is built against 46 | # use the system property java.specification.version 47 | # version string must be a sequence of nonnegative decimal integers 48 | # separated by "."'s and may have leading zeros 49 | java.version=${maven.compiler.target} 50 | # 51 | # 'elasticsearch.version' version of elasticsearch compiled against 52 | # You will have to release a new version of the plugin for each new 53 | # elasticsearch release. This version is checked when the plugin 54 | # is loaded so Elasticsearch will refuse to start in the presence of 55 | # plugins with the incorrect elasticsearch.version. 56 | elasticsearch.version=${elasticsearch.version} 57 | -------------------------------------------------------------------------------- /src/main/resources/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant { 2 | // needed because of the hot reload functionality 3 | permission java.net.SocketPermission "*", "connect,resolve"; 4 | }; -------------------------------------------------------------------------------- /src/test/java/org/wltea/analyzer/TokenizerTest.java: -------------------------------------------------------------------------------- 1 | package org.wltea.analyzer; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.junit.Test; 7 | import org.wltea.analyzer.cfg.Configuration; 8 | import org.wltea.analyzer.dic.RemoteDicMonitor; 9 | import org.wltea.analyzer.lucene.IKAnalyzer; 10 | 11 | import java.io.IOException; 12 | import java.io.StringReader; 13 | import java.util.Arrays; 14 | 15 | public class TokenizerTest { 16 | 17 | @Test 18 | public void testAnalyzer() throws IOException { 19 | Settings settings = Settings.builder() 20 | .put("use_smart", false) 21 | .put("enable_lowercase", false) 22 | .put("enable_remote_dict", false) 23 | .putList("ext_dic_main", Arrays.asList("http://intact.dic")) 24 | .build(); 25 | Configuration configuration=new Configuration(null,settings) ; 26 | 27 | IKAnalyzer ik =new IKAnalyzer(configuration); 28 | 29 | 30 | // String t = "连身裙"; 31 | // String t = "分词器"; 32 | String t = "双肩包"; 33 | TokenStream tokenStream = ik.tokenStream("", new StringReader(t)); 34 | tokenStream.reset(); 35 | CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); 36 | while(tokenStream.incrementToken()){ 37 | System.out.println(termAtt); 38 | } 39 | tokenStream.end(); 40 | tokenStream.close(); 41 | } 42 | 43 | @Test 44 | public void testRemoteFileLoad(){ 45 | 46 | RemoteDicMonitor.RemoteDicFile remoteDicFile = new RemoteDicMonitor.RemoteDicFile(""); 47 | remoteDicFile.setDicPath("http://intact.dic"); 48 | 49 | RemoteDicMonitor monitor = new RemoteDicMonitor(); 50 | System.out.println(monitor.getRemoteWordsUnprivileged(remoteDicFile.getDicPath())); 51 | 52 | monitor.runUnprivileged(remoteDicFile); 53 | } 54 | } 55 | --------------------------------------------------------------------------------