├── .travis.yml
├── LICENSE.txt
├── README.md
├── config
├── extra_main.dic
├── extra_single_word.dic
├── extra_single_word_full.dic
├── extra_single_word_low_freq.dic
├── extra_stopword.dic
├── extra_test.dic
├── main.dic
├── preposition.dic
├── quantifier.dic
├── stopword.dic
├── suffix.dic
└── surname.dic
├── index
├── _1.cfe
├── _1.cfs
├── _1.si
├── _3.cfe
├── _3.cfs
├── _3.si
├── segments_2
├── segments_4
└── write.lock
├── licenses
├── lucene-LICENSE.txt
└── lucene-NOTICE.txt
├── pom.xml
└── src
├── main
├── assemblies
│ └── plugin.xml
├── java
│ └── org
│ │ ├── elasticsearch
│ │ ├── index
│ │ │ └── analysis
│ │ │ │ ├── IkAnalyzerProvider.java
│ │ │ │ └── IkTokenizerFactory.java
│ │ └── plugin
│ │ │ └── analysis
│ │ │ └── ik
│ │ │ └── AnalysisIkPlugin.java
│ │ └── wltea
│ │ └── analyzer
│ │ ├── cfg
│ │ └── Configuration.java
│ │ ├── core
│ │ ├── AnalyzeContext.java
│ │ ├── CharacterUtil.java
│ │ ├── IKArbitrator.java
│ │ ├── IKSegmenter.java
│ │ ├── Lexeme.java
│ │ ├── LexemePath.java
│ │ ├── QuickSortSet.java
│ │ └── segmenter
│ │ │ ├── CJKSegmenter.java
│ │ │ ├── CN_QuantifierSegmenter.java
│ │ │ ├── ISegmenter.java
│ │ │ └── LetterSegmenter.java
│ │ ├── dic
│ │ ├── DicFile.java
│ │ ├── DictSegment.java
│ │ ├── Dictionary.java
│ │ ├── Hit.java
│ │ └── RemoteDicMonitor.java
│ │ ├── help
│ │ ├── CharacterHelper.java
│ │ ├── ESPluginLoggerFactory.java
│ │ ├── PrefixPluginLogger.java
│ │ └── Sleep.java
│ │ └── lucene
│ │ ├── IKAnalyzer.java
│ │ └── IKTokenizer.java
└── resources
│ ├── plugin-descriptor.properties
│ └── plugin-security.policy
└── test
└── java
└── org
└── wltea
└── analyzer
└── TokenizerTest.java
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 | jdk:
3 | - oraclejdk8
4 | install: true
5 | script:
6 | - sudo apt-get update && sudo apt-get install oracle-java8-installer
7 | - java -version
8 | language: java
9 | script: mvn clean package
10 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | IK Analysis for Elasticsearch
2 | =============================
3 |
4 | The IK Analysis plugin integrates Lucene IK analyzer (http://code.google.com/p/ik-analyzer/) into elasticsearch, support customized dictionary.
5 |
6 | Analyzer: `ik_smart` , `ik_max_word` , Tokenizer: `ik_smart` , `ik_max_word`
7 |
8 | 说明
9 | -----
10 | 该分词器时基于github medcl的分词器(https://github.com/medcl/elasticsearch-analysis-ik)改造而来,
11 | 改造点如下:
12 |
13 | 1、改造前,所有索引使用一个词库,没办法针对不同索引添加不同词库,
14 | 改造后,词库的加载由索引中自定义的analyzer配置时,设置的词库而决定
15 | 从而实现了,不同业务的索引使用不同的词库
16 |
17 | 2、优化了Dictionary类的代码结构,使得逻辑更清晰,将原来600行的代码缩减到300行,
18 | 优化比较死板的字典加载机制,不再读取IKAnalyzer.cfg.xml,而直接由用户索引analyzer创建时配置
19 |
20 | 3、优化了Remote Dictionary的加载机制
21 |
22 | 4、去掉了分词器中不必要的synchronized锁,提高了性能
23 |
24 | 5、读取字典文件路径顺序:优先从es的config/analysis-ik/下读取字典文件,
25 | 如未找到,则从plugin下,分词器对应的目录读取
26 |
27 |
28 | ### Dictionary Configuration
29 |
30 | `IKAnalyzer.cfg.xml` 配置文件不再使用,所有自定义扩展词库需要在定义分词器时设置,
31 | 例如
32 | ###
33 | ```
34 | {
35 | "settings": {
36 | "number_of_shards": 1,
37 | "number_of_replicas": 0,
38 | "analysis": {
39 | "tokenizer": {
40 | "my_tokenizer": {
41 | "type": "ik_max_word",
42 | "ext_dic_main": [
43 | "https:xxx.com/sss/ssss.dic",// 该字典是一个远程字典,路径以http或https打头
44 | "dddd.dic"// 该词典文件时ES服务器上的本地文件,需要放到IK的config目录下
45 | ]
46 | }
47 | },
48 | "analyzer":{
49 | "tokenizer":"my_tokenizer",
50 | "filer":["lowercase", "my_stemmer"]
51 | }
52 | }
53 | }
54 | }
55 | ```
56 | Versions
57 | --------
58 |
59 | IK version | ES version
60 | -----------|-----------
61 | master | 6.x
62 |
63 | 其它版本,请自己修改version,打包即可
64 |
65 | Install
66 | -------
67 |
68 | 1.download or compile
69 |
70 | * optional 1 - download pre-build package from here: https://github.com/medcl/elasticsearch-analysis-ik/releases
71 |
72 | create plugin folder `cd your-es-root/plugins/ && mkdir ik`
73 |
74 | unzip plugin to folder `your-es-root/plugins/ik`
75 |
76 | * optional 2 - use elasticsearch-plugin to install ( supported from version v5.5.1 ):
77 |
78 | ```
79 | ./bin/elasticsearch-plugin install https://github.com/medcl/elasticsearch-analysis-ik/releases/download/v6.3.0/elasticsearch-analysis-ik-6.3.0.zip
80 | ```
81 |
82 | NOTE: replace `6.3.0` to your own elasticsearch version
83 |
84 | 2.restart elasticsearch
85 |
86 |
87 |
88 | #### Quick Example
89 |
90 | 1.create a index
91 |
92 | ```bash
93 | curl -XPUT http://localhost:9200/index
94 | ```
95 |
96 | 2.create a mapping
97 |
98 | ```bash
99 | curl -XPOST http://localhost:9200/index/_mapping -H 'Content-Type:application/json' -d'
100 | {
101 | "properties": {
102 | "content": {
103 | "type": "text",
104 | "analyzer": "ik_max_word",
105 | "search_analyzer": "ik_smart"
106 | }
107 | }
108 |
109 | }'
110 | ```
111 |
112 | 3.index some docs
113 |
114 | ```bash
115 | curl -XPOST http://localhost:9200/index/_create/1 -H 'Content-Type:application/json' -d'
116 | {"content":"美国留给伊拉克的是个烂摊子吗"}
117 | '
118 | ```
119 |
120 | ```bash
121 | curl -XPOST http://localhost:9200/index/_create/2 -H 'Content-Type:application/json' -d'
122 | {"content":"公安部:各地校车将享最高路权"}
123 | '
124 | ```
125 |
126 | ```bash
127 | curl -XPOST http://localhost:9200/index/_create/3 -H 'Content-Type:application/json' -d'
128 | {"content":"中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"}
129 | '
130 | ```
131 |
132 | ```bash
133 | curl -XPOST http://localhost:9200/index/_create/4 -H 'Content-Type:application/json' -d'
134 | {"content":"中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"}
135 | '
136 | ```
137 |
138 | 4.query with highlighting
139 |
140 | ```bash
141 | curl -XPOST http://localhost:9200/index/_search -H 'Content-Type:application/json' -d'
142 | {
143 | "query" : { "match" : { "content" : "中国" }},
144 | "highlight" : {
145 | "pre_tags" : ["", ""],
146 | "post_tags" : ["", ""],
147 | "fields" : {
148 | "content" : {}
149 | }
150 | }
151 | }
152 | '
153 | ```
154 |
155 | Result
156 |
157 | ```json
158 | {
159 | "took": 14,
160 | "timed_out": false,
161 | "_shards": {
162 | "total": 5,
163 | "successful": 5,
164 | "failed": 0
165 | },
166 | "hits": {
167 | "total": 2,
168 | "max_score": 2,
169 | "hits": [
170 | {
171 | "_index": "index",
172 | "_type": "fulltext",
173 | "_id": "4",
174 | "_score": 2,
175 | "_source": {
176 | "content": "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首"
177 | },
178 | "highlight": {
179 | "content": [
180 | "中国驻洛杉矶领事馆遭亚裔男子枪击 嫌犯已自首 "
181 | ]
182 | }
183 | },
184 | {
185 | "_index": "index",
186 | "_type": "fulltext",
187 | "_id": "3",
188 | "_score": 2,
189 | "_source": {
190 | "content": "中韩渔警冲突调查:韩警平均每天扣1艘中国渔船"
191 | },
192 | "highlight": {
193 | "content": [
194 | "均每天扣1艘中国渔船 "
195 | ]
196 | }
197 | }
198 | ]
199 | }
200 | }
201 | ```
202 |
203 |
204 |
205 | ### 热更新 IK 分词使用方法
206 |
207 |
208 | 满足上面两点要求就可以实现热更新分词了,不需要重启 ES 实例。
209 |
210 | 可以将需自动更新的热词放在一个 UTF-8 编码的 .txt 文件里,放在 nginx 或其他简易 http server 下,当 .txt 文件修改时,http server 会在客户端请求该文件时自动返回相应的 Last-Modified 和 ETag。可以另外做一个工具来从业务系统提取相关词汇,并更新这个 .txt 文件。
211 |
212 | have fun.
213 |
214 | 常见问题
215 | -------
216 |
217 | 1.自定义词典为什么没有生效?
218 |
219 | 请确保你的扩展词典的文本格式为 UTF8 编码
220 |
221 | 2.如何手动安装?
222 |
223 |
224 | ```bash
225 | git clone https://github.com/medcl/elasticsearch-analysis-ik
226 | cd elasticsearch-analysis-ik
227 | git checkout tags/{version}
228 | mvn clean
229 | mvn compile
230 | mvn package
231 | ```
232 |
233 | 拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik
234 | 重启elasticsearch
235 |
236 | 3.分词测试失败
237 | 请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
238 | 如:
239 | ```bash
240 | curl -XGET "http://localhost:9200/your_index/_analyze" -H 'Content-Type: application/json' -d'
241 | {
242 | "text":"中华人民共和国MN","tokenizer": "my_ik"
243 | }'
244 | ```
245 |
246 |
247 | 4. ik_max_word 和 ik_smart 什么区别?
248 |
249 |
250 | ik_max_word: 会将文本做最细粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,中华人民,中华,华人,人民共和国,人民,人,民,共和国,共和,和,国国,国歌”,会穷尽各种可能的组合,适合 Term Query;
251 |
252 | ik_smart: 会做最粗粒度的拆分,比如会将“中华人民共和国国歌”拆分为“中华人民共和国,国歌”,适合 Phrase 查询。
253 |
254 | Changes
255 | ------
256 | *自 v5.0.0 起*
257 |
258 | - 移除名为 `ik` 的analyzer和tokenizer,请分别使用 `ik_smart` 和 `ik_max_word`
259 |
260 |
261 | Thanks
262 | ------
263 | YourKit supports IK Analysis for ElasticSearch project with its full-featured Java Profiler.
264 | YourKit, LLC is the creator of innovative and intelligent tools for profiling
265 | Java and .NET applications. Take a look at YourKit's leading software products:
266 | YourKit Java Profiler and
267 | YourKit .NET Profiler.
268 |
--------------------------------------------------------------------------------
/config/extra_single_word_low_freq.dic:
--------------------------------------------------------------------------------
1 | 踧
2 | 覢
3 | 觓
4 | 覛
5 | 覅
6 | 覟
7 | 覗
8 | 覣
9 | 覭
10 | 覂
11 | 觡
12 | 覝
13 | 觟
14 | 褱
15 | 褰
16 | 襒
17 | 覞
18 | 袨
19 | 觏
20 | 赒
21 | 觇
22 | 謍
23 | 讙
24 | 襦
25 | 袤
26 | 誸
27 | 诮
28 | 衩
29 | 茷
30 | 趒
31 | 襌
32 | 诰
33 | 譠
34 | 袄
35 | 聱
36 | 豸
37 | 蠓
38 | 讵
39 | 袅
40 | 诂
41 | 裞
42 | 訄
43 | 荺
44 | 褂
45 | 蠡
46 | 裐
47 | 諴
48 | 芫
49 | 赧
50 | 触
51 | 跫
52 | 褫
53 | 赝
54 | 褡
55 | 衪
56 | 裎
57 | 豜
58 | 褶
59 | 裟
60 | 跏
61 | 袪
62 | 袈
63 | 觐
64 | 跄
65 | 坏
66 | 肱
67 | 裾
68 | 考
69 | 豝
70 | 踰
71 | 覃
72 | 蹓
73 | 黾
74 | 褴
75 | 轲
76 | 裨
77 | 蜇
78 | 鮆
79 | 褥
80 | 誊
81 | 貉
82 | 褊
83 | 蜉
84 | 衔
85 | 詄
86 | 豋
87 | 胼
88 | 荞
89 | 踫
90 | 谗
91 | 耦
92 | 誏
93 | 衮
94 | 胝
95 | 幔
96 | 轭
97 | 赈
98 | 贲
99 | 蓼
100 | 褛
101 | 迵
102 | 觊
103 | 蚜
104 | 讫
105 | 颢
106 | 葄
107 | 觎
108 | 诎
109 | 謢
110 | 蹧
111 | 邬
112 | 芊
113 | 赣
114 | 囱
115 | 蝎
116 | 夆
117 | 蠋
118 | 蠕
119 | 蹼
120 | 臊
121 | 蛭
122 | 颚
123 | 讴
124 | 踽
125 | 菫
126 | 臾
127 | 薮
128 | 蹒
129 | 谀
130 | 菀
131 | 佶
132 | 摀
133 | 佚
134 | 邸
135 | 跺
136 | 豊
137 | 荔
138 | 锌
139 | 诿
140 | 蕤
141 | 诳
142 | 芩
143 | 蹴
144 | 褉
145 | 觔
146 | 舴
147 | 腋
148 | 颍
149 | 膊
150 | 脯
151 | 荪
152 | 郢
153 | 坛
154 | 轫
155 | 醺
156 | 捺
157 | 姝
158 | 胭
159 | 饷
160 | 谪
161 | 驮
162 | 僮
163 | 踯
164 | 忪
165 | 驷
166 | 躅
167 | 忑
168 | 彧
169 | 衲
170 | 唠
171 | 跚
172 | 吃
173 | 诩
174 | 褓
175 | 诤
176 | 豨
177 | 诋
178 | 菈
179 | 逖
180 | 荟
181 | 裆
182 | 喋
183 | 忖
184 | 闾
185 | 诌
186 | 啻
187 | 铀
188 | 菡
189 | 胱
190 | 蹬
191 | 隹
192 | 鹬
193 | 诒
194 | 轧
195 | 萏
196 | 舶
197 | 鳅
198 | 药
199 | 酯
200 | 夯
201 | 偬
202 | 酝
203 | 跻
204 | 咤
205 | 掬
206 | 呆
207 | 蹶
208 | 踞
209 | 蝌
210 | 咋
211 | 谧
212 | 舫
213 | 啐
214 | 茸
215 | 谟
216 | 嵌
217 | 蜿
218 | 魇
219 | 帷
220 | 觑
221 | 鳍
222 | 谏
223 | 哽
224 | 乓
225 | 蚌
226 | 嗙
227 | 巿
228 | 刽
229 | 踱
230 | 腆
231 | 薏
232 | 蜃
233 | 谑
234 | 躄
235 | 鸾
236 | 齁
237 | 腼
238 | 呷
239 | 吆
240 | 荀
241 | 裱
242 | 辇
243 | 睫
244 | 伎
245 | 妲
246 | 菠
247 | 鼐
248 | 麾
249 | 芮
250 | 鲑
251 | 辉
252 | 啜
253 | 苞
254 | 踼
255 | 荃
256 | 杞
257 | 浣
258 | 沬
259 | 胤
260 | 恿
261 | 驭
262 | 逵
263 | 钛
264 | 徕
265 | 贮
266 | 蔫
267 | 锚
268 | 衙
269 | 肄
270 | 豺
271 | 闸
272 | 隋
273 | 腑
274 | 脐
275 | 脓
276 | 叱
277 | 迥
278 | 踝
279 | 馥
280 | 佣
281 | 喳
282 | 迩
283 | 贻
284 | 诙
285 | 椭
286 | 琬
287 | 赂
288 | 诧
289 | 苯
290 | 怂
291 | 蟆
292 | 龊
293 | 漳
294 | 迭
295 | 垛
296 | 铲
297 | 馊
298 | 娓
299 | 葆
300 | 赑
301 | 卍
302 | 遽
303 | 谯
304 | 賏
305 | 蛹
306 | 锤
307 | 粟
308 | 衿
309 | 渥
310 | 铳
311 | 刍
312 | 镳
313 | 匮
314 | 万
315 | 骁
316 | 酣
317 | 酉
318 | 骥
319 | 寨
320 | 蓁
321 | 诽
322 | 钡
323 | 浙
324 | 酗
325 | 跩
326 | 拗
327 | 坷
328 | 雱
329 | 闺
330 | 喈
331 | 晔
332 | 螳
333 | 谙
334 | 蹂
335 | 鞑
336 | 蔗
337 | 账
338 | 垚
339 | 瞩
340 | 谩
341 | 掳
342 | 媲
343 | 葾
344 | 鳗
345 | 钣
346 | 檀
347 | 阕
348 | 聿
349 | 蜍
350 | 仆
351 | 嗅
352 | 峥
353 | 蜈
354 | 垠
355 | 蚓
356 | 麓
357 | 殉
358 | 弩
359 | 朴
360 | 胥
361 | 瘴
362 | 篑
363 | 镍
364 | 鹂
365 | 暐
366 | 榷
367 | 咀
368 | 佯
369 | 蚣
370 | 荻
371 | 鬓
372 | 仝
373 | 裴
374 | 讷
375 | 孺
376 | 咨
377 | 俑
378 | 遴
379 | 吽
380 | 笋
381 | 耀
382 | 霾
383 | 绎
384 | 咿
385 | 骸
386 | 霭
387 | 昕
388 | 漩
389 | 浒
390 | 轼
391 | 婿
392 | 嗳
393 | 钙
394 | 谲
395 | 蛾
396 | 跛
397 | 惺
398 | 翎
399 | 炽
400 | 晒
401 | 钳
402 | 鞘
403 | 谚
404 | 钊
405 | 背
406 | 瀛
407 | 槌
408 | 臀
409 | 跋
410 | 窒
411 | 藤
412 | 噬
413 | 蓊
414 | 褐
415 | 蔺
416 | 鲍
417 | 鲨
418 | 舔
419 | 箔
420 | 萦
421 | 诏
422 | 褔
423 | 咄
424 | 俘
425 | 彪
426 | 饪
427 | 嘱
428 | 诬
429 | 踮
430 | 囝
431 | 佢
432 | 汶
433 | 讹
434 | 踅
435 | 咐
436 | 讼
437 | 玟
438 | 迂
439 | 亵
440 | 婵
441 | 馁
442 | 崭
443 | 惦
444 | 蠹
445 | 濒
446 | 匈
447 | 蟋
448 | 谕
449 | 酪
450 | 眛
451 | 煦
452 | 甭
453 | 谄
454 | 妾
455 | 梧
456 | 芜
457 | 蛎
458 | 颐
459 | 雌
460 | 褒
461 | 臼
462 | 圳
463 | 剔
464 | 噶
465 | 耨
466 | 嗈
467 | 勋
468 | 冶
469 | 扑
470 | 膺
471 | 腺
472 | 荤
473 | 坞
474 | 羲
475 | 栾
476 | 傌
477 | 幌
478 | 噗
479 | 蛀
480 | 觞
481 | 塾
482 | 耙
483 | 枭
484 | 擞
485 | 缅
486 | 踌
487 | 蟀
488 | 侥
489 | 诣
490 | 姜
491 | 甸
492 | 俭
493 | 泠
494 | 躇
495 | 萌
496 | 虏
497 | 匕
498 | 藩
499 | 嗽
500 | 蜻
501 | 咛
502 | 艹
503 | 跎
504 | 蔬
505 | 鸠
506 | 跆
507 | 肋
508 | 巅
509 | 芯
510 | 荐
511 | 荼
512 | 慵
513 | 咸
514 | 杭
515 | 樟
516 | 夸
517 | 戮
518 | 吱
519 | 模
520 | 葔
521 | 迢
522 | 砰
523 | 须
524 | 蒜
525 | 骐
526 | 茱
527 | 痊
528 | 蛤
529 | 蜴
530 | 诟
531 | 俾
532 | 疮
533 | 悴
534 | 袒
535 | 蒹
536 | 镖
537 | 娥
538 | 鹉
539 | 婊
540 | 噫
541 | 矜
542 | 岳
543 | 鹦
544 | 葭
545 | 褚
546 | 嵩
547 | 丫
548 | 凛
549 | 峦
550 | 惚
551 | 懊
552 | 韶
553 | 憋
554 | 聋
555 | 讪
556 | 瘫
557 | 霓
558 | 哺
559 | 蝙
560 | 靥
561 | 堇
562 | 铺
563 | 趾
564 | 褪
565 | 缆
566 | 媛
567 | 胧
568 | 肛
569 | 珈
570 | 畴
571 | 驹
572 | 熔
573 | 臆
574 | 肘
575 | 豁
576 | 冕
577 | 吊
578 | 韧
579 | 炜
580 | 舱
581 | 恁
582 | 巳
583 | 舵
584 | 臻
585 | 戊
586 | 稽
587 | 诲
588 | 隽
589 | 铐
590 | 鲫
591 | 畸
592 | 饥
593 | 茉
594 | 蒲
595 | 矶
596 | 峨
597 | 蚵
598 | 蔼
599 | 诛
600 | 焰
601 | 偈
602 | 蚱
603 | 骯
604 | 盔
605 | 巩
606 | 折
607 | 偕
608 | 嗓
609 | 辙
610 | 鸶
611 | 酵
612 | 莘
613 | 耘
614 | 汹
615 | 楞
616 | 陡
617 | 裳
618 | 憎
619 | 讳
620 | 荆
621 | 笃
622 | 屉
623 | 霈
624 | 恬
625 | 蹦
626 | 扬
627 | 侃
628 | 艳
629 | 璇
630 | 韬
631 | 烬
632 | 傀
633 | 铮
634 | 曦
635 | 搂
636 | 蝠
637 | 霄
638 | 胺
639 | 遐
640 | 飨
641 | 郡
642 | 困
643 | 呎
644 | 墅
645 | 鞠
646 | 瘤
647 | 藻
648 | 咆
649 | 踹
650 | 狷
651 | 镀
652 | 桐
653 | 赘
654 | 揽
655 | 炬
656 | 氢
657 | 膛
658 | 搪
659 | 湿
660 | 唆
661 | 兑
662 | 暸
663 | 厮
664 | 懈
665 | 媳
666 | 塘
667 | 靡
668 | 鹭
669 | 祟
670 | 冀
671 | 豚
672 | 蹄
673 | 橙
674 | 阎
675 | 硫
676 | 埠
677 | 噱
678 | 妃
679 | 搓
680 | 啃
681 | 俞
682 | 龚
683 | 橄
684 | 嚎
685 | 椎
686 | 蓦
687 | 朔
688 | 痘
689 | 鳞
690 | 铠
691 | 叽
692 | 跤
693 | 裔
694 | 诃
695 | 岫
696 | 怯
697 | 讥
698 | 聂
699 | 垢
700 | 藐
701 | 濑
702 | 莒
703 | 淇
704 | 毯
705 | 礁
706 | 赃
707 | 庐
708 | 辕
709 | 瞌
710 | 锯
711 | 莓
712 | 涡
713 | 昼
714 | 捌
715 | 嗡
716 | 倌
717 | 禹
718 | 蹋
719 | 卯
720 | 粪
721 | 耽
722 | 闰
723 | 曳
724 | 苔
725 | 诵
726 | 菇
727 | 斟
728 | 芥
729 | 莅
730 | 喀
731 | 麒
732 | 颊
733 | 扛
734 | 曜
735 | 咎
736 | 缮
737 | 诫
738 | 躁
739 | 茜
740 | 缤
741 | 暧
742 | 郄
743 | 酥
744 | 僻
745 | 躬
746 | 峙
747 | 驯
748 | 噎
749 | 厦
750 | 澜
751 | 杏
752 | 樽
753 | 勘
754 | 煤
755 | 茎
756 | 嚷
757 | 昆
758 | 铸
759 | 烘
760 | 邹
761 | 廓
762 | 拚
763 | 俐
764 | 裘
765 | 饵
766 | 恃
767 | 蔓
768 | 笙
769 | 茁
770 | 楷
771 | 嚼
772 | 锻
773 | 蕊
774 | 脖
775 | 茍
776 | 壤
777 | 琮
778 | 莽
779 | 塌
780 | 蚤
781 | 膳
782 | 磋
783 | 蓓
784 | 澈
785 | 萎
786 | 擒
787 | 禄
788 | 儡
789 | 懦
790 | 瞻
791 | 虔
792 | 粥
793 | 赦
794 | 畜
795 | 彷
796 | 寥
797 | 揣
798 | 嫖
799 | 朽
800 | 挂
801 | 啄
802 | 浇
803 | 崖
804 | 棠
805 | 禽
806 | 台
807 | 邂
808 | 矫
809 | 茅
810 | 惫
811 | 吠
812 | 苟
813 | 叩
814 | 徊
815 | 巍
816 | 舆
817 | 邵
818 | 彗
819 | 萃
820 | 拱
821 | 嘶
822 | 貂
823 | 趴
824 | 愿
825 | 脊
826 | 冗
827 | 杆
828 | 蕙
829 | 铎
830 | 囚
831 | 啼
832 | 谤
833 | 徘
834 | 芹
835 | 骆
836 | 夭
837 | 饺
838 | 馒
839 | 溺
840 | 咫
841 | 屐
842 | 绅
843 | 诅
844 | 缉
845 | 渣
846 | 敞
847 | 萱
848 | 丰
849 | 俏
850 | 螃
851 | 蜀
852 | 徽
853 | 逞
854 | 跪
855 | 虞
856 | 隙
857 | 匀
858 | 憧
859 | 辄
860 | 鸳
861 | 疵
862 | 跷
863 | 呱
864 | 穆
865 | 阑
866 | 搏
867 | 肾
868 | 靶
869 | 阱
870 | 囡
871 | 寰
872 | 庄
873 | 蟾
874 | 怠
875 | 腕
876 | 烟
877 | 巾
878 | 奢
879 | 垄
880 | 姨
881 | 躯
882 | 肺
883 | 钰
884 | 佰
885 | 阙
886 | 雏
887 | 溉
888 | 焚
889 | 丑
890 | 锥
891 | 诘
892 | 瞪
893 | 茹
894 | 绊
895 | 蚀
896 | 袱
897 | 煽
898 | 窕
899 | 掷
900 | 沮
901 | 钞
902 | 涕
903 | 浏
904 | 仄
905 | 孰
906 | 峻
907 | 皱
908 | 芦
909 | 膏
910 | 晰
911 | 衬
912 | 谍
913 | 丞
914 | 绽
915 | 蔽
916 | 呕
917 | 轿
918 | 隶
919 | 楠
920 | 匣
921 | 葵
922 | 沫
923 | 刃
924 | 禧
925 | 晦
926 | 哔
927 | 晖
928 | 绣
929 | 仟
930 | 窟
931 | 谛
932 | 瀚
933 | 黛
934 | 忿
935 | 姚
936 | 蜘
937 | 耸
938 | 捍
939 | 斐
940 | 卜
941 | 辗
942 | 刁
943 | 涅
944 | 泓
945 | 梵
946 | 扳
947 | 暇
948 | 袜
949 | 柠
950 | 傍
951 | 逮
952 | 呃
953 | 蜗
954 | 窍
955 | 琉
956 | 喃
957 | 溢
958 | 抉
959 | 旷
960 | 卅
961 | 亟
962 | 膝
963 | 伶
964 | 闇
965 | 莺
966 | 蔚
967 | 醋
968 | 瑛
969 | 拭
970 | 绮
971 | 鑫
972 | 圭
973 | 脂
974 | 酿
975 | 诈
976 | 膨
977 | 隧
978 | 惭
979 | 庚
980 | 衅
981 | 哨
982 | 凋
983 | 里
984 | 祯
985 | 撼
986 | 谭
987 | 稻
988 | 迋
989 | 碌
990 | 罕
991 | 逾
992 | 嗜
993 | 蹲
994 | 檬
995 | 肖
996 | 辖
997 | 襟
998 | 扎
999 | 槟
1000 | 缔
1001 | 袂
1002 | 敷
1003 | 腥
1004 | 喘
1005 | 簿
1006 | 鳖
1007 | 出
1008 | 噢
1009 | 炫
1010 | 佑
1011 | 贷
1012 | 粮
1013 | 荳
1014 | 桦
1015 | 颉
1016 | 哑
1017 | 倪
1018 | 颤
1019 | 御
1020 | 芽
1021 | 朦
1022 | 裹
1023 | 贬
1024 | 蕉
1025 | 蝉
1026 | 赎
1027 | 崔
1028 | 滔
1029 | 茵
1030 | 径
1031 | 克
1032 | 啤
1033 | 拯
1034 | 坟
1035 | 葱
1036 | 芋
1037 | 瞒
1038 | 掠
1039 | 绳
1040 | 蛛
1041 | 匠
1042 | 凸
1043 | 苛
1044 | 押
1045 | 楣
1046 | 芙
1047 | 酌
1048 | 俺
1049 | 掏
1050 | 倡
1051 | 唾
1052 | 瞄
1053 | 磊
1054 | 吼
1055 | 搅
1056 | 溃
1057 | 聆
1058 | 沌
1059 | 蝇
1060 | 鸥
1061 | 妒
1062 | 焕
1063 | 拙
1064 | 夷
1065 | 迄
1066 | 绰
1067 | 锵
1068 | 耿
1069 | 祺
1070 | 吶
1071 | 惶
1072 | 廊
1073 | 兜
1074 | 倩
1075 | 杖
1076 | 窄
1077 | 僚
1078 | 竖
1079 | 芷
1080 | 咚
1081 | 鲢
1082 | 沛
1083 | 挪
1084 | 柄
1085 | 顷
1086 | 璞
1087 | 裸
1088 | 鵰
1089 | 郊
1090 | 屿
1091 | 仕
1092 | 艘
1093 | 铅
1094 | 铝
1095 | 饲
1096 | 黯
1097 | 疫
1098 | 栽
1099 | 喉
1100 | 逗
1101 | 祇
1102 | 阪
1103 | 侍
1104 | 抒
1105 | 弗
1106 | 尬
1107 | 浦
1108 | 鄙
1109 | 盏
1110 | 喽
1111 | 炳
1112 | 卵
1113 | 肌
1114 | 迦
1115 | 擅
1116 | 豹
1117 | 胏
1118 | 炼
1119 | 悸
1120 | 谴
1121 | 贾
1122 | 胀
1123 | 疋
1124 | 矿
1125 | 梨
1126 | 碑
1127 | 髓
1128 | 巢
1129 | 叹
1130 | 屡
1131 | 滩
1132 | 侮
1133 | 橘
1134 | 嘲
1135 | 酬
1136 | 枚
1137 | 氓
1138 | 菌
1139 | 颁
1140 | 萝
1141 | 谘
1142 | 曝
1143 | 薯
1144 | 襄
1145 | 辽
1146 | 萄
1147 | 寇
1148 | 舜
1149 | 颂
1150 | 撰
1151 | 腻
1152 | 崩
1153 | 咕
1154 | 癌
1155 | 歇
1156 | 汰
1157 | 烁
1158 | 撇
1159 | 宴
1160 | 惩
1161 | 烛
1162 | 贰
1163 | 呻
1164 | 呒
1165 | 翩
1166 | 绑
1167 | 捞
1168 | 爹
1169 | 秉
1170 | 棉
1171 | 妓
1172 | 尉
1173 | 霍
1174 | 甫
1175 | 尝
1176 | 葡
1177 | 蒸
1178 | 鸦
1179 | 挚
1180 | 奸
1181 | 纬
1182 | 艰
1183 | 履
1184 | 葬
1185 | 滨
1186 | 耕
1187 | 婴
1188 | 醇
1189 | 堵
1190 | 钉
1191 | 喧
1192 | 遂
1193 | 锣
1194 | 垮
1195 | 蓬
1196 | 薛
1197 | 虐
1198 | 睁
1199 | 厨
1200 | 娶
1201 | 浆
1202 | 挨
1203 | 矢
1204 | 蕾
1205 | 伺
1206 | 券
1207 | 鹏
1208 | 削
1209 | 蓄
1210 | 琦
1211 | 熄
1212 | 湘
1213 | 慌
1214 | 枕
1215 | 衍
1216 | 薇
1217 | 囊
1218 | 喂
1219 | 蕴
1220 | 倘
1221 | 峡
1222 | 浊
1223 | 窃
1224 | 颈
1225 | 裙
1226 | 晕
1227 | 缚
1228 | 获
1229 | 帕
1230 | 脾
1231 | 莹
1232 | 逍
1233 | 姬
1234 | 韦
1235 | 畔
1236 | 伐
1237 | 霞
1238 | 嘘
1239 | 盐
1240 | 摧
1241 | 债
1242 | 佩
1243 | 畏
1244 | 驴
1245 | 氧
1246 | 奴
1247 | 瘦
1248 | 菊
1249 | 廿
1250 | 狭
1251 | 赴
1252 | 碳
1253 | 坊
1254 | 盆
1255 | 趟
1256 | 匿
1257 | 肇
1258 | 溶
1259 | 揭
1260 | 剥
1261 | 沦
1262 | 秃
1263 | 郝
1264 | 唔
1265 | 锡
1266 | 娇
1267 | 抚
1268 | 屎
1269 | 甩
1270 | 娱
1271 | 表
1272 | 犬
1273 | 魁
1274 | 蒂
1275 | 皓
1276 | 祷
1277 | 瞎
1278 | 瘾
1279 | 煎
1280 | 螺
1281 | 遮
1282 | 坠
1283 | 剎
1284 | 筝
1285 | 棵
1286 | 冤
1287 | 崎
1288 | 昔
1289 | 驼
1290 | 竿
1291 | 甄
1292 | 斑
1293 | 歹
1294 | 骏
1295 | 缝
1296 | 鞭
1297 | 垫
1298 | 淹
1299 | 并
1300 | 遨
1301 | 宠
1302 | 掰
1303 | 枯
1304 | 艇
1305 | 豫
1306 | 募
1307 | 郁
1308 | 稚
1309 | 懿
1310 | 辐
1311 | 酱
1312 | 恕
1313 | 范
1314 | 涂
1315 | 滤
1316 | 肃
1317 | 膜
1318 | 佬
1319 | 哼
1320 | 慨
1321 | 穗
1322 | 辰
1323 | 雁
1324 | 瑟
1325 | 帆
1326 | 拢
1327 | 汁
1328 | 蝴
1329 | 冈
1330 | 诠
1331 | 蹈
1332 | 黏
1333 | 痞
1334 | 屑
1335 | 潇
1336 | 觅
1337 | 钧
1338 | 挣
1339 | 谐
1340 | 霜
1341 | 诊
1342 | 熬
1343 | 讽
1344 | 歧
1345 | 戈
1346 | 闯
1347 | 饶
1348 | 斤
1349 | 婉
1350 | 致
1351 | 贿
1352 | 苑
1353 | 矮
1354 | 毋
1355 | 詹
1356 | 祈
1357 | 咳
1358 | 昱
1359 | 佐
1360 | 帖
1361 | 猩
1362 | 尹
1363 | 诇
1364 | 肆
1365 | 亭
1366 | 丘
1367 | 淘
1368 | 颠
1369 | 勃
1370 | 讶
1371 | 抖
1372 | 袁
1373 | 柱
1374 | 僧
1375 | 蚊
1376 | 匹
1377 | 辣
1378 | 螂
1379 | 澡
1380 | 昧
1381 | 诡
1382 | 槽
1383 | 穴
1384 | 斩
1385 | 聘
1386 | 扶
1387 | 熙
1388 | 驰
1389 | 棍
1390 | 兆
1391 | 蟑
1392 | 矩
1393 | 谬
1394 | 贫
1395 | 鼎
1396 | 践
1397 | 盲
1398 | 眷
1399 | 尿
1400 | 伫
1401 | 饿
1402 | 砸
1403 | 妄
1404 | 荡
1405 | 炒
1406 | 冥
1407 | 偿
1408 | 墓
1409 | 骄
1410 | 毙
1411 | 淋
1412 | 芝
1413 | 胃
1414 | 宅
1415 | 董
1416 | 梭
1417 | 凑
1418 | 宰
1419 | 卑
1420 | 丛
1421 | 纠
1422 | 肢
1423 | 闽
1424 | 铜
1425 | 寺
1426 | 瞬
1427 | 澳
1428 | 庞
1429 | 腔
1430 | 泼
1431 | 昂
1432 | 梁
1433 | 躺
1434 | 姻
1435 | 潭
1436 | 吋
1437 | 撤
1438 | 殖
1439 | 轴
1440 | 颖
1441 | 冻
1442 | 琼
1443 | 恳
1444 | 衫
1445 | 譬
1446 | 猎
1447 | 衰
1448 | 桶
1449 | 辜
1450 | 筒
1451 | 赫
1452 | 仗
1453 | 膀
1454 | 乳
1455 | 嚣
1456 | 划
1457 | 玮
1458 | 卿
1459 | 枉
1460 | 埃
1461 | 跨
1462 | 粹
1463 | 猴
1464 | 愤
1465 | 壹
1466 | 卢
1467 | 尧
1468 | 翰
1469 | 叮
1470 | 媚
1471 | 钮
1472 | 袖
1473 | 斌
1474 | 卓
1475 | 粽
1476 | 雀
1477 | 谦
1478 | 傅
1479 | 殿
1480 | 睹
1481 | 菁
1482 | 桂
1483 | 诱
1484 | 舌
1485 | 惟
1486 | 岗
1487 | 衷
1488 | 屈
1489 | 陋
1490 | 陌
1491 | 宵
1492 | 麟
1493 | 魏
1494 | 贸
1495 | 几
1496 | 埔
1497 | 谎
1498 | 袍
1499 | 卸
1500 | 仓
1501 | 匪
1502 | 叛
1503 | 肠
1504 | 肝
1505 | 俄
1506 | 孕
1507 | 庙
1508 | 嫁
1509 | 肤
1510 | 拦
1511 | 羯
1512 | 匙
1513 | 咏
1514 | 蠢
1515 | 纽
1516 | 拘
1517 | 旨
1518 | 胁
1519 | 馨
1520 | 珊
1521 | 签
1522 | 赔
1523 | 秩
1524 | 喻
1525 | 谜
1526 | 翠
1527 | 芭
1528 | 摊
1529 | 侣
1530 | 灿
1531 | 寡
1532 | 罐
1533 | 贼
1534 | 叙
1535 | 谨
1536 | 体
1537 | 敲
1538 | 浴
1539 | 吻
1540 | 臂
1541 | 袭
1542 | 煮
1543 | 腹
1544 | 暮
1545 | 曹
1546 | 虹
1547 | 抑
1548 | 贩
1549 | 踩
1550 | 澎
1551 | 糖
1552 | 催
1553 | 萍
1554 | 垂
1555 | 斥
1556 | 侬
1557 | 拷
1558 | 唤
1559 | 匆
1560 | 阮
1561 | 飙
1562 | 柴
1563 | 剂
1564 | 妖
1565 | 添
1566 | 畅
1567 | 汗
1568 | 鸭
1569 | 稀
1570 | 晋
1571 | 埋
1572 | 弊
1573 | 返
1574 | 叡
1575 | 娟
1576 | 玻
1577 | 腾
1578 | 栋
1579 | 歪
1580 | 邓
1581 | 渴
1582 | 粒
1583 | 泣
1584 | 疾
1585 | 蓉
1586 | 塑
1587 | 祂
1588 | 储
1589 | 劣
1590 | 柯
1591 | 陶
1592 | 患
1593 | 蛇
1594 | 腐
1595 | 琳
1596 | 慎
1597 | 泊
1598 | 牢
1599 | 呈
1600 | 趁
1601 | 恶
1602 | 浑
1603 | 扮
1604 | 樱
1605 | 臣
1606 | 遵
1607 | 缠
1608 | 虫
1609 | 撒
1610 | 叉
1611 | 刑
1612 | 苗
1613 | 脉
1614 | 盈
1615 | 津
1616 | 愧
1617 | 摔
1618 | 盒
1619 | 丧
1620 | 鹤
1621 | 呦
1622 | 厕
1623 | 斜
1624 | 芒
1625 | 翅
1626 | 悄
1627 | 晃
1628 | 茂
1629 | 寸
1630 | 杉
1631 | 旺
1632 | 俩
1633 | 雯
1634 | 霖
1635 | 递
1636 | 胶
1637 | 氛
1638 | 谣
1639 | 捉
1640 | 虾
1641 | 秘
1642 | 漠
1643 | 扭
1644 | 贞
1645 | 陵
1646 | 叔
1647 | 轨
1648 | 鹅
1649 | 液
1650 | 妥
1651 | 贱
1652 | 涨
1653 | 滥
1654 | 痕
1655 | 沿
1656 | 秤
1657 | 措
1658 | 巡
1659 | 丈
1660 | 魅
1661 | 欲
1662 | 缸
1663 | 鹿
1664 | 汝
1665 | 迁
1666 | 矣
1667 | 肩
1668 | 烤
1669 | 笛
1670 | 迅
1671 | 劫
1672 | 趋
1673 | 披
1674 | 荷
1675 | 卒
1676 | 丙
1677 | 碗
1678 | 伙
1679 | 椅
1680 | 赞
1681 | 侦
1682 | 灾
1683 | 秦
1684 | 蛙
1685 | 禅
1686 | 慰
1687 | 余
1688 | 朗
1689 | 辱
1690 | 征
1691 | 愚
1692 | 抛
1693 | 挺
1694 | 彭
1695 | 允
1696 | 靖
1697 | 滋
1698 | 凝
1699 | 赠
1700 | 莎
1701 | 顽
1702 | 狠
1703 | 堕
1704 | 翘
1705 | 惹
1706 | 纲
1707 | 贯
1708 | 饼
1709 | 抬
1710 | 逆
1711 | 堪
1712 | 坤
1713 | 斗
1714 | 钦
1715 | 疏
1716 | 羞
1717 | 扇
1718 | 蜂
1719 | 赌
1720 | 驻
1721 | 屏
1722 | 爵
1723 | 轰
1724 | 契
1725 | 悦
1726 | 邻
1727 | 哉
1728 | 陀
1729 | 裂
1730 | 刷
1731 | 毅
1732 | 拾
1733 | 疼
1734 | 阔
1735 | 耍
1736 | 亏
1737 | 吟
1738 | 锐
1739 | 惧
1740 | 锅
1741 | 蝶
1742 | 壳
1743 | 糕
1744 | 舟
1745 | 牧
1746 | 妮
1747 | 粗
1748 | 仇
1749 | 驶
1750 | 促
1751 | 孝
1752 | 裤
1753 | 誉
1754 | 家
1755 | 迈
1756 | 姿
1757 | 踪
1758 | 兔
1759 | 综
1760 | 旭
1761 | 韵
1762 | 齿
1763 | 乔
1764 | 怖
1765 | 晴
1766 | 闷
1767 | 墨
1768 | 咬
1769 | 侧
1770 | 狱
1771 | 琪
1772 | 梯
1773 | 宾
1774 | 枫
1775 | 锦
1776 | 瑜
1777 | 敦
1778 | 矛
1779 | 弘
1780 | 玛
1781 | 茫
1782 | 迪
1783 | 览
1784 | 挤
1785 | 雳
1786 | 岚
1787 | 卷
1788 | 黎
1789 | 薄
1790 | 柳
1791 | 咦
1792 | 廷
1793 | 瞧
1794 | 幅
1795 | 挖
1796 | 唬
1797 | 侯
1798 | 祸
1799 | 饰
1800 | 儒
1801 | 捡
1802 | 筋
1803 | 融
1804 | 耗
1805 | 铃
1806 | 奉
1807 | 鼻
1808 | 坜
1809 | 曼
1810 | 贡
1811 | 嗨
1812 | 炎
1813 | 啡
1814 | 捐
1815 | 炮
1816 | 霹
1817 | 貌
1818 | 鸣
1819 | 饱
1820 | 廉
1821 | 绘
1822 | 咪
1823 | 吝
1824 | 肚
1825 | 云
1826 | 翼
1827 | 氏
1828 | 骚
1829 | 爷
1830 | 寿
1831 | 绕
1832 | 唷
1833 | 牺
1834 | 屠
1835 | 谋
1836 | 彻
1837 | 俱
1838 | 粉
1839 | 雾
1840 | 涵
1841 | 侨
1842 | 础
1843 | 疗
1844 | 署
1845 | 稿
1846 | 涉
1847 | 稣
1848 | 誓
1849 | 箭
1850 | 涯
1851 | 锺
1852 | 迹
1853 | 抄
1854 | 踢
1855 | 贪
1856 | 咖
1857 | 莱
1858 | 夺
1859 | 勉
1860 | 焦
1861 | 蒋
1862 | 桑
1863 | 沧
1864 | 恰
1865 | 泳
1866 | 牲
1867 | 戒
1868 | 恼
1869 | 夕
1870 | 棚
1871 | 爬
1872 | 菲
1873 | 翁
1874 | 奔
1875 | 滴
1876 | 玄
1877 | 捷
1878 | 曰
1879 | 愉
1880 | 逊
1881 | 憾
1882 | 钓
1883 | 壁
1884 | 躲
1885 | 嫌
1886 | 姆
1887 | 乏
1888 | 洛
1889 | 逼
1890 | 磨
1891 | 剪
1892 | 逝
1893 | 亨
1894 | 盼
1895 | 杯
1896 | 敝
1897 | 碍
1898 | 痴
1899 | 植
1900 | 瑰
1901 | 勤
1902 | 悟
1903 | 彬
1904 | 删
1905 | 薪
1906 | 悠
1907 | 胎
1908 | 侵
1909 | 坪
1910 | 赋
1911 | 弯
1912 | 丹
1913 | 巫
1914 | 轩
1915 | 辨
1916 | 吐
1917 | 么
1918 | 盾
1919 | 扯
1920 | 割
1921 | 艾
1922 | 幼
1923 | 捕
1924 | 召
1925 | 怒
1926 | 坡
1927 | 缓
1928 | 猛
1929 | 驾
1930 | 莉
1931 | 彦
1932 | 韩
1933 | 鞋
1934 | 碧
1935 | 泽
1936 | 泉
1937 | 缴
1938 | 跃
1939 | 喇
1940 | 腿
1941 | 糟
1942 | 胆
1943 | 摘
1944 | 朵
1945 | 逛
1946 | 甜
1947 | 拔
1948 | 劲
1949 | 悉
1950 | 穷
1951 | 汤
1952 | 唐
1953 | 臭
1954 | 玲
1955 | 怡
1956 | 舍
1957 | 欺
1958 | 蜜
1959 | 耻
1960 | 坦
1961 | 叭
1962 | 亿
1963 | 忌
1964 | 鲁
1965 | 繁
1966 | 泥
1967 | 伸
1968 | 壮
1969 | 串
1970 | 圾
1971 | 币
1972 | 荒
1973 | 垃
1974 | 妇
1975 | 旦
1976 | 截
1977 | 喷
1978 | 碎
1979 | 吕
1980 | 犹
1981 | 抹
1982 | 脆
1983 | 煞
1984 | 胞
1985 | 晶
1986 | 潜
1987 | 玫
1988 | 妻
1989 | 估
1990 | 陷
1991 | 孔
1992 | 娃
1993 | 兽
1994 | 肥
1995 | 凉
1996 | 岂
1997 | 逻
1998 | 胸
1999 | 杜
2000 | 袋
2001 | 甘
2002 | 邀
2003 | 培
2004 | 龄
2005 | 辆
2006 | 廖
2007 | 冲
2008 | 渡
2009 | 羽
2010 | 秒
2011 | 辞
2012 | 倾
2013 | 窝
2014 | 柏
2015 | 淑
2016 | 诞
2017 | 漏
2018 | 姑
2019 | 托
2020 | 吾
2021 | 纷
2022 | 拆
2023 | 浩
2024 | 税
2025 | 邱
2026 | 迟
2027 | 筹
2028 | 监
2029 | 汪
2030 | 擎
2031 | 衡
2032 | 狐
2033 | 灰
2034 | 尖
2035 | 番
2036 | 罚
2037 | 证
2038 | 盗
2039 | 祥
2040 | 毫
2041 | 彰
2042 | 扩
2043 | 幽
2044 | 阐
2045 | 喊
2046 | 菩
2047 | 赐
2048 | 奋
2049 | 鲜
2050 | 劝
2051 | 栏
2052 | 慈
2053 | 扫
2054 | 尽
2055 | 穹
2056 | 丌
2057 | 绪
2058 | 砂
2059 | 勿
2060 | 抢
2061 | 啪
2062 | 庸
2063 | 赤
2064 | 饮
2065 | 萨
2066 | 兼
2067 | 访
2068 | 舒
2069 | 裕
2070 | 逸
2071 | 宙
2072 | 丸
2073 | 准
2074 | 魂
2075 | 厚
2076 | 励
2077 | 仰
2078 | 糊
2079 | 顿
2080 | 闭
2081 | 塔
2082 | 枪
2083 | 睛
2084 | 斋
2085 | 奥
2086 | 恭
2087 | 翔
2088 | 遥
2089 | 航
2090 | 孟
2091 | 昌
2092 | 卧
2093 | 颇
2094 | 革
2095 | 邪
2096 | 阻
2097 | 蟹
2098 | 裁
2099 | 后
2100 | 函
2101 | 于
2102 | 拳
2103 | 宽
2104 | 锋
2105 | 州
2106 | 葛
2107 | 拒
2108 | 池
2109 | 镇
2110 | 芬
2111 | 岸
2112 | 寞
2113 | 凭
2114 | 姊
2115 | 殊
2116 | 板
2117 | 勒
2118 | 慕
2119 | 跌
2120 | 踏
2121 | 填
2122 | 陪
2123 | 逐
2124 | 洽
2125 | 描
2126 | 妨
2127 | 仪
2128 | 摄
2129 | 紫
2130 | 谅
2131 | 阅
2132 | 邦
2133 | 麦
2134 | 莲
2135 | 闪
2136 | 纵
2137 | 庭
2138 | 圈
2139 | 榜
2140 | 滑
2141 | 舰
2142 | 面
2143 | 献
2144 | 浅
2145 | 飘
2146 | 宋
2147 | 俗
2148 | 沟
2149 | 巷
2150 | 眠
2151 | 帽
2152 | 惑
2153 | 羊
2154 | 牵
2155 | 净
2156 | 厉
2157 | 撞
2158 | 崇
2159 | 竞
2160 | 回
2161 | 乙
2162 | 聪
2163 | 桃
2164 | 伍
2165 | 役
2166 | 潮
2167 | 损
2168 | 凯
2169 | 锁
2170 | 震
2171 | 醉
2172 | 屁
2173 | 牠
2174 | 孙
2175 | 酷
2176 | 染
2177 | 尺
2178 | 摸
2179 | 盛
2180 | 闹
2181 | 棋
2182 | 吓
2183 | 迫
2184 | 瓜
2185 | 松
2186 | 搬
2187 | 戴
2188 | 瞭
2189 | 乌
2190 | 谱
2191 | 滚
2192 | 赚
2193 | 障
2194 | 逃
2195 | 齐
2196 | 牙
2197 | 怨
2198 | 拖
2199 | 皇
2200 | 贺
2201 | 横
2202 | 塞
2203 | 摆
2204 | 农
2205 | 倍
2206 | 额
2207 | 乘
2208 | 户
2209 | 奈
2210 | 川
2211 | 徐
2212 | 井
2213 | 寝
2214 | 洞
2215 | 劳
2216 | 船
2217 | 域
2218 | 屋
2219 | 胖
2220 | 藉
2221 | 销
2222 | 拼
2223 | 桌
2224 | 忧
2225 | 违
2226 | 拟
2227 | 吵
2228 | 媒
2229 | 辩
2230 | 妙
2231 | 鸿
2232 | 恩
2233 | 映
2234 | 耳
2235 | 傻
2236 | 京
2237 | 搭
2238 | 残
2239 | 稍
2240 | 颜
2241 | 固
2242 | 眉
2243 | 龟
2244 | 哀
2245 | 发
2246 | 沈
2247 | 拨
2248 | 丁
2249 | 愁
2250 | 耐
2251 | 宪
2252 | 覆
2253 | 盟
2254 | 昭
2255 | 握
2256 | 萧
2257 | 延
2258 | 豆
2259 | 弱
2260 | 隆
2261 | 页
2262 | 烧
2263 | 遍
2264 | 距
2265 | 摩
2266 | 祖
2267 | 探
2268 | 倚
2269 | 寂
2270 | 阴
2271 | 悔
2272 | 库
2273 | 嘴
2274 | 沉
2275 | 伊
2276 | 暂
2277 | 霸
2278 | 喵
2279 | 频
2280 | 鼓
2281 | 冒
2282 | 鼠
2283 | 企
2284 | 副
2285 | 菜
2286 | 款
2287 | 忽
2288 | 尾
2289 | 租
2290 | 椰
2291 | 隔
2292 | 狼
2293 | 浮
2294 | 惠
2295 | 峰
2296 | 索
2297 | 芳
2298 | 摇
2299 | 洪
2300 | 伦
2301 | 骨
2302 | 吹
2303 | 郑
2304 | 哩
2305 | 珍
2306 | 纳
2307 | 零
2308 | 哲
2309 | 遭
2310 | 瓶
2311 | 亡
2312 | 振
2313 | 予
2314 | 村
2315 | 旅
2316 | 惨
2317 | 汽
2318 | 爸
2319 | 隐
2320 | 械
2321 | 寒
2322 | 危
2323 | 邮
2324 | 贝
2325 | 阶
2326 | 赖
2327 | 茶
2328 | 谊
2329 | 涛
2330 | 惯
2331 | 尘
2332 | 丝
2333 | 森
2334 | 询
2335 | 露
2336 | 稳
2337 | 桥
2338 | 夏
2339 | 哭
2340 | 坚
2341 | 籍
2342 | 厌
2343 | 苍
2344 | 析
2345 | 冰
2346 | 仙
2347 | 布
2348 | 箱
2349 | 脱
2350 | 贤
2351 | 途
2352 | 订
2353 | 财
2354 | 欧
2355 | 赢
2356 | 枢
2357 | 泪
2358 | 废
2359 | 钢
2360 | 渐
2361 | 泡
2362 | 刊
2363 | 肯
2364 | 恨
2365 | 砍
2366 | 抽
2367 | 股
2368 | 咧
2369 | 婆
2370 | 禁
2371 | 郎
2372 | 默
2373 | 符
2374 | 缩
2375 | 童
2376 | 绿
2377 | 骗
2378 | 辈
2379 | 尼
2380 | 届
2381 | 彼
2382 | 兮
2383 | 聚
2384 | 宇
2385 | 辛
2386 | 疯
2387 | 减
2388 | 米
2389 | 念
2390 | 降
2391 | 街
2392 | 临
2393 | 敏
2394 | 洗
2395 | 玉
2396 | 伴
2397 | 辅
2398 | 诺
2399 | 鸡
2400 | 侠
2401 | 健
2402 | 熊
2403 | 顶
2404 | 挑
2405 | 替
2406 | 豪
2407 | 掌
2408 | 饭
2409 | 银
2410 | 圆
2411 | 志
2412 | 休
2413 | 材
2414 | 灭
2415 | 烈
2416 | 爆
2417 | 透
2418 | 遗
2419 | 虚
2420 | 醒
2421 | 货
2422 | 雅
2423 | 宏
2424 | 帅
2425 | 宫
2426 | 港
2427 | 偶
2428 | 丢
2429 | 篮
2430 | 凡
2431 | 瑞
2432 | 硕
2433 | 雪
2434 | 忠
2435 | 蔡
2436 | 插
2437 | 积
2438 | 乖
2439 | 挥
2440 | 抗
2441 | 察
2442 | 末
2443 | 盖
2444 | 厅
2445 | 移
2446 | 吸
2447 | 括
2448 | 笨
2449 | 孤
2450 | 译
2451 | 避
2452 | 秀
2453 | 富
2454 | 漂
2455 | 柔
2456 | 私
2457 | 围
2458 | 狮
2459 | 祝
2460 | 庆
2461 | 序
2462 | 拥
2463 | 洲
2464 | 徒
2465 | 借
2466 | 晓
2467 | 嘉
2468 | 诗
2469 | 淡
2470 | 束
2471 | 姓
2472 | 颗
2473 | 勇
2474 | 犯
2475 | 喝
2476 | 食
2477 | 镜
2478 | 偏
2479 | 猜
2480 | 层
2481 | 帐
2482 | 仅
2483 | 购
2484 | 衣
2485 | 申
2486 | 伯
2487 | 紧
2488 | 县
2489 | 婚
2490 | 季
2491 | 敬
2492 | 弃
2493 | 尊
2494 | 蛋
2495 | 鹰
2496 | 熟
2497 | 冠
2498 | 唯
2499 | 混
2500 | 藏
2501 | 河
2502 | 忍
2503 | 窗
2504 | 朝
2505 | 轮
2506 | 册
2507 | 乡
2508 | 敌
2509 | 散
2510 | 沙
2511 | 幻
2512 | 短
2513 | 略
2514 | 批
2515 | 游
2516 | 奖
2517 | 岛
2518 | 逢
2519 | 脸
2520 | 顾
2521 | 督
2522 | 协
2523 | 雷
2524 | 详
2525 | 穿
2526 | 慧
2527 | 巧
2528 | 罢
2529 | 呼
2530 | 暗
2531 | 贴
2532 | 纸
2533 | 歉
2534 | 郭
2535 | 努
2536 | 担
2537 | 蓝
2538 | 训
2539 | 享
2540 | 架
2541 | 济
2542 | 猪
2543 | 派
2544 | 均
2545 | 妈
2546 | 哦
2547 | 宣
2548 | 检
2549 | 鬼
2550 | 灯
2551 | 策
2552 | 梅
2553 | 启
2554 | 嘿
2555 | 洋
2556 | 伟
2557 | 萤
2558 | 磁
2559 | 啰
2560 | 付
2561 | 弄
2562 | 寄
2563 | 钟
2564 | 播
2565 | 险
2566 | 载
2567 | 赏
2568 | 汉
2569 | 块
2570 | 刀
2571 | 铭
2572 | 施
2573 | 卫
2574 | 弹
2575 | 售
2576 | 叶
2577 | 皆
2578 | 罪
2579 | 虎
2580 | 归
2581 | 毛
2582 | 昨
2583 | 荣
2584 | 律
2585 | 树
2586 | 奏
2587 | 注
2588 | 扁
2589 | 笔
2590 | 旁
2591 | 键
2592 | 制
2593 | 莫
2594 | 堆
2595 | 射
2596 | 承
2597 | 波
2598 | 皮
2599 | 释
2600 | 判
2601 | 含
2602 | 既
2603 | 退
2604 | 纪
2605 | 刻
2606 | 肉
2607 | 靠
2608 | 麻
2609 | 湖
2610 | 继
2611 | 诚
2612 | 姐
2613 | 益
2614 | 置
2615 | 惜
2616 | 艺
2617 | 尚
2618 | 纯
2619 | 骂
2620 | 琴
2621 | 漫
2622 | 援
2623 | 缺
2624 | 诸
2625 | 尤
2626 | 忆
2627 | 景
2628 | 府
2629 | 委
2630 | 刘
2631 | 绍
2632 | 虑
2633 | 暴
2634 | 草
2635 | 充
2636 | 授
2637 | 防
2638 | 素
2639 | 房
2640 | 搞
2641 | 典
2642 | 仔
2643 | 父
2644 | 吉
2645 | 招
2646 | 剑
2647 | 脚
2648 | 突
2649 | 牌
2650 | 餐
2651 | 仁
2652 | 酒
2653 | 礼
2654 | 巴
2655 | 丽
2656 | 亮
2657 | 恐
2658 | 述
2659 | 周
2660 | 杂
2661 | 旧
2662 | 套
2663 | 赵
2664 | 堂
2665 | 创
2666 | 母
2667 | 辑
2668 | 络
2669 | 俊
2670 | 毒
2671 | 威
2672 | 冷
2673 | 蛮
2674 | 普
2675 | 登
2676 | 微
2677 | 控
2678 | 爽
2679 | 香
2680 | 坐
2681 | 缘
2682 | 幕
2683 | 兰
2684 | 悲
2685 | 势
2686 | 午
2687 | 睡
2688 | 密
2689 | 垒
2690 | 警
2691 | 宗
2692 | 严
2693 | 阵
2694 | 江
2695 | 亚
2696 | 攻
2697 | 静
2698 | 抱
2699 | 啥
2700 | 急
2701 | 宿
2702 | 剧
2703 | 词
2704 | 忙
2705 | 牛
2706 | 吴
2707 | 陆
2708 | 维
2709 | 激
2710 | 增
2711 | 聊
2712 | 浪
2713 | 状
2714 | 良
--------------------------------------------------------------------------------
/config/extra_stopword.dic:
--------------------------------------------------------------------------------
1 | 也
2 | 了
3 | 仍
4 | 从
5 | 以
6 | 使
7 | 则
8 | 却
9 | 又
10 | 及
11 | 对
12 | 就
13 | 并
14 | 很
15 | 或
16 | 把
17 | 是
18 | 的
19 | 着
20 | 给
21 | 而
22 | 被
23 | 让
24 | 在
25 | 还
26 | 比
27 | 等
28 | 当
29 | 与
30 | 于
31 | 但
--------------------------------------------------------------------------------
/config/extra_test.dic:
--------------------------------------------------------------------------------
1 | 我是中国人
--------------------------------------------------------------------------------
/config/preposition.dic:
--------------------------------------------------------------------------------
1 | 不
2 | 也
3 | 了
4 | 仍
5 | 从
6 | 以
7 | 使
8 | 则
9 | 却
10 | 又
11 | 及
12 | 对
13 | 就
14 | 并
15 | 很
16 | 或
17 | 把
18 | 是
19 | 的
20 | 着
21 | 给
22 | 而
23 | 被
24 | 让
25 | 但
--------------------------------------------------------------------------------
/config/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/config/stopword.dic:
--------------------------------------------------------------------------------
1 | a
2 | an
3 | and
4 | are
5 | as
6 | at
7 | be
8 | but
9 | by
10 | for
11 | if
12 | in
13 | into
14 | is
15 | it
16 | no
17 | not
18 | of
19 | on
20 | or
21 | such
22 | that
23 | the
24 | their
25 | then
26 | there
27 | these
28 | they
29 | this
30 | to
31 | was
32 | will
33 | with
--------------------------------------------------------------------------------
/config/suffix.dic:
--------------------------------------------------------------------------------
1 | 乡
2 | 井
3 | 亭
4 | 党
5 | 区
6 | 厅
7 | 县
8 | 园
9 | 塔
10 | 家
11 | 寺
12 | 局
13 | 巷
14 | 市
15 | 弄
16 | 所
17 | 斯基
18 | 楼
19 | 江
20 | 河
21 | 海
22 | 湖
23 | 省
24 | 维奇
25 | 署
26 | 苑
27 | 街
28 | 觀
29 | 观
30 | 诺夫
31 | 路
32 | 部
33 | 镇
34 | 阁
35 | 山
36 | 子
37 | 娃
--------------------------------------------------------------------------------
/config/surname.dic:
--------------------------------------------------------------------------------
1 | 丁
2 | 万
3 | 万俟
4 | 上官
5 | 东方
6 | 乔
7 | 于
8 | 令狐
9 | 仲孙
10 | 任
11 | 何
12 | 余
13 | 候
14 | 傅
15 | 公冶
16 | 公孙
17 | 公羊
18 | 冯
19 | 刘
20 | 单
21 | 单于
22 | 卢
23 | 史
24 | 叶
25 | 司徒
26 | 司空
27 | 司马
28 | 吕
29 | 吴
30 | 周
31 | 唐
32 | 夏
33 | 夏侯
34 | 太叔
35 | 姚
36 | 姜
37 | 孔
38 | 孙
39 | 孟
40 | 宇文
41 | 宋
42 | 宗政
43 | 尉迟
44 | 尹
45 | 崔
46 | 常
47 | 康
48 | 廖
49 | 张
50 | 彭
51 | 徐
52 | 慕容
53 | 戴
54 | 文
55 | 方
56 | 易
57 | 曹
58 | 曾
59 | 朱
60 | 李
61 | 杜
62 | 杨
63 | 林
64 | 梁
65 | 欧阳
66 | 武
67 | 段
68 | 毛
69 | 江
70 | 汤
71 | 沈
72 | 淳于
73 | 潘
74 | 澹台
75 | 濮阳
76 | 熊
77 | 王
78 | 田
79 | 申屠
80 | 白
81 | 皇甫
82 | 石
83 | 秦
84 | 程
85 | 罗
86 | 肖
87 | 胡
88 | 苏
89 | 范
90 | 董
91 | 蒋
92 | 薛
93 | 袁
94 | 许
95 | 诸葛
96 | 谢
97 | 谭
98 | 贺
99 | 贾
100 | 赖
101 | 赫连
102 | 赵
103 | 轩辕
104 | 邓
105 | 邱
106 | 邵
107 | 邹
108 | 郑
109 | 郝
110 | 郭
111 | 金
112 | 钟
113 | 钟离
114 | 钱
115 | 长孙
116 | 闻人
117 | 闾丘
118 | 阎
119 | 陆
120 | 陈
121 | 雷
122 | 韩
123 | 顾
124 | 马
125 | 高
126 | 魏
127 | 鲜于
128 | 黄
129 | 黎
130 | 龙
131 | 龚
--------------------------------------------------------------------------------
/index/_1.cfe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.cfe
--------------------------------------------------------------------------------
/index/_1.cfs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.cfs
--------------------------------------------------------------------------------
/index/_1.si:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_1.si
--------------------------------------------------------------------------------
/index/_3.cfe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.cfe
--------------------------------------------------------------------------------
/index/_3.cfs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.cfs
--------------------------------------------------------------------------------
/index/_3.si:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/_3.si
--------------------------------------------------------------------------------
/index/segments_2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/segments_2
--------------------------------------------------------------------------------
/index/segments_4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/segments_4
--------------------------------------------------------------------------------
/index/write.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PeterMen/elasticsearch-analysis-ik/bde9bcc05a236353fddfa9d01f6c019c717ee507/index/write.lock
--------------------------------------------------------------------------------
/licenses/lucene-LICENSE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
204 |
205 |
206 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
207 | derived from unicode conversion examples available at
208 | http://www.unicode.org/Public/PROGRAMS/CVTUTF. Here is the copyright
209 | from those sources:
210 |
211 | /*
212 | * Copyright 2001-2004 Unicode, Inc.
213 | *
214 | * Disclaimer
215 | *
216 | * This source code is provided as is by Unicode, Inc. No claims are
217 | * made as to fitness for any particular purpose. No warranties of any
218 | * kind are expressed or implied. The recipient agrees to determine
219 | * applicability of information provided. If this file has been
220 | * purchased on magnetic or optical media from Unicode, Inc., the
221 | * sole remedy for any claim will be exchange of defective media
222 | * within 90 days of receipt.
223 | *
224 | * Limitations on Rights to Redistribute This Code
225 | *
226 | * Unicode, Inc. hereby grants the right to freely use the information
227 | * supplied in this file in the creation of products supporting the
228 | * Unicode Standard, and to make copies of this file in any form
229 | * for internal or external distribution as long as this notice
230 | * remains attached.
231 | */
232 |
233 |
234 | Some code in core/src/java/org/apache/lucene/util/ArrayUtil.java was
235 | derived from Python 2.4.2 sources available at
236 | http://www.python.org. Full license is here:
237 |
238 | http://www.python.org/download/releases/2.4.2/license/
239 |
240 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
241 | derived from Python 3.1.2 sources available at
242 | http://www.python.org. Full license is here:
243 |
244 | http://www.python.org/download/releases/3.1.2/license/
245 |
246 | Some code in core/src/java/org/apache/lucene/util/automaton was
247 | derived from Brics automaton sources available at
248 | www.brics.dk/automaton/. Here is the copyright from those sources:
249 |
250 | /*
251 | * Copyright (c) 2001-2009 Anders Moeller
252 | * All rights reserved.
253 | *
254 | * Redistribution and use in source and binary forms, with or without
255 | * modification, are permitted provided that the following conditions
256 | * are met:
257 | * 1. Redistributions of source code must retain the above copyright
258 | * notice, this list of conditions and the following disclaimer.
259 | * 2. Redistributions in binary form must reproduce the above copyright
260 | * notice, this list of conditions and the following disclaimer in the
261 | * documentation and/or other materials provided with the distribution.
262 | * 3. The name of the author may not be used to endorse or promote products
263 | * derived from this software without specific prior written permission.
264 | *
265 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
266 | * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
267 | * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
268 | * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
269 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
270 | * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
271 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
272 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
273 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
274 | * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
275 | */
276 |
277 | The levenshtein automata tables in core/src/java/org/apache/lucene/util/automaton
278 | were automatically generated with the moman/finenight FSA package.
279 | Here is the copyright for those sources:
280 |
281 | # Copyright (c) 2010, Jean-Philippe Barrette-LaPierre,
282 | #
283 | # Permission is hereby granted, free of charge, to any person
284 | # obtaining a copy of this software and associated documentation
285 | # files (the "Software"), to deal in the Software without
286 | # restriction, including without limitation the rights to use,
287 | # copy, modify, merge, publish, distribute, sublicense, and/or sell
288 | # copies of the Software, and to permit persons to whom the
289 | # Software is furnished to do so, subject to the following
290 | # conditions:
291 | #
292 | # The above copyright notice and this permission notice shall be
293 | # included in all copies or substantial portions of the Software.
294 | #
295 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
296 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
297 | # OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
298 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
299 | # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
300 | # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
301 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
302 | # OTHER DEALINGS IN THE SOFTWARE.
303 |
304 | Some code in core/src/java/org/apache/lucene/util/UnicodeUtil.java was
305 | derived from ICU (http://www.icu-project.org)
306 | The full license is available here:
307 | http://source.icu-project.org/repos/icu/icu/trunk/license.html
308 |
309 | /*
310 | * Copyright (C) 1999-2010, International Business Machines
311 | * Corporation and others. All Rights Reserved.
312 | *
313 | * Permission is hereby granted, free of charge, to any person obtaining a copy
314 | * of this software and associated documentation files (the "Software"), to deal
315 | * in the Software without restriction, including without limitation the rights
316 | * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
317 | * Software, and to permit persons to whom the Software is furnished to do so,
318 | * provided that the above copyright notice(s) and this permission notice appear
319 | * in all copies of the Software and that both the above copyright notice(s) and
320 | * this permission notice appear in supporting documentation.
321 | *
322 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
323 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
324 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
325 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
326 | * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
327 | * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
328 | * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
329 | * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
330 | *
331 | * Except as contained in this notice, the name of a copyright holder shall not
332 | * be used in advertising or otherwise to promote the sale, use or other
333 | * dealings in this Software without prior written authorization of the
334 | * copyright holder.
335 | */
336 |
337 | The following license applies to the Snowball stemmers:
338 |
339 | Copyright (c) 2001, Dr Martin Porter
340 | Copyright (c) 2002, Richard Boulton
341 | All rights reserved.
342 |
343 | Redistribution and use in source and binary forms, with or without
344 | modification, are permitted provided that the following conditions are met:
345 |
346 | * Redistributions of source code must retain the above copyright notice,
347 | * this list of conditions and the following disclaimer.
348 | * Redistributions in binary form must reproduce the above copyright
349 | * notice, this list of conditions and the following disclaimer in the
350 | * documentation and/or other materials provided with the distribution.
351 | * Neither the name of the copyright holders nor the names of its contributors
352 | * may be used to endorse or promote products derived from this software
353 | * without specific prior written permission.
354 |
355 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
356 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
357 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
358 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
359 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
360 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
361 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
362 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
363 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
364 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
365 |
366 | The following license applies to the KStemmer:
367 |
368 | Copyright © 2003,
369 | Center for Intelligent Information Retrieval,
370 | University of Massachusetts, Amherst.
371 | All rights reserved.
372 |
373 | Redistribution and use in source and binary forms, with or without modification,
374 | are permitted provided that the following conditions are met:
375 |
376 | 1. Redistributions of source code must retain the above copyright notice, this
377 | list of conditions and the following disclaimer.
378 |
379 | 2. Redistributions in binary form must reproduce the above copyright notice,
380 | this list of conditions and the following disclaimer in the documentation
381 | and/or other materials provided with the distribution.
382 |
383 | 3. The names "Center for Intelligent Information Retrieval" and
384 | "University of Massachusetts" must not be used to endorse or promote products
385 | derived from this software without prior written permission. To obtain
386 | permission, contact info@ciir.cs.umass.edu.
387 |
388 | THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
389 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
390 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
391 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
392 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
393 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
394 | GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
395 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
396 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
397 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
398 | SUCH DAMAGE.
399 |
400 | The following license applies to the Morfologik project:
401 |
402 | Copyright (c) 2006 Dawid Weiss
403 | Copyright (c) 2007-2011 Dawid Weiss, Marcin Miłkowski
404 | All rights reserved.
405 |
406 | Redistribution and use in source and binary forms, with or without modification,
407 | are permitted provided that the following conditions are met:
408 |
409 | * Redistributions of source code must retain the above copyright notice,
410 | this list of conditions and the following disclaimer.
411 |
412 | * Redistributions in binary form must reproduce the above copyright notice,
413 | this list of conditions and the following disclaimer in the documentation
414 | and/or other materials provided with the distribution.
415 |
416 | * Neither the name of Morfologik nor the names of its contributors
417 | may be used to endorse or promote products derived from this software
418 | without specific prior written permission.
419 |
420 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
421 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
422 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
423 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
424 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
425 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
426 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
427 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
428 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
429 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
430 |
431 | ---
432 |
433 | The dictionary comes from Morfologik project. Morfologik uses data from
434 | Polish ispell/myspell dictionary hosted at http://www.sjp.pl/slownik/en/ and
435 | is licenced on the terms of (inter alia) LGPL and Creative Commons
436 | ShareAlike. The part-of-speech tags were added in Morfologik project and
437 | are not found in the data from sjp.pl. The tagset is similar to IPI PAN
438 | tagset.
439 |
440 | ---
441 |
442 | The following license applies to the Morfeusz project,
443 | used by org.apache.lucene.analysis.morfologik.
444 |
445 | BSD-licensed dictionary of Polish (SGJP)
446 | http://sgjp.pl/morfeusz/
447 |
448 | Copyright © 2011 Zygmunt Saloni, Włodzimierz Gruszczyński,
449 | Marcin Woliński, Robert Wołosz
450 |
451 | All rights reserved.
452 |
453 | Redistribution and use in source and binary forms, with or without
454 | modification, are permitted provided that the following conditions are
455 | met:
456 |
457 | 1. Redistributions of source code must retain the above copyright
458 | notice, this list of conditions and the following disclaimer.
459 |
460 | 2. Redistributions in binary form must reproduce the above copyright
461 | notice, this list of conditions and the following disclaimer in the
462 | documentation and/or other materials provided with the
463 | distribution.
464 |
465 | THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDERS “AS IS” AND ANY EXPRESS
466 | OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
467 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
468 | DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS OR CONTRIBUTORS BE
469 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
470 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
471 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
472 | BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
473 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
474 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
475 | IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
476 |
--------------------------------------------------------------------------------
/licenses/lucene-NOTICE.txt:
--------------------------------------------------------------------------------
1 | Apache Lucene
2 | Copyright 2014 The Apache Software Foundation
3 |
4 | This product includes software developed at
5 | The Apache Software Foundation (http://www.apache.org/).
6 |
7 | Includes software from other Apache Software Foundation projects,
8 | including, but not limited to:
9 | - Apache Ant
10 | - Apache Jakarta Regexp
11 | - Apache Commons
12 | - Apache Xerces
13 |
14 | ICU4J, (under analysis/icu) is licensed under an MIT styles license
15 | and Copyright (c) 1995-2008 International Business Machines Corporation and others
16 |
17 | Some data files (under analysis/icu/src/data) are derived from Unicode data such
18 | as the Unicode Character Database. See http://unicode.org/copyright.html for more
19 | details.
20 |
21 | Brics Automaton (under core/src/java/org/apache/lucene/util/automaton) is
22 | BSD-licensed, created by Anders Møller. See http://www.brics.dk/automaton/
23 |
24 | The levenshtein automata tables (under core/src/java/org/apache/lucene/util/automaton) were
25 | automatically generated with the moman/finenight FSA library, created by
26 | Jean-Philippe Barrette-LaPierre. This library is available under an MIT license,
27 | see http://sites.google.com/site/rrettesite/moman and
28 | http://bitbucket.org/jpbarrette/moman/overview/
29 |
30 | The class org.apache.lucene.util.WeakIdentityMap was derived from
31 | the Apache CXF project and is Apache License 2.0.
32 |
33 | The Google Code Prettify is Apache License 2.0.
34 | See http://code.google.com/p/google-code-prettify/
35 |
36 | JUnit (junit-4.10) is licensed under the Common Public License v. 1.0
37 | See http://junit.sourceforge.net/cpl-v10.html
38 |
39 | This product includes code (JaspellTernarySearchTrie) from Java Spelling Checkin
40 | g Package (jaspell): http://jaspell.sourceforge.net/
41 | License: The BSD License (http://www.opensource.org/licenses/bsd-license.php)
42 |
43 | The snowball stemmers in
44 | analysis/common/src/java/net/sf/snowball
45 | were developed by Martin Porter and Richard Boulton.
46 | The snowball stopword lists in
47 | analysis/common/src/resources/org/apache/lucene/analysis/snowball
48 | were developed by Martin Porter and Richard Boulton.
49 | The full snowball package is available from
50 | http://snowball.tartarus.org/
51 |
52 | The KStem stemmer in
53 | analysis/common/src/org/apache/lucene/analysis/en
54 | was developed by Bob Krovetz and Sergio Guzman-Lara (CIIR-UMass Amherst)
55 | under the BSD-license.
56 |
57 | The Arabic,Persian,Romanian,Bulgarian, and Hindi analyzers (common) come with a default
58 | stopword list that is BSD-licensed created by Jacques Savoy. These files reside in:
59 | analysis/common/src/resources/org/apache/lucene/analysis/ar/stopwords.txt,
60 | analysis/common/src/resources/org/apache/lucene/analysis/fa/stopwords.txt,
61 | analysis/common/src/resources/org/apache/lucene/analysis/ro/stopwords.txt,
62 | analysis/common/src/resources/org/apache/lucene/analysis/bg/stopwords.txt,
63 | analysis/common/src/resources/org/apache/lucene/analysis/hi/stopwords.txt
64 | See http://members.unine.ch/jacques.savoy/clef/index.html.
65 |
66 | The German,Spanish,Finnish,French,Hungarian,Italian,Portuguese,Russian and Swedish light stemmers
67 | (common) are based on BSD-licensed reference implementations created by Jacques Savoy and
68 | Ljiljana Dolamic. These files reside in:
69 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanLightStemmer.java
70 | analysis/common/src/java/org/apache/lucene/analysis/de/GermanMinimalStemmer.java
71 | analysis/common/src/java/org/apache/lucene/analysis/es/SpanishLightStemmer.java
72 | analysis/common/src/java/org/apache/lucene/analysis/fi/FinnishLightStemmer.java
73 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchLightStemmer.java
74 | analysis/common/src/java/org/apache/lucene/analysis/fr/FrenchMinimalStemmer.java
75 | analysis/common/src/java/org/apache/lucene/analysis/hu/HungarianLightStemmer.java
76 | analysis/common/src/java/org/apache/lucene/analysis/it/ItalianLightStemmer.java
77 | analysis/common/src/java/org/apache/lucene/analysis/pt/PortugueseLightStemmer.java
78 | analysis/common/src/java/org/apache/lucene/analysis/ru/RussianLightStemmer.java
79 | analysis/common/src/java/org/apache/lucene/analysis/sv/SwedishLightStemmer.java
80 |
81 | The Stempel analyzer (stempel) includes BSD-licensed software developed
82 | by the Egothor project http://egothor.sf.net/, created by Leo Galambos, Martin Kvapil,
83 | and Edmond Nolan.
84 |
85 | The Polish analyzer (stempel) comes with a default
86 | stopword list that is BSD-licensed created by the Carrot2 project. The file resides
87 | in stempel/src/resources/org/apache/lucene/analysis/pl/stopwords.txt.
88 | See http://project.carrot2.org/license.html.
89 |
90 | The SmartChineseAnalyzer source code (smartcn) was
91 | provided by Xiaoping Gao and copyright 2009 by www.imdict.net.
92 |
93 | WordBreakTestUnicode_*.java (under modules/analysis/common/src/test/)
94 | is derived from Unicode data such as the Unicode Character Database.
95 | See http://unicode.org/copyright.html for more details.
96 |
97 | The Morfologik analyzer (morfologik) includes BSD-licensed software
98 | developed by Dawid Weiss and Marcin Miłkowski (http://morfologik.blogspot.com/).
99 |
100 | Morfologik uses data from Polish ispell/myspell dictionary
101 | (http://www.sjp.pl/slownik/en/) licenced on the terms of (inter alia)
102 | LGPL and Creative Commons ShareAlike.
103 |
104 | Morfologic includes data from BSD-licensed dictionary of Polish (SGJP)
105 | (http://sgjp.pl/morfeusz/)
106 |
107 | Servlet-api.jar and javax.servlet-*.jar are under the CDDL license, the original
108 | source code for this can be found at http://www.eclipse.org/jetty/downloads.php
109 |
110 | ===========================================================================
111 | Kuromoji Japanese Morphological Analyzer - Apache Lucene Integration
112 | ===========================================================================
113 |
114 | This software includes a binary and/or source version of data from
115 |
116 | mecab-ipadic-2.7.0-20070801
117 |
118 | which can be obtained from
119 |
120 | http://atilika.com/releases/mecab-ipadic/mecab-ipadic-2.7.0-20070801.tar.gz
121 |
122 | or
123 |
124 | http://jaist.dl.sourceforge.net/project/mecab/mecab-ipadic/2.7.0-20070801/mecab-ipadic-2.7.0-20070801.tar.gz
125 |
126 | ===========================================================================
127 | mecab-ipadic-2.7.0-20070801 Notice
128 | ===========================================================================
129 |
130 | Nara Institute of Science and Technology (NAIST),
131 | the copyright holders, disclaims all warranties with regard to this
132 | software, including all implied warranties of merchantability and
133 | fitness, in no event shall NAIST be liable for
134 | any special, indirect or consequential damages or any damages
135 | whatsoever resulting from loss of use, data or profits, whether in an
136 | action of contract, negligence or other tortuous action, arising out
137 | of or in connection with the use or performance of this software.
138 |
139 | A large portion of the dictionary entries
140 | originate from ICOT Free Software. The following conditions for ICOT
141 | Free Software applies to the current dictionary as well.
142 |
143 | Each User may also freely distribute the Program, whether in its
144 | original form or modified, to any third party or parties, PROVIDED
145 | that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
146 | on, or be attached to, the Program, which is distributed substantially
147 | in the same form as set out herein and that such intended
148 | distribution, if actually made, will neither violate or otherwise
149 | contravene any of the laws and regulations of the countries having
150 | jurisdiction over the User or the intended distribution itself.
151 |
152 | NO WARRANTY
153 |
154 | The program was produced on an experimental basis in the course of the
155 | research and development conducted during the project and is provided
156 | to users as so produced on an experimental basis. Accordingly, the
157 | program is provided without any warranty whatsoever, whether express,
158 | implied, statutory or otherwise. The term "warranty" used herein
159 | includes, but is not limited to, any warranty of the quality,
160 | performance, merchantability and fitness for a particular purpose of
161 | the program and the nonexistence of any infringement or violation of
162 | any right of any third party.
163 |
164 | Each user of the program will agree and understand, and be deemed to
165 | have agreed and understood, that there is no warranty whatsoever for
166 | the program and, accordingly, the entire risk arising from or
167 | otherwise connected with the program is assumed by the user.
168 |
169 | Therefore, neither ICOT, the copyright holder, or any other
170 | organization that participated in or was otherwise related to the
171 | development of the program and their respective officials, directors,
172 | officers and other employees shall be held liable for any and all
173 | damages, including, without limitation, general, special, incidental
174 | and consequential damages, arising out of or otherwise in connection
175 | with the use or inability to use the program or any product, material
176 | or result produced or otherwise obtained by using the program,
177 | regardless of whether they have been advised of, or otherwise had
178 | knowledge of, the possibility of such damages at any time during the
179 | project or thereafter. Each user will be deemed to have agreed to the
180 | foregoing by his or her commencement of use of the program. The term
181 | "use" as used herein includes, but is not limited to, the use,
182 | modification, copying and distribution of the program and the
183 | production of secondary products from the program.
184 |
185 | In the case where the program, whether in its original form or
186 | modified, was distributed or delivered to or received by a user from
187 | any person, organization or entity other than ICOT, unless it makes or
188 | grants independently of ICOT any specific warranty to the user in
189 | writing, such person, organization or entity, will also be exempted
190 | from and not be held liable to the user for any such damages as noted
191 | above as far as the program is concerned.
192 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | elasticsearch-analysis-ik
6 | 4.0.0
7 | org.elasticsearch
8 | elasticsearch-analysis-ik
9 | ${elasticsearch.version}
10 | jar
11 | IK Analyzer for Elasticsearch
12 | 2011
13 |
14 |
15 | 6.5.4
16 | 1.8
17 | ${project.basedir}/src/main/assemblies/plugin.xml
18 | analysis-ik
19 | org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin
20 | true
21 | false
22 | true
23 | 4E899B30
24 | true
25 |
26 |
27 |
28 |
29 | The Apache Software License, Version 2.0
30 | http://www.apache.org/licenses/LICENSE-2.0.txt
31 | repo
32 |
33 |
34 |
35 |
36 |
37 | Medcl
38 | medcl@elastic.co
39 | elastic
40 | http://www.elastic.co
41 |
42 |
43 |
44 |
45 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
46 | scm:git:git@github.com:medcl/elasticsearch-analysis-ik.git
47 |
48 | http://github.com/medcl/elasticsearch-analysis-ik
49 |
50 |
51 |
52 | org.sonatype.oss
53 | oss-parent
54 | 9
55 |
56 |
57 |
58 |
59 | oss.sonatype.org
60 | https://oss.sonatype.org/content/repositories/snapshots
61 |
62 |
63 | oss.sonatype.org
64 | https://oss.sonatype.org/service/local/staging/deploy/maven2/
65 |
66 |
67 |
68 |
69 |
70 | oss.sonatype.org
71 | OSS Sonatype
72 | true
73 | true
74 | http://oss.sonatype.org/content/repositories/releases/
75 |
76 |
77 |
78 |
79 |
80 | org.elasticsearch
81 | elasticsearch
82 | ${elasticsearch.version}
83 | compile
84 |
85 |
86 |
87 |
88 | org.apache.httpcomponents
89 | httpclient
90 | 4.5.2
91 |
92 |
93 |
94 | org.apache.logging.log4j
95 | log4j-api
96 | 2.3
97 |
98 |
99 |
100 | org.hamcrest
101 | hamcrest-core
102 | 1.3
103 | test
104 |
105 |
106 |
107 | org.hamcrest
108 | hamcrest-library
109 | 1.3
110 | test
111 |
112 |
113 | junit
114 | junit
115 | 4.12
116 | test
117 |
118 |
119 |
120 |
121 |
122 |
123 | org.apache.maven.plugins
124 | maven-compiler-plugin
125 | 3.5.1
126 |
127 | ${maven.compiler.target}
128 | ${maven.compiler.target}
129 |
130 |
131 |
132 | org.apache.maven.plugins
133 | maven-surefire-plugin
134 | 2.11
135 |
136 |
137 | **/*Tests.java
138 |
139 |
140 |
141 |
142 | org.apache.maven.plugins
143 | maven-source-plugin
144 | 2.1.2
145 |
146 |
147 | attach-sources
148 |
149 | jar
150 |
151 |
152 |
153 |
154 |
155 | maven-assembly-plugin
156 |
157 |
158 | false
159 | ${project.build.directory}/releases/
160 |
161 | ${basedir}/src/main/assemblies/plugin.xml
162 |
163 |
164 |
165 | fully.qualified.MainClass
166 |
167 |
168 |
169 |
170 |
171 | package
172 |
173 | single
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 | disable-java8-doclint
183 |
184 | [1.8,)
185 |
186 |
187 | -Xdoclint:none
188 |
189 |
190 |
191 | release
192 |
193 |
194 |
195 | org.sonatype.plugins
196 | nexus-staging-maven-plugin
197 | 1.6.3
198 | true
199 |
200 | oss
201 | https://oss.sonatype.org/
202 | true
203 |
204 |
205 |
206 | org.apache.maven.plugins
207 | maven-release-plugin
208 | 2.1
209 |
210 | true
211 | false
212 | release
213 | deploy
214 |
215 |
216 |
217 | org.apache.maven.plugins
218 | maven-compiler-plugin
219 | 3.5.1
220 |
221 | ${maven.compiler.target}
222 | ${maven.compiler.target}
223 |
224 |
225 |
226 | org.apache.maven.plugins
227 | maven-gpg-plugin
228 | 1.5
229 |
230 |
231 | sign-artifacts
232 | verify
233 |
234 | sign
235 |
236 |
237 |
238 |
239 |
240 | org.apache.maven.plugins
241 | maven-source-plugin
242 | 2.2.1
243 |
244 |
245 | attach-sources
246 |
247 | jar-no-fork
248 |
249 |
250 |
251 |
252 |
253 | org.apache.maven.plugins
254 | maven-javadoc-plugin
255 | 2.9
256 |
257 |
258 | attach-javadocs
259 |
260 | jar
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
--------------------------------------------------------------------------------
/src/main/assemblies/plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | analysis-ik-release
4 |
5 | zip
6 |
7 | false
8 |
9 |
10 | ${project.basedir}/config
11 | config
12 |
13 |
14 |
15 |
16 |
17 | ${project.basedir}/src/main/resources/plugin-descriptor.properties
18 |
19 | true
20 |
21 |
22 | ${project.basedir}/src/main/resources/plugin-security.policy
23 |
24 | true
25 |
26 |
27 |
28 |
29 |
30 | true
31 | true
32 |
33 | org.elasticsearch:elasticsearch
34 |
35 |
36 |
37 |
38 | true
39 | true
40 |
41 | org.apache.httpcomponents:httpclient
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/IkAnalyzerProvider.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.elasticsearch.common.settings.Settings;
4 | import org.elasticsearch.env.Environment;
5 | import org.elasticsearch.index.IndexSettings;
6 | import org.wltea.analyzer.cfg.Configuration;
7 | import org.wltea.analyzer.lucene.IKAnalyzer;
8 |
9 | public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider {
10 | private final IKAnalyzer analyzer;
11 |
12 | public IkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings,boolean useSmart) {
13 | super(indexSettings, name, settings);
14 |
15 | Configuration configuration=new Configuration(env,settings).setUseSmart(useSmart);
16 | analyzer=new IKAnalyzer(configuration);
17 | }
18 |
19 | public static IkAnalyzerProvider getIkSmartAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
20 | return new IkAnalyzerProvider(indexSettings,env,name,settings,true);
21 | }
22 |
23 | public static IkAnalyzerProvider getIkAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
24 | return new IkAnalyzerProvider(indexSettings,env,name,settings,false);
25 | }
26 |
27 | @Override public IKAnalyzer get() {
28 | return this.analyzer;
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/index/analysis/IkTokenizerFactory.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.index.analysis;
2 |
3 | import org.apache.lucene.analysis.Tokenizer;
4 | import org.elasticsearch.common.settings.Settings;
5 | import org.elasticsearch.env.Environment;
6 | import org.elasticsearch.index.IndexSettings;
7 | import org.wltea.analyzer.cfg.Configuration;
8 | import org.wltea.analyzer.lucene.IKTokenizer;
9 |
10 | public class IkTokenizerFactory extends AbstractTokenizerFactory {
11 | private Configuration configuration;
12 |
13 | public IkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
14 | super(indexSettings, name, settings);
15 | configuration=new Configuration(env,settings);
16 | }
17 |
18 | public static IkTokenizerFactory getIkTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
19 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(false);
20 | }
21 |
22 | public static IkTokenizerFactory getIkSmartTokenizerFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
23 | return new IkTokenizerFactory(indexSettings,env, name, settings).setSmart(true);
24 | }
25 |
26 | public IkTokenizerFactory setSmart(boolean smart){
27 | this.configuration.setUseSmart(smart);
28 | return this;
29 | }
30 |
31 | @Override
32 | public Tokenizer create() {
33 | return new IKTokenizer(configuration); }
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/elasticsearch/plugin/analysis/ik/AnalysisIkPlugin.java:
--------------------------------------------------------------------------------
1 | package org.elasticsearch.plugin.analysis.ik;
2 |
3 | import org.apache.lucene.analysis.Analyzer;
4 | import org.elasticsearch.index.analysis.AnalyzerProvider;
5 | import org.elasticsearch.index.analysis.IkAnalyzerProvider;
6 | import org.elasticsearch.index.analysis.IkTokenizerFactory;
7 | import org.elasticsearch.index.analysis.TokenizerFactory;
8 | import org.elasticsearch.indices.analysis.AnalysisModule;
9 | import org.elasticsearch.plugins.AnalysisPlugin;
10 | import org.elasticsearch.plugins.Plugin;
11 |
12 | import java.util.HashMap;
13 | import java.util.Map;
14 |
15 |
16 | public class AnalysisIkPlugin extends Plugin implements AnalysisPlugin {
17 |
18 | public static String PLUGIN_NAME = "analysis-ik";
19 |
20 | @Override
21 | public Map> getTokenizers() {
22 | Map> extra = new HashMap<>();
23 |
24 |
25 | extra.put("ik_smart", IkTokenizerFactory::getIkSmartTokenizerFactory);
26 | extra.put("ik_max_word", IkTokenizerFactory::getIkTokenizerFactory);
27 |
28 | return extra;
29 | }
30 |
31 | @Override
32 | public Map>> getAnalyzers() {
33 | Map>> extra = new HashMap<>();
34 |
35 | extra.put("ik_smart", IkAnalyzerProvider::getIkSmartAnalyzerProvider);
36 | extra.put("ik_max_word", IkAnalyzerProvider::getIkAnalyzerProvider);
37 |
38 | return extra;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package org.wltea.analyzer.cfg;
5 |
6 | import org.elasticsearch.common.inject.Inject;
7 | import org.elasticsearch.common.io.PathUtils;
8 | import org.elasticsearch.common.settings.Settings;
9 | import org.elasticsearch.env.Environment;
10 | import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
11 | import org.wltea.analyzer.dic.DicFile;
12 |
13 | import java.io.File;
14 | import java.io.UnsupportedEncodingException;
15 | import java.nio.file.Path;
16 | import java.security.MessageDigest;
17 | import java.security.NoSuchAlgorithmException;
18 | import java.util.ArrayList;
19 | import java.util.List;
20 |
21 | public class Configuration {
22 |
23 |
24 | private static final String PATH_DIC_MAIN = "main.dic";
25 | private static final String PATH_DIC_SURNAME = "surname.dic";
26 | private static final String PATH_DIC_QUANTIFIER = "quantifier.dic";
27 | private static final String PATH_DIC_SUFFIX = "suffix.dic";
28 | private static final String PATH_DIC_PREP = "preposition.dic";
29 | private static final String PATH_DIC_STOP = "stopword.dic";
30 | // 要使用的词典文件
31 | private List dicFiles = new ArrayList<>();
32 |
33 | //是否启用智能分词
34 | private boolean useSmart;
35 |
36 | //是否启用远程词典加载
37 | private boolean enableRemoteDict=false;
38 |
39 | //是否启用小写处理
40 | private boolean enableLowercase=true;
41 | // 用于读取插件绝对路径下文件
42 | private String absolutePath;
43 |
44 | /**
45 | * settings是分词器定义时的配置信息
46 | * */
47 | @Inject
48 | public Configuration(Environment env,Settings settings) {
49 | this.absolutePath = env.configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME).toAbsolutePath().toString();
50 | // this.absolutePath = "C:\\Users\\jm005113\\Desktop\\workspace\\elasticsearch-analysis-ik\\config";
51 | this.useSmart = settings.get("use_smart", "false").equals("true");
52 | this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
53 | this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
54 |
55 | // 以下部分为初始化分词器配置的词典文件
56 | // 基础整词(必选词典文件)
57 | DicFile mainDic = new DicFile(absolutePath);
58 | mainDic.setDicName("main");
59 | mainDic.setDicPath(PATH_DIC_MAIN);
60 | mainDic.setRemote(false);
61 | mainDic.setDictType(DicFile.DictType.INTACT_WORDS);
62 | this.dicFiles.add(mainDic);
63 |
64 | // 基础量词(必选词典文件)
65 | DicFile quantifierDic = new DicFile(absolutePath);
66 | quantifierDic.setDicName("quantifier");
67 | quantifierDic.setDicPath(PATH_DIC_QUANTIFIER);
68 | quantifierDic.setRemote(false);
69 | quantifierDic.setDictType(DicFile.DictType.QUANTIFIER);
70 | this.dicFiles.add(quantifierDic);
71 |
72 | // 基础停词(必选词典文件)
73 | DicFile stopwordsDic = new DicFile(absolutePath);
74 | stopwordsDic.setDicName("stopwords");
75 | stopwordsDic.setDicPath(PATH_DIC_STOP);
76 | stopwordsDic.setRemote(false);
77 | stopwordsDic.setDictType(DicFile.DictType.STOPWORDS);
78 | this.dicFiles.add(stopwordsDic);
79 |
80 | // 基础前缀词(必选词典文件)
81 | DicFile suffixDic = new DicFile(absolutePath);
82 | suffixDic.setDicName("suffix");
83 | suffixDic.setDicPath(PATH_DIC_SUFFIX);
84 | suffixDic.setRemote(false);
85 | suffixDic.setDictType(DicFile.DictType.SUFFIX);
86 | this.dicFiles.add(suffixDic);
87 |
88 | // 基础前姓氏(必选词典文件)
89 | DicFile surnameDic = new DicFile(absolutePath);
90 | surnameDic.setDicName("surname");
91 | surnameDic.setDicPath(PATH_DIC_SURNAME);
92 | surnameDic.setRemote(false);
93 | surnameDic.setDictType(DicFile.DictType.SURNAME);
94 | this.dicFiles.add(surnameDic);
95 |
96 | // 配置用户设置的词典文件
97 | List mainDics = settings.getAsList("ext_dic_main");
98 | if(mainDics != null && mainDics.size() > 0 ){
99 | mainDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.INTACT_WORDS)));
100 | }
101 | // 配置用户设置的词典文件
102 | List stopDics = settings.getAsList("ext_dic_stop");
103 | if(stopDics != null && stopDics.size() > 0 ){
104 | stopDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.STOPWORDS)));
105 | }
106 | // 配置用户设置的词典文件
107 | List quantifierDics = settings.getAsList("ext_dic_quantifier");
108 | if(quantifierDics != null && quantifierDics.size() > 0 ){
109 | quantifierDics.forEach(dicFileStr -> this.dicFiles.add(str2DicFile(absolutePath, dicFileStr).setDictType(DicFile.DictType.QUANTIFIER)));
110 | }
111 | }
112 |
113 | /**
114 | * 解析配置好的词典文件,示例:#dicName$extra#dicPath$extra_test.dic#isRemote$false
115 | * 解析说明:#为key的开始,$是value的开始
116 | * */
117 | private static DicFile str2DicFile(String absolutePath, String dicPath){
118 | DicFile dicFile = new DicFile(absolutePath);
119 | dicFile.setRemote(dicPath.startsWith("http:") || dicPath.startsWith("https:") || dicPath.startsWith("ftp:"));
120 | dicFile.setDicName(getMD5(dicPath));
121 | dicFile.setDicPath(dicPath);
122 | return dicFile;
123 | }
124 |
125 | public static String getMD5(String string) {
126 | byte[] hash;
127 | try {
128 | //创建一个MD5算法对象,并获得MD5字节数组,16*8=128位
129 | hash = MessageDigest.getInstance("MD5").digest(string.getBytes("UTF-8"));
130 | } catch (NoSuchAlgorithmException e) {
131 | throw new RuntimeException("Huh, MD5 should be supported?", e);
132 | } catch (UnsupportedEncodingException e) {
133 | throw new RuntimeException("Huh, UTF-8 should be supported?", e);
134 | }
135 |
136 | //转换为十六进制字符串
137 | StringBuilder hex = new StringBuilder(hash.length * 2);
138 | for (byte b : hash) {
139 | if ((b & 0xFF) < 0x10) hex.append("0");
140 | hex.append(Integer.toHexString(b & 0xFF));
141 | }
142 | return hex.toString().toLowerCase();
143 | }
144 |
145 | public Path getConfigInPluginDir() {
146 | return PathUtils
147 | .get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
148 | .getParent(), "config")
149 | .toAbsolutePath();
150 | }
151 |
152 | public boolean isUseSmart() {
153 | return useSmart;
154 | }
155 |
156 | public Configuration setUseSmart(boolean useSmart) {
157 | this.useSmart = useSmart;
158 | return this;
159 | }
160 |
161 | public boolean isEnableRemoteDict() {
162 | return enableRemoteDict;
163 | }
164 |
165 | public boolean isEnableLowercase() {
166 | return enableLowercase;
167 | }
168 |
169 | public List getDicFiles() {
170 | return dicFiles;
171 | }
172 |
173 | public void addDic(List dicFiles) {
174 | this.dicFiles.addAll(dicFiles);
175 | }
176 | }
177 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.io.IOException;
28 | import java.io.Reader;
29 | import java.util.*;
30 | import java.util.stream.Collectors;
31 |
32 | import org.wltea.analyzer.cfg.Configuration;
33 | import org.wltea.analyzer.dic.DicFile;
34 | import org.wltea.analyzer.dic.Dictionary;
35 |
36 | /**
37 | *
38 | * 分词器上下文状态
39 | *
40 | */
41 | public class AnalyzeContext {
42 |
43 | //默认缓冲区大小
44 | private static final int BUFF_SIZE = 4096;
45 | //缓冲区耗尽的临界值
46 | private static final int BUFF_EXHAUST_CRITICAL = 100;
47 |
48 |
49 | //字符串读取缓冲
50 | private char[] segmentBuff;
51 | //字符类型数组
52 | private int[] charTypes;
53 |
54 |
55 | //记录Reader内已分析的字串总长度
56 | //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
57 | private int buffOffset;
58 | //当前缓冲区位置指针
59 | private int cursor;
60 | //最近一次读入的,可处理的字串长度
61 | private int available;
62 |
63 |
64 | //子分词器锁
65 | //该集合非空,说明有子分词器在占用segmentBuff
66 | private Set buffLocker;
67 |
68 | //原始分词结果集合,未经歧义处理
69 | private QuickSortSet orgLexemes;
70 | //LexemePath位置索引表
71 | private Map pathMap;
72 | //最终分词结果集
73 | private LinkedList results;
74 | // 是否大小写过滤
75 | private boolean isEnableLowerCase;
76 | private boolean isUseSmart;
77 | private List mainDicNames;
78 | private List quantifierNames;
79 | private List stopwordDicFile;
80 |
81 | public AnalyzeContext(Configuration configuration){
82 | this.isUseSmart = configuration.isUseSmart();
83 | this.isEnableLowerCase = configuration.isEnableLowercase();
84 | this.segmentBuff = new char[BUFF_SIZE];
85 | this.charTypes = new int[BUFF_SIZE];
86 | this.buffLocker = new HashSet();
87 | this.orgLexemes = new QuickSortSet();
88 | this.pathMap = new HashMap();
89 | this.results = new LinkedList();
90 | this.mainDicNames = new ArrayList<>();
91 | this.quantifierNames = new ArrayList<>();
92 | this.stopwordDicFile = new ArrayList<>();
93 | // 将定义分词器时的词典文件列表设置到分词器的context中,在分词时,从context中读取词典列表,
94 | // 好实现不同IK分词器使用不同词典的逻辑
95 | configuration.getDicFiles().forEach(dicFile -> {
96 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){
97 | mainDicNames.add(dicFile.getDicName());
98 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){
99 | quantifierNames.add(dicFile.getDicName());
100 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){
101 | stopwordDicFile.add(dicFile.getDicName());
102 | }
103 | });
104 | mainDicNames = mainDicNames.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList());
105 | quantifierNames = quantifierNames.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList());
106 | stopwordDicFile = stopwordDicFile.stream().sorted((o1, o2) -> -1 ).collect(Collectors.toList());
107 | }
108 |
109 | public int getCursor(){
110 | return this.cursor;
111 | }
112 |
113 | public char[] getSegmentBuff(){
114 | return this.segmentBuff;
115 | }
116 |
117 | public char getCurrentChar(){
118 | return this.segmentBuff[this.cursor];
119 | }
120 |
121 | public int getCurrentCharType(){
122 | return this.charTypes[this.cursor];
123 | }
124 |
125 | public int getBufferOffset(){
126 | return this.buffOffset;
127 | }
128 |
129 | public List getMainDicNames() {
130 | return mainDicNames;
131 | }
132 |
133 | public List getQuantifierNames() {
134 | return quantifierNames;
135 | }
136 |
137 | /**
138 | * 根据context的上下文情况,填充segmentBuff
139 | * @param reader
140 | * @return 返回待分析的(有效的)字串长度
141 | * @throws java.io.IOException
142 | */
143 | public int fillBuffer(Reader reader) throws IOException{
144 | int readCount = 0;
145 | if(this.buffOffset == 0){
146 | //首次读取reader
147 | readCount = reader.read(segmentBuff);
148 | }else{
149 | int offset = this.available - this.cursor;
150 | if(offset > 0){
151 | //最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
152 | System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
153 | readCount = offset;
154 | }
155 | //继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
156 | readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
157 | }
158 | //记录最后一次从Reader中读入的可用字符长度
159 | this.available = readCount;
160 | //重置当前指针
161 | this.cursor = 0;
162 | return readCount;
163 | }
164 |
165 | /**
166 | * 初始化buff指针,处理第一个字符
167 | */
168 | public void initCursor(){
169 | this.cursor = 0;
170 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],this.isEnableLowerCase);
171 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
172 | }
173 |
174 | /**
175 | * 指针+1
176 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
177 | * 并处理当前字符
178 | */
179 | public boolean moveCursor(){
180 | if(this.cursor < this.available - 1){
181 | this.cursor++;
182 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],this.isEnableLowerCase);
183 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
184 | return true;
185 | }else{
186 | return false;
187 | }
188 | }
189 |
190 | /**
191 | * 设置当前segmentBuff为锁定状态
192 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
193 | * @param segmenterName
194 | */
195 | public void lockBuffer(String segmenterName){
196 | this.buffLocker.add(segmenterName);
197 | }
198 |
199 | /**
200 | * 移除指定的子分词器名,释放对segmentBuff的占用
201 | * @param segmenterName
202 | */
203 | public void unlockBuffer(String segmenterName){
204 | this.buffLocker.remove(segmenterName);
205 | }
206 |
207 | /**
208 | * 只要buffLocker中存在segmenterName
209 | * 则buffer被锁定
210 | * @return boolean 缓冲去是否被锁定
211 | */
212 | boolean isBufferLocked(){
213 | return this.buffLocker.size() > 0;
214 | }
215 |
216 | /**
217 | * 判断当前segmentBuff是否已经用完
218 | * 当前执针cursor移至segmentBuff末端this.available - 1
219 | * @return
220 | */
221 | public boolean isBufferConsumed(){
222 | return this.cursor == this.available - 1;
223 | }
224 |
225 | /**
226 | * 判断segmentBuff是否需要读取新数据
227 | *
228 | * 满足一下条件时,
229 | * 1.available == BUFF_SIZE 表示buffer满载
230 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
231 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
232 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
233 | * @return
234 | */
235 | boolean needRefillBuffer(){
236 | return this.available == BUFF_SIZE
237 | && this.cursor < this.available - 1
238 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL
239 | && !this.isBufferLocked();
240 | }
241 |
242 | /**
243 | * 累计当前的segmentBuff相对于reader起始位置的位移
244 | */
245 | void markBufferOffset(){
246 | this.buffOffset += this.cursor;
247 | }
248 |
249 | /**
250 | * 向分词结果集添加词元
251 | * @param lexeme
252 | */
253 | public void addLexeme(Lexeme lexeme){
254 | this.orgLexemes.addLexeme(lexeme);
255 | }
256 |
257 | /**
258 | * 添加分词结果路径
259 | * 路径起始位置 ---> 路径 映射表
260 | * @param path
261 | */
262 | public void addLexemePath(LexemePath path){
263 | if(path != null){
264 | this.pathMap.put(path.getPathBegin(), path);
265 | }
266 | }
267 |
268 |
269 | /**
270 | * 返回原始分词结果
271 | * @return
272 | */
273 | public QuickSortSet getOrgLexemes(){
274 | return this.orgLexemes;
275 | }
276 |
277 | /**
278 | * 推送分词结果到结果集合
279 | * 1.从buff头部遍历到this.cursor已处理位置
280 | * 2.将map中存在的分词结果推入results
281 | * 3.将map中不存在的CJDK字符以单字方式推入results
282 | */
283 | public void outputToResult(){
284 | int index = 0;
285 | for( ; index <= this.cursor ;){
286 | //跳过非CJK字符
287 | if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
288 | index++;
289 | continue;
290 | }
291 | //从pathMap找出对应index位置的LexemePath
292 | LexemePath path = this.pathMap.get(index);
293 | if(path != null){
294 | //输出LexemePath中的lexeme到results集合
295 | Lexeme l = path.pollFirst();
296 | while(l != null){
297 | this.results.add(l);
298 | //字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
299 | /*int innerIndex = index + 1;
300 | for (; innerIndex < index + l.getLength(); innerIndex++) {
301 | Lexeme innerL = path.peekFirst();
302 | if (innerL != null && innerIndex == innerL.getBegin()) {
303 | this.outputSingleCJK(innerIndex - 1);
304 | }
305 | }*/
306 |
307 | //将index移至lexeme后
308 | index = l.getBegin() + l.getLength();
309 | l = path.pollFirst();
310 | if(l != null){
311 | //输出path内部,词元间遗漏的单字
312 | for(;index < l.getBegin();index++){
313 | this.outputSingleCJK(index);
314 | }
315 | }
316 | }
317 | }else{//pathMap中找不到index对应的LexemePath
318 | //单字输出
319 | this.outputSingleCJK(index);
320 | index++;
321 | }
322 | }
323 | //清空当前的Map
324 | this.pathMap.clear();
325 | }
326 |
327 | /**
328 | * 对CJK字符进行单字输出
329 | * @param index
330 | */
331 | private void outputSingleCJK(int index){
332 | if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){
333 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
334 | this.results.add(singleCharLexeme);
335 | }else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
336 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
337 | this.results.add(singleCharLexeme);
338 | }
339 | }
340 |
341 | /**
342 | * 返回lexeme
343 | *
344 | * 同时处理合并
345 | * @return
346 | */
347 | Lexeme getNextLexeme(){
348 | //从结果集取出,并移除第一个Lexme
349 | Lexeme result = this.results.pollFirst();
350 | while(result != null){
351 | //数量词合并
352 | this.compound(result);
353 | if(Dictionary.getSingleton().isStopWord(this.stopwordDicFile, this.segmentBuff , result.getBegin() , result.getLength())){
354 | //是停止词继续取列表的下一个
355 | result = this.results.pollFirst();
356 | }else{
357 | //不是停止词, 生成lexeme的词元文本,输出
358 | result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
359 | break;
360 | }
361 | }
362 | return result;
363 | }
364 |
365 | /**
366 | * 重置分词上下文状态
367 | */
368 | void reset(){
369 | this.buffLocker.clear();
370 | this.orgLexemes = new QuickSortSet();
371 | this.available =0;
372 | this.buffOffset = 0;
373 | this.charTypes = new int[BUFF_SIZE];
374 | this.cursor = 0;
375 | this.results.clear();
376 | this.segmentBuff = new char[BUFF_SIZE];
377 | this.pathMap.clear();
378 | }
379 |
380 | /**
381 | * 组合词元
382 | */
383 | private void compound(Lexeme result){
384 |
385 | if(!this.isUseSmart){
386 | return ;
387 | }
388 | //数量词合并处理
389 | if(!this.results.isEmpty()){
390 |
391 | if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
392 | Lexeme nextLexeme = this.results.peekFirst();
393 | boolean appendOk = false;
394 | if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
395 | //合并英文数词+中文数词
396 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
397 | }else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
398 | //合并英文数词+中文量词
399 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
400 | }
401 | if(appendOk){
402 | //弹出
403 | this.results.pollFirst();
404 | }
405 | }
406 |
407 | //可能存在第二轮合并
408 | if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
409 | Lexeme nextLexeme = this.results.peekFirst();
410 | boolean appendOk = false;
411 | if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
412 | //合并中文数词+中文量词
413 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
414 | }
415 | if(appendOk){
416 | //弹出
417 | this.results.pollFirst();
418 | }
419 | }
420 |
421 | }
422 | }
423 |
424 | }
425 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | * 字符集识别工具类
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | /**
29 | *
30 | * 字符集识别工具类
31 | */
32 | public class CharacterUtil {
33 |
34 | public static final int CHAR_USELESS = 0;
35 |
36 | public static final int CHAR_ARABIC = 0X00000001;
37 |
38 | public static final int CHAR_ENGLISH = 0X00000002;
39 |
40 | public static final int CHAR_CHINESE = 0X00000004;
41 |
42 | public static final int CHAR_OTHER_CJK = 0X00000008;
43 |
44 |
45 | /**
46 | * 识别字符类型
47 | * @param input
48 | * @return int CharacterUtil定义的字符类型常量
49 | */
50 | static int identifyCharType(char input){
51 | if(input >= '0' && input <= '9'){
52 | return CHAR_ARABIC;
53 |
54 | }else if((input >= 'a' && input <= 'z')
55 | || (input >= 'A' && input <= 'Z')){
56 | return CHAR_ENGLISH;
57 |
58 | }else {
59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
60 |
61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){
64 | //目前已知的中文字符UTF-8集合
65 | return CHAR_CHINESE;
66 |
67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符
68 | //韩文字符集
69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
70 | || ub == Character.UnicodeBlock.HANGUL_JAMO
71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
72 | //日文字符集
73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名
74 | || ub == Character.UnicodeBlock.KATAKANA //片假名
75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){
76 | return CHAR_OTHER_CJK;
77 |
78 | }
79 | }
80 | //其他的不做处理的字符
81 | return CHAR_USELESS;
82 | }
83 |
84 | /**
85 | * 进行字符规格化(全角转半角,大写转小写处理)
86 | * @param input
87 | * @return char
88 | */
89 | static char regularize(char input,boolean lowercase){
90 | if (input == 12288) {
91 | input = (char) 32;
92 |
93 | }else if (input > 65280 && input < 65375) {
94 | input = (char) (input - 65248);
95 |
96 | }else if (input >= 'A' && input <= 'Z' && lowercase) {
97 | input += 32;
98 | }
99 |
100 | return input;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Stack;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | * IK分词歧义裁决器
32 | */
33 | class IKArbitrator {
34 |
35 | IKArbitrator(){
36 |
37 | }
38 |
39 | /**
40 | * 分词歧义处理
41 | // * @param orgLexemes
42 | * @param useSmart
43 | */
44 | void process(AnalyzeContext context , boolean useSmart){
45 | QuickSortSet orgLexemes = context.getOrgLexemes();
46 | Lexeme orgLexeme = orgLexemes.pollFirst();
47 |
48 | LexemePath crossPath = new LexemePath();
49 | while(orgLexeme != null){
50 | if(!crossPath.addCrossLexeme(orgLexeme)){
51 | //找到与crossPath不相交的下一个crossPath
52 | if(crossPath.size() == 1 || !useSmart){
53 | //crossPath没有歧义 或者 不做歧义处理
54 | //直接输出当前crossPath
55 | context.addLexemePath(crossPath);
56 | }else{
57 | //对当前的crossPath进行歧义处理
58 | QuickSortSet.Cell headCell = crossPath.getHead();
59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
60 | //输出歧义处理结果judgeResult
61 | context.addLexemePath(judgeResult);
62 | }
63 |
64 | //把orgLexeme加入新的crossPath中
65 | crossPath = new LexemePath();
66 | crossPath.addCrossLexeme(orgLexeme);
67 | }
68 | orgLexeme = orgLexemes.pollFirst();
69 | }
70 |
71 |
72 | //处理最后的path
73 | if(crossPath.size() == 1 || !useSmart){
74 | //crossPath没有歧义 或者 不做歧义处理
75 | //直接输出当前crossPath
76 | context.addLexemePath(crossPath);
77 | }else{
78 | //对当前的crossPath进行歧义处理
79 | QuickSortSet.Cell headCell = crossPath.getHead();
80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
81 | //输出歧义处理结果judgeResult
82 | context.addLexemePath(judgeResult);
83 | }
84 | }
85 |
86 | /**
87 | * 歧义识别
88 | * @param lexemeCell 歧义路径链表头
89 | * @param fullTextLength 歧义路径文本长度
90 | * @return
91 | */
92 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
93 | //候选路径集合
94 | TreeSet pathOptions = new TreeSet();
95 | //候选结果路径
96 | LexemePath option = new LexemePath();
97 |
98 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
99 | Stack lexemeStack = this.forwardPath(lexemeCell , option);
100 |
101 | //当前词元链并非最理想的,加入候选路径集合
102 | pathOptions.add(option.copy());
103 |
104 | //存在歧义词,处理
105 | QuickSortSet.Cell c = null;
106 | while(!lexemeStack.isEmpty()){
107 | c = lexemeStack.pop();
108 | //回滚词元链
109 | this.backPath(c.getLexeme() , option);
110 | //从歧义词位置开始,递归,生成可选方案
111 | this.forwardPath(c , option);
112 | pathOptions.add(option.copy());
113 | }
114 |
115 | //返回集合中的最优方案
116 | return pathOptions.first();
117 |
118 | }
119 |
120 | /**
121 | * 向前遍历,添加词元,构造一个无歧义词元组合
122 | // * @param LexemePath path
123 | * @return
124 | */
125 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
126 | //发生冲突的Lexeme栈
127 | Stack conflictStack = new Stack();
128 | QuickSortSet.Cell c = lexemeCell;
129 | //迭代遍历Lexeme链表
130 | while(c != null && c.getLexeme() != null){
131 | if(!option.addNotCrossLexeme(c.getLexeme())){
132 | //词元交叉,添加失败则加入lexemeStack栈
133 | conflictStack.push(c);
134 | }
135 | c = c.getNext();
136 | }
137 | return conflictStack;
138 | }
139 |
140 | /**
141 | * 回滚词元链,直到它能够接受指定的词元
142 | // * @param lexeme
143 | * @param l
144 | */
145 | private void backPath(Lexeme l , LexemePath option){
146 | while(option.checkCross(l)){
147 | option.removeTail();
148 | }
149 |
150 | }
151 |
152 | }
153 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | */
24 | package org.wltea.analyzer.core;
25 |
26 | import org.wltea.analyzer.cfg.Configuration;
27 | import org.wltea.analyzer.core.segmenter.CJKSegmenter;
28 | import org.wltea.analyzer.core.segmenter.CN_QuantifierSegmenter;
29 | import org.wltea.analyzer.core.segmenter.ISegmenter;
30 | import org.wltea.analyzer.core.segmenter.LetterSegmenter;
31 |
32 | import java.io.IOException;
33 | import java.io.Reader;
34 | import java.util.ArrayList;
35 | import java.util.List;
36 |
37 | /**
38 | * IK分词器主类
39 | *
40 | */
41 | public final class IKSegmenter {
42 |
43 | //字符窜reader
44 | private Reader input;
45 | //分词器上下文
46 | private AnalyzeContext context;
47 | //分词处理器列表
48 | private List segmenters;
49 | //分词歧义裁决器
50 | private IKArbitrator arbitrator;
51 | private Configuration configuration;
52 |
53 |
54 | /**
55 | * IK分词器构造函数
56 | * @param input
57 | */
58 | public IKSegmenter(Reader input ,Configuration configuration){
59 | this.input = input;
60 | this.configuration = configuration;
61 | this.init();
62 | }
63 |
64 |
65 | /**
66 | * 初始化
67 | */
68 | private void init(){
69 | //初始化分词上下文
70 | this.context = new AnalyzeContext(configuration);
71 | //加载子分词器
72 | this.segmenters = this.loadSegmenters();
73 | //加载歧义裁决器
74 | this.arbitrator = new IKArbitrator();
75 | }
76 |
77 | /**
78 | * 初始化词典,加载子分词器实现
79 | * @return List
80 | */
81 | private List loadSegmenters(){
82 | List segmenters = new ArrayList(4);
83 | //处理字母的子分词器
84 | segmenters.add(new LetterSegmenter());
85 | //处理中文数量词的子分词器
86 | segmenters.add(new CN_QuantifierSegmenter());
87 | //处理中文词的子分词器
88 | segmenters.add(new CJKSegmenter());
89 | return segmenters;
90 | }
91 |
92 | /**
93 | * 分词,获取下一个词元
94 | * @return Lexeme 词元对象
95 | * @throws java.io.IOException
96 | */
97 | // TODO 待测试 该锁没有必要,tokenizer重用策略是被缓存在ThreadLocal里,所以,不同线程使用的是不同的tokenizer,不会有多线程问题
98 | public Lexeme next()throws IOException{
99 | Lexeme l = null;
100 | while((l = context.getNextLexeme()) == null ){
101 | /*
102 | * 从reader中读取数据,填充buffer
103 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理
104 | * 移位处理上次读入的但未处理的数据
105 | */
106 | int available = context.fillBuffer(this.input);
107 | if(available <= 0){
108 | //reader已经读完
109 | context.reset();
110 | return null;
111 |
112 | }else{
113 | //初始化指针
114 | context.initCursor();
115 | do{
116 | //遍历子分词器
117 | for(ISegmenter segmenter : segmenters){
118 | segmenter.analyze(context);
119 | }
120 | //字符缓冲区接近读完,需要读入新的字符
121 | if(context.needRefillBuffer()){
122 | break;
123 | }
124 | //向前移动指针
125 | }while(context.moveCursor());
126 | //重置子分词器,为下轮循环进行初始化
127 | for(ISegmenter segmenter : segmenters){
128 | segmenter.reset();
129 | }
130 | }
131 | //对分词进行歧义处理
132 | this.arbitrator.process(context, configuration.isUseSmart());
133 | //将分词结果输出到结果集,并处理未切分的单个CJK字符
134 | context.outputToResult();
135 | //记录本次分词的缓冲区位移
136 | context.markBufferOffset();
137 | }
138 | return l;
139 | }
140 |
141 | /**
142 | * 重置分词器到初始状态
143 | * @param input
144 | */
145 | public synchronized void reset(Reader input) {
146 | this.input = input;
147 | context.reset();
148 | for(ISegmenter segmenter : segmenters){
149 | segmenter.reset();
150 | }
151 | }
152 | }
153 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK词元对象
29 | */
30 | public class Lexeme implements Comparable{
31 | //lexemeType常量
32 | //未知
33 | public static final int TYPE_UNKNOWN = 0;
34 | //英文
35 | public static final int TYPE_ENGLISH = 1;
36 | //数字
37 | public static final int TYPE_ARABIC = 2;
38 | //英文数字混合
39 | public static final int TYPE_LETTER = 3;
40 | //中文词元
41 | public static final int TYPE_CNWORD = 4;
42 | //中文单字
43 | public static final int TYPE_CNCHAR = 64;
44 | //日韩文字
45 | public static final int TYPE_OTHER_CJK = 8;
46 | //中文数词
47 | public static final int TYPE_CNUM = 16;
48 | //中文量词
49 | public static final int TYPE_COUNT = 32;
50 | //中文数量词
51 | public static final int TYPE_CQUAN = 48;
52 |
53 | //词元的起始位移
54 | private int offset;
55 | //词元的相对起始位置
56 | private int begin;
57 | //词元的长度
58 | private int length;
59 | //词元文本
60 | private String lexemeText;
61 | //词元类型
62 | private int lexemeType;
63 |
64 |
65 | public Lexeme(int offset , int begin , int length , int lexemeType){
66 | this.offset = offset;
67 | this.begin = begin;
68 | if(length < 0){
69 | throw new IllegalArgumentException("length < 0");
70 | }
71 | this.length = length;
72 | this.lexemeType = lexemeType;
73 | }
74 |
75 | /*
76 | * 判断词元相等算法
77 | * 起始位置偏移、起始位置、终止位置相同
78 | * @see java.lang.Object#equals(Object o)
79 | */
80 | public boolean equals(Object o){
81 | if(o == null){
82 | return false;
83 | }
84 |
85 | if(this == o){
86 | return true;
87 | }
88 |
89 | if(o instanceof Lexeme){
90 | Lexeme other = (Lexeme)o;
91 | if(this.offset == other.getOffset()
92 | && this.begin == other.getBegin()
93 | && this.length == other.getLength()){
94 | return true;
95 | }else{
96 | return false;
97 | }
98 | }else{
99 | return false;
100 | }
101 | }
102 |
103 | /*
104 | * 词元哈希编码算法
105 | * @see java.lang.Object#hashCode()
106 | */
107 | public int hashCode(){
108 | int absBegin = getBeginPosition();
109 | int absEnd = getEndPosition();
110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
111 | }
112 |
113 | /*
114 | * 词元在排序集合中的比较算法
115 | * @see java.lang.Comparable#compareTo(java.lang.Object)
116 | */
117 | public int compareTo(Lexeme other) {
118 | //起始位置优先
119 | if(this.begin < other.getBegin()){
120 | return -1;
121 | }else if(this.begin == other.getBegin()){
122 | //词元长度优先
123 | if(this.length > other.getLength()){
124 | return -1;
125 | }else if(this.length == other.getLength()){
126 | return 0;
127 | }else {//this.length < other.getLength()
128 | return 1;
129 | }
130 |
131 | }else{//this.begin > other.getBegin()
132 | return 1;
133 | }
134 | }
135 |
136 | public int getOffset() {
137 | return offset;
138 | }
139 |
140 | public void setOffset(int offset) {
141 | this.offset = offset;
142 | }
143 |
144 | public int getBegin() {
145 | return begin;
146 | }
147 | /**
148 | * 获取词元在文本中的起始位置
149 | * @return int
150 | */
151 | public int getBeginPosition(){
152 | return offset + begin;
153 | }
154 |
155 | public void setBegin(int begin) {
156 | this.begin = begin;
157 | }
158 |
159 | /**
160 | * 获取词元在文本中的结束位置
161 | * @return int
162 | */
163 | public int getEndPosition(){
164 | return offset + begin + length;
165 | }
166 |
167 | /**
168 | * 获取词元的字符长度
169 | * @return int
170 | */
171 | public int getLength(){
172 | return this.length;
173 | }
174 |
175 | public void setLength(int length) {
176 | if(this.length < 0){
177 | throw new IllegalArgumentException("length < 0");
178 | }
179 | this.length = length;
180 | }
181 |
182 | /**
183 | * 获取词元的文本内容
184 | * @return String
185 | */
186 | public String getLexemeText() {
187 | if(lexemeText == null){
188 | return "";
189 | }
190 | return lexemeText;
191 | }
192 |
193 | public void setLexemeText(String lexemeText) {
194 | if(lexemeText == null){
195 | this.lexemeText = "";
196 | this.length = 0;
197 | }else{
198 | this.lexemeText = lexemeText;
199 | this.length = lexemeText.length();
200 | }
201 | }
202 |
203 | /**
204 | * 获取词元类型
205 | * @return int
206 | */
207 | public int getLexemeType() {
208 | return lexemeType;
209 | }
210 |
211 | /**
212 | * 获取词元类型标示字符串
213 | * @return String
214 | */
215 | public String getLexemeTypeString(){
216 | switch(lexemeType) {
217 |
218 | case TYPE_ENGLISH :
219 | return "ENGLISH";
220 |
221 | case TYPE_ARABIC :
222 | return "ARABIC";
223 |
224 | case TYPE_LETTER :
225 | return "LETTER";
226 |
227 | case TYPE_CNWORD :
228 | return "CN_WORD";
229 |
230 | case TYPE_CNCHAR :
231 | return "CN_CHAR";
232 |
233 | case TYPE_OTHER_CJK :
234 | return "OTHER_CJK";
235 |
236 | case TYPE_COUNT :
237 | return "COUNT";
238 |
239 | case TYPE_CNUM :
240 | return "TYPE_CNUM";
241 |
242 | case TYPE_CQUAN:
243 | return "TYPE_CQUAN";
244 |
245 | default :
246 | return "UNKONW";
247 | }
248 | }
249 |
250 |
251 | public void setLexemeType(int lexemeType) {
252 | this.lexemeType = lexemeType;
253 | }
254 |
255 | /**
256 | * 合并两个相邻的词元
257 | * @param l
258 | * @param lexemeType
259 | * @return boolean 词元是否成功合并
260 | */
261 | public boolean append(Lexeme l , int lexemeType){
262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){
263 | this.length += l.getLength();
264 | this.lexemeType = lexemeType;
265 | return true;
266 | }else {
267 | return false;
268 | }
269 | }
270 |
271 |
272 | /**
273 | *
274 | */
275 | public String toString(){
276 | StringBuffer strbuf = new StringBuffer();
277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t");
279 | strbuf.append(this.getLexemeTypeString());
280 | return strbuf.toString();
281 | }
282 |
283 |
284 | }
285 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 |
28 | /**
29 | * Lexeme链(路径)
30 | */
31 | class LexemePath extends QuickSortSet implements Comparable{
32 |
33 | //起始位置
34 | private int pathBegin;
35 | //结束
36 | private int pathEnd;
37 | //词元链的有效字符长度
38 | private int payloadLength;
39 |
40 | LexemePath(){
41 | this.pathBegin = -1;
42 | this.pathEnd = -1;
43 | this.payloadLength = 0;
44 | }
45 |
46 | /**
47 | * 向LexemePath追加相交的Lexeme
48 | * @param lexeme
49 | * @return
50 | */
51 | boolean addCrossLexeme(Lexeme lexeme){
52 | if(this.isEmpty()){
53 | this.addLexeme(lexeme);
54 | this.pathBegin = lexeme.getBegin();
55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
56 | this.payloadLength += lexeme.getLength();
57 | return true;
58 |
59 | }else if(this.checkCross(lexeme)){
60 | this.addLexeme(lexeme);
61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){
62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
63 | }
64 | this.payloadLength = this.pathEnd - this.pathBegin;
65 | return true;
66 |
67 | }else{
68 | return false;
69 |
70 | }
71 | }
72 |
73 | /**
74 | * 向LexemePath追加不相交的Lexeme
75 | * @param lexeme
76 | * @return
77 | */
78 | boolean addNotCrossLexeme(Lexeme lexeme){
79 | if(this.isEmpty()){
80 | this.addLexeme(lexeme);
81 | this.pathBegin = lexeme.getBegin();
82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
83 | this.payloadLength += lexeme.getLength();
84 | return true;
85 |
86 | }else if(this.checkCross(lexeme)){
87 | return false;
88 |
89 | }else{
90 | this.addLexeme(lexeme);
91 | this.payloadLength += lexeme.getLength();
92 | Lexeme head = this.peekFirst();
93 | this.pathBegin = head.getBegin();
94 | Lexeme tail = this.peekLast();
95 | this.pathEnd = tail.getBegin() + tail.getLength();
96 | return true;
97 |
98 | }
99 | }
100 |
101 | /**
102 | * 移除尾部的Lexeme
103 | * @return
104 | */
105 | Lexeme removeTail(){
106 | Lexeme tail = this.pollLast();
107 | if(this.isEmpty()){
108 | this.pathBegin = -1;
109 | this.pathEnd = -1;
110 | this.payloadLength = 0;
111 | }else{
112 | this.payloadLength -= tail.getLength();
113 | Lexeme newTail = this.peekLast();
114 | this.pathEnd = newTail.getBegin() + newTail.getLength();
115 | }
116 | return tail;
117 | }
118 |
119 | /**
120 | * 检测词元位置交叉(有歧义的切分)
121 | * @param lexeme
122 | * @return
123 | */
124 | boolean checkCross(Lexeme lexeme){
125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength());
127 | }
128 |
129 | int getPathBegin() {
130 | return pathBegin;
131 | }
132 |
133 | int getPathEnd() {
134 | return pathEnd;
135 | }
136 |
137 | /**
138 | * 获取Path的有效词长
139 | * @return
140 | */
141 | int getPayloadLength(){
142 | return this.payloadLength;
143 | }
144 |
145 | /**
146 | * 获取LexemePath的路径长度
147 | * @return
148 | */
149 | int getPathLength(){
150 | return this.pathEnd - this.pathBegin;
151 | }
152 |
153 |
154 | /**
155 | * X权重(词元长度积)
156 | * @return
157 | */
158 | int getXWeight(){
159 | int product = 1;
160 | Cell c = this.getHead();
161 | while( c != null && c.getLexeme() != null){
162 | product *= c.getLexeme().getLength();
163 | c = c.getNext();
164 | }
165 | return product;
166 | }
167 |
168 | /**
169 | * 词元位置权重
170 | * @return
171 | */
172 | int getPWeight(){
173 | int pWeight = 0;
174 | int p = 0;
175 | Cell c = this.getHead();
176 | while( c != null && c.getLexeme() != null){
177 | p++;
178 | pWeight += p * c.getLexeme().getLength() ;
179 | c = c.getNext();
180 | }
181 | return pWeight;
182 | }
183 |
184 | LexemePath copy(){
185 | LexemePath theCopy = new LexemePath();
186 | theCopy.pathBegin = this.pathBegin;
187 | theCopy.pathEnd = this.pathEnd;
188 | theCopy.payloadLength = this.payloadLength;
189 | Cell c = this.getHead();
190 | while( c != null && c.getLexeme() != null){
191 | theCopy.addLexeme(c.getLexeme());
192 | c = c.getNext();
193 | }
194 | return theCopy;
195 | }
196 |
197 | public int compareTo(LexemePath o) {
198 | //比较有效文本长度
199 | if(this.payloadLength > o.payloadLength){
200 | return -1;
201 | }else if(this.payloadLength < o.payloadLength){
202 | return 1;
203 | }else{
204 | //比较词元个数,越少越好
205 | if(this.size() < o.size()){
206 | return -1;
207 | }else if (this.size() > o.size()){
208 | return 1;
209 | }else{
210 | //路径跨度越大越好
211 | if(this.getPathLength() > o.getPathLength()){
212 | return -1;
213 | }else if(this.getPathLength() < o.getPathLength()){
214 | return 1;
215 | }else {
216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
217 | if(this.pathEnd > o.pathEnd){
218 | return -1;
219 | }else if(pathEnd < o.pathEnd){
220 | return 1;
221 | }else{
222 | //词长越平均越好
223 | if(this.getXWeight() > o.getXWeight()){
224 | return -1;
225 | }else if(this.getXWeight() < o.getXWeight()){
226 | return 1;
227 | }else {
228 | //词元位置权重比较
229 | if(this.getPWeight() > o.getPWeight()){
230 | return -1;
231 | }else if(this.getPWeight() < o.getPWeight()){
232 | return 1;
233 | }
234 |
235 | }
236 | }
237 | }
238 | }
239 | }
240 | return 0;
241 | }
242 |
243 | public String toString(){
244 | StringBuffer sb = new StringBuffer();
245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n");
246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n");
247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n");
248 | Cell head = this.getHead();
249 | while(head != null){
250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
251 | head = head.getNext();
252 | }
253 | return sb.toString();
254 | }
255 |
256 | }
257 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK分词器专用的Lexem快速排序集合
29 | */
30 | public class QuickSortSet {
31 | //链表头
32 | private Cell head;
33 | //链表尾
34 | private Cell tail;
35 | //链表的实际大小
36 | private int size;
37 |
38 | QuickSortSet(){
39 | this.size = 0;
40 | }
41 |
42 | /**
43 | * 向链表集合添加词元
44 | * @param lexeme
45 | */
46 | public boolean addLexeme(Lexeme lexeme){
47 | Cell newCell = new Cell(lexeme);
48 | if(this.size == 0){
49 | this.head = newCell;
50 | this.tail = newCell;
51 | this.size++;
52 | return true;
53 |
54 | }else{
55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合
56 | return false;
57 |
58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部
59 | this.tail.next = newCell;
60 | newCell.prev = this.tail;
61 | this.tail = newCell;
62 | this.size++;
63 | return true;
64 |
65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部
66 | this.head.prev = newCell;
67 | newCell.next = this.head;
68 | this.head = newCell;
69 | this.size++;
70 | return true;
71 |
72 | }else{
73 | //从尾部上逆
74 | Cell index = this.tail;
75 | while(index != null && index.compareTo(newCell) > 0){
76 | index = index.prev;
77 | }
78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合
79 | return false;
80 |
81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置
82 | newCell.prev = index;
83 | newCell.next = index.next;
84 | index.next.prev = newCell;
85 | index.next = newCell;
86 | this.size++;
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 |
94 | /**
95 | * 返回链表头部元素
96 | * @return
97 | */
98 | public Lexeme peekFirst(){
99 | if(this.head != null){
100 | return this.head.lexeme;
101 | }
102 | return null;
103 | }
104 |
105 | /**
106 | * 取出链表集合的第一个元素
107 | * @return Lexeme
108 | */
109 | public Lexeme pollFirst(){
110 | if(this.size == 1){
111 | Lexeme first = this.head.lexeme;
112 | this.head = null;
113 | this.tail = null;
114 | this.size--;
115 | return first;
116 | }else if(this.size > 1){
117 | Lexeme first = this.head.lexeme;
118 | this.head = this.head.next;
119 | this.size --;
120 | return first;
121 | }else{
122 | return null;
123 | }
124 | }
125 |
126 | /**
127 | * 返回链表尾部元素
128 | * @return
129 | */
130 | public Lexeme peekLast(){
131 | if(this.tail != null){
132 | return this.tail.lexeme;
133 | }
134 | return null;
135 | }
136 |
137 | /**
138 | * 取出链表集合的最后一个元素
139 | * @return Lexeme
140 | */
141 | public Lexeme pollLast(){
142 | if(this.size == 1){
143 | Lexeme last = this.head.lexeme;
144 | this.head = null;
145 | this.tail = null;
146 | this.size--;
147 | return last;
148 |
149 | }else if(this.size > 1){
150 | Lexeme last = this.tail.lexeme;
151 | this.tail = this.tail.prev;
152 | this.size--;
153 | return last;
154 |
155 | }else{
156 | return null;
157 | }
158 | }
159 |
160 | /**
161 | * 返回集合大小
162 | * @return
163 | */
164 | public int size(){
165 | return this.size;
166 | }
167 |
168 | /**
169 | * 判断集合是否为空
170 | * @return
171 | */
172 | public boolean isEmpty(){
173 | return this.size == 0;
174 | }
175 |
176 | /**
177 | * 返回lexeme链的头部
178 | * @return
179 | */
180 | public Cell getHead(){
181 | return this.head;
182 | }
183 |
184 | /**
185 | *
186 | * IK 中文分词 版本 5.0
187 | * IK Analyzer release 5.0
188 | *
189 | * Licensed to the Apache Software Foundation (ASF) under one or more
190 | * contributor license agreements. See the NOTICE file distributed with
191 | * this work for additional information regarding copyright ownership.
192 | * The ASF licenses this file to You under the Apache License, Version 2.0
193 | * (the "License"); you may not use this file except in compliance with
194 | * the License. You may obtain a copy of the License at
195 | *
196 | * http://www.apache.org/licenses/LICENSE-2.0
197 | *
198 | * Unless required by applicable law or agreed to in writing, software
199 | * distributed under the License is distributed on an "AS IS" BASIS,
200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | * See the License for the specific language governing permissions and
202 | * limitations under the License.
203 | *
204 | * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | * 版权声明 2012,乌龙茶工作室
206 | * provided by Linliangyi and copyright 2012 by Oolong studio
207 | *
208 | * QuickSortSet集合单元
209 | *
210 | */
211 | public class Cell implements Comparable{
212 | private Cell prev;
213 | private Cell next;
214 | private Lexeme lexeme;
215 |
216 | Cell(Lexeme lexeme){
217 | if(lexeme == null){
218 | throw new IllegalArgumentException("lexeme must not be null");
219 | }
220 | this.lexeme = lexeme;
221 | }
222 |
223 | public int compareTo(Cell o) {
224 | return this.lexeme.compareTo(o.lexeme);
225 | }
226 |
227 | public Cell getPrev(){
228 | return this.prev;
229 | }
230 |
231 | public Cell getNext(){
232 | return this.next;
233 | }
234 |
235 | public Lexeme getLexeme(){
236 | return this.lexeme;
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/segmenter/CJKSegmenter.java:
--------------------------------------------------------------------------------
1 |
2 | /**
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.core.segmenter;
27 |
28 | import org.wltea.analyzer.core.AnalyzeContext;
29 | import org.wltea.analyzer.core.CharacterUtil;
30 | import org.wltea.analyzer.core.Lexeme;
31 | import org.wltea.analyzer.dic.Dictionary;
32 | import org.wltea.analyzer.dic.Hit;
33 |
34 | import java.util.LinkedList;
35 | import java.util.List;
36 |
37 |
38 | /**
39 | * 中文-日韩文子分词器
40 | */
41 | public class CJKSegmenter implements ISegmenter {
42 |
43 | //子分词器标签
44 | static final String SEGMENTER_NAME = "CJK_SEGMENTER";
45 | //待处理的分词hit队列
46 | private List tmpHits;
47 |
48 |
49 | public CJKSegmenter(){
50 | this.tmpHits = new LinkedList();
51 | }
52 |
53 | /* (non-Javadoc)
54 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
55 | */
56 | public void analyze(AnalyzeContext context) {
57 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){
58 |
59 | //优先处理tmpHits中的hit
60 | if(!this.tmpHits.isEmpty()){
61 | //处理词段队列
62 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
63 | for(Hit hit : tmpArray){
64 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
65 | if(hit.isMatch()){
66 | //输出当前的词
67 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
68 | context.addLexeme(newLexeme);
69 |
70 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
71 | this.tmpHits.remove(hit);
72 | }
73 |
74 | }else if(hit.isUnmatch()){
75 | //hit不是词,移除
76 | this.tmpHits.remove(hit);
77 | }
78 | }
79 | }
80 |
81 | //*********************************
82 | //再对当前指针位置的字符进行单字匹配
83 | // 分词器选择的词典文件是该分词器实例化时,configuration里的字典文件列表
84 | List singleCharHits = Dictionary.getSingleton().matchInMainDict(context.getMainDicNames(), context.getSegmentBuff(), context.getCursor(), 1);
85 | for(Hit singleCharHit : singleCharHits){
86 | if(singleCharHit.isMatch()){//首字成词
87 | //输出当前的词
88 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
89 | context.addLexeme(newLexeme);
90 |
91 | //同时也是词前缀
92 | if(singleCharHit.isPrefix()){
93 | //前缀匹配则放入hit列表
94 | this.tmpHits.add(singleCharHit);
95 | }
96 | }else if(singleCharHit.isPrefix()){//首字为词前缀
97 | //前缀匹配则放入hit列表
98 | this.tmpHits.add(singleCharHit);
99 | }
100 | }
101 |
102 | }else{
103 | //遇到CHAR_USELESS字符
104 | //清空队列
105 | this.tmpHits.clear();
106 | }
107 |
108 | //判断缓冲区是否已经读完
109 | if(context.isBufferConsumed()){
110 | //清空队列
111 | this.tmpHits.clear();
112 | }
113 |
114 | //判断是否锁定缓冲区
115 | if(this.tmpHits.size() == 0){
116 | context.unlockBuffer(SEGMENTER_NAME);
117 |
118 | }else{
119 | context.lockBuffer(SEGMENTER_NAME);
120 | }
121 | }
122 |
123 | /* (non-Javadoc)
124 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#reset()
125 | */
126 | public void reset() {
127 | //清空队列
128 | this.tmpHits.clear();
129 | }
130 |
131 | }
132 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/segmenter/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core.segmenter;
26 |
27 | import java.util.HashSet;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Set;
31 |
32 | import org.wltea.analyzer.core.AnalyzeContext;
33 | import org.wltea.analyzer.core.CharacterUtil;
34 | import org.wltea.analyzer.core.Lexeme;
35 | import org.wltea.analyzer.dic.Dictionary;
36 | import org.wltea.analyzer.dic.Hit;
37 |
38 | /**
39 | *
40 | * 中文数量词子分词器
41 | */
42 | public class CN_QuantifierSegmenter implements ISegmenter{
43 |
44 | //子分词器标签
45 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
46 |
47 | //中文数词
48 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum
49 | private static Set ChnNumberChars = new HashSet();
50 | static{
51 | char[] ca = Chn_Num.toCharArray();
52 | for(char nChar : ca){
53 | ChnNumberChars.add(nChar);
54 | }
55 | }
56 |
57 | /*
58 | * 词元的开始位置,
59 | * 同时作为子分词器状态标识
60 | * 当start > -1 时,标识当前的分词器正在处理字符
61 | */
62 | private int nStart;
63 | /*
64 | * 记录词元结束位置
65 | * end记录的是在词元中最后一个出现的合理的数词结束
66 | */
67 | private int nEnd;
68 |
69 | //待处理的量词hit队列
70 | private List countHits;
71 |
72 |
73 | public CN_QuantifierSegmenter(){
74 | nStart = -1;
75 | nEnd = -1;
76 | this.countHits = new LinkedList();
77 | }
78 |
79 | /**
80 | * 分词
81 | */
82 | public void analyze(AnalyzeContext context) {
83 | //处理中文数词
84 | this.processCNumber(context);
85 | //处理中文量词
86 | this.processCount(context);
87 |
88 | //判断是否锁定缓冲区
89 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){
90 | //对缓冲区解锁
91 | context.unlockBuffer(SEGMENTER_NAME);
92 | }else{
93 | context.lockBuffer(SEGMENTER_NAME);
94 | }
95 | }
96 |
97 |
98 | /**
99 | * 重置子分词器状态
100 | */
101 | public void reset() {
102 | nStart = -1;
103 | nEnd = -1;
104 | countHits.clear();
105 | }
106 |
107 | /**
108 | * 处理数词
109 | */
110 | private void processCNumber(AnalyzeContext context){
111 | if(nStart == -1 && nEnd == -1){//初始状态
112 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
113 | && ChnNumberChars.contains(context.getCurrentChar())){
114 | //记录数词的起始、结束位置
115 | nStart = context.getCursor();
116 | nEnd = context.getCursor();
117 | }
118 | }else{//正在处理状态
119 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
120 | && ChnNumberChars.contains(context.getCurrentChar())){
121 | //记录数词的结束位置
122 | nEnd = context.getCursor();
123 | }else{
124 | //输出数词
125 | this.outputNumLexeme(context);
126 | //重置头尾指针
127 | nStart = -1;
128 | nEnd = -1;
129 | }
130 | }
131 |
132 | //缓冲区已经用完,还有尚未输出的数词
133 | if(context.isBufferConsumed() && (nStart != -1 && nEnd != -1)){
134 | //输出数词
135 | outputNumLexeme(context);
136 | //重置头尾指针
137 | nStart = -1;
138 | nEnd = -1;
139 | }
140 | }
141 |
142 | /**
143 | * 处理中文量词
144 | * @param context
145 | */
146 | private void processCount(AnalyzeContext context){
147 | // 判断是否需要启动量词扫描
148 | if(!this.needCountScan(context)){
149 | return;
150 | }
151 |
152 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){
153 |
154 | //优先处理countHits中的hit
155 | if(!this.countHits.isEmpty()){
156 | //处理词段队列
157 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
158 | for(Hit hit : tmpArray){
159 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
160 | if(hit.isMatch()){
161 | //输出当前的词
162 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
163 | context.addLexeme(newLexeme);
164 |
165 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除
166 | this.countHits.remove(hit);
167 | }
168 |
169 | }else if(hit.isUnmatch()){
170 | //hit不是词,移除
171 | this.countHits.remove(hit);
172 | }
173 | }
174 | }
175 |
176 | //*********************************
177 | //对当前指针位置的字符进行单字匹配
178 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getQuantifierNames(), context.getSegmentBuff(), context.getCursor(), 1);
179 | if(singleCharHit.isMatch()){//首字成量词词
180 | //输出当前的词
181 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
182 | context.addLexeme(newLexeme);
183 |
184 | //同时也是词前缀
185 | if(singleCharHit.isPrefix()){
186 | //前缀匹配则放入hit列表
187 | this.countHits.add(singleCharHit);
188 | }
189 | }else if(singleCharHit.isPrefix()){//首字为量词前缀
190 | //前缀匹配则放入hit列表
191 | this.countHits.add(singleCharHit);
192 | }
193 |
194 |
195 | }else{
196 | //输入的不是中文字符
197 | //清空未成形的量词
198 | this.countHits.clear();
199 | }
200 |
201 | //缓冲区数据已经读完,还有尚未输出的量词
202 | if(context.isBufferConsumed()){
203 | //清空未成形的量词
204 | this.countHits.clear();
205 | }
206 | }
207 |
208 | /**
209 | * 判断是否需要扫描量词
210 | * @return
211 | */
212 | private boolean needCountScan(AnalyzeContext context){
213 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){
214 | //正在处理中文数词,或者正在处理量词
215 | return true;
216 | }else{
217 | //找到一个相邻的数词
218 | if(!context.getOrgLexemes().isEmpty()){
219 | Lexeme l = context.getOrgLexemes().peekLast();
220 | if((Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType())
221 | && (l.getBegin() + l.getLength() == context.getCursor())){
222 | return true;
223 | }
224 | }
225 | }
226 | return false;
227 | }
228 |
229 | /**
230 | * 添加数词词元到结果集
231 | * @param context
232 | */
233 | private void outputNumLexeme(AnalyzeContext context){
234 | if(nStart > -1 && nEnd > -1){
235 | //输出数词
236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM);
237 | context.addLexeme(newLexeme);
238 |
239 | }
240 | }
241 |
242 | }
243 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/segmenter/ISegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core.segmenter;
26 |
27 |
28 | import org.wltea.analyzer.core.AnalyzeContext;
29 |
30 | /**
31 | *
32 | * 子分词器接口
33 | */
34 | public interface ISegmenter {
35 |
36 | /**
37 | * 从分析器读取下一个可能分解的词元对象
38 | * @param context 分词算法上下文
39 | */
40 | void analyze(AnalyzeContext context);
41 |
42 |
43 | /**
44 | * 重置子分析器状态
45 | */
46 | void reset();
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/segmenter/LetterSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core.segmenter;
26 |
27 | import org.wltea.analyzer.core.AnalyzeContext;
28 | import org.wltea.analyzer.core.CharacterUtil;
29 | import org.wltea.analyzer.core.Lexeme;
30 |
31 | import java.util.Arrays;
32 |
33 | /**
34 | *
35 | * 英文字符及阿拉伯数字子分词器
36 | */
37 | public class LetterSegmenter implements ISegmenter {
38 |
39 | //子分词器标签
40 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
41 | //链接符号
42 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'};
43 |
44 | //数字符号
45 | private static final char[] Num_Connector = new char[]{',' , '.'};
46 |
47 | /*
48 | * 词元的开始位置,
49 | * 同时作为子分词器状态标识
50 | * 当start > -1 时,标识当前的分词器正在处理字符
51 | */
52 | private int start;
53 | /*
54 | * 记录词元结束位置
55 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
56 | */
57 | private int end;
58 |
59 | /*
60 | * 字母起始位置
61 | */
62 | private int englishStart;
63 |
64 | /*
65 | * 字母结束位置
66 | */
67 | private int englishEnd;
68 |
69 | /*
70 | * 阿拉伯数字起始位置
71 | */
72 | private int arabicStart;
73 |
74 | /*
75 | * 阿拉伯数字结束位置
76 | */
77 | private int arabicEnd;
78 |
79 | public LetterSegmenter(){
80 | Arrays.sort(Letter_Connector);
81 | Arrays.sort(Num_Connector);
82 | this.start = -1;
83 | this.end = -1;
84 | this.englishStart = -1;
85 | this.englishEnd = -1;
86 | this.arabicStart = -1;
87 | this.arabicEnd = -1;
88 | }
89 |
90 |
91 | /* (non-Javadoc)
92 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
93 | */
94 | public void analyze(AnalyzeContext context) {
95 | boolean bufferLockFlag = false;
96 | //处理英文字母
97 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
98 | //处理阿拉伯字母
99 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
100 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
101 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
102 |
103 | //判断是否锁定缓冲区
104 | if(bufferLockFlag){
105 | context.lockBuffer(SEGMENTER_NAME);
106 | }else{
107 | //对缓冲区解锁
108 | context.unlockBuffer(SEGMENTER_NAME);
109 | }
110 | }
111 |
112 | /* (non-Javadoc)
113 | * @see org.wltea.analyzer.core.segmenter.ISegmenter#reset()
114 | */
115 | public void reset() {
116 | this.start = -1;
117 | this.end = -1;
118 | this.englishStart = -1;
119 | this.englishEnd = -1;
120 | this.arabicStart = -1;
121 | this.arabicEnd = -1;
122 | }
123 |
124 | /**
125 | * 处理数字字母混合输出
126 | * 如:windos2000 | linliangyi2005@gmail.com
127 | // * @param input
128 | * @param context
129 | * @return
130 | */
131 | private boolean processMixLetter(AnalyzeContext context){
132 | boolean needLock = false;
133 |
134 | if(this.start == -1){//当前的分词器尚未开始处理字符
135 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
136 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
137 | //记录起始指针的位置,标明分词器进入处理状态
138 | this.start = context.getCursor();
139 | this.end = start;
140 | }
141 |
142 | }else{//当前的分词器正在处理字符
143 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
144 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
145 | //记录下可能的结束位置
146 | this.end = context.getCursor();
147 |
148 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
149 | && this.isLetterConnector(context.getCurrentChar())){
150 | //记录下可能的结束位置
151 | this.end = context.getCursor();
152 | }else{
153 | //遇到非Letter字符,输出词元
154 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
155 | context.addLexeme(newLexeme);
156 | this.start = -1;
157 | this.end = -1;
158 | }
159 | }
160 |
161 | //判断缓冲区是否已经读完
162 | if(context.isBufferConsumed() && (this.start != -1 && this.end != -1)){
163 | //缓冲以读完,输出词元
164 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER);
165 | context.addLexeme(newLexeme);
166 | this.start = -1;
167 | this.end = -1;
168 | }
169 |
170 | //判断是否锁定缓冲区
171 | if(this.start == -1 && this.end == -1){
172 | //对缓冲区解锁
173 | needLock = false;
174 | }else{
175 | needLock = true;
176 | }
177 | return needLock;
178 | }
179 |
180 | /**
181 | * 处理纯英文字母输出
182 | * @param context
183 | * @return
184 | */
185 | private boolean processEnglishLetter(AnalyzeContext context){
186 | boolean needLock = false;
187 |
188 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符
189 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
190 | //记录起始指针的位置,标明分词器进入处理状态
191 | this.englishStart = context.getCursor();
192 | this.englishEnd = this.englishStart;
193 | }
194 | }else {//当前的分词器正在处理英文字符
195 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){
196 | //记录当前指针位置为结束位置
197 | this.englishEnd = context.getCursor();
198 | }else{
199 | //遇到非English字符,输出词元
200 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
201 | context.addLexeme(newLexeme);
202 | this.englishStart = -1;
203 | this.englishEnd= -1;
204 | }
205 | }
206 |
207 | //判断缓冲区是否已经读完
208 | if(context.isBufferConsumed() && (this.englishStart != -1 && this.englishEnd != -1)){
209 | //缓冲以读完,输出词元
210 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH);
211 | context.addLexeme(newLexeme);
212 | this.englishStart = -1;
213 | this.englishEnd= -1;
214 | }
215 |
216 | //判断是否锁定缓冲区
217 | if(this.englishStart == -1 && this.englishEnd == -1){
218 | //对缓冲区解锁
219 | needLock = false;
220 | }else{
221 | needLock = true;
222 | }
223 | return needLock;
224 | }
225 |
226 | /**
227 | * 处理阿拉伯数字输出
228 | * @param context
229 | * @return
230 | */
231 | private boolean processArabicLetter(AnalyzeContext context){
232 | boolean needLock = false;
233 |
234 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符
235 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
236 | //记录起始指针的位置,标明分词器进入处理状态
237 | this.arabicStart = context.getCursor();
238 | this.arabicEnd = this.arabicStart;
239 | }
240 | }else {//当前的分词器正在处理数字字符
241 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){
242 | //记录当前指针位置为结束位置
243 | this.arabicEnd = context.getCursor();
244 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
245 | && this.isNumConnector(context.getCurrentChar())){
246 | //不输出数字,但不标记结束
247 | }else{
248 | ////遇到非Arabic字符,输出词元
249 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
250 | context.addLexeme(newLexeme);
251 | this.arabicStart = -1;
252 | this.arabicEnd = -1;
253 | }
254 | }
255 |
256 | //判断缓冲区是否已经读完
257 | if(context.isBufferConsumed() && (this.arabicStart != -1 && this.arabicEnd != -1)){
258 | //生成已切分的词元
259 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC);
260 | context.addLexeme(newLexeme);
261 | this.arabicStart = -1;
262 | this.arabicEnd = -1;
263 | }
264 |
265 | //判断是否锁定缓冲区
266 | if(this.arabicStart == -1 && this.arabicEnd == -1){
267 | //对缓冲区解锁
268 | needLock = false;
269 | }else{
270 | needLock = true;
271 | }
272 | return needLock;
273 | }
274 |
275 | /**
276 | * 判断是否是字母连接符号
277 | * @param input
278 | * @return
279 | */
280 | private boolean isLetterConnector(char input){
281 | int index = Arrays.binarySearch(Letter_Connector, input);
282 | return index >= 0;
283 | }
284 |
285 | /**
286 | * 判断是否是数字连接符号
287 | * @param input
288 | * @return
289 | */
290 | private boolean isNumConnector(char input){
291 | int index = Arrays.binarySearch(Num_Connector, input);
292 | return index >= 0;
293 | }
294 | }
295 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/DicFile.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.dic;
2 |
3 | /**
4 | * 字典信息描述
5 | * */
6 | public class DicFile {
7 |
8 | /** 字典名称 */
9 | private String dicName;
10 |
11 | /** 字典文件路径*/
12 | private String dicPath;
13 |
14 | /** 是远程文件还是本地字典文件, 默认为本地字典文件*/
15 | private Boolean isRemote = false;
16 |
17 | private DictType dictType;
18 |
19 | private String absolutePath;
20 |
21 | public DicFile(String absolutePath){
22 | this.absolutePath = absolutePath;
23 | }
24 |
25 | public String getAbsolutePath() {
26 | return absolutePath;
27 | }
28 | public String getDicName() {
29 | return dicName;
30 | }
31 |
32 | public void setDicName(String dicName) {
33 | this.dicName = dicName;
34 | }
35 |
36 | public String getDicPath() {
37 | return dicPath;
38 | }
39 |
40 | public void setDicPath(String dicPath) {
41 | this.dicPath = dicPath;
42 | }
43 |
44 | public Boolean isRemote() {
45 | return isRemote;
46 | }
47 |
48 | public void setRemote(Boolean remote) {
49 | isRemote = remote;
50 | }
51 |
52 | public DictType getDictType() {
53 | return dictType;
54 | }
55 |
56 | public DicFile setDictType(DictType dictType) {
57 | this.dictType = dictType;
58 | return this;
59 | }
60 |
61 | public enum DictType{
62 | /**整词*/
63 | INTACT_WORDS,
64 | /**量词*/
65 | QUANTIFIER,
66 | /**停词*/
67 | STOPWORDS,
68 | SUFFIX,
69 | SURNAME;
70 | }
71 | }
72 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.util.Arrays;
29 | import java.util.Map;
30 | import java.util.concurrent.ConcurrentHashMap;
31 |
32 | /**
33 | * 词典树分段,表示词典树的一个分枝
34 | */
35 | class DictSegment implements Comparable{
36 |
37 | //公用字典表,存储汉字
38 | private static final Map charMap = new ConcurrentHashMap(16 , 0.95f);
39 | //数组大小上限
40 | private static final int ARRAY_LENGTH_LIMIT = 3;
41 |
42 |
43 | //Map存储结构
44 | private Map childrenMap;
45 | //数组方式存储结构
46 | private DictSegment[] childrenArray;
47 |
48 |
49 | //当前节点上存储的字符
50 | private Character nodeChar;
51 | //当前节点存储的Segment数目
52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
53 | private int storeSize = 0;
54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
55 | private int nodeState = 0;
56 |
57 |
58 | DictSegment(Character nodeChar){
59 | if(nodeChar == null){
60 | throw new IllegalArgumentException("参数为空异常,字符不能为空");
61 | }
62 | this.nodeChar = nodeChar;
63 | }
64 |
65 | Character getNodeChar() {
66 | return nodeChar;
67 | }
68 |
69 | /*
70 | * 判断是否有下一个节点
71 | */
72 | boolean hasNextNode(){
73 | return this.storeSize > 0;
74 | }
75 |
76 | /**
77 | * 匹配词段
78 | * @param charArray
79 | * @return Hit
80 | */
81 | Hit match(char[] charArray){
82 | return this.match(charArray , 0 , charArray.length , null);
83 | }
84 |
85 | /**
86 | * 匹配词段
87 | * @param charArray
88 | * @param begin
89 | * @param length
90 | * @return Hit
91 | */
92 | Hit match(char[] charArray , int begin , int length){
93 | return this.match(charArray , begin , length , null);
94 | }
95 |
96 | /**
97 | * 匹配词段
98 | * @param charArray
99 | * @param begin
100 | * @param length
101 | * @param searchHit
102 | * @return Hit
103 | */
104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){
105 |
106 | if(searchHit == null){
107 | //如果hit为空,新建
108 | searchHit= new Hit();
109 | //设置hit的其实文本位置
110 | searchHit.setBegin(begin);
111 | }else{
112 | //否则要将HIT状态重置
113 | searchHit.setUnmatch();
114 | }
115 | //设置hit的当前处理位置
116 | searchHit.setEnd(begin);
117 |
118 | Character keyChar = Character.valueOf(charArray[begin]);
119 | DictSegment ds = null;
120 |
121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题
122 | DictSegment[] segmentArray = this.childrenArray;
123 | Map segmentMap = this.childrenMap;
124 |
125 | //STEP1 在节点中查找keyChar对应的DictSegment
126 | if(segmentArray != null){
127 | //在数组中查找
128 | DictSegment keySegment = new DictSegment(keyChar);
129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment);
130 | if(position >= 0){
131 | ds = segmentArray[position];
132 | }
133 |
134 | }else if(segmentMap != null){
135 | //在map中查找
136 | ds = (DictSegment)segmentMap.get(keyChar);
137 | }
138 |
139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
140 | if(ds != null){
141 | if(length > 1){
142 | //词未匹配完,继续往下搜索
143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit);
144 | }else if (length == 1){
145 |
146 | //搜索最后一个char
147 | if(ds.nodeState == 1){
148 | //添加HIT状态为完全匹配
149 | searchHit.setMatch();
150 | }
151 | if(ds.hasNextNode()){
152 | //添加HIT状态为前缀匹配
153 | searchHit.setPrefix();
154 | //记录当前位置的DictSegment
155 | searchHit.setMatchedDictSegment(ds);
156 | }
157 | return searchHit;
158 | }
159 |
160 | }
161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配
162 | return searchHit;
163 | }
164 |
165 | /**
166 | * 加载填充词典片段
167 | * @param charArray
168 | */
169 | void fillSegment(char[] charArray){
170 | this.fillSegment(charArray, 0 , charArray.length , 1);
171 | }
172 |
173 | /**
174 | * 屏蔽词典中的一个词
175 | * @param charArray
176 | */
177 | void disableSegment(char[] charArray){
178 | this.fillSegment(charArray, 0 , charArray.length , 0);
179 | }
180 |
181 | /**
182 | * 加载填充词典片段
183 | * @param charArray
184 | * @param begin
185 | * @param length
186 | * @param enabled
187 | */
188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){
189 | //获取字典表中的汉字对象
190 | Character beginChar = Character.valueOf(charArray[begin]);
191 | Character keyChar = charMap.get(beginChar);
192 | //字典中没有该字,则将其添加入字典
193 | if(keyChar == null){
194 | charMap.put(beginChar, beginChar);
195 | keyChar = beginChar;
196 | }
197 |
198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
199 | DictSegment ds = lookforSegment(keyChar , enabled);
200 | if(ds != null){
201 | //处理keyChar对应的segment
202 | if(length > 1){
203 | //词元还没有完全加入词典树
204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled);
205 | }else if (length == 1){
206 | //已经是词元的最后一个char,设置当前节点状态为enabled,
207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
208 | ds.nodeState = enabled;
209 | }
210 | }
211 |
212 | }
213 |
214 | /**
215 | * 查找本节点下对应的keyChar的segment *
216 | * @param keyChar
217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
218 | * @return
219 | */
220 | private DictSegment lookforSegment(Character keyChar , int create){
221 |
222 | DictSegment ds = null;
223 |
224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){
225 | //获取数组容器,如果数组未创建则创建数组
226 | DictSegment[] segmentArray = getChildrenArray();
227 | //搜寻数组
228 | DictSegment keySegment = new DictSegment(keyChar);
229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment);
230 | if(position >= 0){
231 | ds = segmentArray[position];
232 | }
233 |
234 | //遍历数组后没有找到对应的segment
235 | if(ds == null && create == 1){
236 | ds = keySegment;
237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){
238 | //数组容量未满,使用数组存储
239 | segmentArray[this.storeSize] = ds;
240 | //segment数目+1
241 | this.storeSize++;
242 | Arrays.sort(segmentArray , 0 , this.storeSize);
243 |
244 | }else{
245 | //数组容量已满,切换Map存储
246 | //获取Map容器,如果Map未创建,则创建Map
247 | Map segmentMap = getChildrenMap();
248 | //将数组中的segment迁移到Map中
249 | migrate(segmentArray , segmentMap);
250 | //存储新的segment
251 | segmentMap.put(keyChar, ds);
252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
253 | this.storeSize++;
254 | //释放当前的数组引用
255 | this.childrenArray = null;
256 | }
257 |
258 | }
259 |
260 | }else{
261 | //获取Map容器,如果Map未创建,则创建Map
262 | Map segmentMap = getChildrenMap();
263 | //搜索Map
264 | ds = (DictSegment)segmentMap.get(keyChar);
265 | if(ds == null && create == 1){
266 | //构造新的segment
267 | ds = new DictSegment(keyChar);
268 | segmentMap.put(keyChar , ds);
269 | //当前节点存储segment数目+1
270 | this.storeSize ++;
271 | }
272 | }
273 |
274 | return ds;
275 | }
276 |
277 |
278 | /**
279 | * 获取数组容器
280 | * 线程同步方法
281 | */
282 | private DictSegment[] getChildrenArray(){
283 | synchronized(this){
284 | if(this.childrenArray == null){
285 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
286 | }
287 | }
288 | return this.childrenArray;
289 | }
290 |
291 | /**
292 | * 获取Map容器
293 | * 线程同步方法
294 | */
295 | private Map getChildrenMap(){
296 | synchronized(this){
297 | if(this.childrenMap == null){
298 | this.childrenMap = new ConcurrentHashMap(ARRAY_LENGTH_LIMIT * 2,0.8f);
299 | }
300 | }
301 | return this.childrenMap;
302 | }
303 |
304 | /**
305 | * 将数组中的segment迁移到Map中
306 | * @param segmentArray
307 | */
308 | private void migrate(DictSegment[] segmentArray , Map segmentMap){
309 | for(DictSegment segment : segmentArray){
310 | if(segment != null){
311 | segmentMap.put(segment.nodeChar, segment);
312 | }
313 | }
314 | }
315 |
316 | /**
317 | * 实现Comparable接口
318 | * @param o
319 | * @return int
320 | */
321 | public int compareTo(DictSegment o) {
322 | //对当前节点存储的char进行比较
323 | return this.nodeChar.compareTo(o.nodeChar);
324 | }
325 |
326 | }
327 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.io.*;
29 | import java.nio.file.Files;
30 | import java.nio.file.Path;
31 | import java.nio.file.Paths;
32 | import java.util.*;
33 | import java.util.concurrent.Executors;
34 | import java.util.concurrent.ScheduledExecutorService;
35 | import java.util.concurrent.TimeUnit;
36 | import org.apache.logging.log4j.Logger;
37 | import org.elasticsearch.common.io.PathUtils;
38 | import org.wltea.analyzer.help.ESPluginLoggerFactory;
39 |
40 |
41 | /**
42 | * 词典管理类,单子模式
43 | */
44 | public class Dictionary {
45 |
46 | /*
47 | * 词典单子实例
48 | */
49 | private static Dictionary singleton;
50 | /*
51 | * 主词典对象
52 | */
53 | private Map _MainDict = new HashMap<>(4);
54 | /*
55 | * 量词词典
56 | */
57 | private Map _QuantifierDict = new HashMap<>(4);
58 | /*
59 | * 停止词集合
60 | */
61 | private Map _StopWords = new HashMap<>(4);
62 | /*
63 | * 姓氏词典
64 | */
65 | private Map _SurnameDict = new HashMap<>(4);
66 | /*
67 | * 后缀词典
68 | */
69 | private Map _SuffixDict = new HashMap<>(4);
70 | /*
71 | * 副词,介词词典
72 | */
73 | private Map _PrepDict = new HashMap<>(4);
74 |
75 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDicMonitor.class.getName());
76 |
77 | private static ScheduledExecutorService pool;
78 |
79 | private RemoteDicMonitor dicMonitor;
80 |
81 | private Dictionary(){}
82 |
83 | public void loadAllDictFiles(List dicFiles) {
84 | dicFiles.forEach(dicFile -> {
85 | if(needLoad(dicFile)){
86 | DictSegment dictSegment;
87 | if(dicFile.isRemote()){
88 | // 从远程加载
89 | dictSegment = RemoteDicMonitor.loadRemoteDic(dicFile);
90 | // 添加监控任务
91 | addMonitorTask(dicFile);
92 | } else {
93 | dictSegment = loadLocalDictFile(dicFile);
94 | }
95 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){
96 | _MainDict.put(dicFile.getDicName(), dictSegment);
97 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){
98 | _QuantifierDict.put(dicFile.getDicName(), dictSegment);
99 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){
100 | _StopWords.put(dicFile.getDicName(), dictSegment);
101 | } else if(dicFile.getDictType() == DicFile.DictType.SUFFIX){
102 | _SuffixDict.put(dicFile.getDicName(), dictSegment);
103 | } else if(dicFile.getDictType() == DicFile.DictType.SURNAME){
104 | _SurnameDict.put(dicFile.getDicName(), dictSegment);
105 | }
106 | }
107 | });
108 | }
109 |
110 | private void addMonitorTask(DicFile dicFile) {
111 | if(pool == null){
112 | synchronized (Dictionary.class){
113 | if(pool == null){
114 | // 初始化监控任务
115 | initRemoteMoniter();
116 | }
117 | }
118 | }
119 | RemoteDicMonitor.RemoteDicFile remoteDicFile = new RemoteDicMonitor.RemoteDicFile(dicFile.getAbsolutePath());
120 | remoteDicFile.setDicName(dicFile.getDicName());
121 | remoteDicFile.setDicPath(dicFile.getDicPath());
122 | remoteDicFile.setDictType(dicFile.getDictType());
123 | remoteDicFile.setRemote(true);
124 | this.dicMonitor.addFile(remoteDicFile);
125 | }
126 |
127 | private boolean needLoad(DicFile dicFile){
128 | if(dicFile.getDictType() == DicFile.DictType.INTACT_WORDS){
129 | return _MainDict.get(dicFile.getDicName()) == null;
130 | } else if(dicFile.getDictType() == DicFile.DictType.QUANTIFIER){
131 | return _QuantifierDict.get(dicFile.getDicName()) == null;
132 | } else if(dicFile.getDictType() == DicFile.DictType.STOPWORDS){
133 | return _StopWords.get(dicFile.getDicName()) == null;
134 | } else if(dicFile.getDictType() == DicFile.DictType.SUFFIX){
135 | return _SuffixDict.get(dicFile.getDicName()) == null;
136 | } else if(dicFile.getDictType() == DicFile.DictType.SURNAME){
137 | return _SurnameDict.get(dicFile.getDicName()) == null;
138 | }
139 | return false;
140 | }
141 |
142 | private static DictSegment loadLocalDictFile(DicFile dicFile) {
143 | DictSegment dictSegment = new DictSegment((char) 0);
144 |
145 | // check file exist
146 | // 读取字典文件路径顺序:优先从es的config/analysis-ik/下读取字典文件,
147 | // 如未找到,则从plugin下,分词器对应的目录读取
148 | Path dicFilePath = Paths.get(dicFile.getAbsolutePath(), dicFile.getDicPath());
149 | if(!Files.exists(dicFilePath)){
150 | Path configInPluginDir = PathUtils.get(new File(Dictionary.class.getProtectionDomain().getCodeSource().getLocation().getPath())
151 | .getParent(), "config").toAbsolutePath();
152 | dicFilePath = configInPluginDir.resolve(dicFile.getDicPath());
153 | }
154 | // 读取词典文件
155 | try (InputStream is = new FileInputStream(dicFilePath.toFile());
156 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512)){
157 | String word = br.readLine();
158 | if (word != null) {
159 | if (word.startsWith("\uFEFF"))
160 | word = word.substring(1);
161 | for (; word != null; word = br.readLine()) {
162 | word = word.trim();
163 | if (word.isEmpty()) continue;
164 | dictSegment.fillSegment(word.toCharArray());
165 | }
166 | }
167 | } catch (FileNotFoundException e) {
168 | logger.error("ik-analyzer: " + dicFile.getDicName() + " not found", e);
169 | throw new RuntimeException("ik-analyzer: " + dicFile.getDicName() + " not found!!!", e);
170 | } catch (IOException e) {
171 | logger.error("ik-analyzer: " + dicFile.getDicName() + " loading failed", e);
172 | }
173 | return dictSegment;
174 | }
175 |
176 | /**
177 | * 获取词典单子实例
178 | *
179 | * @return Dictionary 单例对象
180 | */
181 | public static Dictionary getSingleton() {
182 | if (singleton == null) {
183 | synchronized (Dictionary.class){
184 | if(singleton == null){
185 | singleton = new Dictionary();
186 | }
187 | }
188 | }
189 | return singleton;
190 | }
191 |
192 | public static void initRemoteMoniter(){
193 | // 开启远程词典文件监控任务
194 | singleton.dicMonitor = new RemoteDicMonitor();
195 | pool = Executors.newScheduledThreadPool(1);
196 | pool.scheduleAtFixedRate(singleton.dicMonitor, 10, 60, TimeUnit.SECONDS);
197 | }
198 |
199 |
200 | /**
201 | * 批量加载新词条
202 | *
203 | * @param words
204 | * Collection词条列表
205 | */
206 | public void addWords(String fileName, Collection words) {
207 | if (words != null) {
208 | for (String word : words) {
209 | if (word != null) {
210 | // 批量加载词条到主内存词典中
211 | singleton._MainDict.get(fileName).fillSegment(word.trim().toCharArray());
212 | }
213 | }
214 | }
215 | }
216 |
217 | /**
218 | * 批量移除(屏蔽)词条
219 | */
220 | public void disableWords(String fileName, Collection words) {
221 | if (words != null) {
222 | for (String word : words) {
223 | if (word != null) {
224 | // 批量屏蔽词条
225 | singleton._MainDict.get(fileName).disableSegment(word.trim().toCharArray());
226 | }
227 | }
228 | }
229 | }
230 |
231 | /**
232 | * 检索匹配主词典
233 | *
234 | * @return Hit 匹配结果描述
235 | */
236 | public Hit matchInMainDict(String fileName, char[] charArray) {
237 | return singleton._MainDict.get(fileName).match(charArray);
238 | }
239 |
240 | /**
241 | * 检索匹配主词典
242 | *
243 | * @return Hit 匹配结果描述
244 | */
245 | public List matchInMainDict(List dicNames, char[] charArray, int begin, int length) {
246 | ArrayList tmpHits = new ArrayList(dicNames.size());
247 | for(String dicName : dicNames){
248 | // 成词优先级比前缀优先级高
249 | Hit tmpHit = singleton._MainDict.get(dicName).match(charArray, begin, length);
250 | if(tmpHit.isMatch() || tmpHit.isPrefix()) tmpHits.add(tmpHit);
251 | }
252 | return tmpHits;
253 | }
254 |
255 | /**
256 | * 检索匹配量词词典
257 | *
258 | * @return Hit 匹配结果描述
259 | */
260 | public Hit matchInQuantifierDict(List fileNames, char[] charArray, int begin, int length) {
261 | Hit tmpHit = new Hit();
262 | for(String fileName : fileNames){
263 | // 成词优先级比前缀优先级高
264 | tmpHit = singleton._QuantifierDict.get(fileName).match(charArray, begin, length);
265 | if(tmpHit.isMatch() || tmpHit.isPrefix()) return tmpHit;
266 | }
267 | return tmpHit;
268 | }
269 |
270 | /**
271 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
272 | *
273 | * @return Hit
274 | */
275 | public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
276 | DictSegment ds = matchedHit.getMatchedDictSegment();
277 | return ds.match(charArray, currentIndex, 1, matchedHit);
278 | }
279 |
280 | /**
281 | * 判断是否是停止词
282 | *
283 | * @return boolean
284 | */
285 | public boolean isStopWord(List fileNames, char[] charArray, int begin, int length) {
286 | for(String fileName : fileNames){
287 | // 满足任意词典里的停词,则认为是停词,都不满足,则不是停词
288 | if(singleton._StopWords.get(fileName).match(charArray, begin, length).isMatch())
289 | return true;
290 | }
291 | return false;
292 | }
293 |
294 | /**
295 | * 检索匹配姓氏词典
296 | * @param charArray
297 | * @param begin
298 | * @param length
299 | * @return Hit 匹配结果描述
300 | */
301 | public static Hit matchInSurnameDict(String fileName, char[] charArray , int begin, int length){
302 | return singleton._SurnameDict.get(fileName).match(charArray, begin, length);
303 | }
304 |
305 | /**
306 | * 检索匹配在后缀词典
307 | * @param charArray
308 | * @param begin
309 | * @param length
310 | * @return Hit 匹配结果描述
311 | */
312 | public static Hit matchInSuffixDict(String fileName, char[] charArray , int begin, int length){
313 | return singleton._SuffixDict.get(fileName).match(charArray, begin, length);
314 | }
315 |
316 | /**
317 | * 检索匹配介词、副词词典
318 | * @param charArray
319 | * @param begin
320 | * @param length
321 | * @return Hit 匹配结果描述
322 | */
323 | public static Hit matchInPrepDict(String fileName, char[] charArray , int begin, int length){
324 | return singleton._PrepDict.get(fileName).match(charArray, begin, length);
325 | }
326 | }
327 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | /**
29 | * 表示一次词典匹配的命中
30 | */
31 | public class Hit {
32 | //Hit不匹配
33 | private static final int UNMATCH = 0x00000000;
34 | //Hit完全匹配
35 | private static final int MATCH = 0x00000001;
36 | //Hit前缀匹配
37 | private static final int PREFIX = 0x00000010;
38 |
39 |
40 | //该HIT当前状态,默认未匹配
41 | private int hitState = UNMATCH;
42 |
43 | //记录词典匹配过程中,当前匹配到的词典分支节点
44 | private DictSegment matchedDictSegment;
45 | /*
46 | * 词段开始位置
47 | */
48 | private int begin;
49 | /*
50 | * 词段的结束位置
51 | */
52 | private int end;
53 |
54 |
55 | /**
56 | * 判断是否完全匹配
57 | */
58 | public boolean isMatch() {
59 | return (this.hitState & MATCH) > 0;
60 | }
61 | /**
62 | *
63 | */
64 | public void setMatch() {
65 | this.hitState = this.hitState | MATCH;
66 | }
67 |
68 | /**
69 | * 判断是否是词的前缀
70 | */
71 | public boolean isPrefix() {
72 | return (this.hitState & PREFIX) > 0;
73 | }
74 | /**
75 | *
76 | */
77 | public void setPrefix() {
78 | this.hitState = this.hitState | PREFIX;
79 | }
80 | /**
81 | * 判断是否是不匹配
82 | */
83 | public boolean isUnmatch() {
84 | return this.hitState == UNMATCH ;
85 | }
86 | /**
87 | *
88 | */
89 | public void setUnmatch() {
90 | this.hitState = UNMATCH;
91 | }
92 |
93 | public DictSegment getMatchedDictSegment() {
94 | return matchedDictSegment;
95 | }
96 |
97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) {
98 | this.matchedDictSegment = matchedDictSegment;
99 | }
100 |
101 | public int getBegin() {
102 | return begin;
103 | }
104 |
105 | public void setBegin(int begin) {
106 | this.begin = begin;
107 | }
108 |
109 | public int getEnd() {
110 | return end;
111 | }
112 |
113 | public void setEnd(int end) {
114 | this.end = end;
115 | }
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/RemoteDicMonitor.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.dic;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStreamReader;
6 | import java.security.AccessController;
7 | import java.security.PrivilegedAction;
8 | import java.util.ArrayList;
9 | import java.util.List;
10 | import java.util.concurrent.ConcurrentLinkedQueue;
11 |
12 | import org.apache.http.Header;
13 | import org.apache.http.HttpEntity;
14 | import org.apache.http.client.config.RequestConfig;
15 | import org.apache.http.client.methods.CloseableHttpResponse;
16 | import org.apache.http.client.methods.HttpGet;
17 | import org.apache.http.client.methods.HttpHead;
18 | import org.apache.http.impl.client.CloseableHttpClient;
19 | import org.apache.http.impl.client.HttpClients;
20 | import org.apache.logging.log4j.Logger;
21 | import org.elasticsearch.SpecialPermission;
22 | import org.wltea.analyzer.help.ESPluginLoggerFactory;
23 |
24 | public class RemoteDicMonitor implements Runnable {
25 |
26 | private static final Logger logger = ESPluginLoggerFactory.getLogger(RemoteDicMonitor.class.getName());
27 |
28 | private static CloseableHttpClient httpclient = HttpClients.createDefault();
29 |
30 | public static class RemoteDicFile extends DicFile{
31 | /** 上次更改时间 */
32 | private String last_modified;
33 | /** 资源属性 */
34 | private String eTags;
35 |
36 | public RemoteDicFile(String absolutePath) {
37 | super(absolutePath);
38 | }
39 |
40 | public String getLast_modified() {
41 | return last_modified;
42 | }
43 |
44 | public void setLast_modified(String last_modified) {
45 | this.last_modified = last_modified;
46 | }
47 |
48 | public String getETags() {
49 | return eTags;
50 | }
51 |
52 | public void setETags(String eTags) {
53 | this.eTags = eTags;
54 | }
55 | }
56 |
57 | /*
58 | * 请求地址
59 | */
60 | private ConcurrentLinkedQueue monitorFiles = new ConcurrentLinkedQueue<>();
61 |
62 | public void addFile(RemoteDicFile dicFile){
63 | boolean hasAdd = monitorFiles.stream().anyMatch(r -> r.getDicName().equals(dicFile.getDicName()));
64 | if(!hasAdd) {
65 | monitorFiles.offer(dicFile);
66 | }
67 | }
68 |
69 | public void run() {
70 | SpecialPermission.check();
71 | monitorFiles.forEach(dicFile -> {
72 | AccessController.doPrivileged((PrivilegedAction) () -> {
73 | this.runUnprivileged(dicFile);
74 | return null;
75 | });
76 | });
77 | }
78 |
79 | /**
80 | * 监控流程:
81 | * ①向词库服务器发送Head请求
82 | * ②从响应中获取Last-Modify、ETags字段值,判断是否变化
83 | * ③如果未变化,休眠1min,返回第①步
84 | * ④如果有变化,重新加载词典
85 | * ⑤休眠1min,返回第①步
86 | */
87 |
88 | public void runUnprivileged(RemoteDicFile dicFile) {
89 |
90 | //超时设置
91 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
92 | .setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
93 |
94 | HttpHead httpHead = new HttpHead(dicFile.getDicPath());
95 | httpHead.setConfig(rc);
96 |
97 | //设置请求头
98 | if (dicFile.getLast_modified() != null) {
99 | httpHead.setHeader("If-Modified-Since", dicFile.getLast_modified());
100 | }
101 | if (dicFile.getETags() != null) {
102 | httpHead.setHeader("If-None-Match", dicFile.getETags());
103 | }
104 |
105 | CloseableHttpResponse response = null;
106 | try {
107 |
108 | response = httpclient.execute(httpHead);
109 |
110 | //返回200 才做操作
111 | if(response.getStatusLine().getStatusCode()==200){
112 |
113 | if (((response.getLastHeader("Last-Modified")!=null) && !response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(dicFile.getLast_modified()))
114 | ||((response.getLastHeader("ETag")!=null) && !response.getLastHeader("ETag").getValue().equalsIgnoreCase(dicFile.eTags))) {
115 |
116 | // 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
117 | List words = getRemoteWords(dicFile.getDicPath());
118 | Dictionary.getSingleton().addWords(dicFile.getDicName(), words);
119 | dicFile.setLast_modified(response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue());
120 | dicFile.setETags(response.getLastHeader("ETag")==null?null:response.getLastHeader("ETag").getValue());
121 | }
122 | }else if (response.getStatusLine().getStatusCode()==304) {
123 | //没有修改,不做操作
124 | //noop
125 | }else{
126 | logger.info("remote_ext_dict {} return bad code {}" , dicFile.getDicPath() , response.getStatusLine().getStatusCode() );
127 | }
128 |
129 | } catch (Exception e) {
130 | logger.error("remote_ext_dict {} error!",e , dicFile.getDicPath());
131 | }finally{
132 | try {
133 | if (response != null) {
134 | response.close();
135 | }
136 | } catch (IOException e) {
137 | logger.error(e.getMessage(), e);
138 | }
139 | }
140 | }
141 |
142 | public static DictSegment loadRemoteDic(DicFile dicFile){
143 | logger.info("[Dict Loading] " + dicFile.getDicPath());
144 | DictSegment dictSegment = new DictSegment((char) 0);
145 | List lists = getRemoteWords(dicFile.getDicPath());
146 | // 如果找不到扩展的字典,则忽略
147 | if (lists == null) {
148 | logger.error("[Dict Loading] " + dicFile.getDicPath() + "加载失败");
149 | return dictSegment;
150 | }
151 | for (String theWord : lists) {
152 | if (theWord != null && !"".equals(theWord.trim())) {
153 | logger.info(theWord);
154 | dictSegment.fillSegment(theWord.trim().toLowerCase().toCharArray());
155 | }
156 | }
157 | return dictSegment;
158 | }
159 |
160 | private static List getRemoteWords(String location) {
161 | SpecialPermission.check();
162 | return AccessController.doPrivileged((PrivilegedAction>) () -> {
163 | return getRemoteWordsUnprivileged(location);
164 | });
165 | }
166 |
167 |
168 | /**
169 | * 从远程服务器上下载自定义词条
170 | */
171 | public static List getRemoteWordsUnprivileged(String location) {
172 |
173 | List buffer = new ArrayList();
174 | RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10 * 1000).setConnectTimeout(10 * 1000)
175 | .setSocketTimeout(60 * 1000).build();
176 | CloseableHttpClient httpclient = HttpClients.createDefault();
177 | CloseableHttpResponse response;
178 | BufferedReader in;
179 | HttpGet get = new HttpGet(location);
180 | get.setConfig(rc);
181 | try {
182 | response = httpclient.execute(get);
183 | if (response.getStatusLine().getStatusCode() == 200) {
184 |
185 | String charset = "UTF-8";
186 | // 获取编码,默认为utf-8
187 | HttpEntity entity = response.getEntity();
188 | if(entity!=null){
189 | Header contentType = entity.getContentType();
190 | if(contentType!=null&&contentType.getValue()!=null){
191 | String typeValue = contentType.getValue();
192 | if(typeValue!=null&&typeValue.contains("charset=")){
193 | charset = typeValue.substring(typeValue.lastIndexOf("=") + 1);
194 | }
195 | }
196 |
197 | if (entity.getContentLength() > 0) {
198 | in = new BufferedReader(new InputStreamReader(entity.getContent(), charset));
199 | String line;
200 | while ((line = in.readLine()) != null) {
201 | buffer.add(line);
202 | }
203 | in.close();
204 | response.close();
205 | return buffer;
206 | }
207 | }
208 | }
209 | response.close();
210 | } catch (IllegalStateException | IOException e) {
211 | logger.error("getRemoteWords {} error", e, location);
212 | }
213 | return buffer;
214 | }
215 | }
216 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/CharacterHelper.java:
--------------------------------------------------------------------------------
1 |
2 | package org.wltea.analyzer.help;
3 |
4 | public class CharacterHelper {
5 |
6 | public static boolean isSpaceLetter(char input){
7 | return input == 8 || input == 9
8 | || input == 10 || input == 13
9 | || input == 32 || input == 160;
10 | }
11 |
12 | public static boolean isEnglishLetter(char input){
13 | return (input >= 'a' && input <= 'z')
14 | || (input >= 'A' && input <= 'Z');
15 | }
16 |
17 | public static boolean isArabicNumber(char input){
18 | return input >= '0' && input <= '9';
19 | }
20 |
21 | public static boolean isCJKCharacter(char input){
22 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
23 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
24 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
25 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
26 |
27 | || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS
28 |
29 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
30 | || ub == Character.UnicodeBlock.HANGUL_JAMO
31 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
32 |
33 | || ub == Character.UnicodeBlock.HIRAGANA
34 | || ub == Character.UnicodeBlock.KATAKANA
35 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS
36 | ) {
37 | return true;
38 | }else{
39 | return false;
40 | }
41 |
42 |
43 |
44 | }
45 |
46 | public static char regularize(char input){
47 | if (input == 12288) {
48 | input = (char) 32;
49 |
50 | }else if (input > 65280 && input < 65375) {
51 | input = (char) (input - 65248);
52 |
53 | }else if (input >= 'A' && input <= 'Z') {
54 | input += 32;
55 | }
56 |
57 | return input;
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/ESPluginLoggerFactory.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.LogManager;
4 | import org.apache.logging.log4j.Logger;
5 | import org.apache.logging.log4j.spi.ExtendedLogger;
6 |
7 | public class ESPluginLoggerFactory {
8 |
9 | private ESPluginLoggerFactory() {
10 | }
11 |
12 | static public Logger getLogger(String name) {
13 | return getLogger("", LogManager.getLogger(name));
14 | }
15 |
16 | static public Logger getLogger(String prefix, String name) {
17 | return getLogger(prefix, LogManager.getLogger(name));
18 | }
19 |
20 | static public Logger getLogger(String prefix, Class> clazz) {
21 | return getLogger(prefix, LogManager.getLogger(clazz.getName()));
22 | }
23 |
24 | static public Logger getLogger(String prefix, Logger logger) {
25 | return (Logger)(prefix != null && prefix.length() != 0 ? new PrefixPluginLogger((ExtendedLogger)logger, logger.getName(), prefix) : logger);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/PrefixPluginLogger.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.Level;
4 | import org.apache.logging.log4j.Marker;
5 | import org.apache.logging.log4j.MarkerManager;
6 | import org.apache.logging.log4j.message.Message;
7 | import org.apache.logging.log4j.message.MessageFactory;
8 | import org.apache.logging.log4j.spi.ExtendedLogger;
9 | import org.apache.logging.log4j.spi.ExtendedLoggerWrapper;
10 |
11 | import java.util.WeakHashMap;
12 |
13 | public class PrefixPluginLogger extends ExtendedLoggerWrapper {
14 | private static final WeakHashMap markers = new WeakHashMap();
15 | private final Marker marker;
16 |
17 | static int markersSize() {
18 | return markers.size();
19 | }
20 |
21 | public String prefix() {
22 | return this.marker.getName();
23 | }
24 |
25 | PrefixPluginLogger(ExtendedLogger logger, String name, String prefix) {
26 | super(logger, name, (MessageFactory) null);
27 | String actualPrefix = prefix == null ? "" : prefix;
28 | WeakHashMap var6 = markers;
29 | MarkerManager.Log4jMarker actualMarker;
30 | synchronized (markers) {
31 | MarkerManager.Log4jMarker maybeMarker = (MarkerManager.Log4jMarker) markers.get(actualPrefix);
32 | if (maybeMarker == null) {
33 | actualMarker = new MarkerManager.Log4jMarker(actualPrefix);
34 | markers.put(new String(actualPrefix), actualMarker);
35 | } else {
36 | actualMarker = maybeMarker;
37 | }
38 | }
39 |
40 | this.marker = (Marker) actualMarker;
41 | }
42 |
43 | public void logMessage(String fqcn, Level level, Marker marker, Message message, Throwable t) {
44 | assert marker == null;
45 |
46 | super.logMessage(fqcn, level, this.marker, message, t);
47 | }
48 | }
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/help/Sleep.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer.help;
2 |
3 | import org.apache.logging.log4j.Logger;
4 |
5 | public class Sleep {
6 |
7 | private static final Logger logger = ESPluginLoggerFactory.getLogger(Sleep.class.getName());
8 |
9 | public enum Type {MSEC, SEC, MIN, HOUR}
10 |
11 | ;
12 |
13 | public static void sleep(Type type, int num) {
14 | try {
15 | switch (type) {
16 | case MSEC:
17 | Thread.sleep(num);
18 | return;
19 | case SEC:
20 | Thread.sleep(num * 1000);
21 | return;
22 | case MIN:
23 | Thread.sleep(num * 60 * 1000);
24 | return;
25 | case HOUR:
26 | Thread.sleep(num * 60 * 60 * 1000);
27 | return;
28 | default:
29 | System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
30 | return;
31 | }
32 | } catch (InterruptedException e) {
33 | logger.error(e.getMessage(), e);
34 | }
35 | }
36 |
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.lucene;
26 |
27 | import org.apache.lucene.analysis.Analyzer;
28 | import org.apache.lucene.analysis.Tokenizer;
29 | import org.wltea.analyzer.cfg.Configuration;
30 |
31 | /**
32 | * IK分词器,Lucene Analyzer接口实现
33 | * 兼容Lucene 4.0版本
34 | */
35 | public final class IKAnalyzer extends Analyzer{
36 |
37 | private Configuration configuration;
38 |
39 | /**
40 | * IK分词器Lucene Analyzer接口实现类
41 | *
42 | * 默认细粒度切分算法
43 | */
44 | private IKAnalyzer(){
45 | }
46 |
47 | /**
48 | * IK分词器Lucene Analyzer接口实现类
49 | *
50 | * @param configuration IK配置
51 | */
52 | public IKAnalyzer(Configuration configuration){
53 | super();
54 | this.configuration = configuration;
55 | }
56 |
57 |
58 | /**
59 | * 重载Analyzer接口,构造分词组件
60 | */
61 | @Override
62 | protected TokenStreamComponents createComponents(String fieldName) {
63 | Tokenizer _IKTokenizer = new IKTokenizer(configuration);
64 | return new TokenStreamComponents(_IKTokenizer);
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 |
25 | *
26 | */
27 | package org.wltea.analyzer.lucene;
28 |
29 | import org.apache.lucene.analysis.Tokenizer;
30 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
32 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
33 | import org.wltea.analyzer.cfg.Configuration;
34 | import org.wltea.analyzer.core.IKSegmenter;
35 | import org.wltea.analyzer.core.Lexeme;
36 |
37 | import java.io.IOException;
38 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
39 | import org.wltea.analyzer.dic.Dictionary;
40 |
41 | /**
42 | * IK分词器 Lucene Tokenizer适配器类
43 | * 兼容Lucene 4.0版本
44 | */
45 | public final class IKTokenizer extends Tokenizer {
46 |
47 | //IK分词器实现
48 | private IKSegmenter _IKImplement;
49 |
50 | //词元文本属性
51 | private final CharTermAttribute termAtt;
52 | //词元位移属性
53 | private final OffsetAttribute offsetAtt;
54 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
55 | private final TypeAttribute typeAtt;
56 | //记录最后一个词元的结束位置
57 | private int endPosition;
58 |
59 | private int skippedPositions;
60 |
61 | private PositionIncrementAttribute posIncrAtt;
62 |
63 |
64 | /**
65 | * Lucene 4.0 Tokenizer适配器类构造函数
66 | */
67 | public IKTokenizer(Configuration configuration){
68 | super();
69 | offsetAtt = addAttribute(OffsetAttribute.class);
70 | termAtt = addAttribute(CharTermAttribute.class);
71 | typeAtt = addAttribute(TypeAttribute.class);
72 | posIncrAtt = addAttribute(PositionIncrementAttribute.class);
73 | // 初始化词典
74 | Dictionary.getSingleton().loadAllDictFiles(configuration.getDicFiles());
75 | // 创建分词器
76 | _IKImplement = new IKSegmenter(input,configuration);
77 | }
78 |
79 | /* (non-Javadoc)
80 | * @see org.apache.lucene.analysis.TokenStream#incrementToken()
81 | */
82 | @Override
83 | public boolean incrementToken() throws IOException {
84 | //清除所有的词元属性
85 | clearAttributes();
86 | skippedPositions = 0;
87 |
88 | Lexeme nextLexeme = _IKImplement.next();
89 | if(nextLexeme != null){
90 | posIncrAtt.setPositionIncrement(skippedPositions +1 );
91 |
92 | //将Lexeme转成Attributes
93 | //设置词元文本
94 | termAtt.append(nextLexeme.getLexemeText());
95 | //设置词元长度
96 | termAtt.setLength(nextLexeme.getLength());
97 | //设置词元位移
98 | offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
99 |
100 | //记录分词的最后位置
101 | endPosition = nextLexeme.getEndPosition();
102 | //记录词元分类
103 | typeAtt.setType(nextLexeme.getLexemeTypeString());
104 | //返会true告知还有下个词元
105 | return true;
106 | }
107 | //返会false告知词元输出完毕
108 | return false;
109 | }
110 |
111 | /*
112 | * (non-Javadoc)
113 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
114 | */
115 | @Override
116 | public void reset() throws IOException {
117 | super.reset();
118 | _IKImplement.reset(input);
119 | skippedPositions = 0;
120 | }
121 |
122 | @Override
123 | public final void end() throws IOException {
124 | super.end();
125 | // set final offset
126 | int finalOffset = correctOffset(this.endPosition);
127 | offsetAtt.setOffset(finalOffset, finalOffset);
128 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-descriptor.properties:
--------------------------------------------------------------------------------
1 | # Elasticsearch plugin descriptor file
2 | # This file must exist as 'plugin-descriptor.properties' at
3 | # the root directory of all plugins.
4 | #
5 | # A plugin can be 'site', 'jvm', or both.
6 | #
7 | ### example site plugin for "foo":
8 | #
9 | # foo.zip <-- zip file for the plugin, with this structure:
10 | # _site/ <-- the contents that will be served
11 | # plugin-descriptor.properties <-- example contents below:
12 | #
13 | # site=true
14 | # description=My cool plugin
15 | # version=1.0
16 | #
17 | ### example jvm plugin for "foo"
18 | #
19 | # foo.zip <-- zip file for the plugin, with this structure:
20 | # .jar <-- classes, resources, dependencies
21 | # .jar <-- any number of jars
22 | # plugin-descriptor.properties <-- example contents below:
23 | #
24 | # jvm=true
25 | # classname=foo.bar.BazPlugin
26 | # description=My cool plugin
27 | # version=2.0.0-rc1
28 | # elasticsearch.version=2.0
29 | # java.version=1.7
30 | #
31 | ### mandatory elements for all plugins:
32 | #
33 | # 'description': simple summary of the plugin
34 | description=${project.description}
35 | #
36 | # 'version': plugin's version
37 | version=${project.version}
38 | #
39 | # 'name': the plugin name
40 | name=${elasticsearch.plugin.name}
41 | #
42 | # 'classname': the name of the class to load, fully-qualified.
43 | classname=${elasticsearch.plugin.classname}
44 | #
45 | # 'java.version' version of java the code is built against
46 | # use the system property java.specification.version
47 | # version string must be a sequence of nonnegative decimal integers
48 | # separated by "."'s and may have leading zeros
49 | java.version=${maven.compiler.target}
50 | #
51 | # 'elasticsearch.version' version of elasticsearch compiled against
52 | # You will have to release a new version of the plugin for each new
53 | # elasticsearch release. This version is checked when the plugin
54 | # is loaded so Elasticsearch will refuse to start in the presence of
55 | # plugins with the incorrect elasticsearch.version.
56 | elasticsearch.version=${elasticsearch.version}
57 |
--------------------------------------------------------------------------------
/src/main/resources/plugin-security.policy:
--------------------------------------------------------------------------------
1 | grant {
2 | // needed because of the hot reload functionality
3 | permission java.net.SocketPermission "*", "connect,resolve";
4 | };
--------------------------------------------------------------------------------
/src/test/java/org/wltea/analyzer/TokenizerTest.java:
--------------------------------------------------------------------------------
1 | package org.wltea.analyzer;
2 |
3 | import org.apache.lucene.analysis.TokenStream;
4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
5 | import org.elasticsearch.common.settings.Settings;
6 | import org.junit.Test;
7 | import org.wltea.analyzer.cfg.Configuration;
8 | import org.wltea.analyzer.dic.RemoteDicMonitor;
9 | import org.wltea.analyzer.lucene.IKAnalyzer;
10 |
11 | import java.io.IOException;
12 | import java.io.StringReader;
13 | import java.util.Arrays;
14 |
15 | public class TokenizerTest {
16 |
17 | @Test
18 | public void testAnalyzer() throws IOException {
19 | Settings settings = Settings.builder()
20 | .put("use_smart", false)
21 | .put("enable_lowercase", false)
22 | .put("enable_remote_dict", false)
23 | .putList("ext_dic_main", Arrays.asList("http://intact.dic"))
24 | .build();
25 | Configuration configuration=new Configuration(null,settings) ;
26 |
27 | IKAnalyzer ik =new IKAnalyzer(configuration);
28 |
29 |
30 | // String t = "连身裙";
31 | // String t = "分词器";
32 | String t = "双肩包";
33 | TokenStream tokenStream = ik.tokenStream("", new StringReader(t));
34 | tokenStream.reset();
35 | CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
36 | while(tokenStream.incrementToken()){
37 | System.out.println(termAtt);
38 | }
39 | tokenStream.end();
40 | tokenStream.close();
41 | }
42 |
43 | @Test
44 | public void testRemoteFileLoad(){
45 |
46 | RemoteDicMonitor.RemoteDicFile remoteDicFile = new RemoteDicMonitor.RemoteDicFile("");
47 | remoteDicFile.setDicPath("http://intact.dic");
48 |
49 | RemoteDicMonitor monitor = new RemoteDicMonitor();
50 | System.out.println(monitor.getRemoteWordsUnprivileged(remoteDicFile.getDicPath()));
51 |
52 | monitor.runUnprivileged(remoteDicFile);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
|