├── README.md
├── pom.xml
└── src
└── main
└── java
└── org
└── wltea
└── analyzer
├── cfg
├── Configuration.java
└── DefaultConfig.java
├── core
├── AnalyzeContext.java
├── CJKSegmenter.java
├── CN_QuantifierSegmenter.java
├── CharacterUtil.java
├── IKArbitrator.java
├── IKSegmenter.java
├── ISegmenter.java
├── LetterSegmenter.java
├── Lexeme.java
├── LexemePath.java
└── QuickSortSet.java
├── dic
├── DictSegment.java
├── Dictionary.java
├── Hit.java
├── main2012.dic
└── quantifier.dic
├── lucene
├── IKAnalyzer.java
└── IKTokenizer.java
├── query
├── IKQueryExpressionParser.java
└── SWMCQueryBuilder.java
└── sample
├── IKAnalzyerDemo.java
└── LuceneIndexAndSearchDemo.java
/README.md:
--------------------------------------------------------------------------------
1 | IKAnalyzer
2 | ==========
3 |
4 | An open source word breaker with lucene supported. See http://code.google.com/p/ik-analyzer/
5 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 |
4 | org.wltea.analyzer
5 | ikanalyzer
6 | 4.10.0
7 | jar
8 |
9 | IKAnalyzer
10 | http://maven.apache.org
11 |
12 |
13 | 4.10.0
14 | UTF-8
15 |
16 |
17 |
18 |
19 | org.apache.lucene
20 | lucene-core
21 | ${lucene.version}
22 |
23 |
24 | org.apache.lucene
25 | lucene-queryparser
26 | ${lucene.version}
27 |
28 |
29 | org.apache.lucene
30 | lucene-analyzers-common
31 | ${lucene.version}
32 |
33 |
34 |
35 |
36 |
37 |
38 | src/main/java/org/wltea/analyzer/dic
39 | org/wltea/analyzer/dic
40 | true
41 |
42 | *.dic
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/cfg/Configuration.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.cfg;
26 |
27 | import java.util.List;
28 |
29 | /**
30 | *
31 | * 配置管理类接口
32 | *
33 | */
34 | public interface Configuration {
35 |
36 | /**
37 | * 返回useSmart标志位
38 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
39 | * @return useSmart
40 | */
41 | public boolean useSmart();
42 |
43 | /**
44 | * 设置useSmart标志位
45 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
46 | * @param useSmart
47 | */
48 | public void setUseSmart(boolean useSmart);
49 |
50 | /**
51 | * 获取主词典路径
52 | *
53 | * @return String 主词典路径
54 | */
55 | public String getMainDictionary();
56 |
57 | /**
58 | * 获取量词词典路径
59 | * @return String 量词词典路径
60 | */
61 | public String getQuantifierDicionary();
62 |
63 | /**
64 | * 获取扩展字典配置路径
65 | * @return List 相对类加载器的路径
66 | */
67 | public List getExtDictionarys();
68 |
69 | /**
70 | * 获取扩展停止词典配置路径
71 | * @return List 相对类加载器的路径
72 | */
73 | public List getExtStopWordDictionarys();
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.cfg;
27 |
28 | import java.io.IOException;
29 | import java.io.InputStream;
30 | import java.util.ArrayList;
31 | import java.util.InvalidPropertiesFormatException;
32 | import java.util.List;
33 | import java.util.Properties;
34 |
35 | /**
36 | * Configuration 默认实现
37 | * 2012-5-8
38 | *
39 | */
40 | public class DefaultConfig implements Configuration {
41 |
42 | /*
43 | * 分词器默认字典路径
44 | */
45 | private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic";
46 | private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic";
47 |
48 | /*
49 | * 分词器配置文件路径
50 | */
51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml";
52 | // 配置属性——扩展字典
53 | private static final String EXT_DICT = "ext_dict";
54 | // 配置属性——扩展停止词典
55 | private static final String EXT_STOP = "ext_stopwords";
56 |
57 | private Properties props;
58 | /*
59 | * 是否使用smart方式分词
60 | */
61 | private boolean useSmart;
62 |
63 | /**
64 | * 返回单例
65 | * @return Configuration单例
66 | */
67 | public static Configuration getInstance() {
68 | return new DefaultConfig();
69 | }
70 |
71 | /*
72 | * 初始化配置文件
73 | */
74 | private DefaultConfig() {
75 | props = new Properties();
76 |
77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME);
78 | if (input != null) {
79 | try {
80 | props.loadFromXML(input);
81 | } catch (InvalidPropertiesFormatException e) {
82 | e.printStackTrace();
83 | } catch (IOException e) {
84 | e.printStackTrace();
85 | }
86 | }
87 | }
88 |
89 | /**
90 | * 返回useSmart标志位
91 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
92 | * @return useSmart
93 | */
94 | public boolean useSmart() {
95 | return useSmart;
96 | }
97 |
98 | /**
99 | * 设置useSmart标志位
100 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分
101 | * @param useSmart
102 | */
103 | public void setUseSmart(boolean useSmart) {
104 | this.useSmart = useSmart;
105 | }
106 |
107 | /**
108 | * 获取主词典路径
109 | *
110 | * @return String 主词典路径
111 | */
112 | public String getMainDictionary() {
113 | return PATH_DIC_MAIN;
114 | }
115 |
116 | /**
117 | * 获取量词词典路径
118 | * @return String 量词词典路径
119 | */
120 | public String getQuantifierDicionary() {
121 | return PATH_DIC_QUANTIFIER;
122 | }
123 |
124 | /**
125 | * 获取扩展字典配置路径
126 | * @return List 相对类加载器的路径
127 | */
128 | public List getExtDictionarys() {
129 | List extDictFiles = new ArrayList(2);
130 | String extDictCfg = props.getProperty(EXT_DICT);
131 | if (extDictCfg != null) {
132 | // 使用;分割多个扩展字典配置
133 | String[] filePaths = extDictCfg.split(";");
134 | if (filePaths != null) {
135 | for (String filePath : filePaths) {
136 | if (filePath != null && !"".equals(filePath.trim())) {
137 | extDictFiles.add(filePath.trim());
138 | }
139 | }
140 | }
141 | }
142 | return extDictFiles;
143 | }
144 |
145 | /**
146 | * 获取扩展停止词典配置路径
147 | * @return List 相对类加载器的路径
148 | */
149 | public List getExtStopWordDictionarys() {
150 | List extStopWordDictFiles = new ArrayList(2);
151 | String extStopWordDictCfg = props.getProperty(EXT_STOP);
152 | if (extStopWordDictCfg != null) {
153 | // 使用;分割多个扩展字典配置
154 | String[] filePaths = extStopWordDictCfg.split(";");
155 | if (filePaths != null) {
156 | for (String filePath : filePaths) {
157 | if (filePath != null && !"".equals(filePath.trim())) {
158 | extStopWordDictFiles.add(filePath.trim());
159 | }
160 | }
161 | }
162 | }
163 | return extStopWordDictFiles;
164 | }
165 |
166 | }
167 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/AnalyzeContext.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.io.IOException;
28 | import java.io.Reader;
29 | import java.util.HashMap;
30 | import java.util.HashSet;
31 | import java.util.LinkedList;
32 | import java.util.Map;
33 | import java.util.Set;
34 |
35 | import org.wltea.analyzer.cfg.Configuration;
36 | import org.wltea.analyzer.dic.Dictionary;
37 |
38 | /**
39 | *
40 | * 分词器上下文状态
41 | *
42 | */
43 | class AnalyzeContext {
44 |
45 | // 默认缓冲区大小
46 | private static final int BUFF_SIZE = 4096;
47 | // 缓冲区耗尽的临界值
48 | private static final int BUFF_EXHAUST_CRITICAL = 100;
49 |
50 | // 字符窜读取缓冲
51 | private char[] segmentBuff;
52 | // 字符类型数组
53 | private int[] charTypes;
54 |
55 | // 记录Reader内已分析的字串总长度
56 | // 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
57 | private int buffOffset;
58 | // 当前缓冲区位置指针
59 | private int cursor;
60 | // 最近一次读入的,可处理的字串长度
61 | private int available;
62 |
63 | // 子分词器锁
64 | // 该集合非空,说明有子分词器在占用segmentBuff
65 | private Set buffLocker;
66 |
67 | // 原始分词结果集合,未经歧义处理
68 | private QuickSortSet orgLexemes;
69 | // LexemePath位置索引表
70 | private Map pathMap;
71 | // 最终分词结果集
72 | private LinkedList results;
73 |
74 | // 分词器配置项
75 | private Configuration cfg;
76 |
77 | public AnalyzeContext(Configuration cfg) {
78 | this.cfg = cfg;
79 | this.segmentBuff = new char[BUFF_SIZE];
80 | this.charTypes = new int[BUFF_SIZE];
81 | this.buffLocker = new HashSet();
82 | this.orgLexemes = new QuickSortSet();
83 | this.pathMap = new HashMap();
84 | this.results = new LinkedList();
85 | }
86 |
87 | int getCursor() {
88 | return this.cursor;
89 | }
90 |
91 | //
92 | // void setCursor(int cursor){
93 | // this.cursor = cursor;
94 | // }
95 |
96 | char[] getSegmentBuff() {
97 | return this.segmentBuff;
98 | }
99 |
100 | char getCurrentChar() {
101 | return this.segmentBuff[this.cursor];
102 | }
103 |
104 | int getCurrentCharType() {
105 | return this.charTypes[this.cursor];
106 | }
107 |
108 | int getBufferOffset() {
109 | return this.buffOffset;
110 | }
111 |
112 | /**
113 | * 根据context的上下文情况,填充segmentBuff
114 | * @param reader
115 | * @return 返回待分析的(有效的)字串长度
116 | * @throws IOException
117 | */
118 | int fillBuffer(Reader reader) throws IOException {
119 | int readCount = 0;
120 | if (this.buffOffset == 0) {
121 | // 首次读取reader
122 | readCount = reader.read(segmentBuff);
123 | } else {
124 | int offset = this.available - this.cursor;
125 | if (offset > 0) {
126 | // 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
127 | System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset);
128 | readCount = offset;
129 | }
130 | // 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
131 | readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset);
132 | }
133 | // 记录最后一次从Reader中读入的可用字符长度
134 | this.available = readCount;
135 | // 重置当前指针
136 | this.cursor = 0;
137 | return readCount;
138 | }
139 |
140 | /**
141 | * 初始化buff指针,处理第一个字符
142 | */
143 | void initCursor() {
144 | this.cursor = 0;
145 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
146 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
147 | }
148 |
149 | /**
150 | * 指针+1
151 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
152 | * 并处理当前字符
153 | */
154 | boolean moveCursor() {
155 | if (this.cursor < this.available - 1) {
156 | this.cursor++;
157 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
158 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
159 | return true;
160 | } else {
161 | return false;
162 | }
163 | }
164 |
165 | /**
166 | * 设置当前segmentBuff为锁定状态
167 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
168 | * @param segmenterName
169 | */
170 | void lockBuffer(String segmenterName) {
171 | this.buffLocker.add(segmenterName);
172 | }
173 |
174 | /**
175 | * 移除指定的子分词器名,释放对segmentBuff的占用
176 | * @param segmenterName
177 | */
178 | void unlockBuffer(String segmenterName) {
179 | this.buffLocker.remove(segmenterName);
180 | }
181 |
182 | /**
183 | * 只要buffLocker中存在segmenterName
184 | * 则buffer被锁定
185 | * @return boolean 缓冲去是否被锁定
186 | */
187 | boolean isBufferLocked() {
188 | return this.buffLocker.size() > 0;
189 | }
190 |
191 | /**
192 | * 判断当前segmentBuff是否已经用完
193 | * 当前执针cursor移至segmentBuff末端this.available - 1
194 | * @return
195 | */
196 | boolean isBufferConsumed() {
197 | return this.cursor == this.available - 1;
198 | }
199 |
200 | /**
201 | * 判断segmentBuff是否需要读取新数据
202 | *
203 | * 满足一下条件时,
204 | * 1.available == BUFF_SIZE 表示buffer满载
205 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
206 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
207 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
208 | * @return
209 | */
210 | boolean needRefillBuffer() {
211 | return this.available == BUFF_SIZE && this.cursor < this.available - 1
212 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked();
213 | }
214 |
215 | /**
216 | * 累计当前的segmentBuff相对于reader起始位置的位移
217 | */
218 | void markBufferOffset() {
219 | this.buffOffset += this.cursor;
220 | }
221 |
222 | /**
223 | * 向分词结果集添加词元
224 | * @param lexeme
225 | */
226 | void addLexeme(Lexeme lexeme) {
227 | this.orgLexemes.addLexeme(lexeme);
228 | }
229 |
230 | /**
231 | * 添加分词结果路径
232 | * 路径起始位置 ---> 路径 映射表
233 | * @param path
234 | */
235 | void addLexemePath(LexemePath path) {
236 | if (path != null) {
237 | this.pathMap.put(path.getPathBegin(), path);
238 | }
239 | }
240 |
241 | /**
242 | * 返回原始分词结果
243 | * @return
244 | */
245 | QuickSortSet getOrgLexemes() {
246 | return this.orgLexemes;
247 | }
248 |
249 | /**
250 | * 推送分词结果到结果集合
251 | * 1.从buff头部遍历到this.cursor已处理位置
252 | * 2.将map中存在的分词结果推入results
253 | * 3.将map中不存在的CJDK字符以单字方式推入results
254 | */
255 | void outputToResult() {
256 | int index = 0;
257 | for (; index <= this.cursor;) {
258 | // 跳过非CJK字符
259 | if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) {
260 | index++;
261 | continue;
262 | }
263 | // 从pathMap找出对应index位置的LexemePath
264 | LexemePath path = this.pathMap.get(index);
265 | if (path != null) {
266 | // 输出LexemePath中的lexeme到results集合
267 | Lexeme l = path.pollFirst();
268 | while (l != null) {
269 | this.results.add(l);
270 | // 将index移至lexeme后
271 | index = l.getBegin() + l.getLength();
272 | l = path.pollFirst();
273 | if (l != null) {
274 | // 输出path内部,词元间遗漏的单字
275 | for (; index < l.getBegin(); index++) {
276 | this.outputSingleCJK(index);
277 | }
278 | }
279 | }
280 | } else {// pathMap中找不到index对应的LexemePath
281 | // 单字输出
282 | this.outputSingleCJK(index);
283 | index++;
284 | }
285 | }
286 | // 清空当前的Map
287 | this.pathMap.clear();
288 | }
289 |
290 | /**
291 | * 对CJK字符进行单字输出
292 | * @param index
293 | */
294 | private void outputSingleCJK(int index) {
295 | if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) {
296 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR);
297 | this.results.add(singleCharLexeme);
298 | } else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) {
299 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK);
300 | this.results.add(singleCharLexeme);
301 | }
302 | }
303 |
304 | /**
305 | * 返回lexeme
306 | *
307 | * 同时处理合并
308 | * @return
309 | */
310 | Lexeme getNextLexeme() {
311 | // 从结果集取出,并移除第一个Lexme
312 | Lexeme result = this.results.pollFirst();
313 | while (result != null) {
314 | // 数量词合并
315 | this.compound(result);
316 | if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(),
317 | result.getLength())) {
318 | // 是停止词继续取列表的下一个
319 | result = this.results.pollFirst();
320 | } else {
321 | // 不是停止词, 生成lexeme的词元文本,输出
322 | result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength()));
323 | break;
324 | }
325 | }
326 | return result;
327 | }
328 |
329 | /**
330 | * 重置分词上下文状态
331 | */
332 | void reset() {
333 | this.buffLocker.clear();
334 | this.orgLexemes = new QuickSortSet();
335 | this.available = 0;
336 | this.buffOffset = 0;
337 | this.charTypes = new int[BUFF_SIZE];
338 | this.cursor = 0;
339 | this.results.clear();
340 | this.segmentBuff = new char[BUFF_SIZE];
341 | this.pathMap.clear();
342 | }
343 |
344 | /**
345 | * 组合词元
346 | */
347 | private void compound(Lexeme result) {
348 | if (!this.cfg.useSmart()) {
349 | return;
350 | }
351 | // 数量词合并处理
352 | if (!this.results.isEmpty()) {
353 |
354 | if (Lexeme.TYPE_ARABIC == result.getLexemeType()) {
355 | Lexeme nextLexeme = this.results.peekFirst();
356 | boolean appendOk = false;
357 | if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) {
358 | // 合并英文数词+中文数词
359 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
360 | } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
361 | // 合并英文数词+中文量词
362 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
363 | }
364 | if (appendOk) {
365 | // 弹出
366 | this.results.pollFirst();
367 | }
368 | }
369 |
370 | // 可能存在第二轮合并
371 | if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) {
372 | Lexeme nextLexeme = this.results.peekFirst();
373 | boolean appendOk = false;
374 | if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) {
375 | // 合并中文数词+中文量词
376 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
377 | }
378 | if (appendOk) {
379 | // 弹出
380 | this.results.pollFirst();
381 | }
382 | }
383 |
384 | }
385 | }
386 |
387 | }
388 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.LinkedList;
28 | import java.util.List;
29 |
30 | import org.wltea.analyzer.dic.Dictionary;
31 | import org.wltea.analyzer.dic.Hit;
32 |
33 | /**
34 | * 中文-日韩文子分词器
35 | */
36 | class CJKSegmenter implements ISegmenter {
37 |
38 | // 子分词器标签
39 | static final String SEGMENTER_NAME = "CJK_SEGMENTER";
40 | // 待处理的分词hit队列
41 | private List tmpHits;
42 |
43 | CJKSegmenter() {
44 | this.tmpHits = new LinkedList();
45 | }
46 |
47 | /*
48 | * (non-Javadoc)
49 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
50 | */
51 | public void analyze(AnalyzeContext context) {
52 | if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) {
53 |
54 | // 优先处理tmpHits中的hit
55 | if (!this.tmpHits.isEmpty()) {
56 | // 处理词段队列
57 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
58 | for (Hit hit : tmpArray) {
59 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(),
60 | context.getCursor(), hit);
61 | if (hit.isMatch()) {
62 | // 输出当前的词
63 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(),
64 | context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD);
65 | context.addLexeme(newLexeme);
66 |
67 | if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
68 | this.tmpHits.remove(hit);
69 | }
70 |
71 | } else if (hit.isUnmatch()) {
72 | // hit不是词,移除
73 | this.tmpHits.remove(hit);
74 | }
75 | }
76 | }
77 |
78 | // *********************************
79 | // 再对当前指针位置的字符进行单字匹配
80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(),
81 | context.getCursor(), 1);
82 | if (singleCharHit.isMatch()) {// 首字成词
83 | // 输出当前的词
84 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1,
85 | Lexeme.TYPE_CNWORD);
86 | context.addLexeme(newLexeme);
87 |
88 | // 同时也是词前缀
89 | if (singleCharHit.isPrefix()) {
90 | // 前缀匹配则放入hit列表
91 | this.tmpHits.add(singleCharHit);
92 | }
93 | } else if (singleCharHit.isPrefix()) {// 首字为词前缀
94 | // 前缀匹配则放入hit列表
95 | this.tmpHits.add(singleCharHit);
96 | }
97 |
98 | } else {
99 | // 遇到CHAR_USELESS字符
100 | // 清空队列
101 | this.tmpHits.clear();
102 | }
103 |
104 | // 判断缓冲区是否已经读完
105 | if (context.isBufferConsumed()) {
106 | // 清空队列
107 | this.tmpHits.clear();
108 | }
109 |
110 | // 判断是否锁定缓冲区
111 | if (this.tmpHits.size() == 0) {
112 | context.unlockBuffer(SEGMENTER_NAME);
113 |
114 | } else {
115 | context.lockBuffer(SEGMENTER_NAME);
116 | }
117 | }
118 |
119 | /*
120 | * (non-Javadoc)
121 | * @see org.wltea.analyzer.core.ISegmenter#reset()
122 | */
123 | public void reset() {
124 | // 清空队列
125 | this.tmpHits.clear();
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.HashSet;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Set;
31 |
32 | import org.wltea.analyzer.dic.Dictionary;
33 | import org.wltea.analyzer.dic.Hit;
34 |
35 | /**
36 | *
37 | * 中文数量词子分词器
38 | */
39 | class CN_QuantifierSegmenter implements ISegmenter {
40 |
41 | // 子分词器标签
42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER";
43 |
44 | // 中文数词
45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum
46 | private static Set ChnNumberChars = new HashSet();
47 | static {
48 | char[] ca = Chn_Num.toCharArray();
49 | for (char nChar : ca) {
50 | ChnNumberChars.add(nChar);
51 | }
52 | }
53 |
54 | /*
55 | * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
56 | */
57 | private int nStart;
58 | /*
59 | * 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束
60 | */
61 | private int nEnd;
62 |
63 | // 待处理的量词hit队列
64 | private List countHits;
65 |
66 | CN_QuantifierSegmenter() {
67 | nStart = -1;
68 | nEnd = -1;
69 | this.countHits = new LinkedList();
70 | }
71 |
72 | /**
73 | * 分词
74 | */
75 | public void analyze(AnalyzeContext context) {
76 | // 处理中文数词
77 | this.processCNumber(context);
78 | // 处理中文量词
79 | this.processCount(context);
80 |
81 | // 判断是否锁定缓冲区
82 | if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) {
83 | // 对缓冲区解锁
84 | context.unlockBuffer(SEGMENTER_NAME);
85 | } else {
86 | context.lockBuffer(SEGMENTER_NAME);
87 | }
88 | }
89 |
90 | /**
91 | * 重置子分词器状态
92 | */
93 | public void reset() {
94 | nStart = -1;
95 | nEnd = -1;
96 | countHits.clear();
97 | }
98 |
99 | /**
100 | * 处理数词
101 | */
102 | private void processCNumber(AnalyzeContext context) {
103 | if (nStart == -1 && nEnd == -1) {// 初始状态
104 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
105 | && ChnNumberChars.contains(context.getCurrentChar())) {
106 | // 记录数词的起始、结束位置
107 | nStart = context.getCursor();
108 | nEnd = context.getCursor();
109 | }
110 | } else {// 正在处理状态
111 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()
112 | && ChnNumberChars.contains(context.getCurrentChar())) {
113 | // 记录数词的结束位置
114 | nEnd = context.getCursor();
115 | } else {
116 | // 输出数词
117 | this.outputNumLexeme(context);
118 | // 重置头尾指针
119 | nStart = -1;
120 | nEnd = -1;
121 | }
122 | }
123 |
124 | // 缓冲区已经用完,还有尚未输出的数词
125 | if (context.isBufferConsumed()) {
126 | if (nStart != -1 && nEnd != -1) {
127 | // 输出数词
128 | outputNumLexeme(context);
129 | // 重置头尾指针
130 | nStart = -1;
131 | nEnd = -1;
132 | }
133 | }
134 | }
135 |
136 | /**
137 | * 处理中文量词
138 | * @param context
139 | */
140 | private void processCount(AnalyzeContext context) {
141 | // 判断是否需要启动量词扫描
142 | if (!this.needCountScan(context)) {
143 | return;
144 | }
145 |
146 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) {
147 |
148 | // 优先处理countHits中的hit
149 | if (!this.countHits.isEmpty()) {
150 | // 处理词段队列
151 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
152 | for (Hit hit : tmpArray) {
153 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(),
154 | context.getCursor(), hit);
155 | if (hit.isMatch()) {
156 | // 输出当前的词
157 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(),
158 | context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT);
159 | context.addLexeme(newLexeme);
160 |
161 | if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除
162 | this.countHits.remove(hit);
163 | }
164 |
165 | } else if (hit.isUnmatch()) {
166 | // hit不是词,移除
167 | this.countHits.remove(hit);
168 | }
169 | }
170 | }
171 |
172 | // *********************************
173 | // 对当前指针位置的字符进行单字匹配
174 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(),
175 | context.getCursor(), 1);
176 | if (singleCharHit.isMatch()) {// 首字成量词词
177 | // 输出当前的词
178 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1,
179 | Lexeme.TYPE_COUNT);
180 | context.addLexeme(newLexeme);
181 |
182 | // 同时也是词前缀
183 | if (singleCharHit.isPrefix()) {
184 | // 前缀匹配则放入hit列表
185 | this.countHits.add(singleCharHit);
186 | }
187 | } else if (singleCharHit.isPrefix()) {// 首字为量词前缀
188 | // 前缀匹配则放入hit列表
189 | this.countHits.add(singleCharHit);
190 | }
191 |
192 | } else {
193 | // 输入的不是中文字符
194 | // 清空未成形的量词
195 | this.countHits.clear();
196 | }
197 |
198 | // 缓冲区数据已经读完,还有尚未输出的量词
199 | if (context.isBufferConsumed()) {
200 | // 清空未成形的量词
201 | this.countHits.clear();
202 | }
203 | }
204 |
205 | /**
206 | * 判断是否需要扫描量词
207 | * @return
208 | */
209 | private boolean needCountScan(AnalyzeContext context) {
210 | if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) {
211 | // 正在处理中文数词,或者正在处理量词
212 | return true;
213 | } else {
214 | // 找到一个相邻的数词
215 | if (!context.getOrgLexemes().isEmpty()) {
216 | Lexeme l = context.getOrgLexemes().peekLast();
217 | if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) {
218 | if (l.getBegin() + l.getLength() == context.getCursor()) {
219 | return true;
220 | }
221 | }
222 | }
223 | }
224 | return false;
225 | }
226 |
227 | /**
228 | * 添加数词词元到结果集
229 | * @param context
230 | */
231 | private void outputNumLexeme(AnalyzeContext context) {
232 | if (nStart > -1 && nEnd > -1) {
233 | // 输出数词
234 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1,
235 | Lexeme.TYPE_CNUM);
236 | context.addLexeme(newLexeme);
237 |
238 | }
239 | }
240 |
241 | }
242 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/CharacterUtil.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | * 字符集识别工具类
25 | */
26 | package org.wltea.analyzer.core;
27 |
28 | /**
29 | *
30 | * 字符集识别工具类
31 | */
32 | class CharacterUtil {
33 |
34 | public static final int CHAR_USELESS = 0;
35 |
36 | public static final int CHAR_ARABIC = 0X00000001;
37 |
38 | public static final int CHAR_ENGLISH = 0X00000002;
39 |
40 | public static final int CHAR_CHINESE = 0X00000004;
41 |
42 | public static final int CHAR_OTHER_CJK = 0X00000008;
43 |
44 | /**
45 | * 识别字符类型
46 | * @param input
47 | * @return int CharacterUtil定义的字符类型常量
48 | */
49 | static int identifyCharType(char input) {
50 | if (input >= '0' && input <= '9') {
51 | return CHAR_ARABIC;
52 |
53 | } else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')) {
54 | return CHAR_ENGLISH;
55 |
56 | } else {
57 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input);
58 |
59 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
60 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
61 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) {
62 | // 目前已知的中文字符UTF-8集合
63 | return CHAR_CHINESE;
64 |
65 | } else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // 全角数字字符和日韩字符
66 | // 韩文字符集
67 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES
68 | || ub == Character.UnicodeBlock.HANGUL_JAMO
69 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO
70 | // 日文字符集
71 | || ub == Character.UnicodeBlock.HIRAGANA // 平假名
72 | || ub == Character.UnicodeBlock.KATAKANA // 片假名
73 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) {
74 | return CHAR_OTHER_CJK;
75 |
76 | }
77 | }
78 | // 其他的不做处理的字符
79 | return CHAR_USELESS;
80 | }
81 |
82 | /**
83 | * 进行字符规格化(全角转半角,大写转小写处理)
84 | * @param input
85 | * @return char
86 | */
87 | static char regularize(char input) {
88 | if (input == 12288) {
89 | input = (char) 32;
90 |
91 | } else if (input > 65280 && input < 65375) {
92 | input = (char) (input - 65248);
93 |
94 | } else if (input >= 'A' && input <= 'Z') {
95 | input += 32;
96 | }
97 |
98 | return input;
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKArbitrator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Stack;
28 | import java.util.TreeSet;
29 |
30 | /**
31 | * IK分词歧义裁决器
32 | */
33 | class IKArbitrator {
34 |
35 | IKArbitrator() {
36 |
37 | }
38 |
39 | /**
40 | * 分词歧义处理
41 | * @param orgLexemes
42 | * @param useSmart
43 | */
44 | void process(AnalyzeContext context, boolean useSmart) {
45 | QuickSortSet orgLexemes = context.getOrgLexemes();
46 | Lexeme orgLexeme = orgLexemes.pollFirst();
47 |
48 | LexemePath crossPath = new LexemePath();
49 | while (orgLexeme != null) {
50 | if (!crossPath.addCrossLexeme(orgLexeme)) {
51 | // 找到与crossPath不相交的下一个crossPath
52 | if (crossPath.size() == 1 || !useSmart) {
53 | // crossPath没有歧义 或者 不做歧义处理
54 | // 直接输出当前crossPath
55 | context.addLexemePath(crossPath);
56 | } else {
57 | // 对当前的crossPath进行歧义处理
58 | QuickSortSet.Cell headCell = crossPath.getHead();
59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
60 | // 输出歧义处理结果judgeResult
61 | context.addLexemePath(judgeResult);
62 | }
63 |
64 | // 把orgLexeme加入新的crossPath中
65 | crossPath = new LexemePath();
66 | crossPath.addCrossLexeme(orgLexeme);
67 | }
68 | orgLexeme = orgLexemes.pollFirst();
69 | }
70 |
71 | // 处理最后的path
72 | if (crossPath.size() == 1 || !useSmart) {
73 | // crossPath没有歧义 或者 不做歧义处理
74 | // 直接输出当前crossPath
75 | context.addLexemePath(crossPath);
76 | } else {
77 | // 对当前的crossPath进行歧义处理
78 | QuickSortSet.Cell headCell = crossPath.getHead();
79 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength());
80 | // 输出歧义处理结果judgeResult
81 | context.addLexemePath(judgeResult);
82 | }
83 | }
84 |
85 | /**
86 | * 歧义识别
87 | * @param lexemeCell 歧义路径链表头
88 | * @param fullTextLength 歧义路径文本长度
89 | * @param option 候选结果路径
90 | * @return
91 | */
92 | private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) {
93 | // 候选路径集合
94 | TreeSet pathOptions = new TreeSet();
95 | // 候选结果路径
96 | LexemePath option = new LexemePath();
97 |
98 | // 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈
99 | Stack lexemeStack = this.forwardPath(lexemeCell, option);
100 |
101 | // 当前词元链并非最理想的,加入候选路径集合
102 | pathOptions.add(option.copy());
103 |
104 | // 存在歧义词,处理
105 | QuickSortSet.Cell c = null;
106 | while (!lexemeStack.isEmpty()) {
107 | c = lexemeStack.pop();
108 | // 回滚词元链
109 | this.backPath(c.getLexeme(), option);
110 | // 从歧义词位置开始,递归,生成可选方案
111 | this.forwardPath(c, option);
112 | pathOptions.add(option.copy());
113 | }
114 |
115 | // 返回集合中的最优方案
116 | return pathOptions.first();
117 |
118 | }
119 |
120 | /**
121 | * 向前遍历,添加词元,构造一个无歧义词元组合
122 | * @param LexemePath path
123 | * @return
124 | */
125 | private Stack forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) {
126 | // 发生冲突的Lexeme栈
127 | Stack conflictStack = new Stack();
128 | QuickSortSet.Cell c = lexemeCell;
129 | // 迭代遍历Lexeme链表
130 | while (c != null && c.getLexeme() != null) {
131 | if (!option.addNotCrossLexeme(c.getLexeme())) {
132 | // 词元交叉,添加失败则加入lexemeStack栈
133 | conflictStack.push(c);
134 | }
135 | c = c.getNext();
136 | }
137 | return conflictStack;
138 | }
139 |
140 | /**
141 | * 回滚词元链,直到它能够接受指定的词元
142 | * @param lexeme
143 | * @param l
144 | */
145 | private void backPath(Lexeme l, LexemePath option) {
146 | while (option.checkCross(l)) {
147 | option.removeTail();
148 | }
149 |
150 | }
151 |
152 | }
153 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/IKSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | */
24 | package org.wltea.analyzer.core;
25 |
26 | import java.io.IOException;
27 | import java.io.Reader;
28 | import java.util.ArrayList;
29 | import java.util.List;
30 |
31 | import org.wltea.analyzer.cfg.Configuration;
32 | import org.wltea.analyzer.cfg.DefaultConfig;
33 | import org.wltea.analyzer.dic.Dictionary;
34 |
35 | /**
36 | * IK分词器主类
37 | *
38 | */
39 | public final class IKSegmenter {
40 |
41 | // 字符窜reader
42 | private Reader input;
43 | // 分词器配置项
44 | private Configuration cfg;
45 | // 分词器上下文
46 | private AnalyzeContext context;
47 | // 分词处理器列表
48 | private List segmenters;
49 | // 分词歧义裁决器
50 | private IKArbitrator arbitrator;
51 |
52 | /**
53 | * IK分词器构造函数
54 | * @param input
55 | * @param useSmart 为true,使用智能分词策略
56 | *
57 | * 非智能分词:细粒度输出所有可能的切分结果
58 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断
59 | */
60 | public IKSegmenter(Reader input, boolean useSmart) {
61 | this.input = input;
62 | this.cfg = DefaultConfig.getInstance();
63 | this.cfg.setUseSmart(useSmart);
64 | this.init();
65 | }
66 |
67 | /**
68 | * IK分词器构造函数
69 | * @param input
70 | * @param cfg 使用自定义的Configuration构造分词器
71 | *
72 | */
73 | public IKSegmenter(Reader input, Configuration cfg) {
74 | this.input = input;
75 | this.cfg = cfg;
76 | this.init();
77 | }
78 |
79 | /**
80 | * 初始化
81 | */
82 | private void init() {
83 | // 初始化词典单例
84 | Dictionary.initial(this.cfg);
85 | // 初始化分词上下文
86 | this.context = new AnalyzeContext(this.cfg);
87 | // 加载子分词器
88 | this.segmenters = this.loadSegmenters();
89 | // 加载歧义裁决器
90 | this.arbitrator = new IKArbitrator();
91 | }
92 |
93 | /**
94 | * 初始化词典,加载子分词器实现
95 | * @return List
96 | */
97 | private List loadSegmenters() {
98 | List segmenters = new ArrayList(4);
99 | // 处理字母的子分词器
100 | segmenters.add(new LetterSegmenter());
101 | // 处理中文数量词的子分词器
102 | segmenters.add(new CN_QuantifierSegmenter());
103 | // 处理中文词的子分词器
104 | segmenters.add(new CJKSegmenter());
105 | return segmenters;
106 | }
107 |
108 | /**
109 | * 分词,获取下一个词元
110 | * @return Lexeme 词元对象
111 | * @throws IOException
112 | */
113 | public synchronized Lexeme next() throws IOException {
114 | Lexeme l = null;
115 | while ((l = context.getNextLexeme()) == null) {
116 | /*
117 | * 从reader中读取数据,填充buffer 如果reader是分次读入buffer的,那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据
118 | */
119 | int available = context.fillBuffer(this.input);
120 | if (available <= 0) {
121 | // reader已经读完
122 | context.reset();
123 | return null;
124 |
125 | } else {
126 | // 初始化指针
127 | context.initCursor();
128 | do {
129 | // 遍历子分词器
130 | for (ISegmenter segmenter : segmenters) {
131 | segmenter.analyze(context);
132 | }
133 | // 字符缓冲区接近读完,需要读入新的字符
134 | if (context.needRefillBuffer()) {
135 | break;
136 | }
137 | // 向前移动指针
138 | } while (context.moveCursor());
139 | // 重置子分词器,为下轮循环进行初始化
140 | for (ISegmenter segmenter : segmenters) {
141 | segmenter.reset();
142 | }
143 | }
144 | // 对分词进行歧义处理
145 | this.arbitrator.process(context, this.cfg.useSmart());
146 | // 将分词结果输出到结果集,并处理未切分的单个CJK字符
147 | context.outputToResult();
148 | // 记录本次分词的缓冲区位移
149 | context.markBufferOffset();
150 | }
151 | return l;
152 | }
153 |
154 | /**
155 | * 重置分词器到初始状态
156 | * @param input
157 | */
158 | public synchronized void reset(Reader input) {
159 | this.input = input;
160 | context.reset();
161 | for (ISegmenter segmenter : segmenters) {
162 | segmenter.reset();
163 | }
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/ISegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | *
29 | * 子分词器接口
30 | */
31 | interface ISegmenter {
32 |
33 | /**
34 | * 从分析器读取下一个可能分解的词元对象
35 | * @param context 分词算法上下文
36 | */
37 | void analyze(AnalyzeContext context);
38 |
39 | /**
40 | * 重置子分析器状态
41 | */
42 | void reset();
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | import java.util.Arrays;
28 |
29 | /**
30 | *
31 | * 英文字符及阿拉伯数字子分词器
32 | */
33 | class LetterSegmenter implements ISegmenter {
34 |
35 | // 子分词器标签
36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER";
37 | // 链接符号
38 | private static final char[] Letter_Connector = new char[] { '#', '&', '+', '-', '.', '@', '_' };
39 |
40 | // 数字符号
41 | private static final char[] Num_Connector = new char[] { ',', '.' };
42 |
43 | /*
44 | * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符
45 | */
46 | private int start;
47 | /*
48 | * 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置
49 | */
50 | private int end;
51 |
52 | /*
53 | * 字母起始位置
54 | */
55 | private int englishStart;
56 |
57 | /*
58 | * 字母结束位置
59 | */
60 | private int englishEnd;
61 |
62 | /*
63 | * 阿拉伯数字起始位置
64 | */
65 | private int arabicStart;
66 |
67 | /*
68 | * 阿拉伯数字结束位置
69 | */
70 | private int arabicEnd;
71 |
72 | LetterSegmenter() {
73 | Arrays.sort(Letter_Connector);
74 | Arrays.sort(Num_Connector);
75 | this.start = -1;
76 | this.end = -1;
77 | this.englishStart = -1;
78 | this.englishEnd = -1;
79 | this.arabicStart = -1;
80 | this.arabicEnd = -1;
81 | }
82 |
83 | /*
84 | * (non-Javadoc)
85 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext)
86 | */
87 | public void analyze(AnalyzeContext context) {
88 | boolean bufferLockFlag = false;
89 | // 处理英文字母
90 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag;
91 | // 处理阿拉伯字母
92 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag;
93 | // 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复)
94 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag;
95 |
96 | // 判断是否锁定缓冲区
97 | if (bufferLockFlag) {
98 | context.lockBuffer(SEGMENTER_NAME);
99 | } else {
100 | // 对缓冲区解锁
101 | context.unlockBuffer(SEGMENTER_NAME);
102 | }
103 | }
104 |
105 | /*
106 | * (non-Javadoc)
107 | * @see org.wltea.analyzer.core.ISegmenter#reset()
108 | */
109 | public void reset() {
110 | this.start = -1;
111 | this.end = -1;
112 | this.englishStart = -1;
113 | this.englishEnd = -1;
114 | this.arabicStart = -1;
115 | this.arabicEnd = -1;
116 | }
117 |
118 | /**
119 | * 处理数字字母混合输出
120 | * 如:windos2000 | linliangyi2005@gmail.com
121 | * @param input
122 | * @param context
123 | * @return
124 | */
125 | private boolean processMixLetter(AnalyzeContext context) {
126 | boolean needLock = false;
127 |
128 | if (this.start == -1) {// 当前的分词器尚未开始处理字符
129 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
130 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
131 | // 记录起始指针的位置,标明分词器进入处理状态
132 | this.start = context.getCursor();
133 | this.end = start;
134 | }
135 |
136 | } else {// 当前的分词器正在处理字符
137 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()
138 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
139 | // 记录下可能的结束位置
140 | this.end = context.getCursor();
141 |
142 | } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
143 | && this.isLetterConnector(context.getCurrentChar())) {
144 | // 记录下可能的结束位置
145 | this.end = context.getCursor();
146 | } else {
147 | // 遇到非Letter字符,输出词元
148 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
149 | + 1, Lexeme.TYPE_LETTER);
150 | context.addLexeme(newLexeme);
151 | this.start = -1;
152 | this.end = -1;
153 | }
154 | }
155 |
156 | // 判断缓冲区是否已经读完
157 | if (context.isBufferConsumed()) {
158 | if (this.start != -1 && this.end != -1) {
159 | // 缓冲以读完,输出词元
160 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start
161 | + 1, Lexeme.TYPE_LETTER);
162 | context.addLexeme(newLexeme);
163 | this.start = -1;
164 | this.end = -1;
165 | }
166 | }
167 |
168 | // 判断是否锁定缓冲区
169 | if (this.start == -1 && this.end == -1) {
170 | // 对缓冲区解锁
171 | needLock = false;
172 | } else {
173 | needLock = true;
174 | }
175 | return needLock;
176 | }
177 |
178 | /**
179 | * 处理纯英文字母输出
180 | * @param context
181 | * @return
182 | */
183 | private boolean processEnglishLetter(AnalyzeContext context) {
184 | boolean needLock = false;
185 |
186 | if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符
187 | if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
188 | // 记录起始指针的位置,标明分词器进入处理状态
189 | this.englishStart = context.getCursor();
190 | this.englishEnd = this.englishStart;
191 | }
192 | } else {// 当前的分词器正在处理英文字符
193 | if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) {
194 | // 记录当前指针位置为结束位置
195 | this.englishEnd = context.getCursor();
196 | } else {
197 | // 遇到非English字符,输出词元
198 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
199 | - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
200 | context.addLexeme(newLexeme);
201 | this.englishStart = -1;
202 | this.englishEnd = -1;
203 | }
204 | }
205 |
206 | // 判断缓冲区是否已经读完
207 | if (context.isBufferConsumed()) {
208 | if (this.englishStart != -1 && this.englishEnd != -1) {
209 | // 缓冲以读完,输出词元
210 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd
211 | - this.englishStart + 1, Lexeme.TYPE_ENGLISH);
212 | context.addLexeme(newLexeme);
213 | this.englishStart = -1;
214 | this.englishEnd = -1;
215 | }
216 | }
217 |
218 | // 判断是否锁定缓冲区
219 | if (this.englishStart == -1 && this.englishEnd == -1) {
220 | // 对缓冲区解锁
221 | needLock = false;
222 | } else {
223 | needLock = true;
224 | }
225 | return needLock;
226 | }
227 |
228 | /**
229 | * 处理阿拉伯数字输出
230 | * @param context
231 | * @return
232 | */
233 | private boolean processArabicLetter(AnalyzeContext context) {
234 | boolean needLock = false;
235 |
236 | if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符
237 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
238 | // 记录起始指针的位置,标明分词器进入处理状态
239 | this.arabicStart = context.getCursor();
240 | this.arabicEnd = this.arabicStart;
241 | }
242 | } else {// 当前的分词器正在处理数字字符
243 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) {
244 | // 记录当前指针位置为结束位置
245 | this.arabicEnd = context.getCursor();
246 | } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType()
247 | && this.isNumConnector(context.getCurrentChar())) {
248 | // 不输出数字,但不标记结束
249 | } else {
250 | // //遇到非Arabic字符,输出词元
251 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
252 | - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
253 | context.addLexeme(newLexeme);
254 | this.arabicStart = -1;
255 | this.arabicEnd = -1;
256 | }
257 | }
258 |
259 | // 判断缓冲区是否已经读完
260 | if (context.isBufferConsumed()) {
261 | if (this.arabicStart != -1 && this.arabicEnd != -1) {
262 | // 生成已切分的词元
263 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd
264 | - this.arabicStart + 1, Lexeme.TYPE_ARABIC);
265 | context.addLexeme(newLexeme);
266 | this.arabicStart = -1;
267 | this.arabicEnd = -1;
268 | }
269 | }
270 |
271 | // 判断是否锁定缓冲区
272 | if (this.arabicStart == -1 && this.arabicEnd == -1) {
273 | // 对缓冲区解锁
274 | needLock = false;
275 | } else {
276 | needLock = true;
277 | }
278 | return needLock;
279 | }
280 |
281 | /**
282 | * 判断是否是字母连接符号
283 | * @param input
284 | * @return
285 | */
286 | private boolean isLetterConnector(char input) {
287 | int index = Arrays.binarySearch(Letter_Connector, input);
288 | return index >= 0;
289 | }
290 |
291 | /**
292 | * 判断是否是数字连接符号
293 | * @param input
294 | * @return
295 | */
296 | private boolean isNumConnector(char input) {
297 | int index = Arrays.binarySearch(Num_Connector, input);
298 | return index >= 0;
299 | }
300 | }
301 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/Lexeme.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK词元对象
29 | */
30 | public class Lexeme implements Comparable {
31 | // lexemeType常量
32 | // 未知
33 | public static final int TYPE_UNKNOWN = 0;
34 | // 英文
35 | public static final int TYPE_ENGLISH = 1;
36 | // 数字
37 | public static final int TYPE_ARABIC = 2;
38 | // 英文数字混合
39 | public static final int TYPE_LETTER = 3;
40 | // 中文词元
41 | public static final int TYPE_CNWORD = 4;
42 | // 中文单字
43 | public static final int TYPE_CNCHAR = 64;
44 | // 日韩文字
45 | public static final int TYPE_OTHER_CJK = 8;
46 | // 中文数词
47 | public static final int TYPE_CNUM = 16;
48 | // 中文量词
49 | public static final int TYPE_COUNT = 32;
50 | // 中文数量词
51 | public static final int TYPE_CQUAN = 48;
52 |
53 | // 词元的起始位移
54 | private int offset;
55 | // 词元的相对起始位置
56 | private int begin;
57 | // 词元的长度
58 | private int length;
59 | // 词元文本
60 | private String lexemeText;
61 | // 词元类型
62 | private int lexemeType;
63 |
64 | public Lexeme(int offset, int begin, int length, int lexemeType) {
65 | this.offset = offset;
66 | this.begin = begin;
67 | if (length < 0) {
68 | throw new IllegalArgumentException("length < 0");
69 | }
70 | this.length = length;
71 | this.lexemeType = lexemeType;
72 | }
73 |
74 | /*
75 | * 判断词元相等算法 起始位置偏移、起始位置、终止位置相同
76 | * @see java.lang.Object#equals(Object o)
77 | */
78 | public boolean equals(Object o) {
79 | if (o == null) {
80 | return false;
81 | }
82 |
83 | if (this == o) {
84 | return true;
85 | }
86 |
87 | if (o instanceof Lexeme) {
88 | Lexeme other = (Lexeme) o;
89 | if (this.offset == other.getOffset() && this.begin == other.getBegin()
90 | && this.length == other.getLength()) {
91 | return true;
92 | } else {
93 | return false;
94 | }
95 | } else {
96 | return false;
97 | }
98 | }
99 |
100 | /*
101 | * 词元哈希编码算法
102 | * @see java.lang.Object#hashCode()
103 | */
104 | public int hashCode() {
105 | int absBegin = getBeginPosition();
106 | int absEnd = getEndPosition();
107 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11;
108 | }
109 |
110 | /*
111 | * 词元在排序集合中的比较算法
112 | * @see java.lang.Comparable#compareTo(java.lang.Object)
113 | */
114 | public int compareTo(Lexeme other) {
115 | // 起始位置优先
116 | if (this.begin < other.getBegin()) {
117 | return -1;
118 | } else if (this.begin == other.getBegin()) {
119 | // 词元长度优先
120 | if (this.length > other.getLength()) {
121 | return -1;
122 | } else if (this.length == other.getLength()) {
123 | return 0;
124 | } else {// this.length < other.getLength()
125 | return 1;
126 | }
127 |
128 | } else {// this.begin > other.getBegin()
129 | return 1;
130 | }
131 | }
132 |
133 | public int getOffset() {
134 | return offset;
135 | }
136 |
137 | public void setOffset(int offset) {
138 | this.offset = offset;
139 | }
140 |
141 | public int getBegin() {
142 | return begin;
143 | }
144 |
145 | /**
146 | * 获取词元在文本中的起始位置
147 | * @return int
148 | */
149 | public int getBeginPosition() {
150 | return offset + begin;
151 | }
152 |
153 | public void setBegin(int begin) {
154 | this.begin = begin;
155 | }
156 |
157 | /**
158 | * 获取词元在文本中的结束位置
159 | * @return int
160 | */
161 | public int getEndPosition() {
162 | return offset + begin + length;
163 | }
164 |
165 | /**
166 | * 获取词元的字符长度
167 | * @return int
168 | */
169 | public int getLength() {
170 | return this.length;
171 | }
172 |
173 | public void setLength(int length) {
174 | if (this.length < 0) {
175 | throw new IllegalArgumentException("length < 0");
176 | }
177 | this.length = length;
178 | }
179 |
180 | /**
181 | * 获取词元的文本内容
182 | * @return String
183 | */
184 | public String getLexemeText() {
185 | if (lexemeText == null) {
186 | return "";
187 | }
188 | return lexemeText;
189 | }
190 |
191 | public void setLexemeText(String lexemeText) {
192 | if (lexemeText == null) {
193 | this.lexemeText = "";
194 | this.length = 0;
195 | } else {
196 | this.lexemeText = lexemeText;
197 | this.length = lexemeText.length();
198 | }
199 | }
200 |
201 | /**
202 | * 获取词元类型
203 | * @return int
204 | */
205 | public int getLexemeType() {
206 | return lexemeType;
207 | }
208 |
209 | /**
210 | * 获取词元类型标示字符串
211 | * @return String
212 | */
213 | public String getLexemeTypeString() {
214 | switch (lexemeType) {
215 |
216 | case TYPE_ENGLISH:
217 | return "ENGLISH";
218 |
219 | case TYPE_ARABIC:
220 | return "ARABIC";
221 |
222 | case TYPE_LETTER:
223 | return "LETTER";
224 |
225 | case TYPE_CNWORD:
226 | return "CN_WORD";
227 |
228 | case TYPE_CNCHAR:
229 | return "CN_CHAR";
230 |
231 | case TYPE_OTHER_CJK:
232 | return "OTHER_CJK";
233 |
234 | case TYPE_COUNT:
235 | return "COUNT";
236 |
237 | case TYPE_CNUM:
238 | return "TYPE_CNUM";
239 |
240 | case TYPE_CQUAN:
241 | return "TYPE_CQUAN";
242 |
243 | default:
244 | return "UNKONW";
245 | }
246 | }
247 |
248 | public void setLexemeType(int lexemeType) {
249 | this.lexemeType = lexemeType;
250 | }
251 |
252 | /**
253 | * 合并两个相邻的词元
254 | * @param l
255 | * @param lexemeType
256 | * @return boolean 词元是否成功合并
257 | */
258 | public boolean append(Lexeme l, int lexemeType) {
259 | if (l != null && this.getEndPosition() == l.getBeginPosition()) {
260 | this.length += l.getLength();
261 | this.lexemeType = lexemeType;
262 | return true;
263 | } else {
264 | return false;
265 | }
266 | }
267 |
268 | /**
269 | *
270 | */
271 | public String toString() {
272 | StringBuffer strbuf = new StringBuffer();
273 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition());
274 | strbuf.append(" : ").append(this.lexemeText).append(" : \t");
275 | strbuf.append(this.getLexemeTypeString());
276 | return strbuf.toString();
277 | }
278 |
279 | }
280 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/LexemePath.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * Lexeme链(路径)
29 | */
30 | class LexemePath extends QuickSortSet implements Comparable {
31 |
32 | // 起始位置
33 | private int pathBegin;
34 | // 结束
35 | private int pathEnd;
36 | // 词元链的有效字符长度
37 | private int payloadLength;
38 |
39 | LexemePath() {
40 | this.pathBegin = -1;
41 | this.pathEnd = -1;
42 | this.payloadLength = 0;
43 | }
44 |
45 | /**
46 | * 向LexemePath追加相交的Lexeme
47 | * @param lexeme
48 | * @return
49 | */
50 | boolean addCrossLexeme(Lexeme lexeme) {
51 | if (this.isEmpty()) {
52 | this.addLexeme(lexeme);
53 | this.pathBegin = lexeme.getBegin();
54 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
55 | this.payloadLength += lexeme.getLength();
56 | return true;
57 |
58 | } else if (this.checkCross(lexeme)) {
59 | this.addLexeme(lexeme);
60 | if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) {
61 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
62 | }
63 | this.payloadLength = this.pathEnd - this.pathBegin;
64 | return true;
65 |
66 | } else {
67 | return false;
68 |
69 | }
70 | }
71 |
72 | /**
73 | * 向LexemePath追加不相交的Lexeme
74 | * @param lexeme
75 | * @return
76 | */
77 | boolean addNotCrossLexeme(Lexeme lexeme) {
78 | if (this.isEmpty()) {
79 | this.addLexeme(lexeme);
80 | this.pathBegin = lexeme.getBegin();
81 | this.pathEnd = lexeme.getBegin() + lexeme.getLength();
82 | this.payloadLength += lexeme.getLength();
83 | return true;
84 |
85 | } else if (this.checkCross(lexeme)) {
86 | return false;
87 |
88 | } else {
89 | this.addLexeme(lexeme);
90 | this.payloadLength += lexeme.getLength();
91 | Lexeme head = this.peekFirst();
92 | this.pathBegin = head.getBegin();
93 | Lexeme tail = this.peekLast();
94 | this.pathEnd = tail.getBegin() + tail.getLength();
95 | return true;
96 |
97 | }
98 | }
99 |
100 | /**
101 | * 移除尾部的Lexeme
102 | * @return
103 | */
104 | Lexeme removeTail() {
105 | Lexeme tail = this.pollLast();
106 | if (this.isEmpty()) {
107 | this.pathBegin = -1;
108 | this.pathEnd = -1;
109 | this.payloadLength = 0;
110 | } else {
111 | this.payloadLength -= tail.getLength();
112 | Lexeme newTail = this.peekLast();
113 | this.pathEnd = newTail.getBegin() + newTail.getLength();
114 | }
115 | return tail;
116 | }
117 |
118 | /**
119 | * 检测词元位置交叉(有歧义的切分)
120 | * @param lexeme
121 | * @return
122 | */
123 | boolean checkCross(Lexeme lexeme) {
124 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd)
125 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()
126 | + lexeme.getLength());
127 | }
128 |
129 | int getPathBegin() {
130 | return pathBegin;
131 | }
132 |
133 | int getPathEnd() {
134 | return pathEnd;
135 | }
136 |
137 | /**
138 | * 获取Path的有效词长
139 | * @return
140 | */
141 | int getPayloadLength() {
142 | return this.payloadLength;
143 | }
144 |
145 | /**
146 | * 获取LexemePath的路径长度
147 | * @return
148 | */
149 | int getPathLength() {
150 | return this.pathEnd - this.pathBegin;
151 | }
152 |
153 | /**
154 | * X权重(词元长度积)
155 | * @return
156 | */
157 | int getXWeight() {
158 | int product = 1;
159 | Cell c = this.getHead();
160 | while (c != null && c.getLexeme() != null) {
161 | product *= c.getLexeme().getLength();
162 | c = c.getNext();
163 | }
164 | return product;
165 | }
166 |
167 | /**
168 | * 词元位置权重
169 | * @return
170 | */
171 | int getPWeight() {
172 | int pWeight = 0;
173 | int p = 0;
174 | Cell c = this.getHead();
175 | while (c != null && c.getLexeme() != null) {
176 | p++;
177 | pWeight += p * c.getLexeme().getLength();
178 | c = c.getNext();
179 | }
180 | return pWeight;
181 | }
182 |
183 | LexemePath copy() {
184 | LexemePath theCopy = new LexemePath();
185 | theCopy.pathBegin = this.pathBegin;
186 | theCopy.pathEnd = this.pathEnd;
187 | theCopy.payloadLength = this.payloadLength;
188 | Cell c = this.getHead();
189 | while (c != null && c.getLexeme() != null) {
190 | theCopy.addLexeme(c.getLexeme());
191 | c = c.getNext();
192 | }
193 | return theCopy;
194 | }
195 |
196 | public int compareTo(LexemePath o) {
197 | // 比较有效文本长度
198 | if (this.payloadLength > o.payloadLength) {
199 | return -1;
200 | } else if (this.payloadLength < o.payloadLength) {
201 | return 1;
202 | } else {
203 | // 比较词元个数,越少越好
204 | if (this.size() < o.size()) {
205 | return -1;
206 | } else if (this.size() > o.size()) {
207 | return 1;
208 | } else {
209 | // 路径跨度越大越好
210 | if (this.getPathLength() > o.getPathLength()) {
211 | return -1;
212 | } else if (this.getPathLength() < o.getPathLength()) {
213 | return 1;
214 | } else {
215 | // 根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先
216 | if (this.pathEnd > o.pathEnd) {
217 | return -1;
218 | } else if (pathEnd < o.pathEnd) {
219 | return 1;
220 | } else {
221 | // 词长越平均越好
222 | if (this.getXWeight() > o.getXWeight()) {
223 | return -1;
224 | } else if (this.getXWeight() < o.getXWeight()) {
225 | return 1;
226 | } else {
227 | // 词元位置权重比较
228 | if (this.getPWeight() > o.getPWeight()) {
229 | return -1;
230 | } else if (this.getPWeight() < o.getPWeight()) {
231 | return 1;
232 | }
233 |
234 | }
235 | }
236 | }
237 | }
238 | }
239 | return 0;
240 | }
241 |
242 | public String toString() {
243 | StringBuffer sb = new StringBuffer();
244 | sb.append("pathBegin : ").append(pathBegin).append("\r\n");
245 | sb.append("pathEnd : ").append(pathEnd).append("\r\n");
246 | sb.append("payloadLength : ").append(payloadLength).append("\r\n");
247 | Cell head = this.getHead();
248 | while (head != null) {
249 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n");
250 | head = head.getNext();
251 | }
252 | return sb.toString();
253 | }
254 |
255 | }
256 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/core/QuickSortSet.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.core;
26 |
27 | /**
28 | * IK分词器专用的Lexem快速排序集合
29 | */
30 | class QuickSortSet {
31 | // 链表头
32 | private Cell head;
33 | // 链表尾
34 | private Cell tail;
35 | // 链表的实际大小
36 | private int size;
37 |
38 | QuickSortSet() {
39 | this.size = 0;
40 | }
41 |
42 | /**
43 | * 向链表集合添加词元
44 | * @param lexeme
45 | */
46 | boolean addLexeme(Lexeme lexeme) {
47 | Cell newCell = new Cell(lexeme);
48 | if (this.size == 0) {
49 | this.head = newCell;
50 | this.tail = newCell;
51 | this.size++;
52 | return true;
53 |
54 | } else {
55 | if (this.tail.compareTo(newCell) == 0) {// 词元与尾部词元相同,不放入集合
56 | return false;
57 |
58 | } else if (this.tail.compareTo(newCell) < 0) {// 词元接入链表尾部
59 | this.tail.next = newCell;
60 | newCell.prev = this.tail;
61 | this.tail = newCell;
62 | this.size++;
63 | return true;
64 |
65 | } else if (this.head.compareTo(newCell) > 0) {// 词元接入链表头部
66 | this.head.prev = newCell;
67 | newCell.next = this.head;
68 | this.head = newCell;
69 | this.size++;
70 | return true;
71 |
72 | } else {
73 | // 从尾部上逆
74 | Cell index = this.tail;
75 | while (index != null && index.compareTo(newCell) > 0) {
76 | index = index.prev;
77 | }
78 | if (index.compareTo(newCell) == 0) {// 词元与集合中的词元重复,不放入集合
79 | return false;
80 |
81 | } else if (index.compareTo(newCell) < 0) {// 词元插入链表中的某个位置
82 | newCell.prev = index;
83 | newCell.next = index.next;
84 | index.next.prev = newCell;
85 | index.next = newCell;
86 | this.size++;
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 |
94 | /**
95 | * 返回链表头部元素
96 | * @return
97 | */
98 | Lexeme peekFirst() {
99 | if (this.head != null) {
100 | return this.head.lexeme;
101 | }
102 | return null;
103 | }
104 |
105 | /**
106 | * 取出链表集合的第一个元素
107 | * @return Lexeme
108 | */
109 | Lexeme pollFirst() {
110 | if (this.size == 1) {
111 | Lexeme first = this.head.lexeme;
112 | this.head = null;
113 | this.tail = null;
114 | this.size--;
115 | return first;
116 | } else if (this.size > 1) {
117 | Lexeme first = this.head.lexeme;
118 | this.head = this.head.next;
119 | this.size--;
120 | return first;
121 | } else {
122 | return null;
123 | }
124 | }
125 |
126 | /**
127 | * 返回链表尾部元素
128 | * @return
129 | */
130 | Lexeme peekLast() {
131 | if (this.tail != null) {
132 | return this.tail.lexeme;
133 | }
134 | return null;
135 | }
136 |
137 | /**
138 | * 取出链表集合的最后一个元素
139 | * @return Lexeme
140 | */
141 | Lexeme pollLast() {
142 | if (this.size == 1) {
143 | Lexeme last = this.head.lexeme;
144 | this.head = null;
145 | this.tail = null;
146 | this.size--;
147 | return last;
148 |
149 | } else if (this.size > 1) {
150 | Lexeme last = this.tail.lexeme;
151 | this.tail = this.tail.prev;
152 | this.size--;
153 | return last;
154 |
155 | } else {
156 | return null;
157 | }
158 | }
159 |
160 | /**
161 | * 返回集合大小
162 | * @return
163 | */
164 | int size() {
165 | return this.size;
166 | }
167 |
168 | /**
169 | * 判断集合是否为空
170 | * @return
171 | */
172 | boolean isEmpty() {
173 | return this.size == 0;
174 | }
175 |
176 | /**
177 | * 返回lexeme链的头部
178 | * @return
179 | */
180 | Cell getHead() {
181 | return this.head;
182 | }
183 |
184 | /**
185 | *
186 | * IK 中文分词 版本 5.0
187 | * IK Analyzer release 5.0
188 | *
189 | * Licensed to the Apache Software Foundation (ASF) under one or more
190 | * contributor license agreements. See the NOTICE file distributed with
191 | * this work for additional information regarding copyright ownership.
192 | * The ASF licenses this file to You under the Apache License, Version 2.0
193 | * (the "License"); you may not use this file except in compliance with
194 | * the License. You may obtain a copy of the License at
195 | *
196 | * http://www.apache.org/licenses/LICENSE-2.0
197 | *
198 | * Unless required by applicable law or agreed to in writing, software
199 | * distributed under the License is distributed on an "AS IS" BASIS,
200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | * See the License for the specific language governing permissions and
202 | * limitations under the License.
203 | *
204 | * 源代码由林良益(linliangyi2005@gmail.com)提供
205 | * 版权声明 2012,乌龙茶工作室
206 | * provided by Linliangyi and copyright 2012 by Oolong studio
207 | *
208 | * QuickSortSet集合单元
209 | *
210 | */
211 | class Cell implements Comparable {
212 | private Cell prev;
213 | private Cell next;
214 | private Lexeme lexeme;
215 |
216 | Cell(Lexeme lexeme) {
217 | if (lexeme == null) {
218 | throw new IllegalArgumentException("lexeme must not be null");
219 | }
220 | this.lexeme = lexeme;
221 | }
222 |
223 | public int compareTo(Cell o) {
224 | return this.lexeme.compareTo(o.lexeme);
225 | }
226 |
227 | public Cell getPrev() {
228 | return this.prev;
229 | }
230 |
231 | public Cell getNext() {
232 | return this.next;
233 | }
234 |
235 | public Lexeme getLexeme() {
236 | return this.lexeme;
237 | }
238 | }
239 | }
240 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/DictSegment.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.util.Arrays;
29 | import java.util.HashMap;
30 | import java.util.Map;
31 |
32 | /**
33 | * 词典树分段,表示词典树的一个分枝
34 | */
35 | class DictSegment implements Comparable {
36 |
37 | // 公用字典表,存储汉字
38 | private static final Map charMap = new HashMap(16,
39 | 0.95f);
40 | // 数组大小上限
41 | private static final int ARRAY_LENGTH_LIMIT = 3;
42 |
43 | // Map存储结构
44 | private Map childrenMap;
45 | // 数组方式存储结构
46 | private DictSegment[] childrenArray;
47 |
48 | // 当前节点上存储的字符
49 | private Character nodeChar;
50 | // 当前节点存储的Segment数目
51 | // storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
52 | private int storeSize = 0;
53 | // 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
54 | private int nodeState = 0;
55 |
56 | DictSegment(Character nodeChar) {
57 | if (nodeChar == null) {
58 | throw new IllegalArgumentException("参数为空异常,字符不能为空");
59 | }
60 | this.nodeChar = nodeChar;
61 | }
62 |
63 | Character getNodeChar() {
64 | return nodeChar;
65 | }
66 |
67 | /*
68 | * 判断是否有下一个节点
69 | */
70 | boolean hasNextNode() {
71 | return this.storeSize > 0;
72 | }
73 |
74 | /**
75 | * 匹配词段
76 | * @param charArray
77 | * @return Hit
78 | */
79 | Hit match(char[] charArray) {
80 | return this.match(charArray, 0, charArray.length, null);
81 | }
82 |
83 | /**
84 | * 匹配词段
85 | * @param charArray
86 | * @param begin
87 | * @param length
88 | * @return Hit
89 | */
90 | Hit match(char[] charArray, int begin, int length) {
91 | return this.match(charArray, begin, length, null);
92 | }
93 |
94 | /**
95 | * 匹配词段
96 | * @param charArray
97 | * @param begin
98 | * @param length
99 | * @param searchHit
100 | * @return Hit
101 | */
102 | Hit match(char[] charArray, int begin, int length, Hit searchHit) {
103 |
104 | if (searchHit == null) {
105 | // 如果hit为空,新建
106 | searchHit = new Hit();
107 | // 设置hit的其实文本位置
108 | searchHit.setBegin(begin);
109 | } else {
110 | // 否则要将HIT状态重置
111 | searchHit.setUnmatch();
112 | }
113 | // 设置hit的当前处理位置
114 | searchHit.setEnd(begin);
115 |
116 | Character keyChar = new Character(charArray[begin]);
117 | DictSegment ds = null;
118 |
119 | // 引用实例变量为本地变量,避免查询时遇到更新的同步问题
120 | DictSegment[] segmentArray = this.childrenArray;
121 | Map segmentMap = this.childrenMap;
122 |
123 | // STEP1 在节点中查找keyChar对应的DictSegment
124 | if (segmentArray != null) {
125 | // 在数组中查找
126 | DictSegment keySegment = new DictSegment(keyChar);
127 | int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
128 | if (position >= 0) {
129 | ds = segmentArray[position];
130 | }
131 |
132 | } else if (segmentMap != null) {
133 | // 在map中查找
134 | ds = (DictSegment) segmentMap.get(keyChar);
135 | }
136 |
137 | // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
138 | if (ds != null) {
139 | if (length > 1) {
140 | // 词未匹配完,继续往下搜索
141 | return ds.match(charArray, begin + 1, length - 1, searchHit);
142 | } else if (length == 1) {
143 |
144 | // 搜索最后一个char
145 | if (ds.nodeState == 1) {
146 | // 添加HIT状态为完全匹配
147 | searchHit.setMatch();
148 | }
149 | if (ds.hasNextNode()) {
150 | // 添加HIT状态为前缀匹配
151 | searchHit.setPrefix();
152 | // 记录当前位置的DictSegment
153 | searchHit.setMatchedDictSegment(ds);
154 | }
155 | return searchHit;
156 | }
157 |
158 | }
159 | // STEP3 没有找到DictSegment, 将HIT设置为不匹配
160 | return searchHit;
161 | }
162 |
163 | /**
164 | * 加载填充词典片段
165 | * @param charArray
166 | */
167 | void fillSegment(char[] charArray) {
168 | this.fillSegment(charArray, 0, charArray.length, 1);
169 | }
170 |
171 | /**
172 | * 屏蔽词典中的一个词
173 | * @param charArray
174 | */
175 | void disableSegment(char[] charArray) {
176 | this.fillSegment(charArray, 0, charArray.length, 0);
177 | }
178 |
179 | /**
180 | * 加载填充词典片段
181 | * @param charArray
182 | * @param begin
183 | * @param length
184 | * @param enabled
185 | */
186 | private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) {
187 | // 获取字典表中的汉字对象
188 | Character beginChar = new Character(charArray[begin]);
189 | Character keyChar = charMap.get(beginChar);
190 | // 字典中没有该字,则将其添加入字典
191 | if (keyChar == null) {
192 | charMap.put(beginChar, beginChar);
193 | keyChar = beginChar;
194 | }
195 |
196 | // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
197 | DictSegment ds = lookforSegment(keyChar, enabled);
198 | if (ds != null) {
199 | // 处理keyChar对应的segment
200 | if (length > 1) {
201 | // 词元还没有完全加入词典树
202 | ds.fillSegment(charArray, begin + 1, length - 1, enabled);
203 | } else if (length == 1) {
204 | // 已经是词元的最后一个char,设置当前节点状态为enabled,
205 | // enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
206 | ds.nodeState = enabled;
207 | }
208 | }
209 |
210 | }
211 |
212 | /**
213 | * 查找本节点下对应的keyChar的segment *
214 | * @param keyChar
215 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
216 | * @return
217 | */
218 | private DictSegment lookforSegment(Character keyChar, int create) {
219 |
220 | DictSegment ds = null;
221 |
222 | if (this.storeSize <= ARRAY_LENGTH_LIMIT) {
223 | // 获取数组容器,如果数组未创建则创建数组
224 | DictSegment[] segmentArray = getChildrenArray();
225 | // 搜寻数组
226 | DictSegment keySegment = new DictSegment(keyChar);
227 | int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment);
228 | if (position >= 0) {
229 | ds = segmentArray[position];
230 | }
231 |
232 | // 遍历数组后没有找到对应的segment
233 | if (ds == null && create == 1) {
234 | ds = keySegment;
235 | if (this.storeSize < ARRAY_LENGTH_LIMIT) {
236 | // 数组容量未满,使用数组存储
237 | segmentArray[this.storeSize] = ds;
238 | // segment数目+1
239 | this.storeSize++;
240 | Arrays.sort(segmentArray, 0, this.storeSize);
241 |
242 | } else {
243 | // 数组容量已满,切换Map存储
244 | // 获取Map容器,如果Map未创建,则创建Map
245 | Map segmentMap = getChildrenMap();
246 | // 将数组中的segment迁移到Map中
247 | migrate(segmentArray, segmentMap);
248 | // 存储新的segment
249 | segmentMap.put(keyChar, ds);
250 | // segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
251 | this.storeSize++;
252 | // 释放当前的数组引用
253 | this.childrenArray = null;
254 | }
255 |
256 | }
257 |
258 | } else {
259 | // 获取Map容器,如果Map未创建,则创建Map
260 | Map segmentMap = getChildrenMap();
261 | // 搜索Map
262 | ds = (DictSegment) segmentMap.get(keyChar);
263 | if (ds == null && create == 1) {
264 | // 构造新的segment
265 | ds = new DictSegment(keyChar);
266 | segmentMap.put(keyChar, ds);
267 | // 当前节点存储segment数目+1
268 | this.storeSize++;
269 | }
270 | }
271 |
272 | return ds;
273 | }
274 |
275 | /**
276 | * 获取数组容器
277 | * 线程同步方法
278 | */
279 | private DictSegment[] getChildrenArray() {
280 | if (this.childrenArray == null) {
281 | synchronized (this) {
282 | if (this.childrenArray == null) {
283 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT];
284 | }
285 | }
286 | }
287 | return this.childrenArray;
288 | }
289 |
290 | /**
291 | * 获取Map容器
292 | * 线程同步方法
293 | */
294 | private Map getChildrenMap() {
295 | if (this.childrenMap == null) {
296 | synchronized (this) {
297 | if (this.childrenMap == null) {
298 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f);
299 | }
300 | }
301 | }
302 | return this.childrenMap;
303 | }
304 |
305 | /**
306 | * 将数组中的segment迁移到Map中
307 | * @param segmentArray
308 | */
309 | private void migrate(DictSegment[] segmentArray, Map segmentMap) {
310 | for (DictSegment segment : segmentArray) {
311 | if (segment != null) {
312 | segmentMap.put(segment.nodeChar, segment);
313 | }
314 | }
315 | }
316 |
317 | /**
318 | * 实现Comparable接口
319 | * @param o
320 | * @return int
321 | */
322 | public int compareTo(DictSegment o) {
323 | // 对当前节点存储的char进行比较
324 | return this.nodeChar.compareTo(o.nodeChar);
325 | }
326 |
327 | }
328 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Dictionary.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | import java.io.BufferedReader;
29 | import java.io.IOException;
30 | import java.io.InputStream;
31 | import java.io.InputStreamReader;
32 | import java.util.Collection;
33 | import java.util.List;
34 |
35 | import org.wltea.analyzer.cfg.Configuration;
36 |
37 | /**
38 | * 词典管理类,单子模式
39 | */
40 | public class Dictionary {
41 |
42 | /*
43 | * 词典单子实例
44 | */
45 | private static Dictionary singleton;
46 |
47 | /*
48 | * 主词典对象
49 | */
50 | private DictSegment _MainDict;
51 |
52 | /*
53 | * 停止词词典
54 | */
55 | private DictSegment _StopWordDict;
56 | /*
57 | * 量词词典
58 | */
59 | private DictSegment _QuantifierDict;
60 |
61 | /**
62 | * 配置对象
63 | */
64 | private Configuration cfg;
65 |
66 | private Dictionary(Configuration cfg) {
67 | this.cfg = cfg;
68 | this.loadMainDict();
69 | this.loadStopWordDict();
70 | this.loadQuantifierDict();
71 | }
72 |
73 | /**
74 | * 词典初始化
75 | * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
76 | * 只有当Dictionary类被实际调用时,才会开始载入词典,
77 | * 这将延长首次分词操作的时间
78 | * 该方法提供了一个在应用加载阶段就初始化字典的手段
79 | * @return Dictionary
80 | */
81 | public static Dictionary initial(Configuration cfg) {
82 | if (singleton == null) {
83 | synchronized (Dictionary.class) {
84 | if (singleton == null) {
85 | singleton = new Dictionary(cfg);
86 | return singleton;
87 | }
88 | }
89 | }
90 | return singleton;
91 | }
92 |
93 | /**
94 | * 获取词典单子实例
95 | * @return Dictionary 单例对象
96 | */
97 | public static Dictionary getSingleton() {
98 | if (singleton == null) {
99 | throw new IllegalStateException("词典尚未初始化,请先调用initial方法");
100 | }
101 | return singleton;
102 | }
103 |
104 | /**
105 | * 批量加载新词条
106 | * @param words Collection词条列表
107 | */
108 | public void addWords(Collection words) {
109 | if (words != null) {
110 | for (String word : words) {
111 | if (word != null) {
112 | // 批量加载词条到主内存词典中
113 | singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray());
114 | }
115 | }
116 | }
117 | }
118 |
119 | /**
120 | * 批量移除(屏蔽)词条
121 | * @param words
122 | */
123 | public void disableWords(Collection words) {
124 | if (words != null) {
125 | for (String word : words) {
126 | if (word != null) {
127 | // 批量屏蔽词条
128 | singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray());
129 | }
130 | }
131 | }
132 | }
133 |
134 | /**
135 | * 检索匹配主词典
136 | * @param charArray
137 | * @return Hit 匹配结果描述
138 | */
139 | public Hit matchInMainDict(char[] charArray) {
140 | return singleton._MainDict.match(charArray);
141 | }
142 |
143 | /**
144 | * 检索匹配主词典
145 | * @param charArray
146 | * @param begin
147 | * @param length
148 | * @return Hit 匹配结果描述
149 | */
150 | public Hit matchInMainDict(char[] charArray, int begin, int length) {
151 | return singleton._MainDict.match(charArray, begin, length);
152 | }
153 |
154 | /**
155 | * 检索匹配量词词典
156 | * @param charArray
157 | * @param begin
158 | * @param length
159 | * @return Hit 匹配结果描述
160 | */
161 | public Hit matchInQuantifierDict(char[] charArray, int begin, int length) {
162 | return singleton._QuantifierDict.match(charArray, begin, length);
163 | }
164 |
165 | /**
166 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配
167 | * @param charArray
168 | * @param currentIndex
169 | * @param matchedHit
170 | * @return Hit
171 | */
172 | public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) {
173 | DictSegment ds = matchedHit.getMatchedDictSegment();
174 | return ds.match(charArray, currentIndex, 1, matchedHit);
175 | }
176 |
177 | /**
178 | * 判断是否是停止词
179 | * @param charArray
180 | * @param begin
181 | * @param length
182 | * @return boolean
183 | */
184 | public boolean isStopWord(char[] charArray, int begin, int length) {
185 | return singleton._StopWordDict.match(charArray, begin, length).isMatch();
186 | }
187 |
188 | /**
189 | * 加载主词典及扩展词典
190 | */
191 | private void loadMainDict() {
192 | // 建立一个主词典实例
193 | _MainDict = new DictSegment((char) 0);
194 | // 读取主词典文件
195 | InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary());
196 | if (is == null) {
197 | throw new RuntimeException("Main Dictionary not found!!!");
198 | }
199 |
200 | try {
201 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
202 | String theWord = null;
203 | do {
204 | theWord = br.readLine();
205 | if (theWord != null && !"".equals(theWord.trim())) {
206 | _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
207 | }
208 | } while (theWord != null);
209 |
210 | } catch (IOException ioe) {
211 | System.err.println("Main Dictionary loading exception.");
212 | ioe.printStackTrace();
213 |
214 | } finally {
215 | try {
216 | if (is != null) {
217 | is.close();
218 | is = null;
219 | }
220 | } catch (IOException e) {
221 | e.printStackTrace();
222 | }
223 | }
224 | // 加载扩展词典
225 | this.loadExtDict();
226 | }
227 |
228 | /**
229 | * 加载用户配置的扩展词典到主词库表
230 | */
231 | private void loadExtDict() {
232 | // 加载扩展词典配置
233 | List extDictFiles = cfg.getExtDictionarys();
234 | if (extDictFiles != null) {
235 | InputStream is = null;
236 | for (String extDictName : extDictFiles) {
237 | // 读取扩展词典文件
238 | System.out.println("加载扩展词典:" + extDictName);
239 | is = this.getClass().getClassLoader().getResourceAsStream(extDictName);
240 | // 如果找不到扩展的字典,则忽略
241 | if (is == null) {
242 | continue;
243 | }
244 | try {
245 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
246 | String theWord = null;
247 | do {
248 | theWord = br.readLine();
249 | if (theWord != null && !"".equals(theWord.trim())) {
250 | // 加载扩展词典数据到主内存词典中
251 | // System.out.println(theWord);
252 | _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
253 | }
254 | } while (theWord != null);
255 |
256 | } catch (IOException ioe) {
257 | System.err.println("Extension Dictionary loading exception.");
258 | ioe.printStackTrace();
259 |
260 | } finally {
261 | try {
262 | if (is != null) {
263 | is.close();
264 | is = null;
265 | }
266 | } catch (IOException e) {
267 | e.printStackTrace();
268 | }
269 | }
270 | }
271 | }
272 | }
273 |
274 | /**
275 | * 加载用户扩展的停止词词典
276 | */
277 | private void loadStopWordDict() {
278 | // 建立一个主词典实例
279 | _StopWordDict = new DictSegment((char) 0);
280 | // 加载扩展停止词典
281 | List extStopWordDictFiles = cfg.getExtStopWordDictionarys();
282 | if (extStopWordDictFiles != null) {
283 | InputStream is = null;
284 | for (String extStopWordDictName : extStopWordDictFiles) {
285 | System.out.println("加载扩展停止词典:" + extStopWordDictName);
286 | // 读取扩展词典文件
287 | is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName);
288 | // 如果找不到扩展的字典,则忽略
289 | if (is == null) {
290 | continue;
291 | }
292 | try {
293 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
294 | String theWord = null;
295 | do {
296 | theWord = br.readLine();
297 | if (theWord != null && !"".equals(theWord.trim())) {
298 | // System.out.println(theWord);
299 | // 加载扩展停止词典数据到内存中
300 | _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
301 | }
302 | } while (theWord != null);
303 |
304 | } catch (IOException ioe) {
305 | System.err.println("Extension Stop word Dictionary loading exception.");
306 | ioe.printStackTrace();
307 |
308 | } finally {
309 | try {
310 | if (is != null) {
311 | is.close();
312 | is = null;
313 | }
314 | } catch (IOException e) {
315 | e.printStackTrace();
316 | }
317 | }
318 | }
319 | }
320 | }
321 |
322 | /**
323 | * 加载量词词典
324 | */
325 | private void loadQuantifierDict() {
326 | // 建立一个量词典实例
327 | _QuantifierDict = new DictSegment((char) 0);
328 | // 读取量词词典文件
329 | InputStream is = this.getClass().getClassLoader()
330 | .getResourceAsStream(cfg.getQuantifierDicionary());
331 | if (is == null) {
332 | throw new RuntimeException("Quantifier Dictionary not found!!!");
333 | }
334 | try {
335 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512);
336 | String theWord = null;
337 | do {
338 | theWord = br.readLine();
339 | if (theWord != null && !"".equals(theWord.trim())) {
340 | _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
341 | }
342 | } while (theWord != null);
343 |
344 | } catch (IOException ioe) {
345 | System.err.println("Quantifier Dictionary loading exception.");
346 | ioe.printStackTrace();
347 |
348 | } finally {
349 | try {
350 | if (is != null) {
351 | is.close();
352 | is = null;
353 | }
354 | } catch (IOException e) {
355 | e.printStackTrace();
356 | }
357 | }
358 | }
359 |
360 | }
361 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/Hit.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * IK 中文分词 版本 5.0
4 | * IK Analyzer release 5.0
5 | *
6 | * Licensed to the Apache Software Foundation (ASF) under one or more
7 | * contributor license agreements. See the NOTICE file distributed with
8 | * this work for additional information regarding copyright ownership.
9 | * The ASF licenses this file to You under the Apache License, Version 2.0
10 | * (the "License"); you may not use this file except in compliance with
11 | * the License. You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | *
21 | * 源代码由林良益(linliangyi2005@gmail.com)提供
22 | * 版权声明 2012,乌龙茶工作室
23 | * provided by Linliangyi and copyright 2012 by Oolong studio
24 | *
25 | */
26 | package org.wltea.analyzer.dic;
27 |
28 | /**
29 | * 表示一次词典匹配的命中
30 | */
31 | public class Hit {
32 | // Hit不匹配
33 | private static final int UNMATCH = 0x00000000;
34 | // Hit完全匹配
35 | private static final int MATCH = 0x00000001;
36 | // Hit前缀匹配
37 | private static final int PREFIX = 0x00000010;
38 |
39 | // 该HIT当前状态,默认未匹配
40 | private int hitState = UNMATCH;
41 |
42 | // 记录词典匹配过程中,当前匹配到的词典分支节点
43 | private DictSegment matchedDictSegment;
44 | /*
45 | * 词段开始位置
46 | */
47 | private int begin;
48 | /*
49 | * 词段的结束位置
50 | */
51 | private int end;
52 |
53 | /**
54 | * 判断是否完全匹配
55 | */
56 | public boolean isMatch() {
57 | return (this.hitState & MATCH) > 0;
58 | }
59 |
60 | /**
61 | *
62 | */
63 | public void setMatch() {
64 | this.hitState = this.hitState | MATCH;
65 | }
66 |
67 | /**
68 | * 判断是否是词的前缀
69 | */
70 | public boolean isPrefix() {
71 | return (this.hitState & PREFIX) > 0;
72 | }
73 |
74 | /**
75 | *
76 | */
77 | public void setPrefix() {
78 | this.hitState = this.hitState | PREFIX;
79 | }
80 |
81 | /**
82 | * 判断是否是不匹配
83 | */
84 | public boolean isUnmatch() {
85 | return this.hitState == UNMATCH;
86 | }
87 |
88 | /**
89 | *
90 | */
91 | public void setUnmatch() {
92 | this.hitState = UNMATCH;
93 | }
94 |
95 | public DictSegment getMatchedDictSegment() {
96 | return matchedDictSegment;
97 | }
98 |
99 | public void setMatchedDictSegment(DictSegment matchedDictSegment) {
100 | this.matchedDictSegment = matchedDictSegment;
101 | }
102 |
103 | public int getBegin() {
104 | return begin;
105 | }
106 |
107 | public void setBegin(int begin) {
108 | this.begin = begin;
109 | }
110 |
111 | public int getEnd() {
112 | return end;
113 | }
114 |
115 | public void setEnd(int end) {
116 | this.end = end;
117 | }
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/dic/quantifier.dic:
--------------------------------------------------------------------------------
1 | 丈
2 | 下
3 | 世
4 | 世纪
5 | 两
6 | 个
7 | 中
8 | 串
9 | 亩
10 | 人
11 | 介
12 | 付
13 | 代
14 | 件
15 | 任
16 | 份
17 | 伏
18 | 伙
19 | 位
20 | 位数
21 | 例
22 | 倍
23 | 像素
24 | 元
25 | 克
26 | 克拉
27 | 公亩
28 | 公克
29 | 公分
30 | 公升
31 | 公尺
32 | 公担
33 | 公斤
34 | 公里
35 | 公顷
36 | 具
37 | 册
38 | 出
39 | 刀
40 | 分
41 | 分钟
42 | 分米
43 | 划
44 | 列
45 | 则
46 | 刻
47 | 剂
48 | 剑
49 | 副
50 | 加仑
51 | 勺
52 | 包
53 | 匙
54 | 匹
55 | 区
56 | 千克
57 | 千米
58 | 升
59 | 卷
60 | 厅
61 | 厘
62 | 厘米
63 | 双
64 | 发
65 | 口
66 | 句
67 | 只
68 | 台
69 | 叶
70 | 号
71 | 名
72 | 吨
73 | 听
74 | 员
75 | 周
76 | 周年
77 | 品
78 | 回
79 | 团
80 | 圆
81 | 圈
82 | 地
83 | 场
84 | 块
85 | 坪
86 | 堆
87 | 声
88 | 壶
89 | 处
90 | 夜
91 | 大
92 | 天
93 | 头
94 | 套
95 | 女
96 | 孔
97 | 字
98 | 宗
99 | 室
100 | 家
101 | 寸
102 | 对
103 | 封
104 | 尊
105 | 小时
106 | 尺
107 | 尾
108 | 局
109 | 层
110 | 届
111 | 岁
112 | 师
113 | 帧
114 | 幅
115 | 幕
116 | 幢
117 | 平方
118 | 平方公尺
119 | 平方公里
120 | 平方分米
121 | 平方厘米
122 | 平方码
123 | 平方米
124 | 平方英寸
125 | 平方英尺
126 | 平方英里
127 | 平米
128 | 年
129 | 年代
130 | 年级
131 | 度
132 | 座
133 | 式
134 | 引
135 | 张
136 | 成
137 | 战
138 | 截
139 | 户
140 | 房
141 | 所
142 | 扇
143 | 手
144 | 打
145 | 批
146 | 把
147 | 折
148 | 担
149 | 拍
150 | 招
151 | 拨
152 | 拳
153 | 指
154 | 掌
155 | 排
156 | 撮
157 | 支
158 | 文
159 | 斗
160 | 斤
161 | 方
162 | 族
163 | 日
164 | 时
165 | 曲
166 | 月
167 | 月份
168 | 期
169 | 本
170 | 朵
171 | 村
172 | 束
173 | 条
174 | 来
175 | 杯
176 | 枚
177 | 枝
178 | 枪
179 | 架
180 | 柄
181 | 柜
182 | 栋
183 | 栏
184 | 株
185 | 样
186 | 根
187 | 格
188 | 案
189 | 桌
190 | 档
191 | 桩
192 | 桶
193 | 梯
194 | 棵
195 | 楼
196 | 次
197 | 款
198 | 步
199 | 段
200 | 毛
201 | 毫
202 | 毫升
203 | 毫米
204 | 毫克
205 | 池
206 | 洲
207 | 派
208 | 海里
209 | 滴
210 | 炮
211 | 点
212 | 点钟
213 | 片
214 | 版
215 | 环
216 | 班
217 | 瓣
218 | 瓶
219 | 生
220 | 男
221 | 画
222 | 界
223 | 盆
224 | 盎司
225 | 盏
226 | 盒
227 | 盘
228 | 相
229 | 眼
230 | 石
231 | 码
232 | 碗
233 | 碟
234 | 磅
235 | 种
236 | 科
237 | 秒
238 | 秒钟
239 | 窝
240 | 立方公尺
241 | 立方分米
242 | 立方厘米
243 | 立方码
244 | 立方米
245 | 立方英寸
246 | 立方英尺
247 | 站
248 | 章
249 | 笔
250 | 等
251 | 筐
252 | 筒
253 | 箱
254 | 篇
255 | 篓
256 | 篮
257 | 簇
258 | 米
259 | 类
260 | 粒
261 | 级
262 | 组
263 | 维
264 | 缕
265 | 缸
266 | 罐
267 | 网
268 | 群
269 | 股
270 | 脚
271 | 船
272 | 艇
273 | 艘
274 | 色
275 | 节
276 | 英亩
277 | 英寸
278 | 英尺
279 | 英里
280 | 行
281 | 袋
282 | 角
283 | 言
284 | 课
285 | 起
286 | 趟
287 | 路
288 | 车
289 | 转
290 | 轮
291 | 辆
292 | 辈
293 | 连
294 | 通
295 | 遍
296 | 部
297 | 里
298 | 重
299 | 针
300 | 钟
301 | 钱
302 | 锅
303 | 门
304 | 间
305 | 队
306 | 阶段
307 | 隅
308 | 集
309 | 页
310 | 顶
311 | 顷
312 | 项
313 | 顿
314 | 颗
315 | 餐
316 | 首
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.lucene;
26 |
27 | import java.io.Reader;
28 |
29 | import org.apache.lucene.analysis.Analyzer;
30 | import org.apache.lucene.analysis.Tokenizer;
31 |
32 | /**
33 | * IK分词器,Lucene Analyzer接口实现
34 | * 兼容Lucene 4.0版本
35 | */
36 | public final class IKAnalyzer extends Analyzer {
37 |
38 | private boolean useSmart;
39 |
40 | public boolean useSmart() {
41 | return useSmart;
42 | }
43 |
44 | public void setUseSmart(boolean useSmart) {
45 | this.useSmart = useSmart;
46 | }
47 |
48 | /**
49 | * IK分词器Lucene Analyzer接口实现类
50 | *
51 | * 默认细粒度切分算法
52 | */
53 | public IKAnalyzer() {
54 | this(false);
55 | }
56 |
57 | /**
58 | * IK分词器Lucene Analyzer接口实现类
59 | *
60 | * @param useSmart 当为true时,分词器进行智能切分
61 | */
62 | public IKAnalyzer(boolean useSmart) {
63 | super();
64 | this.useSmart = useSmart;
65 | }
66 |
67 | /**
68 | * 重载Analyzer接口,构造分词组件
69 | */
70 | @Override
71 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
72 | Tokenizer _IKTokenizer = new IKTokenizer(in, this.useSmart());
73 | return new TokenStreamComponents(_IKTokenizer);
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 |
25 | *
26 | */
27 | package org.wltea.analyzer.lucene;
28 |
29 | import java.io.IOException;
30 | import java.io.Reader;
31 |
32 | import org.apache.lucene.analysis.Tokenizer;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
36 |
37 | import org.wltea.analyzer.core.IKSegmenter;
38 | import org.wltea.analyzer.core.Lexeme;
39 |
40 | /**
41 | * IK分词器 Lucene Tokenizer适配器类
42 | * 兼容Lucene 4.0版本
43 | */
44 | public final class IKTokenizer extends Tokenizer {
45 |
46 | // IK分词器实现
47 | private IKSegmenter _IKImplement;
48 |
49 | // 词元文本属性
50 | private final CharTermAttribute termAtt;
51 | // 词元位移属性
52 | private final OffsetAttribute offsetAtt;
53 | // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
54 | private final TypeAttribute typeAtt;
55 | // 记录最后一个词元的结束位置
56 | private int endPosition;
57 |
58 | /**
59 | * Lucene 4.0 Tokenizer适配器类构造函数
60 | * @param in
61 | * @param useSmart
62 | */
63 | public IKTokenizer(Reader in, boolean useSmart) {
64 | super(in);
65 | offsetAtt = addAttribute(OffsetAttribute.class);
66 | termAtt = addAttribute(CharTermAttribute.class);
67 | typeAtt = addAttribute(TypeAttribute.class);
68 | _IKImplement = new IKSegmenter(input, useSmart);
69 | }
70 |
71 | /*
72 | * (non-Javadoc)
73 | * @see org.apache.lucene.analysis.TokenStream#incrementToken()
74 | */
75 | @Override
76 | public boolean incrementToken() throws IOException {
77 | // 清除所有的词元属性
78 | clearAttributes();
79 | Lexeme nextLexeme = _IKImplement.next();
80 | if (nextLexeme != null) {
81 | // 将Lexeme转成Attributes
82 | // 设置词元文本
83 | termAtt.append(nextLexeme.getLexemeText());
84 | // 设置词元长度
85 | termAtt.setLength(nextLexeme.getLength());
86 | // 设置词元位移
87 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
88 | // 记录分词的最后位置
89 | endPosition = nextLexeme.getEndPosition();
90 | // 记录词元分类
91 | typeAtt.setType(nextLexeme.getLexemeTypeString());
92 | // 返会true告知还有下个词元
93 | return true;
94 | }
95 | // 返会false告知词元输出完毕
96 | return false;
97 | }
98 |
99 | /*
100 | * (non-Javadoc)
101 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
102 | */
103 | @Override
104 | public void reset() throws IOException {
105 | super.reset();
106 | _IKImplement.reset(input);
107 | }
108 |
109 | @Override
110 | public final void end() {
111 | // set final offset
112 | int finalOffset = correctOffset(this.endPosition);
113 | offsetAtt.setOffset(finalOffset, finalOffset);
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.query;
26 |
27 | import java.util.ArrayList;
28 | import java.util.LinkedList;
29 | import java.util.List;
30 | import java.util.Stack;
31 |
32 | import org.apache.lucene.index.Term;
33 | import org.apache.lucene.search.BooleanClause;
34 | import org.apache.lucene.search.BooleanQuery;
35 | import org.apache.lucene.search.Query;
36 | import org.apache.lucene.search.TermQuery;
37 | import org.apache.lucene.search.TermRangeQuery;
38 | import org.apache.lucene.search.BooleanClause.Occur;
39 | import org.apache.lucene.util.BytesRef;
40 |
41 | /**
42 | * IK简易查询表达式解析
43 | * 结合SWMCQuery算法
44 | *
45 | * 表达式例子 :
46 | * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword'
47 | * @author linliangyi
48 | *
49 | */
50 | public class IKQueryExpressionParser {
51 |
52 | // public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],";
53 |
54 | private List elements = new ArrayList();
55 |
56 | private Stack querys = new Stack();
57 |
58 | private Stack operates = new Stack();
59 |
60 | /**
61 | * 解析查询表达式,生成Lucene Query对象
62 | *
63 | * @param expression
64 | * @param quickMode
65 | * @return Lucene query
66 | */
67 | public Query parseExp(String expression, boolean quickMode) {
68 | Query lucenceQuery = null;
69 | if (expression != null && !"".equals(expression.trim())) {
70 | try {
71 | // 文法解析
72 | this.splitElements(expression);
73 | // 语法解析
74 | this.parseSyntax(quickMode);
75 | if (this.querys.size() == 1) {
76 | lucenceQuery = this.querys.pop();
77 | } else {
78 | throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失");
79 | }
80 | } finally {
81 | elements.clear();
82 | querys.clear();
83 | operates.clear();
84 | }
85 | }
86 | return lucenceQuery;
87 | }
88 |
89 | /**
90 | * 表达式文法解析
91 | * @param expression
92 | */
93 | private void splitElements(String expression) {
94 |
95 | if (expression == null) {
96 | return;
97 | }
98 | Element curretElement = null;
99 |
100 | char[] expChars = expression.toCharArray();
101 | for (int i = 0; i < expChars.length; i++) {
102 | switch (expChars[i]) {
103 | case '&':
104 | if (curretElement == null) {
105 | curretElement = new Element();
106 | curretElement.type = '&';
107 | curretElement.append(expChars[i]);
108 | } else if (curretElement.type == '&') {
109 | curretElement.append(expChars[i]);
110 | this.elements.add(curretElement);
111 | curretElement = null;
112 | } else if (curretElement.type == '\'') {
113 | curretElement.append(expChars[i]);
114 | } else {
115 | this.elements.add(curretElement);
116 | curretElement = new Element();
117 | curretElement.type = '&';
118 | curretElement.append(expChars[i]);
119 | }
120 | break;
121 |
122 | case '|':
123 | if (curretElement == null) {
124 | curretElement = new Element();
125 | curretElement.type = '|';
126 | curretElement.append(expChars[i]);
127 | } else if (curretElement.type == '|') {
128 | curretElement.append(expChars[i]);
129 | this.elements.add(curretElement);
130 | curretElement = null;
131 | } else if (curretElement.type == '\'') {
132 | curretElement.append(expChars[i]);
133 | } else {
134 | this.elements.add(curretElement);
135 | curretElement = new Element();
136 | curretElement.type = '|';
137 | curretElement.append(expChars[i]);
138 | }
139 | break;
140 |
141 | case '-':
142 | if (curretElement != null) {
143 | if (curretElement.type == '\'') {
144 | curretElement.append(expChars[i]);
145 | continue;
146 | } else {
147 | this.elements.add(curretElement);
148 | }
149 | }
150 | curretElement = new Element();
151 | curretElement.type = '-';
152 | curretElement.append(expChars[i]);
153 | this.elements.add(curretElement);
154 | curretElement = null;
155 | break;
156 |
157 | case '(':
158 | if (curretElement != null) {
159 | if (curretElement.type == '\'') {
160 | curretElement.append(expChars[i]);
161 | continue;
162 | } else {
163 | this.elements.add(curretElement);
164 | }
165 | }
166 | curretElement = new Element();
167 | curretElement.type = '(';
168 | curretElement.append(expChars[i]);
169 | this.elements.add(curretElement);
170 | curretElement = null;
171 | break;
172 |
173 | case ')':
174 | if (curretElement != null) {
175 | if (curretElement.type == '\'') {
176 | curretElement.append(expChars[i]);
177 | continue;
178 | } else {
179 | this.elements.add(curretElement);
180 | }
181 | }
182 | curretElement = new Element();
183 | curretElement.type = ')';
184 | curretElement.append(expChars[i]);
185 | this.elements.add(curretElement);
186 | curretElement = null;
187 | break;
188 |
189 | case ':':
190 | if (curretElement != null) {
191 | if (curretElement.type == '\'') {
192 | curretElement.append(expChars[i]);
193 | continue;
194 | } else {
195 | this.elements.add(curretElement);
196 | }
197 | }
198 | curretElement = new Element();
199 | curretElement.type = ':';
200 | curretElement.append(expChars[i]);
201 | this.elements.add(curretElement);
202 | curretElement = null;
203 | break;
204 |
205 | case '=':
206 | if (curretElement != null) {
207 | if (curretElement.type == '\'') {
208 | curretElement.append(expChars[i]);
209 | continue;
210 | } else {
211 | this.elements.add(curretElement);
212 | }
213 | }
214 | curretElement = new Element();
215 | curretElement.type = '=';
216 | curretElement.append(expChars[i]);
217 | this.elements.add(curretElement);
218 | curretElement = null;
219 | break;
220 |
221 | case ' ':
222 | if (curretElement != null) {
223 | if (curretElement.type == '\'') {
224 | curretElement.append(expChars[i]);
225 | } else {
226 | this.elements.add(curretElement);
227 | curretElement = null;
228 | }
229 | }
230 |
231 | break;
232 |
233 | case '\'':
234 | if (curretElement == null) {
235 | curretElement = new Element();
236 | curretElement.type = '\'';
237 |
238 | } else if (curretElement.type == '\'') {
239 | this.elements.add(curretElement);
240 | curretElement = null;
241 |
242 | } else {
243 | this.elements.add(curretElement);
244 | curretElement = new Element();
245 | curretElement.type = '\'';
246 |
247 | }
248 | break;
249 |
250 | case '[':
251 | if (curretElement != null) {
252 | if (curretElement.type == '\'') {
253 | curretElement.append(expChars[i]);
254 | continue;
255 | } else {
256 | this.elements.add(curretElement);
257 | }
258 | }
259 | curretElement = new Element();
260 | curretElement.type = '[';
261 | curretElement.append(expChars[i]);
262 | this.elements.add(curretElement);
263 | curretElement = null;
264 | break;
265 |
266 | case ']':
267 | if (curretElement != null) {
268 | if (curretElement.type == '\'') {
269 | curretElement.append(expChars[i]);
270 | continue;
271 | } else {
272 | this.elements.add(curretElement);
273 | }
274 | }
275 | curretElement = new Element();
276 | curretElement.type = ']';
277 | curretElement.append(expChars[i]);
278 | this.elements.add(curretElement);
279 | curretElement = null;
280 |
281 | break;
282 |
283 | case '{':
284 | if (curretElement != null) {
285 | if (curretElement.type == '\'') {
286 | curretElement.append(expChars[i]);
287 | continue;
288 | } else {
289 | this.elements.add(curretElement);
290 | }
291 | }
292 | curretElement = new Element();
293 | curretElement.type = '{';
294 | curretElement.append(expChars[i]);
295 | this.elements.add(curretElement);
296 | curretElement = null;
297 | break;
298 |
299 | case '}':
300 | if (curretElement != null) {
301 | if (curretElement.type == '\'') {
302 | curretElement.append(expChars[i]);
303 | continue;
304 | } else {
305 | this.elements.add(curretElement);
306 | }
307 | }
308 | curretElement = new Element();
309 | curretElement.type = '}';
310 | curretElement.append(expChars[i]);
311 | this.elements.add(curretElement);
312 | curretElement = null;
313 |
314 | break;
315 | case ',':
316 | if (curretElement != null) {
317 | if (curretElement.type == '\'') {
318 | curretElement.append(expChars[i]);
319 | continue;
320 | } else {
321 | this.elements.add(curretElement);
322 | }
323 | }
324 | curretElement = new Element();
325 | curretElement.type = ',';
326 | curretElement.append(expChars[i]);
327 | this.elements.add(curretElement);
328 | curretElement = null;
329 |
330 | break;
331 |
332 | default:
333 | if (curretElement == null) {
334 | curretElement = new Element();
335 | curretElement.type = 'F';
336 | curretElement.append(expChars[i]);
337 |
338 | } else if (curretElement.type == 'F') {
339 | curretElement.append(expChars[i]);
340 |
341 | } else if (curretElement.type == '\'') {
342 | curretElement.append(expChars[i]);
343 |
344 | } else {
345 | this.elements.add(curretElement);
346 | curretElement = new Element();
347 | curretElement.type = 'F';
348 | curretElement.append(expChars[i]);
349 | }
350 | }
351 | }
352 |
353 | if (curretElement != null) {
354 | this.elements.add(curretElement);
355 | curretElement = null;
356 | }
357 | }
358 |
359 | /**
360 | * 语法解析
361 | *
362 | */
363 | private void parseSyntax(boolean quickMode) {
364 | for (int i = 0; i < this.elements.size(); i++) {
365 | Element e = this.elements.get(i);
366 | if ('F' == e.type) {
367 | Element e2 = this.elements.get(i + 1);
368 | if ('=' != e2.type && ':' != e2.type) {
369 | throw new IllegalStateException("表达式异常: = 或 : 号丢失");
370 | }
371 | Element e3 = this.elements.get(i + 2);
372 | // 处理 = 和 : 运算
373 | if ('\'' == e3.type) {
374 | i += 2;
375 | if ('=' == e2.type) {
376 | TermQuery tQuery = new TermQuery(new Term(e.toString(), e3.toString()));
377 | this.querys.push(tQuery);
378 | } else if (':' == e2.type) {
379 | String keyword = e3.toString();
380 | // SWMCQuery Here
381 | Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword, quickMode);
382 | this.querys.push(_SWMCQuery);
383 | }
384 |
385 | } else if ('[' == e3.type || '{' == e3.type) {
386 | i += 2;
387 | // 处理 [] 和 {}
388 | LinkedList eQueue = new LinkedList();
389 | eQueue.add(e3);
390 | for (i++; i < this.elements.size(); i++) {
391 | Element eN = this.elements.get(i);
392 | eQueue.add(eN);
393 | if (']' == eN.type || '}' == eN.type) {
394 | break;
395 | }
396 | }
397 | // 翻译RangeQuery
398 | Query rangeQuery = this.toTermRangeQuery(e, eQueue);
399 | this.querys.push(rangeQuery);
400 | } else {
401 | throw new IllegalStateException("表达式异常:匹配值丢失");
402 | }
403 |
404 | } else if ('(' == e.type) {
405 | this.operates.push(e);
406 |
407 | } else if (')' == e.type) {
408 | boolean doPop = true;
409 | while (doPop && !this.operates.empty()) {
410 | Element op = this.operates.pop();
411 | if ('(' == op.type) {
412 | doPop = false;
413 | } else {
414 | Query q = toBooleanQuery(op);
415 | this.querys.push(q);
416 | }
417 |
418 | }
419 | } else {
420 |
421 | if (this.operates.isEmpty()) {
422 | this.operates.push(e);
423 | } else {
424 | boolean doPeek = true;
425 | while (doPeek && !this.operates.isEmpty()) {
426 | Element eleOnTop = this.operates.peek();
427 | if ('(' == eleOnTop.type) {
428 | doPeek = false;
429 | this.operates.push(e);
430 | } else if (compare(e, eleOnTop) == 1) {
431 | this.operates.push(e);
432 | doPeek = false;
433 | } else if (compare(e, eleOnTop) == 0) {
434 | Query q = toBooleanQuery(eleOnTop);
435 | this.operates.pop();
436 | this.querys.push(q);
437 | } else {
438 | Query q = toBooleanQuery(eleOnTop);
439 | this.operates.pop();
440 | this.querys.push(q);
441 | }
442 | }
443 |
444 | if (doPeek && this.operates.empty()) {
445 | this.operates.push(e);
446 | }
447 | }
448 | }
449 | }
450 |
451 | while (!this.operates.isEmpty()) {
452 | Element eleOnTop = this.operates.pop();
453 | Query q = toBooleanQuery(eleOnTop);
454 | this.querys.push(q);
455 | }
456 | }
457 |
458 | /**
459 | * 根据逻辑操作符,生成BooleanQuery
460 | * @param op
461 | * @return
462 | */
463 | private Query toBooleanQuery(Element op) {
464 | if (this.querys.size() == 0) {
465 | return null;
466 | }
467 |
468 | BooleanQuery resultQuery = new BooleanQuery();
469 |
470 | if (this.querys.size() == 1) {
471 | return this.querys.get(0);
472 | }
473 |
474 | Query q2 = this.querys.pop();
475 | Query q1 = this.querys.pop();
476 | if ('&' == op.type) {
477 | if (q1 != null) {
478 | if (q1 instanceof BooleanQuery) {
479 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses();
480 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) {
481 | for (BooleanClause c : clauses) {
482 | resultQuery.add(c);
483 | }
484 | } else {
485 | resultQuery.add(q1, Occur.MUST);
486 | }
487 |
488 | } else {
489 | // q1 instanceof TermQuery
490 | // q1 instanceof TermRangeQuery
491 | // q1 instanceof PhraseQuery
492 | // others
493 | resultQuery.add(q1, Occur.MUST);
494 | }
495 | }
496 |
497 | if (q2 != null) {
498 | if (q2 instanceof BooleanQuery) {
499 | BooleanClause[] clauses = ((BooleanQuery) q2).getClauses();
500 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) {
501 | for (BooleanClause c : clauses) {
502 | resultQuery.add(c);
503 | }
504 | } else {
505 | resultQuery.add(q2, Occur.MUST);
506 | }
507 |
508 | } else {
509 | // q1 instanceof TermQuery
510 | // q1 instanceof TermRangeQuery
511 | // q1 instanceof PhraseQuery
512 | // others
513 | resultQuery.add(q2, Occur.MUST);
514 | }
515 | }
516 |
517 | } else if ('|' == op.type) {
518 | if (q1 != null) {
519 | if (q1 instanceof BooleanQuery) {
520 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses();
521 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) {
522 | for (BooleanClause c : clauses) {
523 | resultQuery.add(c);
524 | }
525 | } else {
526 | resultQuery.add(q1, Occur.SHOULD);
527 | }
528 |
529 | } else {
530 | // q1 instanceof TermQuery
531 | // q1 instanceof TermRangeQuery
532 | // q1 instanceof PhraseQuery
533 | // others
534 | resultQuery.add(q1, Occur.SHOULD);
535 | }
536 | }
537 |
538 | if (q2 != null) {
539 | if (q2 instanceof BooleanQuery) {
540 | BooleanClause[] clauses = ((BooleanQuery) q2).getClauses();
541 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) {
542 | for (BooleanClause c : clauses) {
543 | resultQuery.add(c);
544 | }
545 | } else {
546 | resultQuery.add(q2, Occur.SHOULD);
547 | }
548 | } else {
549 | // q2 instanceof TermQuery
550 | // q2 instanceof TermRangeQuery
551 | // q2 instanceof PhraseQuery
552 | // others
553 | resultQuery.add(q2, Occur.SHOULD);
554 |
555 | }
556 | }
557 |
558 | } else if ('-' == op.type) {
559 | if (q1 == null || q2 == null) {
560 | throw new IllegalStateException("表达式异常:SubQuery 个数不匹配");
561 | }
562 |
563 | if (q1 instanceof BooleanQuery) {
564 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses();
565 | if (clauses.length > 0) {
566 | for (BooleanClause c : clauses) {
567 | resultQuery.add(c);
568 | }
569 | } else {
570 | resultQuery.add(q1, Occur.MUST);
571 | }
572 |
573 | } else {
574 | // q1 instanceof TermQuery
575 | // q1 instanceof TermRangeQuery
576 | // q1 instanceof PhraseQuery
577 | // others
578 | resultQuery.add(q1, Occur.MUST);
579 | }
580 |
581 | resultQuery.add(q2, Occur.MUST_NOT);
582 | }
583 | return resultQuery;
584 | }
585 |
586 | /**
587 | * 组装TermRangeQuery
588 | * @param elements
589 | * @return
590 | */
591 | private TermRangeQuery toTermRangeQuery(Element fieldNameEle, LinkedList elements) {
592 |
593 | boolean includeFirst = false;
594 | boolean includeLast = false;
595 | String firstValue = null;
596 | String lastValue = null;
597 | // 检查第一个元素是否是[或者{
598 | Element first = elements.getFirst();
599 | if ('[' == first.type) {
600 | includeFirst = true;
601 | } else if ('{' == first.type) {
602 | includeFirst = false;
603 | } else {
604 | throw new IllegalStateException("表达式异常");
605 | }
606 | // 检查最后一个元素是否是]或者}
607 | Element last = elements.getLast();
608 | if (']' == last.type) {
609 | includeLast = true;
610 | } else if ('}' == last.type) {
611 | includeLast = false;
612 | } else {
613 | throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号");
614 | }
615 | if (elements.size() < 4 || elements.size() > 5) {
616 | throw new IllegalStateException("表达式异常, RangeQuery 错误");
617 | }
618 | // 读出中间部分
619 | Element e2 = elements.get(1);
620 | if ('\'' == e2.type) {
621 | firstValue = e2.toString();
622 | //
623 | Element e3 = elements.get(2);
624 | if (',' != e3.type) {
625 | throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔");
626 | }
627 | //
628 | Element e4 = elements.get(3);
629 | if ('\'' == e4.type) {
630 | lastValue = e4.toString();
631 | } else if (e4 != last) {
632 | throw new IllegalStateException("表达式异常,RangeQuery格式错误");
633 | }
634 | } else if (',' == e2.type) {
635 | firstValue = null;
636 | //
637 | Element e3 = elements.get(2);
638 | if ('\'' == e3.type) {
639 | lastValue = e3.toString();
640 | } else {
641 | throw new IllegalStateException("表达式异常,RangeQuery格式错误");
642 | }
643 |
644 | } else {
645 | throw new IllegalStateException("表达式异常, RangeQuery格式错误");
646 | }
647 |
648 | return new TermRangeQuery(fieldNameEle.toString(), new BytesRef(firstValue), new BytesRef(
649 | lastValue), includeFirst, includeLast);
650 | }
651 |
652 | /**
653 | * 比较操作符优先级
654 | * @param e1
655 | * @param e2
656 | * @return
657 | */
658 | private int compare(Element e1, Element e2) {
659 | if ('&' == e1.type) {
660 | if ('&' == e2.type) {
661 | return 0;
662 | } else {
663 | return 1;
664 | }
665 | } else if ('|' == e1.type) {
666 | if ('&' == e2.type) {
667 | return -1;
668 | } else if ('|' == e2.type) {
669 | return 0;
670 | } else {
671 | return 1;
672 | }
673 | } else {
674 | if ('-' == e2.type) {
675 | return 0;
676 | } else {
677 | return -1;
678 | }
679 | }
680 | }
681 |
682 | /**
683 | * 表达式元素(操作符、FieldName、FieldValue)
684 | * @author linliangyi
685 | * May 20, 2010
686 | */
687 | private class Element {
688 | char type = 0;
689 | StringBuffer eleTextBuff;
690 |
691 | public Element() {
692 | eleTextBuff = new StringBuffer();
693 | }
694 |
695 | public void append(char c) {
696 | this.eleTextBuff.append(c);
697 | }
698 |
699 | public String toString() {
700 | return this.eleTextBuff.toString();
701 | }
702 | }
703 |
704 | public static void main(String[] args) {
705 | IKQueryExpressionParser parser = new IKQueryExpressionParser();
706 | // String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'";
707 | String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'";
708 | Query result = parser.parseExp(ikQueryExp, true);
709 | System.out.println(result);
710 |
711 | }
712 |
713 | }
714 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | */
25 | package org.wltea.analyzer.query;
26 |
27 | import java.io.IOException;
28 | import java.io.StringReader;
29 | import java.util.ArrayList;
30 | import java.util.List;
31 |
32 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
33 | import org.apache.lucene.queryparser.classic.ParseException;
34 | import org.apache.lucene.queryparser.classic.QueryParser;
35 | import org.apache.lucene.search.Query;
36 | import org.apache.lucene.util.Version;
37 | import org.wltea.analyzer.core.IKSegmenter;
38 | import org.wltea.analyzer.core.Lexeme;
39 |
40 | /**
41 | * Single Word Multi Char Query Builder
42 | * IK分词算法专用
43 | * @author linliangyi
44 | *
45 | */
46 | public class SWMCQueryBuilder {
47 |
48 | /**
49 | * 生成SWMCQuery
50 | * @param fieldName
51 | * @param keywords
52 | * @param quickMode
53 | * @return Lucene Query
54 | */
55 | public static Query create(String fieldName, String keywords, boolean quickMode) {
56 | if (fieldName == null || keywords == null) {
57 | throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
58 | }
59 | // 1.对keywords进行分词处理
60 | List lexemes = doAnalyze(keywords);
61 | // 2.根据分词结果,生成SWMCQuery
62 | Query _SWMCQuery = getSWMCQuery(fieldName, lexemes, quickMode);
63 | return _SWMCQuery;
64 | }
65 |
66 | /**
67 | * 分词切分,并返回结链表
68 | * @param keywords
69 | * @return
70 | */
71 | private static List doAnalyze(String keywords) {
72 | List lexemes = new ArrayList();
73 | IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true);
74 | try {
75 | Lexeme l = null;
76 | while ((l = ikSeg.next()) != null) {
77 | lexemes.add(l);
78 | }
79 | } catch (IOException e) {
80 | e.printStackTrace();
81 | }
82 | return lexemes;
83 | }
84 |
85 | /**
86 | * 根据分词结果生成SWMC搜索
87 | * @param fieldName
88 | * @param pathOption
89 | * @param quickMode
90 | * @return
91 | */
92 | private static Query getSWMCQuery(String fieldName, List lexemes, boolean quickMode) {
93 | // 构造SWMC的查询表达式
94 | StringBuffer keywordBuffer = new StringBuffer();
95 | // 精简的SWMC的查询表达式
96 | StringBuffer keywordBuffer_Short = new StringBuffer();
97 | // 记录最后词元长度
98 | int lastLexemeLength = 0;
99 | // 记录最后词元结束位置
100 | int lastLexemeEnd = -1;
101 |
102 | int shortCount = 0;
103 | int totalCount = 0;
104 | for (Lexeme l : lexemes) {
105 | totalCount += l.getLength();
106 | // 精简表达式
107 | if (l.getLength() > 1) {
108 | keywordBuffer_Short.append(' ').append(l.getLexemeText());
109 | shortCount += l.getLength();
110 | }
111 |
112 | if (lastLexemeLength == 0) {
113 | keywordBuffer.append(l.getLexemeText());
114 | } else if (lastLexemeLength == 1 && l.getLength() == 1
115 | && lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻,长度为一,合并)
116 | keywordBuffer.append(l.getLexemeText());
117 | } else {
118 | keywordBuffer.append(' ').append(l.getLexemeText());
119 |
120 | }
121 | lastLexemeLength = l.getLength();
122 | lastLexemeEnd = l.getEndPosition();
123 | }
124 |
125 | // 借助lucene queryparser 生成SWMC Query
126 | QueryParser qp = new QueryParser(Version.LUCENE_43, fieldName, new StandardAnalyzer(
127 | Version.LUCENE_43));
128 | qp.setDefaultOperator(QueryParser.AND_OPERATOR);
129 | qp.setAutoGeneratePhraseQueries(true);
130 |
131 | if (quickMode && (shortCount * 1.0f / totalCount) > 0.5f) {
132 | try {
133 | // System.out.println(keywordBuffer.toString());
134 | Query q = qp.parse(keywordBuffer_Short.toString());
135 | return q;
136 | } catch (ParseException e) {
137 | e.printStackTrace();
138 | }
139 |
140 | } else {
141 | if (keywordBuffer.length() > 0) {
142 | try {
143 | // System.out.println(keywordBuffer.toString());
144 | Query q = qp.parse(keywordBuffer.toString());
145 | return q;
146 | } catch (ParseException e) {
147 | e.printStackTrace();
148 | }
149 | }
150 | }
151 | return null;
152 | }
153 | }
154 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0.1
3 | * IK Analyzer release 5.0.1
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.sample;
27 |
28 | import java.io.IOException;
29 | import java.io.StringReader;
30 |
31 | import org.apache.lucene.analysis.Analyzer;
32 | import org.apache.lucene.analysis.TokenStream;
33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
36 | import org.wltea.analyzer.lucene.IKAnalyzer;
37 |
38 | /**
39 | * 使用IKAnalyzer进行分词的演示
40 | * 2012-10-22
41 | *
42 | */
43 | public class IKAnalzyerDemo {
44 |
45 | public static void main(String[] args) {
46 | // 构建IK分词器,使用smart分词模式
47 | Analyzer analyzer = new IKAnalyzer(true);
48 |
49 | // 获取Lucene的TokenStream对象
50 | TokenStream ts = null;
51 | try {
52 | ts = analyzer.tokenStream("myfield", new StringReader(
53 | "这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
54 | // 获取词元位置属性
55 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
56 | // 获取词元文本属性
57 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
58 | // 获取词元文本属性
59 | TypeAttribute type = ts.addAttribute(TypeAttribute.class);
60 |
61 | // 重置TokenStream(重置StringReader)
62 | ts.reset();
63 | // 迭代获取分词结果
64 | while (ts.incrementToken()) {
65 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : "
66 | + term.toString() + " | " + type.type());
67 | }
68 | // 关闭TokenStream(关闭StringReader)
69 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
70 |
71 | } catch (IOException e) {
72 | e.printStackTrace();
73 | } finally {
74 | // 释放TokenStream的所有资源
75 | if (ts != null) {
76 | try {
77 | ts.close();
78 | } catch (IOException e) {
79 | e.printStackTrace();
80 | }
81 | }
82 | }
83 |
84 | }
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java:
--------------------------------------------------------------------------------
1 | /**
2 | * IK 中文分词 版本 5.0
3 | * IK Analyzer release 5.0
4 | *
5 | * Licensed to the Apache Software Foundation (ASF) under one or more
6 | * contributor license agreements. See the NOTICE file distributed with
7 | * this work for additional information regarding copyright ownership.
8 | * The ASF licenses this file to You under the Apache License, Version 2.0
9 | * (the "License"); you may not use this file except in compliance with
10 | * the License. You may obtain a copy of the License at
11 | *
12 | * http://www.apache.org/licenses/LICENSE-2.0
13 | *
14 | * Unless required by applicable law or agreed to in writing, software
15 | * distributed under the License is distributed on an "AS IS" BASIS,
16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | * See the License for the specific language governing permissions and
18 | * limitations under the License.
19 | *
20 | * 源代码由林良益(linliangyi2005@gmail.com)提供
21 | * 版权声明 2012,乌龙茶工作室
22 | * provided by Linliangyi and copyright 2012 by Oolong studio
23 | *
24 | *
25 | */
26 | package org.wltea.analyzer.sample;
27 |
28 | import java.io.IOException;
29 |
30 | import org.apache.lucene.analysis.Analyzer;
31 | import org.apache.lucene.document.Document;
32 | import org.apache.lucene.document.Field;
33 | import org.apache.lucene.document.StringField;
34 | import org.apache.lucene.document.TextField;
35 | import org.apache.lucene.index.CorruptIndexException;
36 | import org.apache.lucene.index.DirectoryReader;
37 | import org.apache.lucene.index.IndexReader;
38 | import org.apache.lucene.index.IndexWriter;
39 | import org.apache.lucene.index.IndexWriterConfig;
40 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
41 | import org.apache.lucene.queryparser.classic.ParseException;
42 | import org.apache.lucene.queryparser.classic.QueryParser;
43 | import org.apache.lucene.search.IndexSearcher;
44 | import org.apache.lucene.search.Query;
45 | import org.apache.lucene.search.ScoreDoc;
46 | import org.apache.lucene.search.TopDocs;
47 | import org.apache.lucene.store.Directory;
48 | import org.apache.lucene.store.LockObtainFailedException;
49 | import org.apache.lucene.store.RAMDirectory;
50 | import org.apache.lucene.util.Version;
51 | import org.wltea.analyzer.lucene.IKAnalyzer;
52 |
53 | /**
54 | * 使用IKAnalyzer进行Lucene索引和查询的演示
55 | * 2012-3-2
56 | *
57 | * 以下是结合Lucene4.0 API的写法
58 | *
59 | */
60 | public class LuceneIndexAndSearchDemo {
61 |
62 | /**
63 | * 模拟:
64 | * 创建一个单条记录的索引,并对其进行搜索
65 | * @param args
66 | */
67 | public static void main(String[] args) {
68 | // Lucene Document的域名
69 | String fieldName = "text";
70 | // 检索内容
71 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
72 |
73 | // 实例化IKAnalyzer分词器
74 | Analyzer analyzer = new IKAnalyzer(true);
75 |
76 | Directory directory = null;
77 | IndexWriter iwriter = null;
78 | IndexReader ireader = null;
79 | IndexSearcher isearcher = null;
80 | try {
81 | // 建立内存索引对象
82 | directory = new RAMDirectory();
83 |
84 | // 配置IndexWriterConfig
85 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer);
86 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
87 | iwriter = new IndexWriter(directory, iwConfig);
88 | // 写入索引
89 | Document doc = new Document();
90 | doc.add(new StringField("ID", "10000", Field.Store.YES));
91 | doc.add(new TextField(fieldName, text, Field.Store.YES));
92 | iwriter.addDocument(doc);
93 | iwriter.close();
94 |
95 | // 搜索过程**********************************
96 | // 实例化搜索器
97 | ireader = DirectoryReader.open(directory);
98 | isearcher = new IndexSearcher(ireader);
99 |
100 | String keyword = "中文分词工具包";
101 | // 使用QueryParser查询分析器构造Query对象
102 | QueryParser qp = new QueryParser(Version.LUCENE_43, fieldName, analyzer);
103 | qp.setDefaultOperator(QueryParser.AND_OPERATOR);
104 | Query query = qp.parse(keyword);
105 | System.out.println("Query = " + query);
106 |
107 | // 搜索相似度最高的5条记录
108 | TopDocs topDocs = isearcher.search(query, 5);
109 | System.out.println("命中:" + topDocs.totalHits);
110 | // 输出结果
111 | ScoreDoc[] scoreDocs = topDocs.scoreDocs;
112 | for (int i = 0; i < topDocs.totalHits; i++) {
113 | Document targetDoc = isearcher.doc(scoreDocs[i].doc);
114 | System.out.println("内容:" + targetDoc.toString());
115 | }
116 |
117 | } catch (CorruptIndexException e) {
118 | e.printStackTrace();
119 | } catch (LockObtainFailedException e) {
120 | e.printStackTrace();
121 | } catch (IOException e) {
122 | e.printStackTrace();
123 | } catch (ParseException e) {
124 | e.printStackTrace();
125 | } finally {
126 | if (ireader != null) {
127 | try {
128 | ireader.close();
129 | } catch (IOException e) {
130 | e.printStackTrace();
131 | }
132 | }
133 | if (directory != null) {
134 | try {
135 | directory.close();
136 | } catch (IOException e) {
137 | e.printStackTrace();
138 | }
139 | }
140 | }
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
|