├── README.md ├── pom.xml └── src └── main └── java └── org └── wltea └── analyzer ├── cfg ├── Configuration.java └── DefaultConfig.java ├── core ├── AnalyzeContext.java ├── CJKSegmenter.java ├── CN_QuantifierSegmenter.java ├── CharacterUtil.java ├── IKArbitrator.java ├── IKSegmenter.java ├── ISegmenter.java ├── LetterSegmenter.java ├── Lexeme.java ├── LexemePath.java └── QuickSortSet.java ├── dic ├── DictSegment.java ├── Dictionary.java ├── Hit.java ├── main2012.dic └── quantifier.dic ├── lucene ├── IKAnalyzer.java └── IKTokenizer.java ├── query ├── IKQueryExpressionParser.java └── SWMCQueryBuilder.java └── sample ├── IKAnalzyerDemo.java └── LuceneIndexAndSearchDemo.java /README.md: -------------------------------------------------------------------------------- 1 | IKAnalyzer 2 | ========== 3 | 4 | An open source word breaker with lucene supported. See http://code.google.com/p/ik-analyzer/ 5 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | org.wltea.analyzer 5 | ikanalyzer 6 | 4.10.0 7 | jar 8 | 9 | IKAnalyzer 10 | http://maven.apache.org 11 | 12 | 13 | 4.10.0 14 | UTF-8 15 | 16 | 17 | 18 | 19 | org.apache.lucene 20 | lucene-core 21 | ${lucene.version} 22 | 23 | 24 | org.apache.lucene 25 | lucene-queryparser 26 | ${lucene.version} 27 | 28 | 29 | org.apache.lucene 30 | lucene-analyzers-common 31 | ${lucene.version} 32 | 33 | 34 | 35 | 36 | 37 | 38 | src/main/java/org/wltea/analyzer/dic 39 | org/wltea/analyzer/dic 40 | true 41 | 42 | *.dic 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.cfg; 26 | 27 | import java.util.List; 28 | 29 | /** 30 | * 31 | * 配置管理类接口 32 | * 33 | */ 34 | public interface Configuration { 35 | 36 | /** 37 | * 返回useSmart标志位 38 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 39 | * @return useSmart 40 | */ 41 | public boolean useSmart(); 42 | 43 | /** 44 | * 设置useSmart标志位 45 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 46 | * @param useSmart 47 | */ 48 | public void setUseSmart(boolean useSmart); 49 | 50 | /** 51 | * 获取主词典路径 52 | * 53 | * @return String 主词典路径 54 | */ 55 | public String getMainDictionary(); 56 | 57 | /** 58 | * 获取量词词典路径 59 | * @return String 量词词典路径 60 | */ 61 | public String getQuantifierDicionary(); 62 | 63 | /** 64 | * 获取扩展字典配置路径 65 | * @return List 相对类加载器的路径 66 | */ 67 | public List getExtDictionarys(); 68 | 69 | /** 70 | * 获取扩展停止词典配置路径 71 | * @return List 相对类加载器的路径 72 | */ 73 | public List getExtStopWordDictionarys(); 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.cfg; 27 | 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.util.ArrayList; 31 | import java.util.InvalidPropertiesFormatException; 32 | import java.util.List; 33 | import java.util.Properties; 34 | 35 | /** 36 | * Configuration 默认实现 37 | * 2012-5-8 38 | * 39 | */ 40 | public class DefaultConfig implements Configuration { 41 | 42 | /* 43 | * 分词器默认字典路径 44 | */ 45 | private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic"; 46 | private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic"; 47 | 48 | /* 49 | * 分词器配置文件路径 50 | */ 51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; 52 | // 配置属性——扩展字典 53 | private static final String EXT_DICT = "ext_dict"; 54 | // 配置属性——扩展停止词典 55 | private static final String EXT_STOP = "ext_stopwords"; 56 | 57 | private Properties props; 58 | /* 59 | * 是否使用smart方式分词 60 | */ 61 | private boolean useSmart; 62 | 63 | /** 64 | * 返回单例 65 | * @return Configuration单例 66 | */ 67 | public static Configuration getInstance() { 68 | return new DefaultConfig(); 69 | } 70 | 71 | /* 72 | * 初始化配置文件 73 | */ 74 | private DefaultConfig() { 75 | props = new Properties(); 76 | 77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME); 78 | if (input != null) { 79 | try { 80 | props.loadFromXML(input); 81 | } catch (InvalidPropertiesFormatException e) { 82 | e.printStackTrace(); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | } 88 | 89 | /** 90 | * 返回useSmart标志位 91 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 92 | * @return useSmart 93 | */ 94 | public boolean useSmart() { 95 | return useSmart; 96 | } 97 | 98 | /** 99 | * 设置useSmart标志位 100 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 101 | * @param useSmart 102 | */ 103 | public void setUseSmart(boolean useSmart) { 104 | this.useSmart = useSmart; 105 | } 106 | 107 | /** 108 | * 获取主词典路径 109 | * 110 | * @return String 主词典路径 111 | */ 112 | public String getMainDictionary() { 113 | return PATH_DIC_MAIN; 114 | } 115 | 116 | /** 117 | * 获取量词词典路径 118 | * @return String 量词词典路径 119 | */ 120 | public String getQuantifierDicionary() { 121 | return PATH_DIC_QUANTIFIER; 122 | } 123 | 124 | /** 125 | * 获取扩展字典配置路径 126 | * @return List 相对类加载器的路径 127 | */ 128 | public List getExtDictionarys() { 129 | List extDictFiles = new ArrayList(2); 130 | String extDictCfg = props.getProperty(EXT_DICT); 131 | if (extDictCfg != null) { 132 | // 使用;分割多个扩展字典配置 133 | String[] filePaths = extDictCfg.split(";"); 134 | if (filePaths != null) { 135 | for (String filePath : filePaths) { 136 | if (filePath != null && !"".equals(filePath.trim())) { 137 | extDictFiles.add(filePath.trim()); 138 | } 139 | } 140 | } 141 | } 142 | return extDictFiles; 143 | } 144 | 145 | /** 146 | * 获取扩展停止词典配置路径 147 | * @return List 相对类加载器的路径 148 | */ 149 | public List getExtStopWordDictionarys() { 150 | List extStopWordDictFiles = new ArrayList(2); 151 | String extStopWordDictCfg = props.getProperty(EXT_STOP); 152 | if (extStopWordDictCfg != null) { 153 | // 使用;分割多个扩展字典配置 154 | String[] filePaths = extStopWordDictCfg.split(";"); 155 | if (filePaths != null) { 156 | for (String filePath : filePaths) { 157 | if (filePath != null && !"".equals(filePath.trim())) { 158 | extStopWordDictFiles.add(filePath.trim()); 159 | } 160 | } 161 | } 162 | } 163 | return extStopWordDictFiles; 164 | } 165 | 166 | } 167 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/AnalyzeContext.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.io.IOException; 28 | import java.io.Reader; 29 | import java.util.HashMap; 30 | import java.util.HashSet; 31 | import java.util.LinkedList; 32 | import java.util.Map; 33 | import java.util.Set; 34 | 35 | import org.wltea.analyzer.cfg.Configuration; 36 | import org.wltea.analyzer.dic.Dictionary; 37 | 38 | /** 39 | * 40 | * 分词器上下文状态 41 | * 42 | */ 43 | class AnalyzeContext { 44 | 45 | // 默认缓冲区大小 46 | private static final int BUFF_SIZE = 4096; 47 | // 缓冲区耗尽的临界值 48 | private static final int BUFF_EXHAUST_CRITICAL = 100; 49 | 50 | // 字符窜读取缓冲 51 | private char[] segmentBuff; 52 | // 字符类型数组 53 | private int[] charTypes; 54 | 55 | // 记录Reader内已分析的字串总长度 56 | // 在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移 57 | private int buffOffset; 58 | // 当前缓冲区位置指针 59 | private int cursor; 60 | // 最近一次读入的,可处理的字串长度 61 | private int available; 62 | 63 | // 子分词器锁 64 | // 该集合非空,说明有子分词器在占用segmentBuff 65 | private Set buffLocker; 66 | 67 | // 原始分词结果集合,未经歧义处理 68 | private QuickSortSet orgLexemes; 69 | // LexemePath位置索引表 70 | private Map pathMap; 71 | // 最终分词结果集 72 | private LinkedList results; 73 | 74 | // 分词器配置项 75 | private Configuration cfg; 76 | 77 | public AnalyzeContext(Configuration cfg) { 78 | this.cfg = cfg; 79 | this.segmentBuff = new char[BUFF_SIZE]; 80 | this.charTypes = new int[BUFF_SIZE]; 81 | this.buffLocker = new HashSet(); 82 | this.orgLexemes = new QuickSortSet(); 83 | this.pathMap = new HashMap(); 84 | this.results = new LinkedList(); 85 | } 86 | 87 | int getCursor() { 88 | return this.cursor; 89 | } 90 | 91 | // 92 | // void setCursor(int cursor){ 93 | // this.cursor = cursor; 94 | // } 95 | 96 | char[] getSegmentBuff() { 97 | return this.segmentBuff; 98 | } 99 | 100 | char getCurrentChar() { 101 | return this.segmentBuff[this.cursor]; 102 | } 103 | 104 | int getCurrentCharType() { 105 | return this.charTypes[this.cursor]; 106 | } 107 | 108 | int getBufferOffset() { 109 | return this.buffOffset; 110 | } 111 | 112 | /** 113 | * 根据context的上下文情况,填充segmentBuff 114 | * @param reader 115 | * @return 返回待分析的(有效的)字串长度 116 | * @throws IOException 117 | */ 118 | int fillBuffer(Reader reader) throws IOException { 119 | int readCount = 0; 120 | if (this.buffOffset == 0) { 121 | // 首次读取reader 122 | readCount = reader.read(segmentBuff); 123 | } else { 124 | int offset = this.available - this.cursor; 125 | if (offset > 0) { 126 | // 最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部 127 | System.arraycopy(this.segmentBuff, this.cursor, this.segmentBuff, 0, offset); 128 | readCount = offset; 129 | } 130 | // 继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分 131 | readCount += reader.read(this.segmentBuff, offset, BUFF_SIZE - offset); 132 | } 133 | // 记录最后一次从Reader中读入的可用字符长度 134 | this.available = readCount; 135 | // 重置当前指针 136 | this.cursor = 0; 137 | return readCount; 138 | } 139 | 140 | /** 141 | * 初始化buff指针,处理第一个字符 142 | */ 143 | void initCursor() { 144 | this.cursor = 0; 145 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 146 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 147 | } 148 | 149 | /** 150 | * 指针+1 151 | * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false 152 | * 并处理当前字符 153 | */ 154 | boolean moveCursor() { 155 | if (this.cursor < this.available - 1) { 156 | this.cursor++; 157 | this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); 158 | this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); 159 | return true; 160 | } else { 161 | return false; 162 | } 163 | } 164 | 165 | /** 166 | * 设置当前segmentBuff为锁定状态 167 | * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff 168 | * @param segmenterName 169 | */ 170 | void lockBuffer(String segmenterName) { 171 | this.buffLocker.add(segmenterName); 172 | } 173 | 174 | /** 175 | * 移除指定的子分词器名,释放对segmentBuff的占用 176 | * @param segmenterName 177 | */ 178 | void unlockBuffer(String segmenterName) { 179 | this.buffLocker.remove(segmenterName); 180 | } 181 | 182 | /** 183 | * 只要buffLocker中存在segmenterName 184 | * 则buffer被锁定 185 | * @return boolean 缓冲去是否被锁定 186 | */ 187 | boolean isBufferLocked() { 188 | return this.buffLocker.size() > 0; 189 | } 190 | 191 | /** 192 | * 判断当前segmentBuff是否已经用完 193 | * 当前执针cursor移至segmentBuff末端this.available - 1 194 | * @return 195 | */ 196 | boolean isBufferConsumed() { 197 | return this.cursor == this.available - 1; 198 | } 199 | 200 | /** 201 | * 判断segmentBuff是否需要读取新数据 202 | * 203 | * 满足一下条件时, 204 | * 1.available == BUFF_SIZE 表示buffer满载 205 | * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 206 | * 3.!context.isBufferLocked()表示没有segmenter在占用buffer 207 | * 要中断当前循环(buffer要进行移位,并再读取数据的操作) 208 | * @return 209 | */ 210 | boolean needRefillBuffer() { 211 | return this.available == BUFF_SIZE && this.cursor < this.available - 1 212 | && this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked(); 213 | } 214 | 215 | /** 216 | * 累计当前的segmentBuff相对于reader起始位置的位移 217 | */ 218 | void markBufferOffset() { 219 | this.buffOffset += this.cursor; 220 | } 221 | 222 | /** 223 | * 向分词结果集添加词元 224 | * @param lexeme 225 | */ 226 | void addLexeme(Lexeme lexeme) { 227 | this.orgLexemes.addLexeme(lexeme); 228 | } 229 | 230 | /** 231 | * 添加分词结果路径 232 | * 路径起始位置 ---> 路径 映射表 233 | * @param path 234 | */ 235 | void addLexemePath(LexemePath path) { 236 | if (path != null) { 237 | this.pathMap.put(path.getPathBegin(), path); 238 | } 239 | } 240 | 241 | /** 242 | * 返回原始分词结果 243 | * @return 244 | */ 245 | QuickSortSet getOrgLexemes() { 246 | return this.orgLexemes; 247 | } 248 | 249 | /** 250 | * 推送分词结果到结果集合 251 | * 1.从buff头部遍历到this.cursor已处理位置 252 | * 2.将map中存在的分词结果推入results 253 | * 3.将map中不存在的CJDK字符以单字方式推入results 254 | */ 255 | void outputToResult() { 256 | int index = 0; 257 | for (; index <= this.cursor;) { 258 | // 跳过非CJK字符 259 | if (CharacterUtil.CHAR_USELESS == this.charTypes[index]) { 260 | index++; 261 | continue; 262 | } 263 | // 从pathMap找出对应index位置的LexemePath 264 | LexemePath path = this.pathMap.get(index); 265 | if (path != null) { 266 | // 输出LexemePath中的lexeme到results集合 267 | Lexeme l = path.pollFirst(); 268 | while (l != null) { 269 | this.results.add(l); 270 | // 将index移至lexeme后 271 | index = l.getBegin() + l.getLength(); 272 | l = path.pollFirst(); 273 | if (l != null) { 274 | // 输出path内部,词元间遗漏的单字 275 | for (; index < l.getBegin(); index++) { 276 | this.outputSingleCJK(index); 277 | } 278 | } 279 | } 280 | } else {// pathMap中找不到index对应的LexemePath 281 | // 单字输出 282 | this.outputSingleCJK(index); 283 | index++; 284 | } 285 | } 286 | // 清空当前的Map 287 | this.pathMap.clear(); 288 | } 289 | 290 | /** 291 | * 对CJK字符进行单字输出 292 | * @param index 293 | */ 294 | private void outputSingleCJK(int index) { 295 | if (CharacterUtil.CHAR_CHINESE == this.charTypes[index]) { 296 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_CNCHAR); 297 | this.results.add(singleCharLexeme); 298 | } else if (CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]) { 299 | Lexeme singleCharLexeme = new Lexeme(this.buffOffset, index, 1, Lexeme.TYPE_OTHER_CJK); 300 | this.results.add(singleCharLexeme); 301 | } 302 | } 303 | 304 | /** 305 | * 返回lexeme 306 | * 307 | * 同时处理合并 308 | * @return 309 | */ 310 | Lexeme getNextLexeme() { 311 | // 从结果集取出,并移除第一个Lexme 312 | Lexeme result = this.results.pollFirst(); 313 | while (result != null) { 314 | // 数量词合并 315 | this.compound(result); 316 | if (Dictionary.getSingleton().isStopWord(this.segmentBuff, result.getBegin(), 317 | result.getLength())) { 318 | // 是停止词继续取列表的下一个 319 | result = this.results.pollFirst(); 320 | } else { 321 | // 不是停止词, 生成lexeme的词元文本,输出 322 | result.setLexemeText(String.valueOf(segmentBuff, result.getBegin(), result.getLength())); 323 | break; 324 | } 325 | } 326 | return result; 327 | } 328 | 329 | /** 330 | * 重置分词上下文状态 331 | */ 332 | void reset() { 333 | this.buffLocker.clear(); 334 | this.orgLexemes = new QuickSortSet(); 335 | this.available = 0; 336 | this.buffOffset = 0; 337 | this.charTypes = new int[BUFF_SIZE]; 338 | this.cursor = 0; 339 | this.results.clear(); 340 | this.segmentBuff = new char[BUFF_SIZE]; 341 | this.pathMap.clear(); 342 | } 343 | 344 | /** 345 | * 组合词元 346 | */ 347 | private void compound(Lexeme result) { 348 | if (!this.cfg.useSmart()) { 349 | return; 350 | } 351 | // 数量词合并处理 352 | if (!this.results.isEmpty()) { 353 | 354 | if (Lexeme.TYPE_ARABIC == result.getLexemeType()) { 355 | Lexeme nextLexeme = this.results.peekFirst(); 356 | boolean appendOk = false; 357 | if (Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()) { 358 | // 合并英文数词+中文数词 359 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM); 360 | } else if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { 361 | // 合并英文数词+中文量词 362 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 363 | } 364 | if (appendOk) { 365 | // 弹出 366 | this.results.pollFirst(); 367 | } 368 | } 369 | 370 | // 可能存在第二轮合并 371 | if (Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()) { 372 | Lexeme nextLexeme = this.results.peekFirst(); 373 | boolean appendOk = false; 374 | if (Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()) { 375 | // 合并中文数词+中文量词 376 | appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN); 377 | } 378 | if (appendOk) { 379 | // 弹出 380 | this.results.pollFirst(); 381 | } 382 | } 383 | 384 | } 385 | } 386 | 387 | } 388 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.LinkedList; 28 | import java.util.List; 29 | 30 | import org.wltea.analyzer.dic.Dictionary; 31 | import org.wltea.analyzer.dic.Hit; 32 | 33 | /** 34 | * 中文-日韩文子分词器 35 | */ 36 | class CJKSegmenter implements ISegmenter { 37 | 38 | // 子分词器标签 39 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 40 | // 待处理的分词hit队列 41 | private List tmpHits; 42 | 43 | CJKSegmenter() { 44 | this.tmpHits = new LinkedList(); 45 | } 46 | 47 | /* 48 | * (non-Javadoc) 49 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 50 | */ 51 | public void analyze(AnalyzeContext context) { 52 | if (CharacterUtil.CHAR_USELESS != context.getCurrentCharType()) { 53 | 54 | // 优先处理tmpHits中的hit 55 | if (!this.tmpHits.isEmpty()) { 56 | // 处理词段队列 57 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 58 | for (Hit hit : tmpArray) { 59 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), 60 | context.getCursor(), hit); 61 | if (hit.isMatch()) { 62 | // 输出当前的词 63 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), 64 | context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_CNWORD); 65 | context.addLexeme(newLexeme); 66 | 67 | if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除 68 | this.tmpHits.remove(hit); 69 | } 70 | 71 | } else if (hit.isUnmatch()) { 72 | // hit不是词,移除 73 | this.tmpHits.remove(hit); 74 | } 75 | } 76 | } 77 | 78 | // ********************************* 79 | // 再对当前指针位置的字符进行单字匹配 80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), 81 | context.getCursor(), 1); 82 | if (singleCharHit.isMatch()) {// 首字成词 83 | // 输出当前的词 84 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, 85 | Lexeme.TYPE_CNWORD); 86 | context.addLexeme(newLexeme); 87 | 88 | // 同时也是词前缀 89 | if (singleCharHit.isPrefix()) { 90 | // 前缀匹配则放入hit列表 91 | this.tmpHits.add(singleCharHit); 92 | } 93 | } else if (singleCharHit.isPrefix()) {// 首字为词前缀 94 | // 前缀匹配则放入hit列表 95 | this.tmpHits.add(singleCharHit); 96 | } 97 | 98 | } else { 99 | // 遇到CHAR_USELESS字符 100 | // 清空队列 101 | this.tmpHits.clear(); 102 | } 103 | 104 | // 判断缓冲区是否已经读完 105 | if (context.isBufferConsumed()) { 106 | // 清空队列 107 | this.tmpHits.clear(); 108 | } 109 | 110 | // 判断是否锁定缓冲区 111 | if (this.tmpHits.size() == 0) { 112 | context.unlockBuffer(SEGMENTER_NAME); 113 | 114 | } else { 115 | context.lockBuffer(SEGMENTER_NAME); 116 | } 117 | } 118 | 119 | /* 120 | * (non-Javadoc) 121 | * @see org.wltea.analyzer.core.ISegmenter#reset() 122 | */ 123 | public void reset() { 124 | // 清空队列 125 | this.tmpHits.clear(); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.HashSet; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Set; 31 | 32 | import org.wltea.analyzer.dic.Dictionary; 33 | import org.wltea.analyzer.dic.Hit; 34 | 35 | /** 36 | * 37 | * 中文数量词子分词器 38 | */ 39 | class CN_QuantifierSegmenter implements ISegmenter { 40 | 41 | // 子分词器标签 42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 43 | 44 | // 中文数词 45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";// Cnum 46 | private static Set ChnNumberChars = new HashSet(); 47 | static { 48 | char[] ca = Chn_Num.toCharArray(); 49 | for (char nChar : ca) { 50 | ChnNumberChars.add(nChar); 51 | } 52 | } 53 | 54 | /* 55 | * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 56 | */ 57 | private int nStart; 58 | /* 59 | * 记录词元结束位置 end记录的是在词元中最后一个出现的合理的数词结束 60 | */ 61 | private int nEnd; 62 | 63 | // 待处理的量词hit队列 64 | private List countHits; 65 | 66 | CN_QuantifierSegmenter() { 67 | nStart = -1; 68 | nEnd = -1; 69 | this.countHits = new LinkedList(); 70 | } 71 | 72 | /** 73 | * 分词 74 | */ 75 | public void analyze(AnalyzeContext context) { 76 | // 处理中文数词 77 | this.processCNumber(context); 78 | // 处理中文量词 79 | this.processCount(context); 80 | 81 | // 判断是否锁定缓冲区 82 | if (this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()) { 83 | // 对缓冲区解锁 84 | context.unlockBuffer(SEGMENTER_NAME); 85 | } else { 86 | context.lockBuffer(SEGMENTER_NAME); 87 | } 88 | } 89 | 90 | /** 91 | * 重置子分词器状态 92 | */ 93 | public void reset() { 94 | nStart = -1; 95 | nEnd = -1; 96 | countHits.clear(); 97 | } 98 | 99 | /** 100 | * 处理数词 101 | */ 102 | private void processCNumber(AnalyzeContext context) { 103 | if (nStart == -1 && nEnd == -1) {// 初始状态 104 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 105 | && ChnNumberChars.contains(context.getCurrentChar())) { 106 | // 记录数词的起始、结束位置 107 | nStart = context.getCursor(); 108 | nEnd = context.getCursor(); 109 | } 110 | } else {// 正在处理状态 111 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 112 | && ChnNumberChars.contains(context.getCurrentChar())) { 113 | // 记录数词的结束位置 114 | nEnd = context.getCursor(); 115 | } else { 116 | // 输出数词 117 | this.outputNumLexeme(context); 118 | // 重置头尾指针 119 | nStart = -1; 120 | nEnd = -1; 121 | } 122 | } 123 | 124 | // 缓冲区已经用完,还有尚未输出的数词 125 | if (context.isBufferConsumed()) { 126 | if (nStart != -1 && nEnd != -1) { 127 | // 输出数词 128 | outputNumLexeme(context); 129 | // 重置头尾指针 130 | nStart = -1; 131 | nEnd = -1; 132 | } 133 | } 134 | } 135 | 136 | /** 137 | * 处理中文量词 138 | * @param context 139 | */ 140 | private void processCount(AnalyzeContext context) { 141 | // 判断是否需要启动量词扫描 142 | if (!this.needCountScan(context)) { 143 | return; 144 | } 145 | 146 | if (CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()) { 147 | 148 | // 优先处理countHits中的hit 149 | if (!this.countHits.isEmpty()) { 150 | // 处理词段队列 151 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 152 | for (Hit hit : tmpArray) { 153 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), 154 | context.getCursor(), hit); 155 | if (hit.isMatch()) { 156 | // 输出当前的词 157 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), hit.getBegin(), 158 | context.getCursor() - hit.getBegin() + 1, Lexeme.TYPE_COUNT); 159 | context.addLexeme(newLexeme); 160 | 161 | if (!hit.isPrefix()) {// 不是词前缀,hit不需要继续匹配,移除 162 | this.countHits.remove(hit); 163 | } 164 | 165 | } else if (hit.isUnmatch()) { 166 | // hit不是词,移除 167 | this.countHits.remove(hit); 168 | } 169 | } 170 | } 171 | 172 | // ********************************* 173 | // 对当前指针位置的字符进行单字匹配 174 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), 175 | context.getCursor(), 1); 176 | if (singleCharHit.isMatch()) {// 首字成量词词 177 | // 输出当前的词 178 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), context.getCursor(), 1, 179 | Lexeme.TYPE_COUNT); 180 | context.addLexeme(newLexeme); 181 | 182 | // 同时也是词前缀 183 | if (singleCharHit.isPrefix()) { 184 | // 前缀匹配则放入hit列表 185 | this.countHits.add(singleCharHit); 186 | } 187 | } else if (singleCharHit.isPrefix()) {// 首字为量词前缀 188 | // 前缀匹配则放入hit列表 189 | this.countHits.add(singleCharHit); 190 | } 191 | 192 | } else { 193 | // 输入的不是中文字符 194 | // 清空未成形的量词 195 | this.countHits.clear(); 196 | } 197 | 198 | // 缓冲区数据已经读完,还有尚未输出的量词 199 | if (context.isBufferConsumed()) { 200 | // 清空未成形的量词 201 | this.countHits.clear(); 202 | } 203 | } 204 | 205 | /** 206 | * 判断是否需要扫描量词 207 | * @return 208 | */ 209 | private boolean needCountScan(AnalyzeContext context) { 210 | if ((nStart != -1 && nEnd != -1) || !countHits.isEmpty()) { 211 | // 正在处理中文数词,或者正在处理量词 212 | return true; 213 | } else { 214 | // 找到一个相邻的数词 215 | if (!context.getOrgLexemes().isEmpty()) { 216 | Lexeme l = context.getOrgLexemes().peekLast(); 217 | if (Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()) { 218 | if (l.getBegin() + l.getLength() == context.getCursor()) { 219 | return true; 220 | } 221 | } 222 | } 223 | } 224 | return false; 225 | } 226 | 227 | /** 228 | * 添加数词词元到结果集 229 | * @param context 230 | */ 231 | private void outputNumLexeme(AnalyzeContext context) { 232 | if (nStart > -1 && nEnd > -1) { 233 | // 输出数词 234 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), nStart, nEnd - nStart + 1, 235 | Lexeme.TYPE_CNUM); 236 | context.addLexeme(newLexeme); 237 | 238 | } 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 字符集识别工具类 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | /** 29 | * 30 | * 字符集识别工具类 31 | */ 32 | class CharacterUtil { 33 | 34 | public static final int CHAR_USELESS = 0; 35 | 36 | public static final int CHAR_ARABIC = 0X00000001; 37 | 38 | public static final int CHAR_ENGLISH = 0X00000002; 39 | 40 | public static final int CHAR_CHINESE = 0X00000004; 41 | 42 | public static final int CHAR_OTHER_CJK = 0X00000008; 43 | 44 | /** 45 | * 识别字符类型 46 | * @param input 47 | * @return int CharacterUtil定义的字符类型常量 48 | */ 49 | static int identifyCharType(char input) { 50 | if (input >= '0' && input <= '9') { 51 | return CHAR_ARABIC; 52 | 53 | } else if ((input >= 'a' && input <= 'z') || (input >= 'A' && input <= 'Z')) { 54 | return CHAR_ENGLISH; 55 | 56 | } else { 57 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 58 | 59 | if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 60 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 61 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A) { 62 | // 目前已知的中文字符UTF-8集合 63 | return CHAR_CHINESE; 64 | 65 | } else if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS // 全角数字字符和日韩字符 66 | // 韩文字符集 67 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 68 | || ub == Character.UnicodeBlock.HANGUL_JAMO 69 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 70 | // 日文字符集 71 | || ub == Character.UnicodeBlock.HIRAGANA // 平假名 72 | || ub == Character.UnicodeBlock.KATAKANA // 片假名 73 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS) { 74 | return CHAR_OTHER_CJK; 75 | 76 | } 77 | } 78 | // 其他的不做处理的字符 79 | return CHAR_USELESS; 80 | } 81 | 82 | /** 83 | * 进行字符规格化(全角转半角,大写转小写处理) 84 | * @param input 85 | * @return char 86 | */ 87 | static char regularize(char input) { 88 | if (input == 12288) { 89 | input = (char) 32; 90 | 91 | } else if (input > 65280 && input < 65375) { 92 | input = (char) (input - 65248); 93 | 94 | } else if (input >= 'A' && input <= 'Z') { 95 | input += 32; 96 | } 97 | 98 | return input; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator() { 36 | 37 | } 38 | 39 | /** 40 | * 分词歧义处理 41 | * @param orgLexemes 42 | * @param useSmart 43 | */ 44 | void process(AnalyzeContext context, boolean useSmart) { 45 | QuickSortSet orgLexemes = context.getOrgLexemes(); 46 | Lexeme orgLexeme = orgLexemes.pollFirst(); 47 | 48 | LexemePath crossPath = new LexemePath(); 49 | while (orgLexeme != null) { 50 | if (!crossPath.addCrossLexeme(orgLexeme)) { 51 | // 找到与crossPath不相交的下一个crossPath 52 | if (crossPath.size() == 1 || !useSmart) { 53 | // crossPath没有歧义 或者 不做歧义处理 54 | // 直接输出当前crossPath 55 | context.addLexemePath(crossPath); 56 | } else { 57 | // 对当前的crossPath进行歧义处理 58 | QuickSortSet.Cell headCell = crossPath.getHead(); 59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 60 | // 输出歧义处理结果judgeResult 61 | context.addLexemePath(judgeResult); 62 | } 63 | 64 | // 把orgLexeme加入新的crossPath中 65 | crossPath = new LexemePath(); 66 | crossPath.addCrossLexeme(orgLexeme); 67 | } 68 | orgLexeme = orgLexemes.pollFirst(); 69 | } 70 | 71 | // 处理最后的path 72 | if (crossPath.size() == 1 || !useSmart) { 73 | // crossPath没有歧义 或者 不做歧义处理 74 | // 直接输出当前crossPath 75 | context.addLexemePath(crossPath); 76 | } else { 77 | // 对当前的crossPath进行歧义处理 78 | QuickSortSet.Cell headCell = crossPath.getHead(); 79 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 80 | // 输出歧义处理结果judgeResult 81 | context.addLexemePath(judgeResult); 82 | } 83 | } 84 | 85 | /** 86 | * 歧义识别 87 | * @param lexemeCell 歧义路径链表头 88 | * @param fullTextLength 歧义路径文本长度 89 | * @param option 候选结果路径 90 | * @return 91 | */ 92 | private LexemePath judge(QuickSortSet.Cell lexemeCell, int fullTextLength) { 93 | // 候选路径集合 94 | TreeSet pathOptions = new TreeSet(); 95 | // 候选结果路径 96 | LexemePath option = new LexemePath(); 97 | 98 | // 对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 99 | Stack lexemeStack = this.forwardPath(lexemeCell, option); 100 | 101 | // 当前词元链并非最理想的,加入候选路径集合 102 | pathOptions.add(option.copy()); 103 | 104 | // 存在歧义词,处理 105 | QuickSortSet.Cell c = null; 106 | while (!lexemeStack.isEmpty()) { 107 | c = lexemeStack.pop(); 108 | // 回滚词元链 109 | this.backPath(c.getLexeme(), option); 110 | // 从歧义词位置开始,递归,生成可选方案 111 | this.forwardPath(c, option); 112 | pathOptions.add(option.copy()); 113 | } 114 | 115 | // 返回集合中的最优方案 116 | return pathOptions.first(); 117 | 118 | } 119 | 120 | /** 121 | * 向前遍历,添加词元,构造一个无歧义词元组合 122 | * @param LexemePath path 123 | * @return 124 | */ 125 | private Stack forwardPath(QuickSortSet.Cell lexemeCell, LexemePath option) { 126 | // 发生冲突的Lexeme栈 127 | Stack conflictStack = new Stack(); 128 | QuickSortSet.Cell c = lexemeCell; 129 | // 迭代遍历Lexeme链表 130 | while (c != null && c.getLexeme() != null) { 131 | if (!option.addNotCrossLexeme(c.getLexeme())) { 132 | // 词元交叉,添加失败则加入lexemeStack栈 133 | conflictStack.push(c); 134 | } 135 | c = c.getNext(); 136 | } 137 | return conflictStack; 138 | } 139 | 140 | /** 141 | * 回滚词元链,直到它能够接受指定的词元 142 | * @param lexeme 143 | * @param l 144 | */ 145 | private void backPath(Lexeme l, LexemePath option) { 146 | while (option.checkCross(l)) { 147 | option.removeTail(); 148 | } 149 | 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import java.io.IOException; 27 | import java.io.Reader; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.cfg.Configuration; 32 | import org.wltea.analyzer.cfg.DefaultConfig; 33 | import org.wltea.analyzer.dic.Dictionary; 34 | 35 | /** 36 | * IK分词器主类 37 | * 38 | */ 39 | public final class IKSegmenter { 40 | 41 | // 字符窜reader 42 | private Reader input; 43 | // 分词器配置项 44 | private Configuration cfg; 45 | // 分词器上下文 46 | private AnalyzeContext context; 47 | // 分词处理器列表 48 | private List segmenters; 49 | // 分词歧义裁决器 50 | private IKArbitrator arbitrator; 51 | 52 | /** 53 | * IK分词器构造函数 54 | * @param input 55 | * @param useSmart 为true,使用智能分词策略 56 | * 57 | * 非智能分词:细粒度输出所有可能的切分结果 58 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断 59 | */ 60 | public IKSegmenter(Reader input, boolean useSmart) { 61 | this.input = input; 62 | this.cfg = DefaultConfig.getInstance(); 63 | this.cfg.setUseSmart(useSmart); 64 | this.init(); 65 | } 66 | 67 | /** 68 | * IK分词器构造函数 69 | * @param input 70 | * @param cfg 使用自定义的Configuration构造分词器 71 | * 72 | */ 73 | public IKSegmenter(Reader input, Configuration cfg) { 74 | this.input = input; 75 | this.cfg = cfg; 76 | this.init(); 77 | } 78 | 79 | /** 80 | * 初始化 81 | */ 82 | private void init() { 83 | // 初始化词典单例 84 | Dictionary.initial(this.cfg); 85 | // 初始化分词上下文 86 | this.context = new AnalyzeContext(this.cfg); 87 | // 加载子分词器 88 | this.segmenters = this.loadSegmenters(); 89 | // 加载歧义裁决器 90 | this.arbitrator = new IKArbitrator(); 91 | } 92 | 93 | /** 94 | * 初始化词典,加载子分词器实现 95 | * @return List 96 | */ 97 | private List loadSegmenters() { 98 | List segmenters = new ArrayList(4); 99 | // 处理字母的子分词器 100 | segmenters.add(new LetterSegmenter()); 101 | // 处理中文数量词的子分词器 102 | segmenters.add(new CN_QuantifierSegmenter()); 103 | // 处理中文词的子分词器 104 | segmenters.add(new CJKSegmenter()); 105 | return segmenters; 106 | } 107 | 108 | /** 109 | * 分词,获取下一个词元 110 | * @return Lexeme 词元对象 111 | * @throws IOException 112 | */ 113 | public synchronized Lexeme next() throws IOException { 114 | Lexeme l = null; 115 | while ((l = context.getNextLexeme()) == null) { 116 | /* 117 | * 从reader中读取数据,填充buffer 如果reader是分次读入buffer的,那么buffer要 进行移位处理 移位处理上次读入的但未处理的数据 118 | */ 119 | int available = context.fillBuffer(this.input); 120 | if (available <= 0) { 121 | // reader已经读完 122 | context.reset(); 123 | return null; 124 | 125 | } else { 126 | // 初始化指针 127 | context.initCursor(); 128 | do { 129 | // 遍历子分词器 130 | for (ISegmenter segmenter : segmenters) { 131 | segmenter.analyze(context); 132 | } 133 | // 字符缓冲区接近读完,需要读入新的字符 134 | if (context.needRefillBuffer()) { 135 | break; 136 | } 137 | // 向前移动指针 138 | } while (context.moveCursor()); 139 | // 重置子分词器,为下轮循环进行初始化 140 | for (ISegmenter segmenter : segmenters) { 141 | segmenter.reset(); 142 | } 143 | } 144 | // 对分词进行歧义处理 145 | this.arbitrator.process(context, this.cfg.useSmart()); 146 | // 将分词结果输出到结果集,并处理未切分的单个CJK字符 147 | context.outputToResult(); 148 | // 记录本次分词的缓冲区位移 149 | context.markBufferOffset(); 150 | } 151 | return l; 152 | } 153 | 154 | /** 155 | * 重置分词器到初始状态 156 | * @param input 157 | */ 158 | public synchronized void reset(Reader input) { 159 | this.input = input; 160 | context.reset(); 161 | for (ISegmenter segmenter : segmenters) { 162 | segmenter.reset(); 163 | } 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * 29 | * 子分词器接口 30 | */ 31 | interface ISegmenter { 32 | 33 | /** 34 | * 从分析器读取下一个可能分解的词元对象 35 | * @param context 分词算法上下文 36 | */ 37 | void analyze(AnalyzeContext context); 38 | 39 | /** 40 | * 重置子分析器状态 41 | */ 42 | void reset(); 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/LetterSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Arrays; 28 | 29 | /** 30 | * 31 | * 英文字符及阿拉伯数字子分词器 32 | */ 33 | class LetterSegmenter implements ISegmenter { 34 | 35 | // 子分词器标签 36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; 37 | // 链接符号 38 | private static final char[] Letter_Connector = new char[] { '#', '&', '+', '-', '.', '@', '_' }; 39 | 40 | // 数字符号 41 | private static final char[] Num_Connector = new char[] { ',', '.' }; 42 | 43 | /* 44 | * 词元的开始位置, 同时作为子分词器状态标识 当start > -1 时,标识当前的分词器正在处理字符 45 | */ 46 | private int start; 47 | /* 48 | * 记录词元结束位置 end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 49 | */ 50 | private int end; 51 | 52 | /* 53 | * 字母起始位置 54 | */ 55 | private int englishStart; 56 | 57 | /* 58 | * 字母结束位置 59 | */ 60 | private int englishEnd; 61 | 62 | /* 63 | * 阿拉伯数字起始位置 64 | */ 65 | private int arabicStart; 66 | 67 | /* 68 | * 阿拉伯数字结束位置 69 | */ 70 | private int arabicEnd; 71 | 72 | LetterSegmenter() { 73 | Arrays.sort(Letter_Connector); 74 | Arrays.sort(Num_Connector); 75 | this.start = -1; 76 | this.end = -1; 77 | this.englishStart = -1; 78 | this.englishEnd = -1; 79 | this.arabicStart = -1; 80 | this.arabicEnd = -1; 81 | } 82 | 83 | /* 84 | * (non-Javadoc) 85 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 86 | */ 87 | public void analyze(AnalyzeContext context) { 88 | boolean bufferLockFlag = false; 89 | // 处理英文字母 90 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 91 | // 处理阿拉伯字母 92 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 93 | // 处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) 94 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 95 | 96 | // 判断是否锁定缓冲区 97 | if (bufferLockFlag) { 98 | context.lockBuffer(SEGMENTER_NAME); 99 | } else { 100 | // 对缓冲区解锁 101 | context.unlockBuffer(SEGMENTER_NAME); 102 | } 103 | } 104 | 105 | /* 106 | * (non-Javadoc) 107 | * @see org.wltea.analyzer.core.ISegmenter#reset() 108 | */ 109 | public void reset() { 110 | this.start = -1; 111 | this.end = -1; 112 | this.englishStart = -1; 113 | this.englishEnd = -1; 114 | this.arabicStart = -1; 115 | this.arabicEnd = -1; 116 | } 117 | 118 | /** 119 | * 处理数字字母混合输出 120 | * 如:windos2000 | linliangyi2005@gmail.com 121 | * @param input 122 | * @param context 123 | * @return 124 | */ 125 | private boolean processMixLetter(AnalyzeContext context) { 126 | boolean needLock = false; 127 | 128 | if (this.start == -1) {// 当前的分词器尚未开始处理字符 129 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 130 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { 131 | // 记录起始指针的位置,标明分词器进入处理状态 132 | this.start = context.getCursor(); 133 | this.end = start; 134 | } 135 | 136 | } else {// 当前的分词器正在处理字符 137 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 138 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { 139 | // 记录下可能的结束位置 140 | this.end = context.getCursor(); 141 | 142 | } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 143 | && this.isLetterConnector(context.getCurrentChar())) { 144 | // 记录下可能的结束位置 145 | this.end = context.getCursor(); 146 | } else { 147 | // 遇到非Letter字符,输出词元 148 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start 149 | + 1, Lexeme.TYPE_LETTER); 150 | context.addLexeme(newLexeme); 151 | this.start = -1; 152 | this.end = -1; 153 | } 154 | } 155 | 156 | // 判断缓冲区是否已经读完 157 | if (context.isBufferConsumed()) { 158 | if (this.start != -1 && this.end != -1) { 159 | // 缓冲以读完,输出词元 160 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.start, this.end - this.start 161 | + 1, Lexeme.TYPE_LETTER); 162 | context.addLexeme(newLexeme); 163 | this.start = -1; 164 | this.end = -1; 165 | } 166 | } 167 | 168 | // 判断是否锁定缓冲区 169 | if (this.start == -1 && this.end == -1) { 170 | // 对缓冲区解锁 171 | needLock = false; 172 | } else { 173 | needLock = true; 174 | } 175 | return needLock; 176 | } 177 | 178 | /** 179 | * 处理纯英文字母输出 180 | * @param context 181 | * @return 182 | */ 183 | private boolean processEnglishLetter(AnalyzeContext context) { 184 | boolean needLock = false; 185 | 186 | if (this.englishStart == -1) {// 当前的分词器尚未开始处理英文字符 187 | if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { 188 | // 记录起始指针的位置,标明分词器进入处理状态 189 | this.englishStart = context.getCursor(); 190 | this.englishEnd = this.englishStart; 191 | } 192 | } else {// 当前的分词器正在处理英文字符 193 | if (CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()) { 194 | // 记录当前指针位置为结束位置 195 | this.englishEnd = context.getCursor(); 196 | } else { 197 | // 遇到非English字符,输出词元 198 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd 199 | - this.englishStart + 1, Lexeme.TYPE_ENGLISH); 200 | context.addLexeme(newLexeme); 201 | this.englishStart = -1; 202 | this.englishEnd = -1; 203 | } 204 | } 205 | 206 | // 判断缓冲区是否已经读完 207 | if (context.isBufferConsumed()) { 208 | if (this.englishStart != -1 && this.englishEnd != -1) { 209 | // 缓冲以读完,输出词元 210 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.englishStart, this.englishEnd 211 | - this.englishStart + 1, Lexeme.TYPE_ENGLISH); 212 | context.addLexeme(newLexeme); 213 | this.englishStart = -1; 214 | this.englishEnd = -1; 215 | } 216 | } 217 | 218 | // 判断是否锁定缓冲区 219 | if (this.englishStart == -1 && this.englishEnd == -1) { 220 | // 对缓冲区解锁 221 | needLock = false; 222 | } else { 223 | needLock = true; 224 | } 225 | return needLock; 226 | } 227 | 228 | /** 229 | * 处理阿拉伯数字输出 230 | * @param context 231 | * @return 232 | */ 233 | private boolean processArabicLetter(AnalyzeContext context) { 234 | boolean needLock = false; 235 | 236 | if (this.arabicStart == -1) {// 当前的分词器尚未开始处理数字字符 237 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { 238 | // 记录起始指针的位置,标明分词器进入处理状态 239 | this.arabicStart = context.getCursor(); 240 | this.arabicEnd = this.arabicStart; 241 | } 242 | } else {// 当前的分词器正在处理数字字符 243 | if (CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()) { 244 | // 记录当前指针位置为结束位置 245 | this.arabicEnd = context.getCursor(); 246 | } else if (CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 247 | && this.isNumConnector(context.getCurrentChar())) { 248 | // 不输出数字,但不标记结束 249 | } else { 250 | // //遇到非Arabic字符,输出词元 251 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd 252 | - this.arabicStart + 1, Lexeme.TYPE_ARABIC); 253 | context.addLexeme(newLexeme); 254 | this.arabicStart = -1; 255 | this.arabicEnd = -1; 256 | } 257 | } 258 | 259 | // 判断缓冲区是否已经读完 260 | if (context.isBufferConsumed()) { 261 | if (this.arabicStart != -1 && this.arabicEnd != -1) { 262 | // 生成已切分的词元 263 | Lexeme newLexeme = new Lexeme(context.getBufferOffset(), this.arabicStart, this.arabicEnd 264 | - this.arabicStart + 1, Lexeme.TYPE_ARABIC); 265 | context.addLexeme(newLexeme); 266 | this.arabicStart = -1; 267 | this.arabicEnd = -1; 268 | } 269 | } 270 | 271 | // 判断是否锁定缓冲区 272 | if (this.arabicStart == -1 && this.arabicEnd == -1) { 273 | // 对缓冲区解锁 274 | needLock = false; 275 | } else { 276 | needLock = true; 277 | } 278 | return needLock; 279 | } 280 | 281 | /** 282 | * 判断是否是字母连接符号 283 | * @param input 284 | * @return 285 | */ 286 | private boolean isLetterConnector(char input) { 287 | int index = Arrays.binarySearch(Letter_Connector, input); 288 | return index >= 0; 289 | } 290 | 291 | /** 292 | * 判断是否是数字连接符号 293 | * @param input 294 | * @return 295 | */ 296 | private boolean isNumConnector(char input) { 297 | int index = Arrays.binarySearch(Num_Connector, input); 298 | return index >= 0; 299 | } 300 | } 301 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK词元对象 29 | */ 30 | public class Lexeme implements Comparable { 31 | // lexemeType常量 32 | // 未知 33 | public static final int TYPE_UNKNOWN = 0; 34 | // 英文 35 | public static final int TYPE_ENGLISH = 1; 36 | // 数字 37 | public static final int TYPE_ARABIC = 2; 38 | // 英文数字混合 39 | public static final int TYPE_LETTER = 3; 40 | // 中文词元 41 | public static final int TYPE_CNWORD = 4; 42 | // 中文单字 43 | public static final int TYPE_CNCHAR = 64; 44 | // 日韩文字 45 | public static final int TYPE_OTHER_CJK = 8; 46 | // 中文数词 47 | public static final int TYPE_CNUM = 16; 48 | // 中文量词 49 | public static final int TYPE_COUNT = 32; 50 | // 中文数量词 51 | public static final int TYPE_CQUAN = 48; 52 | 53 | // 词元的起始位移 54 | private int offset; 55 | // 词元的相对起始位置 56 | private int begin; 57 | // 词元的长度 58 | private int length; 59 | // 词元文本 60 | private String lexemeText; 61 | // 词元类型 62 | private int lexemeType; 63 | 64 | public Lexeme(int offset, int begin, int length, int lexemeType) { 65 | this.offset = offset; 66 | this.begin = begin; 67 | if (length < 0) { 68 | throw new IllegalArgumentException("length < 0"); 69 | } 70 | this.length = length; 71 | this.lexemeType = lexemeType; 72 | } 73 | 74 | /* 75 | * 判断词元相等算法 起始位置偏移、起始位置、终止位置相同 76 | * @see java.lang.Object#equals(Object o) 77 | */ 78 | public boolean equals(Object o) { 79 | if (o == null) { 80 | return false; 81 | } 82 | 83 | if (this == o) { 84 | return true; 85 | } 86 | 87 | if (o instanceof Lexeme) { 88 | Lexeme other = (Lexeme) o; 89 | if (this.offset == other.getOffset() && this.begin == other.getBegin() 90 | && this.length == other.getLength()) { 91 | return true; 92 | } else { 93 | return false; 94 | } 95 | } else { 96 | return false; 97 | } 98 | } 99 | 100 | /* 101 | * 词元哈希编码算法 102 | * @see java.lang.Object#hashCode() 103 | */ 104 | public int hashCode() { 105 | int absBegin = getBeginPosition(); 106 | int absEnd = getEndPosition(); 107 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 108 | } 109 | 110 | /* 111 | * 词元在排序集合中的比较算法 112 | * @see java.lang.Comparable#compareTo(java.lang.Object) 113 | */ 114 | public int compareTo(Lexeme other) { 115 | // 起始位置优先 116 | if (this.begin < other.getBegin()) { 117 | return -1; 118 | } else if (this.begin == other.getBegin()) { 119 | // 词元长度优先 120 | if (this.length > other.getLength()) { 121 | return -1; 122 | } else if (this.length == other.getLength()) { 123 | return 0; 124 | } else {// this.length < other.getLength() 125 | return 1; 126 | } 127 | 128 | } else {// this.begin > other.getBegin() 129 | return 1; 130 | } 131 | } 132 | 133 | public int getOffset() { 134 | return offset; 135 | } 136 | 137 | public void setOffset(int offset) { 138 | this.offset = offset; 139 | } 140 | 141 | public int getBegin() { 142 | return begin; 143 | } 144 | 145 | /** 146 | * 获取词元在文本中的起始位置 147 | * @return int 148 | */ 149 | public int getBeginPosition() { 150 | return offset + begin; 151 | } 152 | 153 | public void setBegin(int begin) { 154 | this.begin = begin; 155 | } 156 | 157 | /** 158 | * 获取词元在文本中的结束位置 159 | * @return int 160 | */ 161 | public int getEndPosition() { 162 | return offset + begin + length; 163 | } 164 | 165 | /** 166 | * 获取词元的字符长度 167 | * @return int 168 | */ 169 | public int getLength() { 170 | return this.length; 171 | } 172 | 173 | public void setLength(int length) { 174 | if (this.length < 0) { 175 | throw new IllegalArgumentException("length < 0"); 176 | } 177 | this.length = length; 178 | } 179 | 180 | /** 181 | * 获取词元的文本内容 182 | * @return String 183 | */ 184 | public String getLexemeText() { 185 | if (lexemeText == null) { 186 | return ""; 187 | } 188 | return lexemeText; 189 | } 190 | 191 | public void setLexemeText(String lexemeText) { 192 | if (lexemeText == null) { 193 | this.lexemeText = ""; 194 | this.length = 0; 195 | } else { 196 | this.lexemeText = lexemeText; 197 | this.length = lexemeText.length(); 198 | } 199 | } 200 | 201 | /** 202 | * 获取词元类型 203 | * @return int 204 | */ 205 | public int getLexemeType() { 206 | return lexemeType; 207 | } 208 | 209 | /** 210 | * 获取词元类型标示字符串 211 | * @return String 212 | */ 213 | public String getLexemeTypeString() { 214 | switch (lexemeType) { 215 | 216 | case TYPE_ENGLISH: 217 | return "ENGLISH"; 218 | 219 | case TYPE_ARABIC: 220 | return "ARABIC"; 221 | 222 | case TYPE_LETTER: 223 | return "LETTER"; 224 | 225 | case TYPE_CNWORD: 226 | return "CN_WORD"; 227 | 228 | case TYPE_CNCHAR: 229 | return "CN_CHAR"; 230 | 231 | case TYPE_OTHER_CJK: 232 | return "OTHER_CJK"; 233 | 234 | case TYPE_COUNT: 235 | return "COUNT"; 236 | 237 | case TYPE_CNUM: 238 | return "TYPE_CNUM"; 239 | 240 | case TYPE_CQUAN: 241 | return "TYPE_CQUAN"; 242 | 243 | default: 244 | return "UNKONW"; 245 | } 246 | } 247 | 248 | public void setLexemeType(int lexemeType) { 249 | this.lexemeType = lexemeType; 250 | } 251 | 252 | /** 253 | * 合并两个相邻的词元 254 | * @param l 255 | * @param lexemeType 256 | * @return boolean 词元是否成功合并 257 | */ 258 | public boolean append(Lexeme l, int lexemeType) { 259 | if (l != null && this.getEndPosition() == l.getBeginPosition()) { 260 | this.length += l.getLength(); 261 | this.lexemeType = lexemeType; 262 | return true; 263 | } else { 264 | return false; 265 | } 266 | } 267 | 268 | /** 269 | * 270 | */ 271 | public String toString() { 272 | StringBuffer strbuf = new StringBuffer(); 273 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 274 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 275 | strbuf.append(this.getLexemeTypeString()); 276 | return strbuf.toString(); 277 | } 278 | 279 | } 280 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * Lexeme链(路径) 29 | */ 30 | class LexemePath extends QuickSortSet implements Comparable { 31 | 32 | // 起始位置 33 | private int pathBegin; 34 | // 结束 35 | private int pathEnd; 36 | // 词元链的有效字符长度 37 | private int payloadLength; 38 | 39 | LexemePath() { 40 | this.pathBegin = -1; 41 | this.pathEnd = -1; 42 | this.payloadLength = 0; 43 | } 44 | 45 | /** 46 | * 向LexemePath追加相交的Lexeme 47 | * @param lexeme 48 | * @return 49 | */ 50 | boolean addCrossLexeme(Lexeme lexeme) { 51 | if (this.isEmpty()) { 52 | this.addLexeme(lexeme); 53 | this.pathBegin = lexeme.getBegin(); 54 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 55 | this.payloadLength += lexeme.getLength(); 56 | return true; 57 | 58 | } else if (this.checkCross(lexeme)) { 59 | this.addLexeme(lexeme); 60 | if (lexeme.getBegin() + lexeme.getLength() > this.pathEnd) { 61 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 62 | } 63 | this.payloadLength = this.pathEnd - this.pathBegin; 64 | return true; 65 | 66 | } else { 67 | return false; 68 | 69 | } 70 | } 71 | 72 | /** 73 | * 向LexemePath追加不相交的Lexeme 74 | * @param lexeme 75 | * @return 76 | */ 77 | boolean addNotCrossLexeme(Lexeme lexeme) { 78 | if (this.isEmpty()) { 79 | this.addLexeme(lexeme); 80 | this.pathBegin = lexeme.getBegin(); 81 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 82 | this.payloadLength += lexeme.getLength(); 83 | return true; 84 | 85 | } else if (this.checkCross(lexeme)) { 86 | return false; 87 | 88 | } else { 89 | this.addLexeme(lexeme); 90 | this.payloadLength += lexeme.getLength(); 91 | Lexeme head = this.peekFirst(); 92 | this.pathBegin = head.getBegin(); 93 | Lexeme tail = this.peekLast(); 94 | this.pathEnd = tail.getBegin() + tail.getLength(); 95 | return true; 96 | 97 | } 98 | } 99 | 100 | /** 101 | * 移除尾部的Lexeme 102 | * @return 103 | */ 104 | Lexeme removeTail() { 105 | Lexeme tail = this.pollLast(); 106 | if (this.isEmpty()) { 107 | this.pathBegin = -1; 108 | this.pathEnd = -1; 109 | this.payloadLength = 0; 110 | } else { 111 | this.payloadLength -= tail.getLength(); 112 | Lexeme newTail = this.peekLast(); 113 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 114 | } 115 | return tail; 116 | } 117 | 118 | /** 119 | * 检测词元位置交叉(有歧义的切分) 120 | * @param lexeme 121 | * @return 122 | */ 123 | boolean checkCross(Lexeme lexeme) { 124 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 125 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin() 126 | + lexeme.getLength()); 127 | } 128 | 129 | int getPathBegin() { 130 | return pathBegin; 131 | } 132 | 133 | int getPathEnd() { 134 | return pathEnd; 135 | } 136 | 137 | /** 138 | * 获取Path的有效词长 139 | * @return 140 | */ 141 | int getPayloadLength() { 142 | return this.payloadLength; 143 | } 144 | 145 | /** 146 | * 获取LexemePath的路径长度 147 | * @return 148 | */ 149 | int getPathLength() { 150 | return this.pathEnd - this.pathBegin; 151 | } 152 | 153 | /** 154 | * X权重(词元长度积) 155 | * @return 156 | */ 157 | int getXWeight() { 158 | int product = 1; 159 | Cell c = this.getHead(); 160 | while (c != null && c.getLexeme() != null) { 161 | product *= c.getLexeme().getLength(); 162 | c = c.getNext(); 163 | } 164 | return product; 165 | } 166 | 167 | /** 168 | * 词元位置权重 169 | * @return 170 | */ 171 | int getPWeight() { 172 | int pWeight = 0; 173 | int p = 0; 174 | Cell c = this.getHead(); 175 | while (c != null && c.getLexeme() != null) { 176 | p++; 177 | pWeight += p * c.getLexeme().getLength(); 178 | c = c.getNext(); 179 | } 180 | return pWeight; 181 | } 182 | 183 | LexemePath copy() { 184 | LexemePath theCopy = new LexemePath(); 185 | theCopy.pathBegin = this.pathBegin; 186 | theCopy.pathEnd = this.pathEnd; 187 | theCopy.payloadLength = this.payloadLength; 188 | Cell c = this.getHead(); 189 | while (c != null && c.getLexeme() != null) { 190 | theCopy.addLexeme(c.getLexeme()); 191 | c = c.getNext(); 192 | } 193 | return theCopy; 194 | } 195 | 196 | public int compareTo(LexemePath o) { 197 | // 比较有效文本长度 198 | if (this.payloadLength > o.payloadLength) { 199 | return -1; 200 | } else if (this.payloadLength < o.payloadLength) { 201 | return 1; 202 | } else { 203 | // 比较词元个数,越少越好 204 | if (this.size() < o.size()) { 205 | return -1; 206 | } else if (this.size() > o.size()) { 207 | return 1; 208 | } else { 209 | // 路径跨度越大越好 210 | if (this.getPathLength() > o.getPathLength()) { 211 | return -1; 212 | } else if (this.getPathLength() < o.getPathLength()) { 213 | return 1; 214 | } else { 215 | // 根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 216 | if (this.pathEnd > o.pathEnd) { 217 | return -1; 218 | } else if (pathEnd < o.pathEnd) { 219 | return 1; 220 | } else { 221 | // 词长越平均越好 222 | if (this.getXWeight() > o.getXWeight()) { 223 | return -1; 224 | } else if (this.getXWeight() < o.getXWeight()) { 225 | return 1; 226 | } else { 227 | // 词元位置权重比较 228 | if (this.getPWeight() > o.getPWeight()) { 229 | return -1; 230 | } else if (this.getPWeight() < o.getPWeight()) { 231 | return 1; 232 | } 233 | 234 | } 235 | } 236 | } 237 | } 238 | } 239 | return 0; 240 | } 241 | 242 | public String toString() { 243 | StringBuffer sb = new StringBuffer(); 244 | sb.append("pathBegin : ").append(pathBegin).append("\r\n"); 245 | sb.append("pathEnd : ").append(pathEnd).append("\r\n"); 246 | sb.append("payloadLength : ").append(payloadLength).append("\r\n"); 247 | Cell head = this.getHead(); 248 | while (head != null) { 249 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); 250 | head = head.getNext(); 251 | } 252 | return sb.toString(); 253 | } 254 | 255 | } 256 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/core/QuickSortSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK分词器专用的Lexem快速排序集合 29 | */ 30 | class QuickSortSet { 31 | // 链表头 32 | private Cell head; 33 | // 链表尾 34 | private Cell tail; 35 | // 链表的实际大小 36 | private int size; 37 | 38 | QuickSortSet() { 39 | this.size = 0; 40 | } 41 | 42 | /** 43 | * 向链表集合添加词元 44 | * @param lexeme 45 | */ 46 | boolean addLexeme(Lexeme lexeme) { 47 | Cell newCell = new Cell(lexeme); 48 | if (this.size == 0) { 49 | this.head = newCell; 50 | this.tail = newCell; 51 | this.size++; 52 | return true; 53 | 54 | } else { 55 | if (this.tail.compareTo(newCell) == 0) {// 词元与尾部词元相同,不放入集合 56 | return false; 57 | 58 | } else if (this.tail.compareTo(newCell) < 0) {// 词元接入链表尾部 59 | this.tail.next = newCell; 60 | newCell.prev = this.tail; 61 | this.tail = newCell; 62 | this.size++; 63 | return true; 64 | 65 | } else if (this.head.compareTo(newCell) > 0) {// 词元接入链表头部 66 | this.head.prev = newCell; 67 | newCell.next = this.head; 68 | this.head = newCell; 69 | this.size++; 70 | return true; 71 | 72 | } else { 73 | // 从尾部上逆 74 | Cell index = this.tail; 75 | while (index != null && index.compareTo(newCell) > 0) { 76 | index = index.prev; 77 | } 78 | if (index.compareTo(newCell) == 0) {// 词元与集合中的词元重复,不放入集合 79 | return false; 80 | 81 | } else if (index.compareTo(newCell) < 0) {// 词元插入链表中的某个位置 82 | newCell.prev = index; 83 | newCell.next = index.next; 84 | index.next.prev = newCell; 85 | index.next = newCell; 86 | this.size++; 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | 94 | /** 95 | * 返回链表头部元素 96 | * @return 97 | */ 98 | Lexeme peekFirst() { 99 | if (this.head != null) { 100 | return this.head.lexeme; 101 | } 102 | return null; 103 | } 104 | 105 | /** 106 | * 取出链表集合的第一个元素 107 | * @return Lexeme 108 | */ 109 | Lexeme pollFirst() { 110 | if (this.size == 1) { 111 | Lexeme first = this.head.lexeme; 112 | this.head = null; 113 | this.tail = null; 114 | this.size--; 115 | return first; 116 | } else if (this.size > 1) { 117 | Lexeme first = this.head.lexeme; 118 | this.head = this.head.next; 119 | this.size--; 120 | return first; 121 | } else { 122 | return null; 123 | } 124 | } 125 | 126 | /** 127 | * 返回链表尾部元素 128 | * @return 129 | */ 130 | Lexeme peekLast() { 131 | if (this.tail != null) { 132 | return this.tail.lexeme; 133 | } 134 | return null; 135 | } 136 | 137 | /** 138 | * 取出链表集合的最后一个元素 139 | * @return Lexeme 140 | */ 141 | Lexeme pollLast() { 142 | if (this.size == 1) { 143 | Lexeme last = this.head.lexeme; 144 | this.head = null; 145 | this.tail = null; 146 | this.size--; 147 | return last; 148 | 149 | } else if (this.size > 1) { 150 | Lexeme last = this.tail.lexeme; 151 | this.tail = this.tail.prev; 152 | this.size--; 153 | return last; 154 | 155 | } else { 156 | return null; 157 | } 158 | } 159 | 160 | /** 161 | * 返回集合大小 162 | * @return 163 | */ 164 | int size() { 165 | return this.size; 166 | } 167 | 168 | /** 169 | * 判断集合是否为空 170 | * @return 171 | */ 172 | boolean isEmpty() { 173 | return this.size == 0; 174 | } 175 | 176 | /** 177 | * 返回lexeme链的头部 178 | * @return 179 | */ 180 | Cell getHead() { 181 | return this.head; 182 | } 183 | 184 | /** 185 | * 186 | * IK 中文分词 版本 5.0 187 | * IK Analyzer release 5.0 188 | * 189 | * Licensed to the Apache Software Foundation (ASF) under one or more 190 | * contributor license agreements. See the NOTICE file distributed with 191 | * this work for additional information regarding copyright ownership. 192 | * The ASF licenses this file to You under the Apache License, Version 2.0 193 | * (the "License"); you may not use this file except in compliance with 194 | * the License. You may obtain a copy of the License at 195 | * 196 | * http://www.apache.org/licenses/LICENSE-2.0 197 | * 198 | * Unless required by applicable law or agreed to in writing, software 199 | * distributed under the License is distributed on an "AS IS" BASIS, 200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | * See the License for the specific language governing permissions and 202 | * limitations under the License. 203 | * 204 | * 源代码由林良益(linliangyi2005@gmail.com)提供 205 | * 版权声明 2012,乌龙茶工作室 206 | * provided by Linliangyi and copyright 2012 by Oolong studio 207 | * 208 | * QuickSortSet集合单元 209 | * 210 | */ 211 | class Cell implements Comparable { 212 | private Cell prev; 213 | private Cell next; 214 | private Lexeme lexeme; 215 | 216 | Cell(Lexeme lexeme) { 217 | if (lexeme == null) { 218 | throw new IllegalArgumentException("lexeme must not be null"); 219 | } 220 | this.lexeme = lexeme; 221 | } 222 | 223 | public int compareTo(Cell o) { 224 | return this.lexeme.compareTo(o.lexeme); 225 | } 226 | 227 | public Cell getPrev() { 228 | return this.prev; 229 | } 230 | 231 | public Cell getNext() { 232 | return this.next; 233 | } 234 | 235 | public Lexeme getLexeme() { 236 | return this.lexeme; 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/DictSegment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.util.Arrays; 29 | import java.util.HashMap; 30 | import java.util.Map; 31 | 32 | /** 33 | * 词典树分段,表示词典树的一个分枝 34 | */ 35 | class DictSegment implements Comparable { 36 | 37 | // 公用字典表,存储汉字 38 | private static final Map charMap = new HashMap(16, 39 | 0.95f); 40 | // 数组大小上限 41 | private static final int ARRAY_LENGTH_LIMIT = 3; 42 | 43 | // Map存储结构 44 | private Map childrenMap; 45 | // 数组方式存储结构 46 | private DictSegment[] childrenArray; 47 | 48 | // 当前节点上存储的字符 49 | private Character nodeChar; 50 | // 当前节点存储的Segment数目 51 | // storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 52 | private int storeSize = 0; 53 | // 当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 54 | private int nodeState = 0; 55 | 56 | DictSegment(Character nodeChar) { 57 | if (nodeChar == null) { 58 | throw new IllegalArgumentException("参数为空异常,字符不能为空"); 59 | } 60 | this.nodeChar = nodeChar; 61 | } 62 | 63 | Character getNodeChar() { 64 | return nodeChar; 65 | } 66 | 67 | /* 68 | * 判断是否有下一个节点 69 | */ 70 | boolean hasNextNode() { 71 | return this.storeSize > 0; 72 | } 73 | 74 | /** 75 | * 匹配词段 76 | * @param charArray 77 | * @return Hit 78 | */ 79 | Hit match(char[] charArray) { 80 | return this.match(charArray, 0, charArray.length, null); 81 | } 82 | 83 | /** 84 | * 匹配词段 85 | * @param charArray 86 | * @param begin 87 | * @param length 88 | * @return Hit 89 | */ 90 | Hit match(char[] charArray, int begin, int length) { 91 | return this.match(charArray, begin, length, null); 92 | } 93 | 94 | /** 95 | * 匹配词段 96 | * @param charArray 97 | * @param begin 98 | * @param length 99 | * @param searchHit 100 | * @return Hit 101 | */ 102 | Hit match(char[] charArray, int begin, int length, Hit searchHit) { 103 | 104 | if (searchHit == null) { 105 | // 如果hit为空,新建 106 | searchHit = new Hit(); 107 | // 设置hit的其实文本位置 108 | searchHit.setBegin(begin); 109 | } else { 110 | // 否则要将HIT状态重置 111 | searchHit.setUnmatch(); 112 | } 113 | // 设置hit的当前处理位置 114 | searchHit.setEnd(begin); 115 | 116 | Character keyChar = new Character(charArray[begin]); 117 | DictSegment ds = null; 118 | 119 | // 引用实例变量为本地变量,避免查询时遇到更新的同步问题 120 | DictSegment[] segmentArray = this.childrenArray; 121 | Map segmentMap = this.childrenMap; 122 | 123 | // STEP1 在节点中查找keyChar对应的DictSegment 124 | if (segmentArray != null) { 125 | // 在数组中查找 126 | DictSegment keySegment = new DictSegment(keyChar); 127 | int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment); 128 | if (position >= 0) { 129 | ds = segmentArray[position]; 130 | } 131 | 132 | } else if (segmentMap != null) { 133 | // 在map中查找 134 | ds = (DictSegment) segmentMap.get(keyChar); 135 | } 136 | 137 | // STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 138 | if (ds != null) { 139 | if (length > 1) { 140 | // 词未匹配完,继续往下搜索 141 | return ds.match(charArray, begin + 1, length - 1, searchHit); 142 | } else if (length == 1) { 143 | 144 | // 搜索最后一个char 145 | if (ds.nodeState == 1) { 146 | // 添加HIT状态为完全匹配 147 | searchHit.setMatch(); 148 | } 149 | if (ds.hasNextNode()) { 150 | // 添加HIT状态为前缀匹配 151 | searchHit.setPrefix(); 152 | // 记录当前位置的DictSegment 153 | searchHit.setMatchedDictSegment(ds); 154 | } 155 | return searchHit; 156 | } 157 | 158 | } 159 | // STEP3 没有找到DictSegment, 将HIT设置为不匹配 160 | return searchHit; 161 | } 162 | 163 | /** 164 | * 加载填充词典片段 165 | * @param charArray 166 | */ 167 | void fillSegment(char[] charArray) { 168 | this.fillSegment(charArray, 0, charArray.length, 1); 169 | } 170 | 171 | /** 172 | * 屏蔽词典中的一个词 173 | * @param charArray 174 | */ 175 | void disableSegment(char[] charArray) { 176 | this.fillSegment(charArray, 0, charArray.length, 0); 177 | } 178 | 179 | /** 180 | * 加载填充词典片段 181 | * @param charArray 182 | * @param begin 183 | * @param length 184 | * @param enabled 185 | */ 186 | private synchronized void fillSegment(char[] charArray, int begin, int length, int enabled) { 187 | // 获取字典表中的汉字对象 188 | Character beginChar = new Character(charArray[begin]); 189 | Character keyChar = charMap.get(beginChar); 190 | // 字典中没有该字,则将其添加入字典 191 | if (keyChar == null) { 192 | charMap.put(beginChar, beginChar); 193 | keyChar = beginChar; 194 | } 195 | 196 | // 搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 197 | DictSegment ds = lookforSegment(keyChar, enabled); 198 | if (ds != null) { 199 | // 处理keyChar对应的segment 200 | if (length > 1) { 201 | // 词元还没有完全加入词典树 202 | ds.fillSegment(charArray, begin + 1, length - 1, enabled); 203 | } else if (length == 1) { 204 | // 已经是词元的最后一个char,设置当前节点状态为enabled, 205 | // enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 206 | ds.nodeState = enabled; 207 | } 208 | } 209 | 210 | } 211 | 212 | /** 213 | * 查找本节点下对应的keyChar的segment * 214 | * @param keyChar 215 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null 216 | * @return 217 | */ 218 | private DictSegment lookforSegment(Character keyChar, int create) { 219 | 220 | DictSegment ds = null; 221 | 222 | if (this.storeSize <= ARRAY_LENGTH_LIMIT) { 223 | // 获取数组容器,如果数组未创建则创建数组 224 | DictSegment[] segmentArray = getChildrenArray(); 225 | // 搜寻数组 226 | DictSegment keySegment = new DictSegment(keyChar); 227 | int position = Arrays.binarySearch(segmentArray, 0, this.storeSize, keySegment); 228 | if (position >= 0) { 229 | ds = segmentArray[position]; 230 | } 231 | 232 | // 遍历数组后没有找到对应的segment 233 | if (ds == null && create == 1) { 234 | ds = keySegment; 235 | if (this.storeSize < ARRAY_LENGTH_LIMIT) { 236 | // 数组容量未满,使用数组存储 237 | segmentArray[this.storeSize] = ds; 238 | // segment数目+1 239 | this.storeSize++; 240 | Arrays.sort(segmentArray, 0, this.storeSize); 241 | 242 | } else { 243 | // 数组容量已满,切换Map存储 244 | // 获取Map容器,如果Map未创建,则创建Map 245 | Map segmentMap = getChildrenMap(); 246 | // 将数组中的segment迁移到Map中 247 | migrate(segmentArray, segmentMap); 248 | // 存储新的segment 249 | segmentMap.put(keyChar, ds); 250 | // segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 251 | this.storeSize++; 252 | // 释放当前的数组引用 253 | this.childrenArray = null; 254 | } 255 | 256 | } 257 | 258 | } else { 259 | // 获取Map容器,如果Map未创建,则创建Map 260 | Map segmentMap = getChildrenMap(); 261 | // 搜索Map 262 | ds = (DictSegment) segmentMap.get(keyChar); 263 | if (ds == null && create == 1) { 264 | // 构造新的segment 265 | ds = new DictSegment(keyChar); 266 | segmentMap.put(keyChar, ds); 267 | // 当前节点存储segment数目+1 268 | this.storeSize++; 269 | } 270 | } 271 | 272 | return ds; 273 | } 274 | 275 | /** 276 | * 获取数组容器 277 | * 线程同步方法 278 | */ 279 | private DictSegment[] getChildrenArray() { 280 | if (this.childrenArray == null) { 281 | synchronized (this) { 282 | if (this.childrenArray == null) { 283 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT]; 284 | } 285 | } 286 | } 287 | return this.childrenArray; 288 | } 289 | 290 | /** 291 | * 获取Map容器 292 | * 线程同步方法 293 | */ 294 | private Map getChildrenMap() { 295 | if (this.childrenMap == null) { 296 | synchronized (this) { 297 | if (this.childrenMap == null) { 298 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2, 0.8f); 299 | } 300 | } 301 | } 302 | return this.childrenMap; 303 | } 304 | 305 | /** 306 | * 将数组中的segment迁移到Map中 307 | * @param segmentArray 308 | */ 309 | private void migrate(DictSegment[] segmentArray, Map segmentMap) { 310 | for (DictSegment segment : segmentArray) { 311 | if (segment != null) { 312 | segmentMap.put(segment.nodeChar, segment); 313 | } 314 | } 315 | } 316 | 317 | /** 318 | * 实现Comparable接口 319 | * @param o 320 | * @return int 321 | */ 322 | public int compareTo(DictSegment o) { 323 | // 对当前节点存储的char进行比较 324 | return this.nodeChar.compareTo(o.nodeChar); 325 | } 326 | 327 | } 328 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/Dictionary.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.io.BufferedReader; 29 | import java.io.IOException; 30 | import java.io.InputStream; 31 | import java.io.InputStreamReader; 32 | import java.util.Collection; 33 | import java.util.List; 34 | 35 | import org.wltea.analyzer.cfg.Configuration; 36 | 37 | /** 38 | * 词典管理类,单子模式 39 | */ 40 | public class Dictionary { 41 | 42 | /* 43 | * 词典单子实例 44 | */ 45 | private static Dictionary singleton; 46 | 47 | /* 48 | * 主词典对象 49 | */ 50 | private DictSegment _MainDict; 51 | 52 | /* 53 | * 停止词词典 54 | */ 55 | private DictSegment _StopWordDict; 56 | /* 57 | * 量词词典 58 | */ 59 | private DictSegment _QuantifierDict; 60 | 61 | /** 62 | * 配置对象 63 | */ 64 | private Configuration cfg; 65 | 66 | private Dictionary(Configuration cfg) { 67 | this.cfg = cfg; 68 | this.loadMainDict(); 69 | this.loadStopWordDict(); 70 | this.loadQuantifierDict(); 71 | } 72 | 73 | /** 74 | * 词典初始化 75 | * 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化 76 | * 只有当Dictionary类被实际调用时,才会开始载入词典, 77 | * 这将延长首次分词操作的时间 78 | * 该方法提供了一个在应用加载阶段就初始化字典的手段 79 | * @return Dictionary 80 | */ 81 | public static Dictionary initial(Configuration cfg) { 82 | if (singleton == null) { 83 | synchronized (Dictionary.class) { 84 | if (singleton == null) { 85 | singleton = new Dictionary(cfg); 86 | return singleton; 87 | } 88 | } 89 | } 90 | return singleton; 91 | } 92 | 93 | /** 94 | * 获取词典单子实例 95 | * @return Dictionary 单例对象 96 | */ 97 | public static Dictionary getSingleton() { 98 | if (singleton == null) { 99 | throw new IllegalStateException("词典尚未初始化,请先调用initial方法"); 100 | } 101 | return singleton; 102 | } 103 | 104 | /** 105 | * 批量加载新词条 106 | * @param words Collection词条列表 107 | */ 108 | public void addWords(Collection words) { 109 | if (words != null) { 110 | for (String word : words) { 111 | if (word != null) { 112 | // 批量加载词条到主内存词典中 113 | singleton._MainDict.fillSegment(word.trim().toLowerCase().toCharArray()); 114 | } 115 | } 116 | } 117 | } 118 | 119 | /** 120 | * 批量移除(屏蔽)词条 121 | * @param words 122 | */ 123 | public void disableWords(Collection words) { 124 | if (words != null) { 125 | for (String word : words) { 126 | if (word != null) { 127 | // 批量屏蔽词条 128 | singleton._MainDict.disableSegment(word.trim().toLowerCase().toCharArray()); 129 | } 130 | } 131 | } 132 | } 133 | 134 | /** 135 | * 检索匹配主词典 136 | * @param charArray 137 | * @return Hit 匹配结果描述 138 | */ 139 | public Hit matchInMainDict(char[] charArray) { 140 | return singleton._MainDict.match(charArray); 141 | } 142 | 143 | /** 144 | * 检索匹配主词典 145 | * @param charArray 146 | * @param begin 147 | * @param length 148 | * @return Hit 匹配结果描述 149 | */ 150 | public Hit matchInMainDict(char[] charArray, int begin, int length) { 151 | return singleton._MainDict.match(charArray, begin, length); 152 | } 153 | 154 | /** 155 | * 检索匹配量词词典 156 | * @param charArray 157 | * @param begin 158 | * @param length 159 | * @return Hit 匹配结果描述 160 | */ 161 | public Hit matchInQuantifierDict(char[] charArray, int begin, int length) { 162 | return singleton._QuantifierDict.match(charArray, begin, length); 163 | } 164 | 165 | /** 166 | * 从已匹配的Hit中直接取出DictSegment,继续向下匹配 167 | * @param charArray 168 | * @param currentIndex 169 | * @param matchedHit 170 | * @return Hit 171 | */ 172 | public Hit matchWithHit(char[] charArray, int currentIndex, Hit matchedHit) { 173 | DictSegment ds = matchedHit.getMatchedDictSegment(); 174 | return ds.match(charArray, currentIndex, 1, matchedHit); 175 | } 176 | 177 | /** 178 | * 判断是否是停止词 179 | * @param charArray 180 | * @param begin 181 | * @param length 182 | * @return boolean 183 | */ 184 | public boolean isStopWord(char[] charArray, int begin, int length) { 185 | return singleton._StopWordDict.match(charArray, begin, length).isMatch(); 186 | } 187 | 188 | /** 189 | * 加载主词典及扩展词典 190 | */ 191 | private void loadMainDict() { 192 | // 建立一个主词典实例 193 | _MainDict = new DictSegment((char) 0); 194 | // 读取主词典文件 195 | InputStream is = this.getClass().getClassLoader().getResourceAsStream(cfg.getMainDictionary()); 196 | if (is == null) { 197 | throw new RuntimeException("Main Dictionary not found!!!"); 198 | } 199 | 200 | try { 201 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); 202 | String theWord = null; 203 | do { 204 | theWord = br.readLine(); 205 | if (theWord != null && !"".equals(theWord.trim())) { 206 | _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); 207 | } 208 | } while (theWord != null); 209 | 210 | } catch (IOException ioe) { 211 | System.err.println("Main Dictionary loading exception."); 212 | ioe.printStackTrace(); 213 | 214 | } finally { 215 | try { 216 | if (is != null) { 217 | is.close(); 218 | is = null; 219 | } 220 | } catch (IOException e) { 221 | e.printStackTrace(); 222 | } 223 | } 224 | // 加载扩展词典 225 | this.loadExtDict(); 226 | } 227 | 228 | /** 229 | * 加载用户配置的扩展词典到主词库表 230 | */ 231 | private void loadExtDict() { 232 | // 加载扩展词典配置 233 | List extDictFiles = cfg.getExtDictionarys(); 234 | if (extDictFiles != null) { 235 | InputStream is = null; 236 | for (String extDictName : extDictFiles) { 237 | // 读取扩展词典文件 238 | System.out.println("加载扩展词典:" + extDictName); 239 | is = this.getClass().getClassLoader().getResourceAsStream(extDictName); 240 | // 如果找不到扩展的字典,则忽略 241 | if (is == null) { 242 | continue; 243 | } 244 | try { 245 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); 246 | String theWord = null; 247 | do { 248 | theWord = br.readLine(); 249 | if (theWord != null && !"".equals(theWord.trim())) { 250 | // 加载扩展词典数据到主内存词典中 251 | // System.out.println(theWord); 252 | _MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); 253 | } 254 | } while (theWord != null); 255 | 256 | } catch (IOException ioe) { 257 | System.err.println("Extension Dictionary loading exception."); 258 | ioe.printStackTrace(); 259 | 260 | } finally { 261 | try { 262 | if (is != null) { 263 | is.close(); 264 | is = null; 265 | } 266 | } catch (IOException e) { 267 | e.printStackTrace(); 268 | } 269 | } 270 | } 271 | } 272 | } 273 | 274 | /** 275 | * 加载用户扩展的停止词词典 276 | */ 277 | private void loadStopWordDict() { 278 | // 建立一个主词典实例 279 | _StopWordDict = new DictSegment((char) 0); 280 | // 加载扩展停止词典 281 | List extStopWordDictFiles = cfg.getExtStopWordDictionarys(); 282 | if (extStopWordDictFiles != null) { 283 | InputStream is = null; 284 | for (String extStopWordDictName : extStopWordDictFiles) { 285 | System.out.println("加载扩展停止词典:" + extStopWordDictName); 286 | // 读取扩展词典文件 287 | is = this.getClass().getClassLoader().getResourceAsStream(extStopWordDictName); 288 | // 如果找不到扩展的字典,则忽略 289 | if (is == null) { 290 | continue; 291 | } 292 | try { 293 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); 294 | String theWord = null; 295 | do { 296 | theWord = br.readLine(); 297 | if (theWord != null && !"".equals(theWord.trim())) { 298 | // System.out.println(theWord); 299 | // 加载扩展停止词典数据到内存中 300 | _StopWordDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); 301 | } 302 | } while (theWord != null); 303 | 304 | } catch (IOException ioe) { 305 | System.err.println("Extension Stop word Dictionary loading exception."); 306 | ioe.printStackTrace(); 307 | 308 | } finally { 309 | try { 310 | if (is != null) { 311 | is.close(); 312 | is = null; 313 | } 314 | } catch (IOException e) { 315 | e.printStackTrace(); 316 | } 317 | } 318 | } 319 | } 320 | } 321 | 322 | /** 323 | * 加载量词词典 324 | */ 325 | private void loadQuantifierDict() { 326 | // 建立一个量词典实例 327 | _QuantifierDict = new DictSegment((char) 0); 328 | // 读取量词词典文件 329 | InputStream is = this.getClass().getClassLoader() 330 | .getResourceAsStream(cfg.getQuantifierDicionary()); 331 | if (is == null) { 332 | throw new RuntimeException("Quantifier Dictionary not found!!!"); 333 | } 334 | try { 335 | BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"), 512); 336 | String theWord = null; 337 | do { 338 | theWord = br.readLine(); 339 | if (theWord != null && !"".equals(theWord.trim())) { 340 | _QuantifierDict.fillSegment(theWord.trim().toLowerCase().toCharArray()); 341 | } 342 | } while (theWord != null); 343 | 344 | } catch (IOException ioe) { 345 | System.err.println("Quantifier Dictionary loading exception."); 346 | ioe.printStackTrace(); 347 | 348 | } finally { 349 | try { 350 | if (is != null) { 351 | is.close(); 352 | is = null; 353 | } 354 | } catch (IOException e) { 355 | e.printStackTrace(); 356 | } 357 | } 358 | } 359 | 360 | } 361 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /** 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | // Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | // Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | // Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | // 该HIT当前状态,默认未匹配 40 | private int hitState = UNMATCH; 41 | 42 | // 记录词典匹配过程中,当前匹配到的词典分支节点 43 | private DictSegment matchedDictSegment; 44 | /* 45 | * 词段开始位置 46 | */ 47 | private int begin; 48 | /* 49 | * 词段的结束位置 50 | */ 51 | private int end; 52 | 53 | /** 54 | * 判断是否完全匹配 55 | */ 56 | public boolean isMatch() { 57 | return (this.hitState & MATCH) > 0; 58 | } 59 | 60 | /** 61 | * 62 | */ 63 | public void setMatch() { 64 | this.hitState = this.hitState | MATCH; 65 | } 66 | 67 | /** 68 | * 判断是否是词的前缀 69 | */ 70 | public boolean isPrefix() { 71 | return (this.hitState & PREFIX) > 0; 72 | } 73 | 74 | /** 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | 81 | /** 82 | * 判断是否是不匹配 83 | */ 84 | public boolean isUnmatch() { 85 | return this.hitState == UNMATCH; 86 | } 87 | 88 | /** 89 | * 90 | */ 91 | public void setUnmatch() { 92 | this.hitState = UNMATCH; 93 | } 94 | 95 | public DictSegment getMatchedDictSegment() { 96 | return matchedDictSegment; 97 | } 98 | 99 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 100 | this.matchedDictSegment = matchedDictSegment; 101 | } 102 | 103 | public int getBegin() { 104 | return begin; 105 | } 106 | 107 | public void setBegin(int begin) { 108 | this.begin = begin; 109 | } 110 | 111 | public int getEnd() { 112 | return end; 113 | } 114 | 115 | public void setEnd(int end) { 116 | this.end = end; 117 | } 118 | 119 | } 120 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/dic/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.lucene; 26 | 27 | import java.io.Reader; 28 | 29 | import org.apache.lucene.analysis.Analyzer; 30 | import org.apache.lucene.analysis.Tokenizer; 31 | 32 | /** 33 | * IK分词器,Lucene Analyzer接口实现 34 | * 兼容Lucene 4.0版本 35 | */ 36 | public final class IKAnalyzer extends Analyzer { 37 | 38 | private boolean useSmart; 39 | 40 | public boolean useSmart() { 41 | return useSmart; 42 | } 43 | 44 | public void setUseSmart(boolean useSmart) { 45 | this.useSmart = useSmart; 46 | } 47 | 48 | /** 49 | * IK分词器Lucene Analyzer接口实现类 50 | * 51 | * 默认细粒度切分算法 52 | */ 53 | public IKAnalyzer() { 54 | this(false); 55 | } 56 | 57 | /** 58 | * IK分词器Lucene Analyzer接口实现类 59 | * 60 | * @param useSmart 当为true时,分词器进行智能切分 61 | */ 62 | public IKAnalyzer(boolean useSmart) { 63 | super(); 64 | this.useSmart = useSmart; 65 | } 66 | 67 | /** 68 | * 重载Analyzer接口,构造分词组件 69 | */ 70 | @Override 71 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) { 72 | Tokenizer _IKTokenizer = new IKTokenizer(in, this.useSmart()); 73 | return new TokenStreamComponents(_IKTokenizer); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | 32 | import org.apache.lucene.analysis.Tokenizer; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * IK分词器 Lucene Tokenizer适配器类 42 | * 兼容Lucene 4.0版本 43 | */ 44 | public final class IKTokenizer extends Tokenizer { 45 | 46 | // IK分词器实现 47 | private IKSegmenter _IKImplement; 48 | 49 | // 词元文本属性 50 | private final CharTermAttribute termAtt; 51 | // 词元位移属性 52 | private final OffsetAttribute offsetAtt; 53 | // 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 54 | private final TypeAttribute typeAtt; 55 | // 记录最后一个词元的结束位置 56 | private int endPosition; 57 | 58 | /** 59 | * Lucene 4.0 Tokenizer适配器类构造函数 60 | * @param in 61 | * @param useSmart 62 | */ 63 | public IKTokenizer(Reader in, boolean useSmart) { 64 | super(in); 65 | offsetAtt = addAttribute(OffsetAttribute.class); 66 | termAtt = addAttribute(CharTermAttribute.class); 67 | typeAtt = addAttribute(TypeAttribute.class); 68 | _IKImplement = new IKSegmenter(input, useSmart); 69 | } 70 | 71 | /* 72 | * (non-Javadoc) 73 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 74 | */ 75 | @Override 76 | public boolean incrementToken() throws IOException { 77 | // 清除所有的词元属性 78 | clearAttributes(); 79 | Lexeme nextLexeme = _IKImplement.next(); 80 | if (nextLexeme != null) { 81 | // 将Lexeme转成Attributes 82 | // 设置词元文本 83 | termAtt.append(nextLexeme.getLexemeText()); 84 | // 设置词元长度 85 | termAtt.setLength(nextLexeme.getLength()); 86 | // 设置词元位移 87 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); 88 | // 记录分词的最后位置 89 | endPosition = nextLexeme.getEndPosition(); 90 | // 记录词元分类 91 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 92 | // 返会true告知还有下个词元 93 | return true; 94 | } 95 | // 返会false告知词元输出完毕 96 | return false; 97 | } 98 | 99 | /* 100 | * (non-Javadoc) 101 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 102 | */ 103 | @Override 104 | public void reset() throws IOException { 105 | super.reset(); 106 | _IKImplement.reset(input); 107 | } 108 | 109 | @Override 110 | public final void end() { 111 | // set final offset 112 | int finalOffset = correctOffset(this.endPosition); 113 | offsetAtt.setOffset(finalOffset, finalOffset); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.query; 26 | 27 | import java.util.ArrayList; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Stack; 31 | 32 | import org.apache.lucene.index.Term; 33 | import org.apache.lucene.search.BooleanClause; 34 | import org.apache.lucene.search.BooleanQuery; 35 | import org.apache.lucene.search.Query; 36 | import org.apache.lucene.search.TermQuery; 37 | import org.apache.lucene.search.TermRangeQuery; 38 | import org.apache.lucene.search.BooleanClause.Occur; 39 | import org.apache.lucene.util.BytesRef; 40 | 41 | /** 42 | * IK简易查询表达式解析 43 | * 结合SWMCQuery算法 44 | * 45 | * 表达式例子 : 46 | * (id='1231231' && title:'monkey') || (content:'你好吗' || ulr='www.ik.com') - name:'helloword' 47 | * @author linliangyi 48 | * 49 | */ 50 | public class IKQueryExpressionParser { 51 | 52 | // public static final String LUCENE_SPECIAL_CHAR = "&&||-()':={}[],"; 53 | 54 | private List elements = new ArrayList(); 55 | 56 | private Stack querys = new Stack(); 57 | 58 | private Stack operates = new Stack(); 59 | 60 | /** 61 | * 解析查询表达式,生成Lucene Query对象 62 | * 63 | * @param expression 64 | * @param quickMode 65 | * @return Lucene query 66 | */ 67 | public Query parseExp(String expression, boolean quickMode) { 68 | Query lucenceQuery = null; 69 | if (expression != null && !"".equals(expression.trim())) { 70 | try { 71 | // 文法解析 72 | this.splitElements(expression); 73 | // 语法解析 74 | this.parseSyntax(quickMode); 75 | if (this.querys.size() == 1) { 76 | lucenceQuery = this.querys.pop(); 77 | } else { 78 | throw new IllegalStateException("表达式异常: 缺少逻辑操作符 或 括号缺失"); 79 | } 80 | } finally { 81 | elements.clear(); 82 | querys.clear(); 83 | operates.clear(); 84 | } 85 | } 86 | return lucenceQuery; 87 | } 88 | 89 | /** 90 | * 表达式文法解析 91 | * @param expression 92 | */ 93 | private void splitElements(String expression) { 94 | 95 | if (expression == null) { 96 | return; 97 | } 98 | Element curretElement = null; 99 | 100 | char[] expChars = expression.toCharArray(); 101 | for (int i = 0; i < expChars.length; i++) { 102 | switch (expChars[i]) { 103 | case '&': 104 | if (curretElement == null) { 105 | curretElement = new Element(); 106 | curretElement.type = '&'; 107 | curretElement.append(expChars[i]); 108 | } else if (curretElement.type == '&') { 109 | curretElement.append(expChars[i]); 110 | this.elements.add(curretElement); 111 | curretElement = null; 112 | } else if (curretElement.type == '\'') { 113 | curretElement.append(expChars[i]); 114 | } else { 115 | this.elements.add(curretElement); 116 | curretElement = new Element(); 117 | curretElement.type = '&'; 118 | curretElement.append(expChars[i]); 119 | } 120 | break; 121 | 122 | case '|': 123 | if (curretElement == null) { 124 | curretElement = new Element(); 125 | curretElement.type = '|'; 126 | curretElement.append(expChars[i]); 127 | } else if (curretElement.type == '|') { 128 | curretElement.append(expChars[i]); 129 | this.elements.add(curretElement); 130 | curretElement = null; 131 | } else if (curretElement.type == '\'') { 132 | curretElement.append(expChars[i]); 133 | } else { 134 | this.elements.add(curretElement); 135 | curretElement = new Element(); 136 | curretElement.type = '|'; 137 | curretElement.append(expChars[i]); 138 | } 139 | break; 140 | 141 | case '-': 142 | if (curretElement != null) { 143 | if (curretElement.type == '\'') { 144 | curretElement.append(expChars[i]); 145 | continue; 146 | } else { 147 | this.elements.add(curretElement); 148 | } 149 | } 150 | curretElement = new Element(); 151 | curretElement.type = '-'; 152 | curretElement.append(expChars[i]); 153 | this.elements.add(curretElement); 154 | curretElement = null; 155 | break; 156 | 157 | case '(': 158 | if (curretElement != null) { 159 | if (curretElement.type == '\'') { 160 | curretElement.append(expChars[i]); 161 | continue; 162 | } else { 163 | this.elements.add(curretElement); 164 | } 165 | } 166 | curretElement = new Element(); 167 | curretElement.type = '('; 168 | curretElement.append(expChars[i]); 169 | this.elements.add(curretElement); 170 | curretElement = null; 171 | break; 172 | 173 | case ')': 174 | if (curretElement != null) { 175 | if (curretElement.type == '\'') { 176 | curretElement.append(expChars[i]); 177 | continue; 178 | } else { 179 | this.elements.add(curretElement); 180 | } 181 | } 182 | curretElement = new Element(); 183 | curretElement.type = ')'; 184 | curretElement.append(expChars[i]); 185 | this.elements.add(curretElement); 186 | curretElement = null; 187 | break; 188 | 189 | case ':': 190 | if (curretElement != null) { 191 | if (curretElement.type == '\'') { 192 | curretElement.append(expChars[i]); 193 | continue; 194 | } else { 195 | this.elements.add(curretElement); 196 | } 197 | } 198 | curretElement = new Element(); 199 | curretElement.type = ':'; 200 | curretElement.append(expChars[i]); 201 | this.elements.add(curretElement); 202 | curretElement = null; 203 | break; 204 | 205 | case '=': 206 | if (curretElement != null) { 207 | if (curretElement.type == '\'') { 208 | curretElement.append(expChars[i]); 209 | continue; 210 | } else { 211 | this.elements.add(curretElement); 212 | } 213 | } 214 | curretElement = new Element(); 215 | curretElement.type = '='; 216 | curretElement.append(expChars[i]); 217 | this.elements.add(curretElement); 218 | curretElement = null; 219 | break; 220 | 221 | case ' ': 222 | if (curretElement != null) { 223 | if (curretElement.type == '\'') { 224 | curretElement.append(expChars[i]); 225 | } else { 226 | this.elements.add(curretElement); 227 | curretElement = null; 228 | } 229 | } 230 | 231 | break; 232 | 233 | case '\'': 234 | if (curretElement == null) { 235 | curretElement = new Element(); 236 | curretElement.type = '\''; 237 | 238 | } else if (curretElement.type == '\'') { 239 | this.elements.add(curretElement); 240 | curretElement = null; 241 | 242 | } else { 243 | this.elements.add(curretElement); 244 | curretElement = new Element(); 245 | curretElement.type = '\''; 246 | 247 | } 248 | break; 249 | 250 | case '[': 251 | if (curretElement != null) { 252 | if (curretElement.type == '\'') { 253 | curretElement.append(expChars[i]); 254 | continue; 255 | } else { 256 | this.elements.add(curretElement); 257 | } 258 | } 259 | curretElement = new Element(); 260 | curretElement.type = '['; 261 | curretElement.append(expChars[i]); 262 | this.elements.add(curretElement); 263 | curretElement = null; 264 | break; 265 | 266 | case ']': 267 | if (curretElement != null) { 268 | if (curretElement.type == '\'') { 269 | curretElement.append(expChars[i]); 270 | continue; 271 | } else { 272 | this.elements.add(curretElement); 273 | } 274 | } 275 | curretElement = new Element(); 276 | curretElement.type = ']'; 277 | curretElement.append(expChars[i]); 278 | this.elements.add(curretElement); 279 | curretElement = null; 280 | 281 | break; 282 | 283 | case '{': 284 | if (curretElement != null) { 285 | if (curretElement.type == '\'') { 286 | curretElement.append(expChars[i]); 287 | continue; 288 | } else { 289 | this.elements.add(curretElement); 290 | } 291 | } 292 | curretElement = new Element(); 293 | curretElement.type = '{'; 294 | curretElement.append(expChars[i]); 295 | this.elements.add(curretElement); 296 | curretElement = null; 297 | break; 298 | 299 | case '}': 300 | if (curretElement != null) { 301 | if (curretElement.type == '\'') { 302 | curretElement.append(expChars[i]); 303 | continue; 304 | } else { 305 | this.elements.add(curretElement); 306 | } 307 | } 308 | curretElement = new Element(); 309 | curretElement.type = '}'; 310 | curretElement.append(expChars[i]); 311 | this.elements.add(curretElement); 312 | curretElement = null; 313 | 314 | break; 315 | case ',': 316 | if (curretElement != null) { 317 | if (curretElement.type == '\'') { 318 | curretElement.append(expChars[i]); 319 | continue; 320 | } else { 321 | this.elements.add(curretElement); 322 | } 323 | } 324 | curretElement = new Element(); 325 | curretElement.type = ','; 326 | curretElement.append(expChars[i]); 327 | this.elements.add(curretElement); 328 | curretElement = null; 329 | 330 | break; 331 | 332 | default: 333 | if (curretElement == null) { 334 | curretElement = new Element(); 335 | curretElement.type = 'F'; 336 | curretElement.append(expChars[i]); 337 | 338 | } else if (curretElement.type == 'F') { 339 | curretElement.append(expChars[i]); 340 | 341 | } else if (curretElement.type == '\'') { 342 | curretElement.append(expChars[i]); 343 | 344 | } else { 345 | this.elements.add(curretElement); 346 | curretElement = new Element(); 347 | curretElement.type = 'F'; 348 | curretElement.append(expChars[i]); 349 | } 350 | } 351 | } 352 | 353 | if (curretElement != null) { 354 | this.elements.add(curretElement); 355 | curretElement = null; 356 | } 357 | } 358 | 359 | /** 360 | * 语法解析 361 | * 362 | */ 363 | private void parseSyntax(boolean quickMode) { 364 | for (int i = 0; i < this.elements.size(); i++) { 365 | Element e = this.elements.get(i); 366 | if ('F' == e.type) { 367 | Element e2 = this.elements.get(i + 1); 368 | if ('=' != e2.type && ':' != e2.type) { 369 | throw new IllegalStateException("表达式异常: = 或 : 号丢失"); 370 | } 371 | Element e3 = this.elements.get(i + 2); 372 | // 处理 = 和 : 运算 373 | if ('\'' == e3.type) { 374 | i += 2; 375 | if ('=' == e2.type) { 376 | TermQuery tQuery = new TermQuery(new Term(e.toString(), e3.toString())); 377 | this.querys.push(tQuery); 378 | } else if (':' == e2.type) { 379 | String keyword = e3.toString(); 380 | // SWMCQuery Here 381 | Query _SWMCQuery = SWMCQueryBuilder.create(e.toString(), keyword, quickMode); 382 | this.querys.push(_SWMCQuery); 383 | } 384 | 385 | } else if ('[' == e3.type || '{' == e3.type) { 386 | i += 2; 387 | // 处理 [] 和 {} 388 | LinkedList eQueue = new LinkedList(); 389 | eQueue.add(e3); 390 | for (i++; i < this.elements.size(); i++) { 391 | Element eN = this.elements.get(i); 392 | eQueue.add(eN); 393 | if (']' == eN.type || '}' == eN.type) { 394 | break; 395 | } 396 | } 397 | // 翻译RangeQuery 398 | Query rangeQuery = this.toTermRangeQuery(e, eQueue); 399 | this.querys.push(rangeQuery); 400 | } else { 401 | throw new IllegalStateException("表达式异常:匹配值丢失"); 402 | } 403 | 404 | } else if ('(' == e.type) { 405 | this.operates.push(e); 406 | 407 | } else if (')' == e.type) { 408 | boolean doPop = true; 409 | while (doPop && !this.operates.empty()) { 410 | Element op = this.operates.pop(); 411 | if ('(' == op.type) { 412 | doPop = false; 413 | } else { 414 | Query q = toBooleanQuery(op); 415 | this.querys.push(q); 416 | } 417 | 418 | } 419 | } else { 420 | 421 | if (this.operates.isEmpty()) { 422 | this.operates.push(e); 423 | } else { 424 | boolean doPeek = true; 425 | while (doPeek && !this.operates.isEmpty()) { 426 | Element eleOnTop = this.operates.peek(); 427 | if ('(' == eleOnTop.type) { 428 | doPeek = false; 429 | this.operates.push(e); 430 | } else if (compare(e, eleOnTop) == 1) { 431 | this.operates.push(e); 432 | doPeek = false; 433 | } else if (compare(e, eleOnTop) == 0) { 434 | Query q = toBooleanQuery(eleOnTop); 435 | this.operates.pop(); 436 | this.querys.push(q); 437 | } else { 438 | Query q = toBooleanQuery(eleOnTop); 439 | this.operates.pop(); 440 | this.querys.push(q); 441 | } 442 | } 443 | 444 | if (doPeek && this.operates.empty()) { 445 | this.operates.push(e); 446 | } 447 | } 448 | } 449 | } 450 | 451 | while (!this.operates.isEmpty()) { 452 | Element eleOnTop = this.operates.pop(); 453 | Query q = toBooleanQuery(eleOnTop); 454 | this.querys.push(q); 455 | } 456 | } 457 | 458 | /** 459 | * 根据逻辑操作符,生成BooleanQuery 460 | * @param op 461 | * @return 462 | */ 463 | private Query toBooleanQuery(Element op) { 464 | if (this.querys.size() == 0) { 465 | return null; 466 | } 467 | 468 | BooleanQuery resultQuery = new BooleanQuery(); 469 | 470 | if (this.querys.size() == 1) { 471 | return this.querys.get(0); 472 | } 473 | 474 | Query q2 = this.querys.pop(); 475 | Query q1 = this.querys.pop(); 476 | if ('&' == op.type) { 477 | if (q1 != null) { 478 | if (q1 instanceof BooleanQuery) { 479 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); 480 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) { 481 | for (BooleanClause c : clauses) { 482 | resultQuery.add(c); 483 | } 484 | } else { 485 | resultQuery.add(q1, Occur.MUST); 486 | } 487 | 488 | } else { 489 | // q1 instanceof TermQuery 490 | // q1 instanceof TermRangeQuery 491 | // q1 instanceof PhraseQuery 492 | // others 493 | resultQuery.add(q1, Occur.MUST); 494 | } 495 | } 496 | 497 | if (q2 != null) { 498 | if (q2 instanceof BooleanQuery) { 499 | BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); 500 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.MUST) { 501 | for (BooleanClause c : clauses) { 502 | resultQuery.add(c); 503 | } 504 | } else { 505 | resultQuery.add(q2, Occur.MUST); 506 | } 507 | 508 | } else { 509 | // q1 instanceof TermQuery 510 | // q1 instanceof TermRangeQuery 511 | // q1 instanceof PhraseQuery 512 | // others 513 | resultQuery.add(q2, Occur.MUST); 514 | } 515 | } 516 | 517 | } else if ('|' == op.type) { 518 | if (q1 != null) { 519 | if (q1 instanceof BooleanQuery) { 520 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); 521 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) { 522 | for (BooleanClause c : clauses) { 523 | resultQuery.add(c); 524 | } 525 | } else { 526 | resultQuery.add(q1, Occur.SHOULD); 527 | } 528 | 529 | } else { 530 | // q1 instanceof TermQuery 531 | // q1 instanceof TermRangeQuery 532 | // q1 instanceof PhraseQuery 533 | // others 534 | resultQuery.add(q1, Occur.SHOULD); 535 | } 536 | } 537 | 538 | if (q2 != null) { 539 | if (q2 instanceof BooleanQuery) { 540 | BooleanClause[] clauses = ((BooleanQuery) q2).getClauses(); 541 | if (clauses.length > 0 && clauses[0].getOccur() == Occur.SHOULD) { 542 | for (BooleanClause c : clauses) { 543 | resultQuery.add(c); 544 | } 545 | } else { 546 | resultQuery.add(q2, Occur.SHOULD); 547 | } 548 | } else { 549 | // q2 instanceof TermQuery 550 | // q2 instanceof TermRangeQuery 551 | // q2 instanceof PhraseQuery 552 | // others 553 | resultQuery.add(q2, Occur.SHOULD); 554 | 555 | } 556 | } 557 | 558 | } else if ('-' == op.type) { 559 | if (q1 == null || q2 == null) { 560 | throw new IllegalStateException("表达式异常:SubQuery 个数不匹配"); 561 | } 562 | 563 | if (q1 instanceof BooleanQuery) { 564 | BooleanClause[] clauses = ((BooleanQuery) q1).getClauses(); 565 | if (clauses.length > 0) { 566 | for (BooleanClause c : clauses) { 567 | resultQuery.add(c); 568 | } 569 | } else { 570 | resultQuery.add(q1, Occur.MUST); 571 | } 572 | 573 | } else { 574 | // q1 instanceof TermQuery 575 | // q1 instanceof TermRangeQuery 576 | // q1 instanceof PhraseQuery 577 | // others 578 | resultQuery.add(q1, Occur.MUST); 579 | } 580 | 581 | resultQuery.add(q2, Occur.MUST_NOT); 582 | } 583 | return resultQuery; 584 | } 585 | 586 | /** 587 | * 组装TermRangeQuery 588 | * @param elements 589 | * @return 590 | */ 591 | private TermRangeQuery toTermRangeQuery(Element fieldNameEle, LinkedList elements) { 592 | 593 | boolean includeFirst = false; 594 | boolean includeLast = false; 595 | String firstValue = null; 596 | String lastValue = null; 597 | // 检查第一个元素是否是[或者{ 598 | Element first = elements.getFirst(); 599 | if ('[' == first.type) { 600 | includeFirst = true; 601 | } else if ('{' == first.type) { 602 | includeFirst = false; 603 | } else { 604 | throw new IllegalStateException("表达式异常"); 605 | } 606 | // 检查最后一个元素是否是]或者} 607 | Element last = elements.getLast(); 608 | if (']' == last.type) { 609 | includeLast = true; 610 | } else if ('}' == last.type) { 611 | includeLast = false; 612 | } else { 613 | throw new IllegalStateException("表达式异常, RangeQuery缺少结束括号"); 614 | } 615 | if (elements.size() < 4 || elements.size() > 5) { 616 | throw new IllegalStateException("表达式异常, RangeQuery 错误"); 617 | } 618 | // 读出中间部分 619 | Element e2 = elements.get(1); 620 | if ('\'' == e2.type) { 621 | firstValue = e2.toString(); 622 | // 623 | Element e3 = elements.get(2); 624 | if (',' != e3.type) { 625 | throw new IllegalStateException("表达式异常, RangeQuery缺少逗号分隔"); 626 | } 627 | // 628 | Element e4 = elements.get(3); 629 | if ('\'' == e4.type) { 630 | lastValue = e4.toString(); 631 | } else if (e4 != last) { 632 | throw new IllegalStateException("表达式异常,RangeQuery格式错误"); 633 | } 634 | } else if (',' == e2.type) { 635 | firstValue = null; 636 | // 637 | Element e3 = elements.get(2); 638 | if ('\'' == e3.type) { 639 | lastValue = e3.toString(); 640 | } else { 641 | throw new IllegalStateException("表达式异常,RangeQuery格式错误"); 642 | } 643 | 644 | } else { 645 | throw new IllegalStateException("表达式异常, RangeQuery格式错误"); 646 | } 647 | 648 | return new TermRangeQuery(fieldNameEle.toString(), new BytesRef(firstValue), new BytesRef( 649 | lastValue), includeFirst, includeLast); 650 | } 651 | 652 | /** 653 | * 比较操作符优先级 654 | * @param e1 655 | * @param e2 656 | * @return 657 | */ 658 | private int compare(Element e1, Element e2) { 659 | if ('&' == e1.type) { 660 | if ('&' == e2.type) { 661 | return 0; 662 | } else { 663 | return 1; 664 | } 665 | } else if ('|' == e1.type) { 666 | if ('&' == e2.type) { 667 | return -1; 668 | } else if ('|' == e2.type) { 669 | return 0; 670 | } else { 671 | return 1; 672 | } 673 | } else { 674 | if ('-' == e2.type) { 675 | return 0; 676 | } else { 677 | return -1; 678 | } 679 | } 680 | } 681 | 682 | /** 683 | * 表达式元素(操作符、FieldName、FieldValue) 684 | * @author linliangyi 685 | * May 20, 2010 686 | */ 687 | private class Element { 688 | char type = 0; 689 | StringBuffer eleTextBuff; 690 | 691 | public Element() { 692 | eleTextBuff = new StringBuffer(); 693 | } 694 | 695 | public void append(char c) { 696 | this.eleTextBuff.append(c); 697 | } 698 | 699 | public String toString() { 700 | return this.eleTextBuff.toString(); 701 | } 702 | } 703 | 704 | public static void main(String[] args) { 705 | IKQueryExpressionParser parser = new IKQueryExpressionParser(); 706 | // String ikQueryExp = "newsTitle:'的两款《魔兽世界》插件Bigfoot和月光宝盒'"; 707 | String ikQueryExp = "(id='ABcdRf' && date:{'20010101','20110101'} && keyword:'魔兽中国') || (content:'KSHT-KSH-A001-18' || ulr='www.ik.com') - name:'林良益'"; 708 | Query result = parser.parseExp(ikQueryExp, true); 709 | System.out.println(result); 710 | 711 | } 712 | 713 | } 714 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.query; 26 | 27 | import java.io.IOException; 28 | import java.io.StringReader; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | 32 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 33 | import org.apache.lucene.queryparser.classic.ParseException; 34 | import org.apache.lucene.queryparser.classic.QueryParser; 35 | import org.apache.lucene.search.Query; 36 | import org.apache.lucene.util.Version; 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * Single Word Multi Char Query Builder 42 | * IK分词算法专用 43 | * @author linliangyi 44 | * 45 | */ 46 | public class SWMCQueryBuilder { 47 | 48 | /** 49 | * 生成SWMCQuery 50 | * @param fieldName 51 | * @param keywords 52 | * @param quickMode 53 | * @return Lucene Query 54 | */ 55 | public static Query create(String fieldName, String keywords, boolean quickMode) { 56 | if (fieldName == null || keywords == null) { 57 | throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); 58 | } 59 | // 1.对keywords进行分词处理 60 | List lexemes = doAnalyze(keywords); 61 | // 2.根据分词结果,生成SWMCQuery 62 | Query _SWMCQuery = getSWMCQuery(fieldName, lexemes, quickMode); 63 | return _SWMCQuery; 64 | } 65 | 66 | /** 67 | * 分词切分,并返回结链表 68 | * @param keywords 69 | * @return 70 | */ 71 | private static List doAnalyze(String keywords) { 72 | List lexemes = new ArrayList(); 73 | IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords), true); 74 | try { 75 | Lexeme l = null; 76 | while ((l = ikSeg.next()) != null) { 77 | lexemes.add(l); 78 | } 79 | } catch (IOException e) { 80 | e.printStackTrace(); 81 | } 82 | return lexemes; 83 | } 84 | 85 | /** 86 | * 根据分词结果生成SWMC搜索 87 | * @param fieldName 88 | * @param pathOption 89 | * @param quickMode 90 | * @return 91 | */ 92 | private static Query getSWMCQuery(String fieldName, List lexemes, boolean quickMode) { 93 | // 构造SWMC的查询表达式 94 | StringBuffer keywordBuffer = new StringBuffer(); 95 | // 精简的SWMC的查询表达式 96 | StringBuffer keywordBuffer_Short = new StringBuffer(); 97 | // 记录最后词元长度 98 | int lastLexemeLength = 0; 99 | // 记录最后词元结束位置 100 | int lastLexemeEnd = -1; 101 | 102 | int shortCount = 0; 103 | int totalCount = 0; 104 | for (Lexeme l : lexemes) { 105 | totalCount += l.getLength(); 106 | // 精简表达式 107 | if (l.getLength() > 1) { 108 | keywordBuffer_Short.append(' ').append(l.getLexemeText()); 109 | shortCount += l.getLength(); 110 | } 111 | 112 | if (lastLexemeLength == 0) { 113 | keywordBuffer.append(l.getLexemeText()); 114 | } else if (lastLexemeLength == 1 && l.getLength() == 1 115 | && lastLexemeEnd == l.getBeginPosition()) {// 单字位置相邻,长度为一,合并) 116 | keywordBuffer.append(l.getLexemeText()); 117 | } else { 118 | keywordBuffer.append(' ').append(l.getLexemeText()); 119 | 120 | } 121 | lastLexemeLength = l.getLength(); 122 | lastLexemeEnd = l.getEndPosition(); 123 | } 124 | 125 | // 借助lucene queryparser 生成SWMC Query 126 | QueryParser qp = new QueryParser(Version.LUCENE_43, fieldName, new StandardAnalyzer( 127 | Version.LUCENE_43)); 128 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 129 | qp.setAutoGeneratePhraseQueries(true); 130 | 131 | if (quickMode && (shortCount * 1.0f / totalCount) > 0.5f) { 132 | try { 133 | // System.out.println(keywordBuffer.toString()); 134 | Query q = qp.parse(keywordBuffer_Short.toString()); 135 | return q; 136 | } catch (ParseException e) { 137 | e.printStackTrace(); 138 | } 139 | 140 | } else { 141 | if (keywordBuffer.length() > 0) { 142 | try { 143 | // System.out.println(keywordBuffer.toString()); 144 | Query q = qp.parse(keywordBuffer.toString()); 145 | return q; 146 | } catch (ParseException e) { 147 | e.printStackTrace(); 148 | } 149 | } 150 | } 151 | return null; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | import java.io.StringReader; 30 | 31 | import org.apache.lucene.analysis.Analyzer; 32 | import org.apache.lucene.analysis.TokenStream; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | import org.wltea.analyzer.lucene.IKAnalyzer; 37 | 38 | /** 39 | * 使用IKAnalyzer进行分词的演示 40 | * 2012-10-22 41 | * 42 | */ 43 | public class IKAnalzyerDemo { 44 | 45 | public static void main(String[] args) { 46 | // 构建IK分词器,使用smart分词模式 47 | Analyzer analyzer = new IKAnalyzer(true); 48 | 49 | // 获取Lucene的TokenStream对象 50 | TokenStream ts = null; 51 | try { 52 | ts = analyzer.tokenStream("myfield", new StringReader( 53 | "这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); 54 | // 获取词元位置属性 55 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 56 | // 获取词元文本属性 57 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 58 | // 获取词元文本属性 59 | TypeAttribute type = ts.addAttribute(TypeAttribute.class); 60 | 61 | // 重置TokenStream(重置StringReader) 62 | ts.reset(); 63 | // 迭代获取分词结果 64 | while (ts.incrementToken()) { 65 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " 66 | + term.toString() + " | " + type.type()); 67 | } 68 | // 关闭TokenStream(关闭StringReader) 69 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset. 70 | 71 | } catch (IOException e) { 72 | e.printStackTrace(); 73 | } finally { 74 | // 释放TokenStream的所有资源 75 | if (ts != null) { 76 | try { 77 | ts.close(); 78 | } catch (IOException e) { 79 | e.printStackTrace(); 80 | } 81 | } 82 | } 83 | 84 | } 85 | 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | 30 | import org.apache.lucene.analysis.Analyzer; 31 | import org.apache.lucene.document.Document; 32 | import org.apache.lucene.document.Field; 33 | import org.apache.lucene.document.StringField; 34 | import org.apache.lucene.document.TextField; 35 | import org.apache.lucene.index.CorruptIndexException; 36 | import org.apache.lucene.index.DirectoryReader; 37 | import org.apache.lucene.index.IndexReader; 38 | import org.apache.lucene.index.IndexWriter; 39 | import org.apache.lucene.index.IndexWriterConfig; 40 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 41 | import org.apache.lucene.queryparser.classic.ParseException; 42 | import org.apache.lucene.queryparser.classic.QueryParser; 43 | import org.apache.lucene.search.IndexSearcher; 44 | import org.apache.lucene.search.Query; 45 | import org.apache.lucene.search.ScoreDoc; 46 | import org.apache.lucene.search.TopDocs; 47 | import org.apache.lucene.store.Directory; 48 | import org.apache.lucene.store.LockObtainFailedException; 49 | import org.apache.lucene.store.RAMDirectory; 50 | import org.apache.lucene.util.Version; 51 | import org.wltea.analyzer.lucene.IKAnalyzer; 52 | 53 | /** 54 | * 使用IKAnalyzer进行Lucene索引和查询的演示 55 | * 2012-3-2 56 | * 57 | * 以下是结合Lucene4.0 API的写法 58 | * 59 | */ 60 | public class LuceneIndexAndSearchDemo { 61 | 62 | /** 63 | * 模拟: 64 | * 创建一个单条记录的索引,并对其进行搜索 65 | * @param args 66 | */ 67 | public static void main(String[] args) { 68 | // Lucene Document的域名 69 | String fieldName = "text"; 70 | // 检索内容 71 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; 72 | 73 | // 实例化IKAnalyzer分词器 74 | Analyzer analyzer = new IKAnalyzer(true); 75 | 76 | Directory directory = null; 77 | IndexWriter iwriter = null; 78 | IndexReader ireader = null; 79 | IndexSearcher isearcher = null; 80 | try { 81 | // 建立内存索引对象 82 | directory = new RAMDirectory(); 83 | 84 | // 配置IndexWriterConfig 85 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); 86 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); 87 | iwriter = new IndexWriter(directory, iwConfig); 88 | // 写入索引 89 | Document doc = new Document(); 90 | doc.add(new StringField("ID", "10000", Field.Store.YES)); 91 | doc.add(new TextField(fieldName, text, Field.Store.YES)); 92 | iwriter.addDocument(doc); 93 | iwriter.close(); 94 | 95 | // 搜索过程********************************** 96 | // 实例化搜索器 97 | ireader = DirectoryReader.open(directory); 98 | isearcher = new IndexSearcher(ireader); 99 | 100 | String keyword = "中文分词工具包"; 101 | // 使用QueryParser查询分析器构造Query对象 102 | QueryParser qp = new QueryParser(Version.LUCENE_43, fieldName, analyzer); 103 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 104 | Query query = qp.parse(keyword); 105 | System.out.println("Query = " + query); 106 | 107 | // 搜索相似度最高的5条记录 108 | TopDocs topDocs = isearcher.search(query, 5); 109 | System.out.println("命中:" + topDocs.totalHits); 110 | // 输出结果 111 | ScoreDoc[] scoreDocs = topDocs.scoreDocs; 112 | for (int i = 0; i < topDocs.totalHits; i++) { 113 | Document targetDoc = isearcher.doc(scoreDocs[i].doc); 114 | System.out.println("内容:" + targetDoc.toString()); 115 | } 116 | 117 | } catch (CorruptIndexException e) { 118 | e.printStackTrace(); 119 | } catch (LockObtainFailedException e) { 120 | e.printStackTrace(); 121 | } catch (IOException e) { 122 | e.printStackTrace(); 123 | } catch (ParseException e) { 124 | e.printStackTrace(); 125 | } finally { 126 | if (ireader != null) { 127 | try { 128 | ireader.close(); 129 | } catch (IOException e) { 130 | e.printStackTrace(); 131 | } 132 | } 133 | if (directory != null) { 134 | try { 135 | directory.close(); 136 | } catch (IOException e) { 137 | e.printStackTrace(); 138 | } 139 | } 140 | } 141 | } 142 | } 143 | --------------------------------------------------------------------------------