├── xultimate-ikanalyzer ├── src │ ├── test │ │ ├── resources │ │ │ ├── ext.dic │ │ │ ├── IKAnalyzer.cfg.xml │ │ │ ├── stopword.dic │ │ │ └── databases.properties │ │ └── java │ │ │ └── org │ │ │ ├── danielli │ │ │ └── xultimate │ │ │ │ └── ikanalyzer │ │ │ │ └── InitializerTest.java │ │ │ └── wltea │ │ │ └── analyzer │ │ │ └── sample │ │ │ ├── IKAnalzyerDemo.java │ │ │ └── LuceneIndexAndSearchDemo.java │ └── main │ │ ├── java │ │ └── org │ │ │ ├── danielli │ │ │ └── xultimate │ │ │ │ └── searching │ │ │ │ ├── biz │ │ │ │ ├── ExtKeywordBiz.java │ │ │ │ ├── StopKeywordBiz.java │ │ │ │ ├── SynonymKeywordBiz.java │ │ │ │ └── impl │ │ │ │ │ ├── MyBatisExtKeywordBiz.java │ │ │ │ │ ├── MyBatisStopKeywordBiz.java │ │ │ │ │ └── MyBatisSynonymKeywordBiz.java │ │ │ │ ├── service │ │ │ │ ├── ExtKeywordService.java │ │ │ │ ├── StopKeywordService.java │ │ │ │ ├── SynonymKeywordService.java │ │ │ │ └── impl │ │ │ │ │ ├── MyBatisExtKeywordService.java │ │ │ │ │ ├── MyBatisStopKeywordService.java │ │ │ │ │ └── MyBatisSynonymKeywordService.java │ │ │ │ ├── dao │ │ │ │ ├── ExtKeywordDAO.java │ │ │ │ ├── StopKeywordDAO.java │ │ │ │ └── SynonymKeywordDAO.java │ │ │ │ ├── po │ │ │ │ ├── ExtKeyword.java │ │ │ │ ├── StopKeyword.java │ │ │ │ └── SynonymKeyword.java │ │ │ │ ├── IKTokenizerFactory.java │ │ │ │ ├── ExtKeywordInitializer.java │ │ │ │ ├── StopKeywordInitializer.java │ │ │ │ ├── SolrSynonymDtabaseLoader.java │ │ │ │ └── SynonymFilterFactory.java │ │ │ └── wltea │ │ │ └── analyzer │ │ │ ├── core │ │ │ ├── ISegmenter.java │ │ │ ├── CharacterUtil.java │ │ │ ├── CJKSegmenter.java │ │ │ ├── IKArbitrator.java │ │ │ ├── IKSegmenter.java │ │ │ ├── QuickSortSet.java │ │ │ ├── CN_QuantifierSegmenter.java │ │ │ ├── LexemePath.java │ │ │ ├── Lexeme.java │ │ │ └── LetterSegmenter.java │ │ │ ├── cfg │ │ │ ├── Configuration.java │ │ │ └── DefaultConfig.java │ │ │ ├── lucene │ │ │ ├── IKAnalyzer.java │ │ │ └── IKTokenizer.java │ │ │ ├── dic │ │ │ ├── quantifier.dic │ │ │ ├── Hit.java │ │ │ └── DictSegment.java │ │ │ └── query │ │ │ └── SWMCQueryBuilder.java │ │ └── resources │ │ ├── mybatis │ │ ├── ExtKeywordDAO.xml │ │ ├── StopKeywordDAO.xml │ │ ├── SynonymKeywordDAO.xml │ │ └── mybatis-3-mapper.dtd │ │ ├── solr_db_init.sql │ │ ├── applicationContext-service-config.xml │ │ ├── applicationContext-service-generic.xml │ │ ├── applicationContext-service-crypto.xml │ │ ├── applicationContext-dao-base.xml │ │ └── applicationContext-dao-generic.xml └── pom.xml ├── .gitignore ├── xultimate-lucene ├── src │ ├── main │ │ └── java │ │ │ └── org │ │ │ └── danielli │ │ │ └── xultimate │ │ │ └── lucene │ │ │ └── util │ │ │ └── AnalyzerUtils.java │ └── test │ │ ├── java │ │ └── org │ │ │ └── danielli │ │ │ └── xultimate │ │ │ └── lucene │ │ │ ├── IdFilter.java │ │ │ ├── IdCollector.java │ │ │ ├── TestUtils.java │ │ │ ├── IdFieldComparator.java │ │ │ ├── NearRealtimeSearchTest2.java │ │ │ ├── NearRealtimeSearchTest1.java │ │ │ └── NearRealtimeSearchTest3.java │ │ └── resources │ │ └── org │ │ └── danielli │ │ └── xultimate │ │ └── lucene │ │ ├── applicationContext-service-lucene1.xml │ │ ├── applicationContext-service-lucene2.xml │ │ └── applicationContext-service-lucene3.xml └── pom.xml ├── xultimate-solr ├── pom.xml └── src │ └── test │ ├── resources │ └── applicationContext-service-solr-client.xml │ └── java │ └── org │ └── danielli │ └── xultimate │ └── solr │ └── SolrServerTest.java ├── pom.xml └── README.md /xultimate-ikanalyzer/src/test/resources/ext.dic: -------------------------------------------------------------------------------- 1 | 诛仙 2 | 诛仙2 3 | 梦幻诛仙 4 | 梦幻诛仙2 5 | 李天朋 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .settings 2 | .project 3 | .classpath 4 | 5 | *.class 6 | 7 | # Package Files # 8 | *.jar 9 | *.war 10 | *.ear 11 | 12 | target 13 | .springBeans -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/ExtKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz; 2 | 3 | import java.util.List; 4 | 5 | public interface ExtKeywordBiz { 6 | 7 | List find(Integer pageNo, Integer pageSize); 8 | } 9 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/StopKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz; 2 | 3 | import java.util.List; 4 | 5 | public interface StopKeywordBiz { 6 | 7 | List find(Integer pageNo, Integer pageSize); 8 | } 9 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/ExtKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service; 2 | 3 | import java.util.List; 4 | 5 | public interface ExtKeywordService { 6 | 7 | List find(Integer pageNo, Integer pageSize); 8 | } 9 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/StopKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service; 2 | 3 | import java.util.List; 4 | 5 | public interface StopKeywordService { 6 | 7 | List find(Integer pageNo, Integer pageSize); 8 | } 9 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/SynonymKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz; 2 | 3 | import java.util.List; 4 | 5 | import org.danielli.xultimate.searching.po.SynonymKeyword; 6 | 7 | public interface SynonymKeywordBiz { 8 | 9 | List find(Integer pageNo, Integer pageSize); 10 | } 11 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/dao/ExtKeywordDAO.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.dao; 2 | 3 | import java.util.List; 4 | 5 | import org.danielli.xultimate.orm.mybatis.MyBatisRepository; 6 | 7 | @MyBatisRepository 8 | public interface ExtKeywordDAO { 9 | 10 | List find(Integer offset, Integer rows); 11 | } 12 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/dao/StopKeywordDAO.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.dao; 2 | 3 | import java.util.List; 4 | 5 | import org.danielli.xultimate.orm.mybatis.MyBatisRepository; 6 | 7 | @MyBatisRepository 8 | public interface StopKeywordDAO { 9 | 10 | List find(Integer offset, Integer rows); 11 | } 12 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/SynonymKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service; 2 | 3 | import java.util.List; 4 | 5 | import org.danielli.xultimate.searching.po.SynonymKeyword; 6 | 7 | public interface SynonymKeywordService { 8 | 9 | List find(Integer pageNo, Integer pageSize); 10 | } 11 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/resources/IKAnalyzer.cfg.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | IK Analyzer 扩展配置 5 | 6 | ext.dic; 7 | 8 | 9 | stopword.dic; 10 | 11 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/dao/SynonymKeywordDAO.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.dao; 2 | 3 | import java.util.List; 4 | 5 | import org.danielli.xultimate.orm.mybatis.MyBatisRepository; 6 | import org.danielli.xultimate.searching.po.SynonymKeyword; 7 | 8 | @MyBatisRepository 9 | public interface SynonymKeywordDAO { 10 | 11 | List find(Integer offset, Integer rows); 12 | } 13 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/resources/stopword.dic: -------------------------------------------------------------------------------- 1 | a 2 | an 3 | and 4 | are 5 | as 6 | at 7 | be 8 | but 9 | by 10 | for 11 | if 12 | in 13 | into 14 | is 15 | it 16 | no 17 | not 18 | of 19 | on 20 | or 21 | such 22 | that 23 | the 24 | their 25 | then 26 | there 27 | these 28 | they 29 | this 30 | to 31 | was 32 | will 33 | with 34 | 也 35 | 了 36 | 仍 37 | 从 38 | 以 39 | 使 40 | 则 41 | 却 42 | 又 43 | 及 44 | 对 45 | 就 46 | 并 47 | 很 48 | 或 49 | 把 50 | 是 51 | 的 52 | 着 53 | 给 54 | 而 55 | 被 56 | 让 57 | 在 58 | 还 59 | 比 60 | 等 61 | 当 62 | 与 63 | 于 64 | 但 -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/mybatis/ExtKeywordDAO.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/mybatis/StopKeywordDAO.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/impl/MyBatisExtKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.ExtKeywordBiz; 8 | import org.danielli.xultimate.searching.service.ExtKeywordService; 9 | import org.springframework.stereotype.Service; 10 | 11 | @Service("myBatisExtKeywordService") 12 | public class MyBatisExtKeywordService implements ExtKeywordService { 13 | 14 | @Resource(name = "myBatisExtKeywordBiz") 15 | private ExtKeywordBiz extKeywordBiz; 16 | 17 | @Override 18 | public List find(Integer pageNo, Integer pageSize) { 19 | return extKeywordBiz.find(pageNo, pageSize); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/impl/MyBatisStopKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.StopKeywordBiz; 8 | import org.danielli.xultimate.searching.service.StopKeywordService; 9 | import org.springframework.stereotype.Service; 10 | 11 | @Service("myBatisStopKeywordService") 12 | public class MyBatisStopKeywordService implements StopKeywordService { 13 | 14 | @Resource(name = "myBatisStopKeywordBiz") 15 | private StopKeywordBiz stopKeywordBiz; 16 | 17 | @Override 18 | public List find(Integer pageNo, Integer pageSize) { 19 | return stopKeywordBiz.find(pageNo, pageSize); 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/resources/databases.properties: -------------------------------------------------------------------------------- 1 | #------------ MySQL ------------ 2 | datasource.solrDb.jdbc.driver=com.mysql.jdbc.Driver 3 | datasource.solrDb.jdbc.url=jdbc:mysql://127.0.0.1:3306/orm_db?useUnicode=true&characterEncoding=UTF-8&zeroDateTimeBehavior=convertToNull 4 | datasource.solrDb.jdbc.username=64c15dd4a9f5523d71dde127e8e8db96 5 | datasource.solrDb.jdbc.password=64c15dd4a9f5523d71dde127e8e8db96 6 | datasource.solrDb.pool.maxActive=20 7 | datasource.solrDb.pool.minIdle=10 8 | 9 | #------------ Oracle ------------ 10 | #datasource.solrDb.jdbc.driver=oracle.jdbc.driver.OracleDriver 11 | #datasource.solrDb.jdbc.url=jdbc:oracle:thin:@localhost:1521:orm_db 12 | #datasource.solrDb.jdbc.username=64c15dd4a9f5523d71dde127e8e8db96 13 | #datasource.solrDb.jdbc.password=64c15dd4a9f5523d71dde127e8e8db96 14 | #datasource.solrDb.pool.maxActive=20 15 | #datasource.solrDb.pool.minIdle=10 -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/service/impl/MyBatisSynonymKeywordService.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.service.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.SynonymKeywordBiz; 8 | import org.danielli.xultimate.searching.po.SynonymKeyword; 9 | import org.danielli.xultimate.searching.service.SynonymKeywordService; 10 | import org.springframework.stereotype.Service; 11 | 12 | @Service("myBatisSynonymKeywordService") 13 | public class MyBatisSynonymKeywordService implements SynonymKeywordService { 14 | 15 | @Resource(name = "myBatisSynonymKeywordBiz") 16 | private SynonymKeywordBiz synonymKeywordBiz; 17 | 18 | @Override 19 | public List find(Integer pageNo, Integer pageSize) { 20 | return synonymKeywordBiz.find(pageNo, pageSize); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/mybatis/SynonymKeywordDAO.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /xultimate-lucene/src/main/java/org/danielli/xultimate/lucene/util/AnalyzerUtils.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene.util; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import org.apache.lucene.analysis.Analyzer; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 | 12 | public class AnalyzerUtils { 13 | 14 | public static List tokenStream(Analyzer analyzer, Reader analyzerReader) throws IOException { 15 | List result = new ArrayList(); 16 | TokenStream tokenStream = analyzer.tokenStream(null, analyzerReader); 17 | CharTermAttribute attribute = tokenStream.getAttribute(CharTermAttribute.class); 18 | while (tokenStream.incrementToken()) { 19 | result.add(attribute.toString()); 20 | } 21 | return result; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/impl/MyBatisExtKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.ExtKeywordBiz; 8 | import org.danielli.xultimate.searching.dao.ExtKeywordDAO; 9 | import org.danielli.xultimate.util.math.NumberUtils; 10 | import org.springframework.stereotype.Service; 11 | 12 | @Service("myBatisExtKeywordBiz") 13 | public class MyBatisExtKeywordBiz implements ExtKeywordBiz { 14 | 15 | @Resource(name = "extKeywordDAO") 16 | private ExtKeywordDAO extKeywordDAO; 17 | 18 | @Override 19 | public List find(Integer pageNo, Integer pageSize) { 20 | if (!NumberUtils.isPositiveNumber(pageNo)) { 21 | pageNo = 1; 22 | } 23 | Integer offset = (pageNo - 1) * pageSize; 24 | return extKeywordDAO.find(offset, pageSize); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/impl/MyBatisStopKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.StopKeywordBiz; 8 | import org.danielli.xultimate.searching.dao.StopKeywordDAO; 9 | import org.danielli.xultimate.util.math.NumberUtils; 10 | import org.springframework.stereotype.Service; 11 | 12 | @Service("myBatisStopKeywordBiz") 13 | public class MyBatisStopKeywordBiz implements StopKeywordBiz { 14 | 15 | @Resource(name = "stopKeywordDAO") 16 | private StopKeywordDAO stopKeywordDAO; 17 | 18 | @Override 19 | public List find(Integer pageNo, Integer pageSize) { 20 | if (!NumberUtils.isPositiveNumber(pageNo)) { 21 | pageNo = 1; 22 | } 23 | Integer offset = (pageNo - 1) * pageSize; 24 | return stopKeywordDAO.find(offset, pageSize); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/po/ExtKeyword.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.po; 2 | 3 | import java.util.Date; 4 | 5 | public class ExtKeyword { 6 | 7 | private Long id; 8 | 9 | private String keyword; 10 | 11 | private Date createTime; 12 | 13 | private Date updateTime; 14 | 15 | public Long getId() { 16 | return id; 17 | } 18 | 19 | public void setId(Long id) { 20 | this.id = id; 21 | } 22 | 23 | public String getKeyword() { 24 | return keyword; 25 | } 26 | 27 | public void setKeyword(String keyword) { 28 | this.keyword = keyword; 29 | } 30 | 31 | public Date getCreateTime() { 32 | return createTime; 33 | } 34 | 35 | public void setCreateTime(Date createTime) { 36 | this.createTime = createTime; 37 | } 38 | 39 | public Date getUpdateTime() { 40 | return updateTime; 41 | } 42 | 43 | public void setUpdateTime(Date updateTime) { 44 | this.updateTime = updateTime; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/po/StopKeyword.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.po; 2 | 3 | import java.util.Date; 4 | 5 | public class StopKeyword { 6 | 7 | private Long id; 8 | 9 | private String keyword; 10 | 11 | private Date createTime; 12 | 13 | private Date updateTime; 14 | 15 | public Long getId() { 16 | return id; 17 | } 18 | 19 | public void setId(Long id) { 20 | this.id = id; 21 | } 22 | 23 | public String getKeyword() { 24 | return keyword; 25 | } 26 | 27 | public void setKeyword(String keyword) { 28 | this.keyword = keyword; 29 | } 30 | 31 | public Date getCreateTime() { 32 | return createTime; 33 | } 34 | 35 | public void setCreateTime(Date createTime) { 36 | this.createTime = createTime; 37 | } 38 | 39 | public Date getUpdateTime() { 40 | return updateTime; 41 | } 42 | 43 | public void setUpdateTime(Date updateTime) { 44 | this.updateTime = updateTime; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/biz/impl/MyBatisSynonymKeywordBiz.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.biz.impl; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.danielli.xultimate.searching.biz.SynonymKeywordBiz; 8 | import org.danielli.xultimate.searching.dao.SynonymKeywordDAO; 9 | import org.danielli.xultimate.searching.po.SynonymKeyword; 10 | import org.danielli.xultimate.util.math.NumberUtils; 11 | import org.springframework.stereotype.Service; 12 | 13 | @Service("myBatisSynonymKeywordBiz") 14 | public class MyBatisSynonymKeywordBiz implements SynonymKeywordBiz { 15 | 16 | @Resource(name = "synonymKeywordDAO") 17 | private SynonymKeywordDAO synonymKeywordDAO; 18 | 19 | @Override 20 | public List find(Integer pageNo, Integer pageSize) { 21 | if (!NumberUtils.isPositiveNumber(pageNo)) { 22 | pageNo = 1; 23 | } 24 | Integer offset = (pageNo - 1) * pageSize; 25 | return synonymKeywordDAO.find(offset, pageSize); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/IdFilter.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.index.AtomicReaderContext; 6 | import org.apache.lucene.index.DocsEnum; 7 | import org.apache.lucene.index.Term; 8 | import org.apache.lucene.search.DocIdSet; 9 | import org.apache.lucene.search.Filter; 10 | import org.apache.lucene.util.Bits; 11 | import org.apache.lucene.util.OpenBitSet; 12 | 13 | public class IdFilter extends Filter { 14 | 15 | private Long[] existIds; 16 | 17 | public IdFilter(Long[] existIds) { 18 | this.existIds = existIds; 19 | } 20 | 21 | @Override 22 | public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException { 23 | OpenBitSet bits = new OpenBitSet(context.reader().maxDoc()); 24 | for (Long id : existIds) { 25 | DocsEnum docsEnum = context.reader().termDocsEnum(new Term("id", String.valueOf(id))); 26 | if(docsEnum != null && docsEnum.nextDoc() != -1) { 27 | bits.set(docsEnum.docID()); 28 | } 29 | } 30 | return bits; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/IKTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching; 2 | 3 | import java.io.Reader; 4 | import java.util.Map; 5 | 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.apache.lucene.analysis.util.TokenizerFactory; 8 | import org.apache.lucene.util.AttributeSource.AttributeFactory; 9 | import org.wltea.analyzer.lucene.IKTokenizer; 10 | 11 | 12 | public class IKTokenizerFactory extends TokenizerFactory { 13 | 14 | public static final boolean DEFAULT_USE_SMART = false; 15 | 16 | private boolean useSmart; 17 | 18 | public boolean useSmart() { 19 | return useSmart; 20 | } 21 | 22 | public void setUseSmart(boolean useSmart) { 23 | this.useSmart = useSmart; 24 | } 25 | 26 | public IKTokenizerFactory(Map args) { 27 | super(args); 28 | assureMatchVersion(); 29 | useSmart = getBoolean(args, "useSmart", DEFAULT_USE_SMART); 30 | } 31 | 32 | @Override 33 | public Tokenizer create(AttributeFactory factory, Reader input) { 34 | Tokenizer _IKTokenizer = new IKTokenizer(input , this.useSmart()); 35 | return _IKTokenizer; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/po/SynonymKeyword.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching.po; 2 | 3 | import java.util.Date; 4 | 5 | public class SynonymKeyword { 6 | 7 | private Long id; 8 | 9 | private String keyword; 10 | 11 | private String synonymKeyword; 12 | 13 | private Date createTime; 14 | 15 | private Date updateTime; 16 | 17 | public Long getId() { 18 | return id; 19 | } 20 | 21 | public void setId(Long id) { 22 | this.id = id; 23 | } 24 | 25 | public String getKeyword() { 26 | return keyword; 27 | } 28 | 29 | public void setKeyword(String keyword) { 30 | this.keyword = keyword; 31 | } 32 | 33 | public String getSynonymKeyword() { 34 | return synonymKeyword; 35 | } 36 | 37 | public void setSynonymKeyword(String synonymKeyword) { 38 | this.synonymKeyword = synonymKeyword; 39 | } 40 | 41 | public Date getCreateTime() { 42 | return createTime; 43 | } 44 | 45 | public void setCreateTime(Date createTime) { 46 | this.createTime = createTime; 47 | } 48 | 49 | public Date getUpdateTime() { 50 | return updateTime; 51 | } 52 | 53 | public void setUpdateTime(Date updateTime) { 54 | this.updateTime = updateTime; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /xultimate-lucene/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.danielli.xultimate 8 | xultimate-searching 9 | 1.0.0-SNAPSHOT 10 | 11 | 12 | xultimate-lucene 13 | 14 | The X-Ultimate Searching Lucene 15 | The X-Ultimate Searching lucene project. 16 | https://github.com/daniellitoc/xultimate-searching/tree/master/xultimate-lucene 17 | 18 | 19 | 20 | ${project.groupId} 21 | xultimate-ikanalyzer 22 | ${project.version} 23 | 24 | 25 | junit 26 | junit 27 | 28 | 29 | org.springframework 30 | spring-test 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/solr_db_init.sql: -------------------------------------------------------------------------------- 1 | create table IF NOT EXISTS `solr_ext_keywords` ( 2 | `id` bigint NOT NULL, 3 | `keyword` varchar(100) NOT NULL, 4 | `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 5 | `update_time` timestamp NOT NULL, 6 | PRIMARY KEY (`id`) 7 | ) ENGINE = MyISAM; 8 | 9 | insert IGNORE into `solr_ext_keywords` values (1, '李天棚', '1990-04-17 00:00:01', '1990-04-17 00:00:01'); 10 | 11 | create table IF NOT EXISTS `solr_stop_keywords` ( 12 | `id` bigint NOT NULL, 13 | `keyword` varchar(100) NOT NULL, 14 | `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 15 | `update_time` timestamp NOT NULL, 16 | PRIMARY KEY (`id`) 17 | ) ENGINE = MyISAM; 18 | 19 | insert IGNORE into `solr_stop_keywords` values (1, '的', '1990-04-17 00:00:01', '1990-04-17 00:00:01'); 20 | 21 | create table IF NOT EXISTS `solr_synonym_keywords` ( 22 | `id` bigint(20) NOT NULL, 23 | `keyword` varchar(100) NOT NULL, 24 | `synonym_keyword` varchar(100) NOT NULL, 25 | `create_time` timestamp NOT NULL, 26 | `update_time` timestamp NOT NULL, 27 | PRIMARY KEY (`id`) 28 | ) ENGINE = MyISAM; 29 | 30 | insert IGNORE into `solr_synonym_keywords` values (1, 'Daniel Li', '李天棚', '1990-04-17 00:00:01', '1990-04-17 00:00:01'); 31 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/ExtKeywordInitializer.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.PostConstruct; 6 | import javax.annotation.Resource; 7 | 8 | import org.danielli.xultimate.searching.service.ExtKeywordService; 9 | import org.danielli.xultimate.util.collections.CollectionUtils; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.context.annotation.Lazy; 13 | import org.springframework.stereotype.Service; 14 | import org.wltea.analyzer.cfg.DefaultConfig; 15 | import org.wltea.analyzer.dic.Dictionary; 16 | 17 | @Service("extKeywordInitializer") 18 | @Lazy(false) 19 | public class ExtKeywordInitializer { 20 | 21 | private static final Logger LOGGER = LoggerFactory.getLogger(ExtKeywordInitializer.class); 22 | 23 | @Resource(name = "myBatisExtKeywordService") 24 | private ExtKeywordService extKeywordService; 25 | 26 | @PostConstruct 27 | public void init() { 28 | LOGGER.info("开始加载扩展词词库从数据库"); 29 | Dictionary.initial(DefaultConfig.getInstance()); 30 | for (int pageNo = 1; ; pageNo++) { 31 | List extKeywordList = extKeywordService.find(pageNo, 10000); 32 | if (CollectionUtils.isEmpty(extKeywordList)) { 33 | break; 34 | } 35 | Dictionary.getSingleton().addWords(extKeywordList); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/StopKeywordInitializer.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching; 2 | 3 | import java.util.List; 4 | 5 | import javax.annotation.PostConstruct; 6 | import javax.annotation.Resource; 7 | 8 | import org.danielli.xultimate.searching.service.StopKeywordService; 9 | import org.danielli.xultimate.util.collections.CollectionUtils; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | import org.springframework.context.annotation.Lazy; 13 | import org.springframework.stereotype.Service; 14 | import org.wltea.analyzer.cfg.DefaultConfig; 15 | import org.wltea.analyzer.dic.Dictionary; 16 | 17 | @Service("stopKeywordInitializer") 18 | @Lazy(false) 19 | public class StopKeywordInitializer { 20 | 21 | private static final Logger LOGGER = LoggerFactory.getLogger(StopKeywordInitializer.class); 22 | 23 | @Resource(name = "myBatisStopKeywordService") 24 | private StopKeywordService stopKeywordService; 25 | 26 | @PostConstruct 27 | public void init() { 28 | LOGGER.info("开始加载停用词词库从数据库"); 29 | Dictionary.initial(DefaultConfig.getInstance()); 30 | for (int pageNo = 1; ; pageNo++) { 31 | List stopKeywordList = stopKeywordService.find(pageNo, 10000); 32 | if (CollectionUtils.isEmpty(stopKeywordList)) { 33 | break; 34 | } 35 | Dictionary.getSingleton().addStopWords(stopKeywordList); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/IdCollector.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.LinkedHashSet; 6 | import java.util.List; 7 | import java.util.Set; 8 | 9 | import org.apache.lucene.index.AtomicReaderContext; 10 | import org.apache.lucene.search.Collector; 11 | import org.apache.lucene.search.FieldCache; 12 | import org.apache.lucene.search.FieldCache.Longs; 13 | import org.apache.lucene.search.Scorer; 14 | 15 | public class IdCollector extends Collector { 16 | 17 | @SuppressWarnings("unused") 18 | private Scorer scorer; 19 | private Longs currentValues; 20 | private Set result = new LinkedHashSet(); 21 | 22 | @Override 23 | public void setScorer(Scorer scorer) throws IOException { 24 | this.scorer = scorer; 25 | } 26 | 27 | @Override 28 | public void setNextReader(AtomicReaderContext context) throws IOException { 29 | this.currentValues = FieldCache.DEFAULT.getLongs(context.reader(), "id", false); 30 | } 31 | 32 | @Override 33 | public void collect(int doc) throws IOException { 34 | Long userId = this.currentValues.get(doc); 35 | result.add(userId); 36 | } 37 | 38 | public Integer getMatchUserCount() { 39 | return result.size(); 40 | } 41 | 42 | public List getResult() { 43 | return new ArrayList(result); 44 | } 45 | 46 | @Override 47 | public boolean acceptsDocsOutOfOrder() { 48 | return false; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/ISegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * 30 | * 子分词器接口 31 | */ 32 | interface ISegmenter { 33 | 34 | /** 35 | * 从分析器读取下一个可能分解的词元对象 36 | * @param context 分词算法上下文 37 | */ 38 | void analyze(AnalyzeContext context); 39 | 40 | 41 | /** 42 | * 重置子分析器状态 43 | */ 44 | void reset(); 45 | 46 | } 47 | -------------------------------------------------------------------------------- /xultimate-solr/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | org.danielli.xultimate 8 | xultimate-searching 9 | 1.0.0-SNAPSHOT 10 | 11 | 12 | xultimate-solr 13 | 14 | The X-Ultimate Toolkit Solr 15 | The X-Ultimate Searching solr project. 16 | https://github.com/daniellitoc/xultimate-searching/tree/master/xultimate-solr 17 | 18 | 19 | 20 | ${project.groupId} 21 | xultimate-core 22 | ${project.version} 23 | 24 | 25 | ${project.groupId} 26 | xultimate-context 27 | ${project.version} 28 | 29 | 30 | ${project.groupId} 31 | xultimate-web 32 | ${project.version} 33 | 34 | 35 | org.apache.solr 36 | solr-solrj 37 | 38 | 39 | junit 40 | junit 41 | 42 | 43 | org.springframework 44 | spring-test 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/cfg/Configuration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.cfg; 26 | 27 | import java.util.List; 28 | 29 | /** 30 | * 31 | * 配置管理类接口 32 | * 33 | */ 34 | public interface Configuration { 35 | 36 | 37 | 38 | /** 39 | * 返回useSmart标志位 40 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 41 | * @return useSmart 42 | */ 43 | public boolean useSmart(); 44 | 45 | /** 46 | * 设置useSmart标志位 47 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 48 | * @param useSmart 49 | */ 50 | public void setUseSmart(boolean useSmart); 51 | 52 | 53 | /** 54 | * 获取主词典路径 55 | * 56 | * @return String 主词典路径 57 | */ 58 | public String getMainDictionary(); 59 | 60 | /** 61 | * 获取量词词典路径 62 | * @return String 量词词典路径 63 | */ 64 | public String getQuantifierDicionary(); 65 | 66 | /** 67 | * 获取扩展字典配置路径 68 | * @return List 相对类加载器的路径 69 | */ 70 | public List getExtDictionarys(); 71 | 72 | 73 | /** 74 | * 获取扩展停止词典配置路径 75 | * @return List 相对类加载器的路径 76 | */ 77 | public List getExtStopWordDictionarys(); 78 | 79 | } 80 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.lucene; 26 | 27 | import java.io.Reader; 28 | 29 | import org.apache.lucene.analysis.Analyzer; 30 | import org.apache.lucene.analysis.Tokenizer; 31 | 32 | /** 33 | * IK分词器,Lucene Analyzer接口实现 34 | * 兼容Lucene 4.0版本 35 | */ 36 | public final class IKAnalyzer extends Analyzer{ 37 | 38 | private boolean useSmart; 39 | 40 | public boolean useSmart() { 41 | return useSmart; 42 | } 43 | 44 | public void setUseSmart(boolean useSmart) { 45 | this.useSmart = useSmart; 46 | } 47 | 48 | /** 49 | * IK分词器Lucene Analyzer接口实现类 50 | * 51 | * 默认细粒度切分算法 52 | */ 53 | public IKAnalyzer(){ 54 | this(false); 55 | } 56 | 57 | /** 58 | * IK分词器Lucene Analyzer接口实现类 59 | * 60 | * @param useSmart 当为true时,分词器进行智能切分 61 | */ 62 | public IKAnalyzer(boolean useSmart){ 63 | super(); 64 | this.useSmart = useSmart; 65 | } 66 | 67 | /** 68 | * 重载Analyzer接口,构造分词组件 69 | */ 70 | @Override 71 | protected TokenStreamComponents createComponents(String fieldName, final Reader in) { 72 | Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart()); 73 | return new TokenStreamComponents(_IKTokenizer); 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/java/org/danielli/xultimate/ikanalyzer/InitializerTest.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.ikanalyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.junit.Test; 12 | import org.junit.runner.RunWith; 13 | import org.springframework.test.context.ContextConfiguration; 14 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 15 | import org.wltea.analyzer.lucene.IKAnalyzer; 16 | 17 | 18 | @RunWith(SpringJUnit4ClassRunner.class) 19 | @ContextConfiguration(locations = { "classpath:applicationContext-service-config.xml", "classpath:applicationContext-service-crypto.xml", "classpath:applicationContext-dao-base.xml", "classpath:applicationContext-dao-generic.xml", "classpath:applicationContext-service-generic.xml" }) 20 | public class InitializerTest { 21 | 22 | @Test 23 | public void test() { 24 | //构建IK分词器,使用smart分词模式 25 | Analyzer analyzer = new IKAnalyzer(true); 26 | 27 | //获取Lucene的TokenStream对象 28 | TokenStream ts = null; 29 | try { 30 | ts = analyzer.tokenStream("myfield", new StringReader("李天棚的测试")); 31 | //获取词元位置属性 32 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 33 | //获取词元文本属性 34 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 35 | //获取词元文本属性 36 | TypeAttribute type = ts.addAttribute(TypeAttribute.class); 37 | 38 | 39 | //重置TokenStream(重置StringReader) 40 | ts.reset(); 41 | //迭代获取分词结果 42 | while (ts.incrementToken()) { 43 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); 44 | } 45 | //关闭TokenStream(关闭StringReader) 46 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset. 47 | 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } finally { 51 | //释放TokenStream的所有资源 52 | if(ts != null){ 53 | try { 54 | ts.close(); 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | } 58 | } 59 | } 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | 6 | org.danielli.xultimate 7 | xultimate-searching 8 | 1.0.0-SNAPSHOT 9 | 10 | 11 | xultimate-ikanalyzer 12 | 13 | The X-Ultimate Searching IKAnalyzer 14 | The X-Ultimate Searching IKAnalyzer project. 15 | https://github.com/daniellitoc/xultimate-searching/tree/master/xultimate-ikanalyzer 16 | 17 | 18 | 19 | ${project.groupId} 20 | xultimate-core 21 | ${project.version} 22 | 23 | 24 | ${project.groupId} 25 | xultimate-context 26 | ${project.version} 27 | 28 | 29 | ${project.groupId} 30 | xultimate-jdbc 31 | ${project.version} 32 | 33 | 34 | ${project.groupId} 35 | xultimate-web 36 | ${project.version} 37 | 38 | 39 | ${project.groupId} 40 | xultimate-context-support 41 | ${project.version} 42 | 43 | 44 | ${project.groupId} 45 | xultimate-mybatis 46 | ${project.version} 47 | 48 | 49 | org.apache.lucene 50 | lucene-core 51 | 52 | 53 | org.apache.lucene 54 | lucene-queryparser 55 | 56 | 57 | org.apache.lucene 58 | lucene-analyzers-common 59 | 60 | 61 | junit 62 | junit 63 | 64 | 65 | org.springframework 66 | spring-test 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/applicationContext-service-config.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/SolrSynonymDtabaseLoader.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.ByteArrayInputStream; 6 | import java.io.ByteArrayOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.io.OutputStreamWriter; 10 | import java.io.Writer; 11 | import java.text.ParseException; 12 | import java.util.List; 13 | import java.util.Scanner; 14 | 15 | import org.apache.lucene.analysis.synonym.SolrSynonymParser; 16 | import org.danielli.xultimate.context.util.ApplicationContextUtils; 17 | import org.danielli.xultimate.context.util.BeanFactoryContext; 18 | import org.danielli.xultimate.searching.po.SynonymKeyword; 19 | import org.danielli.xultimate.searching.service.SynonymKeywordService; 20 | import org.danielli.xultimate.util.collections.CollectionUtils; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | import com.alibaba.fastjson.util.IOUtils; 25 | 26 | public class SolrSynonymDtabaseLoader { 27 | 28 | private static final Logger LOGGER = LoggerFactory.getLogger(SolrSynonymDtabaseLoader.class); 29 | 30 | public void handle(SolrSynonymParser synonymParser) throws ParseException, IOException { 31 | SynonymKeywordService synonymKeywordService = ApplicationContextUtils.getBean(BeanFactoryContext.currentApplicationContext(), SynonymKeywordService.class); 32 | LOGGER.info("开始加载相近词词库从数据库"); 33 | ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); 34 | Writer writer = new BufferedWriter(new OutputStreamWriter(outputStream)); 35 | try { 36 | for (int pageNo = 1; ; pageNo++) { 37 | List synonymKeywordList = synonymKeywordService.find(pageNo, 10000); 38 | if (CollectionUtils.isEmpty(synonymKeywordList)) { 39 | break; 40 | } 41 | for (SynonymKeyword synonymKeyword : synonymKeywordList) { 42 | writer.write(synonymKeyword.getKeyword()); 43 | writer.write("=>"); 44 | writer.write(synonymKeyword.getSynonymKeyword()); 45 | writer.write("\n"); 46 | } 47 | } 48 | writer.flush(); 49 | Scanner scanner = new Scanner(new ByteArrayInputStream(outputStream.toByteArray())); 50 | while (scanner.hasNextLine()) { 51 | System.out.println(scanner.nextLine()); 52 | } 53 | scanner.close(); 54 | synonymParser.add(new BufferedReader(new InputStreamReader(new ByteArrayInputStream(outputStream.toByteArray())))); 55 | } finally { 56 | IOUtils.close(writer); 57 | } 58 | 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/applicationContext-service-generic.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/dic/quantifier.dic: -------------------------------------------------------------------------------- 1 | 丈 2 | 下 3 | 世 4 | 世纪 5 | 两 6 | 个 7 | 中 8 | 串 9 | 亩 10 | 人 11 | 介 12 | 付 13 | 代 14 | 件 15 | 任 16 | 份 17 | 伏 18 | 伙 19 | 位 20 | 位数 21 | 例 22 | 倍 23 | 像素 24 | 元 25 | 克 26 | 克拉 27 | 公亩 28 | 公克 29 | 公分 30 | 公升 31 | 公尺 32 | 公担 33 | 公斤 34 | 公里 35 | 公顷 36 | 具 37 | 册 38 | 出 39 | 刀 40 | 分 41 | 分钟 42 | 分米 43 | 划 44 | 列 45 | 则 46 | 刻 47 | 剂 48 | 剑 49 | 副 50 | 加仑 51 | 勺 52 | 包 53 | 匙 54 | 匹 55 | 区 56 | 千克 57 | 千米 58 | 升 59 | 卷 60 | 厅 61 | 厘 62 | 厘米 63 | 双 64 | 发 65 | 口 66 | 句 67 | 只 68 | 台 69 | 叶 70 | 号 71 | 名 72 | 吨 73 | 听 74 | 员 75 | 周 76 | 周年 77 | 品 78 | 回 79 | 团 80 | 圆 81 | 圈 82 | 地 83 | 场 84 | 块 85 | 坪 86 | 堆 87 | 声 88 | 壶 89 | 处 90 | 夜 91 | 大 92 | 天 93 | 头 94 | 套 95 | 女 96 | 孔 97 | 字 98 | 宗 99 | 室 100 | 家 101 | 寸 102 | 对 103 | 封 104 | 尊 105 | 小时 106 | 尺 107 | 尾 108 | 局 109 | 层 110 | 届 111 | 岁 112 | 师 113 | 帧 114 | 幅 115 | 幕 116 | 幢 117 | 平方 118 | 平方公尺 119 | 平方公里 120 | 平方分米 121 | 平方厘米 122 | 平方码 123 | 平方米 124 | 平方英寸 125 | 平方英尺 126 | 平方英里 127 | 平米 128 | 年 129 | 年代 130 | 年级 131 | 度 132 | 座 133 | 式 134 | 引 135 | 张 136 | 成 137 | 战 138 | 截 139 | 户 140 | 房 141 | 所 142 | 扇 143 | 手 144 | 打 145 | 批 146 | 把 147 | 折 148 | 担 149 | 拍 150 | 招 151 | 拨 152 | 拳 153 | 指 154 | 掌 155 | 排 156 | 撮 157 | 支 158 | 文 159 | 斗 160 | 斤 161 | 方 162 | 族 163 | 日 164 | 时 165 | 曲 166 | 月 167 | 月份 168 | 期 169 | 本 170 | 朵 171 | 村 172 | 束 173 | 条 174 | 来 175 | 杯 176 | 枚 177 | 枝 178 | 枪 179 | 架 180 | 柄 181 | 柜 182 | 栋 183 | 栏 184 | 株 185 | 样 186 | 根 187 | 格 188 | 案 189 | 桌 190 | 档 191 | 桩 192 | 桶 193 | 梯 194 | 棵 195 | 楼 196 | 次 197 | 款 198 | 步 199 | 段 200 | 毛 201 | 毫 202 | 毫升 203 | 毫米 204 | 毫克 205 | 池 206 | 洲 207 | 派 208 | 海里 209 | 滴 210 | 炮 211 | 点 212 | 点钟 213 | 片 214 | 版 215 | 环 216 | 班 217 | 瓣 218 | 瓶 219 | 生 220 | 男 221 | 画 222 | 界 223 | 盆 224 | 盎司 225 | 盏 226 | 盒 227 | 盘 228 | 相 229 | 眼 230 | 石 231 | 码 232 | 碗 233 | 碟 234 | 磅 235 | 种 236 | 科 237 | 秒 238 | 秒钟 239 | 窝 240 | 立方公尺 241 | 立方分米 242 | 立方厘米 243 | 立方码 244 | 立方米 245 | 立方英寸 246 | 立方英尺 247 | 站 248 | 章 249 | 笔 250 | 等 251 | 筐 252 | 筒 253 | 箱 254 | 篇 255 | 篓 256 | 篮 257 | 簇 258 | 米 259 | 类 260 | 粒 261 | 级 262 | 组 263 | 维 264 | 缕 265 | 缸 266 | 罐 267 | 网 268 | 群 269 | 股 270 | 脚 271 | 船 272 | 艇 273 | 艘 274 | 色 275 | 节 276 | 英亩 277 | 英寸 278 | 英尺 279 | 英里 280 | 行 281 | 袋 282 | 角 283 | 言 284 | 课 285 | 起 286 | 趟 287 | 路 288 | 车 289 | 转 290 | 轮 291 | 辆 292 | 辈 293 | 连 294 | 通 295 | 遍 296 | 部 297 | 里 298 | 重 299 | 针 300 | 钟 301 | 钱 302 | 锅 303 | 门 304 | 间 305 | 队 306 | 阶段 307 | 隅 308 | 集 309 | 页 310 | 顶 311 | 顷 312 | 项 313 | 顿 314 | 颗 315 | 餐 316 | 首 -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/applicationContext-service-crypto.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/TestUtils.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.document.Field.Store; 9 | import org.apache.lucene.document.LongField; 10 | import org.apache.lucene.document.StringField; 11 | import org.apache.lucene.document.TextField; 12 | import org.apache.lucene.search.Filter; 13 | import org.apache.lucene.search.IndexSearcher; 14 | import org.apache.lucene.search.Query; 15 | import org.apache.lucene.search.ScoreDoc; 16 | import org.apache.lucene.search.SearcherManager; 17 | import org.apache.lucene.search.Sort; 18 | import org.apache.lucene.search.TopDocs; 19 | 20 | public class TestUtils { 21 | 22 | private static String[] values = { "网站访问量搜索", "近实时搜索" }; 23 | 24 | public static Document[] getDocuments() { 25 | Document[] documents = new Document[100]; 26 | for (int i = 0; i < documents.length; i++) { 27 | documents[i] = createDocument((long) i); 28 | } 29 | return documents; 30 | } 31 | 32 | private static Document createDocument(Long id) { 33 | Document document = new Document(); 34 | document.add(new LongField("time", System.currentTimeMillis(), Store.YES)); 35 | document.add(new StringField("id", String.valueOf(id), Store.YES)); 36 | document.add(new StringField("noAnalyzer", values[(int) (id % 2)], Store.YES)); 37 | document.add(new TextField("analyzer", values[(int) (id % 2)], Store.YES)); 38 | return document; 39 | } 40 | 41 | private static void release(SearcherManager searcherManager, IndexSearcher indexSearcher) { 42 | if (indexSearcher != null) { 43 | try { 44 | searcherManager.release(indexSearcher); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | } 48 | } 49 | } 50 | 51 | public static List getDocuments(SearcherManager searcherManager, Query query, Sort sort) { 52 | IndexSearcher indexSearcher = null; 53 | try { 54 | indexSearcher = searcherManager.acquire(); 55 | TopDocs topDocs = indexSearcher.search(query, 200, sort); 56 | List documents = new ArrayList<>(); 57 | for (ScoreDoc scoreDoc : topDocs.scoreDocs) { 58 | documents.add(indexSearcher.doc(scoreDoc.doc)); 59 | } 60 | return documents; 61 | } catch (Exception e) { 62 | e.printStackTrace(); 63 | return null; 64 | } finally { 65 | release(searcherManager, indexSearcher); 66 | } 67 | } 68 | 69 | public static List getIdList(SearcherManager searcherManager, Query query, Filter filter) { 70 | IndexSearcher indexSearcher = null; 71 | try { 72 | indexSearcher = searcherManager.acquire(); 73 | IdCollector collector = new IdCollector(); 74 | indexSearcher.search(query, filter, collector); 75 | return collector.getResult(); 76 | } catch (Exception e) { 77 | e.printStackTrace(); 78 | return null; 79 | } finally { 80 | release(searcherManager, indexSearcher); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/java/org/wltea/analyzer/sample/IKAnalzyerDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | import java.io.StringReader; 30 | 31 | import org.apache.lucene.analysis.Analyzer; 32 | import org.apache.lucene.analysis.TokenStream; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | import org.wltea.analyzer.lucene.IKAnalyzer; 37 | 38 | /** 39 | * 使用IKAnalyzer进行分词的演示 40 | * 2012-10-22 41 | * 42 | */ 43 | public class IKAnalzyerDemo { 44 | 45 | public static void main(String[] args){ 46 | //构建IK分词器,使用smart分词模式 47 | Analyzer analyzer = new IKAnalyzer(true); 48 | 49 | //获取Lucene的TokenStream对象 50 | TokenStream ts = null; 51 | try { 52 | ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too")); 53 | //获取词元位置属性 54 | OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); 55 | //获取词元文本属性 56 | CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); 57 | //获取词元文本属性 58 | TypeAttribute type = ts.addAttribute(TypeAttribute.class); 59 | 60 | 61 | //重置TokenStream(重置StringReader) 62 | ts.reset(); 63 | //迭代获取分词结果 64 | while (ts.incrementToken()) { 65 | System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); 66 | } 67 | //关闭TokenStream(关闭StringReader) 68 | ts.end(); // Perform end-of-stream operations, e.g. set the final offset. 69 | 70 | } catch (IOException e) { 71 | e.printStackTrace(); 72 | } finally { 73 | //释放TokenStream的所有资源 74 | if(ts != null){ 75 | try { 76 | ts.close(); 77 | } catch (IOException e) { 78 | e.printStackTrace(); 79 | } 80 | } 81 | } 82 | 83 | } 84 | 85 | } 86 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/dic/Hit.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | /** 29 | * 表示一次词典匹配的命中 30 | */ 31 | public class Hit { 32 | //Hit不匹配 33 | private static final int UNMATCH = 0x00000000; 34 | //Hit完全匹配 35 | private static final int MATCH = 0x00000001; 36 | //Hit前缀匹配 37 | private static final int PREFIX = 0x00000010; 38 | 39 | 40 | //该HIT当前状态,默认未匹配 41 | private int hitState = UNMATCH; 42 | 43 | //记录词典匹配过程中,当前匹配到的词典分支节点 44 | private DictSegment matchedDictSegment; 45 | /* 46 | * 词段开始位置 47 | */ 48 | private int begin; 49 | /* 50 | * 词段的结束位置 51 | */ 52 | private int end; 53 | 54 | 55 | /** 56 | * 判断是否完全匹配 57 | */ 58 | public boolean isMatch() { 59 | return (this.hitState & MATCH) > 0; 60 | } 61 | /** 62 | * 63 | */ 64 | public void setMatch() { 65 | this.hitState = this.hitState | MATCH; 66 | } 67 | 68 | /** 69 | * 判断是否是词的前缀 70 | */ 71 | public boolean isPrefix() { 72 | return (this.hitState & PREFIX) > 0; 73 | } 74 | /** 75 | * 76 | */ 77 | public void setPrefix() { 78 | this.hitState = this.hitState | PREFIX; 79 | } 80 | /** 81 | * 判断是否是不匹配 82 | */ 83 | public boolean isUnmatch() { 84 | return this.hitState == UNMATCH ; 85 | } 86 | /** 87 | * 88 | */ 89 | public void setUnmatch() { 90 | this.hitState = UNMATCH; 91 | } 92 | 93 | public DictSegment getMatchedDictSegment() { 94 | return matchedDictSegment; 95 | } 96 | 97 | public void setMatchedDictSegment(DictSegment matchedDictSegment) { 98 | this.matchedDictSegment = matchedDictSegment; 99 | } 100 | 101 | public int getBegin() { 102 | return begin; 103 | } 104 | 105 | public void setBegin(int begin) { 106 | this.begin = begin; 107 | } 108 | 109 | public int getEnd() { 110 | return end; 111 | } 112 | 113 | public void setEnd(int end) { 114 | this.end = end; 115 | } 116 | 117 | } 118 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/IdFieldComparator.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.lang3.ArrayUtils; 6 | import org.apache.lucene.index.AtomicReaderContext; 7 | import org.apache.lucene.search.FieldCache; 8 | import org.apache.lucene.search.FieldComparator; 9 | 10 | public class IdFieldComparator extends FieldComparator { 11 | private final long[] values; 12 | protected final String field; 13 | private FieldCache.Longs currentReaderValues; 14 | private long bottom; 15 | 16 | private long[] headIds; 17 | 18 | public IdFieldComparator(int numHits, String field, long[] headIds) { 19 | this.field = field; 20 | values = new long[numHits]; 21 | this.headIds = headIds; 22 | } 23 | 24 | public int customCompare(long v1, long v2) { 25 | if (v1 == v2) { 26 | return 0; 27 | } 28 | 29 | boolean v1Exists = false; 30 | if (ArrayUtils.contains(headIds, v1)) { 31 | v1Exists = true; 32 | } 33 | boolean v2Exists = false; 34 | if (ArrayUtils.contains(headIds, v2)) { 35 | v2Exists = true; 36 | } 37 | 38 | if (v1Exists && v2Exists) { 39 | return v1 > v2 ? -1 : 1; 40 | } else if (v1Exists) { 41 | return -1; 42 | } else if (v2Exists) { 43 | return 1; 44 | } else { 45 | return v1 > v2 ? -1 : 1; 46 | } 47 | } 48 | 49 | @Override 50 | public int compare(int slot1, int slot2) { 51 | final long v1 = values[slot1]; 52 | final long v2 = values[slot2]; 53 | // if (v1 > v2) { 54 | // return 1; 55 | // } else if (v1 < v2) { 56 | // return -1; 57 | // } else { 58 | // return 0; 59 | // } 60 | return customCompare(v1, v2); 61 | } 62 | 63 | @Override 64 | public void setBottom(int slot) { 65 | this.bottom = values[slot]; 66 | } 67 | 68 | @Override 69 | public int compareBottom(int doc) throws IOException { 70 | long v2 = currentReaderValues.get(doc); 71 | // if (bottom > v2) { 72 | // return 1; 73 | // } else if (bottom < v2) { 74 | // return -1; 75 | // } else { 76 | // return 0; 77 | // } 78 | return customCompare(bottom, v2); 79 | } 80 | 81 | @Override 82 | public void copy(int slot, int doc) throws IOException { 83 | long v2 = currentReaderValues.get(doc); 84 | values[slot] = v2; 85 | } 86 | 87 | @Override 88 | public FieldComparator setNextReader(AtomicReaderContext context) throws IOException { 89 | currentReaderValues = FieldCache.DEFAULT.getLongs(context.reader(), field, false); 90 | return this; 91 | } 92 | 93 | @Override 94 | public Long value(int slot) { 95 | return Long.valueOf(values[slot]); 96 | } 97 | 98 | @Override 99 | public int compareDocToValue(int doc, Long value) throws IOException { 100 | final long valueLong = value.longValue(); 101 | long docValue = currentReaderValues.get(doc); 102 | // if (docValue < valueLong) { 103 | // return -1; 104 | // } else if (docValue > valueLong) { 105 | // return 1; 106 | // } else { 107 | // return 0; 108 | // } 109 | return customCompare(docValue, valueLong); 110 | } 111 | 112 | } 113 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/resources/org/danielli/xultimate/lucene/applicationContext-service-lucene1.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 47 | 48 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/CharacterUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 字符集识别工具类 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | /** 29 | * 30 | * 字符集识别工具类 31 | */ 32 | class CharacterUtil { 33 | 34 | public static final int CHAR_USELESS = 0; 35 | 36 | public static final int CHAR_ARABIC = 0X00000001; 37 | 38 | public static final int CHAR_ENGLISH = 0X00000002; 39 | 40 | public static final int CHAR_CHINESE = 0X00000004; 41 | 42 | public static final int CHAR_OTHER_CJK = 0X00000008; 43 | 44 | 45 | /** 46 | * 识别字符类型 47 | * @param input 48 | * @return int CharacterUtil定义的字符类型常量 49 | */ 50 | static int identifyCharType(char input){ 51 | if(input >= '0' && input <= '9'){ 52 | return CHAR_ARABIC; 53 | 54 | }else if((input >= 'a' && input <= 'z') 55 | || (input >= 'A' && input <= 'Z')){ 56 | return CHAR_ENGLISH; 57 | 58 | }else { 59 | Character.UnicodeBlock ub = Character.UnicodeBlock.of(input); 60 | 61 | if(ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS 62 | || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS 63 | || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A){ 64 | //目前已知的中文字符UTF-8集合 65 | return CHAR_CHINESE; 66 | 67 | }else if(ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS //全角数字字符和日韩字符 68 | //韩文字符集 69 | || ub == Character.UnicodeBlock.HANGUL_SYLLABLES 70 | || ub == Character.UnicodeBlock.HANGUL_JAMO 71 | || ub == Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO 72 | //日文字符集 73 | || ub == Character.UnicodeBlock.HIRAGANA //平假名 74 | || ub == Character.UnicodeBlock.KATAKANA //片假名 75 | || ub == Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS){ 76 | return CHAR_OTHER_CJK; 77 | 78 | } 79 | } 80 | //其他的不做处理的字符 81 | return CHAR_USELESS; 82 | } 83 | 84 | /** 85 | * 进行字符规格化(全角转半角,大写转小写处理) 86 | * @param input 87 | * @return char 88 | */ 89 | static char regularize(char input){ 90 | if (input == 12288) { 91 | input = (char) 32; 92 | 93 | }else if (input > 65280 && input < 65375) { 94 | input = (char) (input - 65248); 95 | 96 | }else if (input >= 'A' && input <= 'Z') { 97 | input += 32; 98 | } 99 | 100 | return input; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/resources/org/danielli/xultimate/lucene/applicationContext-service-lucene2.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 47 | 48 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0.1 3 | * IK Analyzer release 5.0.1 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | 25 | * 26 | */ 27 | package org.wltea.analyzer.lucene; 28 | 29 | import java.io.IOException; 30 | import java.io.Reader; 31 | 32 | import org.apache.lucene.analysis.Tokenizer; 33 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 34 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 35 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 36 | 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * IK分词器 Lucene Tokenizer适配器类 42 | * 兼容Lucene 4.0版本 43 | */ 44 | public final class IKTokenizer extends Tokenizer { 45 | 46 | //IK分词器实现 47 | private IKSegmenter _IKImplement; 48 | 49 | //词元文本属性 50 | private final CharTermAttribute termAtt; 51 | //词元位移属性 52 | private final OffsetAttribute offsetAtt; 53 | //词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) 54 | private final TypeAttribute typeAtt; 55 | //记录最后一个词元的结束位置 56 | private int endPosition; 57 | 58 | /** 59 | * Lucene 4.0 Tokenizer适配器类构造函数 60 | * @param in 61 | * @param useSmart 62 | */ 63 | public IKTokenizer(Reader in , boolean useSmart){ 64 | super(in); 65 | offsetAtt = addAttribute(OffsetAttribute.class); 66 | termAtt = addAttribute(CharTermAttribute.class); 67 | typeAtt = addAttribute(TypeAttribute.class); 68 | _IKImplement = new IKSegmenter(input , useSmart); 69 | } 70 | 71 | /* (non-Javadoc) 72 | * @see org.apache.lucene.analysis.TokenStream#incrementToken() 73 | */ 74 | @Override 75 | public boolean incrementToken() throws IOException { 76 | //清除所有的词元属性 77 | clearAttributes(); 78 | Lexeme nextLexeme = _IKImplement.next(); 79 | if(nextLexeme != null){ 80 | //将Lexeme转成Attributes 81 | //设置词元文本 82 | termAtt.append(nextLexeme.getLexemeText()); 83 | //设置词元长度 84 | termAtt.setLength(nextLexeme.getLength()); 85 | //设置词元位移 86 | offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); 87 | //记录分词的最后位置 88 | endPosition = nextLexeme.getEndPosition(); 89 | //记录词元分类 90 | typeAtt.setType(nextLexeme.getLexemeTypeString()); 91 | //返会true告知还有下个词元 92 | return true; 93 | } 94 | //返会false告知词元输出完毕 95 | return false; 96 | } 97 | 98 | /* 99 | * (non-Javadoc) 100 | * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader) 101 | */ 102 | @Override 103 | public void reset() throws IOException { 104 | super.reset(); 105 | _IKImplement.reset(input); 106 | } 107 | 108 | @Override 109 | public final void end() { 110 | // set final offset 111 | int finalOffset = correctOffset(this.endPosition); 112 | offsetAtt.setOffset(finalOffset, finalOffset); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/resources/org/danielli/xultimate/lucene/applicationContext-service-lucene3.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 35 | 36 | 37 | 38 | 39 | 44 | 45 | 47 | 48 | 50 | 51 | 52 | 53 | 56 | 57 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/CJKSegmenter.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.core; 27 | 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.dic.Dictionary; 32 | import org.wltea.analyzer.dic.Hit; 33 | 34 | 35 | /** 36 | * 中文-日韩文子分词器 37 | */ 38 | class CJKSegmenter implements ISegmenter { 39 | 40 | //子分词器标签 41 | static final String SEGMENTER_NAME = "CJK_SEGMENTER"; 42 | //待处理的分词hit队列 43 | private List tmpHits; 44 | 45 | 46 | CJKSegmenter(){ 47 | this.tmpHits = new LinkedList(); 48 | } 49 | 50 | /* (non-Javadoc) 51 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 52 | */ 53 | public void analyze(AnalyzeContext context) { 54 | if(CharacterUtil.CHAR_USELESS != context.getCurrentCharType()){ 55 | 56 | //优先处理tmpHits中的hit 57 | if(!this.tmpHits.isEmpty()){ 58 | //处理词段队列 59 | Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); 60 | for(Hit hit : tmpArray){ 61 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 62 | if(hit.isMatch()){ 63 | //输出当前的词 64 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); 65 | context.addLexeme(newLexeme); 66 | 67 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 68 | this.tmpHits.remove(hit); 69 | } 70 | 71 | }else if(hit.isUnmatch()){ 72 | //hit不是词,移除 73 | this.tmpHits.remove(hit); 74 | } 75 | } 76 | } 77 | 78 | //********************************* 79 | //再对当前指针位置的字符进行单字匹配 80 | Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); 81 | if(singleCharHit.isMatch()){//首字成词 82 | //输出当前的词 83 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); 84 | context.addLexeme(newLexeme); 85 | 86 | //同时也是词前缀 87 | if(singleCharHit.isPrefix()){ 88 | //前缀匹配则放入hit列表 89 | this.tmpHits.add(singleCharHit); 90 | } 91 | }else if(singleCharHit.isPrefix()){//首字为词前缀 92 | //前缀匹配则放入hit列表 93 | this.tmpHits.add(singleCharHit); 94 | } 95 | 96 | 97 | }else{ 98 | //遇到CHAR_USELESS字符 99 | //清空队列 100 | this.tmpHits.clear(); 101 | } 102 | 103 | //判断缓冲区是否已经读完 104 | if(context.isBufferConsumed()){ 105 | //清空队列 106 | this.tmpHits.clear(); 107 | } 108 | 109 | //判断是否锁定缓冲区 110 | if(this.tmpHits.size() == 0){ 111 | context.unlockBuffer(SEGMENTER_NAME); 112 | 113 | }else{ 114 | context.lockBuffer(SEGMENTER_NAME); 115 | } 116 | } 117 | 118 | /* (non-Javadoc) 119 | * @see org.wltea.analyzer.core.ISegmenter#reset() 120 | */ 121 | public void reset() { 122 | //清空队列 123 | this.tmpHits.clear(); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/NearRealtimeSearchTest2.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.Term; 9 | import org.apache.lucene.index.TrackingIndexWriter; 10 | import org.apache.lucene.queryparser.classic.ParseException; 11 | import org.apache.lucene.queryparser.classic.QueryParser; 12 | import org.apache.lucene.search.BooleanQuery; 13 | import org.apache.lucene.search.FieldComparator; 14 | import org.apache.lucene.search.FieldComparatorSource; 15 | import org.apache.lucene.search.QueryWrapperFilter; 16 | import org.apache.lucene.search.SearcherManager; 17 | import org.apache.lucene.search.Sort; 18 | import org.apache.lucene.search.SortField; 19 | import org.apache.lucene.search.TermQuery; 20 | import org.apache.lucene.search.BooleanClause.Occur; 21 | import org.apache.lucene.search.SortField.Type; 22 | import org.apache.lucene.util.Version; 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | import org.junit.runner.RunWith; 26 | import org.springframework.test.context.ContextConfiguration; 27 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 28 | import org.wltea.analyzer.lucene.IKAnalyzer; 29 | 30 | @RunWith(SpringJUnit4ClassRunner.class) 31 | @ContextConfiguration(locations = { "classpath*:/org/danielli/xultimate/lucene/applicationContext-service-lucene2.xml"}) 32 | public class NearRealtimeSearchTest2 { 33 | 34 | @Resource 35 | private TrackingIndexWriter trackingIndexWriter; 36 | 37 | @Resource 38 | private SearcherManager searcherManager; 39 | 40 | @Resource 41 | private IKAnalyzer ikAnalyzer; 42 | 43 | public void saveDocument(Document document) { 44 | try { 45 | trackingIndexWriter.addDocument(document); 46 | searcherManager.maybeRefreshBlocking(); 47 | } catch (Exception e) { 48 | e.printStackTrace(); 49 | } 50 | } 51 | 52 | public void deleteDocumentById(Long id) { 53 | try { 54 | trackingIndexWriter.deleteDocuments(new Term("id", String.valueOf(id))); 55 | searcherManager.maybeRefreshBlocking(); 56 | } catch (Exception e) { 57 | e.printStackTrace(); 58 | } 59 | } 60 | 61 | @Before 62 | public void before() { 63 | for (Document document : TestUtils.getDocuments()) { 64 | saveDocument(document); 65 | } 66 | } 67 | 68 | @Test 69 | public void test() throws ParseException { 70 | /* Search */ 71 | // 未分词 + 排序 72 | BooleanQuery noAnalyzerBooleanQuery = new BooleanQuery(); 73 | noAnalyzerBooleanQuery.add(new TermQuery(new Term("noAnalyzer", "近实时搜索")), Occur.MUST); 74 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 75 | // 删除 76 | deleteDocumentById(1L); 77 | // 未分词 + 排序 78 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 79 | // 分词 + 排序 80 | BooleanQuery analyzerBooleanQuery = new BooleanQuery(); 81 | analyzerBooleanQuery.add(new QueryParser(Version.LUCENE_45, "analyzer", ikAnalyzer).parse("近实时搜索"), Occur.MUST); 82 | System.out.println(TestUtils.getDocuments(searcherManager, analyzerBooleanQuery, Sort.RELEVANCE).size()); 83 | /* Filter + Collector */ 84 | // 自定义Filter + 自定义Collector 85 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, new IdFilter(new Long[] { 3L, 5L })).size()); 86 | // Filter + 自定义Collector 87 | BooleanQuery booleanQuery = new BooleanQuery(); 88 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(3L))), Occur.SHOULD); 89 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(5L))), Occur.SHOULD); 90 | QueryWrapperFilter filter = new QueryWrapperFilter(booleanQuery); 91 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, filter).size()); 92 | /* 排序 */ 93 | // ID倒排 94 | Sort idSort = new Sort(new SortField("id", Type.LONG, true)); 95 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 96 | // 自定义排序 97 | idSort = new Sort(new SortField("id", new FieldComparatorSource() { 98 | 99 | @Override 100 | public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException { 101 | return new IdFieldComparator(numHits, fieldname, new long[] { 59L, 57L }); 102 | } 103 | })); 104 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 105 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(1).get("id")); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/NearRealtimeSearchTest1.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.IndexWriter; 9 | import org.apache.lucene.index.Term; 10 | import org.apache.lucene.queryparser.classic.ParseException; 11 | import org.apache.lucene.queryparser.classic.QueryParser; 12 | import org.apache.lucene.search.BooleanQuery; 13 | import org.apache.lucene.search.FieldComparator; 14 | import org.apache.lucene.search.FieldComparatorSource; 15 | import org.apache.lucene.search.QueryWrapperFilter; 16 | import org.apache.lucene.search.SearcherManager; 17 | import org.apache.lucene.search.Sort; 18 | import org.apache.lucene.search.SortField; 19 | import org.apache.lucene.search.SortField.Type; 20 | import org.apache.lucene.search.TermQuery; 21 | import org.apache.lucene.search.BooleanClause.Occur; 22 | import org.apache.lucene.util.Version; 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | import org.junit.runner.RunWith; 26 | import org.springframework.test.context.ContextConfiguration; 27 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 28 | import org.wltea.analyzer.lucene.IKAnalyzer; 29 | 30 | @RunWith(SpringJUnit4ClassRunner.class) 31 | @ContextConfiguration(locations = { "classpath:/org/danielli/xultimate/lucene/applicationContext-service-lucene1.xml"}) 32 | public class NearRealtimeSearchTest1 { 33 | 34 | @Resource 35 | private IndexWriter indexWriter; 36 | 37 | @Resource 38 | private SearcherManager searcherManager; 39 | 40 | @Resource 41 | private IKAnalyzer ikAnalyzer; 42 | 43 | public void saveDocument(Document document) { 44 | try { 45 | indexWriter.addDocument(document); 46 | indexWriter.commit(); 47 | searcherManager.maybeRefreshBlocking(); 48 | } catch (Exception e) { 49 | e.printStackTrace(); 50 | } 51 | } 52 | 53 | public void deleteDocumentById(Long id) { 54 | try { 55 | indexWriter.deleteDocuments(new Term("id", String.valueOf(id))); 56 | indexWriter.commit(); 57 | searcherManager.maybeRefreshBlocking(); 58 | } catch (Exception e) { 59 | e.printStackTrace(); 60 | } 61 | } 62 | 63 | @Before 64 | public void before() { 65 | for (Document document : TestUtils.getDocuments()) { 66 | saveDocument(document); 67 | } 68 | } 69 | 70 | @Test 71 | public void test() throws ParseException { 72 | /* Search */ 73 | // 未分词 + 排序 74 | BooleanQuery noAnalyzerBooleanQuery = new BooleanQuery(); 75 | noAnalyzerBooleanQuery.add(new TermQuery(new Term("noAnalyzer", "近实时搜索")), Occur.MUST); 76 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 77 | // 删除 78 | deleteDocumentById(1L); 79 | // 未分词 + 排序 80 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 81 | // 分词 + 排序 82 | BooleanQuery analyzerBooleanQuery = new BooleanQuery(); 83 | analyzerBooleanQuery.add(new QueryParser(Version.LUCENE_45, "analyzer", ikAnalyzer).parse("近实时搜索"), Occur.MUST); 84 | System.out.println(TestUtils.getDocuments(searcherManager, analyzerBooleanQuery, Sort.RELEVANCE).size()); 85 | /* Filter + Collector */ 86 | // 自定义Filter + 自定义Collector 87 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, new IdFilter(new Long[] { 3L, 5L })).size()); 88 | // Filter + 自定义Collector 89 | BooleanQuery booleanQuery = new BooleanQuery(); 90 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(3L))), Occur.SHOULD); 91 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(5L))), Occur.SHOULD); 92 | QueryWrapperFilter filter = new QueryWrapperFilter(booleanQuery); 93 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, filter).size()); 94 | /* 排序 */ 95 | // ID倒排 96 | Sort idSort = new Sort(new SortField("id", Type.LONG, true)); 97 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 98 | // 自定义排序 99 | idSort = new Sort(new SortField("id", new FieldComparatorSource() { 100 | 101 | @Override 102 | public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException { 103 | return new IdFieldComparator(numHits, fieldname, new long[] { 59L, 57L }); 104 | } 105 | })); 106 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 107 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(1).get("id")); 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/danielli/xultimate/searching/SynonymFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.searching; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | import java.text.ParseException; 6 | import java.util.HashMap; 7 | import java.util.Iterator; 8 | import java.util.Map; 9 | 10 | import org.apache.lucene.analysis.Analyzer; 11 | import org.apache.lucene.analysis.TokenStream; 12 | import org.apache.lucene.analysis.Tokenizer; 13 | import org.apache.lucene.analysis.core.LowerCaseFilter; 14 | import org.apache.lucene.analysis.core.WhitespaceTokenizer; 15 | import org.apache.lucene.analysis.synonym.SolrSynonymParser; 16 | import org.apache.lucene.analysis.synonym.SynonymFilter; 17 | import org.apache.lucene.analysis.synonym.SynonymMap; 18 | import org.apache.lucene.analysis.util.ResourceLoader; 19 | import org.apache.lucene.analysis.util.ResourceLoaderAware; 20 | import org.apache.lucene.analysis.util.TokenFilterFactory; 21 | import org.apache.lucene.analysis.util.TokenizerFactory; 22 | import org.apache.lucene.util.Version; 23 | 24 | public class SynonymFilterFactory extends TokenFilterFactory implements ResourceLoaderAware { 25 | 26 | private final boolean ignoreCase; 27 | private final String tokenizerFactory; 28 | private final String format; 29 | private final boolean expand; 30 | private final Map tokArgs = new HashMap(); 31 | 32 | private SynonymMap map; 33 | 34 | public SynonymFilterFactory(Map args) { 35 | super(args); 36 | ignoreCase = getBoolean(args, "ignoreCase", false); 37 | format = get(args, "format"); 38 | expand = getBoolean(args, "expand", true); 39 | 40 | tokenizerFactory = get(args, "tokenizerFactory"); 41 | if (tokenizerFactory != null) { 42 | assureMatchVersion(); 43 | tokArgs.put("luceneMatchVersion", getLuceneMatchVersion().toString()); 44 | for (Iterator itr = args.keySet().iterator(); itr.hasNext();) { 45 | String key = itr.next(); 46 | tokArgs.put(key.replaceAll("^tokenizerFactory\\.", ""), args.get(key)); 47 | itr.remove(); 48 | } 49 | } 50 | if (!args.isEmpty()) { 51 | throw new IllegalArgumentException("Unknown parameters: " + args); 52 | } 53 | } 54 | 55 | @SuppressWarnings("resource") 56 | @Override 57 | public TokenStream create(TokenStream input) { 58 | // if the fst is null, it means there's actually no synonyms... just return the original stream 59 | // as there is nothing to do here. 60 | return map.fst == null ? input : new SynonymFilter(input, map, ignoreCase); 61 | } 62 | 63 | @SuppressWarnings("resource") 64 | @Override 65 | public void inform(ResourceLoader loader) throws IOException { 66 | final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory); 67 | 68 | Analyzer analyzer = new Analyzer() { 69 | @Override 70 | protected TokenStreamComponents createComponents(String fieldName, Reader reader) { 71 | Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_45, reader) : factory.create(reader); 72 | TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_45, tokenizer) : tokenizer; 73 | return new TokenStreamComponents(tokenizer, stream); 74 | } 75 | }; 76 | 77 | try { 78 | if (format == null || format.equals("solr")) { 79 | // TODO: expose dedup as a parameter? 80 | map = loadSolrSynonyms(loader, true, analyzer); 81 | } else { 82 | // TODO: somehow make this more pluggable 83 | throw new IllegalArgumentException("Unrecognized synonyms format: " + format); 84 | } 85 | } catch (ParseException e) { 86 | throw new IOException("Error parsing synonyms file:", e); 87 | } 88 | } 89 | 90 | /** 91 | * Load synonyms from the solr format, "format=solr". 92 | */ 93 | private SynonymMap loadSolrSynonyms(ResourceLoader loader, boolean dedup, Analyzer analyzer) throws IOException, ParseException { 94 | SolrSynonymParser parser = new SolrSynonymParser(dedup, expand, analyzer); 95 | 96 | SolrSynonymDtabaseLoader solrSynonymDtabaseLoader = new SolrSynonymDtabaseLoader(); 97 | solrSynonymDtabaseLoader.handle(parser); 98 | 99 | return parser.build(); 100 | } 101 | 102 | // (there are no tests for this functionality) 103 | private TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String cname) throws IOException { 104 | Class clazz = loader.findClass(cname, TokenizerFactory.class); 105 | try { 106 | TokenizerFactory tokFactory = clazz.getConstructor(Map.class).newInstance(tokArgs); 107 | if (tokFactory instanceof ResourceLoaderAware) { 108 | ((ResourceLoaderAware) tokFactory).inform(loader); 109 | } 110 | return tokFactory; 111 | } catch (Exception e) { 112 | throw new RuntimeException(e); 113 | } 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/IKArbitrator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Stack; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * IK分词歧义裁决器 32 | */ 33 | class IKArbitrator { 34 | 35 | IKArbitrator(){ 36 | 37 | } 38 | 39 | /** 40 | * 分词歧义处理 41 | * @param orgLexemes 42 | * @param useSmart 43 | */ 44 | void process(AnalyzeContext context , boolean useSmart){ 45 | QuickSortSet orgLexemes = context.getOrgLexemes(); 46 | Lexeme orgLexeme = orgLexemes.pollFirst(); 47 | 48 | LexemePath crossPath = new LexemePath(); 49 | while(orgLexeme != null){ 50 | if(!crossPath.addCrossLexeme(orgLexeme)){ 51 | //找到与crossPath不相交的下一个crossPath 52 | if(crossPath.size() == 1 || !useSmart){ 53 | //crossPath没有歧义 或者 不做歧义处理 54 | //直接输出当前crossPath 55 | context.addLexemePath(crossPath); 56 | }else{ 57 | //对当前的crossPath进行歧义处理 58 | QuickSortSet.Cell headCell = crossPath.getHead(); 59 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 60 | //输出歧义处理结果judgeResult 61 | context.addLexemePath(judgeResult); 62 | } 63 | 64 | //把orgLexeme加入新的crossPath中 65 | crossPath = new LexemePath(); 66 | crossPath.addCrossLexeme(orgLexeme); 67 | } 68 | orgLexeme = orgLexemes.pollFirst(); 69 | } 70 | 71 | 72 | //处理最后的path 73 | if(crossPath.size() == 1 || !useSmart){ 74 | //crossPath没有歧义 或者 不做歧义处理 75 | //直接输出当前crossPath 76 | context.addLexemePath(crossPath); 77 | }else{ 78 | //对当前的crossPath进行歧义处理 79 | QuickSortSet.Cell headCell = crossPath.getHead(); 80 | LexemePath judgeResult = this.judge(headCell, crossPath.getPathLength()); 81 | //输出歧义处理结果judgeResult 82 | context.addLexemePath(judgeResult); 83 | } 84 | } 85 | 86 | /** 87 | * 歧义识别 88 | * @param lexemeCell 歧义路径链表头 89 | * @param fullTextLength 歧义路径文本长度 90 | * @param option 候选结果路径 91 | * @return 92 | */ 93 | private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ 94 | //候选路径集合 95 | TreeSet pathOptions = new TreeSet(); 96 | //候选结果路径 97 | LexemePath option = new LexemePath(); 98 | 99 | //对crossPath进行一次遍历,同时返回本次遍历中有冲突的Lexeme栈 100 | Stack lexemeStack = this.forwardPath(lexemeCell , option); 101 | 102 | //当前词元链并非最理想的,加入候选路径集合 103 | pathOptions.add(option.copy()); 104 | 105 | //存在歧义词,处理 106 | QuickSortSet.Cell c = null; 107 | while(!lexemeStack.isEmpty()){ 108 | c = lexemeStack.pop(); 109 | //回滚词元链 110 | this.backPath(c.getLexeme() , option); 111 | //从歧义词位置开始,递归,生成可选方案 112 | this.forwardPath(c , option); 113 | pathOptions.add(option.copy()); 114 | } 115 | 116 | //返回集合中的最优方案 117 | return pathOptions.first(); 118 | 119 | } 120 | 121 | /** 122 | * 向前遍历,添加词元,构造一个无歧义词元组合 123 | * @param LexemePath path 124 | * @return 125 | */ 126 | private Stack forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ 127 | //发生冲突的Lexeme栈 128 | Stack conflictStack = new Stack(); 129 | QuickSortSet.Cell c = lexemeCell; 130 | //迭代遍历Lexeme链表 131 | while(c != null && c.getLexeme() != null){ 132 | if(!option.addNotCrossLexeme(c.getLexeme())){ 133 | //词元交叉,添加失败则加入lexemeStack栈 134 | conflictStack.push(c); 135 | } 136 | c = c.getNext(); 137 | } 138 | return conflictStack; 139 | } 140 | 141 | /** 142 | * 回滚词元链,直到它能够接受指定的词元 143 | * @param lexeme 144 | * @param l 145 | */ 146 | private void backPath(Lexeme l , LexemePath option){ 147 | while(option.checkCross(l)){ 148 | option.removeTail(); 149 | } 150 | 151 | } 152 | 153 | } 154 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/applicationContext-dao-base.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 34 | 35 | 40 | 41 | 45 | 46 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /xultimate-lucene/src/test/java/org/danielli/xultimate/lucene/NearRealtimeSearchTest3.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.lucene; 2 | 3 | import java.io.IOException; 4 | 5 | import javax.annotation.Resource; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.Term; 9 | import org.apache.lucene.index.TrackingIndexWriter; 10 | import org.apache.lucene.queryparser.classic.ParseException; 11 | import org.apache.lucene.queryparser.classic.QueryParser; 12 | import org.apache.lucene.search.BooleanClause.Occur; 13 | import org.apache.lucene.search.BooleanQuery; 14 | import org.apache.lucene.search.FieldComparator; 15 | import org.apache.lucene.search.FieldComparatorSource; 16 | import org.apache.lucene.search.QueryWrapperFilter; 17 | import org.apache.lucene.search.SearcherManager; 18 | import org.apache.lucene.search.Sort; 19 | import org.apache.lucene.search.SortField; 20 | import org.apache.lucene.search.SortField.Type; 21 | import org.apache.lucene.search.TermQuery; 22 | import org.apache.lucene.util.Version; 23 | import org.junit.Before; 24 | import org.junit.Test; 25 | import org.junit.runner.RunWith; 26 | import org.springframework.test.context.ContextConfiguration; 27 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 28 | import org.wltea.analyzer.lucene.IKAnalyzer; 29 | 30 | @RunWith(SpringJUnit4ClassRunner.class) 31 | @ContextConfiguration(locations = { "classpath*:/org/danielli/xultimate/lucene/applicationContext-service-lucene3.xml"}) 32 | public class NearRealtimeSearchTest3 { 33 | 34 | @Resource 35 | private TrackingIndexWriter trackingIndexWriter; 36 | 37 | @Resource 38 | private SearcherManager searcherManager; 39 | 40 | @Resource 41 | private IKAnalyzer ikAnalyzer; 42 | 43 | public void saveDocument(Document document) { 44 | try { 45 | trackingIndexWriter.addDocument(document); 46 | } catch (Exception e) { 47 | e.printStackTrace(); 48 | } 49 | } 50 | 51 | public void deleteDocumentById(Long id) { 52 | try { 53 | trackingIndexWriter.deleteDocuments(new Term("id", String.valueOf(id))); 54 | } catch (Exception e) { 55 | e.printStackTrace(); 56 | } 57 | } 58 | 59 | @Before 60 | public void before() { 61 | for (Document document : TestUtils.getDocuments()) { 62 | saveDocument(document); 63 | } 64 | } 65 | 66 | @Test 67 | public void test() throws InterruptedException, ParseException { 68 | /* Search */ 69 | // 未分词 + 排序 70 | BooleanQuery noAnalyzerBooleanQuery = new BooleanQuery(); 71 | noAnalyzerBooleanQuery.add(new TermQuery(new Term("noAnalyzer", "近实时搜索")), Occur.MUST); 72 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 73 | Thread.sleep(3 * 1000); 74 | // 未分词 + 排序 75 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 76 | // 删除 77 | deleteDocumentById(1L); 78 | // 未分词 + 排序 79 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 80 | Thread.sleep(3 * 1000); 81 | // 未分词 + 排序 82 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, Sort.RELEVANCE).size()); 83 | // 分词 + 排序 84 | BooleanQuery analyzerBooleanQuery = new BooleanQuery(); 85 | analyzerBooleanQuery.add(new QueryParser(Version.LUCENE_45, "analyzer", ikAnalyzer).parse("近实时搜索"), Occur.MUST); 86 | System.out.println(TestUtils.getDocuments(searcherManager, analyzerBooleanQuery, Sort.RELEVANCE).size()); 87 | /* Filter + Collector */ 88 | // 自定义Filter + 自定义Collector 89 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, new IdFilter(new Long[] { 3L, 5L })).size()); 90 | // Filter + 自定义Collector 91 | BooleanQuery booleanQuery = new BooleanQuery(); 92 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(3L))), Occur.SHOULD); 93 | booleanQuery.add(new TermQuery(new Term("id", String.valueOf(5L))), Occur.SHOULD); 94 | QueryWrapperFilter filter = new QueryWrapperFilter(booleanQuery); 95 | System.out.println(TestUtils.getIdList(searcherManager, noAnalyzerBooleanQuery, filter).size()); 96 | /* 排序 */ 97 | // ID倒排 98 | Sort idSort = new Sort(new SortField("id", Type.LONG, true)); 99 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 100 | // 自定义排序 101 | idSort = new Sort(new SortField("id", new FieldComparatorSource() { 102 | 103 | @Override 104 | public FieldComparator newComparator(String fieldname, int numHits, int sortPos, boolean reversed) throws IOException { 105 | return new IdFieldComparator(numHits, fieldname, new long[] { 59L, 57L }); 106 | } 107 | })); 108 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(0).get("id")); 109 | System.out.println(TestUtils.getDocuments(searcherManager, noAnalyzerBooleanQuery, idSort).get(1).get("id")); 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/IKSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | */ 24 | package org.wltea.analyzer.core; 25 | 26 | import java.io.IOException; 27 | import java.io.Reader; 28 | import java.util.ArrayList; 29 | import java.util.List; 30 | 31 | import org.wltea.analyzer.cfg.Configuration; 32 | import org.wltea.analyzer.cfg.DefaultConfig; 33 | import org.wltea.analyzer.dic.Dictionary; 34 | 35 | /** 36 | * IK分词器主类 37 | * 38 | */ 39 | public final class IKSegmenter { 40 | 41 | //字符窜reader 42 | private Reader input; 43 | //分词器配置项 44 | private Configuration cfg; 45 | //分词器上下文 46 | private AnalyzeContext context; 47 | //分词处理器列表 48 | private List segmenters; 49 | //分词歧义裁决器 50 | private IKArbitrator arbitrator; 51 | 52 | 53 | /** 54 | * IK分词器构造函数 55 | * @param input 56 | * @param useSmart 为true,使用智能分词策略 57 | * 58 | * 非智能分词:细粒度输出所有可能的切分结果 59 | * 智能分词: 合并数词和量词,对分词结果进行歧义判断 60 | */ 61 | public IKSegmenter(Reader input , boolean useSmart){ 62 | this.input = input; 63 | this.cfg = DefaultConfig.getInstance(); 64 | this.cfg.setUseSmart(useSmart); 65 | this.init(); 66 | } 67 | 68 | /** 69 | * IK分词器构造函数 70 | * @param input 71 | * @param cfg 使用自定义的Configuration构造分词器 72 | * 73 | */ 74 | public IKSegmenter(Reader input , Configuration cfg){ 75 | this.input = input; 76 | this.cfg = cfg; 77 | this.init(); 78 | } 79 | 80 | /** 81 | * 初始化 82 | */ 83 | private void init(){ 84 | //初始化词典单例 85 | Dictionary.initial(this.cfg); 86 | //初始化分词上下文 87 | this.context = new AnalyzeContext(this.cfg); 88 | //加载子分词器 89 | this.segmenters = this.loadSegmenters(); 90 | //加载歧义裁决器 91 | this.arbitrator = new IKArbitrator(); 92 | } 93 | 94 | /** 95 | * 初始化词典,加载子分词器实现 96 | * @return List 97 | */ 98 | private List loadSegmenters(){ 99 | List segmenters = new ArrayList(4); 100 | //处理字母的子分词器 101 | segmenters.add(new LetterSegmenter()); 102 | //处理中文数量词的子分词器 103 | segmenters.add(new CN_QuantifierSegmenter()); 104 | //处理中文词的子分词器 105 | segmenters.add(new CJKSegmenter()); 106 | return segmenters; 107 | } 108 | 109 | /** 110 | * 分词,获取下一个词元 111 | * @return Lexeme 词元对象 112 | * @throws IOException 113 | */ 114 | public synchronized Lexeme next()throws IOException{ 115 | Lexeme l = null; 116 | while((l = context.getNextLexeme()) == null ){ 117 | /* 118 | * 从reader中读取数据,填充buffer 119 | * 如果reader是分次读入buffer的,那么buffer要 进行移位处理 120 | * 移位处理上次读入的但未处理的数据 121 | */ 122 | int available = context.fillBuffer(this.input); 123 | if(available <= 0){ 124 | //reader已经读完 125 | context.reset(); 126 | return null; 127 | 128 | }else{ 129 | //初始化指针 130 | context.initCursor(); 131 | do{ 132 | //遍历子分词器 133 | for(ISegmenter segmenter : segmenters){ 134 | segmenter.analyze(context); 135 | } 136 | //字符缓冲区接近读完,需要读入新的字符 137 | if(context.needRefillBuffer()){ 138 | break; 139 | } 140 | //向前移动指针 141 | }while(context.moveCursor()); 142 | //重置子分词器,为下轮循环进行初始化 143 | for(ISegmenter segmenter : segmenters){ 144 | segmenter.reset(); 145 | } 146 | } 147 | //对分词进行歧义处理 148 | this.arbitrator.process(context, this.cfg.useSmart()); 149 | //将分词结果输出到结果集,并处理未切分的单个CJK字符 150 | context.outputToResult(); 151 | //记录本次分词的缓冲区位移 152 | context.markBufferOffset(); 153 | } 154 | return l; 155 | } 156 | 157 | /** 158 | * 重置分词器到初始状态 159 | * @param input 160 | */ 161 | public synchronized void reset(Reader input) { 162 | this.input = input; 163 | context.reset(); 164 | for(ISegmenter segmenter : segmenters){ 165 | segmenter.reset(); 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/cfg/DefaultConfig.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.cfg; 27 | 28 | import java.io.IOException; 29 | import java.io.InputStream; 30 | import java.util.ArrayList; 31 | import java.util.InvalidPropertiesFormatException; 32 | import java.util.List; 33 | import java.util.Properties; 34 | 35 | /** 36 | * Configuration 默认实现 37 | * 2012-5-8 38 | * 39 | */ 40 | public class DefaultConfig implements Configuration{ 41 | 42 | /* 43 | * 分词器默认字典路径 44 | */ 45 | private static final String PATH_DIC_MAIN = "org/wltea/analyzer/dic/main2012.dic"; 46 | private static final String PATH_DIC_QUANTIFIER = "org/wltea/analyzer/dic/quantifier.dic"; 47 | 48 | /* 49 | * 分词器配置文件路径 50 | */ 51 | private static final String FILE_NAME = "IKAnalyzer.cfg.xml"; 52 | //配置属性——扩展字典 53 | private static final String EXT_DICT = "ext_dict"; 54 | //配置属性——扩展停止词典 55 | private static final String EXT_STOP = "ext_stopwords"; 56 | 57 | private Properties props; 58 | /* 59 | * 是否使用smart方式分词 60 | */ 61 | private boolean useSmart; 62 | 63 | /** 64 | * 返回单例 65 | * @return Configuration单例 66 | */ 67 | public static Configuration getInstance(){ 68 | return new DefaultConfig(); 69 | } 70 | 71 | /* 72 | * 初始化配置文件 73 | */ 74 | private DefaultConfig(){ 75 | props = new Properties(); 76 | 77 | InputStream input = this.getClass().getClassLoader().getResourceAsStream(FILE_NAME); 78 | if(input != null){ 79 | try { 80 | props.loadFromXML(input); 81 | } catch (InvalidPropertiesFormatException e) { 82 | e.printStackTrace(); 83 | } catch (IOException e) { 84 | e.printStackTrace(); 85 | } 86 | } 87 | } 88 | 89 | 90 | /** 91 | * 返回useSmart标志位 92 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 93 | * @return useSmart 94 | */ 95 | public boolean useSmart() { 96 | return useSmart; 97 | } 98 | 99 | /** 100 | * 设置useSmart标志位 101 | * useSmart =true ,分词器使用智能切分策略, =false则使用细粒度切分 102 | * @param useSmart 103 | */ 104 | public void setUseSmart(boolean useSmart) { 105 | this.useSmart = useSmart; 106 | } 107 | 108 | /** 109 | * 获取主词典路径 110 | * 111 | * @return String 主词典路径 112 | */ 113 | public String getMainDictionary(){ 114 | return PATH_DIC_MAIN; 115 | } 116 | 117 | /** 118 | * 获取量词词典路径 119 | * @return String 量词词典路径 120 | */ 121 | public String getQuantifierDicionary(){ 122 | return PATH_DIC_QUANTIFIER; 123 | } 124 | 125 | /** 126 | * 获取扩展字典配置路径 127 | * @return List 相对类加载器的路径 128 | */ 129 | public List getExtDictionarys(){ 130 | List extDictFiles = new ArrayList(2); 131 | String extDictCfg = props.getProperty(EXT_DICT); 132 | if(extDictCfg != null){ 133 | //使用;分割多个扩展字典配置 134 | String[] filePaths = extDictCfg.split(";"); 135 | if(filePaths != null){ 136 | for(String filePath : filePaths){ 137 | if(filePath != null && !"".equals(filePath.trim())){ 138 | extDictFiles.add(filePath.trim()); 139 | } 140 | } 141 | } 142 | } 143 | return extDictFiles; 144 | } 145 | 146 | 147 | /** 148 | * 获取扩展停止词典配置路径 149 | * @return List 相对类加载器的路径 150 | */ 151 | public List getExtStopWordDictionarys(){ 152 | List extStopWordDictFiles = new ArrayList(2); 153 | String extStopWordDictCfg = props.getProperty(EXT_STOP); 154 | if(extStopWordDictCfg != null){ 155 | //使用;分割多个扩展字典配置 156 | String[] filePaths = extStopWordDictCfg.split(";"); 157 | if(filePaths != null){ 158 | for(String filePath : filePaths){ 159 | if(filePath != null && !"".equals(filePath.trim())){ 160 | extStopWordDictFiles.add(filePath.trim()); 161 | } 162 | } 163 | } 164 | } 165 | return extStopWordDictFiles; 166 | } 167 | 168 | 169 | } 170 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | org.danielli.xultimate 6 | xultimate-searching 7 | 1.0.0-SNAPSHOT 8 | pom 9 | 10 | The X-Ultimate Searching Parent 11 | The X-Ultimate Searching parent project. 12 | https://github.com/daniellitoc/xultimate-searching/ 13 | 14 | 15 | 16 | xultimate-lucene 17 | xultimate-ikanalyzer 18 | xultimate-solr 19 | 20 | 21 | 22 | 1.7 23 | 4.11 24 | 3.2.3.RELEASE 25 | 4.5.0 26 | 4.5.0 27 | 28 | 29 | 30 | 31 | 32 | org.apache.lucene 33 | lucene-core 34 | ${org.apache.lucene.version} 35 | 36 | 37 | org.apache.lucene 38 | lucene-queryparser 39 | ${org.apache.lucene.version} 40 | 41 | 42 | org.apache.lucene 43 | lucene-analyzers-common 44 | ${org.apache.lucene.version} 45 | 46 | 47 | org.apache.solr 48 | solr-solrj 49 | ${org.apache.solr.version} 50 | 51 | 52 | log4j 53 | log4j 54 | 55 | 56 | 57 | 58 | junit 59 | junit 60 | ${junit.version} 61 | test 62 | 63 | 64 | org.springframework 65 | spring-test 66 | ${org.springframework.version} 67 | test 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | src/main/resources 76 | true 77 | 78 | 79 | 80 | 81 | src/test/resources 82 | true 83 | 84 | 85 | 86 | 87 | org.apache.maven.plugins 88 | maven-compiler-plugin 89 | 90 | ${jdk.version} 91 | ${jdk.version} 92 | 93 | 94 | 95 | org.apache.maven.plugins 96 | maven-resources-plugin 97 | 98 | UTF-8 99 | 100 | 101 | 102 | org.apache.maven.plugins 103 | maven-source-plugin 104 | 105 | 106 | attach-sources 107 | verify 108 | 109 | jar-no-fork 110 | 111 | 112 | 113 | 114 | 115 | org.apache.maven.plugins 116 | maven-javadoc-plugin 117 | 118 | 119 | attach-javadocs 120 | 121 | jar 122 | 123 | 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-jar-plugin 129 | 2.2 130 | 131 | 132 | 133 | test-jar 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | nexus-releases 144 | Nexus Release Repository 145 | http://localhost/nexus/content/repositories/releases 146 | 147 | 148 | nexus-snapshots 149 | Nexus Snapshots Repository 150 | http://localhost/nexus/content/repositories/snapshots 151 | 152 | 153 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.query; 26 | 27 | import java.io.IOException; 28 | import java.io.StringReader; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | 32 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 33 | import org.apache.lucene.queryparser.classic.ParseException; 34 | import org.apache.lucene.queryparser.classic.QueryParser; 35 | import org.apache.lucene.search.Query; 36 | import org.apache.lucene.util.Version; 37 | import org.wltea.analyzer.core.IKSegmenter; 38 | import org.wltea.analyzer.core.Lexeme; 39 | 40 | /** 41 | * Single Word Multi Char Query Builder 42 | * IK分词算法专用 43 | * @author linliangyi 44 | * 45 | */ 46 | public class SWMCQueryBuilder { 47 | 48 | /** 49 | * 生成SWMCQuery 50 | * @param fieldName 51 | * @param keywords 52 | * @param quickMode 53 | * @return Lucene Query 54 | */ 55 | public static Query create(String fieldName ,String keywords , boolean quickMode){ 56 | if(fieldName == null || keywords == null){ 57 | throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); 58 | } 59 | //1.对keywords进行分词处理 60 | List lexemes = doAnalyze(keywords); 61 | //2.根据分词结果,生成SWMCQuery 62 | Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); 63 | return _SWMCQuery; 64 | } 65 | 66 | /** 67 | * 分词切分,并返回结链表 68 | * @param keywords 69 | * @return 70 | */ 71 | private static List doAnalyze(String keywords){ 72 | List lexemes = new ArrayList(); 73 | IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); 74 | try{ 75 | Lexeme l = null; 76 | while( (l = ikSeg.next()) != null){ 77 | lexemes.add(l); 78 | } 79 | }catch(IOException e){ 80 | e.printStackTrace(); 81 | } 82 | return lexemes; 83 | } 84 | 85 | 86 | /** 87 | * 根据分词结果生成SWMC搜索 88 | * @param fieldName 89 | * @param pathOption 90 | * @param quickMode 91 | * @return 92 | */ 93 | private static Query getSWMCQuery(String fieldName , List lexemes , boolean quickMode){ 94 | //构造SWMC的查询表达式 95 | StringBuffer keywordBuffer = new StringBuffer(); 96 | //精简的SWMC的查询表达式 97 | StringBuffer keywordBuffer_Short = new StringBuffer(); 98 | //记录最后词元长度 99 | int lastLexemeLength = 0; 100 | //记录最后词元结束位置 101 | int lastLexemeEnd = -1; 102 | 103 | int shortCount = 0; 104 | int totalCount = 0; 105 | for(Lexeme l : lexemes){ 106 | totalCount += l.getLength(); 107 | //精简表达式 108 | if(l.getLength() > 1){ 109 | keywordBuffer_Short.append(' ').append(l.getLexemeText()); 110 | shortCount += l.getLength(); 111 | } 112 | 113 | if(lastLexemeLength == 0){ 114 | keywordBuffer.append(l.getLexemeText()); 115 | }else if(lastLexemeLength == 1 && l.getLength() == 1 116 | && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) 117 | keywordBuffer.append(l.getLexemeText()); 118 | }else{ 119 | keywordBuffer.append(' ').append(l.getLexemeText()); 120 | 121 | } 122 | lastLexemeLength = l.getLength(); 123 | lastLexemeEnd = l.getEndPosition(); 124 | } 125 | 126 | //借助lucene queryparser 生成SWMC Query 127 | QueryParser qp = new QueryParser(Version.LUCENE_45, fieldName, new StandardAnalyzer(Version.LUCENE_45)); 128 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 129 | qp.setAutoGeneratePhraseQueries(true); 130 | 131 | if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ 132 | try { 133 | //System.out.println(keywordBuffer.toString()); 134 | Query q = qp.parse(keywordBuffer_Short.toString()); 135 | return q; 136 | } catch (ParseException e) { 137 | e.printStackTrace(); 138 | } 139 | 140 | }else{ 141 | if(keywordBuffer.length() > 0){ 142 | try { 143 | //System.out.println(keywordBuffer.toString()); 144 | Query q = qp.parse(keywordBuffer.toString()); 145 | return q; 146 | } catch (ParseException e) { 147 | e.printStackTrace(); 148 | } 149 | } 150 | } 151 | return null; 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/test/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | * 25 | */ 26 | package org.wltea.analyzer.sample; 27 | 28 | import java.io.IOException; 29 | 30 | import org.apache.lucene.analysis.Analyzer; 31 | import org.apache.lucene.document.Document; 32 | import org.apache.lucene.document.Field; 33 | import org.apache.lucene.document.StringField; 34 | import org.apache.lucene.document.TextField; 35 | import org.apache.lucene.index.CorruptIndexException; 36 | import org.apache.lucene.index.DirectoryReader; 37 | import org.apache.lucene.index.IndexReader; 38 | import org.apache.lucene.index.IndexWriter; 39 | import org.apache.lucene.index.IndexWriterConfig; 40 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 41 | import org.apache.lucene.queryparser.classic.ParseException; 42 | import org.apache.lucene.queryparser.classic.QueryParser; 43 | import org.apache.lucene.search.IndexSearcher; 44 | import org.apache.lucene.search.Query; 45 | import org.apache.lucene.search.ScoreDoc; 46 | import org.apache.lucene.search.TopDocs; 47 | import org.apache.lucene.store.Directory; 48 | import org.apache.lucene.store.LockObtainFailedException; 49 | import org.apache.lucene.store.RAMDirectory; 50 | import org.apache.lucene.util.Version; 51 | import org.wltea.analyzer.lucene.IKAnalyzer; 52 | 53 | 54 | 55 | 56 | /** 57 | * 使用IKAnalyzer进行Lucene索引和查询的演示 58 | * 2012-3-2 59 | * 60 | * 以下是结合Lucene4.0 API的写法 61 | * 62 | */ 63 | public class LuceneIndexAndSearchDemo { 64 | 65 | 66 | /** 67 | * 模拟: 68 | * 创建一个单条记录的索引,并对其进行搜索 69 | * @param args 70 | */ 71 | public static void main(String[] args){ 72 | //Lucene Document的域名 73 | String fieldName = "text"; 74 | //检索内容 75 | String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; 76 | 77 | //实例化IKAnalyzer分词器 78 | Analyzer analyzer = new IKAnalyzer(true); 79 | 80 | Directory directory = null; 81 | IndexWriter iwriter = null; 82 | IndexReader ireader = null; 83 | IndexSearcher isearcher = null; 84 | try { 85 | //建立内存索引对象 86 | directory = new RAMDirectory(); 87 | 88 | //配置IndexWriterConfig 89 | IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_45 , analyzer); 90 | iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); 91 | iwriter = new IndexWriter(directory , iwConfig); 92 | //写入索引 93 | Document doc = new Document(); 94 | doc.add(new StringField("ID", "10000", Field.Store.YES)); 95 | doc.add(new TextField(fieldName, text, Field.Store.YES)); 96 | iwriter.addDocument(doc); 97 | iwriter.close(); 98 | 99 | 100 | //搜索过程********************************** 101 | //实例化搜索器 102 | ireader = DirectoryReader.open(directory); 103 | isearcher = new IndexSearcher(ireader); 104 | 105 | String keyword = "中文分词工具包"; 106 | //使用QueryParser查询分析器构造Query对象 107 | QueryParser qp = new QueryParser(Version.LUCENE_45, fieldName, analyzer); 108 | qp.setDefaultOperator(QueryParser.AND_OPERATOR); 109 | Query query = qp.parse(keyword); 110 | System.out.println("Query = " + query); 111 | 112 | //搜索相似度最高的5条记录 113 | TopDocs topDocs = isearcher.search(query , 5); 114 | System.out.println("命中:" + topDocs.totalHits); 115 | //输出结果 116 | ScoreDoc[] scoreDocs = topDocs.scoreDocs; 117 | for (int i = 0; i < topDocs.totalHits; i++){ 118 | Document targetDoc = isearcher.doc(scoreDocs[i].doc); 119 | System.out.println("内容:" + targetDoc.toString()); 120 | } 121 | 122 | } catch (CorruptIndexException e) { 123 | e.printStackTrace(); 124 | } catch (LockObtainFailedException e) { 125 | e.printStackTrace(); 126 | } catch (IOException e) { 127 | e.printStackTrace(); 128 | } catch (ParseException e) { 129 | e.printStackTrace(); 130 | } finally{ 131 | if(ireader != null){ 132 | try { 133 | ireader.close(); 134 | } catch (IOException e) { 135 | e.printStackTrace(); 136 | } 137 | } 138 | if(directory != null){ 139 | try { 140 | directory.close(); 141 | } catch (IOException e) { 142 | e.printStackTrace(); 143 | } 144 | } 145 | } 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /xultimate-solr/src/test/resources/applicationContext-service-solr-client.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 55 | 56 | 64 | 70 | 71 | 77 | 78 | 87 | 88 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/applicationContext-dao-generic.xml: -------------------------------------------------------------------------------- 1 | 2 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | datasource.solrDb.jdbc.password 39 | datasource.solrDb.jdbc.username 40 | 41 | 42 | 43 | 44 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 99 | 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xultimate-searching # 2 | 3 | 使用IKAnalyzer和Solr作为搜索服务。不通过文件维护词库,而是通过数据库维护。 4 | 5 | ## xultimate-solr ## 6 | 7 | * 基于SolrCloud,提供了SolrJ的一些ShowCase。包含HttpSolrServer、ConcurrentUpdateSolrServer、LBHttpSolrServer、CloudSolrServer的Spring配置形式。 8 | 9 | #### SolrCloud创建数据集Collection并自动分片,手动分配副本 #### 10 | 11 | http://192.168.2.150:8080/solr/admin/collections?action=CREATE&name=mycollection&numShards=4&replicationFactor=1&maxShardsPerNode=4 12 | // 手动创建192.168.2.150副本Replicas。假设shard1处在192.168.1.150上 13 | http://192.168.2.151:8080/solr/admin/cores?action=CREATE&name=mycollection_shard1_replica_2&collection=mycollection&shard=shard1 14 | http://192.168.2.152:8080/solr/admin/cores?action=CREATE&name=mycollection_shard1_replica_3&collection=mycollection&shard=shard1 15 | http://192.168.2.153:8080/solr/admin/cores?action=CREATE&name=mycollection_shard1_replica_4&collection=mycollection&shard=shard1 16 | // 手动创建192.168.2.153副本Replicas。假设shard2处在192.168.1.153上 17 | http://192.168.2.150:8080/solr/admin/cores?action=CREATE&name=mycollection_shard2_replica_2&collection=mycollection&shard=shard2 18 | http://192.168.2.151:8080/solr/admin/cores?action=CREATE&name=mycollection_shard2_replica_3&collection=mycollection&shard=shard2 19 | http://192.168.2.152:8080/solr/admin/cores?action=CREATE&name=mycollection_shard2_replica_4&collection=mycollection&shard=shard2 20 | // 手动创建192.168.2.152副本Replicas。假设shard3处在192.168.1.152上 21 | http://192.168.2.150:8080/solr/admin/cores?action=CREATE&name=mycollection_shard3_replica_2&collection=mycollection&shard=shard3 22 | http://192.168.2.151:8080/solr/admin/cores?action=CREATE&name=mycollection_shard3_replica_3&collection=mycollection&shard=shard3 23 | http://192.168.2.153:8080/solr/admin/cores?action=CREATE&name=mycollection_shard3_replica_4&collection=mycollection&shard=shard3 24 | // 手动创建192.168.2.151副本Replicas 。假设shard4处在192.168.1.151上 25 | http://192.168.2.150:8080/solr/admin/cores?action=CREATE&name=mycollection_shard4_replica_2&collection=mycollection&shard=shard4 26 | http://192.168.2.152:8080/solr/admin/cores?action=CREATE&name=mycollection_shard4_replica_3&collection=mycollection&shard=shard4 27 | http://192.168.2.153:8080/solr/admin/cores?action=CREATE&name=mycollection_shard4_replica_4&collection=mycollection&shard=shard4 28 | 29 | #### SolrCloud删除数据集Collection #### 30 | 31 | http://192.168.2.150:8080/solr/admin/collections?action=DELETE&name=mycollection 32 | 33 | #### SolrCloud创建数据集Collection并手动分片(必须创建numShards个,否则无法使用) #### 34 | 35 | http://192.168.2.150:8080/solr/admin/cores?action=CREATE&name=mycollection2_shard1_replica_1&collection=mycollection2&shard=shard1&numShards=4 36 | http://192.168.2.151:8080/solr/admin/cores?action=CREATE&name=mycollection2_shard2_replica_1&collection=mycollection2&shard=shard2 37 | http://192.168.2.152:8080/solr/admin/cores?action=CREATE&name=mycollection2_shard3_replica_1&collection=mycollection2&shard=shard3 38 | http://192.168.2.153:8080/solr/admin/cores?action=CREATE&name=mycollection2_shard4_replica_1&collection=mycollection2&shard=shard4 39 | 40 | #### SolrCloud对指定分片在进行自动切分 #### 41 | 42 | http://192.168.2.150:8080/solr/admin/collections?action=CREATE&name=mycollection3&numShards=1&replicationFactor=1&maxShardsPerNode=4 43 |    44 | 或 45 | 46 | http://192.168.2.150:8080/solr/admin/cores?action=CREATE&name=mycollection3_shard1_replica_1&collection=mycollection3&shard=shard1&numShards=1 47 | http://192.168.2.150:8080/solr/admin/collections?action=SPLITSHARD&collection=mycollection3&shard=shard1 // 只支持自动分片的再切分。 48 | http://192.168.2.153:8080/solr/admin/cores?action=UNLOAD&core=mycollection3_shard1_replica1 // mycollection3_shard1_replica1在192.168.2.153上。 49 | 50 | ## xultimate-ikanalyzer ## 51 | 52 | * 扩展Dictionary类,提供停用词、量词的扩展接口。 53 | * 提供ExtKeywordInitializer,用于通过数据库维护IKAnalyzer的扩展词词库。 54 | * 提供StopKeywordInitializer,用于通过数据库维护IKAnalyzer的停用词词库。 55 | * 提供SynonymFilterFactory,用于通过数据库维护Solr的相近词词库。 56 | * 提供IKTokenizerFactory,解决Solr中IKAnalyzer在查询时无法使用userSmart问题。 57 | 58 | #### 配置扩展词、停用此、相近词从文件读取 #### 59 | 60 | * 将项目导出为jar文件,拷贝到solr/WEB-INF/lib/下。 61 | * 复制src/test/resources下的ext.dic IKAnalyzer.cfg.xml stopword.dic到solr/WEB-INF/classes/下。 62 | * 编辑$SOLR_HOME/collection1/conf/synonyms.txt。 63 | * 编辑$SOLR_HOME/collection1/conf/schema.xml,添加类型 64 | 65 | \ 66 | \ 67 | \ 68 | \ 69 | \ 70 | \ 71 | \ 72 | \ 73 | \ 74 | 75 | #### 配置扩展词、停用词、相近词从数据库读取 #### 76 | 77 | * 将项目导出为jar文件,拷贝到solr/WEB-INF/lib/下。 78 | * 拷贝xultimate-ikanalyzer的依赖jar到solr/WEB-INF/lib/下。 79 | * 删除slf4j-log4j12-1.6.6.jar jcl-over-slf4j-1.6.6.jar jul-to-slf4j-1.6.6.jar log4j-1.2.16.jar slf4j-log4j12-1.6.6.jar slf4j-api-1.6.6.jar commons-lang-2.4.jar log4j.properties 80 | * 复制src/test/resources下的databases.properties到solr/WEB-INF/classes/下。 81 | * 编辑solr/WEB-INF/web.xml,添加 82 | 83 | \ 84 | \contextConfigLocation\ 85 | \classpath:applicationContext-service-config.xml, classpath:applicationContext-service-crypto.xml, classpath:applicationContext-dao-base.xml, classpath:applicationContext-dao-generic.xml, classpath:applicationContext-service-generic.xml\ 86 | \ 87 | \ 88 | \org.springframework.web.context.ContextLoaderListener\ 89 | \ 90 | 91 | * 编辑$SOLR_HOME/collection1/conf/schema.xml,添加类型 92 | 93 |   \ 94 | \ 95 | \ 96 | \ 97 | \ 98 | \ 99 | \ 100 | \ 101 | \ 102 | 103 | 104 | ## xultimate-lucene ## 105 | 106 | * 三种Lucene近实时搜索ShowCase。包括添加自定义Collector、自定义Filter、自定义Sort。 107 | * 通过使用Spring进行Bean管理。 108 | 109 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/QuickSortSet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK分词器专用的Lexem快速排序集合 29 | */ 30 | class QuickSortSet { 31 | //链表头 32 | private Cell head; 33 | //链表尾 34 | private Cell tail; 35 | //链表的实际大小 36 | private int size; 37 | 38 | QuickSortSet(){ 39 | this.size = 0; 40 | } 41 | 42 | /** 43 | * 向链表集合添加词元 44 | * @param lexeme 45 | */ 46 | boolean addLexeme(Lexeme lexeme){ 47 | Cell newCell = new Cell(lexeme); 48 | if(this.size == 0){ 49 | this.head = newCell; 50 | this.tail = newCell; 51 | this.size++; 52 | return true; 53 | 54 | }else{ 55 | if(this.tail.compareTo(newCell) == 0){//词元与尾部词元相同,不放入集合 56 | return false; 57 | 58 | }else if(this.tail.compareTo(newCell) < 0){//词元接入链表尾部 59 | this.tail.next = newCell; 60 | newCell.prev = this.tail; 61 | this.tail = newCell; 62 | this.size++; 63 | return true; 64 | 65 | }else if(this.head.compareTo(newCell) > 0){//词元接入链表头部 66 | this.head.prev = newCell; 67 | newCell.next = this.head; 68 | this.head = newCell; 69 | this.size++; 70 | return true; 71 | 72 | }else{ 73 | //从尾部上逆 74 | Cell index = this.tail; 75 | while(index != null && index.compareTo(newCell) > 0){ 76 | index = index.prev; 77 | } 78 | if(index.compareTo(newCell) == 0){//词元与集合中的词元重复,不放入集合 79 | return false; 80 | 81 | }else if(index.compareTo(newCell) < 0){//词元插入链表中的某个位置 82 | newCell.prev = index; 83 | newCell.next = index.next; 84 | index.next.prev = newCell; 85 | index.next = newCell; 86 | this.size++; 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | 94 | /** 95 | * 返回链表头部元素 96 | * @return 97 | */ 98 | Lexeme peekFirst(){ 99 | if(this.head != null){ 100 | return this.head.lexeme; 101 | } 102 | return null; 103 | } 104 | 105 | /** 106 | * 取出链表集合的第一个元素 107 | * @return Lexeme 108 | */ 109 | Lexeme pollFirst(){ 110 | if(this.size == 1){ 111 | Lexeme first = this.head.lexeme; 112 | this.head = null; 113 | this.tail = null; 114 | this.size--; 115 | return first; 116 | }else if(this.size > 1){ 117 | Lexeme first = this.head.lexeme; 118 | this.head = this.head.next; 119 | this.size --; 120 | return first; 121 | }else{ 122 | return null; 123 | } 124 | } 125 | 126 | /** 127 | * 返回链表尾部元素 128 | * @return 129 | */ 130 | Lexeme peekLast(){ 131 | if(this.tail != null){ 132 | return this.tail.lexeme; 133 | } 134 | return null; 135 | } 136 | 137 | /** 138 | * 取出链表集合的最后一个元素 139 | * @return Lexeme 140 | */ 141 | Lexeme pollLast(){ 142 | if(this.size == 1){ 143 | Lexeme last = this.head.lexeme; 144 | this.head = null; 145 | this.tail = null; 146 | this.size--; 147 | return last; 148 | 149 | }else if(this.size > 1){ 150 | Lexeme last = this.tail.lexeme; 151 | this.tail = this.tail.prev; 152 | this.size--; 153 | return last; 154 | 155 | }else{ 156 | return null; 157 | } 158 | } 159 | 160 | /** 161 | * 返回集合大小 162 | * @return 163 | */ 164 | int size(){ 165 | return this.size; 166 | } 167 | 168 | /** 169 | * 判断集合是否为空 170 | * @return 171 | */ 172 | boolean isEmpty(){ 173 | return this.size == 0; 174 | } 175 | 176 | /** 177 | * 返回lexeme链的头部 178 | * @return 179 | */ 180 | Cell getHead(){ 181 | return this.head; 182 | } 183 | 184 | /** 185 | * 186 | * IK 中文分词 版本 5.0 187 | * IK Analyzer release 5.0 188 | * 189 | * Licensed to the Apache Software Foundation (ASF) under one or more 190 | * contributor license agreements. See the NOTICE file distributed with 191 | * this work for additional information regarding copyright ownership. 192 | * The ASF licenses this file to You under the Apache License, Version 2.0 193 | * (the "License"); you may not use this file except in compliance with 194 | * the License. You may obtain a copy of the License at 195 | * 196 | * http://www.apache.org/licenses/LICENSE-2.0 197 | * 198 | * Unless required by applicable law or agreed to in writing, software 199 | * distributed under the License is distributed on an "AS IS" BASIS, 200 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | * See the License for the specific language governing permissions and 202 | * limitations under the License. 203 | * 204 | * 源代码由林良益(linliangyi2005@gmail.com)提供 205 | * 版权声明 2012,乌龙茶工作室 206 | * provided by Linliangyi and copyright 2012 by Oolong studio 207 | * 208 | * QuickSortSet集合单元 209 | * 210 | */ 211 | class Cell implements Comparable{ 212 | private Cell prev; 213 | private Cell next; 214 | private Lexeme lexeme; 215 | 216 | Cell(Lexeme lexeme){ 217 | if(lexeme == null){ 218 | throw new IllegalArgumentException("lexeme must not be null"); 219 | } 220 | this.lexeme = lexeme; 221 | } 222 | 223 | public int compareTo(Cell o) { 224 | return this.lexeme.compareTo(o.lexeme); 225 | } 226 | 227 | public Cell getPrev(){ 228 | return this.prev; 229 | } 230 | 231 | public Cell getNext(){ 232 | return this.next; 233 | } 234 | 235 | public Lexeme getLexeme(){ 236 | return this.lexeme; 237 | } 238 | } 239 | } 240 | -------------------------------------------------------------------------------- /xultimate-solr/src/test/java/org/danielli/xultimate/solr/SolrServerTest.java: -------------------------------------------------------------------------------- 1 | package org.danielli.xultimate.solr; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.Collection; 7 | import java.util.List; 8 | 9 | import javax.annotation.Resource; 10 | 11 | import org.apache.solr.client.solrj.SolrQuery; 12 | import org.apache.solr.client.solrj.SolrServer; 13 | import org.apache.solr.client.solrj.SolrServerException; 14 | import org.apache.solr.client.solrj.beans.Field; 15 | import org.apache.solr.client.solrj.response.QueryResponse; 16 | import org.apache.solr.common.SolrDocument; 17 | import org.apache.solr.common.SolrDocumentList; 18 | import org.apache.solr.common.SolrInputDocument; 19 | import org.danielli.xultimate.util.math.RandomNumberUtils; 20 | import org.junit.Test; 21 | import org.junit.runner.RunWith; 22 | import org.springframework.test.context.ContextConfiguration; 23 | import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; 24 | 25 | @RunWith(SpringJUnit4ClassRunner.class) 26 | @ContextConfiguration(locations = { "classpath:/applicationContext-service-solr-client.xml"}) 27 | public class SolrServerTest { 28 | 29 | // @Resource(name = "httpSolrServer") 30 | // @Resource(name = "lbHttpSolrServer") 31 | @Resource(name = "cloudSolrServer") 32 | private SolrServer querySolrServer; 33 | 34 | // @Resource(name = "concurrentUpdateSolrServer") 35 | // @Resource(name = "lbHttpSolrServer") 36 | @Resource(name = "cloudSolrServer") 37 | private SolrServer updateSolrServer; 38 | 39 | @Test 40 | public void testAddDocument() throws SolrServerException, IOException { 41 | addDocument(); 42 | queryReturnDocument(); 43 | deleteAll(); 44 | queryReturnDocument(); 45 | } 46 | 47 | private void addDocument() throws SolrServerException, IOException { 48 | List solrInputDocuments = new ArrayList<>(); 49 | for (int i = 0 ; i < 100; i++) { 50 | SolrInputDocument document = new SolrInputDocument(); 51 | document.addField("id", "id" + RandomNumberUtils.nextInt(1000), 1.0f); 52 | document.addField("name", "doc" + i, 1.0f); 53 | document.addField("price", RandomNumberUtils.nextInt(1000)); 54 | solrInputDocuments.add(document); 55 | } 56 | 57 | // Do a commit, wait flush, wait searcher. 58 | updateSolrServer.add(solrInputDocuments); // Add the documents to Solr. 59 | updateSolrServer.commit(); 60 | } 61 | 62 | public void deleteAll() throws SolrServerException, IOException { 63 | updateSolrServer.deleteByQuery("*:*"); // CAUTION: deletes everything! 64 | updateSolrServer.commit(); 65 | } 66 | 67 | private void queryReturnDocument() throws SolrServerException { 68 | SolrQuery query = new SolrQuery(); 69 | query.setQuery("*:*"); 70 | query.addSort("price", SolrQuery.ORDER.asc); 71 | QueryResponse rsp = querySolrServer.query(query); 72 | SolrDocumentList documentList = rsp.getResults(); 73 | for (SolrDocument resultDoc : documentList) { 74 | String id = (String) resultDoc.getFieldValue("id"); //id is the uniqueKey field 75 | System.out.println(id); 76 | } 77 | System.out.println("发现" + documentList.getNumFound() + "个文档"); 78 | } 79 | 80 | // @Test 81 | public void testAddBean() throws SolrServerException, IOException { 82 | addBean(); 83 | queryReturnEntity(); 84 | deleteAll(); 85 | queryReturnEntity(); 86 | } 87 | 88 | private void addBean() throws IOException, SolrServerException { 89 | Item item1 = new Item(); 90 | item1.setId("id" + RandomNumberUtils.nextInt(1000)); 91 | item1.setFeatures(Arrays.asList("aaa", "bbb", "ccc")); 92 | 93 | Item item2 = new Item(); 94 | item2.setId("id" + RandomNumberUtils.nextInt(1000)); 95 | item2.setFeatures(Arrays.asList("ddd", "eee", "fff")); 96 | 97 | updateSolrServer.addBeans(Arrays.asList(item1, item2)); // Add the beans to Solr 98 | updateSolrServer.commit(); // Do a commit 99 | } 100 | 101 | public static class Item { 102 | @Field 103 | private String id; 104 | 105 | @Field("cat") 106 | private String[] categories; 107 | 108 | @Field 109 | private List features; 110 | 111 | public String getId() { 112 | return id; 113 | } 114 | 115 | public void setId(String id) { 116 | this.id = id; 117 | } 118 | 119 | public String[] getCategories() { 120 | return categories; 121 | } 122 | 123 | // @Field annotation can be applied on setter methods 124 | // @Field("cat") 125 | public void setCategories(String[] categories) { 126 | this.categories = categories; 127 | } 128 | 129 | public List getFeatures() { 130 | return features; 131 | } 132 | 133 | public void setFeatures(List features) { 134 | this.features = features; 135 | } 136 | } 137 | 138 | private void queryReturnEntity() throws SolrServerException { 139 | SolrQuery query = new SolrQuery(); 140 | query.setQuery("*:*"); 141 | query.addSort("price", SolrQuery.ORDER.asc); 142 | QueryResponse rsp = querySolrServer.query(query); 143 | List beans = rsp.getBeans(Item.class); 144 | for (Item bean : beans) { 145 | String id = bean.getId(); //id is the uniqueKey field 146 | System.out.println(id); 147 | } 148 | } 149 | 150 | // @Test 151 | public void testQueryHighlighting() throws SolrServerException, IOException { 152 | addBean(); 153 | queryHighlighting(); 154 | deleteAll(); 155 | queryHighlighting(); 156 | } 157 | 158 | private void queryHighlighting() throws SolrServerException { 159 | SolrQuery query = new SolrQuery(); 160 | query.setQuery("bbb"); 161 | 162 | query.setHighlight(true).setHighlightSnippets(1); // set other params as needed 163 | query.setParam("hl.fl", "features"); // 设置高亮字段 164 | QueryResponse queryResponse = querySolrServer.query(query); 165 | 166 | SolrDocumentList documentList = queryResponse.getResults(); 167 | for (SolrDocument resultDoc : documentList) { 168 | Collection features = resultDoc.getFieldValues("features"); 169 | String id = (String) resultDoc.getFieldValue("id"); //id is the uniqueKey field 170 | if (queryResponse.getHighlighting().get(id) != null) { 171 | List highlightSnippets = queryResponse.getHighlighting().get(id).get("features"); 172 | System.out.println(Arrays.toString(highlightSnippets.toArray())); 173 | } else { 174 | System.out.println(Arrays.toString(features.toArray())); 175 | } 176 | 177 | } 178 | 179 | List beans = queryResponse.getBeans(Item.class); 180 | for (Item bean : beans) { 181 | String id = bean.getId(); 182 | List features = bean.getFeatures(); 183 | if (queryResponse.getHighlighting().get(id) != null) { 184 | List highlightSnippets = queryResponse.getHighlighting().get(id).get("features"); 185 | System.out.println(Arrays.toString(highlightSnippets.toArray())); 186 | } else { 187 | System.out.println(Arrays.toString(features.toArray())); 188 | } 189 | } 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.HashSet; 28 | import java.util.LinkedList; 29 | import java.util.List; 30 | import java.util.Set; 31 | 32 | import org.wltea.analyzer.dic.Dictionary; 33 | import org.wltea.analyzer.dic.Hit; 34 | 35 | /** 36 | * 37 | * 中文数量词子分词器 38 | */ 39 | class CN_QuantifierSegmenter implements ISegmenter{ 40 | 41 | //子分词器标签 42 | static final String SEGMENTER_NAME = "QUAN_SEGMENTER"; 43 | 44 | //中文数词 45 | private static String Chn_Num = "一二两三四五六七八九十零壹贰叁肆伍陆柒捌玖拾百千万亿拾佰仟萬億兆卅廿";//Cnum 46 | private static Set ChnNumberChars = new HashSet(); 47 | static{ 48 | char[] ca = Chn_Num.toCharArray(); 49 | for(char nChar : ca){ 50 | ChnNumberChars.add(nChar); 51 | } 52 | } 53 | 54 | /* 55 | * 词元的开始位置, 56 | * 同时作为子分词器状态标识 57 | * 当start > -1 时,标识当前的分词器正在处理字符 58 | */ 59 | private int nStart; 60 | /* 61 | * 记录词元结束位置 62 | * end记录的是在词元中最后一个出现的合理的数词结束 63 | */ 64 | private int nEnd; 65 | 66 | //待处理的量词hit队列 67 | private List countHits; 68 | 69 | 70 | CN_QuantifierSegmenter(){ 71 | nStart = -1; 72 | nEnd = -1; 73 | this.countHits = new LinkedList(); 74 | } 75 | 76 | /** 77 | * 分词 78 | */ 79 | public void analyze(AnalyzeContext context) { 80 | //处理中文数词 81 | this.processCNumber(context); 82 | //处理中文量词 83 | this.processCount(context); 84 | 85 | //判断是否锁定缓冲区 86 | if(this.nStart == -1 && this.nEnd == -1 && countHits.isEmpty()){ 87 | //对缓冲区解锁 88 | context.unlockBuffer(SEGMENTER_NAME); 89 | }else{ 90 | context.lockBuffer(SEGMENTER_NAME); 91 | } 92 | } 93 | 94 | 95 | /** 96 | * 重置子分词器状态 97 | */ 98 | public void reset() { 99 | nStart = -1; 100 | nEnd = -1; 101 | countHits.clear(); 102 | } 103 | 104 | /** 105 | * 处理数词 106 | */ 107 | private void processCNumber(AnalyzeContext context){ 108 | if(nStart == -1 && nEnd == -1){//初始状态 109 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 110 | && ChnNumberChars.contains(context.getCurrentChar())){ 111 | //记录数词的起始、结束位置 112 | nStart = context.getCursor(); 113 | nEnd = context.getCursor(); 114 | } 115 | }else{//正在处理状态 116 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType() 117 | && ChnNumberChars.contains(context.getCurrentChar())){ 118 | //记录数词的结束位置 119 | nEnd = context.getCursor(); 120 | }else{ 121 | //输出数词 122 | this.outputNumLexeme(context); 123 | //重置头尾指针 124 | nStart = -1; 125 | nEnd = -1; 126 | } 127 | } 128 | 129 | //缓冲区已经用完,还有尚未输出的数词 130 | if(context.isBufferConsumed()){ 131 | if(nStart != -1 && nEnd != -1){ 132 | //输出数词 133 | outputNumLexeme(context); 134 | //重置头尾指针 135 | nStart = -1; 136 | nEnd = -1; 137 | } 138 | } 139 | } 140 | 141 | /** 142 | * 处理中文量词 143 | * @param context 144 | */ 145 | private void processCount(AnalyzeContext context){ 146 | // 判断是否需要启动量词扫描 147 | if(!this.needCountScan(context)){ 148 | return; 149 | } 150 | 151 | if(CharacterUtil.CHAR_CHINESE == context.getCurrentCharType()){ 152 | 153 | //优先处理countHits中的hit 154 | if(!this.countHits.isEmpty()){ 155 | //处理词段队列 156 | Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); 157 | for(Hit hit : tmpArray){ 158 | hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit); 159 | if(hit.isMatch()){ 160 | //输出当前的词 161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); 162 | context.addLexeme(newLexeme); 163 | 164 | if(!hit.isPrefix()){//不是词前缀,hit不需要继续匹配,移除 165 | this.countHits.remove(hit); 166 | } 167 | 168 | }else if(hit.isUnmatch()){ 169 | //hit不是词,移除 170 | this.countHits.remove(hit); 171 | } 172 | } 173 | } 174 | 175 | //********************************* 176 | //对当前指针位置的字符进行单字匹配 177 | Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); 178 | if(singleCharHit.isMatch()){//首字成量词词 179 | //输出当前的词 180 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); 181 | context.addLexeme(newLexeme); 182 | 183 | //同时也是词前缀 184 | if(singleCharHit.isPrefix()){ 185 | //前缀匹配则放入hit列表 186 | this.countHits.add(singleCharHit); 187 | } 188 | }else if(singleCharHit.isPrefix()){//首字为量词前缀 189 | //前缀匹配则放入hit列表 190 | this.countHits.add(singleCharHit); 191 | } 192 | 193 | 194 | }else{ 195 | //输入的不是中文字符 196 | //清空未成形的量词 197 | this.countHits.clear(); 198 | } 199 | 200 | //缓冲区数据已经读完,还有尚未输出的量词 201 | if(context.isBufferConsumed()){ 202 | //清空未成形的量词 203 | this.countHits.clear(); 204 | } 205 | } 206 | 207 | /** 208 | * 判断是否需要扫描量词 209 | * @return 210 | */ 211 | private boolean needCountScan(AnalyzeContext context){ 212 | if((nStart != -1 && nEnd != -1 ) || !countHits.isEmpty()){ 213 | //正在处理中文数词,或者正在处理量词 214 | return true; 215 | }else{ 216 | //找到一个相邻的数词 217 | if(!context.getOrgLexemes().isEmpty()){ 218 | Lexeme l = context.getOrgLexemes().peekLast(); 219 | if(Lexeme.TYPE_CNUM == l.getLexemeType() || Lexeme.TYPE_ARABIC == l.getLexemeType()){ 220 | if(l.getBegin() + l.getLength() == context.getCursor()){ 221 | return true; 222 | } 223 | } 224 | } 225 | } 226 | return false; 227 | } 228 | 229 | /** 230 | * 添加数词词元到结果集 231 | * @param context 232 | */ 233 | private void outputNumLexeme(AnalyzeContext context){ 234 | if(nStart > -1 && nEnd > -1){ 235 | //输出数词 236 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , nStart , nEnd - nStart + 1 , Lexeme.TYPE_CNUM); 237 | context.addLexeme(newLexeme); 238 | 239 | } 240 | } 241 | 242 | } 243 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/LexemePath.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | 28 | /** 29 | * Lexeme链(路径) 30 | */ 31 | class LexemePath extends QuickSortSet implements Comparable{ 32 | 33 | //起始位置 34 | private int pathBegin; 35 | //结束 36 | private int pathEnd; 37 | //词元链的有效字符长度 38 | private int payloadLength; 39 | 40 | LexemePath(){ 41 | this.pathBegin = -1; 42 | this.pathEnd = -1; 43 | this.payloadLength = 0; 44 | } 45 | 46 | /** 47 | * 向LexemePath追加相交的Lexeme 48 | * @param lexeme 49 | * @return 50 | */ 51 | boolean addCrossLexeme(Lexeme lexeme){ 52 | if(this.isEmpty()){ 53 | this.addLexeme(lexeme); 54 | this.pathBegin = lexeme.getBegin(); 55 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 56 | this.payloadLength += lexeme.getLength(); 57 | return true; 58 | 59 | }else if(this.checkCross(lexeme)){ 60 | this.addLexeme(lexeme); 61 | if(lexeme.getBegin() + lexeme.getLength() > this.pathEnd){ 62 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 63 | } 64 | this.payloadLength = this.pathEnd - this.pathBegin; 65 | return true; 66 | 67 | }else{ 68 | return false; 69 | 70 | } 71 | } 72 | 73 | /** 74 | * 向LexemePath追加不相交的Lexeme 75 | * @param lexeme 76 | * @return 77 | */ 78 | boolean addNotCrossLexeme(Lexeme lexeme){ 79 | if(this.isEmpty()){ 80 | this.addLexeme(lexeme); 81 | this.pathBegin = lexeme.getBegin(); 82 | this.pathEnd = lexeme.getBegin() + lexeme.getLength(); 83 | this.payloadLength += lexeme.getLength(); 84 | return true; 85 | 86 | }else if(this.checkCross(lexeme)){ 87 | return false; 88 | 89 | }else{ 90 | this.addLexeme(lexeme); 91 | this.payloadLength += lexeme.getLength(); 92 | Lexeme head = this.peekFirst(); 93 | this.pathBegin = head.getBegin(); 94 | Lexeme tail = this.peekLast(); 95 | this.pathEnd = tail.getBegin() + tail.getLength(); 96 | return true; 97 | 98 | } 99 | } 100 | 101 | /** 102 | * 移除尾部的Lexeme 103 | * @return 104 | */ 105 | Lexeme removeTail(){ 106 | Lexeme tail = this.pollLast(); 107 | if(this.isEmpty()){ 108 | this.pathBegin = -1; 109 | this.pathEnd = -1; 110 | this.payloadLength = 0; 111 | }else{ 112 | this.payloadLength -= tail.getLength(); 113 | Lexeme newTail = this.peekLast(); 114 | this.pathEnd = newTail.getBegin() + newTail.getLength(); 115 | } 116 | return tail; 117 | } 118 | 119 | /** 120 | * 检测词元位置交叉(有歧义的切分) 121 | * @param lexeme 122 | * @return 123 | */ 124 | boolean checkCross(Lexeme lexeme){ 125 | return (lexeme.getBegin() >= this.pathBegin && lexeme.getBegin() < this.pathEnd) 126 | || (this.pathBegin >= lexeme.getBegin() && this.pathBegin < lexeme.getBegin()+ lexeme.getLength()); 127 | } 128 | 129 | int getPathBegin() { 130 | return pathBegin; 131 | } 132 | 133 | int getPathEnd() { 134 | return pathEnd; 135 | } 136 | 137 | /** 138 | * 获取Path的有效词长 139 | * @return 140 | */ 141 | int getPayloadLength(){ 142 | return this.payloadLength; 143 | } 144 | 145 | /** 146 | * 获取LexemePath的路径长度 147 | * @return 148 | */ 149 | int getPathLength(){ 150 | return this.pathEnd - this.pathBegin; 151 | } 152 | 153 | 154 | /** 155 | * X权重(词元长度积) 156 | * @return 157 | */ 158 | int getXWeight(){ 159 | int product = 1; 160 | Cell c = this.getHead(); 161 | while( c != null && c.getLexeme() != null){ 162 | product *= c.getLexeme().getLength(); 163 | c = c.getNext(); 164 | } 165 | return product; 166 | } 167 | 168 | /** 169 | * 词元位置权重 170 | * @return 171 | */ 172 | int getPWeight(){ 173 | int pWeight = 0; 174 | int p = 0; 175 | Cell c = this.getHead(); 176 | while( c != null && c.getLexeme() != null){ 177 | p++; 178 | pWeight += p * c.getLexeme().getLength() ; 179 | c = c.getNext(); 180 | } 181 | return pWeight; 182 | } 183 | 184 | LexemePath copy(){ 185 | LexemePath theCopy = new LexemePath(); 186 | theCopy.pathBegin = this.pathBegin; 187 | theCopy.pathEnd = this.pathEnd; 188 | theCopy.payloadLength = this.payloadLength; 189 | Cell c = this.getHead(); 190 | while( c != null && c.getLexeme() != null){ 191 | theCopy.addLexeme(c.getLexeme()); 192 | c = c.getNext(); 193 | } 194 | return theCopy; 195 | } 196 | 197 | public int compareTo(LexemePath o) { 198 | //比较有效文本长度 199 | if(this.payloadLength > o.payloadLength){ 200 | return -1; 201 | }else if(this.payloadLength < o.payloadLength){ 202 | return 1; 203 | }else{ 204 | //比较词元个数,越少越好 205 | if(this.size() < o.size()){ 206 | return -1; 207 | }else if (this.size() > o.size()){ 208 | return 1; 209 | }else{ 210 | //路径跨度越大越好 211 | if(this.getPathLength() > o.getPathLength()){ 212 | return -1; 213 | }else if(this.getPathLength() < o.getPathLength()){ 214 | return 1; 215 | }else { 216 | //根据统计学结论,逆向切分概率高于正向切分,因此位置越靠后的优先 217 | if(this.pathEnd > o.pathEnd){ 218 | return -1; 219 | }else if(pathEnd < o.pathEnd){ 220 | return 1; 221 | }else{ 222 | //词长越平均越好 223 | if(this.getXWeight() > o.getXWeight()){ 224 | return -1; 225 | }else if(this.getXWeight() < o.getXWeight()){ 226 | return 1; 227 | }else { 228 | //词元位置权重比较 229 | if(this.getPWeight() > o.getPWeight()){ 230 | return -1; 231 | }else if(this.getPWeight() < o.getPWeight()){ 232 | return 1; 233 | } 234 | 235 | } 236 | } 237 | } 238 | } 239 | } 240 | return 0; 241 | } 242 | 243 | public String toString(){ 244 | StringBuffer sb = new StringBuffer(); 245 | sb.append("pathBegin : ").append(pathBegin).append("\r\n"); 246 | sb.append("pathEnd : ").append(pathEnd).append("\r\n"); 247 | sb.append("payloadLength : ").append(payloadLength).append("\r\n"); 248 | Cell head = this.getHead(); 249 | while(head != null){ 250 | sb.append("lexeme : ").append(head.getLexeme()).append("\r\n"); 251 | head = head.getNext(); 252 | } 253 | return sb.toString(); 254 | } 255 | 256 | } 257 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/Lexeme.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | /** 28 | * IK词元对象 29 | */ 30 | public class Lexeme implements Comparable{ 31 | //lexemeType常量 32 | //未知 33 | public static final int TYPE_UNKNOWN = 0; 34 | //英文 35 | public static final int TYPE_ENGLISH = 1; 36 | //数字 37 | public static final int TYPE_ARABIC = 2; 38 | //英文数字混合 39 | public static final int TYPE_LETTER = 3; 40 | //中文词元 41 | public static final int TYPE_CNWORD = 4; 42 | //中文单字 43 | public static final int TYPE_CNCHAR = 64; 44 | //日韩文字 45 | public static final int TYPE_OTHER_CJK = 8; 46 | //中文数词 47 | public static final int TYPE_CNUM = 16; 48 | //中文量词 49 | public static final int TYPE_COUNT = 32; 50 | //中文数量词 51 | public static final int TYPE_CQUAN = 48; 52 | 53 | //词元的起始位移 54 | private int offset; 55 | //词元的相对起始位置 56 | private int begin; 57 | //词元的长度 58 | private int length; 59 | //词元文本 60 | private String lexemeText; 61 | //词元类型 62 | private int lexemeType; 63 | 64 | 65 | public Lexeme(int offset , int begin , int length , int lexemeType){ 66 | this.offset = offset; 67 | this.begin = begin; 68 | if(length < 0){ 69 | throw new IllegalArgumentException("length < 0"); 70 | } 71 | this.length = length; 72 | this.lexemeType = lexemeType; 73 | } 74 | 75 | /* 76 | * 判断词元相等算法 77 | * 起始位置偏移、起始位置、终止位置相同 78 | * @see java.lang.Object#equals(Object o) 79 | */ 80 | public boolean equals(Object o){ 81 | if(o == null){ 82 | return false; 83 | } 84 | 85 | if(this == o){ 86 | return true; 87 | } 88 | 89 | if(o instanceof Lexeme){ 90 | Lexeme other = (Lexeme)o; 91 | if(this.offset == other.getOffset() 92 | && this.begin == other.getBegin() 93 | && this.length == other.getLength()){ 94 | return true; 95 | }else{ 96 | return false; 97 | } 98 | }else{ 99 | return false; 100 | } 101 | } 102 | 103 | /* 104 | * 词元哈希编码算法 105 | * @see java.lang.Object#hashCode() 106 | */ 107 | public int hashCode(){ 108 | int absBegin = getBeginPosition(); 109 | int absEnd = getEndPosition(); 110 | return (absBegin * 37) + (absEnd * 31) + ((absBegin * absEnd) % getLength()) * 11; 111 | } 112 | 113 | /* 114 | * 词元在排序集合中的比较算法 115 | * @see java.lang.Comparable#compareTo(java.lang.Object) 116 | */ 117 | public int compareTo(Lexeme other) { 118 | //起始位置优先 119 | if(this.begin < other.getBegin()){ 120 | return -1; 121 | }else if(this.begin == other.getBegin()){ 122 | //词元长度优先 123 | if(this.length > other.getLength()){ 124 | return -1; 125 | }else if(this.length == other.getLength()){ 126 | return 0; 127 | }else {//this.length < other.getLength() 128 | return 1; 129 | } 130 | 131 | }else{//this.begin > other.getBegin() 132 | return 1; 133 | } 134 | } 135 | 136 | public int getOffset() { 137 | return offset; 138 | } 139 | 140 | public void setOffset(int offset) { 141 | this.offset = offset; 142 | } 143 | 144 | public int getBegin() { 145 | return begin; 146 | } 147 | /** 148 | * 获取词元在文本中的起始位置 149 | * @return int 150 | */ 151 | public int getBeginPosition(){ 152 | return offset + begin; 153 | } 154 | 155 | public void setBegin(int begin) { 156 | this.begin = begin; 157 | } 158 | 159 | /** 160 | * 获取词元在文本中的结束位置 161 | * @return int 162 | */ 163 | public int getEndPosition(){ 164 | return offset + begin + length; 165 | } 166 | 167 | /** 168 | * 获取词元的字符长度 169 | * @return int 170 | */ 171 | public int getLength(){ 172 | return this.length; 173 | } 174 | 175 | public void setLength(int length) { 176 | if(this.length < 0){ 177 | throw new IllegalArgumentException("length < 0"); 178 | } 179 | this.length = length; 180 | } 181 | 182 | /** 183 | * 获取词元的文本内容 184 | * @return String 185 | */ 186 | public String getLexemeText() { 187 | if(lexemeText == null){ 188 | return ""; 189 | } 190 | return lexemeText; 191 | } 192 | 193 | public void setLexemeText(String lexemeText) { 194 | if(lexemeText == null){ 195 | this.lexemeText = ""; 196 | this.length = 0; 197 | }else{ 198 | this.lexemeText = lexemeText; 199 | this.length = lexemeText.length(); 200 | } 201 | } 202 | 203 | /** 204 | * 获取词元类型 205 | * @return int 206 | */ 207 | public int getLexemeType() { 208 | return lexemeType; 209 | } 210 | 211 | /** 212 | * 获取词元类型标示字符串 213 | * @return String 214 | */ 215 | public String getLexemeTypeString(){ 216 | switch(lexemeType) { 217 | 218 | case TYPE_ENGLISH : 219 | return "ENGLISH"; 220 | 221 | case TYPE_ARABIC : 222 | return "ARABIC"; 223 | 224 | case TYPE_LETTER : 225 | return "LETTER"; 226 | 227 | case TYPE_CNWORD : 228 | return "CN_WORD"; 229 | 230 | case TYPE_CNCHAR : 231 | return "CN_CHAR"; 232 | 233 | case TYPE_OTHER_CJK : 234 | return "OTHER_CJK"; 235 | 236 | case TYPE_COUNT : 237 | return "COUNT"; 238 | 239 | case TYPE_CNUM : 240 | return "TYPE_CNUM"; 241 | 242 | case TYPE_CQUAN: 243 | return "TYPE_CQUAN"; 244 | 245 | default : 246 | return "UNKONW"; 247 | } 248 | } 249 | 250 | 251 | public void setLexemeType(int lexemeType) { 252 | this.lexemeType = lexemeType; 253 | } 254 | 255 | /** 256 | * 合并两个相邻的词元 257 | * @param l 258 | * @param lexemeType 259 | * @return boolean 词元是否成功合并 260 | */ 261 | public boolean append(Lexeme l , int lexemeType){ 262 | if(l != null && this.getEndPosition() == l.getBeginPosition()){ 263 | this.length += l.getLength(); 264 | this.lexemeType = lexemeType; 265 | return true; 266 | }else { 267 | return false; 268 | } 269 | } 270 | 271 | 272 | /** 273 | * 274 | */ 275 | public String toString(){ 276 | StringBuffer strbuf = new StringBuffer(); 277 | strbuf.append(this.getBeginPosition()).append("-").append(this.getEndPosition()); 278 | strbuf.append(" : ").append(this.lexemeText).append(" : \t"); 279 | strbuf.append(this.getLexemeTypeString()); 280 | return strbuf.toString(); 281 | } 282 | 283 | 284 | } 285 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/resources/mybatis/mybatis-3-mapper.dtd: -------------------------------------------------------------------------------- 1 | 2 | 19 | 20 | 21 | 25 | 26 | 27 | 30 | 31 | 32 | 39 | 40 | 41 | 45 | 46 | 47 | 56 | 57 | 58 | 64 | 65 | 66 | 67 | 68 | 75 | 76 | 77 | 84 | 85 | 86 | 94 | 95 | 96 | 104 | 105 | 106 | 121 | 122 | 123 | 137 | 138 | 139 | 145 | 146 | 147 | 152 | 153 | 154 | 158 | 159 | 160 | 164 | 165 | 166 | 183 | 184 | 185 | 198 | 199 | 200 | 207 | 208 | 209 | 219 | 220 | 221 | 231 | 232 | 233 | 234 | 235 | 238 | 239 | 240 | 244 | 245 | 246 | 251 | 252 | 253 | 259 | 260 | 261 | 262 | 263 | 271 | 272 | 273 | 274 | 277 | 278 | 279 | 280 | 283 | 284 | 285 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/core/LetterSegmenter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * IK 中文分词 版本 5.0 3 | * IK Analyzer release 5.0 4 | * 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | * 20 | * 源代码由林良益(linliangyi2005@gmail.com)提供 21 | * 版权声明 2012,乌龙茶工作室 22 | * provided by Linliangyi and copyright 2012 by Oolong studio 23 | * 24 | */ 25 | package org.wltea.analyzer.core; 26 | 27 | import java.util.Arrays; 28 | 29 | /** 30 | * 31 | * 英文字符及阿拉伯数字子分词器 32 | */ 33 | class LetterSegmenter implements ISegmenter { 34 | 35 | //子分词器标签 36 | static final String SEGMENTER_NAME = "LETTER_SEGMENTER"; 37 | //链接符号 38 | private static final char[] Letter_Connector = new char[]{'#' , '&' , '+' , '-' , '.' , '@' , '_'}; 39 | 40 | //数字符号 41 | private static final char[] Num_Connector = new char[]{',' , '.'}; 42 | 43 | /* 44 | * 词元的开始位置, 45 | * 同时作为子分词器状态标识 46 | * 当start > -1 时,标识当前的分词器正在处理字符 47 | */ 48 | private int start; 49 | /* 50 | * 记录词元结束位置 51 | * end记录的是在词元中最后一个出现的Letter但非Sign_Connector的字符的位置 52 | */ 53 | private int end; 54 | 55 | /* 56 | * 字母起始位置 57 | */ 58 | private int englishStart; 59 | 60 | /* 61 | * 字母结束位置 62 | */ 63 | private int englishEnd; 64 | 65 | /* 66 | * 阿拉伯数字起始位置 67 | */ 68 | private int arabicStart; 69 | 70 | /* 71 | * 阿拉伯数字结束位置 72 | */ 73 | private int arabicEnd; 74 | 75 | LetterSegmenter(){ 76 | Arrays.sort(Letter_Connector); 77 | Arrays.sort(Num_Connector); 78 | this.start = -1; 79 | this.end = -1; 80 | this.englishStart = -1; 81 | this.englishEnd = -1; 82 | this.arabicStart = -1; 83 | this.arabicEnd = -1; 84 | } 85 | 86 | 87 | /* (non-Javadoc) 88 | * @see org.wltea.analyzer.core.ISegmenter#analyze(org.wltea.analyzer.core.AnalyzeContext) 89 | */ 90 | public void analyze(AnalyzeContext context) { 91 | boolean bufferLockFlag = false; 92 | //处理英文字母 93 | bufferLockFlag = this.processEnglishLetter(context) || bufferLockFlag; 94 | //处理阿拉伯字母 95 | bufferLockFlag = this.processArabicLetter(context) || bufferLockFlag; 96 | //处理混合字母(这个要放最后处理,可以通过QuickSortSet排除重复) 97 | bufferLockFlag = this.processMixLetter(context) || bufferLockFlag; 98 | 99 | //判断是否锁定缓冲区 100 | if(bufferLockFlag){ 101 | context.lockBuffer(SEGMENTER_NAME); 102 | }else{ 103 | //对缓冲区解锁 104 | context.unlockBuffer(SEGMENTER_NAME); 105 | } 106 | } 107 | 108 | /* (non-Javadoc) 109 | * @see org.wltea.analyzer.core.ISegmenter#reset() 110 | */ 111 | public void reset() { 112 | this.start = -1; 113 | this.end = -1; 114 | this.englishStart = -1; 115 | this.englishEnd = -1; 116 | this.arabicStart = -1; 117 | this.arabicEnd = -1; 118 | } 119 | 120 | /** 121 | * 处理数字字母混合输出 122 | * 如:windos2000 | linliangyi2005@gmail.com 123 | * @param input 124 | * @param context 125 | * @return 126 | */ 127 | private boolean processMixLetter(AnalyzeContext context){ 128 | boolean needLock = false; 129 | 130 | if(this.start == -1){//当前的分词器尚未开始处理字符 131 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 132 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 133 | //记录起始指针的位置,标明分词器进入处理状态 134 | this.start = context.getCursor(); 135 | this.end = start; 136 | } 137 | 138 | }else{//当前的分词器正在处理字符 139 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType() 140 | || CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 141 | //记录下可能的结束位置 142 | this.end = context.getCursor(); 143 | 144 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 145 | && this.isLetterConnector(context.getCurrentChar())){ 146 | //记录下可能的结束位置 147 | this.end = context.getCursor(); 148 | }else{ 149 | //遇到非Letter字符,输出词元 150 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 151 | context.addLexeme(newLexeme); 152 | this.start = -1; 153 | this.end = -1; 154 | } 155 | } 156 | 157 | //判断缓冲区是否已经读完 158 | if(context.isBufferConsumed()){ 159 | if(this.start != -1 && this.end != -1){ 160 | //缓冲以读完,输出词元 161 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.start , this.end - this.start + 1 , Lexeme.TYPE_LETTER); 162 | context.addLexeme(newLexeme); 163 | this.start = -1; 164 | this.end = -1; 165 | } 166 | } 167 | 168 | //判断是否锁定缓冲区 169 | if(this.start == -1 && this.end == -1){ 170 | //对缓冲区解锁 171 | needLock = false; 172 | }else{ 173 | needLock = true; 174 | } 175 | return needLock; 176 | } 177 | 178 | /** 179 | * 处理纯英文字母输出 180 | * @param context 181 | * @return 182 | */ 183 | private boolean processEnglishLetter(AnalyzeContext context){ 184 | boolean needLock = false; 185 | 186 | if(this.englishStart == -1){//当前的分词器尚未开始处理英文字符 187 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 188 | //记录起始指针的位置,标明分词器进入处理状态 189 | this.englishStart = context.getCursor(); 190 | this.englishEnd = this.englishStart; 191 | } 192 | }else {//当前的分词器正在处理英文字符 193 | if(CharacterUtil.CHAR_ENGLISH == context.getCurrentCharType()){ 194 | //记录当前指针位置为结束位置 195 | this.englishEnd = context.getCursor(); 196 | }else{ 197 | //遇到非English字符,输出词元 198 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 199 | context.addLexeme(newLexeme); 200 | this.englishStart = -1; 201 | this.englishEnd= -1; 202 | } 203 | } 204 | 205 | //判断缓冲区是否已经读完 206 | if(context.isBufferConsumed()){ 207 | if(this.englishStart != -1 && this.englishEnd != -1){ 208 | //缓冲以读完,输出词元 209 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.englishStart , this.englishEnd - this.englishStart + 1 , Lexeme.TYPE_ENGLISH); 210 | context.addLexeme(newLexeme); 211 | this.englishStart = -1; 212 | this.englishEnd= -1; 213 | } 214 | } 215 | 216 | //判断是否锁定缓冲区 217 | if(this.englishStart == -1 && this.englishEnd == -1){ 218 | //对缓冲区解锁 219 | needLock = false; 220 | }else{ 221 | needLock = true; 222 | } 223 | return needLock; 224 | } 225 | 226 | /** 227 | * 处理阿拉伯数字输出 228 | * @param context 229 | * @return 230 | */ 231 | private boolean processArabicLetter(AnalyzeContext context){ 232 | boolean needLock = false; 233 | 234 | if(this.arabicStart == -1){//当前的分词器尚未开始处理数字字符 235 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 236 | //记录起始指针的位置,标明分词器进入处理状态 237 | this.arabicStart = context.getCursor(); 238 | this.arabicEnd = this.arabicStart; 239 | } 240 | }else {//当前的分词器正在处理数字字符 241 | if(CharacterUtil.CHAR_ARABIC == context.getCurrentCharType()){ 242 | //记录当前指针位置为结束位置 243 | this.arabicEnd = context.getCursor(); 244 | }else if(CharacterUtil.CHAR_USELESS == context.getCurrentCharType() 245 | && this.isNumConnector(context.getCurrentChar())){ 246 | //不输出数字,但不标记结束 247 | }else{ 248 | ////遇到非Arabic字符,输出词元 249 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 250 | context.addLexeme(newLexeme); 251 | this.arabicStart = -1; 252 | this.arabicEnd = -1; 253 | } 254 | } 255 | 256 | //判断缓冲区是否已经读完 257 | if(context.isBufferConsumed()){ 258 | if(this.arabicStart != -1 && this.arabicEnd != -1){ 259 | //生成已切分的词元 260 | Lexeme newLexeme = new Lexeme(context.getBufferOffset() , this.arabicStart , this.arabicEnd - this.arabicStart + 1 , Lexeme.TYPE_ARABIC); 261 | context.addLexeme(newLexeme); 262 | this.arabicStart = -1; 263 | this.arabicEnd = -1; 264 | } 265 | } 266 | 267 | //判断是否锁定缓冲区 268 | if(this.arabicStart == -1 && this.arabicEnd == -1){ 269 | //对缓冲区解锁 270 | needLock = false; 271 | }else{ 272 | needLock = true; 273 | } 274 | return needLock; 275 | } 276 | 277 | /** 278 | * 判断是否是字母连接符号 279 | * @param input 280 | * @return 281 | */ 282 | private boolean isLetterConnector(char input){ 283 | int index = Arrays.binarySearch(Letter_Connector, input); 284 | return index >= 0; 285 | } 286 | 287 | /** 288 | * 判断是否是数字连接符号 289 | * @param input 290 | * @return 291 | */ 292 | private boolean isNumConnector(char input){ 293 | int index = Arrays.binarySearch(Num_Connector, input); 294 | return index >= 0; 295 | } 296 | } 297 | -------------------------------------------------------------------------------- /xultimate-ikanalyzer/src/main/java/org/wltea/analyzer/dic/DictSegment.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * IK 中文分词 版本 5.0 4 | * IK Analyzer release 5.0 5 | * 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * 21 | * 源代码由林良益(linliangyi2005@gmail.com)提供 22 | * 版权声明 2012,乌龙茶工作室 23 | * provided by Linliangyi and copyright 2012 by Oolong studio 24 | * 25 | */ 26 | package org.wltea.analyzer.dic; 27 | 28 | import java.util.Arrays; 29 | import java.util.HashMap; 30 | import java.util.Map; 31 | 32 | /** 33 | * 词典树分段,表示词典树的一个分枝 34 | */ 35 | class DictSegment implements Comparable{ 36 | 37 | //公用字典表,存储汉字 38 | private static final Map charMap = new HashMap(16 , 0.95f); 39 | //数组大小上限 40 | private static final int ARRAY_LENGTH_LIMIT = 3; 41 | 42 | 43 | //Map存储结构 44 | private Map childrenMap; 45 | //数组方式存储结构 46 | private DictSegment[] childrenArray; 47 | 48 | 49 | //当前节点上存储的字符 50 | private Character nodeChar; 51 | //当前节点存储的Segment数目 52 | //storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储 53 | private int storeSize = 0; 54 | //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词 55 | private int nodeState = 0; 56 | 57 | 58 | DictSegment(Character nodeChar){ 59 | if(nodeChar == null){ 60 | throw new IllegalArgumentException("参数为空异常,字符不能为空"); 61 | } 62 | this.nodeChar = nodeChar; 63 | } 64 | 65 | Character getNodeChar() { 66 | return nodeChar; 67 | } 68 | 69 | /* 70 | * 判断是否有下一个节点 71 | */ 72 | boolean hasNextNode(){ 73 | return this.storeSize > 0; 74 | } 75 | 76 | /** 77 | * 匹配词段 78 | * @param charArray 79 | * @return Hit 80 | */ 81 | Hit match(char[] charArray){ 82 | return this.match(charArray , 0 , charArray.length , null); 83 | } 84 | 85 | /** 86 | * 匹配词段 87 | * @param charArray 88 | * @param begin 89 | * @param length 90 | * @return Hit 91 | */ 92 | Hit match(char[] charArray , int begin , int length){ 93 | return this.match(charArray , begin , length , null); 94 | } 95 | 96 | /** 97 | * 匹配词段 98 | * @param charArray 99 | * @param begin 100 | * @param length 101 | * @param searchHit 102 | * @return Hit 103 | */ 104 | Hit match(char[] charArray , int begin , int length , Hit searchHit){ 105 | 106 | if(searchHit == null){ 107 | //如果hit为空,新建 108 | searchHit= new Hit(); 109 | //设置hit的其实文本位置 110 | searchHit.setBegin(begin); 111 | }else{ 112 | //否则要将HIT状态重置 113 | searchHit.setUnmatch(); 114 | } 115 | //设置hit的当前处理位置 116 | searchHit.setEnd(begin); 117 | 118 | Character keyChar = new Character(charArray[begin]); 119 | DictSegment ds = null; 120 | 121 | //引用实例变量为本地变量,避免查询时遇到更新的同步问题 122 | DictSegment[] segmentArray = this.childrenArray; 123 | Map segmentMap = this.childrenMap; 124 | 125 | //STEP1 在节点中查找keyChar对应的DictSegment 126 | if(segmentArray != null){ 127 | //在数组中查找 128 | DictSegment keySegment = new DictSegment(keyChar); 129 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize , keySegment); 130 | if(position >= 0){ 131 | ds = segmentArray[position]; 132 | } 133 | 134 | }else if(segmentMap != null){ 135 | //在map中查找 136 | ds = (DictSegment)segmentMap.get(keyChar); 137 | } 138 | 139 | //STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果 140 | if(ds != null){ 141 | if(length > 1){ 142 | //词未匹配完,继续往下搜索 143 | return ds.match(charArray, begin + 1 , length - 1 , searchHit); 144 | }else if (length == 1){ 145 | 146 | //搜索最后一个char 147 | if(ds.nodeState == 1){ 148 | //添加HIT状态为完全匹配 149 | searchHit.setMatch(); 150 | } 151 | if(ds.hasNextNode()){ 152 | //添加HIT状态为前缀匹配 153 | searchHit.setPrefix(); 154 | //记录当前位置的DictSegment 155 | searchHit.setMatchedDictSegment(ds); 156 | } 157 | return searchHit; 158 | } 159 | 160 | } 161 | //STEP3 没有找到DictSegment, 将HIT设置为不匹配 162 | return searchHit; 163 | } 164 | 165 | /** 166 | * 加载填充词典片段 167 | * @param charArray 168 | */ 169 | void fillSegment(char[] charArray){ 170 | this.fillSegment(charArray, 0 , charArray.length , 1); 171 | } 172 | 173 | /** 174 | * 屏蔽词典中的一个词 175 | * @param charArray 176 | */ 177 | void disableSegment(char[] charArray){ 178 | this.fillSegment(charArray, 0 , charArray.length , 0); 179 | } 180 | 181 | /** 182 | * 加载填充词典片段 183 | * @param charArray 184 | * @param begin 185 | * @param length 186 | * @param enabled 187 | */ 188 | private synchronized void fillSegment(char[] charArray , int begin , int length , int enabled){ 189 | //获取字典表中的汉字对象 190 | Character beginChar = new Character(charArray[begin]); 191 | Character keyChar = charMap.get(beginChar); 192 | //字典中没有该字,则将其添加入字典 193 | if(keyChar == null){ 194 | charMap.put(beginChar, beginChar); 195 | keyChar = beginChar; 196 | } 197 | 198 | //搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建 199 | DictSegment ds = lookforSegment(keyChar , enabled); 200 | if(ds != null){ 201 | //处理keyChar对应的segment 202 | if(length > 1){ 203 | //词元还没有完全加入词典树 204 | ds.fillSegment(charArray, begin + 1, length - 1 , enabled); 205 | }else if (length == 1){ 206 | //已经是词元的最后一个char,设置当前节点状态为enabled, 207 | //enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词 208 | ds.nodeState = enabled; 209 | } 210 | } 211 | 212 | } 213 | 214 | /** 215 | * 查找本节点下对应的keyChar的segment * 216 | * @param keyChar 217 | * @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null 218 | * @return 219 | */ 220 | private DictSegment lookforSegment(Character keyChar , int create){ 221 | 222 | DictSegment ds = null; 223 | 224 | if(this.storeSize <= ARRAY_LENGTH_LIMIT){ 225 | //获取数组容器,如果数组未创建则创建数组 226 | DictSegment[] segmentArray = getChildrenArray(); 227 | //搜寻数组 228 | DictSegment keySegment = new DictSegment(keyChar); 229 | int position = Arrays.binarySearch(segmentArray, 0 , this.storeSize, keySegment); 230 | if(position >= 0){ 231 | ds = segmentArray[position]; 232 | } 233 | 234 | //遍历数组后没有找到对应的segment 235 | if(ds == null && create == 1){ 236 | ds = keySegment; 237 | if(this.storeSize < ARRAY_LENGTH_LIMIT){ 238 | //数组容量未满,使用数组存储 239 | segmentArray[this.storeSize] = ds; 240 | //segment数目+1 241 | this.storeSize++; 242 | Arrays.sort(segmentArray , 0 , this.storeSize); 243 | 244 | }else{ 245 | //数组容量已满,切换Map存储 246 | //获取Map容器,如果Map未创建,则创建Map 247 | Map segmentMap = getChildrenMap(); 248 | //将数组中的segment迁移到Map中 249 | migrate(segmentArray , segmentMap); 250 | //存储新的segment 251 | segmentMap.put(keyChar, ds); 252 | //segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组 253 | this.storeSize++; 254 | //释放当前的数组引用 255 | this.childrenArray = null; 256 | } 257 | 258 | } 259 | 260 | }else{ 261 | //获取Map容器,如果Map未创建,则创建Map 262 | Map segmentMap = getChildrenMap(); 263 | //搜索Map 264 | ds = (DictSegment)segmentMap.get(keyChar); 265 | if(ds == null && create == 1){ 266 | //构造新的segment 267 | ds = new DictSegment(keyChar); 268 | segmentMap.put(keyChar , ds); 269 | //当前节点存储segment数目+1 270 | this.storeSize ++; 271 | } 272 | } 273 | 274 | return ds; 275 | } 276 | 277 | 278 | /** 279 | * 获取数组容器 280 | * 线程同步方法 281 | */ 282 | private DictSegment[] getChildrenArray(){ 283 | if(this.childrenArray == null){ 284 | synchronized(this){ 285 | if(this.childrenArray == null){ 286 | this.childrenArray = new DictSegment[ARRAY_LENGTH_LIMIT]; 287 | } 288 | } 289 | } 290 | return this.childrenArray; 291 | } 292 | 293 | /** 294 | * 获取Map容器 295 | * 线程同步方法 296 | */ 297 | private Map getChildrenMap(){ 298 | if(this.childrenMap == null){ 299 | synchronized(this){ 300 | if(this.childrenMap == null){ 301 | this.childrenMap = new HashMap(ARRAY_LENGTH_LIMIT * 2,0.8f); 302 | } 303 | } 304 | } 305 | return this.childrenMap; 306 | } 307 | 308 | /** 309 | * 将数组中的segment迁移到Map中 310 | * @param segmentArray 311 | */ 312 | private void migrate(DictSegment[] segmentArray , Map segmentMap){ 313 | for(DictSegment segment : segmentArray){ 314 | if(segment != null){ 315 | segmentMap.put(segment.nodeChar, segment); 316 | } 317 | } 318 | } 319 | 320 | /** 321 | * 实现Comparable接口 322 | * @param o 323 | * @return int 324 | */ 325 | public int compareTo(DictSegment o) { 326 | //对当前节点存储的char进行比较 327 | return this.nodeChar.compareTo(o.nodeChar); 328 | } 329 | 330 | } 331 | --------------------------------------------------------------------------------